trying a simple shell script and fixing archives
This commit is contained in:
125
shell/email_processor.py
Executable file
125
shell/email_processor.py
Executable file
@@ -0,0 +1,125 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Email content processor for run_himalaya.sh
|
||||
Cleans up email content for better readability
|
||||
"""
|
||||
|
||||
import sys
|
||||
import re
|
||||
from urllib.parse import urlparse
|
||||
|
||||
|
||||
def extract_domain(url):
|
||||
"""Extract domain from URL"""
|
||||
try:
|
||||
parsed = urlparse(url if url.startswith("http") else "http://" + url)
|
||||
return parsed.netloc.replace("www.", "")
|
||||
except:
|
||||
return None
|
||||
|
||||
|
||||
def is_important_url(url, domain):
|
||||
"""Determine if URL should be shown in full"""
|
||||
important_domains = [
|
||||
"github.com",
|
||||
"gitlab.com",
|
||||
"jira.",
|
||||
"confluence.",
|
||||
"docs.google.com",
|
||||
"drive.google.com",
|
||||
"sharepoint.com",
|
||||
"slack.com",
|
||||
"teams.microsoft.com",
|
||||
"zoom.us",
|
||||
]
|
||||
|
||||
# Show short URLs in full
|
||||
if len(url) < 60:
|
||||
return True
|
||||
|
||||
# Show URLs with important domains in shortened form
|
||||
if domain and any(imp in domain for imp in important_domains):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def clean_url_defenses(text):
|
||||
"""Remove URL defense wrappers"""
|
||||
# Remove Proofpoint URL defense
|
||||
text = re.sub(r"https://urldefense\.com/v3/__([^;]+)__;[^$]*\$", r"\1", text)
|
||||
|
||||
# Remove other common URL wrappers
|
||||
text = re.sub(
|
||||
r"https://[^/]*phishalarm[^/]*/[^/]*/[^$]*\$", "[Security Link Removed]", text
|
||||
)
|
||||
|
||||
return text
|
||||
|
||||
|
||||
def process_email_content(content):
|
||||
"""Process email content for better readability"""
|
||||
|
||||
# Remove URL defense wrappers first
|
||||
content = clean_url_defenses(content)
|
||||
|
||||
# Clean up common email artifacts first
|
||||
content = re.sub(
|
||||
r"ZjQcmQRYFpfpt.*?ZjQcmQRYFpfpt\w+End",
|
||||
"[Security Banner Removed]",
|
||||
content,
|
||||
flags=re.DOTALL,
|
||||
)
|
||||
|
||||
# Clean up mailto links that clutter the display
|
||||
content = re.sub(r"mailto:[^\s>]+", "", content)
|
||||
|
||||
# Pattern to match URLs (more conservative)
|
||||
url_pattern = r'https?://[^\s<>"{}|\\^`\[\]\(\)]+[^\s<>"{}|\\^`\[\]\(\).,;:!?]'
|
||||
|
||||
def replace_url(match):
|
||||
url = match.group(0)
|
||||
domain = extract_domain(url)
|
||||
|
||||
if is_important_url(url, domain):
|
||||
if domain and len(url) > 60:
|
||||
return f"[{domain}]"
|
||||
else:
|
||||
return url
|
||||
else:
|
||||
if domain:
|
||||
return f"[{domain}]"
|
||||
else:
|
||||
return "[Link]"
|
||||
|
||||
# Replace URLs
|
||||
content = re.sub(url_pattern, replace_url, content)
|
||||
|
||||
# Clean up email headers formatting
|
||||
content = re.sub(
|
||||
r"^(From|To|Subject|Date):\s*(.+?)$", r"**\1:** \2", content, flags=re.MULTILINE
|
||||
)
|
||||
|
||||
# Clean up angle brackets around email addresses that are left over
|
||||
content = re.sub(r"<[^>]*@[^>]*>", "", content)
|
||||
|
||||
# Remove excessive whitespace but preserve paragraph breaks
|
||||
content = re.sub(r"\n\s*\n\s*\n+", "\n\n", content)
|
||||
content = re.sub(r"[ \t]+", " ", content)
|
||||
|
||||
# Remove lines that are just whitespace
|
||||
content = re.sub(r"^\s*$\n", "", content, flags=re.MULTILINE)
|
||||
|
||||
# Clean up repeated domain references on same line
|
||||
content = re.sub(r"\[([^\]]+)\].*?\[\1\]", r"[\1]", content)
|
||||
|
||||
# Clean up trailing angle brackets and other artifacts
|
||||
content = re.sub(r"[<>]+\s*$", "", content, flags=re.MULTILINE)
|
||||
|
||||
return content.strip()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
content = sys.stdin.read()
|
||||
processed = process_email_content(content)
|
||||
print(processed)
|
||||
Reference in New Issue
Block a user