126 lines
3.3 KiB
Python
Executable File
126 lines
3.3 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Email content processor for run_himalaya.sh
|
|
Cleans up email content for better readability
|
|
"""
|
|
|
|
import sys
|
|
import re
|
|
from urllib.parse import urlparse
|
|
|
|
|
|
def extract_domain(url):
|
|
"""Extract domain from URL"""
|
|
try:
|
|
parsed = urlparse(url if url.startswith("http") else "http://" + url)
|
|
return parsed.netloc.replace("www.", "")
|
|
except:
|
|
return None
|
|
|
|
|
|
def is_important_url(url, domain):
|
|
"""Determine if URL should be shown in full"""
|
|
important_domains = [
|
|
"github.com",
|
|
"gitlab.com",
|
|
"jira.",
|
|
"confluence.",
|
|
"docs.google.com",
|
|
"drive.google.com",
|
|
"sharepoint.com",
|
|
"slack.com",
|
|
"teams.microsoft.com",
|
|
"zoom.us",
|
|
]
|
|
|
|
# Show short URLs in full
|
|
if len(url) < 60:
|
|
return True
|
|
|
|
# Show URLs with important domains in shortened form
|
|
if domain and any(imp in domain for imp in important_domains):
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
def clean_url_defenses(text):
|
|
"""Remove URL defense wrappers"""
|
|
# Remove Proofpoint URL defense
|
|
text = re.sub(r"https://urldefense\.com/v3/__([^;]+)__;[^$]*\$", r"\1", text)
|
|
|
|
# Remove other common URL wrappers
|
|
text = re.sub(
|
|
r"https://[^/]*phishalarm[^/]*/[^/]*/[^$]*\$", "[Security Link Removed]", text
|
|
)
|
|
|
|
return text
|
|
|
|
|
|
def process_email_content(content):
|
|
"""Process email content for better readability"""
|
|
|
|
# Remove URL defense wrappers first
|
|
content = clean_url_defenses(content)
|
|
|
|
# Clean up common email artifacts first
|
|
content = re.sub(
|
|
r"ZjQcmQRYFpfpt.*?ZjQcmQRYFpfpt\w+End",
|
|
"[Security Banner Removed]",
|
|
content,
|
|
flags=re.DOTALL,
|
|
)
|
|
|
|
# Clean up mailto links that clutter the display
|
|
content = re.sub(r"mailto:[^\s>]+", "", content)
|
|
|
|
# Pattern to match URLs (more conservative)
|
|
url_pattern = r'https?://[^\s<>"{}|\\^`\[\]\(\)]+[^\s<>"{}|\\^`\[\]\(\).,;:!?]'
|
|
|
|
def replace_url(match):
|
|
url = match.group(0)
|
|
domain = extract_domain(url)
|
|
|
|
if is_important_url(url, domain):
|
|
if domain and len(url) > 60:
|
|
return f"[{domain}]"
|
|
else:
|
|
return url
|
|
else:
|
|
if domain:
|
|
return f"[{domain}]"
|
|
else:
|
|
return "[Link]"
|
|
|
|
# Replace URLs
|
|
content = re.sub(url_pattern, replace_url, content)
|
|
|
|
# Clean up email headers formatting
|
|
content = re.sub(
|
|
r"^(From|To|Subject|Date):\s*(.+?)$", r"**\1:** \2", content, flags=re.MULTILINE
|
|
)
|
|
|
|
# Clean up angle brackets around email addresses that are left over
|
|
content = re.sub(r"<[^>]*@[^>]*>", "", content)
|
|
|
|
# Remove excessive whitespace but preserve paragraph breaks
|
|
content = re.sub(r"\n\s*\n\s*\n+", "\n\n", content)
|
|
content = re.sub(r"[ \t]+", " ", content)
|
|
|
|
# Remove lines that are just whitespace
|
|
content = re.sub(r"^\s*$\n", "", content, flags=re.MULTILINE)
|
|
|
|
# Clean up repeated domain references on same line
|
|
content = re.sub(r"\[([^\]]+)\].*?\[\1\]", r"[\1]", content)
|
|
|
|
# Clean up trailing angle brackets and other artifacts
|
|
content = re.sub(r"[<>]+\s*$", "", content, flags=re.MULTILINE)
|
|
|
|
return content.strip()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
content = sys.stdin.read()
|
|
processed = process_email_content(content)
|
|
print(processed)
|