Files
luk/shell/email_processor.py
2025-07-15 22:13:46 -04:00

126 lines
3.3 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Email content processor for run_himalaya.sh
Cleans up email content for better readability
"""
import sys
import re
from urllib.parse import urlparse
def extract_domain(url):
"""Extract domain from URL"""
try:
parsed = urlparse(url if url.startswith("http") else "http://" + url)
return parsed.netloc.replace("www.", "")
except:
return None
def is_important_url(url, domain):
"""Determine if URL should be shown in full"""
important_domains = [
"github.com",
"gitlab.com",
"jira.",
"confluence.",
"docs.google.com",
"drive.google.com",
"sharepoint.com",
"slack.com",
"teams.microsoft.com",
"zoom.us",
]
# Show short URLs in full
if len(url) < 60:
return True
# Show URLs with important domains in shortened form
if domain and any(imp in domain for imp in important_domains):
return True
return False
def clean_url_defenses(text):
"""Remove URL defense wrappers"""
# Remove Proofpoint URL defense
text = re.sub(r"https://urldefense\.com/v3/__([^;]+)__;[^$]*\$", r"\1", text)
# Remove other common URL wrappers
text = re.sub(
r"https://[^/]*phishalarm[^/]*/[^/]*/[^$]*\$", "[Security Link Removed]", text
)
return text
def process_email_content(content):
"""Process email content for better readability"""
# Remove URL defense wrappers first
content = clean_url_defenses(content)
# Clean up common email artifacts first
content = re.sub(
r"ZjQcmQRYFpfpt.*?ZjQcmQRYFpfpt\w+End",
"[Security Banner Removed]",
content,
flags=re.DOTALL,
)
# Clean up mailto links that clutter the display
content = re.sub(r"mailto:[^\s>]+", "", content)
# Pattern to match URLs (more conservative)
url_pattern = r'https?://[^\s<>"{}|\\^`\[\]\(\)]+[^\s<>"{}|\\^`\[\]\(\).,;:!?]'
def replace_url(match):
url = match.group(0)
domain = extract_domain(url)
if is_important_url(url, domain):
if domain and len(url) > 60:
return f"[{domain}]"
else:
return url
else:
if domain:
return f"[{domain}]"
else:
return "[Link]"
# Replace URLs
content = re.sub(url_pattern, replace_url, content)
# Clean up email headers formatting
content = re.sub(
r"^(From|To|Subject|Date):\s*(.+?)$", r"**\1:** \2", content, flags=re.MULTILINE
)
# Clean up angle brackets around email addresses that are left over
content = re.sub(r"<[^>]*@[^>]*>", "", content)
# Remove excessive whitespace but preserve paragraph breaks
content = re.sub(r"\n\s*\n\s*\n+", "\n\n", content)
content = re.sub(r"[ \t]+", " ", content)
# Remove lines that are just whitespace
content = re.sub(r"^\s*$\n", "", content, flags=re.MULTILINE)
# Clean up repeated domain references on same line
content = re.sub(r"\[([^\]]+)\].*?\[\1\]", r"[\1]", content)
# Clean up trailing angle brackets and other artifacts
content = re.sub(r"[<>]+\s*$", "", content, flags=re.MULTILINE)
return content.strip()
if __name__ == "__main__":
content = sys.stdin.read()
processed = process_email_content(content)
print(processed)