#!/usr/bin/env python3 """ Email content processor for run_himalaya.sh Cleans up email content for better readability """ import sys import re from urllib.parse import urlparse def extract_domain(url): """Extract domain from URL""" try: parsed = urlparse(url if url.startswith("http") else "http://" + url) return parsed.netloc.replace("www.", "") except: return None def is_important_url(url, domain): """Determine if URL should be shown in full""" important_domains = [ "github.com", "gitlab.com", "jira.", "confluence.", "docs.google.com", "drive.google.com", "sharepoint.com", "slack.com", "teams.microsoft.com", "zoom.us", ] # Show short URLs in full if len(url) < 60: return True # Show URLs with important domains in shortened form if domain and any(imp in domain for imp in important_domains): return True return False def clean_url_defenses(text): """Remove URL defense wrappers""" # Remove Proofpoint URL defense text = re.sub(r"https://urldefense\.com/v3/__([^;]+)__;[^$]*\$", r"\1", text) # Remove other common URL wrappers text = re.sub( r"https://[^/]*phishalarm[^/]*/[^/]*/[^$]*\$", "[Security Link Removed]", text ) return text def process_email_content(content): """Process email content for better readability""" # Remove URL defense wrappers first content = clean_url_defenses(content) # Clean up common email artifacts first content = re.sub( r"ZjQcmQRYFpfpt.*?ZjQcmQRYFpfpt\w+End", "[Security Banner Removed]", content, flags=re.DOTALL, ) # Clean up mailto links that clutter the display content = re.sub(r"mailto:[^\s>]+", "", content) # Pattern to match URLs (more conservative) url_pattern = r'https?://[^\s<>"{}|\\^`\[\]\(\)]+[^\s<>"{}|\\^`\[\]\(\).,;:!?]' def replace_url(match): url = match.group(0) domain = extract_domain(url) if is_important_url(url, domain): if domain and len(url) > 60: return f"[{domain}]" else: return url else: if domain: return f"[{domain}]" else: return "[Link]" # Replace URLs content = re.sub(url_pattern, replace_url, content) # Clean up email headers formatting content = re.sub( r"^(From|To|Subject|Date):\s*(.+?)$", r"**\1:** \2", content, flags=re.MULTILINE ) # Clean up angle brackets around email addresses that are left over content = re.sub(r"<[^>]*@[^>]*>", "", content) # Remove excessive whitespace but preserve paragraph breaks content = re.sub(r"\n\s*\n\s*\n+", "\n\n", content) content = re.sub(r"[ \t]+", " ", content) # Remove lines that are just whitespace content = re.sub(r"^\s*$\n", "", content, flags=re.MULTILINE) # Clean up repeated domain references on same line content = re.sub(r"\[([^\]]+)\].*?\[\1\]", r"[\1]", content) # Clean up trailing angle brackets and other artifacts content = re.sub(r"[<>]+\s*$", "", content, flags=re.MULTILINE) return content.strip() if __name__ == "__main__": content = sys.stdin.read() processed = process_email_content(content) print(processed)