luk/shell/email_processor.py

#!/usr/bin/env python3
"""
Email content processor for run_himalaya.sh
Cleans up email content for better readability
"""

import sys
import re
from urllib.parse import urlparse


def extract_domain(url):
    """Extract domain from URL"""
    try:
        parsed = urlparse(url if url.startswith("http") else "http://" + url)
        return parsed.netloc.replace("www.", "")
    except:
        return None


def is_important_url(url, domain):
    """Determine if URL should be shown in full"""
    important_domains = [
        "github.com",
        "gitlab.com",
        "jira.",
        "confluence.",
        "docs.google.com",
        "drive.google.com",
        "sharepoint.com",
        "slack.com",
        "teams.microsoft.com",
        "zoom.us",
    ]

    # Show short URLs in full
    if len(url) < 60:
        return True

    # Show URLs with important domains in shortened form
    if domain and any(imp in domain for imp in important_domains):
        return True

    return False


def clean_url_defenses(text):
    """Remove URL defense wrappers"""
    # Remove Proofpoint URL defense
    text = re.sub(r"https://urldefense\.com/v3/__([^;]+)__;[^$]*\$", r"\1", text)

    # Remove other common URL wrappers
    text = re.sub(
        r"https://[^/]*phishalarm[^/]*/[^/]*/[^$]*\$", "[Security Link Removed]", text
    )

    return text


def process_email_content(content):
    """Process email content for better readability"""

    # Remove URL defense wrappers first
    content = clean_url_defenses(content)

    # Clean up common email artifacts first
    content = re.sub(
        r"ZjQcmQRYFpfpt.*?ZjQcmQRYFpfpt\w+End",
        "[Security Banner Removed]",
        content,
        flags=re.DOTALL,
    )

    # Clean up mailto links that clutter the display
    content = re.sub(r"mailto:[^\s>]+", "", content)

    # Pattern to match URLs (more conservative)
    url_pattern = r'https?://[^\s<>"{}|\\^`\[\]\(\)]+[^\s<>"{}|\\^`\[\]\(\).,;:!?]'

    def replace_url(match):
        url = match.group(0)
        domain = extract_domain(url)

        if is_important_url(url, domain):
            if domain and len(url) > 60:
                return f"[{domain}]"
            else:
                return url
        else:
            if domain:
                return f"[{domain}]"
            else:
                return "[Link]"

    # Replace URLs
    content = re.sub(url_pattern, replace_url, content)

    # Clean up email headers formatting
    content = re.sub(
        r"^(From|To|Subject|Date):\s*(.+?)$", r"**\1:** \2", content, flags=re.MULTILINE
    )

    # Clean up angle brackets around email addresses that are left over
    content = re.sub(r"<[^>]*@[^>]*>", "", content)

    # Remove excessive whitespace but preserve paragraph breaks
    content = re.sub(r"\n\s*\n\s*\n+", "\n\n", content)
    content = re.sub(r"[ \t]+", " ", content)

    # Remove lines that are just whitespace
    content = re.sub(r"^\s*$\n", "", content, flags=re.MULTILINE)

    # Clean up repeated domain references on same line
    content = re.sub(r"\[([^\]]+)\].*?\[\1\]", r"[\1]", content)

    # Clean up trailing angle brackets and other artifacts
    content = re.sub(r"[<>]+\s*$", "", content, flags=re.MULTILINE)

    return content.strip()


if __name__ == "__main__":
    content = sys.stdin.read()
    processed = process_email_content(content)
    print(processed)