trying a simple shell script and fixing archives

2025-07-15 22:13:46 -04:00
parent f7474a3805
commit df4c49c3ef
18 changed files with 1273 additions and 389 deletions
--- a/shell/email_processor.py
+++ b/shell/email_processor.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python3
+"""
+Email content processor for run_himalaya.sh
+Cleans up email content for better readability
+"""
+
+import sys
+import re
+from urllib.parse import urlparse
+
+
+def extract_domain(url):
+    """Extract domain from URL"""
+    try:
+        parsed = urlparse(url if url.startswith("http") else "http://" + url)
+        return parsed.netloc.replace("www.", "")
+    except:
+        return None
+
+
+def is_important_url(url, domain):
+    """Determine if URL should be shown in full"""
+    important_domains = [
+        "github.com",
+        "gitlab.com",
+        "jira.",
+        "confluence.",
+        "docs.google.com",
+        "drive.google.com",
+        "sharepoint.com",
+        "slack.com",
+        "teams.microsoft.com",
+        "zoom.us",
+    ]
+
+    # Show short URLs in full
+    if len(url) < 60:
+        return True
+
+    # Show URLs with important domains in shortened form
+    if domain and any(imp in domain for imp in important_domains):
+        return True
+
+    return False
+
+
+def clean_url_defenses(text):
+    """Remove URL defense wrappers"""
+    # Remove Proofpoint URL defense
+    text = re.sub(r"https://urldefense\.com/v3/__([^;]+)__;[^$]*\$", r"\1", text)
+
+    # Remove other common URL wrappers
+    text = re.sub(
+        r"https://[^/]*phishalarm[^/]*/[^/]*/[^$]*\$", "[Security Link Removed]", text
+    )
+
+    return text
+
+
+def process_email_content(content):
+    """Process email content for better readability"""
+
+    # Remove URL defense wrappers first
+    content = clean_url_defenses(content)
+
+    # Clean up common email artifacts first
+    content = re.sub(
+        r"ZjQcmQRYFpfpt.*?ZjQcmQRYFpfpt\w+End",
+        "[Security Banner Removed]",
+        content,
+        flags=re.DOTALL,
+    )
+
+    # Clean up mailto links that clutter the display
+    content = re.sub(r"mailto:[^\s>]+", "", content)
+
+    # Pattern to match URLs (more conservative)
+    url_pattern = r'https?://[^\s<>"{}|\\^`\[\]\(\)]+[^\s<>"{}|\\^`\[\]\(\).,;:!?]'
+
+    def replace_url(match):
+        url = match.group(0)
+        domain = extract_domain(url)
+
+        if is_important_url(url, domain):
+            if domain and len(url) > 60:
+                return f"[{domain}]"
+            else:
+                return url
+        else:
+            if domain:
+                return f"[{domain}]"
+            else:
+                return "[Link]"
+
+    # Replace URLs
+    content = re.sub(url_pattern, replace_url, content)
+
+    # Clean up email headers formatting
+    content = re.sub(
+        r"^(From|To|Subject|Date):\s*(.+?)$", r"**\1:** \2", content, flags=re.MULTILINE
+    )
+
+    # Clean up angle brackets around email addresses that are left over
+    content = re.sub(r"<[^>]*@[^>]*>", "", content)
+
+    # Remove excessive whitespace but preserve paragraph breaks
+    content = re.sub(r"\n\s*\n\s*\n+", "\n\n", content)
+    content = re.sub(r"[ \t]+", " ", content)
+
+    # Remove lines that are just whitespace
+    content = re.sub(r"^\s*$\n", "", content, flags=re.MULTILINE)
+
+    # Clean up repeated domain references on same line
+    content = re.sub(r"\[([^\]]+)\].*?\[\1\]", r"[\1]", content)
+
+    # Clean up trailing angle brackets and other artifacts
+    content = re.sub(r"[<>]+\s*$", "", content, flags=re.MULTILINE)
+
+    return content.strip()
+
+
+if __name__ == "__main__":
+    content = sys.stdin.read()
+    processed = process_email_content(content)
+    print(processed)