adding UV

2025-05-08 12:02:24 -06:00
parent 7e42644224
commit eba883a465
10 changed files with 856 additions and 126 deletions
--- a/fetch_outlook.py
+++ b/fetch_outlook.py
@@ -1,22 +1,26 @@
+import glob
+import json
 import os
 import re
-from typing import Set
-import msal
-import json
-import glob
-from datetime import datetime
-from dateutil import parser
-from dateutil.tz import UTC
+import time
+
+from datetime import datetime, timedelta
 from email.message import EmailMessage
 from email.utils import format_datetime
+from typing import Set
+
+from dateutil import parser
+from dateutil.tz import UTC
 from rich import print
-from rich.progress import Progress, SpinnerColumn, MofNCompleteColumn
 from rich.panel import Panel
-import time
-import html2text
-import asyncio
-import argparse
+from rich.progress import Progress, SpinnerColumn, MofNCompleteColumn
+
 import aiohttp
+import argparse
+import asyncio
+import html2text
+import msal
+import orjson

 # Filepath for caching timestamp
 cache_timestamp_file = 'cache_timestamp.json'
@@ -44,32 +48,39 @@ args = arg_parser.parse_args()

 dry_run = args.dry_run

+# Define a global semaphore for throttling
+semaphore = asyncio.Semaphore(4)
+
 async def fetch_with_aiohttp(url, headers):
-    async with aiohttp.ClientSession() as session:
-        async with session.get(url, headers=headers) as response:
-            if response.status != 200:
-                raise Exception(f"Failed to fetch {url}: {response.status} {await response.text()}")
-            return await response.json()
+    async with semaphore:
+        async with aiohttp.ClientSession() as session:
+            async with session.get(url, headers=headers) as response:
+                if response.status != 200:
+                    raise Exception(f"Failed to fetch {url}: {response.status} {await response.text()}")
+                raw_bytes = await response.read()
+                content_length = response.headers.get('Content-Length')
+                if content_length and len(raw_bytes) != int(content_length):
+                    print("Warning: Incomplete response received!")
+                    return None
+                return orjson.loads(raw_bytes)

 async def post_with_aiohttp(url, headers, json_data):
-    async with aiohttp.ClientSession() as session:
-        async with session.post(url, headers=headers, json=json_data) as response:
-            if response.status != 201:
-                raise Exception(f"Failed to post to {url}: {response.status} {await response.text()}")
-            return await response.json()
+    async with semaphore:
+        async with aiohttp.ClientSession() as session:
+            async with session.post(url, headers=headers, json=json_data) as response:
+                return response.status

 async def patch_with_aiohttp(url, headers, json_data):
-    async with aiohttp.ClientSession() as session:
-        async with session.patch(url, headers=headers, json=json_data) as response:
-            if response.status != 200:
-                raise Exception(f"Failed to patch {url}: {response.status} {await response.text()}")
-            return await response.json()
+    async with semaphore:
+        async with aiohttp.ClientSession() as session:
+            async with session.patch(url, headers=headers, json=json_data) as response:
+                return response.status

 async def delete_with_aiohttp(url, headers):
-    async with aiohttp.ClientSession() as session:
-        async with session.delete(url, headers=headers) as response:
-            if response.status != 204:
-                raise Exception(f"Failed to delete {url}: {response.status} {await response.text()}")
+    async with semaphore:
+        async with aiohttp.ClientSession() as session:
+            async with session.delete(url, headers=headers) as response:
+                return response.status

 async def synchronize_maildir_async(maildir_path, headers, progress, task_id):
    last_sync = load_last_sync_timestamp()
@@ -78,19 +89,26 @@ async def synchronize_maildir_async(maildir_path, headers, progress, task_id):
    # Find messages moved from "new" to "cur" and mark them as read
    new_dir = os.path.join(maildir_path, 'new')
    cur_dir = os.path.join(maildir_path, 'cur')
-    new_files = set(glob.glob(os.path.join(new_dir, '*.eml')))
-    cur_files = set(glob.glob(os.path.join(cur_dir, '*.eml')))
+    new_files = set(glob.glob(os.path.join(new_dir, '*.eml*')))
+    cur_files = set(glob.glob(os.path.join(cur_dir, '*.eml*')))

    moved_to_cur = [os.path.basename(f) for f in cur_files - new_files]
    progress.update(task_id, total=len(moved_to_cur))
    for filename in moved_to_cur:
-        message_id = filename.split('.')[0]  # Extract the Message-ID from the filename
+        # TODO: this isn't scalable, we should use a more efficient way to check if the file was modified
+        if os.path.getmtime(os.path.join(cur_dir, filename)) < last_sync:
+            progress.update(task_id, advance=1)
+            continue
+        message_id = re.sub(r"\:2.+", "", filename.split('.')[0])  # Extract the Message-ID from the filename
        if not dry_run:
-            await patch_with_aiohttp(
+            status = await patch_with_aiohttp(
                f'https://graph.microsoft.com/v1.0/me/messages/{message_id}',
                headers,
                {'isRead': True}
            )
+            if status == 404:
+                os.remove(os.path.join(cur_dir, filename))
+
        else:
            progress.console.print(f"[DRY-RUN] Would mark message as read: {message_id}")
        progress.advance(task_id)
@@ -102,18 +120,23 @@ async def synchronize_maildir_async(maildir_path, headers, progress, task_id):
        progress.console.print("[DRY-RUN] Would save sync timestamp.")

 async def fetch_mail_async(maildir_path, attachments_dir, headers, progress, task_id):
-    mail_url = 'https://graph.microsoft.com/v1.0/me/mailFolders/inbox/messages?$top=100&$orderby=receivedDateTime asc&$select=id,subject,from,toRecipients,ccRecipients,receivedDateTime,isRead,body,attachments'
+    mail_url = 'https://graph.microsoft.com/v1.0/me/mailFolders/inbox/messages?$top=100&$orderby=receivedDateTime asc&$select=id,subject,from,toRecipients,ccRecipients,receivedDateTime,isRead'
    messages = []

    # Fetch the total count of messages in the inbox
    inbox_url = 'https://graph.microsoft.com/v1.0/me/mailFolders/inbox'
+
    response = await fetch_with_aiohttp(inbox_url, headers)

    total_messages = response.get('totalItemCount', 0)
    progress.update(task_id, total=total_messages)

    while mail_url:
-        response_data = await fetch_with_aiohttp(mail_url, headers)
+        try:
+            response_data = await fetch_with_aiohttp(mail_url, headers)
+        except Exception as e:
+            progress.console.print(f"Error fetching messages: {e}")
+            continue
        messages.extend(response_data.get('value', []))
        progress.advance(task_id, len(response_data.get('value', [])))

@@ -124,8 +147,8 @@ async def fetch_mail_async(maildir_path, attachments_dir, headers, progress, tas
    progress.update(task_id, completed=(len(messages) / 2))
    new_dir = os.path.join(maildir_path, 'new')
    cur_dir = os.path.join(maildir_path, 'cur')
-    new_files = set(glob.glob(os.path.join(new_dir, '*.eml')))
-    cur_files = set(glob.glob(os.path.join(cur_dir, '*.eml')))
+    new_files = set(glob.glob(os.path.join(new_dir, '*.eml*')))
+    cur_files = set(glob.glob(os.path.join(cur_dir, '*.eml*')))

    for filename in Set.union(cur_files, new_files):
        message_id = filename.split('.')[0].split('/')[-1]  # Extract the Message-ID from the filename
@@ -138,14 +161,14 @@ async def fetch_mail_async(maildir_path, attachments_dir, headers, progress, tas

    for message in messages:
        progress.console.print(f"Processing message: {message.get('subject', 'No Subject')}", end='\r')
-        save_email_to_maildir(maildir_path, message, attachments_dir, progress)
+        await save_mime_to_maildir_async(maildir_path, message, attachments_dir, headers, progress)
        progress.update(task_id, advance=0.5)
    progress.update(task_id, completed=len(messages))
    progress.console.print(f"\nFinished saving {len(messages)} messages.")

 async def archive_mail_async(maildir_path, headers, progress, task_id):
    archive_dir = os.path.join(maildir_path, '.Archives')
-    archive_files = glob.glob(os.path.join(archive_dir, '**', '*.eml'), recursive=True)
+    archive_files = glob.glob(os.path.join(archive_dir, '**', '*.eml*'), recursive=True)
    progress.update(task_id, total=len(archive_files))

    folder_response = await fetch_with_aiohttp('https://graph.microsoft.com/v1.0/me/mailFolders', headers)
@@ -157,17 +180,20 @@ async def archive_mail_async(maildir_path, headers, progress, task_id):

    for filepath in archive_files:
        message_id = os.path.basename(filepath).split('.')[0]  # Extract the Message-ID from the filename
-        progress.console.print(f"Moving message to 'Archive' folder: {message_id}")
+
        if not dry_run:
-            response = await post_with_aiohttp(
-                f'https://graph.microsoft.com/v1.0/me/messages/{message_id}/move',
+            status = await post_with_aiohttp(
+                f'https://graph.microsoft.com/v1.0/me/messages/{message_id}/microsoft.graph.move',
                headers,
                {'destinationId': archive_folder_id}
            )
-            if response.status_code != 201:  # 201 Created indicates success
-                progress.console.print(f"Failed to move message to 'Archive': {message_id}, {response.status_code}, {response.text}")
-            if response.status_code == 404:
-                os.remove(filepath)  # Remove the file from local archive if not found on server
+            if status != 201:  # 201 Created indicates success
+                progress.console.print(f"Failed to move message to 'Archive': {message_id}, {status}")
+            if status == 404:
+                os.remove(filepath)  # Remove the file from local archive if not fo
+                progress.console.print(f"Message not found on server, removed local copy: {message_id}")
+            elif status == 204:
+                progress.console.print(f"Moved message to 'Archive': {message_id}")
        else:
            progress.console.print(f"[DRY-RUN] Would move message to 'Archive' folder: {message_id}")
        progress.advance(task_id)
@@ -175,106 +201,134 @@ async def archive_mail_async(maildir_path, headers, progress, task_id):

 async def delete_mail_async(maildir_path, headers, progress, task_id):
    trash_dir = os.path.join(maildir_path, '.Trash', 'cur')
-    trash_files = set(glob.glob(os.path.join(trash_dir, '*.eml')))
+    trash_files = set(glob.glob(os.path.join(trash_dir, '*.eml*')))
    progress.update(task_id, total=len(trash_files))

    for filepath in trash_files:
        message_id = os.path.basename(filepath).split('.')[0]  # Extract the Message-ID from the filename
        if not dry_run:
            progress.console.print(f"Moving message to trash: {message_id}")
-            await delete_with_aiohttp(
+            status = await delete_with_aiohttp(
                f'https://graph.microsoft.com/v1.0/me/messages/{message_id}',
                headers
            )
-            os.remove(filepath)  # Remove the file from local trash
+            if status == 204 or status == 404:
+                os.remove(filepath)  # Remove the file from local trash
        else:
            progress.console.print(f"[DRY-RUN] Would delete message: {message_id}")
        progress.advance(task_id)

 async def fetch_calendar_async(headers, progress, task_id):
-    total_event_url = 'https://graph.microsoft.com/v1.0/me/events?$count=true'
+    yesterday = datetime.now().replace(hour=0, minute=0, second=0) - timedelta(days=1)
+    end_of_today = datetime.now().replace(hour=23, minute=59, second=59)
+    six_days_future = end_of_today + timedelta(days=6)
+    # example         https://graph.microsoft.com/v1.0/me/calendarView?startDateTime=2025-05-06T00:00:00&endDateTime=2025-05-13T23:59:59.999999&$count=true&$select=id
+    event_base_url =f"https://graph.microsoft.com/v1.0/me/calendarView?startDateTime={yesterday.isoformat()}&endDateTime={six_days_future.isoformat()}"
+    total_event_url = f"{event_base_url}&$count=true&$select=id"
+
    total = await fetch_with_aiohttp(total_event_url, headers)

-    total_events = total.get('@odata.count', 0)
+    total_events = total.get('@odata.count', 0) + 1
    progress.update(task_id, total=total_events)
-    calendar_url = 'https://graph.microsoft.com/v1.0/me/events?$top=100&$orderby=start/dateTime asc'
+    calendar_url = f"{event_base_url}&$top=100&$select=start,end,iCalUid,subject,bodyPreview,webLink,location,recurrence,showAs,responseStatus,onlineMeeting"
    events = []
-
+    if total_events > 100:
+        progress.update(task_id, total=total_events + total_events % 100)
    while calendar_url:
        response_data = await fetch_with_aiohttp(calendar_url, headers)
        events.extend(response_data.get('value', []))
-        progress.advance(task_id, len(response_data.get('value', [])))
+        progress.advance(task_id, 1)

        # Get the next page URL from @odata.nextLink
        calendar_url = response_data.get('@odata.nextLink')

-async def download_calendar_events(headers, progress, task_id):
-    # Fetch the total count of events in the calendar
-    total_event_url = 'https://graph.microsoft.com/v1.0/me/events?$count=true'
-    total = await fetch_with_aiohttp(total_event_url, headers)
+    output_file = f'output_ics/outlook_events_latest.ics'
+    if not dry_run:
+        os.makedirs(os.path.dirname(output_file), exist_ok=True)
+        progress.console.print(f"Saving events to {output_file}...")
+        with open(output_file, 'w') as f:
+            f.write("BEGIN:VCALENDAR\nVERSION:2.0\n")
+            for event in events:
+                progress.advance(task_id)
+                if 'start' in event and 'end' in event:
+                    start = parser.isoparse(event['start']['dateTime']).astimezone(UTC)
+                    end = parser.isoparse(event['end']['dateTime']).astimezone(UTC)
+                    f.write(f"BEGIN:VEVENT\nSUMMARY:{event['subject']}\nDESCRIPTION:{event.get('bodyPreview', '')}\n")
+                    f.write(f"UID:{event.get('iCalUId', '')}\n")
+                    f.write(f"LOCATION:{event.get('location', {})['displayName']}\n")
+                    f.write(f"CLASS:{event.get('showAs', '')}\n")
+                    f.write(f"STATUS:{event.get('responseStatus', {})['response']}\n")
+                    if 'onlineMeeting' in event and event['onlineMeeting']:
+                        f.write(f"URL:{event.get('onlineMeeting', {}).get('joinUrl', '')}\n")
+                    f.write(f"DTSTART:{start.strftime('%Y%m%dT%H%M%S')}\n")
+                    f.write(f"DTEND:{end.strftime('%Y%m%dT%H%M%S')}\n")
+                    if 'recurrence' in event and event['recurrence']:  # Check if 'recurrence' exists and is not None
+                        for rule in event['recurrence']:
+                            if rule.startswith('RRULE'):
+                                rule_parts = rule.split(';')
+                                new_rule_parts = []
+                                for part in rule_parts:
+                                    if part.startswith('UNTIL='):
+                                        until_value = part.split('=')[1]
+                                        until_date = parser.isoparse(until_value)
+                                        if start.tzinfo is not None and until_date.tzinfo is None:
+                                            until_date = until_date.replace(tzinfo=UTC)
+                                        new_rule_parts.append(f"UNTIL={until_date.strftime('%Y%m%dT%H%M%SZ')}")
+                                    else:
+                                        new_rule_parts.append(part)
+                                rule = ';'.join(new_rule_parts)
+                            f.write(f"{rule}\n")
+                    f.write("END:VEVENT\n")
+            f.write("END:VCALENDAR\n")
+
+        progress.console.print(f"Saved events to {output_file}")
+    else:
+        progress.console.print(f"[DRY-RUN] Would save events to {output_file}")

-    total_events = total.get('@odata.count', 0)
-    progress.update(task_id, total=total_events)
-    print(f"Total events in calendar: {total_events}")

-    # Fetch events with pagination and expand recurring events
-    events_url = 'https://graph.microsoft.com/v1.0/me/events?$top=100&$expand=instances'
-    events = []
-    progress.console.print("Fetching Calendar events...")
-    while events_url:
-        response_data = await fetch_with_aiohttp(events_url, headers)
-        events.extend(response_data.get('value', []))
-        events_url = response_data.get('@odata.nextLink')
-        progress.advance(task_id, len(response_data.get('value', [])))
-        # Save events to a file in iCalendar format
-        output_file = f'output_ics/outlook_events_latest.ics'
-        if not dry_run:
-            os.makedirs(os.path.dirname(output_file), exist_ok=True)
-            progress.console.print(f"Saving events to {output_file}...")
-            with open(output_file, 'w') as f:
-                f.write("BEGIN:VCALENDAR\nVERSION:2.0\n")
-                for event in events:
-                    if 'start' in event and 'end' in event:
-                        start = parser.isoparse(event['start']['dateTime'])
-                        end = parser.isoparse(event['end']['dateTime'])
-                        f.write(f"BEGIN:VEVENT\nSUMMARY:{event['subject']}\n")
-                        f.write(f"DTSTART:{start.strftime('%Y%m%dT%H%M%S')}\n")
-                        f.write(f"DTEND:{end.strftime('%Y%m%dT%H%M%S')}\n")
-                        if 'recurrence' in event and event['recurrence']:  # Check if 'recurrence' exists and is not None
-                            for rule in event['recurrence']:
-                                if rule.startswith('RRULE'):
-                                    rule_parts = rule.split(';')
-                                    new_rule_parts = []
-                                    for part in rule_parts:
-                                        if part.startswith('UNTIL='):
-                                            until_value = part.split('=')[1]
-                                            until_date = parser.isoparse(until_value)
-                                            if start.tzinfo is not None and until_date.tzinfo is None:
-                                                until_date = until_date.replace(tzinfo=UTC)
-                                            new_rule_parts.append(f"UNTIL={until_date.strftime('%Y%m%dT%H%M%SZ')}")
-                                        else:
-                                            new_rule_parts.append(part)
-                                    rule = ';'.join(new_rule_parts)
-                                f.write(f"{rule}\n")
-                        f.write("END:VEVENT\n")
-                f.write("END:VCALENDAR\n")
-            progress.console.print(f"Saved events to {output_file}")
-        else:
-            progress.console.print(f"[DRY-RUN] Would save events to {output_file}")

-# Function to check if the cache is still valid
-def is_cache_valid():
-    if 'timestamp' in cache_timestamp and 'max_age' in cache_timestamp:
-        current_time = time.time()
-        cache_expiry_time = cache_timestamp['timestamp'] + cache_timestamp['max_age']
-        return current_time < cache_expiry_time
-    return False
 # Function to create Maildir structure
 def create_maildir_structure(base_path):
    os.makedirs(os.path.join(base_path, 'cur'), exist_ok=True)
    os.makedirs(os.path.join(base_path, 'new'), exist_ok=True)
    os.makedirs(os.path.join(base_path, 'tmp'), exist_ok=True)

+async def save_mime_to_maildir_async(maildir_path, email_data, attachments_dir, headers, progress):
+    # Create a new EmailMessage object
+    msg = EmailMessage()
+    # Determine the directory based on isRead
+    target_dir = 'cur' if email_data.get('isRead', False) else 'new'
+    id = email_data.get('id', '')
+    if not id:
+        progress.console.print(f"Message ID not found. Skipping save.")
+        return
+    email_filename = f"{id}.eml"
+    email_filepath = os.path.join(maildir_path, target_dir, email_filename)
+
+    # Check if the file already exists
+    if os.path.exists(email_filepath):
+        progress.console.print(f"Message {id} already exists in {target_dir}. Skipping save.")
+        return
+
+    # Fetch the full MIME payload from the API
+
+    mime_url = f'https://graph.microsoft.com/v1.0/me/messages/{id}/$value'
+    try:
+        async with aiohttp.ClientSession() as session:
+            async with session.get(mime_url, headers=headers) as response:
+                if response.status != 200:
+                    raise Exception(f"Failed to fetch MIME payload for {id}: {response.status} {await response.text()}")
+                mime_payload = await response.text()
+
+        # Save the MIME payload to the Maildir
+        os.makedirs(os.path.dirname(email_filepath), exist_ok=True)
+        with open(email_filepath, 'w') as f:
+            f.write(mime_payload)
+        progress.console.print(f"Saved message {id} to {target_dir}.")
+
+    except Exception as e:
+        progress.console.print(f"Failed to save message {id}: {e}")
+
 def save_email_to_maildir(maildir_path, email_data, attachments_dir, progress):
    # Create a new EmailMessage object
    msg = EmailMessage()
@@ -402,7 +456,7 @@ async def main():
        f.write(cache.serialize())

    access_token = token_response['access_token']
-    headers = {'Authorization': f'Bearer {access_token}'}
+    headers = {'Authorization': f'Bearer {access_token}', 'Prefer': 'outlook.body-content-type="text"'}
    accounts = app.get_accounts()

    if not accounts: