This commit is contained in:
Bendt
2025-12-18 22:11:47 -05:00
parent 0ed7800575
commit a41d59e529
26 changed files with 4187 additions and 373 deletions

View File

@@ -5,10 +5,11 @@ Mail operations for Microsoft Graph API.
import os
import re
import glob
import json
import asyncio
from email.parser import Parser
from email.utils import getaddresses
from typing import List, Dict, Any
from typing import List, Dict, Any, Set
from .client import (
fetch_with_aiohttp,
@@ -27,6 +28,7 @@ async def fetch_mail_async(
task_id,
dry_run=False,
download_attachments=False,
is_cancelled=None,
):
"""
Fetch mail from Microsoft Graph API and save to Maildir.
@@ -39,6 +41,7 @@ async def fetch_mail_async(
task_id: ID of the task in the progress bar.
dry_run (bool): If True, don't actually make changes.
download_attachments (bool): If True, download email attachments.
is_cancelled (callable, optional): Callback that returns True if task should stop.
Returns:
None
@@ -105,8 +108,14 @@ async def fetch_mail_async(
# Update progress to reflect only the messages we actually need to download
progress.update(task_id, total=len(messages_to_download), completed=0)
downloaded_count = 0
for message in messages_to_download:
# Check if task was cancelled/disabled
if is_cancelled and is_cancelled():
progress.console.print("Task cancelled, stopping inbox fetch")
break
progress.console.print(
f"Processing message: {message.get('subject', 'No Subject')}", end="\r"
)
@@ -120,44 +129,92 @@ async def fetch_mail_async(
download_attachments,
)
progress.update(task_id, advance=1)
progress.update(task_id, completed=len(messages_to_download))
progress.console.print(
f"\nFinished downloading {len(messages_to_download)} new messages."
)
downloaded_count += 1
progress.update(task_id, completed=downloaded_count)
progress.console.print(f"\nFinished downloading {downloaded_count} new messages.")
progress.console.print(
f"Total messages on server: {len(messages)}, Already local: {len(local_msg_ids)}"
)
async def archive_mail_async(maildir_path, headers, progress, task_id, dry_run=False):
def _get_archive_sync_state_path(maildir_path: str) -> str:
"""Get the path to the archive sync state file."""
return os.path.join(maildir_path, ".Archive", ".sync_state.json")
def _load_archive_sync_state(maildir_path: str) -> Set[str]:
"""Load the set of message IDs that have been synced to server."""
state_path = _get_archive_sync_state_path(maildir_path)
if os.path.exists(state_path):
try:
with open(state_path, "r") as f:
data = json.load(f)
return set(data.get("synced_to_server", []))
except Exception:
pass
return set()
def _save_archive_sync_state(maildir_path: str, synced_ids: Set[str]) -> None:
"""Save the set of message IDs that have been synced to server."""
state_path = _get_archive_sync_state_path(maildir_path)
os.makedirs(os.path.dirname(state_path), exist_ok=True)
with open(state_path, "w") as f:
json.dump({"synced_to_server": list(synced_ids)}, f, indent=2)
async def archive_mail_async(
maildir_path, headers, progress, task_id, dry_run=False, is_cancelled=None
):
"""
Archive mail from Maildir to Microsoft Graph API archive folder using batch operations.
Messages are moved to the server's Archive folder, but local copies are kept.
A sync state file tracks which messages have already been synced to avoid
re-processing them on subsequent runs.
Args:
maildir_path (str): Path to the Maildir.
headers (dict): Headers including authentication.
progress: Progress instance for updating progress bars.
task_id: ID of the task in the progress bar.
dry_run (bool): If True, don't actually make changes.
is_cancelled (callable, optional): Callback that returns True if task should stop.
Returns:
None
"""
# Check both possible archive folder names locally
# Load already-synced message IDs
synced_ids = _load_archive_sync_state(maildir_path)
# Check both possible archive folder names locally (prefer .Archive)
archive_files = []
for archive_folder_name in [".Archives", ".Archive"]:
for archive_folder_name in [".Archive", ".Archives"]:
archive_dir = os.path.join(maildir_path, archive_folder_name)
if os.path.exists(archive_dir):
archive_files.extend(
glob.glob(os.path.join(archive_dir, "**", "*.eml*"), recursive=True)
)
if not archive_files:
# Filter out already-synced messages
files_to_sync = []
for filepath in archive_files:
message_id = os.path.basename(filepath).split(".")[0]
if message_id not in synced_ids:
files_to_sync.append(filepath)
if not files_to_sync:
progress.update(task_id, total=0, completed=0)
progress.console.print("No messages to archive")
progress.console.print(
f"No new messages to archive ({len(archive_files)} already synced)"
)
return
progress.update(task_id, total=len(archive_files))
progress.update(task_id, total=len(files_to_sync))
progress.console.print(
f"Found {len(files_to_sync)} new messages to sync to server Archive"
)
# Get archive folder ID from server
folder_response = await fetch_with_aiohttp(
@@ -179,9 +236,15 @@ async def archive_mail_async(maildir_path, headers, progress, task_id, dry_run=F
# Process files in batches of 20 (Microsoft Graph batch limit)
batch_size = 20
successful_moves = []
newly_synced_ids: Set[str] = set()
for i in range(0, len(archive_files), batch_size):
batch_files = archive_files[i : i + batch_size]
for i in range(0, len(files_to_sync), batch_size):
# Check if task was cancelled/disabled
if is_cancelled and is_cancelled():
progress.console.print("Task cancelled, stopping archive sync")
break
batch_files = files_to_sync[i : i + batch_size]
# Add small delay between batches to respect API limits
if i > 0:
@@ -216,23 +279,22 @@ async def archive_mail_async(maildir_path, headers, progress, task_id, dry_run=F
status = response["status"]
if status == 201: # 201 Created indicates successful move
os.remove(
filepath
) # Remove the local file since it's now archived on server
# Keep local file, just mark as synced
newly_synced_ids.add(message_id)
successful_moves.append(message_id)
progress.console.print(
f"Moved message to 'Archive': {message_id}"
f"Moved message to server Archive: {message_id}"
)
elif status == 404:
os.remove(
filepath
) # Remove the file from local archive if not found on server
# Message not in Inbox (maybe already archived or deleted on server)
# Mark as synced so we don't retry, but keep local copy
newly_synced_ids.add(message_id)
progress.console.print(
f"Message not found on server, removed local copy: {message_id}"
f"Message not in Inbox (already archived?): {message_id}"
)
else:
progress.console.print(
f"Failed to move message to 'Archive': {message_id}, status: {status}"
f"Failed to move message to Archive: {message_id}, status: {status}"
)
except Exception as e:
@@ -247,19 +309,19 @@ async def archive_mail_async(maildir_path, headers, progress, task_id, dry_run=F
{"destinationId": archive_folder_id},
)
if status == 201:
os.remove(filepath)
newly_synced_ids.add(message_id)
successful_moves.append(message_id)
progress.console.print(
f"Moved message to 'Archive' (fallback): {message_id}"
f"Moved message to server Archive (fallback): {message_id}"
)
elif status == 404:
os.remove(filepath)
newly_synced_ids.add(message_id)
progress.console.print(
f"Message not found on server, removed local copy: {message_id}"
f"Message not in Inbox (already archived?): {message_id}"
)
else:
progress.console.print(
f"Failed to move message to 'Archive': {message_id}, status: {status}"
f"Failed to move message to Archive: {message_id}, status: {status}"
)
except Exception as individual_error:
progress.console.print(
@@ -270,18 +332,184 @@ async def archive_mail_async(maildir_path, headers, progress, task_id, dry_run=F
for filepath in batch_files:
message_id = os.path.basename(filepath).split(".")[0]
progress.console.print(
f"[DRY-RUN] Would move message to 'Archive' folder: {message_id}"
f"[DRY-RUN] Would move message to server Archive: {message_id}"
)
progress.advance(task_id, len(batch_files))
if not dry_run:
# Save sync state after each batch for resilience
if not dry_run and newly_synced_ids:
synced_ids.update(newly_synced_ids)
_save_archive_sync_state(maildir_path, synced_ids)
# Final summary
if not dry_run and successful_moves:
progress.console.print(
f"Successfully archived {len(successful_moves)} messages in batches"
f"Successfully synced {len(successful_moves)} messages to server Archive (kept local copies)"
)
return
async def fetch_archive_mail_async(
maildir_path,
attachments_dir,
headers,
progress,
task_id,
dry_run=False,
download_attachments=False,
max_messages=None,
is_cancelled=None,
):
"""
Fetch archived mail from Microsoft Graph API Archive folder and save to local .Archive Maildir.
Args:
maildir_path (str): Path to the Maildir.
attachments_dir (str): Path to save attachments.
headers (dict): Headers including authentication.
progress: Progress instance for updating progress bars.
task_id: ID of the task in the progress bar.
dry_run (bool): If True, don't actually make changes.
download_attachments (bool): If True, download email attachments.
max_messages (int, optional): Maximum number of messages to fetch. None = all.
is_cancelled (callable, optional): Callback that returns True if task should stop.
Returns:
None
"""
from src.utils.mail_utils.maildir import save_mime_to_maildir_async
# Use the well-known 'archive' folder name
mail_url = "https://graph.microsoft.com/v1.0/me/mailFolders/archive/messages?$top=100&$orderby=receivedDateTime desc&$select=id,subject,from,toRecipients,ccRecipients,receivedDateTime,isRead"
messages = []
# Fetch the total count of messages in the archive
archive_info_url = "https://graph.microsoft.com/v1.0/me/mailFolders/archive"
try:
response = await fetch_with_aiohttp(archive_info_url, headers)
total_messages = response.get("totalItemCount", 0) if response else 0
except Exception as e:
progress.console.print(f"Error fetching archive folder info: {e}")
total_messages = 0
# Apply max_messages limit if specified
effective_total = (
min(total_messages, max_messages) if max_messages else total_messages
)
progress.update(task_id, total=effective_total)
progress.console.print(
f"Archive folder has {total_messages} messages"
+ (f", fetching up to {max_messages}" if max_messages else "")
)
# Fetch messages from archive
fetched_count = 0
while mail_url:
try:
response_data = await fetch_with_aiohttp(mail_url, headers)
except Exception as e:
progress.console.print(f"Error fetching archive messages: {e}")
break
batch = response_data.get("value", []) if response_data else []
# Apply max_messages limit
if max_messages and fetched_count + len(batch) > max_messages:
batch = batch[: max_messages - fetched_count]
messages.extend(batch)
fetched_count += len(batch)
break
messages.extend(batch)
fetched_count += len(batch)
progress.advance(task_id, len(batch))
# Get the next page URL from @odata.nextLink
mail_url = response_data.get("@odata.nextLink") if response_data else None
# Set up local archive directory paths
archive_dir = os.path.join(maildir_path, ".Archive")
cur_dir = os.path.join(archive_dir, "cur")
new_dir = os.path.join(archive_dir, "new")
# Ensure directories exist
os.makedirs(cur_dir, exist_ok=True)
os.makedirs(new_dir, exist_ok=True)
os.makedirs(os.path.join(archive_dir, "tmp"), exist_ok=True)
# Get local message IDs in archive
cur_files = set(glob.glob(os.path.join(cur_dir, "*.eml*")))
new_files = set(glob.glob(os.path.join(new_dir, "*.eml*")))
local_msg_ids = set()
for filename in set.union(cur_files, new_files):
message_id = os.path.basename(filename).split(".")[0]
local_msg_ids.add(message_id)
# Filter messages to only include those not already local
messages_to_download = [msg for msg in messages if msg["id"] not in local_msg_ids]
progress.console.print(
f"Found {len(messages)} messages on server Archive, {len(local_msg_ids)} already local"
)
progress.console.print(
f"Downloading {len(messages_to_download)} new archived messages"
)
# Update progress to reflect only the messages we actually need to download
progress.update(task_id, total=len(messages_to_download), completed=0)
# Load sync state once, we'll update it incrementally
synced_ids = _load_archive_sync_state(maildir_path) if not dry_run else set()
downloaded_count = 0
for message in messages_to_download:
# Check if task was cancelled/disabled
if is_cancelled and is_cancelled():
progress.console.print("Task cancelled, stopping archive fetch")
break
progress.console.print(
f"Processing archived message: {message.get('subject', 'No Subject')[:50]}",
end="\r",
)
# Save to .Archive folder instead of main maildir
await save_mime_to_maildir_async(
archive_dir, # Use archive_dir instead of maildir_path
message,
attachments_dir,
headers,
progress,
dry_run,
download_attachments,
)
progress.update(task_id, advance=1)
downloaded_count += 1
# Update sync state after each message for resilience
# This ensures we don't try to re-upload this message in archive_mail_async
if not dry_run:
synced_ids.add(message["id"])
_save_archive_sync_state(maildir_path, synced_ids)
progress.update(task_id, completed=downloaded_count)
progress.console.print(
f"\nFinished downloading {downloaded_count} archived messages."
)
progress.console.print(
f"Total in server Archive: {total_messages}, Already local: {len(local_msg_ids)}"
)
# Also add any messages we already had locally (from the full server list)
# to ensure they're marked as synced
if not dry_run and messages:
for msg in messages:
synced_ids.add(msg["id"])
_save_archive_sync_state(maildir_path, synced_ids)
async def delete_mail_async(maildir_path, headers, progress, task_id, dry_run=False):
"""
Delete mail from Maildir and Microsoft Graph API using batch operations.