diff --git a/.gitignore b/.gitignore index 3ad97b5..72be17d 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,4 @@ output_markdown_files/output_6.md token_cache.bin output_ics/outlook_events_latest.ics cache_timestamp.json +sync_timestamp.json diff --git a/fetch_outlook.py b/fetch_outlook.py index 3982712..5d21bfd 100644 --- a/fetch_outlook.py +++ b/fetch_outlook.py @@ -3,6 +3,7 @@ import re import msal import requests import json +import glob from datetime import datetime from dateutil import parser from dateutil.tz import UTC @@ -14,6 +15,62 @@ import html2text # Filepath for caching timestamp cache_timestamp_file = 'cache_timestamp.json' + +# Filepath for sync timestamp +sync_timestamp_file = 'sync_timestamp.json' + +# Function to load the last sync timestamp +def load_last_sync_timestamp(): + if os.path.exists(sync_timestamp_file): + with open(sync_timestamp_file, 'r') as f: + return json.load(f).get('last_sync', 0) + return 0 + +# Function to save the current sync timestamp +def save_sync_timestamp(): + with open(sync_timestamp_file, 'w') as f: + json.dump({'last_sync': time.time()}, f) + +# Function to synchronize maildir with the server +def synchronize_maildir(maildir_path, headers): + last_sync = load_last_sync_timestamp() + current_time = time.time() + + # Find messages moved from "new" to "cur" and mark them as read + new_dir = os.path.join(maildir_path, 'new') + cur_dir = os.path.join(maildir_path, 'cur') + new_files = set(glob.glob(os.path.join(new_dir, '*.eml'))) + cur_files = set(glob.glob(os.path.join(cur_dir, '*.eml'))) + + moved_to_cur = [os.path.basename(f) for f in cur_files - new_files] + for filename in moved_to_cur: + message_id = filename.split('.')[0] # Extract the Message-ID from the filename + print(f"Marking message as read: {message_id}") + response = requests.patch( + f'https://graph.microsoft.com/v1.0/me/messages/{message_id}', + headers=headers, + json={'isRead': True} + ) + if response.status_code != 200: + print(f"Failed to mark message as read: {message_id}, {response.status_code}, {response.text}") + + # Find messages moved to ".Trash/cur" and delete them on the server + trash_dir = os.path.join(maildir_path, '.Trash', 'cur') + trash_files = set(glob.glob(os.path.join(trash_dir, '*.eml'))) + for filepath in trash_files: + message_id = os.path.basename(filepath).split('.')[0] # Extract the Message-ID from the filename + print(f"Moving message to trash: {message_id}") + response = requests.delete( + f'https://graph.microsoft.com/v1.0/me/messages/{message_id}', + headers=headers + ) + if response.status_code != 204: # 204 No Content indicates success + print(f"Failed to move message to trash: {message_id}, {response.status_code}, {response.text}") + + # Save the current sync timestamp + save_sync_timestamp() + + # Load cached timestamp if it exists if os.path.exists(cache_timestamp_file): with open(cache_timestamp_file, 'r') as f: @@ -36,43 +93,63 @@ def create_maildir_structure(base_path): os.makedirs(os.path.join(base_path, 'new'), exist_ok=True) os.makedirs(os.path.join(base_path, 'tmp'), exist_ok=True) -# Function to save email to Maildir format with Markdown conversion -def save_email_to_maildir(maildir_path, email_data): +def save_email_to_maildir(maildir_path, email_data, attachments_dir): # Create a new EmailMessage object msg = EmailMessage() received_datetime = email_data.get('receivedDateTime', '') - # Add required headers if received_datetime: - # Parse the ISO 8601 datetime and convert it to RFC 5322 format parsed_datetime = parser.isoparse(received_datetime) msg['Date'] = format_datetime(parsed_datetime) else: - msg['Date'] = '' # Leave empty if no receivedDateTime is available + msg['Date'] = '' - msg['Message-ID'] = email_data.get('id', '') # Use the unique ID of the message - msg['Subject'] = email_data.get('subject', 'No Subject') # Default to 'No Subject' if missing + msg['Message-ID'] = email_data.get('id', '') + msg['Subject'] = email_data.get('subject', 'No Subject') msg['From'] = email_data.get('from', {}).get('emailAddress', {}).get('address', 'unknown@unknown.com') msg['To'] = ', '.join([recipient['emailAddress']['address'] for recipient in email_data.get('toRecipients', [])]) + msg['Cc'] = ', '.join([recipient['emailAddress']['address'] for recipient in email_data.get('ccRecipients', [])]) # Convert the email body from HTML to Markdown body_html = email_data.get('body', {}).get('content', '') if email_data.get('body', {}).get('contentType', '').lower() == 'html': markdown_converter = html2text.HTML2Text() markdown_converter.ignore_images = True - markdown_converter.ignore_links = False # Keep links in the Markdown output + markdown_converter.ignore_links = False body_markdown = markdown_converter.handle(body_html) else: - body_markdown = body_html # Use plain text if the body is not HTML + body_markdown = body_html - # Add the converted Markdown body to the email - msg.set_content(body_markdown) + # Remove lines between any alphanumeric BannerStart and BannerEnd body_markdown = re.sub(r'\w+BannerStart.*?\w+BannerEnd', '', body_markdown, flags=re.DOTALL) - # Save the email to the Maildir 'new' folder - email_filename = f"{msg['Message-ID'] or email_data.get('id', 'unknown')}.eml" - email_filepath = os.path.join(maildir_path, 'new', email_filename) + msg.set_content(body_markdown) + + # Download attachments + for attachment in email_data.get('attachments', []): + attachment_id = attachment.get('id') + attachment_name = attachment.get('name', 'unknown') + attachment_content = attachment.get('contentBytes') + if attachment_content: + attachment_path = os.path.join(attachments_dir, attachment_name) + with open(attachment_path, 'wb') as f: + f.write(attachment_content.encode('utf-8')) + msg.add_attachment(attachment_content.encode('utf-8'), filename=attachment_name) + + # Determine the directory based on isRead + target_dir = 'cur' if email_data.get('isRead', False) else 'new' + email_filename = f"{msg['Message-ID']}.eml" + email_filepath = os.path.join(maildir_path, target_dir, email_filename) + + # Check if the file already exists in any subfolder + for root, _, files in os.walk(maildir_path): + if email_filename in files: + print(f"Message {msg['Message-ID']} already exists in {root}. Skipping save.") + return + + # Save the email to the Maildir with open(email_filepath, 'w') as f: f.write(msg.as_string()) + print(f"Saved message {msg['Message-ID']} to {email_filepath}") # Read Azure app credentials from environment variables client_id = os.getenv('AZURE_CLIENT_ID') @@ -81,6 +158,8 @@ tenant_id = os.getenv('AZURE_TENANT_ID') if not client_id or not tenant_id: raise ValueError("Please set the AZURE_CLIENT_ID and AZURE_TENANT_ID environment variables.") + + # Token cache cache = msal.SerializableTokenCache() cache_file = 'token_cache.bin' @@ -99,7 +178,7 @@ else: # Authentication authority = f'https://login.microsoftonline.com/{tenant_id}' -scopes = ['https://graph.microsoft.com/Calendars.Read', 'https://graph.microsoft.com/Mail.Read'] +scopes = ['https://graph.microsoft.com/Calendars.Read', 'https://graph.microsoft.com/Mail.ReadWrite'] app = msal.PublicClientApplication(client_id, authority=authority, token_cache=cache) accounts = app.get_accounts() @@ -126,7 +205,13 @@ accounts = app.get_accounts() if not accounts: raise Exception("No accounts found") -mail_url = 'https://graph.microsoft.com/v1.0/me/mailFolders/inbox/messages?$top=100&$orderby=receivedDateTime asc' + +# Call the synchronization function before fetching mail +print("Synchronizing maildir with server...") +synchronize_maildir(maildir_path=os.getenv('MAILDIR_PATH', os.path.expanduser('~/Mail')) + "/corteva", headers=headers) +print("Synchronization complete.") + +mail_url = 'https://graph.microsoft.com/v1.0/me/mailFolders/inbox/messages?$top=100&$orderby=receivedDateTime asc&$select=id,subject,from,toRecipients,ccRecipients,receivedDateTime,isRead,body,attachments' messages = [] print("Fetching mail...") @@ -174,14 +259,17 @@ while mail_url: mail_url = response_data.get('@odata.nextLink') -print("\nFinished fetching mail.") +print("\nFinished fetching mail. Now saving them to maildir.") -maildir_path = os.getenv('MAILDIR_PATH', os.path.expanduser('~/Mail')) + "/corteva/INBOX" +# Save emails to Maildir +maildir_path = os.getenv('MAILDIR_PATH', os.path.expanduser('~/Mail')) + "/corteva" +attachments_dir = os.path.join(maildir_path, 'attachments') +os.makedirs(attachments_dir, exist_ok=True) create_maildir_structure(maildir_path) for message in messages: print(f"Processing message: {message.get('subject', 'No Subject')}", end='\r') - save_email_to_maildir(maildir_path, message) + save_email_to_maildir(maildir_path, message, attachments_dir) print(f"\nFinished processing {len(messages)} messages.")