import os import re import msal import requests import json import glob from datetime import datetime from dateutil import parser from dateutil.tz import UTC from email.message import EmailMessage from email.utils import format_datetime import time import html2text # Filepath for caching timestamp cache_timestamp_file = 'cache_timestamp.json' # Filepath for sync timestamp sync_timestamp_file = 'sync_timestamp.json' # Function to load the last sync timestamp def load_last_sync_timestamp(): if os.path.exists(sync_timestamp_file): with open(sync_timestamp_file, 'r') as f: return json.load(f).get('last_sync', 0) return 0 # Function to save the current sync timestamp def save_sync_timestamp(): with open(sync_timestamp_file, 'w') as f: json.dump({'last_sync': time.time()}, f) def synchronize_maildir(maildir_path, headers): last_sync = load_last_sync_timestamp() current_time = time.time() # Find messages moved from "new" to "cur" and mark them as read new_dir = os.path.join(maildir_path, 'new') cur_dir = os.path.join(maildir_path, 'cur') new_files = set(glob.glob(os.path.join(new_dir, '*.eml'))) cur_files = set(glob.glob(os.path.join(cur_dir, '*.eml'))) moved_to_cur = [os.path.basename(f) for f in cur_files - new_files] for filename in moved_to_cur: message_id = filename.split('.')[0] # Extract the Message-ID from the filename print(f"Marking message as read: {message_id}") response = requests.patch( f'https://graph.microsoft.com/v1.0/me/messages/{message_id}', headers=headers, json={'isRead': True} ) if response.status_code != 200: print(f"Failed to mark message as read: {message_id}, {response.status_code}, {response.text}") # Find messages moved to ".Trash/cur" and delete them on the server trash_dir = os.path.join(maildir_path, '.Trash', 'cur') trash_files = set(glob.glob(os.path.join(trash_dir, '*.eml'))) for filepath in trash_files: message_id = os.path.basename(filepath).split('.')[0] # Extract the Message-ID from the filename print(f"Moving message to trash: {message_id}") response = requests.delete( f'https://graph.microsoft.com/v1.0/me/messages/{message_id}', headers=headers ) if response.status_code != 204: # 204 No Content indicates success print(f"Failed to move message to trash: {message_id}, {response.status_code}, {response.text}") # Find messages moved to ".Archives/**/*" and move them to the "Archive" folder on the server archive_dir = os.path.join(maildir_path, '.Archives') archive_files = glob.glob(os.path.join(archive_dir, '**', '*.eml'), recursive=True) # Fetch the list of folders to find the "Archive" folder ID print("Fetching server folders to locate 'Archive' folder...") folder_response = requests.get('https://graph.microsoft.com/v1.0/me/mailFolders', headers=headers) if folder_response.status_code != 200: raise Exception(f"Failed to fetch mail folders: {folder_response.status_code}, {folder_response.text}") folders = folder_response.json().get('value', []) archive_folder_id = None for folder in folders: if folder.get('displayName', '').lower() == 'archive': archive_folder_id = folder.get('id') break if not archive_folder_id: raise Exception("No folder named 'Archive' found on the server.") for filepath in archive_files: message_id = os.path.basename(filepath).split('.')[0] # Extract the Message-ID from the filename print(f"Moving message to 'Archive' folder: {message_id}") response = requests.post( f'https://graph.microsoft.com/v1.0/me/messages/{message_id}/move', headers=headers, json={'destinationId': archive_folder_id} ) if response.status_code != 201: # 201 Created indicates success print(f"Failed to move message to 'Archive': {message_id}, {response.status_code}, {response.text}") # Save the current sync timestamp save_sync_timestamp() # Load cached timestamp if it exists if os.path.exists(cache_timestamp_file): with open(cache_timestamp_file, 'r') as f: cache_timestamp = json.load(f) else: cache_timestamp = {} # Function to check if the cache is still valid def is_cache_valid(): if 'timestamp' in cache_timestamp and 'max_age' in cache_timestamp: current_time = time.time() cache_expiry_time = cache_timestamp['timestamp'] + cache_timestamp['max_age'] return current_time < cache_expiry_time return False # Function to create Maildir structure def create_maildir_structure(base_path): os.makedirs(os.path.join(base_path, 'cur'), exist_ok=True) os.makedirs(os.path.join(base_path, 'new'), exist_ok=True) os.makedirs(os.path.join(base_path, 'tmp'), exist_ok=True) def save_email_to_maildir(maildir_path, email_data, attachments_dir): # Create a new EmailMessage object msg = EmailMessage() received_datetime = email_data.get('receivedDateTime', '') if received_datetime: parsed_datetime = parser.isoparse(received_datetime) msg['Date'] = format_datetime(parsed_datetime) else: msg['Date'] = '' msg['Message-ID'] = email_data.get('id', '') msg['Subject'] = email_data.get('subject', 'No Subject') msg['From'] = email_data.get('from', {}).get('emailAddress', {}).get('address', 'unknown@unknown.com') msg['To'] = ', '.join([recipient['emailAddress']['address'] for recipient in email_data.get('toRecipients', [])]) msg['Cc'] = ', '.join([recipient['emailAddress']['address'] for recipient in email_data.get('ccRecipients', [])]) # Convert the email body from HTML to Markdown body_html = email_data.get('body', {}).get('content', '') if email_data.get('body', {}).get('contentType', '').lower() == 'html': markdown_converter = html2text.HTML2Text() markdown_converter.ignore_images = True markdown_converter.ignore_links = True body_markdown = markdown_converter.handle(body_html) else: body_markdown = body_html # Remove lines between any alphanumeric BannerStart and BannerEnd body_markdown = re.sub(r'\w+BannerStart.*?\w+BannerEnd', '', body_markdown, flags=re.DOTALL) msg.set_content(body_markdown) # Download attachments for attachment in email_data.get('attachments', []): attachment_id = attachment.get('id') attachment_name = attachment.get('name', 'unknown') attachment_content = attachment.get('contentBytes') if attachment_content: attachment_path = os.path.join(attachments_dir, attachment_name) with open(attachment_path, 'wb') as f: f.write(attachment_content.encode('utf-8')) msg.add_attachment(attachment_content.encode('utf-8'), filename=attachment_name) # Determine the directory based on isRead target_dir = 'cur' if email_data.get('isRead', False) else 'new' email_filename = f"{msg['Message-ID']}.eml" email_filepath = os.path.join(maildir_path, target_dir, email_filename) # Check if the file already exists in any subfolder for root, _, files in os.walk(maildir_path): if email_filename in files: print(f"Message {msg['Message-ID']} already exists in {root}. Skipping save.") return # Save the email to the Maildir with open(email_filepath, 'w') as f: f.write(msg.as_string()) print(f"Saved message {msg['Message-ID']} to {email_filepath}") # Read Azure app credentials from environment variables client_id = os.getenv('AZURE_CLIENT_ID') tenant_id = os.getenv('AZURE_TENANT_ID') if not client_id or not tenant_id: raise ValueError("Please set the AZURE_CLIENT_ID and AZURE_TENANT_ID environment variables.") # Token cache cache = msal.SerializableTokenCache() cache_file = 'token_cache.bin' if os.path.exists(cache_file): cache.deserialize(open(cache_file, 'r').read()) # Filepath for caching ETag etag_cache_file = 'etag_cache.json' # Load cached ETag if it exists if os.path.exists(etag_cache_file): with open(etag_cache_file, 'r') as f: etag_cache = json.load(f) else: etag_cache = {} # Authentication authority = f'https://login.microsoftonline.com/{tenant_id}' scopes = ['https://graph.microsoft.com/Calendars.Read', 'https://graph.microsoft.com/Mail.ReadWrite'] app = msal.PublicClientApplication(client_id, authority=authority, token_cache=cache) accounts = app.get_accounts() if accounts: token_response = app.acquire_token_silent(scopes, account=accounts[0]) else: flow = app.initiate_device_flow(scopes=scopes) if 'user_code' not in flow: raise Exception("Failed to create device flow") print(flow['message']) token_response = app.acquire_token_by_device_flow(flow) if 'access_token' not in token_response: raise Exception("Failed to acquire token") # Save token cache with open(cache_file, 'w') as f: f.write(cache.serialize()) access_token = token_response['access_token'] headers = {'Authorization': f'Bearer {access_token}'} accounts = app.get_accounts() if not accounts: raise Exception("No accounts found") # Call the synchronization function before fetching mail print("Synchronizing maildir with server...") synchronize_maildir(maildir_path=os.getenv('MAILDIR_PATH', os.path.expanduser('~/Mail')) + "/corteva", headers=headers) print("Synchronization complete.") mail_url = 'https://graph.microsoft.com/v1.0/me/mailFolders/inbox/messages?$top=100&$orderby=receivedDateTime asc&$select=id,subject,from,toRecipients,ccRecipients,receivedDateTime,isRead,body,attachments' messages = [] print("Fetching mail...") # Fetch the total count of messages in the inbox inbox_url = 'https://graph.microsoft.com/v1.0/me/mailFolders/inbox' response = requests.get(inbox_url, headers=headers) if response.status_code != 200: raise Exception(f"Failed to fetch inbox details: {response.status_code} {response.text}") total_messages = response.json().get('totalItemCount', 0) print(f"Total messages in inbox: {total_messages}") while mail_url: if is_cache_valid(): print("Using cached messages...") break # No need to fetch further, cache is still valid response = requests.get(mail_url, headers=headers) if response.status_code != 200: raise Exception(f"Failed to fetch mail: {response.status_code} {response.text}") # Parse the Cache-Control header to get the max-age value cache_control = response.headers.get('Cache-Control', '') max_age = 0 if 'max-age=' in cache_control: max_age = int(cache_control.split('max-age=')[1].split(',')[0]) # Update the cache timestamp and max-age cache_timestamp['timestamp'] = time.time() cache_timestamp['max_age'] = max_age with open(cache_timestamp_file, 'w') as f: json.dump(cache_timestamp, f) # Process the response response_data = response.json() messages.extend(response_data.get('value', [])) # Add the current page of messages to the list # Calculate and display progress percentage progress = (len(messages) / total_messages) * 100 if total_messages > 0 else 0 print(f"Fetched {len(messages)} of {total_messages} messages ({progress:.2f}%)", end='\r') # Get the next page URL from @odata.nextLink mail_url = response_data.get('@odata.nextLink') print("\nFinished fetching mail. Now saving them to maildir.") # Save emails to Maildir maildir_path = os.getenv('MAILDIR_PATH', os.path.expanduser('~/Mail')) + "/corteva" attachments_dir = os.path.join(maildir_path, 'attachments') os.makedirs(attachments_dir, exist_ok=True) create_maildir_structure(maildir_path) for message in messages: print(f"Processing message: {message.get('subject', 'No Subject')}", end='\r') save_email_to_maildir(maildir_path, message, attachments_dir) print(f"\nFinished processing {len(messages)} messages.") # Fetch events with pagination and expand recurring events events_url = 'https://graph.microsoft.com/v1.0/me/events?$top=100&$expand=instances' events = [] print("Fetching Calendar events...") while events_url: response = requests.get(events_url, headers=headers) response_data = response.json() events.extend(response_data.get('value', [])) print(f"Fetched {len(events)} events so far...", end='\r') events_url = response_data.get('@odata.nextLink') # Save events to a file in iCalendar format output_file = f'output_ics/outlook_events_latest.ics' print(f"Saving events to {output_file}...") with open(output_file, 'w') as f: f.write("BEGIN:VCALENDAR\nVERSION:2.0\n") for event in events: if 'start' in event and 'end' in event: start = parser.isoparse(event['start']['dateTime']) end = parser.isoparse(event['end']['dateTime']) f.write(f"BEGIN:VEVENT\nSUMMARY:{event['subject']}\n") f.write(f"DTSTART:{start.strftime('%Y%m%dT%H%M%S')}\n") f.write(f"DTEND:{end.strftime('%Y%m%dT%H%M%S')}\n") if 'recurrence' in event and event['recurrence']: # Check if 'recurrence' exists and is not None for rule in event['recurrence']: if rule.startswith('RRULE'): rule_parts = rule.split(';') new_rule_parts = [] for part in rule_parts: if part.startswith('UNTIL='): until_value = part.split('=')[1] until_date = parser.isoparse(until_value) if start.tzinfo is not None and until_date.tzinfo is None: until_date = until_date.replace(tzinfo=UTC) new_rule_parts.append(f"UNTIL={until_date.strftime('%Y%m%dT%H%M%SZ')}") else: new_rule_parts.append(part) rule = ';'.join(new_rule_parts) f.write(f"{rule}\n") f.write("END:VEVENT\n") f.write("END:VCALENDAR\n")