import os import re import msal import requests import json from datetime import datetime from dateutil import parser from dateutil.tz import UTC from email.message import EmailMessage from email.utils import format_datetime import time import html2text # Filepath for caching timestamp cache_timestamp_file = 'cache_timestamp.json' # Load cached timestamp if it exists if os.path.exists(cache_timestamp_file): with open(cache_timestamp_file, 'r') as f: cache_timestamp = json.load(f) else: cache_timestamp = {} # Function to check if the cache is still valid def is_cache_valid(): if 'timestamp' in cache_timestamp and 'max_age' in cache_timestamp: current_time = time.time() cache_expiry_time = cache_timestamp['timestamp'] + cache_timestamp['max_age'] return current_time < cache_expiry_time return False # Function to create Maildir structure def create_maildir_structure(base_path): os.makedirs(os.path.join(base_path, 'cur'), exist_ok=True) os.makedirs(os.path.join(base_path, 'new'), exist_ok=True) os.makedirs(os.path.join(base_path, 'tmp'), exist_ok=True) # Function to save email to Maildir format with Markdown conversion def save_email_to_maildir(maildir_path, email_data): # Create a new EmailMessage object msg = EmailMessage() received_datetime = email_data.get('receivedDateTime', '') # Add required headers if received_datetime: # Parse the ISO 8601 datetime and convert it to RFC 5322 format parsed_datetime = parser.isoparse(received_datetime) msg['Date'] = format_datetime(parsed_datetime) else: msg['Date'] = '' # Leave empty if no receivedDateTime is available msg['Message-ID'] = email_data.get('id', '') # Use the unique ID of the message msg['Subject'] = email_data.get('subject', 'No Subject') # Default to 'No Subject' if missing msg['From'] = email_data.get('from', {}).get('emailAddress', {}).get('address', 'unknown@unknown.com') msg['To'] = ', '.join([recipient['emailAddress']['address'] for recipient in email_data.get('toRecipients', [])]) # Convert the email body from HTML to Markdown body_html = email_data.get('body', {}).get('content', '') if email_data.get('body', {}).get('contentType', '').lower() == 'html': markdown_converter = html2text.HTML2Text() markdown_converter.ignore_images = True markdown_converter.ignore_links = False # Keep links in the Markdown output body_markdown = markdown_converter.handle(body_html) else: body_markdown = body_html # Use plain text if the body is not HTML # Add the converted Markdown body to the email msg.set_content(body_markdown) body_markdown = re.sub(r'\w+BannerStart.*?\w+BannerEnd', '', body_markdown, flags=re.DOTALL) # Save the email to the Maildir 'new' folder email_filename = f"{msg['Message-ID'] or email_data.get('id', 'unknown')}.eml" email_filepath = os.path.join(maildir_path, 'new', email_filename) with open(email_filepath, 'w') as f: f.write(msg.as_string()) # Read Azure app credentials from environment variables client_id = os.getenv('AZURE_CLIENT_ID') tenant_id = os.getenv('AZURE_TENANT_ID') if not client_id or not tenant_id: raise ValueError("Please set the AZURE_CLIENT_ID and AZURE_TENANT_ID environment variables.") # Token cache cache = msal.SerializableTokenCache() cache_file = 'token_cache.bin' if os.path.exists(cache_file): cache.deserialize(open(cache_file, 'r').read()) # Filepath for caching ETag etag_cache_file = 'etag_cache.json' # Load cached ETag if it exists if os.path.exists(etag_cache_file): with open(etag_cache_file, 'r') as f: etag_cache = json.load(f) else: etag_cache = {} # Authentication authority = f'https://login.microsoftonline.com/{tenant_id}' scopes = ['https://graph.microsoft.com/Calendars.Read', 'https://graph.microsoft.com/Mail.Read'] app = msal.PublicClientApplication(client_id, authority=authority, token_cache=cache) accounts = app.get_accounts() if accounts: token_response = app.acquire_token_silent(scopes, account=accounts[0]) else: flow = app.initiate_device_flow(scopes=scopes) if 'user_code' not in flow: raise Exception("Failed to create device flow") print(flow['message']) token_response = app.acquire_token_by_device_flow(flow) if 'access_token' not in token_response: raise Exception("Failed to acquire token") # Save token cache with open(cache_file, 'w') as f: f.write(cache.serialize()) access_token = token_response['access_token'] headers = {'Authorization': f'Bearer {access_token}'} accounts = app.get_accounts() if not accounts: raise Exception("No accounts found") mail_url = 'https://graph.microsoft.com/v1.0/me/mailFolders/inbox/messages?$top=100&$orderby=receivedDateTime asc' messages = [] print("Fetching mail...") # Fetch the total count of messages in the inbox inbox_url = 'https://graph.microsoft.com/v1.0/me/mailFolders/inbox' response = requests.get(inbox_url, headers=headers) if response.status_code != 200: raise Exception(f"Failed to fetch inbox details: {response.status_code} {response.text}") total_messages = response.json().get('totalItemCount', 0) print(f"Total messages in inbox: {total_messages}") while mail_url: if is_cache_valid(): print("Using cached messages...") break # No need to fetch further, cache is still valid response = requests.get(mail_url, headers=headers) if response.status_code != 200: raise Exception(f"Failed to fetch mail: {response.status_code} {response.text}") # Parse the Cache-Control header to get the max-age value cache_control = response.headers.get('Cache-Control', '') max_age = 0 if 'max-age=' in cache_control: max_age = int(cache_control.split('max-age=')[1].split(',')[0]) # Update the cache timestamp and max-age cache_timestamp['timestamp'] = time.time() cache_timestamp['max_age'] = max_age with open(cache_timestamp_file, 'w') as f: json.dump(cache_timestamp, f) # Process the response response_data = response.json() messages.extend(response_data.get('value', [])) # Add the current page of messages to the list # Calculate and display progress percentage progress = (len(messages) / total_messages) * 100 if total_messages > 0 else 0 print(f"Fetched {len(messages)} of {total_messages} messages ({progress:.2f}%)", end='\r') # Get the next page URL from @odata.nextLink mail_url = response_data.get('@odata.nextLink') print("\nFinished fetching mail.") maildir_path = os.getenv('MAILDIR_PATH', os.path.expanduser('~/Mail')) + "/corteva/INBOX" create_maildir_structure(maildir_path) for message in messages: print(f"Processing message: {message.get('subject', 'No Subject')}", end='\r') save_email_to_maildir(maildir_path, message) print(f"\nFinished processing {len(messages)} messages.") # Fetch events with pagination and expand recurring events events_url = 'https://graph.microsoft.com/v1.0/me/events?$top=100&$expand=instances' events = [] print("Fetching Calendar events...") while events_url: response = requests.get(events_url, headers=headers) response_data = response.json() events.extend(response_data.get('value', [])) print(f"Fetched {len(events)} events so far...", end='\r') events_url = response_data.get('@odata.nextLink') # Save events to a file in iCalendar format output_file = f'output_ics/outlook_events_latest.ics' print(f"Saving events to {output_file}...") with open(output_file, 'w') as f: f.write("BEGIN:VCALENDAR\nVERSION:2.0\n") for event in events: if 'start' in event and 'end' in event: start = parser.isoparse(event['start']['dateTime']) end = parser.isoparse(event['end']['dateTime']) f.write(f"BEGIN:VEVENT\nSUMMARY:{event['subject']}\n") f.write(f"DTSTART:{start.strftime('%Y%m%dT%H%M%S')}\n") f.write(f"DTEND:{end.strftime('%Y%m%dT%H%M%S')}\n") if 'recurrence' in event and event['recurrence']: # Check if 'recurrence' exists and is not None for rule in event['recurrence']: if rule.startswith('RRULE'): rule_parts = rule.split(';') new_rule_parts = [] for part in rule_parts: if part.startswith('UNTIL='): until_value = part.split('=')[1] until_date = parser.isoparse(until_value) if start.tzinfo is not None and until_date.tzinfo is None: until_date = until_date.replace(tzinfo=UTC) new_rule_parts.append(f"UNTIL={until_date.strftime('%Y%m%dT%H%M%SZ')}") else: new_rule_parts.append(part) rule = ';'.join(new_rule_parts) f.write(f"{rule}\n") f.write("END:VEVENT\n") f.write("END:VCALENDAR\n")