Files
luk/fetch_outlook.py
2025-04-23 12:30:39 -06:00

318 lines
12 KiB
Python

import os
import re
import msal
import requests
import json
import glob
from datetime import datetime
from dateutil import parser
from dateutil.tz import UTC
from email.message import EmailMessage
from email.utils import format_datetime
import time
import html2text
# Filepath for caching timestamp
cache_timestamp_file = 'cache_timestamp.json'
# Filepath for sync timestamp
sync_timestamp_file = 'sync_timestamp.json'
# Function to load the last sync timestamp
def load_last_sync_timestamp():
if os.path.exists(sync_timestamp_file):
with open(sync_timestamp_file, 'r') as f:
return json.load(f).get('last_sync', 0)
return 0
# Function to save the current sync timestamp
def save_sync_timestamp():
with open(sync_timestamp_file, 'w') as f:
json.dump({'last_sync': time.time()}, f)
# Function to synchronize maildir with the server
def synchronize_maildir(maildir_path, headers):
last_sync = load_last_sync_timestamp()
current_time = time.time()
# Find messages moved from "new" to "cur" and mark them as read
new_dir = os.path.join(maildir_path, 'new')
cur_dir = os.path.join(maildir_path, 'cur')
new_files = set(glob.glob(os.path.join(new_dir, '*.eml')))
cur_files = set(glob.glob(os.path.join(cur_dir, '*.eml')))
moved_to_cur = [os.path.basename(f) for f in cur_files - new_files]
for filename in moved_to_cur:
message_id = filename.split('.')[0] # Extract the Message-ID from the filename
print(f"Marking message as read: {message_id}")
response = requests.patch(
f'https://graph.microsoft.com/v1.0/me/messages/{message_id}',
headers=headers,
json={'isRead': True}
)
if response.status_code != 200:
print(f"Failed to mark message as read: {message_id}, {response.status_code}, {response.text}")
# Find messages moved to ".Trash/cur" and delete them on the server
trash_dir = os.path.join(maildir_path, '.Trash', 'cur')
trash_files = set(glob.glob(os.path.join(trash_dir, '*.eml')))
for filepath in trash_files:
message_id = os.path.basename(filepath).split('.')[0] # Extract the Message-ID from the filename
print(f"Moving message to trash: {message_id}")
response = requests.delete(
f'https://graph.microsoft.com/v1.0/me/messages/{message_id}',
headers=headers
)
if response.status_code != 204: # 204 No Content indicates success
print(f"Failed to move message to trash: {message_id}, {response.status_code}, {response.text}")
# Save the current sync timestamp
save_sync_timestamp()
# Load cached timestamp if it exists
if os.path.exists(cache_timestamp_file):
with open(cache_timestamp_file, 'r') as f:
cache_timestamp = json.load(f)
else:
cache_timestamp = {}
# Function to check if the cache is still valid
def is_cache_valid():
if 'timestamp' in cache_timestamp and 'max_age' in cache_timestamp:
current_time = time.time()
cache_expiry_time = cache_timestamp['timestamp'] + cache_timestamp['max_age']
return current_time < cache_expiry_time
return False
# Function to create Maildir structure
def create_maildir_structure(base_path):
os.makedirs(os.path.join(base_path, 'cur'), exist_ok=True)
os.makedirs(os.path.join(base_path, 'new'), exist_ok=True)
os.makedirs(os.path.join(base_path, 'tmp'), exist_ok=True)
def save_email_to_maildir(maildir_path, email_data, attachments_dir):
# Create a new EmailMessage object
msg = EmailMessage()
received_datetime = email_data.get('receivedDateTime', '')
if received_datetime:
parsed_datetime = parser.isoparse(received_datetime)
msg['Date'] = format_datetime(parsed_datetime)
else:
msg['Date'] = ''
msg['Message-ID'] = email_data.get('id', '')
msg['Subject'] = email_data.get('subject', 'No Subject')
msg['From'] = email_data.get('from', {}).get('emailAddress', {}).get('address', 'unknown@unknown.com')
msg['To'] = ', '.join([recipient['emailAddress']['address'] for recipient in email_data.get('toRecipients', [])])
msg['Cc'] = ', '.join([recipient['emailAddress']['address'] for recipient in email_data.get('ccRecipients', [])])
# Convert the email body from HTML to Markdown
body_html = email_data.get('body', {}).get('content', '')
if email_data.get('body', {}).get('contentType', '').lower() == 'html':
markdown_converter = html2text.HTML2Text()
markdown_converter.ignore_images = True
markdown_converter.ignore_links = False
body_markdown = markdown_converter.handle(body_html)
else:
body_markdown = body_html
# Remove lines between any alphanumeric BannerStart and BannerEnd
body_markdown = re.sub(r'\w+BannerStart.*?\w+BannerEnd', '', body_markdown, flags=re.DOTALL)
msg.set_content(body_markdown)
# Download attachments
for attachment in email_data.get('attachments', []):
attachment_id = attachment.get('id')
attachment_name = attachment.get('name', 'unknown')
attachment_content = attachment.get('contentBytes')
if attachment_content:
attachment_path = os.path.join(attachments_dir, attachment_name)
with open(attachment_path, 'wb') as f:
f.write(attachment_content.encode('utf-8'))
msg.add_attachment(attachment_content.encode('utf-8'), filename=attachment_name)
# Determine the directory based on isRead
target_dir = 'cur' if email_data.get('isRead', False) else 'new'
email_filename = f"{msg['Message-ID']}.eml"
email_filepath = os.path.join(maildir_path, target_dir, email_filename)
# Check if the file already exists in any subfolder
for root, _, files in os.walk(maildir_path):
if email_filename in files:
print(f"Message {msg['Message-ID']} already exists in {root}. Skipping save.")
return
# Save the email to the Maildir
with open(email_filepath, 'w') as f:
f.write(msg.as_string())
print(f"Saved message {msg['Message-ID']} to {email_filepath}")
# Read Azure app credentials from environment variables
client_id = os.getenv('AZURE_CLIENT_ID')
tenant_id = os.getenv('AZURE_TENANT_ID')
if not client_id or not tenant_id:
raise ValueError("Please set the AZURE_CLIENT_ID and AZURE_TENANT_ID environment variables.")
# Token cache
cache = msal.SerializableTokenCache()
cache_file = 'token_cache.bin'
if os.path.exists(cache_file):
cache.deserialize(open(cache_file, 'r').read())
# Filepath for caching ETag
etag_cache_file = 'etag_cache.json'
# Load cached ETag if it exists
if os.path.exists(etag_cache_file):
with open(etag_cache_file, 'r') as f:
etag_cache = json.load(f)
else:
etag_cache = {}
# Authentication
authority = f'https://login.microsoftonline.com/{tenant_id}'
scopes = ['https://graph.microsoft.com/Calendars.Read', 'https://graph.microsoft.com/Mail.ReadWrite']
app = msal.PublicClientApplication(client_id, authority=authority, token_cache=cache)
accounts = app.get_accounts()
if accounts:
token_response = app.acquire_token_silent(scopes, account=accounts[0])
else:
flow = app.initiate_device_flow(scopes=scopes)
if 'user_code' not in flow:
raise Exception("Failed to create device flow")
print(flow['message'])
token_response = app.acquire_token_by_device_flow(flow)
if 'access_token' not in token_response:
raise Exception("Failed to acquire token")
# Save token cache
with open(cache_file, 'w') as f:
f.write(cache.serialize())
access_token = token_response['access_token']
headers = {'Authorization': f'Bearer {access_token}'}
accounts = app.get_accounts()
if not accounts:
raise Exception("No accounts found")
# Call the synchronization function before fetching mail
print("Synchronizing maildir with server...")
synchronize_maildir(maildir_path=os.getenv('MAILDIR_PATH', os.path.expanduser('~/Mail')) + "/corteva", headers=headers)
print("Synchronization complete.")
mail_url = 'https://graph.microsoft.com/v1.0/me/mailFolders/inbox/messages?$top=100&$orderby=receivedDateTime asc&$select=id,subject,from,toRecipients,ccRecipients,receivedDateTime,isRead,body,attachments'
messages = []
print("Fetching mail...")
# Fetch the total count of messages in the inbox
inbox_url = 'https://graph.microsoft.com/v1.0/me/mailFolders/inbox'
response = requests.get(inbox_url, headers=headers)
if response.status_code != 200:
raise Exception(f"Failed to fetch inbox details: {response.status_code} {response.text}")
total_messages = response.json().get('totalItemCount', 0)
print(f"Total messages in inbox: {total_messages}")
while mail_url:
if is_cache_valid():
print("Using cached messages...")
break # No need to fetch further, cache is still valid
response = requests.get(mail_url, headers=headers)
if response.status_code != 200:
raise Exception(f"Failed to fetch mail: {response.status_code} {response.text}")
# Parse the Cache-Control header to get the max-age value
cache_control = response.headers.get('Cache-Control', '')
max_age = 0
if 'max-age=' in cache_control:
max_age = int(cache_control.split('max-age=')[1].split(',')[0])
# Update the cache timestamp and max-age
cache_timestamp['timestamp'] = time.time()
cache_timestamp['max_age'] = max_age
with open(cache_timestamp_file, 'w') as f:
json.dump(cache_timestamp, f)
# Process the response
response_data = response.json()
messages.extend(response_data.get('value', [])) # Add the current page of messages to the list
# Calculate and display progress percentage
progress = (len(messages) / total_messages) * 100 if total_messages > 0 else 0
print(f"Fetched {len(messages)} of {total_messages} messages ({progress:.2f}%)", end='\r')
# Get the next page URL from @odata.nextLink
mail_url = response_data.get('@odata.nextLink')
print("\nFinished fetching mail. Now saving them to maildir.")
# Save emails to Maildir
maildir_path = os.getenv('MAILDIR_PATH', os.path.expanduser('~/Mail')) + "/corteva"
attachments_dir = os.path.join(maildir_path, 'attachments')
os.makedirs(attachments_dir, exist_ok=True)
create_maildir_structure(maildir_path)
for message in messages:
print(f"Processing message: {message.get('subject', 'No Subject')}", end='\r')
save_email_to_maildir(maildir_path, message, attachments_dir)
print(f"\nFinished processing {len(messages)} messages.")
# Fetch events with pagination and expand recurring events
events_url = 'https://graph.microsoft.com/v1.0/me/events?$top=100&$expand=instances'
events = []
print("Fetching Calendar events...")
while events_url:
response = requests.get(events_url, headers=headers)
response_data = response.json()
events.extend(response_data.get('value', []))
print(f"Fetched {len(events)} events so far...", end='\r')
events_url = response_data.get('@odata.nextLink')
# Save events to a file in iCalendar format
output_file = f'output_ics/outlook_events_latest.ics'
print(f"Saving events to {output_file}...")
with open(output_file, 'w') as f:
f.write("BEGIN:VCALENDAR\nVERSION:2.0\n")
for event in events:
if 'start' in event and 'end' in event:
start = parser.isoparse(event['start']['dateTime'])
end = parser.isoparse(event['end']['dateTime'])
f.write(f"BEGIN:VEVENT\nSUMMARY:{event['subject']}\n")
f.write(f"DTSTART:{start.strftime('%Y%m%dT%H%M%S')}\n")
f.write(f"DTEND:{end.strftime('%Y%m%dT%H%M%S')}\n")
if 'recurrence' in event and event['recurrence']: # Check if 'recurrence' exists and is not None
for rule in event['recurrence']:
if rule.startswith('RRULE'):
rule_parts = rule.split(';')
new_rule_parts = []
for part in rule_parts:
if part.startswith('UNTIL='):
until_value = part.split('=')[1]
until_date = parser.isoparse(until_value)
if start.tzinfo is not None and until_date.tzinfo is None:
until_date = until_date.replace(tzinfo=UTC)
new_rule_parts.append(f"UNTIL={until_date.strftime('%Y%m%dT%H%M%SZ')}")
else:
new_rule_parts.append(part)
rule = ';'.join(new_rule_parts)
f.write(f"{rule}\n")
f.write("END:VEVENT\n")
f.write("END:VCALENDAR\n")