Files
luk/fetch_outlook.py
2025-04-22 15:28:14 -06:00

211 lines
8.0 KiB
Python

import os
import msal
import requests
import json
from datetime import datetime
from dateutil import parser
from dateutil.tz import UTC
from email.message import EmailMessage
import time
# Filepath for caching timestamp
cache_timestamp_file = 'cache_timestamp.json'
# Load cached timestamp if it exists
if os.path.exists(cache_timestamp_file):
with open(cache_timestamp_file, 'r') as f:
cache_timestamp = json.load(f)
else:
cache_timestamp = {}
# Function to check if the cache is still valid
def is_cache_valid():
if 'timestamp' in cache_timestamp and 'max_age' in cache_timestamp:
current_time = time.time()
cache_expiry_time = cache_timestamp['timestamp'] + cache_timestamp['max_age']
return current_time < cache_expiry_time
return False
# Function to create Maildir structure
def create_maildir_structure(base_path):
os.makedirs(os.path.join(base_path, 'cur'), exist_ok=True)
os.makedirs(os.path.join(base_path, 'new'), exist_ok=True)
os.makedirs(os.path.join(base_path, 'tmp'), exist_ok=True)
# Function to save email to Maildir format
def save_email_to_maildir(maildir_path, email_data):
# Create a new EmailMessage object
msg = EmailMessage()
# Add required headers
msg['Date'] = email_data.get('receivedDateTime', '') # Use the receivedDateTime field
msg['Message-ID'] = email_data.get('id', '') # Use the unique ID of the message
msg['Subject'] = email_data.get('subject', 'No Subject') # Default to 'No Subject' if missing
msg['From'] = email_data.get('from', {}).get('emailAddress', {}).get('address', 'unknown@unknown.com')
msg['To'] = ', '.join([recipient['emailAddress']['address'] for recipient in email_data.get('toRecipients', [])])
# Add the email body
body = email_data.get('body', {}).get('content', '')
msg.set_content(body)
# Save the email to the Maildir 'new' folder
email_filename = f"{msg['Message-ID'] or email_data.get('id', 'unknown')}.eml"
email_filepath = os.path.join(maildir_path, 'new', email_filename)
with open(email_filepath, 'w') as f:
f.write(msg.as_string())
# Read Azure app credentials from environment variables
client_id = os.getenv('AZURE_CLIENT_ID')
tenant_id = os.getenv('AZURE_TENANT_ID')
if not client_id or not tenant_id:
raise ValueError("Please set the AZURE_CLIENT_ID and AZURE_TENANT_ID environment variables.")
# Token cache
cache = msal.SerializableTokenCache()
cache_file = 'token_cache.bin'
if os.path.exists(cache_file):
cache.deserialize(open(cache_file, 'r').read())
# Filepath for caching ETag
etag_cache_file = 'etag_cache.json'
# Load cached ETag if it exists
if os.path.exists(etag_cache_file):
with open(etag_cache_file, 'r') as f:
etag_cache = json.load(f)
else:
etag_cache = {}
# Authentication
authority = f'https://login.microsoftonline.com/{tenant_id}'
scopes = ['https://graph.microsoft.com/Calendars.Read', 'https://graph.microsoft.com/Mail.Read']
app = msal.PublicClientApplication(client_id, authority=authority, token_cache=cache)
accounts = app.get_accounts()
if accounts:
token_response = app.acquire_token_silent(scopes, account=accounts[0])
else:
flow = app.initiate_device_flow(scopes=scopes)
if 'user_code' not in flow:
raise Exception("Failed to create device flow")
print(flow['message'])
token_response = app.acquire_token_by_device_flow(flow)
if 'access_token' not in token_response:
raise Exception("Failed to acquire token")
# Save token cache
with open(cache_file, 'w') as f:
f.write(cache.serialize())
access_token = token_response['access_token']
headers = {'Authorization': f'Bearer {access_token}'}
accounts = app.get_accounts()
if not accounts:
raise Exception("No accounts found")
mail_url = 'https://graph.microsoft.com/v1.0/me/mailFolders/inbox/messages?$top=100&$orderby=receivedDateTime asc'
messages = []
print("Fetching mail...")
# Fetch the total count of messages in the inbox
inbox_url = 'https://graph.microsoft.com/v1.0/me/mailFolders/inbox'
response = requests.get(inbox_url, headers=headers)
if response.status_code != 200:
raise Exception(f"Failed to fetch inbox details: {response.status_code} {response.text}")
total_messages = response.json().get('totalItemCount', 0)
print(f"Total messages in inbox: {total_messages}")
while mail_url:
if is_cache_valid():
print("Using cached messages...")
break # No need to fetch further, cache is still valid
response = requests.get(mail_url, headers=headers)
if response.status_code != 200:
raise Exception(f"Failed to fetch mail: {response.status_code} {response.text}")
# Parse the Cache-Control header to get the max-age value
cache_control = response.headers.get('Cache-Control', '')
max_age = 0
if 'max-age=' in cache_control:
max_age = int(cache_control.split('max-age=')[1].split(',')[0])
# Update the cache timestamp and max-age
cache_timestamp['timestamp'] = time.time()
cache_timestamp['max_age'] = max_age
with open(cache_timestamp_file, 'w') as f:
json.dump(cache_timestamp, f)
# Process the response
response_data = response.json()
messages.extend(response_data.get('value', [])) # Add the current page of messages to the list
# Calculate and display progress percentage
progress = (len(messages) / total_messages) * 100 if total_messages > 0 else 0
print(f"Fetched {len(messages)} of {total_messages} messages ({progress:.2f}%)", end='\r')
# Get the next page URL from @odata.nextLink
mail_url = response_data.get('@odata.nextLink')
print("\nFinished fetching mail.")
maildir_path = os.getenv('MAILDIR_PATH', os.path.expanduser('~/Mail')) + "/corteva/INBOX"
create_maildir_structure(maildir_path)
for message in messages:
print(f"Processing message: {message.get('subject', 'No Subject')}", end='\r')
save_email_to_maildir(maildir_path, message)
print(f"\nFinished processing {len(messages)} messages.")
# Fetch events with pagination and expand recurring events
events_url = 'https://graph.microsoft.com/v1.0/me/events?$top=100&$expand=instances'
events = []
print("Fetching events...")
while events_url:
response = requests.get(events_url, headers=headers)
response_data = response.json()
events.extend(response_data.get('value', []))
print(f"Fetched {len(events)} events so far...", end='\r')
events_url = response_data.get('@odata.nextLink')
# Save events to a file in iCalendar format
output_file = f'output_ics/outlook_events_latest.ics'
print(f"Saving events to {output_file}...")
with open(output_file, 'w') as f:
f.write("BEGIN:VCALENDAR\nVERSION:2.0\n")
for event in events:
if 'start' in event and 'end' in event:
start = parser.isoparse(event['start']['dateTime'])
end = parser.isoparse(event['end']['dateTime'])
f.write(f"BEGIN:VEVENT\nSUMMARY:{event['subject']}\n")
f.write(f"DTSTART:{start.strftime('%Y%m%dT%H%M%S')}\n")
f.write(f"DTEND:{end.strftime('%Y%m%dT%H%M%S')}\n")
if 'recurrence' in event and event['recurrence']: # Check if 'recurrence' exists and is not None
for rule in event['recurrence']:
if rule.startswith('RRULE'):
rule_parts = rule.split(';')
new_rule_parts = []
for part in rule_parts:
if part.startswith('UNTIL='):
until_value = part.split('=')[1]
until_date = parser.isoparse(until_value)
if start.tzinfo is not None and until_date.tzinfo is None:
until_date = until_date.replace(tzinfo=UTC)
new_rule_parts.append(f"UNTIL={until_date.strftime('%Y%m%dT%H%M%SZ')}")
else:
new_rule_parts.append(part)
rule = ';'.join(new_rule_parts)
f.write(f"{rule}\n")
f.write("END:VEVENT\n")
f.write("END:VCALENDAR\n")