Files
luk/fetch_outlook.py
Tim Bendt 4770bcb459 wip
2025-04-23 07:14:53 -06:00

230 lines
9.0 KiB
Python

import os
import re
import msal
import requests
import json
from datetime import datetime
from dateutil import parser
from dateutil.tz import UTC
from email.message import EmailMessage
from email.utils import format_datetime
import time
import html2text
# Filepath for caching timestamp
cache_timestamp_file = 'cache_timestamp.json'
# Load cached timestamp if it exists
if os.path.exists(cache_timestamp_file):
with open(cache_timestamp_file, 'r') as f:
cache_timestamp = json.load(f)
else:
cache_timestamp = {}
# Function to check if the cache is still valid
def is_cache_valid():
if 'timestamp' in cache_timestamp and 'max_age' in cache_timestamp:
current_time = time.time()
cache_expiry_time = cache_timestamp['timestamp'] + cache_timestamp['max_age']
return current_time < cache_expiry_time
return False
# Function to create Maildir structure
def create_maildir_structure(base_path):
os.makedirs(os.path.join(base_path, 'cur'), exist_ok=True)
os.makedirs(os.path.join(base_path, 'new'), exist_ok=True)
os.makedirs(os.path.join(base_path, 'tmp'), exist_ok=True)
# Function to save email to Maildir format with Markdown conversion
def save_email_to_maildir(maildir_path, email_data):
# Create a new EmailMessage object
msg = EmailMessage()
received_datetime = email_data.get('receivedDateTime', '')
# Add required headers
if received_datetime:
# Parse the ISO 8601 datetime and convert it to RFC 5322 format
parsed_datetime = parser.isoparse(received_datetime)
msg['Date'] = format_datetime(parsed_datetime)
else:
msg['Date'] = '' # Leave empty if no receivedDateTime is available
msg['Message-ID'] = email_data.get('id', '') # Use the unique ID of the message
msg['Subject'] = email_data.get('subject', 'No Subject') # Default to 'No Subject' if missing
msg['From'] = email_data.get('from', {}).get('emailAddress', {}).get('address', 'unknown@unknown.com')
msg['To'] = ', '.join([recipient['emailAddress']['address'] for recipient in email_data.get('toRecipients', [])])
# Convert the email body from HTML to Markdown
body_html = email_data.get('body', {}).get('content', '')
if email_data.get('body', {}).get('contentType', '').lower() == 'html':
markdown_converter = html2text.HTML2Text()
markdown_converter.ignore_images = True
markdown_converter.ignore_links = False # Keep links in the Markdown output
body_markdown = markdown_converter.handle(body_html)
else:
body_markdown = body_html # Use plain text if the body is not HTML
# Add the converted Markdown body to the email
msg.set_content(body_markdown)
body_markdown = re.sub(r'\w+BannerStart.*?\w+BannerEnd', '', body_markdown, flags=re.DOTALL)
# Save the email to the Maildir 'new' folder
email_filename = f"{msg['Message-ID'] or email_data.get('id', 'unknown')}.eml"
email_filepath = os.path.join(maildir_path, 'new', email_filename)
with open(email_filepath, 'w') as f:
f.write(msg.as_string())
# Read Azure app credentials from environment variables
client_id = os.getenv('AZURE_CLIENT_ID')
tenant_id = os.getenv('AZURE_TENANT_ID')
if not client_id or not tenant_id:
raise ValueError("Please set the AZURE_CLIENT_ID and AZURE_TENANT_ID environment variables.")
# Token cache
cache = msal.SerializableTokenCache()
cache_file = 'token_cache.bin'
if os.path.exists(cache_file):
cache.deserialize(open(cache_file, 'r').read())
# Filepath for caching ETag
etag_cache_file = 'etag_cache.json'
# Load cached ETag if it exists
if os.path.exists(etag_cache_file):
with open(etag_cache_file, 'r') as f:
etag_cache = json.load(f)
else:
etag_cache = {}
# Authentication
authority = f'https://login.microsoftonline.com/{tenant_id}'
scopes = ['https://graph.microsoft.com/Calendars.Read', 'https://graph.microsoft.com/Mail.Read']
app = msal.PublicClientApplication(client_id, authority=authority, token_cache=cache)
accounts = app.get_accounts()
if accounts:
token_response = app.acquire_token_silent(scopes, account=accounts[0])
else:
flow = app.initiate_device_flow(scopes=scopes)
if 'user_code' not in flow:
raise Exception("Failed to create device flow")
print(flow['message'])
token_response = app.acquire_token_by_device_flow(flow)
if 'access_token' not in token_response:
raise Exception("Failed to acquire token")
# Save token cache
with open(cache_file, 'w') as f:
f.write(cache.serialize())
access_token = token_response['access_token']
headers = {'Authorization': f'Bearer {access_token}'}
accounts = app.get_accounts()
if not accounts:
raise Exception("No accounts found")
mail_url = 'https://graph.microsoft.com/v1.0/me/mailFolders/inbox/messages?$top=100&$orderby=receivedDateTime asc'
messages = []
print("Fetching mail...")
# Fetch the total count of messages in the inbox
inbox_url = 'https://graph.microsoft.com/v1.0/me/mailFolders/inbox'
response = requests.get(inbox_url, headers=headers)
if response.status_code != 200:
raise Exception(f"Failed to fetch inbox details: {response.status_code} {response.text}")
total_messages = response.json().get('totalItemCount', 0)
print(f"Total messages in inbox: {total_messages}")
while mail_url:
if is_cache_valid():
print("Using cached messages...")
break # No need to fetch further, cache is still valid
response = requests.get(mail_url, headers=headers)
if response.status_code != 200:
raise Exception(f"Failed to fetch mail: {response.status_code} {response.text}")
# Parse the Cache-Control header to get the max-age value
cache_control = response.headers.get('Cache-Control', '')
max_age = 0
if 'max-age=' in cache_control:
max_age = int(cache_control.split('max-age=')[1].split(',')[0])
# Update the cache timestamp and max-age
cache_timestamp['timestamp'] = time.time()
cache_timestamp['max_age'] = max_age
with open(cache_timestamp_file, 'w') as f:
json.dump(cache_timestamp, f)
# Process the response
response_data = response.json()
messages.extend(response_data.get('value', [])) # Add the current page of messages to the list
# Calculate and display progress percentage
progress = (len(messages) / total_messages) * 100 if total_messages > 0 else 0
print(f"Fetched {len(messages)} of {total_messages} messages ({progress:.2f}%)", end='\r')
# Get the next page URL from @odata.nextLink
mail_url = response_data.get('@odata.nextLink')
print("\nFinished fetching mail.")
maildir_path = os.getenv('MAILDIR_PATH', os.path.expanduser('~/Mail')) + "/corteva/INBOX"
create_maildir_structure(maildir_path)
for message in messages:
print(f"Processing message: {message.get('subject', 'No Subject')}", end='\r')
save_email_to_maildir(maildir_path, message)
print(f"\nFinished processing {len(messages)} messages.")
# Fetch events with pagination and expand recurring events
events_url = 'https://graph.microsoft.com/v1.0/me/events?$top=100&$expand=instances'
events = []
print("Fetching Calendar events...")
while events_url:
response = requests.get(events_url, headers=headers)
response_data = response.json()
events.extend(response_data.get('value', []))
print(f"Fetched {len(events)} events so far...", end='\r')
events_url = response_data.get('@odata.nextLink')
# Save events to a file in iCalendar format
output_file = f'output_ics/outlook_events_latest.ics'
print(f"Saving events to {output_file}...")
with open(output_file, 'w') as f:
f.write("BEGIN:VCALENDAR\nVERSION:2.0\n")
for event in events:
if 'start' in event and 'end' in event:
start = parser.isoparse(event['start']['dateTime'])
end = parser.isoparse(event['end']['dateTime'])
f.write(f"BEGIN:VEVENT\nSUMMARY:{event['subject']}\n")
f.write(f"DTSTART:{start.strftime('%Y%m%dT%H%M%S')}\n")
f.write(f"DTEND:{end.strftime('%Y%m%dT%H%M%S')}\n")
if 'recurrence' in event and event['recurrence']: # Check if 'recurrence' exists and is not None
for rule in event['recurrence']:
if rule.startswith('RRULE'):
rule_parts = rule.split(';')
new_rule_parts = []
for part in rule_parts:
if part.startswith('UNTIL='):
until_value = part.split('=')[1]
until_date = parser.isoparse(until_value)
if start.tzinfo is not None and until_date.tzinfo is None:
until_date = until_date.replace(tzinfo=UTC)
new_rule_parts.append(f"UNTIL={until_date.strftime('%Y%m%dT%H%M%SZ')}")
else:
new_rule_parts.append(part)
rule = ';'.join(new_rule_parts)
f.write(f"{rule}\n")
f.write("END:VEVENT\n")
f.write("END:VCALENDAR\n")