luk/fetch_outlook.py

import os
import re
import msal
import requests
import json
from datetime import datetime
from dateutil import parser
from dateutil.tz import UTC
from email.message import EmailMessage
from email.utils import format_datetime
import time
import html2text

# Filepath for caching timestamp
cache_timestamp_file = 'cache_timestamp.json'

# Load cached timestamp if it exists
if os.path.exists(cache_timestamp_file):
    with open(cache_timestamp_file, 'r') as f:
        cache_timestamp = json.load(f)
else:
    cache_timestamp = {}

# Function to check if the cache is still valid
def is_cache_valid():
    if 'timestamp' in cache_timestamp and 'max_age' in cache_timestamp:
        current_time = time.time()
        cache_expiry_time = cache_timestamp['timestamp'] + cache_timestamp['max_age']
        return current_time < cache_expiry_time
    return False


# Function to create Maildir structure
def create_maildir_structure(base_path):
    os.makedirs(os.path.join(base_path, 'cur'), exist_ok=True)
    os.makedirs(os.path.join(base_path, 'new'), exist_ok=True)
    os.makedirs(os.path.join(base_path, 'tmp'), exist_ok=True)

# Function to save email to Maildir format with Markdown conversion
def save_email_to_maildir(maildir_path, email_data):
    # Create a new EmailMessage object
    msg = EmailMessage()

    received_datetime = email_data.get('receivedDateTime', '')
    # Add required headers
    if received_datetime:
        # Parse the ISO 8601 datetime and convert it to RFC 5322 format
        parsed_datetime = parser.isoparse(received_datetime)
        msg['Date'] = format_datetime(parsed_datetime)
    else:
        msg['Date'] = ''  # Leave empty if no receivedDateTime is available

    msg['Message-ID'] = email_data.get('id', '')  # Use the unique ID of the message
    msg['Subject'] = email_data.get('subject', 'No Subject')  # Default to 'No Subject' if missing
    msg['From'] = email_data.get('from', {}).get('emailAddress', {}).get('address', 'unknown@unknown.com')
    msg['To'] = ', '.join([recipient['emailAddress']['address'] for recipient in email_data.get('toRecipients', [])])

    # Convert the email body from HTML to Markdown
    body_html = email_data.get('body', {}).get('content', '')
    if email_data.get('body', {}).get('contentType', '').lower() == 'html':
        markdown_converter = html2text.HTML2Text()
        markdown_converter.ignore_images = True
        markdown_converter.ignore_links = False  # Keep links in the Markdown output
        body_markdown = markdown_converter.handle(body_html)
    else:
        body_markdown = body_html  # Use plain text if the body is not HTML

    # Add the converted Markdown body to the email
    msg.set_content(body_markdown)
    body_markdown = re.sub(r'\w+BannerStart.*?\w+BannerEnd', '', body_markdown, flags=re.DOTALL)
    # Save the email to the Maildir 'new' folder
    email_filename = f"{msg['Message-ID'] or email_data.get('id', 'unknown')}.eml"
    email_filepath = os.path.join(maildir_path, 'new', email_filename)
    with open(email_filepath, 'w') as f:
        f.write(msg.as_string())

# Read Azure app credentials from environment variables
client_id = os.getenv('AZURE_CLIENT_ID')
tenant_id = os.getenv('AZURE_TENANT_ID')

if not client_id or not tenant_id:
    raise ValueError("Please set the AZURE_CLIENT_ID and AZURE_TENANT_ID environment variables.")

# Token cache
cache = msal.SerializableTokenCache()
cache_file = 'token_cache.bin'

if os.path.exists(cache_file):
    cache.deserialize(open(cache_file, 'r').read())

# Filepath for caching ETag
etag_cache_file = 'etag_cache.json'
# Load cached ETag if it exists
if os.path.exists(etag_cache_file):
    with open(etag_cache_file, 'r') as f:
        etag_cache = json.load(f)
else:
    etag_cache = {}

# Authentication
authority = f'https://login.microsoftonline.com/{tenant_id}'
scopes = ['https://graph.microsoft.com/Calendars.Read', 'https://graph.microsoft.com/Mail.Read']

app = msal.PublicClientApplication(client_id, authority=authority, token_cache=cache)
accounts = app.get_accounts()

if accounts:
    token_response = app.acquire_token_silent(scopes, account=accounts[0])
else:
    flow = app.initiate_device_flow(scopes=scopes)
    if 'user_code' not in flow:
        raise Exception("Failed to create device flow")
    print(flow['message'])
    token_response = app.acquire_token_by_device_flow(flow)

if 'access_token' not in token_response:
    raise Exception("Failed to acquire token")

# Save token cache
with open(cache_file, 'w') as f:
    f.write(cache.serialize())

access_token = token_response['access_token']
headers = {'Authorization': f'Bearer {access_token}'}
accounts = app.get_accounts()

if not accounts:
    raise Exception("No accounts found")
mail_url = 'https://graph.microsoft.com/v1.0/me/mailFolders/inbox/messages?$top=100&$orderby=receivedDateTime asc'
messages = []
print("Fetching mail...")

# Fetch the total count of messages in the inbox
inbox_url = 'https://graph.microsoft.com/v1.0/me/mailFolders/inbox'
response = requests.get(inbox_url, headers=headers)

if response.status_code != 200:
    raise Exception(f"Failed to fetch inbox details: {response.status_code} {response.text}")

total_messages = response.json().get('totalItemCount', 0)
print(f"Total messages in inbox: {total_messages}")

while mail_url:
    if is_cache_valid():
        print("Using cached messages...")
        break  # No need to fetch further, cache is still valid

    response = requests.get(mail_url, headers=headers)

    if response.status_code != 200:
        raise Exception(f"Failed to fetch mail: {response.status_code} {response.text}")

    # Parse the Cache-Control header to get the max-age value
    cache_control = response.headers.get('Cache-Control', '')
    max_age = 0
    if 'max-age=' in cache_control:
        max_age = int(cache_control.split('max-age=')[1].split(',')[0])

    # Update the cache timestamp and max-age
    cache_timestamp['timestamp'] = time.time()
    cache_timestamp['max_age'] = max_age
    with open(cache_timestamp_file, 'w') as f:
        json.dump(cache_timestamp, f)

    # Process the response
    response_data = response.json()
    messages.extend(response_data.get('value', []))  # Add the current page of messages to the list

    # Calculate and display progress percentage
    progress = (len(messages) / total_messages) * 100 if total_messages > 0 else 0
    print(f"Fetched {len(messages)} of {total_messages} messages ({progress:.2f}%)", end='\r')

    # Get the next page URL from @odata.nextLink
    mail_url = response_data.get('@odata.nextLink')


print("\nFinished fetching mail.")

maildir_path = os.getenv('MAILDIR_PATH', os.path.expanduser('~/Mail')) + "/corteva/INBOX"
create_maildir_structure(maildir_path)

for message in messages:
    print(f"Processing message: {message.get('subject', 'No Subject')}", end='\r')
    save_email_to_maildir(maildir_path, message)

print(f"\nFinished processing {len(messages)} messages.")


# Fetch events with pagination and expand recurring events
events_url = 'https://graph.microsoft.com/v1.0/me/events?$top=100&$expand=instances'
events = []
print("Fetching Calendar events...")
while events_url:
    response = requests.get(events_url, headers=headers)
    response_data = response.json()
    events.extend(response_data.get('value', []))
    print(f"Fetched {len(events)} events so far...", end='\r')
    events_url = response_data.get('@odata.nextLink')

# Save events to a file in iCalendar format
output_file = f'output_ics/outlook_events_latest.ics'
print(f"Saving events to {output_file}...")
with open(output_file, 'w') as f:
    f.write("BEGIN:VCALENDAR\nVERSION:2.0\n")
    for event in events:
        if 'start' in event and 'end' in event:
            start = parser.isoparse(event['start']['dateTime'])
            end = parser.isoparse(event['end']['dateTime'])
            f.write(f"BEGIN:VEVENT\nSUMMARY:{event['subject']}\n")
            f.write(f"DTSTART:{start.strftime('%Y%m%dT%H%M%S')}\n")
            f.write(f"DTEND:{end.strftime('%Y%m%dT%H%M%S')}\n")
            if 'recurrence' in event and event['recurrence']:  # Check if 'recurrence' exists and is not None
                for rule in event['recurrence']:
                    if rule.startswith('RRULE'):
                        rule_parts = rule.split(';')
                        new_rule_parts = []
                        for part in rule_parts:
                            if part.startswith('UNTIL='):
                                until_value = part.split('=')[1]
                                until_date = parser.isoparse(until_value)
                                if start.tzinfo is not None and until_date.tzinfo is None:
                                    until_date = until_date.replace(tzinfo=UTC)
                                new_rule_parts.append(f"UNTIL={until_date.strftime('%Y%m%dT%H%M%SZ')}")
                            else:
                                new_rule_parts.append(part)
                        rule = ';'.join(new_rule_parts)
                    f.write(f"{rule}\n")
            f.write("END:VEVENT\n")
    f.write("END:VCALENDAR\n")