import { DOMParser, initParser } from '@b-fuze/deno-dom/wasm-noinit'; import { Feed, parseFeed } from '@mikaelporttila/rss'; import { fetchUrl, fetchUrlAsGooglebot, fetchUrlWithProxy, fetchUrlWithRetries } from './utils/misc.ts'; import { NewsFeed, NewsFeedCrawlType, NewsFeedType } from './types.ts'; export interface JsonFeedItem { id: string; url: string; title: string; content_text?: string; content_html?: string; summary?: string; date_modified?: string; date_published: string; } export interface JsonFeed { version: string; title: string; home_page_url?: string; description?: string; authors?: { name: string; url?: string }[]; language?: string; items: JsonFeedItem[]; } async function getFeedFromUrlContents(urlContents: string) { try { const jsonFeed = JSON.parse(urlContents) as JsonFeed; return jsonFeed; } catch (_error) { const feed = await parseFeed(urlContents); return feed; } } export async function parseUrl(feedUrl: string) { const urlContents = await fetchUrl(feedUrl); const feed = await getFeedFromUrlContents(urlContents); return feed; } export async function parseUrlAsGooglebot(feedUrl: string) { const urlContents = await fetchUrlAsGooglebot(feedUrl); const feed = await getFeedFromUrlContents(urlContents); return feed; } export async function parseUrlWithProxy(feedUrl: string) { const urlContents = await fetchUrlWithProxy(feedUrl); const feed = await getFeedFromUrlContents(urlContents); return feed; } async function parseUrlWithRetries(feedUrl: string): Promise<{ feed: JsonFeed | Feed; crawlType: NewsFeedCrawlType }> { try { const feed = await parseUrl(feedUrl); return { feed, crawlType: 'direct' }; } catch (_error) { try { const feed = await parseUrlAsGooglebot(feedUrl); return { feed, crawlType: 'googlebot' }; } catch (_error) { const feed = await parseUrlWithProxy(feedUrl); return { feed, crawlType: 'proxy' }; } } } export async function isValid(feedUrl: string, fastFail = false) { try { console.log('Checking if URL is a valid feed URL', feedUrl); const { feed } = fastFail ? { feed: await parseUrl(feedUrl) } : await parseUrlWithRetries(feedUrl); return Boolean( (feed as Feed).title?.value || (feed as JsonFeed).title || (feed as JsonFeed).items?.length || (feed as Feed).links?.length > 0 || feed.description, ); } catch (error) { console.log('Failed parsing feed to check validity', feedUrl); console.log(error); } return false; } export async function getFeedInfo(feedUrl: string, fastFail = false): Promise { try { console.log('Getting Feed URL info', feedUrl); const { feed, crawlType } = fastFail ? { feed: await parseUrl(feedUrl), crawlType: 'direct' as const } : await parseUrlWithRetries(feedUrl); let feedType: NewsFeedType = 'rss'; if ((feed as JsonFeed).version) { feedType = 'json'; } else if ((feed as Feed).type === 'ATOM') { feedType = 'atom'; } return { title: (feed as Feed).title?.value || (feed as JsonFeed).title || '', feed_type: feedType, crawl_type: crawlType, }; } catch (error) { console.log('Failed parsing feed to check validity', feedUrl); console.log(error); } return {}; } export async function findFeedInUrl(url: string) { let urlContents = ''; try { urlContents = await fetchUrl(url); } catch (error) { console.log('Failed to fetch URL to find feed', url); console.log(error); return null; } await initParser(); try { const document = new DOMParser().parseFromString(urlContents, 'text/html'); const urlOptions = [ url, document!.querySelector('link[type="application/rss+xml"]')?.getAttribute('href'), document!.querySelector('link[type="application/atom+xml"]')?.getAttribute('href'), document!.querySelector('link[rel="alternate"]')?.getAttribute('href'), // Try some common URL paths 'feed', 'rss', 'rss.xml', 'feed.xml', 'atom.xml', 'atom', 'feeds/posts/default', ].filter(Boolean); for (const urlOption of urlOptions) { const optionalSlash = urlOption!.startsWith('/') || url.endsWith('/') ? '' : '/'; const potentialFeedUrl = urlOption!.startsWith('http') ? urlOption : `${url}${optionalSlash}${urlOption}`; try { const isValidFeed = await isValid(potentialFeedUrl!, true); if (isValidFeed) { return potentialFeedUrl; } } catch (_error) { // Do nothing. } } } catch (error) { // This error can happen for huge responses, but that usually means the URL works if ((error as Error).toString().includes('RangeError: Maximum call stack size exceeded')) { return url; } else { console.error(error); } } return null; } export function getArticleUrl(links: Feed['entries'][0]['links']) { try { for (const link of links) { if (link.rel === 'alternate' && link.type?.startsWith('text/html')) { return link.href || ''; } } return links[0]?.href || ''; } catch (_error) { return ''; } } export async function getUrlInfo(url: string): Promise<{ title: string; htmlBody: string; textBody: string } | null> { let urlContents = ''; try { urlContents = await fetchUrlWithRetries(url); } catch (error) { console.log('Failed to fetch URL to get info', url); console.log(error); return null; } await initParser(); const document = new DOMParser().parseFromString(urlContents, 'text/html'); const title = document!.querySelector('title')?.textContent; let htmlBody = document!.querySelector('body')?.innerHTML; let textBody = document!.querySelector('body')?.textContent; const mainHtml = document!.querySelector('main')?.innerHTML; const mainText = document!.querySelector('main')?.textContent; const articleHtml = document!.querySelector('article')?.innerHTML; const articleText = document!.querySelector('article')?.textContent; if (mainHtml && mainText) { htmlBody = mainHtml; textBody = mainText; } else if (articleHtml && articleText) { htmlBody = articleHtml; textBody = articleText; } if (!title || !htmlBody || !textBody) { return null; } return { title, htmlBody, textBody }; } export async function parseTextFromHtml(html: string): Promise { let text = ''; await initParser(); const document = new DOMParser().parseFromString(html, 'text/html'); text = document!.textContent; return text; }