Make it public!

2024-03-16 08:40:24 +00:00
commit a5cafdddca
114 changed files with 9569 additions and 0 deletions
--- a/lib/feed.ts
+++ b/lib/feed.ts
@@ -0,0 +1,233 @@
+import { DOMParser, initParser } from 'https://deno.land/x/deno_dom@v0.1.45/deno-dom-wasm-noinit.ts';
+import { Feed, parseFeed } from 'https://deno.land/x/rss@1.0.0/mod.ts';
+import { fetchUrl, fetchUrlAsGooglebot, fetchUrlWithProxy, fetchUrlWithRetries } from './utils.ts';
+import { NewsFeed, NewsFeedCrawlType, NewsFeedType } from './types.ts';
+
+export interface JsonFeedItem {
+  id: string;
+  url: string;
+  title: string;
+  content_text?: string;
+  content_html?: string;
+  summary?: string;
+  date_modified?: string;
+  date_published: string;
+}
+
+export interface JsonFeed {
+  version: string;
+  title: string;
+  home_page_url?: string;
+  description?: string;
+  authors?: { name: string; url?: string }[];
+  language?: string;
+  items: JsonFeedItem[];
+}
+
+async function getFeedFromUrlContents(urlContents: string) {
+  try {
+    const jsonFeed = JSON.parse(urlContents) as JsonFeed;
+    return jsonFeed;
+  } catch (_error) {
+    const feed = await parseFeed(urlContents);
+    return feed;
+  }
+}
+
+export async function parseUrl(feedUrl: string) {
+  const urlContents = await fetchUrl(feedUrl);
+  const feed = await getFeedFromUrlContents(urlContents);
+  return feed;
+}
+
+export async function parseUrlAsGooglebot(feedUrl: string) {
+  const urlContents = await fetchUrlAsGooglebot(feedUrl);
+  const feed = await getFeedFromUrlContents(urlContents);
+  return feed;
+}
+
+export async function parseUrlWithProxy(feedUrl: string) {
+  const urlContents = await fetchUrlWithProxy(feedUrl);
+  const feed = await getFeedFromUrlContents(urlContents);
+  return feed;
+}
+
+async function parseUrlWithRetries(feedUrl: string): Promise<{ feed: JsonFeed | Feed; crawlType: NewsFeedCrawlType }> {
+  try {
+    const feed = await parseUrl(feedUrl);
+    return { feed, crawlType: 'direct' };
+  } catch (_error) {
+    try {
+      const feed = await parseUrlAsGooglebot(feedUrl);
+      return { feed, crawlType: 'googlebot' };
+    } catch (_error) {
+      const feed = await parseUrlWithProxy(feedUrl);
+      return { feed, crawlType: 'proxy' };
+    }
+  }
+}
+
+export async function isValid(feedUrl: string, fastFail = false) {
+  try {
+    console.log('Checking if URL is a valid feed URL', feedUrl);
+    const { feed } = fastFail ? { feed: await parseUrl(feedUrl) } : await parseUrlWithRetries(feedUrl);
+    return Boolean(
+      (feed as Feed).title?.value || (feed as JsonFeed).title || (feed as JsonFeed).items?.length ||
+        (feed as Feed).links?.length > 0 || feed.description,
+    );
+  } catch (error) {
+    console.log('Failed parsing feed to check validity', feedUrl);
+    console.log(error);
+  }
+
+  return false;
+}
+
+export async function getFeedInfo(feedUrl: string, fastFail = false): Promise<NewsFeed['extra']> {
+  try {
+    console.log('Getting Feed URL info', feedUrl);
+
+    const { feed, crawlType } = fastFail
+      ? { feed: await parseUrl(feedUrl), crawlType: 'direct' as const }
+      : await parseUrlWithRetries(feedUrl);
+    let feedType: NewsFeedType = 'rss';
+
+    if ((feed as JsonFeed).version) {
+      feedType = 'json';
+    } else if ((feed as Feed).type === 'ATOM') {
+      feedType = 'atom';
+    }
+
+    return {
+      title: (feed as Feed).title?.value || (feed as JsonFeed).title || '',
+      feed_type: feedType,
+      crawl_type: crawlType,
+    };
+  } catch (error) {
+    console.log('Failed parsing feed to check validity', feedUrl);
+    console.log(error);
+  }
+
+  return {};
+}
+
+export async function findFeedInUrl(url: string) {
+  let urlContents = '';
+  try {
+    urlContents = await fetchUrl(url);
+  } catch (error) {
+    console.log('Failed to fetch URL to find feed', url);
+    console.log(error);
+    return null;
+  }
+
+  await initParser();
+
+  try {
+    const document = new DOMParser().parseFromString(urlContents, 'text/html');
+
+    const urlOptions = [
+      url,
+      document!.querySelector('link[type="application/rss+xml"]')?.getAttribute('href'),
+      document!.querySelector('link[type="application/atom+xml"]')?.getAttribute('href'),
+      document!.querySelector('link[rel="alternate"]')?.getAttribute('href'),
+      // Try some common URL paths
+      'feed',
+      'rss',
+      'rss.xml',
+      'feed.xml',
+      'atom.xml',
+      'atom',
+      'feeds/posts/default',
+    ].filter(Boolean);
+
+    for (const urlOption of urlOptions) {
+      const optionalSlash = urlOption!.startsWith('/') || url.endsWith('/') ? '' : '/';
+      const potentialFeedUrl = urlOption!.startsWith('http') ? urlOption : `${url}${optionalSlash}${urlOption}`;
+
+      try {
+        const isValidFeed = await isValid(potentialFeedUrl!, true);
+
+        if (isValidFeed) {
+          return potentialFeedUrl;
+        }
+      } catch (_error) {
+        // Do nothing.
+      }
+    }
+  } catch (error) {
+    // This error can happen for huge responses, but that usually means the URL works
+    if (error.toString().includes('RangeError: Maximum call stack size exceeded')) {
+      return url;
+    } else {
+      console.error(error);
+    }
+  }
+
+  return null;
+}
+
+export function getArticleUrl(links: Feed['entries'][0]['links']) {
+  try {
+    for (const link of links) {
+      if (link.rel === 'alternate' && link.type?.startsWith('text/html')) {
+        return link.href || '';
+      }
+    }
+
+    return links[0]?.href || '';
+  } catch (_error) {
+    return '';
+  }
+}
+
+export async function getUrlInfo(url: string): Promise<{ title: string; htmlBody: string; textBody: string } | null> {
+  let urlContents = '';
+  try {
+    urlContents = await fetchUrlWithRetries(url);
+  } catch (error) {
+    console.log('Failed to fetch URL to get info', url);
+    console.log(error);
+    return null;
+  }
+
+  await initParser();
+
+  const document = new DOMParser().parseFromString(urlContents, 'text/html');
+
+  const title = document!.querySelector('title')?.textContent;
+  let htmlBody = document!.querySelector('body')?.innerHTML;
+  let textBody = document!.querySelector('body')?.textContent;
+
+  const mainHtml = document!.querySelector('main')?.innerHTML;
+  const mainText = document!.querySelector('main')?.textContent;
+
+  const articleHtml = document!.querySelector('article')?.innerHTML;
+  const articleText = document!.querySelector('article')?.textContent;
+
+  if (mainHtml && mainText) {
+    htmlBody = mainHtml;
+    textBody = mainText;
+  } else if (articleHtml && articleText) {
+    htmlBody = articleHtml;
+    textBody = articleText;
+  }
+
+  if (!title || !htmlBody || !textBody) {
+    return null;
+  }
+
+  return { title, htmlBody, textBody };
+}
+
+export async function parseTextFromHtml(html: string): Promise<string> {
+  let text = '';
+
+  await initParser();
+
+  const document = new DOMParser().parseFromString(html, 'text/html');
+
+  text = document!.textContent;
+
+  return text;
+}