Make it public!

This commit is contained in:
Bruno Bernardino
2024-03-16 08:40:24 +00:00
commit a5cafdddca
114 changed files with 9569 additions and 0 deletions

233
lib/feed.ts Normal file
View File

@@ -0,0 +1,233 @@
import { DOMParser, initParser } from 'https://deno.land/x/deno_dom@v0.1.45/deno-dom-wasm-noinit.ts';
import { Feed, parseFeed } from 'https://deno.land/x/rss@1.0.0/mod.ts';
import { fetchUrl, fetchUrlAsGooglebot, fetchUrlWithProxy, fetchUrlWithRetries } from './utils.ts';
import { NewsFeed, NewsFeedCrawlType, NewsFeedType } from './types.ts';
export interface JsonFeedItem {
id: string;
url: string;
title: string;
content_text?: string;
content_html?: string;
summary?: string;
date_modified?: string;
date_published: string;
}
export interface JsonFeed {
version: string;
title: string;
home_page_url?: string;
description?: string;
authors?: { name: string; url?: string }[];
language?: string;
items: JsonFeedItem[];
}
async function getFeedFromUrlContents(urlContents: string) {
try {
const jsonFeed = JSON.parse(urlContents) as JsonFeed;
return jsonFeed;
} catch (_error) {
const feed = await parseFeed(urlContents);
return feed;
}
}
export async function parseUrl(feedUrl: string) {
const urlContents = await fetchUrl(feedUrl);
const feed = await getFeedFromUrlContents(urlContents);
return feed;
}
export async function parseUrlAsGooglebot(feedUrl: string) {
const urlContents = await fetchUrlAsGooglebot(feedUrl);
const feed = await getFeedFromUrlContents(urlContents);
return feed;
}
export async function parseUrlWithProxy(feedUrl: string) {
const urlContents = await fetchUrlWithProxy(feedUrl);
const feed = await getFeedFromUrlContents(urlContents);
return feed;
}
async function parseUrlWithRetries(feedUrl: string): Promise<{ feed: JsonFeed | Feed; crawlType: NewsFeedCrawlType }> {
try {
const feed = await parseUrl(feedUrl);
return { feed, crawlType: 'direct' };
} catch (_error) {
try {
const feed = await parseUrlAsGooglebot(feedUrl);
return { feed, crawlType: 'googlebot' };
} catch (_error) {
const feed = await parseUrlWithProxy(feedUrl);
return { feed, crawlType: 'proxy' };
}
}
}
export async function isValid(feedUrl: string, fastFail = false) {
try {
console.log('Checking if URL is a valid feed URL', feedUrl);
const { feed } = fastFail ? { feed: await parseUrl(feedUrl) } : await parseUrlWithRetries(feedUrl);
return Boolean(
(feed as Feed).title?.value || (feed as JsonFeed).title || (feed as JsonFeed).items?.length ||
(feed as Feed).links?.length > 0 || feed.description,
);
} catch (error) {
console.log('Failed parsing feed to check validity', feedUrl);
console.log(error);
}
return false;
}
export async function getFeedInfo(feedUrl: string, fastFail = false): Promise<NewsFeed['extra']> {
try {
console.log('Getting Feed URL info', feedUrl);
const { feed, crawlType } = fastFail
? { feed: await parseUrl(feedUrl), crawlType: 'direct' as const }
: await parseUrlWithRetries(feedUrl);
let feedType: NewsFeedType = 'rss';
if ((feed as JsonFeed).version) {
feedType = 'json';
} else if ((feed as Feed).type === 'ATOM') {
feedType = 'atom';
}
return {
title: (feed as Feed).title?.value || (feed as JsonFeed).title || '',
feed_type: feedType,
crawl_type: crawlType,
};
} catch (error) {
console.log('Failed parsing feed to check validity', feedUrl);
console.log(error);
}
return {};
}
export async function findFeedInUrl(url: string) {
let urlContents = '';
try {
urlContents = await fetchUrl(url);
} catch (error) {
console.log('Failed to fetch URL to find feed', url);
console.log(error);
return null;
}
await initParser();
try {
const document = new DOMParser().parseFromString(urlContents, 'text/html');
const urlOptions = [
url,
document!.querySelector('link[type="application/rss+xml"]')?.getAttribute('href'),
document!.querySelector('link[type="application/atom+xml"]')?.getAttribute('href'),
document!.querySelector('link[rel="alternate"]')?.getAttribute('href'),
// Try some common URL paths
'feed',
'rss',
'rss.xml',
'feed.xml',
'atom.xml',
'atom',
'feeds/posts/default',
].filter(Boolean);
for (const urlOption of urlOptions) {
const optionalSlash = urlOption!.startsWith('/') || url.endsWith('/') ? '' : '/';
const potentialFeedUrl = urlOption!.startsWith('http') ? urlOption : `${url}${optionalSlash}${urlOption}`;
try {
const isValidFeed = await isValid(potentialFeedUrl!, true);
if (isValidFeed) {
return potentialFeedUrl;
}
} catch (_error) {
// Do nothing.
}
}
} catch (error) {
// This error can happen for huge responses, but that usually means the URL works
if (error.toString().includes('RangeError: Maximum call stack size exceeded')) {
return url;
} else {
console.error(error);
}
}
return null;
}
export function getArticleUrl(links: Feed['entries'][0]['links']) {
try {
for (const link of links) {
if (link.rel === 'alternate' && link.type?.startsWith('text/html')) {
return link.href || '';
}
}
return links[0]?.href || '';
} catch (_error) {
return '';
}
}
export async function getUrlInfo(url: string): Promise<{ title: string; htmlBody: string; textBody: string } | null> {
let urlContents = '';
try {
urlContents = await fetchUrlWithRetries(url);
} catch (error) {
console.log('Failed to fetch URL to get info', url);
console.log(error);
return null;
}
await initParser();
const document = new DOMParser().parseFromString(urlContents, 'text/html');
const title = document!.querySelector('title')?.textContent;
let htmlBody = document!.querySelector('body')?.innerHTML;
let textBody = document!.querySelector('body')?.textContent;
const mainHtml = document!.querySelector('main')?.innerHTML;
const mainText = document!.querySelector('main')?.textContent;
const articleHtml = document!.querySelector('article')?.innerHTML;
const articleText = document!.querySelector('article')?.textContent;
if (mainHtml && mainText) {
htmlBody = mainHtml;
textBody = mainText;
} else if (articleHtml && articleText) {
htmlBody = articleHtml;
textBody = articleText;
}
if (!title || !htmlBody || !textBody) {
return null;
}
return { title, htmlBody, textBody };
}
export async function parseTextFromHtml(html: string): Promise<string> {
let text = '';
await initParser();
const document = new DOMParser().parseFromString(html, 'text/html');
text = document!.textContent;
return text;
}