234 lines
6.5 KiB
TypeScript
234 lines
6.5 KiB
TypeScript
import { DOMParser, initParser } from 'https://deno.land/x/deno_dom@v0.1.45/deno-dom-wasm-noinit.ts';
|
|
import { Feed, parseFeed } from 'https://deno.land/x/rss@1.0.0/mod.ts';
|
|
import { fetchUrl, fetchUrlAsGooglebot, fetchUrlWithProxy, fetchUrlWithRetries } from './utils/misc.ts';
|
|
import { NewsFeed, NewsFeedCrawlType, NewsFeedType } from './types.ts';
|
|
|
|
export interface JsonFeedItem {
|
|
id: string;
|
|
url: string;
|
|
title: string;
|
|
content_text?: string;
|
|
content_html?: string;
|
|
summary?: string;
|
|
date_modified?: string;
|
|
date_published: string;
|
|
}
|
|
|
|
export interface JsonFeed {
|
|
version: string;
|
|
title: string;
|
|
home_page_url?: string;
|
|
description?: string;
|
|
authors?: { name: string; url?: string }[];
|
|
language?: string;
|
|
items: JsonFeedItem[];
|
|
}
|
|
|
|
async function getFeedFromUrlContents(urlContents: string) {
|
|
try {
|
|
const jsonFeed = JSON.parse(urlContents) as JsonFeed;
|
|
return jsonFeed;
|
|
} catch (_error) {
|
|
const feed = await parseFeed(urlContents);
|
|
return feed;
|
|
}
|
|
}
|
|
|
|
export async function parseUrl(feedUrl: string) {
|
|
const urlContents = await fetchUrl(feedUrl);
|
|
const feed = await getFeedFromUrlContents(urlContents);
|
|
return feed;
|
|
}
|
|
|
|
export async function parseUrlAsGooglebot(feedUrl: string) {
|
|
const urlContents = await fetchUrlAsGooglebot(feedUrl);
|
|
const feed = await getFeedFromUrlContents(urlContents);
|
|
return feed;
|
|
}
|
|
|
|
export async function parseUrlWithProxy(feedUrl: string) {
|
|
const urlContents = await fetchUrlWithProxy(feedUrl);
|
|
const feed = await getFeedFromUrlContents(urlContents);
|
|
return feed;
|
|
}
|
|
|
|
async function parseUrlWithRetries(feedUrl: string): Promise<{ feed: JsonFeed | Feed; crawlType: NewsFeedCrawlType }> {
|
|
try {
|
|
const feed = await parseUrl(feedUrl);
|
|
return { feed, crawlType: 'direct' };
|
|
} catch (_error) {
|
|
try {
|
|
const feed = await parseUrlAsGooglebot(feedUrl);
|
|
return { feed, crawlType: 'googlebot' };
|
|
} catch (_error) {
|
|
const feed = await parseUrlWithProxy(feedUrl);
|
|
return { feed, crawlType: 'proxy' };
|
|
}
|
|
}
|
|
}
|
|
|
|
export async function isValid(feedUrl: string, fastFail = false) {
|
|
try {
|
|
console.log('Checking if URL is a valid feed URL', feedUrl);
|
|
const { feed } = fastFail ? { feed: await parseUrl(feedUrl) } : await parseUrlWithRetries(feedUrl);
|
|
return Boolean(
|
|
(feed as Feed).title?.value || (feed as JsonFeed).title || (feed as JsonFeed).items?.length ||
|
|
(feed as Feed).links?.length > 0 || feed.description,
|
|
);
|
|
} catch (error) {
|
|
console.log('Failed parsing feed to check validity', feedUrl);
|
|
console.log(error);
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
export async function getFeedInfo(feedUrl: string, fastFail = false): Promise<NewsFeed['extra']> {
|
|
try {
|
|
console.log('Getting Feed URL info', feedUrl);
|
|
|
|
const { feed, crawlType } = fastFail
|
|
? { feed: await parseUrl(feedUrl), crawlType: 'direct' as const }
|
|
: await parseUrlWithRetries(feedUrl);
|
|
let feedType: NewsFeedType = 'rss';
|
|
|
|
if ((feed as JsonFeed).version) {
|
|
feedType = 'json';
|
|
} else if ((feed as Feed).type === 'ATOM') {
|
|
feedType = 'atom';
|
|
}
|
|
|
|
return {
|
|
title: (feed as Feed).title?.value || (feed as JsonFeed).title || '',
|
|
feed_type: feedType,
|
|
crawl_type: crawlType,
|
|
};
|
|
} catch (error) {
|
|
console.log('Failed parsing feed to check validity', feedUrl);
|
|
console.log(error);
|
|
}
|
|
|
|
return {};
|
|
}
|
|
|
|
export async function findFeedInUrl(url: string) {
|
|
let urlContents = '';
|
|
try {
|
|
urlContents = await fetchUrl(url);
|
|
} catch (error) {
|
|
console.log('Failed to fetch URL to find feed', url);
|
|
console.log(error);
|
|
return null;
|
|
}
|
|
|
|
await initParser();
|
|
|
|
try {
|
|
const document = new DOMParser().parseFromString(urlContents, 'text/html');
|
|
|
|
const urlOptions = [
|
|
url,
|
|
document!.querySelector('link[type="application/rss+xml"]')?.getAttribute('href'),
|
|
document!.querySelector('link[type="application/atom+xml"]')?.getAttribute('href'),
|
|
document!.querySelector('link[rel="alternate"]')?.getAttribute('href'),
|
|
// Try some common URL paths
|
|
'feed',
|
|
'rss',
|
|
'rss.xml',
|
|
'feed.xml',
|
|
'atom.xml',
|
|
'atom',
|
|
'feeds/posts/default',
|
|
].filter(Boolean);
|
|
|
|
for (const urlOption of urlOptions) {
|
|
const optionalSlash = urlOption!.startsWith('/') || url.endsWith('/') ? '' : '/';
|
|
const potentialFeedUrl = urlOption!.startsWith('http') ? urlOption : `${url}${optionalSlash}${urlOption}`;
|
|
|
|
try {
|
|
const isValidFeed = await isValid(potentialFeedUrl!, true);
|
|
|
|
if (isValidFeed) {
|
|
return potentialFeedUrl;
|
|
}
|
|
} catch (_error) {
|
|
// Do nothing.
|
|
}
|
|
}
|
|
} catch (error) {
|
|
// This error can happen for huge responses, but that usually means the URL works
|
|
if ((error as Error).toString().includes('RangeError: Maximum call stack size exceeded')) {
|
|
return url;
|
|
} else {
|
|
console.error(error);
|
|
}
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
export function getArticleUrl(links: Feed['entries'][0]['links']) {
|
|
try {
|
|
for (const link of links) {
|
|
if (link.rel === 'alternate' && link.type?.startsWith('text/html')) {
|
|
return link.href || '';
|
|
}
|
|
}
|
|
|
|
return links[0]?.href || '';
|
|
} catch (_error) {
|
|
return '';
|
|
}
|
|
}
|
|
|
|
export async function getUrlInfo(url: string): Promise<{ title: string; htmlBody: string; textBody: string } | null> {
|
|
let urlContents = '';
|
|
try {
|
|
urlContents = await fetchUrlWithRetries(url);
|
|
} catch (error) {
|
|
console.log('Failed to fetch URL to get info', url);
|
|
console.log(error);
|
|
return null;
|
|
}
|
|
|
|
await initParser();
|
|
|
|
const document = new DOMParser().parseFromString(urlContents, 'text/html');
|
|
|
|
const title = document!.querySelector('title')?.textContent;
|
|
let htmlBody = document!.querySelector('body')?.innerHTML;
|
|
let textBody = document!.querySelector('body')?.textContent;
|
|
|
|
const mainHtml = document!.querySelector('main')?.innerHTML;
|
|
const mainText = document!.querySelector('main')?.textContent;
|
|
|
|
const articleHtml = document!.querySelector('article')?.innerHTML;
|
|
const articleText = document!.querySelector('article')?.textContent;
|
|
|
|
if (mainHtml && mainText) {
|
|
htmlBody = mainHtml;
|
|
textBody = mainText;
|
|
} else if (articleHtml && articleText) {
|
|
htmlBody = articleHtml;
|
|
textBody = articleText;
|
|
}
|
|
|
|
if (!title || !htmlBody || !textBody) {
|
|
return null;
|
|
}
|
|
|
|
return { title, htmlBody, textBody };
|
|
}
|
|
|
|
export async function parseTextFromHtml(html: string): Promise<string> {
|
|
let text = '';
|
|
|
|
await initParser();
|
|
|
|
const document = new DOMParser().parseFromString(html, 'text/html');
|
|
|
|
text = document!.textContent;
|
|
|
|
return text;
|
|
}
|