rspace-online/modules/rfeeds/lib/feed-parser.ts

163 lines
4.2 KiB
TypeScript

/**
* RSS 2.0 + Atom 1.0 parser.
*
* Uses fast-xml-parser. Detects format by root element (<rss> vs <feed>).
* Returns normalized FeedItem-shaped objects.
*/
import { XMLParser } from 'fast-xml-parser';
export interface ParsedFeedItem {
guid: string;
title: string;
url: string;
summary: string;
publishedAt: number;
author: string;
}
export interface ParsedFeed {
title: string;
description: string;
items: ParsedFeedItem[];
}
const parser = new XMLParser({
ignoreAttributes: false,
attributeNamePrefix: '@_',
textNodeName: '#text',
isArray: (name) => ['item', 'entry'].includes(name),
});
function stripHtml(html: string): string {
return html
.replace(/<[^>]*>/g, '')
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
.replace(/&nbsp;/g, ' ')
.replace(/\s+/g, ' ')
.trim();
}
function truncate(s: string, max: number): string {
if (s.length <= max) return s;
return s.slice(0, max - 1) + '\u2026';
}
function parseDate(raw: string | number | undefined): number {
if (!raw) return Date.now();
const d = new Date(raw);
return isNaN(d.getTime()) ? Date.now() : d.getTime();
}
function textOf(node: any): string {
if (!node) return '';
if (typeof node === 'string') return node;
if (typeof node === 'number') return String(node);
if (node['#text']) return String(node['#text']);
return '';
}
function parseRSS(rss: any): ParsedFeed {
const channel = rss.rss?.channel || rss.channel || {};
const items = channel.item || [];
return {
title: textOf(channel.title) || '',
description: textOf(channel.description) || '',
items: items.map((item: any) => ({
guid: textOf(item.guid) || item.link || crypto.randomUUID(),
title: textOf(item.title) || '',
url: textOf(item.link) || '',
summary: truncate(stripHtml(textOf(item.description) || textOf(item['content:encoded']) || ''), 500),
publishedAt: parseDate(item.pubDate),
author: textOf(item.author) || textOf(item['dc:creator']) || '',
})),
};
}
function parseAtom(feed: any): ParsedFeed {
const root = feed.feed || feed;
const entries = root.entry || [];
return {
title: textOf(root.title) || '',
description: textOf(root.subtitle) || '',
items: entries.map((entry: any) => {
const link = Array.isArray(entry.link)
? entry.link.find((l: any) => l['@_rel'] === 'alternate' || !l['@_rel'])
: entry.link;
const href = link?.['@_href'] || textOf(link) || '';
return {
guid: textOf(entry.id) || href || crypto.randomUUID(),
title: textOf(entry.title) || '',
url: href,
summary: truncate(stripHtml(textOf(entry.summary) || textOf(entry.content) || ''), 500),
publishedAt: parseDate(entry.published || entry.updated),
author: textOf(entry.author?.name) || '',
};
}),
};
}
export function parseFeed(xml: string): ParsedFeed {
const parsed = parser.parse(xml);
// Detect format
if (parsed.rss || parsed.channel) {
return parseRSS(parsed);
}
if (parsed.feed) {
return parseAtom(parsed);
}
// Fallback: try RSS-like structure
if (parsed['rdf:RDF']?.item) {
return {
title: textOf(parsed['rdf:RDF'].channel?.title) || '',
description: textOf(parsed['rdf:RDF'].channel?.description) || '',
items: (parsed['rdf:RDF'].item || []).map((item: any) => ({
guid: textOf(item.link) || crypto.randomUUID(),
title: textOf(item.title) || '',
url: textOf(item.link) || '',
summary: truncate(stripHtml(textOf(item.description) || ''), 500),
publishedAt: parseDate(item['dc:date']),
author: textOf(item['dc:creator']) || '',
})),
};
}
throw new Error('Unrecognized feed format');
}
/**
* Parse OPML XML into a list of feed URLs with titles.
*/
export function parseOPML(xml: string): { url: string; name: string }[] {
const parsed = parser.parse(xml);
const results: { url: string; name: string }[] = [];
function walk(node: any) {
if (!node) return;
const arr = Array.isArray(node) ? node : [node];
for (const item of arr) {
const url = item['@_xmlUrl'] || item['@_xmlurl'];
if (url) {
results.push({
url,
name: item['@_title'] || item['@_text'] || url,
});
}
if (item.outline) walk(item.outline);
}
}
const body = parsed.opml?.body || parsed.body;
if (body?.outline) walk(body.outline);
return results;
}