rspace-online/modules/rfeeds/lib/feed-parser.ts

/**
 * RSS 2.0 + Atom 1.0 parser.
 *
 * Uses fast-xml-parser. Detects format by root element (<rss> vs <feed>).
 * Returns normalized FeedItem-shaped objects.
 */

import { XMLParser } from 'fast-xml-parser';

export interface ParsedFeedItem {
	guid: string;
	title: string;
	url: string;
	summary: string;
	publishedAt: number;
	author: string;
}

export interface ParsedFeed {
	title: string;
	description: string;
	items: ParsedFeedItem[];
}

const parser = new XMLParser({
	ignoreAttributes: false,
	attributeNamePrefix: '@_',
	textNodeName: '#text',
	isArray: (name) => ['item', 'entry'].includes(name),
});

function stripHtml(html: string): string {
	return html
		.replace(/<[^>]*>/g, '')
		.replace(/&amp;/g, '&')
		.replace(/&lt;/g, '<')
		.replace(/&gt;/g, '>')
		.replace(/&quot;/g, '"')
		.replace(/&#39;/g, "'")
		.replace(/&nbsp;/g, ' ')
		.replace(/\s+/g, ' ')
		.trim();
}

function truncate(s: string, max: number): string {
	if (s.length <= max) return s;
	return s.slice(0, max - 1) + '\u2026';
}

function parseDate(raw: string | number | undefined): number {
	if (!raw) return Date.now();
	const d = new Date(raw);
	return isNaN(d.getTime()) ? Date.now() : d.getTime();
}

function textOf(node: any): string {
	if (!node) return '';
	if (typeof node === 'string') return node;
	if (typeof node === 'number') return String(node);
	if (node['#text']) return String(node['#text']);
	return '';
}

function parseRSS(rss: any): ParsedFeed {
	const channel = rss.rss?.channel || rss.channel || {};
	const items = channel.item || [];

	return {
		title: textOf(channel.title) || '',
		description: textOf(channel.description) || '',
		items: items.map((item: any) => ({
			guid: textOf(item.guid) || item.link || crypto.randomUUID(),
			title: textOf(item.title) || '',
			url: textOf(item.link) || '',
			summary: truncate(stripHtml(textOf(item.description) || textOf(item['content:encoded']) || ''), 500),
			publishedAt: parseDate(item.pubDate),
			author: textOf(item.author) || textOf(item['dc:creator']) || '',
		})),
	};
}

function parseAtom(feed: any): ParsedFeed {
	const root = feed.feed || feed;
	const entries = root.entry || [];

	return {
		title: textOf(root.title) || '',
		description: textOf(root.subtitle) || '',
		items: entries.map((entry: any) => {
			const link = Array.isArray(entry.link)
				? entry.link.find((l: any) => l['@_rel'] === 'alternate' || !l['@_rel'])
				: entry.link;
			const href = link?.['@_href'] || textOf(link) || '';

			return {
				guid: textOf(entry.id) || href || crypto.randomUUID(),
				title: textOf(entry.title) || '',
				url: href,
				summary: truncate(stripHtml(textOf(entry.summary) || textOf(entry.content) || ''), 500),
				publishedAt: parseDate(entry.published || entry.updated),
				author: textOf(entry.author?.name) || '',
			};
		}),
	};
}

export function parseFeed(xml: string): ParsedFeed {
	const parsed = parser.parse(xml);

	// Detect format
	if (parsed.rss || parsed.channel) {
		return parseRSS(parsed);
	}
	if (parsed.feed) {
		return parseAtom(parsed);
	}

	// Fallback: try RSS-like structure
	if (parsed['rdf:RDF']?.item) {
		return {
			title: textOf(parsed['rdf:RDF'].channel?.title) || '',
			description: textOf(parsed['rdf:RDF'].channel?.description) || '',
			items: (parsed['rdf:RDF'].item || []).map((item: any) => ({
				guid: textOf(item.link) || crypto.randomUUID(),
				title: textOf(item.title) || '',
				url: textOf(item.link) || '',
				summary: truncate(stripHtml(textOf(item.description) || ''), 500),
				publishedAt: parseDate(item['dc:date']),
				author: textOf(item['dc:creator']) || '',
			})),
		};
	}

	throw new Error('Unrecognized feed format');
}

/**
 * Parse OPML XML into a list of feed URLs with titles.
 */
export function parseOPML(xml: string): { url: string; name: string }[] {
	const parsed = parser.parse(xml);
	const results: { url: string; name: string }[] = [];

	function walk(node: any) {
		if (!node) return;
		const arr = Array.isArray(node) ? node : [node];
		for (const item of arr) {
			const url = item['@_xmlUrl'] || item['@_xmlurl'];
			if (url) {
				results.push({
					url,
					name: item['@_title'] || item['@_text'] || url,
				});
			}
			if (item.outline) walk(item.outline);
		}
	}

	const body = parsed.opml?.body || parsed.body;
	if (body?.outline) walk(body.outline);
	return results;
}