/** * RSS 2.0 + Atom 1.0 parser. * * Uses fast-xml-parser. Detects format by root element ( vs ). * Returns normalized FeedItem-shaped objects. */ import { XMLParser } from 'fast-xml-parser'; export interface ParsedFeedItem { guid: string; title: string; url: string; summary: string; publishedAt: number; author: string; } export interface ParsedFeed { title: string; description: string; items: ParsedFeedItem[]; } const parser = new XMLParser({ ignoreAttributes: false, attributeNamePrefix: '@_', textNodeName: '#text', isArray: (name) => ['item', 'entry'].includes(name), }); function stripHtml(html: string): string { return html .replace(/<[^>]*>/g, '') .replace(/&/g, '&') .replace(/</g, '<') .replace(/>/g, '>') .replace(/"/g, '"') .replace(/'/g, "'") .replace(/ /g, ' ') .replace(/\s+/g, ' ') .trim(); } function truncate(s: string, max: number): string { if (s.length <= max) return s; return s.slice(0, max - 1) + '\u2026'; } function parseDate(raw: string | number | undefined): number { if (!raw) return Date.now(); const d = new Date(raw); return isNaN(d.getTime()) ? Date.now() : d.getTime(); } function textOf(node: any): string { if (!node) return ''; if (typeof node === 'string') return node; if (typeof node === 'number') return String(node); if (node['#text']) return String(node['#text']); return ''; } function parseRSS(rss: any): ParsedFeed { const channel = rss.rss?.channel || rss.channel || {}; const items = channel.item || []; return { title: textOf(channel.title) || '', description: textOf(channel.description) || '', items: items.map((item: any) => ({ guid: textOf(item.guid) || item.link || crypto.randomUUID(), title: textOf(item.title) || '', url: textOf(item.link) || '', summary: truncate(stripHtml(textOf(item.description) || textOf(item['content:encoded']) || ''), 500), publishedAt: parseDate(item.pubDate), author: textOf(item.author) || textOf(item['dc:creator']) || '', })), }; } function parseAtom(feed: any): ParsedFeed { const root = feed.feed || feed; const entries = root.entry || []; return { title: textOf(root.title) || '', description: textOf(root.subtitle) || '', items: entries.map((entry: any) => { const link = Array.isArray(entry.link) ? entry.link.find((l: any) => l['@_rel'] === 'alternate' || !l['@_rel']) : entry.link; const href = link?.['@_href'] || textOf(link) || ''; return { guid: textOf(entry.id) || href || crypto.randomUUID(), title: textOf(entry.title) || '', url: href, summary: truncate(stripHtml(textOf(entry.summary) || textOf(entry.content) || ''), 500), publishedAt: parseDate(entry.published || entry.updated), author: textOf(entry.author?.name) || '', }; }), }; } export function parseFeed(xml: string): ParsedFeed { const parsed = parser.parse(xml); // Detect format if (parsed.rss || parsed.channel) { return parseRSS(parsed); } if (parsed.feed) { return parseAtom(parsed); } // Fallback: try RSS-like structure if (parsed['rdf:RDF']?.item) { return { title: textOf(parsed['rdf:RDF'].channel?.title) || '', description: textOf(parsed['rdf:RDF'].channel?.description) || '', items: (parsed['rdf:RDF'].item || []).map((item: any) => ({ guid: textOf(item.link) || crypto.randomUUID(), title: textOf(item.title) || '', url: textOf(item.link) || '', summary: truncate(stripHtml(textOf(item.description) || ''), 500), publishedAt: parseDate(item['dc:date']), author: textOf(item['dc:creator']) || '', })), }; } throw new Error('Unrecognized feed format'); } /** * Parse OPML XML into a list of feed URLs with titles. */ export function parseOPML(xml: string): { url: string; name: string }[] { const parsed = parser.parse(xml); const results: { url: string; name: string }[] = []; function walk(node: any) { if (!node) return; const arr = Array.isArray(node) ? node : [node]; for (const item of arr) { const url = item['@_xmlUrl'] || item['@_xmlurl']; if (url) { results.push({ url, name: item['@_title'] || item['@_text'] || url, }); } if (item.outline) walk(item.outline); } } const body = parsed.opml?.body || parsed.body; if (body?.outline) walk(body.outline); return results; }