rspace-online/modules/pubs/parse-document.ts

199 lines
4.6 KiB
TypeScript

export interface DocumentSection {
heading?: string;
level?: number;
blocks: DocumentBlock[];
}
export type DocumentBlock =
| { type: "paragraph"; text: string }
| { type: "quote"; text: string; attribution?: string }
| { type: "list"; ordered: boolean; items: string[] }
| { type: "code"; language?: string; code: string }
| { type: "separator" };
export interface ParsedDocument {
title: string;
subtitle?: string;
author?: string;
sections: DocumentSection[];
}
export function parseMarkdown(
content: string,
title?: string,
author?: string,
): ParsedDocument {
const lines = content.split("\n");
const sections: DocumentSection[] = [];
let currentSection: DocumentSection = { blocks: [] };
let detectedTitle = title;
let inCodeBlock = false;
let codeBlockLang = "";
let codeLines: string[] = [];
let inBlockquote = false;
let quoteLines: string[] = [];
let inList = false;
let listItems: string[] = [];
let listOrdered = false;
function flushQuote() {
if (quoteLines.length > 0) {
const text = quoteLines.join("\n").trim();
currentSection.blocks.push({ type: "quote", text });
quoteLines = [];
inBlockquote = false;
}
}
function flushList() {
if (listItems.length > 0) {
currentSection.blocks.push({
type: "list",
ordered: listOrdered,
items: listItems,
});
listItems = [];
inList = false;
}
}
function flushCodeBlock() {
if (codeLines.length > 0) {
currentSection.blocks.push({
type: "code",
language: codeBlockLang || undefined,
code: codeLines.join("\n"),
});
codeLines = [];
inCodeBlock = false;
codeBlockLang = "";
}
}
for (const line of lines) {
// Code block handling
if (line.trimStart().startsWith("```")) {
if (inCodeBlock) {
flushCodeBlock();
} else {
flushQuote();
flushList();
inCodeBlock = true;
codeBlockLang = line.trimStart().slice(3).trim();
}
continue;
}
if (inCodeBlock) {
codeLines.push(line);
continue;
}
// Heading
const headingMatch = line.match(/^(#{1,3})\s+(.+)$/);
if (headingMatch) {
flushQuote();
flushList();
const level = headingMatch[1].length;
const headingText = headingMatch[2].trim();
// Use first h1 as title if none provided
if (level === 1 && !detectedTitle) {
detectedTitle = headingText;
continue;
}
// Start a new section
if (currentSection.blocks.length > 0 || currentSection.heading) {
sections.push(currentSection);
}
currentSection = { heading: headingText, level, blocks: [] };
continue;
}
// Horizontal rule
if (/^(-{3,}|_{3,}|\*{3,})\s*$/.test(line.trim())) {
flushQuote();
flushList();
currentSection.blocks.push({ type: "separator" });
continue;
}
// Blockquote
if (line.trimStart().startsWith("> ")) {
flushList();
inBlockquote = true;
quoteLines.push(line.trimStart().slice(2));
continue;
} else if (inBlockquote) {
if (line.trim() === "") {
flushQuote();
} else {
quoteLines.push(line);
}
continue;
}
// Ordered list
const orderedMatch = line.match(/^\s*\d+\.\s+(.+)$/);
if (orderedMatch) {
flushQuote();
if (inList && !listOrdered) {
flushList();
}
inList = true;
listOrdered = true;
listItems.push(orderedMatch[1]);
continue;
}
// Unordered list
const unorderedMatch = line.match(/^\s*[-*+]\s+(.+)$/);
if (unorderedMatch) {
flushQuote();
if (inList && listOrdered) {
flushList();
}
inList = true;
listOrdered = false;
listItems.push(unorderedMatch[1]);
continue;
}
// Empty line
if (line.trim() === "") {
flushQuote();
flushList();
continue;
}
// Regular paragraph text
flushQuote();
flushList();
// Check if last block is a paragraph — append to it
const lastBlock = currentSection.blocks[currentSection.blocks.length - 1];
if (lastBlock && lastBlock.type === "paragraph") {
lastBlock.text += " " + line.trim();
} else {
currentSection.blocks.push({ type: "paragraph", text: line.trim() });
}
}
// Flush remaining state
flushQuote();
flushList();
flushCodeBlock();
if (currentSection.blocks.length > 0 || currentSection.heading) {
sections.push(currentSection);
}
return {
title: detectedTitle || "Untitled",
author: author || undefined,
sections,
};
}