rspace-online/modules/rdocs/converters/markitdown.ts

56 lines
1.6 KiB
TypeScript

/**
* MarkItDown converter — spawns Microsoft's markitdown CLI to convert
* office documents (PDF, DOCX, PPTX, XLSX) to Markdown.
*
* Follows the same Bun.spawn pattern as rpubs/typst-compile.ts.
*/
import { writeFile, readFile, mkdir, rm } from "node:fs/promises";
import { join } from "node:path";
import { randomUUID } from "node:crypto";
export const MARKITDOWN_EXTS = [".pdf", ".docx", ".pptx", ".xlsx"] as const;
/** Check if filename has an extension that markitdown can convert. */
export function isMarkitdownFormat(filename: string): boolean {
const ext = filename.substring(filename.lastIndexOf(".")).toLowerCase();
return (MARKITDOWN_EXTS as readonly string[]).includes(ext);
}
/** Convert an office file to Markdown via the markitdown CLI. */
export async function convertWithMarkitdown(
filename: string,
data: Uint8Array,
): Promise<string> {
const jobId = randomUUID();
const tmpDir = join("/tmp", `markitdown-${jobId}`);
await mkdir(tmpDir, { recursive: true });
const inputPath = join(tmpDir, filename);
const outputPath = join(tmpDir, "output.md");
try {
await writeFile(inputPath, data);
const proc = Bun.spawn(
["markitdown", inputPath, "-o", outputPath],
{
stdout: "pipe",
stderr: "pipe",
},
);
const exitCode = await proc.exited;
if (exitCode !== 0) {
const stderr = await new Response(proc.stderr).text();
throw new Error(`markitdown failed (exit ${exitCode}): ${stderr}`);
}
const markdown = await readFile(outputPath, "utf-8");
return markdown;
} finally {
await rm(tmpDir, { recursive: true, force: true }).catch(() => {});
}
}