/** * MarkItDown converter — spawns Microsoft's markitdown CLI to convert * office documents (PDF, DOCX, PPTX, XLSX) to Markdown. * * Follows the same Bun.spawn pattern as rpubs/typst-compile.ts. */ import { writeFile, readFile, mkdir, rm } from "node:fs/promises"; import { join } from "node:path"; import { randomUUID } from "node:crypto"; export const MARKITDOWN_EXTS = [".pdf", ".docx", ".pptx", ".xlsx"] as const; /** Check if filename has an extension that markitdown can convert. */ export function isMarkitdownFormat(filename: string): boolean { const ext = filename.substring(filename.lastIndexOf(".")).toLowerCase(); return (MARKITDOWN_EXTS as readonly string[]).includes(ext); } /** Convert an office file to Markdown via the markitdown CLI. */ export async function convertWithMarkitdown( filename: string, data: Uint8Array, ): Promise { const jobId = randomUUID(); const tmpDir = join("/tmp", `markitdown-${jobId}`); await mkdir(tmpDir, { recursive: true }); const inputPath = join(tmpDir, filename); const outputPath = join(tmpDir, "output.md"); try { await writeFile(inputPath, data); const proc = Bun.spawn( ["markitdown", inputPath, "-o", outputPath], { stdout: "pipe", stderr: "pipe", }, ); const exitCode = await proc.exited; if (exitCode !== 0) { const stderr = await new Response(proc.stderr).text(); throw new Error(`markitdown failed (exit ${exitCode}): ${stderr}`); } const markdown = await readFile(outputPath, "utf-8"); return markdown; } finally { await rm(tmpDir, { recursive: true, force: true }).catch(() => {}); } }