56 lines
1.6 KiB
TypeScript
56 lines
1.6 KiB
TypeScript
/**
|
|
* MarkItDown converter — spawns Microsoft's markitdown CLI to convert
|
|
* office documents (PDF, DOCX, PPTX, XLSX) to Markdown.
|
|
*
|
|
* Follows the same Bun.spawn pattern as rpubs/typst-compile.ts.
|
|
*/
|
|
|
|
import { writeFile, readFile, mkdir, rm } from "node:fs/promises";
|
|
import { join } from "node:path";
|
|
import { randomUUID } from "node:crypto";
|
|
|
|
export const MARKITDOWN_EXTS = [".pdf", ".docx", ".pptx", ".xlsx"] as const;
|
|
|
|
/** Check if filename has an extension that markitdown can convert. */
|
|
export function isMarkitdownFormat(filename: string): boolean {
|
|
const ext = filename.substring(filename.lastIndexOf(".")).toLowerCase();
|
|
return (MARKITDOWN_EXTS as readonly string[]).includes(ext);
|
|
}
|
|
|
|
/** Convert an office file to Markdown via the markitdown CLI. */
|
|
export async function convertWithMarkitdown(
|
|
filename: string,
|
|
data: Uint8Array,
|
|
): Promise<string> {
|
|
const jobId = randomUUID();
|
|
const tmpDir = join("/tmp", `markitdown-${jobId}`);
|
|
await mkdir(tmpDir, { recursive: true });
|
|
|
|
const inputPath = join(tmpDir, filename);
|
|
const outputPath = join(tmpDir, "output.md");
|
|
|
|
try {
|
|
await writeFile(inputPath, data);
|
|
|
|
const proc = Bun.spawn(
|
|
["markitdown", inputPath, "-o", outputPath],
|
|
{
|
|
stdout: "pipe",
|
|
stderr: "pipe",
|
|
},
|
|
);
|
|
|
|
const exitCode = await proc.exited;
|
|
|
|
if (exitCode !== 0) {
|
|
const stderr = await new Response(proc.stderr).text();
|
|
throw new Error(`markitdown failed (exit ${exitCode}): ${stderr}`);
|
|
}
|
|
|
|
const markdown = await readFile(outputPath, "utf-8");
|
|
return markdown;
|
|
} finally {
|
|
await rm(tmpDir, { recursive: true, force: true }).catch(() => {});
|
|
}
|
|
}
|