From 698a630b8b697e5ee6d76e0da1799582cebaf0aa Mon Sep 17 00:00:00 2001 From: Jeff Emmett Date: Mon, 13 Apr 2026 10:17:49 -0400 Subject: [PATCH] feat(rdocs+mi): add MarkItDown integration for PDF/DOCX/PPTX/XLSX conversion Office documents dropped onto canvas or imported via rDocs are now converted to Markdown using Microsoft's markitdown CLI. Canvas drops trigger triage; rDocs imports create rich notes with the original file kept as an attachment. Co-Authored-By: Claude Opus 4.6 --- Dockerfile | 15 ++++++- modules/rdocs/converters/file-import.ts | 21 +++++++++- modules/rdocs/converters/markitdown.ts | 55 +++++++++++++++++++++++++ modules/rdocs/mod.ts | 2 +- server/mi-routes.ts | 25 +++++++++++ website/canvas.html | 34 ++++++++++++++- 6 files changed, 145 insertions(+), 7 deletions(-) create mode 100644 modules/rdocs/converters/markitdown.ts diff --git a/Dockerfile b/Dockerfile index 85d92a71..2e2131ad 100644 --- a/Dockerfile +++ b/Dockerfile @@ -26,16 +26,27 @@ RUN apt-get update && apt-get install -y --no-install-recommends curl xz-utils c && rm -rf /tmp/typst* \ && chmod +x /usr/local/bin/typst +# MarkItDown venv stage — Python + pip install markitdown +FROM debian:bookworm-slim AS markitdown +RUN apt-get update && apt-get install -y --no-install-recommends python3 python3-venv \ + && python3 -m venv /opt/markitdown \ + && /opt/markitdown/bin/pip install --no-cache-dir markitdown \ + && rm -rf /var/lib/apt/lists/* + # Production stage FROM oven/bun:1-slim AS production WORKDIR /app -# Install CA certificates for outbound HTTPS (link-preview, etc.) -RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates && rm -rf /var/lib/apt/lists/* +# Install CA certificates + python3 runtime (for markitdown) +RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates python3 && rm -rf /var/lib/apt/lists/* # Install Typst binary (for rPubs PDF generation) COPY --from=typst /usr/local/bin/typst /usr/local/bin/typst +# Install MarkItDown venv (for office document conversion) +COPY --from=markitdown /opt/markitdown /opt/markitdown +ENV PATH="/opt/markitdown/bin:$PATH" + # Copy built assets and server COPY --from=build /app/dist ./dist COPY --from=build /app/server ./server diff --git a/modules/rdocs/converters/file-import.ts b/modules/rdocs/converters/file-import.ts index 0b9baf52..4c67d8e3 100644 --- a/modules/rdocs/converters/file-import.ts +++ b/modules/rdocs/converters/file-import.ts @@ -13,15 +13,16 @@ import TurndownService from 'turndown'; import { markdownToTiptap, extractPlainTextFromTiptap } from './markdown-tiptap'; import { hashContent } from './index'; import type { ConvertedNote } from './index'; +import { isMarkitdownFormat, convertWithMarkitdown } from './markitdown'; const turndown = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced' }); /** Dispatch file import by extension / MIME type. */ -export function importFile( +export async function importFile( filename: string, data: Uint8Array, mimeType?: string, -): ConvertedNote { +): Promise { const ext = filename.substring(filename.lastIndexOf('.')).toLowerCase(); const textContent = () => new TextDecoder().decode(data); @@ -37,6 +38,9 @@ export function importFile( if (['.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.bmp'].includes(ext)) { return importImageFile(filename, data, mimeType || guessMime(ext)); } + if (isMarkitdownFormat(filename)) { + return importOfficeFile(filename, data, mimeType); + } // Default: treat as text try { @@ -47,6 +51,19 @@ export function importFile( } } +/** Import an office file (PDF/DOCX/PPTX/XLSX) via markitdown conversion. */ +async function importOfficeFile( + filename: string, + data: Uint8Array, + mimeType?: string, +): Promise { + const markdown = await convertWithMarkitdown(filename, data); + const note = importMarkdownFile(filename, markdown); + // Keep original file as attachment alongside the converted content + note.attachments = [{ filename, data, mimeType: mimeType || 'application/octet-stream' }]; + return note; +} + /** Import a markdown file. */ export function importMarkdownFile(filename: string, content: string): ConvertedNote { const title = titleFromFilename(filename); diff --git a/modules/rdocs/converters/markitdown.ts b/modules/rdocs/converters/markitdown.ts new file mode 100644 index 00000000..42c27390 --- /dev/null +++ b/modules/rdocs/converters/markitdown.ts @@ -0,0 +1,55 @@ +/** + * MarkItDown converter — spawns Microsoft's markitdown CLI to convert + * office documents (PDF, DOCX, PPTX, XLSX) to Markdown. + * + * Follows the same Bun.spawn pattern as rpubs/typst-compile.ts. + */ + +import { writeFile, readFile, mkdir, rm } from "node:fs/promises"; +import { join } from "node:path"; +import { randomUUID } from "node:crypto"; + +export const MARKITDOWN_EXTS = [".pdf", ".docx", ".pptx", ".xlsx"] as const; + +/** Check if filename has an extension that markitdown can convert. */ +export function isMarkitdownFormat(filename: string): boolean { + const ext = filename.substring(filename.lastIndexOf(".")).toLowerCase(); + return (MARKITDOWN_EXTS as readonly string[]).includes(ext); +} + +/** Convert an office file to Markdown via the markitdown CLI. */ +export async function convertWithMarkitdown( + filename: string, + data: Uint8Array, +): Promise { + const jobId = randomUUID(); + const tmpDir = join("/tmp", `markitdown-${jobId}`); + await mkdir(tmpDir, { recursive: true }); + + const inputPath = join(tmpDir, filename); + const outputPath = join(tmpDir, "output.md"); + + try { + await writeFile(inputPath, data); + + const proc = Bun.spawn( + ["markitdown", inputPath, "-o", outputPath], + { + stdout: "pipe", + stderr: "pipe", + }, + ); + + const exitCode = await proc.exited; + + if (exitCode !== 0) { + const stderr = await new Response(proc.stderr).text(); + throw new Error(`markitdown failed (exit ${exitCode}): ${stderr}`); + } + + const markdown = await readFile(outputPath, "utf-8"); + return markdown; + } finally { + await rm(tmpDir, { recursive: true, force: true }).catch(() => {}); + } +} diff --git a/modules/rdocs/mod.ts b/modules/rdocs/mod.ts index 13387505..da2a5e1e 100644 --- a/modules/rdocs/mod.ts +++ b/modules/rdocs/mod.ts @@ -742,7 +742,7 @@ routes.post("/api/import/files", async (c) => { for (const file of files) { try { const data = new Uint8Array(await file.arrayBuffer()); - const note = importFile(file.name, data, file.type || undefined); + const note = await importFile(file.name, data, file.type || undefined); convertedNotes.push(note); } catch (err) { warnings.push(`Failed to import ${file.name}: ${(err as Error).message}`); diff --git a/server/mi-routes.ts b/server/mi-routes.ts index 7918b092..4010e780 100644 --- a/server/mi-routes.ts +++ b/server/mi-routes.ts @@ -28,6 +28,7 @@ import { getRecentTasksForMI } from "../modules/rtasks/mod"; import { getRecentDocsForMI } from "../modules/rdocs/mod"; import { generateImage, generateVideoViaFal } from "./mi-media"; import { queryModuleContent } from "./mi-data-queries"; +import { convertWithMarkitdown, isMarkitdownFormat } from "../modules/rdocs/converters/markitdown"; const mi = new Hono(); @@ -561,6 +562,30 @@ mi.post("/execute-server-action", async (c) => { } }); +// ── POST /extract-text — convert office files to markdown via markitdown ── + +mi.post("/extract-text", async (c) => { + const formData = await c.req.formData(); + const file = formData.get("file"); + if (!file || typeof file === "string" || !("arrayBuffer" in file)) { + return c.json({ error: "file required (FormData)" }, 400); + } + + const filename = (file as File).name || "upload"; + if (!isMarkitdownFormat(filename)) { + return c.json({ error: `Unsupported format: ${filename}` }, 400); + } + + try { + const data = new Uint8Array(await (file as File).arrayBuffer()); + const markdown = await convertWithMarkitdown(filename, data); + return c.json({ markdown, filename }); + } catch (e: any) { + console.error("[mi/extract-text] Error:", e.message); + return c.json({ error: "Conversion failed: " + e.message }, 500); + } +}); + // ── POST /suggestions — dynamic data-driven suggestions ── mi.post("/suggestions", async (c) => { diff --git a/website/canvas.html b/website/canvas.html index e43108f5..88f29120 100644 --- a/website/canvas.html +++ b/website/canvas.html @@ -4450,6 +4450,24 @@ Use real coordinates, YYYY-MM-DD dates, ISO currency codes. Ask clarifying quest reader.readAsDataURL(file); } + async function handleOfficeFile(file) { + try { + const form = new FormData(); + form.append("file", file); + const res = await fetch("/api/mi/extract-text", { method: "POST", body: form }); + const data = await res.json(); + if (data.error) { + console.error("[canvas] office extract failed:", data.error); + return; + } + if (data.markdown) { + startTriage(data.markdown, "drop"); + } + } catch (err) { + console.error("[canvas] office file conversion failed:", err); + } + } + function handleUrl(url) { if (IMAGE_EXT_RE.test(url)) { window.__canvasApi.newShape("folk-image", { src: url }); @@ -4481,7 +4499,19 @@ Use real coordinates, YYYY-MM-DD dates, ISO currency codes. Ask clarifying quest dragEnterCount = 0; overlay.classList.remove("active"); - // 1. Check for image files + // 1. Check for office files (PDF, DOCX, PPTX, XLSX) → extract text → triage + const OFFICE_EXTS = [".pdf", ".docx", ".pptx", ".xlsx"]; + const officeFile = Array.from(e.dataTransfer?.files || []).find(f => { + const ext = f.name.substring(f.name.lastIndexOf(".")).toLowerCase(); + return OFFICE_EXTS.includes(ext); + }); + if (officeFile) { + e.preventDefault(); + handleOfficeFile(officeFile); + return; + } + + // 2. Check for image files const imageFile = Array.from(e.dataTransfer?.files || []).find(f => f.type.startsWith("image/")); if (imageFile) { e.preventDefault(); @@ -4489,7 +4519,7 @@ Use real coordinates, YYYY-MM-DD dates, ISO currency codes. Ask clarifying quest return; } - // 2. Check for text/URL + // 3. Check for text/URL const text = (e.dataTransfer?.getData("text/plain") || e.dataTransfer?.getData("text/uri-list") || "").trim(); if (text) { e.preventDefault();