Merge branch 'dev'
CI/CD / deploy (push) Successful in 6m3s Details

This commit is contained in:
Jeff Emmett 2026-04-13 10:17:59 -04:00
commit 9a857c7bc2
6 changed files with 145 additions and 7 deletions

View File

@ -26,16 +26,27 @@ RUN apt-get update && apt-get install -y --no-install-recommends curl xz-utils c
&& rm -rf /tmp/typst* \ && rm -rf /tmp/typst* \
&& chmod +x /usr/local/bin/typst && chmod +x /usr/local/bin/typst
# MarkItDown venv stage — Python + pip install markitdown
FROM debian:bookworm-slim AS markitdown
RUN apt-get update && apt-get install -y --no-install-recommends python3 python3-venv \
&& python3 -m venv /opt/markitdown \
&& /opt/markitdown/bin/pip install --no-cache-dir markitdown \
&& rm -rf /var/lib/apt/lists/*
# Production stage # Production stage
FROM oven/bun:1-slim AS production FROM oven/bun:1-slim AS production
WORKDIR /app WORKDIR /app
# Install CA certificates for outbound HTTPS (link-preview, etc.) # Install CA certificates + python3 runtime (for markitdown)
RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates && rm -rf /var/lib/apt/lists/* RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates python3 && rm -rf /var/lib/apt/lists/*
# Install Typst binary (for rPubs PDF generation) # Install Typst binary (for rPubs PDF generation)
COPY --from=typst /usr/local/bin/typst /usr/local/bin/typst COPY --from=typst /usr/local/bin/typst /usr/local/bin/typst
# Install MarkItDown venv (for office document conversion)
COPY --from=markitdown /opt/markitdown /opt/markitdown
ENV PATH="/opt/markitdown/bin:$PATH"
# Copy built assets and server # Copy built assets and server
COPY --from=build /app/dist ./dist COPY --from=build /app/dist ./dist
COPY --from=build /app/server ./server COPY --from=build /app/server ./server

View File

@ -13,15 +13,16 @@ import TurndownService from 'turndown';
import { markdownToTiptap, extractPlainTextFromTiptap } from './markdown-tiptap'; import { markdownToTiptap, extractPlainTextFromTiptap } from './markdown-tiptap';
import { hashContent } from './index'; import { hashContent } from './index';
import type { ConvertedNote } from './index'; import type { ConvertedNote } from './index';
import { isMarkitdownFormat, convertWithMarkitdown } from './markitdown';
const turndown = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced' }); const turndown = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced' });
/** Dispatch file import by extension / MIME type. */ /** Dispatch file import by extension / MIME type. */
export function importFile( export async function importFile(
filename: string, filename: string,
data: Uint8Array, data: Uint8Array,
mimeType?: string, mimeType?: string,
): ConvertedNote { ): Promise<ConvertedNote> {
const ext = filename.substring(filename.lastIndexOf('.')).toLowerCase(); const ext = filename.substring(filename.lastIndexOf('.')).toLowerCase();
const textContent = () => new TextDecoder().decode(data); const textContent = () => new TextDecoder().decode(data);
@ -37,6 +38,9 @@ export function importFile(
if (['.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.bmp'].includes(ext)) { if (['.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.bmp'].includes(ext)) {
return importImageFile(filename, data, mimeType || guessMime(ext)); return importImageFile(filename, data, mimeType || guessMime(ext));
} }
if (isMarkitdownFormat(filename)) {
return importOfficeFile(filename, data, mimeType);
}
// Default: treat as text // Default: treat as text
try { try {
@ -47,6 +51,19 @@ export function importFile(
} }
} }
/** Import an office file (PDF/DOCX/PPTX/XLSX) via markitdown conversion. */
async function importOfficeFile(
filename: string,
data: Uint8Array,
mimeType?: string,
): Promise<ConvertedNote> {
const markdown = await convertWithMarkitdown(filename, data);
const note = importMarkdownFile(filename, markdown);
// Keep original file as attachment alongside the converted content
note.attachments = [{ filename, data, mimeType: mimeType || 'application/octet-stream' }];
return note;
}
/** Import a markdown file. */ /** Import a markdown file. */
export function importMarkdownFile(filename: string, content: string): ConvertedNote { export function importMarkdownFile(filename: string, content: string): ConvertedNote {
const title = titleFromFilename(filename); const title = titleFromFilename(filename);

View File

@ -0,0 +1,55 @@
/**
* MarkItDown converter spawns Microsoft's markitdown CLI to convert
* office documents (PDF, DOCX, PPTX, XLSX) to Markdown.
*
* Follows the same Bun.spawn pattern as rpubs/typst-compile.ts.
*/
import { writeFile, readFile, mkdir, rm } from "node:fs/promises";
import { join } from "node:path";
import { randomUUID } from "node:crypto";
export const MARKITDOWN_EXTS = [".pdf", ".docx", ".pptx", ".xlsx"] as const;
/** Check if filename has an extension that markitdown can convert. */
export function isMarkitdownFormat(filename: string): boolean {
const ext = filename.substring(filename.lastIndexOf(".")).toLowerCase();
return (MARKITDOWN_EXTS as readonly string[]).includes(ext);
}
/** Convert an office file to Markdown via the markitdown CLI. */
export async function convertWithMarkitdown(
filename: string,
data: Uint8Array,
): Promise<string> {
const jobId = randomUUID();
const tmpDir = join("/tmp", `markitdown-${jobId}`);
await mkdir(tmpDir, { recursive: true });
const inputPath = join(tmpDir, filename);
const outputPath = join(tmpDir, "output.md");
try {
await writeFile(inputPath, data);
const proc = Bun.spawn(
["markitdown", inputPath, "-o", outputPath],
{
stdout: "pipe",
stderr: "pipe",
},
);
const exitCode = await proc.exited;
if (exitCode !== 0) {
const stderr = await new Response(proc.stderr).text();
throw new Error(`markitdown failed (exit ${exitCode}): ${stderr}`);
}
const markdown = await readFile(outputPath, "utf-8");
return markdown;
} finally {
await rm(tmpDir, { recursive: true, force: true }).catch(() => {});
}
}

View File

@ -742,7 +742,7 @@ routes.post("/api/import/files", async (c) => {
for (const file of files) { for (const file of files) {
try { try {
const data = new Uint8Array(await file.arrayBuffer()); const data = new Uint8Array(await file.arrayBuffer());
const note = importFile(file.name, data, file.type || undefined); const note = await importFile(file.name, data, file.type || undefined);
convertedNotes.push(note); convertedNotes.push(note);
} catch (err) { } catch (err) {
warnings.push(`Failed to import ${file.name}: ${(err as Error).message}`); warnings.push(`Failed to import ${file.name}: ${(err as Error).message}`);

View File

@ -28,6 +28,7 @@ import { getRecentTasksForMI } from "../modules/rtasks/mod";
import { getRecentDocsForMI } from "../modules/rdocs/mod"; import { getRecentDocsForMI } from "../modules/rdocs/mod";
import { generateImage, generateVideoViaFal } from "./mi-media"; import { generateImage, generateVideoViaFal } from "./mi-media";
import { queryModuleContent } from "./mi-data-queries"; import { queryModuleContent } from "./mi-data-queries";
import { convertWithMarkitdown, isMarkitdownFormat } from "../modules/rdocs/converters/markitdown";
const mi = new Hono(); const mi = new Hono();
@ -561,6 +562,30 @@ mi.post("/execute-server-action", async (c) => {
} }
}); });
// ── POST /extract-text — convert office files to markdown via markitdown ──
mi.post("/extract-text", async (c) => {
const formData = await c.req.formData();
const file = formData.get("file");
if (!file || typeof file === "string" || !("arrayBuffer" in file)) {
return c.json({ error: "file required (FormData)" }, 400);
}
const filename = (file as File).name || "upload";
if (!isMarkitdownFormat(filename)) {
return c.json({ error: `Unsupported format: ${filename}` }, 400);
}
try {
const data = new Uint8Array(await (file as File).arrayBuffer());
const markdown = await convertWithMarkitdown(filename, data);
return c.json({ markdown, filename });
} catch (e: any) {
console.error("[mi/extract-text] Error:", e.message);
return c.json({ error: "Conversion failed: " + e.message }, 500);
}
});
// ── POST /suggestions — dynamic data-driven suggestions ── // ── POST /suggestions — dynamic data-driven suggestions ──
mi.post("/suggestions", async (c) => { mi.post("/suggestions", async (c) => {

View File

@ -4450,6 +4450,24 @@ Use real coordinates, YYYY-MM-DD dates, ISO currency codes. Ask clarifying quest
reader.readAsDataURL(file); reader.readAsDataURL(file);
} }
async function handleOfficeFile(file) {
try {
const form = new FormData();
form.append("file", file);
const res = await fetch("/api/mi/extract-text", { method: "POST", body: form });
const data = await res.json();
if (data.error) {
console.error("[canvas] office extract failed:", data.error);
return;
}
if (data.markdown) {
startTriage(data.markdown, "drop");
}
} catch (err) {
console.error("[canvas] office file conversion failed:", err);
}
}
function handleUrl(url) { function handleUrl(url) {
if (IMAGE_EXT_RE.test(url)) { if (IMAGE_EXT_RE.test(url)) {
window.__canvasApi.newShape("folk-image", { src: url }); window.__canvasApi.newShape("folk-image", { src: url });
@ -4481,7 +4499,19 @@ Use real coordinates, YYYY-MM-DD dates, ISO currency codes. Ask clarifying quest
dragEnterCount = 0; dragEnterCount = 0;
overlay.classList.remove("active"); overlay.classList.remove("active");
// 1. Check for image files // 1. Check for office files (PDF, DOCX, PPTX, XLSX) → extract text → triage
const OFFICE_EXTS = [".pdf", ".docx", ".pptx", ".xlsx"];
const officeFile = Array.from(e.dataTransfer?.files || []).find(f => {
const ext = f.name.substring(f.name.lastIndexOf(".")).toLowerCase();
return OFFICE_EXTS.includes(ext);
});
if (officeFile) {
e.preventDefault();
handleOfficeFile(officeFile);
return;
}
// 2. Check for image files
const imageFile = Array.from(e.dataTransfer?.files || []).find(f => f.type.startsWith("image/")); const imageFile = Array.from(e.dataTransfer?.files || []).find(f => f.type.startsWith("image/"));
if (imageFile) { if (imageFile) {
e.preventDefault(); e.preventDefault();
@ -4489,7 +4519,7 @@ Use real coordinates, YYYY-MM-DD dates, ISO currency codes. Ask clarifying quest
return; return;
} }
// 2. Check for text/URL // 3. Check for text/URL
const text = (e.dataTransfer?.getData("text/plain") || e.dataTransfer?.getData("text/uri-list") || "").trim(); const text = (e.dataTransfer?.getData("text/plain") || e.dataTransfer?.getData("text/uri-list") || "").trim();
if (text) { if (text) {
e.preventDefault(); e.preventDefault();