Merge branch 'dev'
CI/CD / deploy (push) Successful in 6m3s
Details
CI/CD / deploy (push) Successful in 6m3s
Details
This commit is contained in:
commit
9a857c7bc2
15
Dockerfile
15
Dockerfile
|
|
@ -26,16 +26,27 @@ RUN apt-get update && apt-get install -y --no-install-recommends curl xz-utils c
|
|||
&& rm -rf /tmp/typst* \
|
||||
&& chmod +x /usr/local/bin/typst
|
||||
|
||||
# MarkItDown venv stage — Python + pip install markitdown
|
||||
FROM debian:bookworm-slim AS markitdown
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends python3 python3-venv \
|
||||
&& python3 -m venv /opt/markitdown \
|
||||
&& /opt/markitdown/bin/pip install --no-cache-dir markitdown \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Production stage
|
||||
FROM oven/bun:1-slim AS production
|
||||
WORKDIR /app
|
||||
|
||||
# Install CA certificates for outbound HTTPS (link-preview, etc.)
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates && rm -rf /var/lib/apt/lists/*
|
||||
# Install CA certificates + python3 runtime (for markitdown)
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates python3 && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install Typst binary (for rPubs PDF generation)
|
||||
COPY --from=typst /usr/local/bin/typst /usr/local/bin/typst
|
||||
|
||||
# Install MarkItDown venv (for office document conversion)
|
||||
COPY --from=markitdown /opt/markitdown /opt/markitdown
|
||||
ENV PATH="/opt/markitdown/bin:$PATH"
|
||||
|
||||
# Copy built assets and server
|
||||
COPY --from=build /app/dist ./dist
|
||||
COPY --from=build /app/server ./server
|
||||
|
|
|
|||
|
|
@ -13,15 +13,16 @@ import TurndownService from 'turndown';
|
|||
import { markdownToTiptap, extractPlainTextFromTiptap } from './markdown-tiptap';
|
||||
import { hashContent } from './index';
|
||||
import type { ConvertedNote } from './index';
|
||||
import { isMarkitdownFormat, convertWithMarkitdown } from './markitdown';
|
||||
|
||||
const turndown = new TurndownService({ headingStyle: 'atx', codeBlockStyle: 'fenced' });
|
||||
|
||||
/** Dispatch file import by extension / MIME type. */
|
||||
export function importFile(
|
||||
export async function importFile(
|
||||
filename: string,
|
||||
data: Uint8Array,
|
||||
mimeType?: string,
|
||||
): ConvertedNote {
|
||||
): Promise<ConvertedNote> {
|
||||
const ext = filename.substring(filename.lastIndexOf('.')).toLowerCase();
|
||||
const textContent = () => new TextDecoder().decode(data);
|
||||
|
||||
|
|
@ -37,6 +38,9 @@ export function importFile(
|
|||
if (['.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.bmp'].includes(ext)) {
|
||||
return importImageFile(filename, data, mimeType || guessMime(ext));
|
||||
}
|
||||
if (isMarkitdownFormat(filename)) {
|
||||
return importOfficeFile(filename, data, mimeType);
|
||||
}
|
||||
|
||||
// Default: treat as text
|
||||
try {
|
||||
|
|
@ -47,6 +51,19 @@ export function importFile(
|
|||
}
|
||||
}
|
||||
|
||||
/** Import an office file (PDF/DOCX/PPTX/XLSX) via markitdown conversion. */
|
||||
async function importOfficeFile(
|
||||
filename: string,
|
||||
data: Uint8Array,
|
||||
mimeType?: string,
|
||||
): Promise<ConvertedNote> {
|
||||
const markdown = await convertWithMarkitdown(filename, data);
|
||||
const note = importMarkdownFile(filename, markdown);
|
||||
// Keep original file as attachment alongside the converted content
|
||||
note.attachments = [{ filename, data, mimeType: mimeType || 'application/octet-stream' }];
|
||||
return note;
|
||||
}
|
||||
|
||||
/** Import a markdown file. */
|
||||
export function importMarkdownFile(filename: string, content: string): ConvertedNote {
|
||||
const title = titleFromFilename(filename);
|
||||
|
|
|
|||
|
|
@ -0,0 +1,55 @@
|
|||
/**
|
||||
* MarkItDown converter — spawns Microsoft's markitdown CLI to convert
|
||||
* office documents (PDF, DOCX, PPTX, XLSX) to Markdown.
|
||||
*
|
||||
* Follows the same Bun.spawn pattern as rpubs/typst-compile.ts.
|
||||
*/
|
||||
|
||||
import { writeFile, readFile, mkdir, rm } from "node:fs/promises";
|
||||
import { join } from "node:path";
|
||||
import { randomUUID } from "node:crypto";
|
||||
|
||||
export const MARKITDOWN_EXTS = [".pdf", ".docx", ".pptx", ".xlsx"] as const;
|
||||
|
||||
/** Check if filename has an extension that markitdown can convert. */
|
||||
export function isMarkitdownFormat(filename: string): boolean {
|
||||
const ext = filename.substring(filename.lastIndexOf(".")).toLowerCase();
|
||||
return (MARKITDOWN_EXTS as readonly string[]).includes(ext);
|
||||
}
|
||||
|
||||
/** Convert an office file to Markdown via the markitdown CLI. */
|
||||
export async function convertWithMarkitdown(
|
||||
filename: string,
|
||||
data: Uint8Array,
|
||||
): Promise<string> {
|
||||
const jobId = randomUUID();
|
||||
const tmpDir = join("/tmp", `markitdown-${jobId}`);
|
||||
await mkdir(tmpDir, { recursive: true });
|
||||
|
||||
const inputPath = join(tmpDir, filename);
|
||||
const outputPath = join(tmpDir, "output.md");
|
||||
|
||||
try {
|
||||
await writeFile(inputPath, data);
|
||||
|
||||
const proc = Bun.spawn(
|
||||
["markitdown", inputPath, "-o", outputPath],
|
||||
{
|
||||
stdout: "pipe",
|
||||
stderr: "pipe",
|
||||
},
|
||||
);
|
||||
|
||||
const exitCode = await proc.exited;
|
||||
|
||||
if (exitCode !== 0) {
|
||||
const stderr = await new Response(proc.stderr).text();
|
||||
throw new Error(`markitdown failed (exit ${exitCode}): ${stderr}`);
|
||||
}
|
||||
|
||||
const markdown = await readFile(outputPath, "utf-8");
|
||||
return markdown;
|
||||
} finally {
|
||||
await rm(tmpDir, { recursive: true, force: true }).catch(() => {});
|
||||
}
|
||||
}
|
||||
|
|
@ -742,7 +742,7 @@ routes.post("/api/import/files", async (c) => {
|
|||
for (const file of files) {
|
||||
try {
|
||||
const data = new Uint8Array(await file.arrayBuffer());
|
||||
const note = importFile(file.name, data, file.type || undefined);
|
||||
const note = await importFile(file.name, data, file.type || undefined);
|
||||
convertedNotes.push(note);
|
||||
} catch (err) {
|
||||
warnings.push(`Failed to import ${file.name}: ${(err as Error).message}`);
|
||||
|
|
|
|||
|
|
@ -28,6 +28,7 @@ import { getRecentTasksForMI } from "../modules/rtasks/mod";
|
|||
import { getRecentDocsForMI } from "../modules/rdocs/mod";
|
||||
import { generateImage, generateVideoViaFal } from "./mi-media";
|
||||
import { queryModuleContent } from "./mi-data-queries";
|
||||
import { convertWithMarkitdown, isMarkitdownFormat } from "../modules/rdocs/converters/markitdown";
|
||||
|
||||
const mi = new Hono();
|
||||
|
||||
|
|
@ -561,6 +562,30 @@ mi.post("/execute-server-action", async (c) => {
|
|||
}
|
||||
});
|
||||
|
||||
// ── POST /extract-text — convert office files to markdown via markitdown ──
|
||||
|
||||
mi.post("/extract-text", async (c) => {
|
||||
const formData = await c.req.formData();
|
||||
const file = formData.get("file");
|
||||
if (!file || typeof file === "string" || !("arrayBuffer" in file)) {
|
||||
return c.json({ error: "file required (FormData)" }, 400);
|
||||
}
|
||||
|
||||
const filename = (file as File).name || "upload";
|
||||
if (!isMarkitdownFormat(filename)) {
|
||||
return c.json({ error: `Unsupported format: ${filename}` }, 400);
|
||||
}
|
||||
|
||||
try {
|
||||
const data = new Uint8Array(await (file as File).arrayBuffer());
|
||||
const markdown = await convertWithMarkitdown(filename, data);
|
||||
return c.json({ markdown, filename });
|
||||
} catch (e: any) {
|
||||
console.error("[mi/extract-text] Error:", e.message);
|
||||
return c.json({ error: "Conversion failed: " + e.message }, 500);
|
||||
}
|
||||
});
|
||||
|
||||
// ── POST /suggestions — dynamic data-driven suggestions ──
|
||||
|
||||
mi.post("/suggestions", async (c) => {
|
||||
|
|
|
|||
|
|
@ -4450,6 +4450,24 @@ Use real coordinates, YYYY-MM-DD dates, ISO currency codes. Ask clarifying quest
|
|||
reader.readAsDataURL(file);
|
||||
}
|
||||
|
||||
async function handleOfficeFile(file) {
|
||||
try {
|
||||
const form = new FormData();
|
||||
form.append("file", file);
|
||||
const res = await fetch("/api/mi/extract-text", { method: "POST", body: form });
|
||||
const data = await res.json();
|
||||
if (data.error) {
|
||||
console.error("[canvas] office extract failed:", data.error);
|
||||
return;
|
||||
}
|
||||
if (data.markdown) {
|
||||
startTriage(data.markdown, "drop");
|
||||
}
|
||||
} catch (err) {
|
||||
console.error("[canvas] office file conversion failed:", err);
|
||||
}
|
||||
}
|
||||
|
||||
function handleUrl(url) {
|
||||
if (IMAGE_EXT_RE.test(url)) {
|
||||
window.__canvasApi.newShape("folk-image", { src: url });
|
||||
|
|
@ -4481,7 +4499,19 @@ Use real coordinates, YYYY-MM-DD dates, ISO currency codes. Ask clarifying quest
|
|||
dragEnterCount = 0;
|
||||
overlay.classList.remove("active");
|
||||
|
||||
// 1. Check for image files
|
||||
// 1. Check for office files (PDF, DOCX, PPTX, XLSX) → extract text → triage
|
||||
const OFFICE_EXTS = [".pdf", ".docx", ".pptx", ".xlsx"];
|
||||
const officeFile = Array.from(e.dataTransfer?.files || []).find(f => {
|
||||
const ext = f.name.substring(f.name.lastIndexOf(".")).toLowerCase();
|
||||
return OFFICE_EXTS.includes(ext);
|
||||
});
|
||||
if (officeFile) {
|
||||
e.preventDefault();
|
||||
handleOfficeFile(officeFile);
|
||||
return;
|
||||
}
|
||||
|
||||
// 2. Check for image files
|
||||
const imageFile = Array.from(e.dataTransfer?.files || []).find(f => f.type.startsWith("image/"));
|
||||
if (imageFile) {
|
||||
e.preventDefault();
|
||||
|
|
@ -4489,7 +4519,7 @@ Use real coordinates, YYYY-MM-DD dates, ISO currency codes. Ask clarifying quest
|
|||
return;
|
||||
}
|
||||
|
||||
// 2. Check for text/URL
|
||||
// 3. Check for text/URL
|
||||
const text = (e.dataTransfer?.getData("text/plain") || e.dataTransfer?.getData("text/uri-list") || "").trim();
|
||||
if (text) {
|
||||
e.preventDefault();
|
||||
|
|
|
|||
Loading…
Reference in New Issue