From f88e1082f0051b3c67450d2da3f35f9afbaa2afe Mon Sep 17 00:00:00 2001 From: Jeff Emmett Date: Sun, 14 Dec 2025 12:52:16 -0500 Subject: [PATCH] Fix ai-internal network name, add PDF OCR service MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Correct network name from ai-orchestrator_ai-internal to ai-internal - Add pdf-ocr service for ocr.jeffemmett.com (Dockerfile, app.py, compose) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- docker-compose.yml | 2 +- pdf-ocr/Dockerfile | 29 +++++ pdf-ocr/app.py | 261 +++++++++++++++++++++++++++++++++++++ pdf-ocr/docker-compose.yml | 20 +++ 4 files changed, 311 insertions(+), 1 deletion(-) create mode 100644 pdf-ocr/Dockerfile create mode 100644 pdf-ocr/app.py create mode 100644 pdf-ocr/docker-compose.yml diff --git a/docker-compose.yml b/docker-compose.yml index 63795b1..bdf469b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -33,4 +33,4 @@ networks: external: true ai-internal: external: true - name: ai-orchestrator_ai-internal + name: ai-internal diff --git a/pdf-ocr/Dockerfile b/pdf-ocr/Dockerfile new file mode 100644 index 0000000..fdea62d --- /dev/null +++ b/pdf-ocr/Dockerfile @@ -0,0 +1,29 @@ +FROM python:3.11-slim + +# Install system dependencies for OCR +RUN apt-get update && apt-get install -y --no-install-recommends \ + ocrmypdf \ + tesseract-ocr \ + tesseract-ocr-eng \ + ghostscript \ + unpaper \ + pngquant \ + && rm -rf /var/lib/apt/lists/* + +# Install Python dependencies +RUN pip install --no-cache-dir \ + fastapi \ + uvicorn[standard] \ + python-multipart \ + aiofiles + +WORKDIR /app + +# Create directories +RUN mkdir -p /app/uploads /app/processed + +COPY app.py /app/ + +EXPOSE 8000 + +CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/pdf-ocr/app.py b/pdf-ocr/app.py new file mode 100644 index 0000000..cfe0696 --- /dev/null +++ b/pdf-ocr/app.py @@ -0,0 +1,261 @@ +""" +PDF OCR Service +Converts image-based PDFs to searchable text PDFs using ocrmypdf +""" +import os +import subprocess +import tempfile +import asyncio +from pathlib import Path +from typing import Optional +from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks +from fastapi.responses import FileResponse, JSONResponse +from fastapi.middleware.cors import CORSMiddleware +import aiofiles + +app = FastAPI( + title="PDF OCR Service", + description="Convert scanned/image-based PDFs to searchable text PDFs", + version="1.0.0" +) + +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +UPLOAD_DIR = Path("/app/uploads") +PROCESSED_DIR = Path("/app/processed") + +# Ensure directories exist +UPLOAD_DIR.mkdir(exist_ok=True) +PROCESSED_DIR.mkdir(exist_ok=True) + + +def run_ocr(input_path: Path, output_path: Path, language: str = "eng") -> dict: + """Run ocrmypdf on a PDF file""" + try: + result = subprocess.run( + [ + "ocrmypdf", + "--skip-text", # Skip pages that already have text + "--optimize", "1", # Light optimization + "--language", language, + "--output-type", "pdf", + "--jobs", "2", # Parallel processing + str(input_path), + str(output_path) + ], + capture_output=True, + text=True, + timeout=600 # 10 minute timeout + ) + + if result.returncode == 0: + return {"success": True, "message": "OCR completed successfully"} + elif result.returncode == 6: + # Exit code 6 means the PDF already has text + # Copy the original file + import shutil + shutil.copy(input_path, output_path) + return {"success": True, "message": "PDF already contains text, copied as-is"} + else: + return { + "success": False, + "message": f"OCR failed: {result.stderr}", + "returncode": result.returncode + } + except subprocess.TimeoutExpired: + return {"success": False, "message": "OCR timed out after 10 minutes"} + except Exception as e: + return {"success": False, "message": str(e)} + + +@app.get("/") +async def root(): + return { + "service": "PDF OCR Service", + "version": "1.0.0", + "endpoints": { + "POST /ocr": "Upload PDF and get OCR'd version", + "GET /health": "Health check" + } + } + + +@app.get("/health") +async def health(): + # Check if ocrmypdf is available + try: + result = subprocess.run(["ocrmypdf", "--version"], capture_output=True, text=True) + ocrmypdf_version = result.stdout.strip() + except: + ocrmypdf_version = "not available" + + try: + result = subprocess.run(["tesseract", "--version"], capture_output=True, text=True) + tesseract_version = result.stdout.split("\n")[0] + except: + tesseract_version = "not available" + + return { + "status": "healthy", + "ocrmypdf": ocrmypdf_version, + "tesseract": tesseract_version + } + + +@app.post("/ocr") +async def ocr_pdf( + file: UploadFile = File(...), + language: str = "eng" +): + """ + Upload a PDF and receive an OCR'd searchable version. + + - **file**: PDF file to process + - **language**: OCR language (default: eng) + + Returns the processed PDF file. + """ + if not file.filename.lower().endswith('.pdf'): + raise HTTPException(status_code=400, detail="Only PDF files are supported") + + # Save uploaded file + input_path = UPLOAD_DIR / file.filename + async with aiofiles.open(input_path, 'wb') as f: + content = await file.read() + await f.write(content) + + # Generate output filename + output_filename = f"ocr_{file.filename}" + output_path = PROCESSED_DIR / output_filename + + # Run OCR + result = run_ocr(input_path, output_path, language) + + if not result["success"]: + # Cleanup + if input_path.exists(): + input_path.unlink() + raise HTTPException(status_code=500, detail=result["message"]) + + # Check if output was created + if not output_path.exists(): + raise HTTPException(status_code=500, detail="OCR completed but output file not found") + + # Return the processed file + return FileResponse( + path=output_path, + filename=output_filename, + media_type="application/pdf", + headers={ + "X-OCR-Status": result["message"] + } + ) + + +@app.post("/ocr/async") +async def ocr_pdf_async( + file: UploadFile = File(...), + language: str = "eng", + background_tasks: BackgroundTasks = None +): + """ + Upload a PDF for async OCR processing. + Returns a job ID to check status later. + """ + import uuid + + if not file.filename.lower().endswith('.pdf'): + raise HTTPException(status_code=400, detail="Only PDF files are supported") + + job_id = str(uuid.uuid4())[:8] + + # Save uploaded file with job ID + input_path = UPLOAD_DIR / f"{job_id}_{file.filename}" + async with aiofiles.open(input_path, 'wb') as f: + content = await file.read() + await f.write(content) + + output_path = PROCESSED_DIR / f"ocr_{job_id}_{file.filename}" + + # Create status file + status_path = PROCESSED_DIR / f"{job_id}.status" + async with aiofiles.open(status_path, 'w') as f: + await f.write("processing") + + # Run OCR in background + def process_ocr(): + result = run_ocr(input_path, output_path, language) + with open(status_path, 'w') as f: + if result["success"]: + f.write(f"completed:{output_path.name}") + else: + f.write(f"failed:{result['message']}") + + background_tasks.add_task(process_ocr) + + return { + "job_id": job_id, + "status": "processing", + "check_status": f"/ocr/status/{job_id}", + "download": f"/ocr/download/{job_id}" + } + + +@app.get("/ocr/status/{job_id}") +async def get_ocr_status(job_id: str): + """Check the status of an async OCR job""" + status_path = PROCESSED_DIR / f"{job_id}.status" + + if not status_path.exists(): + raise HTTPException(status_code=404, detail="Job not found") + + async with aiofiles.open(status_path, 'r') as f: + status = await f.read() + + if status == "processing": + return {"job_id": job_id, "status": "processing"} + elif status.startswith("completed:"): + filename = status.split(":", 1)[1] + return { + "job_id": job_id, + "status": "completed", + "download": f"/ocr/download/{job_id}" + } + elif status.startswith("failed:"): + error = status.split(":", 1)[1] + return {"job_id": job_id, "status": "failed", "error": error} + + return {"job_id": job_id, "status": "unknown"} + + +@app.get("/ocr/download/{job_id}") +async def download_ocr_result(job_id: str): + """Download the OCR'd PDF for a completed job""" + status_path = PROCESSED_DIR / f"{job_id}.status" + + if not status_path.exists(): + raise HTTPException(status_code=404, detail="Job not found") + + async with aiofiles.open(status_path, 'r') as f: + status = await f.read() + + if not status.startswith("completed:"): + raise HTTPException(status_code=400, detail="Job not completed yet") + + filename = status.split(":", 1)[1] + output_path = PROCESSED_DIR / filename + + if not output_path.exists(): + raise HTTPException(status_code=404, detail="Output file not found") + + return FileResponse( + path=output_path, + filename=filename, + media_type="application/pdf" + ) diff --git a/pdf-ocr/docker-compose.yml b/pdf-ocr/docker-compose.yml new file mode 100644 index 0000000..9a1bace --- /dev/null +++ b/pdf-ocr/docker-compose.yml @@ -0,0 +1,20 @@ +services: + pdf-ocr: + build: . + container_name: pdf-ocr + restart: unless-stopped + volumes: + - ./uploads:/app/uploads + - ./processed:/app/processed + labels: + - "traefik.enable=true" + - "traefik.http.routers.pdf-ocr.rule=Host(`ocr.jeffemmett.com`)" + - "traefik.http.routers.pdf-ocr.entrypoints=web" + - "traefik.http.services.pdf-ocr.loadbalancer.server.port=8000" + - "traefik.docker.network=traefik-public" + networks: + - traefik-public + +networks: + traefik-public: + external: true