Fix ai-internal network name, add PDF OCR service

- Correct network name from ai-orchestrator_ai-internal to ai-internal - Add pdf-ocr service for ocr.jeffemmett.com (Dockerfile, app.py, compose) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-14 12:52:16 -05:00 · 2025-12-14 12:52:16 -05:00 · f88e1082f0
parent d33f3f68d6
commit f88e1082f0
4 changed files with 311 additions and 1 deletions
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -33,4 +33,4 @@ networks:
    external: true
  ai-internal:
    external: true
-    name: ai-orchestrator_ai-internal
+    name: ai-internal
--- a/pdf-ocr/Dockerfile
+++ b/pdf-ocr/Dockerfile
@ -0,0 +1,29 @@
 FROM python:3.11-slim
 # Install system dependencies for OCR
 RUN apt-get update && apt-get install -y --no-install-recommends \
    ocrmypdf \
    tesseract-ocr \
    tesseract-ocr-eng \
    ghostscript \
    unpaper \
    pngquant \
    && rm -rf /var/lib/apt/lists/*
 # Install Python dependencies
 RUN pip install --no-cache-dir \
    fastapi \
    uvicorn[standard] \
    python-multipart \
    aiofiles
 WORKDIR /app
 # Create directories
 RUN mkdir -p /app/uploads /app/processed
 COPY app.py /app/
 EXPOSE 8000
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/pdf-ocr/app.py
+++ b/pdf-ocr/app.py
@ -0,0 +1,261 @@
 """
 PDF OCR Service
 Converts image-based PDFs to searchable text PDFs using ocrmypdf
 """
 import os
 import subprocess
 import tempfile
 import asyncio
 from pathlib import Path
 from typing import Optional
 from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks
 from fastapi.responses import FileResponse, JSONResponse
 from fastapi.middleware.cors import CORSMiddleware
 import aiofiles
 app = FastAPI(
    title="PDF OCR Service",
    description="Convert scanned/image-based PDFs to searchable text PDFs",
    version="1.0.0"
 )
 app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
 )
 UPLOAD_DIR = Path("/app/uploads")
 PROCESSED_DIR = Path("/app/processed")
 # Ensure directories exist
 UPLOAD_DIR.mkdir(exist_ok=True)
 PROCESSED_DIR.mkdir(exist_ok=True)
 def run_ocr(input_path: Path, output_path: Path, language: str = "eng") -> dict:
    """Run ocrmypdf on a PDF file"""
    try:
        result = subprocess.run(
            [
                "ocrmypdf",
                "--skip-text",  # Skip pages that already have text
                "--optimize", "1",  # Light optimization
                "--language", language,
                "--output-type", "pdf",
                "--jobs", "2",  # Parallel processing
                str(input_path),
                str(output_path)
            ],
            capture_output=True,
            text=True,
            timeout=600  # 10 minute timeout
        )
        if result.returncode == 0:
            return {"success": True, "message": "OCR completed successfully"}
        elif result.returncode == 6:
            # Exit code 6 means the PDF already has text
            # Copy the original file
            import shutil
            shutil.copy(input_path, output_path)
            return {"success": True, "message": "PDF already contains text, copied as-is"}
        else:
            return {
                "success": False,
                "message": f"OCR failed: {result.stderr}",
                "returncode": result.returncode
            }
    except subprocess.TimeoutExpired:
        return {"success": False, "message": "OCR timed out after 10 minutes"}
    except Exception as e:
        return {"success": False, "message": str(e)}
@app.get("/")
 async def root():
    return {
        "service": "PDF OCR Service",
        "version": "1.0.0",
        "endpoints": {
            "POST /ocr": "Upload PDF and get OCR'd version",
            "GET /health": "Health check"
        }
    }
@app.get("/health")
 async def health():
    # Check if ocrmypdf is available
    try:
        result = subprocess.run(["ocrmypdf", "--version"], capture_output=True, text=True)
        ocrmypdf_version = result.stdout.strip()
    except:
        ocrmypdf_version = "not available"
    try:
        result = subprocess.run(["tesseract", "--version"], capture_output=True, text=True)
        tesseract_version = result.stdout.split("\n")[0]
    except:
        tesseract_version = "not available"
    return {
        "status": "healthy",
        "ocrmypdf": ocrmypdf_version,
        "tesseract": tesseract_version
    }
@app.post("/ocr")
 async def ocr_pdf(
    file: UploadFile = File(...),
    language: str = "eng"
 ):
    """
    Upload a PDF and receive an OCR'd searchable version.
    - **file**: PDF file to process
    - **language**: OCR language (default: eng)
    Returns the processed PDF file.
    """
    if not file.filename.lower().endswith('.pdf'):
        raise HTTPException(status_code=400, detail="Only PDF files are supported")
    # Save uploaded file
    input_path = UPLOAD_DIR / file.filename
    async with aiofiles.open(input_path, 'wb') as f:
        content = await file.read()
        await f.write(content)
    # Generate output filename
    output_filename = f"ocr_{file.filename}"
    output_path = PROCESSED_DIR / output_filename
    # Run OCR
    result = run_ocr(input_path, output_path, language)
    if not result["success"]:
        # Cleanup
        if input_path.exists():
            input_path.unlink()
        raise HTTPException(status_code=500, detail=result["message"])
    # Check if output was created
    if not output_path.exists():
        raise HTTPException(status_code=500, detail="OCR completed but output file not found")
    # Return the processed file
    return FileResponse(
        path=output_path,
        filename=output_filename,
        media_type="application/pdf",
        headers={
            "X-OCR-Status": result["message"]
        }
    )
@app.post("/ocr/async")
 async def ocr_pdf_async(
    file: UploadFile = File(...),
    language: str = "eng",
    background_tasks: BackgroundTasks = None
 ):
    """
    Upload a PDF for async OCR processing.
    Returns a job ID to check status later.
    """
    import uuid
    if not file.filename.lower().endswith('.pdf'):
        raise HTTPException(status_code=400, detail="Only PDF files are supported")
    job_id = str(uuid.uuid4())[:8]
    # Save uploaded file with job ID
    input_path = UPLOAD_DIR / f"{job_id}_{file.filename}"
    async with aiofiles.open(input_path, 'wb') as f:
        content = await file.read()
        await f.write(content)
    output_path = PROCESSED_DIR / f"ocr_{job_id}_{file.filename}"
    # Create status file
    status_path = PROCESSED_DIR / f"{job_id}.status"
    async with aiofiles.open(status_path, 'w') as f:
        await f.write("processing")
    # Run OCR in background
    def process_ocr():
        result = run_ocr(input_path, output_path, language)
        with open(status_path, 'w') as f:
            if result["success"]:
                f.write(f"completed:{output_path.name}")
            else:
                f.write(f"failed:{result['message']}")
    background_tasks.add_task(process_ocr)
    return {
        "job_id": job_id,
        "status": "processing",
        "check_status": f"/ocr/status/{job_id}",
        "download": f"/ocr/download/{job_id}"
    }
@app.get("/ocr/status/{job_id}")
 async def get_ocr_status(job_id: str):
    """Check the status of an async OCR job"""
    status_path = PROCESSED_DIR / f"{job_id}.status"
    if not status_path.exists():
        raise HTTPException(status_code=404, detail="Job not found")
    async with aiofiles.open(status_path, 'r') as f:
        status = await f.read()
    if status == "processing":
        return {"job_id": job_id, "status": "processing"}
    elif status.startswith("completed:"):
        filename = status.split(":", 1)[1]
        return {
            "job_id": job_id,
            "status": "completed",
            "download": f"/ocr/download/{job_id}"
        }
    elif status.startswith("failed:"):
        error = status.split(":", 1)[1]
        return {"job_id": job_id, "status": "failed", "error": error}
    return {"job_id": job_id, "status": "unknown"}
@app.get("/ocr/download/{job_id}")
 async def download_ocr_result(job_id: str):
    """Download the OCR'd PDF for a completed job"""
    status_path = PROCESSED_DIR / f"{job_id}.status"
    if not status_path.exists():
        raise HTTPException(status_code=404, detail="Job not found")
    async with aiofiles.open(status_path, 'r') as f:
        status = await f.read()
    if not status.startswith("completed:"):
        raise HTTPException(status_code=400, detail="Job not completed yet")
    filename = status.split(":", 1)[1]
    output_path = PROCESSED_DIR / filename
    if not output_path.exists():
        raise HTTPException(status_code=404, detail="Output file not found")
    return FileResponse(
        path=output_path,
        filename=filename,
        media_type="application/pdf"
    )
--- a/pdf-ocr/docker-compose.yml
+++ b/pdf-ocr/docker-compose.yml
@ -0,0 +1,20 @@
 services:
  pdf-ocr:
    build: .
    container_name: pdf-ocr
    restart: unless-stopped
    volumes:
      - ./uploads:/app/uploads
      - ./processed:/app/processed
    labels:
      - "traefik.enable=true"
      - "traefik.http.routers.pdf-ocr.rule=Host(`ocr.jeffemmett.com`)"
      - "traefik.http.routers.pdf-ocr.entrypoints=web"
      - "traefik.http.services.pdf-ocr.loadbalancer.server.port=8000"
      - "traefik.docker.network=traefik-public"
    networks:
      - traefik-public
 networks:
  traefik-public:
    external: true