Fix ai-internal network name, add PDF OCR service

- Correct network name from ai-orchestrator_ai-internal to ai-internal
- Add pdf-ocr service for ocr.jeffemmett.com (Dockerfile, app.py, compose)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Jeff Emmett 2025-12-14 12:52:16 -05:00
parent d33f3f68d6
commit f88e1082f0
4 changed files with 311 additions and 1 deletions

View File

@ -33,4 +33,4 @@ networks:
external: true
ai-internal:
external: true
name: ai-orchestrator_ai-internal
name: ai-internal

29
pdf-ocr/Dockerfile Normal file
View File

@ -0,0 +1,29 @@
FROM python:3.11-slim
# Install system dependencies for OCR
RUN apt-get update && apt-get install -y --no-install-recommends \
ocrmypdf \
tesseract-ocr \
tesseract-ocr-eng \
ghostscript \
unpaper \
pngquant \
&& rm -rf /var/lib/apt/lists/*
# Install Python dependencies
RUN pip install --no-cache-dir \
fastapi \
uvicorn[standard] \
python-multipart \
aiofiles
WORKDIR /app
# Create directories
RUN mkdir -p /app/uploads /app/processed
COPY app.py /app/
EXPOSE 8000
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]

261
pdf-ocr/app.py Normal file
View File

@ -0,0 +1,261 @@
"""
PDF OCR Service
Converts image-based PDFs to searchable text PDFs using ocrmypdf
"""
import os
import subprocess
import tempfile
import asyncio
from pathlib import Path
from typing import Optional
from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks
from fastapi.responses import FileResponse, JSONResponse
from fastapi.middleware.cors import CORSMiddleware
import aiofiles
app = FastAPI(
title="PDF OCR Service",
description="Convert scanned/image-based PDFs to searchable text PDFs",
version="1.0.0"
)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
UPLOAD_DIR = Path("/app/uploads")
PROCESSED_DIR = Path("/app/processed")
# Ensure directories exist
UPLOAD_DIR.mkdir(exist_ok=True)
PROCESSED_DIR.mkdir(exist_ok=True)
def run_ocr(input_path: Path, output_path: Path, language: str = "eng") -> dict:
"""Run ocrmypdf on a PDF file"""
try:
result = subprocess.run(
[
"ocrmypdf",
"--skip-text", # Skip pages that already have text
"--optimize", "1", # Light optimization
"--language", language,
"--output-type", "pdf",
"--jobs", "2", # Parallel processing
str(input_path),
str(output_path)
],
capture_output=True,
text=True,
timeout=600 # 10 minute timeout
)
if result.returncode == 0:
return {"success": True, "message": "OCR completed successfully"}
elif result.returncode == 6:
# Exit code 6 means the PDF already has text
# Copy the original file
import shutil
shutil.copy(input_path, output_path)
return {"success": True, "message": "PDF already contains text, copied as-is"}
else:
return {
"success": False,
"message": f"OCR failed: {result.stderr}",
"returncode": result.returncode
}
except subprocess.TimeoutExpired:
return {"success": False, "message": "OCR timed out after 10 minutes"}
except Exception as e:
return {"success": False, "message": str(e)}
@app.get("/")
async def root():
return {
"service": "PDF OCR Service",
"version": "1.0.0",
"endpoints": {
"POST /ocr": "Upload PDF and get OCR'd version",
"GET /health": "Health check"
}
}
@app.get("/health")
async def health():
# Check if ocrmypdf is available
try:
result = subprocess.run(["ocrmypdf", "--version"], capture_output=True, text=True)
ocrmypdf_version = result.stdout.strip()
except:
ocrmypdf_version = "not available"
try:
result = subprocess.run(["tesseract", "--version"], capture_output=True, text=True)
tesseract_version = result.stdout.split("\n")[0]
except:
tesseract_version = "not available"
return {
"status": "healthy",
"ocrmypdf": ocrmypdf_version,
"tesseract": tesseract_version
}
@app.post("/ocr")
async def ocr_pdf(
file: UploadFile = File(...),
language: str = "eng"
):
"""
Upload a PDF and receive an OCR'd searchable version.
- **file**: PDF file to process
- **language**: OCR language (default: eng)
Returns the processed PDF file.
"""
if not file.filename.lower().endswith('.pdf'):
raise HTTPException(status_code=400, detail="Only PDF files are supported")
# Save uploaded file
input_path = UPLOAD_DIR / file.filename
async with aiofiles.open(input_path, 'wb') as f:
content = await file.read()
await f.write(content)
# Generate output filename
output_filename = f"ocr_{file.filename}"
output_path = PROCESSED_DIR / output_filename
# Run OCR
result = run_ocr(input_path, output_path, language)
if not result["success"]:
# Cleanup
if input_path.exists():
input_path.unlink()
raise HTTPException(status_code=500, detail=result["message"])
# Check if output was created
if not output_path.exists():
raise HTTPException(status_code=500, detail="OCR completed but output file not found")
# Return the processed file
return FileResponse(
path=output_path,
filename=output_filename,
media_type="application/pdf",
headers={
"X-OCR-Status": result["message"]
}
)
@app.post("/ocr/async")
async def ocr_pdf_async(
file: UploadFile = File(...),
language: str = "eng",
background_tasks: BackgroundTasks = None
):
"""
Upload a PDF for async OCR processing.
Returns a job ID to check status later.
"""
import uuid
if not file.filename.lower().endswith('.pdf'):
raise HTTPException(status_code=400, detail="Only PDF files are supported")
job_id = str(uuid.uuid4())[:8]
# Save uploaded file with job ID
input_path = UPLOAD_DIR / f"{job_id}_{file.filename}"
async with aiofiles.open(input_path, 'wb') as f:
content = await file.read()
await f.write(content)
output_path = PROCESSED_DIR / f"ocr_{job_id}_{file.filename}"
# Create status file
status_path = PROCESSED_DIR / f"{job_id}.status"
async with aiofiles.open(status_path, 'w') as f:
await f.write("processing")
# Run OCR in background
def process_ocr():
result = run_ocr(input_path, output_path, language)
with open(status_path, 'w') as f:
if result["success"]:
f.write(f"completed:{output_path.name}")
else:
f.write(f"failed:{result['message']}")
background_tasks.add_task(process_ocr)
return {
"job_id": job_id,
"status": "processing",
"check_status": f"/ocr/status/{job_id}",
"download": f"/ocr/download/{job_id}"
}
@app.get("/ocr/status/{job_id}")
async def get_ocr_status(job_id: str):
"""Check the status of an async OCR job"""
status_path = PROCESSED_DIR / f"{job_id}.status"
if not status_path.exists():
raise HTTPException(status_code=404, detail="Job not found")
async with aiofiles.open(status_path, 'r') as f:
status = await f.read()
if status == "processing":
return {"job_id": job_id, "status": "processing"}
elif status.startswith("completed:"):
filename = status.split(":", 1)[1]
return {
"job_id": job_id,
"status": "completed",
"download": f"/ocr/download/{job_id}"
}
elif status.startswith("failed:"):
error = status.split(":", 1)[1]
return {"job_id": job_id, "status": "failed", "error": error}
return {"job_id": job_id, "status": "unknown"}
@app.get("/ocr/download/{job_id}")
async def download_ocr_result(job_id: str):
"""Download the OCR'd PDF for a completed job"""
status_path = PROCESSED_DIR / f"{job_id}.status"
if not status_path.exists():
raise HTTPException(status_code=404, detail="Job not found")
async with aiofiles.open(status_path, 'r') as f:
status = await f.read()
if not status.startswith("completed:"):
raise HTTPException(status_code=400, detail="Job not completed yet")
filename = status.split(":", 1)[1]
output_path = PROCESSED_DIR / filename
if not output_path.exists():
raise HTTPException(status_code=404, detail="Output file not found")
return FileResponse(
path=output_path,
filename=filename,
media_type="application/pdf"
)

View File

@ -0,0 +1,20 @@
services:
pdf-ocr:
build: .
container_name: pdf-ocr
restart: unless-stopped
volumes:
- ./uploads:/app/uploads
- ./processed:/app/processed
labels:
- "traefik.enable=true"
- "traefik.http.routers.pdf-ocr.rule=Host(`ocr.jeffemmett.com`)"
- "traefik.http.routers.pdf-ocr.entrypoints=web"
- "traefik.http.services.pdf-ocr.loadbalancer.server.port=8000"
- "traefik.docker.network=traefik-public"
networks:
- traefik-public
networks:
traefik-public:
external: true