Fix ai-internal network name, add PDF OCR service
- Correct network name from ai-orchestrator_ai-internal to ai-internal - Add pdf-ocr service for ocr.jeffemmett.com (Dockerfile, app.py, compose) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
d33f3f68d6
commit
f88e1082f0
|
|
@ -33,4 +33,4 @@ networks:
|
||||||
external: true
|
external: true
|
||||||
ai-internal:
|
ai-internal:
|
||||||
external: true
|
external: true
|
||||||
name: ai-orchestrator_ai-internal
|
name: ai-internal
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,29 @@
|
||||||
|
FROM python:3.11-slim
|
||||||
|
|
||||||
|
# Install system dependencies for OCR
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
ocrmypdf \
|
||||||
|
tesseract-ocr \
|
||||||
|
tesseract-ocr-eng \
|
||||||
|
ghostscript \
|
||||||
|
unpaper \
|
||||||
|
pngquant \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Install Python dependencies
|
||||||
|
RUN pip install --no-cache-dir \
|
||||||
|
fastapi \
|
||||||
|
uvicorn[standard] \
|
||||||
|
python-multipart \
|
||||||
|
aiofiles
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Create directories
|
||||||
|
RUN mkdir -p /app/uploads /app/processed
|
||||||
|
|
||||||
|
COPY app.py /app/
|
||||||
|
|
||||||
|
EXPOSE 8000
|
||||||
|
|
||||||
|
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||||
|
|
@ -0,0 +1,261 @@
|
||||||
|
"""
|
||||||
|
PDF OCR Service
|
||||||
|
Converts image-based PDFs to searchable text PDFs using ocrmypdf
|
||||||
|
"""
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import tempfile
|
||||||
|
import asyncio
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks
|
||||||
|
from fastapi.responses import FileResponse, JSONResponse
|
||||||
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
|
import aiofiles
|
||||||
|
|
||||||
|
app = FastAPI(
|
||||||
|
title="PDF OCR Service",
|
||||||
|
description="Convert scanned/image-based PDFs to searchable text PDFs",
|
||||||
|
version="1.0.0"
|
||||||
|
)
|
||||||
|
|
||||||
|
app.add_middleware(
|
||||||
|
CORSMiddleware,
|
||||||
|
allow_origins=["*"],
|
||||||
|
allow_credentials=True,
|
||||||
|
allow_methods=["*"],
|
||||||
|
allow_headers=["*"],
|
||||||
|
)
|
||||||
|
|
||||||
|
UPLOAD_DIR = Path("/app/uploads")
|
||||||
|
PROCESSED_DIR = Path("/app/processed")
|
||||||
|
|
||||||
|
# Ensure directories exist
|
||||||
|
UPLOAD_DIR.mkdir(exist_ok=True)
|
||||||
|
PROCESSED_DIR.mkdir(exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
|
def run_ocr(input_path: Path, output_path: Path, language: str = "eng") -> dict:
|
||||||
|
"""Run ocrmypdf on a PDF file"""
|
||||||
|
try:
|
||||||
|
result = subprocess.run(
|
||||||
|
[
|
||||||
|
"ocrmypdf",
|
||||||
|
"--skip-text", # Skip pages that already have text
|
||||||
|
"--optimize", "1", # Light optimization
|
||||||
|
"--language", language,
|
||||||
|
"--output-type", "pdf",
|
||||||
|
"--jobs", "2", # Parallel processing
|
||||||
|
str(input_path),
|
||||||
|
str(output_path)
|
||||||
|
],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
timeout=600 # 10 minute timeout
|
||||||
|
)
|
||||||
|
|
||||||
|
if result.returncode == 0:
|
||||||
|
return {"success": True, "message": "OCR completed successfully"}
|
||||||
|
elif result.returncode == 6:
|
||||||
|
# Exit code 6 means the PDF already has text
|
||||||
|
# Copy the original file
|
||||||
|
import shutil
|
||||||
|
shutil.copy(input_path, output_path)
|
||||||
|
return {"success": True, "message": "PDF already contains text, copied as-is"}
|
||||||
|
else:
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"message": f"OCR failed: {result.stderr}",
|
||||||
|
"returncode": result.returncode
|
||||||
|
}
|
||||||
|
except subprocess.TimeoutExpired:
|
||||||
|
return {"success": False, "message": "OCR timed out after 10 minutes"}
|
||||||
|
except Exception as e:
|
||||||
|
return {"success": False, "message": str(e)}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/")
|
||||||
|
async def root():
|
||||||
|
return {
|
||||||
|
"service": "PDF OCR Service",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"endpoints": {
|
||||||
|
"POST /ocr": "Upload PDF and get OCR'd version",
|
||||||
|
"GET /health": "Health check"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/health")
|
||||||
|
async def health():
|
||||||
|
# Check if ocrmypdf is available
|
||||||
|
try:
|
||||||
|
result = subprocess.run(["ocrmypdf", "--version"], capture_output=True, text=True)
|
||||||
|
ocrmypdf_version = result.stdout.strip()
|
||||||
|
except:
|
||||||
|
ocrmypdf_version = "not available"
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = subprocess.run(["tesseract", "--version"], capture_output=True, text=True)
|
||||||
|
tesseract_version = result.stdout.split("\n")[0]
|
||||||
|
except:
|
||||||
|
tesseract_version = "not available"
|
||||||
|
|
||||||
|
return {
|
||||||
|
"status": "healthy",
|
||||||
|
"ocrmypdf": ocrmypdf_version,
|
||||||
|
"tesseract": tesseract_version
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/ocr")
|
||||||
|
async def ocr_pdf(
|
||||||
|
file: UploadFile = File(...),
|
||||||
|
language: str = "eng"
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Upload a PDF and receive an OCR'd searchable version.
|
||||||
|
|
||||||
|
- **file**: PDF file to process
|
||||||
|
- **language**: OCR language (default: eng)
|
||||||
|
|
||||||
|
Returns the processed PDF file.
|
||||||
|
"""
|
||||||
|
if not file.filename.lower().endswith('.pdf'):
|
||||||
|
raise HTTPException(status_code=400, detail="Only PDF files are supported")
|
||||||
|
|
||||||
|
# Save uploaded file
|
||||||
|
input_path = UPLOAD_DIR / file.filename
|
||||||
|
async with aiofiles.open(input_path, 'wb') as f:
|
||||||
|
content = await file.read()
|
||||||
|
await f.write(content)
|
||||||
|
|
||||||
|
# Generate output filename
|
||||||
|
output_filename = f"ocr_{file.filename}"
|
||||||
|
output_path = PROCESSED_DIR / output_filename
|
||||||
|
|
||||||
|
# Run OCR
|
||||||
|
result = run_ocr(input_path, output_path, language)
|
||||||
|
|
||||||
|
if not result["success"]:
|
||||||
|
# Cleanup
|
||||||
|
if input_path.exists():
|
||||||
|
input_path.unlink()
|
||||||
|
raise HTTPException(status_code=500, detail=result["message"])
|
||||||
|
|
||||||
|
# Check if output was created
|
||||||
|
if not output_path.exists():
|
||||||
|
raise HTTPException(status_code=500, detail="OCR completed but output file not found")
|
||||||
|
|
||||||
|
# Return the processed file
|
||||||
|
return FileResponse(
|
||||||
|
path=output_path,
|
||||||
|
filename=output_filename,
|
||||||
|
media_type="application/pdf",
|
||||||
|
headers={
|
||||||
|
"X-OCR-Status": result["message"]
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/ocr/async")
|
||||||
|
async def ocr_pdf_async(
|
||||||
|
file: UploadFile = File(...),
|
||||||
|
language: str = "eng",
|
||||||
|
background_tasks: BackgroundTasks = None
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Upload a PDF for async OCR processing.
|
||||||
|
Returns a job ID to check status later.
|
||||||
|
"""
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
if not file.filename.lower().endswith('.pdf'):
|
||||||
|
raise HTTPException(status_code=400, detail="Only PDF files are supported")
|
||||||
|
|
||||||
|
job_id = str(uuid.uuid4())[:8]
|
||||||
|
|
||||||
|
# Save uploaded file with job ID
|
||||||
|
input_path = UPLOAD_DIR / f"{job_id}_{file.filename}"
|
||||||
|
async with aiofiles.open(input_path, 'wb') as f:
|
||||||
|
content = await file.read()
|
||||||
|
await f.write(content)
|
||||||
|
|
||||||
|
output_path = PROCESSED_DIR / f"ocr_{job_id}_{file.filename}"
|
||||||
|
|
||||||
|
# Create status file
|
||||||
|
status_path = PROCESSED_DIR / f"{job_id}.status"
|
||||||
|
async with aiofiles.open(status_path, 'w') as f:
|
||||||
|
await f.write("processing")
|
||||||
|
|
||||||
|
# Run OCR in background
|
||||||
|
def process_ocr():
|
||||||
|
result = run_ocr(input_path, output_path, language)
|
||||||
|
with open(status_path, 'w') as f:
|
||||||
|
if result["success"]:
|
||||||
|
f.write(f"completed:{output_path.name}")
|
||||||
|
else:
|
||||||
|
f.write(f"failed:{result['message']}")
|
||||||
|
|
||||||
|
background_tasks.add_task(process_ocr)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"job_id": job_id,
|
||||||
|
"status": "processing",
|
||||||
|
"check_status": f"/ocr/status/{job_id}",
|
||||||
|
"download": f"/ocr/download/{job_id}"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/ocr/status/{job_id}")
|
||||||
|
async def get_ocr_status(job_id: str):
|
||||||
|
"""Check the status of an async OCR job"""
|
||||||
|
status_path = PROCESSED_DIR / f"{job_id}.status"
|
||||||
|
|
||||||
|
if not status_path.exists():
|
||||||
|
raise HTTPException(status_code=404, detail="Job not found")
|
||||||
|
|
||||||
|
async with aiofiles.open(status_path, 'r') as f:
|
||||||
|
status = await f.read()
|
||||||
|
|
||||||
|
if status == "processing":
|
||||||
|
return {"job_id": job_id, "status": "processing"}
|
||||||
|
elif status.startswith("completed:"):
|
||||||
|
filename = status.split(":", 1)[1]
|
||||||
|
return {
|
||||||
|
"job_id": job_id,
|
||||||
|
"status": "completed",
|
||||||
|
"download": f"/ocr/download/{job_id}"
|
||||||
|
}
|
||||||
|
elif status.startswith("failed:"):
|
||||||
|
error = status.split(":", 1)[1]
|
||||||
|
return {"job_id": job_id, "status": "failed", "error": error}
|
||||||
|
|
||||||
|
return {"job_id": job_id, "status": "unknown"}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/ocr/download/{job_id}")
|
||||||
|
async def download_ocr_result(job_id: str):
|
||||||
|
"""Download the OCR'd PDF for a completed job"""
|
||||||
|
status_path = PROCESSED_DIR / f"{job_id}.status"
|
||||||
|
|
||||||
|
if not status_path.exists():
|
||||||
|
raise HTTPException(status_code=404, detail="Job not found")
|
||||||
|
|
||||||
|
async with aiofiles.open(status_path, 'r') as f:
|
||||||
|
status = await f.read()
|
||||||
|
|
||||||
|
if not status.startswith("completed:"):
|
||||||
|
raise HTTPException(status_code=400, detail="Job not completed yet")
|
||||||
|
|
||||||
|
filename = status.split(":", 1)[1]
|
||||||
|
output_path = PROCESSED_DIR / filename
|
||||||
|
|
||||||
|
if not output_path.exists():
|
||||||
|
raise HTTPException(status_code=404, detail="Output file not found")
|
||||||
|
|
||||||
|
return FileResponse(
|
||||||
|
path=output_path,
|
||||||
|
filename=filename,
|
||||||
|
media_type="application/pdf"
|
||||||
|
)
|
||||||
|
|
@ -0,0 +1,20 @@
|
||||||
|
services:
|
||||||
|
pdf-ocr:
|
||||||
|
build: .
|
||||||
|
container_name: pdf-ocr
|
||||||
|
restart: unless-stopped
|
||||||
|
volumes:
|
||||||
|
- ./uploads:/app/uploads
|
||||||
|
- ./processed:/app/processed
|
||||||
|
labels:
|
||||||
|
- "traefik.enable=true"
|
||||||
|
- "traefik.http.routers.pdf-ocr.rule=Host(`ocr.jeffemmett.com`)"
|
||||||
|
- "traefik.http.routers.pdf-ocr.entrypoints=web"
|
||||||
|
- "traefik.http.services.pdf-ocr.loadbalancer.server.port=8000"
|
||||||
|
- "traefik.docker.network=traefik-public"
|
||||||
|
networks:
|
||||||
|
- traefik-public
|
||||||
|
|
||||||
|
networks:
|
||||||
|
traefik-public:
|
||||||
|
external: true
|
||||||
Loading…
Reference in New Issue