Fix ai-internal network name, add PDF OCR service
- Correct network name from ai-orchestrator_ai-internal to ai-internal - Add pdf-ocr service for ocr.jeffemmett.com (Dockerfile, app.py, compose) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
d33f3f68d6
commit
f88e1082f0
|
|
@ -33,4 +33,4 @@ networks:
|
|||
external: true
|
||||
ai-internal:
|
||||
external: true
|
||||
name: ai-orchestrator_ai-internal
|
||||
name: ai-internal
|
||||
|
|
|
|||
|
|
@ -0,0 +1,29 @@
|
|||
FROM python:3.11-slim
|
||||
|
||||
# Install system dependencies for OCR
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
ocrmypdf \
|
||||
tesseract-ocr \
|
||||
tesseract-ocr-eng \
|
||||
ghostscript \
|
||||
unpaper \
|
||||
pngquant \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install Python dependencies
|
||||
RUN pip install --no-cache-dir \
|
||||
fastapi \
|
||||
uvicorn[standard] \
|
||||
python-multipart \
|
||||
aiofiles
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Create directories
|
||||
RUN mkdir -p /app/uploads /app/processed
|
||||
|
||||
COPY app.py /app/
|
||||
|
||||
EXPOSE 8000
|
||||
|
||||
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
|
|
@ -0,0 +1,261 @@
|
|||
"""
|
||||
PDF OCR Service
|
||||
Converts image-based PDFs to searchable text PDFs using ocrmypdf
|
||||
"""
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from fastapi import FastAPI, UploadFile, File, HTTPException, BackgroundTasks
|
||||
from fastapi.responses import FileResponse, JSONResponse
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
import aiofiles
|
||||
|
||||
app = FastAPI(
|
||||
title="PDF OCR Service",
|
||||
description="Convert scanned/image-based PDFs to searchable text PDFs",
|
||||
version="1.0.0"
|
||||
)
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
UPLOAD_DIR = Path("/app/uploads")
|
||||
PROCESSED_DIR = Path("/app/processed")
|
||||
|
||||
# Ensure directories exist
|
||||
UPLOAD_DIR.mkdir(exist_ok=True)
|
||||
PROCESSED_DIR.mkdir(exist_ok=True)
|
||||
|
||||
|
||||
def run_ocr(input_path: Path, output_path: Path, language: str = "eng") -> dict:
|
||||
"""Run ocrmypdf on a PDF file"""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[
|
||||
"ocrmypdf",
|
||||
"--skip-text", # Skip pages that already have text
|
||||
"--optimize", "1", # Light optimization
|
||||
"--language", language,
|
||||
"--output-type", "pdf",
|
||||
"--jobs", "2", # Parallel processing
|
||||
str(input_path),
|
||||
str(output_path)
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=600 # 10 minute timeout
|
||||
)
|
||||
|
||||
if result.returncode == 0:
|
||||
return {"success": True, "message": "OCR completed successfully"}
|
||||
elif result.returncode == 6:
|
||||
# Exit code 6 means the PDF already has text
|
||||
# Copy the original file
|
||||
import shutil
|
||||
shutil.copy(input_path, output_path)
|
||||
return {"success": True, "message": "PDF already contains text, copied as-is"}
|
||||
else:
|
||||
return {
|
||||
"success": False,
|
||||
"message": f"OCR failed: {result.stderr}",
|
||||
"returncode": result.returncode
|
||||
}
|
||||
except subprocess.TimeoutExpired:
|
||||
return {"success": False, "message": "OCR timed out after 10 minutes"}
|
||||
except Exception as e:
|
||||
return {"success": False, "message": str(e)}
|
||||
|
||||
|
||||
@app.get("/")
|
||||
async def root():
|
||||
return {
|
||||
"service": "PDF OCR Service",
|
||||
"version": "1.0.0",
|
||||
"endpoints": {
|
||||
"POST /ocr": "Upload PDF and get OCR'd version",
|
||||
"GET /health": "Health check"
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health():
|
||||
# Check if ocrmypdf is available
|
||||
try:
|
||||
result = subprocess.run(["ocrmypdf", "--version"], capture_output=True, text=True)
|
||||
ocrmypdf_version = result.stdout.strip()
|
||||
except:
|
||||
ocrmypdf_version = "not available"
|
||||
|
||||
try:
|
||||
result = subprocess.run(["tesseract", "--version"], capture_output=True, text=True)
|
||||
tesseract_version = result.stdout.split("\n")[0]
|
||||
except:
|
||||
tesseract_version = "not available"
|
||||
|
||||
return {
|
||||
"status": "healthy",
|
||||
"ocrmypdf": ocrmypdf_version,
|
||||
"tesseract": tesseract_version
|
||||
}
|
||||
|
||||
|
||||
@app.post("/ocr")
|
||||
async def ocr_pdf(
|
||||
file: UploadFile = File(...),
|
||||
language: str = "eng"
|
||||
):
|
||||
"""
|
||||
Upload a PDF and receive an OCR'd searchable version.
|
||||
|
||||
- **file**: PDF file to process
|
||||
- **language**: OCR language (default: eng)
|
||||
|
||||
Returns the processed PDF file.
|
||||
"""
|
||||
if not file.filename.lower().endswith('.pdf'):
|
||||
raise HTTPException(status_code=400, detail="Only PDF files are supported")
|
||||
|
||||
# Save uploaded file
|
||||
input_path = UPLOAD_DIR / file.filename
|
||||
async with aiofiles.open(input_path, 'wb') as f:
|
||||
content = await file.read()
|
||||
await f.write(content)
|
||||
|
||||
# Generate output filename
|
||||
output_filename = f"ocr_{file.filename}"
|
||||
output_path = PROCESSED_DIR / output_filename
|
||||
|
||||
# Run OCR
|
||||
result = run_ocr(input_path, output_path, language)
|
||||
|
||||
if not result["success"]:
|
||||
# Cleanup
|
||||
if input_path.exists():
|
||||
input_path.unlink()
|
||||
raise HTTPException(status_code=500, detail=result["message"])
|
||||
|
||||
# Check if output was created
|
||||
if not output_path.exists():
|
||||
raise HTTPException(status_code=500, detail="OCR completed but output file not found")
|
||||
|
||||
# Return the processed file
|
||||
return FileResponse(
|
||||
path=output_path,
|
||||
filename=output_filename,
|
||||
media_type="application/pdf",
|
||||
headers={
|
||||
"X-OCR-Status": result["message"]
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@app.post("/ocr/async")
|
||||
async def ocr_pdf_async(
|
||||
file: UploadFile = File(...),
|
||||
language: str = "eng",
|
||||
background_tasks: BackgroundTasks = None
|
||||
):
|
||||
"""
|
||||
Upload a PDF for async OCR processing.
|
||||
Returns a job ID to check status later.
|
||||
"""
|
||||
import uuid
|
||||
|
||||
if not file.filename.lower().endswith('.pdf'):
|
||||
raise HTTPException(status_code=400, detail="Only PDF files are supported")
|
||||
|
||||
job_id = str(uuid.uuid4())[:8]
|
||||
|
||||
# Save uploaded file with job ID
|
||||
input_path = UPLOAD_DIR / f"{job_id}_{file.filename}"
|
||||
async with aiofiles.open(input_path, 'wb') as f:
|
||||
content = await file.read()
|
||||
await f.write(content)
|
||||
|
||||
output_path = PROCESSED_DIR / f"ocr_{job_id}_{file.filename}"
|
||||
|
||||
# Create status file
|
||||
status_path = PROCESSED_DIR / f"{job_id}.status"
|
||||
async with aiofiles.open(status_path, 'w') as f:
|
||||
await f.write("processing")
|
||||
|
||||
# Run OCR in background
|
||||
def process_ocr():
|
||||
result = run_ocr(input_path, output_path, language)
|
||||
with open(status_path, 'w') as f:
|
||||
if result["success"]:
|
||||
f.write(f"completed:{output_path.name}")
|
||||
else:
|
||||
f.write(f"failed:{result['message']}")
|
||||
|
||||
background_tasks.add_task(process_ocr)
|
||||
|
||||
return {
|
||||
"job_id": job_id,
|
||||
"status": "processing",
|
||||
"check_status": f"/ocr/status/{job_id}",
|
||||
"download": f"/ocr/download/{job_id}"
|
||||
}
|
||||
|
||||
|
||||
@app.get("/ocr/status/{job_id}")
|
||||
async def get_ocr_status(job_id: str):
|
||||
"""Check the status of an async OCR job"""
|
||||
status_path = PROCESSED_DIR / f"{job_id}.status"
|
||||
|
||||
if not status_path.exists():
|
||||
raise HTTPException(status_code=404, detail="Job not found")
|
||||
|
||||
async with aiofiles.open(status_path, 'r') as f:
|
||||
status = await f.read()
|
||||
|
||||
if status == "processing":
|
||||
return {"job_id": job_id, "status": "processing"}
|
||||
elif status.startswith("completed:"):
|
||||
filename = status.split(":", 1)[1]
|
||||
return {
|
||||
"job_id": job_id,
|
||||
"status": "completed",
|
||||
"download": f"/ocr/download/{job_id}"
|
||||
}
|
||||
elif status.startswith("failed:"):
|
||||
error = status.split(":", 1)[1]
|
||||
return {"job_id": job_id, "status": "failed", "error": error}
|
||||
|
||||
return {"job_id": job_id, "status": "unknown"}
|
||||
|
||||
|
||||
@app.get("/ocr/download/{job_id}")
|
||||
async def download_ocr_result(job_id: str):
|
||||
"""Download the OCR'd PDF for a completed job"""
|
||||
status_path = PROCESSED_DIR / f"{job_id}.status"
|
||||
|
||||
if not status_path.exists():
|
||||
raise HTTPException(status_code=404, detail="Job not found")
|
||||
|
||||
async with aiofiles.open(status_path, 'r') as f:
|
||||
status = await f.read()
|
||||
|
||||
if not status.startswith("completed:"):
|
||||
raise HTTPException(status_code=400, detail="Job not completed yet")
|
||||
|
||||
filename = status.split(":", 1)[1]
|
||||
output_path = PROCESSED_DIR / filename
|
||||
|
||||
if not output_path.exists():
|
||||
raise HTTPException(status_code=404, detail="Output file not found")
|
||||
|
||||
return FileResponse(
|
||||
path=output_path,
|
||||
filename=filename,
|
||||
media_type="application/pdf"
|
||||
)
|
||||
|
|
@ -0,0 +1,20 @@
|
|||
services:
|
||||
pdf-ocr:
|
||||
build: .
|
||||
container_name: pdf-ocr
|
||||
restart: unless-stopped
|
||||
volumes:
|
||||
- ./uploads:/app/uploads
|
||||
- ./processed:/app/processed
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.routers.pdf-ocr.rule=Host(`ocr.jeffemmett.com`)"
|
||||
- "traefik.http.routers.pdf-ocr.entrypoints=web"
|
||||
- "traefik.http.services.pdf-ocr.loadbalancer.server.port=8000"
|
||||
- "traefik.docker.network=traefik-public"
|
||||
networks:
|
||||
- traefik-public
|
||||
|
||||
networks:
|
||||
traefik-public:
|
||||
external: true
|
||||
Loading…
Reference in New Issue