Initial docling-service: document extraction for AI stack
- FastAPI service using IBM Docling for document extraction - Supports PDF, DOCX, PPTX, XLSX, HTML, images with OCR - Integrates with AI Orchestrator (Ollama) for summarization - Routes audio to RunPod Whisper for transcription - Optional indexing to Semantic Search service - Docker + Traefik configuration for RS 8000 deployment - Python client library included 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
commit
4ed909dbc4
|
|
@ -0,0 +1,4 @@
|
||||||
|
RUNPOD_API_KEY=your_runpod_api_key_here
|
||||||
|
AI_ORCHESTRATOR_URL=http://ai-orchestrator:8080
|
||||||
|
SEMANTIC_SEARCH_URL=http://semantic-search:8000
|
||||||
|
RUNPOD_WHISPER_ENDPOINT=lrtisuv8ixbtub
|
||||||
|
|
@ -0,0 +1,12 @@
|
||||||
|
__pycache__/
|
||||||
|
*.pyc
|
||||||
|
.env
|
||||||
|
.venv/
|
||||||
|
venv/
|
||||||
|
*.egg-info/
|
||||||
|
dist/
|
||||||
|
build/
|
||||||
|
.pytest_cache/
|
||||||
|
.coverage
|
||||||
|
htmlcov/
|
||||||
|
*.log
|
||||||
|
|
@ -0,0 +1,40 @@
|
||||||
|
FROM python:3.11-slim
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Install system dependencies for Docling and OCR
|
||||||
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
|
build-essential \
|
||||||
|
libgl1-mesa-glx \
|
||||||
|
libglib2.0-0 \
|
||||||
|
libsm6 \
|
||||||
|
libxext6 \
|
||||||
|
libxrender-dev \
|
||||||
|
libgomp1 \
|
||||||
|
poppler-utils \
|
||||||
|
tesseract-ocr \
|
||||||
|
libtesseract-dev \
|
||||||
|
ffmpeg \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Copy requirements first for layer caching
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
# Download Docling models at build time (optional, reduces first-run latency)
|
||||||
|
RUN python -c "from docling.document_converter import DocumentConverter; DocumentConverter()" || true
|
||||||
|
|
||||||
|
# Copy application code
|
||||||
|
COPY server.py .
|
||||||
|
|
||||||
|
# Create non-root user
|
||||||
|
RUN useradd -m -u 1000 appuser && chown -R appuser:appuser /app
|
||||||
|
USER appuser
|
||||||
|
|
||||||
|
EXPOSE 8081
|
||||||
|
|
||||||
|
# Health check
|
||||||
|
HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
|
||||||
|
CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8081/health')" || exit 1
|
||||||
|
|
||||||
|
CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8081"]
|
||||||
|
|
@ -0,0 +1,207 @@
|
||||||
|
"""
|
||||||
|
Docling Service Client - Use this to integrate with other services
|
||||||
|
|
||||||
|
Example usage:
|
||||||
|
from client import DoclingClient
|
||||||
|
|
||||||
|
client = DoclingClient("http://docs.jeffemmett.com")
|
||||||
|
|
||||||
|
# Extract from URL
|
||||||
|
result = await client.extract_url("https://example.com/doc.pdf")
|
||||||
|
|
||||||
|
# Extract with summarization
|
||||||
|
result = await client.extract_url(
|
||||||
|
"https://example.com/doc.pdf",
|
||||||
|
summarize=True,
|
||||||
|
summarize_style="bullet_points"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Transcribe audio
|
||||||
|
result = await client.transcribe_url("https://example.com/audio.mp3")
|
||||||
|
"""
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
import base64
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional, Dict, Any, Literal
|
||||||
|
|
||||||
|
|
||||||
|
OutputFormat = Literal["markdown", "json", "text", "html"]
|
||||||
|
SummarizeStyle = Literal["concise", "detailed", "bullet_points", "technical", "eli5"]
|
||||||
|
|
||||||
|
|
||||||
|
class DoclingClient:
|
||||||
|
"""Async client for Docling Service"""
|
||||||
|
|
||||||
|
def __init__(self, base_url: str = "http://localhost:8081", timeout: float = 300):
|
||||||
|
self.base_url = base_url.rstrip("/")
|
||||||
|
self.timeout = timeout
|
||||||
|
|
||||||
|
async def health(self) -> dict:
|
||||||
|
"""Check service health"""
|
||||||
|
async with httpx.AsyncClient(timeout=10) as client:
|
||||||
|
resp = await client.get(f"{self.base_url}/health")
|
||||||
|
return resp.json()
|
||||||
|
|
||||||
|
async def stats(self) -> dict:
|
||||||
|
"""Get processing statistics"""
|
||||||
|
async with httpx.AsyncClient(timeout=10) as client:
|
||||||
|
resp = await client.get(f"{self.base_url}/stats")
|
||||||
|
return resp.json()
|
||||||
|
|
||||||
|
async def extract_url(
|
||||||
|
self,
|
||||||
|
url: str,
|
||||||
|
output_format: OutputFormat = "markdown",
|
||||||
|
summarize: bool = False,
|
||||||
|
summarize_style: SummarizeStyle = "concise",
|
||||||
|
index_to_search: bool = False,
|
||||||
|
metadata: Optional[Dict[str, Any]] = None,
|
||||||
|
) -> dict:
|
||||||
|
"""Extract content from a URL"""
|
||||||
|
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
||||||
|
resp = await client.post(
|
||||||
|
f"{self.base_url}/extract",
|
||||||
|
json={
|
||||||
|
"url": url,
|
||||||
|
"output_format": output_format,
|
||||||
|
"summarize": summarize,
|
||||||
|
"summarize_style": summarize_style,
|
||||||
|
"index_to_search": index_to_search,
|
||||||
|
"metadata": metadata,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
return resp.json()
|
||||||
|
|
||||||
|
async def extract_file(
|
||||||
|
self,
|
||||||
|
file_path: str,
|
||||||
|
output_format: OutputFormat = "markdown",
|
||||||
|
summarize: bool = False,
|
||||||
|
summarize_style: SummarizeStyle = "concise",
|
||||||
|
index_to_search: bool = False,
|
||||||
|
metadata: Optional[Dict[str, Any]] = None,
|
||||||
|
) -> dict:
|
||||||
|
"""Extract content from a local file"""
|
||||||
|
path = Path(file_path)
|
||||||
|
content = base64.b64encode(path.read_bytes()).decode()
|
||||||
|
|
||||||
|
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
||||||
|
resp = await client.post(
|
||||||
|
f"{self.base_url}/extract",
|
||||||
|
json={
|
||||||
|
"base64_content": content,
|
||||||
|
"filename": path.name,
|
||||||
|
"output_format": output_format,
|
||||||
|
"summarize": summarize,
|
||||||
|
"summarize_style": summarize_style,
|
||||||
|
"index_to_search": index_to_search,
|
||||||
|
"metadata": metadata,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
return resp.json()
|
||||||
|
|
||||||
|
async def extract_bytes(
|
||||||
|
self,
|
||||||
|
content: bytes,
|
||||||
|
filename: str,
|
||||||
|
output_format: OutputFormat = "markdown",
|
||||||
|
summarize: bool = False,
|
||||||
|
summarize_style: SummarizeStyle = "concise",
|
||||||
|
index_to_search: bool = False,
|
||||||
|
metadata: Optional[Dict[str, Any]] = None,
|
||||||
|
) -> dict:
|
||||||
|
"""Extract content from bytes"""
|
||||||
|
b64_content = base64.b64encode(content).decode()
|
||||||
|
|
||||||
|
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
||||||
|
resp = await client.post(
|
||||||
|
f"{self.base_url}/extract",
|
||||||
|
json={
|
||||||
|
"base64_content": b64_content,
|
||||||
|
"filename": filename,
|
||||||
|
"output_format": output_format,
|
||||||
|
"summarize": summarize,
|
||||||
|
"summarize_style": summarize_style,
|
||||||
|
"index_to_search": index_to_search,
|
||||||
|
"metadata": metadata,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
return resp.json()
|
||||||
|
|
||||||
|
async def transcribe_url(
|
||||||
|
self,
|
||||||
|
url: str,
|
||||||
|
language: Optional[str] = None,
|
||||||
|
summarize: bool = False,
|
||||||
|
summarize_style: SummarizeStyle = "concise",
|
||||||
|
) -> dict:
|
||||||
|
"""Transcribe audio from URL"""
|
||||||
|
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
||||||
|
resp = await client.post(
|
||||||
|
f"{self.base_url}/transcribe",
|
||||||
|
json={
|
||||||
|
"url": url,
|
||||||
|
"language": language,
|
||||||
|
"summarize": summarize,
|
||||||
|
"summarize_style": summarize_style,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
return resp.json()
|
||||||
|
|
||||||
|
async def transcribe_file(
|
||||||
|
self,
|
||||||
|
file_path: str,
|
||||||
|
language: Optional[str] = None,
|
||||||
|
summarize: bool = False,
|
||||||
|
summarize_style: SummarizeStyle = "concise",
|
||||||
|
) -> dict:
|
||||||
|
"""Transcribe audio from local file"""
|
||||||
|
path = Path(file_path)
|
||||||
|
content = base64.b64encode(path.read_bytes()).decode()
|
||||||
|
|
||||||
|
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
||||||
|
resp = await client.post(
|
||||||
|
f"{self.base_url}/transcribe",
|
||||||
|
json={
|
||||||
|
"base64_content": content,
|
||||||
|
"language": language,
|
||||||
|
"summarize": summarize,
|
||||||
|
"summarize_style": summarize_style,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
return resp.json()
|
||||||
|
|
||||||
|
async def preview_url(self, url: str) -> dict:
|
||||||
|
"""Quick preview of URL content"""
|
||||||
|
async with httpx.AsyncClient(timeout=60) as client:
|
||||||
|
resp = await client.post(
|
||||||
|
f"{self.base_url}/url/preview",
|
||||||
|
json=url,
|
||||||
|
)
|
||||||
|
return resp.json()
|
||||||
|
|
||||||
|
|
||||||
|
# Sync wrapper for convenience
|
||||||
|
class DoclingClientSync:
|
||||||
|
"""Synchronous client wrapper"""
|
||||||
|
|
||||||
|
def __init__(self, base_url: str = "http://localhost:8081", timeout: float = 300):
|
||||||
|
self.base_url = base_url.rstrip("/")
|
||||||
|
self.timeout = timeout
|
||||||
|
|
||||||
|
def extract_url(self, url: str, **kwargs) -> dict:
|
||||||
|
with httpx.Client(timeout=self.timeout) as client:
|
||||||
|
resp = client.post(
|
||||||
|
f"{self.base_url}/extract",
|
||||||
|
json={"url": url, **kwargs},
|
||||||
|
)
|
||||||
|
return resp.json()
|
||||||
|
|
||||||
|
def transcribe_url(self, url: str, **kwargs) -> dict:
|
||||||
|
with httpx.Client(timeout=self.timeout) as client:
|
||||||
|
resp = client.post(
|
||||||
|
f"{self.base_url}/transcribe",
|
||||||
|
json={"url": url, **kwargs},
|
||||||
|
)
|
||||||
|
return resp.json()
|
||||||
|
|
@ -0,0 +1,58 @@
|
||||||
|
services:
|
||||||
|
docling-service:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
image: docling-service:latest
|
||||||
|
container_name: docling-service
|
||||||
|
restart: unless-stopped
|
||||||
|
environment:
|
||||||
|
# Connect to AI orchestrator for summarization (Ollama)
|
||||||
|
- AI_ORCHESTRATOR_URL=http://ai-orchestrator:8080
|
||||||
|
# Connect to semantic search for indexing
|
||||||
|
- SEMANTIC_SEARCH_URL=http://semantic-search:8000
|
||||||
|
# RunPod for Whisper transcription
|
||||||
|
- RUNPOD_API_KEY=${RUNPOD_API_KEY}
|
||||||
|
- RUNPOD_WHISPER_ENDPOINT=lrtisuv8ixbtub
|
||||||
|
labels:
|
||||||
|
# Traefik auto-discovery
|
||||||
|
- "traefik.enable=true"
|
||||||
|
# HTTP router
|
||||||
|
- "traefik.http.routers.docling.rule=Host(`docs.jeffemmett.com`)"
|
||||||
|
- "traefik.http.routers.docling.entrypoints=web"
|
||||||
|
- "traefik.http.services.docling.loadbalancer.server.port=8081"
|
||||||
|
# HTTPS router
|
||||||
|
- "traefik.http.routers.docling-secure.rule=Host(`docs.jeffemmett.com`)"
|
||||||
|
- "traefik.http.routers.docling-secure.entrypoints=websecure"
|
||||||
|
- "traefik.http.routers.docling-secure.tls=true"
|
||||||
|
# Health check for Traefik
|
||||||
|
- "traefik.http.services.docling.loadbalancer.healthcheck.path=/health"
|
||||||
|
- "traefik.http.services.docling.loadbalancer.healthcheck.interval=30s"
|
||||||
|
networks:
|
||||||
|
- traefik-public
|
||||||
|
- ai-internal
|
||||||
|
volumes:
|
||||||
|
# Cache for Docling models (persists across restarts)
|
||||||
|
- docling-cache:/home/appuser/.cache
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
limits:
|
||||||
|
memory: 8G
|
||||||
|
reservations:
|
||||||
|
memory: 2G
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8081/health')"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 10s
|
||||||
|
retries: 3
|
||||||
|
start_period: 60s
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
docling-cache:
|
||||||
|
driver: local
|
||||||
|
|
||||||
|
networks:
|
||||||
|
traefik-public:
|
||||||
|
external: true
|
||||||
|
ai-internal:
|
||||||
|
external: true
|
||||||
|
|
@ -0,0 +1,15 @@
|
||||||
|
fastapi>=0.109.0
|
||||||
|
uvicorn[standard]>=0.27.0
|
||||||
|
httpx>=0.26.0
|
||||||
|
pydantic>=2.0.0
|
||||||
|
python-multipart>=0.0.6
|
||||||
|
|
||||||
|
# Docling and dependencies
|
||||||
|
docling>=2.0.0
|
||||||
|
docling-core>=2.0.0
|
||||||
|
|
||||||
|
# OCR support (optional, for enhanced PDF/image processing)
|
||||||
|
easyocr>=1.7.0
|
||||||
|
|
||||||
|
# For audio file handling
|
||||||
|
pydub>=0.25.1
|
||||||
|
|
@ -0,0 +1,618 @@
|
||||||
|
"""
|
||||||
|
Docling Service - Document extraction and processing for the AI stack
|
||||||
|
|
||||||
|
Integrates with:
|
||||||
|
- AI Orchestrator (Ollama) for summarization
|
||||||
|
- RunPod Whisper for audio transcription
|
||||||
|
- Semantic Search for indexing extracted content
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import asyncio
|
||||||
|
import tempfile
|
||||||
|
import base64
|
||||||
|
import hashlib
|
||||||
|
from pathlib import Path
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Optional, List, Dict, Any, Literal
|
||||||
|
from enum import Enum
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
from fastapi import FastAPI, HTTPException, UploadFile, File, Form, BackgroundTasks
|
||||||
|
from fastapi.middleware.cors import CORSMiddleware
|
||||||
|
from fastapi.responses import JSONResponse
|
||||||
|
from pydantic import BaseModel, HttpUrl
|
||||||
|
|
||||||
|
# Docling imports
|
||||||
|
from docling.document_converter import DocumentConverter
|
||||||
|
from docling.datamodel.base_models import InputFormat
|
||||||
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||||
|
from docling.document_converter import PdfFormatOption
|
||||||
|
|
||||||
|
# Config from environment
|
||||||
|
AI_ORCHESTRATOR_URL = os.getenv("AI_ORCHESTRATOR_URL", "http://ai-orchestrator:8080")
|
||||||
|
SEMANTIC_SEARCH_URL = os.getenv("SEMANTIC_SEARCH_URL", "http://semantic-search:8000")
|
||||||
|
RUNPOD_API_KEY = os.getenv("RUNPOD_API_KEY", "")
|
||||||
|
RUNPOD_WHISPER_ENDPOINT = os.getenv("RUNPOD_WHISPER_ENDPOINT", "lrtisuv8ixbtub")
|
||||||
|
|
||||||
|
# Supported formats
|
||||||
|
DOCUMENT_FORMATS = {".pdf", ".docx", ".pptx", ".xlsx", ".html", ".md", ".txt", ".epub"}
|
||||||
|
IMAGE_FORMATS = {".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".gif"}
|
||||||
|
AUDIO_FORMATS = {".mp3", ".wav", ".m4a", ".ogg", ".flac", ".webm"}
|
||||||
|
|
||||||
|
app = FastAPI(
|
||||||
|
title="Docling Service",
|
||||||
|
description="Document extraction and processing service using Docling",
|
||||||
|
version="1.0.0",
|
||||||
|
)
|
||||||
|
|
||||||
|
app.add_middleware(
|
||||||
|
CORSMiddleware,
|
||||||
|
allow_origins=["*"],
|
||||||
|
allow_credentials=True,
|
||||||
|
allow_methods=["*"],
|
||||||
|
allow_headers=["*"],
|
||||||
|
)
|
||||||
|
|
||||||
|
# Initialize document converter with optimized settings
|
||||||
|
pipeline_options = PdfPipelineOptions()
|
||||||
|
pipeline_options.do_ocr = True
|
||||||
|
pipeline_options.do_table_structure = True
|
||||||
|
|
||||||
|
converter = DocumentConverter(
|
||||||
|
format_options={
|
||||||
|
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Track processing stats
|
||||||
|
stats = {
|
||||||
|
"documents_processed": 0,
|
||||||
|
"pages_extracted": 0,
|
||||||
|
"audio_transcribed": 0,
|
||||||
|
"urls_fetched": 0,
|
||||||
|
"errors": 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class OutputFormat(str, Enum):
|
||||||
|
MARKDOWN = "markdown"
|
||||||
|
JSON = "json"
|
||||||
|
TEXT = "text"
|
||||||
|
HTML = "html"
|
||||||
|
|
||||||
|
|
||||||
|
class SummarizeStyle(str, Enum):
|
||||||
|
CONCISE = "concise"
|
||||||
|
DETAILED = "detailed"
|
||||||
|
BULLET_POINTS = "bullet_points"
|
||||||
|
TECHNICAL = "technical"
|
||||||
|
ELI5 = "eli5" # Explain like I'm 5
|
||||||
|
|
||||||
|
|
||||||
|
class ExtractRequest(BaseModel):
|
||||||
|
"""Request to extract content from a URL or base64-encoded file"""
|
||||||
|
url: Optional[HttpUrl] = None
|
||||||
|
base64_content: Optional[str] = None
|
||||||
|
filename: Optional[str] = None
|
||||||
|
output_format: OutputFormat = OutputFormat.MARKDOWN
|
||||||
|
summarize: bool = False
|
||||||
|
summarize_style: SummarizeStyle = SummarizeStyle.CONCISE
|
||||||
|
index_to_search: bool = False
|
||||||
|
metadata: Optional[Dict[str, Any]] = None
|
||||||
|
|
||||||
|
|
||||||
|
class TranscribeRequest(BaseModel):
|
||||||
|
"""Request to transcribe audio"""
|
||||||
|
url: Optional[HttpUrl] = None
|
||||||
|
base64_content: Optional[str] = None
|
||||||
|
language: Optional[str] = None # Auto-detect if not specified
|
||||||
|
summarize: bool = False
|
||||||
|
summarize_style: SummarizeStyle = SummarizeStyle.CONCISE
|
||||||
|
|
||||||
|
|
||||||
|
class BatchExtractRequest(BaseModel):
|
||||||
|
"""Batch extraction request"""
|
||||||
|
items: List[ExtractRequest]
|
||||||
|
|
||||||
|
|
||||||
|
class ExtractionResult(BaseModel):
|
||||||
|
"""Result of document extraction"""
|
||||||
|
success: bool
|
||||||
|
source: str
|
||||||
|
content: Optional[str] = None
|
||||||
|
format: OutputFormat
|
||||||
|
metadata: Dict[str, Any] = {}
|
||||||
|
summary: Optional[str] = None
|
||||||
|
indexed: bool = False
|
||||||
|
error: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
# ============== Helper Functions ==============
|
||||||
|
|
||||||
|
def get_file_extension(filename: str) -> str:
|
||||||
|
"""Get lowercase file extension"""
|
||||||
|
return Path(filename).suffix.lower()
|
||||||
|
|
||||||
|
|
||||||
|
def generate_doc_id(source: str, content: str) -> str:
|
||||||
|
"""Generate a unique document ID"""
|
||||||
|
hash_input = f"{source}:{content[:1000]}"
|
||||||
|
return hashlib.sha256(hash_input.encode()).hexdigest()[:16]
|
||||||
|
|
||||||
|
|
||||||
|
async def fetch_url_content(url: str) -> tuple[bytes, str]:
|
||||||
|
"""Fetch content from URL, return bytes and detected filename"""
|
||||||
|
async with httpx.AsyncClient(follow_redirects=True, timeout=60) as client:
|
||||||
|
resp = await client.get(url)
|
||||||
|
resp.raise_for_status()
|
||||||
|
|
||||||
|
# Try to get filename from headers or URL
|
||||||
|
content_disposition = resp.headers.get("content-disposition", "")
|
||||||
|
if "filename=" in content_disposition:
|
||||||
|
filename = content_disposition.split("filename=")[1].strip('"\'')
|
||||||
|
else:
|
||||||
|
filename = url.split("/")[-1].split("?")[0] or "document"
|
||||||
|
|
||||||
|
return resp.content, filename
|
||||||
|
|
||||||
|
|
||||||
|
async def transcribe_audio_runpod(audio_data: bytes, language: Optional[str] = None) -> dict:
|
||||||
|
"""Transcribe audio using RunPod Whisper endpoint"""
|
||||||
|
if not RUNPOD_API_KEY:
|
||||||
|
raise HTTPException(status_code=500, detail="RunPod API key not configured")
|
||||||
|
|
||||||
|
# Convert audio to base64
|
||||||
|
audio_base64 = base64.b64encode(audio_data).decode()
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"input": {
|
||||||
|
"audio_base64": audio_base64,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if language:
|
||||||
|
payload["input"]["language"] = language
|
||||||
|
|
||||||
|
async with httpx.AsyncClient(timeout=300) as client:
|
||||||
|
# Submit job
|
||||||
|
resp = await client.post(
|
||||||
|
f"https://api.runpod.ai/v2/{RUNPOD_WHISPER_ENDPOINT}/run",
|
||||||
|
headers={
|
||||||
|
"Authorization": f"Bearer {RUNPOD_API_KEY}",
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
},
|
||||||
|
json=payload,
|
||||||
|
)
|
||||||
|
result = resp.json()
|
||||||
|
|
||||||
|
if "error" in result:
|
||||||
|
raise HTTPException(status_code=500, detail=f"RunPod error: {result['error']}")
|
||||||
|
|
||||||
|
job_id = result.get("id")
|
||||||
|
if not job_id:
|
||||||
|
raise HTTPException(status_code=500, detail="No job ID returned from RunPod")
|
||||||
|
|
||||||
|
# Poll for completion
|
||||||
|
for _ in range(120): # Max 10 minutes
|
||||||
|
await asyncio.sleep(5)
|
||||||
|
status_resp = await client.get(
|
||||||
|
f"https://api.runpod.ai/v2/{RUNPOD_WHISPER_ENDPOINT}/status/{job_id}",
|
||||||
|
headers={"Authorization": f"Bearer {RUNPOD_API_KEY}"},
|
||||||
|
)
|
||||||
|
status_data = status_resp.json()
|
||||||
|
|
||||||
|
if status_data.get("status") == "COMPLETED":
|
||||||
|
return status_data.get("output", {})
|
||||||
|
elif status_data.get("status") in ["FAILED", "CANCELLED"]:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=500,
|
||||||
|
detail=f"Transcription failed: {status_data.get('error', 'Unknown error')}"
|
||||||
|
)
|
||||||
|
|
||||||
|
raise HTTPException(status_code=504, detail="Transcription timed out")
|
||||||
|
|
||||||
|
|
||||||
|
async def summarize_with_ollama(content: str, style: SummarizeStyle) -> str:
|
||||||
|
"""Summarize content using AI Orchestrator (Ollama)"""
|
||||||
|
style_prompts = {
|
||||||
|
SummarizeStyle.CONCISE: "Provide a concise 2-3 sentence summary of the following content:",
|
||||||
|
SummarizeStyle.DETAILED: "Provide a detailed summary of the following content, covering all main points:",
|
||||||
|
SummarizeStyle.BULLET_POINTS: "Summarize the following content as bullet points:",
|
||||||
|
SummarizeStyle.TECHNICAL: "Provide a technical summary of the following content, focusing on key technical details:",
|
||||||
|
SummarizeStyle.ELI5: "Explain the following content in simple terms that a child could understand:",
|
||||||
|
}
|
||||||
|
|
||||||
|
prompt = f"{style_prompts[style]}\n\n{content[:8000]}" # Limit content for context window
|
||||||
|
|
||||||
|
async with httpx.AsyncClient(timeout=120) as client:
|
||||||
|
try:
|
||||||
|
resp = await client.post(
|
||||||
|
f"{AI_ORCHESTRATOR_URL}/api/generate/text",
|
||||||
|
json={
|
||||||
|
"prompt": prompt,
|
||||||
|
"model": "llama3.2",
|
||||||
|
"max_tokens": 1024,
|
||||||
|
"priority": "low", # Use free Ollama
|
||||||
|
},
|
||||||
|
)
|
||||||
|
result = resp.json()
|
||||||
|
return result.get("response", "")
|
||||||
|
except Exception as e:
|
||||||
|
return f"[Summarization failed: {str(e)}]"
|
||||||
|
|
||||||
|
|
||||||
|
async def index_to_semantic_search(
|
||||||
|
doc_id: str,
|
||||||
|
content: str,
|
||||||
|
source: str,
|
||||||
|
metadata: Dict[str, Any],
|
||||||
|
) -> bool:
|
||||||
|
"""Index document to semantic search service"""
|
||||||
|
async with httpx.AsyncClient(timeout=30) as client:
|
||||||
|
try:
|
||||||
|
resp = await client.post(
|
||||||
|
f"{SEMANTIC_SEARCH_URL}/index",
|
||||||
|
json={
|
||||||
|
"id": doc_id,
|
||||||
|
"content": content,
|
||||||
|
"metadata": {
|
||||||
|
"source": source,
|
||||||
|
"indexed_at": datetime.now().isoformat(),
|
||||||
|
**metadata,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
)
|
||||||
|
return resp.status_code == 200
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def extract_with_docling(file_path: Path, output_format: OutputFormat) -> tuple[str, dict]:
|
||||||
|
"""Extract content from document using Docling"""
|
||||||
|
result = converter.convert(str(file_path))
|
||||||
|
doc = result.document
|
||||||
|
|
||||||
|
# Get metadata
|
||||||
|
metadata = {
|
||||||
|
"pages": len(doc.pages) if hasattr(doc, "pages") else 0,
|
||||||
|
"tables": len(doc.tables) if hasattr(doc, "tables") else 0,
|
||||||
|
"figures": len(doc.pictures) if hasattr(doc, "pictures") else 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Export in requested format
|
||||||
|
if output_format == OutputFormat.MARKDOWN:
|
||||||
|
content = doc.export_to_markdown()
|
||||||
|
elif output_format == OutputFormat.JSON:
|
||||||
|
content = doc.export_to_dict()
|
||||||
|
elif output_format == OutputFormat.HTML:
|
||||||
|
content = doc.export_to_html()
|
||||||
|
else: # TEXT
|
||||||
|
content = doc.export_to_markdown() # Markdown is readable as plain text
|
||||||
|
|
||||||
|
return content if isinstance(content, str) else str(content), metadata
|
||||||
|
|
||||||
|
|
||||||
|
# ============== API Endpoints ==============
|
||||||
|
|
||||||
|
@app.get("/")
|
||||||
|
async def root():
|
||||||
|
"""Service info and health check"""
|
||||||
|
return {
|
||||||
|
"service": "Docling Service",
|
||||||
|
"version": "1.0.0",
|
||||||
|
"status": "healthy",
|
||||||
|
"supported_formats": {
|
||||||
|
"documents": list(DOCUMENT_FORMATS),
|
||||||
|
"images": list(IMAGE_FORMATS),
|
||||||
|
"audio": list(AUDIO_FORMATS),
|
||||||
|
},
|
||||||
|
"integrations": {
|
||||||
|
"ai_orchestrator": AI_ORCHESTRATOR_URL,
|
||||||
|
"semantic_search": SEMANTIC_SEARCH_URL,
|
||||||
|
"runpod_whisper": f"endpoint:{RUNPOD_WHISPER_ENDPOINT}",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/health")
|
||||||
|
async def health():
|
||||||
|
"""Health check endpoint for Traefik"""
|
||||||
|
return {"status": "healthy", "timestamp": datetime.now().isoformat()}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/stats")
|
||||||
|
async def get_stats():
|
||||||
|
"""Get processing statistics"""
|
||||||
|
return stats
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/extract", response_model=ExtractionResult)
|
||||||
|
async def extract_document(request: ExtractRequest):
|
||||||
|
"""
|
||||||
|
Extract content from a document (URL or base64).
|
||||||
|
|
||||||
|
Supports: PDF, DOCX, PPTX, XLSX, HTML, MD, TXT, EPUB, images (with OCR)
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Get content
|
||||||
|
if request.url:
|
||||||
|
content_bytes, filename = await fetch_url_content(str(request.url))
|
||||||
|
source = str(request.url)
|
||||||
|
elif request.base64_content:
|
||||||
|
content_bytes = base64.b64decode(request.base64_content)
|
||||||
|
filename = request.filename or "document"
|
||||||
|
source = f"base64:{filename}"
|
||||||
|
else:
|
||||||
|
raise HTTPException(status_code=400, detail="Provide either url or base64_content")
|
||||||
|
|
||||||
|
ext = get_file_extension(filename)
|
||||||
|
|
||||||
|
# Handle audio separately
|
||||||
|
if ext in AUDIO_FORMATS:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail="Use /transcribe endpoint for audio files"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Write to temp file for Docling
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
|
||||||
|
tmp.write(content_bytes)
|
||||||
|
tmp_path = Path(tmp.name)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Extract content
|
||||||
|
content, metadata = extract_with_docling(tmp_path, request.output_format)
|
||||||
|
stats["documents_processed"] += 1
|
||||||
|
stats["pages_extracted"] += metadata.get("pages", 1)
|
||||||
|
|
||||||
|
# Summarize if requested
|
||||||
|
summary = None
|
||||||
|
if request.summarize:
|
||||||
|
summary = await summarize_with_ollama(content, request.summarize_style)
|
||||||
|
|
||||||
|
# Index if requested
|
||||||
|
indexed = False
|
||||||
|
if request.index_to_search:
|
||||||
|
doc_id = generate_doc_id(source, content)
|
||||||
|
indexed = await index_to_semantic_search(
|
||||||
|
doc_id=doc_id,
|
||||||
|
content=content,
|
||||||
|
source=source,
|
||||||
|
metadata={**metadata, **(request.metadata or {})},
|
||||||
|
)
|
||||||
|
|
||||||
|
return ExtractionResult(
|
||||||
|
success=True,
|
||||||
|
source=source,
|
||||||
|
content=content,
|
||||||
|
format=request.output_format,
|
||||||
|
metadata=metadata,
|
||||||
|
summary=summary,
|
||||||
|
indexed=indexed,
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
tmp_path.unlink(missing_ok=True)
|
||||||
|
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
stats["errors"] += 1
|
||||||
|
return ExtractionResult(
|
||||||
|
success=False,
|
||||||
|
source=str(request.url or request.filename or "unknown"),
|
||||||
|
format=request.output_format,
|
||||||
|
error=str(e),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/extract/upload", response_model=ExtractionResult)
|
||||||
|
async def extract_uploaded_file(
|
||||||
|
file: UploadFile = File(...),
|
||||||
|
output_format: OutputFormat = Form(OutputFormat.MARKDOWN),
|
||||||
|
summarize: bool = Form(False),
|
||||||
|
summarize_style: SummarizeStyle = Form(SummarizeStyle.CONCISE),
|
||||||
|
index_to_search: bool = Form(False),
|
||||||
|
):
|
||||||
|
"""Extract content from an uploaded file"""
|
||||||
|
try:
|
||||||
|
content_bytes = await file.read()
|
||||||
|
ext = get_file_extension(file.filename or "document")
|
||||||
|
|
||||||
|
if ext in AUDIO_FORMATS:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail="Use /transcribe/upload endpoint for audio files"
|
||||||
|
)
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
|
||||||
|
tmp.write(content_bytes)
|
||||||
|
tmp_path = Path(tmp.name)
|
||||||
|
|
||||||
|
try:
|
||||||
|
content, metadata = extract_with_docling(tmp_path, output_format)
|
||||||
|
stats["documents_processed"] += 1
|
||||||
|
stats["pages_extracted"] += metadata.get("pages", 1)
|
||||||
|
|
||||||
|
summary = None
|
||||||
|
if summarize:
|
||||||
|
summary = await summarize_with_ollama(content, summarize_style)
|
||||||
|
|
||||||
|
indexed = False
|
||||||
|
if index_to_search:
|
||||||
|
doc_id = generate_doc_id(file.filename or "upload", content)
|
||||||
|
indexed = await index_to_semantic_search(
|
||||||
|
doc_id=doc_id,
|
||||||
|
content=content,
|
||||||
|
source=f"upload:{file.filename}",
|
||||||
|
metadata=metadata,
|
||||||
|
)
|
||||||
|
|
||||||
|
return ExtractionResult(
|
||||||
|
success=True,
|
||||||
|
source=f"upload:{file.filename}",
|
||||||
|
content=content,
|
||||||
|
format=output_format,
|
||||||
|
metadata=metadata,
|
||||||
|
summary=summary,
|
||||||
|
indexed=indexed,
|
||||||
|
)
|
||||||
|
finally:
|
||||||
|
tmp_path.unlink(missing_ok=True)
|
||||||
|
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
stats["errors"] += 1
|
||||||
|
return ExtractionResult(
|
||||||
|
success=False,
|
||||||
|
source=f"upload:{file.filename}",
|
||||||
|
format=output_format,
|
||||||
|
error=str(e),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/transcribe")
|
||||||
|
async def transcribe_audio(request: TranscribeRequest):
|
||||||
|
"""
|
||||||
|
Transcribe audio using RunPod Whisper.
|
||||||
|
|
||||||
|
Supports: MP3, WAV, M4A, OGG, FLAC, WEBM
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
if request.url:
|
||||||
|
content_bytes, filename = await fetch_url_content(str(request.url))
|
||||||
|
source = str(request.url)
|
||||||
|
elif request.base64_content:
|
||||||
|
content_bytes = base64.b64decode(request.base64_content)
|
||||||
|
source = "base64:audio"
|
||||||
|
else:
|
||||||
|
raise HTTPException(status_code=400, detail="Provide either url or base64_content")
|
||||||
|
|
||||||
|
# Transcribe
|
||||||
|
result = await transcribe_audio_runpod(content_bytes, request.language)
|
||||||
|
stats["audio_transcribed"] += 1
|
||||||
|
|
||||||
|
transcript = result.get("transcription", result.get("text", ""))
|
||||||
|
|
||||||
|
# Summarize if requested
|
||||||
|
summary = None
|
||||||
|
if request.summarize and transcript:
|
||||||
|
summary = await summarize_with_ollama(transcript, request.summarize_style)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"source": source,
|
||||||
|
"transcript": transcript,
|
||||||
|
"language": result.get("detected_language"),
|
||||||
|
"duration": result.get("duration"),
|
||||||
|
"summary": summary,
|
||||||
|
}
|
||||||
|
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
stats["errors"] += 1
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"source": str(request.url or "base64"),
|
||||||
|
"error": str(e),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/transcribe/upload")
|
||||||
|
async def transcribe_uploaded_audio(
|
||||||
|
file: UploadFile = File(...),
|
||||||
|
language: Optional[str] = Form(None),
|
||||||
|
summarize: bool = Form(False),
|
||||||
|
summarize_style: SummarizeStyle = Form(SummarizeStyle.CONCISE),
|
||||||
|
):
|
||||||
|
"""Transcribe uploaded audio file"""
|
||||||
|
try:
|
||||||
|
content_bytes = await file.read()
|
||||||
|
|
||||||
|
result = await transcribe_audio_runpod(content_bytes, language)
|
||||||
|
stats["audio_transcribed"] += 1
|
||||||
|
|
||||||
|
transcript = result.get("transcription", result.get("text", ""))
|
||||||
|
|
||||||
|
summary = None
|
||||||
|
if summarize and transcript:
|
||||||
|
summary = await summarize_with_ollama(transcript, summarize_style)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"source": f"upload:{file.filename}",
|
||||||
|
"transcript": transcript,
|
||||||
|
"language": result.get("detected_language"),
|
||||||
|
"duration": result.get("duration"),
|
||||||
|
"summary": summary,
|
||||||
|
}
|
||||||
|
|
||||||
|
except HTTPException:
|
||||||
|
raise
|
||||||
|
except Exception as e:
|
||||||
|
stats["errors"] += 1
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"source": f"upload:{file.filename}",
|
||||||
|
"error": str(e),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/batch")
|
||||||
|
async def batch_extract(request: BatchExtractRequest, background_tasks: BackgroundTasks):
|
||||||
|
"""
|
||||||
|
Batch extract multiple documents.
|
||||||
|
Returns immediately with job ID, processes in background.
|
||||||
|
"""
|
||||||
|
job_id = hashlib.sha256(str(datetime.now()).encode()).hexdigest()[:16]
|
||||||
|
|
||||||
|
# For now, process synchronously (can be enhanced with Redis queue later)
|
||||||
|
results = []
|
||||||
|
for item in request.items:
|
||||||
|
result = await extract_document(item)
|
||||||
|
results.append(result)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"job_id": job_id,
|
||||||
|
"total": len(request.items),
|
||||||
|
"results": results,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/url/preview")
|
||||||
|
async def preview_url(url: HttpUrl):
|
||||||
|
"""Quick preview of URL content (first 500 chars of markdown)"""
|
||||||
|
try:
|
||||||
|
content_bytes, filename = await fetch_url_content(str(url))
|
||||||
|
stats["urls_fetched"] += 1
|
||||||
|
|
||||||
|
ext = get_file_extension(filename)
|
||||||
|
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=ext or ".html", delete=False) as tmp:
|
||||||
|
tmp.write(content_bytes)
|
||||||
|
tmp_path = Path(tmp.name)
|
||||||
|
|
||||||
|
try:
|
||||||
|
content, metadata = extract_with_docling(tmp_path, OutputFormat.MARKDOWN)
|
||||||
|
return {
|
||||||
|
"success": True,
|
||||||
|
"url": str(url),
|
||||||
|
"preview": content[:500] + ("..." if len(content) > 500 else ""),
|
||||||
|
"full_length": len(content),
|
||||||
|
"metadata": metadata,
|
||||||
|
}
|
||||||
|
finally:
|
||||||
|
tmp_path.unlink(missing_ok=True)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return {
|
||||||
|
"success": False,
|
||||||
|
"url": str(url),
|
||||||
|
"error": str(e),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import uvicorn
|
||||||
|
uvicorn.run(app, host="0.0.0.0", port=8081)
|
||||||
Loading…
Reference in New Issue