Initial docling-service: document extraction for AI stack

- FastAPI service using IBM Docling for document extraction - Supports PDF, DOCX, PPTX, XLSX, HTML, images with OCR - Integrates with AI Orchestrator (Ollama) for summarization - Routes audio to RunPod Whisper for transcription - Optional indexing to Semantic Search service - Docker + Traefik configuration for RS 8000 deployment - Python client library included 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-26 20:47:42 -08:00 · 2025-11-26 20:47:42 -08:00 · 4ed909dbc4
commit 4ed909dbc4
7 changed files with 954 additions and 0 deletions
--- a/.env.example
+++ b/.env.example
@ -0,0 +1,4 @@
 RUNPOD_API_KEY=your_runpod_api_key_here
 AI_ORCHESTRATOR_URL=http://ai-orchestrator:8080
 SEMANTIC_SEARCH_URL=http://semantic-search:8000
 RUNPOD_WHISPER_ENDPOINT=lrtisuv8ixbtub
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,12 @@
 __pycache__/
 *.pyc
 .env
 .venv/
 venv/
 *.egg-info/
 dist/
 build/
 .pytest_cache/
 .coverage
 htmlcov/
 *.log
--- a/40
+++ b/40
@ -0,0 +1,40 @@
 FROM python:3.11-slim
 WORKDIR /app
 # Install system dependencies for Docling and OCR
 RUN apt-get update && apt-get install -y --no-install-recommends \
    build-essential \
    libgl1-mesa-glx \
    libglib2.0-0 \
    libsm6 \
    libxext6 \
    libxrender-dev \
    libgomp1 \
    poppler-utils \
    tesseract-ocr \
    libtesseract-dev \
    ffmpeg \
    && rm -rf /var/lib/apt/lists/*
 # Copy requirements first for layer caching
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 # Download Docling models at build time (optional, reduces first-run latency)
 RUN python -c "from docling.document_converter import DocumentConverter; DocumentConverter()" || true
 # Copy application code
 COPY server.py .
 # Create non-root user
 RUN useradd -m -u 1000 appuser && chown -R appuser:appuser /app
 USER appuser
 EXPOSE 8081
 # Health check
 HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
    CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8081/health')" || exit 1
 CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8081"]
--- a/client.py
+++ b/client.py
@ -0,0 +1,207 @@
 """
 Docling Service Client - Use this to integrate with other services
 Example usage:
    from client import DoclingClient
    client = DoclingClient("http://docs.jeffemmett.com")
    # Extract from URL
    result = await client.extract_url("https://example.com/doc.pdf")
    # Extract with summarization
    result = await client.extract_url(
        "https://example.com/doc.pdf",
        summarize=True,
        summarize_style="bullet_points"
    )
    # Transcribe audio
    result = await client.transcribe_url("https://example.com/audio.mp3")
 """
 import httpx
 import base64
 from pathlib import Path
 from typing import Optional, Dict, Any, Literal
 OutputFormat = Literal["markdown", "json", "text", "html"]
 SummarizeStyle = Literal["concise", "detailed", "bullet_points", "technical", "eli5"]
 class DoclingClient:
    """Async client for Docling Service"""
    def __init__(self, base_url: str = "http://localhost:8081", timeout: float = 300):
        self.base_url = base_url.rstrip("/")
        self.timeout = timeout
    async def health(self) -> dict:
        """Check service health"""
        async with httpx.AsyncClient(timeout=10) as client:
            resp = await client.get(f"{self.base_url}/health")
            return resp.json()
    async def stats(self) -> dict:
        """Get processing statistics"""
        async with httpx.AsyncClient(timeout=10) as client:
            resp = await client.get(f"{self.base_url}/stats")
            return resp.json()
    async def extract_url(
        self,
        url: str,
        output_format: OutputFormat = "markdown",
        summarize: bool = False,
        summarize_style: SummarizeStyle = "concise",
        index_to_search: bool = False,
        metadata: Optional[Dict[str, Any]] = None,
    ) -> dict:
        """Extract content from a URL"""
        async with httpx.AsyncClient(timeout=self.timeout) as client:
            resp = await client.post(
                f"{self.base_url}/extract",
                json={
                    "url": url,
                    "output_format": output_format,
                    "summarize": summarize,
                    "summarize_style": summarize_style,
                    "index_to_search": index_to_search,
                    "metadata": metadata,
                },
            )
            return resp.json()
    async def extract_file(
        self,
        file_path: str,
        output_format: OutputFormat = "markdown",
        summarize: bool = False,
        summarize_style: SummarizeStyle = "concise",
        index_to_search: bool = False,
        metadata: Optional[Dict[str, Any]] = None,
    ) -> dict:
        """Extract content from a local file"""
        path = Path(file_path)
        content = base64.b64encode(path.read_bytes()).decode()
        async with httpx.AsyncClient(timeout=self.timeout) as client:
            resp = await client.post(
                f"{self.base_url}/extract",
                json={
                    "base64_content": content,
                    "filename": path.name,
                    "output_format": output_format,
                    "summarize": summarize,
                    "summarize_style": summarize_style,
                    "index_to_search": index_to_search,
                    "metadata": metadata,
                },
            )
            return resp.json()
    async def extract_bytes(
        self,
        content: bytes,
        filename: str,
        output_format: OutputFormat = "markdown",
        summarize: bool = False,
        summarize_style: SummarizeStyle = "concise",
        index_to_search: bool = False,
        metadata: Optional[Dict[str, Any]] = None,
    ) -> dict:
        """Extract content from bytes"""
        b64_content = base64.b64encode(content).decode()
        async with httpx.AsyncClient(timeout=self.timeout) as client:
            resp = await client.post(
                f"{self.base_url}/extract",
                json={
                    "base64_content": b64_content,
                    "filename": filename,
                    "output_format": output_format,
                    "summarize": summarize,
                    "summarize_style": summarize_style,
                    "index_to_search": index_to_search,
                    "metadata": metadata,
                },
            )
            return resp.json()
    async def transcribe_url(
        self,
        url: str,
        language: Optional[str] = None,
        summarize: bool = False,
        summarize_style: SummarizeStyle = "concise",
    ) -> dict:
        """Transcribe audio from URL"""
        async with httpx.AsyncClient(timeout=self.timeout) as client:
            resp = await client.post(
                f"{self.base_url}/transcribe",
                json={
                    "url": url,
                    "language": language,
                    "summarize": summarize,
                    "summarize_style": summarize_style,
                },
            )
            return resp.json()
    async def transcribe_file(
        self,
        file_path: str,
        language: Optional[str] = None,
        summarize: bool = False,
        summarize_style: SummarizeStyle = "concise",
    ) -> dict:
        """Transcribe audio from local file"""
        path = Path(file_path)
        content = base64.b64encode(path.read_bytes()).decode()
        async with httpx.AsyncClient(timeout=self.timeout) as client:
            resp = await client.post(
                f"{self.base_url}/transcribe",
                json={
                    "base64_content": content,
                    "language": language,
                    "summarize": summarize,
                    "summarize_style": summarize_style,
                },
            )
            return resp.json()
    async def preview_url(self, url: str) -> dict:
        """Quick preview of URL content"""
        async with httpx.AsyncClient(timeout=60) as client:
            resp = await client.post(
                f"{self.base_url}/url/preview",
                json=url,
            )
            return resp.json()
 # Sync wrapper for convenience
 class DoclingClientSync:
    """Synchronous client wrapper"""
    def __init__(self, base_url: str = "http://localhost:8081", timeout: float = 300):
        self.base_url = base_url.rstrip("/")
        self.timeout = timeout
    def extract_url(self, url: str, **kwargs) -> dict:
        with httpx.Client(timeout=self.timeout) as client:
            resp = client.post(
                f"{self.base_url}/extract",
                json={"url": url, **kwargs},
            )
            return resp.json()
    def transcribe_url(self, url: str, **kwargs) -> dict:
        with httpx.Client(timeout=self.timeout) as client:
            resp = client.post(
                f"{self.base_url}/transcribe",
                json={"url": url, **kwargs},
            )
            return resp.json()
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,58 @@
 services:
  docling-service:
    build:
      context: .
      dockerfile: Dockerfile
    image: docling-service:latest
    container_name: docling-service
    restart: unless-stopped
    environment:
      # Connect to AI orchestrator for summarization (Ollama)
      - AI_ORCHESTRATOR_URL=http://ai-orchestrator:8080
      # Connect to semantic search for indexing
      - SEMANTIC_SEARCH_URL=http://semantic-search:8000
      # RunPod for Whisper transcription
      - RUNPOD_API_KEY=${RUNPOD_API_KEY}
      - RUNPOD_WHISPER_ENDPOINT=lrtisuv8ixbtub
    labels:
      # Traefik auto-discovery
      - "traefik.enable=true"
      # HTTP router
      - "traefik.http.routers.docling.rule=Host(`docs.jeffemmett.com`)"
      - "traefik.http.routers.docling.entrypoints=web"
      - "traefik.http.services.docling.loadbalancer.server.port=8081"
      # HTTPS router
      - "traefik.http.routers.docling-secure.rule=Host(`docs.jeffemmett.com`)"
      - "traefik.http.routers.docling-secure.entrypoints=websecure"
      - "traefik.http.routers.docling-secure.tls=true"
      # Health check for Traefik
      - "traefik.http.services.docling.loadbalancer.healthcheck.path=/health"
      - "traefik.http.services.docling.loadbalancer.healthcheck.interval=30s"
    networks:
      - traefik-public
      - ai-internal
    volumes:
      # Cache for Docling models (persists across restarts)
      - docling-cache:/home/appuser/.cache
    deploy:
      resources:
        limits:
          memory: 8G
        reservations:
          memory: 2G
    healthcheck:
      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8081/health')"]
      interval: 30s
      timeout: 10s
      retries: 3
      start_period: 60s
 volumes:
  docling-cache:
    driver: local
 networks:
  traefik-public:
    external: true
  ai-internal:
    external: true
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,15 @@
 fastapi>=0.109.0
 uvicorn[standard]>=0.27.0
 httpx>=0.26.0
 pydantic>=2.0.0
 python-multipart>=0.0.6
 # Docling and dependencies
 docling>=2.0.0
 docling-core>=2.0.0
 # OCR support (optional, for enhanced PDF/image processing)
 easyocr>=1.7.0
 # For audio file handling
 pydub>=0.25.1
--- a/server.py
+++ b/server.py
@ -0,0 +1,618 @@
 """
 Docling Service - Document extraction and processing for the AI stack
 Integrates with:
 - AI Orchestrator (Ollama) for summarization
 - RunPod Whisper for audio transcription
 - Semantic Search for indexing extracted content
 """
 import os
 import asyncio
 import tempfile
 import base64
 import hashlib
 from pathlib import Path
 from datetime import datetime
 from typing import Optional, List, Dict, Any, Literal
 from enum import Enum
 import httpx
 from fastapi import FastAPI, HTTPException, UploadFile, File, Form, BackgroundTasks
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
 from pydantic import BaseModel, HttpUrl
 # Docling imports
 from docling.document_converter import DocumentConverter
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.pipeline_options import PdfPipelineOptions
 from docling.document_converter import PdfFormatOption
 # Config from environment
 AI_ORCHESTRATOR_URL = os.getenv("AI_ORCHESTRATOR_URL", "http://ai-orchestrator:8080")
 SEMANTIC_SEARCH_URL = os.getenv("SEMANTIC_SEARCH_URL", "http://semantic-search:8000")
 RUNPOD_API_KEY = os.getenv("RUNPOD_API_KEY", "")
 RUNPOD_WHISPER_ENDPOINT = os.getenv("RUNPOD_WHISPER_ENDPOINT", "lrtisuv8ixbtub")
 # Supported formats
 DOCUMENT_FORMATS = {".pdf", ".docx", ".pptx", ".xlsx", ".html", ".md", ".txt", ".epub"}
 IMAGE_FORMATS = {".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".gif"}
 AUDIO_FORMATS = {".mp3", ".wav", ".m4a", ".ogg", ".flac", ".webm"}
 app = FastAPI(
    title="Docling Service",
    description="Document extraction and processing service using Docling",
    version="1.0.0",
 )
 app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
 )
 # Initialize document converter with optimized settings
 pipeline_options = PdfPipelineOptions()
 pipeline_options.do_ocr = True
 pipeline_options.do_table_structure = True
 converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
 )
 # Track processing stats
 stats = {
    "documents_processed": 0,
    "pages_extracted": 0,
    "audio_transcribed": 0,
    "urls_fetched": 0,
    "errors": 0,
 }
 class OutputFormat(str, Enum):
    MARKDOWN = "markdown"
    JSON = "json"
    TEXT = "text"
    HTML = "html"
 class SummarizeStyle(str, Enum):
    CONCISE = "concise"
    DETAILED = "detailed"
    BULLET_POINTS = "bullet_points"
    TECHNICAL = "technical"
    ELI5 = "eli5"  # Explain like I'm 5
 class ExtractRequest(BaseModel):
    """Request to extract content from a URL or base64-encoded file"""
    url: Optional[HttpUrl] = None
    base64_content: Optional[str] = None
    filename: Optional[str] = None
    output_format: OutputFormat = OutputFormat.MARKDOWN
    summarize: bool = False
    summarize_style: SummarizeStyle = SummarizeStyle.CONCISE
    index_to_search: bool = False
    metadata: Optional[Dict[str, Any]] = None
 class TranscribeRequest(BaseModel):
    """Request to transcribe audio"""
    url: Optional[HttpUrl] = None
    base64_content: Optional[str] = None
    language: Optional[str] = None  # Auto-detect if not specified
    summarize: bool = False
    summarize_style: SummarizeStyle = SummarizeStyle.CONCISE
 class BatchExtractRequest(BaseModel):
    """Batch extraction request"""
    items: List[ExtractRequest]
 class ExtractionResult(BaseModel):
    """Result of document extraction"""
    success: bool
    source: str
    content: Optional[str] = None
    format: OutputFormat
    metadata: Dict[str, Any] = {}
    summary: Optional[str] = None
    indexed: bool = False
    error: Optional[str] = None
 # ============== Helper Functions ==============
 def get_file_extension(filename: str) -> str:
    """Get lowercase file extension"""
    return Path(filename).suffix.lower()
 def generate_doc_id(source: str, content: str) -> str:
    """Generate a unique document ID"""
    hash_input = f"{source}:{content[:1000]}"
    return hashlib.sha256(hash_input.encode()).hexdigest()[:16]
 async def fetch_url_content(url: str) -> tuple[bytes, str]:
    """Fetch content from URL, return bytes and detected filename"""
    async with httpx.AsyncClient(follow_redirects=True, timeout=60) as client:
        resp = await client.get(url)
        resp.raise_for_status()
        # Try to get filename from headers or URL
        content_disposition = resp.headers.get("content-disposition", "")
        if "filename=" in content_disposition:
            filename = content_disposition.split("filename=")[1].strip('"\'')
        else:
            filename = url.split("/")[-1].split("?")[0] or "document"
        return resp.content, filename
 async def transcribe_audio_runpod(audio_data: bytes, language: Optional[str] = None) -> dict:
    """Transcribe audio using RunPod Whisper endpoint"""
    if not RUNPOD_API_KEY:
        raise HTTPException(status_code=500, detail="RunPod API key not configured")
    # Convert audio to base64
    audio_base64 = base64.b64encode(audio_data).decode()
    payload = {
        "input": {
            "audio_base64": audio_base64,
        }
    }
    if language:
        payload["input"]["language"] = language
    async with httpx.AsyncClient(timeout=300) as client:
        # Submit job
        resp = await client.post(
            f"https://api.runpod.ai/v2/{RUNPOD_WHISPER_ENDPOINT}/run",
            headers={
                "Authorization": f"Bearer {RUNPOD_API_KEY}",
                "Content-Type": "application/json",
            },
            json=payload,
        )
        result = resp.json()
        if "error" in result:
            raise HTTPException(status_code=500, detail=f"RunPod error: {result['error']}")
        job_id = result.get("id")
        if not job_id:
            raise HTTPException(status_code=500, detail="No job ID returned from RunPod")
        # Poll for completion
        for _ in range(120):  # Max 10 minutes
            await asyncio.sleep(5)
            status_resp = await client.get(
                f"https://api.runpod.ai/v2/{RUNPOD_WHISPER_ENDPOINT}/status/{job_id}",
                headers={"Authorization": f"Bearer {RUNPOD_API_KEY}"},
            )
            status_data = status_resp.json()
            if status_data.get("status") == "COMPLETED":
                return status_data.get("output", {})
            elif status_data.get("status") in ["FAILED", "CANCELLED"]:
                raise HTTPException(
                    status_code=500,
                    detail=f"Transcription failed: {status_data.get('error', 'Unknown error')}"
                )
        raise HTTPException(status_code=504, detail="Transcription timed out")
 async def summarize_with_ollama(content: str, style: SummarizeStyle) -> str:
    """Summarize content using AI Orchestrator (Ollama)"""
    style_prompts = {
        SummarizeStyle.CONCISE: "Provide a concise 2-3 sentence summary of the following content:",
        SummarizeStyle.DETAILED: "Provide a detailed summary of the following content, covering all main points:",
        SummarizeStyle.BULLET_POINTS: "Summarize the following content as bullet points:",
        SummarizeStyle.TECHNICAL: "Provide a technical summary of the following content, focusing on key technical details:",
        SummarizeStyle.ELI5: "Explain the following content in simple terms that a child could understand:",
    }
    prompt = f"{style_prompts[style]}\n\n{content[:8000]}"  # Limit content for context window
    async with httpx.AsyncClient(timeout=120) as client:
        try:
            resp = await client.post(
                f"{AI_ORCHESTRATOR_URL}/api/generate/text",
                json={
                    "prompt": prompt,
                    "model": "llama3.2",
                    "max_tokens": 1024,
                    "priority": "low",  # Use free Ollama
                },
            )
            result = resp.json()
            return result.get("response", "")
        except Exception as e:
            return f"[Summarization failed: {str(e)}]"
 async def index_to_semantic_search(
    doc_id: str,
    content: str,
    source: str,
    metadata: Dict[str, Any],
 ) -> bool:
    """Index document to semantic search service"""
    async with httpx.AsyncClient(timeout=30) as client:
        try:
            resp = await client.post(
                f"{SEMANTIC_SEARCH_URL}/index",
                json={
                    "id": doc_id,
                    "content": content,
                    "metadata": {
                        "source": source,
                        "indexed_at": datetime.now().isoformat(),
                        **metadata,
                    },
                },
            )
            return resp.status_code == 200
        except Exception:
            return False
 def extract_with_docling(file_path: Path, output_format: OutputFormat) -> tuple[str, dict]:
    """Extract content from document using Docling"""
    result = converter.convert(str(file_path))
    doc = result.document
    # Get metadata
    metadata = {
        "pages": len(doc.pages) if hasattr(doc, "pages") else 0,
        "tables": len(doc.tables) if hasattr(doc, "tables") else 0,
        "figures": len(doc.pictures) if hasattr(doc, "pictures") else 0,
    }
    # Export in requested format
    if output_format == OutputFormat.MARKDOWN:
        content = doc.export_to_markdown()
    elif output_format == OutputFormat.JSON:
        content = doc.export_to_dict()
    elif output_format == OutputFormat.HTML:
        content = doc.export_to_html()
    else:  # TEXT
        content = doc.export_to_markdown()  # Markdown is readable as plain text
    return content if isinstance(content, str) else str(content), metadata
 # ============== API Endpoints ==============
@app.get("/")
 async def root():
    """Service info and health check"""
    return {
        "service": "Docling Service",
        "version": "1.0.0",
        "status": "healthy",
        "supported_formats": {
            "documents": list(DOCUMENT_FORMATS),
            "images": list(IMAGE_FORMATS),
            "audio": list(AUDIO_FORMATS),
        },
        "integrations": {
            "ai_orchestrator": AI_ORCHESTRATOR_URL,
            "semantic_search": SEMANTIC_SEARCH_URL,
            "runpod_whisper": f"endpoint:{RUNPOD_WHISPER_ENDPOINT}",
        },
    }
@app.get("/health")
 async def health():
    """Health check endpoint for Traefik"""
    return {"status": "healthy", "timestamp": datetime.now().isoformat()}
@app.get("/stats")
 async def get_stats():
    """Get processing statistics"""
    return stats
@app.post("/extract", response_model=ExtractionResult)
 async def extract_document(request: ExtractRequest):
    """
    Extract content from a document (URL or base64).
    Supports: PDF, DOCX, PPTX, XLSX, HTML, MD, TXT, EPUB, images (with OCR)
    """
    try:
        # Get content
        if request.url:
            content_bytes, filename = await fetch_url_content(str(request.url))
            source = str(request.url)
        elif request.base64_content:
            content_bytes = base64.b64decode(request.base64_content)
            filename = request.filename or "document"
            source = f"base64:{filename}"
        else:
            raise HTTPException(status_code=400, detail="Provide either url or base64_content")
        ext = get_file_extension(filename)
        # Handle audio separately
        if ext in AUDIO_FORMATS:
            raise HTTPException(
                status_code=400,
                detail="Use /transcribe endpoint for audio files"
            )
        # Write to temp file for Docling
        with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
            tmp.write(content_bytes)
            tmp_path = Path(tmp.name)
        try:
            # Extract content
            content, metadata = extract_with_docling(tmp_path, request.output_format)
            stats["documents_processed"] += 1
            stats["pages_extracted"] += metadata.get("pages", 1)
            # Summarize if requested
            summary = None
            if request.summarize:
                summary = await summarize_with_ollama(content, request.summarize_style)
            # Index if requested
            indexed = False
            if request.index_to_search:
                doc_id = generate_doc_id(source, content)
                indexed = await index_to_semantic_search(
                    doc_id=doc_id,
                    content=content,
                    source=source,
                    metadata={**metadata, **(request.metadata or {})},
                )
            return ExtractionResult(
                success=True,
                source=source,
                content=content,
                format=request.output_format,
                metadata=metadata,
                summary=summary,
                indexed=indexed,
            )
        finally:
            tmp_path.unlink(missing_ok=True)
    except HTTPException:
        raise
    except Exception as e:
        stats["errors"] += 1
        return ExtractionResult(
            success=False,
            source=str(request.url or request.filename or "unknown"),
            format=request.output_format,
            error=str(e),
        )
@app.post("/extract/upload", response_model=ExtractionResult)
 async def extract_uploaded_file(
    file: UploadFile = File(...),
    output_format: OutputFormat = Form(OutputFormat.MARKDOWN),
    summarize: bool = Form(False),
    summarize_style: SummarizeStyle = Form(SummarizeStyle.CONCISE),
    index_to_search: bool = Form(False),
 ):
    """Extract content from an uploaded file"""
    try:
        content_bytes = await file.read()
        ext = get_file_extension(file.filename or "document")
        if ext in AUDIO_FORMATS:
            raise HTTPException(
                status_code=400,
                detail="Use /transcribe/upload endpoint for audio files"
            )
        with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
            tmp.write(content_bytes)
            tmp_path = Path(tmp.name)
        try:
            content, metadata = extract_with_docling(tmp_path, output_format)
            stats["documents_processed"] += 1
            stats["pages_extracted"] += metadata.get("pages", 1)
            summary = None
            if summarize:
                summary = await summarize_with_ollama(content, summarize_style)
            indexed = False
            if index_to_search:
                doc_id = generate_doc_id(file.filename or "upload", content)
                indexed = await index_to_semantic_search(
                    doc_id=doc_id,
                    content=content,
                    source=f"upload:{file.filename}",
                    metadata=metadata,
                )
            return ExtractionResult(
                success=True,
                source=f"upload:{file.filename}",
                content=content,
                format=output_format,
                metadata=metadata,
                summary=summary,
                indexed=indexed,
            )
        finally:
            tmp_path.unlink(missing_ok=True)
    except HTTPException:
        raise
    except Exception as e:
        stats["errors"] += 1
        return ExtractionResult(
            success=False,
            source=f"upload:{file.filename}",
            format=output_format,
            error=str(e),
        )
@app.post("/transcribe")
 async def transcribe_audio(request: TranscribeRequest):
    """
    Transcribe audio using RunPod Whisper.
    Supports: MP3, WAV, M4A, OGG, FLAC, WEBM
    """
    try:
        if request.url:
            content_bytes, filename = await fetch_url_content(str(request.url))
            source = str(request.url)
        elif request.base64_content:
            content_bytes = base64.b64decode(request.base64_content)
            source = "base64:audio"
        else:
            raise HTTPException(status_code=400, detail="Provide either url or base64_content")
        # Transcribe
        result = await transcribe_audio_runpod(content_bytes, request.language)
        stats["audio_transcribed"] += 1
        transcript = result.get("transcription", result.get("text", ""))
        # Summarize if requested
        summary = None
        if request.summarize and transcript:
            summary = await summarize_with_ollama(transcript, request.summarize_style)
        return {
            "success": True,
            "source": source,
            "transcript": transcript,
            "language": result.get("detected_language"),
            "duration": result.get("duration"),
            "summary": summary,
        }
    except HTTPException:
        raise
    except Exception as e:
        stats["errors"] += 1
        return {
            "success": False,
            "source": str(request.url or "base64"),
            "error": str(e),
        }
@app.post("/transcribe/upload")
 async def transcribe_uploaded_audio(
    file: UploadFile = File(...),
    language: Optional[str] = Form(None),
    summarize: bool = Form(False),
    summarize_style: SummarizeStyle = Form(SummarizeStyle.CONCISE),
 ):
    """Transcribe uploaded audio file"""
    try:
        content_bytes = await file.read()
        result = await transcribe_audio_runpod(content_bytes, language)
        stats["audio_transcribed"] += 1
        transcript = result.get("transcription", result.get("text", ""))
        summary = None
        if summarize and transcript:
            summary = await summarize_with_ollama(transcript, summarize_style)
        return {
            "success": True,
            "source": f"upload:{file.filename}",
            "transcript": transcript,
            "language": result.get("detected_language"),
            "duration": result.get("duration"),
            "summary": summary,
        }
    except HTTPException:
        raise
    except Exception as e:
        stats["errors"] += 1
        return {
            "success": False,
            "source": f"upload:{file.filename}",
            "error": str(e),
        }
@app.post("/batch")
 async def batch_extract(request: BatchExtractRequest, background_tasks: BackgroundTasks):
    """
    Batch extract multiple documents.
    Returns immediately with job ID, processes in background.
    """
    job_id = hashlib.sha256(str(datetime.now()).encode()).hexdigest()[:16]
    # For now, process synchronously (can be enhanced with Redis queue later)
    results = []
    for item in request.items:
        result = await extract_document(item)
        results.append(result)
    return {
        "job_id": job_id,
        "total": len(request.items),
        "results": results,
    }
@app.post("/url/preview")
 async def preview_url(url: HttpUrl):
    """Quick preview of URL content (first 500 chars of markdown)"""
    try:
        content_bytes, filename = await fetch_url_content(str(url))
        stats["urls_fetched"] += 1
        ext = get_file_extension(filename)
        with tempfile.NamedTemporaryFile(suffix=ext or ".html", delete=False) as tmp:
            tmp.write(content_bytes)
            tmp_path = Path(tmp.name)
        try:
            content, metadata = extract_with_docling(tmp_path, OutputFormat.MARKDOWN)
            return {
                "success": True,
                "url": str(url),
                "preview": content[:500] + ("..." if len(content) > 500 else ""),
                "full_length": len(content),
                "metadata": metadata,
            }
        finally:
            tmp_path.unlink(missing_ok=True)
    except Exception as e:
        return {
            "success": False,
            "url": str(url),
            "error": str(e),
        }
 if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8081)