Initial docling-service: document extraction for AI stack

- FastAPI service using IBM Docling for document extraction - Supports PDF, DOCX, PPTX, XLSX, HTML, images with OCR - Integrates with AI Orchestrator (Ollama) for summarization - Routes audio to RunPod Whisper for transcription - Optional indexing to Semantic Search service - Docker + Traefik configuration for RS 8000 deployment - Python client library included 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-26 20:47:42 -08:00 · 2025-11-26 20:47:42 -08:00 · 4ed909dbc4
commit 4ed909dbc4
7 changed files with 954 additions and 0 deletions
--- a/.env.example
+++ b/.env.example
@ -0,0 +1,4 @@
+RUNPOD_API_KEY=your_runpod_api_key_here
+AI_ORCHESTRATOR_URL=http://ai-orchestrator:8080
+SEMANTIC_SEARCH_URL=http://semantic-search:8000
+RUNPOD_WHISPER_ENDPOINT=lrtisuv8ixbtub
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,12 @@
+__pycache__/
+*.pyc
+.env
+.venv/
+venv/
+*.egg-info/
+dist/
+build/
+.pytest_cache/
+.coverage
+htmlcov/
+*.log
--- a/40
+++ b/40
@ -0,0 +1,40 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+# Install system dependencies for Docling and OCR
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    libgomp1 \
+    poppler-utils \
+    tesseract-ocr \
+    libtesseract-dev \
+    ffmpeg \
+    && rm -rf /var/lib/apt/lists/*
+
+# Copy requirements first for layer caching
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Download Docling models at build time (optional, reduces first-run latency)
+RUN python -c "from docling.document_converter import DocumentConverter; DocumentConverter()" || true
+
+# Copy application code
+COPY server.py .
+
+# Create non-root user
+RUN useradd -m -u 1000 appuser && chown -R appuser:appuser /app
+USER appuser
+
+EXPOSE 8081
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
+    CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8081/health')" || exit 1
+
+CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8081"]
--- a/client.py
+++ b/client.py
@ -0,0 +1,207 @@
+"""
+Docling Service Client - Use this to integrate with other services
+
+Example usage:
+    from client import DoclingClient
+
+    client = DoclingClient("http://docs.jeffemmett.com")
+
+    # Extract from URL
+    result = await client.extract_url("https://example.com/doc.pdf")
+
+    # Extract with summarization
+    result = await client.extract_url(
+        "https://example.com/doc.pdf",
+        summarize=True,
+        summarize_style="bullet_points"
+    )
+
+    # Transcribe audio
+    result = await client.transcribe_url("https://example.com/audio.mp3")
+"""
+
+import httpx
+import base64
+from pathlib import Path
+from typing import Optional, Dict, Any, Literal
+
+
+OutputFormat = Literal["markdown", "json", "text", "html"]
+SummarizeStyle = Literal["concise", "detailed", "bullet_points", "technical", "eli5"]
+
+
+class DoclingClient:
+    """Async client for Docling Service"""
+
+    def __init__(self, base_url: str = "http://localhost:8081", timeout: float = 300):
+        self.base_url = base_url.rstrip("/")
+        self.timeout = timeout
+
+    async def health(self) -> dict:
+        """Check service health"""
+        async with httpx.AsyncClient(timeout=10) as client:
+            resp = await client.get(f"{self.base_url}/health")
+            return resp.json()
+
+    async def stats(self) -> dict:
+        """Get processing statistics"""
+        async with httpx.AsyncClient(timeout=10) as client:
+            resp = await client.get(f"{self.base_url}/stats")
+            return resp.json()
+
+    async def extract_url(
+        self,
+        url: str,
+        output_format: OutputFormat = "markdown",
+        summarize: bool = False,
+        summarize_style: SummarizeStyle = "concise",
+        index_to_search: bool = False,
+        metadata: Optional[Dict[str, Any]] = None,
+    ) -> dict:
+        """Extract content from a URL"""
+        async with httpx.AsyncClient(timeout=self.timeout) as client:
+            resp = await client.post(
+                f"{self.base_url}/extract",
+                json={
+                    "url": url,
+                    "output_format": output_format,
+                    "summarize": summarize,
+                    "summarize_style": summarize_style,
+                    "index_to_search": index_to_search,
+                    "metadata": metadata,
+                },
+            )
+            return resp.json()
+
+    async def extract_file(
+        self,
+        file_path: str,
+        output_format: OutputFormat = "markdown",
+        summarize: bool = False,
+        summarize_style: SummarizeStyle = "concise",
+        index_to_search: bool = False,
+        metadata: Optional[Dict[str, Any]] = None,
+    ) -> dict:
+        """Extract content from a local file"""
+        path = Path(file_path)
+        content = base64.b64encode(path.read_bytes()).decode()
+
+        async with httpx.AsyncClient(timeout=self.timeout) as client:
+            resp = await client.post(
+                f"{self.base_url}/extract",
+                json={
+                    "base64_content": content,
+                    "filename": path.name,
+                    "output_format": output_format,
+                    "summarize": summarize,
+                    "summarize_style": summarize_style,
+                    "index_to_search": index_to_search,
+                    "metadata": metadata,
+                },
+            )
+            return resp.json()
+
+    async def extract_bytes(
+        self,
+        content: bytes,
+        filename: str,
+        output_format: OutputFormat = "markdown",
+        summarize: bool = False,
+        summarize_style: SummarizeStyle = "concise",
+        index_to_search: bool = False,
+        metadata: Optional[Dict[str, Any]] = None,
+    ) -> dict:
+        """Extract content from bytes"""
+        b64_content = base64.b64encode(content).decode()
+
+        async with httpx.AsyncClient(timeout=self.timeout) as client:
+            resp = await client.post(
+                f"{self.base_url}/extract",
+                json={
+                    "base64_content": b64_content,
+                    "filename": filename,
+                    "output_format": output_format,
+                    "summarize": summarize,
+                    "summarize_style": summarize_style,
+                    "index_to_search": index_to_search,
+                    "metadata": metadata,
+                },
+            )
+            return resp.json()
+
+    async def transcribe_url(
+        self,
+        url: str,
+        language: Optional[str] = None,
+        summarize: bool = False,
+        summarize_style: SummarizeStyle = "concise",
+    ) -> dict:
+        """Transcribe audio from URL"""
+        async with httpx.AsyncClient(timeout=self.timeout) as client:
+            resp = await client.post(
+                f"{self.base_url}/transcribe",
+                json={
+                    "url": url,
+                    "language": language,
+                    "summarize": summarize,
+                    "summarize_style": summarize_style,
+                },
+            )
+            return resp.json()
+
+    async def transcribe_file(
+        self,
+        file_path: str,
+        language: Optional[str] = None,
+        summarize: bool = False,
+        summarize_style: SummarizeStyle = "concise",
+    ) -> dict:
+        """Transcribe audio from local file"""
+        path = Path(file_path)
+        content = base64.b64encode(path.read_bytes()).decode()
+
+        async with httpx.AsyncClient(timeout=self.timeout) as client:
+            resp = await client.post(
+                f"{self.base_url}/transcribe",
+                json={
+                    "base64_content": content,
+                    "language": language,
+                    "summarize": summarize,
+                    "summarize_style": summarize_style,
+                },
+            )
+            return resp.json()
+
+    async def preview_url(self, url: str) -> dict:
+        """Quick preview of URL content"""
+        async with httpx.AsyncClient(timeout=60) as client:
+            resp = await client.post(
+                f"{self.base_url}/url/preview",
+                json=url,
+            )
+            return resp.json()
+
+
+# Sync wrapper for convenience
+class DoclingClientSync:
+    """Synchronous client wrapper"""
+
+    def __init__(self, base_url: str = "http://localhost:8081", timeout: float = 300):
+        self.base_url = base_url.rstrip("/")
+        self.timeout = timeout
+
+    def extract_url(self, url: str, **kwargs) -> dict:
+        with httpx.Client(timeout=self.timeout) as client:
+            resp = client.post(
+                f"{self.base_url}/extract",
+                json={"url": url, **kwargs},
+            )
+            return resp.json()
+
+    def transcribe_url(self, url: str, **kwargs) -> dict:
+        with httpx.Client(timeout=self.timeout) as client:
+            resp = client.post(
+                f"{self.base_url}/transcribe",
+                json={"url": url, **kwargs},
+            )
+            return resp.json()
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,58 @@
+services:
+  docling-service:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    image: docling-service:latest
+    container_name: docling-service
+    restart: unless-stopped
+    environment:
+      # Connect to AI orchestrator for summarization (Ollama)
+      - AI_ORCHESTRATOR_URL=http://ai-orchestrator:8080
+      # Connect to semantic search for indexing
+      - SEMANTIC_SEARCH_URL=http://semantic-search:8000
+      # RunPod for Whisper transcription
+      - RUNPOD_API_KEY=${RUNPOD_API_KEY}
+      - RUNPOD_WHISPER_ENDPOINT=lrtisuv8ixbtub
+    labels:
+      # Traefik auto-discovery
+      - "traefik.enable=true"
+      # HTTP router
+      - "traefik.http.routers.docling.rule=Host(`docs.jeffemmett.com`)"
+      - "traefik.http.routers.docling.entrypoints=web"
+      - "traefik.http.services.docling.loadbalancer.server.port=8081"
+      # HTTPS router
+      - "traefik.http.routers.docling-secure.rule=Host(`docs.jeffemmett.com`)"
+      - "traefik.http.routers.docling-secure.entrypoints=websecure"
+      - "traefik.http.routers.docling-secure.tls=true"
+      # Health check for Traefik
+      - "traefik.http.services.docling.loadbalancer.healthcheck.path=/health"
+      - "traefik.http.services.docling.loadbalancer.healthcheck.interval=30s"
+    networks:
+      - traefik-public
+      - ai-internal
+    volumes:
+      # Cache for Docling models (persists across restarts)
+      - docling-cache:/home/appuser/.cache
+    deploy:
+      resources:
+        limits:
+          memory: 8G
+        reservations:
+          memory: 2G
+    healthcheck:
+      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8081/health')"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 60s
+
+volumes:
+  docling-cache:
+    driver: local
+
+networks:
+  traefik-public:
+    external: true
+  ai-internal:
+    external: true
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,15 @@
+fastapi>=0.109.0
+uvicorn[standard]>=0.27.0
+httpx>=0.26.0
+pydantic>=2.0.0
+python-multipart>=0.0.6
+
+# Docling and dependencies
+docling>=2.0.0
+docling-core>=2.0.0
+
+# OCR support (optional, for enhanced PDF/image processing)
+easyocr>=1.7.0
+
+# For audio file handling
+pydub>=0.25.1
--- a/server.py
+++ b/server.py
@ -0,0 +1,618 @@
+"""
+Docling Service - Document extraction and processing for the AI stack
+
+Integrates with:
+- AI Orchestrator (Ollama) for summarization
+- RunPod Whisper for audio transcription
+- Semantic Search for indexing extracted content
+"""
+
+import os
+import asyncio
+import tempfile
+import base64
+import hashlib
+from pathlib import Path
+from datetime import datetime
+from typing import Optional, List, Dict, Any, Literal
+from enum import Enum
+
+import httpx
+from fastapi import FastAPI, HTTPException, UploadFile, File, Form, BackgroundTasks
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+from pydantic import BaseModel, HttpUrl
+
+# Docling imports
+from docling.document_converter import DocumentConverter
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling.document_converter import PdfFormatOption
+
+# Config from environment
+AI_ORCHESTRATOR_URL = os.getenv("AI_ORCHESTRATOR_URL", "http://ai-orchestrator:8080")
+SEMANTIC_SEARCH_URL = os.getenv("SEMANTIC_SEARCH_URL", "http://semantic-search:8000")
+RUNPOD_API_KEY = os.getenv("RUNPOD_API_KEY", "")
+RUNPOD_WHISPER_ENDPOINT = os.getenv("RUNPOD_WHISPER_ENDPOINT", "lrtisuv8ixbtub")
+
+# Supported formats
+DOCUMENT_FORMATS = {".pdf", ".docx", ".pptx", ".xlsx", ".html", ".md", ".txt", ".epub"}
+IMAGE_FORMATS = {".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".gif"}
+AUDIO_FORMATS = {".mp3", ".wav", ".m4a", ".ogg", ".flac", ".webm"}
+
+app = FastAPI(
+    title="Docling Service",
+    description="Document extraction and processing service using Docling",
+    version="1.0.0",
+)
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+# Initialize document converter with optimized settings
+pipeline_options = PdfPipelineOptions()
+pipeline_options.do_ocr = True
+pipeline_options.do_table_structure = True
+
+converter = DocumentConverter(
+    format_options={
+        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
+    }
+)
+
+# Track processing stats
+stats = {
+    "documents_processed": 0,
+    "pages_extracted": 0,
+    "audio_transcribed": 0,
+    "urls_fetched": 0,
+    "errors": 0,
+}
+
+
+class OutputFormat(str, Enum):
+    MARKDOWN = "markdown"
+    JSON = "json"
+    TEXT = "text"
+    HTML = "html"
+
+
+class SummarizeStyle(str, Enum):
+    CONCISE = "concise"
+    DETAILED = "detailed"
+    BULLET_POINTS = "bullet_points"
+    TECHNICAL = "technical"
+    ELI5 = "eli5"  # Explain like I'm 5
+
+
+class ExtractRequest(BaseModel):
+    """Request to extract content from a URL or base64-encoded file"""
+    url: Optional[HttpUrl] = None
+    base64_content: Optional[str] = None
+    filename: Optional[str] = None
+    output_format: OutputFormat = OutputFormat.MARKDOWN
+    summarize: bool = False
+    summarize_style: SummarizeStyle = SummarizeStyle.CONCISE
+    index_to_search: bool = False
+    metadata: Optional[Dict[str, Any]] = None
+
+
+class TranscribeRequest(BaseModel):
+    """Request to transcribe audio"""
+    url: Optional[HttpUrl] = None
+    base64_content: Optional[str] = None
+    language: Optional[str] = None  # Auto-detect if not specified
+    summarize: bool = False
+    summarize_style: SummarizeStyle = SummarizeStyle.CONCISE
+
+
+class BatchExtractRequest(BaseModel):
+    """Batch extraction request"""
+    items: List[ExtractRequest]
+
+
+class ExtractionResult(BaseModel):
+    """Result of document extraction"""
+    success: bool
+    source: str
+    content: Optional[str] = None
+    format: OutputFormat
+    metadata: Dict[str, Any] = {}
+    summary: Optional[str] = None
+    indexed: bool = False
+    error: Optional[str] = None
+
+
+# ============== Helper Functions ==============
+
+def get_file_extension(filename: str) -> str:
+    """Get lowercase file extension"""
+    return Path(filename).suffix.lower()
+
+
+def generate_doc_id(source: str, content: str) -> str:
+    """Generate a unique document ID"""
+    hash_input = f"{source}:{content[:1000]}"
+    return hashlib.sha256(hash_input.encode()).hexdigest()[:16]
+
+
+async def fetch_url_content(url: str) -> tuple[bytes, str]:
+    """Fetch content from URL, return bytes and detected filename"""
+    async with httpx.AsyncClient(follow_redirects=True, timeout=60) as client:
+        resp = await client.get(url)
+        resp.raise_for_status()
+
+        # Try to get filename from headers or URL
+        content_disposition = resp.headers.get("content-disposition", "")
+        if "filename=" in content_disposition:
+            filename = content_disposition.split("filename=")[1].strip('"\'')
+        else:
+            filename = url.split("/")[-1].split("?")[0] or "document"
+
+        return resp.content, filename
+
+
+async def transcribe_audio_runpod(audio_data: bytes, language: Optional[str] = None) -> dict:
+    """Transcribe audio using RunPod Whisper endpoint"""
+    if not RUNPOD_API_KEY:
+        raise HTTPException(status_code=500, detail="RunPod API key not configured")
+
+    # Convert audio to base64
+    audio_base64 = base64.b64encode(audio_data).decode()
+
+    payload = {
+        "input": {
+            "audio_base64": audio_base64,
+        }
+    }
+    if language:
+        payload["input"]["language"] = language
+
+    async with httpx.AsyncClient(timeout=300) as client:
+        # Submit job
+        resp = await client.post(
+            f"https://api.runpod.ai/v2/{RUNPOD_WHISPER_ENDPOINT}/run",
+            headers={
+                "Authorization": f"Bearer {RUNPOD_API_KEY}",
+                "Content-Type": "application/json",
+            },
+            json=payload,
+        )
+        result = resp.json()
+
+        if "error" in result:
+            raise HTTPException(status_code=500, detail=f"RunPod error: {result['error']}")
+
+        job_id = result.get("id")
+        if not job_id:
+            raise HTTPException(status_code=500, detail="No job ID returned from RunPod")
+
+        # Poll for completion
+        for _ in range(120):  # Max 10 minutes
+            await asyncio.sleep(5)
+            status_resp = await client.get(
+                f"https://api.runpod.ai/v2/{RUNPOD_WHISPER_ENDPOINT}/status/{job_id}",
+                headers={"Authorization": f"Bearer {RUNPOD_API_KEY}"},
+            )
+            status_data = status_resp.json()
+
+            if status_data.get("status") == "COMPLETED":
+                return status_data.get("output", {})
+            elif status_data.get("status") in ["FAILED", "CANCELLED"]:
+                raise HTTPException(
+                    status_code=500,
+                    detail=f"Transcription failed: {status_data.get('error', 'Unknown error')}"
+                )
+
+        raise HTTPException(status_code=504, detail="Transcription timed out")
+
+
+async def summarize_with_ollama(content: str, style: SummarizeStyle) -> str:
+    """Summarize content using AI Orchestrator (Ollama)"""
+    style_prompts = {
+        SummarizeStyle.CONCISE: "Provide a concise 2-3 sentence summary of the following content:",
+        SummarizeStyle.DETAILED: "Provide a detailed summary of the following content, covering all main points:",
+        SummarizeStyle.BULLET_POINTS: "Summarize the following content as bullet points:",
+        SummarizeStyle.TECHNICAL: "Provide a technical summary of the following content, focusing on key technical details:",
+        SummarizeStyle.ELI5: "Explain the following content in simple terms that a child could understand:",
+    }
+
+    prompt = f"{style_prompts[style]}\n\n{content[:8000]}"  # Limit content for context window
+
+    async with httpx.AsyncClient(timeout=120) as client:
+        try:
+            resp = await client.post(
+                f"{AI_ORCHESTRATOR_URL}/api/generate/text",
+                json={
+                    "prompt": prompt,
+                    "model": "llama3.2",
+                    "max_tokens": 1024,
+                    "priority": "low",  # Use free Ollama
+                },
+            )
+            result = resp.json()
+            return result.get("response", "")
+        except Exception as e:
+            return f"[Summarization failed: {str(e)}]"
+
+
+async def index_to_semantic_search(
+    doc_id: str,
+    content: str,
+    source: str,
+    metadata: Dict[str, Any],
+) -> bool:
+    """Index document to semantic search service"""
+    async with httpx.AsyncClient(timeout=30) as client:
+        try:
+            resp = await client.post(
+                f"{SEMANTIC_SEARCH_URL}/index",
+                json={
+                    "id": doc_id,
+                    "content": content,
+                    "metadata": {
+                        "source": source,
+                        "indexed_at": datetime.now().isoformat(),
+                        **metadata,
+                    },
+                },
+            )
+            return resp.status_code == 200
+        except Exception:
+            return False
+
+
+def extract_with_docling(file_path: Path, output_format: OutputFormat) -> tuple[str, dict]:
+    """Extract content from document using Docling"""
+    result = converter.convert(str(file_path))
+    doc = result.document
+
+    # Get metadata
+    metadata = {
+        "pages": len(doc.pages) if hasattr(doc, "pages") else 0,
+        "tables": len(doc.tables) if hasattr(doc, "tables") else 0,
+        "figures": len(doc.pictures) if hasattr(doc, "pictures") else 0,
+    }
+
+    # Export in requested format
+    if output_format == OutputFormat.MARKDOWN:
+        content = doc.export_to_markdown()
+    elif output_format == OutputFormat.JSON:
+        content = doc.export_to_dict()
+    elif output_format == OutputFormat.HTML:
+        content = doc.export_to_html()
+    else:  # TEXT
+        content = doc.export_to_markdown()  # Markdown is readable as plain text
+
+    return content if isinstance(content, str) else str(content), metadata
+
+
+# ============== API Endpoints ==============
+
+@app.get("/")
+async def root():
+    """Service info and health check"""
+    return {
+        "service": "Docling Service",
+        "version": "1.0.0",
+        "status": "healthy",
+        "supported_formats": {
+            "documents": list(DOCUMENT_FORMATS),
+            "images": list(IMAGE_FORMATS),
+            "audio": list(AUDIO_FORMATS),
+        },
+        "integrations": {
+            "ai_orchestrator": AI_ORCHESTRATOR_URL,
+            "semantic_search": SEMANTIC_SEARCH_URL,
+            "runpod_whisper": f"endpoint:{RUNPOD_WHISPER_ENDPOINT}",
+        },
+    }
+
+
+@app.get("/health")
+async def health():
+    """Health check endpoint for Traefik"""
+    return {"status": "healthy", "timestamp": datetime.now().isoformat()}
+
+
+@app.get("/stats")
+async def get_stats():
+    """Get processing statistics"""
+    return stats
+
+
+@app.post("/extract", response_model=ExtractionResult)
+async def extract_document(request: ExtractRequest):
+    """
+    Extract content from a document (URL or base64).
+
+    Supports: PDF, DOCX, PPTX, XLSX, HTML, MD, TXT, EPUB, images (with OCR)
+    """
+    try:
+        # Get content
+        if request.url:
+            content_bytes, filename = await fetch_url_content(str(request.url))
+            source = str(request.url)
+        elif request.base64_content:
+            content_bytes = base64.b64decode(request.base64_content)
+            filename = request.filename or "document"
+            source = f"base64:{filename}"
+        else:
+            raise HTTPException(status_code=400, detail="Provide either url or base64_content")
+
+        ext = get_file_extension(filename)
+
+        # Handle audio separately
+        if ext in AUDIO_FORMATS:
+            raise HTTPException(
+                status_code=400,
+                detail="Use /transcribe endpoint for audio files"
+            )
+
+        # Write to temp file for Docling
+        with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
+            tmp.write(content_bytes)
+            tmp_path = Path(tmp.name)
+
+        try:
+            # Extract content
+            content, metadata = extract_with_docling(tmp_path, request.output_format)
+            stats["documents_processed"] += 1
+            stats["pages_extracted"] += metadata.get("pages", 1)
+
+            # Summarize if requested
+            summary = None
+            if request.summarize:
+                summary = await summarize_with_ollama(content, request.summarize_style)
+
+            # Index if requested
+            indexed = False
+            if request.index_to_search:
+                doc_id = generate_doc_id(source, content)
+                indexed = await index_to_semantic_search(
+                    doc_id=doc_id,
+                    content=content,
+                    source=source,
+                    metadata={**metadata, **(request.metadata or {})},
+                )
+
+            return ExtractionResult(
+                success=True,
+                source=source,
+                content=content,
+                format=request.output_format,
+                metadata=metadata,
+                summary=summary,
+                indexed=indexed,
+            )
+        finally:
+            tmp_path.unlink(missing_ok=True)
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        stats["errors"] += 1
+        return ExtractionResult(
+            success=False,
+            source=str(request.url or request.filename or "unknown"),
+            format=request.output_format,
+            error=str(e),
+        )
+
+
+@app.post("/extract/upload", response_model=ExtractionResult)
+async def extract_uploaded_file(
+    file: UploadFile = File(...),
+    output_format: OutputFormat = Form(OutputFormat.MARKDOWN),
+    summarize: bool = Form(False),
+    summarize_style: SummarizeStyle = Form(SummarizeStyle.CONCISE),
+    index_to_search: bool = Form(False),
+):
+    """Extract content from an uploaded file"""
+    try:
+        content_bytes = await file.read()
+        ext = get_file_extension(file.filename or "document")
+
+        if ext in AUDIO_FORMATS:
+            raise HTTPException(
+                status_code=400,
+                detail="Use /transcribe/upload endpoint for audio files"
+            )
+
+        with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
+            tmp.write(content_bytes)
+            tmp_path = Path(tmp.name)
+
+        try:
+            content, metadata = extract_with_docling(tmp_path, output_format)
+            stats["documents_processed"] += 1
+            stats["pages_extracted"] += metadata.get("pages", 1)
+
+            summary = None
+            if summarize:
+                summary = await summarize_with_ollama(content, summarize_style)
+
+            indexed = False
+            if index_to_search:
+                doc_id = generate_doc_id(file.filename or "upload", content)
+                indexed = await index_to_semantic_search(
+                    doc_id=doc_id,
+                    content=content,
+                    source=f"upload:{file.filename}",
+                    metadata=metadata,
+                )
+
+            return ExtractionResult(
+                success=True,
+                source=f"upload:{file.filename}",
+                content=content,
+                format=output_format,
+                metadata=metadata,
+                summary=summary,
+                indexed=indexed,
+            )
+        finally:
+            tmp_path.unlink(missing_ok=True)
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        stats["errors"] += 1
+        return ExtractionResult(
+            success=False,
+            source=f"upload:{file.filename}",
+            format=output_format,
+            error=str(e),
+        )
+
+
+@app.post("/transcribe")
+async def transcribe_audio(request: TranscribeRequest):
+    """
+    Transcribe audio using RunPod Whisper.
+
+    Supports: MP3, WAV, M4A, OGG, FLAC, WEBM
+    """
+    try:
+        if request.url:
+            content_bytes, filename = await fetch_url_content(str(request.url))
+            source = str(request.url)
+        elif request.base64_content:
+            content_bytes = base64.b64decode(request.base64_content)
+            source = "base64:audio"
+        else:
+            raise HTTPException(status_code=400, detail="Provide either url or base64_content")
+
+        # Transcribe
+        result = await transcribe_audio_runpod(content_bytes, request.language)
+        stats["audio_transcribed"] += 1
+
+        transcript = result.get("transcription", result.get("text", ""))
+
+        # Summarize if requested
+        summary = None
+        if request.summarize and transcript:
+            summary = await summarize_with_ollama(transcript, request.summarize_style)
+
+        return {
+            "success": True,
+            "source": source,
+            "transcript": transcript,
+            "language": result.get("detected_language"),
+            "duration": result.get("duration"),
+            "summary": summary,
+        }
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        stats["errors"] += 1
+        return {
+            "success": False,
+            "source": str(request.url or "base64"),
+            "error": str(e),
+        }
+
+
+@app.post("/transcribe/upload")
+async def transcribe_uploaded_audio(
+    file: UploadFile = File(...),
+    language: Optional[str] = Form(None),
+    summarize: bool = Form(False),
+    summarize_style: SummarizeStyle = Form(SummarizeStyle.CONCISE),
+):
+    """Transcribe uploaded audio file"""
+    try:
+        content_bytes = await file.read()
+
+        result = await transcribe_audio_runpod(content_bytes, language)
+        stats["audio_transcribed"] += 1
+
+        transcript = result.get("transcription", result.get("text", ""))
+
+        summary = None
+        if summarize and transcript:
+            summary = await summarize_with_ollama(transcript, summarize_style)
+
+        return {
+            "success": True,
+            "source": f"upload:{file.filename}",
+            "transcript": transcript,
+            "language": result.get("detected_language"),
+            "duration": result.get("duration"),
+            "summary": summary,
+        }
+
+    except HTTPException:
+        raise
+    except Exception as e:
+        stats["errors"] += 1
+        return {
+            "success": False,
+            "source": f"upload:{file.filename}",
+            "error": str(e),
+        }
+
+
+@app.post("/batch")
+async def batch_extract(request: BatchExtractRequest, background_tasks: BackgroundTasks):
+    """
+    Batch extract multiple documents.
+    Returns immediately with job ID, processes in background.
+    """
+    job_id = hashlib.sha256(str(datetime.now()).encode()).hexdigest()[:16]
+
+    # For now, process synchronously (can be enhanced with Redis queue later)
+    results = []
+    for item in request.items:
+        result = await extract_document(item)
+        results.append(result)
+
+    return {
+        "job_id": job_id,
+        "total": len(request.items),
+        "results": results,
+    }
+
+
+@app.post("/url/preview")
+async def preview_url(url: HttpUrl):
+    """Quick preview of URL content (first 500 chars of markdown)"""
+    try:
+        content_bytes, filename = await fetch_url_content(str(url))
+        stats["urls_fetched"] += 1
+
+        ext = get_file_extension(filename)
+
+        with tempfile.NamedTemporaryFile(suffix=ext or ".html", delete=False) as tmp:
+            tmp.write(content_bytes)
+            tmp_path = Path(tmp.name)
+
+        try:
+            content, metadata = extract_with_docling(tmp_path, OutputFormat.MARKDOWN)
+            return {
+                "success": True,
+                "url": str(url),
+                "preview": content[:500] + ("..." if len(content) > 500 else ""),
+                "full_length": len(content),
+                "metadata": metadata,
+            }
+        finally:
+            tmp_path.unlink(missing_ok=True)
+
+    except Exception as e:
+        return {
+            "success": False,
+            "url": str(url),
+            "error": str(e),
+        }
+
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8081)