1276 lines
44 KiB
Python
1276 lines
44 KiB
Python
"""
|
|
Docling Service - Document extraction and processing for the AI stack
|
|
|
|
Integrates with:
|
|
- AI Orchestrator (Ollama) for summarization
|
|
- RunPod Whisper for audio transcription
|
|
- Semantic Search for indexing extracted content
|
|
"""
|
|
|
|
import os
|
|
import asyncio
|
|
import tempfile
|
|
import base64
|
|
import hashlib
|
|
from pathlib import Path
|
|
from datetime import datetime
|
|
from typing import Optional, List, Dict, Any, Literal
|
|
from enum import Enum
|
|
|
|
import httpx
|
|
from fastapi import FastAPI, HTTPException, UploadFile, File, Form, BackgroundTasks
|
|
from fastapi.middleware.cors import CORSMiddleware
|
|
from fastapi.responses import JSONResponse, HTMLResponse
|
|
from pydantic import BaseModel, HttpUrl
|
|
|
|
# Docling imports
|
|
from docling.document_converter import DocumentConverter
|
|
from docling.datamodel.base_models import InputFormat
|
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
|
from docling.document_converter import PdfFormatOption
|
|
|
|
# Config from environment
|
|
AI_ORCHESTRATOR_URL = os.getenv("AI_ORCHESTRATOR_URL", "http://ai-orchestrator:8080")
|
|
SEMANTIC_SEARCH_URL = os.getenv("SEMANTIC_SEARCH_URL", "http://semantic-search:8000")
|
|
RUNPOD_API_KEY = os.getenv("RUNPOD_API_KEY", "")
|
|
RUNPOD_WHISPER_ENDPOINT = os.getenv("RUNPOD_WHISPER_ENDPOINT", "lrtisuv8ixbtub")
|
|
|
|
# Supported formats
|
|
DOCUMENT_FORMATS = {".pdf", ".docx", ".pptx", ".xlsx", ".html", ".md", ".txt", ".epub"}
|
|
IMAGE_FORMATS = {".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".gif"}
|
|
AUDIO_FORMATS = {".mp3", ".wav", ".m4a", ".ogg", ".flac", ".webm"}
|
|
|
|
app = FastAPI(
|
|
title="Docling Service",
|
|
description="Document extraction and processing service using Docling",
|
|
version="1.0.0",
|
|
)
|
|
|
|
app.add_middleware(
|
|
CORSMiddleware,
|
|
allow_origins=["*"],
|
|
allow_credentials=True,
|
|
allow_methods=["*"],
|
|
allow_headers=["*"],
|
|
)
|
|
|
|
# Initialize document converter with optimized settings
|
|
pipeline_options = PdfPipelineOptions()
|
|
pipeline_options.do_ocr = True
|
|
pipeline_options.do_table_structure = True
|
|
|
|
converter = DocumentConverter(
|
|
format_options={
|
|
InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
|
|
}
|
|
)
|
|
|
|
# Track processing stats
|
|
stats = {
|
|
"documents_processed": 0,
|
|
"pages_extracted": 0,
|
|
"audio_transcribed": 0,
|
|
"urls_fetched": 0,
|
|
"errors": 0,
|
|
}
|
|
|
|
|
|
class OutputFormat(str, Enum):
|
|
MARKDOWN = "markdown"
|
|
JSON = "json"
|
|
TEXT = "text"
|
|
HTML = "html"
|
|
|
|
|
|
class SummarizeStyle(str, Enum):
|
|
CONCISE = "concise"
|
|
DETAILED = "detailed"
|
|
BULLET_POINTS = "bullet_points"
|
|
TECHNICAL = "technical"
|
|
ELI5 = "eli5" # Explain like I'm 5
|
|
|
|
|
|
class ExtractRequest(BaseModel):
|
|
"""Request to extract content from a URL or base64-encoded file"""
|
|
url: Optional[HttpUrl] = None
|
|
base64_content: Optional[str] = None
|
|
filename: Optional[str] = None
|
|
output_format: OutputFormat = OutputFormat.MARKDOWN
|
|
summarize: bool = False
|
|
summarize_style: SummarizeStyle = SummarizeStyle.CONCISE
|
|
index_to_search: bool = False
|
|
metadata: Optional[Dict[str, Any]] = None
|
|
|
|
|
|
class TranscribeRequest(BaseModel):
|
|
"""Request to transcribe audio"""
|
|
url: Optional[HttpUrl] = None
|
|
base64_content: Optional[str] = None
|
|
language: Optional[str] = None # Auto-detect if not specified
|
|
summarize: bool = False
|
|
summarize_style: SummarizeStyle = SummarizeStyle.CONCISE
|
|
|
|
|
|
class BatchExtractRequest(BaseModel):
|
|
"""Batch extraction request"""
|
|
items: List[ExtractRequest]
|
|
|
|
|
|
class ExtractionResult(BaseModel):
|
|
"""Result of document extraction"""
|
|
success: bool
|
|
source: str
|
|
content: Optional[str] = None
|
|
format: OutputFormat
|
|
metadata: Dict[str, Any] = {}
|
|
summary: Optional[str] = None
|
|
indexed: bool = False
|
|
error: Optional[str] = None
|
|
|
|
|
|
# ============== Helper Functions ==============
|
|
|
|
def get_file_extension(filename: str) -> str:
|
|
"""Get lowercase file extension"""
|
|
return Path(filename).suffix.lower()
|
|
|
|
|
|
def generate_doc_id(source: str, content: str) -> str:
|
|
"""Generate a unique document ID"""
|
|
hash_input = f"{source}:{content[:1000]}"
|
|
return hashlib.sha256(hash_input.encode()).hexdigest()[:16]
|
|
|
|
|
|
async def fetch_url_content(url: str) -> tuple[bytes, str]:
|
|
"""Fetch content from URL, return bytes and detected filename"""
|
|
async with httpx.AsyncClient(follow_redirects=True, timeout=60) as client:
|
|
resp = await client.get(url)
|
|
resp.raise_for_status()
|
|
|
|
# Try to get filename from headers or URL
|
|
content_disposition = resp.headers.get("content-disposition", "")
|
|
if "filename=" in content_disposition:
|
|
filename = content_disposition.split("filename=")[1].strip('"\'')
|
|
else:
|
|
filename = url.split("/")[-1].split("?")[0] or "document"
|
|
|
|
return resp.content, filename
|
|
|
|
|
|
async def transcribe_audio_runpod(audio_data: bytes, language: Optional[str] = None) -> dict:
|
|
"""Transcribe audio using RunPod Whisper endpoint"""
|
|
if not RUNPOD_API_KEY:
|
|
raise HTTPException(status_code=500, detail="RunPod API key not configured")
|
|
|
|
# Convert audio to base64
|
|
audio_base64 = base64.b64encode(audio_data).decode()
|
|
|
|
payload = {
|
|
"input": {
|
|
"audio_base64": audio_base64,
|
|
}
|
|
}
|
|
if language:
|
|
payload["input"]["language"] = language
|
|
|
|
async with httpx.AsyncClient(timeout=300) as client:
|
|
# Submit job
|
|
resp = await client.post(
|
|
f"https://api.runpod.ai/v2/{RUNPOD_WHISPER_ENDPOINT}/run",
|
|
headers={
|
|
"Authorization": f"Bearer {RUNPOD_API_KEY}",
|
|
"Content-Type": "application/json",
|
|
},
|
|
json=payload,
|
|
)
|
|
result = resp.json()
|
|
|
|
if "error" in result:
|
|
raise HTTPException(status_code=500, detail=f"RunPod error: {result['error']}")
|
|
|
|
job_id = result.get("id")
|
|
if not job_id:
|
|
raise HTTPException(status_code=500, detail="No job ID returned from RunPod")
|
|
|
|
# Poll for completion
|
|
for _ in range(120): # Max 10 minutes
|
|
await asyncio.sleep(5)
|
|
status_resp = await client.get(
|
|
f"https://api.runpod.ai/v2/{RUNPOD_WHISPER_ENDPOINT}/status/{job_id}",
|
|
headers={"Authorization": f"Bearer {RUNPOD_API_KEY}"},
|
|
)
|
|
status_data = status_resp.json()
|
|
|
|
if status_data.get("status") == "COMPLETED":
|
|
return status_data.get("output", {})
|
|
elif status_data.get("status") in ["FAILED", "CANCELLED"]:
|
|
raise HTTPException(
|
|
status_code=500,
|
|
detail=f"Transcription failed: {status_data.get('error', 'Unknown error')}"
|
|
)
|
|
|
|
raise HTTPException(status_code=504, detail="Transcription timed out")
|
|
|
|
|
|
async def summarize_with_ollama(content: str, style: SummarizeStyle) -> str:
|
|
"""Summarize content using AI Orchestrator (Ollama)"""
|
|
style_prompts = {
|
|
SummarizeStyle.CONCISE: "Provide a concise 2-3 sentence summary of the following content:",
|
|
SummarizeStyle.DETAILED: "Provide a detailed summary of the following content, covering all main points:",
|
|
SummarizeStyle.BULLET_POINTS: "Summarize the following content as bullet points:",
|
|
SummarizeStyle.TECHNICAL: "Provide a technical summary of the following content, focusing on key technical details:",
|
|
SummarizeStyle.ELI5: "Explain the following content in simple terms that a child could understand:",
|
|
}
|
|
|
|
prompt = f"{style_prompts[style]}\n\n{content[:8000]}" # Limit content for context window
|
|
|
|
async with httpx.AsyncClient(timeout=120) as client:
|
|
try:
|
|
resp = await client.post(
|
|
f"{AI_ORCHESTRATOR_URL}/api/generate/text",
|
|
json={
|
|
"prompt": prompt,
|
|
"model": "llama3.2",
|
|
"max_tokens": 1024,
|
|
"priority": "low", # Use free Ollama
|
|
},
|
|
)
|
|
result = resp.json()
|
|
return result.get("response", "")
|
|
except Exception as e:
|
|
return f"[Summarization failed: {str(e)}]"
|
|
|
|
|
|
async def index_to_semantic_search(
|
|
doc_id: str,
|
|
content: str,
|
|
source: str,
|
|
metadata: Dict[str, Any],
|
|
) -> bool:
|
|
"""Index document to semantic search service"""
|
|
async with httpx.AsyncClient(timeout=30) as client:
|
|
try:
|
|
resp = await client.post(
|
|
f"{SEMANTIC_SEARCH_URL}/index",
|
|
json={
|
|
"id": doc_id,
|
|
"content": content,
|
|
"metadata": {
|
|
"source": source,
|
|
"indexed_at": datetime.now().isoformat(),
|
|
**metadata,
|
|
},
|
|
},
|
|
)
|
|
return resp.status_code == 200
|
|
except Exception:
|
|
return False
|
|
|
|
|
|
def extract_with_docling(file_path: Path, output_format: OutputFormat) -> tuple[str, dict]:
|
|
"""Extract content from document using Docling"""
|
|
result = converter.convert(str(file_path))
|
|
doc = result.document
|
|
|
|
# Get metadata
|
|
metadata = {
|
|
"pages": len(doc.pages) if hasattr(doc, "pages") else 0,
|
|
"tables": len(doc.tables) if hasattr(doc, "tables") else 0,
|
|
"figures": len(doc.pictures) if hasattr(doc, "pictures") else 0,
|
|
}
|
|
|
|
# Export in requested format
|
|
if output_format == OutputFormat.MARKDOWN:
|
|
content = doc.export_to_markdown()
|
|
elif output_format == OutputFormat.JSON:
|
|
content = doc.export_to_dict()
|
|
elif output_format == OutputFormat.HTML:
|
|
content = doc.export_to_html()
|
|
else: # TEXT
|
|
content = doc.export_to_markdown() # Markdown is readable as plain text
|
|
|
|
return content if isinstance(content, str) else str(content), metadata
|
|
|
|
|
|
# ============== API Endpoints ==============
|
|
|
|
@app.get("/", response_class=HTMLResponse)
|
|
async def dashboard():
|
|
"""Web interface for document processing"""
|
|
html = """
|
|
<!DOCTYPE html>
|
|
<html lang="en">
|
|
<head>
|
|
<meta charset="UTF-8">
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
|
<title>Document Intelligence - Extract, Summarize, Index</title>
|
|
<style>
|
|
* { box-sizing: border-box; margin: 0; padding: 0; }
|
|
body {
|
|
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
|
background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
|
|
min-height: 100vh;
|
|
color: #e0e0e0;
|
|
line-height: 1.6;
|
|
}
|
|
.container {
|
|
max-width: 1200px;
|
|
margin: 0 auto;
|
|
padding: 20px;
|
|
}
|
|
header {
|
|
text-align: center;
|
|
padding: 40px 0 30px;
|
|
}
|
|
h1 {
|
|
font-size: 2.5rem;
|
|
background: linear-gradient(90deg, #00d9ff, #00ff88);
|
|
-webkit-background-clip: text;
|
|
-webkit-text-fill-color: transparent;
|
|
background-clip: text;
|
|
margin-bottom: 10px;
|
|
}
|
|
.subtitle {
|
|
color: #888;
|
|
font-size: 1.1rem;
|
|
}
|
|
.main-grid {
|
|
display: grid;
|
|
grid-template-columns: 1fr 1fr;
|
|
gap: 30px;
|
|
margin-top: 30px;
|
|
}
|
|
@media (max-width: 900px) {
|
|
.main-grid { grid-template-columns: 1fr; }
|
|
}
|
|
.card {
|
|
background: rgba(255,255,255,0.03);
|
|
border: 1px solid rgba(255,255,255,0.1);
|
|
border-radius: 16px;
|
|
padding: 25px;
|
|
backdrop-filter: blur(10px);
|
|
}
|
|
.card h2 {
|
|
color: #00d9ff;
|
|
margin-bottom: 20px;
|
|
font-size: 1.3rem;
|
|
display: flex;
|
|
align-items: center;
|
|
gap: 10px;
|
|
}
|
|
.input-group {
|
|
margin-bottom: 20px;
|
|
}
|
|
label {
|
|
display: block;
|
|
margin-bottom: 8px;
|
|
color: #aaa;
|
|
font-size: 0.9rem;
|
|
}
|
|
input[type="text"], input[type="url"], select, textarea {
|
|
width: 100%;
|
|
padding: 12px 16px;
|
|
border: 1px solid rgba(255,255,255,0.1);
|
|
border-radius: 8px;
|
|
background: rgba(0,0,0,0.3);
|
|
color: #fff;
|
|
font-size: 1rem;
|
|
transition: border-color 0.2s;
|
|
}
|
|
input:focus, select:focus, textarea:focus {
|
|
outline: none;
|
|
border-color: #00d9ff;
|
|
}
|
|
.drop-zone {
|
|
border: 2px dashed rgba(0,217,255,0.3);
|
|
border-radius: 12px;
|
|
padding: 40px;
|
|
text-align: center;
|
|
cursor: pointer;
|
|
transition: all 0.3s;
|
|
background: rgba(0,217,255,0.02);
|
|
}
|
|
.drop-zone:hover, .drop-zone.dragover {
|
|
border-color: #00d9ff;
|
|
background: rgba(0,217,255,0.08);
|
|
}
|
|
.drop-zone-icon {
|
|
font-size: 3rem;
|
|
margin-bottom: 15px;
|
|
}
|
|
.drop-zone p {
|
|
color: #888;
|
|
}
|
|
.drop-zone .formats {
|
|
font-size: 0.8rem;
|
|
color: #666;
|
|
margin-top: 10px;
|
|
}
|
|
.file-info {
|
|
display: none;
|
|
padding: 15px;
|
|
background: rgba(0,255,136,0.1);
|
|
border-radius: 8px;
|
|
margin-top: 15px;
|
|
}
|
|
.file-info.visible { display: block; }
|
|
.options-grid {
|
|
display: grid;
|
|
grid-template-columns: 1fr 1fr;
|
|
gap: 15px;
|
|
}
|
|
.checkbox-group {
|
|
display: flex;
|
|
align-items: center;
|
|
gap: 10px;
|
|
padding: 12px;
|
|
background: rgba(0,0,0,0.2);
|
|
border-radius: 8px;
|
|
cursor: pointer;
|
|
}
|
|
.checkbox-group:hover {
|
|
background: rgba(0,0,0,0.3);
|
|
}
|
|
input[type="checkbox"] {
|
|
width: 18px;
|
|
height: 18px;
|
|
accent-color: #00d9ff;
|
|
}
|
|
.btn {
|
|
width: 100%;
|
|
padding: 14px 24px;
|
|
border: none;
|
|
border-radius: 8px;
|
|
font-size: 1rem;
|
|
font-weight: 600;
|
|
cursor: pointer;
|
|
transition: all 0.2s;
|
|
}
|
|
.btn-primary {
|
|
background: linear-gradient(90deg, #00d9ff, #00b8d4);
|
|
color: #000;
|
|
}
|
|
.btn-primary:hover:not(:disabled) {
|
|
transform: translateY(-2px);
|
|
box-shadow: 0 8px 25px rgba(0,217,255,0.3);
|
|
}
|
|
.btn-primary:disabled {
|
|
opacity: 0.5;
|
|
cursor: not-allowed;
|
|
}
|
|
.btn-secondary {
|
|
background: rgba(255,255,255,0.1);
|
|
color: #fff;
|
|
margin-top: 10px;
|
|
}
|
|
.btn-secondary:hover {
|
|
background: rgba(255,255,255,0.15);
|
|
}
|
|
.results-card {
|
|
grid-column: 1 / -1;
|
|
}
|
|
.results-header {
|
|
display: flex;
|
|
justify-content: space-between;
|
|
align-items: center;
|
|
margin-bottom: 20px;
|
|
}
|
|
.status {
|
|
display: inline-flex;
|
|
align-items: center;
|
|
gap: 8px;
|
|
padding: 6px 14px;
|
|
border-radius: 20px;
|
|
font-size: 0.85rem;
|
|
}
|
|
.status.processing {
|
|
background: rgba(255,193,7,0.2);
|
|
color: #ffc107;
|
|
}
|
|
.status.success {
|
|
background: rgba(0,255,136,0.2);
|
|
color: #00ff88;
|
|
}
|
|
.status.error {
|
|
background: rgba(255,82,82,0.2);
|
|
color: #ff5252;
|
|
}
|
|
.spinner {
|
|
width: 16px;
|
|
height: 16px;
|
|
border: 2px solid transparent;
|
|
border-top-color: currentColor;
|
|
border-radius: 50%;
|
|
animation: spin 0.8s linear infinite;
|
|
}
|
|
@keyframes spin {
|
|
to { transform: rotate(360deg); }
|
|
}
|
|
.tabs {
|
|
display: flex;
|
|
gap: 5px;
|
|
margin-bottom: 20px;
|
|
border-bottom: 1px solid rgba(255,255,255,0.1);
|
|
padding-bottom: 5px;
|
|
}
|
|
.tab {
|
|
padding: 10px 20px;
|
|
background: transparent;
|
|
border: none;
|
|
color: #888;
|
|
cursor: pointer;
|
|
border-radius: 8px 8px 0 0;
|
|
transition: all 0.2s;
|
|
}
|
|
.tab:hover { color: #fff; }
|
|
.tab.active {
|
|
background: rgba(0,217,255,0.1);
|
|
color: #00d9ff;
|
|
}
|
|
.tab-content {
|
|
display: none;
|
|
max-height: 500px;
|
|
overflow-y: auto;
|
|
}
|
|
.tab-content.active { display: block; }
|
|
.content-display {
|
|
background: rgba(0,0,0,0.3);
|
|
border-radius: 8px;
|
|
padding: 20px;
|
|
white-space: pre-wrap;
|
|
font-family: 'Monaco', 'Menlo', monospace;
|
|
font-size: 0.9rem;
|
|
line-height: 1.7;
|
|
}
|
|
.summary-display {
|
|
background: linear-gradient(135deg, rgba(0,217,255,0.1), rgba(0,255,136,0.1));
|
|
border-left: 3px solid #00d9ff;
|
|
padding: 20px;
|
|
border-radius: 0 8px 8px 0;
|
|
}
|
|
.meta-grid {
|
|
display: grid;
|
|
grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
|
|
gap: 15px;
|
|
}
|
|
.meta-item {
|
|
background: rgba(0,0,0,0.2);
|
|
padding: 15px;
|
|
border-radius: 8px;
|
|
text-align: center;
|
|
}
|
|
.meta-value {
|
|
font-size: 1.8rem;
|
|
font-weight: 700;
|
|
color: #00d9ff;
|
|
}
|
|
.meta-label {
|
|
font-size: 0.8rem;
|
|
color: #888;
|
|
margin-top: 5px;
|
|
}
|
|
.indexed-badge {
|
|
display: inline-flex;
|
|
align-items: center;
|
|
gap: 6px;
|
|
padding: 8px 16px;
|
|
background: rgba(0,255,136,0.2);
|
|
color: #00ff88;
|
|
border-radius: 20px;
|
|
font-size: 0.9rem;
|
|
}
|
|
.stats-bar {
|
|
display: flex;
|
|
gap: 20px;
|
|
justify-content: center;
|
|
flex-wrap: wrap;
|
|
margin-top: 30px;
|
|
padding: 20px;
|
|
background: rgba(0,0,0,0.2);
|
|
border-radius: 12px;
|
|
}
|
|
.stat {
|
|
text-align: center;
|
|
}
|
|
.stat-value {
|
|
font-size: 1.5rem;
|
|
font-weight: 700;
|
|
color: #00d9ff;
|
|
}
|
|
.stat-label {
|
|
font-size: 0.75rem;
|
|
color: #666;
|
|
}
|
|
#results { display: none; }
|
|
#results.visible { display: block; }
|
|
.hidden { display: none !important; }
|
|
</style>
|
|
</head>
|
|
<body>
|
|
<div class="container">
|
|
<header>
|
|
<h1>Document Intelligence</h1>
|
|
<p class="subtitle">Extract, summarize, and index documents into your knowledge base</p>
|
|
</header>
|
|
|
|
<div class="main-grid">
|
|
<!-- Input Card -->
|
|
<div class="card">
|
|
<h2><span>📄</span> Input Source</h2>
|
|
|
|
<div class="input-group">
|
|
<label for="url-input">URL (webpage, PDF, document)</label>
|
|
<input type="url" id="url-input" placeholder="https://example.com/document.pdf">
|
|
</div>
|
|
|
|
<div style="text-align: center; color: #666; margin: 15px 0;">— or —</div>
|
|
|
|
<div class="drop-zone" id="drop-zone">
|
|
<div class="drop-zone-icon">📁</div>
|
|
<p>Drop a file here or click to browse</p>
|
|
<p class="formats">PDF, DOCX, PPTX, XLSX, HTML, MD, TXT, EPUB, images, audio</p>
|
|
<input type="file" id="file-input" hidden accept=".pdf,.docx,.pptx,.xlsx,.html,.md,.txt,.epub,.png,.jpg,.jpeg,.tiff,.bmp,.gif,.mp3,.wav,.m4a,.ogg,.flac,.webm">
|
|
</div>
|
|
<div class="file-info" id="file-info">
|
|
<strong id="file-name"></strong>
|
|
<span id="file-size" style="color: #888; margin-left: 10px;"></span>
|
|
</div>
|
|
</div>
|
|
|
|
<!-- Options Card -->
|
|
<div class="card">
|
|
<h2><span>⚙️</span> Processing Options</h2>
|
|
|
|
<div class="input-group">
|
|
<label for="summary-style">Summary Style</label>
|
|
<select id="summary-style">
|
|
<option value="concise">Concise (2-3 sentences)</option>
|
|
<option value="bullet_points">Bullet Points</option>
|
|
<option value="detailed">Detailed</option>
|
|
<option value="technical">Technical</option>
|
|
<option value="eli5">ELI5 (Simple explanation)</option>
|
|
</select>
|
|
</div>
|
|
|
|
<div class="input-group">
|
|
<label for="output-format">Output Format</label>
|
|
<select id="output-format">
|
|
<option value="markdown">Markdown</option>
|
|
<option value="text">Plain Text</option>
|
|
<option value="html">HTML</option>
|
|
<option value="json">JSON</option>
|
|
</select>
|
|
</div>
|
|
|
|
<div class="input-group">
|
|
<label>Actions</label>
|
|
<div class="options-grid">
|
|
<label class="checkbox-group">
|
|
<input type="checkbox" id="do-summarize" checked>
|
|
<span>Generate Summary</span>
|
|
</label>
|
|
<label class="checkbox-group">
|
|
<input type="checkbox" id="do-index">
|
|
<span>Index to Knowledge Base</span>
|
|
</label>
|
|
</div>
|
|
</div>
|
|
|
|
<button class="btn btn-primary" id="process-btn" onclick="processDocument()">
|
|
Process Document
|
|
</button>
|
|
<button class="btn btn-secondary" onclick="clearAll()">Clear</button>
|
|
</div>
|
|
|
|
<!-- Results Card -->
|
|
<div class="card results-card" id="results">
|
|
<div class="results-header">
|
|
<h2><span>📊</span> Results</h2>
|
|
<div>
|
|
<span class="status" id="status">
|
|
<span class="spinner"></span>
|
|
Processing...
|
|
</span>
|
|
<span class="indexed-badge hidden" id="indexed-badge">
|
|
✓ Indexed to Knowledge Base
|
|
</span>
|
|
</div>
|
|
</div>
|
|
|
|
<div class="meta-grid" id="meta-grid" style="margin-bottom: 20px;">
|
|
<div class="meta-item">
|
|
<div class="meta-value" id="meta-pages">-</div>
|
|
<div class="meta-label">Pages</div>
|
|
</div>
|
|
<div class="meta-item">
|
|
<div class="meta-value" id="meta-tables">-</div>
|
|
<div class="meta-label">Tables</div>
|
|
</div>
|
|
<div class="meta-item">
|
|
<div class="meta-value" id="meta-figures">-</div>
|
|
<div class="meta-label">Figures</div>
|
|
</div>
|
|
<div class="meta-item">
|
|
<div class="meta-value" id="meta-chars">-</div>
|
|
<div class="meta-label">Characters</div>
|
|
</div>
|
|
</div>
|
|
|
|
<div class="tabs">
|
|
<button class="tab active" onclick="switchTab('summary')">Summary</button>
|
|
<button class="tab" onclick="switchTab('content')">Full Content</button>
|
|
<button class="tab" id="transcript-tab" style="display:none;" onclick="switchTab('transcript')">Transcript</button>
|
|
</div>
|
|
|
|
<div class="tab-content active" id="tab-summary">
|
|
<div class="summary-display" id="summary-content">
|
|
Processing...
|
|
</div>
|
|
</div>
|
|
<div class="tab-content" id="tab-content">
|
|
<div class="content-display" id="full-content">
|
|
Loading content...
|
|
</div>
|
|
</div>
|
|
<div class="tab-content" id="tab-transcript">
|
|
<div class="content-display" id="transcript-content">
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
<div class="stats-bar">
|
|
<div class="stat">
|
|
<div class="stat-value" id="stat-docs">""" + str(stats["documents_processed"]) + """</div>
|
|
<div class="stat-label">Documents Processed</div>
|
|
</div>
|
|
<div class="stat">
|
|
<div class="stat-value" id="stat-pages">""" + str(stats["pages_extracted"]) + """</div>
|
|
<div class="stat-label">Pages Extracted</div>
|
|
</div>
|
|
<div class="stat">
|
|
<div class="stat-value" id="stat-audio">""" + str(stats["audio_transcribed"]) + """</div>
|
|
<div class="stat-label">Audio Transcribed</div>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
|
|
<script>
|
|
const dropZone = document.getElementById('drop-zone');
|
|
const fileInput = document.getElementById('file-input');
|
|
const fileInfo = document.getElementById('file-info');
|
|
let selectedFile = null;
|
|
|
|
// Drag and drop
|
|
dropZone.addEventListener('click', () => fileInput.click());
|
|
dropZone.addEventListener('dragover', (e) => {
|
|
e.preventDefault();
|
|
dropZone.classList.add('dragover');
|
|
});
|
|
dropZone.addEventListener('dragleave', () => dropZone.classList.remove('dragover'));
|
|
dropZone.addEventListener('drop', (e) => {
|
|
e.preventDefault();
|
|
dropZone.classList.remove('dragover');
|
|
if (e.dataTransfer.files.length) {
|
|
handleFile(e.dataTransfer.files[0]);
|
|
}
|
|
});
|
|
fileInput.addEventListener('change', (e) => {
|
|
if (e.target.files.length) {
|
|
handleFile(e.target.files[0]);
|
|
}
|
|
});
|
|
|
|
function handleFile(file) {
|
|
selectedFile = file;
|
|
document.getElementById('file-name').textContent = file.name;
|
|
document.getElementById('file-size').textContent = formatBytes(file.size);
|
|
fileInfo.classList.add('visible');
|
|
document.getElementById('url-input').value = '';
|
|
}
|
|
|
|
function formatBytes(bytes) {
|
|
if (bytes === 0) return '0 Bytes';
|
|
const k = 1024;
|
|
const sizes = ['Bytes', 'KB', 'MB', 'GB'];
|
|
const i = Math.floor(Math.log(bytes) / Math.log(k));
|
|
return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
|
|
}
|
|
|
|
function switchTab(tab) {
|
|
document.querySelectorAll('.tab').forEach(t => t.classList.remove('active'));
|
|
document.querySelectorAll('.tab-content').forEach(c => c.classList.remove('active'));
|
|
event.target.classList.add('active');
|
|
document.getElementById('tab-' + tab).classList.add('active');
|
|
}
|
|
|
|
function clearAll() {
|
|
document.getElementById('url-input').value = '';
|
|
selectedFile = null;
|
|
fileInfo.classList.remove('visible');
|
|
document.getElementById('results').classList.remove('visible');
|
|
}
|
|
|
|
async function processDocument() {
|
|
const url = document.getElementById('url-input').value.trim();
|
|
const summarize = document.getElementById('do-summarize').checked;
|
|
const indexToSearch = document.getElementById('do-index').checked;
|
|
const summaryStyle = document.getElementById('summary-style').value;
|
|
const outputFormat = document.getElementById('output-format').value;
|
|
|
|
if (!url && !selectedFile) {
|
|
alert('Please provide a URL or upload a file');
|
|
return;
|
|
}
|
|
|
|
// Show results and set processing state
|
|
const results = document.getElementById('results');
|
|
const status = document.getElementById('status');
|
|
const processBtn = document.getElementById('process-btn');
|
|
|
|
results.classList.add('visible');
|
|
status.className = 'status processing';
|
|
status.innerHTML = '<span class="spinner"></span> Processing...';
|
|
processBtn.disabled = true;
|
|
document.getElementById('indexed-badge').classList.add('hidden');
|
|
|
|
try {
|
|
let response;
|
|
const isAudio = selectedFile && /\\.(mp3|wav|m4a|ogg|flac|webm)$/i.test(selectedFile.name);
|
|
|
|
if (selectedFile) {
|
|
const formData = new FormData();
|
|
formData.append('file', selectedFile);
|
|
formData.append('summarize', summarize);
|
|
formData.append('summarize_style', summaryStyle);
|
|
formData.append('index_to_search', indexToSearch);
|
|
if (!isAudio) {
|
|
formData.append('output_format', outputFormat);
|
|
}
|
|
|
|
const endpoint = isAudio ? '/transcribe/upload' : '/extract/upload';
|
|
response = await fetch(endpoint, {
|
|
method: 'POST',
|
|
body: formData
|
|
});
|
|
} else {
|
|
// Check if URL points to audio
|
|
const urlIsAudio = /\\.(mp3|wav|m4a|ogg|flac|webm)(\\?|$)/i.test(url);
|
|
const endpoint = urlIsAudio ? '/transcribe' : '/extract';
|
|
|
|
const payload = urlIsAudio ? {
|
|
url: url,
|
|
summarize: summarize,
|
|
summarize_style: summaryStyle
|
|
} : {
|
|
url: url,
|
|
output_format: outputFormat,
|
|
summarize: summarize,
|
|
summarize_style: summaryStyle,
|
|
index_to_search: indexToSearch
|
|
};
|
|
|
|
response = await fetch(endpoint, {
|
|
method: 'POST',
|
|
headers: { 'Content-Type': 'application/json' },
|
|
body: JSON.stringify(payload)
|
|
});
|
|
}
|
|
|
|
const data = await response.json();
|
|
|
|
if (data.success) {
|
|
status.className = 'status success';
|
|
status.textContent = '✓ Complete';
|
|
|
|
// Handle audio transcription
|
|
if (data.transcript !== undefined) {
|
|
document.getElementById('transcript-tab').style.display = 'block';
|
|
document.getElementById('transcript-content').textContent = data.transcript || 'No transcript available';
|
|
document.getElementById('full-content').textContent = data.transcript || '';
|
|
document.getElementById('meta-pages').textContent = '-';
|
|
document.getElementById('meta-tables').textContent = '-';
|
|
document.getElementById('meta-figures').textContent = '-';
|
|
document.getElementById('meta-chars').textContent = data.transcript ? data.transcript.length.toLocaleString() : '0';
|
|
|
|
// Switch to transcript tab
|
|
document.querySelectorAll('.tab').forEach(t => t.classList.remove('active'));
|
|
document.querySelectorAll('.tab-content').forEach(c => c.classList.remove('active'));
|
|
document.getElementById('transcript-tab').classList.add('active');
|
|
document.getElementById('tab-transcript').classList.add('active');
|
|
} else {
|
|
document.getElementById('transcript-tab').style.display = 'none';
|
|
document.getElementById('full-content').textContent = data.content || 'No content extracted';
|
|
|
|
const meta = data.metadata || {};
|
|
document.getElementById('meta-pages').textContent = meta.pages || '-';
|
|
document.getElementById('meta-tables').textContent = meta.tables || '-';
|
|
document.getElementById('meta-figures').textContent = meta.figures || '-';
|
|
document.getElementById('meta-chars').textContent = data.content ? data.content.length.toLocaleString() : '0';
|
|
}
|
|
|
|
document.getElementById('summary-content').textContent = data.summary || 'No summary generated';
|
|
|
|
if (data.indexed) {
|
|
document.getElementById('indexed-badge').classList.remove('hidden');
|
|
}
|
|
|
|
// Update stats
|
|
fetchStats();
|
|
} else {
|
|
status.className = 'status error';
|
|
status.textContent = '✗ Error';
|
|
document.getElementById('summary-content').textContent = 'Error: ' + (data.error || 'Unknown error');
|
|
document.getElementById('full-content').textContent = '';
|
|
}
|
|
} catch (err) {
|
|
status.className = 'status error';
|
|
status.textContent = '✗ Error';
|
|
document.getElementById('summary-content').textContent = 'Error: ' + err.message;
|
|
} finally {
|
|
processBtn.disabled = false;
|
|
}
|
|
}
|
|
|
|
async function fetchStats() {
|
|
try {
|
|
const resp = await fetch('/stats');
|
|
const stats = await resp.json();
|
|
document.getElementById('stat-docs').textContent = stats.documents_processed || 0;
|
|
document.getElementById('stat-pages').textContent = stats.pages_extracted || 0;
|
|
document.getElementById('stat-audio').textContent = stats.audio_transcribed || 0;
|
|
} catch (e) {}
|
|
}
|
|
|
|
// Initial stats load
|
|
fetchStats();
|
|
</script>
|
|
</body>
|
|
</html>
|
|
"""
|
|
return HTMLResponse(content=html)
|
|
|
|
|
|
@app.get("/api/info")
|
|
async def api_info():
|
|
"""Service info (JSON API endpoint)"""
|
|
return {
|
|
"service": "Docling Service",
|
|
"version": "1.0.0",
|
|
"status": "healthy",
|
|
"supported_formats": {
|
|
"documents": list(DOCUMENT_FORMATS),
|
|
"images": list(IMAGE_FORMATS),
|
|
"audio": list(AUDIO_FORMATS),
|
|
},
|
|
"integrations": {
|
|
"ai_orchestrator": AI_ORCHESTRATOR_URL,
|
|
"semantic_search": SEMANTIC_SEARCH_URL,
|
|
"runpod_whisper": f"endpoint:{RUNPOD_WHISPER_ENDPOINT}",
|
|
},
|
|
}
|
|
|
|
|
|
@app.get("/health")
|
|
async def health():
|
|
"""Health check endpoint for Traefik"""
|
|
return {"status": "healthy", "timestamp": datetime.now().isoformat()}
|
|
|
|
|
|
@app.get("/stats")
|
|
async def get_stats():
|
|
"""Get processing statistics"""
|
|
return stats
|
|
|
|
|
|
@app.post("/extract", response_model=ExtractionResult)
|
|
async def extract_document(request: ExtractRequest):
|
|
"""
|
|
Extract content from a document (URL or base64).
|
|
|
|
Supports: PDF, DOCX, PPTX, XLSX, HTML, MD, TXT, EPUB, images (with OCR)
|
|
"""
|
|
try:
|
|
# Get content
|
|
if request.url:
|
|
content_bytes, filename = await fetch_url_content(str(request.url))
|
|
source = str(request.url)
|
|
elif request.base64_content:
|
|
content_bytes = base64.b64decode(request.base64_content)
|
|
filename = request.filename or "document"
|
|
source = f"base64:{filename}"
|
|
else:
|
|
raise HTTPException(status_code=400, detail="Provide either url or base64_content")
|
|
|
|
ext = get_file_extension(filename)
|
|
|
|
# Handle audio separately
|
|
if ext in AUDIO_FORMATS:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail="Use /transcribe endpoint for audio files"
|
|
)
|
|
|
|
# Write to temp file for Docling
|
|
with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
|
|
tmp.write(content_bytes)
|
|
tmp_path = Path(tmp.name)
|
|
|
|
try:
|
|
# Extract content
|
|
content, metadata = extract_with_docling(tmp_path, request.output_format)
|
|
stats["documents_processed"] += 1
|
|
stats["pages_extracted"] += metadata.get("pages", 1)
|
|
|
|
# Summarize if requested
|
|
summary = None
|
|
if request.summarize:
|
|
summary = await summarize_with_ollama(content, request.summarize_style)
|
|
|
|
# Index if requested
|
|
indexed = False
|
|
if request.index_to_search:
|
|
doc_id = generate_doc_id(source, content)
|
|
indexed = await index_to_semantic_search(
|
|
doc_id=doc_id,
|
|
content=content,
|
|
source=source,
|
|
metadata={**metadata, **(request.metadata or {})},
|
|
)
|
|
|
|
return ExtractionResult(
|
|
success=True,
|
|
source=source,
|
|
content=content,
|
|
format=request.output_format,
|
|
metadata=metadata,
|
|
summary=summary,
|
|
indexed=indexed,
|
|
)
|
|
finally:
|
|
tmp_path.unlink(missing_ok=True)
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
stats["errors"] += 1
|
|
return ExtractionResult(
|
|
success=False,
|
|
source=str(request.url or request.filename or "unknown"),
|
|
format=request.output_format,
|
|
error=str(e),
|
|
)
|
|
|
|
|
|
@app.post("/extract/upload", response_model=ExtractionResult)
|
|
async def extract_uploaded_file(
|
|
file: UploadFile = File(...),
|
|
output_format: OutputFormat = Form(OutputFormat.MARKDOWN),
|
|
summarize: bool = Form(False),
|
|
summarize_style: SummarizeStyle = Form(SummarizeStyle.CONCISE),
|
|
index_to_search: bool = Form(False),
|
|
):
|
|
"""Extract content from an uploaded file"""
|
|
try:
|
|
content_bytes = await file.read()
|
|
ext = get_file_extension(file.filename or "document")
|
|
|
|
if ext in AUDIO_FORMATS:
|
|
raise HTTPException(
|
|
status_code=400,
|
|
detail="Use /transcribe/upload endpoint for audio files"
|
|
)
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=ext, delete=False) as tmp:
|
|
tmp.write(content_bytes)
|
|
tmp_path = Path(tmp.name)
|
|
|
|
try:
|
|
content, metadata = extract_with_docling(tmp_path, output_format)
|
|
stats["documents_processed"] += 1
|
|
stats["pages_extracted"] += metadata.get("pages", 1)
|
|
|
|
summary = None
|
|
if summarize:
|
|
summary = await summarize_with_ollama(content, summarize_style)
|
|
|
|
indexed = False
|
|
if index_to_search:
|
|
doc_id = generate_doc_id(file.filename or "upload", content)
|
|
indexed = await index_to_semantic_search(
|
|
doc_id=doc_id,
|
|
content=content,
|
|
source=f"upload:{file.filename}",
|
|
metadata=metadata,
|
|
)
|
|
|
|
return ExtractionResult(
|
|
success=True,
|
|
source=f"upload:{file.filename}",
|
|
content=content,
|
|
format=output_format,
|
|
metadata=metadata,
|
|
summary=summary,
|
|
indexed=indexed,
|
|
)
|
|
finally:
|
|
tmp_path.unlink(missing_ok=True)
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
stats["errors"] += 1
|
|
return ExtractionResult(
|
|
success=False,
|
|
source=f"upload:{file.filename}",
|
|
format=output_format,
|
|
error=str(e),
|
|
)
|
|
|
|
|
|
@app.post("/transcribe")
|
|
async def transcribe_audio(request: TranscribeRequest):
|
|
"""
|
|
Transcribe audio using RunPod Whisper.
|
|
|
|
Supports: MP3, WAV, M4A, OGG, FLAC, WEBM
|
|
"""
|
|
try:
|
|
if request.url:
|
|
content_bytes, filename = await fetch_url_content(str(request.url))
|
|
source = str(request.url)
|
|
elif request.base64_content:
|
|
content_bytes = base64.b64decode(request.base64_content)
|
|
source = "base64:audio"
|
|
else:
|
|
raise HTTPException(status_code=400, detail="Provide either url or base64_content")
|
|
|
|
# Transcribe
|
|
result = await transcribe_audio_runpod(content_bytes, request.language)
|
|
stats["audio_transcribed"] += 1
|
|
|
|
transcript = result.get("transcription", result.get("text", ""))
|
|
|
|
# Summarize if requested
|
|
summary = None
|
|
if request.summarize and transcript:
|
|
summary = await summarize_with_ollama(transcript, request.summarize_style)
|
|
|
|
return {
|
|
"success": True,
|
|
"source": source,
|
|
"transcript": transcript,
|
|
"language": result.get("detected_language"),
|
|
"duration": result.get("duration"),
|
|
"summary": summary,
|
|
}
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
stats["errors"] += 1
|
|
return {
|
|
"success": False,
|
|
"source": str(request.url or "base64"),
|
|
"error": str(e),
|
|
}
|
|
|
|
|
|
@app.post("/transcribe/upload")
|
|
async def transcribe_uploaded_audio(
|
|
file: UploadFile = File(...),
|
|
language: Optional[str] = Form(None),
|
|
summarize: bool = Form(False),
|
|
summarize_style: SummarizeStyle = Form(SummarizeStyle.CONCISE),
|
|
):
|
|
"""Transcribe uploaded audio file"""
|
|
try:
|
|
content_bytes = await file.read()
|
|
|
|
result = await transcribe_audio_runpod(content_bytes, language)
|
|
stats["audio_transcribed"] += 1
|
|
|
|
transcript = result.get("transcription", result.get("text", ""))
|
|
|
|
summary = None
|
|
if summarize and transcript:
|
|
summary = await summarize_with_ollama(transcript, summarize_style)
|
|
|
|
return {
|
|
"success": True,
|
|
"source": f"upload:{file.filename}",
|
|
"transcript": transcript,
|
|
"language": result.get("detected_language"),
|
|
"duration": result.get("duration"),
|
|
"summary": summary,
|
|
}
|
|
|
|
except HTTPException:
|
|
raise
|
|
except Exception as e:
|
|
stats["errors"] += 1
|
|
return {
|
|
"success": False,
|
|
"source": f"upload:{file.filename}",
|
|
"error": str(e),
|
|
}
|
|
|
|
|
|
@app.post("/batch")
|
|
async def batch_extract(request: BatchExtractRequest, background_tasks: BackgroundTasks):
|
|
"""
|
|
Batch extract multiple documents.
|
|
Returns immediately with job ID, processes in background.
|
|
"""
|
|
job_id = hashlib.sha256(str(datetime.now()).encode()).hexdigest()[:16]
|
|
|
|
# For now, process synchronously (can be enhanced with Redis queue later)
|
|
results = []
|
|
for item in request.items:
|
|
result = await extract_document(item)
|
|
results.append(result)
|
|
|
|
return {
|
|
"job_id": job_id,
|
|
"total": len(request.items),
|
|
"results": results,
|
|
}
|
|
|
|
|
|
@app.post("/url/preview")
|
|
async def preview_url(url: HttpUrl):
|
|
"""Quick preview of URL content (first 500 chars of markdown)"""
|
|
try:
|
|
content_bytes, filename = await fetch_url_content(str(url))
|
|
stats["urls_fetched"] += 1
|
|
|
|
ext = get_file_extension(filename)
|
|
|
|
with tempfile.NamedTemporaryFile(suffix=ext or ".html", delete=False) as tmp:
|
|
tmp.write(content_bytes)
|
|
tmp_path = Path(tmp.name)
|
|
|
|
try:
|
|
content, metadata = extract_with_docling(tmp_path, OutputFormat.MARKDOWN)
|
|
return {
|
|
"success": True,
|
|
"url": str(url),
|
|
"preview": content[:500] + ("..." if len(content) > 500 else ""),
|
|
"full_length": len(content),
|
|
"metadata": metadata,
|
|
}
|
|
finally:
|
|
tmp_path.unlink(missing_ok=True)
|
|
|
|
except Exception as e:
|
|
return {
|
|
"success": False,
|
|
"url": str(url),
|
|
"error": str(e),
|
|
}
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import uvicorn
|
|
uvicorn.run(app, host="0.0.0.0", port=8081)
|