docling-service/client.py

208 lines
6.8 KiB
Python

"""
Docling Service Client - Use this to integrate with other services
Example usage:
from client import DoclingClient
client = DoclingClient("http://docs.jeffemmett.com")
# Extract from URL
result = await client.extract_url("https://example.com/doc.pdf")
# Extract with summarization
result = await client.extract_url(
"https://example.com/doc.pdf",
summarize=True,
summarize_style="bullet_points"
)
# Transcribe audio
result = await client.transcribe_url("https://example.com/audio.mp3")
"""
import httpx
import base64
from pathlib import Path
from typing import Optional, Dict, Any, Literal
OutputFormat = Literal["markdown", "json", "text", "html"]
SummarizeStyle = Literal["concise", "detailed", "bullet_points", "technical", "eli5"]
class DoclingClient:
"""Async client for Docling Service"""
def __init__(self, base_url: str = "http://localhost:8081", timeout: float = 300):
self.base_url = base_url.rstrip("/")
self.timeout = timeout
async def health(self) -> dict:
"""Check service health"""
async with httpx.AsyncClient(timeout=10) as client:
resp = await client.get(f"{self.base_url}/health")
return resp.json()
async def stats(self) -> dict:
"""Get processing statistics"""
async with httpx.AsyncClient(timeout=10) as client:
resp = await client.get(f"{self.base_url}/stats")
return resp.json()
async def extract_url(
self,
url: str,
output_format: OutputFormat = "markdown",
summarize: bool = False,
summarize_style: SummarizeStyle = "concise",
index_to_search: bool = False,
metadata: Optional[Dict[str, Any]] = None,
) -> dict:
"""Extract content from a URL"""
async with httpx.AsyncClient(timeout=self.timeout) as client:
resp = await client.post(
f"{self.base_url}/extract",
json={
"url": url,
"output_format": output_format,
"summarize": summarize,
"summarize_style": summarize_style,
"index_to_search": index_to_search,
"metadata": metadata,
},
)
return resp.json()
async def extract_file(
self,
file_path: str,
output_format: OutputFormat = "markdown",
summarize: bool = False,
summarize_style: SummarizeStyle = "concise",
index_to_search: bool = False,
metadata: Optional[Dict[str, Any]] = None,
) -> dict:
"""Extract content from a local file"""
path = Path(file_path)
content = base64.b64encode(path.read_bytes()).decode()
async with httpx.AsyncClient(timeout=self.timeout) as client:
resp = await client.post(
f"{self.base_url}/extract",
json={
"base64_content": content,
"filename": path.name,
"output_format": output_format,
"summarize": summarize,
"summarize_style": summarize_style,
"index_to_search": index_to_search,
"metadata": metadata,
},
)
return resp.json()
async def extract_bytes(
self,
content: bytes,
filename: str,
output_format: OutputFormat = "markdown",
summarize: bool = False,
summarize_style: SummarizeStyle = "concise",
index_to_search: bool = False,
metadata: Optional[Dict[str, Any]] = None,
) -> dict:
"""Extract content from bytes"""
b64_content = base64.b64encode(content).decode()
async with httpx.AsyncClient(timeout=self.timeout) as client:
resp = await client.post(
f"{self.base_url}/extract",
json={
"base64_content": b64_content,
"filename": filename,
"output_format": output_format,
"summarize": summarize,
"summarize_style": summarize_style,
"index_to_search": index_to_search,
"metadata": metadata,
},
)
return resp.json()
async def transcribe_url(
self,
url: str,
language: Optional[str] = None,
summarize: bool = False,
summarize_style: SummarizeStyle = "concise",
) -> dict:
"""Transcribe audio from URL"""
async with httpx.AsyncClient(timeout=self.timeout) as client:
resp = await client.post(
f"{self.base_url}/transcribe",
json={
"url": url,
"language": language,
"summarize": summarize,
"summarize_style": summarize_style,
},
)
return resp.json()
async def transcribe_file(
self,
file_path: str,
language: Optional[str] = None,
summarize: bool = False,
summarize_style: SummarizeStyle = "concise",
) -> dict:
"""Transcribe audio from local file"""
path = Path(file_path)
content = base64.b64encode(path.read_bytes()).decode()
async with httpx.AsyncClient(timeout=self.timeout) as client:
resp = await client.post(
f"{self.base_url}/transcribe",
json={
"base64_content": content,
"language": language,
"summarize": summarize,
"summarize_style": summarize_style,
},
)
return resp.json()
async def preview_url(self, url: str) -> dict:
"""Quick preview of URL content"""
async with httpx.AsyncClient(timeout=60) as client:
resp = await client.post(
f"{self.base_url}/url/preview",
json=url,
)
return resp.json()
# Sync wrapper for convenience
class DoclingClientSync:
"""Synchronous client wrapper"""
def __init__(self, base_url: str = "http://localhost:8081", timeout: float = 300):
self.base_url = base_url.rstrip("/")
self.timeout = timeout
def extract_url(self, url: str, **kwargs) -> dict:
with httpx.Client(timeout=self.timeout) as client:
resp = client.post(
f"{self.base_url}/extract",
json={"url": url, **kwargs},
)
return resp.json()
def transcribe_url(self, url: str, **kwargs) -> dict:
with httpx.Client(timeout=self.timeout) as client:
resp = client.post(
f"{self.base_url}/transcribe",
json={"url": url, **kwargs},
)
return resp.json()