208 lines
6.8 KiB
Python
208 lines
6.8 KiB
Python
"""
|
|
Docling Service Client - Use this to integrate with other services
|
|
|
|
Example usage:
|
|
from client import DoclingClient
|
|
|
|
client = DoclingClient("http://docs.jeffemmett.com")
|
|
|
|
# Extract from URL
|
|
result = await client.extract_url("https://example.com/doc.pdf")
|
|
|
|
# Extract with summarization
|
|
result = await client.extract_url(
|
|
"https://example.com/doc.pdf",
|
|
summarize=True,
|
|
summarize_style="bullet_points"
|
|
)
|
|
|
|
# Transcribe audio
|
|
result = await client.transcribe_url("https://example.com/audio.mp3")
|
|
"""
|
|
|
|
import httpx
|
|
import base64
|
|
from pathlib import Path
|
|
from typing import Optional, Dict, Any, Literal
|
|
|
|
|
|
OutputFormat = Literal["markdown", "json", "text", "html"]
|
|
SummarizeStyle = Literal["concise", "detailed", "bullet_points", "technical", "eli5"]
|
|
|
|
|
|
class DoclingClient:
|
|
"""Async client for Docling Service"""
|
|
|
|
def __init__(self, base_url: str = "http://localhost:8081", timeout: float = 300):
|
|
self.base_url = base_url.rstrip("/")
|
|
self.timeout = timeout
|
|
|
|
async def health(self) -> dict:
|
|
"""Check service health"""
|
|
async with httpx.AsyncClient(timeout=10) as client:
|
|
resp = await client.get(f"{self.base_url}/health")
|
|
return resp.json()
|
|
|
|
async def stats(self) -> dict:
|
|
"""Get processing statistics"""
|
|
async with httpx.AsyncClient(timeout=10) as client:
|
|
resp = await client.get(f"{self.base_url}/stats")
|
|
return resp.json()
|
|
|
|
async def extract_url(
|
|
self,
|
|
url: str,
|
|
output_format: OutputFormat = "markdown",
|
|
summarize: bool = False,
|
|
summarize_style: SummarizeStyle = "concise",
|
|
index_to_search: bool = False,
|
|
metadata: Optional[Dict[str, Any]] = None,
|
|
) -> dict:
|
|
"""Extract content from a URL"""
|
|
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
|
resp = await client.post(
|
|
f"{self.base_url}/extract",
|
|
json={
|
|
"url": url,
|
|
"output_format": output_format,
|
|
"summarize": summarize,
|
|
"summarize_style": summarize_style,
|
|
"index_to_search": index_to_search,
|
|
"metadata": metadata,
|
|
},
|
|
)
|
|
return resp.json()
|
|
|
|
async def extract_file(
|
|
self,
|
|
file_path: str,
|
|
output_format: OutputFormat = "markdown",
|
|
summarize: bool = False,
|
|
summarize_style: SummarizeStyle = "concise",
|
|
index_to_search: bool = False,
|
|
metadata: Optional[Dict[str, Any]] = None,
|
|
) -> dict:
|
|
"""Extract content from a local file"""
|
|
path = Path(file_path)
|
|
content = base64.b64encode(path.read_bytes()).decode()
|
|
|
|
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
|
resp = await client.post(
|
|
f"{self.base_url}/extract",
|
|
json={
|
|
"base64_content": content,
|
|
"filename": path.name,
|
|
"output_format": output_format,
|
|
"summarize": summarize,
|
|
"summarize_style": summarize_style,
|
|
"index_to_search": index_to_search,
|
|
"metadata": metadata,
|
|
},
|
|
)
|
|
return resp.json()
|
|
|
|
async def extract_bytes(
|
|
self,
|
|
content: bytes,
|
|
filename: str,
|
|
output_format: OutputFormat = "markdown",
|
|
summarize: bool = False,
|
|
summarize_style: SummarizeStyle = "concise",
|
|
index_to_search: bool = False,
|
|
metadata: Optional[Dict[str, Any]] = None,
|
|
) -> dict:
|
|
"""Extract content from bytes"""
|
|
b64_content = base64.b64encode(content).decode()
|
|
|
|
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
|
resp = await client.post(
|
|
f"{self.base_url}/extract",
|
|
json={
|
|
"base64_content": b64_content,
|
|
"filename": filename,
|
|
"output_format": output_format,
|
|
"summarize": summarize,
|
|
"summarize_style": summarize_style,
|
|
"index_to_search": index_to_search,
|
|
"metadata": metadata,
|
|
},
|
|
)
|
|
return resp.json()
|
|
|
|
async def transcribe_url(
|
|
self,
|
|
url: str,
|
|
language: Optional[str] = None,
|
|
summarize: bool = False,
|
|
summarize_style: SummarizeStyle = "concise",
|
|
) -> dict:
|
|
"""Transcribe audio from URL"""
|
|
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
|
resp = await client.post(
|
|
f"{self.base_url}/transcribe",
|
|
json={
|
|
"url": url,
|
|
"language": language,
|
|
"summarize": summarize,
|
|
"summarize_style": summarize_style,
|
|
},
|
|
)
|
|
return resp.json()
|
|
|
|
async def transcribe_file(
|
|
self,
|
|
file_path: str,
|
|
language: Optional[str] = None,
|
|
summarize: bool = False,
|
|
summarize_style: SummarizeStyle = "concise",
|
|
) -> dict:
|
|
"""Transcribe audio from local file"""
|
|
path = Path(file_path)
|
|
content = base64.b64encode(path.read_bytes()).decode()
|
|
|
|
async with httpx.AsyncClient(timeout=self.timeout) as client:
|
|
resp = await client.post(
|
|
f"{self.base_url}/transcribe",
|
|
json={
|
|
"base64_content": content,
|
|
"language": language,
|
|
"summarize": summarize,
|
|
"summarize_style": summarize_style,
|
|
},
|
|
)
|
|
return resp.json()
|
|
|
|
async def preview_url(self, url: str) -> dict:
|
|
"""Quick preview of URL content"""
|
|
async with httpx.AsyncClient(timeout=60) as client:
|
|
resp = await client.post(
|
|
f"{self.base_url}/url/preview",
|
|
json=url,
|
|
)
|
|
return resp.json()
|
|
|
|
|
|
# Sync wrapper for convenience
|
|
class DoclingClientSync:
|
|
"""Synchronous client wrapper"""
|
|
|
|
def __init__(self, base_url: str = "http://localhost:8081", timeout: float = 300):
|
|
self.base_url = base_url.rstrip("/")
|
|
self.timeout = timeout
|
|
|
|
def extract_url(self, url: str, **kwargs) -> dict:
|
|
with httpx.Client(timeout=self.timeout) as client:
|
|
resp = client.post(
|
|
f"{self.base_url}/extract",
|
|
json={"url": url, **kwargs},
|
|
)
|
|
return resp.json()
|
|
|
|
def transcribe_url(self, url: str, **kwargs) -> dict:
|
|
with httpx.Client(timeout=self.timeout) as client:
|
|
resp = client.post(
|
|
f"{self.base_url}/transcribe",
|
|
json={"url": url, **kwargs},
|
|
)
|
|
return resp.json()
|