154 lines
4.6 KiB
Python
154 lines
4.6 KiB
Python
"""LLM client with hybrid routing between Ollama and Claude."""
|
|
|
|
from typing import AsyncIterator, Optional
|
|
import httpx
|
|
from anthropic import Anthropic
|
|
from tenacity import retry, stop_after_attempt, wait_exponential
|
|
|
|
from .config import settings
|
|
|
|
|
|
class LLMClient:
|
|
"""Unified LLM client with hybrid routing."""
|
|
|
|
def __init__(self):
|
|
self.ollama_url = settings.ollama_base_url
|
|
self.ollama_model = settings.ollama_model
|
|
|
|
# Initialize Claude client if API key is set
|
|
self.claude_client = None
|
|
if settings.anthropic_api_key:
|
|
self.claude_client = Anthropic(api_key=settings.anthropic_api_key)
|
|
|
|
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=1, max=10))
|
|
async def _call_ollama(
|
|
self,
|
|
prompt: str,
|
|
system: Optional[str] = None,
|
|
temperature: float = 0.7,
|
|
max_tokens: int = 2048,
|
|
) -> str:
|
|
"""Call Ollama API."""
|
|
messages = []
|
|
if system:
|
|
messages.append({"role": "system", "content": system})
|
|
messages.append({"role": "user", "content": prompt})
|
|
|
|
async with httpx.AsyncClient(timeout=300.0) as client: # 5 min for large content
|
|
response = await client.post(
|
|
f"{self.ollama_url}/api/chat",
|
|
json={
|
|
"model": self.ollama_model,
|
|
"messages": messages,
|
|
"stream": False,
|
|
"options": {
|
|
"temperature": temperature,
|
|
"num_predict": max_tokens,
|
|
},
|
|
},
|
|
)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
return data["message"]["content"]
|
|
|
|
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=1, max=10))
|
|
async def _call_claude(
|
|
self,
|
|
prompt: str,
|
|
system: Optional[str] = None,
|
|
temperature: float = 0.7,
|
|
max_tokens: int = 4096,
|
|
) -> str:
|
|
"""Call Claude API."""
|
|
if not self.claude_client:
|
|
raise ValueError("Claude API key not configured")
|
|
|
|
message = self.claude_client.messages.create(
|
|
model=settings.claude_model,
|
|
max_tokens=max_tokens,
|
|
system=system or "",
|
|
messages=[{"role": "user", "content": prompt}],
|
|
temperature=temperature,
|
|
)
|
|
return message.content[0].text
|
|
|
|
async def chat(
|
|
self,
|
|
prompt: str,
|
|
system: Optional[str] = None,
|
|
use_claude: bool = False,
|
|
temperature: float = 0.7,
|
|
max_tokens: int = 2048,
|
|
) -> str:
|
|
"""
|
|
Chat with LLM using hybrid routing.
|
|
|
|
Args:
|
|
prompt: User prompt
|
|
system: System prompt
|
|
use_claude: Force Claude API (otherwise uses Ollama by default)
|
|
temperature: Sampling temperature
|
|
max_tokens: Max response tokens
|
|
|
|
Returns:
|
|
LLM response text
|
|
"""
|
|
if use_claude and self.claude_client:
|
|
return await self._call_claude(prompt, system, temperature, max_tokens)
|
|
else:
|
|
return await self._call_ollama(prompt, system, temperature, max_tokens)
|
|
|
|
async def generate_draft(
|
|
self,
|
|
prompt: str,
|
|
system: Optional[str] = None,
|
|
temperature: float = 0.5,
|
|
) -> str:
|
|
"""
|
|
Generate article draft - uses Claude for higher quality.
|
|
|
|
Args:
|
|
prompt: Prompt describing what to generate
|
|
system: System prompt for context
|
|
temperature: Lower for more factual output
|
|
|
|
Returns:
|
|
Generated draft text
|
|
"""
|
|
# Use Claude for drafts if configured, otherwise fall back to Ollama
|
|
use_claude = settings.use_claude_for_drafts and self.claude_client is not None
|
|
return await self.chat(
|
|
prompt, system, use_claude=use_claude, temperature=temperature, max_tokens=4096
|
|
)
|
|
|
|
async def analyze(
|
|
self,
|
|
content: str,
|
|
task: str,
|
|
temperature: float = 0.3,
|
|
) -> str:
|
|
"""
|
|
Analyze content for a specific task - uses Claude for complex analysis.
|
|
|
|
Args:
|
|
content: Content to analyze
|
|
task: Description of analysis task
|
|
temperature: Lower for more deterministic output
|
|
|
|
Returns:
|
|
Analysis result
|
|
"""
|
|
prompt = f"""Task: {task}
|
|
|
|
Content to analyze:
|
|
{content}
|
|
|
|
Provide your analysis:"""
|
|
|
|
use_claude = self.claude_client is not None
|
|
return await self.chat(prompt, use_claude=use_claude, temperature=temperature)
|
|
|
|
|
|
# Singleton instance
|
|
llm_client = LLMClient()
|