"""LLM client with hybrid routing between Ollama and Claude.""" from typing import AsyncIterator, Optional import httpx from anthropic import Anthropic from tenacity import retry, stop_after_attempt, wait_exponential from .config import settings class LLMClient: """Unified LLM client with hybrid routing.""" def __init__(self): self.ollama_url = settings.ollama_base_url self.ollama_model = settings.ollama_model # Initialize Claude client if API key is set self.claude_client = None if settings.anthropic_api_key: self.claude_client = Anthropic(api_key=settings.anthropic_api_key) @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=1, max=10)) async def _call_ollama( self, prompt: str, system: Optional[str] = None, temperature: float = 0.7, max_tokens: int = 2048, ) -> str: """Call Ollama API.""" messages = [] if system: messages.append({"role": "system", "content": system}) messages.append({"role": "user", "content": prompt}) async with httpx.AsyncClient(timeout=300.0) as client: # 5 min for large content response = await client.post( f"{self.ollama_url}/api/chat", json={ "model": self.ollama_model, "messages": messages, "stream": False, "options": { "temperature": temperature, "num_predict": max_tokens, }, }, ) response.raise_for_status() data = response.json() return data["message"]["content"] @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=1, max=10)) async def _call_claude( self, prompt: str, system: Optional[str] = None, temperature: float = 0.7, max_tokens: int = 4096, ) -> str: """Call Claude API.""" if not self.claude_client: raise ValueError("Claude API key not configured") message = self.claude_client.messages.create( model=settings.claude_model, max_tokens=max_tokens, system=system or "", messages=[{"role": "user", "content": prompt}], temperature=temperature, ) return message.content[0].text async def chat( self, prompt: str, system: Optional[str] = None, use_claude: bool = False, temperature: float = 0.7, max_tokens: int = 2048, ) -> str: """ Chat with LLM using hybrid routing. Args: prompt: User prompt system: System prompt use_claude: Force Claude API (otherwise uses Ollama by default) temperature: Sampling temperature max_tokens: Max response tokens Returns: LLM response text """ if use_claude and self.claude_client: return await self._call_claude(prompt, system, temperature, max_tokens) else: return await self._call_ollama(prompt, system, temperature, max_tokens) async def generate_draft( self, prompt: str, system: Optional[str] = None, temperature: float = 0.5, ) -> str: """ Generate article draft - uses Claude for higher quality. Args: prompt: Prompt describing what to generate system: System prompt for context temperature: Lower for more factual output Returns: Generated draft text """ # Use Claude for drafts if configured, otherwise fall back to Ollama use_claude = settings.use_claude_for_drafts and self.claude_client is not None return await self.chat( prompt, system, use_claude=use_claude, temperature=temperature, max_tokens=4096 ) async def analyze( self, content: str, task: str, temperature: float = 0.3, ) -> str: """ Analyze content for a specific task - uses Claude for complex analysis. Args: content: Content to analyze task: Description of analysis task temperature: Lower for more deterministic output Returns: Analysis result """ prompt = f"""Task: {task} Content to analyze: {content} Provide your analysis:""" use_claude = self.claude_client is not None return await self.chat(prompt, use_claude=use_claude, temperature=temperature) # Singleton instance llm_client = LLMClient()