p2pwiki-ai/src/llm.py

154 lines
4.6 KiB
Python

"""LLM client with hybrid routing between Ollama and Claude."""
from typing import AsyncIterator, Optional
import httpx
from anthropic import Anthropic
from tenacity import retry, stop_after_attempt, wait_exponential
from .config import settings
class LLMClient:
"""Unified LLM client with hybrid routing."""
def __init__(self):
self.ollama_url = settings.ollama_base_url
self.ollama_model = settings.ollama_model
# Initialize Claude client if API key is set
self.claude_client = None
if settings.anthropic_api_key:
self.claude_client = Anthropic(api_key=settings.anthropic_api_key)
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=1, max=10))
async def _call_ollama(
self,
prompt: str,
system: Optional[str] = None,
temperature: float = 0.7,
max_tokens: int = 2048,
) -> str:
"""Call Ollama API."""
messages = []
if system:
messages.append({"role": "system", "content": system})
messages.append({"role": "user", "content": prompt})
async with httpx.AsyncClient(timeout=300.0) as client: # 5 min for large content
response = await client.post(
f"{self.ollama_url}/api/chat",
json={
"model": self.ollama_model,
"messages": messages,
"stream": False,
"options": {
"temperature": temperature,
"num_predict": max_tokens,
},
},
)
response.raise_for_status()
data = response.json()
return data["message"]["content"]
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=1, max=10))
async def _call_claude(
self,
prompt: str,
system: Optional[str] = None,
temperature: float = 0.7,
max_tokens: int = 4096,
) -> str:
"""Call Claude API."""
if not self.claude_client:
raise ValueError("Claude API key not configured")
message = self.claude_client.messages.create(
model=settings.claude_model,
max_tokens=max_tokens,
system=system or "",
messages=[{"role": "user", "content": prompt}],
temperature=temperature,
)
return message.content[0].text
async def chat(
self,
prompt: str,
system: Optional[str] = None,
use_claude: bool = False,
temperature: float = 0.7,
max_tokens: int = 2048,
) -> str:
"""
Chat with LLM using hybrid routing.
Args:
prompt: User prompt
system: System prompt
use_claude: Force Claude API (otherwise uses Ollama by default)
temperature: Sampling temperature
max_tokens: Max response tokens
Returns:
LLM response text
"""
if use_claude and self.claude_client:
return await self._call_claude(prompt, system, temperature, max_tokens)
else:
return await self._call_ollama(prompt, system, temperature, max_tokens)
async def generate_draft(
self,
prompt: str,
system: Optional[str] = None,
temperature: float = 0.5,
) -> str:
"""
Generate article draft - uses Claude for higher quality.
Args:
prompt: Prompt describing what to generate
system: System prompt for context
temperature: Lower for more factual output
Returns:
Generated draft text
"""
# Use Claude for drafts if configured, otherwise fall back to Ollama
use_claude = settings.use_claude_for_drafts and self.claude_client is not None
return await self.chat(
prompt, system, use_claude=use_claude, temperature=temperature, max_tokens=4096
)
async def analyze(
self,
content: str,
task: str,
temperature: float = 0.3,
) -> str:
"""
Analyze content for a specific task - uses Claude for complex analysis.
Args:
content: Content to analyze
task: Description of analysis task
temperature: Lower for more deterministic output
Returns:
Analysis result
"""
prompt = f"""Task: {task}
Content to analyze:
{content}
Provide your analysis:"""
use_claude = self.claude_client is not None
return await self.chat(prompt, use_claude=use_claude, temperature=temperature)
# Singleton instance
llm_client = LLMClient()