p2pwiki-ai/src/llm.py

"""LLM client with hybrid routing between Ollama and Claude."""

from typing import AsyncIterator, Optional
import httpx
from anthropic import Anthropic
from tenacity import retry, stop_after_attempt, wait_exponential

from .config import settings


class LLMClient:
    """Unified LLM client with hybrid routing."""

    def __init__(self):
        self.ollama_url = settings.ollama_base_url
        self.ollama_model = settings.ollama_model

        # Initialize Claude client if API key is set
        self.claude_client = None
        if settings.anthropic_api_key:
            self.claude_client = Anthropic(api_key=settings.anthropic_api_key)

    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=1, max=10))
    async def _call_ollama(
        self,
        prompt: str,
        system: Optional[str] = None,
        temperature: float = 0.7,
        max_tokens: int = 2048,
    ) -> str:
        """Call Ollama API."""
        messages = []
        if system:
            messages.append({"role": "system", "content": system})
        messages.append({"role": "user", "content": prompt})

        async with httpx.AsyncClient(timeout=300.0) as client:  # 5 min for large content
            response = await client.post(
                f"{self.ollama_url}/api/chat",
                json={
                    "model": self.ollama_model,
                    "messages": messages,
                    "stream": False,
                    "options": {
                        "temperature": temperature,
                        "num_predict": max_tokens,
                    },
                },
            )
            response.raise_for_status()
            data = response.json()
            return data["message"]["content"]

    @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=1, max=10))
    async def _call_claude(
        self,
        prompt: str,
        system: Optional[str] = None,
        temperature: float = 0.7,
        max_tokens: int = 4096,
    ) -> str:
        """Call Claude API."""
        if not self.claude_client:
            raise ValueError("Claude API key not configured")

        message = self.claude_client.messages.create(
            model=settings.claude_model,
            max_tokens=max_tokens,
            system=system or "",
            messages=[{"role": "user", "content": prompt}],
            temperature=temperature,
        )
        return message.content[0].text

    async def chat(
        self,
        prompt: str,
        system: Optional[str] = None,
        use_claude: bool = False,
        temperature: float = 0.7,
        max_tokens: int = 2048,
    ) -> str:
        """
        Chat with LLM using hybrid routing.

        Args:
            prompt: User prompt
            system: System prompt
            use_claude: Force Claude API (otherwise uses Ollama by default)
            temperature: Sampling temperature
            max_tokens: Max response tokens

        Returns:
            LLM response text
        """
        if use_claude and self.claude_client:
            return await self._call_claude(prompt, system, temperature, max_tokens)
        else:
            return await self._call_ollama(prompt, system, temperature, max_tokens)

    async def generate_draft(
        self,
        prompt: str,
        system: Optional[str] = None,
        temperature: float = 0.5,
    ) -> str:
        """
        Generate article draft - uses Claude for higher quality.

        Args:
            prompt: Prompt describing what to generate
            system: System prompt for context
            temperature: Lower for more factual output

        Returns:
            Generated draft text
        """
        # Use Claude for drafts if configured, otherwise fall back to Ollama
        use_claude = settings.use_claude_for_drafts and self.claude_client is not None
        return await self.chat(
            prompt, system, use_claude=use_claude, temperature=temperature, max_tokens=4096
        )

    async def analyze(
        self,
        content: str,
        task: str,
        temperature: float = 0.3,
    ) -> str:
        """
        Analyze content for a specific task - uses Claude for complex analysis.

        Args:
            content: Content to analyze
            task: Description of analysis task
            temperature: Lower for more deterministic output

        Returns:
            Analysis result
        """
        prompt = f"""Task: {task}

Content to analyze:
{content}

Provide your analysis:"""

        use_claude = self.claude_client is not None
        return await self.chat(prompt, use_claude=use_claude, temperature=temperature)


# Singleton instance
llm_client = LLMClient()