feat(mi-api): add configurable timeout + AI orchestrator GPU fallback

Increase Ollama timeout from hardcoded 120s to configurable 600s default. Add optional AI Orchestrator integration for RunPod GPU acceleration with automatic fallback to direct Ollama when orchestrator is unavailable. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-22 20:08:28 -07:00 · 2026-03-22 20:08:28 -07:00 · 76d44975cb
parent aa85248a22
commit 76d44975cb
2 changed files with 114 additions and 24 deletions
--- a/deploy/meeting-intelligence/api/app/config.py
+++ b/deploy/meeting-intelligence/api/app/config.py
@ -18,6 +18,11 @@ class Settings(BaseSettings):
    # Ollama (for AI summaries)
    ollama_url: str = "http://localhost:11434"
    ollama_model: str = "llama3.2"
+    ollama_timeout: int = 600  # seconds (up from hardcoded 120)
+
+    # AI Orchestrator (optional GPU fallback via RunPod)
+    ai_orchestrator_url: str = ""  # empty = disabled, e.g. "http://ai-orchestrator:8080"
+    ai_orchestrator_priority: str = "normal"  # low|normal|high

    # File paths
    recordings_path: str = "/recordings"
--- a/deploy/meeting-intelligence/api/app/routes/summaries.py
+++ b/deploy/meeting-intelligence/api/app/routes/summaries.py
@ -2,6 +2,7 @@
 AI Summary routes.
 """

+import asyncio
 import json
 from typing import Optional, List

@ -149,8 +150,8 @@ async def generate_summary(
    # Format transcript for LLM
    transcript_text = _format_transcript(segments)

-    # Generate summary using Ollama
-    summary_data = await _generate_summary_with_ollama(transcript_text)
+    # Generate summary — try orchestrator first if configured, fall back to direct Ollama
+    summary_data, model_used = await _generate_summary(transcript_text)

    # Save summary
    await db.save_summary(
@ -161,7 +162,7 @@ async def generate_summary(
        decisions=summary_data["decisions"],
        topics=summary_data["topics"],
        sentiment=summary_data["sentiment"],
-        model_used=settings.ollama_model
+        model_used=model_used
    )

    # Update meeting status
@ -204,11 +205,112 @@ def _format_transcript(segments: list) -> str:
    return "\n".join(lines)


-async def _generate_summary_with_ollama(transcript: str) -> dict:
-    """Generate summary using Ollama."""
+async def _generate_summary(transcript: str) -> tuple[dict, str]:
+    """Generate summary via orchestrator (if configured) with Ollama fallback.
+
+    Returns (summary_data, model_used) tuple.
+    """
    prompt = SUMMARY_PROMPT.format(transcript=transcript[:15000])  # Limit context

-    async with httpx.AsyncClient(timeout=120.0) as client:
+    # Try AI Orchestrator first if configured
+    if settings.ai_orchestrator_url:
+        try:
+            result = await _generate_summary_with_orchestrator(prompt)
+            return result, f"orchestrator/{result.get('_provider', 'unknown')}"
+        except Exception as e:
+            log.warning("Orchestrator failed, falling back to direct Ollama", error=str(e))
+
+    # Fallback: direct Ollama with configurable timeout
+    return await _generate_summary_with_ollama(prompt), settings.ollama_model
+
+
+async def _generate_summary_with_orchestrator(prompt: str) -> dict:
+    """Generate summary via AI Orchestrator (supports RunPod GPU fallback)."""
+    timeout = settings.ollama_timeout
+
+    async with httpx.AsyncClient(timeout=float(timeout)) as client:
+        # Submit generation request
+        response = await client.post(
+            f"{settings.ai_orchestrator_url}/api/generate/text",
+            json={
+                "prompt": prompt,
+                "system": "You are a meeting analysis assistant. Respond only with valid JSON.",
+                "model": settings.ollama_model,
+                "max_tokens": 2048,
+                "temperature": 0.3,
+                "priority": settings.ai_orchestrator_priority,
+            }
+        )
+        response.raise_for_status()
+        result = response.json()
+        provider = result.get("provider", "unknown")
+
+        log.info("Orchestrator responded", provider=provider, cost=result.get("cost"))
+
+        if provider == "runpod":
+            # Async RunPod job — poll for completion
+            job_id = result["job_id"]
+            response_text = await _poll_runpod_job(client, job_id, timeout)
+        else:
+            # Ollama sync response — text is already in the response
+            response_text = result.get("response", "")
+
+        summary_data = _parse_summary_json(response_text)
+        summary_data["_provider"] = provider
+        return summary_data
+
+
+async def _poll_runpod_job(client: httpx.AsyncClient, job_id: str, max_wait: int) -> str:
+    """Poll AI Orchestrator for RunPod job completion."""
+    poll_url = f"{settings.ai_orchestrator_url}/api/status/llm/{job_id}"
+    elapsed = 0
+    interval = 5
+
+    while elapsed < max_wait:
+        await asyncio.sleep(interval)
+        elapsed += interval
+
+        resp = await client.get(poll_url)
+        resp.raise_for_status()
+        status_data = resp.json()
+        status = status_data.get("status", "")
+
+        log.debug("RunPod job poll", job_id=job_id, status=status, elapsed=elapsed)
+
+        if status == "COMPLETED":
+            output = status_data.get("output", {})
+            # vLLM output may be nested; extract text
+            if isinstance(output, dict):
+                return output.get("text", output.get("response", json.dumps(output)))
+            return str(output)
+        elif status == "FAILED":
+            error = status_data.get("error", "Unknown RunPod error")
+            raise RuntimeError(f"RunPod job failed: {error}")
+
+    raise TimeoutError(f"RunPod job {job_id} did not complete within {max_wait}s")
+
+
+def _parse_summary_json(response_text: str) -> dict:
+    """Parse and validate summary JSON from LLM response."""
+    try:
+        summary_data = json.loads(response_text)
+    except json.JSONDecodeError as e:
+        log.error("Failed to parse AI response as JSON", error=str(e), text=response_text[:500])
+        raise HTTPException(status_code=500, detail="Failed to parse AI response")
+
+    return {
+        "summary": summary_data.get("summary", "No summary generated"),
+        "key_points": summary_data.get("key_points", []),
+        "action_items": summary_data.get("action_items", []),
+        "decisions": summary_data.get("decisions", []),
+        "topics": summary_data.get("topics", []),
+        "sentiment": summary_data.get("sentiment", "neutral"),
+    }
+
+
+async def _generate_summary_with_ollama(prompt: str) -> dict:
+    """Generate summary using direct Ollama with configurable timeout."""
+    async with httpx.AsyncClient(timeout=float(settings.ollama_timeout)) as client:
        try:
            response = await client.post(
                f"{settings.ollama_url}/api/generate",
@ -224,18 +326,7 @@ async def _generate_summary_with_ollama(transcript: str) -> dict:
            result = response.json()
            response_text = result.get("response", "")

-            # Parse JSON from response
-            summary_data = json.loads(response_text)
-
-            # Validate required fields
-            return {
-                "summary": summary_data.get("summary", "No summary generated"),
-                "key_points": summary_data.get("key_points", []),
-                "action_items": summary_data.get("action_items", []),
-                "decisions": summary_data.get("decisions", []),
-                "topics": summary_data.get("topics", []),
-                "sentiment": summary_data.get("sentiment", "neutral")
-            }
+            return _parse_summary_json(response_text)

        except httpx.HTTPError as e:
            log.error("Ollama request failed", error=str(e))
@ -243,9 +334,3 @@ async def _generate_summary_with_ollama(transcript: str) -> dict:
                status_code=503,
                detail=f"AI service unavailable: {str(e)}"
            )
-        except json.JSONDecodeError as e:
-            log.error("Failed to parse Ollama response", error=str(e))
-            raise HTTPException(
-                status_code=500,
-                detail="Failed to parse AI response"
-            )