feat(mi-api): add configurable timeout + AI orchestrator GPU fallback
Increase Ollama timeout from hardcoded 120s to configurable 600s default. Add optional AI Orchestrator integration for RunPod GPU acceleration with automatic fallback to direct Ollama when orchestrator is unavailable. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
aa85248a22
commit
76d44975cb
|
|
@ -18,6 +18,11 @@ class Settings(BaseSettings):
|
|||
# Ollama (for AI summaries)
|
||||
ollama_url: str = "http://localhost:11434"
|
||||
ollama_model: str = "llama3.2"
|
||||
ollama_timeout: int = 600 # seconds (up from hardcoded 120)
|
||||
|
||||
# AI Orchestrator (optional GPU fallback via RunPod)
|
||||
ai_orchestrator_url: str = "" # empty = disabled, e.g. "http://ai-orchestrator:8080"
|
||||
ai_orchestrator_priority: str = "normal" # low|normal|high
|
||||
|
||||
# File paths
|
||||
recordings_path: str = "/recordings"
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@
|
|||
AI Summary routes.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
from typing import Optional, List
|
||||
|
||||
|
|
@ -149,8 +150,8 @@ async def generate_summary(
|
|||
# Format transcript for LLM
|
||||
transcript_text = _format_transcript(segments)
|
||||
|
||||
# Generate summary using Ollama
|
||||
summary_data = await _generate_summary_with_ollama(transcript_text)
|
||||
# Generate summary — try orchestrator first if configured, fall back to direct Ollama
|
||||
summary_data, model_used = await _generate_summary(transcript_text)
|
||||
|
||||
# Save summary
|
||||
await db.save_summary(
|
||||
|
|
@ -161,7 +162,7 @@ async def generate_summary(
|
|||
decisions=summary_data["decisions"],
|
||||
topics=summary_data["topics"],
|
||||
sentiment=summary_data["sentiment"],
|
||||
model_used=settings.ollama_model
|
||||
model_used=model_used
|
||||
)
|
||||
|
||||
# Update meeting status
|
||||
|
|
@ -204,11 +205,112 @@ def _format_transcript(segments: list) -> str:
|
|||
return "\n".join(lines)
|
||||
|
||||
|
||||
async def _generate_summary_with_ollama(transcript: str) -> dict:
|
||||
"""Generate summary using Ollama."""
|
||||
async def _generate_summary(transcript: str) -> tuple[dict, str]:
|
||||
"""Generate summary via orchestrator (if configured) with Ollama fallback.
|
||||
|
||||
Returns (summary_data, model_used) tuple.
|
||||
"""
|
||||
prompt = SUMMARY_PROMPT.format(transcript=transcript[:15000]) # Limit context
|
||||
|
||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||
# Try AI Orchestrator first if configured
|
||||
if settings.ai_orchestrator_url:
|
||||
try:
|
||||
result = await _generate_summary_with_orchestrator(prompt)
|
||||
return result, f"orchestrator/{result.get('_provider', 'unknown')}"
|
||||
except Exception as e:
|
||||
log.warning("Orchestrator failed, falling back to direct Ollama", error=str(e))
|
||||
|
||||
# Fallback: direct Ollama with configurable timeout
|
||||
return await _generate_summary_with_ollama(prompt), settings.ollama_model
|
||||
|
||||
|
||||
async def _generate_summary_with_orchestrator(prompt: str) -> dict:
|
||||
"""Generate summary via AI Orchestrator (supports RunPod GPU fallback)."""
|
||||
timeout = settings.ollama_timeout
|
||||
|
||||
async with httpx.AsyncClient(timeout=float(timeout)) as client:
|
||||
# Submit generation request
|
||||
response = await client.post(
|
||||
f"{settings.ai_orchestrator_url}/api/generate/text",
|
||||
json={
|
||||
"prompt": prompt,
|
||||
"system": "You are a meeting analysis assistant. Respond only with valid JSON.",
|
||||
"model": settings.ollama_model,
|
||||
"max_tokens": 2048,
|
||||
"temperature": 0.3,
|
||||
"priority": settings.ai_orchestrator_priority,
|
||||
}
|
||||
)
|
||||
response.raise_for_status()
|
||||
result = response.json()
|
||||
provider = result.get("provider", "unknown")
|
||||
|
||||
log.info("Orchestrator responded", provider=provider, cost=result.get("cost"))
|
||||
|
||||
if provider == "runpod":
|
||||
# Async RunPod job — poll for completion
|
||||
job_id = result["job_id"]
|
||||
response_text = await _poll_runpod_job(client, job_id, timeout)
|
||||
else:
|
||||
# Ollama sync response — text is already in the response
|
||||
response_text = result.get("response", "")
|
||||
|
||||
summary_data = _parse_summary_json(response_text)
|
||||
summary_data["_provider"] = provider
|
||||
return summary_data
|
||||
|
||||
|
||||
async def _poll_runpod_job(client: httpx.AsyncClient, job_id: str, max_wait: int) -> str:
|
||||
"""Poll AI Orchestrator for RunPod job completion."""
|
||||
poll_url = f"{settings.ai_orchestrator_url}/api/status/llm/{job_id}"
|
||||
elapsed = 0
|
||||
interval = 5
|
||||
|
||||
while elapsed < max_wait:
|
||||
await asyncio.sleep(interval)
|
||||
elapsed += interval
|
||||
|
||||
resp = await client.get(poll_url)
|
||||
resp.raise_for_status()
|
||||
status_data = resp.json()
|
||||
status = status_data.get("status", "")
|
||||
|
||||
log.debug("RunPod job poll", job_id=job_id, status=status, elapsed=elapsed)
|
||||
|
||||
if status == "COMPLETED":
|
||||
output = status_data.get("output", {})
|
||||
# vLLM output may be nested; extract text
|
||||
if isinstance(output, dict):
|
||||
return output.get("text", output.get("response", json.dumps(output)))
|
||||
return str(output)
|
||||
elif status == "FAILED":
|
||||
error = status_data.get("error", "Unknown RunPod error")
|
||||
raise RuntimeError(f"RunPod job failed: {error}")
|
||||
|
||||
raise TimeoutError(f"RunPod job {job_id} did not complete within {max_wait}s")
|
||||
|
||||
|
||||
def _parse_summary_json(response_text: str) -> dict:
|
||||
"""Parse and validate summary JSON from LLM response."""
|
||||
try:
|
||||
summary_data = json.loads(response_text)
|
||||
except json.JSONDecodeError as e:
|
||||
log.error("Failed to parse AI response as JSON", error=str(e), text=response_text[:500])
|
||||
raise HTTPException(status_code=500, detail="Failed to parse AI response")
|
||||
|
||||
return {
|
||||
"summary": summary_data.get("summary", "No summary generated"),
|
||||
"key_points": summary_data.get("key_points", []),
|
||||
"action_items": summary_data.get("action_items", []),
|
||||
"decisions": summary_data.get("decisions", []),
|
||||
"topics": summary_data.get("topics", []),
|
||||
"sentiment": summary_data.get("sentiment", "neutral"),
|
||||
}
|
||||
|
||||
|
||||
async def _generate_summary_with_ollama(prompt: str) -> dict:
|
||||
"""Generate summary using direct Ollama with configurable timeout."""
|
||||
async with httpx.AsyncClient(timeout=float(settings.ollama_timeout)) as client:
|
||||
try:
|
||||
response = await client.post(
|
||||
f"{settings.ollama_url}/api/generate",
|
||||
|
|
@ -224,18 +326,7 @@ async def _generate_summary_with_ollama(transcript: str) -> dict:
|
|||
result = response.json()
|
||||
response_text = result.get("response", "")
|
||||
|
||||
# Parse JSON from response
|
||||
summary_data = json.loads(response_text)
|
||||
|
||||
# Validate required fields
|
||||
return {
|
||||
"summary": summary_data.get("summary", "No summary generated"),
|
||||
"key_points": summary_data.get("key_points", []),
|
||||
"action_items": summary_data.get("action_items", []),
|
||||
"decisions": summary_data.get("decisions", []),
|
||||
"topics": summary_data.get("topics", []),
|
||||
"sentiment": summary_data.get("sentiment", "neutral")
|
||||
}
|
||||
return _parse_summary_json(response_text)
|
||||
|
||||
except httpx.HTTPError as e:
|
||||
log.error("Ollama request failed", error=str(e))
|
||||
|
|
@ -243,9 +334,3 @@ async def _generate_summary_with_ollama(transcript: str) -> dict:
|
|||
status_code=503,
|
||||
detail=f"AI service unavailable: {str(e)}"
|
||||
)
|
||||
except json.JSONDecodeError as e:
|
||||
log.error("Failed to parse Ollama response", error=str(e))
|
||||
raise HTTPException(
|
||||
status_code=500,
|
||||
detail="Failed to parse AI response"
|
||||
)
|
||||
|
|
|
|||
Loading…
Reference in New Issue