Initial commit: AI Orchestrator with Ollama + RunPod smart routing

- FastAPI server with dashboard - Smart routing: Ollama (free) for text, RunPod (GPU) for images/video - Docker + docker-compose with Traefik labels - Endpoints: text, chat, image, video, comfyui, whisper - Cost tracking and savings estimation 🤖 Generated with Claude Code
2025-11-26 19:11:58 -08:00 · 2025-11-26 19:11:58 -08:00 · 047a98575b
commit 047a98575b
7 changed files with 945 additions and 0 deletions
--- a/.env.example
+++ b/.env.example
@ -0,0 +1,7 @@
+# AI Orchestrator Environment Variables
+
+# RunPod API Key (required for GPU endpoints)
+RUNPOD_API_KEY=your_runpod_api_key_here
+
+# Ollama host (defaults to http://ollama:11434 in docker-compose)
+# OLLAMA_HOST=http://ollama:11434
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,33 @@
+# Environment
+.env
+*.env.local
+
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+venv/
+env/
+.venv/
+ENV/
+
+# IDE
+.idea/
+.vscode/
+*.swp
+*.swo
+
+# Test outputs
+*.png
+*.mp4
+*.mp3
+*.wav
+
+# Docker
+.docker/
+
+# OS
+.DS_Store
+Thumbs.db
--- a/48
+++ b/48
@ -0,0 +1,48 @@
+# AI Orchestrator - Optimized Production Dockerfile
+# Multi-stage build for minimal image size
+
+FROM python:3.12-slim as builder
+
+WORKDIR /build
+
+# Install build dependencies
+RUN pip install --no-cache-dir --upgrade pip wheel
+
+# Copy and install Python dependencies
+COPY requirements.txt .
+RUN pip wheel --no-cache-dir --wheel-dir /wheels -r requirements.txt
+
+# Production stage
+FROM python:3.12-slim
+
+WORKDIR /app
+
+# Create non-root user for security
+RUN useradd --create-home --shell /bin/bash appuser
+
+# Install wheels from builder stage
+COPY --from=builder /wheels /wheels
+RUN pip install --no-cache-dir /wheels/* && rm -rf /wheels
+
+# Copy application code
+COPY server.py .
+
+# Set ownership
+RUN chown -R appuser:appuser /app
+
+# Switch to non-root user
+USER appuser
+
+# Environment variables (can be overridden)
+ENV PYTHONUNBUFFERED=1
+ENV PYTHONDONTWRITEBYTECODE=1
+
+# Expose port
+EXPOSE 8080
+
+# Health check
+HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
+    CMD python -c "import urllib.request; urllib.request.urlopen('http://localhost:8080/api/health')" || exit 1
+
+# Run the application
+CMD ["python", "-m", "uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8080"]
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,70 @@
+version: '3.8'
+
+services:
+  ai-orchestrator:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    image: ai-orchestrator:latest
+    container_name: ai-orchestrator
+    restart: unless-stopped
+    environment:
+      - RUNPOD_API_KEY=${RUNPOD_API_KEY}
+      - OLLAMA_HOST=http://ollama:11434
+    depends_on:
+      ollama:
+        condition: service_healthy
+    labels:
+      # Traefik auto-discovery
+      - "traefik.enable=true"
+      - "traefik.http.routers.ai-orchestrator.rule=Host(`ai.jeffemmett.com`)"
+      - "traefik.http.routers.ai-orchestrator.entrypoints=websecure"
+      - "traefik.http.routers.ai-orchestrator.tls=true"
+      - "traefik.http.services.ai-orchestrator.loadbalancer.server.port=8080"
+      # Health check for Traefik
+      - "traefik.http.services.ai-orchestrator.loadbalancer.healthcheck.path=/api/health"
+      - "traefik.http.services.ai-orchestrator.loadbalancer.healthcheck.interval=30s"
+    networks:
+      - traefik-public
+      - ai-internal
+    healthcheck:
+      test: ["CMD", "python", "-c", "import urllib.request; urllib.request.urlopen('http://localhost:8080/api/health')"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 10s
+
+  ollama:
+    image: ollama/ollama:latest
+    container_name: ollama
+    restart: unless-stopped
+    volumes:
+      - ollama-data:/root/.ollama
+    networks:
+      - ai-internal
+    # Expose internally only (orchestrator routes to it)
+    expose:
+      - "11434"
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:11434/api/tags"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 30s
+    # CPU-only mode (no GPU passthrough needed for RS 8000)
+    deploy:
+      resources:
+        limits:
+          memory: 16G
+        reservations:
+          memory: 4G
+
+volumes:
+  ollama-data:
+    driver: local
+
+networks:
+  traefik-public:
+    external: true
+  ai-internal:
+    driver: bridge
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,4 @@
+fastapi>=0.109.0
+uvicorn>=0.27.0
+httpx>=0.26.0
+pydantic>=2.5.0
--- a/server.py
+++ b/server.py
@ -0,0 +1,757 @@
+"""
+AI Orchestrator - Smart routing between local Ollama (free) and RunPod (GPU)
+
+Routes:
+- Text/Code: Ollama (free, local CPU) or RunPod vLLM (paid, fast GPU)
+- Images: RunPod Automatic1111/ComfyUI
+- Video: RunPod Wan2.2
+- Audio: RunPod WhisperX
+"""
+
+import os
+import asyncio
+import httpx
+from datetime import datetime
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import HTMLResponse, StreamingResponse
+from pydantic import BaseModel
+from typing import Optional, List, Dict, Any, Literal
+from enum import Enum
+
+# Config
+RUNPOD_API_KEY = os.getenv("RUNPOD_API_KEY", "rpa_YYOARL5MEBTTKKWGABRKTW2CVHQYRBTOBZNSGIL3lwwfdz")
+RUNPOD_API_BASE = "https://api.runpod.ai/v2"
+OLLAMA_HOST = os.getenv("OLLAMA_HOST", "http://localhost:11434")
+
+# RunPod endpoints (paid GPU)
+ENDPOINTS = {
+    "video": {"id": "4jql4l7l0yw0f3", "name": "Wan2.2 Video", "type": "video"},
+    "image": {"id": "tzf1j3sc3zufsy", "name": "Automatic1111 SD", "type": "image"},
+    "comfyui": {"id": "5zurj845tbf8he", "name": "ComfyUI", "type": "image"},
+    "whisper": {"id": "lrtisuv8ixbtub", "name": "WhisperX", "type": "audio"},
+    "llm": {"id": "03g5hz3hlo8gr2", "name": "vLLM", "type": "text"},
+}
+
+# Ollama models (free local CPU)
+OLLAMA_MODELS = {
+    "llama3.2": {"name": "Llama 3.2 3B", "context": 128000, "size": "3B"},
+    "llama3.2:1b": {"name": "Llama 3.2 1B", "context": 128000, "size": "1B"},
+    "qwen2.5-coder:7b": {"name": "Qwen 2.5 Coder 7B", "context": 32000, "size": "7B"},
+    "mistral": {"name": "Mistral 7B", "context": 32000, "size": "7B"},
+    "phi3": {"name": "Phi-3 Mini", "context": 128000, "size": "3.8B"},
+}
+
+app = FastAPI(
+    title="AI Orchestrator",
+    description="Smart routing between local Ollama (free) and RunPod (GPU)",
+    version="1.0.0",
+)
+
+# CORS middleware for web access
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Configure for production
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+# Store recent jobs
+recent_jobs = []
+
+# Track cost savings
+cost_tracker = {
+    "ollama_requests": 0,
+    "runpod_requests": 0,
+    "estimated_savings": 0.0,  # USD saved by using Ollama
+}
+
+
+# ============== Ollama Functions (FREE local inference) ==============
+
+async def ollama_health() -> dict:
+    """Check Ollama service health"""
+    async with httpx.AsyncClient() as client:
+        try:
+            resp = await client.get(f"{OLLAMA_HOST}/api/tags", timeout=5)
+            if resp.status_code == 200:
+                data = resp.json()
+                return {
+                    "status": "healthy",
+                    "models": [m["name"] for m in data.get("models", [])],
+                }
+            return {"status": "unhealthy", "error": f"Status {resp.status_code}"}
+        except Exception as e:
+            return {"status": "unavailable", "error": str(e)}
+
+
+async def ollama_generate(
+    prompt: str,
+    model: str = "llama3.2",
+    system: Optional[str] = None,
+    stream: bool = False,
+    options: Optional[dict] = None,
+) -> dict:
+    """Generate text using local Ollama"""
+    payload = {
+        "model": model,
+        "prompt": prompt,
+        "stream": stream,
+    }
+    if system:
+        payload["system"] = system
+    if options:
+        payload["options"] = options
+
+    async with httpx.AsyncClient() as client:
+        try:
+            resp = await client.post(
+                f"{OLLAMA_HOST}/api/generate",
+                json=payload,
+                timeout=300,  # 5 min timeout for long generations
+            )
+            result = resp.json()
+            # Track usage
+            cost_tracker["ollama_requests"] += 1
+            cost_tracker["estimated_savings"] += 0.001  # ~$0.001 saved per request vs RunPod
+            return result
+        except Exception as e:
+            return {"error": str(e)}
+
+
+async def ollama_chat(
+    messages: List[dict],
+    model: str = "llama3.2",
+    stream: bool = False,
+    options: Optional[dict] = None,
+) -> dict:
+    """Chat completion using local Ollama"""
+    payload = {
+        "model": model,
+        "messages": messages,
+        "stream": stream,
+    }
+    if options:
+        payload["options"] = options
+
+    async with httpx.AsyncClient() as client:
+        try:
+            resp = await client.post(
+                f"{OLLAMA_HOST}/api/chat",
+                json=payload,
+                timeout=300,
+            )
+            result = resp.json()
+            cost_tracker["ollama_requests"] += 1
+            cost_tracker["estimated_savings"] += 0.001
+            return result
+        except Exception as e:
+            return {"error": str(e)}
+
+
+async def ollama_pull_model(model: str) -> dict:
+    """Pull/download a model to Ollama"""
+    async with httpx.AsyncClient() as client:
+        try:
+            resp = await client.post(
+                f"{OLLAMA_HOST}/api/pull",
+                json={"name": model},
+                timeout=600,  # Models can take a while to download
+            )
+            return resp.json()
+        except Exception as e:
+            return {"error": str(e)}
+
+
+def build_comfyui_workflow(
+    prompt: str,
+    negative_prompt: str = "",
+    seed: int = 42,
+    steps: int = 20,
+    cfg: float = 1.0,  # Flux uses low CFG (1.0)
+    width: int = 1024,
+    height: int = 1024,
+    sampler: str = "euler",
+    scheduler: str = "simple",
+    denoise: float = 1.0,
+    model: str = "flux1-dev-fp8.safetensors",
+) -> dict:
+    """Build a ComfyUI Flux txt2img workflow in API format"""
+    return {
+        "4": {
+            "class_type": "CheckpointLoaderSimple",
+            "inputs": {
+                "ckpt_name": model
+            }
+        },
+        "5": {
+            "class_type": "EmptyLatentImage",
+            "inputs": {
+                "batch_size": 1,
+                "height": height,
+                "width": width
+            }
+        },
+        "6": {
+            "class_type": "CLIPTextEncode",
+            "inputs": {
+                "clip": ["4", 1],
+                "text": prompt
+            }
+        },
+        "7": {
+            "class_type": "CLIPTextEncode",
+            "inputs": {
+                "clip": ["4", 1],
+                "text": negative_prompt
+            }
+        },
+        "3": {
+            "class_type": "KSampler",
+            "inputs": {
+                "cfg": cfg,
+                "denoise": denoise,
+                "latent_image": ["5", 0],
+                "model": ["4", 0],
+                "negative": ["7", 0],
+                "positive": ["6", 0],
+                "sampler_name": sampler,
+                "scheduler": scheduler,
+                "seed": seed,
+                "steps": steps
+            }
+        },
+        "8": {
+            "class_type": "VAEDecode",
+            "inputs": {
+                "samples": ["3", 0],
+                "vae": ["4", 2]
+            }
+        },
+        "9": {
+            "class_type": "SaveImage",
+            "inputs": {
+                "filename_prefix": "ComfyUI",
+                "images": ["8", 0]
+            }
+        }
+    }
+
+
+async def get_endpoint_health(endpoint_id: str) -> dict:
+    """Get health status of a RunPod endpoint"""
+    async with httpx.AsyncClient() as client:
+        try:
+            resp = await client.get(
+                f"{RUNPOD_API_BASE}/{endpoint_id}/health",
+                headers={"Authorization": f"Bearer {RUNPOD_API_KEY}"},
+                timeout=10,
+            )
+            return resp.json()
+        except Exception as e:
+            return {"error": str(e)}
+
+
+async def get_job_status(endpoint_id: str, job_id: str) -> dict:
+    """Get status of a specific job"""
+    async with httpx.AsyncClient() as client:
+        try:
+            resp = await client.get(
+                f"{RUNPOD_API_BASE}/{endpoint_id}/status/{job_id}",
+                headers={"Authorization": f"Bearer {RUNPOD_API_KEY}"},
+                timeout=10,
+            )
+            return resp.json()
+        except Exception as e:
+            return {"error": str(e)}
+
+
+async def submit_job(endpoint_id: str, payload: dict) -> dict:
+    """Submit a job to a RunPod endpoint"""
+    async with httpx.AsyncClient() as client:
+        try:
+            resp = await client.post(
+                f"{RUNPOD_API_BASE}/{endpoint_id}/run",
+                headers={
+                    "Authorization": f"Bearer {RUNPOD_API_KEY}",
+                    "Content-Type": "application/json",
+                },
+                json={"input": payload},
+                timeout=30,
+            )
+            return resp.json()
+        except Exception as e:
+            return {"error": str(e)}
+
+
+@app.get("/", response_class=HTMLResponse)
+async def dashboard():
+    """Main dashboard showing all endpoint statuses"""
+
+    # Fetch all endpoint health in parallel
+    health_tasks = {
+        name: get_endpoint_health(ep["id"])
+        for name, ep in ENDPOINTS.items()
+    }
+
+    health_results = {}
+    for name, task in health_tasks.items():
+        health_results[name] = await task
+
+    # Get Ollama status
+    ollama_status = await ollama_health()
+
+    # Build HTML
+    html = """
+    <!DOCTYPE html>
+    <html>
+    <head>
+        <title>AI Orchestrator Dashboard</title>
+        <meta http-equiv="refresh" content="10">
+        <style>
+            body { font-family: -apple-system, sans-serif; background: #1a1a2e; color: #eee; padding: 20px; }
+            h1 { color: #00d9ff; }
+            .grid { display: grid; grid-template-columns: repeat(auto-fit, minmax(300px, 1fr)); gap: 20px; }
+            .card { background: #16213e; border-radius: 10px; padding: 20px; }
+            .card h3 { margin-top: 0; color: #00d9ff; }
+            .status { display: inline-block; padding: 4px 12px; border-radius: 20px; font-size: 12px; }
+            .status.ready { background: #00c853; color: #000; }
+            .status.throttled { background: #ff9800; color: #000; }
+            .status.idle { background: #2196f3; color: #fff; }
+            .status.error { background: #f44336; color: #fff; }
+            .metric { display: flex; justify-content: space-between; padding: 8px 0; border-bottom: 1px solid #333; }
+            .metric:last-child { border-bottom: none; }
+            .metric-value { font-weight: bold; color: #00d9ff; }
+            .timestamp { color: #666; font-size: 12px; margin-top: 20px; }
+            .test-btn { background: #00d9ff; color: #000; border: none; padding: 10px 20px; border-radius: 5px; cursor: pointer; margin-top: 10px; }
+            .test-btn:hover { background: #00b8d4; }
+        </style>
+    </head>
+    <body>
+        <h1>🤖 AI Orchestrator Dashboard</h1>
+        <p class="timestamp">Last updated: """ + datetime.now().strftime("%Y-%m-%d %H:%M:%S") + """ (auto-refreshes every 10s)</p>
+
+        <div class="stats-bar" style="display: flex; gap: 20px; margin-bottom: 20px; flex-wrap: wrap;">
+            <div class="stat-card" style="background: #16213e; padding: 15px 25px; border-radius: 10px;">
+                <span style="color: #888;">Ollama Status</span>
+                <span class="metric-value" style="display: block; font-size: 24px; color: """ + ("#00c853" if ollama_status.get("status") == "healthy" else "#f44336") + """;">""" + ollama_status.get("status", "unknown").upper() + """</span>
+            </div>
+            <div class="stat-card" style="background: #16213e; padding: 15px 25px; border-radius: 10px;">
+                <span style="color: #888;">Free Requests (Ollama)</span>
+                <span class="metric-value" style="display: block; font-size: 24px;">""" + str(cost_tracker["ollama_requests"]) + """</span>
+            </div>
+            <div class="stat-card" style="background: #16213e; padding: 15px 25px; border-radius: 10px;">
+                <span style="color: #888;">Paid Requests (RunPod)</span>
+                <span class="metric-value" style="display: block; font-size: 24px;">""" + str(cost_tracker["runpod_requests"]) + """</span>
+            </div>
+            <div class="stat-card" style="background: #16213e; padding: 15px 25px; border-radius: 10px;">
+                <span style="color: #888;">Est. Savings</span>
+                <span class="metric-value" style="display: block; font-size: 24px; color: #00c853;">$""" + str(round(cost_tracker["estimated_savings"], 2)) + """</span>
+            </div>
+        </div>
+
+        <div class="grid">
+    """
+
+    for name, ep in ENDPOINTS.items():
+        health = health_results.get(name, {})
+        workers = health.get("workers", {})
+        jobs = health.get("jobs", {})
+
+        # Determine status
+        if "error" in health:
+            status_class = "error"
+            status_text = "Error"
+        elif workers.get("ready", 0) > 0 or workers.get("running", 0) > 0:
+            status_class = "ready"
+            status_text = "Ready"
+        elif workers.get("throttled", 0) > 0:
+            status_class = "throttled"
+            status_text = "Throttled (waiting for GPU)"
+        elif workers.get("idle", 0) > 0:
+            status_class = "idle"
+            status_text = "Idle"
+        else:
+            status_class = "idle"
+            status_text = "Standby"
+
+        html += f"""
+            <div class="card">
+                <h3>{ep['name']}</h3>
+                <span class="status {status_class}">{status_text}</span>
+                <p style="color: #888; font-size: 12px;">Type: {ep['type']} | ID: {ep['id'][:8]}...</p>
+
+                <div class="metric"><span>Workers Ready</span><span class="metric-value">{workers.get('ready', 0)}</span></div>
+                <div class="metric"><span>Workers Running</span><span class="metric-value">{workers.get('running', 0)}</span></div>
+                <div class="metric"><span>Workers Initializing</span><span class="metric-value">{workers.get('initializing', 0)}</span></div>
+                <div class="metric"><span>Workers Throttled</span><span class="metric-value">{workers.get('throttled', 0)}</span></div>
+                <div class="metric"><span>Jobs In Queue</span><span class="metric-value">{jobs.get('inQueue', 0)}</span></div>
+                <div class="metric"><span>Jobs In Progress</span><span class="metric-value">{jobs.get('inProgress', 0)}</span></div>
+                <div class="metric"><span>Jobs Completed</span><span class="metric-value">{jobs.get('completed', 0)}</span></div>
+                <div class="metric"><span>Jobs Failed</span><span class="metric-value">{jobs.get('failed', 0)}</span></div>
+
+                <a href="/test/{name}"><button class="test-btn">Test Endpoint</button></a>
+            </div>
+        """
+
+    html += """
+        </div>
+
+        <h2 style="margin-top: 40px;">Recent Jobs</h2>
+        <div class="card">
+    """
+
+    if recent_jobs:
+        for job in recent_jobs[-10:][::-1]:
+            html += f"""<div class="metric">
+                <span>{job['endpoint']} - {job['id'][:16]}...</span>
+                <span class="metric-value">{job['status']}</span>
+            </div>"""
+    else:
+        html += "<p style='color: #666;'>No jobs submitted yet</p>"
+
+    html += """
+        </div>
+    </body>
+    </html>
+    """
+
+    return HTMLResponse(content=html)
+
+
+@app.get("/api/health")
+async def api_health():
+    """API endpoint to get all endpoint health"""
+    results = {}
+    for name, ep in ENDPOINTS.items():
+        results[name] = await get_endpoint_health(ep["id"])
+    return results
+
+
+@app.get("/api/status/{endpoint}/{job_id}")
+async def api_job_status(endpoint: str, job_id: str):
+    """Get status of a specific job"""
+    if endpoint not in ENDPOINTS:
+        raise HTTPException(status_code=404, detail="Endpoint not found")
+    return await get_job_status(ENDPOINTS[endpoint]["id"], job_id)
+
+
+@app.get("/test/{endpoint}")
+async def test_endpoint(endpoint: str):
+    """Submit a test job to an endpoint"""
+    if endpoint not in ENDPOINTS:
+        raise HTTPException(status_code=404, detail="Endpoint not found")
+
+    ep = ENDPOINTS[endpoint]
+
+    # Different test payloads for different endpoint types
+    if ep["type"] == "video":
+        payload = {
+            "prompt": "A cat walking through a garden, cinematic lighting, high quality",
+            "negative_prompt": "blurry, low quality, distorted",
+            "seed": 42,
+            "cfg": 4.0,
+            "steps": 20,
+            "width": 832,
+            "height": 480,
+            "num_frames": 81,
+            "length": 81
+        }
+    elif ep["type"] == "image":
+        if endpoint == "comfyui":
+            # ComfyUI needs a workflow JSON
+            payload = {
+                "workflow": build_comfyui_workflow(
+                    prompt="A beautiful sunset over mountains, photorealistic, 8k",
+                    negative_prompt="blurry, low quality, distorted",
+                    seed=42,
+                    steps=20,
+                    cfg=7.0,
+                    width=512,
+                    height=512
+                )
+            }
+        else:
+            payload = {"prompt": "A beautiful sunset over mountains, photorealistic"}
+    elif ep["type"] == "audio":
+        payload = {"audio_url": "https://example.com/test.mp3"}
+    elif ep["type"] == "text":
+        payload = {"prompt": "Hello, how are you?", "max_tokens": 50}
+    else:
+        payload = {"test": True}
+
+    result = await submit_job(ep["id"], payload)
+
+    # Track job
+    if "id" in result:
+        recent_jobs.append({
+            "endpoint": endpoint,
+            "id": result["id"],
+            "status": result.get("status", "SUBMITTED"),
+            "timestamp": datetime.now().isoformat(),
+        })
+
+    return result
+
+
+class VideoRequest(BaseModel):
+    prompt: str
+    negative_prompt: Optional[str] = "blurry, low quality, distorted"
+    seed: Optional[int] = None  # Random if not set
+    cfg: Optional[float] = 4.0  # Classifier-free guidance
+    steps: Optional[int] = 20
+    width: Optional[int] = 832
+    height: Optional[int] = 480
+    num_frames: Optional[int] = 81
+    length: Optional[int] = 81  # Same as num_frames for Wan2.2
+
+
+class ImageRequest(BaseModel):
+    prompt: str
+    negative_prompt: Optional[str] = "blurry, low quality"
+    steps: Optional[int] = 20
+    width: Optional[int] = 512
+    height: Optional[int] = 512
+
+
+@app.post("/api/generate/video")
+async def generate_video(request: VideoRequest):
+    """Generate video using Wan2.2"""
+    import random
+    payload = request.dict()
+    # Generate random seed if not provided
+    if payload.get("seed") is None:
+        payload["seed"] = random.randint(1, 2147483647)
+
+    result = await submit_job(ENDPOINTS["video"]["id"], payload)
+    if "id" in result:
+        recent_jobs.append({
+            "endpoint": "video",
+            "id": result["id"],
+            "status": result.get("status", "SUBMITTED"),
+            "timestamp": datetime.now().isoformat(),
+        })
+    return result
+
+
+@app.post("/api/generate/image")
+async def generate_image(request: ImageRequest):
+    """Generate image using Automatic1111"""
+    result = await submit_job(ENDPOINTS["image"]["id"], request.dict())
+    if "id" in result:
+        recent_jobs.append({
+            "endpoint": "image",
+            "id": result["id"],
+            "status": result.get("status", "SUBMITTED"),
+            "timestamp": datetime.now().isoformat(),
+        })
+    return result
+
+
+class ComfyUIRequest(BaseModel):
+    prompt: str
+    negative_prompt: Optional[str] = ""
+    seed: Optional[int] = None
+    steps: Optional[int] = 20
+    cfg: Optional[float] = 7.0
+    width: Optional[int] = 512
+    height: Optional[int] = 512
+    sampler: Optional[str] = "euler"
+    scheduler: Optional[str] = "normal"
+    workflow: Optional[Dict[str, Any]] = None  # Custom workflow override
+
+
+@app.post("/api/generate/comfyui")
+async def generate_comfyui(request: ComfyUIRequest):
+    """Generate image using ComfyUI with workflow"""
+    import random
+
+    # Use custom workflow if provided, otherwise build default txt2img
+    if request.workflow:
+        workflow = request.workflow
+    else:
+        seed = request.seed if request.seed is not None else random.randint(1, 2147483647)
+        workflow = build_comfyui_workflow(
+            prompt=request.prompt,
+            negative_prompt=request.negative_prompt,
+            seed=seed,
+            steps=request.steps,
+            cfg=request.cfg,
+            width=request.width,
+            height=request.height,
+            sampler=request.sampler,
+            scheduler=request.scheduler,
+        )
+
+    payload = {"workflow": workflow}
+    result = await submit_job(ENDPOINTS["comfyui"]["id"], payload)
+
+    if "id" in result:
+        recent_jobs.append({
+            "endpoint": "comfyui",
+            "id": result["id"],
+            "status": result.get("status", "SUBMITTED"),
+            "timestamp": datetime.now().isoformat(),
+        })
+    return result
+
+
+# ============== Text Generation Endpoints (Smart Routing) ==============
+
+class Priority(str, Enum):
+    LOW = "low"      # Always use free Ollama
+    NORMAL = "normal"  # Ollama if available, else RunPod
+    HIGH = "high"    # RunPod for speed
+
+
+class TextRequest(BaseModel):
+    prompt: str
+    system: Optional[str] = None
+    model: Optional[str] = "llama3.2"  # Ollama model name
+    max_tokens: Optional[int] = 2048
+    temperature: Optional[float] = 0.7
+    priority: Optional[Priority] = Priority.NORMAL
+
+
+class ChatRequest(BaseModel):
+    messages: List[Dict[str, str]]  # [{"role": "user", "content": "..."}]
+    model: Optional[str] = "llama3.2"
+    max_tokens: Optional[int] = 2048
+    temperature: Optional[float] = 0.7
+    priority: Optional[Priority] = Priority.NORMAL
+
+
+@app.post("/api/generate/text")
+async def generate_text(request: TextRequest):
+    """
+    Generate text with smart routing:
+    - LOW priority: Always Ollama (free)
+    - NORMAL priority: Ollama if healthy, else RunPod
+    - HIGH priority: RunPod vLLM (fast GPU)
+    """
+    # Check Ollama health for routing decision
+    ollama_status = await ollama_health()
+    use_ollama = False
+
+    if request.priority == Priority.LOW:
+        use_ollama = True
+    elif request.priority == Priority.NORMAL:
+        use_ollama = ollama_status.get("status") == "healthy"
+    # HIGH priority always uses RunPod
+
+    if use_ollama and ollama_status.get("status") == "healthy":
+        # Use free local Ollama
+        result = await ollama_generate(
+            prompt=request.prompt,
+            model=request.model,
+            system=request.system,
+            options={
+                "num_predict": request.max_tokens,
+                "temperature": request.temperature,
+            },
+        )
+        return {
+            "provider": "ollama",
+            "model": request.model,
+            "cost": 0.0,
+            "response": result.get("response", ""),
+            "tokens": result.get("eval_count", 0),
+        }
+    else:
+        # Use RunPod vLLM (paid)
+        cost_tracker["runpod_requests"] += 1
+        payload = {
+            "prompt": request.prompt,
+            "max_tokens": request.max_tokens,
+            "temperature": request.temperature,
+        }
+        result = await submit_job(ENDPOINTS["llm"]["id"], payload)
+        if "id" in result:
+            recent_jobs.append({
+                "endpoint": "llm",
+                "id": result["id"],
+                "status": result.get("status", "SUBMITTED"),
+                "timestamp": datetime.now().isoformat(),
+            })
+        return {
+            "provider": "runpod",
+            "model": "vLLM",
+            "cost": 0.001,  # Estimated per request
+            "job_id": result.get("id"),
+            "status": result.get("status"),
+        }
+
+
+@app.post("/api/chat")
+async def chat_completion(request: ChatRequest):
+    """
+    Chat completion with smart routing
+    """
+    ollama_status = await ollama_health()
+    use_ollama = request.priority != Priority.HIGH and ollama_status.get("status") == "healthy"
+
+    if use_ollama:
+        result = await ollama_chat(
+            messages=request.messages,
+            model=request.model,
+            options={
+                "num_predict": request.max_tokens,
+                "temperature": request.temperature,
+            },
+        )
+        return {
+            "provider": "ollama",
+            "model": request.model,
+            "cost": 0.0,
+            "message": result.get("message", {}),
+            "tokens": result.get("eval_count", 0),
+        }
+    else:
+        # Fallback to RunPod
+        cost_tracker["runpod_requests"] += 1
+        # Convert chat format to prompt for vLLM
+        prompt = "\n".join([f"{m['role']}: {m['content']}" for m in request.messages])
+        result = await submit_job(ENDPOINTS["llm"]["id"], {"prompt": prompt})
+        return {
+            "provider": "runpod",
+            "job_id": result.get("id"),
+            "status": result.get("status"),
+        }
+
+
+@app.get("/api/ollama/models")
+async def list_ollama_models():
+    """List available Ollama models"""
+    status = await ollama_health()
+    return {
+        "available": status.get("models", []),
+        "recommended": list(OLLAMA_MODELS.keys()),
+        "status": status.get("status"),
+    }
+
+
+@app.post("/api/ollama/pull/{model}")
+async def pull_ollama_model(model: str):
+    """Pull/download a model to Ollama"""
+    result = await ollama_pull_model(model)
+    return result
+
+
+@app.get("/api/stats")
+async def get_stats():
+    """Get usage statistics and cost savings"""
+    return {
+        "ollama_requests": cost_tracker["ollama_requests"],
+        "runpod_requests": cost_tracker["runpod_requests"],
+        "estimated_savings_usd": round(cost_tracker["estimated_savings"], 4),
+        "total_requests": cost_tracker["ollama_requests"] + cost_tracker["runpod_requests"],
+        "ollama_percentage": round(
+            cost_tracker["ollama_requests"] / max(1, cost_tracker["ollama_requests"] + cost_tracker["runpod_requests"]) * 100, 1
+        ),
+    }
+
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8080)
--- a/test_api.py
+++ b/test_api.py
@ -0,0 +1,26 @@
+import runpod
+import os
+
+# Set API key from config
+runpod.api_key = "rpa_YYOARL5MEBTTKKWGABRKTW2CVHQYRBTOBZNSGIL3lwwfdz"
+
+# Test 1: List all pods
+print("=== Testing RunPod API Connection ===\n")
+print("1. Listing all pods:")
+pods = runpod.get_pods()
+for pod in pods:
+    print(f"   - {pod['name']} ({pod['id']}): {pod['desiredStatus']}")
+
+# Test 2: Check serverless endpoints
+print("\n2. Checking serverless endpoints:")
+try:
+    endpoints = runpod.get_endpoints()
+    if endpoints:
+        for endpoint in endpoints:
+            print(f"   - {endpoint.get('name', 'Unnamed')}: {endpoint.get('id')}")
+    else:
+        print("   No serverless endpoints configured yet")
+except Exception as e:
+    print(f"   Note: {e}")
+
+print("\n✅ API Connection successful!")