From cc2d06bbfb38110d50df6ade4b45e3877fbd0689 Mon Sep 17 00:00:00 2001 From: Jeff Emmett Date: Mon, 23 Mar 2026 10:15:12 -0700 Subject: [PATCH] fix: use sampling_params for RunPod vLLM max_tokens + add API key env vLLM ignores top-level max_tokens, only reads from sampling_params. Also adds RUNPOD_API_KEY to compose for explicit env injection. Co-Authored-By: Claude Opus 4.6 --- docker-compose.yml | 1 + server.py | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index 77987e1..84a5af5 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -8,6 +8,7 @@ services: restart: unless-stopped environment: - OLLAMA_HOST=http://ollama:11434 + - RUNPOD_API_KEY=${RUNPOD_API_KEY} - INFISICAL_CLIENT_ID=${INFISICAL_CLIENT_ID} - INFISICAL_CLIENT_SECRET=${INFISICAL_CLIENT_SECRET} - INFISICAL_PROJECT_SLUG=ai-orchestrator diff --git a/server.py b/server.py index 3dbbb46..ea732fe 100644 --- a/server.py +++ b/server.py @@ -663,8 +663,10 @@ async def generate_text(request: TextRequest): cost_tracker["runpod_requests"] += 1 payload = { "prompt": request.prompt, - "max_tokens": request.max_tokens, - "temperature": request.temperature, + "sampling_params": { + "max_tokens": request.max_tokens, + "temperature": request.temperature, + }, } result = await submit_job(ENDPOINTS["llm"]["id"], payload) if "id" in result: