fix: use sampling_params for RunPod vLLM max_tokens + add API key env

vLLM ignores top-level max_tokens, only reads from sampling_params.
Also adds RUNPOD_API_KEY to compose for explicit env injection.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Jeff Emmett 2026-03-23 10:15:12 -07:00
parent 514f9d8601
commit cc2d06bbfb
2 changed files with 5 additions and 2 deletions

View File

@ -8,6 +8,7 @@ services:
restart: unless-stopped
environment:
- OLLAMA_HOST=http://ollama:11434
- RUNPOD_API_KEY=${RUNPOD_API_KEY}
- INFISICAL_CLIENT_ID=${INFISICAL_CLIENT_ID}
- INFISICAL_CLIENT_SECRET=${INFISICAL_CLIENT_SECRET}
- INFISICAL_PROJECT_SLUG=ai-orchestrator

View File

@ -663,8 +663,10 @@ async def generate_text(request: TextRequest):
cost_tracker["runpod_requests"] += 1
payload = {
"prompt": request.prompt,
"max_tokens": request.max_tokens,
"temperature": request.temperature,
"sampling_params": {
"max_tokens": request.max_tokens,
"temperature": request.temperature,
},
}
result = await submit_job(ENDPOINTS["llm"]["id"], payload)
if "id" in result: