fix: use sampling_params for RunPod vLLM max_tokens + add API key env
vLLM ignores top-level max_tokens, only reads from sampling_params. Also adds RUNPOD_API_KEY to compose for explicit env injection. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
514f9d8601
commit
cc2d06bbfb
|
|
@ -8,6 +8,7 @@ services:
|
|||
restart: unless-stopped
|
||||
environment:
|
||||
- OLLAMA_HOST=http://ollama:11434
|
||||
- RUNPOD_API_KEY=${RUNPOD_API_KEY}
|
||||
- INFISICAL_CLIENT_ID=${INFISICAL_CLIENT_ID}
|
||||
- INFISICAL_CLIENT_SECRET=${INFISICAL_CLIENT_SECRET}
|
||||
- INFISICAL_PROJECT_SLUG=ai-orchestrator
|
||||
|
|
|
|||
|
|
@ -663,8 +663,10 @@ async def generate_text(request: TextRequest):
|
|||
cost_tracker["runpod_requests"] += 1
|
||||
payload = {
|
||||
"prompt": request.prompt,
|
||||
"max_tokens": request.max_tokens,
|
||||
"temperature": request.temperature,
|
||||
"sampling_params": {
|
||||
"max_tokens": request.max_tokens,
|
||||
"temperature": request.temperature,
|
||||
},
|
||||
}
|
||||
result = await submit_job(ENDPOINTS["llm"]["id"], payload)
|
||||
if "id" in result:
|
||||
|
|
|
|||
Loading…
Reference in New Issue