Drastically reduce prompt size for CPU inference speed

- Cut context to 512 tokens, max output to 128
- Only 2 retrieval chunks of 150 chars each (no headers)
- Keep only last 2 conversation messages
- Minimized system prompt overhead

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Jeff Emmett 2026-02-17 01:47:06 -07:00
parent 08be7716f9
commit 80b398643e
3 changed files with 10 additions and 20 deletions

View File

@ -24,7 +24,7 @@ class Settings(BaseSettings):
# RAG settings # RAG settings
chunk_size: int = 500 # tokens per chunk chunk_size: int = 500 # tokens per chunk
chunk_overlap: int = 50 # token overlap between chunks chunk_overlap: int = 50 # token overlap between chunks
retrieval_top_k: int = 3 # number of chunks to retrieve retrieval_top_k: int = 2 # number of chunks to retrieve
class Config: class Config:
env_file = ".env" env_file = ".env"

View File

@ -24,9 +24,9 @@ async def stream_ollama(messages: list[dict], system: str = "") -> AsyncGenerato
"stream": True, "stream": True,
"keep_alive": "24h", "keep_alive": "24h",
"options": { "options": {
"num_ctx": 1024, "num_ctx": 512,
"num_predict": 256, "num_predict": 128,
"num_thread": 16, "num_thread": 12,
"temperature": 0.7, "temperature": 0.7,
}, },
} }

View File

@ -65,19 +65,9 @@ def build_context_prompt(chunks: list[dict]) -> str:
return "\n[No relevant documents found in the database.]\n" return "\n[No relevant documents found in the database.]\n"
context_parts = [] context_parts = []
for i, chunk in enumerate(chunks, 1): for chunk in chunks:
source_label = chunk["source_type"].title() content = chunk["content"][:150]
metadata = chunk["metadata"] context_parts.append(content)
header = f"--- Source {i} ({source_label})"
if "title" in metadata:
header += f" | {metadata['title']}"
if "substance" in metadata:
header += f" | Substance: {metadata['substance']}"
header += " ---"
content = chunk["content"][:300]
context_parts.append(f"{header}\n{content}")
return "\n\n".join(context_parts) return "\n\n".join(context_parts)
@ -92,12 +82,12 @@ async def chat_stream(
# Build the context-augmented system prompt # Build the context-augmented system prompt
context_text = build_context_prompt(chunks) context_text = build_context_prompt(chunks)
full_system = f"{SYSTEM_PROMPT}\n\nContext:\n{context_text}" full_system = f"{SYSTEM_PROMPT}\n{context_text}"
# Build message history (keep minimal for speed) # Keep only last exchange for speed
messages = [] messages = []
if conversation_history: if conversation_history:
messages = conversation_history[-4:] messages = conversation_history[-2:]
messages.append({"role": "user", "content": user_message}) messages.append({"role": "user", "content": user_message})