Drastically reduce prompt size for CPU inference speed

- Cut context to 512 tokens, max output to 128 - Only 2 retrieval chunks of 150 chars each (no headers) - Keep only last 2 conversation messages - Minimized system prompt overhead Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-17 01:47:06 -07:00 · 2026-02-17 01:47:06 -07:00 · 80b398643e
parent 08be7716f9
commit 80b398643e
3 changed files with 10 additions and 20 deletions
--- a/app/config.py
+++ b/app/config.py
@ -24,7 +24,7 @@ class Settings(BaseSettings):
    # RAG settings
    chunk_size: int = 500  # tokens per chunk
    chunk_overlap: int = 50  # token overlap between chunks
-    retrieval_top_k: int = 3  # number of chunks to retrieve
+    retrieval_top_k: int = 2  # number of chunks to retrieve
    class Config:
        env_file = ".env"
--- a/app/llm.py
+++ b/app/llm.py
@ -24,9 +24,9 @@ async def stream_ollama(messages: list[dict], system: str = "") -> AsyncGenerato
        "stream": True,
        "keep_alive": "24h",
        "options": {
-            "num_ctx": 1024,
+            "num_ctx": 512,
-            "num_predict": 256,
+            "num_predict": 128,
-            "num_thread": 16,
+            "num_thread": 12,
            "temperature": 0.7,
        },
    }
--- a/app/rag.py
+++ b/app/rag.py
@ -65,19 +65,9 @@ def build_context_prompt(chunks: list[dict]) -> str:
        return "\n[No relevant documents found in the database.]\n"
    context_parts = []
-    for i, chunk in enumerate(chunks, 1):
+    for chunk in chunks:
-        source_label = chunk["source_type"].title()
+        content = chunk["content"][:150]
-        metadata = chunk["metadata"]
+        context_parts.append(content)
        header = f"--- Source {i} ({source_label})"
        if "title" in metadata:
            header += f" | {metadata['title']}"
        if "substance" in metadata:
            header += f" | Substance: {metadata['substance']}"
        header += " ---"
        content = chunk["content"][:300]
        context_parts.append(f"{header}\n{content}")
    return "\n\n".join(context_parts)
@ -92,12 +82,12 @@ async def chat_stream(
    # Build the context-augmented system prompt
    context_text = build_context_prompt(chunks)
-    full_system = f"{SYSTEM_PROMPT}\n\nContext:\n{context_text}"
+    full_system = f"{SYSTEM_PROMPT}\n{context_text}"
-    # Build message history (keep minimal for speed)
+    # Keep only last exchange for speed
    messages = []
    if conversation_history:
-        messages = conversation_history[-4:]
+        messages = conversation_history[-2:]
    messages.append({"role": "user", "content": user_message})