Drastically reduce prompt size for CPU inference speed
- Cut context to 512 tokens, max output to 128 - Only 2 retrieval chunks of 150 chars each (no headers) - Keep only last 2 conversation messages - Minimized system prompt overhead Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
08be7716f9
commit
80b398643e
|
|
@ -24,7 +24,7 @@ class Settings(BaseSettings):
|
|||
# RAG settings
|
||||
chunk_size: int = 500 # tokens per chunk
|
||||
chunk_overlap: int = 50 # token overlap between chunks
|
||||
retrieval_top_k: int = 3 # number of chunks to retrieve
|
||||
retrieval_top_k: int = 2 # number of chunks to retrieve
|
||||
|
||||
class Config:
|
||||
env_file = ".env"
|
||||
|
|
|
|||
|
|
@ -24,9 +24,9 @@ async def stream_ollama(messages: list[dict], system: str = "") -> AsyncGenerato
|
|||
"stream": True,
|
||||
"keep_alive": "24h",
|
||||
"options": {
|
||||
"num_ctx": 1024,
|
||||
"num_predict": 256,
|
||||
"num_thread": 16,
|
||||
"num_ctx": 512,
|
||||
"num_predict": 128,
|
||||
"num_thread": 12,
|
||||
"temperature": 0.7,
|
||||
},
|
||||
}
|
||||
|
|
|
|||
22
app/rag.py
22
app/rag.py
|
|
@ -65,19 +65,9 @@ def build_context_prompt(chunks: list[dict]) -> str:
|
|||
return "\n[No relevant documents found in the database.]\n"
|
||||
|
||||
context_parts = []
|
||||
for i, chunk in enumerate(chunks, 1):
|
||||
source_label = chunk["source_type"].title()
|
||||
metadata = chunk["metadata"]
|
||||
|
||||
header = f"--- Source {i} ({source_label})"
|
||||
if "title" in metadata:
|
||||
header += f" | {metadata['title']}"
|
||||
if "substance" in metadata:
|
||||
header += f" | Substance: {metadata['substance']}"
|
||||
header += " ---"
|
||||
|
||||
content = chunk["content"][:300]
|
||||
context_parts.append(f"{header}\n{content}")
|
||||
for chunk in chunks:
|
||||
content = chunk["content"][:150]
|
||||
context_parts.append(content)
|
||||
|
||||
return "\n\n".join(context_parts)
|
||||
|
||||
|
|
@ -92,12 +82,12 @@ async def chat_stream(
|
|||
|
||||
# Build the context-augmented system prompt
|
||||
context_text = build_context_prompt(chunks)
|
||||
full_system = f"{SYSTEM_PROMPT}\n\nContext:\n{context_text}"
|
||||
full_system = f"{SYSTEM_PROMPT}\n{context_text}"
|
||||
|
||||
# Build message history (keep minimal for speed)
|
||||
# Keep only last exchange for speed
|
||||
messages = []
|
||||
if conversation_history:
|
||||
messages = conversation_history[-4:]
|
||||
messages = conversation_history[-2:]
|
||||
|
||||
messages.append({"role": "user", "content": user_message})
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue