Drastically reduce prompt size for CPU inference speed
- Cut context to 512 tokens, max output to 128 - Only 2 retrieval chunks of 150 chars each (no headers) - Keep only last 2 conversation messages - Minimized system prompt overhead Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
08be7716f9
commit
80b398643e
|
|
@ -24,7 +24,7 @@ class Settings(BaseSettings):
|
||||||
# RAG settings
|
# RAG settings
|
||||||
chunk_size: int = 500 # tokens per chunk
|
chunk_size: int = 500 # tokens per chunk
|
||||||
chunk_overlap: int = 50 # token overlap between chunks
|
chunk_overlap: int = 50 # token overlap between chunks
|
||||||
retrieval_top_k: int = 3 # number of chunks to retrieve
|
retrieval_top_k: int = 2 # number of chunks to retrieve
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
env_file = ".env"
|
env_file = ".env"
|
||||||
|
|
|
||||||
|
|
@ -24,9 +24,9 @@ async def stream_ollama(messages: list[dict], system: str = "") -> AsyncGenerato
|
||||||
"stream": True,
|
"stream": True,
|
||||||
"keep_alive": "24h",
|
"keep_alive": "24h",
|
||||||
"options": {
|
"options": {
|
||||||
"num_ctx": 1024,
|
"num_ctx": 512,
|
||||||
"num_predict": 256,
|
"num_predict": 128,
|
||||||
"num_thread": 16,
|
"num_thread": 12,
|
||||||
"temperature": 0.7,
|
"temperature": 0.7,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
|
||||||
22
app/rag.py
22
app/rag.py
|
|
@ -65,19 +65,9 @@ def build_context_prompt(chunks: list[dict]) -> str:
|
||||||
return "\n[No relevant documents found in the database.]\n"
|
return "\n[No relevant documents found in the database.]\n"
|
||||||
|
|
||||||
context_parts = []
|
context_parts = []
|
||||||
for i, chunk in enumerate(chunks, 1):
|
for chunk in chunks:
|
||||||
source_label = chunk["source_type"].title()
|
content = chunk["content"][:150]
|
||||||
metadata = chunk["metadata"]
|
context_parts.append(content)
|
||||||
|
|
||||||
header = f"--- Source {i} ({source_label})"
|
|
||||||
if "title" in metadata:
|
|
||||||
header += f" | {metadata['title']}"
|
|
||||||
if "substance" in metadata:
|
|
||||||
header += f" | Substance: {metadata['substance']}"
|
|
||||||
header += " ---"
|
|
||||||
|
|
||||||
content = chunk["content"][:300]
|
|
||||||
context_parts.append(f"{header}\n{content}")
|
|
||||||
|
|
||||||
return "\n\n".join(context_parts)
|
return "\n\n".join(context_parts)
|
||||||
|
|
||||||
|
|
@ -92,12 +82,12 @@ async def chat_stream(
|
||||||
|
|
||||||
# Build the context-augmented system prompt
|
# Build the context-augmented system prompt
|
||||||
context_text = build_context_prompt(chunks)
|
context_text = build_context_prompt(chunks)
|
||||||
full_system = f"{SYSTEM_PROMPT}\n\nContext:\n{context_text}"
|
full_system = f"{SYSTEM_PROMPT}\n{context_text}"
|
||||||
|
|
||||||
# Build message history (keep minimal for speed)
|
# Keep only last exchange for speed
|
||||||
messages = []
|
messages = []
|
||||||
if conversation_history:
|
if conversation_history:
|
||||||
messages = conversation_history[-4:]
|
messages = conversation_history[-2:]
|
||||||
|
|
||||||
messages.append({"role": "user", "content": user_message})
|
messages.append({"role": "user", "content": user_message})
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue