Initial commit: Erowid conversational bot

RAG-powered chatbot that indexes Erowid's experience reports and substance
info, making them searchable via natural conversation. Built with FastAPI,
PostgreSQL+pgvector, Ollama embeddings, and streaming LLM responses.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Jeff Emmett 2026-02-17 01:19:49 +00:00
commit d09d065d08
19 changed files with 1752 additions and 0 deletions

24
.env.example Normal file
View File

@ -0,0 +1,24 @@
# Database
DATABASE_URL=postgresql+asyncpg://erowid:erowid@erowid-db:5432/erowid
DATABASE_URL_SYNC=postgresql://erowid:erowid@erowid-db:5432/erowid
POSTGRES_USER=erowid
POSTGRES_PASSWORD=erowid
POSTGRES_DB=erowid
# LLM Provider: ollama | claude | openai
LLM_PROVIDER=ollama
# Ollama (local, free)
OLLAMA_BASE_URL=http://ollama:11434
OLLAMA_EMBED_MODEL=nomic-embed-text
OLLAMA_CHAT_MODEL=llama3.2:3b
# Claude API (optional)
ANTHROPIC_API_KEY=
# OpenAI API (optional)
OPENAI_API_KEY=
# App
APP_HOST=0.0.0.0
APP_PORT=8000

10
.gitignore vendored Normal file
View File

@ -0,0 +1,10 @@
__pycache__/
*.pyc
*.pyo
.env
*.egg-info/
dist/
build/
.venv/
venv/
PLAN.md

16
Dockerfile Normal file
View File

@ -0,0 +1,16 @@
FROM python:3.12-slim
WORKDIR /app
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential libpq-dev \
&& rm -rf /var/lib/apt/lists/*
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY app/ ./app/
EXPOSE 8000
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]

0
app/__init__.py Normal file
View File

34
app/config.py Normal file
View File

@ -0,0 +1,34 @@
from pydantic_settings import BaseSettings
class Settings(BaseSettings):
database_url: str = "postgresql+asyncpg://erowid:erowid@erowid-db:5432/erowid"
database_url_sync: str = "postgresql://erowid:erowid@erowid-db:5432/erowid"
llm_provider: str = "ollama" # ollama | claude | openai
ollama_base_url: str = "http://ollama:11434"
ollama_embed_model: str = "nomic-embed-text"
ollama_chat_model: str = "llama3.1:8b"
anthropic_api_key: str = ""
openai_api_key: str = ""
app_host: str = "0.0.0.0"
app_port: int = 8000
# Scraper settings
scrape_delay: float = 3.0 # seconds between requests (be polite to Erowid)
scrape_batch_size: int = 50
# RAG settings
chunk_size: int = 500 # tokens per chunk
chunk_overlap: int = 50 # token overlap between chunks
retrieval_top_k: int = 4 # number of chunks to retrieve
class Config:
env_file = ".env"
extra = "ignore"
settings = Settings()

23
app/database.py Normal file
View File

@ -0,0 +1,23 @@
from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker
from sqlalchemy.orm import DeclarativeBase
from app.config import settings
engine = create_async_engine(settings.database_url, echo=False)
async_session = async_sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
class Base(DeclarativeBase):
pass
async def get_db():
async with async_session() as session:
yield session
async def init_db():
async with engine.begin() as conn:
await conn.execute(
__import__("sqlalchemy").text("CREATE EXTENSION IF NOT EXISTS vector")
)
await conn.run_sync(Base.metadata.create_all)

199
app/embeddings.py Normal file
View File

@ -0,0 +1,199 @@
import json
import logging
from typing import AsyncGenerator
import httpx
from sqlalchemy import select, func
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import settings
from app.database import async_session
from app.models import Experience, Substance, DocumentChunk
logger = logging.getLogger(__name__)
def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list[str]:
"""Split text into overlapping chunks by approximate token count (words / 0.75)."""
words = text.split()
# Approximate: 1 token ~ 0.75 words
words_per_chunk = int(chunk_size * 0.75)
words_overlap = int(overlap * 0.75)
if len(words) <= words_per_chunk:
return [text]
chunks = []
start = 0
while start < len(words):
end = start + words_per_chunk
chunk = " ".join(words[start:end])
chunks.append(chunk)
start = end - words_overlap
return chunks
async def get_embedding(text: str, client: httpx.AsyncClient | None = None) -> list[float]:
"""Get embedding vector for text using Ollama."""
should_close = False
if client is None:
client = httpx.AsyncClient(timeout=60)
should_close = True
try:
resp = await client.post(
f"{settings.ollama_base_url}/api/embeddings",
json={
"model": settings.ollama_embed_model,
"prompt": text,
},
)
resp.raise_for_status()
data = resp.json()
return data["embedding"]
finally:
if should_close:
await client.aclose()
async def get_embeddings_batch(texts: list[str], client: httpx.AsyncClient) -> list[list[float]]:
"""Get embeddings for multiple texts sequentially."""
embeddings = []
for text in texts:
emb = await get_embedding(text, client)
embeddings.append(emb)
return embeddings
async def embed_experiences(batch_size: int = 20):
"""Chunk and embed all un-embedded experience reports."""
async with async_session() as db:
# Find experiences that don't have chunks yet
subq = select(DocumentChunk.source_id).where(
DocumentChunk.source_type == "experience"
).distinct()
result = await db.execute(
select(Experience).where(Experience.id.not_in(subq))
)
experiences = result.scalars().all()
logger.info(f"Found {len(experiences)} experiences to embed")
async with httpx.AsyncClient(timeout=60) as client:
total_chunks = 0
for i, exp in enumerate(experiences):
# Build a rich text representation
header = f"Experience Report: {exp.title}\n"
header += f"Substance: {exp.substance}\n"
if exp.category:
header += f"Category: {exp.category}\n"
if exp.gender:
header += f"Gender: {exp.gender}\n"
if exp.age:
header += f"Age: {exp.age}\n"
header += "\n"
full_text = header + exp.body
chunks = chunk_text(full_text, settings.chunk_size, settings.chunk_overlap)
for idx, chunk_text_content in enumerate(chunks):
embedding = await get_embedding(chunk_text_content, client)
metadata = json.dumps({
"title": exp.title,
"substance": exp.substance,
"category": exp.category,
"erowid_id": exp.erowid_id,
})
doc_chunk = DocumentChunk(
source_type="experience",
source_id=exp.id,
chunk_index=idx,
content=chunk_text_content,
metadata_json=metadata,
embedding=embedding,
)
db.add(doc_chunk)
total_chunks += 1
if (i + 1) % batch_size == 0:
await db.commit()
logger.info(f"Embedded {i + 1} experiences ({total_chunks} chunks)")
await db.commit()
logger.info(f"Done! Created {total_chunks} chunks from {len(experiences)} experiences")
return total_chunks
async def embed_substances(batch_size: int = 10):
"""Chunk and embed all un-embedded substance info pages."""
async with async_session() as db:
subq = select(DocumentChunk.source_id).where(
DocumentChunk.source_type == "substance"
).distinct()
result = await db.execute(
select(Substance).where(Substance.id.not_in(subq))
)
substances = result.scalars().all()
logger.info(f"Found {len(substances)} substances to embed")
async with httpx.AsyncClient(timeout=60) as client:
total_chunks = 0
for i, sub in enumerate(substances):
# Build rich text representation
sections = []
sections.append(f"Substance Information: {sub.name}")
if sub.category:
sections.append(f"Category: {sub.category}")
if sub.description:
sections.append(f"\nOverview:\n{sub.description}")
if sub.effects:
sections.append(f"\nEffects:\n{sub.effects}")
if sub.dosage:
sections.append(f"\nDosage:\n{sub.dosage}")
if sub.duration:
sections.append(f"\nDuration:\n{sub.duration}")
if sub.chemistry:
sections.append(f"\nChemistry:\n{sub.chemistry}")
if sub.health:
sections.append(f"\nHealth & Safety:\n{sub.health}")
if sub.law:
sections.append(f"\nLegal Status:\n{sub.law}")
full_text = "\n".join(sections)
chunks = chunk_text(full_text, settings.chunk_size, settings.chunk_overlap)
for idx, chunk_text_content in enumerate(chunks):
embedding = await get_embedding(chunk_text_content, client)
metadata = json.dumps({
"substance": sub.name,
"category": sub.category,
})
doc_chunk = DocumentChunk(
source_type="substance",
source_id=sub.id,
chunk_index=idx,
content=chunk_text_content,
metadata_json=metadata,
embedding=embedding,
)
db.add(doc_chunk)
total_chunks += 1
if (i + 1) % batch_size == 0:
await db.commit()
logger.info(f"Embedded {i + 1} substances ({total_chunks} chunks)")
await db.commit()
logger.info(f"Done! Created {total_chunks} chunks from {len(substances)} substances")
return total_chunks
async def embed_all():
"""Embed everything that hasn't been embedded yet."""
exp_chunks = await embed_experiences()
sub_chunks = await embed_substances()
return {"experience_chunks": exp_chunks, "substance_chunks": sub_chunks}

119
app/llm.py Normal file
View File

@ -0,0 +1,119 @@
import json
import logging
from typing import AsyncGenerator
import httpx
from app.config import settings
logger = logging.getLogger(__name__)
async def stream_ollama(messages: list[dict], system: str = "") -> AsyncGenerator[str, None]:
"""Stream a chat completion from Ollama."""
all_messages = []
if system:
all_messages.append({"role": "system", "content": system})
all_messages.extend(messages)
payload = {
"model": settings.ollama_chat_model,
"messages": all_messages,
"stream": True,
}
timeout = httpx.Timeout(connect=30, read=600, write=30, pool=30)
async with httpx.AsyncClient(timeout=timeout) as client:
async with client.stream(
"POST",
f"{settings.ollama_base_url}/api/chat",
json=payload,
) as resp:
resp.raise_for_status()
buffer = b""
async for chunk in resp.aiter_bytes():
buffer += chunk
# Process complete JSON lines
while b"\n" in buffer:
line, buffer = buffer.split(b"\n", 1)
line = line.strip()
if not line:
continue
try:
data = json.loads(line)
if "message" in data and "content" in data["message"]:
content = data["message"]["content"]
if content:
yield content
if data.get("done"):
return
except json.JSONDecodeError:
continue
async def stream_claude(messages: list[dict], system: str = "") -> AsyncGenerator[str, None]:
"""Stream a chat completion from Claude API."""
try:
from anthropic import AsyncAnthropic
except ImportError:
raise RuntimeError("anthropic package not installed")
client = AsyncAnthropic(api_key=settings.anthropic_api_key)
async with client.messages.stream(
model="claude-sonnet-4-5-20250929",
max_tokens=2048,
system=system,
messages=messages,
) as stream:
async for text in stream.text_stream:
yield text
async def stream_openai(messages: list[dict], system: str = "") -> AsyncGenerator[str, None]:
"""Stream a chat completion from OpenAI API."""
try:
from openai import AsyncOpenAI
except ImportError:
raise RuntimeError("openai package not installed")
client = AsyncOpenAI(api_key=settings.openai_api_key)
all_messages = []
if system:
all_messages.append({"role": "system", "content": system})
all_messages.extend(messages)
stream = await client.chat.completions.create(
model="gpt-4o-mini",
messages=all_messages,
max_tokens=2048,
stream=True,
)
async for chunk in stream:
if chunk.choices and chunk.choices[0].delta.content:
yield chunk.choices[0].delta.content
async def stream_chat(messages: list[dict], system: str = "") -> AsyncGenerator[str, None]:
"""Route to the configured LLM provider."""
provider = settings.llm_provider.lower()
if provider == "ollama":
async for token in stream_ollama(messages, system):
yield token
elif provider == "claude":
if not settings.anthropic_api_key:
yield "Error: ANTHROPIC_API_KEY not configured. Set it in .env or switch LLM_PROVIDER to ollama."
return
async for token in stream_claude(messages, system):
yield token
elif provider == "openai":
if not settings.openai_api_key:
yield "Error: OPENAI_API_KEY not configured. Set it in .env or switch LLM_PROVIDER to ollama."
return
async for token in stream_openai(messages, system):
yield token
else:
yield f"Error: Unknown LLM_PROVIDER '{provider}'. Use ollama, claude, or openai."

140
app/main.py Normal file
View File

@ -0,0 +1,140 @@
import asyncio
import json
import logging
import uuid
from contextlib import asynccontextmanager
from pathlib import Path
from fastapi import FastAPI, Request
from fastapi.responses import HTMLResponse, StreamingResponse, JSONResponse
from fastapi.staticfiles import StaticFiles
from app.config import settings
from app.database import init_db, async_session
from app.rag import chat_stream
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(name)s] %(levelname)s: %(message)s")
logger = logging.getLogger(__name__)
# In-memory session store (conversation history per session)
sessions: dict[str, list[dict]] = {}
@asynccontextmanager
async def lifespan(app: FastAPI):
logger.info("Initializing database...")
await init_db()
logger.info("Database ready.")
yield
app = FastAPI(title="Erowid Bot", lifespan=lifespan)
# Serve static files
static_dir = Path(__file__).parent / "static"
app.mount("/static", StaticFiles(directory=str(static_dir)), name="static")
@app.get("/", response_class=HTMLResponse)
async def index():
return (static_dir / "index.html").read_text()
@app.get("/health")
async def health():
return {"status": "ok"}
@app.post("/chat")
async def chat(request: Request):
"""Chat endpoint with streaming SSE response."""
body = await request.json()
message = body.get("message", "").strip()
session_id = body.get("session_id", "")
if not message:
return JSONResponse({"error": "Empty message"}, status_code=400)
if not session_id:
session_id = str(uuid.uuid4())
# Get or create conversation history
history = sessions.get(session_id, [])
async def generate():
full_response = ""
try:
async for token in chat_stream(message, history):
full_response += token
yield f"data: {json.dumps({'token': token})}\n\n"
except Exception as e:
logger.error(f"Chat error: {e}")
yield f"data: {json.dumps({'error': str(e)})}\n\n"
# Save to history
history.append({"role": "user", "content": message})
history.append({"role": "assistant", "content": full_response})
# Keep history bounded
if len(history) > 20:
history[:] = history[-20:]
sessions[session_id] = history
yield f"data: {json.dumps({'done': True, 'session_id': session_id})}\n\n"
return StreamingResponse(
generate(),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"X-Accel-Buffering": "no",
},
)
@app.get("/stats")
async def stats():
"""Get database stats."""
from sqlalchemy import func, select
from app.models import Experience, Substance, DocumentChunk
async with async_session() as db:
exp_count = (await db.execute(select(func.count(Experience.id)))).scalar() or 0
sub_count = (await db.execute(select(func.count(Substance.id)))).scalar() or 0
chunk_count = (await db.execute(select(func.count(DocumentChunk.id)))).scalar() or 0
return {
"experiences": exp_count,
"substances": sub_count,
"chunks": chunk_count,
}
@app.post("/admin/scrape/experiences")
async def trigger_scrape_experiences(request: Request):
"""Trigger experience scraping (admin endpoint)."""
body = await request.json() if request.headers.get("content-type") == "application/json" else {}
limit = body.get("limit")
from app.scraper.experiences import scrape_all_experiences
asyncio.create_task(scrape_all_experiences(limit=limit))
return {"status": "started", "message": "Experience scraping started in background"}
@app.post("/admin/scrape/substances")
async def trigger_scrape_substances(request: Request):
"""Trigger substance scraping (admin endpoint)."""
body = await request.json() if request.headers.get("content-type") == "application/json" else {}
limit = body.get("limit")
from app.scraper.substances import scrape_all_substances
asyncio.create_task(scrape_all_substances(limit=limit))
return {"status": "started", "message": "Substance scraping started in background"}
@app.post("/admin/embed")
async def trigger_embedding():
"""Trigger embedding pipeline (admin endpoint)."""
from app.embeddings import embed_all
asyncio.create_task(embed_all())
return {"status": "started", "message": "Embedding pipeline started in background"}

56
app/models.py Normal file
View File

@ -0,0 +1,56 @@
from sqlalchemy import Column, Integer, String, Text, Boolean, Float, ForeignKey
from sqlalchemy.dialects.postgresql import ARRAY, TIMESTAMP
from pgvector.sqlalchemy import Vector
from datetime import datetime, timezone
from app.database import Base
class Substance(Base):
__tablename__ = "substances"
id = Column(Integer, primary_key=True, autoincrement=True)
name = Column(String(255), unique=True, nullable=False, index=True)
url = Column(String(1024))
category = Column(String(255)) # e.g. "Psychedelics", "Stimulants"
description = Column(Text)
effects = Column(Text)
dosage = Column(Text)
duration = Column(Text)
chemistry = Column(Text)
health = Column(Text)
law = Column(Text)
raw_html = Column(Text)
scraped_at = Column(TIMESTAMP(timezone=True), default=lambda: datetime.now(timezone.utc))
class Experience(Base):
__tablename__ = "experiences"
id = Column(Integer, primary_key=True, autoincrement=True)
erowid_id = Column(Integer, unique=True, index=True)
title = Column(String(512))
author = Column(String(255))
substance = Column(String(512)) # may list multiple substances
substance_list = Column(ARRAY(String)) # parsed list
body = Column(Text, nullable=False)
category = Column(String(255)) # e.g. "General", "First Times", "Bad Trips"
gender = Column(String(50))
age = Column(String(50))
year = Column(Integer)
url = Column(String(1024))
intensity = Column(String(100))
raw_html = Column(Text)
scraped_at = Column(TIMESTAMP(timezone=True), default=lambda: datetime.now(timezone.utc))
class DocumentChunk(Base):
__tablename__ = "document_chunks"
id = Column(Integer, primary_key=True, autoincrement=True)
source_type = Column(String(50), nullable=False, index=True) # "experience" or "substance"
source_id = Column(Integer, nullable=False, index=True)
chunk_index = Column(Integer, nullable=False)
content = Column(Text, nullable=False)
metadata_json = Column(Text) # JSON string with extra metadata
embedding = Column(Vector(768)) # nomic-embed-text dimension
created_at = Column(TIMESTAMP(timezone=True), default=lambda: datetime.now(timezone.utc))

108
app/rag.py Normal file
View File

@ -0,0 +1,108 @@
import json
import logging
from typing import AsyncGenerator
from sqlalchemy import select, text
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import settings
from app.database import async_session
from app.embeddings import get_embedding
from app.llm import stream_chat
from app.models import DocumentChunk
logger = logging.getLogger(__name__)
SYSTEM_PROMPT = """You are the Erowid Knowledge Assistant focused on harm reduction. Provide accurate, non-judgmental substance info from the Erowid database. Prioritize safety. Never encourage drug use. Cite sources when possible. Say when info is limited."""
async def retrieve_context(query: str, top_k: int | None = None) -> list[dict]:
"""Retrieve the most relevant document chunks for a query."""
if top_k is None:
top_k = settings.retrieval_top_k
# Get query embedding
query_embedding = await get_embedding(query)
async with async_session() as db:
# Use pgvector cosine distance for similarity search
result = await db.execute(
text("""
SELECT id, source_type, source_id, chunk_index, content, metadata_json,
embedding <=> :query_embedding AS distance
FROM document_chunks
ORDER BY embedding <=> :query_embedding
LIMIT :top_k
"""),
{"query_embedding": str(query_embedding), "top_k": top_k},
)
chunks = []
for row in result.fetchall():
metadata = {}
if row[5]:
try:
metadata = json.loads(row[5])
except json.JSONDecodeError:
pass
chunks.append({
"id": row[0],
"source_type": row[1],
"source_id": row[2],
"chunk_index": row[3],
"content": row[4],
"metadata": metadata,
"distance": row[6],
})
return chunks
def build_context_prompt(chunks: list[dict]) -> str:
"""Build a context string from retrieved chunks."""
if not chunks:
return "\n[No relevant documents found in the database.]\n"
context_parts = []
for i, chunk in enumerate(chunks, 1):
source_label = chunk["source_type"].title()
metadata = chunk["metadata"]
header = f"--- Source {i} ({source_label})"
if "title" in metadata:
header += f" | {metadata['title']}"
if "substance" in metadata:
header += f" | Substance: {metadata['substance']}"
header += " ---"
# Limit each chunk to avoid overwhelming the LLM
content = chunk["content"][:800]
context_parts.append(f"{header}\n{content}")
return "\n\n".join(context_parts)
async def chat_stream(
user_message: str,
conversation_history: list[dict] | None = None,
) -> AsyncGenerator[str, None]:
"""Full RAG pipeline: retrieve context, build prompt, stream LLM response."""
# Retrieve relevant chunks
chunks = await retrieve_context(user_message)
# Build the context-augmented system prompt
context_text = build_context_prompt(chunks)
full_system = f"{SYSTEM_PROMPT}\n\n--- RELEVANT EROWID DATA ---\n{context_text}\n--- END EROWID DATA ---"
# Build message history
messages = []
if conversation_history:
# Keep last 10 messages for context
messages = conversation_history[-10:]
messages.append({"role": "user", "content": user_message})
# Stream from LLM
async for token in stream_chat(messages, system=full_system):
yield token

0
app/scraper/__init__.py Normal file
View File

244
app/scraper/experiences.py Normal file
View File

@ -0,0 +1,244 @@
import asyncio
import re
import logging
from datetime import datetime, timezone
import httpx
from bs4 import BeautifulSoup
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import settings
from app.database import async_session
from app.models import Experience
logger = logging.getLogger(__name__)
BASE_URL = "https://erowid.org"
EXP_LIST_URL = "https://erowid.org/experiences/exp_list.shtml"
REPORT_URL = "https://erowid.org/experiences/exp.php?ID={id}"
HEADERS = {
"User-Agent": "ErowidResearchBot/1.0 (educational research project)",
"Accept": "text/html,application/xhtml+xml",
}
async def get_all_substance_pages(client: httpx.AsyncClient) -> list[dict]:
"""Get main substance experience listing pages from the master index.
Only fetches top-level substance pages (e.g. exp_LSD.shtml), not
category sub-pages (e.g. exp_LSD_General.shtml) since the main page
already contains all report IDs for that substance.
"""
resp = await client.get(EXP_LIST_URL, headers=HEADERS, timeout=60)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "lxml")
pages = []
seen_substances = set()
for a in soup.select("a[href]"):
href = a.get("href", "")
name = a.get_text(strip=True)
if not href.startswith("subs/exp_") or not href.endswith(".shtml") or not name:
continue
full_url = f"https://erowid.org/experiences/{href}"
# Extract the base substance name from the URL
# e.g. subs/exp_LSD.shtml -> LSD, subs/exp_LSD_General.shtml -> LSD
filename = href.replace("subs/exp_", "").replace(".shtml", "")
# Skip category sub-pages — they contain subsets of the main page
# Category sub-pages have suffixes like _General, _First_Times, _Bad_Trips, etc.
known_categories = [
"_General", "_First_Times", "_Combinations", "_Retrospective",
"_Preparation", "_Difficult_Experiences", "_Bad_Trips",
"_Health_Problems", "_Train_Wrecks", "_Glowing_Experiences",
"_Mystical_Experiences", "_Health_Benefits", "_What_Was_in_That",
"_Medical_Use", "_Performance_Enhancement", "_Addiction",
]
is_category = any(filename.endswith(cat) for cat in known_categories)
if is_category:
continue
if full_url not in seen_substances:
seen_substances.add(full_url)
pages.append({"name": name, "url": full_url})
logger.info(f"Found {len(pages)} main substance experience pages (filtered from category sub-pages)")
return pages
async def get_experience_ids_from_page(client: httpx.AsyncClient, url: str) -> list[int]:
"""Extract all experience report IDs from a substance listing page."""
try:
resp = await client.get(url, headers=HEADERS, timeout=30)
resp.raise_for_status()
except httpx.HTTPError as e:
logger.warning(f"Failed to fetch {url}: {e}")
return []
ids = [int(x) for x in re.findall(r"exp\.php\?ID=(\d+)", resp.text)]
return list(set(ids)) # dedupe
async def get_all_experience_ids(client: httpx.AsyncClient) -> list[int]:
"""Collect all unique experience IDs from all substance pages.
Fetches pages concurrently in batches of 5 for speed.
"""
pages = await get_all_substance_pages(client)
all_ids = set()
batch_size = 5
for i in range(0, len(pages), batch_size):
batch = pages[i : i + batch_size]
tasks = [get_experience_ids_from_page(client, p["url"]) for p in batch]
results = await asyncio.gather(*tasks, return_exceptions=True)
for result in results:
if isinstance(result, list):
all_ids.update(result)
logger.info(f"Scanned {min(i + batch_size, len(pages))}/{len(pages)} pages, {len(all_ids)} unique IDs")
await asyncio.sleep(0.5)
logger.info(f"Found {len(all_ids)} unique experience IDs total")
return sorted(all_ids)
async def scrape_experience_report(client: httpx.AsyncClient, erowid_id: int) -> dict | None:
"""Scrape a single experience report."""
url = REPORT_URL.format(id=erowid_id)
try:
resp = await client.get(url, headers=HEADERS, timeout=30)
resp.raise_for_status()
except httpx.HTTPError as e:
logger.warning(f"Failed to fetch experience {erowid_id}: {e}")
return None
soup = BeautifulSoup(resp.text, "lxml")
# Extract the main report body
body_div = soup.select_one("div.report-text-surround")
if not body_div:
logger.warning(f"No report body found for {erowid_id}")
return None
# Remove the dosechart from body text to avoid duplication
body_text_parts = []
for el in body_div.children:
if hasattr(el, "name") and el.name == "table":
continue # skip dosechart table
text = el.get_text(separator="\n", strip=True) if hasattr(el, "get_text") else str(el).strip()
if text:
body_text_parts.append(text)
body = "\n\n".join(body_text_parts)
if not body or len(body) < 50:
return None
# Extract metadata
title = ""
title_el = soup.select_one("div.title")
if title_el:
title = title_el.get_text(strip=True)
substance = ""
sub_el = soup.select_one("div.substance")
if sub_el:
substance = sub_el.get_text(strip=True)
substance_list = [s.strip() for s in re.split(r"[,&]", substance) if s.strip()]
author = ""
author_el = soup.select_one("div.author")
if author_el:
author = author_el.get_text(strip=True).replace("by ", "")
# Dosage info
dose_table = soup.select_one("table.dosechart")
dose_text = ""
if dose_table:
rows = dose_table.select("tr")
dose_parts = []
for row in rows:
cells = row.select("td")
row_text = " ".join(c.get_text(strip=True) for c in cells if c.get_text(strip=True))
if row_text:
dose_parts.append(row_text)
dose_text = "; ".join(dose_parts)
if dose_text:
body = f"Dosage: {dose_text}\n\n{body}"
# Weight/gender from the body weight line inside report-text-surround
gender = ""
age = ""
weight_el = soup.select_one("table.bodyweight")
if weight_el:
wt = weight_el.get_text(strip=True)
age_match = re.search(r"(\d+)\s*yr", wt, re.IGNORECASE)
if age_match:
age = age_match.group(1)
if "male" in wt.lower():
gender = "Male"
elif "female" in wt.lower():
gender = "Female"
# Try to extract category from the page
category = ""
cat_el = soup.select_one("div.foot-eroid-cat")
if cat_el:
category = cat_el.get_text(strip=True)
return {
"erowid_id": erowid_id,
"title": title,
"author": author,
"substance": substance,
"substance_list": substance_list,
"body": body,
"category": category,
"gender": gender,
"age": age,
"url": url,
"raw_html": resp.text,
}
async def scrape_all_experiences(limit: int | None = None):
"""Main scraper entry point. Scrapes all experience reports into the database."""
async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
all_ids = await get_all_experience_ids(client)
if limit:
all_ids = all_ids[:limit]
async with async_session() as db:
result = await db.execute(select(Experience.erowid_id))
existing_ids = {row[0] for row in result.fetchall()}
logger.info(f"Already have {len(existing_ids)} experiences in DB")
to_scrape = [eid for eid in all_ids if eid not in existing_ids]
logger.info(f"Need to scrape {len(to_scrape)} new experiences")
scraped = 0
errors = 0
for eid in to_scrape:
data = await scrape_experience_report(client, eid)
if data:
exp = Experience(**data)
db.add(exp)
scraped += 1
if scraped % settings.scrape_batch_size == 0:
await db.commit()
logger.info(f"Committed batch: {scraped}/{len(to_scrape)} scraped ({errors} errors)")
else:
errors += 1
await asyncio.sleep(settings.scrape_delay)
await db.commit()
logger.info(f"Done! Scraped {scraped} new experiences ({errors} errors)")
return scraped

171
app/scraper/substances.py Normal file
View File

@ -0,0 +1,171 @@
import asyncio
import re
import logging
from datetime import datetime, timezone
import httpx
from bs4 import BeautifulSoup
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import settings
from app.database import async_session
from app.models import Substance
logger = logging.getLogger(__name__)
BASE_URL = "https://erowid.org"
VAULT_INDEX = "https://erowid.org/chemicals/"
HEADERS = {
"User-Agent": "ErowidResearchBot/1.0 (educational research project)",
"Accept": "text/html,application/xhtml+xml",
}
# Known substance categories on Erowid
CATEGORIES = [
"Psychedelics", "Empathogens", "Stimulants", "Depressants",
"Dissociatives", "Cannabis", "Opioids", "Nootropics",
"Plants & Herbs", "Pharmaceuticals", "Research Chemicals",
]
async def get_substance_urls(client: httpx.AsyncClient) -> list[dict]:
"""Get all substance vault URLs from the chemicals index."""
resp = await client.get(VAULT_INDEX, headers=HEADERS)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "lxml")
substances = []
# The chemicals index lists substances with links to their vaults
for link in soup.select("a[href]"):
href = link.get("href", "")
text = link.get_text(strip=True)
# Filter for substance vault links (e.g., /chemicals/lsd/, /chemicals/psilocybin/)
if re.match(r"^/chemicals/[\w_-]+/?$", href) or re.match(r"^[\w_-]+/?$", href):
if text and len(text) > 1 and not text.startswith("["):
full_url = href if href.startswith("http") else f"{BASE_URL}{href}" if href.startswith("/") else f"{VAULT_INDEX}{href}"
substances.append({
"name": text,
"url": full_url,
})
# Also scrape the plants/herbs section
plants_url = f"{BASE_URL}/plants/"
try:
resp2 = await client.get(plants_url, headers=HEADERS)
resp2.raise_for_status()
soup2 = BeautifulSoup(resp2.text, "lxml")
for link in soup2.select("a[href]"):
href = link.get("href", "")
text = link.get_text(strip=True)
if re.match(r"^/plants/[\w_-]+/?$", href) or re.match(r"^[\w_-]+/?$", href):
if text and len(text) > 1 and not text.startswith("["):
full_url = href if href.startswith("http") else f"{BASE_URL}{href}" if href.startswith("/") else f"{plants_url}{href}"
substances.append({
"name": text,
"url": full_url,
"category": "Plants & Herbs",
})
except httpx.HTTPError:
logger.warning("Failed to fetch plants index")
logger.info(f"Found {len(substances)} substance URLs")
return substances
async def scrape_substance_vault(client: httpx.AsyncClient, name: str, url: str) -> dict | None:
"""Scrape a substance vault page for key information."""
try:
resp = await client.get(url, headers=HEADERS)
resp.raise_for_status()
except httpx.HTTPError as e:
logger.warning(f"Failed to fetch substance {name}: {e}")
return None
soup = BeautifulSoup(resp.text, "lxml")
raw_html = resp.text
# Extract text content from the vault main page
description = ""
main_content = soup.select_one("div.sum-content") or soup.select_one("td.content") or soup.select_one("body")
if main_content:
description = main_content.get_text(separator="\n", strip=True)[:5000]
# Try to find sub-pages: effects, dose, duration, health, law
sections = {}
sub_pages = {
"effects": ["effects", "effects.shtml"],
"dosage": ["dose", "dose.shtml", "dosage.shtml"],
"duration": ["duration", "duration.shtml", "timeline.shtml"],
"chemistry": ["chemistry", "chemistry.shtml"],
"health": ["health", "health.shtml", "warnings.shtml"],
"law": ["law", "law.shtml", "legal.shtml"],
}
for section_name, paths in sub_pages.items():
for path in paths:
sub_url = f"{url.rstrip('/')}/{path}"
try:
sub_resp = await client.get(sub_url, headers=HEADERS)
if sub_resp.status_code == 200:
sub_soup = BeautifulSoup(sub_resp.text, "lxml")
content_el = sub_soup.select_one("div.sum-content") or sub_soup.select_one("td.content")
if content_el:
sections[section_name] = content_el.get_text(separator="\n", strip=True)[:3000]
break
await asyncio.sleep(0.5)
except httpx.HTTPError:
continue
return {
"name": name,
"url": url,
"description": description,
"effects": sections.get("effects", ""),
"dosage": sections.get("dosage", ""),
"duration": sections.get("duration", ""),
"chemistry": sections.get("chemistry", ""),
"health": sections.get("health", ""),
"law": sections.get("law", ""),
"raw_html": raw_html,
}
async def scrape_all_substances(limit: int | None = None):
"""Main entry point: scrape all substance vaults."""
async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
substance_list = await get_substance_urls(client)
if limit:
substance_list = substance_list[:limit]
async with async_session() as db:
# Get already-scraped substances
result = await db.execute(select(Substance.name))
existing = {row[0].lower() for row in result.fetchall()}
logger.info(f"Already have {len(existing)} substances in DB")
to_scrape = [s for s in substance_list if s["name"].lower() not in existing]
logger.info(f"Need to scrape {len(to_scrape)} new substances")
scraped = 0
for sub_meta in to_scrape:
data = await scrape_substance_vault(
client, sub_meta["name"], sub_meta["url"]
)
if data:
data["category"] = sub_meta.get("category", "")
sub = Substance(**data)
db.add(sub)
scraped += 1
if scraped % 10 == 0:
await db.commit()
logger.info(f"Committed: {scraped} substances scraped")
await asyncio.sleep(settings.scrape_delay)
await db.commit()
logger.info(f"Done! Scraped {scraped} new substances")
return scraped

157
app/static/app.js Normal file
View File

@ -0,0 +1,157 @@
const chatContainer = document.getElementById("chat-container");
const messageInput = document.getElementById("message-input");
const sendBtn = document.getElementById("send-btn");
const welcomeEl = document.getElementById("welcome");
const statsEl = document.getElementById("stats");
let sessionId = localStorage.getItem("erowid_session") || "";
let isStreaming = false;
// Load stats
async function loadStats() {
try {
const resp = await fetch("/stats");
const data = await resp.json();
statsEl.textContent = `${data.experiences} reports | ${data.substances} substances | ${data.chunks} chunks`;
} catch {
statsEl.textContent = "connecting...";
}
}
loadStats();
// Auto-resize textarea
messageInput.addEventListener("input", () => {
messageInput.style.height = "auto";
messageInput.style.height = Math.min(messageInput.scrollHeight, 120) + "px";
});
// Send on Enter (Shift+Enter for newline)
messageInput.addEventListener("keydown", (e) => {
if (e.key === "Enter" && !e.shiftKey) {
e.preventDefault();
sendMessage();
}
});
sendBtn.addEventListener("click", sendMessage);
// Suggestion clicks
document.querySelectorAll(".suggestion").forEach((el) => {
el.addEventListener("click", () => {
messageInput.value = el.textContent;
sendMessage();
});
});
function addMessage(role, content) {
if (welcomeEl) welcomeEl.style.display = "none";
const msg = document.createElement("div");
msg.className = `message ${role}`;
const avatar = document.createElement("div");
avatar.className = "message-avatar";
avatar.textContent = role === "user" ? "You" : "E";
const contentEl = document.createElement("div");
contentEl.className = "message-content";
contentEl.textContent = content;
msg.appendChild(avatar);
msg.appendChild(contentEl);
chatContainer.appendChild(msg);
chatContainer.scrollTop = chatContainer.scrollHeight;
return contentEl;
}
function addTypingIndicator() {
const msg = document.createElement("div");
msg.className = "message assistant";
msg.id = "typing-indicator";
const avatar = document.createElement("div");
avatar.className = "message-avatar";
avatar.textContent = "E";
const contentEl = document.createElement("div");
contentEl.className = "message-content";
contentEl.innerHTML = '<div class="typing"><span></span><span></span><span></span></div>';
msg.appendChild(avatar);
msg.appendChild(contentEl);
chatContainer.appendChild(msg);
chatContainer.scrollTop = chatContainer.scrollHeight;
}
function removeTypingIndicator() {
const el = document.getElementById("typing-indicator");
if (el) el.remove();
}
async function sendMessage() {
const text = messageInput.value.trim();
if (!text || isStreaming) return;
isStreaming = true;
sendBtn.disabled = true;
messageInput.value = "";
messageInput.style.height = "auto";
addMessage("user", text);
addTypingIndicator();
try {
const resp = await fetch("/chat", {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ message: text, session_id: sessionId }),
});
removeTypingIndicator();
const contentEl = addMessage("assistant", "");
const reader = resp.body.getReader();
const decoder = new TextDecoder();
let buffer = "";
let fullResponse = "";
while (true) {
const { done, value } = await reader.read();
if (done) break;
buffer += decoder.decode(value, { stream: true });
const lines = buffer.split("\n");
buffer = lines.pop() || "";
for (const line of lines) {
if (!line.startsWith("data: ")) continue;
const jsonStr = line.slice(6).trim();
if (!jsonStr) continue;
try {
const data = JSON.parse(jsonStr);
if (data.token) {
fullResponse += data.token;
contentEl.textContent = fullResponse;
chatContainer.scrollTop = chatContainer.scrollHeight;
}
if (data.session_id) {
sessionId = data.session_id;
localStorage.setItem("erowid_session", sessionId);
}
if (data.error) {
contentEl.textContent = `Error: ${data.error}`;
}
} catch {}
}
}
} catch (err) {
removeTypingIndicator();
addMessage("assistant", `Connection error: ${err.message}`);
}
isStreaming = false;
sendBtn.disabled = false;
messageInput.focus();
}

59
app/static/index.html Normal file
View File

@ -0,0 +1,59 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Erowid Bot</title>
<link rel="stylesheet" href="/static/style.css">
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap" rel="stylesheet">
</head>
<body>
<div class="header">
<div class="header-left">
<div class="logo">Erowid Bot</div>
<div class="tagline">Harm Reduction Knowledge Assistant</div>
</div>
<div class="stats-badge" id="stats">loading...</div>
</div>
<div class="chat-container" id="chat-container">
<div class="welcome" id="welcome">
<h2>Explore the Erowid Database</h2>
<p>
Ask questions about substances, experience reports, dosage information,
effects, safety, and more. All information is sourced from the Erowid vault
and experience reports. This bot prioritizes harm reduction and safety.
</p>
<div class="suggestions">
<button class="suggestion">What are the effects of psilocybin?</button>
<button class="suggestion">Tell me about safe MDMA dosing</button>
<button class="suggestion">What do people report about DMT experiences?</button>
<button class="suggestion">What are dangerous drug combinations?</button>
<button class="suggestion">Compare LSD and psilocybin experiences</button>
</div>
</div>
</div>
<div class="input-container">
<div class="input-wrapper">
<textarea
id="message-input"
placeholder="Ask about substances, experiences, safety..."
rows="1"
></textarea>
<button id="send-btn">
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
<line x1="22" y1="2" x2="11" y2="13"></line>
<polygon points="22 2 15 22 11 13 2 9 22 2"></polygon>
</svg>
</button>
</div>
<div class="disclaimer">
Information sourced from Erowid.org. Not medical advice. Always practice harm reduction.
</div>
</div>
<script src="/static/app.js"></script>
</body>
</html>

326
app/static/style.css Normal file
View File

@ -0,0 +1,326 @@
* {
margin: 0;
padding: 0;
box-sizing: border-box;
}
:root {
--bg-primary: #0a0a0f;
--bg-secondary: #12121a;
--bg-tertiary: #1a1a2e;
--text-primary: #e0e0e8;
--text-secondary: #8888a0;
--accent: #6c5ce7;
--accent-glow: rgba(108, 92, 231, 0.3);
--user-bg: #2d2d44;
--bot-bg: #1a1a2e;
--border: #2a2a3e;
--danger: #e74c3c;
--success: #2ecc71;
}
body {
font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
background: var(--bg-primary);
color: var(--text-primary);
height: 100vh;
display: flex;
flex-direction: column;
overflow: hidden;
}
/* Header */
.header {
display: flex;
align-items: center;
justify-content: space-between;
padding: 12px 20px;
background: var(--bg-secondary);
border-bottom: 1px solid var(--border);
flex-shrink: 0;
}
.header-left {
display: flex;
align-items: center;
gap: 12px;
}
.logo {
font-size: 24px;
font-weight: 700;
background: linear-gradient(135deg, var(--accent), #a29bfe);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
background-clip: text;
}
.tagline {
font-size: 12px;
color: var(--text-secondary);
letter-spacing: 0.5px;
}
.stats-badge {
font-size: 11px;
color: var(--text-secondary);
background: var(--bg-tertiary);
padding: 4px 10px;
border-radius: 12px;
border: 1px solid var(--border);
}
/* Chat area */
.chat-container {
flex: 1;
overflow-y: auto;
padding: 20px;
scroll-behavior: smooth;
}
.chat-container::-webkit-scrollbar {
width: 6px;
}
.chat-container::-webkit-scrollbar-track {
background: transparent;
}
.chat-container::-webkit-scrollbar-thumb {
background: var(--border);
border-radius: 3px;
}
.welcome {
text-align: center;
padding: 60px 20px;
max-width: 600px;
margin: 0 auto;
}
.welcome h2 {
font-size: 22px;
margin-bottom: 12px;
color: var(--text-primary);
}
.welcome p {
color: var(--text-secondary);
line-height: 1.6;
margin-bottom: 20px;
font-size: 14px;
}
.suggestions {
display: flex;
flex-wrap: wrap;
gap: 8px;
justify-content: center;
}
.suggestion {
background: var(--bg-tertiary);
border: 1px solid var(--border);
color: var(--text-secondary);
padding: 8px 14px;
border-radius: 20px;
font-size: 13px;
cursor: pointer;
transition: all 0.2s;
}
.suggestion:hover {
border-color: var(--accent);
color: var(--text-primary);
background: rgba(108, 92, 231, 0.1);
}
/* Messages */
.message {
display: flex;
gap: 12px;
margin-bottom: 16px;
max-width: 800px;
margin-left: auto;
margin-right: auto;
animation: fadeIn 0.3s ease;
}
@keyframes fadeIn {
from { opacity: 0; transform: translateY(8px); }
to { opacity: 1; transform: translateY(0); }
}
.message.user {
flex-direction: row-reverse;
}
.message-avatar {
width: 32px;
height: 32px;
border-radius: 8px;
display: flex;
align-items: center;
justify-content: center;
font-size: 14px;
flex-shrink: 0;
}
.message.user .message-avatar {
background: var(--user-bg);
}
.message.assistant .message-avatar {
background: var(--accent);
}
.message-content {
padding: 10px 16px;
border-radius: 12px;
max-width: 75%;
line-height: 1.6;
font-size: 14px;
white-space: pre-wrap;
word-wrap: break-word;
}
.message.user .message-content {
background: var(--user-bg);
border-bottom-right-radius: 4px;
}
.message.assistant .message-content {
background: var(--bot-bg);
border: 1px solid var(--border);
border-bottom-left-radius: 4px;
}
.message-content p {
margin-bottom: 8px;
}
.message-content p:last-child {
margin-bottom: 0;
}
.message-content strong {
color: #a29bfe;
}
.message-content code {
background: rgba(108, 92, 231, 0.15);
padding: 1px 5px;
border-radius: 3px;
font-size: 13px;
}
/* Typing indicator */
.typing {
display: flex;
gap: 4px;
padding: 4px 0;
}
.typing span {
width: 6px;
height: 6px;
background: var(--text-secondary);
border-radius: 50%;
animation: bounce 1.4s infinite;
}
.typing span:nth-child(2) { animation-delay: 0.2s; }
.typing span:nth-child(3) { animation-delay: 0.4s; }
@keyframes bounce {
0%, 60%, 100% { transform: translateY(0); }
30% { transform: translateY(-6px); }
}
/* Input area */
.input-container {
padding: 16px 20px;
background: var(--bg-secondary);
border-top: 1px solid var(--border);
flex-shrink: 0;
}
.input-wrapper {
display: flex;
gap: 10px;
max-width: 800px;
margin: 0 auto;
}
#message-input {
flex: 1;
background: var(--bg-tertiary);
border: 1px solid var(--border);
color: var(--text-primary);
padding: 12px 16px;
border-radius: 12px;
font-size: 14px;
font-family: inherit;
resize: none;
outline: none;
min-height: 44px;
max-height: 120px;
transition: border-color 0.2s;
}
#message-input:focus {
border-color: var(--accent);
box-shadow: 0 0 0 2px var(--accent-glow);
}
#message-input::placeholder {
color: var(--text-secondary);
}
#send-btn {
background: var(--accent);
color: white;
border: none;
width: 44px;
height: 44px;
border-radius: 12px;
cursor: pointer;
display: flex;
align-items: center;
justify-content: center;
transition: all 0.2s;
flex-shrink: 0;
}
#send-btn:hover {
background: #5b4bd5;
transform: scale(1.05);
}
#send-btn:disabled {
opacity: 0.5;
cursor: not-allowed;
transform: none;
}
#send-btn svg {
width: 18px;
height: 18px;
}
.disclaimer {
text-align: center;
font-size: 11px;
color: var(--text-secondary);
margin-top: 8px;
max-width: 800px;
margin-left: auto;
margin-right: auto;
}
/* Mobile */
@media (max-width: 640px) {
.header { padding: 10px 14px; }
.chat-container { padding: 12px; }
.message-content { max-width: 85%; font-size: 13px; }
.input-container { padding: 10px 14px; }
.suggestions { gap: 6px; }
.suggestion { font-size: 12px; padding: 6px 12px; }
}

50
docker-compose.yml Normal file
View File

@ -0,0 +1,50 @@
services:
erowid-bot:
build: .
container_name: erowid-bot
restart: unless-stopped
env_file: .env
ports:
- "8421:8000"
depends_on:
erowid-db:
condition: service_healthy
networks:
- default
- traefik-public
- ai-internal
labels:
- "traefik.enable=true"
- "traefik.http.routers.erowid-bot.rule=Host(`erowid.jeffemmett.com`)"
- "traefik.http.routers.erowid-bot.entrypoints=websecure"
- "traefik.http.routers.erowid-bot.tls.certresolver=letsencrypt"
- "traefik.http.services.erowid-bot.loadbalancer.server.port=8000"
volumes:
- ./app:/app/app
erowid-db:
image: pgvector/pgvector:pg16
container_name: erowid-db
restart: unless-stopped
environment:
POSTGRES_USER: ${POSTGRES_USER:-erowid}
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-erowid}
POSTGRES_DB: ${POSTGRES_DB:-erowid}
volumes:
- erowid-db-data:/var/lib/postgresql/data
healthcheck:
test: ["CMD-SHELL", "pg_isready -U erowid"]
interval: 5s
timeout: 5s
retries: 5
networks:
- default
volumes:
erowid-db-data:
networks:
traefik-public:
external: true
ai-internal:
external: true

16
requirements.txt Normal file
View File

@ -0,0 +1,16 @@
fastapi==0.115.6
uvicorn[standard]==0.34.0
sqlalchemy==2.0.36
asyncpg==0.30.0
pgvector==0.3.6
psycopg2-binary==2.9.10
httpx==0.28.1
python-dotenv==1.0.1
beautifulsoup4==4.12.3
lxml==5.3.0
pydantic==2.10.3
pydantic-settings==2.7.0
sse-starlette==2.2.1
tiktoken==0.8.0
anthropic==0.40.0
openai==1.58.1