From d09d065d081e5df9ffaa829cfbc17834bad11159 Mon Sep 17 00:00:00 2001
From: Jeff Emmett <jeff@jeffemmett.com>
Date: Tue, 17 Feb 2026 01:19:49 +0000
Subject: [PATCH] Initial commit: Erowid conversational bot

RAG-powered chatbot that indexes Erowid's experience reports and substance
info, making them searchable via natural conversation. Built with FastAPI,
PostgreSQL+pgvector, Ollama embeddings, and streaming LLM responses.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .env.example               |  24 +++
 .gitignore                 |  10 ++
 Dockerfile                 |  16 ++
 app/__init__.py            |   0
 app/config.py              |  34 ++++
 app/database.py            |  23 +++
 app/embeddings.py          | 199 ++++++++++++++++++++++
 app/llm.py                 | 119 ++++++++++++++
 app/main.py                | 140 ++++++++++++++++
 app/models.py              |  56 +++++++
 app/rag.py                 | 108 ++++++++++++
 app/scraper/__init__.py    |   0
 app/scraper/experiences.py | 244 +++++++++++++++++++++++++++
 app/scraper/substances.py  | 171 +++++++++++++++++++
 app/static/app.js          | 157 ++++++++++++++++++
 app/static/index.html      |  59 +++++++
 app/static/style.css       | 326 +++++++++++++++++++++++++++++++++++++
 docker-compose.yml         |  50 ++++++
 requirements.txt           |  16 ++
 19 files changed, 1752 insertions(+)
 create mode 100644 .env.example
 create mode 100644 .gitignore
 create mode 100644 Dockerfile
 create mode 100644 app/__init__.py
 create mode 100644 app/config.py
 create mode 100644 app/database.py
 create mode 100644 app/embeddings.py
 create mode 100644 app/llm.py
 create mode 100644 app/main.py
 create mode 100644 app/models.py
 create mode 100644 app/rag.py
 create mode 100644 app/scraper/__init__.py
 create mode 100644 app/scraper/experiences.py
 create mode 100644 app/scraper/substances.py
 create mode 100644 app/static/app.js
 create mode 100644 app/static/index.html
 create mode 100644 app/static/style.css
 create mode 100644 docker-compose.yml
 create mode 100644 requirements.txt

diff --git a/.env.example b/.env.example
new file mode 100644
index 0000000..c6dd195
--- /dev/null
+++ b/.env.example
@@ -0,0 +1,24 @@
+# Database
+DATABASE_URL=postgresql+asyncpg://erowid:erowid@erowid-db:5432/erowid
+DATABASE_URL_SYNC=postgresql://erowid:erowid@erowid-db:5432/erowid
+POSTGRES_USER=erowid
+POSTGRES_PASSWORD=erowid
+POSTGRES_DB=erowid
+
+# LLM Provider: ollama | claude | openai
+LLM_PROVIDER=ollama
+
+# Ollama (local, free)
+OLLAMA_BASE_URL=http://ollama:11434
+OLLAMA_EMBED_MODEL=nomic-embed-text
+OLLAMA_CHAT_MODEL=llama3.2:3b
+
+# Claude API (optional)
+ANTHROPIC_API_KEY=
+
+# OpenAI API (optional)
+OPENAI_API_KEY=
+
+# App
+APP_HOST=0.0.0.0
+APP_PORT=8000
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..20f4e8a
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,10 @@
+__pycache__/
+*.pyc
+*.pyo
+.env
+*.egg-info/
+dist/
+build/
+.venv/
+venv/
+PLAN.md
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..5f5edda
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,16 @@
+FROM python:3.12-slim
+
+WORKDIR /app
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential libpq-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY app/ ./app/
+
+EXPOSE 8000
+
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/app/__init__.py b/app/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/app/config.py b/app/config.py
new file mode 100644
index 0000000..92ac0fc
--- /dev/null
+++ b/app/config.py
@@ -0,0 +1,34 @@
+from pydantic_settings import BaseSettings
+
+
+class Settings(BaseSettings):
+    database_url: str = "postgresql+asyncpg://erowid:erowid@erowid-db:5432/erowid"
+    database_url_sync: str = "postgresql://erowid:erowid@erowid-db:5432/erowid"
+
+    llm_provider: str = "ollama"  # ollama | claude | openai
+
+    ollama_base_url: str = "http://ollama:11434"
+    ollama_embed_model: str = "nomic-embed-text"
+    ollama_chat_model: str = "llama3.1:8b"
+
+    anthropic_api_key: str = ""
+    openai_api_key: str = ""
+
+    app_host: str = "0.0.0.0"
+    app_port: int = 8000
+
+    # Scraper settings
+    scrape_delay: float = 3.0  # seconds between requests (be polite to Erowid)
+    scrape_batch_size: int = 50
+
+    # RAG settings
+    chunk_size: int = 500  # tokens per chunk
+    chunk_overlap: int = 50  # token overlap between chunks
+    retrieval_top_k: int = 4  # number of chunks to retrieve
+
+    class Config:
+        env_file = ".env"
+        extra = "ignore"
+
+
+settings = Settings()
diff --git a/app/database.py b/app/database.py
new file mode 100644
index 0000000..fea0a1a
--- /dev/null
+++ b/app/database.py
@@ -0,0 +1,23 @@
+from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker
+from sqlalchemy.orm import DeclarativeBase
+from app.config import settings
+
+engine = create_async_engine(settings.database_url, echo=False)
+async_session = async_sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
+
+
+class Base(DeclarativeBase):
+    pass
+
+
+async def get_db():
+    async with async_session() as session:
+        yield session
+
+
+async def init_db():
+    async with engine.begin() as conn:
+        await conn.execute(
+            __import__("sqlalchemy").text("CREATE EXTENSION IF NOT EXISTS vector")
+        )
+        await conn.run_sync(Base.metadata.create_all)
diff --git a/app/embeddings.py b/app/embeddings.py
new file mode 100644
index 0000000..c5201f8
--- /dev/null
+++ b/app/embeddings.py
@@ -0,0 +1,199 @@
+import json
+import logging
+from typing import AsyncGenerator
+
+import httpx
+from sqlalchemy import select, func
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.config import settings
+from app.database import async_session
+from app.models import Experience, Substance, DocumentChunk
+
+logger = logging.getLogger(__name__)
+
+
+def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list[str]:
+    """Split text into overlapping chunks by approximate token count (words / 0.75)."""
+    words = text.split()
+    # Approximate: 1 token ~ 0.75 words
+    words_per_chunk = int(chunk_size * 0.75)
+    words_overlap = int(overlap * 0.75)
+
+    if len(words) <= words_per_chunk:
+        return [text]
+
+    chunks = []
+    start = 0
+    while start < len(words):
+        end = start + words_per_chunk
+        chunk = " ".join(words[start:end])
+        chunks.append(chunk)
+        start = end - words_overlap
+
+    return chunks
+
+
+async def get_embedding(text: str, client: httpx.AsyncClient | None = None) -> list[float]:
+    """Get embedding vector for text using Ollama."""
+    should_close = False
+    if client is None:
+        client = httpx.AsyncClient(timeout=60)
+        should_close = True
+
+    try:
+        resp = await client.post(
+            f"{settings.ollama_base_url}/api/embeddings",
+            json={
+                "model": settings.ollama_embed_model,
+                "prompt": text,
+            },
+        )
+        resp.raise_for_status()
+        data = resp.json()
+        return data["embedding"]
+    finally:
+        if should_close:
+            await client.aclose()
+
+
+async def get_embeddings_batch(texts: list[str], client: httpx.AsyncClient) -> list[list[float]]:
+    """Get embeddings for multiple texts sequentially."""
+    embeddings = []
+    for text in texts:
+        emb = await get_embedding(text, client)
+        embeddings.append(emb)
+    return embeddings
+
+
+async def embed_experiences(batch_size: int = 20):
+    """Chunk and embed all un-embedded experience reports."""
+    async with async_session() as db:
+        # Find experiences that don't have chunks yet
+        subq = select(DocumentChunk.source_id).where(
+            DocumentChunk.source_type == "experience"
+        ).distinct()
+        result = await db.execute(
+            select(Experience).where(Experience.id.not_in(subq))
+        )
+        experiences = result.scalars().all()
+        logger.info(f"Found {len(experiences)} experiences to embed")
+
+        async with httpx.AsyncClient(timeout=60) as client:
+            total_chunks = 0
+            for i, exp in enumerate(experiences):
+                # Build a rich text representation
+                header = f"Experience Report: {exp.title}\n"
+                header += f"Substance: {exp.substance}\n"
+                if exp.category:
+                    header += f"Category: {exp.category}\n"
+                if exp.gender:
+                    header += f"Gender: {exp.gender}\n"
+                if exp.age:
+                    header += f"Age: {exp.age}\n"
+                header += "\n"
+
+                full_text = header + exp.body
+                chunks = chunk_text(full_text, settings.chunk_size, settings.chunk_overlap)
+
+                for idx, chunk_text_content in enumerate(chunks):
+                    embedding = await get_embedding(chunk_text_content, client)
+
+                    metadata = json.dumps({
+                        "title": exp.title,
+                        "substance": exp.substance,
+                        "category": exp.category,
+                        "erowid_id": exp.erowid_id,
+                    })
+
+                    doc_chunk = DocumentChunk(
+                        source_type="experience",
+                        source_id=exp.id,
+                        chunk_index=idx,
+                        content=chunk_text_content,
+                        metadata_json=metadata,
+                        embedding=embedding,
+                    )
+                    db.add(doc_chunk)
+                    total_chunks += 1
+
+                if (i + 1) % batch_size == 0:
+                    await db.commit()
+                    logger.info(f"Embedded {i + 1} experiences ({total_chunks} chunks)")
+
+            await db.commit()
+            logger.info(f"Done! Created {total_chunks} chunks from {len(experiences)} experiences")
+            return total_chunks
+
+
+async def embed_substances(batch_size: int = 10):
+    """Chunk and embed all un-embedded substance info pages."""
+    async with async_session() as db:
+        subq = select(DocumentChunk.source_id).where(
+            DocumentChunk.source_type == "substance"
+        ).distinct()
+        result = await db.execute(
+            select(Substance).where(Substance.id.not_in(subq))
+        )
+        substances = result.scalars().all()
+        logger.info(f"Found {len(substances)} substances to embed")
+
+        async with httpx.AsyncClient(timeout=60) as client:
+            total_chunks = 0
+            for i, sub in enumerate(substances):
+                # Build rich text representation
+                sections = []
+                sections.append(f"Substance Information: {sub.name}")
+                if sub.category:
+                    sections.append(f"Category: {sub.category}")
+                if sub.description:
+                    sections.append(f"\nOverview:\n{sub.description}")
+                if sub.effects:
+                    sections.append(f"\nEffects:\n{sub.effects}")
+                if sub.dosage:
+                    sections.append(f"\nDosage:\n{sub.dosage}")
+                if sub.duration:
+                    sections.append(f"\nDuration:\n{sub.duration}")
+                if sub.chemistry:
+                    sections.append(f"\nChemistry:\n{sub.chemistry}")
+                if sub.health:
+                    sections.append(f"\nHealth & Safety:\n{sub.health}")
+                if sub.law:
+                    sections.append(f"\nLegal Status:\n{sub.law}")
+
+                full_text = "\n".join(sections)
+                chunks = chunk_text(full_text, settings.chunk_size, settings.chunk_overlap)
+
+                for idx, chunk_text_content in enumerate(chunks):
+                    embedding = await get_embedding(chunk_text_content, client)
+
+                    metadata = json.dumps({
+                        "substance": sub.name,
+                        "category": sub.category,
+                    })
+
+                    doc_chunk = DocumentChunk(
+                        source_type="substance",
+                        source_id=sub.id,
+                        chunk_index=idx,
+                        content=chunk_text_content,
+                        metadata_json=metadata,
+                        embedding=embedding,
+                    )
+                    db.add(doc_chunk)
+                    total_chunks += 1
+
+                if (i + 1) % batch_size == 0:
+                    await db.commit()
+                    logger.info(f"Embedded {i + 1} substances ({total_chunks} chunks)")
+
+            await db.commit()
+            logger.info(f"Done! Created {total_chunks} chunks from {len(substances)} substances")
+            return total_chunks
+
+
+async def embed_all():
+    """Embed everything that hasn't been embedded yet."""
+    exp_chunks = await embed_experiences()
+    sub_chunks = await embed_substances()
+    return {"experience_chunks": exp_chunks, "substance_chunks": sub_chunks}
diff --git a/app/llm.py b/app/llm.py
new file mode 100644
index 0000000..b033a50
--- /dev/null
+++ b/app/llm.py
@@ -0,0 +1,119 @@
+import json
+import logging
+from typing import AsyncGenerator
+
+import httpx
+
+from app.config import settings
+
+logger = logging.getLogger(__name__)
+
+
+async def stream_ollama(messages: list[dict], system: str = "") -> AsyncGenerator[str, None]:
+    """Stream a chat completion from Ollama."""
+    all_messages = []
+    if system:
+        all_messages.append({"role": "system", "content": system})
+    all_messages.extend(messages)
+
+    payload = {
+        "model": settings.ollama_chat_model,
+        "messages": all_messages,
+        "stream": True,
+    }
+
+    timeout = httpx.Timeout(connect=30, read=600, write=30, pool=30)
+    async with httpx.AsyncClient(timeout=timeout) as client:
+        async with client.stream(
+            "POST",
+            f"{settings.ollama_base_url}/api/chat",
+            json=payload,
+        ) as resp:
+            resp.raise_for_status()
+            buffer = b""
+            async for chunk in resp.aiter_bytes():
+                buffer += chunk
+                # Process complete JSON lines
+                while b"\n" in buffer:
+                    line, buffer = buffer.split(b"\n", 1)
+                    line = line.strip()
+                    if not line:
+                        continue
+                    try:
+                        data = json.loads(line)
+                        if "message" in data and "content" in data["message"]:
+                            content = data["message"]["content"]
+                            if content:
+                                yield content
+                        if data.get("done"):
+                            return
+                    except json.JSONDecodeError:
+                        continue
+
+
+async def stream_claude(messages: list[dict], system: str = "") -> AsyncGenerator[str, None]:
+    """Stream a chat completion from Claude API."""
+    try:
+        from anthropic import AsyncAnthropic
+    except ImportError:
+        raise RuntimeError("anthropic package not installed")
+
+    client = AsyncAnthropic(api_key=settings.anthropic_api_key)
+
+    async with client.messages.stream(
+        model="claude-sonnet-4-5-20250929",
+        max_tokens=2048,
+        system=system,
+        messages=messages,
+    ) as stream:
+        async for text in stream.text_stream:
+            yield text
+
+
+async def stream_openai(messages: list[dict], system: str = "") -> AsyncGenerator[str, None]:
+    """Stream a chat completion from OpenAI API."""
+    try:
+        from openai import AsyncOpenAI
+    except ImportError:
+        raise RuntimeError("openai package not installed")
+
+    client = AsyncOpenAI(api_key=settings.openai_api_key)
+
+    all_messages = []
+    if system:
+        all_messages.append({"role": "system", "content": system})
+    all_messages.extend(messages)
+
+    stream = await client.chat.completions.create(
+        model="gpt-4o-mini",
+        messages=all_messages,
+        max_tokens=2048,
+        stream=True,
+    )
+
+    async for chunk in stream:
+        if chunk.choices and chunk.choices[0].delta.content:
+            yield chunk.choices[0].delta.content
+
+
+async def stream_chat(messages: list[dict], system: str = "") -> AsyncGenerator[str, None]:
+    """Route to the configured LLM provider."""
+    provider = settings.llm_provider.lower()
+
+    if provider == "ollama":
+        async for token in stream_ollama(messages, system):
+            yield token
+    elif provider == "claude":
+        if not settings.anthropic_api_key:
+            yield "Error: ANTHROPIC_API_KEY not configured. Set it in .env or switch LLM_PROVIDER to ollama."
+            return
+        async for token in stream_claude(messages, system):
+            yield token
+    elif provider == "openai":
+        if not settings.openai_api_key:
+            yield "Error: OPENAI_API_KEY not configured. Set it in .env or switch LLM_PROVIDER to ollama."
+            return
+        async for token in stream_openai(messages, system):
+            yield token
+    else:
+        yield f"Error: Unknown LLM_PROVIDER '{provider}'. Use ollama, claude, or openai."
diff --git a/app/main.py b/app/main.py
new file mode 100644
index 0000000..b4eed85
--- /dev/null
+++ b/app/main.py
@@ -0,0 +1,140 @@
+import asyncio
+import json
+import logging
+import uuid
+from contextlib import asynccontextmanager
+from pathlib import Path
+
+from fastapi import FastAPI, Request
+from fastapi.responses import HTMLResponse, StreamingResponse, JSONResponse
+from fastapi.staticfiles import StaticFiles
+
+from app.config import settings
+from app.database import init_db, async_session
+from app.rag import chat_stream
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(name)s] %(levelname)s: %(message)s")
+logger = logging.getLogger(__name__)
+
+# In-memory session store (conversation history per session)
+sessions: dict[str, list[dict]] = {}
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    logger.info("Initializing database...")
+    await init_db()
+    logger.info("Database ready.")
+    yield
+
+
+app = FastAPI(title="Erowid Bot", lifespan=lifespan)
+
+# Serve static files
+static_dir = Path(__file__).parent / "static"
+app.mount("/static", StaticFiles(directory=str(static_dir)), name="static")
+
+
+@app.get("/", response_class=HTMLResponse)
+async def index():
+    return (static_dir / "index.html").read_text()
+
+
+@app.get("/health")
+async def health():
+    return {"status": "ok"}
+
+
+@app.post("/chat")
+async def chat(request: Request):
+    """Chat endpoint with streaming SSE response."""
+    body = await request.json()
+    message = body.get("message", "").strip()
+    session_id = body.get("session_id", "")
+
+    if not message:
+        return JSONResponse({"error": "Empty message"}, status_code=400)
+
+    if not session_id:
+        session_id = str(uuid.uuid4())
+
+    # Get or create conversation history
+    history = sessions.get(session_id, [])
+
+    async def generate():
+        full_response = ""
+        try:
+            async for token in chat_stream(message, history):
+                full_response += token
+                yield f"data: {json.dumps({'token': token})}\n\n"
+        except Exception as e:
+            logger.error(f"Chat error: {e}")
+            yield f"data: {json.dumps({'error': str(e)})}\n\n"
+
+        # Save to history
+        history.append({"role": "user", "content": message})
+        history.append({"role": "assistant", "content": full_response})
+        # Keep history bounded
+        if len(history) > 20:
+            history[:] = history[-20:]
+        sessions[session_id] = history
+
+        yield f"data: {json.dumps({'done': True, 'session_id': session_id})}\n\n"
+
+    return StreamingResponse(
+        generate(),
+        media_type="text/event-stream",
+        headers={
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+            "X-Accel-Buffering": "no",
+        },
+    )
+
+
+@app.get("/stats")
+async def stats():
+    """Get database stats."""
+    from sqlalchemy import func, select
+    from app.models import Experience, Substance, DocumentChunk
+
+    async with async_session() as db:
+        exp_count = (await db.execute(select(func.count(Experience.id)))).scalar() or 0
+        sub_count = (await db.execute(select(func.count(Substance.id)))).scalar() or 0
+        chunk_count = (await db.execute(select(func.count(DocumentChunk.id)))).scalar() or 0
+
+    return {
+        "experiences": exp_count,
+        "substances": sub_count,
+        "chunks": chunk_count,
+    }
+
+
+@app.post("/admin/scrape/experiences")
+async def trigger_scrape_experiences(request: Request):
+    """Trigger experience scraping (admin endpoint)."""
+    body = await request.json() if request.headers.get("content-type") == "application/json" else {}
+    limit = body.get("limit")
+
+    from app.scraper.experiences import scrape_all_experiences
+    asyncio.create_task(scrape_all_experiences(limit=limit))
+    return {"status": "started", "message": "Experience scraping started in background"}
+
+
+@app.post("/admin/scrape/substances")
+async def trigger_scrape_substances(request: Request):
+    """Trigger substance scraping (admin endpoint)."""
+    body = await request.json() if request.headers.get("content-type") == "application/json" else {}
+    limit = body.get("limit")
+
+    from app.scraper.substances import scrape_all_substances
+    asyncio.create_task(scrape_all_substances(limit=limit))
+    return {"status": "started", "message": "Substance scraping started in background"}
+
+
+@app.post("/admin/embed")
+async def trigger_embedding():
+    """Trigger embedding pipeline (admin endpoint)."""
+    from app.embeddings import embed_all
+    asyncio.create_task(embed_all())
+    return {"status": "started", "message": "Embedding pipeline started in background"}
diff --git a/app/models.py b/app/models.py
new file mode 100644
index 0000000..2206be4
--- /dev/null
+++ b/app/models.py
@@ -0,0 +1,56 @@
+from sqlalchemy import Column, Integer, String, Text, Boolean, Float, ForeignKey
+from sqlalchemy.dialects.postgresql import ARRAY, TIMESTAMP
+from pgvector.sqlalchemy import Vector
+from datetime import datetime, timezone
+from app.database import Base
+
+
+class Substance(Base):
+    __tablename__ = "substances"
+
+    id = Column(Integer, primary_key=True, autoincrement=True)
+    name = Column(String(255), unique=True, nullable=False, index=True)
+    url = Column(String(1024))
+    category = Column(String(255))  # e.g. "Psychedelics", "Stimulants"
+    description = Column(Text)
+    effects = Column(Text)
+    dosage = Column(Text)
+    duration = Column(Text)
+    chemistry = Column(Text)
+    health = Column(Text)
+    law = Column(Text)
+    raw_html = Column(Text)
+    scraped_at = Column(TIMESTAMP(timezone=True), default=lambda: datetime.now(timezone.utc))
+
+
+class Experience(Base):
+    __tablename__ = "experiences"
+
+    id = Column(Integer, primary_key=True, autoincrement=True)
+    erowid_id = Column(Integer, unique=True, index=True)
+    title = Column(String(512))
+    author = Column(String(255))
+    substance = Column(String(512))  # may list multiple substances
+    substance_list = Column(ARRAY(String))  # parsed list
+    body = Column(Text, nullable=False)
+    category = Column(String(255))  # e.g. "General", "First Times", "Bad Trips"
+    gender = Column(String(50))
+    age = Column(String(50))
+    year = Column(Integer)
+    url = Column(String(1024))
+    intensity = Column(String(100))
+    raw_html = Column(Text)
+    scraped_at = Column(TIMESTAMP(timezone=True), default=lambda: datetime.now(timezone.utc))
+
+
+class DocumentChunk(Base):
+    __tablename__ = "document_chunks"
+
+    id = Column(Integer, primary_key=True, autoincrement=True)
+    source_type = Column(String(50), nullable=False, index=True)  # "experience" or "substance"
+    source_id = Column(Integer, nullable=False, index=True)
+    chunk_index = Column(Integer, nullable=False)
+    content = Column(Text, nullable=False)
+    metadata_json = Column(Text)  # JSON string with extra metadata
+    embedding = Column(Vector(768))  # nomic-embed-text dimension
+    created_at = Column(TIMESTAMP(timezone=True), default=lambda: datetime.now(timezone.utc))
diff --git a/app/rag.py b/app/rag.py
new file mode 100644
index 0000000..a855e12
--- /dev/null
+++ b/app/rag.py
@@ -0,0 +1,108 @@
+import json
+import logging
+from typing import AsyncGenerator
+
+from sqlalchemy import select, text
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.config import settings
+from app.database import async_session
+from app.embeddings import get_embedding
+from app.llm import stream_chat
+from app.models import DocumentChunk
+
+logger = logging.getLogger(__name__)
+
+SYSTEM_PROMPT = """You are the Erowid Knowledge Assistant focused on harm reduction. Provide accurate, non-judgmental substance info from the Erowid database. Prioritize safety. Never encourage drug use. Cite sources when possible. Say when info is limited."""
+
+
+async def retrieve_context(query: str, top_k: int | None = None) -> list[dict]:
+    """Retrieve the most relevant document chunks for a query."""
+    if top_k is None:
+        top_k = settings.retrieval_top_k
+
+    # Get query embedding
+    query_embedding = await get_embedding(query)
+
+    async with async_session() as db:
+        # Use pgvector cosine distance for similarity search
+        result = await db.execute(
+            text("""
+                SELECT id, source_type, source_id, chunk_index, content, metadata_json,
+                       embedding <=> :query_embedding AS distance
+                FROM document_chunks
+                ORDER BY embedding <=> :query_embedding
+                LIMIT :top_k
+            """),
+            {"query_embedding": str(query_embedding), "top_k": top_k},
+        )
+
+        chunks = []
+        for row in result.fetchall():
+            metadata = {}
+            if row[5]:
+                try:
+                    metadata = json.loads(row[5])
+                except json.JSONDecodeError:
+                    pass
+
+            chunks.append({
+                "id": row[0],
+                "source_type": row[1],
+                "source_id": row[2],
+                "chunk_index": row[3],
+                "content": row[4],
+                "metadata": metadata,
+                "distance": row[6],
+            })
+
+        return chunks
+
+
+def build_context_prompt(chunks: list[dict]) -> str:
+    """Build a context string from retrieved chunks."""
+    if not chunks:
+        return "\n[No relevant documents found in the database.]\n"
+
+    context_parts = []
+    for i, chunk in enumerate(chunks, 1):
+        source_label = chunk["source_type"].title()
+        metadata = chunk["metadata"]
+
+        header = f"--- Source {i} ({source_label})"
+        if "title" in metadata:
+            header += f" | {metadata['title']}"
+        if "substance" in metadata:
+            header += f" | Substance: {metadata['substance']}"
+        header += " ---"
+
+        # Limit each chunk to avoid overwhelming the LLM
+        content = chunk["content"][:800]
+        context_parts.append(f"{header}\n{content}")
+
+    return "\n\n".join(context_parts)
+
+
+async def chat_stream(
+    user_message: str,
+    conversation_history: list[dict] | None = None,
+) -> AsyncGenerator[str, None]:
+    """Full RAG pipeline: retrieve context, build prompt, stream LLM response."""
+    # Retrieve relevant chunks
+    chunks = await retrieve_context(user_message)
+
+    # Build the context-augmented system prompt
+    context_text = build_context_prompt(chunks)
+    full_system = f"{SYSTEM_PROMPT}\n\n--- RELEVANT EROWID DATA ---\n{context_text}\n--- END EROWID DATA ---"
+
+    # Build message history
+    messages = []
+    if conversation_history:
+        # Keep last 10 messages for context
+        messages = conversation_history[-10:]
+
+    messages.append({"role": "user", "content": user_message})
+
+    # Stream from LLM
+    async for token in stream_chat(messages, system=full_system):
+        yield token
diff --git a/app/scraper/__init__.py b/app/scraper/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/app/scraper/experiences.py b/app/scraper/experiences.py
new file mode 100644
index 0000000..2f3f880
--- /dev/null
+++ b/app/scraper/experiences.py
@@ -0,0 +1,244 @@
+import asyncio
+import re
+import logging
+from datetime import datetime, timezone
+
+import httpx
+from bs4 import BeautifulSoup
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.config import settings
+from app.database import async_session
+from app.models import Experience
+
+logger = logging.getLogger(__name__)
+
+BASE_URL = "https://erowid.org"
+EXP_LIST_URL = "https://erowid.org/experiences/exp_list.shtml"
+REPORT_URL = "https://erowid.org/experiences/exp.php?ID={id}"
+
+HEADERS = {
+    "User-Agent": "ErowidResearchBot/1.0 (educational research project)",
+    "Accept": "text/html,application/xhtml+xml",
+}
+
+
+async def get_all_substance_pages(client: httpx.AsyncClient) -> list[dict]:
+    """Get main substance experience listing pages from the master index.
+
+    Only fetches top-level substance pages (e.g. exp_LSD.shtml), not
+    category sub-pages (e.g. exp_LSD_General.shtml) since the main page
+    already contains all report IDs for that substance.
+    """
+    resp = await client.get(EXP_LIST_URL, headers=HEADERS, timeout=60)
+    resp.raise_for_status()
+    soup = BeautifulSoup(resp.text, "lxml")
+
+    pages = []
+    seen_substances = set()
+    for a in soup.select("a[href]"):
+        href = a.get("href", "")
+        name = a.get_text(strip=True)
+        if not href.startswith("subs/exp_") or not href.endswith(".shtml") or not name:
+            continue
+
+        full_url = f"https://erowid.org/experiences/{href}"
+
+        # Extract the base substance name from the URL
+        # e.g. subs/exp_LSD.shtml -> LSD, subs/exp_LSD_General.shtml -> LSD
+        filename = href.replace("subs/exp_", "").replace(".shtml", "")
+
+        # Skip category sub-pages — they contain subsets of the main page
+        # Category sub-pages have suffixes like _General, _First_Times, _Bad_Trips, etc.
+        known_categories = [
+            "_General", "_First_Times", "_Combinations", "_Retrospective",
+            "_Preparation", "_Difficult_Experiences", "_Bad_Trips",
+            "_Health_Problems", "_Train_Wrecks", "_Glowing_Experiences",
+            "_Mystical_Experiences", "_Health_Benefits", "_What_Was_in_That",
+            "_Medical_Use", "_Performance_Enhancement", "_Addiction",
+        ]
+        is_category = any(filename.endswith(cat) for cat in known_categories)
+        if is_category:
+            continue
+
+        if full_url not in seen_substances:
+            seen_substances.add(full_url)
+            pages.append({"name": name, "url": full_url})
+
+    logger.info(f"Found {len(pages)} main substance experience pages (filtered from category sub-pages)")
+    return pages
+
+
+async def get_experience_ids_from_page(client: httpx.AsyncClient, url: str) -> list[int]:
+    """Extract all experience report IDs from a substance listing page."""
+    try:
+        resp = await client.get(url, headers=HEADERS, timeout=30)
+        resp.raise_for_status()
+    except httpx.HTTPError as e:
+        logger.warning(f"Failed to fetch {url}: {e}")
+        return []
+
+    ids = [int(x) for x in re.findall(r"exp\.php\?ID=(\d+)", resp.text)]
+    return list(set(ids))  # dedupe
+
+
+async def get_all_experience_ids(client: httpx.AsyncClient) -> list[int]:
+    """Collect all unique experience IDs from all substance pages.
+
+    Fetches pages concurrently in batches of 5 for speed.
+    """
+    pages = await get_all_substance_pages(client)
+
+    all_ids = set()
+    batch_size = 5
+    for i in range(0, len(pages), batch_size):
+        batch = pages[i : i + batch_size]
+        tasks = [get_experience_ids_from_page(client, p["url"]) for p in batch]
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+        for result in results:
+            if isinstance(result, list):
+                all_ids.update(result)
+        logger.info(f"Scanned {min(i + batch_size, len(pages))}/{len(pages)} pages, {len(all_ids)} unique IDs")
+        await asyncio.sleep(0.5)
+
+    logger.info(f"Found {len(all_ids)} unique experience IDs total")
+    return sorted(all_ids)
+
+
+async def scrape_experience_report(client: httpx.AsyncClient, erowid_id: int) -> dict | None:
+    """Scrape a single experience report."""
+    url = REPORT_URL.format(id=erowid_id)
+    try:
+        resp = await client.get(url, headers=HEADERS, timeout=30)
+        resp.raise_for_status()
+    except httpx.HTTPError as e:
+        logger.warning(f"Failed to fetch experience {erowid_id}: {e}")
+        return None
+
+    soup = BeautifulSoup(resp.text, "lxml")
+
+    # Extract the main report body
+    body_div = soup.select_one("div.report-text-surround")
+    if not body_div:
+        logger.warning(f"No report body found for {erowid_id}")
+        return None
+
+    # Remove the dosechart from body text to avoid duplication
+    body_text_parts = []
+    for el in body_div.children:
+        if hasattr(el, "name") and el.name == "table":
+            continue  # skip dosechart table
+        text = el.get_text(separator="\n", strip=True) if hasattr(el, "get_text") else str(el).strip()
+        if text:
+            body_text_parts.append(text)
+
+    body = "\n\n".join(body_text_parts)
+    if not body or len(body) < 50:
+        return None
+
+    # Extract metadata
+    title = ""
+    title_el = soup.select_one("div.title")
+    if title_el:
+        title = title_el.get_text(strip=True)
+
+    substance = ""
+    sub_el = soup.select_one("div.substance")
+    if sub_el:
+        substance = sub_el.get_text(strip=True)
+
+    substance_list = [s.strip() for s in re.split(r"[,&]", substance) if s.strip()]
+
+    author = ""
+    author_el = soup.select_one("div.author")
+    if author_el:
+        author = author_el.get_text(strip=True).replace("by ", "")
+
+    # Dosage info
+    dose_table = soup.select_one("table.dosechart")
+    dose_text = ""
+    if dose_table:
+        rows = dose_table.select("tr")
+        dose_parts = []
+        for row in rows:
+            cells = row.select("td")
+            row_text = " ".join(c.get_text(strip=True) for c in cells if c.get_text(strip=True))
+            if row_text:
+                dose_parts.append(row_text)
+        dose_text = "; ".join(dose_parts)
+
+    if dose_text:
+        body = f"Dosage: {dose_text}\n\n{body}"
+
+    # Weight/gender from the body weight line inside report-text-surround
+    gender = ""
+    age = ""
+    weight_el = soup.select_one("table.bodyweight")
+    if weight_el:
+        wt = weight_el.get_text(strip=True)
+        age_match = re.search(r"(\d+)\s*yr", wt, re.IGNORECASE)
+        if age_match:
+            age = age_match.group(1)
+        if "male" in wt.lower():
+            gender = "Male"
+        elif "female" in wt.lower():
+            gender = "Female"
+
+    # Try to extract category from the page
+    category = ""
+    cat_el = soup.select_one("div.foot-eroid-cat")
+    if cat_el:
+        category = cat_el.get_text(strip=True)
+
+    return {
+        "erowid_id": erowid_id,
+        "title": title,
+        "author": author,
+        "substance": substance,
+        "substance_list": substance_list,
+        "body": body,
+        "category": category,
+        "gender": gender,
+        "age": age,
+        "url": url,
+        "raw_html": resp.text,
+    }
+
+
+async def scrape_all_experiences(limit: int | None = None):
+    """Main scraper entry point. Scrapes all experience reports into the database."""
+    async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
+        all_ids = await get_all_experience_ids(client)
+
+        if limit:
+            all_ids = all_ids[:limit]
+
+        async with async_session() as db:
+            result = await db.execute(select(Experience.erowid_id))
+            existing_ids = {row[0] for row in result.fetchall()}
+            logger.info(f"Already have {len(existing_ids)} experiences in DB")
+
+            to_scrape = [eid for eid in all_ids if eid not in existing_ids]
+            logger.info(f"Need to scrape {len(to_scrape)} new experiences")
+
+            scraped = 0
+            errors = 0
+            for eid in to_scrape:
+                data = await scrape_experience_report(client, eid)
+                if data:
+                    exp = Experience(**data)
+                    db.add(exp)
+                    scraped += 1
+
+                    if scraped % settings.scrape_batch_size == 0:
+                        await db.commit()
+                        logger.info(f"Committed batch: {scraped}/{len(to_scrape)} scraped ({errors} errors)")
+                else:
+                    errors += 1
+
+                await asyncio.sleep(settings.scrape_delay)
+
+            await db.commit()
+            logger.info(f"Done! Scraped {scraped} new experiences ({errors} errors)")
+            return scraped
diff --git a/app/scraper/substances.py b/app/scraper/substances.py
new file mode 100644
index 0000000..79efd13
--- /dev/null
+++ b/app/scraper/substances.py
@@ -0,0 +1,171 @@
+import asyncio
+import re
+import logging
+from datetime import datetime, timezone
+
+import httpx
+from bs4 import BeautifulSoup
+from sqlalchemy import select
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.config import settings
+from app.database import async_session
+from app.models import Substance
+
+logger = logging.getLogger(__name__)
+
+BASE_URL = "https://erowid.org"
+VAULT_INDEX = "https://erowid.org/chemicals/"
+
+HEADERS = {
+    "User-Agent": "ErowidResearchBot/1.0 (educational research project)",
+    "Accept": "text/html,application/xhtml+xml",
+}
+
+# Known substance categories on Erowid
+CATEGORIES = [
+    "Psychedelics", "Empathogens", "Stimulants", "Depressants",
+    "Dissociatives", "Cannabis", "Opioids", "Nootropics",
+    "Plants & Herbs", "Pharmaceuticals", "Research Chemicals",
+]
+
+
+async def get_substance_urls(client: httpx.AsyncClient) -> list[dict]:
+    """Get all substance vault URLs from the chemicals index."""
+    resp = await client.get(VAULT_INDEX, headers=HEADERS)
+    resp.raise_for_status()
+    soup = BeautifulSoup(resp.text, "lxml")
+
+    substances = []
+    # The chemicals index lists substances with links to their vaults
+    for link in soup.select("a[href]"):
+        href = link.get("href", "")
+        text = link.get_text(strip=True)
+        # Filter for substance vault links (e.g., /chemicals/lsd/, /chemicals/psilocybin/)
+        if re.match(r"^/chemicals/[\w_-]+/?$", href) or re.match(r"^[\w_-]+/?$", href):
+            if text and len(text) > 1 and not text.startswith("["):
+                full_url = href if href.startswith("http") else f"{BASE_URL}{href}" if href.startswith("/") else f"{VAULT_INDEX}{href}"
+                substances.append({
+                    "name": text,
+                    "url": full_url,
+                })
+
+    # Also scrape the plants/herbs section
+    plants_url = f"{BASE_URL}/plants/"
+    try:
+        resp2 = await client.get(plants_url, headers=HEADERS)
+        resp2.raise_for_status()
+        soup2 = BeautifulSoup(resp2.text, "lxml")
+        for link in soup2.select("a[href]"):
+            href = link.get("href", "")
+            text = link.get_text(strip=True)
+            if re.match(r"^/plants/[\w_-]+/?$", href) or re.match(r"^[\w_-]+/?$", href):
+                if text and len(text) > 1 and not text.startswith("["):
+                    full_url = href if href.startswith("http") else f"{BASE_URL}{href}" if href.startswith("/") else f"{plants_url}{href}"
+                    substances.append({
+                        "name": text,
+                        "url": full_url,
+                        "category": "Plants & Herbs",
+                    })
+    except httpx.HTTPError:
+        logger.warning("Failed to fetch plants index")
+
+    logger.info(f"Found {len(substances)} substance URLs")
+    return substances
+
+
+async def scrape_substance_vault(client: httpx.AsyncClient, name: str, url: str) -> dict | None:
+    """Scrape a substance vault page for key information."""
+    try:
+        resp = await client.get(url, headers=HEADERS)
+        resp.raise_for_status()
+    except httpx.HTTPError as e:
+        logger.warning(f"Failed to fetch substance {name}: {e}")
+        return None
+
+    soup = BeautifulSoup(resp.text, "lxml")
+    raw_html = resp.text
+
+    # Extract text content from the vault main page
+    description = ""
+    main_content = soup.select_one("div.sum-content") or soup.select_one("td.content") or soup.select_one("body")
+    if main_content:
+        description = main_content.get_text(separator="\n", strip=True)[:5000]
+
+    # Try to find sub-pages: effects, dose, duration, health, law
+    sections = {}
+    sub_pages = {
+        "effects": ["effects", "effects.shtml"],
+        "dosage": ["dose", "dose.shtml", "dosage.shtml"],
+        "duration": ["duration", "duration.shtml", "timeline.shtml"],
+        "chemistry": ["chemistry", "chemistry.shtml"],
+        "health": ["health", "health.shtml", "warnings.shtml"],
+        "law": ["law", "law.shtml", "legal.shtml"],
+    }
+
+    for section_name, paths in sub_pages.items():
+        for path in paths:
+            sub_url = f"{url.rstrip('/')}/{path}"
+            try:
+                sub_resp = await client.get(sub_url, headers=HEADERS)
+                if sub_resp.status_code == 200:
+                    sub_soup = BeautifulSoup(sub_resp.text, "lxml")
+                    content_el = sub_soup.select_one("div.sum-content") or sub_soup.select_one("td.content")
+                    if content_el:
+                        sections[section_name] = content_el.get_text(separator="\n", strip=True)[:3000]
+                        break
+                await asyncio.sleep(0.5)
+            except httpx.HTTPError:
+                continue
+
+    return {
+        "name": name,
+        "url": url,
+        "description": description,
+        "effects": sections.get("effects", ""),
+        "dosage": sections.get("dosage", ""),
+        "duration": sections.get("duration", ""),
+        "chemistry": sections.get("chemistry", ""),
+        "health": sections.get("health", ""),
+        "law": sections.get("law", ""),
+        "raw_html": raw_html,
+    }
+
+
+async def scrape_all_substances(limit: int | None = None):
+    """Main entry point: scrape all substance vaults."""
+    async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
+        substance_list = await get_substance_urls(client)
+
+        if limit:
+            substance_list = substance_list[:limit]
+
+        async with async_session() as db:
+            # Get already-scraped substances
+            result = await db.execute(select(Substance.name))
+            existing = {row[0].lower() for row in result.fetchall()}
+            logger.info(f"Already have {len(existing)} substances in DB")
+
+            to_scrape = [s for s in substance_list if s["name"].lower() not in existing]
+            logger.info(f"Need to scrape {len(to_scrape)} new substances")
+
+            scraped = 0
+            for sub_meta in to_scrape:
+                data = await scrape_substance_vault(
+                    client, sub_meta["name"], sub_meta["url"]
+                )
+                if data:
+                    data["category"] = sub_meta.get("category", "")
+                    sub = Substance(**data)
+                    db.add(sub)
+                    scraped += 1
+
+                    if scraped % 10 == 0:
+                        await db.commit()
+                        logger.info(f"Committed: {scraped} substances scraped")
+
+                await asyncio.sleep(settings.scrape_delay)
+
+            await db.commit()
+            logger.info(f"Done! Scraped {scraped} new substances")
+            return scraped
diff --git a/app/static/app.js b/app/static/app.js
new file mode 100644
index 0000000..b38f691
--- /dev/null
+++ b/app/static/app.js
@@ -0,0 +1,157 @@
+const chatContainer = document.getElementById("chat-container");
+const messageInput = document.getElementById("message-input");
+const sendBtn = document.getElementById("send-btn");
+const welcomeEl = document.getElementById("welcome");
+const statsEl = document.getElementById("stats");
+
+let sessionId = localStorage.getItem("erowid_session") || "";
+let isStreaming = false;
+
+// Load stats
+async function loadStats() {
+  try {
+    const resp = await fetch("/stats");
+    const data = await resp.json();
+    statsEl.textContent = `${data.experiences} reports | ${data.substances} substances | ${data.chunks} chunks`;
+  } catch {
+    statsEl.textContent = "connecting...";
+  }
+}
+loadStats();
+
+// Auto-resize textarea
+messageInput.addEventListener("input", () => {
+  messageInput.style.height = "auto";
+  messageInput.style.height = Math.min(messageInput.scrollHeight, 120) + "px";
+});
+
+// Send on Enter (Shift+Enter for newline)
+messageInput.addEventListener("keydown", (e) => {
+  if (e.key === "Enter" && !e.shiftKey) {
+    e.preventDefault();
+    sendMessage();
+  }
+});
+
+sendBtn.addEventListener("click", sendMessage);
+
+// Suggestion clicks
+document.querySelectorAll(".suggestion").forEach((el) => {
+  el.addEventListener("click", () => {
+    messageInput.value = el.textContent;
+    sendMessage();
+  });
+});
+
+function addMessage(role, content) {
+  if (welcomeEl) welcomeEl.style.display = "none";
+
+  const msg = document.createElement("div");
+  msg.className = `message ${role}`;
+
+  const avatar = document.createElement("div");
+  avatar.className = "message-avatar";
+  avatar.textContent = role === "user" ? "You" : "E";
+
+  const contentEl = document.createElement("div");
+  contentEl.className = "message-content";
+  contentEl.textContent = content;
+
+  msg.appendChild(avatar);
+  msg.appendChild(contentEl);
+  chatContainer.appendChild(msg);
+  chatContainer.scrollTop = chatContainer.scrollHeight;
+
+  return contentEl;
+}
+
+function addTypingIndicator() {
+  const msg = document.createElement("div");
+  msg.className = "message assistant";
+  msg.id = "typing-indicator";
+
+  const avatar = document.createElement("div");
+  avatar.className = "message-avatar";
+  avatar.textContent = "E";
+
+  const contentEl = document.createElement("div");
+  contentEl.className = "message-content";
+  contentEl.innerHTML = '<div class="typing"><span></span><span></span><span></span></div>';
+
+  msg.appendChild(avatar);
+  msg.appendChild(contentEl);
+  chatContainer.appendChild(msg);
+  chatContainer.scrollTop = chatContainer.scrollHeight;
+}
+
+function removeTypingIndicator() {
+  const el = document.getElementById("typing-indicator");
+  if (el) el.remove();
+}
+
+async function sendMessage() {
+  const text = messageInput.value.trim();
+  if (!text || isStreaming) return;
+
+  isStreaming = true;
+  sendBtn.disabled = true;
+  messageInput.value = "";
+  messageInput.style.height = "auto";
+
+  addMessage("user", text);
+  addTypingIndicator();
+
+  try {
+    const resp = await fetch("/chat", {
+      method: "POST",
+      headers: { "Content-Type": "application/json" },
+      body: JSON.stringify({ message: text, session_id: sessionId }),
+    });
+
+    removeTypingIndicator();
+    const contentEl = addMessage("assistant", "");
+
+    const reader = resp.body.getReader();
+    const decoder = new TextDecoder();
+    let buffer = "";
+    let fullResponse = "";
+
+    while (true) {
+      const { done, value } = await reader.read();
+      if (done) break;
+
+      buffer += decoder.decode(value, { stream: true });
+      const lines = buffer.split("\n");
+      buffer = lines.pop() || "";
+
+      for (const line of lines) {
+        if (!line.startsWith("data: ")) continue;
+        const jsonStr = line.slice(6).trim();
+        if (!jsonStr) continue;
+
+        try {
+          const data = JSON.parse(jsonStr);
+          if (data.token) {
+            fullResponse += data.token;
+            contentEl.textContent = fullResponse;
+            chatContainer.scrollTop = chatContainer.scrollHeight;
+          }
+          if (data.session_id) {
+            sessionId = data.session_id;
+            localStorage.setItem("erowid_session", sessionId);
+          }
+          if (data.error) {
+            contentEl.textContent = `Error: ${data.error}`;
+          }
+        } catch {}
+      }
+    }
+  } catch (err) {
+    removeTypingIndicator();
+    addMessage("assistant", `Connection error: ${err.message}`);
+  }
+
+  isStreaming = false;
+  sendBtn.disabled = false;
+  messageInput.focus();
+}
diff --git a/app/static/index.html b/app/static/index.html
new file mode 100644
index 0000000..edd83cd
--- /dev/null
+++ b/app/static/index.html
@@ -0,0 +1,59 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8">
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  <title>Erowid Bot</title>
+  <link rel="stylesheet" href="/static/style.css">
+  <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap" rel="stylesheet">
+</head>
+<body>
+
+  <div class="header">
+    <div class="header-left">
+      <div class="logo">Erowid Bot</div>
+      <div class="tagline">Harm Reduction Knowledge Assistant</div>
+    </div>
+    <div class="stats-badge" id="stats">loading...</div>
+  </div>
+
+  <div class="chat-container" id="chat-container">
+    <div class="welcome" id="welcome">
+      <h2>Explore the Erowid Database</h2>
+      <p>
+        Ask questions about substances, experience reports, dosage information,
+        effects, safety, and more. All information is sourced from the Erowid vault
+        and experience reports. This bot prioritizes harm reduction and safety.
+      </p>
+      <div class="suggestions">
+        <button class="suggestion">What are the effects of psilocybin?</button>
+        <button class="suggestion">Tell me about safe MDMA dosing</button>
+        <button class="suggestion">What do people report about DMT experiences?</button>
+        <button class="suggestion">What are dangerous drug combinations?</button>
+        <button class="suggestion">Compare LSD and psilocybin experiences</button>
+      </div>
+    </div>
+  </div>
+
+  <div class="input-container">
+    <div class="input-wrapper">
+      <textarea
+        id="message-input"
+        placeholder="Ask about substances, experiences, safety..."
+        rows="1"
+      ></textarea>
+      <button id="send-btn">
+        <svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
+          <line x1="22" y1="2" x2="11" y2="13"></line>
+          <polygon points="22 2 15 22 11 13 2 9 22 2"></polygon>
+        </svg>
+      </button>
+    </div>
+    <div class="disclaimer">
+      Information sourced from Erowid.org. Not medical advice. Always practice harm reduction.
+    </div>
+  </div>
+
+  <script src="/static/app.js"></script>
+</body>
+</html>
diff --git a/app/static/style.css b/app/static/style.css
new file mode 100644
index 0000000..eacfe3e
--- /dev/null
+++ b/app/static/style.css
@@ -0,0 +1,326 @@
+* {
+  margin: 0;
+  padding: 0;
+  box-sizing: border-box;
+}
+
+:root {
+  --bg-primary: #0a0a0f;
+  --bg-secondary: #12121a;
+  --bg-tertiary: #1a1a2e;
+  --text-primary: #e0e0e8;
+  --text-secondary: #8888a0;
+  --accent: #6c5ce7;
+  --accent-glow: rgba(108, 92, 231, 0.3);
+  --user-bg: #2d2d44;
+  --bot-bg: #1a1a2e;
+  --border: #2a2a3e;
+  --danger: #e74c3c;
+  --success: #2ecc71;
+}
+
+body {
+  font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
+  background: var(--bg-primary);
+  color: var(--text-primary);
+  height: 100vh;
+  display: flex;
+  flex-direction: column;
+  overflow: hidden;
+}
+
+/* Header */
+.header {
+  display: flex;
+  align-items: center;
+  justify-content: space-between;
+  padding: 12px 20px;
+  background: var(--bg-secondary);
+  border-bottom: 1px solid var(--border);
+  flex-shrink: 0;
+}
+
+.header-left {
+  display: flex;
+  align-items: center;
+  gap: 12px;
+}
+
+.logo {
+  font-size: 24px;
+  font-weight: 700;
+  background: linear-gradient(135deg, var(--accent), #a29bfe);
+  -webkit-background-clip: text;
+  -webkit-text-fill-color: transparent;
+  background-clip: text;
+}
+
+.tagline {
+  font-size: 12px;
+  color: var(--text-secondary);
+  letter-spacing: 0.5px;
+}
+
+.stats-badge {
+  font-size: 11px;
+  color: var(--text-secondary);
+  background: var(--bg-tertiary);
+  padding: 4px 10px;
+  border-radius: 12px;
+  border: 1px solid var(--border);
+}
+
+/* Chat area */
+.chat-container {
+  flex: 1;
+  overflow-y: auto;
+  padding: 20px;
+  scroll-behavior: smooth;
+}
+
+.chat-container::-webkit-scrollbar {
+  width: 6px;
+}
+.chat-container::-webkit-scrollbar-track {
+  background: transparent;
+}
+.chat-container::-webkit-scrollbar-thumb {
+  background: var(--border);
+  border-radius: 3px;
+}
+
+.welcome {
+  text-align: center;
+  padding: 60px 20px;
+  max-width: 600px;
+  margin: 0 auto;
+}
+
+.welcome h2 {
+  font-size: 22px;
+  margin-bottom: 12px;
+  color: var(--text-primary);
+}
+
+.welcome p {
+  color: var(--text-secondary);
+  line-height: 1.6;
+  margin-bottom: 20px;
+  font-size: 14px;
+}
+
+.suggestions {
+  display: flex;
+  flex-wrap: wrap;
+  gap: 8px;
+  justify-content: center;
+}
+
+.suggestion {
+  background: var(--bg-tertiary);
+  border: 1px solid var(--border);
+  color: var(--text-secondary);
+  padding: 8px 14px;
+  border-radius: 20px;
+  font-size: 13px;
+  cursor: pointer;
+  transition: all 0.2s;
+}
+
+.suggestion:hover {
+  border-color: var(--accent);
+  color: var(--text-primary);
+  background: rgba(108, 92, 231, 0.1);
+}
+
+/* Messages */
+.message {
+  display: flex;
+  gap: 12px;
+  margin-bottom: 16px;
+  max-width: 800px;
+  margin-left: auto;
+  margin-right: auto;
+  animation: fadeIn 0.3s ease;
+}
+
+@keyframes fadeIn {
+  from { opacity: 0; transform: translateY(8px); }
+  to { opacity: 1; transform: translateY(0); }
+}
+
+.message.user {
+  flex-direction: row-reverse;
+}
+
+.message-avatar {
+  width: 32px;
+  height: 32px;
+  border-radius: 8px;
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  font-size: 14px;
+  flex-shrink: 0;
+}
+
+.message.user .message-avatar {
+  background: var(--user-bg);
+}
+
+.message.assistant .message-avatar {
+  background: var(--accent);
+}
+
+.message-content {
+  padding: 10px 16px;
+  border-radius: 12px;
+  max-width: 75%;
+  line-height: 1.6;
+  font-size: 14px;
+  white-space: pre-wrap;
+  word-wrap: break-word;
+}
+
+.message.user .message-content {
+  background: var(--user-bg);
+  border-bottom-right-radius: 4px;
+}
+
+.message.assistant .message-content {
+  background: var(--bot-bg);
+  border: 1px solid var(--border);
+  border-bottom-left-radius: 4px;
+}
+
+.message-content p {
+  margin-bottom: 8px;
+}
+.message-content p:last-child {
+  margin-bottom: 0;
+}
+
+.message-content strong {
+  color: #a29bfe;
+}
+
+.message-content code {
+  background: rgba(108, 92, 231, 0.15);
+  padding: 1px 5px;
+  border-radius: 3px;
+  font-size: 13px;
+}
+
+/* Typing indicator */
+.typing {
+  display: flex;
+  gap: 4px;
+  padding: 4px 0;
+}
+
+.typing span {
+  width: 6px;
+  height: 6px;
+  background: var(--text-secondary);
+  border-radius: 50%;
+  animation: bounce 1.4s infinite;
+}
+
+.typing span:nth-child(2) { animation-delay: 0.2s; }
+.typing span:nth-child(3) { animation-delay: 0.4s; }
+
+@keyframes bounce {
+  0%, 60%, 100% { transform: translateY(0); }
+  30% { transform: translateY(-6px); }
+}
+
+/* Input area */
+.input-container {
+  padding: 16px 20px;
+  background: var(--bg-secondary);
+  border-top: 1px solid var(--border);
+  flex-shrink: 0;
+}
+
+.input-wrapper {
+  display: flex;
+  gap: 10px;
+  max-width: 800px;
+  margin: 0 auto;
+}
+
+#message-input {
+  flex: 1;
+  background: var(--bg-tertiary);
+  border: 1px solid var(--border);
+  color: var(--text-primary);
+  padding: 12px 16px;
+  border-radius: 12px;
+  font-size: 14px;
+  font-family: inherit;
+  resize: none;
+  outline: none;
+  min-height: 44px;
+  max-height: 120px;
+  transition: border-color 0.2s;
+}
+
+#message-input:focus {
+  border-color: var(--accent);
+  box-shadow: 0 0 0 2px var(--accent-glow);
+}
+
+#message-input::placeholder {
+  color: var(--text-secondary);
+}
+
+#send-btn {
+  background: var(--accent);
+  color: white;
+  border: none;
+  width: 44px;
+  height: 44px;
+  border-radius: 12px;
+  cursor: pointer;
+  display: flex;
+  align-items: center;
+  justify-content: center;
+  transition: all 0.2s;
+  flex-shrink: 0;
+}
+
+#send-btn:hover {
+  background: #5b4bd5;
+  transform: scale(1.05);
+}
+
+#send-btn:disabled {
+  opacity: 0.5;
+  cursor: not-allowed;
+  transform: none;
+}
+
+#send-btn svg {
+  width: 18px;
+  height: 18px;
+}
+
+.disclaimer {
+  text-align: center;
+  font-size: 11px;
+  color: var(--text-secondary);
+  margin-top: 8px;
+  max-width: 800px;
+  margin-left: auto;
+  margin-right: auto;
+}
+
+/* Mobile */
+@media (max-width: 640px) {
+  .header { padding: 10px 14px; }
+  .chat-container { padding: 12px; }
+  .message-content { max-width: 85%; font-size: 13px; }
+  .input-container { padding: 10px 14px; }
+  .suggestions { gap: 6px; }
+  .suggestion { font-size: 12px; padding: 6px 12px; }
+}
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..6e873b6
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,50 @@
+services:
+  erowid-bot:
+    build: .
+    container_name: erowid-bot
+    restart: unless-stopped
+    env_file: .env
+    ports:
+      - "8421:8000"
+    depends_on:
+      erowid-db:
+        condition: service_healthy
+    networks:
+      - default
+      - traefik-public
+      - ai-internal
+    labels:
+      - "traefik.enable=true"
+      - "traefik.http.routers.erowid-bot.rule=Host(`erowid.jeffemmett.com`)"
+      - "traefik.http.routers.erowid-bot.entrypoints=websecure"
+      - "traefik.http.routers.erowid-bot.tls.certresolver=letsencrypt"
+      - "traefik.http.services.erowid-bot.loadbalancer.server.port=8000"
+    volumes:
+      - ./app:/app/app
+
+  erowid-db:
+    image: pgvector/pgvector:pg16
+    container_name: erowid-db
+    restart: unless-stopped
+    environment:
+      POSTGRES_USER: ${POSTGRES_USER:-erowid}
+      POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-erowid}
+      POSTGRES_DB: ${POSTGRES_DB:-erowid}
+    volumes:
+      - erowid-db-data:/var/lib/postgresql/data
+    healthcheck:
+      test: ["CMD-SHELL", "pg_isready -U erowid"]
+      interval: 5s
+      timeout: 5s
+      retries: 5
+    networks:
+      - default
+
+volumes:
+  erowid-db-data:
+
+networks:
+  traefik-public:
+    external: true
+  ai-internal:
+    external: true
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..da37b51
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,16 @@
+fastapi==0.115.6
+uvicorn[standard]==0.34.0
+sqlalchemy==2.0.36
+asyncpg==0.30.0
+pgvector==0.3.6
+psycopg2-binary==2.9.10
+httpx==0.28.1
+python-dotenv==1.0.1
+beautifulsoup4==4.12.3
+lxml==5.3.0
+pydantic==2.10.3
+pydantic-settings==2.7.0
+sse-starlette==2.2.1
+tiktoken==0.8.0
+anthropic==0.40.0
+openai==1.58.1