Initial commit: Erowid conversational bot
RAG-powered chatbot that indexes Erowid's experience reports and substance info, making them searchable via natural conversation. Built with FastAPI, PostgreSQL+pgvector, Ollama embeddings, and streaming LLM responses. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
commit
d09d065d08
|
|
@ -0,0 +1,24 @@
|
|||
# Database
|
||||
DATABASE_URL=postgresql+asyncpg://erowid:erowid@erowid-db:5432/erowid
|
||||
DATABASE_URL_SYNC=postgresql://erowid:erowid@erowid-db:5432/erowid
|
||||
POSTGRES_USER=erowid
|
||||
POSTGRES_PASSWORD=erowid
|
||||
POSTGRES_DB=erowid
|
||||
|
||||
# LLM Provider: ollama | claude | openai
|
||||
LLM_PROVIDER=ollama
|
||||
|
||||
# Ollama (local, free)
|
||||
OLLAMA_BASE_URL=http://ollama:11434
|
||||
OLLAMA_EMBED_MODEL=nomic-embed-text
|
||||
OLLAMA_CHAT_MODEL=llama3.2:3b
|
||||
|
||||
# Claude API (optional)
|
||||
ANTHROPIC_API_KEY=
|
||||
|
||||
# OpenAI API (optional)
|
||||
OPENAI_API_KEY=
|
||||
|
||||
# App
|
||||
APP_HOST=0.0.0.0
|
||||
APP_PORT=8000
|
||||
|
|
@ -0,0 +1,10 @@
|
|||
__pycache__/
|
||||
*.pyc
|
||||
*.pyo
|
||||
.env
|
||||
*.egg-info/
|
||||
dist/
|
||||
build/
|
||||
.venv/
|
||||
venv/
|
||||
PLAN.md
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
FROM python:3.12-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
build-essential libpq-dev \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
COPY app/ ./app/
|
||||
|
||||
EXPOSE 8000
|
||||
|
||||
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
|
||||
|
|
@ -0,0 +1,34 @@
|
|||
from pydantic_settings import BaseSettings
|
||||
|
||||
|
||||
class Settings(BaseSettings):
|
||||
database_url: str = "postgresql+asyncpg://erowid:erowid@erowid-db:5432/erowid"
|
||||
database_url_sync: str = "postgresql://erowid:erowid@erowid-db:5432/erowid"
|
||||
|
||||
llm_provider: str = "ollama" # ollama | claude | openai
|
||||
|
||||
ollama_base_url: str = "http://ollama:11434"
|
||||
ollama_embed_model: str = "nomic-embed-text"
|
||||
ollama_chat_model: str = "llama3.1:8b"
|
||||
|
||||
anthropic_api_key: str = ""
|
||||
openai_api_key: str = ""
|
||||
|
||||
app_host: str = "0.0.0.0"
|
||||
app_port: int = 8000
|
||||
|
||||
# Scraper settings
|
||||
scrape_delay: float = 3.0 # seconds between requests (be polite to Erowid)
|
||||
scrape_batch_size: int = 50
|
||||
|
||||
# RAG settings
|
||||
chunk_size: int = 500 # tokens per chunk
|
||||
chunk_overlap: int = 50 # token overlap between chunks
|
||||
retrieval_top_k: int = 4 # number of chunks to retrieve
|
||||
|
||||
class Config:
|
||||
env_file = ".env"
|
||||
extra = "ignore"
|
||||
|
||||
|
||||
settings = Settings()
|
||||
|
|
@ -0,0 +1,23 @@
|
|||
from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker
|
||||
from sqlalchemy.orm import DeclarativeBase
|
||||
from app.config import settings
|
||||
|
||||
engine = create_async_engine(settings.database_url, echo=False)
|
||||
async_session = async_sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
|
||||
|
||||
|
||||
class Base(DeclarativeBase):
|
||||
pass
|
||||
|
||||
|
||||
async def get_db():
|
||||
async with async_session() as session:
|
||||
yield session
|
||||
|
||||
|
||||
async def init_db():
|
||||
async with engine.begin() as conn:
|
||||
await conn.execute(
|
||||
__import__("sqlalchemy").text("CREATE EXTENSION IF NOT EXISTS vector")
|
||||
)
|
||||
await conn.run_sync(Base.metadata.create_all)
|
||||
|
|
@ -0,0 +1,199 @@
|
|||
import json
|
||||
import logging
|
||||
from typing import AsyncGenerator
|
||||
|
||||
import httpx
|
||||
from sqlalchemy import select, func
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.config import settings
|
||||
from app.database import async_session
|
||||
from app.models import Experience, Substance, DocumentChunk
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list[str]:
|
||||
"""Split text into overlapping chunks by approximate token count (words / 0.75)."""
|
||||
words = text.split()
|
||||
# Approximate: 1 token ~ 0.75 words
|
||||
words_per_chunk = int(chunk_size * 0.75)
|
||||
words_overlap = int(overlap * 0.75)
|
||||
|
||||
if len(words) <= words_per_chunk:
|
||||
return [text]
|
||||
|
||||
chunks = []
|
||||
start = 0
|
||||
while start < len(words):
|
||||
end = start + words_per_chunk
|
||||
chunk = " ".join(words[start:end])
|
||||
chunks.append(chunk)
|
||||
start = end - words_overlap
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
async def get_embedding(text: str, client: httpx.AsyncClient | None = None) -> list[float]:
|
||||
"""Get embedding vector for text using Ollama."""
|
||||
should_close = False
|
||||
if client is None:
|
||||
client = httpx.AsyncClient(timeout=60)
|
||||
should_close = True
|
||||
|
||||
try:
|
||||
resp = await client.post(
|
||||
f"{settings.ollama_base_url}/api/embeddings",
|
||||
json={
|
||||
"model": settings.ollama_embed_model,
|
||||
"prompt": text,
|
||||
},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
return data["embedding"]
|
||||
finally:
|
||||
if should_close:
|
||||
await client.aclose()
|
||||
|
||||
|
||||
async def get_embeddings_batch(texts: list[str], client: httpx.AsyncClient) -> list[list[float]]:
|
||||
"""Get embeddings for multiple texts sequentially."""
|
||||
embeddings = []
|
||||
for text in texts:
|
||||
emb = await get_embedding(text, client)
|
||||
embeddings.append(emb)
|
||||
return embeddings
|
||||
|
||||
|
||||
async def embed_experiences(batch_size: int = 20):
|
||||
"""Chunk and embed all un-embedded experience reports."""
|
||||
async with async_session() as db:
|
||||
# Find experiences that don't have chunks yet
|
||||
subq = select(DocumentChunk.source_id).where(
|
||||
DocumentChunk.source_type == "experience"
|
||||
).distinct()
|
||||
result = await db.execute(
|
||||
select(Experience).where(Experience.id.not_in(subq))
|
||||
)
|
||||
experiences = result.scalars().all()
|
||||
logger.info(f"Found {len(experiences)} experiences to embed")
|
||||
|
||||
async with httpx.AsyncClient(timeout=60) as client:
|
||||
total_chunks = 0
|
||||
for i, exp in enumerate(experiences):
|
||||
# Build a rich text representation
|
||||
header = f"Experience Report: {exp.title}\n"
|
||||
header += f"Substance: {exp.substance}\n"
|
||||
if exp.category:
|
||||
header += f"Category: {exp.category}\n"
|
||||
if exp.gender:
|
||||
header += f"Gender: {exp.gender}\n"
|
||||
if exp.age:
|
||||
header += f"Age: {exp.age}\n"
|
||||
header += "\n"
|
||||
|
||||
full_text = header + exp.body
|
||||
chunks = chunk_text(full_text, settings.chunk_size, settings.chunk_overlap)
|
||||
|
||||
for idx, chunk_text_content in enumerate(chunks):
|
||||
embedding = await get_embedding(chunk_text_content, client)
|
||||
|
||||
metadata = json.dumps({
|
||||
"title": exp.title,
|
||||
"substance": exp.substance,
|
||||
"category": exp.category,
|
||||
"erowid_id": exp.erowid_id,
|
||||
})
|
||||
|
||||
doc_chunk = DocumentChunk(
|
||||
source_type="experience",
|
||||
source_id=exp.id,
|
||||
chunk_index=idx,
|
||||
content=chunk_text_content,
|
||||
metadata_json=metadata,
|
||||
embedding=embedding,
|
||||
)
|
||||
db.add(doc_chunk)
|
||||
total_chunks += 1
|
||||
|
||||
if (i + 1) % batch_size == 0:
|
||||
await db.commit()
|
||||
logger.info(f"Embedded {i + 1} experiences ({total_chunks} chunks)")
|
||||
|
||||
await db.commit()
|
||||
logger.info(f"Done! Created {total_chunks} chunks from {len(experiences)} experiences")
|
||||
return total_chunks
|
||||
|
||||
|
||||
async def embed_substances(batch_size: int = 10):
|
||||
"""Chunk and embed all un-embedded substance info pages."""
|
||||
async with async_session() as db:
|
||||
subq = select(DocumentChunk.source_id).where(
|
||||
DocumentChunk.source_type == "substance"
|
||||
).distinct()
|
||||
result = await db.execute(
|
||||
select(Substance).where(Substance.id.not_in(subq))
|
||||
)
|
||||
substances = result.scalars().all()
|
||||
logger.info(f"Found {len(substances)} substances to embed")
|
||||
|
||||
async with httpx.AsyncClient(timeout=60) as client:
|
||||
total_chunks = 0
|
||||
for i, sub in enumerate(substances):
|
||||
# Build rich text representation
|
||||
sections = []
|
||||
sections.append(f"Substance Information: {sub.name}")
|
||||
if sub.category:
|
||||
sections.append(f"Category: {sub.category}")
|
||||
if sub.description:
|
||||
sections.append(f"\nOverview:\n{sub.description}")
|
||||
if sub.effects:
|
||||
sections.append(f"\nEffects:\n{sub.effects}")
|
||||
if sub.dosage:
|
||||
sections.append(f"\nDosage:\n{sub.dosage}")
|
||||
if sub.duration:
|
||||
sections.append(f"\nDuration:\n{sub.duration}")
|
||||
if sub.chemistry:
|
||||
sections.append(f"\nChemistry:\n{sub.chemistry}")
|
||||
if sub.health:
|
||||
sections.append(f"\nHealth & Safety:\n{sub.health}")
|
||||
if sub.law:
|
||||
sections.append(f"\nLegal Status:\n{sub.law}")
|
||||
|
||||
full_text = "\n".join(sections)
|
||||
chunks = chunk_text(full_text, settings.chunk_size, settings.chunk_overlap)
|
||||
|
||||
for idx, chunk_text_content in enumerate(chunks):
|
||||
embedding = await get_embedding(chunk_text_content, client)
|
||||
|
||||
metadata = json.dumps({
|
||||
"substance": sub.name,
|
||||
"category": sub.category,
|
||||
})
|
||||
|
||||
doc_chunk = DocumentChunk(
|
||||
source_type="substance",
|
||||
source_id=sub.id,
|
||||
chunk_index=idx,
|
||||
content=chunk_text_content,
|
||||
metadata_json=metadata,
|
||||
embedding=embedding,
|
||||
)
|
||||
db.add(doc_chunk)
|
||||
total_chunks += 1
|
||||
|
||||
if (i + 1) % batch_size == 0:
|
||||
await db.commit()
|
||||
logger.info(f"Embedded {i + 1} substances ({total_chunks} chunks)")
|
||||
|
||||
await db.commit()
|
||||
logger.info(f"Done! Created {total_chunks} chunks from {len(substances)} substances")
|
||||
return total_chunks
|
||||
|
||||
|
||||
async def embed_all():
|
||||
"""Embed everything that hasn't been embedded yet."""
|
||||
exp_chunks = await embed_experiences()
|
||||
sub_chunks = await embed_substances()
|
||||
return {"experience_chunks": exp_chunks, "substance_chunks": sub_chunks}
|
||||
|
|
@ -0,0 +1,119 @@
|
|||
import json
|
||||
import logging
|
||||
from typing import AsyncGenerator
|
||||
|
||||
import httpx
|
||||
|
||||
from app.config import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def stream_ollama(messages: list[dict], system: str = "") -> AsyncGenerator[str, None]:
|
||||
"""Stream a chat completion from Ollama."""
|
||||
all_messages = []
|
||||
if system:
|
||||
all_messages.append({"role": "system", "content": system})
|
||||
all_messages.extend(messages)
|
||||
|
||||
payload = {
|
||||
"model": settings.ollama_chat_model,
|
||||
"messages": all_messages,
|
||||
"stream": True,
|
||||
}
|
||||
|
||||
timeout = httpx.Timeout(connect=30, read=600, write=30, pool=30)
|
||||
async with httpx.AsyncClient(timeout=timeout) as client:
|
||||
async with client.stream(
|
||||
"POST",
|
||||
f"{settings.ollama_base_url}/api/chat",
|
||||
json=payload,
|
||||
) as resp:
|
||||
resp.raise_for_status()
|
||||
buffer = b""
|
||||
async for chunk in resp.aiter_bytes():
|
||||
buffer += chunk
|
||||
# Process complete JSON lines
|
||||
while b"\n" in buffer:
|
||||
line, buffer = buffer.split(b"\n", 1)
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
try:
|
||||
data = json.loads(line)
|
||||
if "message" in data and "content" in data["message"]:
|
||||
content = data["message"]["content"]
|
||||
if content:
|
||||
yield content
|
||||
if data.get("done"):
|
||||
return
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
|
||||
async def stream_claude(messages: list[dict], system: str = "") -> AsyncGenerator[str, None]:
|
||||
"""Stream a chat completion from Claude API."""
|
||||
try:
|
||||
from anthropic import AsyncAnthropic
|
||||
except ImportError:
|
||||
raise RuntimeError("anthropic package not installed")
|
||||
|
||||
client = AsyncAnthropic(api_key=settings.anthropic_api_key)
|
||||
|
||||
async with client.messages.stream(
|
||||
model="claude-sonnet-4-5-20250929",
|
||||
max_tokens=2048,
|
||||
system=system,
|
||||
messages=messages,
|
||||
) as stream:
|
||||
async for text in stream.text_stream:
|
||||
yield text
|
||||
|
||||
|
||||
async def stream_openai(messages: list[dict], system: str = "") -> AsyncGenerator[str, None]:
|
||||
"""Stream a chat completion from OpenAI API."""
|
||||
try:
|
||||
from openai import AsyncOpenAI
|
||||
except ImportError:
|
||||
raise RuntimeError("openai package not installed")
|
||||
|
||||
client = AsyncOpenAI(api_key=settings.openai_api_key)
|
||||
|
||||
all_messages = []
|
||||
if system:
|
||||
all_messages.append({"role": "system", "content": system})
|
||||
all_messages.extend(messages)
|
||||
|
||||
stream = await client.chat.completions.create(
|
||||
model="gpt-4o-mini",
|
||||
messages=all_messages,
|
||||
max_tokens=2048,
|
||||
stream=True,
|
||||
)
|
||||
|
||||
async for chunk in stream:
|
||||
if chunk.choices and chunk.choices[0].delta.content:
|
||||
yield chunk.choices[0].delta.content
|
||||
|
||||
|
||||
async def stream_chat(messages: list[dict], system: str = "") -> AsyncGenerator[str, None]:
|
||||
"""Route to the configured LLM provider."""
|
||||
provider = settings.llm_provider.lower()
|
||||
|
||||
if provider == "ollama":
|
||||
async for token in stream_ollama(messages, system):
|
||||
yield token
|
||||
elif provider == "claude":
|
||||
if not settings.anthropic_api_key:
|
||||
yield "Error: ANTHROPIC_API_KEY not configured. Set it in .env or switch LLM_PROVIDER to ollama."
|
||||
return
|
||||
async for token in stream_claude(messages, system):
|
||||
yield token
|
||||
elif provider == "openai":
|
||||
if not settings.openai_api_key:
|
||||
yield "Error: OPENAI_API_KEY not configured. Set it in .env or switch LLM_PROVIDER to ollama."
|
||||
return
|
||||
async for token in stream_openai(messages, system):
|
||||
yield token
|
||||
else:
|
||||
yield f"Error: Unknown LLM_PROVIDER '{provider}'. Use ollama, claude, or openai."
|
||||
|
|
@ -0,0 +1,140 @@
|
|||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import uuid
|
||||
from contextlib import asynccontextmanager
|
||||
from pathlib import Path
|
||||
|
||||
from fastapi import FastAPI, Request
|
||||
from fastapi.responses import HTMLResponse, StreamingResponse, JSONResponse
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
|
||||
from app.config import settings
|
||||
from app.database import init_db, async_session
|
||||
from app.rag import chat_stream
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(name)s] %(levelname)s: %(message)s")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# In-memory session store (conversation history per session)
|
||||
sessions: dict[str, list[dict]] = {}
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
logger.info("Initializing database...")
|
||||
await init_db()
|
||||
logger.info("Database ready.")
|
||||
yield
|
||||
|
||||
|
||||
app = FastAPI(title="Erowid Bot", lifespan=lifespan)
|
||||
|
||||
# Serve static files
|
||||
static_dir = Path(__file__).parent / "static"
|
||||
app.mount("/static", StaticFiles(directory=str(static_dir)), name="static")
|
||||
|
||||
|
||||
@app.get("/", response_class=HTMLResponse)
|
||||
async def index():
|
||||
return (static_dir / "index.html").read_text()
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health():
|
||||
return {"status": "ok"}
|
||||
|
||||
|
||||
@app.post("/chat")
|
||||
async def chat(request: Request):
|
||||
"""Chat endpoint with streaming SSE response."""
|
||||
body = await request.json()
|
||||
message = body.get("message", "").strip()
|
||||
session_id = body.get("session_id", "")
|
||||
|
||||
if not message:
|
||||
return JSONResponse({"error": "Empty message"}, status_code=400)
|
||||
|
||||
if not session_id:
|
||||
session_id = str(uuid.uuid4())
|
||||
|
||||
# Get or create conversation history
|
||||
history = sessions.get(session_id, [])
|
||||
|
||||
async def generate():
|
||||
full_response = ""
|
||||
try:
|
||||
async for token in chat_stream(message, history):
|
||||
full_response += token
|
||||
yield f"data: {json.dumps({'token': token})}\n\n"
|
||||
except Exception as e:
|
||||
logger.error(f"Chat error: {e}")
|
||||
yield f"data: {json.dumps({'error': str(e)})}\n\n"
|
||||
|
||||
# Save to history
|
||||
history.append({"role": "user", "content": message})
|
||||
history.append({"role": "assistant", "content": full_response})
|
||||
# Keep history bounded
|
||||
if len(history) > 20:
|
||||
history[:] = history[-20:]
|
||||
sessions[session_id] = history
|
||||
|
||||
yield f"data: {json.dumps({'done': True, 'session_id': session_id})}\n\n"
|
||||
|
||||
return StreamingResponse(
|
||||
generate(),
|
||||
media_type="text/event-stream",
|
||||
headers={
|
||||
"Cache-Control": "no-cache",
|
||||
"Connection": "keep-alive",
|
||||
"X-Accel-Buffering": "no",
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@app.get("/stats")
|
||||
async def stats():
|
||||
"""Get database stats."""
|
||||
from sqlalchemy import func, select
|
||||
from app.models import Experience, Substance, DocumentChunk
|
||||
|
||||
async with async_session() as db:
|
||||
exp_count = (await db.execute(select(func.count(Experience.id)))).scalar() or 0
|
||||
sub_count = (await db.execute(select(func.count(Substance.id)))).scalar() or 0
|
||||
chunk_count = (await db.execute(select(func.count(DocumentChunk.id)))).scalar() or 0
|
||||
|
||||
return {
|
||||
"experiences": exp_count,
|
||||
"substances": sub_count,
|
||||
"chunks": chunk_count,
|
||||
}
|
||||
|
||||
|
||||
@app.post("/admin/scrape/experiences")
|
||||
async def trigger_scrape_experiences(request: Request):
|
||||
"""Trigger experience scraping (admin endpoint)."""
|
||||
body = await request.json() if request.headers.get("content-type") == "application/json" else {}
|
||||
limit = body.get("limit")
|
||||
|
||||
from app.scraper.experiences import scrape_all_experiences
|
||||
asyncio.create_task(scrape_all_experiences(limit=limit))
|
||||
return {"status": "started", "message": "Experience scraping started in background"}
|
||||
|
||||
|
||||
@app.post("/admin/scrape/substances")
|
||||
async def trigger_scrape_substances(request: Request):
|
||||
"""Trigger substance scraping (admin endpoint)."""
|
||||
body = await request.json() if request.headers.get("content-type") == "application/json" else {}
|
||||
limit = body.get("limit")
|
||||
|
||||
from app.scraper.substances import scrape_all_substances
|
||||
asyncio.create_task(scrape_all_substances(limit=limit))
|
||||
return {"status": "started", "message": "Substance scraping started in background"}
|
||||
|
||||
|
||||
@app.post("/admin/embed")
|
||||
async def trigger_embedding():
|
||||
"""Trigger embedding pipeline (admin endpoint)."""
|
||||
from app.embeddings import embed_all
|
||||
asyncio.create_task(embed_all())
|
||||
return {"status": "started", "message": "Embedding pipeline started in background"}
|
||||
|
|
@ -0,0 +1,56 @@
|
|||
from sqlalchemy import Column, Integer, String, Text, Boolean, Float, ForeignKey
|
||||
from sqlalchemy.dialects.postgresql import ARRAY, TIMESTAMP
|
||||
from pgvector.sqlalchemy import Vector
|
||||
from datetime import datetime, timezone
|
||||
from app.database import Base
|
||||
|
||||
|
||||
class Substance(Base):
|
||||
__tablename__ = "substances"
|
||||
|
||||
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||
name = Column(String(255), unique=True, nullable=False, index=True)
|
||||
url = Column(String(1024))
|
||||
category = Column(String(255)) # e.g. "Psychedelics", "Stimulants"
|
||||
description = Column(Text)
|
||||
effects = Column(Text)
|
||||
dosage = Column(Text)
|
||||
duration = Column(Text)
|
||||
chemistry = Column(Text)
|
||||
health = Column(Text)
|
||||
law = Column(Text)
|
||||
raw_html = Column(Text)
|
||||
scraped_at = Column(TIMESTAMP(timezone=True), default=lambda: datetime.now(timezone.utc))
|
||||
|
||||
|
||||
class Experience(Base):
|
||||
__tablename__ = "experiences"
|
||||
|
||||
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||
erowid_id = Column(Integer, unique=True, index=True)
|
||||
title = Column(String(512))
|
||||
author = Column(String(255))
|
||||
substance = Column(String(512)) # may list multiple substances
|
||||
substance_list = Column(ARRAY(String)) # parsed list
|
||||
body = Column(Text, nullable=False)
|
||||
category = Column(String(255)) # e.g. "General", "First Times", "Bad Trips"
|
||||
gender = Column(String(50))
|
||||
age = Column(String(50))
|
||||
year = Column(Integer)
|
||||
url = Column(String(1024))
|
||||
intensity = Column(String(100))
|
||||
raw_html = Column(Text)
|
||||
scraped_at = Column(TIMESTAMP(timezone=True), default=lambda: datetime.now(timezone.utc))
|
||||
|
||||
|
||||
class DocumentChunk(Base):
|
||||
__tablename__ = "document_chunks"
|
||||
|
||||
id = Column(Integer, primary_key=True, autoincrement=True)
|
||||
source_type = Column(String(50), nullable=False, index=True) # "experience" or "substance"
|
||||
source_id = Column(Integer, nullable=False, index=True)
|
||||
chunk_index = Column(Integer, nullable=False)
|
||||
content = Column(Text, nullable=False)
|
||||
metadata_json = Column(Text) # JSON string with extra metadata
|
||||
embedding = Column(Vector(768)) # nomic-embed-text dimension
|
||||
created_at = Column(TIMESTAMP(timezone=True), default=lambda: datetime.now(timezone.utc))
|
||||
|
|
@ -0,0 +1,108 @@
|
|||
import json
|
||||
import logging
|
||||
from typing import AsyncGenerator
|
||||
|
||||
from sqlalchemy import select, text
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.config import settings
|
||||
from app.database import async_session
|
||||
from app.embeddings import get_embedding
|
||||
from app.llm import stream_chat
|
||||
from app.models import DocumentChunk
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
SYSTEM_PROMPT = """You are the Erowid Knowledge Assistant focused on harm reduction. Provide accurate, non-judgmental substance info from the Erowid database. Prioritize safety. Never encourage drug use. Cite sources when possible. Say when info is limited."""
|
||||
|
||||
|
||||
async def retrieve_context(query: str, top_k: int | None = None) -> list[dict]:
|
||||
"""Retrieve the most relevant document chunks for a query."""
|
||||
if top_k is None:
|
||||
top_k = settings.retrieval_top_k
|
||||
|
||||
# Get query embedding
|
||||
query_embedding = await get_embedding(query)
|
||||
|
||||
async with async_session() as db:
|
||||
# Use pgvector cosine distance for similarity search
|
||||
result = await db.execute(
|
||||
text("""
|
||||
SELECT id, source_type, source_id, chunk_index, content, metadata_json,
|
||||
embedding <=> :query_embedding AS distance
|
||||
FROM document_chunks
|
||||
ORDER BY embedding <=> :query_embedding
|
||||
LIMIT :top_k
|
||||
"""),
|
||||
{"query_embedding": str(query_embedding), "top_k": top_k},
|
||||
)
|
||||
|
||||
chunks = []
|
||||
for row in result.fetchall():
|
||||
metadata = {}
|
||||
if row[5]:
|
||||
try:
|
||||
metadata = json.loads(row[5])
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
chunks.append({
|
||||
"id": row[0],
|
||||
"source_type": row[1],
|
||||
"source_id": row[2],
|
||||
"chunk_index": row[3],
|
||||
"content": row[4],
|
||||
"metadata": metadata,
|
||||
"distance": row[6],
|
||||
})
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
def build_context_prompt(chunks: list[dict]) -> str:
|
||||
"""Build a context string from retrieved chunks."""
|
||||
if not chunks:
|
||||
return "\n[No relevant documents found in the database.]\n"
|
||||
|
||||
context_parts = []
|
||||
for i, chunk in enumerate(chunks, 1):
|
||||
source_label = chunk["source_type"].title()
|
||||
metadata = chunk["metadata"]
|
||||
|
||||
header = f"--- Source {i} ({source_label})"
|
||||
if "title" in metadata:
|
||||
header += f" | {metadata['title']}"
|
||||
if "substance" in metadata:
|
||||
header += f" | Substance: {metadata['substance']}"
|
||||
header += " ---"
|
||||
|
||||
# Limit each chunk to avoid overwhelming the LLM
|
||||
content = chunk["content"][:800]
|
||||
context_parts.append(f"{header}\n{content}")
|
||||
|
||||
return "\n\n".join(context_parts)
|
||||
|
||||
|
||||
async def chat_stream(
|
||||
user_message: str,
|
||||
conversation_history: list[dict] | None = None,
|
||||
) -> AsyncGenerator[str, None]:
|
||||
"""Full RAG pipeline: retrieve context, build prompt, stream LLM response."""
|
||||
# Retrieve relevant chunks
|
||||
chunks = await retrieve_context(user_message)
|
||||
|
||||
# Build the context-augmented system prompt
|
||||
context_text = build_context_prompt(chunks)
|
||||
full_system = f"{SYSTEM_PROMPT}\n\n--- RELEVANT EROWID DATA ---\n{context_text}\n--- END EROWID DATA ---"
|
||||
|
||||
# Build message history
|
||||
messages = []
|
||||
if conversation_history:
|
||||
# Keep last 10 messages for context
|
||||
messages = conversation_history[-10:]
|
||||
|
||||
messages.append({"role": "user", "content": user_message})
|
||||
|
||||
# Stream from LLM
|
||||
async for token in stream_chat(messages, system=full_system):
|
||||
yield token
|
||||
|
|
@ -0,0 +1,244 @@
|
|||
import asyncio
|
||||
import re
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.config import settings
|
||||
from app.database import async_session
|
||||
from app.models import Experience
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
BASE_URL = "https://erowid.org"
|
||||
EXP_LIST_URL = "https://erowid.org/experiences/exp_list.shtml"
|
||||
REPORT_URL = "https://erowid.org/experiences/exp.php?ID={id}"
|
||||
|
||||
HEADERS = {
|
||||
"User-Agent": "ErowidResearchBot/1.0 (educational research project)",
|
||||
"Accept": "text/html,application/xhtml+xml",
|
||||
}
|
||||
|
||||
|
||||
async def get_all_substance_pages(client: httpx.AsyncClient) -> list[dict]:
|
||||
"""Get main substance experience listing pages from the master index.
|
||||
|
||||
Only fetches top-level substance pages (e.g. exp_LSD.shtml), not
|
||||
category sub-pages (e.g. exp_LSD_General.shtml) since the main page
|
||||
already contains all report IDs for that substance.
|
||||
"""
|
||||
resp = await client.get(EXP_LIST_URL, headers=HEADERS, timeout=60)
|
||||
resp.raise_for_status()
|
||||
soup = BeautifulSoup(resp.text, "lxml")
|
||||
|
||||
pages = []
|
||||
seen_substances = set()
|
||||
for a in soup.select("a[href]"):
|
||||
href = a.get("href", "")
|
||||
name = a.get_text(strip=True)
|
||||
if not href.startswith("subs/exp_") or not href.endswith(".shtml") or not name:
|
||||
continue
|
||||
|
||||
full_url = f"https://erowid.org/experiences/{href}"
|
||||
|
||||
# Extract the base substance name from the URL
|
||||
# e.g. subs/exp_LSD.shtml -> LSD, subs/exp_LSD_General.shtml -> LSD
|
||||
filename = href.replace("subs/exp_", "").replace(".shtml", "")
|
||||
|
||||
# Skip category sub-pages — they contain subsets of the main page
|
||||
# Category sub-pages have suffixes like _General, _First_Times, _Bad_Trips, etc.
|
||||
known_categories = [
|
||||
"_General", "_First_Times", "_Combinations", "_Retrospective",
|
||||
"_Preparation", "_Difficult_Experiences", "_Bad_Trips",
|
||||
"_Health_Problems", "_Train_Wrecks", "_Glowing_Experiences",
|
||||
"_Mystical_Experiences", "_Health_Benefits", "_What_Was_in_That",
|
||||
"_Medical_Use", "_Performance_Enhancement", "_Addiction",
|
||||
]
|
||||
is_category = any(filename.endswith(cat) for cat in known_categories)
|
||||
if is_category:
|
||||
continue
|
||||
|
||||
if full_url not in seen_substances:
|
||||
seen_substances.add(full_url)
|
||||
pages.append({"name": name, "url": full_url})
|
||||
|
||||
logger.info(f"Found {len(pages)} main substance experience pages (filtered from category sub-pages)")
|
||||
return pages
|
||||
|
||||
|
||||
async def get_experience_ids_from_page(client: httpx.AsyncClient, url: str) -> list[int]:
|
||||
"""Extract all experience report IDs from a substance listing page."""
|
||||
try:
|
||||
resp = await client.get(url, headers=HEADERS, timeout=30)
|
||||
resp.raise_for_status()
|
||||
except httpx.HTTPError as e:
|
||||
logger.warning(f"Failed to fetch {url}: {e}")
|
||||
return []
|
||||
|
||||
ids = [int(x) for x in re.findall(r"exp\.php\?ID=(\d+)", resp.text)]
|
||||
return list(set(ids)) # dedupe
|
||||
|
||||
|
||||
async def get_all_experience_ids(client: httpx.AsyncClient) -> list[int]:
|
||||
"""Collect all unique experience IDs from all substance pages.
|
||||
|
||||
Fetches pages concurrently in batches of 5 for speed.
|
||||
"""
|
||||
pages = await get_all_substance_pages(client)
|
||||
|
||||
all_ids = set()
|
||||
batch_size = 5
|
||||
for i in range(0, len(pages), batch_size):
|
||||
batch = pages[i : i + batch_size]
|
||||
tasks = [get_experience_ids_from_page(client, p["url"]) for p in batch]
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
for result in results:
|
||||
if isinstance(result, list):
|
||||
all_ids.update(result)
|
||||
logger.info(f"Scanned {min(i + batch_size, len(pages))}/{len(pages)} pages, {len(all_ids)} unique IDs")
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
logger.info(f"Found {len(all_ids)} unique experience IDs total")
|
||||
return sorted(all_ids)
|
||||
|
||||
|
||||
async def scrape_experience_report(client: httpx.AsyncClient, erowid_id: int) -> dict | None:
|
||||
"""Scrape a single experience report."""
|
||||
url = REPORT_URL.format(id=erowid_id)
|
||||
try:
|
||||
resp = await client.get(url, headers=HEADERS, timeout=30)
|
||||
resp.raise_for_status()
|
||||
except httpx.HTTPError as e:
|
||||
logger.warning(f"Failed to fetch experience {erowid_id}: {e}")
|
||||
return None
|
||||
|
||||
soup = BeautifulSoup(resp.text, "lxml")
|
||||
|
||||
# Extract the main report body
|
||||
body_div = soup.select_one("div.report-text-surround")
|
||||
if not body_div:
|
||||
logger.warning(f"No report body found for {erowid_id}")
|
||||
return None
|
||||
|
||||
# Remove the dosechart from body text to avoid duplication
|
||||
body_text_parts = []
|
||||
for el in body_div.children:
|
||||
if hasattr(el, "name") and el.name == "table":
|
||||
continue # skip dosechart table
|
||||
text = el.get_text(separator="\n", strip=True) if hasattr(el, "get_text") else str(el).strip()
|
||||
if text:
|
||||
body_text_parts.append(text)
|
||||
|
||||
body = "\n\n".join(body_text_parts)
|
||||
if not body or len(body) < 50:
|
||||
return None
|
||||
|
||||
# Extract metadata
|
||||
title = ""
|
||||
title_el = soup.select_one("div.title")
|
||||
if title_el:
|
||||
title = title_el.get_text(strip=True)
|
||||
|
||||
substance = ""
|
||||
sub_el = soup.select_one("div.substance")
|
||||
if sub_el:
|
||||
substance = sub_el.get_text(strip=True)
|
||||
|
||||
substance_list = [s.strip() for s in re.split(r"[,&]", substance) if s.strip()]
|
||||
|
||||
author = ""
|
||||
author_el = soup.select_one("div.author")
|
||||
if author_el:
|
||||
author = author_el.get_text(strip=True).replace("by ", "")
|
||||
|
||||
# Dosage info
|
||||
dose_table = soup.select_one("table.dosechart")
|
||||
dose_text = ""
|
||||
if dose_table:
|
||||
rows = dose_table.select("tr")
|
||||
dose_parts = []
|
||||
for row in rows:
|
||||
cells = row.select("td")
|
||||
row_text = " ".join(c.get_text(strip=True) for c in cells if c.get_text(strip=True))
|
||||
if row_text:
|
||||
dose_parts.append(row_text)
|
||||
dose_text = "; ".join(dose_parts)
|
||||
|
||||
if dose_text:
|
||||
body = f"Dosage: {dose_text}\n\n{body}"
|
||||
|
||||
# Weight/gender from the body weight line inside report-text-surround
|
||||
gender = ""
|
||||
age = ""
|
||||
weight_el = soup.select_one("table.bodyweight")
|
||||
if weight_el:
|
||||
wt = weight_el.get_text(strip=True)
|
||||
age_match = re.search(r"(\d+)\s*yr", wt, re.IGNORECASE)
|
||||
if age_match:
|
||||
age = age_match.group(1)
|
||||
if "male" in wt.lower():
|
||||
gender = "Male"
|
||||
elif "female" in wt.lower():
|
||||
gender = "Female"
|
||||
|
||||
# Try to extract category from the page
|
||||
category = ""
|
||||
cat_el = soup.select_one("div.foot-eroid-cat")
|
||||
if cat_el:
|
||||
category = cat_el.get_text(strip=True)
|
||||
|
||||
return {
|
||||
"erowid_id": erowid_id,
|
||||
"title": title,
|
||||
"author": author,
|
||||
"substance": substance,
|
||||
"substance_list": substance_list,
|
||||
"body": body,
|
||||
"category": category,
|
||||
"gender": gender,
|
||||
"age": age,
|
||||
"url": url,
|
||||
"raw_html": resp.text,
|
||||
}
|
||||
|
||||
|
||||
async def scrape_all_experiences(limit: int | None = None):
|
||||
"""Main scraper entry point. Scrapes all experience reports into the database."""
|
||||
async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
|
||||
all_ids = await get_all_experience_ids(client)
|
||||
|
||||
if limit:
|
||||
all_ids = all_ids[:limit]
|
||||
|
||||
async with async_session() as db:
|
||||
result = await db.execute(select(Experience.erowid_id))
|
||||
existing_ids = {row[0] for row in result.fetchall()}
|
||||
logger.info(f"Already have {len(existing_ids)} experiences in DB")
|
||||
|
||||
to_scrape = [eid for eid in all_ids if eid not in existing_ids]
|
||||
logger.info(f"Need to scrape {len(to_scrape)} new experiences")
|
||||
|
||||
scraped = 0
|
||||
errors = 0
|
||||
for eid in to_scrape:
|
||||
data = await scrape_experience_report(client, eid)
|
||||
if data:
|
||||
exp = Experience(**data)
|
||||
db.add(exp)
|
||||
scraped += 1
|
||||
|
||||
if scraped % settings.scrape_batch_size == 0:
|
||||
await db.commit()
|
||||
logger.info(f"Committed batch: {scraped}/{len(to_scrape)} scraped ({errors} errors)")
|
||||
else:
|
||||
errors += 1
|
||||
|
||||
await asyncio.sleep(settings.scrape_delay)
|
||||
|
||||
await db.commit()
|
||||
logger.info(f"Done! Scraped {scraped} new experiences ({errors} errors)")
|
||||
return scraped
|
||||
|
|
@ -0,0 +1,171 @@
|
|||
import asyncio
|
||||
import re
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
|
||||
import httpx
|
||||
from bs4 import BeautifulSoup
|
||||
from sqlalchemy import select
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.config import settings
|
||||
from app.database import async_session
|
||||
from app.models import Substance
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
BASE_URL = "https://erowid.org"
|
||||
VAULT_INDEX = "https://erowid.org/chemicals/"
|
||||
|
||||
HEADERS = {
|
||||
"User-Agent": "ErowidResearchBot/1.0 (educational research project)",
|
||||
"Accept": "text/html,application/xhtml+xml",
|
||||
}
|
||||
|
||||
# Known substance categories on Erowid
|
||||
CATEGORIES = [
|
||||
"Psychedelics", "Empathogens", "Stimulants", "Depressants",
|
||||
"Dissociatives", "Cannabis", "Opioids", "Nootropics",
|
||||
"Plants & Herbs", "Pharmaceuticals", "Research Chemicals",
|
||||
]
|
||||
|
||||
|
||||
async def get_substance_urls(client: httpx.AsyncClient) -> list[dict]:
|
||||
"""Get all substance vault URLs from the chemicals index."""
|
||||
resp = await client.get(VAULT_INDEX, headers=HEADERS)
|
||||
resp.raise_for_status()
|
||||
soup = BeautifulSoup(resp.text, "lxml")
|
||||
|
||||
substances = []
|
||||
# The chemicals index lists substances with links to their vaults
|
||||
for link in soup.select("a[href]"):
|
||||
href = link.get("href", "")
|
||||
text = link.get_text(strip=True)
|
||||
# Filter for substance vault links (e.g., /chemicals/lsd/, /chemicals/psilocybin/)
|
||||
if re.match(r"^/chemicals/[\w_-]+/?$", href) or re.match(r"^[\w_-]+/?$", href):
|
||||
if text and len(text) > 1 and not text.startswith("["):
|
||||
full_url = href if href.startswith("http") else f"{BASE_URL}{href}" if href.startswith("/") else f"{VAULT_INDEX}{href}"
|
||||
substances.append({
|
||||
"name": text,
|
||||
"url": full_url,
|
||||
})
|
||||
|
||||
# Also scrape the plants/herbs section
|
||||
plants_url = f"{BASE_URL}/plants/"
|
||||
try:
|
||||
resp2 = await client.get(plants_url, headers=HEADERS)
|
||||
resp2.raise_for_status()
|
||||
soup2 = BeautifulSoup(resp2.text, "lxml")
|
||||
for link in soup2.select("a[href]"):
|
||||
href = link.get("href", "")
|
||||
text = link.get_text(strip=True)
|
||||
if re.match(r"^/plants/[\w_-]+/?$", href) or re.match(r"^[\w_-]+/?$", href):
|
||||
if text and len(text) > 1 and not text.startswith("["):
|
||||
full_url = href if href.startswith("http") else f"{BASE_URL}{href}" if href.startswith("/") else f"{plants_url}{href}"
|
||||
substances.append({
|
||||
"name": text,
|
||||
"url": full_url,
|
||||
"category": "Plants & Herbs",
|
||||
})
|
||||
except httpx.HTTPError:
|
||||
logger.warning("Failed to fetch plants index")
|
||||
|
||||
logger.info(f"Found {len(substances)} substance URLs")
|
||||
return substances
|
||||
|
||||
|
||||
async def scrape_substance_vault(client: httpx.AsyncClient, name: str, url: str) -> dict | None:
|
||||
"""Scrape a substance vault page for key information."""
|
||||
try:
|
||||
resp = await client.get(url, headers=HEADERS)
|
||||
resp.raise_for_status()
|
||||
except httpx.HTTPError as e:
|
||||
logger.warning(f"Failed to fetch substance {name}: {e}")
|
||||
return None
|
||||
|
||||
soup = BeautifulSoup(resp.text, "lxml")
|
||||
raw_html = resp.text
|
||||
|
||||
# Extract text content from the vault main page
|
||||
description = ""
|
||||
main_content = soup.select_one("div.sum-content") or soup.select_one("td.content") or soup.select_one("body")
|
||||
if main_content:
|
||||
description = main_content.get_text(separator="\n", strip=True)[:5000]
|
||||
|
||||
# Try to find sub-pages: effects, dose, duration, health, law
|
||||
sections = {}
|
||||
sub_pages = {
|
||||
"effects": ["effects", "effects.shtml"],
|
||||
"dosage": ["dose", "dose.shtml", "dosage.shtml"],
|
||||
"duration": ["duration", "duration.shtml", "timeline.shtml"],
|
||||
"chemistry": ["chemistry", "chemistry.shtml"],
|
||||
"health": ["health", "health.shtml", "warnings.shtml"],
|
||||
"law": ["law", "law.shtml", "legal.shtml"],
|
||||
}
|
||||
|
||||
for section_name, paths in sub_pages.items():
|
||||
for path in paths:
|
||||
sub_url = f"{url.rstrip('/')}/{path}"
|
||||
try:
|
||||
sub_resp = await client.get(sub_url, headers=HEADERS)
|
||||
if sub_resp.status_code == 200:
|
||||
sub_soup = BeautifulSoup(sub_resp.text, "lxml")
|
||||
content_el = sub_soup.select_one("div.sum-content") or sub_soup.select_one("td.content")
|
||||
if content_el:
|
||||
sections[section_name] = content_el.get_text(separator="\n", strip=True)[:3000]
|
||||
break
|
||||
await asyncio.sleep(0.5)
|
||||
except httpx.HTTPError:
|
||||
continue
|
||||
|
||||
return {
|
||||
"name": name,
|
||||
"url": url,
|
||||
"description": description,
|
||||
"effects": sections.get("effects", ""),
|
||||
"dosage": sections.get("dosage", ""),
|
||||
"duration": sections.get("duration", ""),
|
||||
"chemistry": sections.get("chemistry", ""),
|
||||
"health": sections.get("health", ""),
|
||||
"law": sections.get("law", ""),
|
||||
"raw_html": raw_html,
|
||||
}
|
||||
|
||||
|
||||
async def scrape_all_substances(limit: int | None = None):
|
||||
"""Main entry point: scrape all substance vaults."""
|
||||
async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
|
||||
substance_list = await get_substance_urls(client)
|
||||
|
||||
if limit:
|
||||
substance_list = substance_list[:limit]
|
||||
|
||||
async with async_session() as db:
|
||||
# Get already-scraped substances
|
||||
result = await db.execute(select(Substance.name))
|
||||
existing = {row[0].lower() for row in result.fetchall()}
|
||||
logger.info(f"Already have {len(existing)} substances in DB")
|
||||
|
||||
to_scrape = [s for s in substance_list if s["name"].lower() not in existing]
|
||||
logger.info(f"Need to scrape {len(to_scrape)} new substances")
|
||||
|
||||
scraped = 0
|
||||
for sub_meta in to_scrape:
|
||||
data = await scrape_substance_vault(
|
||||
client, sub_meta["name"], sub_meta["url"]
|
||||
)
|
||||
if data:
|
||||
data["category"] = sub_meta.get("category", "")
|
||||
sub = Substance(**data)
|
||||
db.add(sub)
|
||||
scraped += 1
|
||||
|
||||
if scraped % 10 == 0:
|
||||
await db.commit()
|
||||
logger.info(f"Committed: {scraped} substances scraped")
|
||||
|
||||
await asyncio.sleep(settings.scrape_delay)
|
||||
|
||||
await db.commit()
|
||||
logger.info(f"Done! Scraped {scraped} new substances")
|
||||
return scraped
|
||||
|
|
@ -0,0 +1,157 @@
|
|||
const chatContainer = document.getElementById("chat-container");
|
||||
const messageInput = document.getElementById("message-input");
|
||||
const sendBtn = document.getElementById("send-btn");
|
||||
const welcomeEl = document.getElementById("welcome");
|
||||
const statsEl = document.getElementById("stats");
|
||||
|
||||
let sessionId = localStorage.getItem("erowid_session") || "";
|
||||
let isStreaming = false;
|
||||
|
||||
// Load stats
|
||||
async function loadStats() {
|
||||
try {
|
||||
const resp = await fetch("/stats");
|
||||
const data = await resp.json();
|
||||
statsEl.textContent = `${data.experiences} reports | ${data.substances} substances | ${data.chunks} chunks`;
|
||||
} catch {
|
||||
statsEl.textContent = "connecting...";
|
||||
}
|
||||
}
|
||||
loadStats();
|
||||
|
||||
// Auto-resize textarea
|
||||
messageInput.addEventListener("input", () => {
|
||||
messageInput.style.height = "auto";
|
||||
messageInput.style.height = Math.min(messageInput.scrollHeight, 120) + "px";
|
||||
});
|
||||
|
||||
// Send on Enter (Shift+Enter for newline)
|
||||
messageInput.addEventListener("keydown", (e) => {
|
||||
if (e.key === "Enter" && !e.shiftKey) {
|
||||
e.preventDefault();
|
||||
sendMessage();
|
||||
}
|
||||
});
|
||||
|
||||
sendBtn.addEventListener("click", sendMessage);
|
||||
|
||||
// Suggestion clicks
|
||||
document.querySelectorAll(".suggestion").forEach((el) => {
|
||||
el.addEventListener("click", () => {
|
||||
messageInput.value = el.textContent;
|
||||
sendMessage();
|
||||
});
|
||||
});
|
||||
|
||||
function addMessage(role, content) {
|
||||
if (welcomeEl) welcomeEl.style.display = "none";
|
||||
|
||||
const msg = document.createElement("div");
|
||||
msg.className = `message ${role}`;
|
||||
|
||||
const avatar = document.createElement("div");
|
||||
avatar.className = "message-avatar";
|
||||
avatar.textContent = role === "user" ? "You" : "E";
|
||||
|
||||
const contentEl = document.createElement("div");
|
||||
contentEl.className = "message-content";
|
||||
contentEl.textContent = content;
|
||||
|
||||
msg.appendChild(avatar);
|
||||
msg.appendChild(contentEl);
|
||||
chatContainer.appendChild(msg);
|
||||
chatContainer.scrollTop = chatContainer.scrollHeight;
|
||||
|
||||
return contentEl;
|
||||
}
|
||||
|
||||
function addTypingIndicator() {
|
||||
const msg = document.createElement("div");
|
||||
msg.className = "message assistant";
|
||||
msg.id = "typing-indicator";
|
||||
|
||||
const avatar = document.createElement("div");
|
||||
avatar.className = "message-avatar";
|
||||
avatar.textContent = "E";
|
||||
|
||||
const contentEl = document.createElement("div");
|
||||
contentEl.className = "message-content";
|
||||
contentEl.innerHTML = '<div class="typing"><span></span><span></span><span></span></div>';
|
||||
|
||||
msg.appendChild(avatar);
|
||||
msg.appendChild(contentEl);
|
||||
chatContainer.appendChild(msg);
|
||||
chatContainer.scrollTop = chatContainer.scrollHeight;
|
||||
}
|
||||
|
||||
function removeTypingIndicator() {
|
||||
const el = document.getElementById("typing-indicator");
|
||||
if (el) el.remove();
|
||||
}
|
||||
|
||||
async function sendMessage() {
|
||||
const text = messageInput.value.trim();
|
||||
if (!text || isStreaming) return;
|
||||
|
||||
isStreaming = true;
|
||||
sendBtn.disabled = true;
|
||||
messageInput.value = "";
|
||||
messageInput.style.height = "auto";
|
||||
|
||||
addMessage("user", text);
|
||||
addTypingIndicator();
|
||||
|
||||
try {
|
||||
const resp = await fetch("/chat", {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify({ message: text, session_id: sessionId }),
|
||||
});
|
||||
|
||||
removeTypingIndicator();
|
||||
const contentEl = addMessage("assistant", "");
|
||||
|
||||
const reader = resp.body.getReader();
|
||||
const decoder = new TextDecoder();
|
||||
let buffer = "";
|
||||
let fullResponse = "";
|
||||
|
||||
while (true) {
|
||||
const { done, value } = await reader.read();
|
||||
if (done) break;
|
||||
|
||||
buffer += decoder.decode(value, { stream: true });
|
||||
const lines = buffer.split("\n");
|
||||
buffer = lines.pop() || "";
|
||||
|
||||
for (const line of lines) {
|
||||
if (!line.startsWith("data: ")) continue;
|
||||
const jsonStr = line.slice(6).trim();
|
||||
if (!jsonStr) continue;
|
||||
|
||||
try {
|
||||
const data = JSON.parse(jsonStr);
|
||||
if (data.token) {
|
||||
fullResponse += data.token;
|
||||
contentEl.textContent = fullResponse;
|
||||
chatContainer.scrollTop = chatContainer.scrollHeight;
|
||||
}
|
||||
if (data.session_id) {
|
||||
sessionId = data.session_id;
|
||||
localStorage.setItem("erowid_session", sessionId);
|
||||
}
|
||||
if (data.error) {
|
||||
contentEl.textContent = `Error: ${data.error}`;
|
||||
}
|
||||
} catch {}
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
removeTypingIndicator();
|
||||
addMessage("assistant", `Connection error: ${err.message}`);
|
||||
}
|
||||
|
||||
isStreaming = false;
|
||||
sendBtn.disabled = false;
|
||||
messageInput.focus();
|
||||
}
|
||||
|
|
@ -0,0 +1,59 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Erowid Bot</title>
|
||||
<link rel="stylesheet" href="/static/style.css">
|
||||
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap" rel="stylesheet">
|
||||
</head>
|
||||
<body>
|
||||
|
||||
<div class="header">
|
||||
<div class="header-left">
|
||||
<div class="logo">Erowid Bot</div>
|
||||
<div class="tagline">Harm Reduction Knowledge Assistant</div>
|
||||
</div>
|
||||
<div class="stats-badge" id="stats">loading...</div>
|
||||
</div>
|
||||
|
||||
<div class="chat-container" id="chat-container">
|
||||
<div class="welcome" id="welcome">
|
||||
<h2>Explore the Erowid Database</h2>
|
||||
<p>
|
||||
Ask questions about substances, experience reports, dosage information,
|
||||
effects, safety, and more. All information is sourced from the Erowid vault
|
||||
and experience reports. This bot prioritizes harm reduction and safety.
|
||||
</p>
|
||||
<div class="suggestions">
|
||||
<button class="suggestion">What are the effects of psilocybin?</button>
|
||||
<button class="suggestion">Tell me about safe MDMA dosing</button>
|
||||
<button class="suggestion">What do people report about DMT experiences?</button>
|
||||
<button class="suggestion">What are dangerous drug combinations?</button>
|
||||
<button class="suggestion">Compare LSD and psilocybin experiences</button>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="input-container">
|
||||
<div class="input-wrapper">
|
||||
<textarea
|
||||
id="message-input"
|
||||
placeholder="Ask about substances, experiences, safety..."
|
||||
rows="1"
|
||||
></textarea>
|
||||
<button id="send-btn">
|
||||
<svg viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
|
||||
<line x1="22" y1="2" x2="11" y2="13"></line>
|
||||
<polygon points="22 2 15 22 11 13 2 9 22 2"></polygon>
|
||||
</svg>
|
||||
</button>
|
||||
</div>
|
||||
<div class="disclaimer">
|
||||
Information sourced from Erowid.org. Not medical advice. Always practice harm reduction.
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script src="/static/app.js"></script>
|
||||
</body>
|
||||
</html>
|
||||
|
|
@ -0,0 +1,326 @@
|
|||
* {
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
box-sizing: border-box;
|
||||
}
|
||||
|
||||
:root {
|
||||
--bg-primary: #0a0a0f;
|
||||
--bg-secondary: #12121a;
|
||||
--bg-tertiary: #1a1a2e;
|
||||
--text-primary: #e0e0e8;
|
||||
--text-secondary: #8888a0;
|
||||
--accent: #6c5ce7;
|
||||
--accent-glow: rgba(108, 92, 231, 0.3);
|
||||
--user-bg: #2d2d44;
|
||||
--bot-bg: #1a1a2e;
|
||||
--border: #2a2a3e;
|
||||
--danger: #e74c3c;
|
||||
--success: #2ecc71;
|
||||
}
|
||||
|
||||
body {
|
||||
font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif;
|
||||
background: var(--bg-primary);
|
||||
color: var(--text-primary);
|
||||
height: 100vh;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
/* Header */
|
||||
.header {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: space-between;
|
||||
padding: 12px 20px;
|
||||
background: var(--bg-secondary);
|
||||
border-bottom: 1px solid var(--border);
|
||||
flex-shrink: 0;
|
||||
}
|
||||
|
||||
.header-left {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 12px;
|
||||
}
|
||||
|
||||
.logo {
|
||||
font-size: 24px;
|
||||
font-weight: 700;
|
||||
background: linear-gradient(135deg, var(--accent), #a29bfe);
|
||||
-webkit-background-clip: text;
|
||||
-webkit-text-fill-color: transparent;
|
||||
background-clip: text;
|
||||
}
|
||||
|
||||
.tagline {
|
||||
font-size: 12px;
|
||||
color: var(--text-secondary);
|
||||
letter-spacing: 0.5px;
|
||||
}
|
||||
|
||||
.stats-badge {
|
||||
font-size: 11px;
|
||||
color: var(--text-secondary);
|
||||
background: var(--bg-tertiary);
|
||||
padding: 4px 10px;
|
||||
border-radius: 12px;
|
||||
border: 1px solid var(--border);
|
||||
}
|
||||
|
||||
/* Chat area */
|
||||
.chat-container {
|
||||
flex: 1;
|
||||
overflow-y: auto;
|
||||
padding: 20px;
|
||||
scroll-behavior: smooth;
|
||||
}
|
||||
|
||||
.chat-container::-webkit-scrollbar {
|
||||
width: 6px;
|
||||
}
|
||||
.chat-container::-webkit-scrollbar-track {
|
||||
background: transparent;
|
||||
}
|
||||
.chat-container::-webkit-scrollbar-thumb {
|
||||
background: var(--border);
|
||||
border-radius: 3px;
|
||||
}
|
||||
|
||||
.welcome {
|
||||
text-align: center;
|
||||
padding: 60px 20px;
|
||||
max-width: 600px;
|
||||
margin: 0 auto;
|
||||
}
|
||||
|
||||
.welcome h2 {
|
||||
font-size: 22px;
|
||||
margin-bottom: 12px;
|
||||
color: var(--text-primary);
|
||||
}
|
||||
|
||||
.welcome p {
|
||||
color: var(--text-secondary);
|
||||
line-height: 1.6;
|
||||
margin-bottom: 20px;
|
||||
font-size: 14px;
|
||||
}
|
||||
|
||||
.suggestions {
|
||||
display: flex;
|
||||
flex-wrap: wrap;
|
||||
gap: 8px;
|
||||
justify-content: center;
|
||||
}
|
||||
|
||||
.suggestion {
|
||||
background: var(--bg-tertiary);
|
||||
border: 1px solid var(--border);
|
||||
color: var(--text-secondary);
|
||||
padding: 8px 14px;
|
||||
border-radius: 20px;
|
||||
font-size: 13px;
|
||||
cursor: pointer;
|
||||
transition: all 0.2s;
|
||||
}
|
||||
|
||||
.suggestion:hover {
|
||||
border-color: var(--accent);
|
||||
color: var(--text-primary);
|
||||
background: rgba(108, 92, 231, 0.1);
|
||||
}
|
||||
|
||||
/* Messages */
|
||||
.message {
|
||||
display: flex;
|
||||
gap: 12px;
|
||||
margin-bottom: 16px;
|
||||
max-width: 800px;
|
||||
margin-left: auto;
|
||||
margin-right: auto;
|
||||
animation: fadeIn 0.3s ease;
|
||||
}
|
||||
|
||||
@keyframes fadeIn {
|
||||
from { opacity: 0; transform: translateY(8px); }
|
||||
to { opacity: 1; transform: translateY(0); }
|
||||
}
|
||||
|
||||
.message.user {
|
||||
flex-direction: row-reverse;
|
||||
}
|
||||
|
||||
.message-avatar {
|
||||
width: 32px;
|
||||
height: 32px;
|
||||
border-radius: 8px;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
font-size: 14px;
|
||||
flex-shrink: 0;
|
||||
}
|
||||
|
||||
.message.user .message-avatar {
|
||||
background: var(--user-bg);
|
||||
}
|
||||
|
||||
.message.assistant .message-avatar {
|
||||
background: var(--accent);
|
||||
}
|
||||
|
||||
.message-content {
|
||||
padding: 10px 16px;
|
||||
border-radius: 12px;
|
||||
max-width: 75%;
|
||||
line-height: 1.6;
|
||||
font-size: 14px;
|
||||
white-space: pre-wrap;
|
||||
word-wrap: break-word;
|
||||
}
|
||||
|
||||
.message.user .message-content {
|
||||
background: var(--user-bg);
|
||||
border-bottom-right-radius: 4px;
|
||||
}
|
||||
|
||||
.message.assistant .message-content {
|
||||
background: var(--bot-bg);
|
||||
border: 1px solid var(--border);
|
||||
border-bottom-left-radius: 4px;
|
||||
}
|
||||
|
||||
.message-content p {
|
||||
margin-bottom: 8px;
|
||||
}
|
||||
.message-content p:last-child {
|
||||
margin-bottom: 0;
|
||||
}
|
||||
|
||||
.message-content strong {
|
||||
color: #a29bfe;
|
||||
}
|
||||
|
||||
.message-content code {
|
||||
background: rgba(108, 92, 231, 0.15);
|
||||
padding: 1px 5px;
|
||||
border-radius: 3px;
|
||||
font-size: 13px;
|
||||
}
|
||||
|
||||
/* Typing indicator */
|
||||
.typing {
|
||||
display: flex;
|
||||
gap: 4px;
|
||||
padding: 4px 0;
|
||||
}
|
||||
|
||||
.typing span {
|
||||
width: 6px;
|
||||
height: 6px;
|
||||
background: var(--text-secondary);
|
||||
border-radius: 50%;
|
||||
animation: bounce 1.4s infinite;
|
||||
}
|
||||
|
||||
.typing span:nth-child(2) { animation-delay: 0.2s; }
|
||||
.typing span:nth-child(3) { animation-delay: 0.4s; }
|
||||
|
||||
@keyframes bounce {
|
||||
0%, 60%, 100% { transform: translateY(0); }
|
||||
30% { transform: translateY(-6px); }
|
||||
}
|
||||
|
||||
/* Input area */
|
||||
.input-container {
|
||||
padding: 16px 20px;
|
||||
background: var(--bg-secondary);
|
||||
border-top: 1px solid var(--border);
|
||||
flex-shrink: 0;
|
||||
}
|
||||
|
||||
.input-wrapper {
|
||||
display: flex;
|
||||
gap: 10px;
|
||||
max-width: 800px;
|
||||
margin: 0 auto;
|
||||
}
|
||||
|
||||
#message-input {
|
||||
flex: 1;
|
||||
background: var(--bg-tertiary);
|
||||
border: 1px solid var(--border);
|
||||
color: var(--text-primary);
|
||||
padding: 12px 16px;
|
||||
border-radius: 12px;
|
||||
font-size: 14px;
|
||||
font-family: inherit;
|
||||
resize: none;
|
||||
outline: none;
|
||||
min-height: 44px;
|
||||
max-height: 120px;
|
||||
transition: border-color 0.2s;
|
||||
}
|
||||
|
||||
#message-input:focus {
|
||||
border-color: var(--accent);
|
||||
box-shadow: 0 0 0 2px var(--accent-glow);
|
||||
}
|
||||
|
||||
#message-input::placeholder {
|
||||
color: var(--text-secondary);
|
||||
}
|
||||
|
||||
#send-btn {
|
||||
background: var(--accent);
|
||||
color: white;
|
||||
border: none;
|
||||
width: 44px;
|
||||
height: 44px;
|
||||
border-radius: 12px;
|
||||
cursor: pointer;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
transition: all 0.2s;
|
||||
flex-shrink: 0;
|
||||
}
|
||||
|
||||
#send-btn:hover {
|
||||
background: #5b4bd5;
|
||||
transform: scale(1.05);
|
||||
}
|
||||
|
||||
#send-btn:disabled {
|
||||
opacity: 0.5;
|
||||
cursor: not-allowed;
|
||||
transform: none;
|
||||
}
|
||||
|
||||
#send-btn svg {
|
||||
width: 18px;
|
||||
height: 18px;
|
||||
}
|
||||
|
||||
.disclaimer {
|
||||
text-align: center;
|
||||
font-size: 11px;
|
||||
color: var(--text-secondary);
|
||||
margin-top: 8px;
|
||||
max-width: 800px;
|
||||
margin-left: auto;
|
||||
margin-right: auto;
|
||||
}
|
||||
|
||||
/* Mobile */
|
||||
@media (max-width: 640px) {
|
||||
.header { padding: 10px 14px; }
|
||||
.chat-container { padding: 12px; }
|
||||
.message-content { max-width: 85%; font-size: 13px; }
|
||||
.input-container { padding: 10px 14px; }
|
||||
.suggestions { gap: 6px; }
|
||||
.suggestion { font-size: 12px; padding: 6px 12px; }
|
||||
}
|
||||
|
|
@ -0,0 +1,50 @@
|
|||
services:
|
||||
erowid-bot:
|
||||
build: .
|
||||
container_name: erowid-bot
|
||||
restart: unless-stopped
|
||||
env_file: .env
|
||||
ports:
|
||||
- "8421:8000"
|
||||
depends_on:
|
||||
erowid-db:
|
||||
condition: service_healthy
|
||||
networks:
|
||||
- default
|
||||
- traefik-public
|
||||
- ai-internal
|
||||
labels:
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.routers.erowid-bot.rule=Host(`erowid.jeffemmett.com`)"
|
||||
- "traefik.http.routers.erowid-bot.entrypoints=websecure"
|
||||
- "traefik.http.routers.erowid-bot.tls.certresolver=letsencrypt"
|
||||
- "traefik.http.services.erowid-bot.loadbalancer.server.port=8000"
|
||||
volumes:
|
||||
- ./app:/app/app
|
||||
|
||||
erowid-db:
|
||||
image: pgvector/pgvector:pg16
|
||||
container_name: erowid-db
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
POSTGRES_USER: ${POSTGRES_USER:-erowid}
|
||||
POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-erowid}
|
||||
POSTGRES_DB: ${POSTGRES_DB:-erowid}
|
||||
volumes:
|
||||
- erowid-db-data:/var/lib/postgresql/data
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "pg_isready -U erowid"]
|
||||
interval: 5s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
networks:
|
||||
- default
|
||||
|
||||
volumes:
|
||||
erowid-db-data:
|
||||
|
||||
networks:
|
||||
traefik-public:
|
||||
external: true
|
||||
ai-internal:
|
||||
external: true
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
fastapi==0.115.6
|
||||
uvicorn[standard]==0.34.0
|
||||
sqlalchemy==2.0.36
|
||||
asyncpg==0.30.0
|
||||
pgvector==0.3.6
|
||||
psycopg2-binary==2.9.10
|
||||
httpx==0.28.1
|
||||
python-dotenv==1.0.1
|
||||
beautifulsoup4==4.12.3
|
||||
lxml==5.3.0
|
||||
pydantic==2.10.3
|
||||
pydantic-settings==2.7.0
|
||||
sse-starlette==2.2.1
|
||||
tiktoken==0.8.0
|
||||
anthropic==0.40.0
|
||||
openai==1.58.1
|
||||
Loading…
Reference in New Issue