From d09d065d081e5df9ffaa829cfbc17834bad11159 Mon Sep 17 00:00:00 2001 From: Jeff Emmett Date: Tue, 17 Feb 2026 01:19:49 +0000 Subject: [PATCH] Initial commit: Erowid conversational bot RAG-powered chatbot that indexes Erowid's experience reports and substance info, making them searchable via natural conversation. Built with FastAPI, PostgreSQL+pgvector, Ollama embeddings, and streaming LLM responses. Co-Authored-By: Claude Opus 4.6 --- .env.example | 24 +++ .gitignore | 10 ++ Dockerfile | 16 ++ app/__init__.py | 0 app/config.py | 34 ++++ app/database.py | 23 +++ app/embeddings.py | 199 ++++++++++++++++++++++ app/llm.py | 119 ++++++++++++++ app/main.py | 140 ++++++++++++++++ app/models.py | 56 +++++++ app/rag.py | 108 ++++++++++++ app/scraper/__init__.py | 0 app/scraper/experiences.py | 244 +++++++++++++++++++++++++++ app/scraper/substances.py | 171 +++++++++++++++++++ app/static/app.js | 157 ++++++++++++++++++ app/static/index.html | 59 +++++++ app/static/style.css | 326 +++++++++++++++++++++++++++++++++++++ docker-compose.yml | 50 ++++++ requirements.txt | 16 ++ 19 files changed, 1752 insertions(+) create mode 100644 .env.example create mode 100644 .gitignore create mode 100644 Dockerfile create mode 100644 app/__init__.py create mode 100644 app/config.py create mode 100644 app/database.py create mode 100644 app/embeddings.py create mode 100644 app/llm.py create mode 100644 app/main.py create mode 100644 app/models.py create mode 100644 app/rag.py create mode 100644 app/scraper/__init__.py create mode 100644 app/scraper/experiences.py create mode 100644 app/scraper/substances.py create mode 100644 app/static/app.js create mode 100644 app/static/index.html create mode 100644 app/static/style.css create mode 100644 docker-compose.yml create mode 100644 requirements.txt diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..c6dd195 --- /dev/null +++ b/.env.example @@ -0,0 +1,24 @@ +# Database +DATABASE_URL=postgresql+asyncpg://erowid:erowid@erowid-db:5432/erowid +DATABASE_URL_SYNC=postgresql://erowid:erowid@erowid-db:5432/erowid +POSTGRES_USER=erowid +POSTGRES_PASSWORD=erowid +POSTGRES_DB=erowid + +# LLM Provider: ollama | claude | openai +LLM_PROVIDER=ollama + +# Ollama (local, free) +OLLAMA_BASE_URL=http://ollama:11434 +OLLAMA_EMBED_MODEL=nomic-embed-text +OLLAMA_CHAT_MODEL=llama3.2:3b + +# Claude API (optional) +ANTHROPIC_API_KEY= + +# OpenAI API (optional) +OPENAI_API_KEY= + +# App +APP_HOST=0.0.0.0 +APP_PORT=8000 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..20f4e8a --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +__pycache__/ +*.pyc +*.pyo +.env +*.egg-info/ +dist/ +build/ +.venv/ +venv/ +PLAN.md diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..5f5edda --- /dev/null +++ b/Dockerfile @@ -0,0 +1,16 @@ +FROM python:3.12-slim + +WORKDIR /app + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential libpq-dev \ + && rm -rf /var/lib/apt/lists/* + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY app/ ./app/ + +EXPOSE 8000 + +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/app/__init__.py b/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/config.py b/app/config.py new file mode 100644 index 0000000..92ac0fc --- /dev/null +++ b/app/config.py @@ -0,0 +1,34 @@ +from pydantic_settings import BaseSettings + + +class Settings(BaseSettings): + database_url: str = "postgresql+asyncpg://erowid:erowid@erowid-db:5432/erowid" + database_url_sync: str = "postgresql://erowid:erowid@erowid-db:5432/erowid" + + llm_provider: str = "ollama" # ollama | claude | openai + + ollama_base_url: str = "http://ollama:11434" + ollama_embed_model: str = "nomic-embed-text" + ollama_chat_model: str = "llama3.1:8b" + + anthropic_api_key: str = "" + openai_api_key: str = "" + + app_host: str = "0.0.0.0" + app_port: int = 8000 + + # Scraper settings + scrape_delay: float = 3.0 # seconds between requests (be polite to Erowid) + scrape_batch_size: int = 50 + + # RAG settings + chunk_size: int = 500 # tokens per chunk + chunk_overlap: int = 50 # token overlap between chunks + retrieval_top_k: int = 4 # number of chunks to retrieve + + class Config: + env_file = ".env" + extra = "ignore" + + +settings = Settings() diff --git a/app/database.py b/app/database.py new file mode 100644 index 0000000..fea0a1a --- /dev/null +++ b/app/database.py @@ -0,0 +1,23 @@ +from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker +from sqlalchemy.orm import DeclarativeBase +from app.config import settings + +engine = create_async_engine(settings.database_url, echo=False) +async_session = async_sessionmaker(engine, class_=AsyncSession, expire_on_commit=False) + + +class Base(DeclarativeBase): + pass + + +async def get_db(): + async with async_session() as session: + yield session + + +async def init_db(): + async with engine.begin() as conn: + await conn.execute( + __import__("sqlalchemy").text("CREATE EXTENSION IF NOT EXISTS vector") + ) + await conn.run_sync(Base.metadata.create_all) diff --git a/app/embeddings.py b/app/embeddings.py new file mode 100644 index 0000000..c5201f8 --- /dev/null +++ b/app/embeddings.py @@ -0,0 +1,199 @@ +import json +import logging +from typing import AsyncGenerator + +import httpx +from sqlalchemy import select, func +from sqlalchemy.ext.asyncio import AsyncSession + +from app.config import settings +from app.database import async_session +from app.models import Experience, Substance, DocumentChunk + +logger = logging.getLogger(__name__) + + +def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list[str]: + """Split text into overlapping chunks by approximate token count (words / 0.75).""" + words = text.split() + # Approximate: 1 token ~ 0.75 words + words_per_chunk = int(chunk_size * 0.75) + words_overlap = int(overlap * 0.75) + + if len(words) <= words_per_chunk: + return [text] + + chunks = [] + start = 0 + while start < len(words): + end = start + words_per_chunk + chunk = " ".join(words[start:end]) + chunks.append(chunk) + start = end - words_overlap + + return chunks + + +async def get_embedding(text: str, client: httpx.AsyncClient | None = None) -> list[float]: + """Get embedding vector for text using Ollama.""" + should_close = False + if client is None: + client = httpx.AsyncClient(timeout=60) + should_close = True + + try: + resp = await client.post( + f"{settings.ollama_base_url}/api/embeddings", + json={ + "model": settings.ollama_embed_model, + "prompt": text, + }, + ) + resp.raise_for_status() + data = resp.json() + return data["embedding"] + finally: + if should_close: + await client.aclose() + + +async def get_embeddings_batch(texts: list[str], client: httpx.AsyncClient) -> list[list[float]]: + """Get embeddings for multiple texts sequentially.""" + embeddings = [] + for text in texts: + emb = await get_embedding(text, client) + embeddings.append(emb) + return embeddings + + +async def embed_experiences(batch_size: int = 20): + """Chunk and embed all un-embedded experience reports.""" + async with async_session() as db: + # Find experiences that don't have chunks yet + subq = select(DocumentChunk.source_id).where( + DocumentChunk.source_type == "experience" + ).distinct() + result = await db.execute( + select(Experience).where(Experience.id.not_in(subq)) + ) + experiences = result.scalars().all() + logger.info(f"Found {len(experiences)} experiences to embed") + + async with httpx.AsyncClient(timeout=60) as client: + total_chunks = 0 + for i, exp in enumerate(experiences): + # Build a rich text representation + header = f"Experience Report: {exp.title}\n" + header += f"Substance: {exp.substance}\n" + if exp.category: + header += f"Category: {exp.category}\n" + if exp.gender: + header += f"Gender: {exp.gender}\n" + if exp.age: + header += f"Age: {exp.age}\n" + header += "\n" + + full_text = header + exp.body + chunks = chunk_text(full_text, settings.chunk_size, settings.chunk_overlap) + + for idx, chunk_text_content in enumerate(chunks): + embedding = await get_embedding(chunk_text_content, client) + + metadata = json.dumps({ + "title": exp.title, + "substance": exp.substance, + "category": exp.category, + "erowid_id": exp.erowid_id, + }) + + doc_chunk = DocumentChunk( + source_type="experience", + source_id=exp.id, + chunk_index=idx, + content=chunk_text_content, + metadata_json=metadata, + embedding=embedding, + ) + db.add(doc_chunk) + total_chunks += 1 + + if (i + 1) % batch_size == 0: + await db.commit() + logger.info(f"Embedded {i + 1} experiences ({total_chunks} chunks)") + + await db.commit() + logger.info(f"Done! Created {total_chunks} chunks from {len(experiences)} experiences") + return total_chunks + + +async def embed_substances(batch_size: int = 10): + """Chunk and embed all un-embedded substance info pages.""" + async with async_session() as db: + subq = select(DocumentChunk.source_id).where( + DocumentChunk.source_type == "substance" + ).distinct() + result = await db.execute( + select(Substance).where(Substance.id.not_in(subq)) + ) + substances = result.scalars().all() + logger.info(f"Found {len(substances)} substances to embed") + + async with httpx.AsyncClient(timeout=60) as client: + total_chunks = 0 + for i, sub in enumerate(substances): + # Build rich text representation + sections = [] + sections.append(f"Substance Information: {sub.name}") + if sub.category: + sections.append(f"Category: {sub.category}") + if sub.description: + sections.append(f"\nOverview:\n{sub.description}") + if sub.effects: + sections.append(f"\nEffects:\n{sub.effects}") + if sub.dosage: + sections.append(f"\nDosage:\n{sub.dosage}") + if sub.duration: + sections.append(f"\nDuration:\n{sub.duration}") + if sub.chemistry: + sections.append(f"\nChemistry:\n{sub.chemistry}") + if sub.health: + sections.append(f"\nHealth & Safety:\n{sub.health}") + if sub.law: + sections.append(f"\nLegal Status:\n{sub.law}") + + full_text = "\n".join(sections) + chunks = chunk_text(full_text, settings.chunk_size, settings.chunk_overlap) + + for idx, chunk_text_content in enumerate(chunks): + embedding = await get_embedding(chunk_text_content, client) + + metadata = json.dumps({ + "substance": sub.name, + "category": sub.category, + }) + + doc_chunk = DocumentChunk( + source_type="substance", + source_id=sub.id, + chunk_index=idx, + content=chunk_text_content, + metadata_json=metadata, + embedding=embedding, + ) + db.add(doc_chunk) + total_chunks += 1 + + if (i + 1) % batch_size == 0: + await db.commit() + logger.info(f"Embedded {i + 1} substances ({total_chunks} chunks)") + + await db.commit() + logger.info(f"Done! Created {total_chunks} chunks from {len(substances)} substances") + return total_chunks + + +async def embed_all(): + """Embed everything that hasn't been embedded yet.""" + exp_chunks = await embed_experiences() + sub_chunks = await embed_substances() + return {"experience_chunks": exp_chunks, "substance_chunks": sub_chunks} diff --git a/app/llm.py b/app/llm.py new file mode 100644 index 0000000..b033a50 --- /dev/null +++ b/app/llm.py @@ -0,0 +1,119 @@ +import json +import logging +from typing import AsyncGenerator + +import httpx + +from app.config import settings + +logger = logging.getLogger(__name__) + + +async def stream_ollama(messages: list[dict], system: str = "") -> AsyncGenerator[str, None]: + """Stream a chat completion from Ollama.""" + all_messages = [] + if system: + all_messages.append({"role": "system", "content": system}) + all_messages.extend(messages) + + payload = { + "model": settings.ollama_chat_model, + "messages": all_messages, + "stream": True, + } + + timeout = httpx.Timeout(connect=30, read=600, write=30, pool=30) + async with httpx.AsyncClient(timeout=timeout) as client: + async with client.stream( + "POST", + f"{settings.ollama_base_url}/api/chat", + json=payload, + ) as resp: + resp.raise_for_status() + buffer = b"" + async for chunk in resp.aiter_bytes(): + buffer += chunk + # Process complete JSON lines + while b"\n" in buffer: + line, buffer = buffer.split(b"\n", 1) + line = line.strip() + if not line: + continue + try: + data = json.loads(line) + if "message" in data and "content" in data["message"]: + content = data["message"]["content"] + if content: + yield content + if data.get("done"): + return + except json.JSONDecodeError: + continue + + +async def stream_claude(messages: list[dict], system: str = "") -> AsyncGenerator[str, None]: + """Stream a chat completion from Claude API.""" + try: + from anthropic import AsyncAnthropic + except ImportError: + raise RuntimeError("anthropic package not installed") + + client = AsyncAnthropic(api_key=settings.anthropic_api_key) + + async with client.messages.stream( + model="claude-sonnet-4-5-20250929", + max_tokens=2048, + system=system, + messages=messages, + ) as stream: + async for text in stream.text_stream: + yield text + + +async def stream_openai(messages: list[dict], system: str = "") -> AsyncGenerator[str, None]: + """Stream a chat completion from OpenAI API.""" + try: + from openai import AsyncOpenAI + except ImportError: + raise RuntimeError("openai package not installed") + + client = AsyncOpenAI(api_key=settings.openai_api_key) + + all_messages = [] + if system: + all_messages.append({"role": "system", "content": system}) + all_messages.extend(messages) + + stream = await client.chat.completions.create( + model="gpt-4o-mini", + messages=all_messages, + max_tokens=2048, + stream=True, + ) + + async for chunk in stream: + if chunk.choices and chunk.choices[0].delta.content: + yield chunk.choices[0].delta.content + + +async def stream_chat(messages: list[dict], system: str = "") -> AsyncGenerator[str, None]: + """Route to the configured LLM provider.""" + provider = settings.llm_provider.lower() + + if provider == "ollama": + async for token in stream_ollama(messages, system): + yield token + elif provider == "claude": + if not settings.anthropic_api_key: + yield "Error: ANTHROPIC_API_KEY not configured. Set it in .env or switch LLM_PROVIDER to ollama." + return + async for token in stream_claude(messages, system): + yield token + elif provider == "openai": + if not settings.openai_api_key: + yield "Error: OPENAI_API_KEY not configured. Set it in .env or switch LLM_PROVIDER to ollama." + return + async for token in stream_openai(messages, system): + yield token + else: + yield f"Error: Unknown LLM_PROVIDER '{provider}'. Use ollama, claude, or openai." diff --git a/app/main.py b/app/main.py new file mode 100644 index 0000000..b4eed85 --- /dev/null +++ b/app/main.py @@ -0,0 +1,140 @@ +import asyncio +import json +import logging +import uuid +from contextlib import asynccontextmanager +from pathlib import Path + +from fastapi import FastAPI, Request +from fastapi.responses import HTMLResponse, StreamingResponse, JSONResponse +from fastapi.staticfiles import StaticFiles + +from app.config import settings +from app.database import init_db, async_session +from app.rag import chat_stream + +logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(name)s] %(levelname)s: %(message)s") +logger = logging.getLogger(__name__) + +# In-memory session store (conversation history per session) +sessions: dict[str, list[dict]] = {} + + +@asynccontextmanager +async def lifespan(app: FastAPI): + logger.info("Initializing database...") + await init_db() + logger.info("Database ready.") + yield + + +app = FastAPI(title="Erowid Bot", lifespan=lifespan) + +# Serve static files +static_dir = Path(__file__).parent / "static" +app.mount("/static", StaticFiles(directory=str(static_dir)), name="static") + + +@app.get("/", response_class=HTMLResponse) +async def index(): + return (static_dir / "index.html").read_text() + + +@app.get("/health") +async def health(): + return {"status": "ok"} + + +@app.post("/chat") +async def chat(request: Request): + """Chat endpoint with streaming SSE response.""" + body = await request.json() + message = body.get("message", "").strip() + session_id = body.get("session_id", "") + + if not message: + return JSONResponse({"error": "Empty message"}, status_code=400) + + if not session_id: + session_id = str(uuid.uuid4()) + + # Get or create conversation history + history = sessions.get(session_id, []) + + async def generate(): + full_response = "" + try: + async for token in chat_stream(message, history): + full_response += token + yield f"data: {json.dumps({'token': token})}\n\n" + except Exception as e: + logger.error(f"Chat error: {e}") + yield f"data: {json.dumps({'error': str(e)})}\n\n" + + # Save to history + history.append({"role": "user", "content": message}) + history.append({"role": "assistant", "content": full_response}) + # Keep history bounded + if len(history) > 20: + history[:] = history[-20:] + sessions[session_id] = history + + yield f"data: {json.dumps({'done': True, 'session_id': session_id})}\n\n" + + return StreamingResponse( + generate(), + media_type="text/event-stream", + headers={ + "Cache-Control": "no-cache", + "Connection": "keep-alive", + "X-Accel-Buffering": "no", + }, + ) + + +@app.get("/stats") +async def stats(): + """Get database stats.""" + from sqlalchemy import func, select + from app.models import Experience, Substance, DocumentChunk + + async with async_session() as db: + exp_count = (await db.execute(select(func.count(Experience.id)))).scalar() or 0 + sub_count = (await db.execute(select(func.count(Substance.id)))).scalar() or 0 + chunk_count = (await db.execute(select(func.count(DocumentChunk.id)))).scalar() or 0 + + return { + "experiences": exp_count, + "substances": sub_count, + "chunks": chunk_count, + } + + +@app.post("/admin/scrape/experiences") +async def trigger_scrape_experiences(request: Request): + """Trigger experience scraping (admin endpoint).""" + body = await request.json() if request.headers.get("content-type") == "application/json" else {} + limit = body.get("limit") + + from app.scraper.experiences import scrape_all_experiences + asyncio.create_task(scrape_all_experiences(limit=limit)) + return {"status": "started", "message": "Experience scraping started in background"} + + +@app.post("/admin/scrape/substances") +async def trigger_scrape_substances(request: Request): + """Trigger substance scraping (admin endpoint).""" + body = await request.json() if request.headers.get("content-type") == "application/json" else {} + limit = body.get("limit") + + from app.scraper.substances import scrape_all_substances + asyncio.create_task(scrape_all_substances(limit=limit)) + return {"status": "started", "message": "Substance scraping started in background"} + + +@app.post("/admin/embed") +async def trigger_embedding(): + """Trigger embedding pipeline (admin endpoint).""" + from app.embeddings import embed_all + asyncio.create_task(embed_all()) + return {"status": "started", "message": "Embedding pipeline started in background"} diff --git a/app/models.py b/app/models.py new file mode 100644 index 0000000..2206be4 --- /dev/null +++ b/app/models.py @@ -0,0 +1,56 @@ +from sqlalchemy import Column, Integer, String, Text, Boolean, Float, ForeignKey +from sqlalchemy.dialects.postgresql import ARRAY, TIMESTAMP +from pgvector.sqlalchemy import Vector +from datetime import datetime, timezone +from app.database import Base + + +class Substance(Base): + __tablename__ = "substances" + + id = Column(Integer, primary_key=True, autoincrement=True) + name = Column(String(255), unique=True, nullable=False, index=True) + url = Column(String(1024)) + category = Column(String(255)) # e.g. "Psychedelics", "Stimulants" + description = Column(Text) + effects = Column(Text) + dosage = Column(Text) + duration = Column(Text) + chemistry = Column(Text) + health = Column(Text) + law = Column(Text) + raw_html = Column(Text) + scraped_at = Column(TIMESTAMP(timezone=True), default=lambda: datetime.now(timezone.utc)) + + +class Experience(Base): + __tablename__ = "experiences" + + id = Column(Integer, primary_key=True, autoincrement=True) + erowid_id = Column(Integer, unique=True, index=True) + title = Column(String(512)) + author = Column(String(255)) + substance = Column(String(512)) # may list multiple substances + substance_list = Column(ARRAY(String)) # parsed list + body = Column(Text, nullable=False) + category = Column(String(255)) # e.g. "General", "First Times", "Bad Trips" + gender = Column(String(50)) + age = Column(String(50)) + year = Column(Integer) + url = Column(String(1024)) + intensity = Column(String(100)) + raw_html = Column(Text) + scraped_at = Column(TIMESTAMP(timezone=True), default=lambda: datetime.now(timezone.utc)) + + +class DocumentChunk(Base): + __tablename__ = "document_chunks" + + id = Column(Integer, primary_key=True, autoincrement=True) + source_type = Column(String(50), nullable=False, index=True) # "experience" or "substance" + source_id = Column(Integer, nullable=False, index=True) + chunk_index = Column(Integer, nullable=False) + content = Column(Text, nullable=False) + metadata_json = Column(Text) # JSON string with extra metadata + embedding = Column(Vector(768)) # nomic-embed-text dimension + created_at = Column(TIMESTAMP(timezone=True), default=lambda: datetime.now(timezone.utc)) diff --git a/app/rag.py b/app/rag.py new file mode 100644 index 0000000..a855e12 --- /dev/null +++ b/app/rag.py @@ -0,0 +1,108 @@ +import json +import logging +from typing import AsyncGenerator + +from sqlalchemy import select, text +from sqlalchemy.ext.asyncio import AsyncSession + +from app.config import settings +from app.database import async_session +from app.embeddings import get_embedding +from app.llm import stream_chat +from app.models import DocumentChunk + +logger = logging.getLogger(__name__) + +SYSTEM_PROMPT = """You are the Erowid Knowledge Assistant focused on harm reduction. Provide accurate, non-judgmental substance info from the Erowid database. Prioritize safety. Never encourage drug use. Cite sources when possible. Say when info is limited.""" + + +async def retrieve_context(query: str, top_k: int | None = None) -> list[dict]: + """Retrieve the most relevant document chunks for a query.""" + if top_k is None: + top_k = settings.retrieval_top_k + + # Get query embedding + query_embedding = await get_embedding(query) + + async with async_session() as db: + # Use pgvector cosine distance for similarity search + result = await db.execute( + text(""" + SELECT id, source_type, source_id, chunk_index, content, metadata_json, + embedding <=> :query_embedding AS distance + FROM document_chunks + ORDER BY embedding <=> :query_embedding + LIMIT :top_k + """), + {"query_embedding": str(query_embedding), "top_k": top_k}, + ) + + chunks = [] + for row in result.fetchall(): + metadata = {} + if row[5]: + try: + metadata = json.loads(row[5]) + except json.JSONDecodeError: + pass + + chunks.append({ + "id": row[0], + "source_type": row[1], + "source_id": row[2], + "chunk_index": row[3], + "content": row[4], + "metadata": metadata, + "distance": row[6], + }) + + return chunks + + +def build_context_prompt(chunks: list[dict]) -> str: + """Build a context string from retrieved chunks.""" + if not chunks: + return "\n[No relevant documents found in the database.]\n" + + context_parts = [] + for i, chunk in enumerate(chunks, 1): + source_label = chunk["source_type"].title() + metadata = chunk["metadata"] + + header = f"--- Source {i} ({source_label})" + if "title" in metadata: + header += f" | {metadata['title']}" + if "substance" in metadata: + header += f" | Substance: {metadata['substance']}" + header += " ---" + + # Limit each chunk to avoid overwhelming the LLM + content = chunk["content"][:800] + context_parts.append(f"{header}\n{content}") + + return "\n\n".join(context_parts) + + +async def chat_stream( + user_message: str, + conversation_history: list[dict] | None = None, +) -> AsyncGenerator[str, None]: + """Full RAG pipeline: retrieve context, build prompt, stream LLM response.""" + # Retrieve relevant chunks + chunks = await retrieve_context(user_message) + + # Build the context-augmented system prompt + context_text = build_context_prompt(chunks) + full_system = f"{SYSTEM_PROMPT}\n\n--- RELEVANT EROWID DATA ---\n{context_text}\n--- END EROWID DATA ---" + + # Build message history + messages = [] + if conversation_history: + # Keep last 10 messages for context + messages = conversation_history[-10:] + + messages.append({"role": "user", "content": user_message}) + + # Stream from LLM + async for token in stream_chat(messages, system=full_system): + yield token diff --git a/app/scraper/__init__.py b/app/scraper/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/scraper/experiences.py b/app/scraper/experiences.py new file mode 100644 index 0000000..2f3f880 --- /dev/null +++ b/app/scraper/experiences.py @@ -0,0 +1,244 @@ +import asyncio +import re +import logging +from datetime import datetime, timezone + +import httpx +from bs4 import BeautifulSoup +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.config import settings +from app.database import async_session +from app.models import Experience + +logger = logging.getLogger(__name__) + +BASE_URL = "https://erowid.org" +EXP_LIST_URL = "https://erowid.org/experiences/exp_list.shtml" +REPORT_URL = "https://erowid.org/experiences/exp.php?ID={id}" + +HEADERS = { + "User-Agent": "ErowidResearchBot/1.0 (educational research project)", + "Accept": "text/html,application/xhtml+xml", +} + + +async def get_all_substance_pages(client: httpx.AsyncClient) -> list[dict]: + """Get main substance experience listing pages from the master index. + + Only fetches top-level substance pages (e.g. exp_LSD.shtml), not + category sub-pages (e.g. exp_LSD_General.shtml) since the main page + already contains all report IDs for that substance. + """ + resp = await client.get(EXP_LIST_URL, headers=HEADERS, timeout=60) + resp.raise_for_status() + soup = BeautifulSoup(resp.text, "lxml") + + pages = [] + seen_substances = set() + for a in soup.select("a[href]"): + href = a.get("href", "") + name = a.get_text(strip=True) + if not href.startswith("subs/exp_") or not href.endswith(".shtml") or not name: + continue + + full_url = f"https://erowid.org/experiences/{href}" + + # Extract the base substance name from the URL + # e.g. subs/exp_LSD.shtml -> LSD, subs/exp_LSD_General.shtml -> LSD + filename = href.replace("subs/exp_", "").replace(".shtml", "") + + # Skip category sub-pages — they contain subsets of the main page + # Category sub-pages have suffixes like _General, _First_Times, _Bad_Trips, etc. + known_categories = [ + "_General", "_First_Times", "_Combinations", "_Retrospective", + "_Preparation", "_Difficult_Experiences", "_Bad_Trips", + "_Health_Problems", "_Train_Wrecks", "_Glowing_Experiences", + "_Mystical_Experiences", "_Health_Benefits", "_What_Was_in_That", + "_Medical_Use", "_Performance_Enhancement", "_Addiction", + ] + is_category = any(filename.endswith(cat) for cat in known_categories) + if is_category: + continue + + if full_url not in seen_substances: + seen_substances.add(full_url) + pages.append({"name": name, "url": full_url}) + + logger.info(f"Found {len(pages)} main substance experience pages (filtered from category sub-pages)") + return pages + + +async def get_experience_ids_from_page(client: httpx.AsyncClient, url: str) -> list[int]: + """Extract all experience report IDs from a substance listing page.""" + try: + resp = await client.get(url, headers=HEADERS, timeout=30) + resp.raise_for_status() + except httpx.HTTPError as e: + logger.warning(f"Failed to fetch {url}: {e}") + return [] + + ids = [int(x) for x in re.findall(r"exp\.php\?ID=(\d+)", resp.text)] + return list(set(ids)) # dedupe + + +async def get_all_experience_ids(client: httpx.AsyncClient) -> list[int]: + """Collect all unique experience IDs from all substance pages. + + Fetches pages concurrently in batches of 5 for speed. + """ + pages = await get_all_substance_pages(client) + + all_ids = set() + batch_size = 5 + for i in range(0, len(pages), batch_size): + batch = pages[i : i + batch_size] + tasks = [get_experience_ids_from_page(client, p["url"]) for p in batch] + results = await asyncio.gather(*tasks, return_exceptions=True) + for result in results: + if isinstance(result, list): + all_ids.update(result) + logger.info(f"Scanned {min(i + batch_size, len(pages))}/{len(pages)} pages, {len(all_ids)} unique IDs") + await asyncio.sleep(0.5) + + logger.info(f"Found {len(all_ids)} unique experience IDs total") + return sorted(all_ids) + + +async def scrape_experience_report(client: httpx.AsyncClient, erowid_id: int) -> dict | None: + """Scrape a single experience report.""" + url = REPORT_URL.format(id=erowid_id) + try: + resp = await client.get(url, headers=HEADERS, timeout=30) + resp.raise_for_status() + except httpx.HTTPError as e: + logger.warning(f"Failed to fetch experience {erowid_id}: {e}") + return None + + soup = BeautifulSoup(resp.text, "lxml") + + # Extract the main report body + body_div = soup.select_one("div.report-text-surround") + if not body_div: + logger.warning(f"No report body found for {erowid_id}") + return None + + # Remove the dosechart from body text to avoid duplication + body_text_parts = [] + for el in body_div.children: + if hasattr(el, "name") and el.name == "table": + continue # skip dosechart table + text = el.get_text(separator="\n", strip=True) if hasattr(el, "get_text") else str(el).strip() + if text: + body_text_parts.append(text) + + body = "\n\n".join(body_text_parts) + if not body or len(body) < 50: + return None + + # Extract metadata + title = "" + title_el = soup.select_one("div.title") + if title_el: + title = title_el.get_text(strip=True) + + substance = "" + sub_el = soup.select_one("div.substance") + if sub_el: + substance = sub_el.get_text(strip=True) + + substance_list = [s.strip() for s in re.split(r"[,&]", substance) if s.strip()] + + author = "" + author_el = soup.select_one("div.author") + if author_el: + author = author_el.get_text(strip=True).replace("by ", "") + + # Dosage info + dose_table = soup.select_one("table.dosechart") + dose_text = "" + if dose_table: + rows = dose_table.select("tr") + dose_parts = [] + for row in rows: + cells = row.select("td") + row_text = " ".join(c.get_text(strip=True) for c in cells if c.get_text(strip=True)) + if row_text: + dose_parts.append(row_text) + dose_text = "; ".join(dose_parts) + + if dose_text: + body = f"Dosage: {dose_text}\n\n{body}" + + # Weight/gender from the body weight line inside report-text-surround + gender = "" + age = "" + weight_el = soup.select_one("table.bodyweight") + if weight_el: + wt = weight_el.get_text(strip=True) + age_match = re.search(r"(\d+)\s*yr", wt, re.IGNORECASE) + if age_match: + age = age_match.group(1) + if "male" in wt.lower(): + gender = "Male" + elif "female" in wt.lower(): + gender = "Female" + + # Try to extract category from the page + category = "" + cat_el = soup.select_one("div.foot-eroid-cat") + if cat_el: + category = cat_el.get_text(strip=True) + + return { + "erowid_id": erowid_id, + "title": title, + "author": author, + "substance": substance, + "substance_list": substance_list, + "body": body, + "category": category, + "gender": gender, + "age": age, + "url": url, + "raw_html": resp.text, + } + + +async def scrape_all_experiences(limit: int | None = None): + """Main scraper entry point. Scrapes all experience reports into the database.""" + async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client: + all_ids = await get_all_experience_ids(client) + + if limit: + all_ids = all_ids[:limit] + + async with async_session() as db: + result = await db.execute(select(Experience.erowid_id)) + existing_ids = {row[0] for row in result.fetchall()} + logger.info(f"Already have {len(existing_ids)} experiences in DB") + + to_scrape = [eid for eid in all_ids if eid not in existing_ids] + logger.info(f"Need to scrape {len(to_scrape)} new experiences") + + scraped = 0 + errors = 0 + for eid in to_scrape: + data = await scrape_experience_report(client, eid) + if data: + exp = Experience(**data) + db.add(exp) + scraped += 1 + + if scraped % settings.scrape_batch_size == 0: + await db.commit() + logger.info(f"Committed batch: {scraped}/{len(to_scrape)} scraped ({errors} errors)") + else: + errors += 1 + + await asyncio.sleep(settings.scrape_delay) + + await db.commit() + logger.info(f"Done! Scraped {scraped} new experiences ({errors} errors)") + return scraped diff --git a/app/scraper/substances.py b/app/scraper/substances.py new file mode 100644 index 0000000..79efd13 --- /dev/null +++ b/app/scraper/substances.py @@ -0,0 +1,171 @@ +import asyncio +import re +import logging +from datetime import datetime, timezone + +import httpx +from bs4 import BeautifulSoup +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.config import settings +from app.database import async_session +from app.models import Substance + +logger = logging.getLogger(__name__) + +BASE_URL = "https://erowid.org" +VAULT_INDEX = "https://erowid.org/chemicals/" + +HEADERS = { + "User-Agent": "ErowidResearchBot/1.0 (educational research project)", + "Accept": "text/html,application/xhtml+xml", +} + +# Known substance categories on Erowid +CATEGORIES = [ + "Psychedelics", "Empathogens", "Stimulants", "Depressants", + "Dissociatives", "Cannabis", "Opioids", "Nootropics", + "Plants & Herbs", "Pharmaceuticals", "Research Chemicals", +] + + +async def get_substance_urls(client: httpx.AsyncClient) -> list[dict]: + """Get all substance vault URLs from the chemicals index.""" + resp = await client.get(VAULT_INDEX, headers=HEADERS) + resp.raise_for_status() + soup = BeautifulSoup(resp.text, "lxml") + + substances = [] + # The chemicals index lists substances with links to their vaults + for link in soup.select("a[href]"): + href = link.get("href", "") + text = link.get_text(strip=True) + # Filter for substance vault links (e.g., /chemicals/lsd/, /chemicals/psilocybin/) + if re.match(r"^/chemicals/[\w_-]+/?$", href) or re.match(r"^[\w_-]+/?$", href): + if text and len(text) > 1 and not text.startswith("["): + full_url = href if href.startswith("http") else f"{BASE_URL}{href}" if href.startswith("/") else f"{VAULT_INDEX}{href}" + substances.append({ + "name": text, + "url": full_url, + }) + + # Also scrape the plants/herbs section + plants_url = f"{BASE_URL}/plants/" + try: + resp2 = await client.get(plants_url, headers=HEADERS) + resp2.raise_for_status() + soup2 = BeautifulSoup(resp2.text, "lxml") + for link in soup2.select("a[href]"): + href = link.get("href", "") + text = link.get_text(strip=True) + if re.match(r"^/plants/[\w_-]+/?$", href) or re.match(r"^[\w_-]+/?$", href): + if text and len(text) > 1 and not text.startswith("["): + full_url = href if href.startswith("http") else f"{BASE_URL}{href}" if href.startswith("/") else f"{plants_url}{href}" + substances.append({ + "name": text, + "url": full_url, + "category": "Plants & Herbs", + }) + except httpx.HTTPError: + logger.warning("Failed to fetch plants index") + + logger.info(f"Found {len(substances)} substance URLs") + return substances + + +async def scrape_substance_vault(client: httpx.AsyncClient, name: str, url: str) -> dict | None: + """Scrape a substance vault page for key information.""" + try: + resp = await client.get(url, headers=HEADERS) + resp.raise_for_status() + except httpx.HTTPError as e: + logger.warning(f"Failed to fetch substance {name}: {e}") + return None + + soup = BeautifulSoup(resp.text, "lxml") + raw_html = resp.text + + # Extract text content from the vault main page + description = "" + main_content = soup.select_one("div.sum-content") or soup.select_one("td.content") or soup.select_one("body") + if main_content: + description = main_content.get_text(separator="\n", strip=True)[:5000] + + # Try to find sub-pages: effects, dose, duration, health, law + sections = {} + sub_pages = { + "effects": ["effects", "effects.shtml"], + "dosage": ["dose", "dose.shtml", "dosage.shtml"], + "duration": ["duration", "duration.shtml", "timeline.shtml"], + "chemistry": ["chemistry", "chemistry.shtml"], + "health": ["health", "health.shtml", "warnings.shtml"], + "law": ["law", "law.shtml", "legal.shtml"], + } + + for section_name, paths in sub_pages.items(): + for path in paths: + sub_url = f"{url.rstrip('/')}/{path}" + try: + sub_resp = await client.get(sub_url, headers=HEADERS) + if sub_resp.status_code == 200: + sub_soup = BeautifulSoup(sub_resp.text, "lxml") + content_el = sub_soup.select_one("div.sum-content") or sub_soup.select_one("td.content") + if content_el: + sections[section_name] = content_el.get_text(separator="\n", strip=True)[:3000] + break + await asyncio.sleep(0.5) + except httpx.HTTPError: + continue + + return { + "name": name, + "url": url, + "description": description, + "effects": sections.get("effects", ""), + "dosage": sections.get("dosage", ""), + "duration": sections.get("duration", ""), + "chemistry": sections.get("chemistry", ""), + "health": sections.get("health", ""), + "law": sections.get("law", ""), + "raw_html": raw_html, + } + + +async def scrape_all_substances(limit: int | None = None): + """Main entry point: scrape all substance vaults.""" + async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client: + substance_list = await get_substance_urls(client) + + if limit: + substance_list = substance_list[:limit] + + async with async_session() as db: + # Get already-scraped substances + result = await db.execute(select(Substance.name)) + existing = {row[0].lower() for row in result.fetchall()} + logger.info(f"Already have {len(existing)} substances in DB") + + to_scrape = [s for s in substance_list if s["name"].lower() not in existing] + logger.info(f"Need to scrape {len(to_scrape)} new substances") + + scraped = 0 + for sub_meta in to_scrape: + data = await scrape_substance_vault( + client, sub_meta["name"], sub_meta["url"] + ) + if data: + data["category"] = sub_meta.get("category", "") + sub = Substance(**data) + db.add(sub) + scraped += 1 + + if scraped % 10 == 0: + await db.commit() + logger.info(f"Committed: {scraped} substances scraped") + + await asyncio.sleep(settings.scrape_delay) + + await db.commit() + logger.info(f"Done! Scraped {scraped} new substances") + return scraped diff --git a/app/static/app.js b/app/static/app.js new file mode 100644 index 0000000..b38f691 --- /dev/null +++ b/app/static/app.js @@ -0,0 +1,157 @@ +const chatContainer = document.getElementById("chat-container"); +const messageInput = document.getElementById("message-input"); +const sendBtn = document.getElementById("send-btn"); +const welcomeEl = document.getElementById("welcome"); +const statsEl = document.getElementById("stats"); + +let sessionId = localStorage.getItem("erowid_session") || ""; +let isStreaming = false; + +// Load stats +async function loadStats() { + try { + const resp = await fetch("/stats"); + const data = await resp.json(); + statsEl.textContent = `${data.experiences} reports | ${data.substances} substances | ${data.chunks} chunks`; + } catch { + statsEl.textContent = "connecting..."; + } +} +loadStats(); + +// Auto-resize textarea +messageInput.addEventListener("input", () => { + messageInput.style.height = "auto"; + messageInput.style.height = Math.min(messageInput.scrollHeight, 120) + "px"; +}); + +// Send on Enter (Shift+Enter for newline) +messageInput.addEventListener("keydown", (e) => { + if (e.key === "Enter" && !e.shiftKey) { + e.preventDefault(); + sendMessage(); + } +}); + +sendBtn.addEventListener("click", sendMessage); + +// Suggestion clicks +document.querySelectorAll(".suggestion").forEach((el) => { + el.addEventListener("click", () => { + messageInput.value = el.textContent; + sendMessage(); + }); +}); + +function addMessage(role, content) { + if (welcomeEl) welcomeEl.style.display = "none"; + + const msg = document.createElement("div"); + msg.className = `message ${role}`; + + const avatar = document.createElement("div"); + avatar.className = "message-avatar"; + avatar.textContent = role === "user" ? "You" : "E"; + + const contentEl = document.createElement("div"); + contentEl.className = "message-content"; + contentEl.textContent = content; + + msg.appendChild(avatar); + msg.appendChild(contentEl); + chatContainer.appendChild(msg); + chatContainer.scrollTop = chatContainer.scrollHeight; + + return contentEl; +} + +function addTypingIndicator() { + const msg = document.createElement("div"); + msg.className = "message assistant"; + msg.id = "typing-indicator"; + + const avatar = document.createElement("div"); + avatar.className = "message-avatar"; + avatar.textContent = "E"; + + const contentEl = document.createElement("div"); + contentEl.className = "message-content"; + contentEl.innerHTML = '
'; + + msg.appendChild(avatar); + msg.appendChild(contentEl); + chatContainer.appendChild(msg); + chatContainer.scrollTop = chatContainer.scrollHeight; +} + +function removeTypingIndicator() { + const el = document.getElementById("typing-indicator"); + if (el) el.remove(); +} + +async function sendMessage() { + const text = messageInput.value.trim(); + if (!text || isStreaming) return; + + isStreaming = true; + sendBtn.disabled = true; + messageInput.value = ""; + messageInput.style.height = "auto"; + + addMessage("user", text); + addTypingIndicator(); + + try { + const resp = await fetch("/chat", { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ message: text, session_id: sessionId }), + }); + + removeTypingIndicator(); + const contentEl = addMessage("assistant", ""); + + const reader = resp.body.getReader(); + const decoder = new TextDecoder(); + let buffer = ""; + let fullResponse = ""; + + while (true) { + const { done, value } = await reader.read(); + if (done) break; + + buffer += decoder.decode(value, { stream: true }); + const lines = buffer.split("\n"); + buffer = lines.pop() || ""; + + for (const line of lines) { + if (!line.startsWith("data: ")) continue; + const jsonStr = line.slice(6).trim(); + if (!jsonStr) continue; + + try { + const data = JSON.parse(jsonStr); + if (data.token) { + fullResponse += data.token; + contentEl.textContent = fullResponse; + chatContainer.scrollTop = chatContainer.scrollHeight; + } + if (data.session_id) { + sessionId = data.session_id; + localStorage.setItem("erowid_session", sessionId); + } + if (data.error) { + contentEl.textContent = `Error: ${data.error}`; + } + } catch {} + } + } + } catch (err) { + removeTypingIndicator(); + addMessage("assistant", `Connection error: ${err.message}`); + } + + isStreaming = false; + sendBtn.disabled = false; + messageInput.focus(); +} diff --git a/app/static/index.html b/app/static/index.html new file mode 100644 index 0000000..edd83cd --- /dev/null +++ b/app/static/index.html @@ -0,0 +1,59 @@ + + + + + + Erowid Bot + + + + + +
+
+ +
Harm Reduction Knowledge Assistant
+
+
loading...
+
+ +
+
+

Explore the Erowid Database

+

+ Ask questions about substances, experience reports, dosage information, + effects, safety, and more. All information is sourced from the Erowid vault + and experience reports. This bot prioritizes harm reduction and safety. +

+
+ + + + + +
+
+
+ +
+
+ + +
+
+ Information sourced from Erowid.org. Not medical advice. Always practice harm reduction. +
+
+ + + + diff --git a/app/static/style.css b/app/static/style.css new file mode 100644 index 0000000..eacfe3e --- /dev/null +++ b/app/static/style.css @@ -0,0 +1,326 @@ +* { + margin: 0; + padding: 0; + box-sizing: border-box; +} + +:root { + --bg-primary: #0a0a0f; + --bg-secondary: #12121a; + --bg-tertiary: #1a1a2e; + --text-primary: #e0e0e8; + --text-secondary: #8888a0; + --accent: #6c5ce7; + --accent-glow: rgba(108, 92, 231, 0.3); + --user-bg: #2d2d44; + --bot-bg: #1a1a2e; + --border: #2a2a3e; + --danger: #e74c3c; + --success: #2ecc71; +} + +body { + font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif; + background: var(--bg-primary); + color: var(--text-primary); + height: 100vh; + display: flex; + flex-direction: column; + overflow: hidden; +} + +/* Header */ +.header { + display: flex; + align-items: center; + justify-content: space-between; + padding: 12px 20px; + background: var(--bg-secondary); + border-bottom: 1px solid var(--border); + flex-shrink: 0; +} + +.header-left { + display: flex; + align-items: center; + gap: 12px; +} + +.logo { + font-size: 24px; + font-weight: 700; + background: linear-gradient(135deg, var(--accent), #a29bfe); + -webkit-background-clip: text; + -webkit-text-fill-color: transparent; + background-clip: text; +} + +.tagline { + font-size: 12px; + color: var(--text-secondary); + letter-spacing: 0.5px; +} + +.stats-badge { + font-size: 11px; + color: var(--text-secondary); + background: var(--bg-tertiary); + padding: 4px 10px; + border-radius: 12px; + border: 1px solid var(--border); +} + +/* Chat area */ +.chat-container { + flex: 1; + overflow-y: auto; + padding: 20px; + scroll-behavior: smooth; +} + +.chat-container::-webkit-scrollbar { + width: 6px; +} +.chat-container::-webkit-scrollbar-track { + background: transparent; +} +.chat-container::-webkit-scrollbar-thumb { + background: var(--border); + border-radius: 3px; +} + +.welcome { + text-align: center; + padding: 60px 20px; + max-width: 600px; + margin: 0 auto; +} + +.welcome h2 { + font-size: 22px; + margin-bottom: 12px; + color: var(--text-primary); +} + +.welcome p { + color: var(--text-secondary); + line-height: 1.6; + margin-bottom: 20px; + font-size: 14px; +} + +.suggestions { + display: flex; + flex-wrap: wrap; + gap: 8px; + justify-content: center; +} + +.suggestion { + background: var(--bg-tertiary); + border: 1px solid var(--border); + color: var(--text-secondary); + padding: 8px 14px; + border-radius: 20px; + font-size: 13px; + cursor: pointer; + transition: all 0.2s; +} + +.suggestion:hover { + border-color: var(--accent); + color: var(--text-primary); + background: rgba(108, 92, 231, 0.1); +} + +/* Messages */ +.message { + display: flex; + gap: 12px; + margin-bottom: 16px; + max-width: 800px; + margin-left: auto; + margin-right: auto; + animation: fadeIn 0.3s ease; +} + +@keyframes fadeIn { + from { opacity: 0; transform: translateY(8px); } + to { opacity: 1; transform: translateY(0); } +} + +.message.user { + flex-direction: row-reverse; +} + +.message-avatar { + width: 32px; + height: 32px; + border-radius: 8px; + display: flex; + align-items: center; + justify-content: center; + font-size: 14px; + flex-shrink: 0; +} + +.message.user .message-avatar { + background: var(--user-bg); +} + +.message.assistant .message-avatar { + background: var(--accent); +} + +.message-content { + padding: 10px 16px; + border-radius: 12px; + max-width: 75%; + line-height: 1.6; + font-size: 14px; + white-space: pre-wrap; + word-wrap: break-word; +} + +.message.user .message-content { + background: var(--user-bg); + border-bottom-right-radius: 4px; +} + +.message.assistant .message-content { + background: var(--bot-bg); + border: 1px solid var(--border); + border-bottom-left-radius: 4px; +} + +.message-content p { + margin-bottom: 8px; +} +.message-content p:last-child { + margin-bottom: 0; +} + +.message-content strong { + color: #a29bfe; +} + +.message-content code { + background: rgba(108, 92, 231, 0.15); + padding: 1px 5px; + border-radius: 3px; + font-size: 13px; +} + +/* Typing indicator */ +.typing { + display: flex; + gap: 4px; + padding: 4px 0; +} + +.typing span { + width: 6px; + height: 6px; + background: var(--text-secondary); + border-radius: 50%; + animation: bounce 1.4s infinite; +} + +.typing span:nth-child(2) { animation-delay: 0.2s; } +.typing span:nth-child(3) { animation-delay: 0.4s; } + +@keyframes bounce { + 0%, 60%, 100% { transform: translateY(0); } + 30% { transform: translateY(-6px); } +} + +/* Input area */ +.input-container { + padding: 16px 20px; + background: var(--bg-secondary); + border-top: 1px solid var(--border); + flex-shrink: 0; +} + +.input-wrapper { + display: flex; + gap: 10px; + max-width: 800px; + margin: 0 auto; +} + +#message-input { + flex: 1; + background: var(--bg-tertiary); + border: 1px solid var(--border); + color: var(--text-primary); + padding: 12px 16px; + border-radius: 12px; + font-size: 14px; + font-family: inherit; + resize: none; + outline: none; + min-height: 44px; + max-height: 120px; + transition: border-color 0.2s; +} + +#message-input:focus { + border-color: var(--accent); + box-shadow: 0 0 0 2px var(--accent-glow); +} + +#message-input::placeholder { + color: var(--text-secondary); +} + +#send-btn { + background: var(--accent); + color: white; + border: none; + width: 44px; + height: 44px; + border-radius: 12px; + cursor: pointer; + display: flex; + align-items: center; + justify-content: center; + transition: all 0.2s; + flex-shrink: 0; +} + +#send-btn:hover { + background: #5b4bd5; + transform: scale(1.05); +} + +#send-btn:disabled { + opacity: 0.5; + cursor: not-allowed; + transform: none; +} + +#send-btn svg { + width: 18px; + height: 18px; +} + +.disclaimer { + text-align: center; + font-size: 11px; + color: var(--text-secondary); + margin-top: 8px; + max-width: 800px; + margin-left: auto; + margin-right: auto; +} + +/* Mobile */ +@media (max-width: 640px) { + .header { padding: 10px 14px; } + .chat-container { padding: 12px; } + .message-content { max-width: 85%; font-size: 13px; } + .input-container { padding: 10px 14px; } + .suggestions { gap: 6px; } + .suggestion { font-size: 12px; padding: 6px 12px; } +} diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..6e873b6 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,50 @@ +services: + erowid-bot: + build: . + container_name: erowid-bot + restart: unless-stopped + env_file: .env + ports: + - "8421:8000" + depends_on: + erowid-db: + condition: service_healthy + networks: + - default + - traefik-public + - ai-internal + labels: + - "traefik.enable=true" + - "traefik.http.routers.erowid-bot.rule=Host(`erowid.jeffemmett.com`)" + - "traefik.http.routers.erowid-bot.entrypoints=websecure" + - "traefik.http.routers.erowid-bot.tls.certresolver=letsencrypt" + - "traefik.http.services.erowid-bot.loadbalancer.server.port=8000" + volumes: + - ./app:/app/app + + erowid-db: + image: pgvector/pgvector:pg16 + container_name: erowid-db + restart: unless-stopped + environment: + POSTGRES_USER: ${POSTGRES_USER:-erowid} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-erowid} + POSTGRES_DB: ${POSTGRES_DB:-erowid} + volumes: + - erowid-db-data:/var/lib/postgresql/data + healthcheck: + test: ["CMD-SHELL", "pg_isready -U erowid"] + interval: 5s + timeout: 5s + retries: 5 + networks: + - default + +volumes: + erowid-db-data: + +networks: + traefik-public: + external: true + ai-internal: + external: true diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..da37b51 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,16 @@ +fastapi==0.115.6 +uvicorn[standard]==0.34.0 +sqlalchemy==2.0.36 +asyncpg==0.30.0 +pgvector==0.3.6 +psycopg2-binary==2.9.10 +httpx==0.28.1 +python-dotenv==1.0.1 +beautifulsoup4==4.12.3 +lxml==5.3.0 +pydantic==2.10.3 +pydantic-settings==2.7.0 +sse-starlette==2.2.1 +tiktoken==0.8.0 +anthropic==0.40.0 +openai==1.58.1