erowid-bot/app/scraper/substances.py

import asyncio
import re
import logging
from datetime import datetime, timezone

import httpx
from bs4 import BeautifulSoup
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession

from app.config import settings
from app.database import async_session
from app.models import Substance

logger = logging.getLogger(__name__)

BASE_URL = "https://erowid.org"
VAULT_INDEX = "https://erowid.org/chemicals/"

HEADERS = {
    "User-Agent": "ErowidResearchBot/1.0 (educational research project)",
    "Accept": "text/html,application/xhtml+xml",
}

# Known substance categories on Erowid
CATEGORIES = [
    "Psychedelics", "Empathogens", "Stimulants", "Depressants",
    "Dissociatives", "Cannabis", "Opioids", "Nootropics",
    "Plants & Herbs", "Pharmaceuticals", "Research Chemicals",
]


async def get_substance_urls(client: httpx.AsyncClient) -> list[dict]:
    """Get all substance vault URLs from the chemicals index."""
    resp = await client.get(VAULT_INDEX, headers=HEADERS)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "lxml")

    substances = []
    # The chemicals index lists substances with links to their vaults
    for link in soup.select("a[href]"):
        href = link.get("href", "")
        text = link.get_text(strip=True)
        # Filter for substance vault links (e.g., /chemicals/lsd/, /chemicals/psilocybin/)
        if re.match(r"^/chemicals/[\w_-]+/?$", href) or re.match(r"^[\w_-]+/?$", href):
            if text and len(text) > 1 and not text.startswith("["):
                full_url = href if href.startswith("http") else f"{BASE_URL}{href}" if href.startswith("/") else f"{VAULT_INDEX}{href}"
                substances.append({
                    "name": text,
                    "url": full_url,
                })

    # Also scrape the plants/herbs section
    plants_url = f"{BASE_URL}/plants/"
    try:
        resp2 = await client.get(plants_url, headers=HEADERS)
        resp2.raise_for_status()
        soup2 = BeautifulSoup(resp2.text, "lxml")
        for link in soup2.select("a[href]"):
            href = link.get("href", "")
            text = link.get_text(strip=True)
            if re.match(r"^/plants/[\w_-]+/?$", href) or re.match(r"^[\w_-]+/?$", href):
                if text and len(text) > 1 and not text.startswith("["):
                    full_url = href if href.startswith("http") else f"{BASE_URL}{href}" if href.startswith("/") else f"{plants_url}{href}"
                    substances.append({
                        "name": text,
                        "url": full_url,
                        "category": "Plants & Herbs",
                    })
    except httpx.HTTPError:
        logger.warning("Failed to fetch plants index")

    logger.info(f"Found {len(substances)} substance URLs")
    return substances


async def scrape_substance_vault(client: httpx.AsyncClient, name: str, url: str) -> dict | None:
    """Scrape a substance vault page for key information."""
    try:
        resp = await client.get(url, headers=HEADERS)
        resp.raise_for_status()
    except httpx.HTTPError as e:
        logger.warning(f"Failed to fetch substance {name}: {e}")
        return None

    soup = BeautifulSoup(resp.text, "lxml")
    raw_html = resp.text

    # Extract text content from the vault main page
    description = ""
    main_content = soup.select_one("div.sum-content") or soup.select_one("td.content") or soup.select_one("body")
    if main_content:
        description = main_content.get_text(separator="\n", strip=True)[:5000]

    # Try to find sub-pages: effects, dose, duration, health, law
    sections = {}
    sub_pages = {
        "effects": ["effects", "effects.shtml"],
        "dosage": ["dose", "dose.shtml", "dosage.shtml"],
        "duration": ["duration", "duration.shtml", "timeline.shtml"],
        "chemistry": ["chemistry", "chemistry.shtml"],
        "health": ["health", "health.shtml", "warnings.shtml"],
        "law": ["law", "law.shtml", "legal.shtml"],
    }

    for section_name, paths in sub_pages.items():
        for path in paths:
            sub_url = f"{url.rstrip('/')}/{path}"
            try:
                sub_resp = await client.get(sub_url, headers=HEADERS)
                if sub_resp.status_code == 200:
                    sub_soup = BeautifulSoup(sub_resp.text, "lxml")
                    content_el = sub_soup.select_one("div.sum-content") or sub_soup.select_one("td.content")
                    if content_el:
                        sections[section_name] = content_el.get_text(separator="\n", strip=True)[:3000]
                        break
                await asyncio.sleep(0.5)
            except httpx.HTTPError:
                continue

    return {
        "name": name,
        "url": url,
        "description": description,
        "effects": sections.get("effects", ""),
        "dosage": sections.get("dosage", ""),
        "duration": sections.get("duration", ""),
        "chemistry": sections.get("chemistry", ""),
        "health": sections.get("health", ""),
        "law": sections.get("law", ""),
        "raw_html": raw_html,
    }


async def scrape_all_substances(limit: int | None = None):
    """Main entry point: scrape all substance vaults."""
    async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
        substance_list = await get_substance_urls(client)

        if limit:
            substance_list = substance_list[:limit]

        async with async_session() as db:
            # Get already-scraped substances
            result = await db.execute(select(Substance.name))
            existing = {row[0].lower() for row in result.fetchall()}
            logger.info(f"Already have {len(existing)} substances in DB")

            to_scrape = [s for s in substance_list if s["name"].lower() not in existing]
            logger.info(f"Need to scrape {len(to_scrape)} new substances")

            scraped = 0
            for sub_meta in to_scrape:
                data = await scrape_substance_vault(
                    client, sub_meta["name"], sub_meta["url"]
                )
                if data:
                    data["category"] = sub_meta.get("category", "")
                    sub = Substance(**data)
                    db.add(sub)
                    scraped += 1

                    if scraped % 10 == 0:
                        await db.commit()
                        logger.info(f"Committed: {scraped} substances scraped")

                await asyncio.sleep(settings.scrape_delay)

            await db.commit()
            logger.info(f"Done! Scraped {scraped} new substances")
            return scraped