erowid-bot/app/scraper/experiences.py

import asyncio
import re
import logging
from datetime import datetime, timezone

import httpx
from bs4 import BeautifulSoup
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession

from app.config import settings
from app.database import async_session
from app.models import Experience

logger = logging.getLogger(__name__)

BASE_URL = "https://erowid.org"
EXP_LIST_URL = "https://erowid.org/experiences/exp_list.shtml"
REPORT_URL = "https://erowid.org/experiences/exp.php?ID={id}"

HEADERS = {
    "User-Agent": "ErowidResearchBot/1.0 (educational research project)",
    "Accept": "text/html,application/xhtml+xml",
}


async def get_all_substance_pages(client: httpx.AsyncClient) -> list[dict]:
    """Get main substance experience listing pages from the master index.

    Only fetches top-level substance pages (e.g. exp_LSD.shtml), not
    category sub-pages (e.g. exp_LSD_General.shtml) since the main page
    already contains all report IDs for that substance.
    """
    resp = await client.get(EXP_LIST_URL, headers=HEADERS, timeout=60)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "lxml")

    pages = []
    seen_substances = set()
    for a in soup.select("a[href]"):
        href = a.get("href", "")
        name = a.get_text(strip=True)
        if not href.startswith("subs/exp_") or not href.endswith(".shtml") or not name:
            continue

        full_url = f"https://erowid.org/experiences/{href}"

        # Extract the base substance name from the URL
        # e.g. subs/exp_LSD.shtml -> LSD, subs/exp_LSD_General.shtml -> LSD
        filename = href.replace("subs/exp_", "").replace(".shtml", "")

        # Skip category sub-pages — they contain subsets of the main page
        # Category sub-pages have suffixes like _General, _First_Times, _Bad_Trips, etc.
        known_categories = [
            "_General", "_First_Times", "_Combinations", "_Retrospective",
            "_Preparation", "_Difficult_Experiences", "_Bad_Trips",
            "_Health_Problems", "_Train_Wrecks", "_Glowing_Experiences",
            "_Mystical_Experiences", "_Health_Benefits", "_What_Was_in_That",
            "_Medical_Use", "_Performance_Enhancement", "_Addiction",
        ]
        is_category = any(filename.endswith(cat) for cat in known_categories)
        if is_category:
            continue

        if full_url not in seen_substances:
            seen_substances.add(full_url)
            pages.append({"name": name, "url": full_url})

    logger.info(f"Found {len(pages)} main substance experience pages (filtered from category sub-pages)")
    return pages


async def get_experience_ids_from_page(client: httpx.AsyncClient, url: str) -> list[int]:
    """Extract all experience report IDs from a substance listing page."""
    try:
        resp = await client.get(url, headers=HEADERS, timeout=30)
        resp.raise_for_status()
    except httpx.HTTPError as e:
        logger.warning(f"Failed to fetch {url}: {e}")
        return []

    ids = [int(x) for x in re.findall(r"exp\.php\?ID=(\d+)", resp.text)]
    return list(set(ids))  # dedupe


async def get_all_experience_ids(client: httpx.AsyncClient) -> list[int]:
    """Collect all unique experience IDs from all substance pages.

    Fetches pages concurrently in batches of 5 for speed.
    """
    pages = await get_all_substance_pages(client)

    all_ids = set()
    batch_size = 5
    for i in range(0, len(pages), batch_size):
        batch = pages[i : i + batch_size]
        tasks = [get_experience_ids_from_page(client, p["url"]) for p in batch]
        results = await asyncio.gather(*tasks, return_exceptions=True)
        for result in results:
            if isinstance(result, list):
                all_ids.update(result)
        logger.info(f"Scanned {min(i + batch_size, len(pages))}/{len(pages)} pages, {len(all_ids)} unique IDs")
        await asyncio.sleep(0.5)

    logger.info(f"Found {len(all_ids)} unique experience IDs total")
    return sorted(all_ids)


async def scrape_experience_report(client: httpx.AsyncClient, erowid_id: int) -> dict | None:
    """Scrape a single experience report."""
    url = REPORT_URL.format(id=erowid_id)
    try:
        resp = await client.get(url, headers=HEADERS, timeout=30)
        resp.raise_for_status()
    except httpx.HTTPError as e:
        logger.warning(f"Failed to fetch experience {erowid_id}: {e}")
        return None

    soup = BeautifulSoup(resp.text, "lxml")

    # Extract the main report body
    body_div = soup.select_one("div.report-text-surround")
    if not body_div:
        logger.warning(f"No report body found for {erowid_id}")
        return None

    # Remove the dosechart from body text to avoid duplication
    body_text_parts = []
    for el in body_div.children:
        if hasattr(el, "name") and el.name == "table":
            continue  # skip dosechart table
        text = el.get_text(separator="\n", strip=True) if hasattr(el, "get_text") else str(el).strip()
        if text:
            body_text_parts.append(text)

    body = "\n\n".join(body_text_parts)
    if not body or len(body) < 50:
        return None

    # Extract metadata
    title = ""
    title_el = soup.select_one("div.title")
    if title_el:
        title = title_el.get_text(strip=True)

    substance = ""
    sub_el = soup.select_one("div.substance")
    if sub_el:
        substance = sub_el.get_text(strip=True)

    substance_list = [s.strip() for s in re.split(r"[,&]", substance) if s.strip()]

    author = ""
    author_el = soup.select_one("div.author")
    if author_el:
        author = author_el.get_text(strip=True).replace("by ", "")

    # Dosage info
    dose_table = soup.select_one("table.dosechart")
    dose_text = ""
    if dose_table:
        rows = dose_table.select("tr")
        dose_parts = []
        for row in rows:
            cells = row.select("td")
            row_text = " ".join(c.get_text(strip=True) for c in cells if c.get_text(strip=True))
            if row_text:
                dose_parts.append(row_text)
        dose_text = "; ".join(dose_parts)

    if dose_text:
        body = f"Dosage: {dose_text}\n\n{body}"

    # Weight/gender from the body weight line inside report-text-surround
    gender = ""
    age = ""
    weight_el = soup.select_one("table.bodyweight")
    if weight_el:
        wt = weight_el.get_text(strip=True)
        age_match = re.search(r"(\d+)\s*yr", wt, re.IGNORECASE)
        if age_match:
            age = age_match.group(1)
        if "male" in wt.lower():
            gender = "Male"
        elif "female" in wt.lower():
            gender = "Female"

    # Try to extract category from the page
    category = ""
    cat_el = soup.select_one("div.foot-eroid-cat")
    if cat_el:
        category = cat_el.get_text(strip=True)

    return {
        "erowid_id": erowid_id,
        "title": title,
        "author": author,
        "substance": substance,
        "substance_list": substance_list,
        "body": body,
        "category": category,
        "gender": gender,
        "age": age,
        "url": url,
        "raw_html": resp.text,
    }


async def scrape_all_experiences(limit: int | None = None):
    """Main scraper entry point. Scrapes all experience reports into the database."""
    async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
        all_ids = await get_all_experience_ids(client)

        if limit:
            all_ids = all_ids[:limit]

        async with async_session() as db:
            result = await db.execute(select(Experience.erowid_id))
            existing_ids = {row[0] for row in result.fetchall()}
            logger.info(f"Already have {len(existing_ids)} experiences in DB")

            to_scrape = [eid for eid in all_ids if eid not in existing_ids]
            logger.info(f"Need to scrape {len(to_scrape)} new experiences")

            scraped = 0
            errors = 0
            for eid in to_scrape:
                data = await scrape_experience_report(client, eid)
                if data:
                    exp = Experience(**data)
                    db.add(exp)
                    scraped += 1

                    if scraped % settings.scrape_batch_size == 0:
                        await db.commit()
                        logger.info(f"Committed batch: {scraped}/{len(to_scrape)} scraped ({errors} errors)")
                else:
                    errors += 1

                await asyncio.sleep(settings.scrape_delay)

            await db.commit()
            logger.info(f"Done! Scraped {scraped} new experiences ({errors} errors)")
            return scraped