import asyncio import re import logging from datetime import datetime, timezone import httpx from bs4 import BeautifulSoup from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession from app.config import settings from app.database import async_session from app.models import Experience logger = logging.getLogger(__name__) BASE_URL = "https://erowid.org" EXP_LIST_URL = "https://erowid.org/experiences/exp_list.shtml" REPORT_URL = "https://erowid.org/experiences/exp.php?ID={id}" HEADERS = { "User-Agent": "ErowidResearchBot/1.0 (educational research project)", "Accept": "text/html,application/xhtml+xml", } async def get_all_substance_pages(client: httpx.AsyncClient) -> list[dict]: """Get main substance experience listing pages from the master index. Only fetches top-level substance pages (e.g. exp_LSD.shtml), not category sub-pages (e.g. exp_LSD_General.shtml) since the main page already contains all report IDs for that substance. """ resp = await client.get(EXP_LIST_URL, headers=HEADERS, timeout=60) resp.raise_for_status() soup = BeautifulSoup(resp.text, "lxml") pages = [] seen_substances = set() for a in soup.select("a[href]"): href = a.get("href", "") name = a.get_text(strip=True) if not href.startswith("subs/exp_") or not href.endswith(".shtml") or not name: continue full_url = f"https://erowid.org/experiences/{href}" # Extract the base substance name from the URL # e.g. subs/exp_LSD.shtml -> LSD, subs/exp_LSD_General.shtml -> LSD filename = href.replace("subs/exp_", "").replace(".shtml", "") # Skip category sub-pages — they contain subsets of the main page # Category sub-pages have suffixes like _General, _First_Times, _Bad_Trips, etc. known_categories = [ "_General", "_First_Times", "_Combinations", "_Retrospective", "_Preparation", "_Difficult_Experiences", "_Bad_Trips", "_Health_Problems", "_Train_Wrecks", "_Glowing_Experiences", "_Mystical_Experiences", "_Health_Benefits", "_What_Was_in_That", "_Medical_Use", "_Performance_Enhancement", "_Addiction", ] is_category = any(filename.endswith(cat) for cat in known_categories) if is_category: continue if full_url not in seen_substances: seen_substances.add(full_url) pages.append({"name": name, "url": full_url}) logger.info(f"Found {len(pages)} main substance experience pages (filtered from category sub-pages)") return pages async def get_experience_ids_from_page(client: httpx.AsyncClient, url: str) -> list[int]: """Extract all experience report IDs from a substance listing page.""" try: resp = await client.get(url, headers=HEADERS, timeout=30) resp.raise_for_status() except httpx.HTTPError as e: logger.warning(f"Failed to fetch {url}: {e}") return [] ids = [int(x) for x in re.findall(r"exp\.php\?ID=(\d+)", resp.text)] return list(set(ids)) # dedupe async def get_all_experience_ids(client: httpx.AsyncClient) -> list[int]: """Collect all unique experience IDs from all substance pages. Fetches pages concurrently in batches of 5 for speed. """ pages = await get_all_substance_pages(client) all_ids = set() batch_size = 5 for i in range(0, len(pages), batch_size): batch = pages[i : i + batch_size] tasks = [get_experience_ids_from_page(client, p["url"]) for p in batch] results = await asyncio.gather(*tasks, return_exceptions=True) for result in results: if isinstance(result, list): all_ids.update(result) logger.info(f"Scanned {min(i + batch_size, len(pages))}/{len(pages)} pages, {len(all_ids)} unique IDs") await asyncio.sleep(0.5) logger.info(f"Found {len(all_ids)} unique experience IDs total") return sorted(all_ids) async def scrape_experience_report(client: httpx.AsyncClient, erowid_id: int) -> dict | None: """Scrape a single experience report.""" url = REPORT_URL.format(id=erowid_id) try: resp = await client.get(url, headers=HEADERS, timeout=30) resp.raise_for_status() except httpx.HTTPError as e: logger.warning(f"Failed to fetch experience {erowid_id}: {e}") return None soup = BeautifulSoup(resp.text, "lxml") # Extract the main report body body_div = soup.select_one("div.report-text-surround") if not body_div: logger.warning(f"No report body found for {erowid_id}") return None # Remove the dosechart from body text to avoid duplication body_text_parts = [] for el in body_div.children: if hasattr(el, "name") and el.name == "table": continue # skip dosechart table text = el.get_text(separator="\n", strip=True) if hasattr(el, "get_text") else str(el).strip() if text: body_text_parts.append(text) body = "\n\n".join(body_text_parts) if not body or len(body) < 50: return None # Extract metadata title = "" title_el = soup.select_one("div.title") if title_el: title = title_el.get_text(strip=True) substance = "" sub_el = soup.select_one("div.substance") if sub_el: substance = sub_el.get_text(strip=True) substance_list = [s.strip() for s in re.split(r"[,&]", substance) if s.strip()] author = "" author_el = soup.select_one("div.author") if author_el: author = author_el.get_text(strip=True).replace("by ", "") # Dosage info dose_table = soup.select_one("table.dosechart") dose_text = "" if dose_table: rows = dose_table.select("tr") dose_parts = [] for row in rows: cells = row.select("td") row_text = " ".join(c.get_text(strip=True) for c in cells if c.get_text(strip=True)) if row_text: dose_parts.append(row_text) dose_text = "; ".join(dose_parts) if dose_text: body = f"Dosage: {dose_text}\n\n{body}" # Weight/gender from the body weight line inside report-text-surround gender = "" age = "" weight_el = soup.select_one("table.bodyweight") if weight_el: wt = weight_el.get_text(strip=True) age_match = re.search(r"(\d+)\s*yr", wt, re.IGNORECASE) if age_match: age = age_match.group(1) if "male" in wt.lower(): gender = "Male" elif "female" in wt.lower(): gender = "Female" # Try to extract category from the page category = "" cat_el = soup.select_one("div.foot-eroid-cat") if cat_el: category = cat_el.get_text(strip=True) return { "erowid_id": erowid_id, "title": title, "author": author, "substance": substance, "substance_list": substance_list, "body": body, "category": category, "gender": gender, "age": age, "url": url, "raw_html": resp.text, } async def scrape_all_experiences(limit: int | None = None): """Main scraper entry point. Scrapes all experience reports into the database.""" async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client: all_ids = await get_all_experience_ids(client) if limit: all_ids = all_ids[:limit] async with async_session() as db: result = await db.execute(select(Experience.erowid_id)) existing_ids = {row[0] for row in result.fetchall()} logger.info(f"Already have {len(existing_ids)} experiences in DB") to_scrape = [eid for eid in all_ids if eid not in existing_ids] logger.info(f"Need to scrape {len(to_scrape)} new experiences") scraped = 0 errors = 0 for eid in to_scrape: data = await scrape_experience_report(client, eid) if data: exp = Experience(**data) db.add(exp) scraped += 1 if scraped % settings.scrape_batch_size == 0: await db.commit() logger.info(f"Committed batch: {scraped}/{len(to_scrape)} scraped ({errors} errors)") else: errors += 1 await asyncio.sleep(settings.scrape_delay) await db.commit() logger.info(f"Done! Scraped {scraped} new experiences ({errors} errors)") return scraped