import asyncio import re import logging from datetime import datetime, timezone import httpx from bs4 import BeautifulSoup from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession from app.config import settings from app.database import async_session from app.models import Substance logger = logging.getLogger(__name__) BASE_URL = "https://erowid.org" VAULT_INDEX = "https://erowid.org/chemicals/" HEADERS = { "User-Agent": "ErowidResearchBot/1.0 (educational research project)", "Accept": "text/html,application/xhtml+xml", } # Known substance categories on Erowid CATEGORIES = [ "Psychedelics", "Empathogens", "Stimulants", "Depressants", "Dissociatives", "Cannabis", "Opioids", "Nootropics", "Plants & Herbs", "Pharmaceuticals", "Research Chemicals", ] async def get_substance_urls(client: httpx.AsyncClient) -> list[dict]: """Get all substance vault URLs from the chemicals index.""" resp = await client.get(VAULT_INDEX, headers=HEADERS) resp.raise_for_status() soup = BeautifulSoup(resp.text, "lxml") substances = [] # The chemicals index lists substances with links to their vaults for link in soup.select("a[href]"): href = link.get("href", "") text = link.get_text(strip=True) # Filter for substance vault links (e.g., /chemicals/lsd/, /chemicals/psilocybin/) if re.match(r"^/chemicals/[\w_-]+/?$", href) or re.match(r"^[\w_-]+/?$", href): if text and len(text) > 1 and not text.startswith("["): full_url = href if href.startswith("http") else f"{BASE_URL}{href}" if href.startswith("/") else f"{VAULT_INDEX}{href}" substances.append({ "name": text, "url": full_url, }) # Also scrape the plants/herbs section plants_url = f"{BASE_URL}/plants/" try: resp2 = await client.get(plants_url, headers=HEADERS) resp2.raise_for_status() soup2 = BeautifulSoup(resp2.text, "lxml") for link in soup2.select("a[href]"): href = link.get("href", "") text = link.get_text(strip=True) if re.match(r"^/plants/[\w_-]+/?$", href) or re.match(r"^[\w_-]+/?$", href): if text and len(text) > 1 and not text.startswith("["): full_url = href if href.startswith("http") else f"{BASE_URL}{href}" if href.startswith("/") else f"{plants_url}{href}" substances.append({ "name": text, "url": full_url, "category": "Plants & Herbs", }) except httpx.HTTPError: logger.warning("Failed to fetch plants index") logger.info(f"Found {len(substances)} substance URLs") return substances async def scrape_substance_vault(client: httpx.AsyncClient, name: str, url: str) -> dict | None: """Scrape a substance vault page for key information.""" try: resp = await client.get(url, headers=HEADERS) resp.raise_for_status() except httpx.HTTPError as e: logger.warning(f"Failed to fetch substance {name}: {e}") return None soup = BeautifulSoup(resp.text, "lxml") raw_html = resp.text # Extract text content from the vault main page description = "" main_content = soup.select_one("div.sum-content") or soup.select_one("td.content") or soup.select_one("body") if main_content: description = main_content.get_text(separator="\n", strip=True)[:5000] # Try to find sub-pages: effects, dose, duration, health, law sections = {} sub_pages = { "effects": ["effects", "effects.shtml"], "dosage": ["dose", "dose.shtml", "dosage.shtml"], "duration": ["duration", "duration.shtml", "timeline.shtml"], "chemistry": ["chemistry", "chemistry.shtml"], "health": ["health", "health.shtml", "warnings.shtml"], "law": ["law", "law.shtml", "legal.shtml"], } for section_name, paths in sub_pages.items(): for path in paths: sub_url = f"{url.rstrip('/')}/{path}" try: sub_resp = await client.get(sub_url, headers=HEADERS) if sub_resp.status_code == 200: sub_soup = BeautifulSoup(sub_resp.text, "lxml") content_el = sub_soup.select_one("div.sum-content") or sub_soup.select_one("td.content") if content_el: sections[section_name] = content_el.get_text(separator="\n", strip=True)[:3000] break await asyncio.sleep(0.5) except httpx.HTTPError: continue return { "name": name, "url": url, "description": description, "effects": sections.get("effects", ""), "dosage": sections.get("dosage", ""), "duration": sections.get("duration", ""), "chemistry": sections.get("chemistry", ""), "health": sections.get("health", ""), "law": sections.get("law", ""), "raw_html": raw_html, } async def scrape_all_substances(limit: int | None = None): """Main entry point: scrape all substance vaults.""" async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client: substance_list = await get_substance_urls(client) if limit: substance_list = substance_list[:limit] async with async_session() as db: # Get already-scraped substances result = await db.execute(select(Substance.name)) existing = {row[0].lower() for row in result.fetchall()} logger.info(f"Already have {len(existing)} substances in DB") to_scrape = [s for s in substance_list if s["name"].lower() not in existing] logger.info(f"Need to scrape {len(to_scrape)} new substances") scraped = 0 for sub_meta in to_scrape: data = await scrape_substance_vault( client, sub_meta["name"], sub_meta["url"] ) if data: data["category"] = sub_meta.get("category", "") sub = Substance(**data) db.add(sub) scraped += 1 if scraped % 10 == 0: await db.commit() logger.info(f"Committed: {scraped} substances scraped") await asyncio.sleep(settings.scrape_delay) await db.commit() logger.info(f"Done! Scraped {scraped} new substances") return scraped