erowid-bot/app/scraper/substances.py

172 lines
6.5 KiB
Python

import asyncio
import re
import logging
from datetime import datetime, timezone
import httpx
from bs4 import BeautifulSoup
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import settings
from app.database import async_session
from app.models import Substance
logger = logging.getLogger(__name__)
BASE_URL = "https://erowid.org"
VAULT_INDEX = "https://erowid.org/chemicals/"
HEADERS = {
"User-Agent": "ErowidResearchBot/1.0 (educational research project)",
"Accept": "text/html,application/xhtml+xml",
}
# Known substance categories on Erowid
CATEGORIES = [
"Psychedelics", "Empathogens", "Stimulants", "Depressants",
"Dissociatives", "Cannabis", "Opioids", "Nootropics",
"Plants & Herbs", "Pharmaceuticals", "Research Chemicals",
]
async def get_substance_urls(client: httpx.AsyncClient) -> list[dict]:
"""Get all substance vault URLs from the chemicals index."""
resp = await client.get(VAULT_INDEX, headers=HEADERS)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "lxml")
substances = []
# The chemicals index lists substances with links to their vaults
for link in soup.select("a[href]"):
href = link.get("href", "")
text = link.get_text(strip=True)
# Filter for substance vault links (e.g., /chemicals/lsd/, /chemicals/psilocybin/)
if re.match(r"^/chemicals/[\w_-]+/?$", href) or re.match(r"^[\w_-]+/?$", href):
if text and len(text) > 1 and not text.startswith("["):
full_url = href if href.startswith("http") else f"{BASE_URL}{href}" if href.startswith("/") else f"{VAULT_INDEX}{href}"
substances.append({
"name": text,
"url": full_url,
})
# Also scrape the plants/herbs section
plants_url = f"{BASE_URL}/plants/"
try:
resp2 = await client.get(plants_url, headers=HEADERS)
resp2.raise_for_status()
soup2 = BeautifulSoup(resp2.text, "lxml")
for link in soup2.select("a[href]"):
href = link.get("href", "")
text = link.get_text(strip=True)
if re.match(r"^/plants/[\w_-]+/?$", href) or re.match(r"^[\w_-]+/?$", href):
if text and len(text) > 1 and not text.startswith("["):
full_url = href if href.startswith("http") else f"{BASE_URL}{href}" if href.startswith("/") else f"{plants_url}{href}"
substances.append({
"name": text,
"url": full_url,
"category": "Plants & Herbs",
})
except httpx.HTTPError:
logger.warning("Failed to fetch plants index")
logger.info(f"Found {len(substances)} substance URLs")
return substances
async def scrape_substance_vault(client: httpx.AsyncClient, name: str, url: str) -> dict | None:
"""Scrape a substance vault page for key information."""
try:
resp = await client.get(url, headers=HEADERS)
resp.raise_for_status()
except httpx.HTTPError as e:
logger.warning(f"Failed to fetch substance {name}: {e}")
return None
soup = BeautifulSoup(resp.text, "lxml")
raw_html = resp.text
# Extract text content from the vault main page
description = ""
main_content = soup.select_one("div.sum-content") or soup.select_one("td.content") or soup.select_one("body")
if main_content:
description = main_content.get_text(separator="\n", strip=True)[:5000]
# Try to find sub-pages: effects, dose, duration, health, law
sections = {}
sub_pages = {
"effects": ["effects", "effects.shtml"],
"dosage": ["dose", "dose.shtml", "dosage.shtml"],
"duration": ["duration", "duration.shtml", "timeline.shtml"],
"chemistry": ["chemistry", "chemistry.shtml"],
"health": ["health", "health.shtml", "warnings.shtml"],
"law": ["law", "law.shtml", "legal.shtml"],
}
for section_name, paths in sub_pages.items():
for path in paths:
sub_url = f"{url.rstrip('/')}/{path}"
try:
sub_resp = await client.get(sub_url, headers=HEADERS)
if sub_resp.status_code == 200:
sub_soup = BeautifulSoup(sub_resp.text, "lxml")
content_el = sub_soup.select_one("div.sum-content") or sub_soup.select_one("td.content")
if content_el:
sections[section_name] = content_el.get_text(separator="\n", strip=True)[:3000]
break
await asyncio.sleep(0.5)
except httpx.HTTPError:
continue
return {
"name": name,
"url": url,
"description": description,
"effects": sections.get("effects", ""),
"dosage": sections.get("dosage", ""),
"duration": sections.get("duration", ""),
"chemistry": sections.get("chemistry", ""),
"health": sections.get("health", ""),
"law": sections.get("law", ""),
"raw_html": raw_html,
}
async def scrape_all_substances(limit: int | None = None):
"""Main entry point: scrape all substance vaults."""
async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
substance_list = await get_substance_urls(client)
if limit:
substance_list = substance_list[:limit]
async with async_session() as db:
# Get already-scraped substances
result = await db.execute(select(Substance.name))
existing = {row[0].lower() for row in result.fetchall()}
logger.info(f"Already have {len(existing)} substances in DB")
to_scrape = [s for s in substance_list if s["name"].lower() not in existing]
logger.info(f"Need to scrape {len(to_scrape)} new substances")
scraped = 0
for sub_meta in to_scrape:
data = await scrape_substance_vault(
client, sub_meta["name"], sub_meta["url"]
)
if data:
data["category"] = sub_meta.get("category", "")
sub = Substance(**data)
db.add(sub)
scraped += 1
if scraped % 10 == 0:
await db.commit()
logger.info(f"Committed: {scraped} substances scraped")
await asyncio.sleep(settings.scrape_delay)
await db.commit()
logger.info(f"Done! Scraped {scraped} new substances")
return scraped