172 lines
6.5 KiB
Python
172 lines
6.5 KiB
Python
import asyncio
|
|
import re
|
|
import logging
|
|
from datetime import datetime, timezone
|
|
|
|
import httpx
|
|
from bs4 import BeautifulSoup
|
|
from sqlalchemy import select
|
|
from sqlalchemy.ext.asyncio import AsyncSession
|
|
|
|
from app.config import settings
|
|
from app.database import async_session
|
|
from app.models import Substance
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
BASE_URL = "https://erowid.org"
|
|
VAULT_INDEX = "https://erowid.org/chemicals/"
|
|
|
|
HEADERS = {
|
|
"User-Agent": "ErowidResearchBot/1.0 (educational research project)",
|
|
"Accept": "text/html,application/xhtml+xml",
|
|
}
|
|
|
|
# Known substance categories on Erowid
|
|
CATEGORIES = [
|
|
"Psychedelics", "Empathogens", "Stimulants", "Depressants",
|
|
"Dissociatives", "Cannabis", "Opioids", "Nootropics",
|
|
"Plants & Herbs", "Pharmaceuticals", "Research Chemicals",
|
|
]
|
|
|
|
|
|
async def get_substance_urls(client: httpx.AsyncClient) -> list[dict]:
|
|
"""Get all substance vault URLs from the chemicals index."""
|
|
resp = await client.get(VAULT_INDEX, headers=HEADERS)
|
|
resp.raise_for_status()
|
|
soup = BeautifulSoup(resp.text, "lxml")
|
|
|
|
substances = []
|
|
# The chemicals index lists substances with links to their vaults
|
|
for link in soup.select("a[href]"):
|
|
href = link.get("href", "")
|
|
text = link.get_text(strip=True)
|
|
# Filter for substance vault links (e.g., /chemicals/lsd/, /chemicals/psilocybin/)
|
|
if re.match(r"^/chemicals/[\w_-]+/?$", href) or re.match(r"^[\w_-]+/?$", href):
|
|
if text and len(text) > 1 and not text.startswith("["):
|
|
full_url = href if href.startswith("http") else f"{BASE_URL}{href}" if href.startswith("/") else f"{VAULT_INDEX}{href}"
|
|
substances.append({
|
|
"name": text,
|
|
"url": full_url,
|
|
})
|
|
|
|
# Also scrape the plants/herbs section
|
|
plants_url = f"{BASE_URL}/plants/"
|
|
try:
|
|
resp2 = await client.get(plants_url, headers=HEADERS)
|
|
resp2.raise_for_status()
|
|
soup2 = BeautifulSoup(resp2.text, "lxml")
|
|
for link in soup2.select("a[href]"):
|
|
href = link.get("href", "")
|
|
text = link.get_text(strip=True)
|
|
if re.match(r"^/plants/[\w_-]+/?$", href) or re.match(r"^[\w_-]+/?$", href):
|
|
if text and len(text) > 1 and not text.startswith("["):
|
|
full_url = href if href.startswith("http") else f"{BASE_URL}{href}" if href.startswith("/") else f"{plants_url}{href}"
|
|
substances.append({
|
|
"name": text,
|
|
"url": full_url,
|
|
"category": "Plants & Herbs",
|
|
})
|
|
except httpx.HTTPError:
|
|
logger.warning("Failed to fetch plants index")
|
|
|
|
logger.info(f"Found {len(substances)} substance URLs")
|
|
return substances
|
|
|
|
|
|
async def scrape_substance_vault(client: httpx.AsyncClient, name: str, url: str) -> dict | None:
|
|
"""Scrape a substance vault page for key information."""
|
|
try:
|
|
resp = await client.get(url, headers=HEADERS)
|
|
resp.raise_for_status()
|
|
except httpx.HTTPError as e:
|
|
logger.warning(f"Failed to fetch substance {name}: {e}")
|
|
return None
|
|
|
|
soup = BeautifulSoup(resp.text, "lxml")
|
|
raw_html = resp.text
|
|
|
|
# Extract text content from the vault main page
|
|
description = ""
|
|
main_content = soup.select_one("div.sum-content") or soup.select_one("td.content") or soup.select_one("body")
|
|
if main_content:
|
|
description = main_content.get_text(separator="\n", strip=True)[:5000]
|
|
|
|
# Try to find sub-pages: effects, dose, duration, health, law
|
|
sections = {}
|
|
sub_pages = {
|
|
"effects": ["effects", "effects.shtml"],
|
|
"dosage": ["dose", "dose.shtml", "dosage.shtml"],
|
|
"duration": ["duration", "duration.shtml", "timeline.shtml"],
|
|
"chemistry": ["chemistry", "chemistry.shtml"],
|
|
"health": ["health", "health.shtml", "warnings.shtml"],
|
|
"law": ["law", "law.shtml", "legal.shtml"],
|
|
}
|
|
|
|
for section_name, paths in sub_pages.items():
|
|
for path in paths:
|
|
sub_url = f"{url.rstrip('/')}/{path}"
|
|
try:
|
|
sub_resp = await client.get(sub_url, headers=HEADERS)
|
|
if sub_resp.status_code == 200:
|
|
sub_soup = BeautifulSoup(sub_resp.text, "lxml")
|
|
content_el = sub_soup.select_one("div.sum-content") or sub_soup.select_one("td.content")
|
|
if content_el:
|
|
sections[section_name] = content_el.get_text(separator="\n", strip=True)[:3000]
|
|
break
|
|
await asyncio.sleep(0.5)
|
|
except httpx.HTTPError:
|
|
continue
|
|
|
|
return {
|
|
"name": name,
|
|
"url": url,
|
|
"description": description,
|
|
"effects": sections.get("effects", ""),
|
|
"dosage": sections.get("dosage", ""),
|
|
"duration": sections.get("duration", ""),
|
|
"chemistry": sections.get("chemistry", ""),
|
|
"health": sections.get("health", ""),
|
|
"law": sections.get("law", ""),
|
|
"raw_html": raw_html,
|
|
}
|
|
|
|
|
|
async def scrape_all_substances(limit: int | None = None):
|
|
"""Main entry point: scrape all substance vaults."""
|
|
async with httpx.AsyncClient(timeout=30, follow_redirects=True) as client:
|
|
substance_list = await get_substance_urls(client)
|
|
|
|
if limit:
|
|
substance_list = substance_list[:limit]
|
|
|
|
async with async_session() as db:
|
|
# Get already-scraped substances
|
|
result = await db.execute(select(Substance.name))
|
|
existing = {row[0].lower() for row in result.fetchall()}
|
|
logger.info(f"Already have {len(existing)} substances in DB")
|
|
|
|
to_scrape = [s for s in substance_list if s["name"].lower() not in existing]
|
|
logger.info(f"Need to scrape {len(to_scrape)} new substances")
|
|
|
|
scraped = 0
|
|
for sub_meta in to_scrape:
|
|
data = await scrape_substance_vault(
|
|
client, sub_meta["name"], sub_meta["url"]
|
|
)
|
|
if data:
|
|
data["category"] = sub_meta.get("category", "")
|
|
sub = Substance(**data)
|
|
db.add(sub)
|
|
scraped += 1
|
|
|
|
if scraped % 10 == 0:
|
|
await db.commit()
|
|
logger.info(f"Committed: {scraped} substances scraped")
|
|
|
|
await asyncio.sleep(settings.scrape_delay)
|
|
|
|
await db.commit()
|
|
logger.info(f"Done! Scraped {scraped} new substances")
|
|
return scraped
|