"""MediaWiki XML dump parser - converts to structured JSON.""" import json import re from dataclasses import dataclass, field, asdict from pathlib import Path from typing import Iterator from lxml import etree from rich.progress import Progress, TaskID from rich.console import Console from .config import settings console = Console() # MediaWiki namespace MW_NS = {"mw": "http://www.mediawiki.org/xml/export-0.6/"} @dataclass class WikiArticle: """Represents a parsed wiki article.""" id: int title: str content: str # Raw wikitext plain_text: str # Cleaned plain text for embedding categories: list[str] = field(default_factory=list) links: list[str] = field(default_factory=list) # Internal wiki links external_links: list[str] = field(default_factory=list) timestamp: str = "" contributor: str = "" def to_dict(self) -> dict: return asdict(self) def clean_wikitext(text: str) -> str: """Convert MediaWiki markup to plain text for embedding.""" if not text: return "" # Remove templates {{...}} text = re.sub(r"\{\{[^}]+\}\}", "", text) # Remove categories [[Category:...]] text = re.sub(r"\[\[Category:[^\]]+\]\]", "", text, flags=re.IGNORECASE) # Convert wiki links [[Page|Display]] or [[Page]] to just the display text text = re.sub(r"\[\[([^|\]]+)\|([^\]]+)\]\]", r"\2", text) text = re.sub(r"\[\[([^\]]+)\]\]", r"\1", text) # Remove external links [url text] -> text text = re.sub(r"\[https?://[^\s\]]+ ([^\]]+)\]", r"\1", text) text = re.sub(r"\[https?://[^\]]+\]", "", text) # Remove wiki formatting text = re.sub(r"'''?([^']+)'''?", r"\1", text) # Bold/italic text = re.sub(r"={2,}([^=]+)={2,}", r"\1", text) # Headers text = re.sub(r"^[*#:;]+", "", text, flags=re.MULTILINE) # List markers # Remove HTML tags text = re.sub(r"<[^>]+>", "", text) # Clean up whitespace text = re.sub(r"\n{3,}", "\n\n", text) text = re.sub(r" {2,}", " ", text) return text.strip() def extract_categories(text: str) -> list[str]: """Extract category names from wikitext.""" pattern = r"\[\[Category:([^\]|]+)" return list(set(re.findall(pattern, text, re.IGNORECASE))) def extract_wiki_links(text: str) -> list[str]: """Extract internal wiki links from wikitext.""" # Match [[Page]] or [[Page|Display]] pattern = r"\[\[([^|\]]+)" links = re.findall(pattern, text) # Filter out categories and files return list( set( link.strip() for link in links if not link.lower().startswith(("category:", "file:", "image:")) ) ) def extract_external_links(text: str) -> list[str]: """Extract external URLs from wikitext.""" pattern = r"https?://[^\s\]\)\"']+" return list(set(re.findall(pattern, text))) def parse_xml_file(xml_path: Path) -> Iterator[WikiArticle]: """Parse a MediaWiki XML dump file and yield articles.""" context = etree.iterparse( str(xml_path), events=("end",), tag="{http://www.mediawiki.org/xml/export-0.6/}page" ) for event, page in context: # Get basic info title_elem = page.find("mw:title", MW_NS) id_elem = page.find("mw:id", MW_NS) ns_elem = page.find("mw:ns", MW_NS) # Skip non-main namespace pages (talk, user, etc.) if ns_elem is not None and ns_elem.text != "0": page.clear() continue title = title_elem.text if title_elem is not None else "" page_id = int(id_elem.text) if id_elem is not None else 0 # Get latest revision revision = page.find("mw:revision", MW_NS) if revision is None: page.clear() continue text_elem = revision.find("mw:text", MW_NS) timestamp_elem = revision.find("mw:timestamp", MW_NS) contributor = revision.find("mw:contributor", MW_NS) content = text_elem.text if text_elem is not None else "" timestamp = timestamp_elem.text if timestamp_elem is not None else "" contributor_name = "" if contributor is not None: username = contributor.find("mw:username", MW_NS) if username is not None: contributor_name = username.text or "" # Skip redirects and empty pages if not content or content.lower().startswith("#redirect"): page.clear() continue article = WikiArticle( id=page_id, title=title, content=content, plain_text=clean_wikitext(content), categories=extract_categories(content), links=extract_wiki_links(content), external_links=extract_external_links(content), timestamp=timestamp, contributor=contributor_name, ) # Clear element to free memory page.clear() yield article def parse_all_dumps(output_path: Path | None = None) -> list[WikiArticle]: """Parse all XML dump files and optionally save to JSON.""" xml_files = sorted(settings.xmldump_dir.glob("*.xml")) if not xml_files: console.print(f"[red]No XML files found in {settings.xmldump_dir}[/red]") return [] console.print(f"[green]Found {len(xml_files)} XML files to parse[/green]") all_articles = [] with Progress() as progress: task = progress.add_task("[cyan]Parsing XML files...", total=len(xml_files)) for xml_file in xml_files: progress.update(task, description=f"[cyan]Parsing {xml_file.name}...") for article in parse_xml_file(xml_file): all_articles.append(article) progress.advance(task) console.print(f"[green]Parsed {len(all_articles)} articles[/green]") if output_path: console.print(f"[cyan]Saving to {output_path}...[/cyan]") with open(output_path, "w", encoding="utf-8") as f: json.dump([a.to_dict() for a in all_articles], f, ensure_ascii=False, indent=2) console.print(f"[green]Saved {len(all_articles)} articles to {output_path}[/green]") return all_articles def parse_mediawiki_files(articles_dir: Path, output_path: Path | None = None) -> list[WikiArticle]: """Parse individual .mediawiki files from a directory (Codeberg format).""" mediawiki_files = list(articles_dir.glob("*.mediawiki")) if not mediawiki_files: console.print(f"[red]No .mediawiki files found in {articles_dir}[/red]") return [] console.print(f"[green]Found {len(mediawiki_files)} .mediawiki files to parse[/green]") all_articles = [] with Progress() as progress: task = progress.add_task("[cyan]Parsing files...", total=len(mediawiki_files)) for i, filepath in enumerate(mediawiki_files): # Title is the filename without extension title = filepath.stem try: content = filepath.read_text(encoding="utf-8", errors="replace") except Exception as e: console.print(f"[yellow]Warning: Could not read {filepath}: {e}[/yellow]") progress.advance(task) continue # Skip redirects and empty files if not content or content.strip().lower().startswith("#redirect"): progress.advance(task) continue article = WikiArticle( id=i, title=title, content=content, plain_text=clean_wikitext(content), categories=extract_categories(content), links=extract_wiki_links(content), external_links=extract_external_links(content), timestamp="", contributor="", ) all_articles.append(article) progress.advance(task) console.print(f"[green]Parsed {len(all_articles)} articles[/green]") if output_path: console.print(f"[cyan]Saving to {output_path}...[/cyan]") with open(output_path, "w", encoding="utf-8") as f: json.dump([a.to_dict() for a in all_articles], f, ensure_ascii=False, indent=2) console.print(f"[green]Saved {len(all_articles)} articles to {output_path}[/green]") return all_articles def main(): """CLI entry point for parsing wiki content.""" output_path = settings.data_dir / "articles.json" # Check for articles directory first (newer, more complete than XML dumps) if settings.articles_dir.exists(): console.print(f"[cyan]Found articles directory at {settings.articles_dir}, using that...[/cyan]") parse_mediawiki_files(settings.articles_dir, output_path) else: # Fall back to XML dumps parse_all_dumps(output_path) if __name__ == "__main__": main()