267 lines
8.7 KiB
Python
267 lines
8.7 KiB
Python
"""MediaWiki XML dump parser - converts to structured JSON."""
|
|
|
|
import json
|
|
import re
|
|
from dataclasses import dataclass, field, asdict
|
|
from pathlib import Path
|
|
from typing import Iterator
|
|
from lxml import etree
|
|
from rich.progress import Progress, TaskID
|
|
from rich.console import Console
|
|
|
|
from .config import settings
|
|
|
|
console = Console()
|
|
|
|
# MediaWiki namespace
|
|
MW_NS = {"mw": "http://www.mediawiki.org/xml/export-0.6/"}
|
|
|
|
|
|
@dataclass
|
|
class WikiArticle:
|
|
"""Represents a parsed wiki article."""
|
|
|
|
id: int
|
|
title: str
|
|
content: str # Raw wikitext
|
|
plain_text: str # Cleaned plain text for embedding
|
|
categories: list[str] = field(default_factory=list)
|
|
links: list[str] = field(default_factory=list) # Internal wiki links
|
|
external_links: list[str] = field(default_factory=list)
|
|
timestamp: str = ""
|
|
contributor: str = ""
|
|
|
|
def to_dict(self) -> dict:
|
|
return asdict(self)
|
|
|
|
|
|
def clean_wikitext(text: str) -> str:
|
|
"""Convert MediaWiki markup to plain text for embedding."""
|
|
if not text:
|
|
return ""
|
|
|
|
# Remove templates {{...}}
|
|
text = re.sub(r"\{\{[^}]+\}\}", "", text)
|
|
|
|
# Remove categories [[Category:...]]
|
|
text = re.sub(r"\[\[Category:[^\]]+\]\]", "", text, flags=re.IGNORECASE)
|
|
|
|
# Convert wiki links [[Page|Display]] or [[Page]] to just the display text
|
|
text = re.sub(r"\[\[([^|\]]+)\|([^\]]+)\]\]", r"\2", text)
|
|
text = re.sub(r"\[\[([^\]]+)\]\]", r"\1", text)
|
|
|
|
# Remove external links [url text] -> text
|
|
text = re.sub(r"\[https?://[^\s\]]+ ([^\]]+)\]", r"\1", text)
|
|
text = re.sub(r"\[https?://[^\]]+\]", "", text)
|
|
|
|
# Remove wiki formatting
|
|
text = re.sub(r"'''?([^']+)'''?", r"\1", text) # Bold/italic
|
|
text = re.sub(r"={2,}([^=]+)={2,}", r"\1", text) # Headers
|
|
text = re.sub(r"^[*#:;]+", "", text, flags=re.MULTILINE) # List markers
|
|
|
|
# Remove HTML tags
|
|
text = re.sub(r"<[^>]+>", "", text)
|
|
|
|
# Clean up whitespace
|
|
text = re.sub(r"\n{3,}", "\n\n", text)
|
|
text = re.sub(r" {2,}", " ", text)
|
|
|
|
return text.strip()
|
|
|
|
|
|
def extract_categories(text: str) -> list[str]:
|
|
"""Extract category names from wikitext."""
|
|
pattern = r"\[\[Category:([^\]|]+)"
|
|
return list(set(re.findall(pattern, text, re.IGNORECASE)))
|
|
|
|
|
|
def extract_wiki_links(text: str) -> list[str]:
|
|
"""Extract internal wiki links from wikitext."""
|
|
# Match [[Page]] or [[Page|Display]]
|
|
pattern = r"\[\[([^|\]]+)"
|
|
links = re.findall(pattern, text)
|
|
# Filter out categories and files
|
|
return list(
|
|
set(
|
|
link.strip()
|
|
for link in links
|
|
if not link.lower().startswith(("category:", "file:", "image:"))
|
|
)
|
|
)
|
|
|
|
|
|
def extract_external_links(text: str) -> list[str]:
|
|
"""Extract external URLs from wikitext."""
|
|
pattern = r"https?://[^\s\]\)\"']+"
|
|
return list(set(re.findall(pattern, text)))
|
|
|
|
|
|
def parse_xml_file(xml_path: Path) -> Iterator[WikiArticle]:
|
|
"""Parse a MediaWiki XML dump file and yield articles."""
|
|
context = etree.iterparse(
|
|
str(xml_path), events=("end",), tag="{http://www.mediawiki.org/xml/export-0.6/}page"
|
|
)
|
|
|
|
for event, page in context:
|
|
# Get basic info
|
|
title_elem = page.find("mw:title", MW_NS)
|
|
id_elem = page.find("mw:id", MW_NS)
|
|
ns_elem = page.find("mw:ns", MW_NS)
|
|
|
|
# Skip non-main namespace pages (talk, user, etc.)
|
|
if ns_elem is not None and ns_elem.text != "0":
|
|
page.clear()
|
|
continue
|
|
|
|
title = title_elem.text if title_elem is not None else ""
|
|
page_id = int(id_elem.text) if id_elem is not None else 0
|
|
|
|
# Get latest revision
|
|
revision = page.find("mw:revision", MW_NS)
|
|
if revision is None:
|
|
page.clear()
|
|
continue
|
|
|
|
text_elem = revision.find("mw:text", MW_NS)
|
|
timestamp_elem = revision.find("mw:timestamp", MW_NS)
|
|
contributor = revision.find("mw:contributor", MW_NS)
|
|
|
|
content = text_elem.text if text_elem is not None else ""
|
|
timestamp = timestamp_elem.text if timestamp_elem is not None else ""
|
|
|
|
contributor_name = ""
|
|
if contributor is not None:
|
|
username = contributor.find("mw:username", MW_NS)
|
|
if username is not None:
|
|
contributor_name = username.text or ""
|
|
|
|
# Skip redirects and empty pages
|
|
if not content or content.lower().startswith("#redirect"):
|
|
page.clear()
|
|
continue
|
|
|
|
article = WikiArticle(
|
|
id=page_id,
|
|
title=title,
|
|
content=content,
|
|
plain_text=clean_wikitext(content),
|
|
categories=extract_categories(content),
|
|
links=extract_wiki_links(content),
|
|
external_links=extract_external_links(content),
|
|
timestamp=timestamp,
|
|
contributor=contributor_name,
|
|
)
|
|
|
|
# Clear element to free memory
|
|
page.clear()
|
|
|
|
yield article
|
|
|
|
|
|
def parse_all_dumps(output_path: Path | None = None) -> list[WikiArticle]:
|
|
"""Parse all XML dump files and optionally save to JSON."""
|
|
xml_files = sorted(settings.xmldump_dir.glob("*.xml"))
|
|
|
|
if not xml_files:
|
|
console.print(f"[red]No XML files found in {settings.xmldump_dir}[/red]")
|
|
return []
|
|
|
|
console.print(f"[green]Found {len(xml_files)} XML files to parse[/green]")
|
|
|
|
all_articles = []
|
|
|
|
with Progress() as progress:
|
|
task = progress.add_task("[cyan]Parsing XML files...", total=len(xml_files))
|
|
|
|
for xml_file in xml_files:
|
|
progress.update(task, description=f"[cyan]Parsing {xml_file.name}...")
|
|
|
|
for article in parse_xml_file(xml_file):
|
|
all_articles.append(article)
|
|
|
|
progress.advance(task)
|
|
|
|
console.print(f"[green]Parsed {len(all_articles)} articles[/green]")
|
|
|
|
if output_path:
|
|
console.print(f"[cyan]Saving to {output_path}...[/cyan]")
|
|
with open(output_path, "w", encoding="utf-8") as f:
|
|
json.dump([a.to_dict() for a in all_articles], f, ensure_ascii=False, indent=2)
|
|
console.print(f"[green]Saved {len(all_articles)} articles to {output_path}[/green]")
|
|
|
|
return all_articles
|
|
|
|
|
|
def parse_mediawiki_files(articles_dir: Path, output_path: Path | None = None) -> list[WikiArticle]:
|
|
"""Parse individual .mediawiki files from a directory (Codeberg format)."""
|
|
mediawiki_files = list(articles_dir.glob("*.mediawiki"))
|
|
|
|
if not mediawiki_files:
|
|
console.print(f"[red]No .mediawiki files found in {articles_dir}[/red]")
|
|
return []
|
|
|
|
console.print(f"[green]Found {len(mediawiki_files)} .mediawiki files to parse[/green]")
|
|
|
|
all_articles = []
|
|
|
|
with Progress() as progress:
|
|
task = progress.add_task("[cyan]Parsing files...", total=len(mediawiki_files))
|
|
|
|
for i, filepath in enumerate(mediawiki_files):
|
|
# Title is the filename without extension
|
|
title = filepath.stem
|
|
|
|
try:
|
|
content = filepath.read_text(encoding="utf-8", errors="replace")
|
|
except Exception as e:
|
|
console.print(f"[yellow]Warning: Could not read {filepath}: {e}[/yellow]")
|
|
progress.advance(task)
|
|
continue
|
|
|
|
# Skip redirects and empty files
|
|
if not content or content.strip().lower().startswith("#redirect"):
|
|
progress.advance(task)
|
|
continue
|
|
|
|
article = WikiArticle(
|
|
id=i,
|
|
title=title,
|
|
content=content,
|
|
plain_text=clean_wikitext(content),
|
|
categories=extract_categories(content),
|
|
links=extract_wiki_links(content),
|
|
external_links=extract_external_links(content),
|
|
timestamp="",
|
|
contributor="",
|
|
)
|
|
|
|
all_articles.append(article)
|
|
progress.advance(task)
|
|
|
|
console.print(f"[green]Parsed {len(all_articles)} articles[/green]")
|
|
|
|
if output_path:
|
|
console.print(f"[cyan]Saving to {output_path}...[/cyan]")
|
|
with open(output_path, "w", encoding="utf-8") as f:
|
|
json.dump([a.to_dict() for a in all_articles], f, ensure_ascii=False, indent=2)
|
|
console.print(f"[green]Saved {len(all_articles)} articles to {output_path}[/green]")
|
|
|
|
return all_articles
|
|
|
|
|
|
def main():
|
|
"""CLI entry point for parsing wiki content."""
|
|
output_path = settings.data_dir / "articles.json"
|
|
|
|
# Check for articles directory first (newer, more complete than XML dumps)
|
|
if settings.articles_dir.exists():
|
|
console.print(f"[cyan]Found articles directory at {settings.articles_dir}, using that...[/cyan]")
|
|
parse_mediawiki_files(settings.articles_dir, output_path)
|
|
else:
|
|
# Fall back to XML dumps
|
|
parse_all_dumps(output_path)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|