p2pwiki-ai/src/parser.py

267 lines
8.7 KiB
Python

"""MediaWiki XML dump parser - converts to structured JSON."""
import json
import re
from dataclasses import dataclass, field, asdict
from pathlib import Path
from typing import Iterator
from lxml import etree
from rich.progress import Progress, TaskID
from rich.console import Console
from .config import settings
console = Console()
# MediaWiki namespace
MW_NS = {"mw": "http://www.mediawiki.org/xml/export-0.6/"}
@dataclass
class WikiArticle:
"""Represents a parsed wiki article."""
id: int
title: str
content: str # Raw wikitext
plain_text: str # Cleaned plain text for embedding
categories: list[str] = field(default_factory=list)
links: list[str] = field(default_factory=list) # Internal wiki links
external_links: list[str] = field(default_factory=list)
timestamp: str = ""
contributor: str = ""
def to_dict(self) -> dict:
return asdict(self)
def clean_wikitext(text: str) -> str:
"""Convert MediaWiki markup to plain text for embedding."""
if not text:
return ""
# Remove templates {{...}}
text = re.sub(r"\{\{[^}]+\}\}", "", text)
# Remove categories [[Category:...]]
text = re.sub(r"\[\[Category:[^\]]+\]\]", "", text, flags=re.IGNORECASE)
# Convert wiki links [[Page|Display]] or [[Page]] to just the display text
text = re.sub(r"\[\[([^|\]]+)\|([^\]]+)\]\]", r"\2", text)
text = re.sub(r"\[\[([^\]]+)\]\]", r"\1", text)
# Remove external links [url text] -> text
text = re.sub(r"\[https?://[^\s\]]+ ([^\]]+)\]", r"\1", text)
text = re.sub(r"\[https?://[^\]]+\]", "", text)
# Remove wiki formatting
text = re.sub(r"'''?([^']+)'''?", r"\1", text) # Bold/italic
text = re.sub(r"={2,}([^=]+)={2,}", r"\1", text) # Headers
text = re.sub(r"^[*#:;]+", "", text, flags=re.MULTILINE) # List markers
# Remove HTML tags
text = re.sub(r"<[^>]+>", "", text)
# Clean up whitespace
text = re.sub(r"\n{3,}", "\n\n", text)
text = re.sub(r" {2,}", " ", text)
return text.strip()
def extract_categories(text: str) -> list[str]:
"""Extract category names from wikitext."""
pattern = r"\[\[Category:([^\]|]+)"
return list(set(re.findall(pattern, text, re.IGNORECASE)))
def extract_wiki_links(text: str) -> list[str]:
"""Extract internal wiki links from wikitext."""
# Match [[Page]] or [[Page|Display]]
pattern = r"\[\[([^|\]]+)"
links = re.findall(pattern, text)
# Filter out categories and files
return list(
set(
link.strip()
for link in links
if not link.lower().startswith(("category:", "file:", "image:"))
)
)
def extract_external_links(text: str) -> list[str]:
"""Extract external URLs from wikitext."""
pattern = r"https?://[^\s\]\)\"']+"
return list(set(re.findall(pattern, text)))
def parse_xml_file(xml_path: Path) -> Iterator[WikiArticle]:
"""Parse a MediaWiki XML dump file and yield articles."""
context = etree.iterparse(
str(xml_path), events=("end",), tag="{http://www.mediawiki.org/xml/export-0.6/}page"
)
for event, page in context:
# Get basic info
title_elem = page.find("mw:title", MW_NS)
id_elem = page.find("mw:id", MW_NS)
ns_elem = page.find("mw:ns", MW_NS)
# Skip non-main namespace pages (talk, user, etc.)
if ns_elem is not None and ns_elem.text != "0":
page.clear()
continue
title = title_elem.text if title_elem is not None else ""
page_id = int(id_elem.text) if id_elem is not None else 0
# Get latest revision
revision = page.find("mw:revision", MW_NS)
if revision is None:
page.clear()
continue
text_elem = revision.find("mw:text", MW_NS)
timestamp_elem = revision.find("mw:timestamp", MW_NS)
contributor = revision.find("mw:contributor", MW_NS)
content = text_elem.text if text_elem is not None else ""
timestamp = timestamp_elem.text if timestamp_elem is not None else ""
contributor_name = ""
if contributor is not None:
username = contributor.find("mw:username", MW_NS)
if username is not None:
contributor_name = username.text or ""
# Skip redirects and empty pages
if not content or content.lower().startswith("#redirect"):
page.clear()
continue
article = WikiArticle(
id=page_id,
title=title,
content=content,
plain_text=clean_wikitext(content),
categories=extract_categories(content),
links=extract_wiki_links(content),
external_links=extract_external_links(content),
timestamp=timestamp,
contributor=contributor_name,
)
# Clear element to free memory
page.clear()
yield article
def parse_all_dumps(output_path: Path | None = None) -> list[WikiArticle]:
"""Parse all XML dump files and optionally save to JSON."""
xml_files = sorted(settings.xmldump_dir.glob("*.xml"))
if not xml_files:
console.print(f"[red]No XML files found in {settings.xmldump_dir}[/red]")
return []
console.print(f"[green]Found {len(xml_files)} XML files to parse[/green]")
all_articles = []
with Progress() as progress:
task = progress.add_task("[cyan]Parsing XML files...", total=len(xml_files))
for xml_file in xml_files:
progress.update(task, description=f"[cyan]Parsing {xml_file.name}...")
for article in parse_xml_file(xml_file):
all_articles.append(article)
progress.advance(task)
console.print(f"[green]Parsed {len(all_articles)} articles[/green]")
if output_path:
console.print(f"[cyan]Saving to {output_path}...[/cyan]")
with open(output_path, "w", encoding="utf-8") as f:
json.dump([a.to_dict() for a in all_articles], f, ensure_ascii=False, indent=2)
console.print(f"[green]Saved {len(all_articles)} articles to {output_path}[/green]")
return all_articles
def parse_mediawiki_files(articles_dir: Path, output_path: Path | None = None) -> list[WikiArticle]:
"""Parse individual .mediawiki files from a directory (Codeberg format)."""
mediawiki_files = list(articles_dir.glob("*.mediawiki"))
if not mediawiki_files:
console.print(f"[red]No .mediawiki files found in {articles_dir}[/red]")
return []
console.print(f"[green]Found {len(mediawiki_files)} .mediawiki files to parse[/green]")
all_articles = []
with Progress() as progress:
task = progress.add_task("[cyan]Parsing files...", total=len(mediawiki_files))
for i, filepath in enumerate(mediawiki_files):
# Title is the filename without extension
title = filepath.stem
try:
content = filepath.read_text(encoding="utf-8", errors="replace")
except Exception as e:
console.print(f"[yellow]Warning: Could not read {filepath}: {e}[/yellow]")
progress.advance(task)
continue
# Skip redirects and empty files
if not content or content.strip().lower().startswith("#redirect"):
progress.advance(task)
continue
article = WikiArticle(
id=i,
title=title,
content=content,
plain_text=clean_wikitext(content),
categories=extract_categories(content),
links=extract_wiki_links(content),
external_links=extract_external_links(content),
timestamp="",
contributor="",
)
all_articles.append(article)
progress.advance(task)
console.print(f"[green]Parsed {len(all_articles)} articles[/green]")
if output_path:
console.print(f"[cyan]Saving to {output_path}...[/cyan]")
with open(output_path, "w", encoding="utf-8") as f:
json.dump([a.to_dict() for a in all_articles], f, ensure_ascii=False, indent=2)
console.print(f"[green]Saved {len(all_articles)} articles to {output_path}[/green]")
return all_articles
def main():
"""CLI entry point for parsing wiki content."""
output_path = settings.data_dir / "articles.json"
# Check for articles directory first (newer, more complete than XML dumps)
if settings.articles_dir.exists():
console.print(f"[cyan]Found articles directory at {settings.articles_dir}, using that...[/cyan]")
parse_mediawiki_files(settings.articles_dir, output_path)
else:
# Fall back to XML dumps
parse_all_dumps(output_path)
if __name__ == "__main__":
main()