p2pwiki-ai/src/parser.py

"""MediaWiki XML dump parser - converts to structured JSON."""

import json
import re
from dataclasses import dataclass, field, asdict
from pathlib import Path
from typing import Iterator
from lxml import etree
from rich.progress import Progress, TaskID
from rich.console import Console

from .config import settings

console = Console()

# MediaWiki namespace
MW_NS = {"mw": "http://www.mediawiki.org/xml/export-0.6/"}


@dataclass
class WikiArticle:
    """Represents a parsed wiki article."""

    id: int
    title: str
    content: str  # Raw wikitext
    plain_text: str  # Cleaned plain text for embedding
    categories: list[str] = field(default_factory=list)
    links: list[str] = field(default_factory=list)  # Internal wiki links
    external_links: list[str] = field(default_factory=list)
    timestamp: str = ""
    contributor: str = ""

    def to_dict(self) -> dict:
        return asdict(self)


def clean_wikitext(text: str) -> str:
    """Convert MediaWiki markup to plain text for embedding."""
    if not text:
        return ""

    # Remove templates {{...}}
    text = re.sub(r"\{\{[^}]+\}\}", "", text)

    # Remove categories [[Category:...]]
    text = re.sub(r"\[\[Category:[^\]]+\]\]", "", text, flags=re.IGNORECASE)

    # Convert wiki links [[Page|Display]] or [[Page]] to just the display text
    text = re.sub(r"\[\[([^|\]]+)\|([^\]]+)\]\]", r"\2", text)
    text = re.sub(r"\[\[([^\]]+)\]\]", r"\1", text)

    # Remove external links [url text] -> text
    text = re.sub(r"\[https?://[^\s\]]+ ([^\]]+)\]", r"\1", text)
    text = re.sub(r"\[https?://[^\]]+\]", "", text)

    # Remove wiki formatting
    text = re.sub(r"'''?([^']+)'''?", r"\1", text)  # Bold/italic
    text = re.sub(r"={2,}([^=]+)={2,}", r"\1", text)  # Headers
    text = re.sub(r"^[*#:;]+", "", text, flags=re.MULTILINE)  # List markers

    # Remove HTML tags
    text = re.sub(r"<[^>]+>", "", text)

    # Clean up whitespace
    text = re.sub(r"\n{3,}", "\n\n", text)
    text = re.sub(r" {2,}", " ", text)

    return text.strip()


def extract_categories(text: str) -> list[str]:
    """Extract category names from wikitext."""
    pattern = r"\[\[Category:([^\]|]+)"
    return list(set(re.findall(pattern, text, re.IGNORECASE)))


def extract_wiki_links(text: str) -> list[str]:
    """Extract internal wiki links from wikitext."""
    # Match [[Page]] or [[Page|Display]]
    pattern = r"\[\[([^|\]]+)"
    links = re.findall(pattern, text)
    # Filter out categories and files
    return list(
        set(
            link.strip()
            for link in links
            if not link.lower().startswith(("category:", "file:", "image:"))
        )
    )


def extract_external_links(text: str) -> list[str]:
    """Extract external URLs from wikitext."""
    pattern = r"https?://[^\s\]\)\"']+"
    return list(set(re.findall(pattern, text)))


def parse_xml_file(xml_path: Path) -> Iterator[WikiArticle]:
    """Parse a MediaWiki XML dump file and yield articles."""
    context = etree.iterparse(
        str(xml_path), events=("end",), tag="{http://www.mediawiki.org/xml/export-0.6/}page"
    )

    for event, page in context:
        # Get basic info
        title_elem = page.find("mw:title", MW_NS)
        id_elem = page.find("mw:id", MW_NS)
        ns_elem = page.find("mw:ns", MW_NS)

        # Skip non-main namespace pages (talk, user, etc.)
        if ns_elem is not None and ns_elem.text != "0":
            page.clear()
            continue

        title = title_elem.text if title_elem is not None else ""
        page_id = int(id_elem.text) if id_elem is not None else 0

        # Get latest revision
        revision = page.find("mw:revision", MW_NS)
        if revision is None:
            page.clear()
            continue

        text_elem = revision.find("mw:text", MW_NS)
        timestamp_elem = revision.find("mw:timestamp", MW_NS)
        contributor = revision.find("mw:contributor", MW_NS)

        content = text_elem.text if text_elem is not None else ""
        timestamp = timestamp_elem.text if timestamp_elem is not None else ""

        contributor_name = ""
        if contributor is not None:
            username = contributor.find("mw:username", MW_NS)
            if username is not None:
                contributor_name = username.text or ""

        # Skip redirects and empty pages
        if not content or content.lower().startswith("#redirect"):
            page.clear()
            continue

        article = WikiArticle(
            id=page_id,
            title=title,
            content=content,
            plain_text=clean_wikitext(content),
            categories=extract_categories(content),
            links=extract_wiki_links(content),
            external_links=extract_external_links(content),
            timestamp=timestamp,
            contributor=contributor_name,
        )

        # Clear element to free memory
        page.clear()

        yield article


def parse_all_dumps(output_path: Path | None = None) -> list[WikiArticle]:
    """Parse all XML dump files and optionally save to JSON."""
    xml_files = sorted(settings.xmldump_dir.glob("*.xml"))

    if not xml_files:
        console.print(f"[red]No XML files found in {settings.xmldump_dir}[/red]")
        return []

    console.print(f"[green]Found {len(xml_files)} XML files to parse[/green]")

    all_articles = []

    with Progress() as progress:
        task = progress.add_task("[cyan]Parsing XML files...", total=len(xml_files))

        for xml_file in xml_files:
            progress.update(task, description=f"[cyan]Parsing {xml_file.name}...")

            for article in parse_xml_file(xml_file):
                all_articles.append(article)

            progress.advance(task)

    console.print(f"[green]Parsed {len(all_articles)} articles[/green]")

    if output_path:
        console.print(f"[cyan]Saving to {output_path}...[/cyan]")
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump([a.to_dict() for a in all_articles], f, ensure_ascii=False, indent=2)
        console.print(f"[green]Saved {len(all_articles)} articles to {output_path}[/green]")

    return all_articles


def parse_mediawiki_files(articles_dir: Path, output_path: Path | None = None) -> list[WikiArticle]:
    """Parse individual .mediawiki files from a directory (Codeberg format)."""
    mediawiki_files = list(articles_dir.glob("*.mediawiki"))

    if not mediawiki_files:
        console.print(f"[red]No .mediawiki files found in {articles_dir}[/red]")
        return []

    console.print(f"[green]Found {len(mediawiki_files)} .mediawiki files to parse[/green]")

    all_articles = []

    with Progress() as progress:
        task = progress.add_task("[cyan]Parsing files...", total=len(mediawiki_files))

        for i, filepath in enumerate(mediawiki_files):
            # Title is the filename without extension
            title = filepath.stem

            try:
                content = filepath.read_text(encoding="utf-8", errors="replace")
            except Exception as e:
                console.print(f"[yellow]Warning: Could not read {filepath}: {e}[/yellow]")
                progress.advance(task)
                continue

            # Skip redirects and empty files
            if not content or content.strip().lower().startswith("#redirect"):
                progress.advance(task)
                continue

            article = WikiArticle(
                id=i,
                title=title,
                content=content,
                plain_text=clean_wikitext(content),
                categories=extract_categories(content),
                links=extract_wiki_links(content),
                external_links=extract_external_links(content),
                timestamp="",
                contributor="",
            )

            all_articles.append(article)
            progress.advance(task)

    console.print(f"[green]Parsed {len(all_articles)} articles[/green]")

    if output_path:
        console.print(f"[cyan]Saving to {output_path}...[/cyan]")
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump([a.to_dict() for a in all_articles], f, ensure_ascii=False, indent=2)
        console.print(f"[green]Saved {len(all_articles)} articles to {output_path}[/green]")

    return all_articles


def main():
    """CLI entry point for parsing wiki content."""
    output_path = settings.data_dir / "articles.json"

    # Check for articles directory first (newer, more complete than XML dumps)
    if settings.articles_dir.exists():
        console.print(f"[cyan]Found articles directory at {settings.articles_dir}, using that...[/cyan]")
        parse_mediawiki_files(settings.articles_dir, output_path)
    else:
        # Fall back to XML dumps
        parse_all_dumps(output_path)


if __name__ == "__main__":
    main()