p2pwiki-ai/src/blog_parser.py

"""Blog HTML parser - extracts blog posts from cached WordPress HTML files."""

import json
import re
import zipfile
from dataclasses import dataclass, field, asdict
from pathlib import Path
from typing import Iterator, Optional
from bs4 import BeautifulSoup
from rich.progress import Progress
from rich.console import Console

from .config import settings

console = Console()


@dataclass
class BlogPost:
    """Represents a parsed blog post."""

    id: int
    title: str
    content: str  # Raw HTML content
    plain_text: str  # Cleaned plain text for embedding
    categories: list[str] = field(default_factory=list)
    links: list[str] = field(default_factory=list)  # Internal links
    external_links: list[str] = field(default_factory=list)
    timestamp: str = ""
    contributor: str = ""
    url: str = ""
    description: str = ""

    def to_dict(self) -> dict:
        return asdict(self)


def clean_html_text(soup_element) -> str:
    """Extract clean text from HTML element."""
    if not soup_element:
        return ""

    # Get text with space separators
    text = soup_element.get_text(separator="\n", strip=True)

    # Clean up whitespace
    text = re.sub(r"\n{3,}", "\n\n", text)
    text = re.sub(r" {2,}", " ", text)

    return text.strip()


def extract_links(soup_element) -> tuple[list[str], list[str]]:
    """Extract internal and external links from HTML."""
    internal = []
    external = []

    if not soup_element:
        return internal, external

    for a in soup_element.find_all("a", href=True):
        href = a["href"]
        if "blog.p2pfoundation.net" in href or "wiki.p2pfoundation.net" in href:
            internal.append(href)
        elif href.startswith("http"):
            external.append(href)

    return list(set(internal)), list(set(external))


def parse_blog_html(html_content: str, post_id: int, slug: str) -> Optional[BlogPost]:
    """Parse a single blog post HTML file."""
    soup = BeautifulSoup(html_content, "html.parser")

    # Extract title
    title_tag = soup.find("title")
    title = title_tag.text.replace(" | P2P Foundation", "").strip() if title_tag else slug.replace("-", " ").title()

    # Extract description
    desc_meta = soup.find("meta", attrs={"name": "description"})
    description = desc_meta["content"] if desc_meta and desc_meta.get("content") else ""

    # Extract published time
    pub_meta = soup.find("meta", attrs={"property": "article:published_time"})
    timestamp = pub_meta["content"] if pub_meta and pub_meta.get("content") else ""

    # Extract author
    author = ""
    author_meta = soup.find("meta", attrs={"name": "author"})
    if author_meta and author_meta.get("content"):
        author = author_meta["content"]
    else:
        # Try to find in schema.org data
        schema = soup.find("script", class_="yoast-schema-graph")
        if schema:
            try:
                schema_data = json.loads(schema.string)
                for item in schema_data.get("@graph", []):
                    if item.get("@type") == "Person":
                        author = item.get("name", "")
                        break
            except (json.JSONDecodeError, TypeError):
                pass

    # Extract article content
    article = soup.find("article")
    if not article:
        return None

    # Get HTML content
    content = str(article)

    # Get plain text
    plain_text = clean_html_text(article)

    # Skip if too short (likely not a real post)
    if len(plain_text) < 100:
        return None

    # Extract links
    internal_links, external_links = extract_links(article)

    # Extract categories from tags/categories section
    categories = []
    cat_links = soup.find_all("a", rel="category tag")
    for cat in cat_links:
        categories.append(cat.text.strip())

    tag_links = soup.find_all("a", rel="tag")
    for tag in tag_links:
        if tag.text.strip() not in categories:
            categories.append(tag.text.strip())

    return BlogPost(
        id=post_id,
        title=title,
        content=content,
        plain_text=plain_text,
        categories=categories,
        links=internal_links,
        external_links=external_links,
        timestamp=timestamp,
        contributor=author,
        url=f"https://blog.p2pfoundation.net/{slug}/",
        description=description,
    )


def parse_blog_zip(zip_path: Path, output_path: Optional[Path] = None) -> list[BlogPost]:
    """Parse blog posts from a WordPress cache zip file."""
    console.print(f"[cyan]Parsing blog posts from {zip_path}...[/cyan]")

    posts = []
    seen_slugs = set()
    post_id = 100000  # Start high to avoid conflicts with wiki article IDs

    # Pattern to match main blog post HTML files (not feeds, embeds, or date-specific)
    post_pattern = re.compile(
        r"blog\.p2pfoundation\.net/public_html/wp-content/cache/page_enhanced/blog\.p2pfoundation\.net/([^/]+)/_index_ssl\.html$"
    )

    with zipfile.ZipFile(zip_path, "r") as zf:
        # Get all matching files
        html_files = [
            name for name in zf.namelist()
            if post_pattern.search(name) and "/feed/" not in name and "/embed/" not in name
        ]

        console.print(f"[green]Found {len(html_files)} blog post files[/green]")

        with Progress() as progress:
            task = progress.add_task("[cyan]Parsing blog posts...", total=len(html_files))

            for filepath in html_files:
                match = post_pattern.search(filepath)
                if not match:
                    progress.advance(task)
                    continue

                slug = match.group(1)

                # Skip duplicates and special pages
                if slug in seen_slugs:
                    progress.advance(task)
                    continue

                # Skip non-post pages
                skip_patterns = ["page", "category", "tag", "author", "feed", "wp-", "uploads"]
                if any(slug.startswith(p) for p in skip_patterns):
                    progress.advance(task)
                    continue

                seen_slugs.add(slug)

                try:
                    with zf.open(filepath) as f:
                        html_content = f.read().decode("utf-8", errors="replace")

                    post = parse_blog_html(html_content, post_id, slug)
                    if post:
                        posts.append(post)
                        post_id += 1
                except Exception as e:
                    console.print(f"[yellow]Warning: Could not parse {slug}: {e}[/yellow]")

                progress.advance(task)

    console.print(f"[green]Parsed {len(posts)} blog posts[/green]")

    if output_path:
        console.print(f"[cyan]Saving to {output_path}...[/cyan]")
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump([p.to_dict() for p in posts], f, ensure_ascii=False, indent=2)
        console.print(f"[green]Saved {len(posts)} blog posts to {output_path}[/green]")

    return posts


def merge_with_wiki_articles(blog_posts: list[BlogPost], wiki_articles_path: Path, output_path: Path):
    """Merge blog posts with existing wiki articles."""
    console.print(f"[cyan]Loading existing wiki articles from {wiki_articles_path}...[/cyan]")

    with open(wiki_articles_path, "r", encoding="utf-8") as f:
        wiki_articles = json.load(f)

    console.print(f"[green]Loaded {len(wiki_articles)} wiki articles[/green]")

    # Convert blog posts to same format as wiki articles
    for post in blog_posts:
        wiki_articles.append({
            "id": post.id,
            "title": f"[Blog] {post.title}",  # Prefix to distinguish from wiki articles
            "content": post.content,
            "plain_text": post.plain_text,
            "categories": post.categories,
            "links": post.links,
            "external_links": post.external_links,
            "timestamp": post.timestamp,
            "contributor": post.contributor,
        })

    console.print(f"[cyan]Saving merged articles to {output_path}...[/cyan]")
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(wiki_articles, f, ensure_ascii=False, indent=2)

    console.print(f"[green]Saved {len(wiki_articles)} total articles[/green]")


def main():
    """CLI entry point for parsing blog content."""
    # Look for blog zip file
    blog_zip = Path("/mnt/c/Users/jeffe/Downloads/blog.p2pfoundation.net.zip")

    if not blog_zip.exists():
        console.print(f"[red]Blog zip not found at {blog_zip}[/red]")
        return

    # Parse blog posts
    blog_output = settings.data_dir / "blog_posts.json"
    posts = parse_blog_zip(blog_zip, blog_output)

    # Merge with wiki articles
    wiki_articles = settings.data_dir / "articles.json"
    if wiki_articles.exists():
        merged_output = settings.data_dir / "articles_with_blog.json"
        merge_with_wiki_articles(posts, wiki_articles, merged_output)
        console.print(f"[green]Merged articles saved to {merged_output}[/green]")
        console.print("[yellow]To use merged articles, rename articles_with_blog.json to articles.json and re-run embeddings[/yellow]")


if __name__ == "__main__":
    main()