katheryn-website/frontend/scripts/parse-squarespace-xml.js

#!/usr/bin/env node
/**
 * Parse Squarespace WordPress XML export and extract blog posts
 * Outputs JSON files for use in Next.js static generation
 */

const fs = require('fs');
const path = require('path');

// XML file path (Windows path via WSL)
const xmlPath = '/mnt/c/Users/jeffe/Downloads/Squarespace-Wordpress-Export-02-03-2026.xml';
const outputDir = path.join(__dirname, '..', 'src', 'data');

// Ensure output directory exists
if (!fs.existsSync(outputDir)) {
  fs.mkdirSync(outputDir, { recursive: true });
}

// Read XML file
console.log('Reading XML file...');
const xmlContent = fs.readFileSync(xmlPath, 'utf-8');

// Extract all items (posts and pages)
const itemRegex = /<item>([\s\S]*?)<\/item>/g;
const items = [];
let match;

while ((match = itemRegex.exec(xmlContent)) !== null) {
  items.push(match[1]);
}

console.log(`Found ${items.length} items`);

// Helper functions to extract data
function decodeHTMLEntities(text) {
  // Run twice to handle double-encoded entities
  let decoded = text
    .replace(/&amp;/g, '&')
    .replace(/&lt;/g, '<')
    .replace(/&gt;/g, '>')
    .replace(/&quot;/g, '"')
    .replace(/&#39;/g, "'")
    .replace(/&nbsp;/g, ' ');
  // Second pass for double-encoded
  decoded = decoded
    .replace(/&amp;/g, '&')
    .replace(/&lt;/g, '<')
    .replace(/&gt;/g, '>')
    .replace(/&quot;/g, '"')
    .replace(/&#39;/g, "'");
  return decoded;
}

function extractTag(content, tag) {
  const regex = new RegExp(`<${tag}>([\\s\\S]*?)<\\/${tag}>`);
  const match = content.match(regex);
  return match ? decodeHTMLEntities(match[1].trim()) : '';
}

function extractCDATA(content, tag) {
  const regex = new RegExp(`<${tag}><!\\[CDATA\\[([\\s\\S]*?)\\]\\]><\\/${tag}>`);
  const match = content.match(regex);
  return match ? decodeHTMLEntities(match[1].trim()) : '';
}

function extractWPTag(content, tag) {
  const regex = new RegExp(`<wp:${tag}>([\\s\\S]*?)<\\/wp:${tag}>`);
  const match = content.match(regex);
  return match ? match[1].trim() : '';
}

function extractWPCDATA(content, tag) {
  const regex = new RegExp(`<wp:${tag}><!\\[CDATA\\[([\\s\\S]*?)\\]\\]><\\/wp:${tag}>`);
  const match = content.match(regex);
  return match ? match[1].trim() : '';
}

// Create slug from post name or title
function createSlug(postName, title) {
  if (postName) {
    // Extract last part of path for slug
    const parts = postName.split('/');
    return parts[parts.length - 1] || parts[parts.length - 2] || '';
  }
  return title
    .toLowerCase()
    .replace(/[^a-z0-9]+/g, '-')
    .replace(/^-|-$/g, '');
}

// Extract image URLs from content
function extractImages(content) {
  const imageRegex = /<img[^>]+src=["']([^"']+)["'][^>]*>/g;
  const images = [];
  let imgMatch;
  while ((imgMatch = imageRegex.exec(content)) !== null) {
    images.push(imgMatch[1]);
  }
  return images;
}

// Convert WordPress [caption] shortcodes to HTML figure/figcaption
function convertCaptionShortcodes(html) {
  // Pattern: [caption id="" align="..." width="..."]<a>...<img>...</a> or <img.../>  Caption text [/caption]
  // The caption text is after the img/a tag and before [/caption]
  return html.replace(
    /\[caption[^\]]*\]([\s\S]*?)\[\/caption\]/gi,
    (match, content) => {
      // Try to match either <a><img></a> or just <img>
      let imgMatch = content.match(/(<a[^>]*>[\s\S]*?<\/a>)/i);
      if (!imgMatch) {
        // No link wrapping, try just img tag
        imgMatch = content.match(/(<img[^>]*\/?>)/i);
      }

      if (!imgMatch) {
        // No image found, just return the content without shortcode
        return content.trim();
      }

      const imgTag = imgMatch[1];
      // Caption text is everything after the image/link tag
      const afterImg = content.substring(content.indexOf(imgTag) + imgTag.length);
      let captionText = afterImg.trim();

      // Clean up the caption text
      captionText = captionText
        .replace(/^\s+|\s+$/g, '')  // Trim whitespace
        .replace(/\s+/g, ' ');       // Normalize spaces

      if (captionText) {
        return `<figure class="wp-caption">${imgTag}<figcaption>${captionText}</figcaption></figure>`;
      } else {
        return `<figure class="wp-caption">${imgTag}</figure>`;
      }
    }
  );
}

// Clean HTML content - convert to simpler format
function cleanContent(html) {
  if (!html) return '';

  // First convert WordPress caption shortcodes to proper HTML
  let clean = convertCaptionShortcodes(html);

  // Remove Squarespace-specific attributes and classes
  clean = clean
    .replace(/data-sqsp[^=]*="[^"]*"/g, '')
    .replace(/class="[^"]*sqs[^"]*"/g, '')
    .replace(/style="white-space:pre-wrap;"/g, '')
    .replace(/class=""/g, '')
    .replace(/<p class="" >/g, '<p>')
    .replace(/<p class="">/g, '<p>')
    .replace(/data-rte-preserve-empty="true"/g, '')
    .replace(/<div class="sqs-html-content"[^>]*>/g, '<div>')
    .replace(/<figure class="block-animation-none">/g, '<figure>')
    .replace(/data-animation-role="quote"/g, '');

  return clean;
}

// Decode HTML entities for excerpts
function decodeExcerptEntities(text) {
  return text
    // Common HTML entities
    .replace(/&mdash;/g, '—')
    .replace(/&ndash;/g, '–')
    .replace(/&hellip;/g, '...')
    .replace(/&rsquo;/g, "'")
    .replace(/&lsquo;/g, "'")
    .replace(/&rdquo;/g, '"')
    .replace(/&ldquo;/g, '"')
    .replace(/&nbsp;/g, ' ')
    .replace(/&amp;/g, '&')
    .replace(/&lt;/g, '<')
    .replace(/&gt;/g, '>')
    .replace(/&quot;/g, '"')
    .replace(/&#39;/g, "'")
    .replace(/&#8217;/g, "'")
    .replace(/&#8216;/g, "'")
    .replace(/&#8220;/g, '"')
    .replace(/&#8221;/g, '"')
    .replace(/&#8212;/g, '—')
    .replace(/&#8211;/g, '–')
    .replace(/&#8230;/g, '...');
}

// Remove WordPress-style shortcodes
function removeShortcodes(text) {
  // Remove [caption]...[/caption] shortcodes
  let clean = text.replace(/\[caption[^\]]*\](.*?)\[\/caption\]/gi, '$1');
  // Remove other common shortcodes
  clean = clean.replace(/\[\/?[a-z_]+[^\]]*\]/gi, '');
  return clean;
}

// Remove the first/leading image from content to avoid duplication with featured image
function removeLeadingImage(content, featuredImageUrl) {
  if (!featuredImageUrl || !content) return content;

  let result = content.trim();

  // Pattern 1: Remove leading <figure class="wp-caption">...</figure> containing the featured image
  const figureMatch = result.match(/^(\s*<figure[^>]*>[\s\S]*?<\/figure>\s*)/i);
  if (figureMatch && figureMatch[1].includes(featuredImageUrl.split('?')[0])) {
    result = result.substring(figureMatch[1].length).trim();
    return result;
  }

  // Pattern 2: Remove leading <a><img></a> containing the featured image
  const linkedImgMatch = result.match(/^(\s*<a[^>]*>[\s\S]*?<img[^>]*>[\s\S]*?<\/a>\s*)/i);
  if (linkedImgMatch && linkedImgMatch[1].includes(featuredImageUrl.split('?')[0])) {
    result = result.substring(linkedImgMatch[1].length).trim();
    return result;
  }

  // Pattern 3: Remove leading standalone <img> containing the featured image
  const imgMatch = result.match(/^(\s*<img[^>]*\/?>[\s*]*)/i);
  if (imgMatch && imgMatch[1].includes(featuredImageUrl.split('?')[0])) {
    result = result.substring(imgMatch[1].length).trim();
    return result;
  }

  return result;
}

// Extract first paragraph as excerpt
function extractExcerpt(content, maxLength = 200) {
  // First decode HTML entities in the raw content
  let text = decodeExcerptEntities(content);
  // Remove WordPress shortcodes
  text = removeShortcodes(text);
  // Remove HTML tags
  text = text.replace(/<[^>]+>/g, ' ');
  // Clean up whitespace
  text = text.replace(/\s+/g, ' ').trim();

  if (text.length <= maxLength) return text;
  return text.substring(0, maxLength).replace(/\s+\S*$/, '') + '...';
}

// Parse items
const posts = [];
const pages = [];

items.forEach((item, index) => {
  const title = extractTag(item, 'title') || extractCDATA(item, 'title');
  const link = extractTag(item, 'link');
  const content = extractCDATA(item, 'content:encoded');
  const postName = extractWPTag(item, 'post_name');
  const postType = extractWPTag(item, 'post_type');
  const postId = extractWPTag(item, 'post_id');
  const status = extractWPTag(item, 'status');
  const pubDate = extractTag(item, 'pubDate');
  const postDate = extractWPTag(item, 'post_date');

  if (!title || status !== 'publish') return;

  const slug = createSlug(postName, title);
  const images = extractImages(content);
  const featuredImage = images[0] || null;
  let cleanedContent = cleanContent(content);
  // Remove the leading image from content if it matches the featured image (to avoid duplication)
  cleanedContent = removeLeadingImage(cleanedContent, featuredImage);
  const excerpt = extractExcerpt(cleanedContent);

  const data = {
    id: parseInt(postId) || index,
    title,
    slug,
    link,
    content: cleanedContent,
    excerpt,
    featuredImage,
    images,
    pubDate,
    postDate,
    status,
  };

  if (postType === 'post') {
    posts.push(data);
  } else if (postType === 'page') {
    pages.push(data);
  }
});

// Sort posts by date (newest first)
posts.sort((a, b) => new Date(b.postDate) - new Date(a.postDate));

console.log(`Parsed ${posts.length} blog posts`);
console.log(`Parsed ${pages.length} pages`);

// Write blog posts JSON
const postsPath = path.join(outputDir, 'blog-posts.json');
fs.writeFileSync(postsPath, JSON.stringify(posts, null, 2));
console.log(`Blog posts written to: ${postsPath}`);

// Write pages JSON
const pagesPath = path.join(outputDir, 'pages.json');
fs.writeFileSync(pagesPath, JSON.stringify(pages, null, 2));
console.log(`Pages written to: ${pagesPath}`);

// Create index file for easier importing
const indexContent = `// Auto-generated blog data exports
import blogPostsData from './blog-posts.json';
import pagesData from './pages.json';

export interface BlogPost {
  id: number;
  title: string;
  slug: string;
  link: string;
  content: string;
  excerpt: string;
  featuredImage: string | null;
  images: string[];
  pubDate: string;
  postDate: string;
  status: string;
}

export interface Page {
  id: number;
  title: string;
  slug: string;
  link: string;
  content: string;
  excerpt: string;
  featuredImage: string | null;
  images: string[];
  pubDate: string;
  postDate: string;
  status: string;
}

export const blogPosts: BlogPost[] = blogPostsData as BlogPost[];
export const pages: Page[] = pagesData as Page[];

export function getBlogPostBySlug(slug: string): BlogPost | undefined {
  return blogPosts.find(post => post.slug === slug);
}

export function getPageBySlug(slug: string): Page | undefined {
  return pages.find(page => page.slug === slug);
}

export function getAllBlogSlugs(): string[] {
  return blogPosts.map(post => post.slug);
}

export function getRecentPosts(limit: number = 10): BlogPost[] {
  return blogPosts.slice(0, limit);
}
`;

const indexPath = path.join(outputDir, 'index.ts');
fs.writeFileSync(indexPath, indexContent);
console.log(`Index file written to: ${indexPath}`);

// Print sample of first 5 posts
console.log('\nFirst 5 blog posts:');
posts.slice(0, 5).forEach((post, i) => {
  console.log(`  ${i + 1}. ${post.title} (${post.slug}) - ${post.postDate}`);
});