#!/usr/bin/env node /** * Parse Squarespace WordPress XML export and extract blog posts * Outputs JSON files for use in Next.js static generation */ const fs = require('fs'); const path = require('path'); // XML file path (Windows path via WSL) const xmlPath = '/mnt/c/Users/jeffe/Downloads/Squarespace-Wordpress-Export-02-03-2026.xml'; const outputDir = path.join(__dirname, '..', 'src', 'data'); // Ensure output directory exists if (!fs.existsSync(outputDir)) { fs.mkdirSync(outputDir, { recursive: true }); } // Read XML file console.log('Reading XML file...'); const xmlContent = fs.readFileSync(xmlPath, 'utf-8'); // Extract all items (posts and pages) const itemRegex = /([\s\S]*?)<\/item>/g; const items = []; let match; while ((match = itemRegex.exec(xmlContent)) !== null) { items.push(match[1]); } console.log(`Found ${items.length} items`); // Helper functions to extract data function decodeHTMLEntities(text) { // Run twice to handle double-encoded entities let decoded = text .replace(/&/g, '&') .replace(/</g, '<') .replace(/>/g, '>') .replace(/"/g, '"') .replace(/'/g, "'") .replace(/ /g, ' '); // Second pass for double-encoded decoded = decoded .replace(/&/g, '&') .replace(/</g, '<') .replace(/>/g, '>') .replace(/"/g, '"') .replace(/'/g, "'"); return decoded; } function extractTag(content, tag) { const regex = new RegExp(`<${tag}>([\\s\\S]*?)<\\/${tag}>`); const match = content.match(regex); return match ? decodeHTMLEntities(match[1].trim()) : ''; } function extractCDATA(content, tag) { const regex = new RegExp(`<${tag}><\\/${tag}>`); const match = content.match(regex); return match ? decodeHTMLEntities(match[1].trim()) : ''; } function extractWPTag(content, tag) { const regex = new RegExp(`([\\s\\S]*?)<\\/wp:${tag}>`); const match = content.match(regex); return match ? match[1].trim() : ''; } function extractWPCDATA(content, tag) { const regex = new RegExp(`<\\/wp:${tag}>`); const match = content.match(regex); return match ? match[1].trim() : ''; } // Create slug from post name or title function createSlug(postName, title) { if (postName) { // Extract last part of path for slug const parts = postName.split('/'); return parts[parts.length - 1] || parts[parts.length - 2] || ''; } return title .toLowerCase() .replace(/[^a-z0-9]+/g, '-') .replace(/^-|-$/g, ''); } // Extract image URLs from content function extractImages(content) { const imageRegex = /]+src=["']([^"']+)["'][^>]*>/g; const images = []; let imgMatch; while ((imgMatch = imageRegex.exec(content)) !== null) { images.push(imgMatch[1]); } return images; } // Convert WordPress [caption] shortcodes to HTML figure/figcaption function convertCaptionShortcodes(html) { // Pattern: [caption id="" align="..." width="..."]...... or Caption text [/caption] // The caption text is after the img/a tag and before [/caption] return html.replace( /\[caption[^\]]*\]([\s\S]*?)\[\/caption\]/gi, (match, content) => { // Try to match either or just let imgMatch = content.match(/(]*>[\s\S]*?<\/a>)/i); if (!imgMatch) { // No link wrapping, try just img tag imgMatch = content.match(/(]*\/?>)/i); } if (!imgMatch) { // No image found, just return the content without shortcode return content.trim(); } const imgTag = imgMatch[1]; // Caption text is everything after the image/link tag const afterImg = content.substring(content.indexOf(imgTag) + imgTag.length); let captionText = afterImg.trim(); // Clean up the caption text captionText = captionText .replace(/^\s+|\s+$/g, '') // Trim whitespace .replace(/\s+/g, ' '); // Normalize spaces if (captionText) { return `
${imgTag}
${captionText}
`; } else { return `
${imgTag}
`; } } ); } // Clean HTML content - convert to simpler format function cleanContent(html) { if (!html) return ''; // First convert WordPress caption shortcodes to proper HTML let clean = convertCaptionShortcodes(html); // Remove Squarespace-specific attributes and classes clean = clean .replace(/data-sqsp[^=]*="[^"]*"/g, '') .replace(/class="[^"]*sqs[^"]*"/g, '') .replace(/style="white-space:pre-wrap;"/g, '') .replace(/class=""/g, '') .replace(/

/g, '

') .replace(/

/g, '

') .replace(/data-rte-preserve-empty="true"/g, '') .replace(/

]*>/g, '
') .replace(/
/g, '
') .replace(/data-animation-role="quote"/g, ''); return clean; } // Decode HTML entities for excerpts function decodeExcerptEntities(text) { return text // Common HTML entities .replace(/—/g, '—') .replace(/–/g, '–') .replace(/…/g, '...') .replace(/’/g, "'") .replace(/‘/g, "'") .replace(/”/g, '"') .replace(/“/g, '"') .replace(/ /g, ' ') .replace(/&/g, '&') .replace(/</g, '<') .replace(/>/g, '>') .replace(/"/g, '"') .replace(/'/g, "'") .replace(/’/g, "'") .replace(/‘/g, "'") .replace(/“/g, '"') .replace(/”/g, '"') .replace(/—/g, '—') .replace(/–/g, '–') .replace(/…/g, '...'); } // Remove WordPress-style shortcodes function removeShortcodes(text) { // Remove [caption]...[/caption] shortcodes let clean = text.replace(/\[caption[^\]]*\](.*?)\[\/caption\]/gi, '$1'); // Remove other common shortcodes clean = clean.replace(/\[\/?[a-z_]+[^\]]*\]/gi, ''); return clean; } // Remove the first/leading image from content to avoid duplication with featured image function removeLeadingImage(content, featuredImageUrl) { if (!featuredImageUrl || !content) return content; let result = content.trim(); // Pattern 1: Remove leading
...
containing the featured image const figureMatch = result.match(/^(\s*]*>[\s\S]*?<\/figure>\s*)/i); if (figureMatch && figureMatch[1].includes(featuredImageUrl.split('?')[0])) { result = result.substring(figureMatch[1].length).trim(); return result; } // Pattern 2: Remove leading containing the featured image const linkedImgMatch = result.match(/^(\s*]*>[\s\S]*?]*>[\s\S]*?<\/a>\s*)/i); if (linkedImgMatch && linkedImgMatch[1].includes(featuredImageUrl.split('?')[0])) { result = result.substring(linkedImgMatch[1].length).trim(); return result; } // Pattern 3: Remove leading standalone containing the featured image const imgMatch = result.match(/^(\s*]*\/?>[\s*]*)/i); if (imgMatch && imgMatch[1].includes(featuredImageUrl.split('?')[0])) { result = result.substring(imgMatch[1].length).trim(); return result; } return result; } // Extract first paragraph as excerpt function extractExcerpt(content, maxLength = 200) { // First decode HTML entities in the raw content let text = decodeExcerptEntities(content); // Remove WordPress shortcodes text = removeShortcodes(text); // Remove HTML tags text = text.replace(/<[^>]+>/g, ' '); // Clean up whitespace text = text.replace(/\s+/g, ' ').trim(); if (text.length <= maxLength) return text; return text.substring(0, maxLength).replace(/\s+\S*$/, '') + '...'; } // Parse items const posts = []; const pages = []; items.forEach((item, index) => { const title = extractTag(item, 'title') || extractCDATA(item, 'title'); const link = extractTag(item, 'link'); const content = extractCDATA(item, 'content:encoded'); const postName = extractWPTag(item, 'post_name'); const postType = extractWPTag(item, 'post_type'); const postId = extractWPTag(item, 'post_id'); const status = extractWPTag(item, 'status'); const pubDate = extractTag(item, 'pubDate'); const postDate = extractWPTag(item, 'post_date'); if (!title || status !== 'publish') return; const slug = createSlug(postName, title); const images = extractImages(content); const featuredImage = images[0] || null; let cleanedContent = cleanContent(content); // Remove the leading image from content if it matches the featured image (to avoid duplication) cleanedContent = removeLeadingImage(cleanedContent, featuredImage); const excerpt = extractExcerpt(cleanedContent); const data = { id: parseInt(postId) || index, title, slug, link, content: cleanedContent, excerpt, featuredImage, images, pubDate, postDate, status, }; if (postType === 'post') { posts.push(data); } else if (postType === 'page') { pages.push(data); } }); // Sort posts by date (newest first) posts.sort((a, b) => new Date(b.postDate) - new Date(a.postDate)); console.log(`Parsed ${posts.length} blog posts`); console.log(`Parsed ${pages.length} pages`); // Write blog posts JSON const postsPath = path.join(outputDir, 'blog-posts.json'); fs.writeFileSync(postsPath, JSON.stringify(posts, null, 2)); console.log(`Blog posts written to: ${postsPath}`); // Write pages JSON const pagesPath = path.join(outputDir, 'pages.json'); fs.writeFileSync(pagesPath, JSON.stringify(pages, null, 2)); console.log(`Pages written to: ${pagesPath}`); // Create index file for easier importing const indexContent = `// Auto-generated blog data exports import blogPostsData from './blog-posts.json'; import pagesData from './pages.json'; export interface BlogPost { id: number; title: string; slug: string; link: string; content: string; excerpt: string; featuredImage: string | null; images: string[]; pubDate: string; postDate: string; status: string; } export interface Page { id: number; title: string; slug: string; link: string; content: string; excerpt: string; featuredImage: string | null; images: string[]; pubDate: string; postDate: string; status: string; } export const blogPosts: BlogPost[] = blogPostsData as BlogPost[]; export const pages: Page[] = pagesData as Page[]; export function getBlogPostBySlug(slug: string): BlogPost | undefined { return blogPosts.find(post => post.slug === slug); } export function getPageBySlug(slug: string): Page | undefined { return pages.find(page => page.slug === slug); } export function getAllBlogSlugs(): string[] { return blogPosts.map(post => post.slug); } export function getRecentPosts(limit: number = 10): BlogPost[] { return blogPosts.slice(0, limit); } `; const indexPath = path.join(outputDir, 'index.ts'); fs.writeFileSync(indexPath, indexContent); console.log(`Index file written to: ${indexPath}`); // Print sample of first 5 posts console.log('\nFirst 5 blog posts:'); posts.slice(0, 5).forEach((post, i) => { console.log(` ${i + 1}. ${post.title} (${post.slug}) - ${post.postDate}`); });