katheryn-website/frontend/scripts/parse-squarespace-xml.js

367 lines
11 KiB
JavaScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env node
/**
* Parse Squarespace WordPress XML export and extract blog posts
* Outputs JSON files for use in Next.js static generation
*/
const fs = require('fs');
const path = require('path');
// XML file path (Windows path via WSL)
const xmlPath = '/mnt/c/Users/jeffe/Downloads/Squarespace-Wordpress-Export-02-03-2026.xml';
const outputDir = path.join(__dirname, '..', 'src', 'data');
// Ensure output directory exists
if (!fs.existsSync(outputDir)) {
fs.mkdirSync(outputDir, { recursive: true });
}
// Read XML file
console.log('Reading XML file...');
const xmlContent = fs.readFileSync(xmlPath, 'utf-8');
// Extract all items (posts and pages)
const itemRegex = /<item>([\s\S]*?)<\/item>/g;
const items = [];
let match;
while ((match = itemRegex.exec(xmlContent)) !== null) {
items.push(match[1]);
}
console.log(`Found ${items.length} items`);
// Helper functions to extract data
function decodeHTMLEntities(text) {
// Run twice to handle double-encoded entities
let decoded = text
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
.replace(/&nbsp;/g, ' ');
// Second pass for double-encoded
decoded = decoded
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'");
return decoded;
}
function extractTag(content, tag) {
const regex = new RegExp(`<${tag}>([\\s\\S]*?)<\\/${tag}>`);
const match = content.match(regex);
return match ? decodeHTMLEntities(match[1].trim()) : '';
}
function extractCDATA(content, tag) {
const regex = new RegExp(`<${tag}><!\\[CDATA\\[([\\s\\S]*?)\\]\\]><\\/${tag}>`);
const match = content.match(regex);
return match ? decodeHTMLEntities(match[1].trim()) : '';
}
function extractWPTag(content, tag) {
const regex = new RegExp(`<wp:${tag}>([\\s\\S]*?)<\\/wp:${tag}>`);
const match = content.match(regex);
return match ? match[1].trim() : '';
}
function extractWPCDATA(content, tag) {
const regex = new RegExp(`<wp:${tag}><!\\[CDATA\\[([\\s\\S]*?)\\]\\]><\\/wp:${tag}>`);
const match = content.match(regex);
return match ? match[1].trim() : '';
}
// Create slug from post name or title
function createSlug(postName, title) {
if (postName) {
// Extract last part of path for slug
const parts = postName.split('/');
return parts[parts.length - 1] || parts[parts.length - 2] || '';
}
return title
.toLowerCase()
.replace(/[^a-z0-9]+/g, '-')
.replace(/^-|-$/g, '');
}
// Extract image URLs from content
function extractImages(content) {
const imageRegex = /<img[^>]+src=["']([^"']+)["'][^>]*>/g;
const images = [];
let imgMatch;
while ((imgMatch = imageRegex.exec(content)) !== null) {
images.push(imgMatch[1]);
}
return images;
}
// Convert WordPress [caption] shortcodes to HTML figure/figcaption
function convertCaptionShortcodes(html) {
// Pattern: [caption id="" align="..." width="..."]<a>...<img>...</a> or <img.../> Caption text [/caption]
// The caption text is after the img/a tag and before [/caption]
return html.replace(
/\[caption[^\]]*\]([\s\S]*?)\[\/caption\]/gi,
(match, content) => {
// Try to match either <a><img></a> or just <img>
let imgMatch = content.match(/(<a[^>]*>[\s\S]*?<\/a>)/i);
if (!imgMatch) {
// No link wrapping, try just img tag
imgMatch = content.match(/(<img[^>]*\/?>)/i);
}
if (!imgMatch) {
// No image found, just return the content without shortcode
return content.trim();
}
const imgTag = imgMatch[1];
// Caption text is everything after the image/link tag
const afterImg = content.substring(content.indexOf(imgTag) + imgTag.length);
let captionText = afterImg.trim();
// Clean up the caption text
captionText = captionText
.replace(/^\s+|\s+$/g, '') // Trim whitespace
.replace(/\s+/g, ' '); // Normalize spaces
if (captionText) {
return `<figure class="wp-caption">${imgTag}<figcaption>${captionText}</figcaption></figure>`;
} else {
return `<figure class="wp-caption">${imgTag}</figure>`;
}
}
);
}
// Clean HTML content - convert to simpler format
function cleanContent(html) {
if (!html) return '';
// First convert WordPress caption shortcodes to proper HTML
let clean = convertCaptionShortcodes(html);
// Remove Squarespace-specific attributes and classes
clean = clean
.replace(/data-sqsp[^=]*="[^"]*"/g, '')
.replace(/class="[^"]*sqs[^"]*"/g, '')
.replace(/style="white-space:pre-wrap;"/g, '')
.replace(/class=""/g, '')
.replace(/<p class="" >/g, '<p>')
.replace(/<p class="">/g, '<p>')
.replace(/data-rte-preserve-empty="true"/g, '')
.replace(/<div class="sqs-html-content"[^>]*>/g, '<div>')
.replace(/<figure class="block-animation-none">/g, '<figure>')
.replace(/data-animation-role="quote"/g, '');
return clean;
}
// Decode HTML entities for excerpts
function decodeExcerptEntities(text) {
return text
// Common HTML entities
.replace(/&mdash;/g, '—')
.replace(/&ndash;/g, '')
.replace(/&hellip;/g, '...')
.replace(/&rsquo;/g, "'")
.replace(/&lsquo;/g, "'")
.replace(/&rdquo;/g, '"')
.replace(/&ldquo;/g, '"')
.replace(/&nbsp;/g, ' ')
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
.replace(/&#8217;/g, "'")
.replace(/&#8216;/g, "'")
.replace(/&#8220;/g, '"')
.replace(/&#8221;/g, '"')
.replace(/&#8212;/g, '—')
.replace(/&#8211;/g, '')
.replace(/&#8230;/g, '...');
}
// Remove WordPress-style shortcodes
function removeShortcodes(text) {
// Remove [caption]...[/caption] shortcodes
let clean = text.replace(/\[caption[^\]]*\](.*?)\[\/caption\]/gi, '$1');
// Remove other common shortcodes
clean = clean.replace(/\[\/?[a-z_]+[^\]]*\]/gi, '');
return clean;
}
// Remove the first/leading image from content to avoid duplication with featured image
function removeLeadingImage(content, featuredImageUrl) {
if (!featuredImageUrl || !content) return content;
let result = content.trim();
// Pattern 1: Remove leading <figure class="wp-caption">...</figure> containing the featured image
const figureMatch = result.match(/^(\s*<figure[^>]*>[\s\S]*?<\/figure>\s*)/i);
if (figureMatch && figureMatch[1].includes(featuredImageUrl.split('?')[0])) {
result = result.substring(figureMatch[1].length).trim();
return result;
}
// Pattern 2: Remove leading <a><img></a> containing the featured image
const linkedImgMatch = result.match(/^(\s*<a[^>]*>[\s\S]*?<img[^>]*>[\s\S]*?<\/a>\s*)/i);
if (linkedImgMatch && linkedImgMatch[1].includes(featuredImageUrl.split('?')[0])) {
result = result.substring(linkedImgMatch[1].length).trim();
return result;
}
// Pattern 3: Remove leading standalone <img> containing the featured image
const imgMatch = result.match(/^(\s*<img[^>]*\/?>[\s*]*)/i);
if (imgMatch && imgMatch[1].includes(featuredImageUrl.split('?')[0])) {
result = result.substring(imgMatch[1].length).trim();
return result;
}
return result;
}
// Extract first paragraph as excerpt
function extractExcerpt(content, maxLength = 200) {
// First decode HTML entities in the raw content
let text = decodeExcerptEntities(content);
// Remove WordPress shortcodes
text = removeShortcodes(text);
// Remove HTML tags
text = text.replace(/<[^>]+>/g, ' ');
// Clean up whitespace
text = text.replace(/\s+/g, ' ').trim();
if (text.length <= maxLength) return text;
return text.substring(0, maxLength).replace(/\s+\S*$/, '') + '...';
}
// Parse items
const posts = [];
const pages = [];
items.forEach((item, index) => {
const title = extractTag(item, 'title') || extractCDATA(item, 'title');
const link = extractTag(item, 'link');
const content = extractCDATA(item, 'content:encoded');
const postName = extractWPTag(item, 'post_name');
const postType = extractWPTag(item, 'post_type');
const postId = extractWPTag(item, 'post_id');
const status = extractWPTag(item, 'status');
const pubDate = extractTag(item, 'pubDate');
const postDate = extractWPTag(item, 'post_date');
if (!title || status !== 'publish') return;
const slug = createSlug(postName, title);
const images = extractImages(content);
const featuredImage = images[0] || null;
let cleanedContent = cleanContent(content);
// Remove the leading image from content if it matches the featured image (to avoid duplication)
cleanedContent = removeLeadingImage(cleanedContent, featuredImage);
const excerpt = extractExcerpt(cleanedContent);
const data = {
id: parseInt(postId) || index,
title,
slug,
link,
content: cleanedContent,
excerpt,
featuredImage,
images,
pubDate,
postDate,
status,
};
if (postType === 'post') {
posts.push(data);
} else if (postType === 'page') {
pages.push(data);
}
});
// Sort posts by date (newest first)
posts.sort((a, b) => new Date(b.postDate) - new Date(a.postDate));
console.log(`Parsed ${posts.length} blog posts`);
console.log(`Parsed ${pages.length} pages`);
// Write blog posts JSON
const postsPath = path.join(outputDir, 'blog-posts.json');
fs.writeFileSync(postsPath, JSON.stringify(posts, null, 2));
console.log(`Blog posts written to: ${postsPath}`);
// Write pages JSON
const pagesPath = path.join(outputDir, 'pages.json');
fs.writeFileSync(pagesPath, JSON.stringify(pages, null, 2));
console.log(`Pages written to: ${pagesPath}`);
// Create index file for easier importing
const indexContent = `// Auto-generated blog data exports
import blogPostsData from './blog-posts.json';
import pagesData from './pages.json';
export interface BlogPost {
id: number;
title: string;
slug: string;
link: string;
content: string;
excerpt: string;
featuredImage: string | null;
images: string[];
pubDate: string;
postDate: string;
status: string;
}
export interface Page {
id: number;
title: string;
slug: string;
link: string;
content: string;
excerpt: string;
featuredImage: string | null;
images: string[];
pubDate: string;
postDate: string;
status: string;
}
export const blogPosts: BlogPost[] = blogPostsData as BlogPost[];
export const pages: Page[] = pagesData as Page[];
export function getBlogPostBySlug(slug: string): BlogPost | undefined {
return blogPosts.find(post => post.slug === slug);
}
export function getPageBySlug(slug: string): Page | undefined {
return pages.find(page => page.slug === slug);
}
export function getAllBlogSlugs(): string[] {
return blogPosts.map(post => post.slug);
}
export function getRecentPosts(limit: number = 10): BlogPost[] {
return blogPosts.slice(0, limit);
}
`;
const indexPath = path.join(outputDir, 'index.ts');
fs.writeFileSync(indexPath, indexContent);
console.log(`Index file written to: ${indexPath}`);
// Print sample of first 5 posts
console.log('\nFirst 5 blog posts:');
posts.slice(0, 5).forEach((post, i) => {
console.log(` ${i + 1}. ${post.title} (${post.slug}) - ${post.postDate}`);
});