katheryn-website/frontend/scripts/parse-squarespace-xml.js

252 lines
7.0 KiB
JavaScript

#!/usr/bin/env node
/**
* Parse Squarespace WordPress XML export and extract blog posts
* Outputs JSON files for use in Next.js static generation
*/
const fs = require('fs');
const path = require('path');
// XML file path (Windows path via WSL)
const xmlPath = '/mnt/c/Users/jeffe/Downloads/Squarespace-Wordpress-Export-02-03-2026.xml';
const outputDir = path.join(__dirname, '..', 'src', 'data');
// Ensure output directory exists
if (!fs.existsSync(outputDir)) {
fs.mkdirSync(outputDir, { recursive: true });
}
// Read XML file
console.log('Reading XML file...');
const xmlContent = fs.readFileSync(xmlPath, 'utf-8');
// Extract all items (posts and pages)
const itemRegex = /<item>([\s\S]*?)<\/item>/g;
const items = [];
let match;
while ((match = itemRegex.exec(xmlContent)) !== null) {
items.push(match[1]);
}
console.log(`Found ${items.length} items`);
// Helper functions to extract data
function decodeHTMLEntities(text) {
// Run twice to handle double-encoded entities
let decoded = text
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'")
.replace(/&nbsp;/g, ' ');
// Second pass for double-encoded
decoded = decoded
.replace(/&amp;/g, '&')
.replace(/&lt;/g, '<')
.replace(/&gt;/g, '>')
.replace(/&quot;/g, '"')
.replace(/&#39;/g, "'");
return decoded;
}
function extractTag(content, tag) {
const regex = new RegExp(`<${tag}>([\\s\\S]*?)<\\/${tag}>`);
const match = content.match(regex);
return match ? decodeHTMLEntities(match[1].trim()) : '';
}
function extractCDATA(content, tag) {
const regex = new RegExp(`<${tag}><!\\[CDATA\\[([\\s\\S]*?)\\]\\]><\\/${tag}>`);
const match = content.match(regex);
return match ? decodeHTMLEntities(match[1].trim()) : '';
}
function extractWPTag(content, tag) {
const regex = new RegExp(`<wp:${tag}>([\\s\\S]*?)<\\/wp:${tag}>`);
const match = content.match(regex);
return match ? match[1].trim() : '';
}
function extractWPCDATA(content, tag) {
const regex = new RegExp(`<wp:${tag}><!\\[CDATA\\[([\\s\\S]*?)\\]\\]><\\/wp:${tag}>`);
const match = content.match(regex);
return match ? match[1].trim() : '';
}
// Create slug from post name or title
function createSlug(postName, title) {
if (postName) {
// Extract last part of path for slug
const parts = postName.split('/');
return parts[parts.length - 1] || parts[parts.length - 2] || '';
}
return title
.toLowerCase()
.replace(/[^a-z0-9]+/g, '-')
.replace(/^-|-$/g, '');
}
// Extract image URLs from content
function extractImages(content) {
const imageRegex = /<img[^>]+src=["']([^"']+)["'][^>]*>/g;
const images = [];
let imgMatch;
while ((imgMatch = imageRegex.exec(content)) !== null) {
images.push(imgMatch[1]);
}
return images;
}
// Clean HTML content - convert to simpler format
function cleanContent(html) {
if (!html) return '';
// Remove Squarespace-specific attributes and classes
let clean = html
.replace(/data-sqsp[^=]*="[^"]*"/g, '')
.replace(/class="[^"]*sqs[^"]*"/g, '')
.replace(/style="white-space:pre-wrap;"/g, '')
.replace(/class=""/g, '')
.replace(/<p class="" >/g, '<p>')
.replace(/<p class="">/g, '<p>')
.replace(/data-rte-preserve-empty="true"/g, '')
.replace(/<div class="sqs-html-content"[^>]*>/g, '<div>')
.replace(/<figure class="block-animation-none">/g, '<figure>')
.replace(/data-animation-role="quote"/g, '');
return clean;
}
// Extract first paragraph as excerpt
function extractExcerpt(content, maxLength = 200) {
// Remove HTML tags
const text = content.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim();
if (text.length <= maxLength) return text;
return text.substring(0, maxLength).replace(/\s+\S*$/, '') + '...';
}
// Parse items
const posts = [];
const pages = [];
items.forEach((item, index) => {
const title = extractTag(item, 'title') || extractCDATA(item, 'title');
const link = extractTag(item, 'link');
const content = extractCDATA(item, 'content:encoded');
const postName = extractWPTag(item, 'post_name');
const postType = extractWPTag(item, 'post_type');
const postId = extractWPTag(item, 'post_id');
const status = extractWPTag(item, 'status');
const pubDate = extractTag(item, 'pubDate');
const postDate = extractWPTag(item, 'post_date');
if (!title || status !== 'publish') return;
const slug = createSlug(postName, title);
const images = extractImages(content);
const featuredImage = images[0] || null;
const cleanedContent = cleanContent(content);
const excerpt = extractExcerpt(cleanedContent);
const data = {
id: parseInt(postId) || index,
title,
slug,
link,
content: cleanedContent,
excerpt,
featuredImage,
images,
pubDate,
postDate,
status,
};
if (postType === 'post') {
posts.push(data);
} else if (postType === 'page') {
pages.push(data);
}
});
// Sort posts by date (newest first)
posts.sort((a, b) => new Date(b.postDate) - new Date(a.postDate));
console.log(`Parsed ${posts.length} blog posts`);
console.log(`Parsed ${pages.length} pages`);
// Write blog posts JSON
const postsPath = path.join(outputDir, 'blog-posts.json');
fs.writeFileSync(postsPath, JSON.stringify(posts, null, 2));
console.log(`Blog posts written to: ${postsPath}`);
// Write pages JSON
const pagesPath = path.join(outputDir, 'pages.json');
fs.writeFileSync(pagesPath, JSON.stringify(pages, null, 2));
console.log(`Pages written to: ${pagesPath}`);
// Create index file for easier importing
const indexContent = `// Auto-generated blog data exports
import blogPostsData from './blog-posts.json';
import pagesData from './pages.json';
export interface BlogPost {
id: number;
title: string;
slug: string;
link: string;
content: string;
excerpt: string;
featuredImage: string | null;
images: string[];
pubDate: string;
postDate: string;
status: string;
}
export interface Page {
id: number;
title: string;
slug: string;
link: string;
content: string;
excerpt: string;
featuredImage: string | null;
images: string[];
pubDate: string;
postDate: string;
status: string;
}
export const blogPosts: BlogPost[] = blogPostsData as BlogPost[];
export const pages: Page[] = pagesData as Page[];
export function getBlogPostBySlug(slug: string): BlogPost | undefined {
return blogPosts.find(post => post.slug === slug);
}
export function getPageBySlug(slug: string): Page | undefined {
return pages.find(page => page.slug === slug);
}
export function getAllBlogSlugs(): string[] {
return blogPosts.map(post => post.slug);
}
export function getRecentPosts(limit: number = 10): BlogPost[] {
return blogPosts.slice(0, limit);
}
`;
const indexPath = path.join(outputDir, 'index.ts');
fs.writeFileSync(indexPath, indexContent);
console.log(`Index file written to: ${indexPath}`);
// Print sample of first 5 posts
console.log('\nFirst 5 blog posts:');
posts.slice(0, 5).forEach((post, i) => {
console.log(` ${i + 1}. ${post.title} (${post.slug}) - ${post.postDate}`);
});