294 lines
8.1 KiB
JavaScript
294 lines
8.1 KiB
JavaScript
#!/usr/bin/env node
|
||
/**
|
||
* Parse Squarespace WordPress XML export and extract blog posts
|
||
* Outputs JSON files for use in Next.js static generation
|
||
*/
|
||
|
||
const fs = require('fs');
|
||
const path = require('path');
|
||
|
||
// XML file path (Windows path via WSL)
|
||
const xmlPath = '/mnt/c/Users/jeffe/Downloads/Squarespace-Wordpress-Export-02-03-2026.xml';
|
||
const outputDir = path.join(__dirname, '..', 'src', 'data');
|
||
|
||
// Ensure output directory exists
|
||
if (!fs.existsSync(outputDir)) {
|
||
fs.mkdirSync(outputDir, { recursive: true });
|
||
}
|
||
|
||
// Read XML file
|
||
console.log('Reading XML file...');
|
||
const xmlContent = fs.readFileSync(xmlPath, 'utf-8');
|
||
|
||
// Extract all items (posts and pages)
|
||
const itemRegex = /<item>([\s\S]*?)<\/item>/g;
|
||
const items = [];
|
||
let match;
|
||
|
||
while ((match = itemRegex.exec(xmlContent)) !== null) {
|
||
items.push(match[1]);
|
||
}
|
||
|
||
console.log(`Found ${items.length} items`);
|
||
|
||
// Helper functions to extract data
|
||
function decodeHTMLEntities(text) {
|
||
// Run twice to handle double-encoded entities
|
||
let decoded = text
|
||
.replace(/&/g, '&')
|
||
.replace(/</g, '<')
|
||
.replace(/>/g, '>')
|
||
.replace(/"/g, '"')
|
||
.replace(/'/g, "'")
|
||
.replace(/ /g, ' ');
|
||
// Second pass for double-encoded
|
||
decoded = decoded
|
||
.replace(/&/g, '&')
|
||
.replace(/</g, '<')
|
||
.replace(/>/g, '>')
|
||
.replace(/"/g, '"')
|
||
.replace(/'/g, "'");
|
||
return decoded;
|
||
}
|
||
|
||
function extractTag(content, tag) {
|
||
const regex = new RegExp(`<${tag}>([\\s\\S]*?)<\\/${tag}>`);
|
||
const match = content.match(regex);
|
||
return match ? decodeHTMLEntities(match[1].trim()) : '';
|
||
}
|
||
|
||
function extractCDATA(content, tag) {
|
||
const regex = new RegExp(`<${tag}><!\\[CDATA\\[([\\s\\S]*?)\\]\\]><\\/${tag}>`);
|
||
const match = content.match(regex);
|
||
return match ? decodeHTMLEntities(match[1].trim()) : '';
|
||
}
|
||
|
||
function extractWPTag(content, tag) {
|
||
const regex = new RegExp(`<wp:${tag}>([\\s\\S]*?)<\\/wp:${tag}>`);
|
||
const match = content.match(regex);
|
||
return match ? match[1].trim() : '';
|
||
}
|
||
|
||
function extractWPCDATA(content, tag) {
|
||
const regex = new RegExp(`<wp:${tag}><!\\[CDATA\\[([\\s\\S]*?)\\]\\]><\\/wp:${tag}>`);
|
||
const match = content.match(regex);
|
||
return match ? match[1].trim() : '';
|
||
}
|
||
|
||
// Create slug from post name or title
|
||
function createSlug(postName, title) {
|
||
if (postName) {
|
||
// Extract last part of path for slug
|
||
const parts = postName.split('/');
|
||
return parts[parts.length - 1] || parts[parts.length - 2] || '';
|
||
}
|
||
return title
|
||
.toLowerCase()
|
||
.replace(/[^a-z0-9]+/g, '-')
|
||
.replace(/^-|-$/g, '');
|
||
}
|
||
|
||
// Extract image URLs from content
|
||
function extractImages(content) {
|
||
const imageRegex = /<img[^>]+src=["']([^"']+)["'][^>]*>/g;
|
||
const images = [];
|
||
let imgMatch;
|
||
while ((imgMatch = imageRegex.exec(content)) !== null) {
|
||
images.push(imgMatch[1]);
|
||
}
|
||
return images;
|
||
}
|
||
|
||
// Clean HTML content - convert to simpler format
|
||
function cleanContent(html) {
|
||
if (!html) return '';
|
||
|
||
// Remove Squarespace-specific attributes and classes
|
||
let clean = html
|
||
.replace(/data-sqsp[^=]*="[^"]*"/g, '')
|
||
.replace(/class="[^"]*sqs[^"]*"/g, '')
|
||
.replace(/style="white-space:pre-wrap;"/g, '')
|
||
.replace(/class=""/g, '')
|
||
.replace(/<p class="" >/g, '<p>')
|
||
.replace(/<p class="">/g, '<p>')
|
||
.replace(/data-rte-preserve-empty="true"/g, '')
|
||
.replace(/<div class="sqs-html-content"[^>]*>/g, '<div>')
|
||
.replace(/<figure class="block-animation-none">/g, '<figure>')
|
||
.replace(/data-animation-role="quote"/g, '');
|
||
|
||
return clean;
|
||
}
|
||
|
||
// Decode HTML entities for excerpts
|
||
function decodeExcerptEntities(text) {
|
||
return text
|
||
// Common HTML entities
|
||
.replace(/—/g, '—')
|
||
.replace(/–/g, '–')
|
||
.replace(/…/g, '...')
|
||
.replace(/’/g, "'")
|
||
.replace(/‘/g, "'")
|
||
.replace(/”/g, '"')
|
||
.replace(/“/g, '"')
|
||
.replace(/ /g, ' ')
|
||
.replace(/&/g, '&')
|
||
.replace(/</g, '<')
|
||
.replace(/>/g, '>')
|
||
.replace(/"/g, '"')
|
||
.replace(/'/g, "'")
|
||
.replace(/’/g, "'")
|
||
.replace(/‘/g, "'")
|
||
.replace(/“/g, '"')
|
||
.replace(/”/g, '"')
|
||
.replace(/—/g, '—')
|
||
.replace(/–/g, '–')
|
||
.replace(/…/g, '...');
|
||
}
|
||
|
||
// Remove WordPress-style shortcodes
|
||
function removeShortcodes(text) {
|
||
// Remove [caption]...[/caption] shortcodes
|
||
let clean = text.replace(/\[caption[^\]]*\](.*?)\[\/caption\]/gi, '$1');
|
||
// Remove other common shortcodes
|
||
clean = clean.replace(/\[\/?[a-z_]+[^\]]*\]/gi, '');
|
||
return clean;
|
||
}
|
||
|
||
// Extract first paragraph as excerpt
|
||
function extractExcerpt(content, maxLength = 200) {
|
||
// First decode HTML entities in the raw content
|
||
let text = decodeExcerptEntities(content);
|
||
// Remove WordPress shortcodes
|
||
text = removeShortcodes(text);
|
||
// Remove HTML tags
|
||
text = text.replace(/<[^>]+>/g, ' ');
|
||
// Clean up whitespace
|
||
text = text.replace(/\s+/g, ' ').trim();
|
||
|
||
if (text.length <= maxLength) return text;
|
||
return text.substring(0, maxLength).replace(/\s+\S*$/, '') + '...';
|
||
}
|
||
|
||
// Parse items
|
||
const posts = [];
|
||
const pages = [];
|
||
|
||
items.forEach((item, index) => {
|
||
const title = extractTag(item, 'title') || extractCDATA(item, 'title');
|
||
const link = extractTag(item, 'link');
|
||
const content = extractCDATA(item, 'content:encoded');
|
||
const postName = extractWPTag(item, 'post_name');
|
||
const postType = extractWPTag(item, 'post_type');
|
||
const postId = extractWPTag(item, 'post_id');
|
||
const status = extractWPTag(item, 'status');
|
||
const pubDate = extractTag(item, 'pubDate');
|
||
const postDate = extractWPTag(item, 'post_date');
|
||
|
||
if (!title || status !== 'publish') return;
|
||
|
||
const slug = createSlug(postName, title);
|
||
const images = extractImages(content);
|
||
const featuredImage = images[0] || null;
|
||
const cleanedContent = cleanContent(content);
|
||
const excerpt = extractExcerpt(cleanedContent);
|
||
|
||
const data = {
|
||
id: parseInt(postId) || index,
|
||
title,
|
||
slug,
|
||
link,
|
||
content: cleanedContent,
|
||
excerpt,
|
||
featuredImage,
|
||
images,
|
||
pubDate,
|
||
postDate,
|
||
status,
|
||
};
|
||
|
||
if (postType === 'post') {
|
||
posts.push(data);
|
||
} else if (postType === 'page') {
|
||
pages.push(data);
|
||
}
|
||
});
|
||
|
||
// Sort posts by date (newest first)
|
||
posts.sort((a, b) => new Date(b.postDate) - new Date(a.postDate));
|
||
|
||
console.log(`Parsed ${posts.length} blog posts`);
|
||
console.log(`Parsed ${pages.length} pages`);
|
||
|
||
// Write blog posts JSON
|
||
const postsPath = path.join(outputDir, 'blog-posts.json');
|
||
fs.writeFileSync(postsPath, JSON.stringify(posts, null, 2));
|
||
console.log(`Blog posts written to: ${postsPath}`);
|
||
|
||
// Write pages JSON
|
||
const pagesPath = path.join(outputDir, 'pages.json');
|
||
fs.writeFileSync(pagesPath, JSON.stringify(pages, null, 2));
|
||
console.log(`Pages written to: ${pagesPath}`);
|
||
|
||
// Create index file for easier importing
|
||
const indexContent = `// Auto-generated blog data exports
|
||
import blogPostsData from './blog-posts.json';
|
||
import pagesData from './pages.json';
|
||
|
||
export interface BlogPost {
|
||
id: number;
|
||
title: string;
|
||
slug: string;
|
||
link: string;
|
||
content: string;
|
||
excerpt: string;
|
||
featuredImage: string | null;
|
||
images: string[];
|
||
pubDate: string;
|
||
postDate: string;
|
||
status: string;
|
||
}
|
||
|
||
export interface Page {
|
||
id: number;
|
||
title: string;
|
||
slug: string;
|
||
link: string;
|
||
content: string;
|
||
excerpt: string;
|
||
featuredImage: string | null;
|
||
images: string[];
|
||
pubDate: string;
|
||
postDate: string;
|
||
status: string;
|
||
}
|
||
|
||
export const blogPosts: BlogPost[] = blogPostsData as BlogPost[];
|
||
export const pages: Page[] = pagesData as Page[];
|
||
|
||
export function getBlogPostBySlug(slug: string): BlogPost | undefined {
|
||
return blogPosts.find(post => post.slug === slug);
|
||
}
|
||
|
||
export function getPageBySlug(slug: string): Page | undefined {
|
||
return pages.find(page => page.slug === slug);
|
||
}
|
||
|
||
export function getAllBlogSlugs(): string[] {
|
||
return blogPosts.map(post => post.slug);
|
||
}
|
||
|
||
export function getRecentPosts(limit: number = 10): BlogPost[] {
|
||
return blogPosts.slice(0, limit);
|
||
}
|
||
`;
|
||
|
||
const indexPath = path.join(outputDir, 'index.ts');
|
||
fs.writeFileSync(indexPath, indexContent);
|
||
console.log(`Index file written to: ${indexPath}`);
|
||
|
||
// Print sample of first 5 posts
|
||
console.log('\nFirst 5 blog posts:');
|
||
posts.slice(0, 5).forEach((post, i) => {
|
||
console.log(` ${i + 1}. ${post.title} (${post.slug}) - ${post.postDate}`);
|
||
});
|