252 lines
7.0 KiB
JavaScript
252 lines
7.0 KiB
JavaScript
#!/usr/bin/env node
|
|
/**
|
|
* Parse Squarespace WordPress XML export and extract blog posts
|
|
* Outputs JSON files for use in Next.js static generation
|
|
*/
|
|
|
|
const fs = require('fs');
|
|
const path = require('path');
|
|
|
|
// XML file path (Windows path via WSL)
|
|
const xmlPath = '/mnt/c/Users/jeffe/Downloads/Squarespace-Wordpress-Export-02-03-2026.xml';
|
|
const outputDir = path.join(__dirname, '..', 'src', 'data');
|
|
|
|
// Ensure output directory exists
|
|
if (!fs.existsSync(outputDir)) {
|
|
fs.mkdirSync(outputDir, { recursive: true });
|
|
}
|
|
|
|
// Read XML file
|
|
console.log('Reading XML file...');
|
|
const xmlContent = fs.readFileSync(xmlPath, 'utf-8');
|
|
|
|
// Extract all items (posts and pages)
|
|
const itemRegex = /<item>([\s\S]*?)<\/item>/g;
|
|
const items = [];
|
|
let match;
|
|
|
|
while ((match = itemRegex.exec(xmlContent)) !== null) {
|
|
items.push(match[1]);
|
|
}
|
|
|
|
console.log(`Found ${items.length} items`);
|
|
|
|
// Helper functions to extract data
|
|
function decodeHTMLEntities(text) {
|
|
// Run twice to handle double-encoded entities
|
|
let decoded = text
|
|
.replace(/&/g, '&')
|
|
.replace(/</g, '<')
|
|
.replace(/>/g, '>')
|
|
.replace(/"/g, '"')
|
|
.replace(/'/g, "'")
|
|
.replace(/ /g, ' ');
|
|
// Second pass for double-encoded
|
|
decoded = decoded
|
|
.replace(/&/g, '&')
|
|
.replace(/</g, '<')
|
|
.replace(/>/g, '>')
|
|
.replace(/"/g, '"')
|
|
.replace(/'/g, "'");
|
|
return decoded;
|
|
}
|
|
|
|
function extractTag(content, tag) {
|
|
const regex = new RegExp(`<${tag}>([\\s\\S]*?)<\\/${tag}>`);
|
|
const match = content.match(regex);
|
|
return match ? decodeHTMLEntities(match[1].trim()) : '';
|
|
}
|
|
|
|
function extractCDATA(content, tag) {
|
|
const regex = new RegExp(`<${tag}><!\\[CDATA\\[([\\s\\S]*?)\\]\\]><\\/${tag}>`);
|
|
const match = content.match(regex);
|
|
return match ? decodeHTMLEntities(match[1].trim()) : '';
|
|
}
|
|
|
|
function extractWPTag(content, tag) {
|
|
const regex = new RegExp(`<wp:${tag}>([\\s\\S]*?)<\\/wp:${tag}>`);
|
|
const match = content.match(regex);
|
|
return match ? match[1].trim() : '';
|
|
}
|
|
|
|
function extractWPCDATA(content, tag) {
|
|
const regex = new RegExp(`<wp:${tag}><!\\[CDATA\\[([\\s\\S]*?)\\]\\]><\\/wp:${tag}>`);
|
|
const match = content.match(regex);
|
|
return match ? match[1].trim() : '';
|
|
}
|
|
|
|
// Create slug from post name or title
|
|
function createSlug(postName, title) {
|
|
if (postName) {
|
|
// Extract last part of path for slug
|
|
const parts = postName.split('/');
|
|
return parts[parts.length - 1] || parts[parts.length - 2] || '';
|
|
}
|
|
return title
|
|
.toLowerCase()
|
|
.replace(/[^a-z0-9]+/g, '-')
|
|
.replace(/^-|-$/g, '');
|
|
}
|
|
|
|
// Extract image URLs from content
|
|
function extractImages(content) {
|
|
const imageRegex = /<img[^>]+src=["']([^"']+)["'][^>]*>/g;
|
|
const images = [];
|
|
let imgMatch;
|
|
while ((imgMatch = imageRegex.exec(content)) !== null) {
|
|
images.push(imgMatch[1]);
|
|
}
|
|
return images;
|
|
}
|
|
|
|
// Clean HTML content - convert to simpler format
|
|
function cleanContent(html) {
|
|
if (!html) return '';
|
|
|
|
// Remove Squarespace-specific attributes and classes
|
|
let clean = html
|
|
.replace(/data-sqsp[^=]*="[^"]*"/g, '')
|
|
.replace(/class="[^"]*sqs[^"]*"/g, '')
|
|
.replace(/style="white-space:pre-wrap;"/g, '')
|
|
.replace(/class=""/g, '')
|
|
.replace(/<p class="" >/g, '<p>')
|
|
.replace(/<p class="">/g, '<p>')
|
|
.replace(/data-rte-preserve-empty="true"/g, '')
|
|
.replace(/<div class="sqs-html-content"[^>]*>/g, '<div>')
|
|
.replace(/<figure class="block-animation-none">/g, '<figure>')
|
|
.replace(/data-animation-role="quote"/g, '');
|
|
|
|
return clean;
|
|
}
|
|
|
|
// Extract first paragraph as excerpt
|
|
function extractExcerpt(content, maxLength = 200) {
|
|
// Remove HTML tags
|
|
const text = content.replace(/<[^>]+>/g, ' ').replace(/\s+/g, ' ').trim();
|
|
if (text.length <= maxLength) return text;
|
|
return text.substring(0, maxLength).replace(/\s+\S*$/, '') + '...';
|
|
}
|
|
|
|
// Parse items
|
|
const posts = [];
|
|
const pages = [];
|
|
|
|
items.forEach((item, index) => {
|
|
const title = extractTag(item, 'title') || extractCDATA(item, 'title');
|
|
const link = extractTag(item, 'link');
|
|
const content = extractCDATA(item, 'content:encoded');
|
|
const postName = extractWPTag(item, 'post_name');
|
|
const postType = extractWPTag(item, 'post_type');
|
|
const postId = extractWPTag(item, 'post_id');
|
|
const status = extractWPTag(item, 'status');
|
|
const pubDate = extractTag(item, 'pubDate');
|
|
const postDate = extractWPTag(item, 'post_date');
|
|
|
|
if (!title || status !== 'publish') return;
|
|
|
|
const slug = createSlug(postName, title);
|
|
const images = extractImages(content);
|
|
const featuredImage = images[0] || null;
|
|
const cleanedContent = cleanContent(content);
|
|
const excerpt = extractExcerpt(cleanedContent);
|
|
|
|
const data = {
|
|
id: parseInt(postId) || index,
|
|
title,
|
|
slug,
|
|
link,
|
|
content: cleanedContent,
|
|
excerpt,
|
|
featuredImage,
|
|
images,
|
|
pubDate,
|
|
postDate,
|
|
status,
|
|
};
|
|
|
|
if (postType === 'post') {
|
|
posts.push(data);
|
|
} else if (postType === 'page') {
|
|
pages.push(data);
|
|
}
|
|
});
|
|
|
|
// Sort posts by date (newest first)
|
|
posts.sort((a, b) => new Date(b.postDate) - new Date(a.postDate));
|
|
|
|
console.log(`Parsed ${posts.length} blog posts`);
|
|
console.log(`Parsed ${pages.length} pages`);
|
|
|
|
// Write blog posts JSON
|
|
const postsPath = path.join(outputDir, 'blog-posts.json');
|
|
fs.writeFileSync(postsPath, JSON.stringify(posts, null, 2));
|
|
console.log(`Blog posts written to: ${postsPath}`);
|
|
|
|
// Write pages JSON
|
|
const pagesPath = path.join(outputDir, 'pages.json');
|
|
fs.writeFileSync(pagesPath, JSON.stringify(pages, null, 2));
|
|
console.log(`Pages written to: ${pagesPath}`);
|
|
|
|
// Create index file for easier importing
|
|
const indexContent = `// Auto-generated blog data exports
|
|
import blogPostsData from './blog-posts.json';
|
|
import pagesData from './pages.json';
|
|
|
|
export interface BlogPost {
|
|
id: number;
|
|
title: string;
|
|
slug: string;
|
|
link: string;
|
|
content: string;
|
|
excerpt: string;
|
|
featuredImage: string | null;
|
|
images: string[];
|
|
pubDate: string;
|
|
postDate: string;
|
|
status: string;
|
|
}
|
|
|
|
export interface Page {
|
|
id: number;
|
|
title: string;
|
|
slug: string;
|
|
link: string;
|
|
content: string;
|
|
excerpt: string;
|
|
featuredImage: string | null;
|
|
images: string[];
|
|
pubDate: string;
|
|
postDate: string;
|
|
status: string;
|
|
}
|
|
|
|
export const blogPosts: BlogPost[] = blogPostsData as BlogPost[];
|
|
export const pages: Page[] = pagesData as Page[];
|
|
|
|
export function getBlogPostBySlug(slug: string): BlogPost | undefined {
|
|
return blogPosts.find(post => post.slug === slug);
|
|
}
|
|
|
|
export function getPageBySlug(slug: string): Page | undefined {
|
|
return pages.find(page => page.slug === slug);
|
|
}
|
|
|
|
export function getAllBlogSlugs(): string[] {
|
|
return blogPosts.map(post => post.slug);
|
|
}
|
|
|
|
export function getRecentPosts(limit: number = 10): BlogPost[] {
|
|
return blogPosts.slice(0, limit);
|
|
}
|
|
`;
|
|
|
|
const indexPath = path.join(outputDir, 'index.ts');
|
|
fs.writeFileSync(indexPath, indexContent);
|
|
console.log(`Index file written to: ${indexPath}`);
|
|
|
|
// Print sample of first 5 posts
|
|
console.log('\nFirst 5 blog posts:');
|
|
posts.slice(0, 5).forEach((post, i) => {
|
|
console.log(` ${i + 1}. ${post.title} (${post.slug}) - ${post.postDate}`);
|
|
});
|