rspace-online/modules/rcart/extract.ts

277 lines
8.2 KiB
TypeScript

/**
* Server-side product extraction from URLs.
*
* Ported from UniCart extension's content.ts ProductDetector,
* adapted for server-side HTML parsing (no DOM — regex-based).
* Reuses the fetch pattern from /api/link-preview in server/index.ts.
*/
export interface ExtractedProduct {
name: string;
price: number | null;
currency: string;
description: string | null;
imageUrl: string | null;
sourceUrl: string;
sku: string | null;
vendor: {
name: string;
domain: string;
platform: string | null;
};
}
const FETCH_TIMEOUT = 5000;
const USER_AGENT =
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
/** Fetch URL HTML with browser-like headers and timeout. */
async function fetchHtml(url: string): Promise<string> {
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), FETCH_TIMEOUT);
try {
const resp = await fetch(url, {
signal: controller.signal,
headers: {
'User-Agent': USER_AGENT,
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.5',
},
redirect: 'follow',
});
if (!resp.ok) throw new Error(`HTTP ${resp.status}`);
return await resp.text();
} finally {
clearTimeout(timer);
}
}
/** Extract domain from URL, stripping www. */
function extractDomain(url: string): string {
try {
const host = new URL(url).hostname;
return host.replace(/^www\./, '');
} catch {
return url;
}
}
/** Detect platform from URL hostname. */
function detectPlatform(url: string, html: string): string | null {
const domain = extractDomain(url);
if (domain.includes('amazon.')) return 'amazon';
if (domain.includes('etsy.com')) return 'etsy';
if (html.includes('Shopify.') || html.includes('cdn.shopify.com')) return 'shopify';
if (html.includes('woocommerce')) return 'woocommerce';
return null;
}
// ── Extractors ──
/** Extract product data from JSON-LD structured data. */
function extractJsonLd(html: string): Partial<ExtractedProduct> | null {
const scriptRegex = /<script[^>]*type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi;
let match: RegExpExecArray | null;
while ((match = scriptRegex.exec(html)) !== null) {
try {
const data = JSON.parse(match[1]);
// Handle @graph arrays
const products = data['@graph']?.filter((item: any) => item['@type'] === 'Product') || [];
const product = data['@type'] === 'Product' ? data : products[0];
if (product) {
const offer = Array.isArray(product.offers) ? product.offers[0] : product.offers;
const price = parseFloat(offer?.price || offer?.lowPrice || '0');
return {
name: product.name || null,
price: price > 0 ? price : null,
currency: offer?.priceCurrency || 'USD',
description: product.description || null,
imageUrl: Array.isArray(product.image) ? product.image[0] : product.image || null,
sku: product.sku || null,
};
}
} catch {
// Invalid JSON, continue
}
}
return null;
}
/** Extract product data from Open Graph / product meta tags. */
function extractMetaTags(html: string): Partial<ExtractedProduct> {
const result: Partial<ExtractedProduct> = {};
const getMetaContent = (property: string): string | null => {
const re = new RegExp(`<meta[^>]*property=["']${property}["'][^>]*content=["']([^"']*)["']`, 'i');
const alt = new RegExp(`<meta[^>]*content=["']([^"']*)["'][^>]*property=["']${property}["']`, 'i');
const m = html.match(re) || html.match(alt);
return m ? m[1] : null;
};
const title = getMetaContent('og:title');
if (title) result.name = title;
const priceAmount = getMetaContent('product:price:amount') || getMetaContent('og:price:amount');
if (priceAmount) {
const p = parseFloat(priceAmount);
if (p > 0) result.price = p;
}
const priceCurrency = getMetaContent('product:price:currency');
if (priceCurrency) result.currency = priceCurrency;
const image = getMetaContent('og:image');
if (image) result.imageUrl = image;
const description = getMetaContent('og:description');
if (description) result.description = description;
return result;
}
/** Amazon-specific extraction via regex on HTML. */
function extractAmazon(html: string, url: string): Partial<ExtractedProduct> | null {
const result: Partial<ExtractedProduct> = {};
// Title
const titleMatch = html.match(/id=["']productTitle["'][^>]*>([^<]+)</i);
if (titleMatch) result.name = titleMatch[1].trim();
if (!result.name) return null;
// Price — look for common Amazon price patterns
const pricePatterns = [
/class=["']a-offscreen["'][^>]*>([£€$¥₹]?)\s*([\d,]+\.?\d*)/,
/id=["']priceblock_ourprice["'][^>]*>([£€$¥₹]?)\s*([\d,]+\.?\d*)/,
/id=["']priceblock_dealprice["'][^>]*>([£€$¥₹]?)\s*([\d,]+\.?\d*)/,
/"price":\s*"?([\d.]+)"?/,
];
for (const re of pricePatterns) {
const m = html.match(re);
if (m) {
const priceStr = m[2] || m[1];
const price = parseFloat(priceStr.replace(/,/g, ''));
if (price > 0) {
result.price = price;
break;
}
}
}
// ASIN from URL
const asinMatch = url.match(/\/(?:dp|gp\/product)\/([A-Z0-9]{10})/);
if (asinMatch) result.sku = asinMatch[1];
// Image
const imgMatch = html.match(/id=["']landingImage["'][^>]*src=["']([^"']+)/i);
if (imgMatch) result.imageUrl = imgMatch[1];
// Description — feature bullets
const bulletMatches = html.match(/class=["']a-list-item["'][^>]*>([^<]+)/g);
if (bulletMatches && bulletMatches.length > 0) {
result.description = bulletMatches
.slice(0, 3)
.map(b => {
const m = b.match(/>([^<]+)/);
return m ? m[1].trim() : '';
})
.filter(Boolean)
.join(' • ');
}
return result;
}
/** Shopify extraction — look for `var meta = {...}` in scripts. */
function extractShopify(html: string): Partial<ExtractedProduct> | null {
const metaMatch = html.match(/var\s+meta\s*=\s*(\{[\s\S]*?\});/);
if (!metaMatch) return null;
try {
const meta = JSON.parse(metaMatch[1]);
if (meta.product) {
return {
name: meta.product.title,
price: typeof meta.product.price === 'number' ? meta.product.price / 100 : null,
currency: 'USD',
description: meta.product.description || null,
sku: meta.product.variants?.[0]?.sku || null,
};
}
} catch { /* parse failure */ }
return null;
}
/** Get HTML <title> as last-resort name fallback. */
function extractTitle(html: string): string | null {
const m = html.match(/<title[^>]*>([^<]+)<\/title>/i);
return m ? m[1].trim() : null;
}
// ── Public API ──
/**
* Extract product information from a URL.
* Cascading strategy: JSON-LD → platform-specific → meta tags → fallback.
*/
export async function extractProductFromUrl(url: string): Promise<ExtractedProduct> {
const html = await fetchHtml(url);
const domain = extractDomain(url);
const platform = detectPlatform(url, html);
// Layer 1: JSON-LD (most reliable)
const jsonLd = extractJsonLd(html);
// Layer 2: Platform-specific
let platformData: Partial<ExtractedProduct> | null = null;
if (platform === 'amazon') platformData = extractAmazon(html, url);
else if (platform === 'shopify') platformData = extractShopify(html);
// Etsy and WooCommerce rely on JSON-LD / meta tags
// Layer 3: Meta tags
const metaTags = extractMetaTags(html);
// Merge layers (earlier = higher priority)
const merged: Partial<ExtractedProduct> = {
...metaTags,
...platformData,
...jsonLd,
};
// Fallback name from <title>
if (!merged.name) {
merged.name = extractTitle(html) || domain;
}
// Resolve relative image URLs
if (merged.imageUrl && !merged.imageUrl.startsWith('http')) {
try {
merged.imageUrl = new URL(merged.imageUrl, url).href;
} catch { /* leave as-is */ }
}
// Vendor name: try to find site name from og:site_name
let vendorName = domain;
const siteNameMatch = html.match(/<meta[^>]*property=["']og:site_name["'][^>]*content=["']([^"']*)["']/i)
|| html.match(/<meta[^>]*content=["']([^"']*)["'][^>]*property=["']og:site_name["']/i);
if (siteNameMatch) vendorName = siteNameMatch[1];
return {
name: merged.name!,
price: merged.price ?? null,
currency: merged.currency || 'USD',
description: merged.description ?? null,
imageUrl: merged.imageUrl ?? null,
sourceUrl: url,
sku: merged.sku ?? null,
vendor: {
name: vendorName,
domain,
platform,
},
};
}