277 lines
8.2 KiB
TypeScript
277 lines
8.2 KiB
TypeScript
/**
|
|
* Server-side product extraction from URLs.
|
|
*
|
|
* Ported from UniCart extension's content.ts ProductDetector,
|
|
* adapted for server-side HTML parsing (no DOM — regex-based).
|
|
* Reuses the fetch pattern from /api/link-preview in server/index.ts.
|
|
*/
|
|
|
|
export interface ExtractedProduct {
|
|
name: string;
|
|
price: number | null;
|
|
currency: string;
|
|
description: string | null;
|
|
imageUrl: string | null;
|
|
sourceUrl: string;
|
|
sku: string | null;
|
|
vendor: {
|
|
name: string;
|
|
domain: string;
|
|
platform: string | null;
|
|
};
|
|
}
|
|
|
|
const FETCH_TIMEOUT = 5000;
|
|
const USER_AGENT =
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';
|
|
|
|
/** Fetch URL HTML with browser-like headers and timeout. */
|
|
async function fetchHtml(url: string): Promise<string> {
|
|
const controller = new AbortController();
|
|
const timer = setTimeout(() => controller.abort(), FETCH_TIMEOUT);
|
|
try {
|
|
const resp = await fetch(url, {
|
|
signal: controller.signal,
|
|
headers: {
|
|
'User-Agent': USER_AGENT,
|
|
Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
|
'Accept-Language': 'en-US,en;q=0.5',
|
|
},
|
|
redirect: 'follow',
|
|
});
|
|
if (!resp.ok) throw new Error(`HTTP ${resp.status}`);
|
|
return await resp.text();
|
|
} finally {
|
|
clearTimeout(timer);
|
|
}
|
|
}
|
|
|
|
/** Extract domain from URL, stripping www. */
|
|
function extractDomain(url: string): string {
|
|
try {
|
|
const host = new URL(url).hostname;
|
|
return host.replace(/^www\./, '');
|
|
} catch {
|
|
return url;
|
|
}
|
|
}
|
|
|
|
/** Detect platform from URL hostname. */
|
|
function detectPlatform(url: string, html: string): string | null {
|
|
const domain = extractDomain(url);
|
|
if (domain.includes('amazon.')) return 'amazon';
|
|
if (domain.includes('etsy.com')) return 'etsy';
|
|
if (html.includes('Shopify.') || html.includes('cdn.shopify.com')) return 'shopify';
|
|
if (html.includes('woocommerce')) return 'woocommerce';
|
|
return null;
|
|
}
|
|
|
|
// ── Extractors ──
|
|
|
|
/** Extract product data from JSON-LD structured data. */
|
|
function extractJsonLd(html: string): Partial<ExtractedProduct> | null {
|
|
const scriptRegex = /<script[^>]*type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi;
|
|
let match: RegExpExecArray | null;
|
|
|
|
while ((match = scriptRegex.exec(html)) !== null) {
|
|
try {
|
|
const data = JSON.parse(match[1]);
|
|
|
|
// Handle @graph arrays
|
|
const products = data['@graph']?.filter((item: any) => item['@type'] === 'Product') || [];
|
|
const product = data['@type'] === 'Product' ? data : products[0];
|
|
|
|
if (product) {
|
|
const offer = Array.isArray(product.offers) ? product.offers[0] : product.offers;
|
|
const price = parseFloat(offer?.price || offer?.lowPrice || '0');
|
|
|
|
return {
|
|
name: product.name || null,
|
|
price: price > 0 ? price : null,
|
|
currency: offer?.priceCurrency || 'USD',
|
|
description: product.description || null,
|
|
imageUrl: Array.isArray(product.image) ? product.image[0] : product.image || null,
|
|
sku: product.sku || null,
|
|
};
|
|
}
|
|
} catch {
|
|
// Invalid JSON, continue
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
/** Extract product data from Open Graph / product meta tags. */
|
|
function extractMetaTags(html: string): Partial<ExtractedProduct> {
|
|
const result: Partial<ExtractedProduct> = {};
|
|
|
|
const getMetaContent = (property: string): string | null => {
|
|
const re = new RegExp(`<meta[^>]*property=["']${property}["'][^>]*content=["']([^"']*)["']`, 'i');
|
|
const alt = new RegExp(`<meta[^>]*content=["']([^"']*)["'][^>]*property=["']${property}["']`, 'i');
|
|
const m = html.match(re) || html.match(alt);
|
|
return m ? m[1] : null;
|
|
};
|
|
|
|
const title = getMetaContent('og:title');
|
|
if (title) result.name = title;
|
|
|
|
const priceAmount = getMetaContent('product:price:amount') || getMetaContent('og:price:amount');
|
|
if (priceAmount) {
|
|
const p = parseFloat(priceAmount);
|
|
if (p > 0) result.price = p;
|
|
}
|
|
|
|
const priceCurrency = getMetaContent('product:price:currency');
|
|
if (priceCurrency) result.currency = priceCurrency;
|
|
|
|
const image = getMetaContent('og:image');
|
|
if (image) result.imageUrl = image;
|
|
|
|
const description = getMetaContent('og:description');
|
|
if (description) result.description = description;
|
|
|
|
return result;
|
|
}
|
|
|
|
/** Amazon-specific extraction via regex on HTML. */
|
|
function extractAmazon(html: string, url: string): Partial<ExtractedProduct> | null {
|
|
const result: Partial<ExtractedProduct> = {};
|
|
|
|
// Title
|
|
const titleMatch = html.match(/id=["']productTitle["'][^>]*>([^<]+)</i);
|
|
if (titleMatch) result.name = titleMatch[1].trim();
|
|
if (!result.name) return null;
|
|
|
|
// Price — look for common Amazon price patterns
|
|
const pricePatterns = [
|
|
/class=["']a-offscreen["'][^>]*>([£€$¥₹]?)\s*([\d,]+\.?\d*)/,
|
|
/id=["']priceblock_ourprice["'][^>]*>([£€$¥₹]?)\s*([\d,]+\.?\d*)/,
|
|
/id=["']priceblock_dealprice["'][^>]*>([£€$¥₹]?)\s*([\d,]+\.?\d*)/,
|
|
/"price":\s*"?([\d.]+)"?/,
|
|
];
|
|
for (const re of pricePatterns) {
|
|
const m = html.match(re);
|
|
if (m) {
|
|
const priceStr = m[2] || m[1];
|
|
const price = parseFloat(priceStr.replace(/,/g, ''));
|
|
if (price > 0) {
|
|
result.price = price;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// ASIN from URL
|
|
const asinMatch = url.match(/\/(?:dp|gp\/product)\/([A-Z0-9]{10})/);
|
|
if (asinMatch) result.sku = asinMatch[1];
|
|
|
|
// Image
|
|
const imgMatch = html.match(/id=["']landingImage["'][^>]*src=["']([^"']+)/i);
|
|
if (imgMatch) result.imageUrl = imgMatch[1];
|
|
|
|
// Description — feature bullets
|
|
const bulletMatches = html.match(/class=["']a-list-item["'][^>]*>([^<]+)/g);
|
|
if (bulletMatches && bulletMatches.length > 0) {
|
|
result.description = bulletMatches
|
|
.slice(0, 3)
|
|
.map(b => {
|
|
const m = b.match(/>([^<]+)/);
|
|
return m ? m[1].trim() : '';
|
|
})
|
|
.filter(Boolean)
|
|
.join(' • ');
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
/** Shopify extraction — look for `var meta = {...}` in scripts. */
|
|
function extractShopify(html: string): Partial<ExtractedProduct> | null {
|
|
const metaMatch = html.match(/var\s+meta\s*=\s*(\{[\s\S]*?\});/);
|
|
if (!metaMatch) return null;
|
|
|
|
try {
|
|
const meta = JSON.parse(metaMatch[1]);
|
|
if (meta.product) {
|
|
return {
|
|
name: meta.product.title,
|
|
price: typeof meta.product.price === 'number' ? meta.product.price / 100 : null,
|
|
currency: 'USD',
|
|
description: meta.product.description || null,
|
|
sku: meta.product.variants?.[0]?.sku || null,
|
|
};
|
|
}
|
|
} catch { /* parse failure */ }
|
|
return null;
|
|
}
|
|
|
|
/** Get HTML <title> as last-resort name fallback. */
|
|
function extractTitle(html: string): string | null {
|
|
const m = html.match(/<title[^>]*>([^<]+)<\/title>/i);
|
|
return m ? m[1].trim() : null;
|
|
}
|
|
|
|
// ── Public API ──
|
|
|
|
/**
|
|
* Extract product information from a URL.
|
|
* Cascading strategy: JSON-LD → platform-specific → meta tags → fallback.
|
|
*/
|
|
export async function extractProductFromUrl(url: string): Promise<ExtractedProduct> {
|
|
const html = await fetchHtml(url);
|
|
const domain = extractDomain(url);
|
|
const platform = detectPlatform(url, html);
|
|
|
|
// Layer 1: JSON-LD (most reliable)
|
|
const jsonLd = extractJsonLd(html);
|
|
|
|
// Layer 2: Platform-specific
|
|
let platformData: Partial<ExtractedProduct> | null = null;
|
|
if (platform === 'amazon') platformData = extractAmazon(html, url);
|
|
else if (platform === 'shopify') platformData = extractShopify(html);
|
|
// Etsy and WooCommerce rely on JSON-LD / meta tags
|
|
|
|
// Layer 3: Meta tags
|
|
const metaTags = extractMetaTags(html);
|
|
|
|
// Merge layers (earlier = higher priority)
|
|
const merged: Partial<ExtractedProduct> = {
|
|
...metaTags,
|
|
...platformData,
|
|
...jsonLd,
|
|
};
|
|
|
|
// Fallback name from <title>
|
|
if (!merged.name) {
|
|
merged.name = extractTitle(html) || domain;
|
|
}
|
|
|
|
// Resolve relative image URLs
|
|
if (merged.imageUrl && !merged.imageUrl.startsWith('http')) {
|
|
try {
|
|
merged.imageUrl = new URL(merged.imageUrl, url).href;
|
|
} catch { /* leave as-is */ }
|
|
}
|
|
|
|
// Vendor name: try to find site name from og:site_name
|
|
let vendorName = domain;
|
|
const siteNameMatch = html.match(/<meta[^>]*property=["']og:site_name["'][^>]*content=["']([^"']*)["']/i)
|
|
|| html.match(/<meta[^>]*content=["']([^"']*)["'][^>]*property=["']og:site_name["']/i);
|
|
if (siteNameMatch) vendorName = siteNameMatch[1];
|
|
|
|
return {
|
|
name: merged.name!,
|
|
price: merged.price ?? null,
|
|
currency: merged.currency || 'USD',
|
|
description: merged.description ?? null,
|
|
imageUrl: merged.imageUrl ?? null,
|
|
sourceUrl: url,
|
|
sku: merged.sku ?? null,
|
|
vendor: {
|
|
name: vendorName,
|
|
domain,
|
|
platform,
|
|
},
|
|
};
|
|
}
|