/** * Server-side product extraction from URLs. * * Ported from UniCart extension's content.ts ProductDetector, * adapted for server-side HTML parsing (no DOM — regex-based). * Reuses the fetch pattern from /api/link-preview in server/index.ts. */ export interface ExtractedProduct { name: string; price: number | null; currency: string; description: string | null; imageUrl: string | null; sourceUrl: string; sku: string | null; vendor: { name: string; domain: string; platform: string | null; }; } const FETCH_TIMEOUT = 5000; const USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'; /** Fetch URL HTML with browser-like headers and timeout. */ async function fetchHtml(url: string): Promise { const controller = new AbortController(); const timer = setTimeout(() => controller.abort(), FETCH_TIMEOUT); try { const resp = await fetch(url, { signal: controller.signal, headers: { 'User-Agent': USER_AGENT, Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', }, redirect: 'follow', }); if (!resp.ok) throw new Error(`HTTP ${resp.status}`); return await resp.text(); } finally { clearTimeout(timer); } } /** Extract domain from URL, stripping www. */ function extractDomain(url: string): string { try { const host = new URL(url).hostname; return host.replace(/^www\./, ''); } catch { return url; } } /** Detect platform from URL hostname. */ function detectPlatform(url: string, html: string): string | null { const domain = extractDomain(url); if (domain.includes('amazon.')) return 'amazon'; if (domain.includes('etsy.com')) return 'etsy'; if (html.includes('Shopify.') || html.includes('cdn.shopify.com')) return 'shopify'; if (html.includes('woocommerce')) return 'woocommerce'; return null; } // ── Extractors ── /** Extract product data from JSON-LD structured data. */ function extractJsonLd(html: string): Partial | null { const scriptRegex = /]*type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi; let match: RegExpExecArray | null; while ((match = scriptRegex.exec(html)) !== null) { try { const data = JSON.parse(match[1]); // Handle @graph arrays const products = data['@graph']?.filter((item: any) => item['@type'] === 'Product') || []; const product = data['@type'] === 'Product' ? data : products[0]; if (product) { const offer = Array.isArray(product.offers) ? product.offers[0] : product.offers; const price = parseFloat(offer?.price || offer?.lowPrice || '0'); return { name: product.name || null, price: price > 0 ? price : null, currency: offer?.priceCurrency || 'USD', description: product.description || null, imageUrl: Array.isArray(product.image) ? product.image[0] : product.image || null, sku: product.sku || null, }; } } catch { // Invalid JSON, continue } } return null; } /** Extract product data from Open Graph / product meta tags. */ function extractMetaTags(html: string): Partial { const result: Partial = {}; const getMetaContent = (property: string): string | null => { const re = new RegExp(`]*property=["']${property}["'][^>]*content=["']([^"']*)["']`, 'i'); const alt = new RegExp(`]*content=["']([^"']*)["'][^>]*property=["']${property}["']`, 'i'); const m = html.match(re) || html.match(alt); return m ? m[1] : null; }; const title = getMetaContent('og:title'); if (title) result.name = title; const priceAmount = getMetaContent('product:price:amount') || getMetaContent('og:price:amount'); if (priceAmount) { const p = parseFloat(priceAmount); if (p > 0) result.price = p; } const priceCurrency = getMetaContent('product:price:currency'); if (priceCurrency) result.currency = priceCurrency; const image = getMetaContent('og:image'); if (image) result.imageUrl = image; const description = getMetaContent('og:description'); if (description) result.description = description; return result; } /** Amazon-specific extraction via regex on HTML. */ function extractAmazon(html: string, url: string): Partial | null { const result: Partial = {}; // Title const titleMatch = html.match(/id=["']productTitle["'][^>]*>([^<]+)]*>([£€$¥₹]?)\s*([\d,]+\.?\d*)/, /id=["']priceblock_ourprice["'][^>]*>([£€$¥₹]?)\s*([\d,]+\.?\d*)/, /id=["']priceblock_dealprice["'][^>]*>([£€$¥₹]?)\s*([\d,]+\.?\d*)/, /"price":\s*"?([\d.]+)"?/, ]; for (const re of pricePatterns) { const m = html.match(re); if (m) { const priceStr = m[2] || m[1]; const price = parseFloat(priceStr.replace(/,/g, '')); if (price > 0) { result.price = price; break; } } } // ASIN from URL const asinMatch = url.match(/\/(?:dp|gp\/product)\/([A-Z0-9]{10})/); if (asinMatch) result.sku = asinMatch[1]; // Image const imgMatch = html.match(/id=["']landingImage["'][^>]*src=["']([^"']+)/i); if (imgMatch) result.imageUrl = imgMatch[1]; // Description — feature bullets const bulletMatches = html.match(/class=["']a-list-item["'][^>]*>([^<]+)/g); if (bulletMatches && bulletMatches.length > 0) { result.description = bulletMatches .slice(0, 3) .map(b => { const m = b.match(/>([^<]+)/); return m ? m[1].trim() : ''; }) .filter(Boolean) .join(' • '); } return result; } /** Shopify extraction — look for `var meta = {...}` in scripts. */ function extractShopify(html: string): Partial | null { const metaMatch = html.match(/var\s+meta\s*=\s*(\{[\s\S]*?\});/); if (!metaMatch) return null; try { const meta = JSON.parse(metaMatch[1]); if (meta.product) { return { name: meta.product.title, price: typeof meta.product.price === 'number' ? meta.product.price / 100 : null, currency: 'USD', description: meta.product.description || null, sku: meta.product.variants?.[0]?.sku || null, }; } } catch { /* parse failure */ } return null; } /** Get HTML as last-resort name fallback. */ function extractTitle(html: string): string | null { const m = html.match(/<title[^>]*>([^<]+)<\/title>/i); return m ? m[1].trim() : null; } // ── Public API ── /** * Extract product information from a URL. * Cascading strategy: JSON-LD → platform-specific → meta tags → fallback. */ export async function extractProductFromUrl(url: string): Promise<ExtractedProduct> { const html = await fetchHtml(url); const domain = extractDomain(url); const platform = detectPlatform(url, html); // Layer 1: JSON-LD (most reliable) const jsonLd = extractJsonLd(html); // Layer 2: Platform-specific let platformData: Partial<ExtractedProduct> | null = null; if (platform === 'amazon') platformData = extractAmazon(html, url); else if (platform === 'shopify') platformData = extractShopify(html); // Etsy and WooCommerce rely on JSON-LD / meta tags // Layer 3: Meta tags const metaTags = extractMetaTags(html); // Merge layers (earlier = higher priority) const merged: Partial<ExtractedProduct> = { ...metaTags, ...platformData, ...jsonLd, }; // Fallback name from <title> if (!merged.name) { merged.name = extractTitle(html) || domain; } // Resolve relative image URLs if (merged.imageUrl && !merged.imageUrl.startsWith('http')) { try { merged.imageUrl = new URL(merged.imageUrl, url).href; } catch { /* leave as-is */ } } // Vendor name: try to find site name from og:site_name let vendorName = domain; const siteNameMatch = html.match(/<meta[^>]*property=["']og:site_name["'][^>]*content=["']([^"']*)["']/i) || html.match(/<meta[^>]*content=["']([^"']*)["'][^>]*property=["']og:site_name["']/i); if (siteNameMatch) vendorName = siteNameMatch[1]; return { name: merged.name!, price: merged.price ?? null, currency: merged.currency || 'USD', description: merged.description ?? null, imageUrl: merged.imageUrl ?? null, sourceUrl: url, sku: merged.sku ?? null, vendor: { name: vendorName, domain, platform, }, }; }