rspace-online/modules/rcart/extract.ts

/**
 * Server-side product extraction from URLs.
 *
 * Ported from UniCart extension's content.ts ProductDetector,
 * adapted for server-side HTML parsing (no DOM — regex-based).
 * Reuses the fetch pattern from /api/link-preview in server/index.ts.
 */

export interface ExtractedProduct {
	name: string;
	price: number | null;
	currency: string;
	description: string | null;
	imageUrl: string | null;
	sourceUrl: string;
	sku: string | null;
	vendor: {
		name: string;
		domain: string;
		platform: string | null;
	};
}

const FETCH_TIMEOUT = 5000;
const USER_AGENT =
	'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36';

/** Fetch URL HTML with browser-like headers and timeout. */
async function fetchHtml(url: string): Promise<string> {
	const controller = new AbortController();
	const timer = setTimeout(() => controller.abort(), FETCH_TIMEOUT);
	try {
		const resp = await fetch(url, {
			signal: controller.signal,
			headers: {
				'User-Agent': USER_AGENT,
				Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
				'Accept-Language': 'en-US,en;q=0.5',
			},
			redirect: 'follow',
		});
		if (!resp.ok) throw new Error(`HTTP ${resp.status}`);
		return await resp.text();
	} finally {
		clearTimeout(timer);
	}
}

/** Extract domain from URL, stripping www. */
function extractDomain(url: string): string {
	try {
		const host = new URL(url).hostname;
		return host.replace(/^www\./, '');
	} catch {
		return url;
	}
}

/** Detect platform from URL hostname. */
function detectPlatform(url: string, html: string): string | null {
	const domain = extractDomain(url);
	if (domain.includes('amazon.')) return 'amazon';
	if (domain.includes('etsy.com')) return 'etsy';
	if (html.includes('Shopify.') || html.includes('cdn.shopify.com')) return 'shopify';
	if (html.includes('woocommerce')) return 'woocommerce';
	return null;
}

// ── Extractors ──

/** Extract product data from JSON-LD structured data. */
function extractJsonLd(html: string): Partial<ExtractedProduct> | null {
	const scriptRegex = /<script[^>]*type=["']application\/ld\+json["'][^>]*>([\s\S]*?)<\/script>/gi;
	let match: RegExpExecArray | null;

	while ((match = scriptRegex.exec(html)) !== null) {
		try {
			const data = JSON.parse(match[1]);

			// Handle @graph arrays
			const products = data['@graph']?.filter((item: any) => item['@type'] === 'Product') || [];
			const product = data['@type'] === 'Product' ? data : products[0];

			if (product) {
				const offer = Array.isArray(product.offers) ? product.offers[0] : product.offers;
				const price = parseFloat(offer?.price || offer?.lowPrice || '0');

				return {
					name: product.name || null,
					price: price > 0 ? price : null,
					currency: offer?.priceCurrency || 'USD',
					description: product.description || null,
					imageUrl: Array.isArray(product.image) ? product.image[0] : product.image || null,
					sku: product.sku || null,
				};
			}
		} catch {
			// Invalid JSON, continue
		}
	}
	return null;
}

/** Extract product data from Open Graph / product meta tags. */
function extractMetaTags(html: string): Partial<ExtractedProduct> {
	const result: Partial<ExtractedProduct> = {};

	const getMetaContent = (property: string): string | null => {
		const re = new RegExp(`<meta[^>]*property=["']${property}["'][^>]*content=["']([^"']*)["']`, 'i');
		const alt = new RegExp(`<meta[^>]*content=["']([^"']*)["'][^>]*property=["']${property}["']`, 'i');
		const m = html.match(re) || html.match(alt);
		return m ? m[1] : null;
	};

	const title = getMetaContent('og:title');
	if (title) result.name = title;

	const priceAmount = getMetaContent('product:price:amount') || getMetaContent('og:price:amount');
	if (priceAmount) {
		const p = parseFloat(priceAmount);
		if (p > 0) result.price = p;
	}

	const priceCurrency = getMetaContent('product:price:currency');
	if (priceCurrency) result.currency = priceCurrency;

	const image = getMetaContent('og:image');
	if (image) result.imageUrl = image;

	const description = getMetaContent('og:description');
	if (description) result.description = description;

	return result;
}

/** Amazon-specific extraction via regex on HTML. */
function extractAmazon(html: string, url: string): Partial<ExtractedProduct> | null {
	const result: Partial<ExtractedProduct> = {};

	// Title
	const titleMatch = html.match(/id=["']productTitle["'][^>]*>([^<]+)</i);
	if (titleMatch) result.name = titleMatch[1].trim();
	if (!result.name) return null;

	// Price — look for common Amazon price patterns
	const pricePatterns = [
		/class=["']a-offscreen["'][^>]*>([£€$¥₹]?)\s*([\d,]+\.?\d*)/,
		/id=["']priceblock_ourprice["'][^>]*>([£€$¥₹]?)\s*([\d,]+\.?\d*)/,
		/id=["']priceblock_dealprice["'][^>]*>([£€$¥₹]?)\s*([\d,]+\.?\d*)/,
		/"price":\s*"?([\d.]+)"?/,
	];
	for (const re of pricePatterns) {
		const m = html.match(re);
		if (m) {
			const priceStr = m[2] || m[1];
			const price = parseFloat(priceStr.replace(/,/g, ''));
			if (price > 0) {
				result.price = price;
				break;
			}
		}
	}

	// ASIN from URL
	const asinMatch = url.match(/\/(?:dp|gp\/product)\/([A-Z0-9]{10})/);
	if (asinMatch) result.sku = asinMatch[1];

	// Image
	const imgMatch = html.match(/id=["']landingImage["'][^>]*src=["']([^"']+)/i);
	if (imgMatch) result.imageUrl = imgMatch[1];

	// Description — feature bullets
	const bulletMatches = html.match(/class=["']a-list-item["'][^>]*>([^<]+)/g);
	if (bulletMatches && bulletMatches.length > 0) {
		result.description = bulletMatches
			.slice(0, 3)
			.map(b => {
				const m = b.match(/>([^<]+)/);
				return m ? m[1].trim() : '';
			})
			.filter(Boolean)
			.join(' • ');
	}

	return result;
}

/** Shopify extraction — look for `var meta = {...}` in scripts. */
function extractShopify(html: string): Partial<ExtractedProduct> | null {
	const metaMatch = html.match(/var\s+meta\s*=\s*(\{[\s\S]*?\});/);
	if (!metaMatch) return null;

	try {
		const meta = JSON.parse(metaMatch[1]);
		if (meta.product) {
			return {
				name: meta.product.title,
				price: typeof meta.product.price === 'number' ? meta.product.price / 100 : null,
				currency: 'USD',
				description: meta.product.description || null,
				sku: meta.product.variants?.[0]?.sku || null,
			};
		}
	} catch { /* parse failure */ }
	return null;
}

/** Get HTML <title> as last-resort name fallback. */
function extractTitle(html: string): string | null {
	const m = html.match(/<title[^>]*>([^<]+)<\/title>/i);
	return m ? m[1].trim() : null;
}

// ── Public API ──

/**
 * Extract product information from a URL.
 * Cascading strategy: JSON-LD → platform-specific → meta tags → fallback.
 */
export async function extractProductFromUrl(url: string): Promise<ExtractedProduct> {
	const html = await fetchHtml(url);
	const domain = extractDomain(url);
	const platform = detectPlatform(url, html);

	// Layer 1: JSON-LD (most reliable)
	const jsonLd = extractJsonLd(html);

	// Layer 2: Platform-specific
	let platformData: Partial<ExtractedProduct> | null = null;
	if (platform === 'amazon') platformData = extractAmazon(html, url);
	else if (platform === 'shopify') platformData = extractShopify(html);
	// Etsy and WooCommerce rely on JSON-LD / meta tags

	// Layer 3: Meta tags
	const metaTags = extractMetaTags(html);

	// Merge layers (earlier = higher priority)
	const merged: Partial<ExtractedProduct> = {
		...metaTags,
		...platformData,
		...jsonLd,
	};

	// Fallback name from <title>
	if (!merged.name) {
		merged.name = extractTitle(html) || domain;
	}

	// Resolve relative image URLs
	if (merged.imageUrl && !merged.imageUrl.startsWith('http')) {
		try {
			merged.imageUrl = new URL(merged.imageUrl, url).href;
		} catch { /* leave as-is */ }
	}

	// Vendor name: try to find site name from og:site_name
	let vendorName = domain;
	const siteNameMatch = html.match(/<meta[^>]*property=["']og:site_name["'][^>]*content=["']([^"']*)["']/i)
		|| html.match(/<meta[^>]*content=["']([^"']*)["'][^>]*property=["']og:site_name["']/i);
	if (siteNameMatch) vendorName = siteNameMatch[1];

	return {
		name: merged.name!,
		price: merged.price ?? null,
		currency: merged.currency || 'USD',
		description: merged.description ?? null,
		imageUrl: merged.imageUrl ?? null,
		sourceUrl: url,
		sku: merged.sku ?? null,
		vendor: {
			name: vendorName,
			domain,
			platform,
		},
	};
}