26 lines
763 B
TypeScript
26 lines
763 B
TypeScript
/**
|
|
* Trigram + Jaccard similarity utilities for MI knowledge ranking.
|
|
* Pure functions, zero dependencies.
|
|
*/
|
|
|
|
/** Extract character trigrams from text (lowercased). */
|
|
export function trigrams(text: string): Set<string> {
|
|
const s = text.toLowerCase().replace(/[^\w\s]/g, "");
|
|
const set = new Set<string>();
|
|
for (let i = 0; i <= s.length - 3; i++) {
|
|
set.add(s.slice(i, i + 3));
|
|
}
|
|
return set;
|
|
}
|
|
|
|
/** Jaccard similarity between two trigram sets (0..1). */
|
|
export function jaccardSimilarity(a: Set<string>, b: Set<string>): number {
|
|
if (a.size === 0 && b.size === 0) return 0;
|
|
let intersection = 0;
|
|
for (const t of a) {
|
|
if (b.has(t)) intersection++;
|
|
}
|
|
const union = a.size + b.size - intersection;
|
|
return union === 0 ? 0 : intersection / union;
|
|
}
|