Replace Whisper-tiny offline fallback with Parakeet.js (NVIDIA 0.6B v2)
Swap @xenova/transformers (whisper-tiny, ~45MB) for parakeet.js (Parakeet TDT 0.6B v2, ~634MB) loaded from CDN at runtime. Much higher transcription accuracy at the cost of larger initial model download. Uses indirect dynamic import to avoid Next.js/webpack bundling issues. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
d1bdb126af
commit
fbbe8d38d1
|
|
@ -2,8 +2,8 @@
|
||||||
const nextConfig = {
|
const nextConfig = {
|
||||||
output: 'standalone',
|
output: 'standalone',
|
||||||
webpack: (config, { isServer, webpack }) => {
|
webpack: (config, { isServer, webpack }) => {
|
||||||
// @xenova/transformers depends on onnxruntime-node (native .node binaries)
|
// Ignore onnxruntime-node if any dependency pulls it in.
|
||||||
// which can't be bundled by webpack. We only use the web ONNX runtime.
|
// We only use the browser ONNX runtime (loaded from CDN at runtime).
|
||||||
config.plugins.push(
|
config.plugins.push(
|
||||||
new webpack.IgnorePlugin({
|
new webpack.IgnorePlugin({
|
||||||
resourceRegExp: /onnxruntime-node/,
|
resourceRegExp: /onnxruntime-node/,
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -23,7 +23,6 @@
|
||||||
"@tiptap/pm": "^3.19.0",
|
"@tiptap/pm": "^3.19.0",
|
||||||
"@tiptap/react": "^3.19.0",
|
"@tiptap/react": "^3.19.0",
|
||||||
"@tiptap/starter-kit": "^3.19.0",
|
"@tiptap/starter-kit": "^3.19.0",
|
||||||
"@xenova/transformers": "^2.17.2",
|
|
||||||
"dompurify": "^3.2.0",
|
"dompurify": "^3.2.0",
|
||||||
"lowlight": "^3.3.0",
|
"lowlight": "^3.3.0",
|
||||||
"marked": "^15.0.0",
|
"marked": "^15.0.0",
|
||||||
|
|
|
||||||
|
|
@ -338,8 +338,8 @@ export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) {
|
||||||
if (!transcript) {
|
if (!transcript) {
|
||||||
// Fallback 2: offline Whisper via Transformers.js in browser
|
// Fallback 2: offline Whisper via Transformers.js in browser
|
||||||
try {
|
try {
|
||||||
setOfflineProgress({ status: 'loading', message: 'Loading offline model...' });
|
setOfflineProgress({ status: 'loading', message: 'Loading Parakeet model...' });
|
||||||
const { transcribeOffline } = await import('@/lib/whisperOffline');
|
const { transcribeOffline } = await import('@/lib/parakeetOffline');
|
||||||
transcript = await transcribeOffline(blob, (p) => setOfflineProgress(p));
|
transcript = await transcribeOffline(blob, (p) => setOfflineProgress(p));
|
||||||
setOfflineProgress(null);
|
setOfflineProgress(null);
|
||||||
} catch (offlineErr) {
|
} catch (offlineErr) {
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,157 @@
|
||||||
|
/**
|
||||||
|
* Offline transcription using parakeet.js (NVIDIA Parakeet TDT 0.6B v2).
|
||||||
|
* Loaded at runtime from CDN to avoid Next.js/webpack bundling issues
|
||||||
|
* with onnxruntime-web's node-specific files.
|
||||||
|
* Model is ~634 MB (int8) on first download, cached in IndexedDB after.
|
||||||
|
* Much higher accuracy than Whisper-tiny at the cost of larger model size.
|
||||||
|
*/
|
||||||
|
|
||||||
|
const CACHE_KEY = 'parakeet-offline-cached';
|
||||||
|
|
||||||
|
export interface WhisperProgress {
|
||||||
|
status: 'checking' | 'downloading' | 'loading' | 'transcribing' | 'done' | 'error';
|
||||||
|
progress?: number;
|
||||||
|
file?: string;
|
||||||
|
message?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
type ProgressCallback = (progress: WhisperProgress) => void;
|
||||||
|
|
||||||
|
// Singleton model — don't reload on subsequent calls
|
||||||
|
let cachedModel: any = null;
|
||||||
|
let loadingPromise: Promise<any> | null = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if the Parakeet model has been downloaded before.
|
||||||
|
* Best-effort check via localStorage flag; actual cache is in IndexedDB.
|
||||||
|
*/
|
||||||
|
export function isModelCached(): boolean {
|
||||||
|
if (typeof window === 'undefined') return false;
|
||||||
|
return localStorage.getItem(CACHE_KEY) === 'true';
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Detect WebGPU availability in the current browser.
|
||||||
|
*/
|
||||||
|
async function detectWebGPU(): Promise<boolean> {
|
||||||
|
if (typeof navigator === 'undefined' || !(navigator as any).gpu) return false;
|
||||||
|
try {
|
||||||
|
const adapter = await (navigator as any).gpu.requestAdapter();
|
||||||
|
return !!adapter;
|
||||||
|
} catch {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get or create the Parakeet model singleton.
|
||||||
|
*/
|
||||||
|
async function getModel(onProgress?: ProgressCallback): Promise<any> {
|
||||||
|
if (cachedModel) return cachedModel;
|
||||||
|
if (loadingPromise) return loadingPromise;
|
||||||
|
|
||||||
|
loadingPromise = (async () => {
|
||||||
|
onProgress?.({ status: 'loading', message: 'Loading Parakeet model...' });
|
||||||
|
|
||||||
|
// Load from CDN at runtime — avoids webpack/Terser issues with onnxruntime-web.
|
||||||
|
// Use indirect dynamic import so webpack can't statically analyze the URL.
|
||||||
|
const importModule = new Function('url', 'return import(url)');
|
||||||
|
const { fromHub } = await importModule('https://esm.sh/parakeet.js@1.1.2');
|
||||||
|
|
||||||
|
const backend = (await detectWebGPU()) ? 'webgpu' : 'wasm';
|
||||||
|
const fileProgress: Record<string, { loaded: number; total: number }> = {};
|
||||||
|
|
||||||
|
const model = await fromHub('parakeet-tdt-0.6b-v2', {
|
||||||
|
backend,
|
||||||
|
progress: ({ file, loaded, total }: { file: string; loaded: number; total: number }) => {
|
||||||
|
fileProgress[file] = { loaded, total };
|
||||||
|
|
||||||
|
let totalBytes = 0;
|
||||||
|
let loadedBytes = 0;
|
||||||
|
for (const fp of Object.values(fileProgress)) {
|
||||||
|
totalBytes += fp.total || 0;
|
||||||
|
loadedBytes += fp.loaded || 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (totalBytes > 0) {
|
||||||
|
const pct = Math.round((loadedBytes / totalBytes) * 100);
|
||||||
|
onProgress?.({
|
||||||
|
status: 'downloading',
|
||||||
|
progress: pct,
|
||||||
|
file,
|
||||||
|
message: `Downloading Parakeet model... ${pct}%`,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
localStorage.setItem(CACHE_KEY, 'true');
|
||||||
|
onProgress?.({ status: 'loading', message: 'Model loaded' });
|
||||||
|
|
||||||
|
cachedModel = model;
|
||||||
|
loadingPromise = null;
|
||||||
|
return model;
|
||||||
|
})();
|
||||||
|
|
||||||
|
return loadingPromise;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Decode an audio Blob to Float32Array at 16 kHz mono.
|
||||||
|
*/
|
||||||
|
async function decodeAudioBlob(blob: Blob): Promise<Float32Array> {
|
||||||
|
const arrayBuffer = await blob.arrayBuffer();
|
||||||
|
const audioCtx = new AudioContext({ sampleRate: 16000 });
|
||||||
|
try {
|
||||||
|
const audioBuffer = await audioCtx.decodeAudioData(arrayBuffer);
|
||||||
|
|
||||||
|
// Already 16 kHz mono — return directly
|
||||||
|
if (audioBuffer.sampleRate === 16000 && audioBuffer.numberOfChannels === 1) {
|
||||||
|
return audioBuffer.getChannelData(0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Resample via OfflineAudioContext
|
||||||
|
const numSamples = Math.ceil(audioBuffer.duration * 16000);
|
||||||
|
const offlineCtx = new OfflineAudioContext(1, numSamples, 16000);
|
||||||
|
const source = offlineCtx.createBufferSource();
|
||||||
|
source.buffer = audioBuffer;
|
||||||
|
source.connect(offlineCtx.destination);
|
||||||
|
source.start();
|
||||||
|
const resampled = await offlineCtx.startRendering();
|
||||||
|
return resampled.getChannelData(0);
|
||||||
|
} finally {
|
||||||
|
await audioCtx.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Transcribe an audio Blob offline using Parakeet in the browser.
|
||||||
|
*
|
||||||
|
* First call downloads the model (~634 MB). Subsequent calls use cached model.
|
||||||
|
* Returns the transcribed text.
|
||||||
|
*/
|
||||||
|
export async function transcribeOffline(
|
||||||
|
audioBlob: Blob,
|
||||||
|
onProgress?: ProgressCallback
|
||||||
|
): Promise<string> {
|
||||||
|
try {
|
||||||
|
const model = await getModel(onProgress);
|
||||||
|
|
||||||
|
onProgress?.({ status: 'transcribing', message: 'Transcribing audio...' });
|
||||||
|
|
||||||
|
const audioData = await decodeAudioBlob(audioBlob);
|
||||||
|
|
||||||
|
const result = await model.transcribe(audioData, 16000, {
|
||||||
|
returnTimestamps: false,
|
||||||
|
enableProfiling: false,
|
||||||
|
});
|
||||||
|
|
||||||
|
const text = result.utterance_text?.trim() || '';
|
||||||
|
onProgress?.({ status: 'done', message: 'Transcription complete' });
|
||||||
|
return text;
|
||||||
|
} catch (err) {
|
||||||
|
const message = err instanceof Error ? err.message : 'Transcription failed';
|
||||||
|
onProgress?.({ status: 'error', message });
|
||||||
|
throw err;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -1,120 +0,0 @@
|
||||||
/**
|
|
||||||
* Offline Whisper transcription using @xenova/transformers (Transformers.js v2).
|
|
||||||
* Dynamically imports the library to avoid SSR issues.
|
|
||||||
* Uses Xenova/whisper-tiny with quantized weights (~45MB download).
|
|
||||||
* Model is cached by the browser after first download.
|
|
||||||
*/
|
|
||||||
|
|
||||||
const MODEL_ID = 'Xenova/whisper-tiny';
|
|
||||||
const CACHE_KEY = 'whisper-offline-cached';
|
|
||||||
|
|
||||||
export interface WhisperProgress {
|
|
||||||
status: 'checking' | 'downloading' | 'loading' | 'transcribing' | 'done' | 'error';
|
|
||||||
progress?: number; // 0-100 for download progress
|
|
||||||
file?: string;
|
|
||||||
message?: string;
|
|
||||||
}
|
|
||||||
|
|
||||||
type ProgressCallback = (progress: WhisperProgress) => void;
|
|
||||||
|
|
||||||
// Keep a singleton pipeline so we don't reload on subsequent calls
|
|
||||||
let cachedPipeline: any = null;
|
|
||||||
let loadingPromise: Promise<any> | null = null;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Check if the Whisper model has been downloaded before.
|
|
||||||
* Note: this is a best-effort check via localStorage flag.
|
|
||||||
* The actual model cache is managed by Transformers.js via Cache API.
|
|
||||||
*/
|
|
||||||
export function isModelCached(): boolean {
|
|
||||||
if (typeof window === 'undefined') return false;
|
|
||||||
return localStorage.getItem(CACHE_KEY) === 'true';
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Get or create the Whisper pipeline singleton.
|
|
||||||
*/
|
|
||||||
async function getPipeline(onProgress?: ProgressCallback): Promise<any> {
|
|
||||||
if (cachedPipeline) return cachedPipeline;
|
|
||||||
|
|
||||||
// Prevent multiple concurrent loads
|
|
||||||
if (loadingPromise) return loadingPromise;
|
|
||||||
|
|
||||||
loadingPromise = (async () => {
|
|
||||||
onProgress?.({ status: 'loading', message: 'Loading Whisper model...' });
|
|
||||||
|
|
||||||
const { pipeline, env } = await import('@xenova/transformers');
|
|
||||||
|
|
||||||
// Disable local model checks — always use browser cache / HF Hub
|
|
||||||
env.allowLocalModels = false;
|
|
||||||
|
|
||||||
const pipe = await pipeline('automatic-speech-recognition', MODEL_ID, {
|
|
||||||
quantized: true,
|
|
||||||
progress_callback: (p: any) => {
|
|
||||||
if (p.status === 'progress' && p.progress !== undefined) {
|
|
||||||
onProgress?.({
|
|
||||||
status: 'downloading',
|
|
||||||
progress: Math.round(p.progress),
|
|
||||||
file: p.file,
|
|
||||||
message: `Downloading model... ${Math.round(p.progress)}%`,
|
|
||||||
});
|
|
||||||
} else if (p.status === 'ready') {
|
|
||||||
localStorage.setItem(CACHE_KEY, 'true');
|
|
||||||
onProgress?.({ status: 'loading', message: 'Model loaded' });
|
|
||||||
}
|
|
||||||
},
|
|
||||||
});
|
|
||||||
|
|
||||||
cachedPipeline = pipe;
|
|
||||||
loadingPromise = null;
|
|
||||||
return pipe;
|
|
||||||
})();
|
|
||||||
|
|
||||||
return loadingPromise;
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Decode an audio Blob to Float32Array at 16kHz mono.
|
|
||||||
*/
|
|
||||||
async function decodeAudioBlob(blob: Blob): Promise<Float32Array> {
|
|
||||||
const arrayBuffer = await blob.arrayBuffer();
|
|
||||||
const audioCtx = new AudioContext({ sampleRate: 16000 });
|
|
||||||
try {
|
|
||||||
const audioBuffer = await audioCtx.decodeAudioData(arrayBuffer);
|
|
||||||
return audioBuffer.getChannelData(0);
|
|
||||||
} finally {
|
|
||||||
await audioCtx.close();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Transcribe an audio Blob offline using Whisper in the browser.
|
|
||||||
*
|
|
||||||
* First call will download the model (~45MB). Subsequent calls use the cached model.
|
|
||||||
* Returns the transcribed text.
|
|
||||||
*/
|
|
||||||
export async function transcribeOffline(
|
|
||||||
audioBlob: Blob,
|
|
||||||
onProgress?: ProgressCallback
|
|
||||||
): Promise<string> {
|
|
||||||
try {
|
|
||||||
const pipe = await getPipeline(onProgress);
|
|
||||||
|
|
||||||
onProgress?.({ status: 'transcribing', message: 'Transcribing audio...' });
|
|
||||||
|
|
||||||
const audioData = await decodeAudioBlob(audioBlob);
|
|
||||||
|
|
||||||
const result = await pipe(audioData, {
|
|
||||||
language: 'en',
|
|
||||||
return_timestamps: false,
|
|
||||||
});
|
|
||||||
|
|
||||||
const text = (result as any).text?.trim() || '';
|
|
||||||
onProgress?.({ status: 'done', message: 'Transcription complete' });
|
|
||||||
return text;
|
|
||||||
} catch (err) {
|
|
||||||
const message = err instanceof Error ? err.message : 'Transcription failed';
|
|
||||||
onProgress?.({ status: 'error', message });
|
|
||||||
throw err;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Loading…
Reference in New Issue