feat: add offline Whisper transcription fallback via Transformers.js
When both WebSocket streaming and server batch API are unavailable, falls back to in-browser Whisper (Xenova/whisper-tiny, ~45MB, cached). Shows download progress bar and transcription status during processing. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
e6fb53bf15
commit
441403fd14
|
|
@ -1,6 +1,24 @@
|
||||||
/** @type {import('next').NextConfig} */
|
/** @type {import('next').NextConfig} */
|
||||||
const nextConfig = {
|
const nextConfig = {
|
||||||
output: 'standalone',
|
output: 'standalone',
|
||||||
|
webpack: (config, { isServer, webpack }) => {
|
||||||
|
// @xenova/transformers depends on onnxruntime-node (native .node binaries)
|
||||||
|
// which can't be bundled by webpack. We only use the web ONNX runtime.
|
||||||
|
config.plugins.push(
|
||||||
|
new webpack.IgnorePlugin({
|
||||||
|
resourceRegExp: /onnxruntime-node/,
|
||||||
|
})
|
||||||
|
);
|
||||||
|
if (!isServer) {
|
||||||
|
config.resolve.fallback = {
|
||||||
|
...config.resolve.fallback,
|
||||||
|
fs: false,
|
||||||
|
path: false,
|
||||||
|
os: false,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
return config;
|
||||||
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
export default nextConfig;
|
export default nextConfig;
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -12,6 +12,7 @@
|
||||||
"db:studio": "npx prisma studio"
|
"db:studio": "npx prisma studio"
|
||||||
},
|
},
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
|
"@encryptid/sdk": "file:../encryptid-sdk",
|
||||||
"@prisma/client": "^6.19.2",
|
"@prisma/client": "^6.19.2",
|
||||||
"@tiptap/extension-code-block-lowlight": "^3.19.0",
|
"@tiptap/extension-code-block-lowlight": "^3.19.0",
|
||||||
"@tiptap/extension-image": "^3.19.0",
|
"@tiptap/extension-image": "^3.19.0",
|
||||||
|
|
@ -22,7 +23,7 @@
|
||||||
"@tiptap/pm": "^3.19.0",
|
"@tiptap/pm": "^3.19.0",
|
||||||
"@tiptap/react": "^3.19.0",
|
"@tiptap/react": "^3.19.0",
|
||||||
"@tiptap/starter-kit": "^3.19.0",
|
"@tiptap/starter-kit": "^3.19.0",
|
||||||
"@encryptid/sdk": "file:../encryptid-sdk",
|
"@xenova/transformers": "^2.17.2",
|
||||||
"dompurify": "^3.2.0",
|
"dompurify": "^3.2.0",
|
||||||
"lowlight": "^3.3.0",
|
"lowlight": "^3.3.0",
|
||||||
"marked": "^15.0.0",
|
"marked": "^15.0.0",
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,13 @@
|
||||||
import { useState, useRef, useCallback, useEffect } from 'react';
|
import { useState, useRef, useCallback, useEffect } from 'react';
|
||||||
import { authFetch } from '@/lib/authFetch';
|
import { authFetch } from '@/lib/authFetch';
|
||||||
|
|
||||||
|
interface WhisperProgress {
|
||||||
|
status: 'checking' | 'downloading' | 'loading' | 'transcribing' | 'done' | 'error';
|
||||||
|
progress?: number;
|
||||||
|
file?: string;
|
||||||
|
message?: string;
|
||||||
|
}
|
||||||
|
|
||||||
interface Segment {
|
interface Segment {
|
||||||
id: number;
|
id: number;
|
||||||
text: string;
|
text: string;
|
||||||
|
|
@ -36,6 +43,7 @@ export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) {
|
||||||
const [error, setError] = useState<string | null>(null);
|
const [error, setError] = useState<string | null>(null);
|
||||||
const [audioUrl, setAudioUrl] = useState<string | null>(null);
|
const [audioUrl, setAudioUrl] = useState<string | null>(null);
|
||||||
const [streaming, setStreaming] = useState(false);
|
const [streaming, setStreaming] = useState(false);
|
||||||
|
const [offlineProgress, setOfflineProgress] = useState<WhisperProgress | null>(null);
|
||||||
|
|
||||||
const mediaRecorderRef = useRef<MediaRecorder | null>(null);
|
const mediaRecorderRef = useRef<MediaRecorder | null>(null);
|
||||||
const audioContextRef = useRef<AudioContext | null>(null);
|
const audioContextRef = useRef<AudioContext | null>(null);
|
||||||
|
|
@ -308,7 +316,7 @@ export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!transcript) {
|
if (!transcript) {
|
||||||
// Fallback: batch transcription via API proxy
|
// Fallback 1: batch transcription via API proxy
|
||||||
try {
|
try {
|
||||||
const transcribeForm = new FormData();
|
const transcribeForm = new FormData();
|
||||||
transcribeForm.append('audio', blob, 'recording.webm');
|
transcribeForm.append('audio', blob, 'recording.webm');
|
||||||
|
|
@ -323,7 +331,20 @@ export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) {
|
||||||
transcript = transcribeResult.text || '';
|
transcript = transcribeResult.text || '';
|
||||||
}
|
}
|
||||||
} catch {
|
} catch {
|
||||||
console.warn('Batch transcription also failed');
|
console.warn('Batch transcription failed, trying offline...');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!transcript) {
|
||||||
|
// Fallback 2: offline Whisper via Transformers.js in browser
|
||||||
|
try {
|
||||||
|
setOfflineProgress({ status: 'loading', message: 'Loading offline model...' });
|
||||||
|
const { transcribeOffline } = await import('@/lib/whisperOffline');
|
||||||
|
transcript = await transcribeOffline(blob, (p) => setOfflineProgress(p));
|
||||||
|
setOfflineProgress(null);
|
||||||
|
} catch (offlineErr) {
|
||||||
|
console.warn('Offline transcription failed:', offlineErr);
|
||||||
|
setOfflineProgress(null);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -391,10 +412,14 @@ export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) {
|
||||||
<span className="text-2xl font-mono text-white">
|
<span className="text-2xl font-mono text-white">
|
||||||
{formatTime(elapsed)}
|
{formatTime(elapsed)}
|
||||||
</span>
|
</span>
|
||||||
{streaming && (
|
{streaming ? (
|
||||||
<span className="text-xs text-green-400/70 font-medium tracking-wider">
|
<span className="text-xs text-green-400/70 font-medium tracking-wider">
|
||||||
LIVE
|
LIVE
|
||||||
</span>
|
</span>
|
||||||
|
) : (
|
||||||
|
<span className="text-xs text-slate-500 font-medium tracking-wider">
|
||||||
|
OFFLINE
|
||||||
|
</span>
|
||||||
)}
|
)}
|
||||||
</div>
|
</div>
|
||||||
<button
|
<button
|
||||||
|
|
@ -430,8 +455,17 @@ export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) {
|
||||||
/>
|
/>
|
||||||
</svg>
|
</svg>
|
||||||
<p className="text-sm text-slate-400">
|
<p className="text-sm text-slate-400">
|
||||||
Finalizing transcription...
|
{offlineProgress?.message || 'Finalizing transcription...'}
|
||||||
</p>
|
</p>
|
||||||
|
{offlineProgress?.status === 'downloading' &&
|
||||||
|
offlineProgress.progress !== undefined && (
|
||||||
|
<div className="w-48 h-1.5 bg-slate-700 rounded-full overflow-hidden">
|
||||||
|
<div
|
||||||
|
className="h-full bg-amber-400 rounded-full transition-all duration-300"
|
||||||
|
style={{ width: `${offlineProgress.progress}%` }}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
</div>
|
</div>
|
||||||
)}
|
)}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,120 @@
|
||||||
|
/**
|
||||||
|
* Offline Whisper transcription using @xenova/transformers (Transformers.js v2).
|
||||||
|
* Dynamically imports the library to avoid SSR issues.
|
||||||
|
* Uses Xenova/whisper-tiny with quantized weights (~45MB download).
|
||||||
|
* Model is cached by the browser after first download.
|
||||||
|
*/
|
||||||
|
|
||||||
|
const MODEL_ID = 'Xenova/whisper-tiny';
|
||||||
|
const CACHE_KEY = 'whisper-offline-cached';
|
||||||
|
|
||||||
|
export interface WhisperProgress {
|
||||||
|
status: 'checking' | 'downloading' | 'loading' | 'transcribing' | 'done' | 'error';
|
||||||
|
progress?: number; // 0-100 for download progress
|
||||||
|
file?: string;
|
||||||
|
message?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
type ProgressCallback = (progress: WhisperProgress) => void;
|
||||||
|
|
||||||
|
// Keep a singleton pipeline so we don't reload on subsequent calls
|
||||||
|
let cachedPipeline: any = null;
|
||||||
|
let loadingPromise: Promise<any> | null = null;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Check if the Whisper model has been downloaded before.
|
||||||
|
* Note: this is a best-effort check via localStorage flag.
|
||||||
|
* The actual model cache is managed by Transformers.js via Cache API.
|
||||||
|
*/
|
||||||
|
export function isModelCached(): boolean {
|
||||||
|
if (typeof window === 'undefined') return false;
|
||||||
|
return localStorage.getItem(CACHE_KEY) === 'true';
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Get or create the Whisper pipeline singleton.
|
||||||
|
*/
|
||||||
|
async function getPipeline(onProgress?: ProgressCallback): Promise<any> {
|
||||||
|
if (cachedPipeline) return cachedPipeline;
|
||||||
|
|
||||||
|
// Prevent multiple concurrent loads
|
||||||
|
if (loadingPromise) return loadingPromise;
|
||||||
|
|
||||||
|
loadingPromise = (async () => {
|
||||||
|
onProgress?.({ status: 'loading', message: 'Loading Whisper model...' });
|
||||||
|
|
||||||
|
const { pipeline, env } = await import('@xenova/transformers');
|
||||||
|
|
||||||
|
// Disable local model checks — always use browser cache / HF Hub
|
||||||
|
env.allowLocalModels = false;
|
||||||
|
|
||||||
|
const pipe = await pipeline('automatic-speech-recognition', MODEL_ID, {
|
||||||
|
quantized: true,
|
||||||
|
progress_callback: (p: any) => {
|
||||||
|
if (p.status === 'progress' && p.progress !== undefined) {
|
||||||
|
onProgress?.({
|
||||||
|
status: 'downloading',
|
||||||
|
progress: Math.round(p.progress),
|
||||||
|
file: p.file,
|
||||||
|
message: `Downloading model... ${Math.round(p.progress)}%`,
|
||||||
|
});
|
||||||
|
} else if (p.status === 'ready') {
|
||||||
|
localStorage.setItem(CACHE_KEY, 'true');
|
||||||
|
onProgress?.({ status: 'loading', message: 'Model loaded' });
|
||||||
|
}
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
cachedPipeline = pipe;
|
||||||
|
loadingPromise = null;
|
||||||
|
return pipe;
|
||||||
|
})();
|
||||||
|
|
||||||
|
return loadingPromise;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Decode an audio Blob to Float32Array at 16kHz mono.
|
||||||
|
*/
|
||||||
|
async function decodeAudioBlob(blob: Blob): Promise<Float32Array> {
|
||||||
|
const arrayBuffer = await blob.arrayBuffer();
|
||||||
|
const audioCtx = new AudioContext({ sampleRate: 16000 });
|
||||||
|
try {
|
||||||
|
const audioBuffer = await audioCtx.decodeAudioData(arrayBuffer);
|
||||||
|
return audioBuffer.getChannelData(0);
|
||||||
|
} finally {
|
||||||
|
await audioCtx.close();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Transcribe an audio Blob offline using Whisper in the browser.
|
||||||
|
*
|
||||||
|
* First call will download the model (~45MB). Subsequent calls use the cached model.
|
||||||
|
* Returns the transcribed text.
|
||||||
|
*/
|
||||||
|
export async function transcribeOffline(
|
||||||
|
audioBlob: Blob,
|
||||||
|
onProgress?: ProgressCallback
|
||||||
|
): Promise<string> {
|
||||||
|
try {
|
||||||
|
const pipe = await getPipeline(onProgress);
|
||||||
|
|
||||||
|
onProgress?.({ status: 'transcribing', message: 'Transcribing audio...' });
|
||||||
|
|
||||||
|
const audioData = await decodeAudioBlob(audioBlob);
|
||||||
|
|
||||||
|
const result = await pipe(audioData, {
|
||||||
|
language: 'en',
|
||||||
|
return_timestamps: false,
|
||||||
|
});
|
||||||
|
|
||||||
|
const text = (result as any).text?.trim() || '';
|
||||||
|
onProgress?.({ status: 'done', message: 'Transcription complete' });
|
||||||
|
return text;
|
||||||
|
} catch (err) {
|
||||||
|
const message = err instanceof Error ? err.message : 'Transcription failed';
|
||||||
|
onProgress?.({ status: 'error', message });
|
||||||
|
throw err;
|
||||||
|
}
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue