feat: add offline Whisper transcription fallback via Transformers.js

When both WebSocket streaming and server batch API are unavailable, falls back to in-browser Whisper (Xenova/whisper-tiny, ~45MB, cached). Shows download progress bar and transcription status during processing. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 13:38:15 -07:00 · 2026-02-15 13:38:15 -07:00 · 441403fd14
parent e6fb53bf15
commit 441403fd14
5 changed files with 1046 additions and 8 deletions
--- a/next.config.mjs
+++ b/next.config.mjs
@ -1,6 +1,24 @@
 /** @type {import('next').NextConfig} */
 const nextConfig = {
  output: 'standalone',
+  webpack: (config, { isServer, webpack }) => {
+    // @xenova/transformers depends on onnxruntime-node (native .node binaries)
+    // which can't be bundled by webpack. We only use the web ONNX runtime.
+    config.plugins.push(
+      new webpack.IgnorePlugin({
+        resourceRegExp: /onnxruntime-node/,
+      })
+    );
+    if (!isServer) {
+      config.resolve.fallback = {
+        ...config.resolve.fallback,
+        fs: false,
+        path: false,
+        os: false,
+      };
+    }
+    return config;
+  },
 };

 export default nextConfig;
--- a/package-lock.json
+++ b/package-lock.json
--- a/package.json
+++ b/package.json
@ -12,6 +12,7 @@
    "db:studio": "npx prisma studio"
  },
  "dependencies": {
+    "@encryptid/sdk": "file:../encryptid-sdk",
    "@prisma/client": "^6.19.2",
    "@tiptap/extension-code-block-lowlight": "^3.19.0",
    "@tiptap/extension-image": "^3.19.0",
@ -22,7 +23,7 @@
    "@tiptap/pm": "^3.19.0",
    "@tiptap/react": "^3.19.0",
    "@tiptap/starter-kit": "^3.19.0",
-    "@encryptid/sdk": "file:../encryptid-sdk",
+    "@xenova/transformers": "^2.17.2",
    "dompurify": "^3.2.0",
    "lowlight": "^3.3.0",
    "marked": "^15.0.0",
--- a/src/components/VoiceRecorder.tsx
+++ b/src/components/VoiceRecorder.tsx
@ -3,6 +3,13 @@
 import { useState, useRef, useCallback, useEffect } from 'react';
 import { authFetch } from '@/lib/authFetch';

+interface WhisperProgress {
+  status: 'checking' | 'downloading' | 'loading' | 'transcribing' | 'done' | 'error';
+  progress?: number;
+  file?: string;
+  message?: string;
+}
+
 interface Segment {
  id: number;
  text: string;
@ -36,6 +43,7 @@ export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) {
  const [error, setError] = useState<string | null>(null);
  const [audioUrl, setAudioUrl] = useState<string | null>(null);
  const [streaming, setStreaming] = useState(false);
+  const [offlineProgress, setOfflineProgress] = useState<WhisperProgress | null>(null);

  const mediaRecorderRef = useRef<MediaRecorder | null>(null);
  const audioContextRef = useRef<AudioContext | null>(null);
@ -308,7 +316,7 @@ export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) {
      }

      if (!transcript) {
-        // Fallback: batch transcription via API proxy
+        // Fallback 1: batch transcription via API proxy
        try {
          const transcribeForm = new FormData();
          transcribeForm.append('audio', blob, 'recording.webm');
@ -323,7 +331,20 @@ export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) {
            transcript = transcribeResult.text || '';
          }
        } catch {
-          console.warn('Batch transcription also failed');
+          console.warn('Batch transcription failed, trying offline...');
+        }
+      }
+
+      if (!transcript) {
+        // Fallback 2: offline Whisper via Transformers.js in browser
+        try {
+          setOfflineProgress({ status: 'loading', message: 'Loading offline model...' });
+          const { transcribeOffline } = await import('@/lib/whisperOffline');
+          transcript = await transcribeOffline(blob, (p) => setOfflineProgress(p));
+          setOfflineProgress(null);
+        } catch (offlineErr) {
+          console.warn('Offline transcription failed:', offlineErr);
+          setOfflineProgress(null);
        }
      }

@ -391,10 +412,14 @@ export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) {
                <span className="text-2xl font-mono text-white">
                  {formatTime(elapsed)}
                </span>
-                {streaming && (
+                {streaming ? (
                  <span className="text-xs text-green-400/70 font-medium tracking-wider">
                    LIVE
                  </span>
+                ) : (
+                  <span className="text-xs text-slate-500 font-medium tracking-wider">
+                    OFFLINE
+                  </span>
                )}
              </div>
              <button
@ -430,8 +455,17 @@ export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) {
                />
              </svg>
              <p className="text-sm text-slate-400">
-                Finalizing transcription...
+                {offlineProgress?.message || 'Finalizing transcription...'}
              </p>
+              {offlineProgress?.status === 'downloading' &&
+                offlineProgress.progress !== undefined && (
+                  <div className="w-48 h-1.5 bg-slate-700 rounded-full overflow-hidden">
+                    <div
+                      className="h-full bg-amber-400 rounded-full transition-all duration-300"
+                      style={{ width: `${offlineProgress.progress}%` }}
+                    />
+                  </div>
+                )}
            </div>
          )}

--- a/src/lib/whisperOffline.ts
+++ b/src/lib/whisperOffline.ts
@ -0,0 +1,120 @@
+/**
+ * Offline Whisper transcription using @xenova/transformers (Transformers.js v2).
+ * Dynamically imports the library to avoid SSR issues.
+ * Uses Xenova/whisper-tiny with quantized weights (~45MB download).
+ * Model is cached by the browser after first download.
+ */
+
+const MODEL_ID = 'Xenova/whisper-tiny';
+const CACHE_KEY = 'whisper-offline-cached';
+
+export interface WhisperProgress {
+  status: 'checking' | 'downloading' | 'loading' | 'transcribing' | 'done' | 'error';
+  progress?: number; // 0-100 for download progress
+  file?: string;
+  message?: string;
+}
+
+type ProgressCallback = (progress: WhisperProgress) => void;
+
+// Keep a singleton pipeline so we don't reload on subsequent calls
+let cachedPipeline: any = null;
+let loadingPromise: Promise<any> | null = null;
+
+/**
+ * Check if the Whisper model has been downloaded before.
+ * Note: this is a best-effort check via localStorage flag.
+ * The actual model cache is managed by Transformers.js via Cache API.
+ */
+export function isModelCached(): boolean {
+  if (typeof window === 'undefined') return false;
+  return localStorage.getItem(CACHE_KEY) === 'true';
+}
+
+/**
+ * Get or create the Whisper pipeline singleton.
+ */
+async function getPipeline(onProgress?: ProgressCallback): Promise<any> {
+  if (cachedPipeline) return cachedPipeline;
+
+  // Prevent multiple concurrent loads
+  if (loadingPromise) return loadingPromise;
+
+  loadingPromise = (async () => {
+    onProgress?.({ status: 'loading', message: 'Loading Whisper model...' });
+
+    const { pipeline, env } = await import('@xenova/transformers');
+
+    // Disable local model checks — always use browser cache / HF Hub
+    env.allowLocalModels = false;
+
+    const pipe = await pipeline('automatic-speech-recognition', MODEL_ID, {
+      quantized: true,
+      progress_callback: (p: any) => {
+        if (p.status === 'progress' && p.progress !== undefined) {
+          onProgress?.({
+            status: 'downloading',
+            progress: Math.round(p.progress),
+            file: p.file,
+            message: `Downloading model... ${Math.round(p.progress)}%`,
+          });
+        } else if (p.status === 'ready') {
+          localStorage.setItem(CACHE_KEY, 'true');
+          onProgress?.({ status: 'loading', message: 'Model loaded' });
+        }
+      },
+    });
+
+    cachedPipeline = pipe;
+    loadingPromise = null;
+    return pipe;
+  })();
+
+  return loadingPromise;
+}
+
+/**
+ * Decode an audio Blob to Float32Array at 16kHz mono.
+ */
+async function decodeAudioBlob(blob: Blob): Promise<Float32Array> {
+  const arrayBuffer = await blob.arrayBuffer();
+  const audioCtx = new AudioContext({ sampleRate: 16000 });
+  try {
+    const audioBuffer = await audioCtx.decodeAudioData(arrayBuffer);
+    return audioBuffer.getChannelData(0);
+  } finally {
+    await audioCtx.close();
+  }
+}
+
+/**
+ * Transcribe an audio Blob offline using Whisper in the browser.
+ *
+ * First call will download the model (~45MB). Subsequent calls use the cached model.
+ * Returns the transcribed text.
+ */
+export async function transcribeOffline(
+  audioBlob: Blob,
+  onProgress?: ProgressCallback
+): Promise<string> {
+  try {
+    const pipe = await getPipeline(onProgress);
+
+    onProgress?.({ status: 'transcribing', message: 'Transcribing audio...' });
+
+    const audioData = await decodeAudioBlob(audioBlob);
+
+    const result = await pipe(audioData, {
+      language: 'en',
+      return_timestamps: false,
+    });
+
+    const text = (result as any).text?.trim() || '';
+    onProgress?.({ status: 'done', message: 'Transcription complete' });
+    return text;
+  } catch (err) {
+    const message = err instanceof Error ? err.message : 'Transcription failed';
+    onProgress?.({ status: 'error', message });
+    throw err;
+  }
+}