feat: add rVoice popup recorder with 3-tier transcription to browser extension

Adds a standalone voice recording popup (voice.html) accessible via the extension popup button or Ctrl+Shift+V hotkey. Records audio, uploads to rNotes, and transcribes with a 3-tier cascade: server Whisper API, live Web Speech API (real-time text while recording), and offline Parakeet.js (NVIDIA 0.6B, ~634MB cached in IndexedDB). Saves as AUDIO notes with editable transcript and notebook selection. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-24 16:43:57 -08:00 · 2026-02-24 16:43:57 -08:00 · 5ff6c9d832
parent 3d77eae16b
commit 5ff6c9d832
4 changed files with 439 additions and 21 deletions
--- a/browser-extension/manifest.json
+++ b/browser-extension/manifest.json
@ -35,6 +35,9 @@
    "page": "options.html",
    "open_in_tab": false
  },
+  "content_security_policy": {
+    "extension_pages": "script-src 'self' https://esm.sh; object-src 'self'"
+  },
  "commands": {
    "open-voice-recorder": {
      "suggested_key": {
--- a/browser-extension/parakeet-offline.js
+++ b/browser-extension/parakeet-offline.js
@ -0,0 +1,147 @@
+/**
+ * Offline transcription using parakeet.js (NVIDIA Parakeet TDT 0.6B v2).
+ * Loaded at runtime from CDN. Model ~634 MB (int8) on first download,
+ * cached in IndexedDB after. Works fully offline after first download.
+ *
+ * Port of src/lib/parakeetOffline.ts for the browser extension.
+ */
+
+const CACHE_KEY = 'parakeet-offline-cached';
+
+// Singleton model — don't reload on subsequent calls
+let cachedModel = null;
+let loadingPromise = null;
+
+/**
+ * Check if the Parakeet model has been downloaded before.
+ */
+function isModelCached() {
+  try {
+    return localStorage.getItem(CACHE_KEY) === 'true';
+  } catch {
+    return false;
+  }
+}
+
+/**
+ * Detect WebGPU availability.
+ */
+async function detectWebGPU() {
+  if (!navigator.gpu) return false;
+  try {
+    const adapter = await navigator.gpu.requestAdapter();
+    return !!adapter;
+  } catch {
+    return false;
+  }
+}
+
+/**
+ * Get or create the Parakeet model singleton.
+ * @param {function} onProgress - callback({ status, progress, file, message })
+ */
+async function getModel(onProgress) {
+  if (cachedModel) return cachedModel;
+  if (loadingPromise) return loadingPromise;
+
+  loadingPromise = (async () => {
+    onProgress?.({ status: 'loading', message: 'Loading Parakeet model...' });
+
+    // Dynamic import from CDN at runtime
+    const { fromHub } = await import('https://esm.sh/parakeet.js@1.1.2');
+
+    const backend = (await detectWebGPU()) ? 'webgpu' : 'wasm';
+    const fileProgress = {};
+
+    const model = await fromHub('parakeet-tdt-0.6b-v2', {
+      backend,
+      progress: ({ file, loaded, total }) => {
+        fileProgress[file] = { loaded, total };
+
+        let totalBytes = 0;
+        let loadedBytes = 0;
+        for (const fp of Object.values(fileProgress)) {
+          totalBytes += fp.total || 0;
+          loadedBytes += fp.loaded || 0;
+        }
+
+        if (totalBytes > 0) {
+          const pct = Math.round((loadedBytes / totalBytes) * 100);
+          onProgress?.({
+            status: 'downloading',
+            progress: pct,
+            file,
+            message: `Downloading model... ${pct}%`,
+          });
+        }
+      },
+    });
+
+    localStorage.setItem(CACHE_KEY, 'true');
+    onProgress?.({ status: 'loading', message: 'Model loaded' });
+
+    cachedModel = model;
+    loadingPromise = null;
+    return model;
+  })();
+
+  return loadingPromise;
+}
+
+/**
+ * Decode an audio Blob to Float32Array at 16 kHz mono.
+ */
+async function decodeAudioBlob(blob) {
+  const arrayBuffer = await blob.arrayBuffer();
+  const audioCtx = new AudioContext({ sampleRate: 16000 });
+  try {
+    const audioBuffer = await audioCtx.decodeAudioData(arrayBuffer);
+
+    if (audioBuffer.sampleRate === 16000 && audioBuffer.numberOfChannels === 1) {
+      return audioBuffer.getChannelData(0);
+    }
+
+    // Resample via OfflineAudioContext
+    const numSamples = Math.ceil(audioBuffer.duration * 16000);
+    const offlineCtx = new OfflineAudioContext(1, numSamples, 16000);
+    const source = offlineCtx.createBufferSource();
+    source.buffer = audioBuffer;
+    source.connect(offlineCtx.destination);
+    source.start();
+    const resampled = await offlineCtx.startRendering();
+    return resampled.getChannelData(0);
+  } finally {
+    await audioCtx.close();
+  }
+}
+
+/**
+ * Transcribe an audio Blob offline using Parakeet in the browser.
+ * First call downloads the model (~634 MB). Subsequent calls use cached.
+ *
+ * @param {Blob} audioBlob
+ * @param {function} onProgress - callback({ status, progress, file, message })
+ * @returns {Promise<string>} transcribed text
+ */
+async function transcribeOffline(audioBlob, onProgress) {
+  const model = await getModel(onProgress);
+
+  onProgress?.({ status: 'transcribing', message: 'Transcribing audio...' });
+
+  const audioData = await decodeAudioBlob(audioBlob);
+
+  const result = await model.transcribe(audioData, 16000, {
+    returnTimestamps: false,
+    enableProfiling: false,
+  });
+
+  const text = result.utterance_text?.trim() || '';
+  onProgress?.({ status: 'done', message: 'Transcription complete' });
+  return text;
+}
+
+// Export for use in voice.js (loaded as ES module)
+window.ParakeetOffline = {
+  isModelCached,
+  transcribeOffline,
+};
--- a/browser-extension/voice.html
+++ b/browser-extension/voice.html
@ -175,6 +175,13 @@
      color: #525252;
      font-style: italic;
    }
+    .transcript-text .final-text {
+      color: #d4d4d4;
+    }
+    .transcript-text .interim-text {
+      color: #737373;
+      font-style: italic;
+    }

    /* Controls row */
    .controls {
@ -255,6 +262,61 @@
    .status-bar.error { color: #fca5a5; background: #450a0a; border-top-color: #991b1b; }
    .status-bar.loading { color: #93c5fd; background: #172554; border-top-color: #1e40af; }

+    /* Live indicator */
+    .live-indicator {
+      display: none;
+      align-items: center;
+      gap: 5px;
+      font-size: 10px;
+      font-weight: 700;
+      text-transform: uppercase;
+      letter-spacing: 1.5px;
+      color: #4ade80;
+    }
+    .live-indicator.visible {
+      display: flex;
+    }
+    .live-indicator .dot {
+      width: 6px;
+      height: 6px;
+      border-radius: 50%;
+      background: #4ade80;
+      animation: pulse-dot 1s infinite;
+    }
+    @keyframes pulse-dot {
+      0%, 100% { opacity: 1; }
+      50% { opacity: 0.3; }
+    }
+
+    /* Progress bar (for model download) */
+    .progress-area {
+      width: 100%;
+      padding: 0 14px 8px;
+      display: none;
+    }
+    .progress-area.visible {
+      display: block;
+    }
+    .progress-label {
+      font-size: 11px;
+      color: #a3a3a3;
+      margin-bottom: 4px;
+    }
+    .progress-bar {
+      width: 100%;
+      height: 6px;
+      background: #262626;
+      border-radius: 3px;
+      overflow: hidden;
+    }
+    .progress-bar .fill {
+      height: 100%;
+      background: #f59e0b;
+      border-radius: 3px;
+      transition: width 0.3s;
+      width: 0%;
+    }
+
    /* Audio preview */
    .audio-preview {
      width: 100%;
@ -305,6 +367,15 @@
      <div class="inner"></div>
    </button>
    <div class="timer" id="timer">00:00</div>
+    <div class="live-indicator" id="liveIndicator">
+      <span class="dot"></span>
+      Live transcribe
+    </div>
+  </div>
+
+  <div class="progress-area" id="progressArea">
+    <div class="progress-label" id="progressLabel">Loading model...</div>
+    <div class="progress-bar"><div class="fill" id="progressFill"></div></div>
  </div>

  <div class="audio-preview" id="audioPreview">
@ -334,9 +405,10 @@
  <div class="status-bar" id="statusBar"></div>

  <div class="kbd-hint">
-    <kbd>Space</kbd> to record &middot; <kbd>Esc</kbd> to close
+    <kbd>Space</kbd> to record &middot; <kbd>Esc</kbd> to close &middot; Offline ready
  </div>

+  <script src="parakeet-offline.js" type="module"></script>
  <script src="voice.js"></script>
 </body>
 </html>
--- a/browser-extension/voice.js
+++ b/browser-extension/voice.js
@ -9,17 +9,23 @@ let startTime = 0;
 let audioBlob = null;
 let audioUrl = null;
 let transcript = '';
+let liveTranscript = ''; // accumulated from Web Speech API
 let uploadedFileUrl = '';
 let uploadedMimeType = '';
 let uploadedFileSize = 0;
 let duration = 0;

+// Web Speech API
+let recognition = null;
+let speechSupported = !!(window.SpeechRecognition || window.webkitSpeechRecognition);
+
 // --- DOM refs ---
 const recBtn = document.getElementById('recBtn');
 const timerEl = document.getElementById('timer');
 const statusLabel = document.getElementById('statusLabel');
 const transcriptArea = document.getElementById('transcriptArea');
 const transcriptText = document.getElementById('transcriptText');
+const liveIndicator = document.getElementById('liveIndicator');
 const audioPreview = document.getElementById('audioPreview');
 const audioPlayer = document.getElementById('audioPlayer');
 const notebookSelect = document.getElementById('notebook');
@ -70,6 +76,36 @@ function showStatusBar(message, type) {
  }
 }

+// --- Parakeet progress UI ---
+
+const progressArea = document.getElementById('progressArea');
+const progressLabel = document.getElementById('progressLabel');
+const progressFill = document.getElementById('progressFill');
+
+function showParakeetProgress(p) {
+  if (!progressArea) return;
+  progressArea.classList.add('visible');
+
+  if (p.message) {
+    progressLabel.textContent = p.message;
+  }
+
+  if (p.status === 'downloading' && p.progress !== undefined) {
+    progressFill.style.width = `${p.progress}%`;
+  } else if (p.status === 'transcribing') {
+    progressFill.style.width = '100%';
+  } else if (p.status === 'loading') {
+    progressFill.style.width = '0%';
+  }
+}
+
+function hideParakeetProgress() {
+  if (progressArea) {
+    progressArea.classList.remove('visible');
+    progressFill.style.width = '0%';
+  }
+}
+
 // --- Notebook loader ---

 async function loadNotebooks() {
@ -103,6 +139,97 @@ notebookSelect.addEventListener('change', (e) => {
  chrome.storage.local.set({ lastNotebookId: e.target.value });
 });

+// --- Live transcription (Web Speech API) ---
+
+function startLiveTranscription() {
+  if (!speechSupported) return;
+
+  const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
+  recognition = new SpeechRecognition();
+  recognition.continuous = true;
+  recognition.interimResults = true;
+  recognition.lang = 'en-US';
+
+  let finalizedText = '';
+
+  recognition.onresult = (event) => {
+    let interimText = '';
+    // Rebuild finalized text from all final results
+    finalizedText = '';
+    for (let i = 0; i < event.results.length; i++) {
+      const result = event.results[i];
+      if (result.isFinal) {
+        finalizedText += result[0].transcript.trim() + ' ';
+      } else {
+        interimText += result[0].transcript;
+      }
+    }
+
+    liveTranscript = finalizedText.trim();
+
+    // Update the live transcript display
+    updateLiveDisplay(finalizedText.trim(), interimText.trim());
+  };
+
+  recognition.onerror = (event) => {
+    if (event.error !== 'aborted' && event.error !== 'no-speech') {
+      console.warn('Speech recognition error:', event.error);
+    }
+  };
+
+  // Auto-restart on end (Chrome stops after ~60s of silence)
+  recognition.onend = () => {
+    if (state === 'recording' && recognition) {
+      try { recognition.start(); } catch {}
+    }
+  };
+
+  try {
+    recognition.start();
+    if (liveIndicator) liveIndicator.classList.add('visible');
+  } catch (err) {
+    console.warn('Could not start speech recognition:', err);
+    speechSupported = false;
+  }
+}
+
+function stopLiveTranscription() {
+  if (recognition) {
+    const ref = recognition;
+    recognition = null;
+    try { ref.stop(); } catch {}
+  }
+  if (liveIndicator) liveIndicator.classList.remove('visible');
+}
+
+function updateLiveDisplay(finalText, interimText) {
+  if (state !== 'recording') return;
+
+  // Show transcript area while recording
+  transcriptArea.classList.add('visible');
+
+  let html = '';
+  if (finalText) {
+    html += `<span class="final-text">${escapeHtml(finalText)}</span>`;
+  }
+  if (interimText) {
+    html += `<span class="interim-text">${escapeHtml(interimText)}</span>`;
+  }
+  if (!finalText && !interimText) {
+    html = '<span class="placeholder">Listening...</span>';
+  }
+  transcriptText.innerHTML = html;
+
+  // Auto-scroll
+  transcriptText.scrollTop = transcriptText.scrollHeight;
+}
+
+function escapeHtml(text) {
+  const div = document.createElement('div');
+  div.textContent = text;
+  return div.innerHTML;
+}
+
 // --- Recording ---

 async function startRecording() {
@ -115,6 +242,7 @@ async function startRecording() {

    mediaRecorder = new MediaRecorder(stream, { mimeType });
    audioChunks = [];
+    liveTranscript = '';

    mediaRecorder.ondataavailable = (e) => {
      if (e.data.size > 0) audioChunks.push(e.data);
@ -130,14 +258,24 @@ async function startRecording() {
    setStatusLabel('Recording', 'recording');
    postActions.style.display = 'none';
    audioPreview.classList.remove('visible');
-    transcriptArea.classList.remove('visible');
    statusBar.className = 'status-bar';

+    // Show transcript area with listening placeholder
+    if (speechSupported) {
+      transcriptArea.classList.add('visible');
+      transcriptText.innerHTML = '<span class="placeholder">Listening...</span>';
+    } else {
+      transcriptArea.classList.remove('visible');
+    }
+
    timerInterval = setInterval(() => {
      const elapsed = Math.floor((Date.now() - startTime) / 1000);
      timerEl.textContent = formatTime(elapsed);
    }, 1000);

+    // Start live transcription alongside recording
+    startLiveTranscription();
+
  } catch (err) {
    showStatusBar(err.message || 'Microphone access denied', 'error');
  }
@ -150,6 +288,12 @@ async function stopRecording() {
  timerInterval = null;
  duration = Math.floor((Date.now() - startTime) / 1000);

+  // Capture live transcript before stopping recognition
+  const capturedLiveTranscript = liveTranscript;
+
+  // Stop live transcription
+  stopLiveTranscription();
+
  state = 'processing';
  recBtn.classList.remove('recording');
  timerEl.classList.remove('recording');
@ -170,17 +314,21 @@ async function stopRecording() {
  audioPlayer.src = audioUrl;
  audioPreview.classList.add('visible');

-  // Show transcript area with placeholder
+  // Show live transcript while we process (if we have one)
  transcriptArea.classList.add('visible');
-  transcriptText.innerHTML = '<span class="placeholder">Transcribing...</span>';
+  if (capturedLiveTranscript) {
+    transcriptText.textContent = capturedLiveTranscript;
+    showStatusBar('Improving transcript...', 'loading');
+  } else {
+    transcriptText.innerHTML = '<span class="placeholder">Transcribing...</span>';
+    showStatusBar('Uploading & transcribing...', 'loading');
+  }

  // Upload audio file
  const token = await getToken();
  const settings = await getSettings();

  try {
-    showStatusBar('Uploading recording...', 'loading');
-
    const uploadForm = new FormData();
    uploadForm.append('file', audioBlob, 'voice-note.webm');

@ -197,26 +345,50 @@ async function stopRecording() {
    uploadedMimeType = uploadResult.mimeType;
    uploadedFileSize = uploadResult.size;

-    // Transcribe via batch API
-    showStatusBar('Transcribing...', 'loading');
+    // --- Three-tier transcription cascade ---

-    const transcribeForm = new FormData();
-    transcribeForm.append('audio', audioBlob, 'voice-note.webm');
+    // Tier 1: Batch API (Whisper on server — highest quality)
+    let bestTranscript = '';
+    try {
+      showStatusBar('Transcribing via server...', 'loading');
+      const transcribeForm = new FormData();
+      transcribeForm.append('audio', audioBlob, 'voice-note.webm');

-    const transcribeRes = await fetch(`${settings.host}/api/voice/transcribe`, {
-      method: 'POST',
-      headers: { 'Authorization': `Bearer ${token}` },
-      body: transcribeForm,
-    });
+      const transcribeRes = await fetch(`${settings.host}/api/voice/transcribe`, {
+        method: 'POST',
+        headers: { 'Authorization': `Bearer ${token}` },
+        body: transcribeForm,
+      });

-    if (transcribeRes.ok) {
-      const transcribeResult = await transcribeRes.json();
-      transcript = transcribeResult.text || '';
-    } else {
-      transcript = '';
-      console.warn('Transcription failed, saving without transcript');
+      if (transcribeRes.ok) {
+        const transcribeResult = await transcribeRes.json();
+        bestTranscript = transcribeResult.text || '';
+      }
+    } catch {
+      console.warn('Tier 1 (batch API) unavailable');
    }

+    // Tier 2: Live transcript from Web Speech API (already captured)
+    if (!bestTranscript && capturedLiveTranscript) {
+      bestTranscript = capturedLiveTranscript;
+    }
+
+    // Tier 3: Offline Parakeet.js (NVIDIA, runs in browser)
+    if (!bestTranscript && window.ParakeetOffline) {
+      try {
+        showStatusBar('Transcribing offline (Parakeet)...', 'loading');
+        bestTranscript = await window.ParakeetOffline.transcribeOffline(audioBlob, (p) => {
+          showParakeetProgress(p);
+        });
+        hideParakeetProgress();
+      } catch (offlineErr) {
+        console.warn('Tier 3 (Parakeet offline) failed:', offlineErr);
+        hideParakeetProgress();
+      }
+    }
+
+    transcript = bestTranscript;
+
    // Show transcript (editable)
    if (transcript) {
      transcriptText.textContent = transcript;
@ -230,6 +402,26 @@ async function stopRecording() {
    statusBar.className = 'status-bar';

  } catch (err) {
+    // On upload error, try offline transcription directly
+    let fallbackTranscript = capturedLiveTranscript || '';
+
+    if (!fallbackTranscript && window.ParakeetOffline) {
+      try {
+        showStatusBar('Upload failed, transcribing offline...', 'loading');
+        fallbackTranscript = await window.ParakeetOffline.transcribeOffline(audioBlob, (p) => {
+          showParakeetProgress(p);
+        });
+        hideParakeetProgress();
+      } catch {
+        hideParakeetProgress();
+      }
+    }
+
+    transcript = fallbackTranscript;
+    if (transcript) {
+      transcriptText.textContent = transcript;
+    }
+
    showStatusBar(`Error: ${err.message}`, 'error');
    state = 'done';
    setStatusLabel('Error', 'idle');
@ -341,11 +533,14 @@ function resetState() {
  audioChunks = [];
  audioBlob = null;
  transcript = '';
+  liveTranscript = '';
  uploadedFileUrl = '';
  uploadedMimeType = '';
  uploadedFileSize = 0;
  duration = 0;

+  stopLiveTranscription();
+
  if (audioUrl) {
    URL.revokeObjectURL(audioUrl);
    audioUrl = null;
@ -358,6 +553,7 @@ function resetState() {
  postActions.style.display = 'none';
  audioPreview.classList.remove('visible');
  transcriptArea.classList.remove('visible');
+  hideParakeetProgress();
  statusBar.className = 'status-bar';
 }