diff --git a/browser-extension/manifest.json b/browser-extension/manifest.json index 0f615d9..7317a84 100644 --- a/browser-extension/manifest.json +++ b/browser-extension/manifest.json @@ -35,6 +35,9 @@ "page": "options.html", "open_in_tab": false }, + "content_security_policy": { + "extension_pages": "script-src 'self' https://esm.sh; object-src 'self'" + }, "commands": { "open-voice-recorder": { "suggested_key": { diff --git a/browser-extension/parakeet-offline.js b/browser-extension/parakeet-offline.js new file mode 100644 index 0000000..2aa4443 --- /dev/null +++ b/browser-extension/parakeet-offline.js @@ -0,0 +1,147 @@ +/** + * Offline transcription using parakeet.js (NVIDIA Parakeet TDT 0.6B v2). + * Loaded at runtime from CDN. Model ~634 MB (int8) on first download, + * cached in IndexedDB after. Works fully offline after first download. + * + * Port of src/lib/parakeetOffline.ts for the browser extension. + */ + +const CACHE_KEY = 'parakeet-offline-cached'; + +// Singleton model — don't reload on subsequent calls +let cachedModel = null; +let loadingPromise = null; + +/** + * Check if the Parakeet model has been downloaded before. + */ +function isModelCached() { + try { + return localStorage.getItem(CACHE_KEY) === 'true'; + } catch { + return false; + } +} + +/** + * Detect WebGPU availability. + */ +async function detectWebGPU() { + if (!navigator.gpu) return false; + try { + const adapter = await navigator.gpu.requestAdapter(); + return !!adapter; + } catch { + return false; + } +} + +/** + * Get or create the Parakeet model singleton. + * @param {function} onProgress - callback({ status, progress, file, message }) + */ +async function getModel(onProgress) { + if (cachedModel) return cachedModel; + if (loadingPromise) return loadingPromise; + + loadingPromise = (async () => { + onProgress?.({ status: 'loading', message: 'Loading Parakeet model...' }); + + // Dynamic import from CDN at runtime + const { fromHub } = await import('https://esm.sh/parakeet.js@1.1.2'); + + const backend = (await detectWebGPU()) ? 'webgpu' : 'wasm'; + const fileProgress = {}; + + const model = await fromHub('parakeet-tdt-0.6b-v2', { + backend, + progress: ({ file, loaded, total }) => { + fileProgress[file] = { loaded, total }; + + let totalBytes = 0; + let loadedBytes = 0; + for (const fp of Object.values(fileProgress)) { + totalBytes += fp.total || 0; + loadedBytes += fp.loaded || 0; + } + + if (totalBytes > 0) { + const pct = Math.round((loadedBytes / totalBytes) * 100); + onProgress?.({ + status: 'downloading', + progress: pct, + file, + message: `Downloading model... ${pct}%`, + }); + } + }, + }); + + localStorage.setItem(CACHE_KEY, 'true'); + onProgress?.({ status: 'loading', message: 'Model loaded' }); + + cachedModel = model; + loadingPromise = null; + return model; + })(); + + return loadingPromise; +} + +/** + * Decode an audio Blob to Float32Array at 16 kHz mono. + */ +async function decodeAudioBlob(blob) { + const arrayBuffer = await blob.arrayBuffer(); + const audioCtx = new AudioContext({ sampleRate: 16000 }); + try { + const audioBuffer = await audioCtx.decodeAudioData(arrayBuffer); + + if (audioBuffer.sampleRate === 16000 && audioBuffer.numberOfChannels === 1) { + return audioBuffer.getChannelData(0); + } + + // Resample via OfflineAudioContext + const numSamples = Math.ceil(audioBuffer.duration * 16000); + const offlineCtx = new OfflineAudioContext(1, numSamples, 16000); + const source = offlineCtx.createBufferSource(); + source.buffer = audioBuffer; + source.connect(offlineCtx.destination); + source.start(); + const resampled = await offlineCtx.startRendering(); + return resampled.getChannelData(0); + } finally { + await audioCtx.close(); + } +} + +/** + * Transcribe an audio Blob offline using Parakeet in the browser. + * First call downloads the model (~634 MB). Subsequent calls use cached. + * + * @param {Blob} audioBlob + * @param {function} onProgress - callback({ status, progress, file, message }) + * @returns {Promise} transcribed text + */ +async function transcribeOffline(audioBlob, onProgress) { + const model = await getModel(onProgress); + + onProgress?.({ status: 'transcribing', message: 'Transcribing audio...' }); + + const audioData = await decodeAudioBlob(audioBlob); + + const result = await model.transcribe(audioData, 16000, { + returnTimestamps: false, + enableProfiling: false, + }); + + const text = result.utterance_text?.trim() || ''; + onProgress?.({ status: 'done', message: 'Transcription complete' }); + return text; +} + +// Export for use in voice.js (loaded as ES module) +window.ParakeetOffline = { + isModelCached, + transcribeOffline, +}; diff --git a/browser-extension/voice.html b/browser-extension/voice.html index ecacc3f..0da0f25 100644 --- a/browser-extension/voice.html +++ b/browser-extension/voice.html @@ -175,6 +175,13 @@ color: #525252; font-style: italic; } + .transcript-text .final-text { + color: #d4d4d4; + } + .transcript-text .interim-text { + color: #737373; + font-style: italic; + } /* Controls row */ .controls { @@ -255,6 +262,61 @@ .status-bar.error { color: #fca5a5; background: #450a0a; border-top-color: #991b1b; } .status-bar.loading { color: #93c5fd; background: #172554; border-top-color: #1e40af; } + /* Live indicator */ + .live-indicator { + display: none; + align-items: center; + gap: 5px; + font-size: 10px; + font-weight: 700; + text-transform: uppercase; + letter-spacing: 1.5px; + color: #4ade80; + } + .live-indicator.visible { + display: flex; + } + .live-indicator .dot { + width: 6px; + height: 6px; + border-radius: 50%; + background: #4ade80; + animation: pulse-dot 1s infinite; + } + @keyframes pulse-dot { + 0%, 100% { opacity: 1; } + 50% { opacity: 0.3; } + } + + /* Progress bar (for model download) */ + .progress-area { + width: 100%; + padding: 0 14px 8px; + display: none; + } + .progress-area.visible { + display: block; + } + .progress-label { + font-size: 11px; + color: #a3a3a3; + margin-bottom: 4px; + } + .progress-bar { + width: 100%; + height: 6px; + background: #262626; + border-radius: 3px; + overflow: hidden; + } + .progress-bar .fill { + height: 100%; + background: #f59e0b; + border-radius: 3px; + transition: width 0.3s; + width: 0%; + } + /* Audio preview */ .audio-preview { width: 100%; @@ -305,6 +367,15 @@
00:00
+
+ + Live transcribe +
+ + +
+
Loading model...
+
@@ -334,9 +405,10 @@
- Space to record · Esc to close + Space to record · Esc to close · Offline ready
+ diff --git a/browser-extension/voice.js b/browser-extension/voice.js index 8dbe6c5..9c94767 100644 --- a/browser-extension/voice.js +++ b/browser-extension/voice.js @@ -9,17 +9,23 @@ let startTime = 0; let audioBlob = null; let audioUrl = null; let transcript = ''; +let liveTranscript = ''; // accumulated from Web Speech API let uploadedFileUrl = ''; let uploadedMimeType = ''; let uploadedFileSize = 0; let duration = 0; +// Web Speech API +let recognition = null; +let speechSupported = !!(window.SpeechRecognition || window.webkitSpeechRecognition); + // --- DOM refs --- const recBtn = document.getElementById('recBtn'); const timerEl = document.getElementById('timer'); const statusLabel = document.getElementById('statusLabel'); const transcriptArea = document.getElementById('transcriptArea'); const transcriptText = document.getElementById('transcriptText'); +const liveIndicator = document.getElementById('liveIndicator'); const audioPreview = document.getElementById('audioPreview'); const audioPlayer = document.getElementById('audioPlayer'); const notebookSelect = document.getElementById('notebook'); @@ -70,6 +76,36 @@ function showStatusBar(message, type) { } } +// --- Parakeet progress UI --- + +const progressArea = document.getElementById('progressArea'); +const progressLabel = document.getElementById('progressLabel'); +const progressFill = document.getElementById('progressFill'); + +function showParakeetProgress(p) { + if (!progressArea) return; + progressArea.classList.add('visible'); + + if (p.message) { + progressLabel.textContent = p.message; + } + + if (p.status === 'downloading' && p.progress !== undefined) { + progressFill.style.width = `${p.progress}%`; + } else if (p.status === 'transcribing') { + progressFill.style.width = '100%'; + } else if (p.status === 'loading') { + progressFill.style.width = '0%'; + } +} + +function hideParakeetProgress() { + if (progressArea) { + progressArea.classList.remove('visible'); + progressFill.style.width = '0%'; + } +} + // --- Notebook loader --- async function loadNotebooks() { @@ -103,6 +139,97 @@ notebookSelect.addEventListener('change', (e) => { chrome.storage.local.set({ lastNotebookId: e.target.value }); }); +// --- Live transcription (Web Speech API) --- + +function startLiveTranscription() { + if (!speechSupported) return; + + const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition; + recognition = new SpeechRecognition(); + recognition.continuous = true; + recognition.interimResults = true; + recognition.lang = 'en-US'; + + let finalizedText = ''; + + recognition.onresult = (event) => { + let interimText = ''; + // Rebuild finalized text from all final results + finalizedText = ''; + for (let i = 0; i < event.results.length; i++) { + const result = event.results[i]; + if (result.isFinal) { + finalizedText += result[0].transcript.trim() + ' '; + } else { + interimText += result[0].transcript; + } + } + + liveTranscript = finalizedText.trim(); + + // Update the live transcript display + updateLiveDisplay(finalizedText.trim(), interimText.trim()); + }; + + recognition.onerror = (event) => { + if (event.error !== 'aborted' && event.error !== 'no-speech') { + console.warn('Speech recognition error:', event.error); + } + }; + + // Auto-restart on end (Chrome stops after ~60s of silence) + recognition.onend = () => { + if (state === 'recording' && recognition) { + try { recognition.start(); } catch {} + } + }; + + try { + recognition.start(); + if (liveIndicator) liveIndicator.classList.add('visible'); + } catch (err) { + console.warn('Could not start speech recognition:', err); + speechSupported = false; + } +} + +function stopLiveTranscription() { + if (recognition) { + const ref = recognition; + recognition = null; + try { ref.stop(); } catch {} + } + if (liveIndicator) liveIndicator.classList.remove('visible'); +} + +function updateLiveDisplay(finalText, interimText) { + if (state !== 'recording') return; + + // Show transcript area while recording + transcriptArea.classList.add('visible'); + + let html = ''; + if (finalText) { + html += `${escapeHtml(finalText)}`; + } + if (interimText) { + html += `${escapeHtml(interimText)}`; + } + if (!finalText && !interimText) { + html = 'Listening...'; + } + transcriptText.innerHTML = html; + + // Auto-scroll + transcriptText.scrollTop = transcriptText.scrollHeight; +} + +function escapeHtml(text) { + const div = document.createElement('div'); + div.textContent = text; + return div.innerHTML; +} + // --- Recording --- async function startRecording() { @@ -115,6 +242,7 @@ async function startRecording() { mediaRecorder = new MediaRecorder(stream, { mimeType }); audioChunks = []; + liveTranscript = ''; mediaRecorder.ondataavailable = (e) => { if (e.data.size > 0) audioChunks.push(e.data); @@ -130,14 +258,24 @@ async function startRecording() { setStatusLabel('Recording', 'recording'); postActions.style.display = 'none'; audioPreview.classList.remove('visible'); - transcriptArea.classList.remove('visible'); statusBar.className = 'status-bar'; + // Show transcript area with listening placeholder + if (speechSupported) { + transcriptArea.classList.add('visible'); + transcriptText.innerHTML = 'Listening...'; + } else { + transcriptArea.classList.remove('visible'); + } + timerInterval = setInterval(() => { const elapsed = Math.floor((Date.now() - startTime) / 1000); timerEl.textContent = formatTime(elapsed); }, 1000); + // Start live transcription alongside recording + startLiveTranscription(); + } catch (err) { showStatusBar(err.message || 'Microphone access denied', 'error'); } @@ -150,6 +288,12 @@ async function stopRecording() { timerInterval = null; duration = Math.floor((Date.now() - startTime) / 1000); + // Capture live transcript before stopping recognition + const capturedLiveTranscript = liveTranscript; + + // Stop live transcription + stopLiveTranscription(); + state = 'processing'; recBtn.classList.remove('recording'); timerEl.classList.remove('recording'); @@ -170,17 +314,21 @@ async function stopRecording() { audioPlayer.src = audioUrl; audioPreview.classList.add('visible'); - // Show transcript area with placeholder + // Show live transcript while we process (if we have one) transcriptArea.classList.add('visible'); - transcriptText.innerHTML = 'Transcribing...'; + if (capturedLiveTranscript) { + transcriptText.textContent = capturedLiveTranscript; + showStatusBar('Improving transcript...', 'loading'); + } else { + transcriptText.innerHTML = 'Transcribing...'; + showStatusBar('Uploading & transcribing...', 'loading'); + } // Upload audio file const token = await getToken(); const settings = await getSettings(); try { - showStatusBar('Uploading recording...', 'loading'); - const uploadForm = new FormData(); uploadForm.append('file', audioBlob, 'voice-note.webm'); @@ -197,26 +345,50 @@ async function stopRecording() { uploadedMimeType = uploadResult.mimeType; uploadedFileSize = uploadResult.size; - // Transcribe via batch API - showStatusBar('Transcribing...', 'loading'); + // --- Three-tier transcription cascade --- - const transcribeForm = new FormData(); - transcribeForm.append('audio', audioBlob, 'voice-note.webm'); + // Tier 1: Batch API (Whisper on server — highest quality) + let bestTranscript = ''; + try { + showStatusBar('Transcribing via server...', 'loading'); + const transcribeForm = new FormData(); + transcribeForm.append('audio', audioBlob, 'voice-note.webm'); - const transcribeRes = await fetch(`${settings.host}/api/voice/transcribe`, { - method: 'POST', - headers: { 'Authorization': `Bearer ${token}` }, - body: transcribeForm, - }); + const transcribeRes = await fetch(`${settings.host}/api/voice/transcribe`, { + method: 'POST', + headers: { 'Authorization': `Bearer ${token}` }, + body: transcribeForm, + }); - if (transcribeRes.ok) { - const transcribeResult = await transcribeRes.json(); - transcript = transcribeResult.text || ''; - } else { - transcript = ''; - console.warn('Transcription failed, saving without transcript'); + if (transcribeRes.ok) { + const transcribeResult = await transcribeRes.json(); + bestTranscript = transcribeResult.text || ''; + } + } catch { + console.warn('Tier 1 (batch API) unavailable'); } + // Tier 2: Live transcript from Web Speech API (already captured) + if (!bestTranscript && capturedLiveTranscript) { + bestTranscript = capturedLiveTranscript; + } + + // Tier 3: Offline Parakeet.js (NVIDIA, runs in browser) + if (!bestTranscript && window.ParakeetOffline) { + try { + showStatusBar('Transcribing offline (Parakeet)...', 'loading'); + bestTranscript = await window.ParakeetOffline.transcribeOffline(audioBlob, (p) => { + showParakeetProgress(p); + }); + hideParakeetProgress(); + } catch (offlineErr) { + console.warn('Tier 3 (Parakeet offline) failed:', offlineErr); + hideParakeetProgress(); + } + } + + transcript = bestTranscript; + // Show transcript (editable) if (transcript) { transcriptText.textContent = transcript; @@ -230,6 +402,26 @@ async function stopRecording() { statusBar.className = 'status-bar'; } catch (err) { + // On upload error, try offline transcription directly + let fallbackTranscript = capturedLiveTranscript || ''; + + if (!fallbackTranscript && window.ParakeetOffline) { + try { + showStatusBar('Upload failed, transcribing offline...', 'loading'); + fallbackTranscript = await window.ParakeetOffline.transcribeOffline(audioBlob, (p) => { + showParakeetProgress(p); + }); + hideParakeetProgress(); + } catch { + hideParakeetProgress(); + } + } + + transcript = fallbackTranscript; + if (transcript) { + transcriptText.textContent = transcript; + } + showStatusBar(`Error: ${err.message}`, 'error'); state = 'done'; setStatusLabel('Error', 'idle'); @@ -341,11 +533,14 @@ function resetState() { audioChunks = []; audioBlob = null; transcript = ''; + liveTranscript = ''; uploadedFileUrl = ''; uploadedMimeType = ''; uploadedFileSize = 0; duration = 0; + stopLiveTranscription(); + if (audioUrl) { URL.revokeObjectURL(audioUrl); audioUrl = null; @@ -358,6 +553,7 @@ function resetState() { postActions.style.display = 'none'; audioPreview.classList.remove('visible'); transcriptArea.classList.remove('visible'); + hideParakeetProgress(); statusBar.className = 'status-bar'; }