feat: add rVoice popup recorder with 3-tier transcription to browser extension

Adds a standalone voice recording popup (voice.html) accessible via the
extension popup button or Ctrl+Shift+V hotkey. Records audio, uploads to
rNotes, and transcribes with a 3-tier cascade: server Whisper API, live
Web Speech API (real-time text while recording), and offline Parakeet.js
(NVIDIA 0.6B, ~634MB cached in IndexedDB). Saves as AUDIO notes with
editable transcript and notebook selection.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Jeff Emmett 2026-02-24 16:43:57 -08:00
parent 3d77eae16b
commit 5ff6c9d832
4 changed files with 439 additions and 21 deletions

View File

@ -35,6 +35,9 @@
"page": "options.html",
"open_in_tab": false
},
"content_security_policy": {
"extension_pages": "script-src 'self' https://esm.sh; object-src 'self'"
},
"commands": {
"open-voice-recorder": {
"suggested_key": {

View File

@ -0,0 +1,147 @@
/**
* Offline transcription using parakeet.js (NVIDIA Parakeet TDT 0.6B v2).
* Loaded at runtime from CDN. Model ~634 MB (int8) on first download,
* cached in IndexedDB after. Works fully offline after first download.
*
* Port of src/lib/parakeetOffline.ts for the browser extension.
*/
const CACHE_KEY = 'parakeet-offline-cached';
// Singleton model — don't reload on subsequent calls
let cachedModel = null;
let loadingPromise = null;
/**
* Check if the Parakeet model has been downloaded before.
*/
function isModelCached() {
try {
return localStorage.getItem(CACHE_KEY) === 'true';
} catch {
return false;
}
}
/**
* Detect WebGPU availability.
*/
async function detectWebGPU() {
if (!navigator.gpu) return false;
try {
const adapter = await navigator.gpu.requestAdapter();
return !!adapter;
} catch {
return false;
}
}
/**
* Get or create the Parakeet model singleton.
* @param {function} onProgress - callback({ status, progress, file, message })
*/
async function getModel(onProgress) {
if (cachedModel) return cachedModel;
if (loadingPromise) return loadingPromise;
loadingPromise = (async () => {
onProgress?.({ status: 'loading', message: 'Loading Parakeet model...' });
// Dynamic import from CDN at runtime
const { fromHub } = await import('https://esm.sh/parakeet.js@1.1.2');
const backend = (await detectWebGPU()) ? 'webgpu' : 'wasm';
const fileProgress = {};
const model = await fromHub('parakeet-tdt-0.6b-v2', {
backend,
progress: ({ file, loaded, total }) => {
fileProgress[file] = { loaded, total };
let totalBytes = 0;
let loadedBytes = 0;
for (const fp of Object.values(fileProgress)) {
totalBytes += fp.total || 0;
loadedBytes += fp.loaded || 0;
}
if (totalBytes > 0) {
const pct = Math.round((loadedBytes / totalBytes) * 100);
onProgress?.({
status: 'downloading',
progress: pct,
file,
message: `Downloading model... ${pct}%`,
});
}
},
});
localStorage.setItem(CACHE_KEY, 'true');
onProgress?.({ status: 'loading', message: 'Model loaded' });
cachedModel = model;
loadingPromise = null;
return model;
})();
return loadingPromise;
}
/**
* Decode an audio Blob to Float32Array at 16 kHz mono.
*/
async function decodeAudioBlob(blob) {
const arrayBuffer = await blob.arrayBuffer();
const audioCtx = new AudioContext({ sampleRate: 16000 });
try {
const audioBuffer = await audioCtx.decodeAudioData(arrayBuffer);
if (audioBuffer.sampleRate === 16000 && audioBuffer.numberOfChannels === 1) {
return audioBuffer.getChannelData(0);
}
// Resample via OfflineAudioContext
const numSamples = Math.ceil(audioBuffer.duration * 16000);
const offlineCtx = new OfflineAudioContext(1, numSamples, 16000);
const source = offlineCtx.createBufferSource();
source.buffer = audioBuffer;
source.connect(offlineCtx.destination);
source.start();
const resampled = await offlineCtx.startRendering();
return resampled.getChannelData(0);
} finally {
await audioCtx.close();
}
}
/**
* Transcribe an audio Blob offline using Parakeet in the browser.
* First call downloads the model (~634 MB). Subsequent calls use cached.
*
* @param {Blob} audioBlob
* @param {function} onProgress - callback({ status, progress, file, message })
* @returns {Promise<string>} transcribed text
*/
async function transcribeOffline(audioBlob, onProgress) {
const model = await getModel(onProgress);
onProgress?.({ status: 'transcribing', message: 'Transcribing audio...' });
const audioData = await decodeAudioBlob(audioBlob);
const result = await model.transcribe(audioData, 16000, {
returnTimestamps: false,
enableProfiling: false,
});
const text = result.utterance_text?.trim() || '';
onProgress?.({ status: 'done', message: 'Transcription complete' });
return text;
}
// Export for use in voice.js (loaded as ES module)
window.ParakeetOffline = {
isModelCached,
transcribeOffline,
};

View File

@ -175,6 +175,13 @@
color: #525252;
font-style: italic;
}
.transcript-text .final-text {
color: #d4d4d4;
}
.transcript-text .interim-text {
color: #737373;
font-style: italic;
}
/* Controls row */
.controls {
@ -255,6 +262,61 @@
.status-bar.error { color: #fca5a5; background: #450a0a; border-top-color: #991b1b; }
.status-bar.loading { color: #93c5fd; background: #172554; border-top-color: #1e40af; }
/* Live indicator */
.live-indicator {
display: none;
align-items: center;
gap: 5px;
font-size: 10px;
font-weight: 700;
text-transform: uppercase;
letter-spacing: 1.5px;
color: #4ade80;
}
.live-indicator.visible {
display: flex;
}
.live-indicator .dot {
width: 6px;
height: 6px;
border-radius: 50%;
background: #4ade80;
animation: pulse-dot 1s infinite;
}
@keyframes pulse-dot {
0%, 100% { opacity: 1; }
50% { opacity: 0.3; }
}
/* Progress bar (for model download) */
.progress-area {
width: 100%;
padding: 0 14px 8px;
display: none;
}
.progress-area.visible {
display: block;
}
.progress-label {
font-size: 11px;
color: #a3a3a3;
margin-bottom: 4px;
}
.progress-bar {
width: 100%;
height: 6px;
background: #262626;
border-radius: 3px;
overflow: hidden;
}
.progress-bar .fill {
height: 100%;
background: #f59e0b;
border-radius: 3px;
transition: width 0.3s;
width: 0%;
}
/* Audio preview */
.audio-preview {
width: 100%;
@ -305,6 +367,15 @@
<div class="inner"></div>
</button>
<div class="timer" id="timer">00:00</div>
<div class="live-indicator" id="liveIndicator">
<span class="dot"></span>
Live transcribe
</div>
</div>
<div class="progress-area" id="progressArea">
<div class="progress-label" id="progressLabel">Loading model...</div>
<div class="progress-bar"><div class="fill" id="progressFill"></div></div>
</div>
<div class="audio-preview" id="audioPreview">
@ -334,9 +405,10 @@
<div class="status-bar" id="statusBar"></div>
<div class="kbd-hint">
<kbd>Space</kbd> to record &middot; <kbd>Esc</kbd> to close
<kbd>Space</kbd> to record &middot; <kbd>Esc</kbd> to close &middot; Offline ready
</div>
<script src="parakeet-offline.js" type="module"></script>
<script src="voice.js"></script>
</body>
</html>

View File

@ -9,17 +9,23 @@ let startTime = 0;
let audioBlob = null;
let audioUrl = null;
let transcript = '';
let liveTranscript = ''; // accumulated from Web Speech API
let uploadedFileUrl = '';
let uploadedMimeType = '';
let uploadedFileSize = 0;
let duration = 0;
// Web Speech API
let recognition = null;
let speechSupported = !!(window.SpeechRecognition || window.webkitSpeechRecognition);
// --- DOM refs ---
const recBtn = document.getElementById('recBtn');
const timerEl = document.getElementById('timer');
const statusLabel = document.getElementById('statusLabel');
const transcriptArea = document.getElementById('transcriptArea');
const transcriptText = document.getElementById('transcriptText');
const liveIndicator = document.getElementById('liveIndicator');
const audioPreview = document.getElementById('audioPreview');
const audioPlayer = document.getElementById('audioPlayer');
const notebookSelect = document.getElementById('notebook');
@ -70,6 +76,36 @@ function showStatusBar(message, type) {
}
}
// --- Parakeet progress UI ---
const progressArea = document.getElementById('progressArea');
const progressLabel = document.getElementById('progressLabel');
const progressFill = document.getElementById('progressFill');
function showParakeetProgress(p) {
if (!progressArea) return;
progressArea.classList.add('visible');
if (p.message) {
progressLabel.textContent = p.message;
}
if (p.status === 'downloading' && p.progress !== undefined) {
progressFill.style.width = `${p.progress}%`;
} else if (p.status === 'transcribing') {
progressFill.style.width = '100%';
} else if (p.status === 'loading') {
progressFill.style.width = '0%';
}
}
function hideParakeetProgress() {
if (progressArea) {
progressArea.classList.remove('visible');
progressFill.style.width = '0%';
}
}
// --- Notebook loader ---
async function loadNotebooks() {
@ -103,6 +139,97 @@ notebookSelect.addEventListener('change', (e) => {
chrome.storage.local.set({ lastNotebookId: e.target.value });
});
// --- Live transcription (Web Speech API) ---
function startLiveTranscription() {
if (!speechSupported) return;
const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
recognition = new SpeechRecognition();
recognition.continuous = true;
recognition.interimResults = true;
recognition.lang = 'en-US';
let finalizedText = '';
recognition.onresult = (event) => {
let interimText = '';
// Rebuild finalized text from all final results
finalizedText = '';
for (let i = 0; i < event.results.length; i++) {
const result = event.results[i];
if (result.isFinal) {
finalizedText += result[0].transcript.trim() + ' ';
} else {
interimText += result[0].transcript;
}
}
liveTranscript = finalizedText.trim();
// Update the live transcript display
updateLiveDisplay(finalizedText.trim(), interimText.trim());
};
recognition.onerror = (event) => {
if (event.error !== 'aborted' && event.error !== 'no-speech') {
console.warn('Speech recognition error:', event.error);
}
};
// Auto-restart on end (Chrome stops after ~60s of silence)
recognition.onend = () => {
if (state === 'recording' && recognition) {
try { recognition.start(); } catch {}
}
};
try {
recognition.start();
if (liveIndicator) liveIndicator.classList.add('visible');
} catch (err) {
console.warn('Could not start speech recognition:', err);
speechSupported = false;
}
}
function stopLiveTranscription() {
if (recognition) {
const ref = recognition;
recognition = null;
try { ref.stop(); } catch {}
}
if (liveIndicator) liveIndicator.classList.remove('visible');
}
function updateLiveDisplay(finalText, interimText) {
if (state !== 'recording') return;
// Show transcript area while recording
transcriptArea.classList.add('visible');
let html = '';
if (finalText) {
html += `<span class="final-text">${escapeHtml(finalText)}</span>`;
}
if (interimText) {
html += `<span class="interim-text">${escapeHtml(interimText)}</span>`;
}
if (!finalText && !interimText) {
html = '<span class="placeholder">Listening...</span>';
}
transcriptText.innerHTML = html;
// Auto-scroll
transcriptText.scrollTop = transcriptText.scrollHeight;
}
function escapeHtml(text) {
const div = document.createElement('div');
div.textContent = text;
return div.innerHTML;
}
// --- Recording ---
async function startRecording() {
@ -115,6 +242,7 @@ async function startRecording() {
mediaRecorder = new MediaRecorder(stream, { mimeType });
audioChunks = [];
liveTranscript = '';
mediaRecorder.ondataavailable = (e) => {
if (e.data.size > 0) audioChunks.push(e.data);
@ -130,14 +258,24 @@ async function startRecording() {
setStatusLabel('Recording', 'recording');
postActions.style.display = 'none';
audioPreview.classList.remove('visible');
transcriptArea.classList.remove('visible');
statusBar.className = 'status-bar';
// Show transcript area with listening placeholder
if (speechSupported) {
transcriptArea.classList.add('visible');
transcriptText.innerHTML = '<span class="placeholder">Listening...</span>';
} else {
transcriptArea.classList.remove('visible');
}
timerInterval = setInterval(() => {
const elapsed = Math.floor((Date.now() - startTime) / 1000);
timerEl.textContent = formatTime(elapsed);
}, 1000);
// Start live transcription alongside recording
startLiveTranscription();
} catch (err) {
showStatusBar(err.message || 'Microphone access denied', 'error');
}
@ -150,6 +288,12 @@ async function stopRecording() {
timerInterval = null;
duration = Math.floor((Date.now() - startTime) / 1000);
// Capture live transcript before stopping recognition
const capturedLiveTranscript = liveTranscript;
// Stop live transcription
stopLiveTranscription();
state = 'processing';
recBtn.classList.remove('recording');
timerEl.classList.remove('recording');
@ -170,17 +314,21 @@ async function stopRecording() {
audioPlayer.src = audioUrl;
audioPreview.classList.add('visible');
// Show transcript area with placeholder
// Show live transcript while we process (if we have one)
transcriptArea.classList.add('visible');
transcriptText.innerHTML = '<span class="placeholder">Transcribing...</span>';
if (capturedLiveTranscript) {
transcriptText.textContent = capturedLiveTranscript;
showStatusBar('Improving transcript...', 'loading');
} else {
transcriptText.innerHTML = '<span class="placeholder">Transcribing...</span>';
showStatusBar('Uploading & transcribing...', 'loading');
}
// Upload audio file
const token = await getToken();
const settings = await getSettings();
try {
showStatusBar('Uploading recording...', 'loading');
const uploadForm = new FormData();
uploadForm.append('file', audioBlob, 'voice-note.webm');
@ -197,26 +345,50 @@ async function stopRecording() {
uploadedMimeType = uploadResult.mimeType;
uploadedFileSize = uploadResult.size;
// Transcribe via batch API
showStatusBar('Transcribing...', 'loading');
// --- Three-tier transcription cascade ---
const transcribeForm = new FormData();
transcribeForm.append('audio', audioBlob, 'voice-note.webm');
// Tier 1: Batch API (Whisper on server — highest quality)
let bestTranscript = '';
try {
showStatusBar('Transcribing via server...', 'loading');
const transcribeForm = new FormData();
transcribeForm.append('audio', audioBlob, 'voice-note.webm');
const transcribeRes = await fetch(`${settings.host}/api/voice/transcribe`, {
method: 'POST',
headers: { 'Authorization': `Bearer ${token}` },
body: transcribeForm,
});
const transcribeRes = await fetch(`${settings.host}/api/voice/transcribe`, {
method: 'POST',
headers: { 'Authorization': `Bearer ${token}` },
body: transcribeForm,
});
if (transcribeRes.ok) {
const transcribeResult = await transcribeRes.json();
transcript = transcribeResult.text || '';
} else {
transcript = '';
console.warn('Transcription failed, saving without transcript');
if (transcribeRes.ok) {
const transcribeResult = await transcribeRes.json();
bestTranscript = transcribeResult.text || '';
}
} catch {
console.warn('Tier 1 (batch API) unavailable');
}
// Tier 2: Live transcript from Web Speech API (already captured)
if (!bestTranscript && capturedLiveTranscript) {
bestTranscript = capturedLiveTranscript;
}
// Tier 3: Offline Parakeet.js (NVIDIA, runs in browser)
if (!bestTranscript && window.ParakeetOffline) {
try {
showStatusBar('Transcribing offline (Parakeet)...', 'loading');
bestTranscript = await window.ParakeetOffline.transcribeOffline(audioBlob, (p) => {
showParakeetProgress(p);
});
hideParakeetProgress();
} catch (offlineErr) {
console.warn('Tier 3 (Parakeet offline) failed:', offlineErr);
hideParakeetProgress();
}
}
transcript = bestTranscript;
// Show transcript (editable)
if (transcript) {
transcriptText.textContent = transcript;
@ -230,6 +402,26 @@ async function stopRecording() {
statusBar.className = 'status-bar';
} catch (err) {
// On upload error, try offline transcription directly
let fallbackTranscript = capturedLiveTranscript || '';
if (!fallbackTranscript && window.ParakeetOffline) {
try {
showStatusBar('Upload failed, transcribing offline...', 'loading');
fallbackTranscript = await window.ParakeetOffline.transcribeOffline(audioBlob, (p) => {
showParakeetProgress(p);
});
hideParakeetProgress();
} catch {
hideParakeetProgress();
}
}
transcript = fallbackTranscript;
if (transcript) {
transcriptText.textContent = transcript;
}
showStatusBar(`Error: ${err.message}`, 'error');
state = 'done';
setStatusLabel('Error', 'idle');
@ -341,11 +533,14 @@ function resetState() {
audioChunks = [];
audioBlob = null;
transcript = '';
liveTranscript = '';
uploadedFileUrl = '';
uploadedMimeType = '';
uploadedFileSize = 0;
duration = 0;
stopLiveTranscription();
if (audioUrl) {
URL.revokeObjectURL(audioUrl);
audioUrl = null;
@ -358,6 +553,7 @@ function resetState() {
postActions.style.display = 'none';
audioPreview.classList.remove('visible');
transcriptArea.classList.remove('visible');
hideParakeetProgress();
statusBar.className = 'status-bar';
}