feat: live streaming transcription via WebSocket + diarization UI

Add AudioWorklet-based PCM16 streaming to VoiceRecorder with WebSocket connection for near-real-time transcription. Segments appear as finalized text that never shifts. Add speaker diarization button on audio notes with color-coded speaker labels. Graceful fallback to batch transcription when WebSocket unavailable. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-15 09:48:40 -07:00 · 2026-02-15 09:48:40 -07:00 · 560dceec0f
parent 30f3383d1b
commit 560dceec0f
5 changed files with 430 additions and 51 deletions
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -12,6 +12,7 @@ services:
      - NEXT_PUBLIC_ENCRYPTID_SERVER_URL=${NEXT_PUBLIC_ENCRYPTID_SERVER_URL:-https://encryptid.jeffemmett.com}
      - RSPACE_INTERNAL_KEY=${RSPACE_INTERNAL_KEY}
      - VOICE_API_URL=${VOICE_API_URL:-http://voice-command-api:8000}
+      - NEXT_PUBLIC_VOICE_WS_URL=${NEXT_PUBLIC_VOICE_WS_URL:-wss://voice.jeffemmett.com}
    volumes:
      - uploads_data:/app/uploads
    labels:
--- a/public/pcm-processor.js
+++ b/public/pcm-processor.js
@ -0,0 +1,22 @@
+/**
+ * AudioWorklet processor that captures raw PCM16 audio for WebSocket streaming.
+ * Runs in a separate thread, sends Int16 buffers to the main thread.
+ */
+class PCMProcessor extends AudioWorkletProcessor {
+  process(inputs) {
+    const input = inputs[0];
+    if (input.length > 0) {
+      const channelData = input[0]; // mono channel
+      // Convert float32 [-1, 1] to int16 [-32768, 32767]
+      const pcm16 = new Int16Array(channelData.length);
+      for (let i = 0; i < channelData.length; i++) {
+        const s = Math.max(-1, Math.min(1, channelData[i]));
+        pcm16[i] = s < 0 ? s * 32768 : s * 32767;
+      }
+      this.port.postMessage(pcm16.buffer, [pcm16.buffer]);
+    }
+    return true;
+  }
+}
+
+registerProcessor('pcm-processor', PCMProcessor);
--- a/src/app/api/voice/diarize/route.ts
+++ b/src/app/api/voice/diarize/route.ts
@ -0,0 +1,42 @@
+import { NextRequest, NextResponse } from 'next/server';
+import { requireAuth, isAuthed } from '@/lib/auth';
+
+const VOICE_API_URL = process.env.VOICE_API_URL || 'http://voice-command-api:8000';
+
+export async function POST(request: NextRequest) {
+  try {
+    const auth = await requireAuth(request);
+    if (!isAuthed(auth)) return auth;
+
+    const formData = await request.formData();
+    const audio = formData.get('audio') as File | null;
+
+    if (!audio) {
+      return NextResponse.json({ error: 'No audio file provided' }, { status: 400 });
+    }
+
+    // Forward to voice-command API diarization endpoint
+    const proxyForm = new FormData();
+    proxyForm.append('audio', audio, audio.name || 'recording.webm');
+
+    const res = await fetch(`${VOICE_API_URL}/api/voice/diarize`, {
+      method: 'POST',
+      body: proxyForm,
+    });
+
+    if (!res.ok) {
+      const err = await res.text();
+      console.error('Diarization API error:', res.status, err);
+      return NextResponse.json(
+        { error: 'Diarization failed' },
+        { status: res.status }
+      );
+    }
+
+    const result = await res.json();
+    return NextResponse.json(result);
+  } catch (error) {
+    console.error('Diarize proxy error:', error);
+    return NextResponse.json({ error: 'Diarization failed' }, { status: 500 });
+  }
+}
--- a/src/app/notes/[id]/page.tsx
+++ b/src/app/notes/[id]/page.tsx
@ -47,6 +47,8 @@ export default function NoteDetailPage() {
  const [editTitle, setEditTitle] = useState('');
  const [editContent, setEditContent] = useState('');
  const [saving, setSaving] = useState(false);
+  const [diarizing, setDiarizing] = useState(false);
+  const [speakers, setSpeakers] = useState<{ speaker: string; start: number; end: number }[] | null>(null);

  useEffect(() => {
    fetch(`/api/notes/${params.id}`)
@ -104,6 +106,35 @@ export default function NoteDetailPage() {
    }
  };

+  const handleDiarize = async () => {
+    if (!note?.fileUrl || diarizing) return;
+    setDiarizing(true);
+    try {
+      // Fetch the audio file from the server
+      const audioRes = await fetch(note.fileUrl);
+      const audioBlob = await audioRes.blob();
+
+      const form = new FormData();
+      form.append('audio', audioBlob, 'recording.webm');
+
+      const res = await authFetch('/api/voice/diarize', {
+        method: 'POST',
+        body: form,
+      });
+
+      if (res.ok) {
+        const result = await res.json();
+        setSpeakers(result.speakers || []);
+      } else {
+        console.error('Diarization failed');
+      }
+    } catch (error) {
+      console.error('Diarization error:', error);
+    } finally {
+      setDiarizing(false);
+    }
+  };
+
  if (loading) {
    return (
      <div className="min-h-screen bg-[#0a0a0a] flex items-center justify-center">
@ -257,7 +288,40 @@ export default function NoteDetailPage() {
              {note.duration != null && <span>{Math.floor(note.duration / 60)}:{(note.duration % 60).toString().padStart(2, '0')}</span>}
              {note.mimeType && <span>{note.mimeType}</span>}
              {note.fileSize && <span>{(note.fileSize / 1024).toFixed(1)} KB</span>}
+              {!speakers && (
+                <button
+                  onClick={handleDiarize}
+                  disabled={diarizing}
+                  className="ml-auto px-3 py-1 text-xs rounded-lg border border-slate-600 text-slate-400 hover:text-white hover:border-slate-500 transition-colors disabled:opacity-50"
+                >
+                  {diarizing ? 'Identifying speakers...' : 'Identify speakers'}
+                </button>
+              )}
            </div>
+            {speakers && speakers.length > 0 && (
+              <div className="pt-2 border-t border-slate-700 space-y-1.5">
+                <div className="text-xs text-slate-500 uppercase tracking-wider mb-2">Speakers</div>
+                {speakers.map((s, i) => {
+                  const colors: Record<string, string> = {
+                    SPEAKER_00: 'border-blue-500/50 text-blue-300',
+                    SPEAKER_01: 'border-green-500/50 text-green-300',
+                    SPEAKER_02: 'border-purple-500/50 text-purple-300',
+                    SPEAKER_03: 'border-orange-500/50 text-orange-300',
+                  };
+                  const color = colors[s.speaker] || 'border-slate-500/50 text-slate-300';
+                  return (
+                    <div key={i} className={`text-xs px-2 py-1.5 rounded border-l-2 bg-slate-800/50 ${color}`}>
+                      <span className="font-medium">{s.speaker.replace('SPEAKER_', 'Speaker ')}</span>
+                      <span className="text-slate-500 ml-2">
+                        {Math.floor(s.start / 60)}:{Math.floor(s.start % 60).toString().padStart(2, '0')}
+                        &ndash;
+                        {Math.floor(s.end / 60)}:{Math.floor(s.end % 60).toString().padStart(2, '0')}
+                      </span>
+                    </div>
+                  );
+                })}
+              </div>
+            )}
          </div>
        )}

--- a/src/components/VoiceRecorder.tsx
+++ b/src/components/VoiceRecorder.tsx
@ -3,6 +3,13 @@
 import { useState, useRef, useCallback, useEffect } from 'react';
 import { authFetch } from '@/lib/authFetch';

+interface Segment {
+  id: number;
+  text: string;
+  start: number;
+  end: number;
+}
+
 interface VoiceRecorderResult {
  fileUrl: string;
  mimeType: string;
@ -16,18 +23,30 @@ interface VoiceRecorderProps {
  className?: string;
 }

+const VOICE_WS_URL =
+  process.env.NEXT_PUBLIC_VOICE_WS_URL || 'wss://voice.jeffemmett.com';
+
 export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) {
-  const [recording, setRecording] = useState(false);
-  const [processing, setProcessing] = useState(false);
-  const [processingStep, setProcessingStep] = useState('');
+  const [status, setStatus] = useState<'idle' | 'recording' | 'processing'>(
+    'idle'
+  );
  const [elapsed, setElapsed] = useState(0);
+  const [segments, setSegments] = useState<Segment[]>([]);
+  const [isListening, setIsListening] = useState(false);
  const [error, setError] = useState<string | null>(null);
  const [audioUrl, setAudioUrl] = useState<string | null>(null);
+  const [streaming, setStreaming] = useState(false);

  const mediaRecorderRef = useRef<MediaRecorder | null>(null);
+  const audioContextRef = useRef<AudioContext | null>(null);
+  const workletNodeRef = useRef<AudioWorkletNode | null>(null);
+  const sourceNodeRef = useRef<MediaStreamAudioSourceNode | null>(null);
+  const wsRef = useRef<WebSocket | null>(null);
  const chunksRef = useRef<Blob[]>([]);
+  const segmentsRef = useRef<Segment[]>([]);
  const timerRef = useRef<ReturnType<typeof setInterval> | null>(null);
  const startTimeRef = useRef<number>(0);
+  const transcriptScrollRef = useRef<HTMLDivElement | null>(null);

  useEffect(() => {
    return () => {
@ -36,44 +55,159 @@ export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) {
    };
  }, [audioUrl]);

+  // Auto-scroll transcript to bottom when new segments arrive
+  useEffect(() => {
+    if (transcriptScrollRef.current) {
+      transcriptScrollRef.current.scrollTop =
+        transcriptScrollRef.current.scrollHeight;
+    }
+  }, [segments]);
+
+  const addSegment = useCallback((seg: Segment) => {
+    segmentsRef.current = [...segmentsRef.current, seg];
+    setSegments([...segmentsRef.current]);
+  }, []);
+
+  const cleanup = useCallback(() => {
+    if (workletNodeRef.current) {
+      workletNodeRef.current.disconnect();
+      workletNodeRef.current = null;
+    }
+    if (sourceNodeRef.current) {
+      sourceNodeRef.current.disconnect();
+      sourceNodeRef.current = null;
+    }
+    if (
+      audioContextRef.current &&
+      audioContextRef.current.state !== 'closed'
+    ) {
+      audioContextRef.current.close().catch(() => {});
+      audioContextRef.current = null;
+    }
+    if (wsRef.current) {
+      if (wsRef.current.readyState === WebSocket.OPEN) {
+        wsRef.current.close();
+      }
+      wsRef.current = null;
+    }
+  }, []);
+
  const formatTime = (seconds: number) => {
-    const m = Math.floor(seconds / 60).toString().padStart(2, '0');
+    const m = Math.floor(seconds / 60)
+      .toString()
+      .padStart(2, '0');
    const s = (seconds % 60).toString().padStart(2, '0');
    return `${m}:${s}`;
  };

  const startRecording = useCallback(async () => {
    setError(null);
+    setSegments([]);
+    segmentsRef.current = [];
+    setIsListening(false);
+    setStreaming(false);
+
    try {
      const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+
+      // Start MediaRecorder for the audio file
      const mediaRecorder = new MediaRecorder(stream, {
        mimeType: MediaRecorder.isTypeSupported('audio/webm;codecs=opus')
          ? 'audio/webm;codecs=opus'
          : 'audio/webm',
      });
-
      chunksRef.current = [];
      mediaRecorder.ondataavailable = (e) => {
        if (e.data.size > 0) chunksRef.current.push(e.data);
      };
-
-      mediaRecorder.onstop = () => {
-        stream.getTracks().forEach((t) => t.stop());
-      };
-
      mediaRecorder.start(1000);
      mediaRecorderRef.current = mediaRecorder;
-      startTimeRef.current = Date.now();
-      setRecording(true);
-      setElapsed(0);

+      // Try to set up WebSocket streaming for live transcription
+      try {
+        const ws = new WebSocket(`${VOICE_WS_URL}/api/voice/stream`);
+        wsRef.current = ws;
+
+        await new Promise<void>((resolve, reject) => {
+          const timeout = setTimeout(() => {
+            ws.close();
+            reject(new Error('WebSocket connection timeout'));
+          }, 5000);
+
+          ws.onopen = () => {
+            clearTimeout(timeout);
+            resolve();
+          };
+          ws.onerror = () => {
+            clearTimeout(timeout);
+            reject(new Error('WebSocket connection failed'));
+          };
+        });
+
+        // WebSocket message handler
+        ws.onmessage = (event) => {
+          try {
+            const data = JSON.parse(event.data);
+            if (data.type === 'listening') {
+              setIsListening(true);
+              setTimeout(() => setIsListening(false), 600);
+            } else if (data.type === 'segment') {
+              addSegment({
+                id: data.id,
+                text: data.text,
+                start: data.start,
+                end: data.end,
+              });
+            }
+          } catch {
+            // Ignore parse errors
+          }
+        };
+
+        // Set up AudioContext at 16kHz and AudioWorklet for PCM16 streaming
+        const audioCtx = new AudioContext({ sampleRate: 16000 });
+        audioContextRef.current = audioCtx;
+        const source = audioCtx.createMediaStreamSource(stream);
+        sourceNodeRef.current = source;
+
+        await audioCtx.audioWorklet.addModule('/pcm-processor.js');
+        const workletNode = new AudioWorkletNode(audioCtx, 'pcm-processor');
+        workletNodeRef.current = workletNode;
+
+        workletNode.port.onmessage = (e) => {
+          if (ws.readyState === WebSocket.OPEN) {
+            ws.send(e.data as ArrayBuffer);
+          }
+        };
+
+        source.connect(workletNode);
+        // Don't connect to destination — we don't want to hear ourselves
+        setStreaming(true);
+      } catch (wsErr) {
+        console.warn(
+          'WebSocket streaming unavailable, will batch transcribe:',
+          wsErr
+        );
+        setStreaming(false);
+        if (wsRef.current) {
+          wsRef.current.close();
+          wsRef.current = null;
+        }
+      }
+
+      // Start timer
+      startTimeRef.current = Date.now();
+      setStatus('recording');
+      setElapsed(0);
      timerRef.current = setInterval(() => {
        setElapsed(Math.floor((Date.now() - startTimeRef.current) / 1000));
      }, 1000);
    } catch (err) {
-      setError(err instanceof Error ? err.message : 'Microphone access denied');
+      setError(
+        err instanceof Error ? err.message : 'Microphone access denied'
+      );
    }
-  }, []);
+  }, [addSegment]);

  const stopRecording = useCallback(async () => {
    const mediaRecorder = mediaRecorderRef.current;
@ -84,26 +218,73 @@ export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) {
      timerRef.current = null;
    }

-    const duration = Math.floor((Date.now() - startTimeRef.current) / 1000);
-    setRecording(false);
-    setProcessing(true);
+    const duration = Math.floor(
+      (Date.now() - startTimeRef.current) / 1000
+    );
+    setStatus('processing');

-    // Wait for final data
+    // Stop AudioWorklet streaming
+    if (workletNodeRef.current) {
+      workletNodeRef.current.disconnect();
+      workletNodeRef.current = null;
+    }
+
+    // Send "end" to WebSocket and wait for final segments
+    let wsFullText = '';
+    if (wsRef.current && wsRef.current.readyState === WebSocket.OPEN) {
+      try {
+        const ws = wsRef.current;
+        wsFullText = await new Promise<string>((resolve) => {
+          const timeout = setTimeout(() => resolve(''), 5000);
+
+          const handler = (event: MessageEvent) => {
+            try {
+              const data = JSON.parse(event.data);
+              if (data.type === 'segment') {
+                addSegment({
+                  id: data.id,
+                  text: data.text,
+                  start: data.start,
+                  end: data.end,
+                });
+              }
+              if (data.type === 'done') {
+                clearTimeout(timeout);
+                ws.removeEventListener('message', handler);
+                resolve(data.fullText || '');
+              }
+            } catch {
+              // Ignore
+            }
+          };
+
+          ws.addEventListener('message', handler);
+          ws.send(JSON.stringify({ type: 'end' }));
+        });
+      } catch {
+        // Timeout or error — use accumulated segments
+      }
+    }
+
+    // Close WebSocket and AudioContext
+    cleanup();
+
+    // Stop MediaRecorder and collect the audio blob
    const blob = await new Promise<Blob>((resolve) => {
      mediaRecorder.onstop = () => {
        mediaRecorder.stream.getTracks().forEach((t) => t.stop());
-        resolve(new Blob(chunksRef.current, { type: mediaRecorder.mimeType }));
+        resolve(
+          new Blob(chunksRef.current, { type: mediaRecorder.mimeType })
+        );
      };
      mediaRecorder.stop();
    });

-    // Preview URL
    const previewUrl = URL.createObjectURL(blob);
    setAudioUrl(previewUrl);

    try {
      // Upload audio file
-      setProcessingStep('Uploading audio...');
      const uploadForm = new FormData();
      uploadForm.append('file', blob, 'recording.webm');

@ -119,22 +300,31 @@ export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) {

      const uploadResult = await uploadRes.json();

-      // Transcribe
-      setProcessingStep('Transcribing...');
-      const transcribeForm = new FormData();
-      transcribeForm.append('audio', blob, 'recording.webm');
+      // Determine transcript: prefer WebSocket fullText, then assembled segments, then batch
+      let transcript = wsFullText;

-      const transcribeRes = await authFetch('/api/voice/transcribe', {
-        method: 'POST',
-        body: transcribeForm,
-      });
+      if (!transcript && segmentsRef.current.length > 0) {
+        transcript = segmentsRef.current.map((s) => s.text).join(' ');
+      }

-      let transcript = '';
-      if (transcribeRes.ok) {
-        const transcribeResult = await transcribeRes.json();
-        transcript = transcribeResult.text || '';
-      } else {
-        console.warn('Transcription failed, saving audio without transcript');
+      if (!transcript) {
+        // Fallback: batch transcription via API proxy
+        try {
+          const transcribeForm = new FormData();
+          transcribeForm.append('audio', blob, 'recording.webm');
+
+          const transcribeRes = await authFetch('/api/voice/transcribe', {
+            method: 'POST',
+            body: transcribeForm,
+          });
+
+          if (transcribeRes.ok) {
+            const transcribeResult = await transcribeRes.json();
+            transcript = transcribeResult.text || '';
+          }
+        } catch {
+          console.warn('Batch transcription also failed');
+        }
      }

      onResult({
@ -147,18 +337,20 @@ export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) {
    } catch (err) {
      setError(err instanceof Error ? err.message : 'Processing failed');
    } finally {
-      setProcessing(false);
-      setProcessingStep('');
+      setStatus('idle');
    }
-  }, [onResult]);
+  }, [onResult, addSegment, cleanup]);

  const discard = useCallback(() => {
    if (audioUrl) {
      URL.revokeObjectURL(audioUrl);
      setAudioUrl(null);
    }
+    setSegments([]);
+    segmentsRef.current = [];
    setElapsed(0);
    setError(null);
+    setStatus('idle');
  }, [audioUrl]);

  return (
@ -166,14 +358,18 @@ export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) {
      <div className="border border-slate-700 rounded-lg p-6 bg-slate-800/30">
        {/* Recording controls */}
        <div className="flex flex-col items-center gap-4">
-          {!recording && !processing && !audioUrl && (
+          {status === 'idle' && !audioUrl && (
            <>
              <button
                type="button"
                onClick={startRecording}
                className="w-20 h-20 rounded-full bg-red-500 hover:bg-red-400 transition-colors flex items-center justify-center"
              >
-                <svg className="w-8 h-8 text-white" fill="currentColor" viewBox="0 0 24 24">
+                <svg
+                  className="w-8 h-8 text-white"
+                  fill="currentColor"
+                  viewBox="0 0 24 24"
+                >
                  <path d="M12 14c1.66 0 3-1.34 3-3V5c0-1.66-1.34-3-3-3S9 3.34 9 5v6c0 1.66 1.34 3 3 3zm-1-9c0-.55.45-1 1-1s1 .45 1 1v6c0 .55-.45 1-1 1s-1-.45-1-1V5z" />
                  <path d="M17 11c0 2.76-2.24 5-5 5s-5-2.24-5-5H5c0 3.53 2.61 6.43 6 6.92V21h2v-3.08c3.39-.49 6-3.39 6-6.92h-2z" />
                </svg>
@ -182,11 +378,24 @@ export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) {
            </>
          )}

-          {recording && (
+          {status === 'recording' && (
            <>
              <div className="flex items-center gap-3">
-                <span className="w-3 h-3 rounded-full bg-red-500 animate-pulse" />
-                <span className="text-2xl font-mono text-white">{formatTime(elapsed)}</span>
+                <span
+                  className={`w-3 h-3 rounded-full transition-colors ${
+                    isListening
+                      ? 'bg-green-400 animate-pulse'
+                      : 'bg-red-500 animate-pulse'
+                  }`}
+                />
+                <span className="text-2xl font-mono text-white">
+                  {formatTime(elapsed)}
+                </span>
+                {streaming && (
+                  <span className="text-xs text-green-400/70 font-medium tracking-wider">
+                    LIVE
+                  </span>
+                )}
              </div>
              <button
                type="button"
@ -199,21 +408,40 @@ export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) {
            </>
          )}

-          {processing && (
+          {status === 'processing' && (
            <div className="flex flex-col items-center gap-3 py-4">
-              <svg className="animate-spin h-8 w-8 text-amber-400" viewBox="0 0 24 24">
-                <circle className="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" strokeWidth="4" fill="none" />
-                <path className="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4zm2 5.291A7.962 7.962 0 014 12H0c0 3.042 1.135 5.824 3 7.938l3-2.647z" />
+              <svg
+                className="animate-spin h-8 w-8 text-amber-400"
+                viewBox="0 0 24 24"
+              >
+                <circle
+                  className="opacity-25"
+                  cx="12"
+                  cy="12"
+                  r="10"
+                  stroke="currentColor"
+                  strokeWidth="4"
+                  fill="none"
+                />
+                <path
+                  className="opacity-75"
+                  fill="currentColor"
+                  d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4zm2 5.291A7.962 7.962 0 014 12H0c0 3.042 1.135 5.824 3 7.938l3-2.647z"
+                />
              </svg>
-              <p className="text-sm text-slate-400">{processingStep}</p>
+              <p className="text-sm text-slate-400">
+                Finalizing transcription...
+              </p>
            </div>
          )}

-          {audioUrl && !processing && (
+          {audioUrl && status === 'idle' && (
            <div className="w-full space-y-3">
              <audio controls src={audioUrl} className="w-full" />
              <div className="flex items-center justify-between">
-                <span className="text-sm text-slate-400">{formatTime(elapsed)} recorded</span>
+                <span className="text-sm text-slate-400">
+                  {formatTime(elapsed)} recorded
+                </span>
                <button
                  type="button"
                  onClick={discard}
@ -226,6 +454,28 @@ export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) {
          )}
        </div>

+        {/* Live transcript segments */}
+        {segments.length > 0 && (
+          <div className="mt-4">
+            <div className="text-xs text-slate-500 uppercase tracking-wider mb-2">
+              Live Transcript
+            </div>
+            <div
+              ref={transcriptScrollRef}
+              className="space-y-1.5 max-h-48 overflow-y-auto"
+            >
+              {segments.map((seg) => (
+                <div
+                  key={seg.id}
+                  className="text-sm text-slate-300 px-3 py-2 bg-slate-800/50 rounded border-l-2 border-amber-500/30 animate-in fade-in slide-in-from-bottom-1 duration-300"
+                >
+                  {seg.text}
+                </div>
+              ))}
+            </div>
+          </div>
+        )}
+
        {error && (
          <p className="text-red-400 text-sm mt-4 text-center">{error}</p>
        )}