diff --git a/docker-compose.yml b/docker-compose.yml
index df5cc57..428f244 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -12,6 +12,7 @@ services:
- NEXT_PUBLIC_ENCRYPTID_SERVER_URL=${NEXT_PUBLIC_ENCRYPTID_SERVER_URL:-https://encryptid.jeffemmett.com}
- RSPACE_INTERNAL_KEY=${RSPACE_INTERNAL_KEY}
- VOICE_API_URL=${VOICE_API_URL:-http://voice-command-api:8000}
+ - NEXT_PUBLIC_VOICE_WS_URL=${NEXT_PUBLIC_VOICE_WS_URL:-wss://voice.jeffemmett.com}
volumes:
- uploads_data:/app/uploads
labels:
diff --git a/public/pcm-processor.js b/public/pcm-processor.js
new file mode 100644
index 0000000..3302dde
--- /dev/null
+++ b/public/pcm-processor.js
@@ -0,0 +1,22 @@
+/**
+ * AudioWorklet processor that captures raw PCM16 audio for WebSocket streaming.
+ * Runs in a separate thread, sends Int16 buffers to the main thread.
+ */
+class PCMProcessor extends AudioWorkletProcessor {
+ process(inputs) {
+ const input = inputs[0];
+ if (input.length > 0) {
+ const channelData = input[0]; // mono channel
+ // Convert float32 [-1, 1] to int16 [-32768, 32767]
+ const pcm16 = new Int16Array(channelData.length);
+ for (let i = 0; i < channelData.length; i++) {
+ const s = Math.max(-1, Math.min(1, channelData[i]));
+ pcm16[i] = s < 0 ? s * 32768 : s * 32767;
+ }
+ this.port.postMessage(pcm16.buffer, [pcm16.buffer]);
+ }
+ return true;
+ }
+}
+
+registerProcessor('pcm-processor', PCMProcessor);
diff --git a/src/app/api/voice/diarize/route.ts b/src/app/api/voice/diarize/route.ts
new file mode 100644
index 0000000..d637bcf
--- /dev/null
+++ b/src/app/api/voice/diarize/route.ts
@@ -0,0 +1,42 @@
+import { NextRequest, NextResponse } from 'next/server';
+import { requireAuth, isAuthed } from '@/lib/auth';
+
+const VOICE_API_URL = process.env.VOICE_API_URL || 'http://voice-command-api:8000';
+
+export async function POST(request: NextRequest) {
+ try {
+ const auth = await requireAuth(request);
+ if (!isAuthed(auth)) return auth;
+
+ const formData = await request.formData();
+ const audio = formData.get('audio') as File | null;
+
+ if (!audio) {
+ return NextResponse.json({ error: 'No audio file provided' }, { status: 400 });
+ }
+
+ // Forward to voice-command API diarization endpoint
+ const proxyForm = new FormData();
+ proxyForm.append('audio', audio, audio.name || 'recording.webm');
+
+ const res = await fetch(`${VOICE_API_URL}/api/voice/diarize`, {
+ method: 'POST',
+ body: proxyForm,
+ });
+
+ if (!res.ok) {
+ const err = await res.text();
+ console.error('Diarization API error:', res.status, err);
+ return NextResponse.json(
+ { error: 'Diarization failed' },
+ { status: res.status }
+ );
+ }
+
+ const result = await res.json();
+ return NextResponse.json(result);
+ } catch (error) {
+ console.error('Diarize proxy error:', error);
+ return NextResponse.json({ error: 'Diarization failed' }, { status: 500 });
+ }
+}
diff --git a/src/app/notes/[id]/page.tsx b/src/app/notes/[id]/page.tsx
index 10685a8..9cf6de4 100644
--- a/src/app/notes/[id]/page.tsx
+++ b/src/app/notes/[id]/page.tsx
@@ -47,6 +47,8 @@ export default function NoteDetailPage() {
const [editTitle, setEditTitle] = useState('');
const [editContent, setEditContent] = useState('');
const [saving, setSaving] = useState(false);
+ const [diarizing, setDiarizing] = useState(false);
+ const [speakers, setSpeakers] = useState<{ speaker: string; start: number; end: number }[] | null>(null);
useEffect(() => {
fetch(`/api/notes/${params.id}`)
@@ -104,6 +106,35 @@ export default function NoteDetailPage() {
}
};
+ const handleDiarize = async () => {
+ if (!note?.fileUrl || diarizing) return;
+ setDiarizing(true);
+ try {
+ // Fetch the audio file from the server
+ const audioRes = await fetch(note.fileUrl);
+ const audioBlob = await audioRes.blob();
+
+ const form = new FormData();
+ form.append('audio', audioBlob, 'recording.webm');
+
+ const res = await authFetch('/api/voice/diarize', {
+ method: 'POST',
+ body: form,
+ });
+
+ if (res.ok) {
+ const result = await res.json();
+ setSpeakers(result.speakers || []);
+ } else {
+ console.error('Diarization failed');
+ }
+ } catch (error) {
+ console.error('Diarization error:', error);
+ } finally {
+ setDiarizing(false);
+ }
+ };
+
if (loading) {
return (
@@ -257,7 +288,40 @@ export default function NoteDetailPage() {
{note.duration != null && {Math.floor(note.duration / 60)}:{(note.duration % 60).toString().padStart(2, '0')}}
{note.mimeType && {note.mimeType}}
{note.fileSize && {(note.fileSize / 1024).toFixed(1)} KB}
+ {!speakers && (
+
+ )}
+ {speakers && speakers.length > 0 && (
+
+
Speakers
+ {speakers.map((s, i) => {
+ const colors: Record
= {
+ SPEAKER_00: 'border-blue-500/50 text-blue-300',
+ SPEAKER_01: 'border-green-500/50 text-green-300',
+ SPEAKER_02: 'border-purple-500/50 text-purple-300',
+ SPEAKER_03: 'border-orange-500/50 text-orange-300',
+ };
+ const color = colors[s.speaker] || 'border-slate-500/50 text-slate-300';
+ return (
+
+ {s.speaker.replace('SPEAKER_', 'Speaker ')}
+
+ {Math.floor(s.start / 60)}:{Math.floor(s.start % 60).toString().padStart(2, '0')}
+ –
+ {Math.floor(s.end / 60)}:{Math.floor(s.end % 60).toString().padStart(2, '0')}
+
+
+ );
+ })}
+
+ )}
)}
diff --git a/src/components/VoiceRecorder.tsx b/src/components/VoiceRecorder.tsx
index 8fa4673..4678e81 100644
--- a/src/components/VoiceRecorder.tsx
+++ b/src/components/VoiceRecorder.tsx
@@ -3,6 +3,13 @@
import { useState, useRef, useCallback, useEffect } from 'react';
import { authFetch } from '@/lib/authFetch';
+interface Segment {
+ id: number;
+ text: string;
+ start: number;
+ end: number;
+}
+
interface VoiceRecorderResult {
fileUrl: string;
mimeType: string;
@@ -16,18 +23,30 @@ interface VoiceRecorderProps {
className?: string;
}
+const VOICE_WS_URL =
+ process.env.NEXT_PUBLIC_VOICE_WS_URL || 'wss://voice.jeffemmett.com';
+
export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) {
- const [recording, setRecording] = useState(false);
- const [processing, setProcessing] = useState(false);
- const [processingStep, setProcessingStep] = useState('');
+ const [status, setStatus] = useState<'idle' | 'recording' | 'processing'>(
+ 'idle'
+ );
const [elapsed, setElapsed] = useState(0);
+ const [segments, setSegments] = useState([]);
+ const [isListening, setIsListening] = useState(false);
const [error, setError] = useState(null);
const [audioUrl, setAudioUrl] = useState(null);
+ const [streaming, setStreaming] = useState(false);
const mediaRecorderRef = useRef(null);
+ const audioContextRef = useRef(null);
+ const workletNodeRef = useRef(null);
+ const sourceNodeRef = useRef(null);
+ const wsRef = useRef(null);
const chunksRef = useRef([]);
+ const segmentsRef = useRef([]);
const timerRef = useRef | null>(null);
const startTimeRef = useRef(0);
+ const transcriptScrollRef = useRef(null);
useEffect(() => {
return () => {
@@ -36,44 +55,159 @@ export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) {
};
}, [audioUrl]);
+ // Auto-scroll transcript to bottom when new segments arrive
+ useEffect(() => {
+ if (transcriptScrollRef.current) {
+ transcriptScrollRef.current.scrollTop =
+ transcriptScrollRef.current.scrollHeight;
+ }
+ }, [segments]);
+
+ const addSegment = useCallback((seg: Segment) => {
+ segmentsRef.current = [...segmentsRef.current, seg];
+ setSegments([...segmentsRef.current]);
+ }, []);
+
+ const cleanup = useCallback(() => {
+ if (workletNodeRef.current) {
+ workletNodeRef.current.disconnect();
+ workletNodeRef.current = null;
+ }
+ if (sourceNodeRef.current) {
+ sourceNodeRef.current.disconnect();
+ sourceNodeRef.current = null;
+ }
+ if (
+ audioContextRef.current &&
+ audioContextRef.current.state !== 'closed'
+ ) {
+ audioContextRef.current.close().catch(() => {});
+ audioContextRef.current = null;
+ }
+ if (wsRef.current) {
+ if (wsRef.current.readyState === WebSocket.OPEN) {
+ wsRef.current.close();
+ }
+ wsRef.current = null;
+ }
+ }, []);
+
const formatTime = (seconds: number) => {
- const m = Math.floor(seconds / 60).toString().padStart(2, '0');
+ const m = Math.floor(seconds / 60)
+ .toString()
+ .padStart(2, '0');
const s = (seconds % 60).toString().padStart(2, '0');
return `${m}:${s}`;
};
const startRecording = useCallback(async () => {
setError(null);
+ setSegments([]);
+ segmentsRef.current = [];
+ setIsListening(false);
+ setStreaming(false);
+
try {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
+
+ // Start MediaRecorder for the audio file
const mediaRecorder = new MediaRecorder(stream, {
mimeType: MediaRecorder.isTypeSupported('audio/webm;codecs=opus')
? 'audio/webm;codecs=opus'
: 'audio/webm',
});
-
chunksRef.current = [];
mediaRecorder.ondataavailable = (e) => {
if (e.data.size > 0) chunksRef.current.push(e.data);
};
-
- mediaRecorder.onstop = () => {
- stream.getTracks().forEach((t) => t.stop());
- };
-
mediaRecorder.start(1000);
mediaRecorderRef.current = mediaRecorder;
- startTimeRef.current = Date.now();
- setRecording(true);
- setElapsed(0);
+ // Try to set up WebSocket streaming for live transcription
+ try {
+ const ws = new WebSocket(`${VOICE_WS_URL}/api/voice/stream`);
+ wsRef.current = ws;
+
+ await new Promise((resolve, reject) => {
+ const timeout = setTimeout(() => {
+ ws.close();
+ reject(new Error('WebSocket connection timeout'));
+ }, 5000);
+
+ ws.onopen = () => {
+ clearTimeout(timeout);
+ resolve();
+ };
+ ws.onerror = () => {
+ clearTimeout(timeout);
+ reject(new Error('WebSocket connection failed'));
+ };
+ });
+
+ // WebSocket message handler
+ ws.onmessage = (event) => {
+ try {
+ const data = JSON.parse(event.data);
+ if (data.type === 'listening') {
+ setIsListening(true);
+ setTimeout(() => setIsListening(false), 600);
+ } else if (data.type === 'segment') {
+ addSegment({
+ id: data.id,
+ text: data.text,
+ start: data.start,
+ end: data.end,
+ });
+ }
+ } catch {
+ // Ignore parse errors
+ }
+ };
+
+ // Set up AudioContext at 16kHz and AudioWorklet for PCM16 streaming
+ const audioCtx = new AudioContext({ sampleRate: 16000 });
+ audioContextRef.current = audioCtx;
+ const source = audioCtx.createMediaStreamSource(stream);
+ sourceNodeRef.current = source;
+
+ await audioCtx.audioWorklet.addModule('/pcm-processor.js');
+ const workletNode = new AudioWorkletNode(audioCtx, 'pcm-processor');
+ workletNodeRef.current = workletNode;
+
+ workletNode.port.onmessage = (e) => {
+ if (ws.readyState === WebSocket.OPEN) {
+ ws.send(e.data as ArrayBuffer);
+ }
+ };
+
+ source.connect(workletNode);
+ // Don't connect to destination — we don't want to hear ourselves
+ setStreaming(true);
+ } catch (wsErr) {
+ console.warn(
+ 'WebSocket streaming unavailable, will batch transcribe:',
+ wsErr
+ );
+ setStreaming(false);
+ if (wsRef.current) {
+ wsRef.current.close();
+ wsRef.current = null;
+ }
+ }
+
+ // Start timer
+ startTimeRef.current = Date.now();
+ setStatus('recording');
+ setElapsed(0);
timerRef.current = setInterval(() => {
setElapsed(Math.floor((Date.now() - startTimeRef.current) / 1000));
}, 1000);
} catch (err) {
- setError(err instanceof Error ? err.message : 'Microphone access denied');
+ setError(
+ err instanceof Error ? err.message : 'Microphone access denied'
+ );
}
- }, []);
+ }, [addSegment]);
const stopRecording = useCallback(async () => {
const mediaRecorder = mediaRecorderRef.current;
@@ -84,26 +218,73 @@ export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) {
timerRef.current = null;
}
- const duration = Math.floor((Date.now() - startTimeRef.current) / 1000);
- setRecording(false);
- setProcessing(true);
+ const duration = Math.floor(
+ (Date.now() - startTimeRef.current) / 1000
+ );
+ setStatus('processing');
- // Wait for final data
+ // Stop AudioWorklet streaming
+ if (workletNodeRef.current) {
+ workletNodeRef.current.disconnect();
+ workletNodeRef.current = null;
+ }
+
+ // Send "end" to WebSocket and wait for final segments
+ let wsFullText = '';
+ if (wsRef.current && wsRef.current.readyState === WebSocket.OPEN) {
+ try {
+ const ws = wsRef.current;
+ wsFullText = await new Promise((resolve) => {
+ const timeout = setTimeout(() => resolve(''), 5000);
+
+ const handler = (event: MessageEvent) => {
+ try {
+ const data = JSON.parse(event.data);
+ if (data.type === 'segment') {
+ addSegment({
+ id: data.id,
+ text: data.text,
+ start: data.start,
+ end: data.end,
+ });
+ }
+ if (data.type === 'done') {
+ clearTimeout(timeout);
+ ws.removeEventListener('message', handler);
+ resolve(data.fullText || '');
+ }
+ } catch {
+ // Ignore
+ }
+ };
+
+ ws.addEventListener('message', handler);
+ ws.send(JSON.stringify({ type: 'end' }));
+ });
+ } catch {
+ // Timeout or error — use accumulated segments
+ }
+ }
+
+ // Close WebSocket and AudioContext
+ cleanup();
+
+ // Stop MediaRecorder and collect the audio blob
const blob = await new Promise((resolve) => {
mediaRecorder.onstop = () => {
mediaRecorder.stream.getTracks().forEach((t) => t.stop());
- resolve(new Blob(chunksRef.current, { type: mediaRecorder.mimeType }));
+ resolve(
+ new Blob(chunksRef.current, { type: mediaRecorder.mimeType })
+ );
};
mediaRecorder.stop();
});
- // Preview URL
const previewUrl = URL.createObjectURL(blob);
setAudioUrl(previewUrl);
try {
// Upload audio file
- setProcessingStep('Uploading audio...');
const uploadForm = new FormData();
uploadForm.append('file', blob, 'recording.webm');
@@ -119,22 +300,31 @@ export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) {
const uploadResult = await uploadRes.json();
- // Transcribe
- setProcessingStep('Transcribing...');
- const transcribeForm = new FormData();
- transcribeForm.append('audio', blob, 'recording.webm');
+ // Determine transcript: prefer WebSocket fullText, then assembled segments, then batch
+ let transcript = wsFullText;
- const transcribeRes = await authFetch('/api/voice/transcribe', {
- method: 'POST',
- body: transcribeForm,
- });
+ if (!transcript && segmentsRef.current.length > 0) {
+ transcript = segmentsRef.current.map((s) => s.text).join(' ');
+ }
- let transcript = '';
- if (transcribeRes.ok) {
- const transcribeResult = await transcribeRes.json();
- transcript = transcribeResult.text || '';
- } else {
- console.warn('Transcription failed, saving audio without transcript');
+ if (!transcript) {
+ // Fallback: batch transcription via API proxy
+ try {
+ const transcribeForm = new FormData();
+ transcribeForm.append('audio', blob, 'recording.webm');
+
+ const transcribeRes = await authFetch('/api/voice/transcribe', {
+ method: 'POST',
+ body: transcribeForm,
+ });
+
+ if (transcribeRes.ok) {
+ const transcribeResult = await transcribeRes.json();
+ transcript = transcribeResult.text || '';
+ }
+ } catch {
+ console.warn('Batch transcription also failed');
+ }
}
onResult({
@@ -147,18 +337,20 @@ export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) {
} catch (err) {
setError(err instanceof Error ? err.message : 'Processing failed');
} finally {
- setProcessing(false);
- setProcessingStep('');
+ setStatus('idle');
}
- }, [onResult]);
+ }, [onResult, addSegment, cleanup]);
const discard = useCallback(() => {
if (audioUrl) {
URL.revokeObjectURL(audioUrl);
setAudioUrl(null);
}
+ setSegments([]);
+ segmentsRef.current = [];
setElapsed(0);
setError(null);
+ setStatus('idle');
}, [audioUrl]);
return (
@@ -166,14 +358,18 @@ export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) {
{/* Recording controls */}
- {!recording && !processing && !audioUrl && (
+ {status === 'idle' && !audioUrl && (
<>