diff --git a/docker-compose.yml b/docker-compose.yml index df5cc57..428f244 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -12,6 +12,7 @@ services: - NEXT_PUBLIC_ENCRYPTID_SERVER_URL=${NEXT_PUBLIC_ENCRYPTID_SERVER_URL:-https://encryptid.jeffemmett.com} - RSPACE_INTERNAL_KEY=${RSPACE_INTERNAL_KEY} - VOICE_API_URL=${VOICE_API_URL:-http://voice-command-api:8000} + - NEXT_PUBLIC_VOICE_WS_URL=${NEXT_PUBLIC_VOICE_WS_URL:-wss://voice.jeffemmett.com} volumes: - uploads_data:/app/uploads labels: diff --git a/public/pcm-processor.js b/public/pcm-processor.js new file mode 100644 index 0000000..3302dde --- /dev/null +++ b/public/pcm-processor.js @@ -0,0 +1,22 @@ +/** + * AudioWorklet processor that captures raw PCM16 audio for WebSocket streaming. + * Runs in a separate thread, sends Int16 buffers to the main thread. + */ +class PCMProcessor extends AudioWorkletProcessor { + process(inputs) { + const input = inputs[0]; + if (input.length > 0) { + const channelData = input[0]; // mono channel + // Convert float32 [-1, 1] to int16 [-32768, 32767] + const pcm16 = new Int16Array(channelData.length); + for (let i = 0; i < channelData.length; i++) { + const s = Math.max(-1, Math.min(1, channelData[i])); + pcm16[i] = s < 0 ? s * 32768 : s * 32767; + } + this.port.postMessage(pcm16.buffer, [pcm16.buffer]); + } + return true; + } +} + +registerProcessor('pcm-processor', PCMProcessor); diff --git a/src/app/api/voice/diarize/route.ts b/src/app/api/voice/diarize/route.ts new file mode 100644 index 0000000..d637bcf --- /dev/null +++ b/src/app/api/voice/diarize/route.ts @@ -0,0 +1,42 @@ +import { NextRequest, NextResponse } from 'next/server'; +import { requireAuth, isAuthed } from '@/lib/auth'; + +const VOICE_API_URL = process.env.VOICE_API_URL || 'http://voice-command-api:8000'; + +export async function POST(request: NextRequest) { + try { + const auth = await requireAuth(request); + if (!isAuthed(auth)) return auth; + + const formData = await request.formData(); + const audio = formData.get('audio') as File | null; + + if (!audio) { + return NextResponse.json({ error: 'No audio file provided' }, { status: 400 }); + } + + // Forward to voice-command API diarization endpoint + const proxyForm = new FormData(); + proxyForm.append('audio', audio, audio.name || 'recording.webm'); + + const res = await fetch(`${VOICE_API_URL}/api/voice/diarize`, { + method: 'POST', + body: proxyForm, + }); + + if (!res.ok) { + const err = await res.text(); + console.error('Diarization API error:', res.status, err); + return NextResponse.json( + { error: 'Diarization failed' }, + { status: res.status } + ); + } + + const result = await res.json(); + return NextResponse.json(result); + } catch (error) { + console.error('Diarize proxy error:', error); + return NextResponse.json({ error: 'Diarization failed' }, { status: 500 }); + } +} diff --git a/src/app/notes/[id]/page.tsx b/src/app/notes/[id]/page.tsx index 10685a8..9cf6de4 100644 --- a/src/app/notes/[id]/page.tsx +++ b/src/app/notes/[id]/page.tsx @@ -47,6 +47,8 @@ export default function NoteDetailPage() { const [editTitle, setEditTitle] = useState(''); const [editContent, setEditContent] = useState(''); const [saving, setSaving] = useState(false); + const [diarizing, setDiarizing] = useState(false); + const [speakers, setSpeakers] = useState<{ speaker: string; start: number; end: number }[] | null>(null); useEffect(() => { fetch(`/api/notes/${params.id}`) @@ -104,6 +106,35 @@ export default function NoteDetailPage() { } }; + const handleDiarize = async () => { + if (!note?.fileUrl || diarizing) return; + setDiarizing(true); + try { + // Fetch the audio file from the server + const audioRes = await fetch(note.fileUrl); + const audioBlob = await audioRes.blob(); + + const form = new FormData(); + form.append('audio', audioBlob, 'recording.webm'); + + const res = await authFetch('/api/voice/diarize', { + method: 'POST', + body: form, + }); + + if (res.ok) { + const result = await res.json(); + setSpeakers(result.speakers || []); + } else { + console.error('Diarization failed'); + } + } catch (error) { + console.error('Diarization error:', error); + } finally { + setDiarizing(false); + } + }; + if (loading) { return (
@@ -257,7 +288,40 @@ export default function NoteDetailPage() { {note.duration != null && {Math.floor(note.duration / 60)}:{(note.duration % 60).toString().padStart(2, '0')}} {note.mimeType && {note.mimeType}} {note.fileSize && {(note.fileSize / 1024).toFixed(1)} KB} + {!speakers && ( + + )}
+ {speakers && speakers.length > 0 && ( +
+
Speakers
+ {speakers.map((s, i) => { + const colors: Record = { + SPEAKER_00: 'border-blue-500/50 text-blue-300', + SPEAKER_01: 'border-green-500/50 text-green-300', + SPEAKER_02: 'border-purple-500/50 text-purple-300', + SPEAKER_03: 'border-orange-500/50 text-orange-300', + }; + const color = colors[s.speaker] || 'border-slate-500/50 text-slate-300'; + return ( +
+ {s.speaker.replace('SPEAKER_', 'Speaker ')} + + {Math.floor(s.start / 60)}:{Math.floor(s.start % 60).toString().padStart(2, '0')} + – + {Math.floor(s.end / 60)}:{Math.floor(s.end % 60).toString().padStart(2, '0')} + +
+ ); + })} +
+ )} )} diff --git a/src/components/VoiceRecorder.tsx b/src/components/VoiceRecorder.tsx index 8fa4673..4678e81 100644 --- a/src/components/VoiceRecorder.tsx +++ b/src/components/VoiceRecorder.tsx @@ -3,6 +3,13 @@ import { useState, useRef, useCallback, useEffect } from 'react'; import { authFetch } from '@/lib/authFetch'; +interface Segment { + id: number; + text: string; + start: number; + end: number; +} + interface VoiceRecorderResult { fileUrl: string; mimeType: string; @@ -16,18 +23,30 @@ interface VoiceRecorderProps { className?: string; } +const VOICE_WS_URL = + process.env.NEXT_PUBLIC_VOICE_WS_URL || 'wss://voice.jeffemmett.com'; + export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) { - const [recording, setRecording] = useState(false); - const [processing, setProcessing] = useState(false); - const [processingStep, setProcessingStep] = useState(''); + const [status, setStatus] = useState<'idle' | 'recording' | 'processing'>( + 'idle' + ); const [elapsed, setElapsed] = useState(0); + const [segments, setSegments] = useState([]); + const [isListening, setIsListening] = useState(false); const [error, setError] = useState(null); const [audioUrl, setAudioUrl] = useState(null); + const [streaming, setStreaming] = useState(false); const mediaRecorderRef = useRef(null); + const audioContextRef = useRef(null); + const workletNodeRef = useRef(null); + const sourceNodeRef = useRef(null); + const wsRef = useRef(null); const chunksRef = useRef([]); + const segmentsRef = useRef([]); const timerRef = useRef | null>(null); const startTimeRef = useRef(0); + const transcriptScrollRef = useRef(null); useEffect(() => { return () => { @@ -36,44 +55,159 @@ export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) { }; }, [audioUrl]); + // Auto-scroll transcript to bottom when new segments arrive + useEffect(() => { + if (transcriptScrollRef.current) { + transcriptScrollRef.current.scrollTop = + transcriptScrollRef.current.scrollHeight; + } + }, [segments]); + + const addSegment = useCallback((seg: Segment) => { + segmentsRef.current = [...segmentsRef.current, seg]; + setSegments([...segmentsRef.current]); + }, []); + + const cleanup = useCallback(() => { + if (workletNodeRef.current) { + workletNodeRef.current.disconnect(); + workletNodeRef.current = null; + } + if (sourceNodeRef.current) { + sourceNodeRef.current.disconnect(); + sourceNodeRef.current = null; + } + if ( + audioContextRef.current && + audioContextRef.current.state !== 'closed' + ) { + audioContextRef.current.close().catch(() => {}); + audioContextRef.current = null; + } + if (wsRef.current) { + if (wsRef.current.readyState === WebSocket.OPEN) { + wsRef.current.close(); + } + wsRef.current = null; + } + }, []); + const formatTime = (seconds: number) => { - const m = Math.floor(seconds / 60).toString().padStart(2, '0'); + const m = Math.floor(seconds / 60) + .toString() + .padStart(2, '0'); const s = (seconds % 60).toString().padStart(2, '0'); return `${m}:${s}`; }; const startRecording = useCallback(async () => { setError(null); + setSegments([]); + segmentsRef.current = []; + setIsListening(false); + setStreaming(false); + try { const stream = await navigator.mediaDevices.getUserMedia({ audio: true }); + + // Start MediaRecorder for the audio file const mediaRecorder = new MediaRecorder(stream, { mimeType: MediaRecorder.isTypeSupported('audio/webm;codecs=opus') ? 'audio/webm;codecs=opus' : 'audio/webm', }); - chunksRef.current = []; mediaRecorder.ondataavailable = (e) => { if (e.data.size > 0) chunksRef.current.push(e.data); }; - - mediaRecorder.onstop = () => { - stream.getTracks().forEach((t) => t.stop()); - }; - mediaRecorder.start(1000); mediaRecorderRef.current = mediaRecorder; - startTimeRef.current = Date.now(); - setRecording(true); - setElapsed(0); + // Try to set up WebSocket streaming for live transcription + try { + const ws = new WebSocket(`${VOICE_WS_URL}/api/voice/stream`); + wsRef.current = ws; + + await new Promise((resolve, reject) => { + const timeout = setTimeout(() => { + ws.close(); + reject(new Error('WebSocket connection timeout')); + }, 5000); + + ws.onopen = () => { + clearTimeout(timeout); + resolve(); + }; + ws.onerror = () => { + clearTimeout(timeout); + reject(new Error('WebSocket connection failed')); + }; + }); + + // WebSocket message handler + ws.onmessage = (event) => { + try { + const data = JSON.parse(event.data); + if (data.type === 'listening') { + setIsListening(true); + setTimeout(() => setIsListening(false), 600); + } else if (data.type === 'segment') { + addSegment({ + id: data.id, + text: data.text, + start: data.start, + end: data.end, + }); + } + } catch { + // Ignore parse errors + } + }; + + // Set up AudioContext at 16kHz and AudioWorklet for PCM16 streaming + const audioCtx = new AudioContext({ sampleRate: 16000 }); + audioContextRef.current = audioCtx; + const source = audioCtx.createMediaStreamSource(stream); + sourceNodeRef.current = source; + + await audioCtx.audioWorklet.addModule('/pcm-processor.js'); + const workletNode = new AudioWorkletNode(audioCtx, 'pcm-processor'); + workletNodeRef.current = workletNode; + + workletNode.port.onmessage = (e) => { + if (ws.readyState === WebSocket.OPEN) { + ws.send(e.data as ArrayBuffer); + } + }; + + source.connect(workletNode); + // Don't connect to destination — we don't want to hear ourselves + setStreaming(true); + } catch (wsErr) { + console.warn( + 'WebSocket streaming unavailable, will batch transcribe:', + wsErr + ); + setStreaming(false); + if (wsRef.current) { + wsRef.current.close(); + wsRef.current = null; + } + } + + // Start timer + startTimeRef.current = Date.now(); + setStatus('recording'); + setElapsed(0); timerRef.current = setInterval(() => { setElapsed(Math.floor((Date.now() - startTimeRef.current) / 1000)); }, 1000); } catch (err) { - setError(err instanceof Error ? err.message : 'Microphone access denied'); + setError( + err instanceof Error ? err.message : 'Microphone access denied' + ); } - }, []); + }, [addSegment]); const stopRecording = useCallback(async () => { const mediaRecorder = mediaRecorderRef.current; @@ -84,26 +218,73 @@ export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) { timerRef.current = null; } - const duration = Math.floor((Date.now() - startTimeRef.current) / 1000); - setRecording(false); - setProcessing(true); + const duration = Math.floor( + (Date.now() - startTimeRef.current) / 1000 + ); + setStatus('processing'); - // Wait for final data + // Stop AudioWorklet streaming + if (workletNodeRef.current) { + workletNodeRef.current.disconnect(); + workletNodeRef.current = null; + } + + // Send "end" to WebSocket and wait for final segments + let wsFullText = ''; + if (wsRef.current && wsRef.current.readyState === WebSocket.OPEN) { + try { + const ws = wsRef.current; + wsFullText = await new Promise((resolve) => { + const timeout = setTimeout(() => resolve(''), 5000); + + const handler = (event: MessageEvent) => { + try { + const data = JSON.parse(event.data); + if (data.type === 'segment') { + addSegment({ + id: data.id, + text: data.text, + start: data.start, + end: data.end, + }); + } + if (data.type === 'done') { + clearTimeout(timeout); + ws.removeEventListener('message', handler); + resolve(data.fullText || ''); + } + } catch { + // Ignore + } + }; + + ws.addEventListener('message', handler); + ws.send(JSON.stringify({ type: 'end' })); + }); + } catch { + // Timeout or error — use accumulated segments + } + } + + // Close WebSocket and AudioContext + cleanup(); + + // Stop MediaRecorder and collect the audio blob const blob = await new Promise((resolve) => { mediaRecorder.onstop = () => { mediaRecorder.stream.getTracks().forEach((t) => t.stop()); - resolve(new Blob(chunksRef.current, { type: mediaRecorder.mimeType })); + resolve( + new Blob(chunksRef.current, { type: mediaRecorder.mimeType }) + ); }; mediaRecorder.stop(); }); - // Preview URL const previewUrl = URL.createObjectURL(blob); setAudioUrl(previewUrl); try { // Upload audio file - setProcessingStep('Uploading audio...'); const uploadForm = new FormData(); uploadForm.append('file', blob, 'recording.webm'); @@ -119,22 +300,31 @@ export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) { const uploadResult = await uploadRes.json(); - // Transcribe - setProcessingStep('Transcribing...'); - const transcribeForm = new FormData(); - transcribeForm.append('audio', blob, 'recording.webm'); + // Determine transcript: prefer WebSocket fullText, then assembled segments, then batch + let transcript = wsFullText; - const transcribeRes = await authFetch('/api/voice/transcribe', { - method: 'POST', - body: transcribeForm, - }); + if (!transcript && segmentsRef.current.length > 0) { + transcript = segmentsRef.current.map((s) => s.text).join(' '); + } - let transcript = ''; - if (transcribeRes.ok) { - const transcribeResult = await transcribeRes.json(); - transcript = transcribeResult.text || ''; - } else { - console.warn('Transcription failed, saving audio without transcript'); + if (!transcript) { + // Fallback: batch transcription via API proxy + try { + const transcribeForm = new FormData(); + transcribeForm.append('audio', blob, 'recording.webm'); + + const transcribeRes = await authFetch('/api/voice/transcribe', { + method: 'POST', + body: transcribeForm, + }); + + if (transcribeRes.ok) { + const transcribeResult = await transcribeRes.json(); + transcript = transcribeResult.text || ''; + } + } catch { + console.warn('Batch transcription also failed'); + } } onResult({ @@ -147,18 +337,20 @@ export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) { } catch (err) { setError(err instanceof Error ? err.message : 'Processing failed'); } finally { - setProcessing(false); - setProcessingStep(''); + setStatus('idle'); } - }, [onResult]); + }, [onResult, addSegment, cleanup]); const discard = useCallback(() => { if (audioUrl) { URL.revokeObjectURL(audioUrl); setAudioUrl(null); } + setSegments([]); + segmentsRef.current = []; setElapsed(0); setError(null); + setStatus('idle'); }, [audioUrl]); return ( @@ -166,14 +358,18 @@ export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) {
{/* Recording controls */}
- {!recording && !processing && !audioUrl && ( + {status === 'idle' && !audioUrl && ( <>