feat: live streaming transcription via WebSocket + diarization UI
Add AudioWorklet-based PCM16 streaming to VoiceRecorder with WebSocket connection for near-real-time transcription. Segments appear as finalized text that never shifts. Add speaker diarization button on audio notes with color-coded speaker labels. Graceful fallback to batch transcription when WebSocket unavailable. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
30f3383d1b
commit
560dceec0f
|
|
@ -12,6 +12,7 @@ services:
|
||||||
- NEXT_PUBLIC_ENCRYPTID_SERVER_URL=${NEXT_PUBLIC_ENCRYPTID_SERVER_URL:-https://encryptid.jeffemmett.com}
|
- NEXT_PUBLIC_ENCRYPTID_SERVER_URL=${NEXT_PUBLIC_ENCRYPTID_SERVER_URL:-https://encryptid.jeffemmett.com}
|
||||||
- RSPACE_INTERNAL_KEY=${RSPACE_INTERNAL_KEY}
|
- RSPACE_INTERNAL_KEY=${RSPACE_INTERNAL_KEY}
|
||||||
- VOICE_API_URL=${VOICE_API_URL:-http://voice-command-api:8000}
|
- VOICE_API_URL=${VOICE_API_URL:-http://voice-command-api:8000}
|
||||||
|
- NEXT_PUBLIC_VOICE_WS_URL=${NEXT_PUBLIC_VOICE_WS_URL:-wss://voice.jeffemmett.com}
|
||||||
volumes:
|
volumes:
|
||||||
- uploads_data:/app/uploads
|
- uploads_data:/app/uploads
|
||||||
labels:
|
labels:
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,22 @@
|
||||||
|
/**
|
||||||
|
* AudioWorklet processor that captures raw PCM16 audio for WebSocket streaming.
|
||||||
|
* Runs in a separate thread, sends Int16 buffers to the main thread.
|
||||||
|
*/
|
||||||
|
class PCMProcessor extends AudioWorkletProcessor {
|
||||||
|
process(inputs) {
|
||||||
|
const input = inputs[0];
|
||||||
|
if (input.length > 0) {
|
||||||
|
const channelData = input[0]; // mono channel
|
||||||
|
// Convert float32 [-1, 1] to int16 [-32768, 32767]
|
||||||
|
const pcm16 = new Int16Array(channelData.length);
|
||||||
|
for (let i = 0; i < channelData.length; i++) {
|
||||||
|
const s = Math.max(-1, Math.min(1, channelData[i]));
|
||||||
|
pcm16[i] = s < 0 ? s * 32768 : s * 32767;
|
||||||
|
}
|
||||||
|
this.port.postMessage(pcm16.buffer, [pcm16.buffer]);
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
registerProcessor('pcm-processor', PCMProcessor);
|
||||||
|
|
@ -0,0 +1,42 @@
|
||||||
|
import { NextRequest, NextResponse } from 'next/server';
|
||||||
|
import { requireAuth, isAuthed } from '@/lib/auth';
|
||||||
|
|
||||||
|
const VOICE_API_URL = process.env.VOICE_API_URL || 'http://voice-command-api:8000';
|
||||||
|
|
||||||
|
export async function POST(request: NextRequest) {
|
||||||
|
try {
|
||||||
|
const auth = await requireAuth(request);
|
||||||
|
if (!isAuthed(auth)) return auth;
|
||||||
|
|
||||||
|
const formData = await request.formData();
|
||||||
|
const audio = formData.get('audio') as File | null;
|
||||||
|
|
||||||
|
if (!audio) {
|
||||||
|
return NextResponse.json({ error: 'No audio file provided' }, { status: 400 });
|
||||||
|
}
|
||||||
|
|
||||||
|
// Forward to voice-command API diarization endpoint
|
||||||
|
const proxyForm = new FormData();
|
||||||
|
proxyForm.append('audio', audio, audio.name || 'recording.webm');
|
||||||
|
|
||||||
|
const res = await fetch(`${VOICE_API_URL}/api/voice/diarize`, {
|
||||||
|
method: 'POST',
|
||||||
|
body: proxyForm,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!res.ok) {
|
||||||
|
const err = await res.text();
|
||||||
|
console.error('Diarization API error:', res.status, err);
|
||||||
|
return NextResponse.json(
|
||||||
|
{ error: 'Diarization failed' },
|
||||||
|
{ status: res.status }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
const result = await res.json();
|
||||||
|
return NextResponse.json(result);
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Diarize proxy error:', error);
|
||||||
|
return NextResponse.json({ error: 'Diarization failed' }, { status: 500 });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -47,6 +47,8 @@ export default function NoteDetailPage() {
|
||||||
const [editTitle, setEditTitle] = useState('');
|
const [editTitle, setEditTitle] = useState('');
|
||||||
const [editContent, setEditContent] = useState('');
|
const [editContent, setEditContent] = useState('');
|
||||||
const [saving, setSaving] = useState(false);
|
const [saving, setSaving] = useState(false);
|
||||||
|
const [diarizing, setDiarizing] = useState(false);
|
||||||
|
const [speakers, setSpeakers] = useState<{ speaker: string; start: number; end: number }[] | null>(null);
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
fetch(`/api/notes/${params.id}`)
|
fetch(`/api/notes/${params.id}`)
|
||||||
|
|
@ -104,6 +106,35 @@ export default function NoteDetailPage() {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const handleDiarize = async () => {
|
||||||
|
if (!note?.fileUrl || diarizing) return;
|
||||||
|
setDiarizing(true);
|
||||||
|
try {
|
||||||
|
// Fetch the audio file from the server
|
||||||
|
const audioRes = await fetch(note.fileUrl);
|
||||||
|
const audioBlob = await audioRes.blob();
|
||||||
|
|
||||||
|
const form = new FormData();
|
||||||
|
form.append('audio', audioBlob, 'recording.webm');
|
||||||
|
|
||||||
|
const res = await authFetch('/api/voice/diarize', {
|
||||||
|
method: 'POST',
|
||||||
|
body: form,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (res.ok) {
|
||||||
|
const result = await res.json();
|
||||||
|
setSpeakers(result.speakers || []);
|
||||||
|
} else {
|
||||||
|
console.error('Diarization failed');
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error('Diarization error:', error);
|
||||||
|
} finally {
|
||||||
|
setDiarizing(false);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
if (loading) {
|
if (loading) {
|
||||||
return (
|
return (
|
||||||
<div className="min-h-screen bg-[#0a0a0a] flex items-center justify-center">
|
<div className="min-h-screen bg-[#0a0a0a] flex items-center justify-center">
|
||||||
|
|
@ -257,7 +288,40 @@ export default function NoteDetailPage() {
|
||||||
{note.duration != null && <span>{Math.floor(note.duration / 60)}:{(note.duration % 60).toString().padStart(2, '0')}</span>}
|
{note.duration != null && <span>{Math.floor(note.duration / 60)}:{(note.duration % 60).toString().padStart(2, '0')}</span>}
|
||||||
{note.mimeType && <span>{note.mimeType}</span>}
|
{note.mimeType && <span>{note.mimeType}</span>}
|
||||||
{note.fileSize && <span>{(note.fileSize / 1024).toFixed(1)} KB</span>}
|
{note.fileSize && <span>{(note.fileSize / 1024).toFixed(1)} KB</span>}
|
||||||
|
{!speakers && (
|
||||||
|
<button
|
||||||
|
onClick={handleDiarize}
|
||||||
|
disabled={diarizing}
|
||||||
|
className="ml-auto px-3 py-1 text-xs rounded-lg border border-slate-600 text-slate-400 hover:text-white hover:border-slate-500 transition-colors disabled:opacity-50"
|
||||||
|
>
|
||||||
|
{diarizing ? 'Identifying speakers...' : 'Identify speakers'}
|
||||||
|
</button>
|
||||||
|
)}
|
||||||
</div>
|
</div>
|
||||||
|
{speakers && speakers.length > 0 && (
|
||||||
|
<div className="pt-2 border-t border-slate-700 space-y-1.5">
|
||||||
|
<div className="text-xs text-slate-500 uppercase tracking-wider mb-2">Speakers</div>
|
||||||
|
{speakers.map((s, i) => {
|
||||||
|
const colors: Record<string, string> = {
|
||||||
|
SPEAKER_00: 'border-blue-500/50 text-blue-300',
|
||||||
|
SPEAKER_01: 'border-green-500/50 text-green-300',
|
||||||
|
SPEAKER_02: 'border-purple-500/50 text-purple-300',
|
||||||
|
SPEAKER_03: 'border-orange-500/50 text-orange-300',
|
||||||
|
};
|
||||||
|
const color = colors[s.speaker] || 'border-slate-500/50 text-slate-300';
|
||||||
|
return (
|
||||||
|
<div key={i} className={`text-xs px-2 py-1.5 rounded border-l-2 bg-slate-800/50 ${color}`}>
|
||||||
|
<span className="font-medium">{s.speaker.replace('SPEAKER_', 'Speaker ')}</span>
|
||||||
|
<span className="text-slate-500 ml-2">
|
||||||
|
{Math.floor(s.start / 60)}:{Math.floor(s.start % 60).toString().padStart(2, '0')}
|
||||||
|
–
|
||||||
|
{Math.floor(s.end / 60)}:{Math.floor(s.end % 60).toString().padStart(2, '0')}
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
})}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
</div>
|
</div>
|
||||||
)}
|
)}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,13 @@
|
||||||
import { useState, useRef, useCallback, useEffect } from 'react';
|
import { useState, useRef, useCallback, useEffect } from 'react';
|
||||||
import { authFetch } from '@/lib/authFetch';
|
import { authFetch } from '@/lib/authFetch';
|
||||||
|
|
||||||
|
interface Segment {
|
||||||
|
id: number;
|
||||||
|
text: string;
|
||||||
|
start: number;
|
||||||
|
end: number;
|
||||||
|
}
|
||||||
|
|
||||||
interface VoiceRecorderResult {
|
interface VoiceRecorderResult {
|
||||||
fileUrl: string;
|
fileUrl: string;
|
||||||
mimeType: string;
|
mimeType: string;
|
||||||
|
|
@ -16,18 +23,30 @@ interface VoiceRecorderProps {
|
||||||
className?: string;
|
className?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const VOICE_WS_URL =
|
||||||
|
process.env.NEXT_PUBLIC_VOICE_WS_URL || 'wss://voice.jeffemmett.com';
|
||||||
|
|
||||||
export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) {
|
export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) {
|
||||||
const [recording, setRecording] = useState(false);
|
const [status, setStatus] = useState<'idle' | 'recording' | 'processing'>(
|
||||||
const [processing, setProcessing] = useState(false);
|
'idle'
|
||||||
const [processingStep, setProcessingStep] = useState('');
|
);
|
||||||
const [elapsed, setElapsed] = useState(0);
|
const [elapsed, setElapsed] = useState(0);
|
||||||
|
const [segments, setSegments] = useState<Segment[]>([]);
|
||||||
|
const [isListening, setIsListening] = useState(false);
|
||||||
const [error, setError] = useState<string | null>(null);
|
const [error, setError] = useState<string | null>(null);
|
||||||
const [audioUrl, setAudioUrl] = useState<string | null>(null);
|
const [audioUrl, setAudioUrl] = useState<string | null>(null);
|
||||||
|
const [streaming, setStreaming] = useState(false);
|
||||||
|
|
||||||
const mediaRecorderRef = useRef<MediaRecorder | null>(null);
|
const mediaRecorderRef = useRef<MediaRecorder | null>(null);
|
||||||
|
const audioContextRef = useRef<AudioContext | null>(null);
|
||||||
|
const workletNodeRef = useRef<AudioWorkletNode | null>(null);
|
||||||
|
const sourceNodeRef = useRef<MediaStreamAudioSourceNode | null>(null);
|
||||||
|
const wsRef = useRef<WebSocket | null>(null);
|
||||||
const chunksRef = useRef<Blob[]>([]);
|
const chunksRef = useRef<Blob[]>([]);
|
||||||
|
const segmentsRef = useRef<Segment[]>([]);
|
||||||
const timerRef = useRef<ReturnType<typeof setInterval> | null>(null);
|
const timerRef = useRef<ReturnType<typeof setInterval> | null>(null);
|
||||||
const startTimeRef = useRef<number>(0);
|
const startTimeRef = useRef<number>(0);
|
||||||
|
const transcriptScrollRef = useRef<HTMLDivElement | null>(null);
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
return () => {
|
return () => {
|
||||||
|
|
@ -36,44 +55,159 @@ export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) {
|
||||||
};
|
};
|
||||||
}, [audioUrl]);
|
}, [audioUrl]);
|
||||||
|
|
||||||
|
// Auto-scroll transcript to bottom when new segments arrive
|
||||||
|
useEffect(() => {
|
||||||
|
if (transcriptScrollRef.current) {
|
||||||
|
transcriptScrollRef.current.scrollTop =
|
||||||
|
transcriptScrollRef.current.scrollHeight;
|
||||||
|
}
|
||||||
|
}, [segments]);
|
||||||
|
|
||||||
|
const addSegment = useCallback((seg: Segment) => {
|
||||||
|
segmentsRef.current = [...segmentsRef.current, seg];
|
||||||
|
setSegments([...segmentsRef.current]);
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
const cleanup = useCallback(() => {
|
||||||
|
if (workletNodeRef.current) {
|
||||||
|
workletNodeRef.current.disconnect();
|
||||||
|
workletNodeRef.current = null;
|
||||||
|
}
|
||||||
|
if (sourceNodeRef.current) {
|
||||||
|
sourceNodeRef.current.disconnect();
|
||||||
|
sourceNodeRef.current = null;
|
||||||
|
}
|
||||||
|
if (
|
||||||
|
audioContextRef.current &&
|
||||||
|
audioContextRef.current.state !== 'closed'
|
||||||
|
) {
|
||||||
|
audioContextRef.current.close().catch(() => {});
|
||||||
|
audioContextRef.current = null;
|
||||||
|
}
|
||||||
|
if (wsRef.current) {
|
||||||
|
if (wsRef.current.readyState === WebSocket.OPEN) {
|
||||||
|
wsRef.current.close();
|
||||||
|
}
|
||||||
|
wsRef.current = null;
|
||||||
|
}
|
||||||
|
}, []);
|
||||||
|
|
||||||
const formatTime = (seconds: number) => {
|
const formatTime = (seconds: number) => {
|
||||||
const m = Math.floor(seconds / 60).toString().padStart(2, '0');
|
const m = Math.floor(seconds / 60)
|
||||||
|
.toString()
|
||||||
|
.padStart(2, '0');
|
||||||
const s = (seconds % 60).toString().padStart(2, '0');
|
const s = (seconds % 60).toString().padStart(2, '0');
|
||||||
return `${m}:${s}`;
|
return `${m}:${s}`;
|
||||||
};
|
};
|
||||||
|
|
||||||
const startRecording = useCallback(async () => {
|
const startRecording = useCallback(async () => {
|
||||||
setError(null);
|
setError(null);
|
||||||
|
setSegments([]);
|
||||||
|
segmentsRef.current = [];
|
||||||
|
setIsListening(false);
|
||||||
|
setStreaming(false);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
|
||||||
|
|
||||||
|
// Start MediaRecorder for the audio file
|
||||||
const mediaRecorder = new MediaRecorder(stream, {
|
const mediaRecorder = new MediaRecorder(stream, {
|
||||||
mimeType: MediaRecorder.isTypeSupported('audio/webm;codecs=opus')
|
mimeType: MediaRecorder.isTypeSupported('audio/webm;codecs=opus')
|
||||||
? 'audio/webm;codecs=opus'
|
? 'audio/webm;codecs=opus'
|
||||||
: 'audio/webm',
|
: 'audio/webm',
|
||||||
});
|
});
|
||||||
|
|
||||||
chunksRef.current = [];
|
chunksRef.current = [];
|
||||||
mediaRecorder.ondataavailable = (e) => {
|
mediaRecorder.ondataavailable = (e) => {
|
||||||
if (e.data.size > 0) chunksRef.current.push(e.data);
|
if (e.data.size > 0) chunksRef.current.push(e.data);
|
||||||
};
|
};
|
||||||
|
|
||||||
mediaRecorder.onstop = () => {
|
|
||||||
stream.getTracks().forEach((t) => t.stop());
|
|
||||||
};
|
|
||||||
|
|
||||||
mediaRecorder.start(1000);
|
mediaRecorder.start(1000);
|
||||||
mediaRecorderRef.current = mediaRecorder;
|
mediaRecorderRef.current = mediaRecorder;
|
||||||
startTimeRef.current = Date.now();
|
|
||||||
setRecording(true);
|
|
||||||
setElapsed(0);
|
|
||||||
|
|
||||||
|
// Try to set up WebSocket streaming for live transcription
|
||||||
|
try {
|
||||||
|
const ws = new WebSocket(`${VOICE_WS_URL}/api/voice/stream`);
|
||||||
|
wsRef.current = ws;
|
||||||
|
|
||||||
|
await new Promise<void>((resolve, reject) => {
|
||||||
|
const timeout = setTimeout(() => {
|
||||||
|
ws.close();
|
||||||
|
reject(new Error('WebSocket connection timeout'));
|
||||||
|
}, 5000);
|
||||||
|
|
||||||
|
ws.onopen = () => {
|
||||||
|
clearTimeout(timeout);
|
||||||
|
resolve();
|
||||||
|
};
|
||||||
|
ws.onerror = () => {
|
||||||
|
clearTimeout(timeout);
|
||||||
|
reject(new Error('WebSocket connection failed'));
|
||||||
|
};
|
||||||
|
});
|
||||||
|
|
||||||
|
// WebSocket message handler
|
||||||
|
ws.onmessage = (event) => {
|
||||||
|
try {
|
||||||
|
const data = JSON.parse(event.data);
|
||||||
|
if (data.type === 'listening') {
|
||||||
|
setIsListening(true);
|
||||||
|
setTimeout(() => setIsListening(false), 600);
|
||||||
|
} else if (data.type === 'segment') {
|
||||||
|
addSegment({
|
||||||
|
id: data.id,
|
||||||
|
text: data.text,
|
||||||
|
start: data.start,
|
||||||
|
end: data.end,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// Ignore parse errors
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Set up AudioContext at 16kHz and AudioWorklet for PCM16 streaming
|
||||||
|
const audioCtx = new AudioContext({ sampleRate: 16000 });
|
||||||
|
audioContextRef.current = audioCtx;
|
||||||
|
const source = audioCtx.createMediaStreamSource(stream);
|
||||||
|
sourceNodeRef.current = source;
|
||||||
|
|
||||||
|
await audioCtx.audioWorklet.addModule('/pcm-processor.js');
|
||||||
|
const workletNode = new AudioWorkletNode(audioCtx, 'pcm-processor');
|
||||||
|
workletNodeRef.current = workletNode;
|
||||||
|
|
||||||
|
workletNode.port.onmessage = (e) => {
|
||||||
|
if (ws.readyState === WebSocket.OPEN) {
|
||||||
|
ws.send(e.data as ArrayBuffer);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
source.connect(workletNode);
|
||||||
|
// Don't connect to destination — we don't want to hear ourselves
|
||||||
|
setStreaming(true);
|
||||||
|
} catch (wsErr) {
|
||||||
|
console.warn(
|
||||||
|
'WebSocket streaming unavailable, will batch transcribe:',
|
||||||
|
wsErr
|
||||||
|
);
|
||||||
|
setStreaming(false);
|
||||||
|
if (wsRef.current) {
|
||||||
|
wsRef.current.close();
|
||||||
|
wsRef.current = null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start timer
|
||||||
|
startTimeRef.current = Date.now();
|
||||||
|
setStatus('recording');
|
||||||
|
setElapsed(0);
|
||||||
timerRef.current = setInterval(() => {
|
timerRef.current = setInterval(() => {
|
||||||
setElapsed(Math.floor((Date.now() - startTimeRef.current) / 1000));
|
setElapsed(Math.floor((Date.now() - startTimeRef.current) / 1000));
|
||||||
}, 1000);
|
}, 1000);
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
setError(err instanceof Error ? err.message : 'Microphone access denied');
|
setError(
|
||||||
|
err instanceof Error ? err.message : 'Microphone access denied'
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}, []);
|
}, [addSegment]);
|
||||||
|
|
||||||
const stopRecording = useCallback(async () => {
|
const stopRecording = useCallback(async () => {
|
||||||
const mediaRecorder = mediaRecorderRef.current;
|
const mediaRecorder = mediaRecorderRef.current;
|
||||||
|
|
@ -84,26 +218,73 @@ export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) {
|
||||||
timerRef.current = null;
|
timerRef.current = null;
|
||||||
}
|
}
|
||||||
|
|
||||||
const duration = Math.floor((Date.now() - startTimeRef.current) / 1000);
|
const duration = Math.floor(
|
||||||
setRecording(false);
|
(Date.now() - startTimeRef.current) / 1000
|
||||||
setProcessing(true);
|
);
|
||||||
|
setStatus('processing');
|
||||||
|
|
||||||
// Wait for final data
|
// Stop AudioWorklet streaming
|
||||||
|
if (workletNodeRef.current) {
|
||||||
|
workletNodeRef.current.disconnect();
|
||||||
|
workletNodeRef.current = null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Send "end" to WebSocket and wait for final segments
|
||||||
|
let wsFullText = '';
|
||||||
|
if (wsRef.current && wsRef.current.readyState === WebSocket.OPEN) {
|
||||||
|
try {
|
||||||
|
const ws = wsRef.current;
|
||||||
|
wsFullText = await new Promise<string>((resolve) => {
|
||||||
|
const timeout = setTimeout(() => resolve(''), 5000);
|
||||||
|
|
||||||
|
const handler = (event: MessageEvent) => {
|
||||||
|
try {
|
||||||
|
const data = JSON.parse(event.data);
|
||||||
|
if (data.type === 'segment') {
|
||||||
|
addSegment({
|
||||||
|
id: data.id,
|
||||||
|
text: data.text,
|
||||||
|
start: data.start,
|
||||||
|
end: data.end,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
if (data.type === 'done') {
|
||||||
|
clearTimeout(timeout);
|
||||||
|
ws.removeEventListener('message', handler);
|
||||||
|
resolve(data.fullText || '');
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// Ignore
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
ws.addEventListener('message', handler);
|
||||||
|
ws.send(JSON.stringify({ type: 'end' }));
|
||||||
|
});
|
||||||
|
} catch {
|
||||||
|
// Timeout or error — use accumulated segments
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Close WebSocket and AudioContext
|
||||||
|
cleanup();
|
||||||
|
|
||||||
|
// Stop MediaRecorder and collect the audio blob
|
||||||
const blob = await new Promise<Blob>((resolve) => {
|
const blob = await new Promise<Blob>((resolve) => {
|
||||||
mediaRecorder.onstop = () => {
|
mediaRecorder.onstop = () => {
|
||||||
mediaRecorder.stream.getTracks().forEach((t) => t.stop());
|
mediaRecorder.stream.getTracks().forEach((t) => t.stop());
|
||||||
resolve(new Blob(chunksRef.current, { type: mediaRecorder.mimeType }));
|
resolve(
|
||||||
|
new Blob(chunksRef.current, { type: mediaRecorder.mimeType })
|
||||||
|
);
|
||||||
};
|
};
|
||||||
mediaRecorder.stop();
|
mediaRecorder.stop();
|
||||||
});
|
});
|
||||||
|
|
||||||
// Preview URL
|
|
||||||
const previewUrl = URL.createObjectURL(blob);
|
const previewUrl = URL.createObjectURL(blob);
|
||||||
setAudioUrl(previewUrl);
|
setAudioUrl(previewUrl);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Upload audio file
|
// Upload audio file
|
||||||
setProcessingStep('Uploading audio...');
|
|
||||||
const uploadForm = new FormData();
|
const uploadForm = new FormData();
|
||||||
uploadForm.append('file', blob, 'recording.webm');
|
uploadForm.append('file', blob, 'recording.webm');
|
||||||
|
|
||||||
|
|
@ -119,22 +300,31 @@ export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) {
|
||||||
|
|
||||||
const uploadResult = await uploadRes.json();
|
const uploadResult = await uploadRes.json();
|
||||||
|
|
||||||
// Transcribe
|
// Determine transcript: prefer WebSocket fullText, then assembled segments, then batch
|
||||||
setProcessingStep('Transcribing...');
|
let transcript = wsFullText;
|
||||||
const transcribeForm = new FormData();
|
|
||||||
transcribeForm.append('audio', blob, 'recording.webm');
|
|
||||||
|
|
||||||
const transcribeRes = await authFetch('/api/voice/transcribe', {
|
if (!transcript && segmentsRef.current.length > 0) {
|
||||||
method: 'POST',
|
transcript = segmentsRef.current.map((s) => s.text).join(' ');
|
||||||
body: transcribeForm,
|
}
|
||||||
});
|
|
||||||
|
|
||||||
let transcript = '';
|
if (!transcript) {
|
||||||
if (transcribeRes.ok) {
|
// Fallback: batch transcription via API proxy
|
||||||
const transcribeResult = await transcribeRes.json();
|
try {
|
||||||
transcript = transcribeResult.text || '';
|
const transcribeForm = new FormData();
|
||||||
} else {
|
transcribeForm.append('audio', blob, 'recording.webm');
|
||||||
console.warn('Transcription failed, saving audio without transcript');
|
|
||||||
|
const transcribeRes = await authFetch('/api/voice/transcribe', {
|
||||||
|
method: 'POST',
|
||||||
|
body: transcribeForm,
|
||||||
|
});
|
||||||
|
|
||||||
|
if (transcribeRes.ok) {
|
||||||
|
const transcribeResult = await transcribeRes.json();
|
||||||
|
transcript = transcribeResult.text || '';
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
console.warn('Batch transcription also failed');
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
onResult({
|
onResult({
|
||||||
|
|
@ -147,18 +337,20 @@ export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) {
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
setError(err instanceof Error ? err.message : 'Processing failed');
|
setError(err instanceof Error ? err.message : 'Processing failed');
|
||||||
} finally {
|
} finally {
|
||||||
setProcessing(false);
|
setStatus('idle');
|
||||||
setProcessingStep('');
|
|
||||||
}
|
}
|
||||||
}, [onResult]);
|
}, [onResult, addSegment, cleanup]);
|
||||||
|
|
||||||
const discard = useCallback(() => {
|
const discard = useCallback(() => {
|
||||||
if (audioUrl) {
|
if (audioUrl) {
|
||||||
URL.revokeObjectURL(audioUrl);
|
URL.revokeObjectURL(audioUrl);
|
||||||
setAudioUrl(null);
|
setAudioUrl(null);
|
||||||
}
|
}
|
||||||
|
setSegments([]);
|
||||||
|
segmentsRef.current = [];
|
||||||
setElapsed(0);
|
setElapsed(0);
|
||||||
setError(null);
|
setError(null);
|
||||||
|
setStatus('idle');
|
||||||
}, [audioUrl]);
|
}, [audioUrl]);
|
||||||
|
|
||||||
return (
|
return (
|
||||||
|
|
@ -166,14 +358,18 @@ export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) {
|
||||||
<div className="border border-slate-700 rounded-lg p-6 bg-slate-800/30">
|
<div className="border border-slate-700 rounded-lg p-6 bg-slate-800/30">
|
||||||
{/* Recording controls */}
|
{/* Recording controls */}
|
||||||
<div className="flex flex-col items-center gap-4">
|
<div className="flex flex-col items-center gap-4">
|
||||||
{!recording && !processing && !audioUrl && (
|
{status === 'idle' && !audioUrl && (
|
||||||
<>
|
<>
|
||||||
<button
|
<button
|
||||||
type="button"
|
type="button"
|
||||||
onClick={startRecording}
|
onClick={startRecording}
|
||||||
className="w-20 h-20 rounded-full bg-red-500 hover:bg-red-400 transition-colors flex items-center justify-center"
|
className="w-20 h-20 rounded-full bg-red-500 hover:bg-red-400 transition-colors flex items-center justify-center"
|
||||||
>
|
>
|
||||||
<svg className="w-8 h-8 text-white" fill="currentColor" viewBox="0 0 24 24">
|
<svg
|
||||||
|
className="w-8 h-8 text-white"
|
||||||
|
fill="currentColor"
|
||||||
|
viewBox="0 0 24 24"
|
||||||
|
>
|
||||||
<path d="M12 14c1.66 0 3-1.34 3-3V5c0-1.66-1.34-3-3-3S9 3.34 9 5v6c0 1.66 1.34 3 3 3zm-1-9c0-.55.45-1 1-1s1 .45 1 1v6c0 .55-.45 1-1 1s-1-.45-1-1V5z" />
|
<path d="M12 14c1.66 0 3-1.34 3-3V5c0-1.66-1.34-3-3-3S9 3.34 9 5v6c0 1.66 1.34 3 3 3zm-1-9c0-.55.45-1 1-1s1 .45 1 1v6c0 .55-.45 1-1 1s-1-.45-1-1V5z" />
|
||||||
<path d="M17 11c0 2.76-2.24 5-5 5s-5-2.24-5-5H5c0 3.53 2.61 6.43 6 6.92V21h2v-3.08c3.39-.49 6-3.39 6-6.92h-2z" />
|
<path d="M17 11c0 2.76-2.24 5-5 5s-5-2.24-5-5H5c0 3.53 2.61 6.43 6 6.92V21h2v-3.08c3.39-.49 6-3.39 6-6.92h-2z" />
|
||||||
</svg>
|
</svg>
|
||||||
|
|
@ -182,11 +378,24 @@ export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) {
|
||||||
</>
|
</>
|
||||||
)}
|
)}
|
||||||
|
|
||||||
{recording && (
|
{status === 'recording' && (
|
||||||
<>
|
<>
|
||||||
<div className="flex items-center gap-3">
|
<div className="flex items-center gap-3">
|
||||||
<span className="w-3 h-3 rounded-full bg-red-500 animate-pulse" />
|
<span
|
||||||
<span className="text-2xl font-mono text-white">{formatTime(elapsed)}</span>
|
className={`w-3 h-3 rounded-full transition-colors ${
|
||||||
|
isListening
|
||||||
|
? 'bg-green-400 animate-pulse'
|
||||||
|
: 'bg-red-500 animate-pulse'
|
||||||
|
}`}
|
||||||
|
/>
|
||||||
|
<span className="text-2xl font-mono text-white">
|
||||||
|
{formatTime(elapsed)}
|
||||||
|
</span>
|
||||||
|
{streaming && (
|
||||||
|
<span className="text-xs text-green-400/70 font-medium tracking-wider">
|
||||||
|
LIVE
|
||||||
|
</span>
|
||||||
|
)}
|
||||||
</div>
|
</div>
|
||||||
<button
|
<button
|
||||||
type="button"
|
type="button"
|
||||||
|
|
@ -199,21 +408,40 @@ export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) {
|
||||||
</>
|
</>
|
||||||
)}
|
)}
|
||||||
|
|
||||||
{processing && (
|
{status === 'processing' && (
|
||||||
<div className="flex flex-col items-center gap-3 py-4">
|
<div className="flex flex-col items-center gap-3 py-4">
|
||||||
<svg className="animate-spin h-8 w-8 text-amber-400" viewBox="0 0 24 24">
|
<svg
|
||||||
<circle className="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" strokeWidth="4" fill="none" />
|
className="animate-spin h-8 w-8 text-amber-400"
|
||||||
<path className="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4zm2 5.291A7.962 7.962 0 014 12H0c0 3.042 1.135 5.824 3 7.938l3-2.647z" />
|
viewBox="0 0 24 24"
|
||||||
|
>
|
||||||
|
<circle
|
||||||
|
className="opacity-25"
|
||||||
|
cx="12"
|
||||||
|
cy="12"
|
||||||
|
r="10"
|
||||||
|
stroke="currentColor"
|
||||||
|
strokeWidth="4"
|
||||||
|
fill="none"
|
||||||
|
/>
|
||||||
|
<path
|
||||||
|
className="opacity-75"
|
||||||
|
fill="currentColor"
|
||||||
|
d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4zm2 5.291A7.962 7.962 0 014 12H0c0 3.042 1.135 5.824 3 7.938l3-2.647z"
|
||||||
|
/>
|
||||||
</svg>
|
</svg>
|
||||||
<p className="text-sm text-slate-400">{processingStep}</p>
|
<p className="text-sm text-slate-400">
|
||||||
|
Finalizing transcription...
|
||||||
|
</p>
|
||||||
</div>
|
</div>
|
||||||
)}
|
)}
|
||||||
|
|
||||||
{audioUrl && !processing && (
|
{audioUrl && status === 'idle' && (
|
||||||
<div className="w-full space-y-3">
|
<div className="w-full space-y-3">
|
||||||
<audio controls src={audioUrl} className="w-full" />
|
<audio controls src={audioUrl} className="w-full" />
|
||||||
<div className="flex items-center justify-between">
|
<div className="flex items-center justify-between">
|
||||||
<span className="text-sm text-slate-400">{formatTime(elapsed)} recorded</span>
|
<span className="text-sm text-slate-400">
|
||||||
|
{formatTime(elapsed)} recorded
|
||||||
|
</span>
|
||||||
<button
|
<button
|
||||||
type="button"
|
type="button"
|
||||||
onClick={discard}
|
onClick={discard}
|
||||||
|
|
@ -226,6 +454,28 @@ export function VoiceRecorder({ onResult, className }: VoiceRecorderProps) {
|
||||||
)}
|
)}
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
{/* Live transcript segments */}
|
||||||
|
{segments.length > 0 && (
|
||||||
|
<div className="mt-4">
|
||||||
|
<div className="text-xs text-slate-500 uppercase tracking-wider mb-2">
|
||||||
|
Live Transcript
|
||||||
|
</div>
|
||||||
|
<div
|
||||||
|
ref={transcriptScrollRef}
|
||||||
|
className="space-y-1.5 max-h-48 overflow-y-auto"
|
||||||
|
>
|
||||||
|
{segments.map((seg) => (
|
||||||
|
<div
|
||||||
|
key={seg.id}
|
||||||
|
className="text-sm text-slate-300 px-3 py-2 bg-slate-800/50 rounded border-l-2 border-amber-500/30 animate-in fade-in slide-in-from-bottom-1 duration-300"
|
||||||
|
>
|
||||||
|
{seg.text}
|
||||||
|
</div>
|
||||||
|
))}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
|
||||||
{error && (
|
{error && (
|
||||||
<p className="text-red-400 text-sm mt-4 text-center">{error}</p>
|
<p className="text-red-400 text-sm mt-4 text-center">{error}</p>
|
||||||
)}
|
)}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue