fix: output unified transcript instead of echoed segments

The Web Speech API hook was sending each recognition result as a
separate fragment via onTranscriptUpdate, causing the transcription
shape to display fragmented/echoed conversation. Now the hook
accumulates the full transcript internally and always sends the
complete text, so the shape receives one unified conversation.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Jeff Emmett 2026-03-18 10:02:59 +00:00
parent 526a4c4b9d
commit 007a25d3da
2 changed files with 30 additions and 69 deletions

View File

@ -79,8 +79,6 @@ export const useWebSpeechTranscription = ({
const interimTranscriptRef = useRef('') const interimTranscriptRef = useRef('')
const lastSpeechTimeRef = useRef<number>(0) const lastSpeechTimeRef = useRef<number>(0)
const pauseTimeoutRef = useRef<NodeJS.Timeout | null>(null) const pauseTimeoutRef = useRef<NodeJS.Timeout | null>(null)
const lastConfidenceRef = useRef<number>(0)
const speakerChangeThreshold = 0.3 // Threshold for detecting speaker changes
// Function to add line breaks after pauses and improve punctuation // Function to add line breaks after pauses and improve punctuation
const processTranscript = useCallback((text: string, isFinal: boolean = false) => { const processTranscript = useCallback((text: string, isFinal: boolean = false) => {
@ -109,30 +107,6 @@ export const useWebSpeechTranscription = ({
return processedText return processedText
}, []) }, [])
// Function to detect speaker changes based on confidence and timing
const detectSpeakerChange = useCallback((confidence: number) => {
if (lastConfidenceRef.current === 0) {
lastConfidenceRef.current = confidence
return false
}
const confidenceDiff = Math.abs(confidence - lastConfidenceRef.current)
const now = Date.now()
const timeSinceLastSpeech = now - lastSpeechTimeRef.current
// Detect speaker change if confidence changes significantly and there's been a pause
const isSpeakerChange = confidenceDiff > speakerChangeThreshold && timeSinceLastSpeech > 1000
if (isSpeakerChange) {
// Reduced debug logging
lastConfidenceRef.current = confidence
return true
}
lastConfidenceRef.current = confidence
return false
}, [speakerChangeThreshold])
// Function to handle pause detection // Function to handle pause detection
const handlePauseDetection = useCallback(() => { const handlePauseDetection = useCallback(() => {
// Clear existing timeout // Clear existing timeout
@ -140,19 +114,21 @@ export const useWebSpeechTranscription = ({
clearTimeout(pauseTimeoutRef.current) clearTimeout(pauseTimeoutRef.current)
} }
// Set new timeout for pause detection // Set new timeout for pause detection — after a long silence,
// append a newline so the next utterance starts on a new line.
pauseTimeoutRef.current = setTimeout(() => { pauseTimeoutRef.current = setTimeout(() => {
const now = Date.now() const now = Date.now()
const timeSinceLastSpeech = now - lastSpeechTimeRef.current const timeSinceLastSpeech = now - lastSpeechTimeRef.current
// If more than 2 seconds of silence, add a line break to interim transcript
if (timeSinceLastSpeech > 2000 && lastSpeechTimeRef.current > 0) { if (timeSinceLastSpeech > 2000 && lastSpeechTimeRef.current > 0) {
const currentTranscript = finalTranscriptRef.current + '\n' // Only append the newline if the transcript doesn't already end with one
setTranscript(currentTranscript) if (finalTranscriptRef.current && !finalTranscriptRef.current.endsWith('\n')) {
onTranscriptUpdate?.(currentTranscript) finalTranscriptRef.current += '\n'
// Reduced debug logging setTranscript(finalTranscriptRef.current)
onTranscriptUpdate?.(finalTranscriptRef.current)
}
} }
}, 2000) // Check after 2 seconds of silence }, 2000)
}, [onTranscriptUpdate]) }, [onTranscriptUpdate])
// Check if Web Speech API is supported // Check if Web Speech API is supported
@ -202,24 +178,20 @@ export const useWebSpeechTranscription = ({
// Update final transcript with processing // Update final transcript with processing
if (finalTranscript) { if (finalTranscript) {
// Get confidence from the first result
const confidence = event.results[event.results.length - 1]?.[0]?.confidence || 0
// Detect speaker change
const isSpeakerChange = detectSpeakerChange(confidence)
// Add speaker indicator if change detected
let speakerPrefix = ''
if (isSpeakerChange) {
speakerPrefix = '\n[Speaker Change]\n'
}
const processedFinal = processTranscript(finalTranscript, true) const processedFinal = processTranscript(finalTranscript, true)
const newText = speakerPrefix + processedFinal
finalTranscriptRef.current += newText // Append to running transcript with a space separator
if (finalTranscriptRef.current) {
finalTranscriptRef.current += ' ' + processedFinal
} else {
finalTranscriptRef.current = processedFinal
}
// Always send the full accumulated transcript so the shape
// receives one unified conversation instead of fragments
setTranscript(finalTranscriptRef.current) setTranscript(finalTranscriptRef.current)
onTranscriptUpdate?.(newText) // Only send the new text portion onTranscriptUpdate?.(finalTranscriptRef.current)
// Trigger pause detection // Trigger pause detection
handlePauseDetection() handlePauseDetection()
} }
@ -262,7 +234,6 @@ export const useWebSpeechTranscription = ({
// setTranscript('') // setTranscript('')
// setInterimTranscript('') // setInterimTranscript('')
lastSpeechTimeRef.current = 0 lastSpeechTimeRef.current = 0
lastConfidenceRef.current = 0
// Clear any existing pause timeout // Clear any existing pause timeout
if (pauseTimeoutRef.current) { if (pauseTimeoutRef.current) {

View File

@ -191,35 +191,25 @@ export class TranscriptionShape extends BaseBoxShapeUtil<ITranscription> {
// Web Speech API hook for real-time transcription // Web Speech API hook for real-time transcription
const webSpeechOptions = useMemo(() => ({ const webSpeechOptions = useMemo(() => ({
onTranscriptUpdate: (newText: string) => { onTranscriptUpdate: (fullText: string) => {
// Always append to existing text for continuous transcription // The hook now sends the full accumulated transcript,
const currentText = shape.props.text || '' // so we replace rather than append
const updatedText = currentText + (currentText ? ' ' : '') + newText
if (!isLiveEditing) { if (!isLiveEditing) {
// Update shape text without changing height
this.editor.updateShape({ this.editor.updateShape({
id: shape.id, id: shape.id,
type: 'Transcription', type: 'Transcription',
props: { props: {
...shape.props, ...shape.props,
text: updatedText text: fullText
// Removed h: textHeight to prevent auto-resizing
} }
}) })
// Also update the editing content if it's empty or matches the old text
if (!editingContent || editingContent === shape.props.text) { if (!editingContent || editingContent === shape.props.text) {
setEditingContent(updatedText) setEditingContent(fullText)
} }
} else { } else {
// In live editing mode, append to the separate live edit transcript setLiveEditTranscript(fullText)
const currentLiveTranscript = liveEditTranscript || '' setEditingContent(fullText)
const updatedLiveTranscript = currentLiveTranscript + (currentLiveTranscript ? ' ' : '') + newText
setLiveEditTranscript(updatedLiveTranscript)
// Also update editing content to show the live transcript
setEditingContent(updatedLiveTranscript)
} }
}, },
onError: (error: Error) => { onError: (error: Error) => {