fix: output unified transcript instead of echoed segments

The Web Speech API hook was sending each recognition result as a separate fragment via onTranscriptUpdate, causing the transcription shape to display fragmented/echoed conversation. Now the hook accumulates the full transcript internally and always sends the complete text, so the shape receives one unified conversation. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-18 10:02:59 +00:00 · 2026-03-18 10:02:59 +00:00 · 007a25d3da
parent 526a4c4b9d
commit 007a25d3da
2 changed files with 30 additions and 69 deletions
--- a/src/hooks/useWebSpeechTranscription.ts
+++ b/src/hooks/useWebSpeechTranscription.ts
@ -79,8 +79,6 @@ export const useWebSpeechTranscription = ({
  const interimTranscriptRef = useRef('')
  const lastSpeechTimeRef = useRef<number>(0)
  const pauseTimeoutRef = useRef<NodeJS.Timeout | null>(null)
-  const lastConfidenceRef = useRef<number>(0)
-  const speakerChangeThreshold = 0.3 // Threshold for detecting speaker changes

  // Function to add line breaks after pauses and improve punctuation
  const processTranscript = useCallback((text: string, isFinal: boolean = false) => {
@ -109,30 +107,6 @@ export const useWebSpeechTranscription = ({
    return processedText
  }, [])

-  // Function to detect speaker changes based on confidence and timing
-  const detectSpeakerChange = useCallback((confidence: number) => {
-    if (lastConfidenceRef.current === 0) {
-      lastConfidenceRef.current = confidence
-      return false
-    }
-    
-    const confidenceDiff = Math.abs(confidence - lastConfidenceRef.current)
-    const now = Date.now()
-    const timeSinceLastSpeech = now - lastSpeechTimeRef.current
-    
-    // Detect speaker change if confidence changes significantly and there's been a pause
-    const isSpeakerChange = confidenceDiff > speakerChangeThreshold && timeSinceLastSpeech > 1000
-    
-    if (isSpeakerChange) {
-        // Reduced debug logging
-      lastConfidenceRef.current = confidence
-      return true
-    }
-    
-    lastConfidenceRef.current = confidence
-    return false
-  }, [speakerChangeThreshold])
-
  // Function to handle pause detection
  const handlePauseDetection = useCallback(() => {
    // Clear existing timeout
@ -140,19 +114,21 @@ export const useWebSpeechTranscription = ({
      clearTimeout(pauseTimeoutRef.current)
    }
    
-    // Set new timeout for pause detection
+    // Set new timeout for pause detection — after a long silence,
+    // append a newline so the next utterance starts on a new line.
    pauseTimeoutRef.current = setTimeout(() => {
      const now = Date.now()
      const timeSinceLastSpeech = now - lastSpeechTimeRef.current
-      
-      // If more than 2 seconds of silence, add a line break to interim transcript
+
      if (timeSinceLastSpeech > 2000 && lastSpeechTimeRef.current > 0) {
-        const currentTranscript = finalTranscriptRef.current + '\n'
-        setTranscript(currentTranscript)
-        onTranscriptUpdate?.(currentTranscript)
-        // Reduced debug logging
+        // Only append the newline if the transcript doesn't already end with one
+        if (finalTranscriptRef.current && !finalTranscriptRef.current.endsWith('\n')) {
+          finalTranscriptRef.current += '\n'
+          setTranscript(finalTranscriptRef.current)
+          onTranscriptUpdate?.(finalTranscriptRef.current)
+        }
      }
-    }, 2000) // Check after 2 seconds of silence
+    }, 2000)
  }, [onTranscriptUpdate])

  // Check if Web Speech API is supported
@ -202,24 +178,20 @@ export const useWebSpeechTranscription = ({

      // Update final transcript with processing
      if (finalTranscript) {
-        // Get confidence from the first result
-        const confidence = event.results[event.results.length - 1]?.[0]?.confidence || 0
-        
-        // Detect speaker change
-        const isSpeakerChange = detectSpeakerChange(confidence)
-        
-        // Add speaker indicator if change detected
-        let speakerPrefix = ''
-        if (isSpeakerChange) {
-          speakerPrefix = '\n[Speaker Change]\n'
-        }
-        
        const processedFinal = processTranscript(finalTranscript, true)
-        const newText = speakerPrefix + processedFinal
-        finalTranscriptRef.current += newText
+
+        // Append to running transcript with a space separator
+        if (finalTranscriptRef.current) {
+          finalTranscriptRef.current += ' ' + processedFinal
+        } else {
+          finalTranscriptRef.current = processedFinal
+        }
+
+        // Always send the full accumulated transcript so the shape
+        // receives one unified conversation instead of fragments
        setTranscript(finalTranscriptRef.current)
-        onTranscriptUpdate?.(newText) // Only send the new text portion
-        
+        onTranscriptUpdate?.(finalTranscriptRef.current)
+
        // Trigger pause detection
        handlePauseDetection()
      }
@ -262,7 +234,6 @@ export const useWebSpeechTranscription = ({
      // setTranscript('')
      // setInterimTranscript('')
      lastSpeechTimeRef.current = 0
-      lastConfidenceRef.current = 0
      
      // Clear any existing pause timeout
      if (pauseTimeoutRef.current) {
--- a/src/shapes/TranscriptionShapeUtil.tsx
+++ b/src/shapes/TranscriptionShapeUtil.tsx
@ -191,35 +191,25 @@ export class TranscriptionShape extends BaseBoxShapeUtil<ITranscription> {

    // Web Speech API hook for real-time transcription
    const webSpeechOptions = useMemo(() => ({
-      onTranscriptUpdate: (newText: string) => {
-        // Always append to existing text for continuous transcription
-        const currentText = shape.props.text || ''
-        const updatedText = currentText + (currentText ? ' ' : '') + newText
-        
+      onTranscriptUpdate: (fullText: string) => {
+        // The hook now sends the full accumulated transcript,
+        // so we replace rather than append
        if (!isLiveEditing) {
-          // Update shape text without changing height
          this.editor.updateShape({
            id: shape.id,
            type: 'Transcription',
            props: {
              ...shape.props,
-              text: updatedText
-              // Removed h: textHeight to prevent auto-resizing
+              text: fullText
            }
          })
-          
-          // Also update the editing content if it's empty or matches the old text
+
          if (!editingContent || editingContent === shape.props.text) {
-            setEditingContent(updatedText)
+            setEditingContent(fullText)
          }
        } else {
-          // In live editing mode, append to the separate live edit transcript
-          const currentLiveTranscript = liveEditTranscript || ''
-          const updatedLiveTranscript = currentLiveTranscript + (currentLiveTranscript ? ' ' : '') + newText
-          setLiveEditTranscript(updatedLiveTranscript)
-          
-          // Also update editing content to show the live transcript
-          setEditingContent(updatedLiveTranscript)
+          setLiveEditTranscript(fullText)
+          setEditingContent(fullText)
        }
      },
      onError: (error: Error) => {