canvas-website/src/hooks/useWebSpeechTranscription.ts

329 lines
9.8 KiB
TypeScript

import { useState, useRef, useCallback, useEffect } from 'react'
// TypeScript declarations for Web Speech API
declare global {
interface Window {
SpeechRecognition: typeof SpeechRecognition
webkitSpeechRecognition: typeof SpeechRecognition
}
interface SpeechRecognition extends EventTarget {
continuous: boolean
interimResults: boolean
lang: string
maxAlternatives: number
start(): void
stop(): void
onstart: ((this: SpeechRecognition, ev: Event) => any) | null
onresult: ((this: SpeechRecognition, ev: SpeechRecognitionEvent) => any) | null
onerror: ((this: SpeechRecognition, ev: SpeechRecognitionErrorEvent) => any) | null
onend: ((this: SpeechRecognition, ev: Event) => any) | null
}
interface SpeechRecognitionEvent extends Event {
resultIndex: number
results: SpeechRecognitionResultList
}
interface SpeechRecognitionErrorEvent extends Event {
error: string
}
interface SpeechRecognitionResultList {
readonly length: number
item(index: number): SpeechRecognitionResult
[index: number]: SpeechRecognitionResult
}
interface SpeechRecognitionResult {
readonly length: number
item(index: number): SpeechRecognitionAlternative
[index: number]: SpeechRecognitionAlternative
readonly isFinal: boolean
}
interface SpeechRecognitionAlternative {
readonly transcript: string
readonly confidence: number
}
var SpeechRecognition: {
prototype: SpeechRecognition
new(): SpeechRecognition
}
}
interface UseWebSpeechTranscriptionOptions {
onTranscriptUpdate?: (text: string) => void
onError?: (error: Error) => void
language?: string
continuous?: boolean
interimResults?: boolean
}
export const useWebSpeechTranscription = ({
onTranscriptUpdate,
onError,
language = 'en-US',
continuous = true,
interimResults = true
}: UseWebSpeechTranscriptionOptions = {}) => {
const [isRecording, setIsRecording] = useState(false)
const [isTranscribing, setIsTranscribing] = useState(false)
const [transcript, setTranscript] = useState('')
const [interimTranscript, setInterimTranscript] = useState('')
const [isSupported, setIsSupported] = useState(false)
const recognitionRef = useRef<SpeechRecognition | null>(null)
const finalTranscriptRef = useRef('')
const interimTranscriptRef = useRef('')
const lastSpeechTimeRef = useRef<number>(0)
const pauseTimeoutRef = useRef<NodeJS.Timeout | null>(null)
const lastConfidenceRef = useRef<number>(0)
const speakerChangeThreshold = 0.3 // Threshold for detecting speaker changes
// Function to add line breaks after pauses and improve punctuation
const processTranscript = useCallback((text: string, isFinal: boolean = false) => {
if (!text.trim()) return text
let processedText = text.trim()
// Add punctuation if missing at the end
if (isFinal && processedText && !/[.!?]$/.test(processedText)) {
processedText += '.'
}
// Add line break if there's been a pause (for final results)
if (isFinal) {
const now = Date.now()
const timeSinceLastSpeech = now - lastSpeechTimeRef.current
// If more than 3 seconds since last speech, add a line break
if (timeSinceLastSpeech > 3000 && lastSpeechTimeRef.current > 0) {
processedText = '\n' + processedText
}
lastSpeechTimeRef.current = now
}
return processedText
}, [])
// Function to detect speaker changes based on confidence and timing
const detectSpeakerChange = useCallback((confidence: number) => {
if (lastConfidenceRef.current === 0) {
lastConfidenceRef.current = confidence
return false
}
const confidenceDiff = Math.abs(confidence - lastConfidenceRef.current)
const now = Date.now()
const timeSinceLastSpeech = now - lastSpeechTimeRef.current
// Detect speaker change if confidence changes significantly and there's been a pause
const isSpeakerChange = confidenceDiff > speakerChangeThreshold && timeSinceLastSpeech > 1000
if (isSpeakerChange) {
// Reduced debug logging
lastConfidenceRef.current = confidence
return true
}
lastConfidenceRef.current = confidence
return false
}, [speakerChangeThreshold])
// Function to handle pause detection
const handlePauseDetection = useCallback(() => {
// Clear existing timeout
if (pauseTimeoutRef.current) {
clearTimeout(pauseTimeoutRef.current)
}
// Set new timeout for pause detection
pauseTimeoutRef.current = setTimeout(() => {
const now = Date.now()
const timeSinceLastSpeech = now - lastSpeechTimeRef.current
// If more than 2 seconds of silence, add a line break to interim transcript
if (timeSinceLastSpeech > 2000 && lastSpeechTimeRef.current > 0) {
const currentTranscript = finalTranscriptRef.current + '\n'
setTranscript(currentTranscript)
onTranscriptUpdate?.(currentTranscript)
// Reduced debug logging
}
}, 2000) // Check after 2 seconds of silence
}, [onTranscriptUpdate])
// Check if Web Speech API is supported
useEffect(() => {
const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition
if (SpeechRecognition) {
setIsSupported(true)
// Reduced debug logging
} else {
setIsSupported(false)
onError?.(new Error('Web Speech API is not supported in this browser'))
}
}, [onError])
// Initialize speech recognition
const initializeRecognition = useCallback(() => {
if (!isSupported) return null
const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition
const recognition = new SpeechRecognition()
recognition.continuous = continuous
recognition.interimResults = interimResults
recognition.lang = language
recognition.maxAlternatives = 1
recognition.onstart = () => {
setIsRecording(true)
setIsTranscribing(true)
}
recognition.onresult = (event) => {
let interimTranscript = ''
let finalTranscript = ''
// Process all results
for (let i = event.resultIndex; i < event.results.length; i++) {
const result = event.results[i]
const transcript = result[0].transcript
if (result.isFinal) {
finalTranscript += transcript
} else {
interimTranscript += transcript
}
}
// Update final transcript with processing
if (finalTranscript) {
// Get confidence from the first result
const confidence = event.results[event.results.length - 1]?.[0]?.confidence || 0
// Detect speaker change
const isSpeakerChange = detectSpeakerChange(confidence)
// Add speaker indicator if change detected
let speakerPrefix = ''
if (isSpeakerChange) {
speakerPrefix = '\n[Speaker Change]\n'
}
const processedFinal = processTranscript(finalTranscript, true)
const newText = speakerPrefix + processedFinal
finalTranscriptRef.current += newText
setTranscript(finalTranscriptRef.current)
onTranscriptUpdate?.(newText) // Only send the new text portion
// Trigger pause detection
handlePauseDetection()
}
// Update interim transcript
if (interimTranscript) {
const processedInterim = processTranscript(interimTranscript, false)
interimTranscriptRef.current = processedInterim
setInterimTranscript(processedInterim)
}
}
recognition.onerror = (event) => {
console.error('❌ Web Speech API error:', event.error)
setIsRecording(false)
setIsTranscribing(false)
onError?.(new Error(`Speech recognition error: ${event.error}`))
}
recognition.onend = () => {
setIsRecording(false)
setIsTranscribing(false)
}
return recognition
}, [isSupported, continuous, interimResults, language, onTranscriptUpdate, onError])
// Start recording
const startRecording = useCallback(() => {
if (!isSupported) {
onError?.(new Error('Web Speech API is not supported'))
return
}
try {
// Don't reset transcripts for continuous transcription - keep existing content
// finalTranscriptRef.current = ''
// interimTranscriptRef.current = ''
// setTranscript('')
// setInterimTranscript('')
lastSpeechTimeRef.current = 0
lastConfidenceRef.current = 0
// Clear any existing pause timeout
if (pauseTimeoutRef.current) {
clearTimeout(pauseTimeoutRef.current)
pauseTimeoutRef.current = null
}
// Initialize and start recognition
const recognition = initializeRecognition()
if (recognition) {
recognitionRef.current = recognition
recognition.start()
}
} catch (error) {
console.error('❌ Error starting Web Speech API:', error)
onError?.(error as Error)
}
}, [isSupported, initializeRecognition, onError])
// Stop recording
const stopRecording = useCallback(() => {
if (recognitionRef.current) {
recognitionRef.current.stop()
recognitionRef.current = null
}
}, [])
// Cleanup
const cleanup = useCallback(() => {
if (recognitionRef.current) {
recognitionRef.current.stop()
recognitionRef.current = null
}
// Clear pause timeout
if (pauseTimeoutRef.current) {
clearTimeout(pauseTimeoutRef.current)
pauseTimeoutRef.current = null
}
setIsRecording(false)
setIsTranscribing(false)
}, [])
// Cleanup on unmount
useEffect(() => {
return cleanup
}, [cleanup])
return {
isRecording,
isTranscribing,
transcript,
interimTranscript,
isSupported,
startRecording,
stopRecording,
cleanup
}
}
// Export as default for compatibility
export default useWebSpeechTranscription