canvas-website/src/hooks/useWebSpeechTranscription.ts

import { useState, useRef, useCallback, useEffect } from 'react'

// TypeScript declarations for Web Speech API
declare global {
  interface Window {
    SpeechRecognition: typeof SpeechRecognition
    webkitSpeechRecognition: typeof SpeechRecognition
  }

  interface SpeechRecognition extends EventTarget {
    continuous: boolean
    interimResults: boolean
    lang: string
    maxAlternatives: number
    start(): void
    stop(): void
    onstart: ((this: SpeechRecognition, ev: Event) => any) | null
    onresult: ((this: SpeechRecognition, ev: SpeechRecognitionEvent) => any) | null
    onerror: ((this: SpeechRecognition, ev: SpeechRecognitionErrorEvent) => any) | null
    onend: ((this: SpeechRecognition, ev: Event) => any) | null
  }

  interface SpeechRecognitionEvent extends Event {
    resultIndex: number
    results: SpeechRecognitionResultList
  }

  interface SpeechRecognitionErrorEvent extends Event {
    error: string
  }

  interface SpeechRecognitionResultList {
    readonly length: number
    item(index: number): SpeechRecognitionResult
    [index: number]: SpeechRecognitionResult
  }

  interface SpeechRecognitionResult {
    readonly length: number
    item(index: number): SpeechRecognitionAlternative
    [index: number]: SpeechRecognitionAlternative
    readonly isFinal: boolean
  }

  interface SpeechRecognitionAlternative {
    readonly transcript: string
    readonly confidence: number
  }

  var SpeechRecognition: {
    prototype: SpeechRecognition
    new(): SpeechRecognition
  }
}

interface UseWebSpeechTranscriptionOptions {
  onTranscriptUpdate?: (text: string) => void
  onError?: (error: Error) => void
  language?: string
  continuous?: boolean
  interimResults?: boolean
}

export const useWebSpeechTranscription = ({
  onTranscriptUpdate,
  onError,
  language = 'en-US',
  continuous = true,
  interimResults = true
}: UseWebSpeechTranscriptionOptions = {}) => {
  const [isRecording, setIsRecording] = useState(false)
  const [isTranscribing, setIsTranscribing] = useState(false)
  const [transcript, setTranscript] = useState('')
  const [interimTranscript, setInterimTranscript] = useState('')
  const [isSupported, setIsSupported] = useState(false)

  const recognitionRef = useRef<SpeechRecognition | null>(null)
  const finalTranscriptRef = useRef('')
  const interimTranscriptRef = useRef('')
  const lastSpeechTimeRef = useRef<number>(0)
  const pauseTimeoutRef = useRef<NodeJS.Timeout | null>(null)
  const lastConfidenceRef = useRef<number>(0)
  const speakerChangeThreshold = 0.3 // Threshold for detecting speaker changes

  // Function to add line breaks after pauses and improve punctuation
  const processTranscript = useCallback((text: string, isFinal: boolean = false) => {
    if (!text.trim()) return text

    let processedText = text.trim()

    // Add punctuation if missing at the end
    if (isFinal && processedText && !/[.!?]$/.test(processedText)) {
      processedText += '.'
    }

    // Add line break if there's been a pause (for final results)
    if (isFinal) {
      const now = Date.now()
      const timeSinceLastSpeech = now - lastSpeechTimeRef.current

      // If more than 3 seconds since last speech, add a line break
      if (timeSinceLastSpeech > 3000 && lastSpeechTimeRef.current > 0) {
        processedText = '\n' + processedText
      }

      lastSpeechTimeRef.current = now
    }

    return processedText
  }, [])

  // Function to detect speaker changes based on confidence and timing
  const detectSpeakerChange = useCallback((confidence: number) => {
    if (lastConfidenceRef.current === 0) {
      lastConfidenceRef.current = confidence
      return false
    }

    const confidenceDiff = Math.abs(confidence - lastConfidenceRef.current)
    const now = Date.now()
    const timeSinceLastSpeech = now - lastSpeechTimeRef.current

    // Detect speaker change if confidence changes significantly and there's been a pause
    const isSpeakerChange = confidenceDiff > speakerChangeThreshold && timeSinceLastSpeech > 1000

    if (isSpeakerChange) {
        // Reduced debug logging
      lastConfidenceRef.current = confidence
      return true
    }

    lastConfidenceRef.current = confidence
    return false
  }, [speakerChangeThreshold])

  // Function to handle pause detection
  const handlePauseDetection = useCallback(() => {
    // Clear existing timeout
    if (pauseTimeoutRef.current) {
      clearTimeout(pauseTimeoutRef.current)
    }

    // Set new timeout for pause detection
    pauseTimeoutRef.current = setTimeout(() => {
      const now = Date.now()
      const timeSinceLastSpeech = now - lastSpeechTimeRef.current

      // If more than 2 seconds of silence, add a line break to interim transcript
      if (timeSinceLastSpeech > 2000 && lastSpeechTimeRef.current > 0) {
        const currentTranscript = finalTranscriptRef.current + '\n'
        setTranscript(currentTranscript)
        onTranscriptUpdate?.(currentTranscript)
        // Reduced debug logging
      }
    }, 2000) // Check after 2 seconds of silence
  }, [onTranscriptUpdate])

  // Check if Web Speech API is supported
  useEffect(() => {
    const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition
    if (SpeechRecognition) {
      setIsSupported(true)
      // Reduced debug logging
    } else {
      setIsSupported(false)
      onError?.(new Error('Web Speech API is not supported in this browser'))
    }
  }, [onError])

  // Initialize speech recognition
  const initializeRecognition = useCallback(() => {
    if (!isSupported) return null

    const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition
    const recognition = new SpeechRecognition()

    recognition.continuous = continuous
    recognition.interimResults = interimResults
    recognition.lang = language
    recognition.maxAlternatives = 1

    recognition.onstart = () => {
      setIsRecording(true)
      setIsTranscribing(true)
    }

    recognition.onresult = (event) => {
      let interimTranscript = ''
      let finalTranscript = ''

      // Process all results
      for (let i = event.resultIndex; i < event.results.length; i++) {
        const result = event.results[i]
        const transcript = result[0].transcript

        if (result.isFinal) {
          finalTranscript += transcript
        } else {
          interimTranscript += transcript
        }
      }

      // Update final transcript with processing
      if (finalTranscript) {
        // Get confidence from the first result
        const confidence = event.results[event.results.length - 1]?.[0]?.confidence || 0

        // Detect speaker change
        const isSpeakerChange = detectSpeakerChange(confidence)

        // Add speaker indicator if change detected
        let speakerPrefix = ''
        if (isSpeakerChange) {
          speakerPrefix = '\n[Speaker Change]\n'
        }

        const processedFinal = processTranscript(finalTranscript, true)
        const newText = speakerPrefix + processedFinal
        finalTranscriptRef.current += newText
        setTranscript(finalTranscriptRef.current)
        onTranscriptUpdate?.(newText) // Only send the new text portion

        // Trigger pause detection
        handlePauseDetection()
      }

      // Update interim transcript
      if (interimTranscript) {
        const processedInterim = processTranscript(interimTranscript, false)
        interimTranscriptRef.current = processedInterim
        setInterimTranscript(processedInterim)
      }
    }

    recognition.onerror = (event) => {
      console.error('❌ Web Speech API error:', event.error)
      setIsRecording(false)
      setIsTranscribing(false)
      onError?.(new Error(`Speech recognition error: ${event.error}`))
    }

    recognition.onend = () => {
      setIsRecording(false)
      setIsTranscribing(false)
    }

    return recognition
  }, [isSupported, continuous, interimResults, language, onTranscriptUpdate, onError])

  // Start recording
  const startRecording = useCallback(() => {
    if (!isSupported) {
      onError?.(new Error('Web Speech API is not supported'))
      return
    }

    try {

      // Don't reset transcripts for continuous transcription - keep existing content
      // finalTranscriptRef.current = ''
      // interimTranscriptRef.current = ''
      // setTranscript('')
      // setInterimTranscript('')
      lastSpeechTimeRef.current = 0
      lastConfidenceRef.current = 0

      // Clear any existing pause timeout
      if (pauseTimeoutRef.current) {
        clearTimeout(pauseTimeoutRef.current)
        pauseTimeoutRef.current = null
      }

      // Initialize and start recognition
      const recognition = initializeRecognition()
      if (recognition) {
        recognitionRef.current = recognition
        recognition.start()
      }
    } catch (error) {
      console.error('❌ Error starting Web Speech API:', error)
      onError?.(error as Error)
    }
  }, [isSupported, initializeRecognition, onError])

  // Stop recording
  const stopRecording = useCallback(() => {
    if (recognitionRef.current) {
      recognitionRef.current.stop()
      recognitionRef.current = null
    }
  }, [])

  // Cleanup
  const cleanup = useCallback(() => {
    if (recognitionRef.current) {
      recognitionRef.current.stop()
      recognitionRef.current = null
    }

    // Clear pause timeout
    if (pauseTimeoutRef.current) {
      clearTimeout(pauseTimeoutRef.current)
      pauseTimeoutRef.current = null
    }

    setIsRecording(false)
    setIsTranscribing(false)
  }, [])

  // Cleanup on unmount
  useEffect(() => {
    return cleanup
  }, [cleanup])

  return {
    isRecording,
    isTranscribing,
    transcript,
    interimTranscript,
    isSupported,
    startRecording,
    stopRecording,
    cleanup
  }
}

// Export as default for compatibility
export default useWebSpeechTranscription