1058 lines
40 KiB
TypeScript
1058 lines
40 KiB
TypeScript
import { useCallback, useEffect, useRef, useState } from 'react'
|
||
import { pipeline, env } from '@xenova/transformers'
|
||
import { transcribeWithRunPod } from '../lib/runpodApi'
|
||
import { isRunPodConfigured } from '../lib/clientConfig'
|
||
|
||
// Configure the transformers library
|
||
env.allowRemoteModels = true
|
||
env.allowLocalModels = false
|
||
env.useBrowserCache = true
|
||
env.useCustomCache = false
|
||
|
||
// Helper function to detect audio format from blob
|
||
function detectAudioFormat(blob: Blob): Promise<string> {
|
||
if (blob.type && blob.type !== 'application/octet-stream') {
|
||
return Promise.resolve(blob.type)
|
||
}
|
||
|
||
// Try to detect from the first few bytes
|
||
return new Promise((resolve) => {
|
||
const reader = new FileReader()
|
||
reader.onload = () => {
|
||
try {
|
||
const arrayBuffer = reader.result as ArrayBuffer
|
||
if (!arrayBuffer || arrayBuffer.byteLength < 4) {
|
||
resolve('audio/webm;codecs=opus') // Default fallback
|
||
return
|
||
}
|
||
|
||
const uint8Array = new Uint8Array(arrayBuffer.slice(0, 12))
|
||
|
||
// Check for common audio format signatures
|
||
if (uint8Array[0] === 0x52 && uint8Array[1] === 0x49 && uint8Array[2] === 0x46 && uint8Array[3] === 0x46) {
|
||
resolve('audio/wav')
|
||
} else if (uint8Array[0] === 0x4F && uint8Array[1] === 0x67 && uint8Array[2] === 0x67 && uint8Array[3] === 0x53) {
|
||
resolve('audio/ogg;codecs=opus')
|
||
} else if (uint8Array[0] === 0x1A && uint8Array[1] === 0x45 && uint8Array[2] === 0xDF && uint8Array[3] === 0xA3) {
|
||
resolve('audio/webm;codecs=opus')
|
||
} else {
|
||
resolve('audio/webm;codecs=opus') // Default fallback
|
||
}
|
||
} catch (error) {
|
||
console.warn('⚠️ Error detecting audio format:', error)
|
||
resolve('audio/webm;codecs=opus') // Default fallback
|
||
}
|
||
}
|
||
reader.onerror = () => {
|
||
resolve('audio/webm;codecs=opus') // Default fallback
|
||
}
|
||
reader.readAsArrayBuffer(blob.slice(0, 12))
|
||
})
|
||
}
|
||
|
||
// Convert Float32Array audio data to WAV blob
|
||
async function createWavBlob(audioData: Float32Array, sampleRate: number): Promise<Blob> {
|
||
const length = audioData.length
|
||
const buffer = new ArrayBuffer(44 + length * 2)
|
||
const view = new DataView(buffer)
|
||
|
||
// WAV header
|
||
const writeString = (offset: number, string: string) => {
|
||
for (let i = 0; i < string.length; i++) {
|
||
view.setUint8(offset + i, string.charCodeAt(i))
|
||
}
|
||
}
|
||
|
||
writeString(0, 'RIFF')
|
||
view.setUint32(4, 36 + length * 2, true)
|
||
writeString(8, 'WAVE')
|
||
writeString(12, 'fmt ')
|
||
view.setUint32(16, 16, true)
|
||
view.setUint16(20, 1, true)
|
||
view.setUint16(22, 1, true)
|
||
view.setUint32(24, sampleRate, true)
|
||
view.setUint32(28, sampleRate * 2, true)
|
||
view.setUint16(32, 2, true)
|
||
view.setUint16(34, 16, true)
|
||
writeString(36, 'data')
|
||
view.setUint32(40, length * 2, true)
|
||
|
||
// Convert float samples to 16-bit PCM
|
||
let offset = 44
|
||
for (let i = 0; i < length; i++) {
|
||
const sample = Math.max(-1, Math.min(1, audioData[i]))
|
||
view.setInt16(offset, sample < 0 ? sample * 0x8000 : sample * 0x7FFF, true)
|
||
offset += 2
|
||
}
|
||
|
||
return new Blob([buffer], { type: 'audio/wav' })
|
||
}
|
||
|
||
// Simple resampling function for audio data
|
||
function resampleAudio(audioData: Float32Array, fromSampleRate: number, toSampleRate: number): Float32Array {
|
||
if (fromSampleRate === toSampleRate) {
|
||
return audioData
|
||
}
|
||
|
||
// Validate input parameters
|
||
if (!audioData || audioData.length === 0) {
|
||
throw new Error('Invalid audio data for resampling')
|
||
}
|
||
|
||
if (fromSampleRate <= 0 || toSampleRate <= 0) {
|
||
throw new Error('Invalid sample rates for resampling')
|
||
}
|
||
|
||
const ratio = fromSampleRate / toSampleRate
|
||
const newLength = Math.floor(audioData.length / ratio)
|
||
|
||
// Ensure we have a valid length
|
||
if (newLength <= 0) {
|
||
throw new Error('Invalid resampled length')
|
||
}
|
||
|
||
const resampled = new Float32Array(newLength)
|
||
|
||
for (let i = 0; i < newLength; i++) {
|
||
const sourceIndex = Math.floor(i * ratio)
|
||
// Ensure sourceIndex is within bounds
|
||
if (sourceIndex >= 0 && sourceIndex < audioData.length) {
|
||
resampled[i] = audioData[sourceIndex]
|
||
} else {
|
||
resampled[i] = 0
|
||
}
|
||
}
|
||
|
||
return resampled
|
||
}
|
||
|
||
interface ModelOption {
|
||
name: string
|
||
options: {
|
||
quantized: boolean
|
||
use_browser_cache: boolean
|
||
use_custom_cache: boolean
|
||
}
|
||
}
|
||
|
||
interface UseWhisperTranscriptionOptions {
|
||
onTranscriptUpdate?: (text: string) => void
|
||
onError?: (error: Error) => void
|
||
language?: string
|
||
enableStreaming?: boolean
|
||
enableAdvancedErrorHandling?: boolean
|
||
modelOptions?: ModelOption[]
|
||
autoInitialize?: boolean // If false, model will only load when startRecording is called
|
||
useRunPod?: boolean // If true, use RunPod WhisperX endpoint instead of local model (defaults to checking if RunPod is configured)
|
||
}
|
||
|
||
export const useWhisperTranscription = ({
|
||
onTranscriptUpdate,
|
||
onError,
|
||
language = 'en',
|
||
enableStreaming = false,
|
||
enableAdvancedErrorHandling = false,
|
||
modelOptions,
|
||
autoInitialize = true, // Default to true for backward compatibility
|
||
useRunPod = undefined // If undefined, auto-detect based on configuration
|
||
}: UseWhisperTranscriptionOptions = {}) => {
|
||
// Auto-detect RunPod usage if not explicitly set
|
||
const shouldUseRunPod = useRunPod !== undefined ? useRunPod : isRunPodConfigured()
|
||
const [isRecording, setIsRecording] = useState(false)
|
||
const [isTranscribing, setIsTranscribing] = useState(false)
|
||
const [isSpeaking, setIsSpeaking] = useState(false)
|
||
const [transcript, setTranscript] = useState('')
|
||
const [modelLoaded, setModelLoaded] = useState(false)
|
||
|
||
const transcriberRef = useRef<any>(null)
|
||
const streamRef = useRef<MediaStream | null>(null)
|
||
const mediaRecorderRef = useRef<MediaRecorder | null>(null)
|
||
const audioChunksRef = useRef<Blob[]>([])
|
||
const isRecordingRef = useRef(false)
|
||
const transcriptRef = useRef('')
|
||
const streamingTranscriptRef = useRef('')
|
||
const periodicTranscriptionRef = useRef<NodeJS.Timeout | null>(null)
|
||
const lastTranscriptionTimeRef = useRef<number>(0)
|
||
const lastSpeechTimeRef = useRef<number>(0)
|
||
const previousTranscriptLengthRef = useRef<number>(0) // Track previous transcript length for continuous transcription
|
||
|
||
// Function to process transcript with line breaks and punctuation
|
||
const processTranscript = useCallback((text: string, isStreaming: boolean = false) => {
|
||
if (!text.trim()) return text
|
||
|
||
let processedText = text.trim()
|
||
|
||
// Add punctuation if missing at the end
|
||
if (!/[.!?]$/.test(processedText)) {
|
||
processedText += '.'
|
||
}
|
||
|
||
// Add line break if there's been a pause (for streaming)
|
||
if (isStreaming) {
|
||
const now = Date.now()
|
||
const timeSinceLastSpeech = now - lastSpeechTimeRef.current
|
||
|
||
// If more than 3 seconds since last speech, add a line break
|
||
if (timeSinceLastSpeech > 3000 && lastSpeechTimeRef.current > 0) {
|
||
processedText = '\n' + processedText
|
||
}
|
||
|
||
lastSpeechTimeRef.current = now
|
||
}
|
||
|
||
return processedText
|
||
}, [])
|
||
|
||
// Initialize transcriber with optional advanced error handling
|
||
const initializeTranscriber = useCallback(async () => {
|
||
// Skip model loading if using RunPod
|
||
if (shouldUseRunPod) {
|
||
console.log('🚀 Using RunPod WhisperX endpoint - skipping local model loading')
|
||
setModelLoaded(true) // Mark as "loaded" since we don't need a local model
|
||
return null
|
||
}
|
||
|
||
if (transcriberRef.current) return transcriberRef.current
|
||
|
||
try {
|
||
console.log('🤖 Loading Whisper model...')
|
||
|
||
// Check if we're running in a CORS-restricted environment
|
||
if (typeof window !== 'undefined' && window.location.protocol === 'file:') {
|
||
console.warn('⚠️ Running from file:// protocol - CORS issues may occur')
|
||
console.warn('💡 Consider running from a local development server for better compatibility')
|
||
}
|
||
|
||
if (enableAdvancedErrorHandling && modelOptions) {
|
||
// Use advanced model loading with fallbacks
|
||
let transcriber = null
|
||
let lastError = null
|
||
|
||
for (const modelOption of modelOptions) {
|
||
try {
|
||
console.log(`🔄 Trying model: ${modelOption.name}`)
|
||
transcriber = await pipeline('automatic-speech-recognition', modelOption.name, {
|
||
...modelOption.options,
|
||
progress_callback: (progress: any) => {
|
||
if (progress.status === 'downloading') {
|
||
console.log(`📦 Downloading model: ${progress.file} (${Math.round(progress.progress * 100)}%)`)
|
||
}
|
||
}
|
||
})
|
||
console.log(`✅ Successfully loaded model: ${modelOption.name}`)
|
||
break
|
||
} catch (error) {
|
||
console.warn(`⚠️ Failed to load model ${modelOption.name}:`, error)
|
||
lastError = error
|
||
continue
|
||
}
|
||
}
|
||
|
||
if (!transcriber) {
|
||
throw lastError || new Error('Failed to load any model')
|
||
}
|
||
|
||
transcriberRef.current = transcriber
|
||
setModelLoaded(true)
|
||
return transcriber
|
||
} else {
|
||
// Simple model loading (default behavior) with fallback
|
||
const modelOptions = [
|
||
'Xenova/whisper-tiny.en',
|
||
'Xenova/whisper-tiny'
|
||
]
|
||
|
||
let transcriber = null
|
||
let lastError = null
|
||
|
||
for (const modelName of modelOptions) {
|
||
try {
|
||
// Reduced debug logging
|
||
|
||
const loadPromise = pipeline('automatic-speech-recognition', modelName, {
|
||
quantized: true,
|
||
progress_callback: (progress: any) => {
|
||
if (progress.status === 'downloading') {
|
||
console.log(`📦 Downloading model: ${progress.file} (${Math.round(progress.progress * 100)}%)`)
|
||
} else if (progress.status === 'loading') {
|
||
console.log(`🔄 Loading model: ${progress.file}`)
|
||
}
|
||
}
|
||
})
|
||
|
||
const timeoutPromise = new Promise((_, reject) =>
|
||
setTimeout(() => reject(new Error('Model loading timeout')), 60000) // 60 seconds timeout
|
||
)
|
||
|
||
transcriber = await Promise.race([loadPromise, timeoutPromise])
|
||
|
||
transcriberRef.current = transcriber
|
||
setModelLoaded(true)
|
||
console.log(`✅ Whisper model loaded: ${modelName}`)
|
||
|
||
return transcriber
|
||
} catch (error) {
|
||
// Reduced error logging - only show final error
|
||
lastError = error
|
||
continue
|
||
}
|
||
}
|
||
|
||
// If all models failed, throw the last error
|
||
throw lastError || new Error('Failed to load any Whisper model')
|
||
}
|
||
} catch (error) {
|
||
console.error('❌ Failed to load model:', error)
|
||
onError?.(error as Error)
|
||
throw error
|
||
}
|
||
}, [onError, enableAdvancedErrorHandling, modelOptions])
|
||
|
||
// Handle streaming transcript updates
|
||
const handleStreamingTranscriptUpdate = useCallback((newText: string) => {
|
||
if (newText.trim()) {
|
||
const newTextTrimmed = newText.trim()
|
||
const currentTranscript = streamingTranscriptRef.current.trim()
|
||
|
||
if (currentTranscript === '') {
|
||
streamingTranscriptRef.current = newTextTrimmed
|
||
} else {
|
||
// Check if the new text is already contained in the current transcript
|
||
if (!currentTranscript.includes(newTextTrimmed)) {
|
||
streamingTranscriptRef.current = currentTranscript + ' ' + newTextTrimmed
|
||
} else {
|
||
// Find the best overlap point to avoid duplicates
|
||
const words = newTextTrimmed.split(' ')
|
||
const currentWords = currentTranscript.split(' ')
|
||
|
||
let overlapIndex = 0
|
||
let maxOverlap = 0
|
||
|
||
for (let i = 1; i <= Math.min(words.length, currentWords.length); i++) {
|
||
const currentEnd = currentWords.slice(-i).join(' ')
|
||
const newStart = words.slice(0, i).join(' ')
|
||
|
||
if (currentEnd === newStart && i > maxOverlap) {
|
||
maxOverlap = i
|
||
overlapIndex = i
|
||
}
|
||
}
|
||
|
||
if (overlapIndex > 0 && overlapIndex < words.length) {
|
||
const newPart = words.slice(overlapIndex).join(' ')
|
||
streamingTranscriptRef.current = currentTranscript + ' ' + newPart
|
||
}
|
||
}
|
||
}
|
||
|
||
const processedTranscript = processTranscript(streamingTranscriptRef.current, true)
|
||
streamingTranscriptRef.current = processedTranscript
|
||
setTranscript(processedTranscript)
|
||
|
||
// Only send the new portion for continuous transcription
|
||
const newTextPortion = processedTranscript.substring(previousTranscriptLengthRef.current)
|
||
if (newTextPortion.trim()) {
|
||
onTranscriptUpdate?.(newTextPortion)
|
||
previousTranscriptLengthRef.current = processedTranscript.length
|
||
}
|
||
|
||
console.log(`📝 Real-time transcript updated: "${newTextTrimmed}" -> Total: "${processedTranscript}"`)
|
||
console.log(`🔄 Streaming transcript state updated, calling onTranscriptUpdate with: "${processedTranscript}"`)
|
||
}
|
||
}, [onTranscriptUpdate, processTranscript])
|
||
|
||
// Process accumulated audio chunks for streaming transcription
|
||
const processAccumulatedAudioChunks = useCallback(async () => {
|
||
try {
|
||
// Throttle transcription requests
|
||
const now = Date.now()
|
||
if (now - (lastTranscriptionTimeRef.current || 0) < 800) { // Reduced to 0.8 seconds for better responsiveness
|
||
return // Skip if less than 0.8 seconds since last transcription
|
||
}
|
||
|
||
const chunks = audioChunksRef.current || []
|
||
if (chunks.length === 0 || chunks.length < 2) {
|
||
console.log(`⚠️ Not enough chunks for real-time processing: ${chunks.length}`)
|
||
return
|
||
}
|
||
|
||
// Take the last 4-5 chunks for balanced processing (1-2 seconds)
|
||
const recentChunks = chunks.slice(-5)
|
||
const validChunks = recentChunks.filter(chunk => chunk && chunk.size > 2000) // Filter out small chunks
|
||
|
||
if (validChunks.length < 2) {
|
||
console.log(`⚠️ Not enough valid chunks for real-time processing: ${validChunks.length}`)
|
||
return
|
||
}
|
||
|
||
const totalSize = validChunks.reduce((sum, chunk) => sum + chunk.size, 0)
|
||
if (totalSize < 20000) { // Increased to 20KB for reliable decoding
|
||
console.log(`⚠️ Not enough audio data for real-time processing: ${totalSize} bytes`)
|
||
return
|
||
}
|
||
|
||
// Use the MIME type from the MediaRecorder, not individual chunks
|
||
let mimeType = 'audio/webm;codecs=opus' // Default to WebM
|
||
if (mediaRecorderRef.current && mediaRecorderRef.current.mimeType) {
|
||
mimeType = mediaRecorderRef.current.mimeType
|
||
}
|
||
|
||
console.log(`🔄 Real-time processing ${validChunks.length} chunks, total size: ${totalSize} bytes, type: ${mimeType}`)
|
||
console.log(`🔄 Chunk sizes:`, validChunks.map(c => c.size))
|
||
console.log(`🔄 Chunk types:`, validChunks.map(c => c.type))
|
||
|
||
// Create a more robust blob with proper headers
|
||
const tempBlob = new Blob(validChunks, { type: mimeType })
|
||
|
||
// Validate blob size
|
||
if (tempBlob.size < 10000) {
|
||
console.log(`⚠️ Blob too small for processing: ${tempBlob.size} bytes`)
|
||
return
|
||
}
|
||
|
||
const audioBuffer = await tempBlob.arrayBuffer()
|
||
|
||
// Validate audio buffer
|
||
if (audioBuffer.byteLength < 10000) {
|
||
console.log(`⚠️ Audio buffer too small: ${audioBuffer.byteLength} bytes`)
|
||
return
|
||
}
|
||
|
||
const audioContext = new AudioContext()
|
||
let audioBufferFromBlob: AudioBuffer
|
||
|
||
try {
|
||
// Try to decode the audio buffer
|
||
audioBufferFromBlob = await audioContext.decodeAudioData(audioBuffer)
|
||
console.log(`✅ Successfully decoded real-time audio buffer: ${audioBufferFromBlob.length} samples`)
|
||
} catch (decodeError) {
|
||
console.log('⚠️ Real-time chunk decode failed, trying alternative approach:', decodeError)
|
||
|
||
// Try alternative approach: create a new blob with different MIME type
|
||
try {
|
||
const alternativeBlob = new Blob(validChunks, { type: 'audio/webm' })
|
||
const alternativeBuffer = await alternativeBlob.arrayBuffer()
|
||
audioBufferFromBlob = await audioContext.decodeAudioData(alternativeBuffer)
|
||
console.log(`✅ Successfully decoded with alternative approach: ${audioBufferFromBlob.length} samples`)
|
||
} catch (altError) {
|
||
console.log('⚠️ Alternative decode also failed, skipping:', altError)
|
||
await audioContext.close()
|
||
return
|
||
}
|
||
}
|
||
|
||
await audioContext.close()
|
||
|
||
const audioData = audioBufferFromBlob.getChannelData(0)
|
||
if (!audioData || audioData.length === 0) {
|
||
return
|
||
}
|
||
|
||
// Resample if necessary
|
||
let processedAudioData: Float32Array = audioData
|
||
if (audioBufferFromBlob.sampleRate !== 16000) {
|
||
processedAudioData = resampleAudio(audioData as Float32Array, audioBufferFromBlob.sampleRate, 16000)
|
||
}
|
||
|
||
// Check for meaningful audio content
|
||
const rms = Math.sqrt(processedAudioData.reduce((sum, val) => sum + val * val, 0) / processedAudioData.length)
|
||
const maxAmplitude = Math.max(...processedAudioData.map(Math.abs))
|
||
const dynamicRange = maxAmplitude - Math.min(...processedAudioData.map(Math.abs))
|
||
|
||
console.log(`🔊 Real-time audio analysis: RMS=${rms.toFixed(6)}, Max=${maxAmplitude.toFixed(6)}, Range=${dynamicRange.toFixed(6)}`)
|
||
|
||
if (rms < 0.001) {
|
||
console.log('⚠️ Audio too quiet for transcription (RMS < 0.001)')
|
||
return // Skip very quiet audio
|
||
}
|
||
|
||
if (dynamicRange < 0.01) {
|
||
console.log('⚠️ Audio has very low dynamic range, may be mostly noise')
|
||
return
|
||
}
|
||
|
||
// Ensure reasonable length for real-time processing (max 2 seconds for balanced speed)
|
||
const maxRealtimeSamples = 32000 // 2 seconds at 16kHz
|
||
if (processedAudioData.length > maxRealtimeSamples) {
|
||
processedAudioData = processedAudioData.slice(-maxRealtimeSamples)
|
||
}
|
||
|
||
if (processedAudioData.length < 2000) { // Increased to 2 second minimum for reliable processing
|
||
return // Skip very short audio
|
||
}
|
||
|
||
console.log(`🎵 Real-time audio: ${processedAudioData.length} samples (${(processedAudioData.length / 16000).toFixed(2)}s)`)
|
||
|
||
let transcriptionText = ''
|
||
|
||
// Use RunPod if configured, otherwise use local model
|
||
if (shouldUseRunPod) {
|
||
console.log('🚀 Using RunPod WhisperX API for real-time transcription...')
|
||
// Convert processed audio data back to blob for RunPod
|
||
const wavBlob = await createWavBlob(processedAudioData, 16000)
|
||
transcriptionText = await transcribeWithRunPod(wavBlob, language)
|
||
} else {
|
||
// Use local Whisper model
|
||
if (!transcriberRef.current) {
|
||
console.log('⚠️ Transcriber not available for real-time processing')
|
||
return
|
||
}
|
||
const result = await transcriberRef.current(processedAudioData, {
|
||
language: language,
|
||
task: 'transcribe',
|
||
return_timestamps: false,
|
||
chunk_length_s: 5, // Longer chunks for better context
|
||
stride_length_s: 2, // Larger stride for better coverage
|
||
no_speech_threshold: 0.3, // Higher threshold to reduce noise
|
||
logprob_threshold: -0.8, // More sensitive detection
|
||
compression_ratio_threshold: 2.0 // More permissive for real-time
|
||
})
|
||
|
||
transcriptionText = result?.text || ''
|
||
}
|
||
if (transcriptionText.trim()) {
|
||
lastTranscriptionTimeRef.current = Date.now()
|
||
console.log(`✅ Real-time transcript: "${transcriptionText.trim()}"`)
|
||
console.log(`🔄 Calling handleStreamingTranscriptUpdate with: "${transcriptionText.trim()}"`)
|
||
handleStreamingTranscriptUpdate(transcriptionText.trim())
|
||
} else {
|
||
console.log('⚠️ No real-time transcription text produced, trying fallback parameters...')
|
||
|
||
// Try with more permissive parameters for real-time processing (only for local model)
|
||
if (!shouldUseRunPod && transcriberRef.current) {
|
||
try {
|
||
const fallbackResult = await transcriberRef.current(processedAudioData, {
|
||
task: 'transcribe',
|
||
return_timestamps: false,
|
||
chunk_length_s: 3, // Shorter chunks for fallback
|
||
stride_length_s: 1, // Smaller stride for fallback
|
||
no_speech_threshold: 0.1, // Very low threshold for fallback
|
||
logprob_threshold: -1.2, // Very sensitive for fallback
|
||
compression_ratio_threshold: 2.5 // Very permissive for fallback
|
||
})
|
||
|
||
const fallbackText = fallbackResult?.text || ''
|
||
if (fallbackText.trim()) {
|
||
console.log(`✅ Fallback real-time transcript: "${fallbackText.trim()}"`)
|
||
lastTranscriptionTimeRef.current = Date.now()
|
||
handleStreamingTranscriptUpdate(fallbackText.trim())
|
||
} else {
|
||
console.log('⚠️ Fallback transcription also produced no text')
|
||
}
|
||
} catch (fallbackError) {
|
||
console.log('⚠️ Fallback transcription failed:', fallbackError)
|
||
}
|
||
}
|
||
}
|
||
|
||
} catch (error) {
|
||
console.error('❌ Error processing accumulated audio chunks:', error)
|
||
}
|
||
}, [handleStreamingTranscriptUpdate, language, shouldUseRunPod])
|
||
|
||
// Process recorded audio chunks (final processing)
|
||
const processAudioChunks = useCallback(async () => {
|
||
if (audioChunksRef.current.length === 0) {
|
||
console.log('⚠️ No audio chunks to process')
|
||
return
|
||
}
|
||
|
||
// For local model, ensure transcriber is loaded
|
||
if (!shouldUseRunPod) {
|
||
if (!transcriberRef.current) {
|
||
console.log('⚠️ No transcriber available')
|
||
return
|
||
}
|
||
|
||
// Ensure model is loaded
|
||
if (!modelLoaded) {
|
||
console.log('⚠️ Model not loaded yet, waiting...')
|
||
try {
|
||
await initializeTranscriber()
|
||
} catch (error) {
|
||
console.error('❌ Failed to initialize transcriber:', error)
|
||
onError?.(error as Error)
|
||
return
|
||
}
|
||
}
|
||
}
|
||
|
||
try {
|
||
setIsTranscribing(true)
|
||
console.log('🔄 Processing final audio chunks...')
|
||
|
||
// Create a blob from all chunks with proper MIME type detection
|
||
let mimeType = 'audio/webm;codecs=opus'
|
||
if (audioChunksRef.current.length > 0 && audioChunksRef.current[0].type) {
|
||
mimeType = audioChunksRef.current[0].type
|
||
}
|
||
|
||
// Filter out small chunks that might be corrupted
|
||
const validChunks = audioChunksRef.current.filter(chunk => chunk && chunk.size > 1000)
|
||
|
||
if (validChunks.length === 0) {
|
||
console.log('⚠️ No valid audio chunks to process')
|
||
return
|
||
}
|
||
|
||
console.log(`🔄 Processing ${validChunks.length} valid chunks out of ${audioChunksRef.current.length} total chunks`)
|
||
|
||
const audioBlob = new Blob(validChunks, { type: mimeType })
|
||
|
||
// Validate blob size
|
||
if (audioBlob.size < 10000) {
|
||
console.log(`⚠️ Audio blob too small for processing: ${audioBlob.size} bytes`)
|
||
return
|
||
}
|
||
|
||
// Convert blob to array buffer
|
||
const arrayBuffer = await audioBlob.arrayBuffer()
|
||
|
||
// Validate array buffer
|
||
if (arrayBuffer.byteLength < 10000) {
|
||
console.log(`⚠️ Audio buffer too small: ${arrayBuffer.byteLength} bytes`)
|
||
return
|
||
}
|
||
|
||
// Create audio context to convert to Float32Array
|
||
const audioContext = new AudioContext()
|
||
|
||
let audioBuffer: AudioBuffer
|
||
try {
|
||
audioBuffer = await audioContext.decodeAudioData(arrayBuffer)
|
||
console.log(`✅ Successfully decoded final audio buffer: ${audioBuffer.length} samples`)
|
||
} catch (decodeError) {
|
||
console.error('❌ Failed to decode final audio buffer:', decodeError)
|
||
|
||
// Try alternative approach with different MIME type
|
||
try {
|
||
console.log('🔄 Trying alternative MIME type for final processing...')
|
||
const alternativeBlob = new Blob(validChunks, { type: 'audio/webm' })
|
||
const alternativeBuffer = await alternativeBlob.arrayBuffer()
|
||
audioBuffer = await audioContext.decodeAudioData(alternativeBuffer)
|
||
console.log(`✅ Successfully decoded with alternative approach: ${audioBuffer.length} samples`)
|
||
} catch (altError) {
|
||
console.error('❌ Alternative decode also failed:', altError)
|
||
await audioContext.close()
|
||
throw new Error('Failed to decode audio data. The audio format may not be supported or the data may be corrupted.')
|
||
}
|
||
}
|
||
|
||
await audioContext.close()
|
||
|
||
// Get the first channel as Float32Array
|
||
const audioData = audioBuffer.getChannelData(0)
|
||
|
||
console.log(`🔍 Audio buffer info: sampleRate=${audioBuffer.sampleRate}, length=${audioBuffer.length}, duration=${audioBuffer.duration}s`)
|
||
console.log(`🔍 Audio data: length=${audioData.length}, first 10 values:`, Array.from(audioData.slice(0, 10)))
|
||
|
||
// Check for meaningful audio content
|
||
const rms = Math.sqrt(audioData.reduce((sum, val) => sum + val * val, 0) / audioData.length)
|
||
console.log(`🔊 Audio RMS level: ${rms.toFixed(6)}`)
|
||
|
||
if (rms < 0.001) {
|
||
console.log('⚠️ Audio appears to be mostly silence (RMS < 0.001)')
|
||
}
|
||
|
||
// Resample if necessary
|
||
let processedAudioData: Float32Array = audioData
|
||
if (audioBuffer.sampleRate !== 16000) {
|
||
console.log(`🔄 Resampling from ${audioBuffer.sampleRate}Hz to 16000Hz`)
|
||
processedAudioData = resampleAudio(audioData as Float32Array, audioBuffer.sampleRate, 16000)
|
||
}
|
||
|
||
console.log(`🎵 Processing audio: ${processedAudioData.length} samples (${(processedAudioData.length / 16000).toFixed(2)}s)`)
|
||
|
||
console.log('🔄 Starting transcription...')
|
||
|
||
let newText = ''
|
||
|
||
// Use RunPod if configured, otherwise use local model
|
||
if (shouldUseRunPod) {
|
||
console.log('🚀 Using RunPod WhisperX API...')
|
||
// Convert processed audio data back to blob for RunPod
|
||
// Create a WAV blob from the Float32Array
|
||
const wavBlob = await createWavBlob(processedAudioData, 16000)
|
||
newText = await transcribeWithRunPod(wavBlob, language)
|
||
console.log('✅ RunPod transcription result:', newText)
|
||
} else {
|
||
// Use local Whisper model
|
||
if (!transcriberRef.current) {
|
||
throw new Error('Transcriber not initialized')
|
||
}
|
||
const result = await transcriberRef.current(processedAudioData, {
|
||
language: language,
|
||
task: 'transcribe',
|
||
return_timestamps: false
|
||
})
|
||
|
||
console.log('🔍 Transcription result:', result)
|
||
newText = result?.text?.trim() || ''
|
||
}
|
||
if (newText) {
|
||
const processedText = processTranscript(newText, enableStreaming)
|
||
|
||
if (enableStreaming) {
|
||
// For streaming mode, merge with existing streaming transcript
|
||
handleStreamingTranscriptUpdate(processedText)
|
||
} else {
|
||
// For non-streaming mode, append to existing transcript
|
||
const currentTranscript = transcriptRef.current
|
||
const updatedTranscript = currentTranscript ? `${currentTranscript} ${processedText}` : processedText
|
||
|
||
transcriptRef.current = updatedTranscript
|
||
setTranscript(updatedTranscript)
|
||
|
||
// Only send the new portion for continuous transcription
|
||
const newTextPortion = updatedTranscript.substring(previousTranscriptLengthRef.current)
|
||
if (newTextPortion.trim()) {
|
||
onTranscriptUpdate?.(newTextPortion)
|
||
previousTranscriptLengthRef.current = updatedTranscript.length
|
||
}
|
||
|
||
console.log(`✅ Transcription: "${processedText}" -> Total: "${updatedTranscript}"`)
|
||
}
|
||
} else {
|
||
console.log('⚠️ No transcription text produced')
|
||
console.log('🔍 Full transcription result object:', result)
|
||
|
||
// Try alternative transcription parameters (only for local model)
|
||
if (!shouldUseRunPod && transcriberRef.current) {
|
||
console.log('🔄 Trying alternative transcription parameters...')
|
||
try {
|
||
const altResult = await transcriberRef.current(processedAudioData, {
|
||
task: 'transcribe',
|
||
return_timestamps: false
|
||
})
|
||
console.log('🔍 Alternative transcription result:', altResult)
|
||
|
||
if (altResult?.text?.trim()) {
|
||
const processedAltText = processTranscript(altResult.text, enableStreaming)
|
||
console.log('✅ Alternative transcription successful:', processedAltText)
|
||
const currentTranscript = transcriptRef.current
|
||
const updatedTranscript = currentTranscript ? `${currentTranscript} ${processedAltText}` : processedAltText
|
||
|
||
transcriptRef.current = updatedTranscript
|
||
setTranscript(updatedTranscript)
|
||
|
||
// Only send the new portion for continuous transcription
|
||
const newTextPortion = updatedTranscript.substring(previousTranscriptLengthRef.current)
|
||
if (newTextPortion.trim()) {
|
||
onTranscriptUpdate?.(newTextPortion)
|
||
previousTranscriptLengthRef.current = updatedTranscript.length
|
||
}
|
||
}
|
||
} catch (altError) {
|
||
console.log('⚠️ Alternative transcription also failed:', altError)
|
||
}
|
||
}
|
||
}
|
||
|
||
// Clear processed chunks
|
||
audioChunksRef.current = []
|
||
|
||
} catch (error) {
|
||
console.error('❌ Error processing audio:', error)
|
||
onError?.(error as Error)
|
||
} finally {
|
||
setIsTranscribing(false)
|
||
}
|
||
}, [transcriberRef, language, onTranscriptUpdate, onError, enableStreaming, handleStreamingTranscriptUpdate, modelLoaded, initializeTranscriber, shouldUseRunPod])
|
||
|
||
// Start recording
|
||
const startRecording = useCallback(async () => {
|
||
try {
|
||
console.log('🎤 Starting recording...')
|
||
console.log('🔍 enableStreaming in startRecording:', enableStreaming)
|
||
|
||
// Ensure model is loaded before starting (skip for RunPod)
|
||
if (!shouldUseRunPod && !modelLoaded) {
|
||
console.log('🔄 Model not loaded, initializing...')
|
||
await initializeTranscriber()
|
||
} else if (shouldUseRunPod) {
|
||
// For RunPod, just mark as ready
|
||
setModelLoaded(true)
|
||
}
|
||
|
||
// Don't reset transcripts for continuous transcription - keep existing content
|
||
// transcriptRef.current = ''
|
||
// streamingTranscriptRef.current = ''
|
||
// setTranscript('')
|
||
lastSpeechTimeRef.current = 0
|
||
audioChunksRef.current = []
|
||
lastTranscriptionTimeRef.current = 0
|
||
|
||
// Clear any existing periodic transcription timer
|
||
if (periodicTranscriptionRef.current) {
|
||
clearInterval(periodicTranscriptionRef.current)
|
||
periodicTranscriptionRef.current = null
|
||
}
|
||
|
||
// Get microphone access
|
||
const stream = await navigator.mediaDevices.getUserMedia({
|
||
audio: {
|
||
echoCancellation: true,
|
||
noiseSuppression: true,
|
||
autoGainControl: true,
|
||
sampleRate: 44100,
|
||
channelCount: 1
|
||
}
|
||
})
|
||
|
||
streamRef.current = stream
|
||
|
||
// Create MediaRecorder with fallback options
|
||
let mediaRecorder: MediaRecorder
|
||
const options = [
|
||
{ mimeType: 'audio/webm;codecs=opus' },
|
||
{ mimeType: 'audio/webm' },
|
||
{ mimeType: 'audio/ogg;codecs=opus' },
|
||
{ mimeType: 'audio/ogg' },
|
||
{ mimeType: 'audio/wav' },
|
||
{ mimeType: 'audio/mp4' }
|
||
]
|
||
|
||
for (const option of options) {
|
||
if (MediaRecorder.isTypeSupported(option.mimeType)) {
|
||
console.log('🎵 Using MIME type:', option.mimeType)
|
||
mediaRecorder = new MediaRecorder(stream, option)
|
||
break
|
||
}
|
||
}
|
||
|
||
if (!mediaRecorder!) {
|
||
throw new Error('No supported audio format found')
|
||
}
|
||
|
||
// Store the MIME type for later use
|
||
const mimeType = mediaRecorder.mimeType
|
||
console.log('🎵 Final MIME type:', mimeType)
|
||
|
||
mediaRecorderRef.current = mediaRecorder
|
||
|
||
// Handle data available
|
||
mediaRecorder.ondataavailable = (event) => {
|
||
if (event.data.size > 0) {
|
||
// Validate chunk before adding
|
||
if (event.data.size > 1000) { // Only add chunks with meaningful size
|
||
audioChunksRef.current.push(event.data)
|
||
console.log(`📦 Received chunk ${audioChunksRef.current.length}, size: ${event.data.size} bytes, type: ${event.data.type}`)
|
||
|
||
// Limit the number of chunks to prevent memory issues
|
||
if (audioChunksRef.current.length > 20) {
|
||
audioChunksRef.current = audioChunksRef.current.slice(-15) // Keep last 15 chunks
|
||
}
|
||
} else {
|
||
console.log(`⚠️ Skipping small chunk: ${event.data.size} bytes`)
|
||
}
|
||
}
|
||
}
|
||
|
||
// Handle recording stop
|
||
mediaRecorder.onstop = () => {
|
||
console.log('🛑 Recording stopped, processing audio...')
|
||
processAudioChunks()
|
||
}
|
||
|
||
// Handle MediaRecorder state changes
|
||
mediaRecorder.onstart = () => {
|
||
console.log('🎤 MediaRecorder started')
|
||
console.log('🔍 enableStreaming value:', enableStreaming)
|
||
setIsRecording(true)
|
||
isRecordingRef.current = true
|
||
|
||
// Start periodic transcription processing for streaming mode
|
||
if (enableStreaming) {
|
||
console.log('🔄 Starting streaming transcription (every 0.8 seconds)')
|
||
periodicTranscriptionRef.current = setInterval(() => {
|
||
console.log('🔄 Interval triggered, isRecordingRef.current:', isRecordingRef.current)
|
||
if (isRecordingRef.current) {
|
||
console.log('🔄 Running periodic streaming transcription...')
|
||
processAccumulatedAudioChunks()
|
||
} else {
|
||
console.log('⚠️ Not running transcription - recording stopped')
|
||
}
|
||
}, 800) // Update every 0.8 seconds for better responsiveness
|
||
} else {
|
||
console.log('ℹ️ Streaming transcription disabled - enableStreaming is false')
|
||
}
|
||
}
|
||
|
||
// Start recording with appropriate timeslice
|
||
const timeslice = enableStreaming ? 1000 : 2000 // Larger chunks for more stable processing
|
||
console.log(`🎵 Starting recording with ${timeslice}ms timeslice`)
|
||
mediaRecorder.start(timeslice)
|
||
isRecordingRef.current = true
|
||
setIsRecording(true)
|
||
|
||
console.log('✅ Recording started - MediaRecorder state:', mediaRecorder.state)
|
||
|
||
} catch (error) {
|
||
console.error('❌ Error starting recording:', error)
|
||
onError?.(error as Error)
|
||
}
|
||
}, [processAudioChunks, processAccumulatedAudioChunks, onError, enableStreaming, modelLoaded, initializeTranscriber, shouldUseRunPod])
|
||
|
||
// Stop recording
|
||
const stopRecording = useCallback(async () => {
|
||
try {
|
||
console.log('🛑 Stopping recording...')
|
||
|
||
// Clear periodic transcription timer
|
||
if (periodicTranscriptionRef.current) {
|
||
clearInterval(periodicTranscriptionRef.current)
|
||
periodicTranscriptionRef.current = null
|
||
}
|
||
|
||
if (mediaRecorderRef.current && isRecordingRef.current) {
|
||
mediaRecorderRef.current.stop()
|
||
}
|
||
|
||
if (streamRef.current) {
|
||
streamRef.current.getTracks().forEach(track => track.stop())
|
||
streamRef.current = null
|
||
}
|
||
|
||
isRecordingRef.current = false
|
||
setIsRecording(false)
|
||
|
||
console.log('✅ Recording stopped')
|
||
|
||
} catch (error) {
|
||
console.error('❌ Error stopping recording:', error)
|
||
onError?.(error as Error)
|
||
}
|
||
}, [onError])
|
||
|
||
// Pause recording (placeholder for compatibility)
|
||
const pauseRecording = useCallback(async () => {
|
||
console.log('⏸️ Pause recording not implemented')
|
||
}, [])
|
||
|
||
// Cleanup function
|
||
const cleanup = useCallback(() => {
|
||
console.log('🧹 Cleaning up transcription resources...')
|
||
|
||
// Stop recording if active
|
||
if (isRecordingRef.current) {
|
||
setIsRecording(false)
|
||
isRecordingRef.current = false
|
||
}
|
||
|
||
// Clear periodic transcription timer
|
||
if (periodicTranscriptionRef.current) {
|
||
clearInterval(periodicTranscriptionRef.current)
|
||
periodicTranscriptionRef.current = null
|
||
}
|
||
|
||
// Stop MediaRecorder if active
|
||
if (mediaRecorderRef.current && mediaRecorderRef.current.state === 'recording') {
|
||
mediaRecorderRef.current.stop()
|
||
}
|
||
|
||
// Stop audio stream
|
||
if (streamRef.current) {
|
||
streamRef.current.getTracks().forEach(track => track.stop())
|
||
streamRef.current = null
|
||
}
|
||
|
||
// Clear chunks
|
||
audioChunksRef.current = []
|
||
|
||
console.log('✅ Cleanup completed')
|
||
}, [])
|
||
|
||
// Convenience functions for compatibility
|
||
const startTranscription = useCallback(async () => {
|
||
try {
|
||
console.log('🎤 Starting transcription...')
|
||
|
||
// Reset all transcription state for clean start
|
||
streamingTranscriptRef.current = ''
|
||
setTranscript('')
|
||
setIsRecording(false)
|
||
isRecordingRef.current = false
|
||
lastTranscriptionTimeRef.current = 0
|
||
|
||
// Clear any existing timers
|
||
if (periodicTranscriptionRef.current) {
|
||
clearInterval(periodicTranscriptionRef.current)
|
||
periodicTranscriptionRef.current = null
|
||
}
|
||
|
||
// Initialize the model if not already loaded (skip for RunPod)
|
||
if (!shouldUseRunPod && !modelLoaded) {
|
||
await initializeTranscriber()
|
||
} else if (shouldUseRunPod) {
|
||
setModelLoaded(true)
|
||
}
|
||
|
||
await startRecording()
|
||
console.log('✅ Transcription started')
|
||
|
||
} catch (error) {
|
||
console.error('❌ Error starting transcription:', error)
|
||
onError?.(error as Error)
|
||
}
|
||
}, [startRecording, onError, modelLoaded, initializeTranscriber])
|
||
|
||
const stopTranscription = useCallback(async () => {
|
||
try {
|
||
console.log('🛑 Stopping transcription...')
|
||
await stopRecording()
|
||
console.log('✅ Transcription stopped')
|
||
} catch (error) {
|
||
console.error('❌ Error stopping transcription:', error)
|
||
onError?.(error as Error)
|
||
}
|
||
}, [stopRecording, onError])
|
||
|
||
const pauseTranscription = useCallback(async () => {
|
||
try {
|
||
console.log('⏸️ Pausing transcription...')
|
||
await pauseRecording()
|
||
console.log('✅ Transcription paused')
|
||
} catch (error) {
|
||
console.error('❌ Error pausing transcription:', error)
|
||
onError?.(error as Error)
|
||
}
|
||
}, [pauseRecording, onError])
|
||
|
||
// Initialize model on mount (only if autoInitialize is true)
|
||
useEffect(() => {
|
||
if (autoInitialize) {
|
||
initializeTranscriber().catch(console.warn)
|
||
}
|
||
}, [initializeTranscriber, autoInitialize, shouldUseRunPod])
|
||
|
||
// Cleanup on unmount
|
||
useEffect(() => {
|
||
return () => {
|
||
cleanup()
|
||
}
|
||
}, [cleanup])
|
||
|
||
return {
|
||
// State
|
||
isRecording,
|
||
isSpeaking,
|
||
isTranscribing,
|
||
transcript,
|
||
modelLoaded,
|
||
|
||
// Actions
|
||
startTranscription,
|
||
stopTranscription,
|
||
pauseTranscription,
|
||
|
||
// Raw functions for advanced usage
|
||
startRecording,
|
||
stopRecording,
|
||
pauseRecording,
|
||
cleanup
|
||
}
|
||
}
|
||
|
||
// Export both the new consolidated hook and the old name for backward compatibility
|
||
export const useWhisperTranscriptionSimple = useWhisperTranscription
|