canvas-website/src/hooks/useAdvancedSpeakerDiarizati...

208 lines
6.0 KiB
TypeScript

import { useState, useRef, useCallback, useEffect } from 'react'
interface SpeakerSegment {
speaker: string
text: string
startTime: number
endTime: number
confidence: number
}
interface UseAdvancedSpeakerDiarizationOptions {
onTranscriptUpdate?: (segments: SpeakerSegment[]) => void
onError?: (error: Error) => void
maxSpeakers?: number
enableRealTime?: boolean
}
export const useAdvancedSpeakerDiarization = ({
onTranscriptUpdate,
onError,
maxSpeakers = 4,
enableRealTime = false
}: UseAdvancedSpeakerDiarizationOptions = {}) => {
const [isProcessing, setIsProcessing] = useState(false)
const [speakers, setSpeakers] = useState<string[]>([])
const [segments, setSegments] = useState<SpeakerSegment[]>([])
const [isSupported, setIsSupported] = useState(false)
const audioContextRef = useRef<AudioContext | null>(null)
const mediaStreamRef = useRef<MediaStream | null>(null)
const processorRef = useRef<ScriptProcessorNode | null>(null)
const audioBufferRef = useRef<Float32Array[]>([])
// Check if advanced features are supported
useEffect(() => {
// Check for Web Audio API support
const hasWebAudio = !!(window.AudioContext || (window as any).webkitAudioContext)
const hasMediaDevices = !!(navigator.mediaDevices && navigator.mediaDevices.getUserMedia)
setIsSupported(hasWebAudio && hasMediaDevices)
if (!hasWebAudio) {
onError?.(new Error('Web Audio API is not supported'))
}
if (!hasMediaDevices) {
onError?.(new Error('Media Devices API is not supported'))
}
}, [onError])
// Simple speaker detection based on audio characteristics
const detectSpeakerCharacteristics = useCallback((audioData: Float32Array) => {
// Calculate basic audio features
const rms = Math.sqrt(audioData.reduce((sum, val) => sum + val * val, 0) / audioData.length)
const maxAmplitude = Math.max(...audioData.map(Math.abs))
const zeroCrossings = audioData.slice(1).reduce((count, val, i) =>
count + (Math.sign(val) !== Math.sign(audioData[i]) ? 1 : 0), 0
)
// Simple speaker identification based on audio characteristics
const speakerId = `Speaker_${Math.floor(rms * 1000) % maxSpeakers + 1}`
return {
speakerId,
confidence: Math.min(rms * 10, 1), // Simple confidence based on RMS
features: {
rms,
maxAmplitude,
zeroCrossings
}
}
}, [maxSpeakers])
// Process audio data for speaker diarization
const processAudioData = useCallback((audioData: Float32Array, timestamp: number) => {
if (!enableRealTime) return
const speakerInfo = detectSpeakerCharacteristics(audioData)
// Create a simple segment
const segment: SpeakerSegment = {
speaker: speakerInfo.speakerId,
text: '', // Would need transcription integration
startTime: timestamp,
endTime: timestamp + (audioData.length / 16000), // Assuming 16kHz
confidence: speakerInfo.confidence
}
// Update segments
setSegments(prev => [...prev, segment])
// Update speakers list
setSpeakers(prev => {
if (!prev.includes(speakerInfo.speakerId)) {
return [...prev, speakerInfo.speakerId]
}
return prev
})
onTranscriptUpdate?.([segment])
}, [enableRealTime, detectSpeakerCharacteristics, onTranscriptUpdate])
// Start audio processing
const startProcessing = useCallback(async () => {
if (!isSupported) {
onError?.(new Error('Advanced speaker diarization not supported'))
return
}
try {
setIsProcessing(true)
// Get audio stream
const stream = await navigator.mediaDevices.getUserMedia({
audio: {
sampleRate: 16000,
channelCount: 1,
echoCancellation: true,
noiseSuppression: true
}
})
mediaStreamRef.current = stream
// Create audio context
const AudioContext = window.AudioContext || (window as any).webkitAudioContext
const audioContext = new AudioContext({ sampleRate: 16000 })
audioContextRef.current = audioContext
// Create audio source
const source = audioContext.createMediaStreamSource(stream)
// Create processor for real-time analysis
const processor = audioContext.createScriptProcessor(4096, 1, 1)
processorRef.current = processor
processor.onaudioprocess = (event) => {
const inputBuffer = event.inputBuffer
const audioData = inputBuffer.getChannelData(0)
const timestamp = audioContext.currentTime
processAudioData(audioData, timestamp)
}
// Connect audio nodes
source.connect(processor)
processor.connect(audioContext.destination)
console.log('🎤 Advanced speaker diarization started')
} catch (error) {
console.error('❌ Error starting speaker diarization:', error)
onError?.(error as Error)
setIsProcessing(false)
}
}, [isSupported, processAudioData, onError])
// Stop audio processing
const stopProcessing = useCallback(() => {
if (mediaStreamRef.current) {
mediaStreamRef.current.getTracks().forEach(track => track.stop())
mediaStreamRef.current = null
}
if (processorRef.current) {
processorRef.current.disconnect()
processorRef.current = null
}
if (audioContextRef.current) {
audioContextRef.current.close()
audioContextRef.current = null
}
setIsProcessing(false)
console.log('🛑 Advanced speaker diarization stopped')
}, [])
// Cleanup on unmount
useEffect(() => {
return () => {
stopProcessing()
}
}, [stopProcessing])
// Format segments as readable text
const formatSegmentsAsText = useCallback((segments: SpeakerSegment[]) => {
return segments.map(segment =>
`${segment.speaker}: ${segment.text}`
).join('\n')
}, [])
return {
isProcessing,
isSupported,
speakers,
segments,
startProcessing,
stopProcessing,
formatSegmentsAsText
}
}
export default useAdvancedSpeakerDiarization