canvas-website/src/utils/audioAnalysis.ts

// Audio analysis utilities for speaker identification and voice activity detection

export interface VoiceCharacteristics {
  pitch: number
  volume: number
  spectralCentroid: number
  mfcc: number[] // Mel-frequency cepstral coefficients
  zeroCrossingRate: number
  energy: number
}

export interface SpeakerProfile {
  id: string
  name: string
  voiceCharacteristics: VoiceCharacteristics
  confidence: number
  lastSeen: number
  totalSpeakingTime: number
}

export interface AudioSegment {
  startTime: number
  endTime: number
  speakerId: string
  transcript: string
  confidence: number
  isFinal: boolean
}

export class AudioAnalyzer {
  private audioContext: AudioContext | null = null
  private analyser: AnalyserNode | null = null
  private microphone: MediaStreamAudioSourceNode | null = null
  private dataArray: Float32Array | null = null
  private speakers: Map<string, SpeakerProfile> = new Map()
  private currentSpeakerId: string | null = null
  private lastVoiceActivity: number = 0
  private voiceActivityThreshold: number = 0.01
  private silenceTimeout: number = 2000 // 2 seconds of silence before considering speaker change

  constructor() {
    this.initializeAudioContext()
  }

  private async initializeAudioContext() {
    try {
      this.audioContext = new (window.AudioContext || (window as any).webkitAudioContext)()
      this.analyser = this.audioContext.createAnalyser()
      this.analyser.fftSize = 2048
      this.analyser.smoothingTimeConstant = 0.8

      const bufferLength = this.analyser.frequencyBinCount
      this.dataArray = new Float32Array(bufferLength)
    } catch (error) {
      console.error('Failed to initialize audio context:', error)
    }
  }

  async connectMicrophone(stream: MediaStream): Promise<void> {
    if (!this.audioContext || !this.analyser) {
      await this.initializeAudioContext()
    }

    if (this.audioContext && this.analyser) {
      this.microphone = this.audioContext.createMediaStreamSource(stream)
      this.microphone.connect(this.analyser)
      console.log('🎤 Microphone connected to audio analyzer')
    }
  }

  analyzeVoiceCharacteristics(): VoiceCharacteristics | null {
    if (!this.analyser || !this.dataArray) {
      return null
    }

    this.analyser.getFloatTimeDomainData(this.dataArray as any)

    // Calculate basic audio features
    const pitch = this.calculatePitch()
    const volume = this.calculateVolume()
    const spectralCentroid = this.calculateSpectralCentroid()
    const mfcc = this.calculateMFCC()
    const zeroCrossingRate = this.calculateZeroCrossingRate()
    const energy = this.calculateEnergy()

    return {
      pitch,
      volume,
      spectralCentroid,
      mfcc,
      zeroCrossingRate,
      energy
    }
  }

  private calculatePitch(): number {
    if (!this.dataArray) return 0

    // Simple autocorrelation-based pitch detection
    const minPeriod = 20 // samples
    const maxPeriod = 200 // samples
    let bestPeriod = 0
    let bestCorrelation = 0

    for (let period = minPeriod; period < maxPeriod && period < this.dataArray.length / 2; period++) {
      let correlation = 0
      for (let i = 0; i < this.dataArray.length - period; i++) {
        correlation += this.dataArray[i] * this.dataArray[i + period]
      }

      if (correlation > bestCorrelation) {
        bestCorrelation = correlation
        bestPeriod = period
      }
    }

    // Convert period to frequency (assuming 44.1kHz sample rate)
    return bestPeriod > 0 ? 44100 / bestPeriod : 0
  }

  private calculateVolume(): number {
    if (!this.dataArray) return 0

    let sum = 0
    for (let i = 0; i < this.dataArray.length; i++) {
      sum += Math.abs(this.dataArray[i])
    }
    return sum / this.dataArray.length
  }

  private calculateSpectralCentroid(): number {
    if (!this.analyser || !this.dataArray) return 0

    const frequencyData = new Uint8Array(this.analyser.frequencyBinCount)
    this.analyser.getByteFrequencyData(frequencyData)

    let weightedSum = 0
    let magnitudeSum = 0

    for (let i = 0; i < frequencyData.length; i++) {
      const magnitude = frequencyData[i]
      const frequency = (i * this.audioContext!.sampleRate) / (2 * frequencyData.length)
      weightedSum += frequency * magnitude
      magnitudeSum += magnitude
    }

    return magnitudeSum > 0 ? weightedSum / magnitudeSum : 0
  }

  private calculateMFCC(): number[] {
    // Simplified MFCC calculation - in a real implementation, you'd use a proper FFT
    // For now, return basic frequency domain features
    if (!this.analyser) return []

    const frequencyData = new Uint8Array(this.analyser.frequencyBinCount)
    this.analyser.getByteFrequencyData(frequencyData)

    // Extract 13 MFCC-like coefficients by averaging frequency bands
    const mfcc = []
    const bandSize = Math.floor(frequencyData.length / 13)

    for (let i = 0; i < 13; i++) {
      let sum = 0
      const start = i * bandSize
      const end = Math.min(start + bandSize, frequencyData.length)

      for (let j = start; j < end; j++) {
        sum += frequencyData[j]
      }
      mfcc.push(sum / (end - start))
    }

    return mfcc
  }

  private calculateZeroCrossingRate(): number {
    if (!this.dataArray) return 0

    let crossings = 0
    for (let i = 1; i < this.dataArray.length; i++) {
      if ((this.dataArray[i] >= 0) !== (this.dataArray[i - 1] >= 0)) {
        crossings++
      }
    }
    return crossings / this.dataArray.length
  }

  private calculateEnergy(): number {
    if (!this.dataArray) return 0

    let energy = 0
    for (let i = 0; i < this.dataArray.length; i++) {
      energy += this.dataArray[i] * this.dataArray[i]
    }
    return energy / this.dataArray.length
  }

  detectVoiceActivity(): boolean {
    const volume = this.calculateVolume()
    const isVoiceActive = volume > this.voiceActivityThreshold

    if (isVoiceActive) {
      this.lastVoiceActivity = Date.now()
    }

    return isVoiceActive
  }

  identifySpeaker(voiceCharacteristics: VoiceCharacteristics): string {
    let bestMatch: string | null = null
    let bestScore = 0
    const threshold = 0.7 // Minimum similarity threshold

    // Compare with existing speakers
    for (const [speakerId, profile] of this.speakers) {
      const similarity = this.calculateSimilarity(voiceCharacteristics, profile.voiceCharacteristics)
      if (similarity > bestScore && similarity > threshold) {
        bestScore = similarity
        bestMatch = speakerId
      }
    }

    // If no good match found, create new speaker
    if (!bestMatch) {
      const newSpeakerId = `speaker_${Date.now()}_${Math.random().toString(36).substr(2, 9)}`
      const newSpeaker: SpeakerProfile = {
        id: newSpeakerId,
        name: `Speaker ${this.speakers.size + 1}`,
        voiceCharacteristics,
        confidence: 0.8,
        lastSeen: Date.now(),
        totalSpeakingTime: 0
      }
      this.speakers.set(newSpeakerId, newSpeaker)
      bestMatch = newSpeakerId
      console.log(`🎤 New speaker identified: ${newSpeaker.name} (${newSpeakerId})`)
    } else {
      // Update existing speaker profile
      const speaker = this.speakers.get(bestMatch)!
      speaker.lastSeen = Date.now()
      speaker.confidence = Math.min(1.0, speaker.confidence + 0.1)

      // Update voice characteristics with weighted average
      const weight = 0.1
      speaker.voiceCharacteristics = {
        pitch: speaker.voiceCharacteristics.pitch * (1 - weight) + voiceCharacteristics.pitch * weight,
        volume: speaker.voiceCharacteristics.volume * (1 - weight) + voiceCharacteristics.volume * weight,
        spectralCentroid: speaker.voiceCharacteristics.spectralCentroid * (1 - weight) + voiceCharacteristics.spectralCentroid * weight,
        mfcc: speaker.voiceCharacteristics.mfcc.map((val, i) =>
          val * (1 - weight) + (voiceCharacteristics.mfcc[i] || 0) * weight
        ),
        zeroCrossingRate: speaker.voiceCharacteristics.zeroCrossingRate * (1 - weight) + voiceCharacteristics.zeroCrossingRate * weight,
        energy: speaker.voiceCharacteristics.energy * (1 - weight) + voiceCharacteristics.energy * weight
      }
    }

    return bestMatch
  }

  private calculateSimilarity(voice1: VoiceCharacteristics, voice2: VoiceCharacteristics): number {
    // Calculate weighted similarity between voice characteristics
    const pitchSimilarity = 1 - Math.abs(voice1.pitch - voice2.pitch) / Math.max(voice1.pitch, voice2.pitch, 1)
    const volumeSimilarity = 1 - Math.abs(voice1.volume - voice2.volume) / Math.max(voice1.volume, voice2.volume, 0.001)
    const spectralSimilarity = 1 - Math.abs(voice1.spectralCentroid - voice2.spectralCentroid) / Math.max(voice1.spectralCentroid, voice2.spectralCentroid, 1)
    const zcrSimilarity = 1 - Math.abs(voice1.zeroCrossingRate - voice2.zeroCrossingRate) / Math.max(voice1.zeroCrossingRate, voice2.zeroCrossingRate, 0.001)
    const energySimilarity = 1 - Math.abs(voice1.energy - voice2.energy) / Math.max(voice1.energy, voice2.energy, 0.001)

    // MFCC similarity (simplified)
    let mfccSimilarity = 0
    if (voice1.mfcc.length === voice2.mfcc.length) {
      let sum = 0
      for (let i = 0; i < voice1.mfcc.length; i++) {
        sum += 1 - Math.abs(voice1.mfcc[i] - voice2.mfcc[i]) / Math.max(voice1.mfcc[i], voice2.mfcc[i], 1)
      }
      mfccSimilarity = sum / voice1.mfcc.length
    }

    // Weighted average of similarities
    return (
      pitchSimilarity * 0.2 +
      volumeSimilarity * 0.15 +
      spectralSimilarity * 0.2 +
      zcrSimilarity * 0.15 +
      energySimilarity * 0.15 +
      mfccSimilarity * 0.15
    )
  }

  detectSpeakerChange(): boolean {
    const now = Date.now()
    const timeSinceLastActivity = now - this.lastVoiceActivity

    // If there's been silence for a while, consider it a potential speaker change
    return timeSinceLastActivity > this.silenceTimeout
  }

  getCurrentSpeaker(): SpeakerProfile | null {
    if (!this.currentSpeakerId) return null
    return this.speakers.get(this.currentSpeakerId) || null
  }

  getAllSpeakers(): SpeakerProfile[] {
    return Array.from(this.speakers.values())
  }

  updateSpeakerName(speakerId: string, name: string): void {
    const speaker = this.speakers.get(speakerId)
    if (speaker) {
      speaker.name = name
      console.log(`🎤 Updated speaker name: ${speakerId} -> ${name}`)
    }
  }

  getSpeakerById(speakerId: string): SpeakerProfile | null {
    return this.speakers.get(speakerId) || null
  }

  cleanup(): void {
    if (this.microphone) {
      this.microphone.disconnect()
      this.microphone = null
    }
    if (this.audioContext) {
      this.audioContext.close()
      this.audioContext = null
    }
    this.analyser = null
    this.dataArray = null
  }
}

// Global audio analyzer instance
export const audioAnalyzer = new AudioAnalyzer()