rspace-online/lib/mi-voice-bridge.ts

/**
 * MiVoiceBridge — TTS output via Edge TTS bridge + Web Speech Synthesis fallback.
 *
 * Connects to claude-voice.jeffemmett.com for high-quality neural voice synthesis.
 * Falls back to browser speechSynthesis if the bridge is unavailable.
 */

export type VoiceState = "idle" | "listening" | "thinking" | "speaking";

export interface MiVoiceBridgeOptions {
	bridgeUrl?: string;
	voice?: string;
	onStateChange?: (state: VoiceState) => void;
}

const DEFAULT_BRIDGE = "https://claude-voice.jeffemmett.com";
const WS_PATH = "/ws/audio";
const TTS_PATH = "/api/tts/speak";

export class MiVoiceBridge {
	#bridgeUrl: string;
	#voice: string;
	#onStateChange: ((s: VoiceState) => void) | null;
	#ws: WebSocket | null = null;
	#audioCtx: AudioContext | null = null;
	#currentSource: AudioBufferSourceNode | null = null;
	#speaking = false;
	#destroyed = false;
	#speakResolve: (() => void) | null = null;

	constructor(opts: MiVoiceBridgeOptions = {}) {
		this.#bridgeUrl = opts.bridgeUrl ?? DEFAULT_BRIDGE;
		this.#voice = opts.voice ?? "en-US-AvaMultilingualNeural";
		this.#onStateChange = opts.onStateChange ?? null;
	}

	get isSpeaking(): boolean {
		return this.#speaking;
	}

	setVoice(voice: string): void {
		this.#voice = voice;
	}

	async speak(text: string): Promise<void> {
		if (this.#destroyed || !text.trim()) return;
		this.#speaking = true;

		try {
			await this.#speakViaBridge(text);
		} catch {
			// Bridge unavailable — fall back to browser TTS
			await this.#speakViaBrowser(text);
		} finally {
			this.#speaking = false;
		}
	}

	stop(): void {
		// Stop AudioContext playback
		if (this.#currentSource) {
			try { this.#currentSource.stop(); } catch { /* already stopped */ }
			this.#currentSource = null;
		}
		// Stop browser TTS
		if (window.speechSynthesis?.speaking) {
			window.speechSynthesis.cancel();
		}
		this.#speaking = false;
		if (this.#speakResolve) {
			this.#speakResolve();
			this.#speakResolve = null;
		}
	}

	destroy(): void {
		this.#destroyed = true;
		this.stop();
		if (this.#ws) {
			this.#ws.close();
			this.#ws = null;
		}
		if (this.#audioCtx) {
			this.#audioCtx.close();
			this.#audioCtx = null;
		}
	}

	// ── Bridge TTS ──

	async #ensureAudioCtx(): Promise<AudioContext> {
		if (!this.#audioCtx || this.#audioCtx.state === "closed") {
			this.#audioCtx = new AudioContext();
		}
		if (this.#audioCtx.state === "suspended") {
			try { await this.#audioCtx.resume(); } catch { /* gesture may be required */ }
		}
		return this.#audioCtx;
	}

	#connectWs(): Promise<WebSocket> {
		if (this.#ws && this.#ws.readyState === WebSocket.OPEN) {
			return Promise.resolve(this.#ws);
		}

		return new Promise((resolve, reject) => {
			const wsUrl = this.#bridgeUrl.replace(/^http/, "ws") + WS_PATH;
			const ws = new WebSocket(wsUrl);
			ws.binaryType = "arraybuffer";

			const timeout = setTimeout(() => {
				ws.close();
				reject(new Error("WS connect timeout"));
			}, 5000);

			ws.onopen = () => {
				clearTimeout(timeout);
				this.#ws = ws;
				resolve(ws);
			};

			ws.onerror = () => {
				clearTimeout(timeout);
				reject(new Error("WS connection failed"));
			};

			ws.onclose = () => {
				if (this.#ws === ws) this.#ws = null;
			};
		});
	}

	async #speakViaBridge(text: string): Promise<void> {
		// Connect WS first so we're ready to receive audio
		const ws = await this.#connectWs();

		return new Promise<void>(async (resolve, reject) => {
			this.#speakResolve = resolve;

			// Listen for the audio frame
			const handler = async (ev: MessageEvent) => {
				if (!(ev.data instanceof ArrayBuffer)) return;
				ws.removeEventListener("message", handler);

				try {
					const buf = ev.data as ArrayBuffer;
					const view = new DataView(buf);

					// Frame format: [4B header_len big-endian][JSON header][MP3 bytes]
					const headerLen = view.getUint32(0, false);
					const headerJson = new TextDecoder().decode(buf.slice(4, 4 + headerLen));
					let gainValue = 1;
					try {
						const header = JSON.parse(headerJson);
						if (typeof header.volume === "number") {
							gainValue = Math.max(0, Math.min(1, header.volume));
						}
					} catch { /* ignore bad header */ }
					const mp3Bytes = buf.slice(4 + headerLen);

					const ctx = await this.#ensureAudioCtx();
					const audioBuffer = await ctx.decodeAudioData(mp3Bytes.slice(0)); // slice to copy
					const source = ctx.createBufferSource();
					source.buffer = audioBuffer;
					const gain = ctx.createGain();
					gain.gain.value = gainValue;
					source.connect(gain);
					gain.connect(ctx.destination);
					this.#currentSource = source;
					if (ctx.state === "suspended") {
						try { await ctx.resume(); } catch { /* ignore */ }
					}

					source.onended = () => {
						this.#currentSource = null;
						this.#speakResolve = null;
						resolve();
					};

					source.start();
				} catch (err) {
					this.#speakResolve = null;
					reject(err);
				}
			};

			ws.addEventListener("message", handler);

			// POST the TTS request
			try {
				const res = await fetch(`${this.#bridgeUrl}${TTS_PATH}`, {
					method: "POST",
					headers: { "Content-Type": "application/json" },
					body: JSON.stringify({ text, voice: this.#voice, volume: 0.55, rate: "-8%", pitch: "-2Hz" }),
				});
				if (!res.ok) {
					ws.removeEventListener("message", handler);
					this.#speakResolve = null;
					reject(new Error(`TTS POST failed: ${res.status}`));
				}
			} catch (err) {
				ws.removeEventListener("message", handler);
				this.#speakResolve = null;
				reject(err);
			}

			// Timeout: if no audio frame in 15s, reject
			setTimeout(() => {
				ws.removeEventListener("message", handler);
				if (this.#speakResolve === resolve) {
					this.#speakResolve = null;
					reject(new Error("TTS audio timeout"));
				}
			}, 15000);
		});
	}

	// ── Browser fallback ──

	async #speakViaBrowser(text: string): Promise<void> {
		if (!window.speechSynthesis) return;

		return new Promise<void>((resolve) => {
			this.#speakResolve = resolve;
			const utterance = new SpeechSynthesisUtterance(text);
			utterance.rate = 0.95;
			utterance.pitch = 0.9;
			utterance.volume = 0.55;
			utterance.onend = () => {
				this.#speakResolve = null;
				resolve();
			};
			utterance.onerror = () => {
				this.#speakResolve = null;
				resolve();
			};
			window.speechSynthesis.speak(utterance);
		});
	}
}