rspace-online/lib/mi-voice-bridge.ts

241 lines
6.2 KiB
TypeScript

/**
* MiVoiceBridge — TTS output via Edge TTS bridge + Web Speech Synthesis fallback.
*
* Connects to claude-voice.jeffemmett.com for high-quality neural voice synthesis.
* Falls back to browser speechSynthesis if the bridge is unavailable.
*/
export type VoiceState = "idle" | "listening" | "thinking" | "speaking";
export interface MiVoiceBridgeOptions {
bridgeUrl?: string;
voice?: string;
onStateChange?: (state: VoiceState) => void;
}
const DEFAULT_BRIDGE = "https://claude-voice.jeffemmett.com";
const WS_PATH = "/ws/audio";
const TTS_PATH = "/api/tts/speak";
export class MiVoiceBridge {
#bridgeUrl: string;
#voice: string;
#onStateChange: ((s: VoiceState) => void) | null;
#ws: WebSocket | null = null;
#audioCtx: AudioContext | null = null;
#currentSource: AudioBufferSourceNode | null = null;
#speaking = false;
#destroyed = false;
#speakResolve: (() => void) | null = null;
constructor(opts: MiVoiceBridgeOptions = {}) {
this.#bridgeUrl = opts.bridgeUrl ?? DEFAULT_BRIDGE;
this.#voice = opts.voice ?? "en-US-AvaMultilingualNeural";
this.#onStateChange = opts.onStateChange ?? null;
}
get isSpeaking(): boolean {
return this.#speaking;
}
setVoice(voice: string): void {
this.#voice = voice;
}
async speak(text: string): Promise<void> {
if (this.#destroyed || !text.trim()) return;
this.#speaking = true;
try {
await this.#speakViaBridge(text);
} catch {
// Bridge unavailable — fall back to browser TTS
await this.#speakViaBrowser(text);
} finally {
this.#speaking = false;
}
}
stop(): void {
// Stop AudioContext playback
if (this.#currentSource) {
try { this.#currentSource.stop(); } catch { /* already stopped */ }
this.#currentSource = null;
}
// Stop browser TTS
if (window.speechSynthesis?.speaking) {
window.speechSynthesis.cancel();
}
this.#speaking = false;
if (this.#speakResolve) {
this.#speakResolve();
this.#speakResolve = null;
}
}
destroy(): void {
this.#destroyed = true;
this.stop();
if (this.#ws) {
this.#ws.close();
this.#ws = null;
}
if (this.#audioCtx) {
this.#audioCtx.close();
this.#audioCtx = null;
}
}
// ── Bridge TTS ──
async #ensureAudioCtx(): Promise<AudioContext> {
if (!this.#audioCtx || this.#audioCtx.state === "closed") {
this.#audioCtx = new AudioContext();
}
if (this.#audioCtx.state === "suspended") {
try { await this.#audioCtx.resume(); } catch { /* gesture may be required */ }
}
return this.#audioCtx;
}
#connectWs(): Promise<WebSocket> {
if (this.#ws && this.#ws.readyState === WebSocket.OPEN) {
return Promise.resolve(this.#ws);
}
return new Promise((resolve, reject) => {
const wsUrl = this.#bridgeUrl.replace(/^http/, "ws") + WS_PATH;
const ws = new WebSocket(wsUrl);
ws.binaryType = "arraybuffer";
const timeout = setTimeout(() => {
ws.close();
reject(new Error("WS connect timeout"));
}, 5000);
ws.onopen = () => {
clearTimeout(timeout);
this.#ws = ws;
resolve(ws);
};
ws.onerror = () => {
clearTimeout(timeout);
reject(new Error("WS connection failed"));
};
ws.onclose = () => {
if (this.#ws === ws) this.#ws = null;
};
});
}
async #speakViaBridge(text: string): Promise<void> {
// Connect WS first so we're ready to receive audio
const ws = await this.#connectWs();
return new Promise<void>(async (resolve, reject) => {
this.#speakResolve = resolve;
// Listen for the audio frame
const handler = async (ev: MessageEvent) => {
if (!(ev.data instanceof ArrayBuffer)) return;
ws.removeEventListener("message", handler);
try {
const buf = ev.data as ArrayBuffer;
const view = new DataView(buf);
// Frame format: [4B header_len big-endian][JSON header][MP3 bytes]
const headerLen = view.getUint32(0, false);
const headerJson = new TextDecoder().decode(buf.slice(4, 4 + headerLen));
let gainValue = 1;
try {
const header = JSON.parse(headerJson);
if (typeof header.volume === "number") {
gainValue = Math.max(0, Math.min(1, header.volume));
}
} catch { /* ignore bad header */ }
const mp3Bytes = buf.slice(4 + headerLen);
const ctx = await this.#ensureAudioCtx();
const audioBuffer = await ctx.decodeAudioData(mp3Bytes.slice(0)); // slice to copy
const source = ctx.createBufferSource();
source.buffer = audioBuffer;
const gain = ctx.createGain();
gain.gain.value = gainValue;
source.connect(gain);
gain.connect(ctx.destination);
this.#currentSource = source;
if (ctx.state === "suspended") {
try { await ctx.resume(); } catch { /* ignore */ }
}
source.onended = () => {
this.#currentSource = null;
this.#speakResolve = null;
resolve();
};
source.start();
} catch (err) {
this.#speakResolve = null;
reject(err);
}
};
ws.addEventListener("message", handler);
// POST the TTS request
try {
const res = await fetch(`${this.#bridgeUrl}${TTS_PATH}`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify({ text, voice: this.#voice, volume: 0.55, rate: "-8%", pitch: "-2Hz" }),
});
if (!res.ok) {
ws.removeEventListener("message", handler);
this.#speakResolve = null;
reject(new Error(`TTS POST failed: ${res.status}`));
}
} catch (err) {
ws.removeEventListener("message", handler);
this.#speakResolve = null;
reject(err);
}
// Timeout: if no audio frame in 15s, reject
setTimeout(() => {
ws.removeEventListener("message", handler);
if (this.#speakResolve === resolve) {
this.#speakResolve = null;
reject(new Error("TTS audio timeout"));
}
}, 15000);
});
}
// ── Browser fallback ──
async #speakViaBrowser(text: string): Promise<void> {
if (!window.speechSynthesis) return;
return new Promise<void>((resolve) => {
this.#speakResolve = resolve;
const utterance = new SpeechSynthesisUtterance(text);
utterance.rate = 0.95;
utterance.pitch = 0.9;
utterance.volume = 0.55;
utterance.onend = () => {
this.#speakResolve = null;
resolve();
};
utterance.onerror = () => {
this.#speakResolve = null;
resolve();
};
window.speechSynthesis.speak(utterance);
});
}
}