241 lines
6.2 KiB
TypeScript
241 lines
6.2 KiB
TypeScript
/**
|
|
* MiVoiceBridge — TTS output via Edge TTS bridge + Web Speech Synthesis fallback.
|
|
*
|
|
* Connects to claude-voice.jeffemmett.com for high-quality neural voice synthesis.
|
|
* Falls back to browser speechSynthesis if the bridge is unavailable.
|
|
*/
|
|
|
|
export type VoiceState = "idle" | "listening" | "thinking" | "speaking";
|
|
|
|
export interface MiVoiceBridgeOptions {
|
|
bridgeUrl?: string;
|
|
voice?: string;
|
|
onStateChange?: (state: VoiceState) => void;
|
|
}
|
|
|
|
const DEFAULT_BRIDGE = "https://claude-voice.jeffemmett.com";
|
|
const WS_PATH = "/ws/audio";
|
|
const TTS_PATH = "/api/tts/speak";
|
|
|
|
export class MiVoiceBridge {
|
|
#bridgeUrl: string;
|
|
#voice: string;
|
|
#onStateChange: ((s: VoiceState) => void) | null;
|
|
#ws: WebSocket | null = null;
|
|
#audioCtx: AudioContext | null = null;
|
|
#currentSource: AudioBufferSourceNode | null = null;
|
|
#speaking = false;
|
|
#destroyed = false;
|
|
#speakResolve: (() => void) | null = null;
|
|
|
|
constructor(opts: MiVoiceBridgeOptions = {}) {
|
|
this.#bridgeUrl = opts.bridgeUrl ?? DEFAULT_BRIDGE;
|
|
this.#voice = opts.voice ?? "en-US-EmmaMultilingualNeural";
|
|
this.#onStateChange = opts.onStateChange ?? null;
|
|
}
|
|
|
|
get isSpeaking(): boolean {
|
|
return this.#speaking;
|
|
}
|
|
|
|
setVoice(voice: string): void {
|
|
this.#voice = voice;
|
|
}
|
|
|
|
async speak(text: string): Promise<void> {
|
|
if (this.#destroyed || !text.trim()) return;
|
|
this.#speaking = true;
|
|
|
|
try {
|
|
await this.#speakViaBridge(text);
|
|
} catch {
|
|
// Bridge unavailable — fall back to browser TTS
|
|
await this.#speakViaBrowser(text);
|
|
} finally {
|
|
this.#speaking = false;
|
|
}
|
|
}
|
|
|
|
stop(): void {
|
|
// Stop AudioContext playback
|
|
if (this.#currentSource) {
|
|
try { this.#currentSource.stop(); } catch { /* already stopped */ }
|
|
this.#currentSource = null;
|
|
}
|
|
// Stop browser TTS
|
|
if (window.speechSynthesis?.speaking) {
|
|
window.speechSynthesis.cancel();
|
|
}
|
|
this.#speaking = false;
|
|
if (this.#speakResolve) {
|
|
this.#speakResolve();
|
|
this.#speakResolve = null;
|
|
}
|
|
}
|
|
|
|
destroy(): void {
|
|
this.#destroyed = true;
|
|
this.stop();
|
|
if (this.#ws) {
|
|
this.#ws.close();
|
|
this.#ws = null;
|
|
}
|
|
if (this.#audioCtx) {
|
|
this.#audioCtx.close();
|
|
this.#audioCtx = null;
|
|
}
|
|
}
|
|
|
|
// ── Bridge TTS ──
|
|
|
|
async #ensureAudioCtx(): Promise<AudioContext> {
|
|
if (!this.#audioCtx || this.#audioCtx.state === "closed") {
|
|
this.#audioCtx = new AudioContext();
|
|
}
|
|
if (this.#audioCtx.state === "suspended") {
|
|
try { await this.#audioCtx.resume(); } catch { /* gesture may be required */ }
|
|
}
|
|
return this.#audioCtx;
|
|
}
|
|
|
|
#connectWs(): Promise<WebSocket> {
|
|
if (this.#ws && this.#ws.readyState === WebSocket.OPEN) {
|
|
return Promise.resolve(this.#ws);
|
|
}
|
|
|
|
return new Promise((resolve, reject) => {
|
|
const wsUrl = this.#bridgeUrl.replace(/^http/, "ws") + WS_PATH;
|
|
const ws = new WebSocket(wsUrl);
|
|
ws.binaryType = "arraybuffer";
|
|
|
|
const timeout = setTimeout(() => {
|
|
ws.close();
|
|
reject(new Error("WS connect timeout"));
|
|
}, 5000);
|
|
|
|
ws.onopen = () => {
|
|
clearTimeout(timeout);
|
|
this.#ws = ws;
|
|
resolve(ws);
|
|
};
|
|
|
|
ws.onerror = () => {
|
|
clearTimeout(timeout);
|
|
reject(new Error("WS connection failed"));
|
|
};
|
|
|
|
ws.onclose = () => {
|
|
if (this.#ws === ws) this.#ws = null;
|
|
};
|
|
});
|
|
}
|
|
|
|
async #speakViaBridge(text: string): Promise<void> {
|
|
// Connect WS first so we're ready to receive audio
|
|
const ws = await this.#connectWs();
|
|
|
|
return new Promise<void>(async (resolve, reject) => {
|
|
this.#speakResolve = resolve;
|
|
|
|
// Listen for the audio frame
|
|
const handler = async (ev: MessageEvent) => {
|
|
if (!(ev.data instanceof ArrayBuffer)) return;
|
|
ws.removeEventListener("message", handler);
|
|
|
|
try {
|
|
const buf = ev.data as ArrayBuffer;
|
|
const view = new DataView(buf);
|
|
|
|
// Frame format: [4B header_len big-endian][JSON header][MP3 bytes]
|
|
const headerLen = view.getUint32(0, false);
|
|
const headerJson = new TextDecoder().decode(buf.slice(4, 4 + headerLen));
|
|
let gainValue = 1;
|
|
try {
|
|
const header = JSON.parse(headerJson);
|
|
if (typeof header.volume === "number") {
|
|
gainValue = Math.max(0, Math.min(1, header.volume));
|
|
}
|
|
} catch { /* ignore bad header */ }
|
|
const mp3Bytes = buf.slice(4 + headerLen);
|
|
|
|
const ctx = await this.#ensureAudioCtx();
|
|
const audioBuffer = await ctx.decodeAudioData(mp3Bytes.slice(0)); // slice to copy
|
|
const source = ctx.createBufferSource();
|
|
source.buffer = audioBuffer;
|
|
const gain = ctx.createGain();
|
|
gain.gain.value = gainValue;
|
|
source.connect(gain);
|
|
gain.connect(ctx.destination);
|
|
this.#currentSource = source;
|
|
if (ctx.state === "suspended") {
|
|
try { await ctx.resume(); } catch { /* ignore */ }
|
|
}
|
|
|
|
source.onended = () => {
|
|
this.#currentSource = null;
|
|
this.#speakResolve = null;
|
|
resolve();
|
|
};
|
|
|
|
source.start();
|
|
} catch (err) {
|
|
this.#speakResolve = null;
|
|
reject(err);
|
|
}
|
|
};
|
|
|
|
ws.addEventListener("message", handler);
|
|
|
|
// POST the TTS request
|
|
try {
|
|
const res = await fetch(`${this.#bridgeUrl}${TTS_PATH}`, {
|
|
method: "POST",
|
|
headers: { "Content-Type": "application/json" },
|
|
body: JSON.stringify({ text, voice: this.#voice, volume: 0.25, rate: "-8%", pitch: "+0Hz" }),
|
|
});
|
|
if (!res.ok) {
|
|
ws.removeEventListener("message", handler);
|
|
this.#speakResolve = null;
|
|
reject(new Error(`TTS POST failed: ${res.status}`));
|
|
}
|
|
} catch (err) {
|
|
ws.removeEventListener("message", handler);
|
|
this.#speakResolve = null;
|
|
reject(err);
|
|
}
|
|
|
|
// Timeout: if no audio frame in 15s, reject
|
|
setTimeout(() => {
|
|
ws.removeEventListener("message", handler);
|
|
if (this.#speakResolve === resolve) {
|
|
this.#speakResolve = null;
|
|
reject(new Error("TTS audio timeout"));
|
|
}
|
|
}, 15000);
|
|
});
|
|
}
|
|
|
|
// ── Browser fallback ──
|
|
|
|
async #speakViaBrowser(text: string): Promise<void> {
|
|
if (!window.speechSynthesis) return;
|
|
|
|
return new Promise<void>((resolve) => {
|
|
this.#speakResolve = resolve;
|
|
const utterance = new SpeechSynthesisUtterance(text);
|
|
utterance.rate = 0.95;
|
|
utterance.pitch = 1.0;
|
|
utterance.volume = 0.25;
|
|
utterance.onend = () => {
|
|
this.#speakResolve = null;
|
|
resolve();
|
|
};
|
|
utterance.onerror = () => {
|
|
this.#speakResolve = null;
|
|
resolve();
|
|
};
|
|
window.speechSynthesis.speak(utterance);
|
|
});
|
|
}
|
|
}
|