fix(mi-voice): big-endian header parse, apply gain, softer voice defaults

- Parse WS frame header length as big-endian (server uses struct.pack('>I')). Previous little-endian read always failed, silently forcing the browser Web Speech fallback. - Apply header volume via GainNode so quieter actually plays quieter. - Default voice en-US-AriaNeural -> en-US-AvaMultilingualNeural, with rate -8% / pitch -2Hz / volume 0.55 for a calmer, less grating output. Browser fallback gets matching rate/pitch/volume.
2026-04-16 15:18:40 -04:00 · 2026-04-16 15:18:40 -04:00 · 1f084fa674
parent 9daeb60895
commit 1f084fa674
1 changed files with 18 additions and 6 deletions
--- a/lib/mi-voice-bridge.ts
+++ b/lib/mi-voice-bridge.ts
@ -30,7 +30,7 @@ export class MiVoiceBridge {
 	constructor(opts: MiVoiceBridgeOptions = {}) {
 		this.#bridgeUrl = opts.bridgeUrl ?? DEFAULT_BRIDGE;
-		this.#voice = opts.voice ?? "en-US-AriaNeural";
+		this.#voice = opts.voice ?? "en-US-AvaMultilingualNeural";
 		this.#onStateChange = opts.onStateChange ?? null;
 	}
@ -146,15 +146,25 @@ export class MiVoiceBridge {
 					const buf = ev.data as ArrayBuffer;
 					const view = new DataView(buf);
-					// Frame format: [4B header_len][JSON header][MP3 bytes]
+					// Frame format: [4B header_len big-endian][JSON header][MP3 bytes]
-					const headerLen = view.getUint32(0, true);
+					const headerLen = view.getUint32(0, false);
 					const headerJson = new TextDecoder().decode(buf.slice(4, 4 + headerLen));
 					let gainValue = 1;
 					try {
 						const header = JSON.parse(headerJson);
 						if (typeof header.volume === "number") {
 							gainValue = Math.max(0, Math.min(1, header.volume));
 						}
 					} catch { /* ignore bad header */ }
 					const mp3Bytes = buf.slice(4 + headerLen);
 					const ctx = this.#ensureAudioCtx();
 					const audioBuffer = await ctx.decodeAudioData(mp3Bytes.slice(0)); // slice to copy
 					const source = ctx.createBufferSource();
 					source.buffer = audioBuffer;
-					source.connect(ctx.destination);
+					const gain = ctx.createGain();
 					gain.gain.value = gainValue;
 					source.connect(gain).connect(ctx.destination);
 					this.#currentSource = source;
 					source.onended = () => {
@ -177,7 +187,7 @@ export class MiVoiceBridge {
 				const res = await fetch(`${this.#bridgeUrl}${TTS_PATH}`, {
 					method: "POST",
 					headers: { "Content-Type": "application/json" },
-					body: JSON.stringify({ text, voice: this.#voice, volume: 100 }),
+					body: JSON.stringify({ text, voice: this.#voice, volume: 0.55, rate: "-8%", pitch: "-2Hz" }),
 				});
 				if (!res.ok) {
 					ws.removeEventListener("message", handler);
@ -209,7 +219,9 @@ export class MiVoiceBridge {
 		return new Promise<void>((resolve) => {
 			this.#speakResolve = resolve;
 			const utterance = new SpeechSynthesisUtterance(text);
-			utterance.rate = 1.05;
+			utterance.rate = 0.95;
 			utterance.pitch = 0.9;
 			utterance.volume = 0.55;
 			utterance.onend = () => {
 				this.#speakResolve = null;
 				resolve();