fix(mi-voice): big-endian header parse, apply gain, softer voice defaults

- Parse WS frame header length as big-endian (server uses struct.pack('>I')).
  Previous little-endian read always failed, silently forcing the browser
  Web Speech fallback.
- Apply header volume via GainNode so quieter actually plays quieter.
- Default voice en-US-AriaNeural -> en-US-AvaMultilingualNeural, with
  rate -8% / pitch -2Hz / volume 0.55 for a calmer, less grating output.
  Browser fallback gets matching rate/pitch/volume.
This commit is contained in:
Jeff Emmett 2026-04-16 15:18:40 -04:00
parent 9daeb60895
commit 1f084fa674
1 changed files with 18 additions and 6 deletions

View File

@ -30,7 +30,7 @@ export class MiVoiceBridge {
constructor(opts: MiVoiceBridgeOptions = {}) { constructor(opts: MiVoiceBridgeOptions = {}) {
this.#bridgeUrl = opts.bridgeUrl ?? DEFAULT_BRIDGE; this.#bridgeUrl = opts.bridgeUrl ?? DEFAULT_BRIDGE;
this.#voice = opts.voice ?? "en-US-AriaNeural"; this.#voice = opts.voice ?? "en-US-AvaMultilingualNeural";
this.#onStateChange = opts.onStateChange ?? null; this.#onStateChange = opts.onStateChange ?? null;
} }
@ -146,15 +146,25 @@ export class MiVoiceBridge {
const buf = ev.data as ArrayBuffer; const buf = ev.data as ArrayBuffer;
const view = new DataView(buf); const view = new DataView(buf);
// Frame format: [4B header_len][JSON header][MP3 bytes] // Frame format: [4B header_len big-endian][JSON header][MP3 bytes]
const headerLen = view.getUint32(0, true); const headerLen = view.getUint32(0, false);
const headerJson = new TextDecoder().decode(buf.slice(4, 4 + headerLen));
let gainValue = 1;
try {
const header = JSON.parse(headerJson);
if (typeof header.volume === "number") {
gainValue = Math.max(0, Math.min(1, header.volume));
}
} catch { /* ignore bad header */ }
const mp3Bytes = buf.slice(4 + headerLen); const mp3Bytes = buf.slice(4 + headerLen);
const ctx = this.#ensureAudioCtx(); const ctx = this.#ensureAudioCtx();
const audioBuffer = await ctx.decodeAudioData(mp3Bytes.slice(0)); // slice to copy const audioBuffer = await ctx.decodeAudioData(mp3Bytes.slice(0)); // slice to copy
const source = ctx.createBufferSource(); const source = ctx.createBufferSource();
source.buffer = audioBuffer; source.buffer = audioBuffer;
source.connect(ctx.destination); const gain = ctx.createGain();
gain.gain.value = gainValue;
source.connect(gain).connect(ctx.destination);
this.#currentSource = source; this.#currentSource = source;
source.onended = () => { source.onended = () => {
@ -177,7 +187,7 @@ export class MiVoiceBridge {
const res = await fetch(`${this.#bridgeUrl}${TTS_PATH}`, { const res = await fetch(`${this.#bridgeUrl}${TTS_PATH}`, {
method: "POST", method: "POST",
headers: { "Content-Type": "application/json" }, headers: { "Content-Type": "application/json" },
body: JSON.stringify({ text, voice: this.#voice, volume: 100 }), body: JSON.stringify({ text, voice: this.#voice, volume: 0.55, rate: "-8%", pitch: "-2Hz" }),
}); });
if (!res.ok) { if (!res.ok) {
ws.removeEventListener("message", handler); ws.removeEventListener("message", handler);
@ -209,7 +219,9 @@ export class MiVoiceBridge {
return new Promise<void>((resolve) => { return new Promise<void>((resolve) => {
this.#speakResolve = resolve; this.#speakResolve = resolve;
const utterance = new SpeechSynthesisUtterance(text); const utterance = new SpeechSynthesisUtterance(text);
utterance.rate = 1.05; utterance.rate = 0.95;
utterance.pitch = 0.9;
utterance.volume = 0.55;
utterance.onend = () => { utterance.onend = () => {
this.#speakResolve = null; this.#speakResolve = null;
resolve(); resolve();