diff --git a/lib/mi-voice-bridge.ts b/lib/mi-voice-bridge.ts index 7d296219..a45e6d05 100644 --- a/lib/mi-voice-bridge.ts +++ b/lib/mi-voice-bridge.ts @@ -30,7 +30,7 @@ export class MiVoiceBridge { constructor(opts: MiVoiceBridgeOptions = {}) { this.#bridgeUrl = opts.bridgeUrl ?? DEFAULT_BRIDGE; - this.#voice = opts.voice ?? "en-US-AriaNeural"; + this.#voice = opts.voice ?? "en-US-AvaMultilingualNeural"; this.#onStateChange = opts.onStateChange ?? null; } @@ -146,15 +146,25 @@ export class MiVoiceBridge { const buf = ev.data as ArrayBuffer; const view = new DataView(buf); - // Frame format: [4B header_len][JSON header][MP3 bytes] - const headerLen = view.getUint32(0, true); + // Frame format: [4B header_len big-endian][JSON header][MP3 bytes] + const headerLen = view.getUint32(0, false); + const headerJson = new TextDecoder().decode(buf.slice(4, 4 + headerLen)); + let gainValue = 1; + try { + const header = JSON.parse(headerJson); + if (typeof header.volume === "number") { + gainValue = Math.max(0, Math.min(1, header.volume)); + } + } catch { /* ignore bad header */ } const mp3Bytes = buf.slice(4 + headerLen); const ctx = this.#ensureAudioCtx(); const audioBuffer = await ctx.decodeAudioData(mp3Bytes.slice(0)); // slice to copy const source = ctx.createBufferSource(); source.buffer = audioBuffer; - source.connect(ctx.destination); + const gain = ctx.createGain(); + gain.gain.value = gainValue; + source.connect(gain).connect(ctx.destination); this.#currentSource = source; source.onended = () => { @@ -177,7 +187,7 @@ export class MiVoiceBridge { const res = await fetch(`${this.#bridgeUrl}${TTS_PATH}`, { method: "POST", headers: { "Content-Type": "application/json" }, - body: JSON.stringify({ text, voice: this.#voice, volume: 100 }), + body: JSON.stringify({ text, voice: this.#voice, volume: 0.55, rate: "-8%", pitch: "-2Hz" }), }); if (!res.ok) { ws.removeEventListener("message", handler); @@ -209,7 +219,9 @@ export class MiVoiceBridge { return new Promise((resolve) => { this.#speakResolve = resolve; const utterance = new SpeechSynthesisUtterance(text); - utterance.rate = 1.05; + utterance.rate = 0.95; + utterance.pitch = 0.9; + utterance.volume = 0.55; utterance.onend = () => { this.#speakResolve = null; resolve();