feat: add voice dictation to MI bar, markdown, chat & prompt inputs

Extract Web Speech API logic from folk-transcription into a reusable
SpeechDictation utility, then wire mic buttons into all 4 text input
surfaces. Dictation fills inputs in real-time without auto-submitting.
Hidden gracefully in unsupported browsers.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Jeff Emmett 2026-03-03 17:10:52 -08:00
parent 997ccc0bef
commit da2b21cd98
6 changed files with 441 additions and 172 deletions

View File

@ -1,5 +1,6 @@
import { FolkShape } from "./folk-shape";
import { css, html } from "./tags";
import { SpeechDictation } from "./speech-dictation";
const styles = css`
:host {
@ -123,6 +124,30 @@ const styles = css`
background: #ea580c;
}
.mic-btn {
background: transparent;
border: 1px solid #e2e8f0;
border-radius: 6px;
padding: 8px 10px;
cursor: pointer;
font-size: 14px;
transition: all 0.2s;
}
.mic-btn:hover {
border-color: #f97316;
}
.mic-btn.recording {
border-color: #ef4444;
animation: micPulse 1.5s infinite;
}
@keyframes micPulse {
0%, 100% { transform: scale(1); }
50% { transform: scale(1.15); }
}
.username-prompt {
padding: 12px;
text-align: center;
@ -228,6 +253,7 @@ export class FolkChat extends FolkShape {
<div class="messages"></div>
<div class="input-container">
<input type="text" class="message-input" placeholder="Type a message..." />
${SpeechDictation.isSupported() ? '<button class="mic-btn" title="Voice dictation">🎤</button>' : ''}
<button class="send-btn">Send</button>
</div>
</div>
@ -295,6 +321,41 @@ export class FolkChat extends FolkShape {
if (e.key === "Enter") sendMessage();
});
// Voice dictation
const micBtn = wrapper.querySelector(".mic-btn") as HTMLButtonElement | null;
if (micBtn) {
let baseText = "";
let interimText = "";
const dictation = new SpeechDictation({
onInterim: (text) => {
interimText = text;
messageInput.value = baseText + (baseText ? " " : "") + text;
},
onFinal: (text) => {
interimText = "";
baseText += (baseText ? " " : "") + text;
messageInput.value = baseText;
},
onStateChange: (recording) => {
micBtn.classList.toggle("recording", recording);
if (!recording) {
baseText = messageInput.value;
interimText = "";
}
},
onError: (err) => console.warn("Chat dictation:", err),
});
micBtn.addEventListener("click", (e) => {
e.stopPropagation();
if (!dictation.isRecording) {
baseText = messageInput.value;
}
dictation.toggle();
messageInput.focus();
});
}
// Close button
closeBtn.addEventListener("click", (e) => {
e.stopPropagation();

View File

@ -1,5 +1,6 @@
import { FolkShape } from "./folk-shape";
import { css, html } from "./tags";
import { SpeechDictation } from "./speech-dictation";
const styles = css`
:host {
@ -48,6 +49,15 @@ const styles = css`
background: rgba(255, 255, 255, 0.2);
}
.header-actions button.mic-recording {
animation: micPulse 1.5s infinite;
}
@keyframes micPulse {
0%, 100% { transform: scale(1); }
50% { transform: scale(1.2); }
}
.content {
padding: 12px;
height: calc(100% - 36px);
@ -164,6 +174,7 @@ export class FolkMarkdown extends FolkShape {
</span>
<div class="header-actions">
<button class="edit-btn" title="Toggle Edit"></button>
${SpeechDictation.isSupported() ? '<button class="mic-btn" title="Voice dictation">🎤</button>' : ''}
<button class="close-btn" title="Close">×</button>
</div>
</div>
@ -237,6 +248,36 @@ export class FolkMarkdown extends FolkShape {
this.dispatchEvent(new CustomEvent("close"));
});
// Voice dictation
const micBtn = wrapper.querySelector(".mic-btn") as HTMLButtonElement | null;
if (micBtn) {
const dictation = new SpeechDictation({
onInterim: (text) => {
// Show interim in editor (will be replaced by final)
},
onFinal: (text) => {
enterMarkdownEdit();
const pos = editor.selectionStart;
const before = editor.value.slice(0, pos);
const after = editor.value.slice(pos);
const sep = before && !before.endsWith(" ") && !before.endsWith("\n") ? " " : "";
editor.value = before + sep + text + after;
this.#content = editor.value;
editor.selectionStart = editor.selectionEnd = pos + sep.length + text.length;
},
onStateChange: (recording) => {
micBtn.classList.toggle("mic-recording", recording);
if (recording) enterMarkdownEdit();
},
onError: (err) => console.warn("Markdown dictation:", err),
});
micBtn.addEventListener("click", (e) => {
e.stopPropagation();
dictation.toggle();
});
}
// Editor input
editor.addEventListener("input", () => {
this.#content = editor.value;

View File

@ -1,5 +1,6 @@
import { FolkShape } from "./folk-shape";
import { css, html } from "./tags";
import { SpeechDictation } from "./speech-dictation";
const styles = css`
:host {
@ -175,6 +176,30 @@ const styles = css`
cursor: not-allowed;
}
.mic-btn {
padding: 10px 12px;
background: transparent;
border: 2px solid #e2e8f0;
border-radius: 8px;
font-size: 14px;
cursor: pointer;
transition: all 0.2s;
}
.mic-btn:hover {
border-color: #6366f1;
}
.mic-btn.recording {
border-color: #ef4444;
animation: micPulse 1.5s infinite;
}
@keyframes micPulse {
0%, 100% { transform: scale(1); }
50% { transform: scale(1.15); }
}
.error {
color: #ef4444;
padding: 12px;
@ -293,6 +318,7 @@ export class FolkPrompt extends FolkShape {
</select>
<div class="prompt-row">
<textarea class="prompt-input" placeholder="Type your message..." rows="2"></textarea>
${SpeechDictation.isSupported() ? '<button class="mic-btn" title="Voice dictation">🎤</button>' : ''}
<button class="send-btn"></button>
</div>
</div>
@ -341,6 +367,45 @@ export class FolkPrompt extends FolkShape {
// Prevent drag on inputs
this.#promptInput?.addEventListener("pointerdown", (e) => e.stopPropagation());
// Voice dictation
const micBtn = wrapper.querySelector(".mic-btn") as HTMLButtonElement | null;
if (micBtn) {
let baseText = "";
let interimText = "";
const dictation = new SpeechDictation({
onInterim: (text) => {
interimText = text;
if (this.#promptInput) {
this.#promptInput.value = baseText + (baseText ? " " : "") + text;
}
},
onFinal: (text) => {
interimText = "";
baseText += (baseText ? " " : "") + text;
if (this.#promptInput) {
this.#promptInput.value = baseText;
}
},
onStateChange: (recording) => {
micBtn.classList.toggle("recording", recording);
if (!recording) {
baseText = this.#promptInput?.value || "";
interimText = "";
}
},
onError: (err) => console.warn("Prompt dictation:", err),
});
micBtn.addEventListener("click", (e) => {
e.stopPropagation();
if (!dictation.isRecording) {
baseText = this.#promptInput?.value || "";
}
dictation.toggle();
this.#promptInput?.focus();
});
}
// Close button
closeBtn.addEventListener("click", (e) => {
e.stopPropagation();

View File

@ -1,56 +1,6 @@
import { FolkShape } from "./folk-shape";
import { css, html } from "./tags";
// Web Speech API types (not all browsers have these in their types)
interface SpeechRecognitionResult {
readonly length: number;
item(index: number): SpeechRecognitionAlternative;
[index: number]: SpeechRecognitionAlternative;
readonly isFinal: boolean;
}
interface SpeechRecognitionAlternative {
readonly transcript: string;
readonly confidence: number;
}
interface SpeechRecognitionResultList {
readonly length: number;
item(index: number): SpeechRecognitionResult;
[index: number]: SpeechRecognitionResult;
}
interface SpeechRecognitionEvent extends Event {
readonly resultIndex: number;
readonly results: SpeechRecognitionResultList;
}
interface SpeechRecognitionErrorEvent extends Event {
readonly error: string;
readonly message: string;
}
interface SpeechRecognition extends EventTarget {
continuous: boolean;
interimResults: boolean;
lang: string;
onresult: ((event: SpeechRecognitionEvent) => void) | null;
onerror: ((event: SpeechRecognitionErrorEvent) => void) | null;
onend: (() => void) | null;
start(): void;
stop(): void;
}
interface SpeechRecognitionConstructor {
new (): SpeechRecognition;
}
declare global {
interface Window {
SpeechRecognition?: SpeechRecognitionConstructor;
webkitSpeechRecognition?: SpeechRecognitionConstructor;
}
}
import { SpeechDictation } from "./speech-dictation";
const styles = css`
:host {
@ -292,7 +242,7 @@ export class FolkTranscription extends FolkShape {
#isRecording = false;
#duration = 0;
#durationInterval: ReturnType<typeof setInterval> | null = null;
#recognition: SpeechRecognition | null = null;
#dictation: SpeechDictation | null = null;
#error: string | null = null;
#recordBtn: HTMLElement | null = null;
@ -389,144 +339,95 @@ export class FolkTranscription extends FolkShape {
this.dispatchEvent(new CustomEvent("close"));
});
// Initialize speech recognition
this.#initSpeechRecognition();
// Initialize speech dictation
this.#initDictation();
return root;
}
#initSpeechRecognition() {
const SpeechRecognitionImpl = window.SpeechRecognition || window.webkitSpeechRecognition;
if (!SpeechRecognitionImpl) {
#initDictation() {
if (!SpeechDictation.isSupported()) {
this.#error = "Speech recognition not supported in this browser";
this.#renderError();
return;
}
this.#recognition = new SpeechRecognitionImpl();
this.#recognition.continuous = true;
this.#recognition.interimResults = true;
this.#recognition.lang = "en-US";
this.#recognition.onresult = (event) => {
for (let i = event.resultIndex; i < event.results.length; i++) {
const result = event.results[i];
const text = result[0].transcript;
if (result.isFinal) {
// Find and update interim segment or add new
const interimIdx = this.#segments.findIndex((s) => !s.isFinal);
if (interimIdx >= 0) {
this.#segments[interimIdx] = {
...this.#segments[interimIdx],
text,
isFinal: true,
};
} else {
this.#segments.push({
id: crypto.randomUUID(),
text,
timestamp: this.#duration,
isFinal: true,
});
}
this.#dictation = new SpeechDictation({
onInterim: (text) => {
const interimIdx = this.#segments.findIndex((s) => !s.isFinal);
if (interimIdx >= 0) {
this.#segments[interimIdx].text = text;
} else {
// Update or add interim
const interimIdx = this.#segments.findIndex((s) => !s.isFinal);
if (interimIdx >= 0) {
this.#segments[interimIdx].text = text;
} else {
this.#segments.push({
id: crypto.randomUUID(),
text,
timestamp: this.#duration,
isFinal: false,
});
}
this.#segments.push({
id: crypto.randomUUID(),
text,
timestamp: this.#duration,
isFinal: false,
});
}
}
this.#renderTranscript();
};
this.#recognition.onerror = (event) => {
console.error("Speech recognition error:", event.error);
if (event.error !== "no-speech") {
this.#error = `Recognition error: ${event.error}`;
this.#renderTranscript();
},
onFinal: (text) => {
const interimIdx = this.#segments.findIndex((s) => !s.isFinal);
if (interimIdx >= 0) {
this.#segments[interimIdx] = {
...this.#segments[interimIdx],
text,
isFinal: true,
};
} else {
this.#segments.push({
id: crypto.randomUUID(),
text,
timestamp: this.#duration,
isFinal: true,
});
}
this.#renderTranscript();
},
onError: (err) => {
console.error("Speech recognition error:", err);
this.#error = err;
this.#renderError();
}
};
this.#recognition.onend = () => {
// Restart if still supposed to be recording
if (this.#isRecording && this.#recognition) {
this.#recognition.start();
}
};
},
onStateChange: (recording) => {
this.#isRecording = recording;
if (recording) {
this.#error = null;
this.#recordBtn?.classList.add("recording");
if (this.#statusEl) {
this.#statusEl.textContent = "Recording...";
this.#statusEl.classList.add("recording");
}
this.#durationInterval = setInterval(() => {
this.#duration++;
this.#updateDuration();
}, 1000);
this.dispatchEvent(new CustomEvent("recording-start"));
} else {
this.#recordBtn?.classList.remove("recording");
if (this.#statusEl) {
this.#statusEl.textContent = "Stopped";
this.#statusEl.classList.remove("recording");
}
if (this.#durationInterval) {
clearInterval(this.#durationInterval);
this.#durationInterval = null;
}
this.#segments = this.#segments.filter((s) => s.isFinal);
this.#renderTranscript();
this.dispatchEvent(new CustomEvent("recording-stop", { detail: { transcript: this.transcript } }));
}
},
});
}
#toggleRecording() {
if (this.#isRecording) {
this.#stopRecording();
} else {
this.#startRecording();
}
}
#startRecording() {
if (!this.#recognition) {
this.#error = "Speech recognition not available";
this.#renderError();
return;
}
try {
this.#recognition.start();
this.#isRecording = true;
this.#error = null;
this.#recordBtn?.classList.add("recording");
if (this.#statusEl) {
this.#statusEl.textContent = "Recording...";
this.#statusEl.classList.add("recording");
}
// Start duration timer
this.#durationInterval = setInterval(() => {
this.#duration++;
this.#updateDuration();
}, 1000);
this.dispatchEvent(new CustomEvent("recording-start"));
} catch (error) {
this.#error = "Failed to start recording";
this.#renderError();
}
this.#dictation?.toggle();
}
#stopRecording() {
if (!this.#isRecording) return;
this.#recognition?.stop();
this.#isRecording = false;
this.#recordBtn?.classList.remove("recording");
if (this.#statusEl) {
this.#statusEl.textContent = "Stopped";
this.#statusEl.classList.remove("recording");
}
// Stop duration timer
if (this.#durationInterval) {
clearInterval(this.#durationInterval);
this.#durationInterval = null;
}
// Remove any interim segments
this.#segments = this.#segments.filter((s) => s.isFinal);
this.#renderTranscript();
this.dispatchEvent(new CustomEvent("recording-stop", { detail: { transcript: this.transcript } }));
this.#dictation?.stop();
}
#updateDuration() {

148
lib/speech-dictation.ts Normal file
View File

@ -0,0 +1,148 @@
// Web Speech API types (not all browsers have these in their types)
export interface SpeechRecognitionResult {
readonly length: number;
item(index: number): SpeechRecognitionAlternative;
[index: number]: SpeechRecognitionAlternative;
readonly isFinal: boolean;
}
export interface SpeechRecognitionAlternative {
readonly transcript: string;
readonly confidence: number;
}
export interface SpeechRecognitionResultList {
readonly length: number;
item(index: number): SpeechRecognitionResult;
[index: number]: SpeechRecognitionResult;
}
export interface SpeechRecognitionEvent extends Event {
readonly resultIndex: number;
readonly results: SpeechRecognitionResultList;
}
export interface SpeechRecognitionErrorEvent extends Event {
readonly error: string;
readonly message: string;
}
export interface SpeechRecognitionInstance extends EventTarget {
continuous: boolean;
interimResults: boolean;
lang: string;
onresult: ((event: SpeechRecognitionEvent) => void) | null;
onerror: ((event: SpeechRecognitionErrorEvent) => void) | null;
onend: (() => void) | null;
start(): void;
stop(): void;
}
interface SpeechRecognitionConstructor {
new (): SpeechRecognitionInstance;
}
declare global {
interface Window {
SpeechRecognition?: SpeechRecognitionConstructor;
webkitSpeechRecognition?: SpeechRecognitionConstructor;
}
}
export interface SpeechDictationOptions {
onInterim?: (text: string) => void;
onFinal?: (text: string) => void;
onError?: (error: string) => void;
onStateChange?: (recording: boolean) => void;
lang?: string;
}
export class SpeechDictation {
#recognition: SpeechRecognitionInstance | null = null;
#recording = false;
#opts: SpeechDictationOptions;
constructor(opts: SpeechDictationOptions) {
this.#opts = opts;
this.#init();
}
static isSupported(): boolean {
return !!(window.SpeechRecognition || window.webkitSpeechRecognition);
}
get isRecording(): boolean {
return this.#recording;
}
start(): void {
if (this.#recording || !this.#recognition) return;
try {
this.#recognition.start();
this.#recording = true;
this.#opts.onStateChange?.(true);
} catch (error) {
this.#opts.onError?.("Failed to start recording");
}
}
stop(): void {
if (!this.#recording) return;
this.#recording = false;
this.#recognition?.stop();
this.#opts.onStateChange?.(false);
}
toggle(): void {
if (this.#recording) {
this.stop();
} else {
this.start();
}
}
destroy(): void {
this.stop();
if (this.#recognition) {
this.#recognition.onresult = null;
this.#recognition.onerror = null;
this.#recognition.onend = null;
this.#recognition = null;
}
}
#init(): void {
const Impl = window.SpeechRecognition || window.webkitSpeechRecognition;
if (!Impl) return;
this.#recognition = new Impl();
this.#recognition.continuous = true;
this.#recognition.interimResults = true;
this.#recognition.lang = this.#opts.lang ?? "en-US";
this.#recognition.onresult = (event) => {
for (let i = event.resultIndex; i < event.results.length; i++) {
const result = event.results[i];
const text = result[0].transcript;
if (result.isFinal) {
this.#opts.onFinal?.(text);
} else {
this.#opts.onInterim?.(text);
}
}
};
this.#recognition.onerror = (event) => {
if (event.error !== "no-speech") {
this.#opts.onError?.(`Recognition error: ${event.error}`);
}
};
this.#recognition.onend = () => {
// Auto-restart while still recording
if (this.#recording && this.#recognition) {
this.#recognition.start();
}
};
}
}

View File

@ -10,6 +10,7 @@ import { getAccessToken } from "./rstack-identity";
import { parseMiActions, summariseActions } from "../../lib/mi-actions";
import { MiActionExecutor } from "../../lib/mi-action-executor";
import { suggestTools, type ToolHint } from "../../lib/mi-tool-schema";
import { SpeechDictation } from "../../lib/speech-dictation";
interface MiMessage {
role: "user" | "assistant";
@ -22,6 +23,8 @@ export class RStackMi extends HTMLElement {
#shadow: ShadowRoot;
#messages: MiMessage[] = [];
#abortController: AbortController | null = null;
#dictation: SpeechDictation | null = null;
#interimText = "";
constructor() {
super();
@ -40,6 +43,7 @@ export class RStackMi extends HTMLElement {
<span class="mi-icon">&#10023;</span>
<input class="mi-input" id="mi-input" type="text"
placeholder="Ask mi anything — setup, navigation, what's possible..." autocomplete="off" />
${SpeechDictation.isSupported() ? '<button class="mi-mic-btn" id="mi-mic" title="Voice dictation">🎤</button>' : ''}
</div>
<div class="mi-panel" id="mi-panel">
<div class="mi-messages" id="mi-messages">
@ -84,6 +88,40 @@ export class RStackMi extends HTMLElement {
// Stop clicks inside the panel from closing it
panel.addEventListener("click", (e) => e.stopPropagation());
bar.addEventListener("click", (e) => e.stopPropagation());
// Voice dictation
const micBtn = this.#shadow.getElementById("mi-mic") as HTMLButtonElement | null;
if (micBtn) {
let baseText = "";
this.#dictation = new SpeechDictation({
onInterim: (text) => {
this.#interimText = text;
input.value = baseText + (baseText ? " " : "") + text;
},
onFinal: (text) => {
this.#interimText = "";
baseText += (baseText ? " " : "") + text;
input.value = baseText;
},
onStateChange: (recording) => {
micBtn.classList.toggle("recording", recording);
if (!recording) {
baseText = input.value;
this.#interimText = "";
}
},
onError: (err) => console.warn("MI dictation:", err),
});
micBtn.addEventListener("click", (e) => {
e.stopPropagation();
if (!this.#dictation!.isRecording) {
baseText = input.value;
}
this.#dictation!.toggle();
input.focus();
});
}
}
/** Gather page context: open shapes, active module, tabs, canvas state. */
@ -328,6 +366,21 @@ const STYLES = `
}
.mi-input::placeholder { color: var(--rs-text-muted); }
.mi-mic-btn {
background: none; border: none; cursor: pointer; padding: 2px 4px;
font-size: 0.85rem; border-radius: 6px; transition: all 0.2s;
flex-shrink: 0; line-height: 1;
}
.mi-mic-btn:hover { background: var(--rs-bg-hover); }
.mi-mic-btn.recording {
animation: micPulse 1.5s infinite;
filter: saturate(2) brightness(1.1);
}
@keyframes micPulse {
0%, 100% { transform: scale(1); }
50% { transform: scale(1.15); }
}
.mi-panel {
position: absolute; top: calc(100% + 8px); left: 0; right: 0;
min-width: 360px; max-height: 420px;