From da2b21cd98776bd820f53ed32ae96100eb9f18f4 Mon Sep 17 00:00:00 2001 From: Jeff Emmett Date: Tue, 3 Mar 2026 17:10:52 -0800 Subject: [PATCH] feat: add voice dictation to MI bar, markdown, chat & prompt inputs Extract Web Speech API logic from folk-transcription into a reusable SpeechDictation utility, then wire mic buttons into all 4 text input surfaces. Dictation fills inputs in real-time without auto-submitting. Hidden gracefully in unsupported browsers. Co-Authored-By: Claude Opus 4.6 --- lib/folk-chat.ts | 61 ++++++++ lib/folk-markdown.ts | 41 ++++++ lib/folk-prompt.ts | 65 +++++++++ lib/folk-transcription.ts | 245 ++++++++++----------------------- lib/speech-dictation.ts | 148 ++++++++++++++++++++ shared/components/rstack-mi.ts | 53 +++++++ 6 files changed, 441 insertions(+), 172 deletions(-) create mode 100644 lib/speech-dictation.ts diff --git a/lib/folk-chat.ts b/lib/folk-chat.ts index 7aff580..1481624 100644 --- a/lib/folk-chat.ts +++ b/lib/folk-chat.ts @@ -1,5 +1,6 @@ import { FolkShape } from "./folk-shape"; import { css, html } from "./tags"; +import { SpeechDictation } from "./speech-dictation"; const styles = css` :host { @@ -123,6 +124,30 @@ const styles = css` background: #ea580c; } + .mic-btn { + background: transparent; + border: 1px solid #e2e8f0; + border-radius: 6px; + padding: 8px 10px; + cursor: pointer; + font-size: 14px; + transition: all 0.2s; + } + + .mic-btn:hover { + border-color: #f97316; + } + + .mic-btn.recording { + border-color: #ef4444; + animation: micPulse 1.5s infinite; + } + + @keyframes micPulse { + 0%, 100% { transform: scale(1); } + 50% { transform: scale(1.15); } + } + .username-prompt { padding: 12px; text-align: center; @@ -228,6 +253,7 @@ export class FolkChat extends FolkShape {
+ ${SpeechDictation.isSupported() ? '' : ''}
@@ -295,6 +321,41 @@ export class FolkChat extends FolkShape { if (e.key === "Enter") sendMessage(); }); + // Voice dictation + const micBtn = wrapper.querySelector(".mic-btn") as HTMLButtonElement | null; + if (micBtn) { + let baseText = ""; + let interimText = ""; + const dictation = new SpeechDictation({ + onInterim: (text) => { + interimText = text; + messageInput.value = baseText + (baseText ? " " : "") + text; + }, + onFinal: (text) => { + interimText = ""; + baseText += (baseText ? " " : "") + text; + messageInput.value = baseText; + }, + onStateChange: (recording) => { + micBtn.classList.toggle("recording", recording); + if (!recording) { + baseText = messageInput.value; + interimText = ""; + } + }, + onError: (err) => console.warn("Chat dictation:", err), + }); + + micBtn.addEventListener("click", (e) => { + e.stopPropagation(); + if (!dictation.isRecording) { + baseText = messageInput.value; + } + dictation.toggle(); + messageInput.focus(); + }); + } + // Close button closeBtn.addEventListener("click", (e) => { e.stopPropagation(); diff --git a/lib/folk-markdown.ts b/lib/folk-markdown.ts index 9fb3597..fe963f8 100644 --- a/lib/folk-markdown.ts +++ b/lib/folk-markdown.ts @@ -1,5 +1,6 @@ import { FolkShape } from "./folk-shape"; import { css, html } from "./tags"; +import { SpeechDictation } from "./speech-dictation"; const styles = css` :host { @@ -48,6 +49,15 @@ const styles = css` background: rgba(255, 255, 255, 0.2); } + .header-actions button.mic-recording { + animation: micPulse 1.5s infinite; + } + + @keyframes micPulse { + 0%, 100% { transform: scale(1); } + 50% { transform: scale(1.2); } + } + .content { padding: 12px; height: calc(100% - 36px); @@ -164,6 +174,7 @@ export class FolkMarkdown extends FolkShape {
+ ${SpeechDictation.isSupported() ? '' : ''}
@@ -237,6 +248,36 @@ export class FolkMarkdown extends FolkShape { this.dispatchEvent(new CustomEvent("close")); }); + // Voice dictation + const micBtn = wrapper.querySelector(".mic-btn") as HTMLButtonElement | null; + if (micBtn) { + const dictation = new SpeechDictation({ + onInterim: (text) => { + // Show interim in editor (will be replaced by final) + }, + onFinal: (text) => { + enterMarkdownEdit(); + const pos = editor.selectionStart; + const before = editor.value.slice(0, pos); + const after = editor.value.slice(pos); + const sep = before && !before.endsWith(" ") && !before.endsWith("\n") ? " " : ""; + editor.value = before + sep + text + after; + this.#content = editor.value; + editor.selectionStart = editor.selectionEnd = pos + sep.length + text.length; + }, + onStateChange: (recording) => { + micBtn.classList.toggle("mic-recording", recording); + if (recording) enterMarkdownEdit(); + }, + onError: (err) => console.warn("Markdown dictation:", err), + }); + + micBtn.addEventListener("click", (e) => { + e.stopPropagation(); + dictation.toggle(); + }); + } + // Editor input editor.addEventListener("input", () => { this.#content = editor.value; diff --git a/lib/folk-prompt.ts b/lib/folk-prompt.ts index f641642..73f2151 100644 --- a/lib/folk-prompt.ts +++ b/lib/folk-prompt.ts @@ -1,5 +1,6 @@ import { FolkShape } from "./folk-shape"; import { css, html } from "./tags"; +import { SpeechDictation } from "./speech-dictation"; const styles = css` :host { @@ -175,6 +176,30 @@ const styles = css` cursor: not-allowed; } + .mic-btn { + padding: 10px 12px; + background: transparent; + border: 2px solid #e2e8f0; + border-radius: 8px; + font-size: 14px; + cursor: pointer; + transition: all 0.2s; + } + + .mic-btn:hover { + border-color: #6366f1; + } + + .mic-btn.recording { + border-color: #ef4444; + animation: micPulse 1.5s infinite; + } + + @keyframes micPulse { + 0%, 100% { transform: scale(1); } + 50% { transform: scale(1.15); } + } + .error { color: #ef4444; padding: 12px; @@ -293,6 +318,7 @@ export class FolkPrompt extends FolkShape {
+ ${SpeechDictation.isSupported() ? '' : ''}
@@ -341,6 +367,45 @@ export class FolkPrompt extends FolkShape { // Prevent drag on inputs this.#promptInput?.addEventListener("pointerdown", (e) => e.stopPropagation()); + // Voice dictation + const micBtn = wrapper.querySelector(".mic-btn") as HTMLButtonElement | null; + if (micBtn) { + let baseText = ""; + let interimText = ""; + const dictation = new SpeechDictation({ + onInterim: (text) => { + interimText = text; + if (this.#promptInput) { + this.#promptInput.value = baseText + (baseText ? " " : "") + text; + } + }, + onFinal: (text) => { + interimText = ""; + baseText += (baseText ? " " : "") + text; + if (this.#promptInput) { + this.#promptInput.value = baseText; + } + }, + onStateChange: (recording) => { + micBtn.classList.toggle("recording", recording); + if (!recording) { + baseText = this.#promptInput?.value || ""; + interimText = ""; + } + }, + onError: (err) => console.warn("Prompt dictation:", err), + }); + + micBtn.addEventListener("click", (e) => { + e.stopPropagation(); + if (!dictation.isRecording) { + baseText = this.#promptInput?.value || ""; + } + dictation.toggle(); + this.#promptInput?.focus(); + }); + } + // Close button closeBtn.addEventListener("click", (e) => { e.stopPropagation(); diff --git a/lib/folk-transcription.ts b/lib/folk-transcription.ts index ef404bc..b47dbec 100644 --- a/lib/folk-transcription.ts +++ b/lib/folk-transcription.ts @@ -1,56 +1,6 @@ import { FolkShape } from "./folk-shape"; import { css, html } from "./tags"; - -// Web Speech API types (not all browsers have these in their types) -interface SpeechRecognitionResult { - readonly length: number; - item(index: number): SpeechRecognitionAlternative; - [index: number]: SpeechRecognitionAlternative; - readonly isFinal: boolean; -} - -interface SpeechRecognitionAlternative { - readonly transcript: string; - readonly confidence: number; -} - -interface SpeechRecognitionResultList { - readonly length: number; - item(index: number): SpeechRecognitionResult; - [index: number]: SpeechRecognitionResult; -} - -interface SpeechRecognitionEvent extends Event { - readonly resultIndex: number; - readonly results: SpeechRecognitionResultList; -} - -interface SpeechRecognitionErrorEvent extends Event { - readonly error: string; - readonly message: string; -} - -interface SpeechRecognition extends EventTarget { - continuous: boolean; - interimResults: boolean; - lang: string; - onresult: ((event: SpeechRecognitionEvent) => void) | null; - onerror: ((event: SpeechRecognitionErrorEvent) => void) | null; - onend: (() => void) | null; - start(): void; - stop(): void; -} - -interface SpeechRecognitionConstructor { - new (): SpeechRecognition; -} - -declare global { - interface Window { - SpeechRecognition?: SpeechRecognitionConstructor; - webkitSpeechRecognition?: SpeechRecognitionConstructor; - } -} +import { SpeechDictation } from "./speech-dictation"; const styles = css` :host { @@ -292,7 +242,7 @@ export class FolkTranscription extends FolkShape { #isRecording = false; #duration = 0; #durationInterval: ReturnType | null = null; - #recognition: SpeechRecognition | null = null; + #dictation: SpeechDictation | null = null; #error: string | null = null; #recordBtn: HTMLElement | null = null; @@ -389,144 +339,95 @@ export class FolkTranscription extends FolkShape { this.dispatchEvent(new CustomEvent("close")); }); - // Initialize speech recognition - this.#initSpeechRecognition(); + // Initialize speech dictation + this.#initDictation(); return root; } - #initSpeechRecognition() { - const SpeechRecognitionImpl = window.SpeechRecognition || window.webkitSpeechRecognition; - - if (!SpeechRecognitionImpl) { + #initDictation() { + if (!SpeechDictation.isSupported()) { this.#error = "Speech recognition not supported in this browser"; this.#renderError(); return; } - this.#recognition = new SpeechRecognitionImpl(); - this.#recognition.continuous = true; - this.#recognition.interimResults = true; - this.#recognition.lang = "en-US"; - - this.#recognition.onresult = (event) => { - for (let i = event.resultIndex; i < event.results.length; i++) { - const result = event.results[i]; - const text = result[0].transcript; - - if (result.isFinal) { - // Find and update interim segment or add new - const interimIdx = this.#segments.findIndex((s) => !s.isFinal); - if (interimIdx >= 0) { - this.#segments[interimIdx] = { - ...this.#segments[interimIdx], - text, - isFinal: true, - }; - } else { - this.#segments.push({ - id: crypto.randomUUID(), - text, - timestamp: this.#duration, - isFinal: true, - }); - } + this.#dictation = new SpeechDictation({ + onInterim: (text) => { + const interimIdx = this.#segments.findIndex((s) => !s.isFinal); + if (interimIdx >= 0) { + this.#segments[interimIdx].text = text; } else { - // Update or add interim - const interimIdx = this.#segments.findIndex((s) => !s.isFinal); - if (interimIdx >= 0) { - this.#segments[interimIdx].text = text; - } else { - this.#segments.push({ - id: crypto.randomUUID(), - text, - timestamp: this.#duration, - isFinal: false, - }); - } + this.#segments.push({ + id: crypto.randomUUID(), + text, + timestamp: this.#duration, + isFinal: false, + }); } - } - this.#renderTranscript(); - }; - - this.#recognition.onerror = (event) => { - console.error("Speech recognition error:", event.error); - if (event.error !== "no-speech") { - this.#error = `Recognition error: ${event.error}`; + this.#renderTranscript(); + }, + onFinal: (text) => { + const interimIdx = this.#segments.findIndex((s) => !s.isFinal); + if (interimIdx >= 0) { + this.#segments[interimIdx] = { + ...this.#segments[interimIdx], + text, + isFinal: true, + }; + } else { + this.#segments.push({ + id: crypto.randomUUID(), + text, + timestamp: this.#duration, + isFinal: true, + }); + } + this.#renderTranscript(); + }, + onError: (err) => { + console.error("Speech recognition error:", err); + this.#error = err; this.#renderError(); - } - }; - - this.#recognition.onend = () => { - // Restart if still supposed to be recording - if (this.#isRecording && this.#recognition) { - this.#recognition.start(); - } - }; + }, + onStateChange: (recording) => { + this.#isRecording = recording; + if (recording) { + this.#error = null; + this.#recordBtn?.classList.add("recording"); + if (this.#statusEl) { + this.#statusEl.textContent = "Recording..."; + this.#statusEl.classList.add("recording"); + } + this.#durationInterval = setInterval(() => { + this.#duration++; + this.#updateDuration(); + }, 1000); + this.dispatchEvent(new CustomEvent("recording-start")); + } else { + this.#recordBtn?.classList.remove("recording"); + if (this.#statusEl) { + this.#statusEl.textContent = "Stopped"; + this.#statusEl.classList.remove("recording"); + } + if (this.#durationInterval) { + clearInterval(this.#durationInterval); + this.#durationInterval = null; + } + this.#segments = this.#segments.filter((s) => s.isFinal); + this.#renderTranscript(); + this.dispatchEvent(new CustomEvent("recording-stop", { detail: { transcript: this.transcript } })); + } + }, + }); } #toggleRecording() { - if (this.#isRecording) { - this.#stopRecording(); - } else { - this.#startRecording(); - } - } - - #startRecording() { - if (!this.#recognition) { - this.#error = "Speech recognition not available"; - this.#renderError(); - return; - } - - try { - this.#recognition.start(); - this.#isRecording = true; - this.#error = null; - - this.#recordBtn?.classList.add("recording"); - if (this.#statusEl) { - this.#statusEl.textContent = "Recording..."; - this.#statusEl.classList.add("recording"); - } - - // Start duration timer - this.#durationInterval = setInterval(() => { - this.#duration++; - this.#updateDuration(); - }, 1000); - - this.dispatchEvent(new CustomEvent("recording-start")); - } catch (error) { - this.#error = "Failed to start recording"; - this.#renderError(); - } + this.#dictation?.toggle(); } #stopRecording() { - if (!this.#isRecording) return; - - this.#recognition?.stop(); - this.#isRecording = false; - - this.#recordBtn?.classList.remove("recording"); - if (this.#statusEl) { - this.#statusEl.textContent = "Stopped"; - this.#statusEl.classList.remove("recording"); - } - - // Stop duration timer - if (this.#durationInterval) { - clearInterval(this.#durationInterval); - this.#durationInterval = null; - } - - // Remove any interim segments - this.#segments = this.#segments.filter((s) => s.isFinal); - this.#renderTranscript(); - - this.dispatchEvent(new CustomEvent("recording-stop", { detail: { transcript: this.transcript } })); + this.#dictation?.stop(); } #updateDuration() { diff --git a/lib/speech-dictation.ts b/lib/speech-dictation.ts new file mode 100644 index 0000000..6879331 --- /dev/null +++ b/lib/speech-dictation.ts @@ -0,0 +1,148 @@ +// Web Speech API types (not all browsers have these in their types) +export interface SpeechRecognitionResult { + readonly length: number; + item(index: number): SpeechRecognitionAlternative; + [index: number]: SpeechRecognitionAlternative; + readonly isFinal: boolean; +} + +export interface SpeechRecognitionAlternative { + readonly transcript: string; + readonly confidence: number; +} + +export interface SpeechRecognitionResultList { + readonly length: number; + item(index: number): SpeechRecognitionResult; + [index: number]: SpeechRecognitionResult; +} + +export interface SpeechRecognitionEvent extends Event { + readonly resultIndex: number; + readonly results: SpeechRecognitionResultList; +} + +export interface SpeechRecognitionErrorEvent extends Event { + readonly error: string; + readonly message: string; +} + +export interface SpeechRecognitionInstance extends EventTarget { + continuous: boolean; + interimResults: boolean; + lang: string; + onresult: ((event: SpeechRecognitionEvent) => void) | null; + onerror: ((event: SpeechRecognitionErrorEvent) => void) | null; + onend: (() => void) | null; + start(): void; + stop(): void; +} + +interface SpeechRecognitionConstructor { + new (): SpeechRecognitionInstance; +} + +declare global { + interface Window { + SpeechRecognition?: SpeechRecognitionConstructor; + webkitSpeechRecognition?: SpeechRecognitionConstructor; + } +} + +export interface SpeechDictationOptions { + onInterim?: (text: string) => void; + onFinal?: (text: string) => void; + onError?: (error: string) => void; + onStateChange?: (recording: boolean) => void; + lang?: string; +} + +export class SpeechDictation { + #recognition: SpeechRecognitionInstance | null = null; + #recording = false; + #opts: SpeechDictationOptions; + + constructor(opts: SpeechDictationOptions) { + this.#opts = opts; + this.#init(); + } + + static isSupported(): boolean { + return !!(window.SpeechRecognition || window.webkitSpeechRecognition); + } + + get isRecording(): boolean { + return this.#recording; + } + + start(): void { + if (this.#recording || !this.#recognition) return; + try { + this.#recognition.start(); + this.#recording = true; + this.#opts.onStateChange?.(true); + } catch (error) { + this.#opts.onError?.("Failed to start recording"); + } + } + + stop(): void { + if (!this.#recording) return; + this.#recording = false; + this.#recognition?.stop(); + this.#opts.onStateChange?.(false); + } + + toggle(): void { + if (this.#recording) { + this.stop(); + } else { + this.start(); + } + } + + destroy(): void { + this.stop(); + if (this.#recognition) { + this.#recognition.onresult = null; + this.#recognition.onerror = null; + this.#recognition.onend = null; + this.#recognition = null; + } + } + + #init(): void { + const Impl = window.SpeechRecognition || window.webkitSpeechRecognition; + if (!Impl) return; + + this.#recognition = new Impl(); + this.#recognition.continuous = true; + this.#recognition.interimResults = true; + this.#recognition.lang = this.#opts.lang ?? "en-US"; + + this.#recognition.onresult = (event) => { + for (let i = event.resultIndex; i < event.results.length; i++) { + const result = event.results[i]; + const text = result[0].transcript; + if (result.isFinal) { + this.#opts.onFinal?.(text); + } else { + this.#opts.onInterim?.(text); + } + } + }; + + this.#recognition.onerror = (event) => { + if (event.error !== "no-speech") { + this.#opts.onError?.(`Recognition error: ${event.error}`); + } + }; + + this.#recognition.onend = () => { + // Auto-restart while still recording + if (this.#recording && this.#recognition) { + this.#recognition.start(); + } + }; + } +} diff --git a/shared/components/rstack-mi.ts b/shared/components/rstack-mi.ts index cf28558..79e7c78 100644 --- a/shared/components/rstack-mi.ts +++ b/shared/components/rstack-mi.ts @@ -10,6 +10,7 @@ import { getAccessToken } from "./rstack-identity"; import { parseMiActions, summariseActions } from "../../lib/mi-actions"; import { MiActionExecutor } from "../../lib/mi-action-executor"; import { suggestTools, type ToolHint } from "../../lib/mi-tool-schema"; +import { SpeechDictation } from "../../lib/speech-dictation"; interface MiMessage { role: "user" | "assistant"; @@ -22,6 +23,8 @@ export class RStackMi extends HTMLElement { #shadow: ShadowRoot; #messages: MiMessage[] = []; #abortController: AbortController | null = null; + #dictation: SpeechDictation | null = null; + #interimText = ""; constructor() { super(); @@ -40,6 +43,7 @@ export class RStackMi extends HTMLElement { + ${SpeechDictation.isSupported() ? '' : ''}
@@ -84,6 +88,40 @@ export class RStackMi extends HTMLElement { // Stop clicks inside the panel from closing it panel.addEventListener("click", (e) => e.stopPropagation()); bar.addEventListener("click", (e) => e.stopPropagation()); + + // Voice dictation + const micBtn = this.#shadow.getElementById("mi-mic") as HTMLButtonElement | null; + if (micBtn) { + let baseText = ""; + this.#dictation = new SpeechDictation({ + onInterim: (text) => { + this.#interimText = text; + input.value = baseText + (baseText ? " " : "") + text; + }, + onFinal: (text) => { + this.#interimText = ""; + baseText += (baseText ? " " : "") + text; + input.value = baseText; + }, + onStateChange: (recording) => { + micBtn.classList.toggle("recording", recording); + if (!recording) { + baseText = input.value; + this.#interimText = ""; + } + }, + onError: (err) => console.warn("MI dictation:", err), + }); + + micBtn.addEventListener("click", (e) => { + e.stopPropagation(); + if (!this.#dictation!.isRecording) { + baseText = input.value; + } + this.#dictation!.toggle(); + input.focus(); + }); + } } /** Gather page context: open shapes, active module, tabs, canvas state. */ @@ -328,6 +366,21 @@ const STYLES = ` } .mi-input::placeholder { color: var(--rs-text-muted); } +.mi-mic-btn { + background: none; border: none; cursor: pointer; padding: 2px 4px; + font-size: 0.85rem; border-radius: 6px; transition: all 0.2s; + flex-shrink: 0; line-height: 1; +} +.mi-mic-btn:hover { background: var(--rs-bg-hover); } +.mi-mic-btn.recording { + animation: micPulse 1.5s infinite; + filter: saturate(2) brightness(1.1); +} +@keyframes micPulse { + 0%, 100% { transform: scale(1); } + 50% { transform: scale(1.15); } +} + .mi-panel { position: absolute; top: calc(100% + 8px); left: 0; right: 0; min-width: 360px; max-height: 420px;