Initial commit: Standalone transcription PWA

Extracted from canvas-website with:
- Web Speech API transcription (Chrome/Edge, fast)
- Local Whisper AI model (offline capable, ~75MB)
- PWA support with service worker
- Docker + Runtipi deployment configs

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Jeff Emmett 2026-01-02 10:37:07 +01:00
commit 1a8774aa97
23 changed files with 8936 additions and 0 deletions

27
.gitignore vendored Normal file
View File

@ -0,0 +1,27 @@
# Dependencies
node_modules/
# Build output
dist/
# Logs
*.log
npm-debug.log*
# Editor
.vscode/
.idea/
*.swp
*.swo
# OS
.DS_Store
Thumbs.db
# Environment
.env
.env.local
.env.*.local
# PWA dev
dev-dist/

34
Dockerfile Normal file
View File

@ -0,0 +1,34 @@
# Build stage
FROM node:20-alpine AS builder
WORKDIR /app
# Copy package files
COPY package*.json ./
# Install dependencies
RUN npm ci
# Copy source files
COPY . .
# Build the app
RUN npm run build
# Production stage
FROM nginx:alpine
# Copy custom nginx config
COPY nginx.conf /etc/nginx/conf.d/default.conf
# Copy built files from builder
COPY --from=builder /app/dist /usr/share/nginx/html
# Expose port
EXPOSE 80
# Health check
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
CMD wget --no-verbose --tries=1 --spider http://localhost:80/ || exit 1
CMD ["nginx", "-g", "daemon off;"]

67
README.md Normal file
View File

@ -0,0 +1,67 @@
# Transcribe
A standalone Progressive Web App (PWA) for voice transcription, extracted from canvas-website.
## Features
- **Two Transcription Modes:**
- **Web Speech API** - Fast, real-time transcription using browser's built-in speech recognition (Chrome/Edge)
- **Local Whisper** - Offline-capable transcription using Whisper AI model running entirely in the browser
- **PWA Support** - Install on any device, works offline
- **Simple UI** - Start/stop recording, copy/download transcripts
- **No Server Required** - All processing happens in the browser
## Quick Start
```bash
# Install dependencies
npm install
# Run development server
npm run dev
# Build for production
npm run build
```
## Docker Deployment
```bash
# Build and run with Docker
docker compose up -d --build
# Or build manually
docker build -t transcribe-app .
docker run -p 3000:80 transcribe-app
```
## Runtipi Deployment
Copy the contents of the `runtipi/` folder to your Runtipi apps directory:
```bash
cp -r runtipi/* /path/to/runtipi/user-config/transcribe/
```
Or use the pre-built Docker image from GitHub Container Registry.
## Browser Compatibility
| Browser | Web Speech API | Whisper (Local) |
|---------|----------------|-----------------|
| Chrome | Full | Full |
| Edge | Full | Full |
| Safari | Partial | Full |
| Firefox | No | Full |
## Tech Stack
- React 18 + TypeScript
- Vite with PWA plugin
- @xenova/transformers for local Whisper inference
- Web Speech API for browser-native transcription
## License
MIT

20
docker-compose.yml Normal file
View File

@ -0,0 +1,20 @@
version: '3.8'
services:
transcribe:
build: .
container_name: transcribe-app
restart: unless-stopped
ports:
- "3000:80"
labels:
# Traefik labels for reverse proxy (if using Traefik)
- "traefik.enable=true"
- "traefik.http.routers.transcribe.rule=Host(`transcribe.jeffemmett.com`)"
- "traefik.http.services.transcribe.loadbalancer.server.port=80"
networks:
- default
networks:
default:
name: transcribe-network

16
index.html Normal file
View File

@ -0,0 +1,16 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0, viewport-fit=cover" />
<meta name="theme-color" content="#1a1a2e" />
<meta name="description" content="Voice transcription PWA with local Whisper AI and Web Speech API" />
<link rel="icon" type="image/svg+xml" href="/icons/icon.svg" />
<link rel="apple-touch-icon" href="/icons/apple-touch-icon.png" />
<title>Transcribe</title>
</head>
<body>
<div id="root"></div>
<script type="module" src="/src/main.tsx"></script>
</body>
</html>

40
nginx.conf Normal file
View File

@ -0,0 +1,40 @@
server {
listen 80;
server_name localhost;
root /usr/share/nginx/html;
index index.html;
# Gzip compression
gzip on;
gzip_vary on;
gzip_min_length 1024;
gzip_proxied any;
gzip_types text/plain text/css text/xml text/javascript application/javascript application/json application/xml;
# Security headers
add_header X-Frame-Options "SAMEORIGIN" always;
add_header X-Content-Type-Options "nosniff" always;
add_header X-XSS-Protection "1; mode=block" always;
# Cache static assets
location ~* \.(js|css|png|jpg|jpeg|gif|ico|svg|woff|woff2)$ {
expires 1y;
add_header Cache-Control "public, immutable";
}
# Service worker - no cache
location /sw.js {
expires off;
add_header Cache-Control "no-cache, no-store, must-revalidate";
}
# PWA manifest
location /manifest.webmanifest {
types { application/manifest+json webmanifest; }
}
# SPA fallback
location / {
try_files $uri $uri/ /index.html;
}
}

7266
package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

29
package.json Normal file
View File

@ -0,0 +1,29 @@
{
"name": "transcribe-app",
"version": "1.0.0",
"description": "Standalone transcription PWA with Web Speech API and local Whisper",
"type": "module",
"scripts": {
"dev": "vite --host 0.0.0.0 --port 3000",
"build": "tsc && vite build",
"preview": "vite preview",
"docker:build": "docker build -t transcribe-app .",
"docker:run": "docker run -p 3000:80 transcribe-app"
},
"dependencies": {
"@xenova/transformers": "^2.17.2",
"react": "^18.2.0",
"react-dom": "^18.2.0"
},
"devDependencies": {
"@types/react": "^18.2.0",
"@types/react-dom": "^18.2.0",
"@vitejs/plugin-react": "^4.2.1",
"typescript": "^5.3.3",
"vite": "^5.4.0",
"vite-plugin-pwa": "^0.20.0"
},
"engines": {
"node": ">=18.0.0"
}
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 348 KiB

BIN
public/icons/icon-192.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 341 KiB

BIN
public/icons/icon-512.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 348 KiB

8
public/icons/icon.svg Normal file
View File

@ -0,0 +1,8 @@
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" fill="none">
<rect width="512" height="512" rx="96" fill="#1a1a2e"/>
<circle cx="256" cy="200" r="80" fill="#ff9500"/>
<rect x="216" y="280" width="80" height="120" rx="40" fill="#ff9500"/>
<path d="M176 320 C176 400, 256 440, 256 440 C256 440, 336 400, 336 320" stroke="#ff9500" stroke-width="24" stroke-linecap="round" fill="none"/>
<line x1="256" y1="440" x2="256" y2="480" stroke="#ff9500" stroke-width="24" stroke-linecap="round"/>
<line x1="200" y1="480" x2="312" y2="480" stroke="#ff9500" stroke-width="24" stroke-linecap="round"/>
</svg>

After

Width:  |  Height:  |  Size: 620 B

18
runtipi/config.json Normal file
View File

@ -0,0 +1,18 @@
{
"$schema": "../schema.json",
"name": "Transcribe",
"id": "transcribe",
"available": true,
"short_desc": "Voice transcription PWA with local Whisper AI and Web Speech API",
"author": "Jeff Emmett",
"port": 3000,
"categories": ["utilities", "media"],
"description": "A progressive web app for voice transcription featuring both browser-native Web Speech API (fast, requires internet) and local Whisper AI model (works offline). Perfect for transcribing meetings, notes, and dictation.",
"tipiVersion": 1,
"version": "1.0.0",
"source": "https://github.com/jeffemmett/transcribe-app",
"website": "https://transcribe.jeffemmett.com",
"exposable": true,
"supported_architectures": ["arm64", "amd64"],
"form_fields": []
}

View File

@ -0,0 +1,15 @@
version: "3.8"
services:
transcribe:
container_name: transcribe
image: ghcr.io/jeffemmett/transcribe-app:latest
restart: unless-stopped
ports:
- "${APP_PORT}:80"
networks:
- tipi_main_network
networks:
tipi_main_network:
external: true

362
src/App.css Normal file
View File

@ -0,0 +1,362 @@
* {
box-sizing: border-box;
margin: 0;
padding: 0;
}
:root {
--primary: #ff9500;
--primary-dark: #e68600;
--bg-dark: #1a1a2e;
--bg-card: #16213e;
--bg-input: #0f3460;
--text-primary: #ffffff;
--text-secondary: #a0a0b0;
--error: #ff4757;
--success: #2ecc71;
--border-radius: 12px;
}
html, body {
height: 100%;
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, sans-serif;
background: var(--bg-dark);
color: var(--text-primary);
}
#root {
min-height: 100%;
}
.app {
min-height: 100vh;
display: flex;
flex-direction: column;
padding: 20px;
max-width: 800px;
margin: 0 auto;
}
/* Header */
.header {
text-align: center;
padding: 30px 0;
}
.header h1 {
font-size: 2.5rem;
font-weight: 700;
color: var(--primary);
margin-bottom: 8px;
}
.subtitle {
color: var(--text-secondary);
font-size: 1.1rem;
}
/* Main content */
.main {
flex: 1;
display: flex;
flex-direction: column;
gap: 24px;
}
/* Mode selector */
.mode-selector {
display: grid;
grid-template-columns: 1fr 1fr;
gap: 12px;
}
.mode-btn {
display: flex;
flex-direction: column;
align-items: center;
gap: 4px;
padding: 16px;
background: var(--bg-card);
border: 2px solid transparent;
border-radius: var(--border-radius);
color: var(--text-primary);
font-size: 1rem;
font-weight: 600;
cursor: pointer;
transition: all 0.2s ease;
}
.mode-btn:hover:not(:disabled) {
border-color: var(--primary);
}
.mode-btn.active {
border-color: var(--primary);
background: rgba(255, 149, 0, 0.1);
}
.mode-btn:disabled {
opacity: 0.5;
cursor: not-allowed;
}
.mode-desc {
font-size: 0.75rem;
font-weight: 400;
color: var(--text-secondary);
}
/* Model status */
.model-status {
background: var(--bg-card);
border-radius: var(--border-radius);
padding: 16px;
}
.progress-bar {
height: 8px;
background: var(--bg-input);
border-radius: 4px;
overflow: hidden;
margin-bottom: 8px;
}
.progress-fill {
height: 100%;
background: var(--primary);
transition: width 0.3s ease;
}
.progress-text {
font-size: 0.875rem;
color: var(--text-secondary);
text-align: center;
}
.model-info {
font-size: 0.875rem;
color: var(--text-secondary);
text-align: center;
}
/* Error banner */
.error-banner {
display: flex;
align-items: center;
justify-content: space-between;
background: rgba(255, 71, 87, 0.1);
border: 1px solid var(--error);
border-radius: var(--border-radius);
padding: 12px 16px;
color: var(--error);
}
.error-banner button {
background: none;
border: none;
color: var(--error);
font-size: 1.5rem;
cursor: pointer;
line-height: 1;
}
/* Controls */
.controls {
display: flex;
flex-direction: column;
align-items: center;
gap: 16px;
}
.record-btn {
display: flex;
align-items: center;
justify-content: center;
gap: 12px;
padding: 20px 40px;
border: none;
border-radius: 50px;
font-size: 1.125rem;
font-weight: 600;
cursor: pointer;
transition: all 0.2s ease;
}
.record-btn.start {
background: var(--primary);
color: var(--bg-dark);
}
.record-btn.start:hover:not(:disabled) {
background: var(--primary-dark);
transform: scale(1.02);
}
.record-btn.start:disabled {
opacity: 0.5;
cursor: not-allowed;
}
.record-btn.stop {
background: var(--error);
color: white;
}
.record-btn.stop:hover {
background: #e84152;
transform: scale(1.02);
}
.record-btn svg {
flex-shrink: 0;
}
/* Recording indicator */
.recording-indicator {
display: flex;
align-items: center;
gap: 8px;
color: var(--error);
}
.pulse {
width: 12px;
height: 12px;
background: var(--error);
border-radius: 50%;
animation: pulse 1.5s ease-in-out infinite;
}
@keyframes pulse {
0%, 100% { opacity: 1; transform: scale(1); }
50% { opacity: 0.5; transform: scale(1.2); }
}
.time {
font-size: 1.25rem;
font-weight: 600;
font-variant-numeric: tabular-nums;
}
/* Transcript */
.transcript-container {
flex: 1;
display: flex;
flex-direction: column;
background: var(--bg-card);
border-radius: var(--border-radius);
overflow: hidden;
}
.transcript-header {
display: flex;
align-items: center;
justify-content: space-between;
padding: 16px 20px;
border-bottom: 1px solid rgba(255, 255, 255, 0.1);
}
.transcript-header h2 {
font-size: 1rem;
font-weight: 600;
}
.transcript-actions {
display: flex;
gap: 8px;
}
.action-btn {
padding: 6px 12px;
background: var(--bg-input);
border: none;
border-radius: 6px;
color: var(--text-primary);
font-size: 0.875rem;
cursor: pointer;
transition: background 0.2s ease;
}
.action-btn:hover {
background: rgba(255, 149, 0, 0.2);
}
.clear-btn:hover {
background: rgba(255, 71, 87, 0.2);
color: var(--error);
}
.transcript-box {
flex: 1;
min-height: 200px;
padding: 20px;
overflow-y: auto;
}
.transcript-text {
font-size: 1.125rem;
line-height: 1.7;
white-space: pre-wrap;
word-break: break-word;
}
.transcript-placeholder {
color: var(--text-secondary);
font-style: italic;
}
.interim-text {
margin-top: 12px;
padding-top: 12px;
border-top: 1px dashed rgba(255, 255, 255, 0.1);
color: var(--text-secondary);
font-style: italic;
}
.transcribing-indicator {
margin-top: 12px;
color: var(--primary);
font-size: 0.875rem;
}
/* Footer */
.footer {
text-align: center;
padding: 20px 0;
}
.footer p {
color: var(--text-secondary);
font-size: 0.875rem;
}
/* Responsive */
@media (max-width: 600px) {
.app {
padding: 16px;
}
.header h1 {
font-size: 2rem;
}
.record-btn {
padding: 16px 32px;
font-size: 1rem;
}
.transcript-actions {
flex-wrap: wrap;
}
}
/* PWA standalone mode */
@media (display-mode: standalone) {
.header {
padding-top: env(safe-area-inset-top, 20px);
}
.footer {
padding-bottom: env(safe-area-inset-bottom, 20px);
}
}

266
src/App.tsx Normal file
View File

@ -0,0 +1,266 @@
import { useState, useCallback } from 'react'
import { useWebSpeechTranscription } from './hooks/useWebSpeechTranscription'
import { useWhisperTranscription } from './hooks/useWhisperTranscription'
import './App.css'
type TranscriptionMode = 'webspeech' | 'whisper'
function App() {
const [mode, setMode] = useState<TranscriptionMode>('webspeech')
const [transcript, setTranscript] = useState('')
const [error, setError] = useState<string | null>(null)
const [modelProgress, setModelProgress] = useState<{ progress: number; status: string } | null>(null)
const [recordingTime, setRecordingTime] = useState(0)
const [timerInterval, setTimerInterval] = useState<NodeJS.Timeout | null>(null)
// Web Speech API hook
const webSpeech = useWebSpeechTranscription({
onTranscriptUpdate: (text) => {
setTranscript(prev => prev + (prev ? ' ' : '') + text)
},
onError: (err) => setError(err.message),
language: 'en-US'
})
// Whisper hook
const whisper = useWhisperTranscription({
onTranscriptUpdate: (text) => {
setTranscript(prev => prev + (prev ? ' ' : '') + text)
},
onError: (err) => setError(err.message),
onModelProgress: (progress, status) => setModelProgress({ progress, status }),
language: 'en'
})
const isRecording = mode === 'webspeech' ? webSpeech.isRecording : whisper.isRecording
const isSupported = mode === 'webspeech' ? webSpeech.isSupported : true
// Start timer
const startTimer = useCallback(() => {
setRecordingTime(0)
const interval = setInterval(() => {
setRecordingTime(prev => prev + 1)
}, 1000)
setTimerInterval(interval)
}, [])
// Stop timer
const stopTimer = useCallback(() => {
if (timerInterval) {
clearInterval(timerInterval)
setTimerInterval(null)
}
}, [timerInterval])
// Handle start recording
const handleStart = useCallback(async () => {
setError(null)
startTimer()
if (mode === 'webspeech') {
webSpeech.startRecording()
} else {
await whisper.startRecording()
}
}, [mode, webSpeech, whisper, startTimer])
// Handle stop recording
const handleStop = useCallback(async () => {
stopTimer()
if (mode === 'webspeech') {
webSpeech.stopRecording()
} else {
await whisper.stopRecording()
}
}, [mode, webSpeech, whisper, stopTimer])
// Handle clear
const handleClear = useCallback(() => {
setTranscript('')
setRecordingTime(0)
if (mode === 'webspeech') {
webSpeech.clearTranscript()
} else {
whisper.clearTranscript()
}
}, [mode, webSpeech, whisper])
// Copy to clipboard
const handleCopy = useCallback(async () => {
try {
await navigator.clipboard.writeText(transcript)
// Show brief feedback
const btn = document.querySelector('.copy-btn') as HTMLButtonElement
if (btn) {
const originalText = btn.textContent
btn.textContent = 'Copied!'
setTimeout(() => {
btn.textContent = originalText
}, 1500)
}
} catch (err) {
console.error('Failed to copy:', err)
}
}, [transcript])
// Download as text file
const handleDownload = useCallback(() => {
const blob = new Blob([transcript], { type: 'text/plain' })
const url = URL.createObjectURL(blob)
const a = document.createElement('a')
a.href = url
a.download = `transcript-${new Date().toISOString().slice(0, 10)}.txt`
document.body.appendChild(a)
a.click()
document.body.removeChild(a)
URL.revokeObjectURL(url)
}, [transcript])
// Format recording time
const formatTime = (seconds: number) => {
const mins = Math.floor(seconds / 60)
const secs = seconds % 60
return `${mins}:${secs.toString().padStart(2, '0')}`
}
return (
<div className="app">
<header className="header">
<h1>Transcribe</h1>
<p className="subtitle">Voice to text, right in your browser</p>
</header>
<main className="main">
{/* Mode selector */}
<div className="mode-selector">
<button
className={`mode-btn ${mode === 'webspeech' ? 'active' : ''}`}
onClick={() => setMode('webspeech')}
disabled={isRecording}
>
Web Speech
<span className="mode-desc">Fast, requires internet</span>
</button>
<button
className={`mode-btn ${mode === 'whisper' ? 'active' : ''}`}
onClick={() => setMode('whisper')}
disabled={isRecording}
>
Whisper
<span className="mode-desc">Local AI, works offline</span>
</button>
</div>
{/* Model loading status for Whisper */}
{mode === 'whisper' && !whisper.modelLoaded && (
<div className="model-status">
{whisper.modelLoading ? (
<>
<div className="progress-bar">
<div
className="progress-fill"
style={{ width: `${modelProgress?.progress || 0}%` }}
/>
</div>
<p className="progress-text">{modelProgress?.status || 'Loading model...'}</p>
</>
) : (
<p className="model-info">
Model will be downloaded on first use (~75MB)
</p>
)}
</div>
)}
{/* Error display */}
{error && (
<div className="error-banner">
<span>{error}</span>
<button onClick={() => setError(null)}>×</button>
</div>
)}
{/* Recording controls */}
<div className="controls">
{!isRecording ? (
<button
className="record-btn start"
onClick={handleStart}
disabled={!isSupported || (mode === 'whisper' && whisper.modelLoading)}
>
<svg viewBox="0 0 24 24" width="32" height="32" fill="currentColor">
<circle cx="12" cy="12" r="10" />
</svg>
<span>Start Recording</span>
</button>
) : (
<button className="record-btn stop" onClick={handleStop}>
<svg viewBox="0 0 24 24" width="32" height="32" fill="currentColor">
<rect x="6" y="6" width="12" height="12" rx="2" />
</svg>
<span>Stop Recording</span>
</button>
)}
{isRecording && (
<div className="recording-indicator">
<span className="pulse" />
<span className="time">{formatTime(recordingTime)}</span>
</div>
)}
</div>
{/* Transcript display */}
<div className="transcript-container">
<div className="transcript-header">
<h2>Transcript</h2>
{transcript && (
<div className="transcript-actions">
<button className="action-btn copy-btn" onClick={handleCopy}>
Copy
</button>
<button className="action-btn" onClick={handleDownload}>
Download
</button>
<button className="action-btn clear-btn" onClick={handleClear}>
Clear
</button>
</div>
)}
</div>
<div className="transcript-box">
{transcript ? (
<p className="transcript-text">{transcript}</p>
) : (
<p className="transcript-placeholder">
{isRecording ? 'Listening...' : 'Your transcript will appear here'}
</p>
)}
{/* Interim text for Web Speech */}
{mode === 'webspeech' && webSpeech.interimTranscript && (
<p className="interim-text">{webSpeech.interimTranscript}</p>
)}
{/* Transcribing indicator for Whisper */}
{mode === 'whisper' && whisper.isTranscribing && (
<p className="transcribing-indicator">Processing audio...</p>
)}
</div>
</div>
</main>
<footer className="footer">
<p>
{mode === 'webspeech'
? 'Using browser\'s built-in speech recognition'
: 'Using local Whisper AI model'}
</p>
</footer>
</div>
)
}
export default App

View File

@ -0,0 +1,235 @@
import { useState, useRef, useCallback, useEffect } from 'react'
// TypeScript declarations for Web Speech API
declare global {
interface Window {
SpeechRecognition: typeof SpeechRecognition
webkitSpeechRecognition: typeof SpeechRecognition
}
interface SpeechRecognition extends EventTarget {
continuous: boolean
interimResults: boolean
lang: string
maxAlternatives: number
start(): void
stop(): void
onstart: ((this: SpeechRecognition, ev: Event) => void) | null
onresult: ((this: SpeechRecognition, ev: SpeechRecognitionEvent) => void) | null
onerror: ((this: SpeechRecognition, ev: SpeechRecognitionErrorEvent) => void) | null
onend: ((this: SpeechRecognition, ev: Event) => void) | null
}
interface SpeechRecognitionEvent extends Event {
resultIndex: number
results: SpeechRecognitionResultList
}
interface SpeechRecognitionErrorEvent extends Event {
error: string
}
interface SpeechRecognitionResultList {
readonly length: number
item(index: number): SpeechRecognitionResult
[index: number]: SpeechRecognitionResult
}
interface SpeechRecognitionResult {
readonly length: number
item(index: number): SpeechRecognitionAlternative
[index: number]: SpeechRecognitionAlternative
readonly isFinal: boolean
}
interface SpeechRecognitionAlternative {
readonly transcript: string
readonly confidence: number
}
// eslint-disable-next-line no-var
var SpeechRecognition: {
prototype: SpeechRecognition
new(): SpeechRecognition
}
}
interface UseWebSpeechTranscriptionOptions {
onTranscriptUpdate?: (text: string) => void
onError?: (error: Error) => void
language?: string
continuous?: boolean
interimResults?: boolean
}
export const useWebSpeechTranscription = ({
onTranscriptUpdate,
onError,
language = 'en-US',
continuous = true,
interimResults = true
}: UseWebSpeechTranscriptionOptions = {}) => {
const [isRecording, setIsRecording] = useState(false)
const [transcript, setTranscript] = useState('')
const [interimTranscript, setInterimTranscript] = useState('')
const [isSupported, setIsSupported] = useState(false)
const recognitionRef = useRef<SpeechRecognition | null>(null)
const finalTranscriptRef = useRef('')
const lastSpeechTimeRef = useRef<number>(0)
// Process transcript with line breaks after pauses
const processTranscript = useCallback((text: string, isFinal: boolean = false) => {
if (!text.trim()) return text
let processedText = text.trim()
// Add punctuation if missing at the end
if (isFinal && processedText && !/[.!?]$/.test(processedText)) {
processedText += '.'
}
// Add line break if there's been a pause
if (isFinal) {
const now = Date.now()
const timeSinceLastSpeech = now - lastSpeechTimeRef.current
if (timeSinceLastSpeech > 3000 && lastSpeechTimeRef.current > 0) {
processedText = '\n\n' + processedText
}
lastSpeechTimeRef.current = now
}
return processedText
}, [])
// Check if Web Speech API is supported
useEffect(() => {
const SpeechRecognitionAPI = window.SpeechRecognition || window.webkitSpeechRecognition
if (SpeechRecognitionAPI) {
setIsSupported(true)
} else {
setIsSupported(false)
onError?.(new Error('Web Speech API is not supported in this browser. Try Chrome or Edge.'))
}
}, [onError])
// Initialize speech recognition
const initializeRecognition = useCallback(() => {
if (!isSupported) return null
const SpeechRecognitionAPI = window.SpeechRecognition || window.webkitSpeechRecognition
const recognition = new SpeechRecognitionAPI()
recognition.continuous = continuous
recognition.interimResults = interimResults
recognition.lang = language
recognition.maxAlternatives = 1
recognition.onstart = () => {
setIsRecording(true)
}
recognition.onresult = (event) => {
let interim = ''
let final = ''
for (let i = event.resultIndex; i < event.results.length; i++) {
const result = event.results[i]
const text = result[0].transcript
if (result.isFinal) {
final += text
} else {
interim += text
}
}
if (final) {
const processedFinal = processTranscript(final, true)
finalTranscriptRef.current += processedFinal + ' '
setTranscript(finalTranscriptRef.current)
onTranscriptUpdate?.(processedFinal)
}
if (interim) {
setInterimTranscript(processTranscript(interim, false))
}
}
recognition.onerror = (event) => {
console.error('Speech recognition error:', event.error)
setIsRecording(false)
if (event.error !== 'aborted') {
onError?.(new Error(`Speech recognition error: ${event.error}`))
}
}
recognition.onend = () => {
setIsRecording(false)
setInterimTranscript('')
}
return recognition
}, [isSupported, continuous, interimResults, language, onTranscriptUpdate, onError, processTranscript])
// Start recording
const startRecording = useCallback(() => {
if (!isSupported) {
onError?.(new Error('Web Speech API is not supported'))
return
}
try {
lastSpeechTimeRef.current = 0
const recognition = initializeRecognition()
if (recognition) {
recognitionRef.current = recognition
recognition.start()
}
} catch (error) {
console.error('Error starting speech recognition:', error)
onError?.(error as Error)
}
}, [isSupported, initializeRecognition, onError])
// Stop recording
const stopRecording = useCallback(() => {
if (recognitionRef.current) {
recognitionRef.current.stop()
recognitionRef.current = null
}
}, [])
// Clear transcript
const clearTranscript = useCallback(() => {
finalTranscriptRef.current = ''
setTranscript('')
setInterimTranscript('')
}, [])
// Cleanup on unmount
useEffect(() => {
return () => {
if (recognitionRef.current) {
recognitionRef.current.stop()
recognitionRef.current = null
}
}
}, [])
return {
isRecording,
transcript,
interimTranscript,
isSupported,
startRecording,
stopRecording,
clearTranscript
}
}
export default useWebSpeechTranscription

View File

@ -0,0 +1,426 @@
import { useCallback, useEffect, useRef, useState } from 'react'
import { pipeline, env } from '@xenova/transformers'
// Configure the transformers library
env.allowRemoteModels = true
env.allowLocalModels = false
env.useBrowserCache = true
// Resample audio to 16kHz for Whisper
function resampleAudio(audioData: Float32Array, fromSampleRate: number, toSampleRate: number): Float32Array {
if (fromSampleRate === toSampleRate) {
return audioData
}
if (!audioData || audioData.length === 0) {
throw new Error('Invalid audio data for resampling')
}
const ratio = fromSampleRate / toSampleRate
const newLength = Math.floor(audioData.length / ratio)
if (newLength <= 0) {
throw new Error('Invalid resampled length')
}
const resampled = new Float32Array(newLength)
for (let i = 0; i < newLength; i++) {
const sourceIndex = Math.floor(i * ratio)
if (sourceIndex >= 0 && sourceIndex < audioData.length) {
resampled[i] = audioData[sourceIndex]
} else {
resampled[i] = 0
}
}
return resampled
}
interface UseWhisperTranscriptionOptions {
onTranscriptUpdate?: (text: string) => void
onError?: (error: Error) => void
onModelProgress?: (progress: number, status: string) => void
language?: string
enableStreaming?: boolean
}
export const useWhisperTranscription = ({
onTranscriptUpdate,
onError,
onModelProgress,
language = 'en',
enableStreaming = true
}: UseWhisperTranscriptionOptions = {}) => {
const [isRecording, setIsRecording] = useState(false)
const [isTranscribing, setIsTranscribing] = useState(false)
const [transcript, setTranscript] = useState('')
const [modelLoaded, setModelLoaded] = useState(false)
const [modelLoading, setModelLoading] = useState(false)
const transcriberRef = useRef<ReturnType<typeof pipeline> extends Promise<infer T> ? T : never>(null!)
const streamRef = useRef<MediaStream | null>(null)
const mediaRecorderRef = useRef<MediaRecorder | null>(null)
const audioChunksRef = useRef<Blob[]>([])
const isRecordingRef = useRef(false)
const transcriptRef = useRef('')
const periodicTranscriptionRef = useRef<NodeJS.Timeout | null>(null)
// Initialize Whisper model
const initializeTranscriber = useCallback(async () => {
if (transcriberRef.current || modelLoading) return transcriberRef.current
setModelLoading(true)
try {
const modelNames = ['Xenova/whisper-tiny.en', 'Xenova/whisper-tiny']
let transcriber = null
let lastError = null
for (const modelName of modelNames) {
try {
transcriber = await pipeline('automatic-speech-recognition', modelName, {
quantized: true,
progress_callback: (progress: { status: string; file?: string; progress?: number }) => {
if (progress.status === 'downloading' && progress.progress) {
onModelProgress?.(progress.progress, `Downloading ${progress.file || 'model'}...`)
} else if (progress.status === 'loading') {
onModelProgress?.(100, 'Loading model...')
}
}
})
transcriberRef.current = transcriber
setModelLoaded(true)
setModelLoading(false)
return transcriber
} catch (error) {
console.warn(`Failed to load model ${modelName}:`, error)
lastError = error
continue
}
}
throw lastError || new Error('Failed to load any Whisper model')
} catch (error) {
console.error('Failed to load Whisper model:', error)
setModelLoading(false)
onError?.(error as Error)
throw error
}
}, [onError, onModelProgress, modelLoading])
// Process audio chunks for transcription
const processAudioChunks = useCallback(async () => {
if (audioChunksRef.current.length === 0) return
if (!transcriberRef.current) {
console.warn('Transcriber not initialized')
return
}
try {
setIsTranscribing(true)
// Get MIME type from MediaRecorder
let mimeType = 'audio/webm;codecs=opus'
if (mediaRecorderRef.current?.mimeType) {
mimeType = mediaRecorderRef.current.mimeType
}
const validChunks = audioChunksRef.current.filter(chunk => chunk && chunk.size > 1000)
if (validChunks.length === 0) return
const audioBlob = new Blob(validChunks, { type: mimeType })
if (audioBlob.size < 10000) return
const arrayBuffer = await audioBlob.arrayBuffer()
const audioContext = new AudioContext()
let audioBuffer: AudioBuffer
try {
audioBuffer = await audioContext.decodeAudioData(arrayBuffer)
} catch {
// Try alternative MIME type
try {
const altBlob = new Blob(validChunks, { type: 'audio/webm' })
const altBuffer = await altBlob.arrayBuffer()
audioBuffer = await audioContext.decodeAudioData(altBuffer)
} catch (altError) {
await audioContext.close()
throw new Error('Failed to decode audio. Format may not be supported.')
}
}
await audioContext.close()
const audioData = audioBuffer.getChannelData(0)
// Resample to 16kHz
let processedAudioData: Float32Array = audioData
if (audioBuffer.sampleRate !== 16000) {
processedAudioData = resampleAudio(audioData, audioBuffer.sampleRate, 16000)
}
// Skip if too quiet
const rms = Math.sqrt(processedAudioData.reduce((sum, val) => sum + val * val, 0) / processedAudioData.length)
if (rms < 0.001) return
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const result = await (transcriberRef.current as any)(processedAudioData, {
language,
task: 'transcribe',
return_timestamps: false
})
const newText = (result as { text?: string })?.text?.trim() || ''
if (newText) {
const processedText = newText.endsWith('.') || newText.endsWith('?') || newText.endsWith('!')
? newText
: newText + '.'
transcriptRef.current += (transcriptRef.current ? ' ' : '') + processedText
setTranscript(transcriptRef.current)
onTranscriptUpdate?.(processedText)
}
// Clear processed chunks
audioChunksRef.current = []
} catch (error) {
console.error('Error processing audio:', error)
onError?.(error as Error)
} finally {
setIsTranscribing(false)
}
}, [language, onTranscriptUpdate, onError])
// Process accumulated chunks for streaming
const processAccumulatedChunks = useCallback(async () => {
try {
const chunks = audioChunksRef.current
if (chunks.length < 3) return
// Get recent chunks (last 2 seconds worth)
const recentChunks = chunks.slice(-5)
const validChunks = recentChunks.filter(chunk => chunk && chunk.size > 2000)
if (validChunks.length < 2) return
let mimeType = 'audio/webm;codecs=opus'
if (mediaRecorderRef.current?.mimeType) {
mimeType = mediaRecorderRef.current.mimeType
}
const tempBlob = new Blob(validChunks, { type: mimeType })
if (tempBlob.size < 20000) return
const audioBuffer = await tempBlob.arrayBuffer()
const audioContext = new AudioContext()
let audioBufferFromBlob: AudioBuffer
try {
audioBufferFromBlob = await audioContext.decodeAudioData(audioBuffer)
} catch {
await audioContext.close()
return
}
await audioContext.close()
const audioData = audioBufferFromBlob.getChannelData(0)
if (!audioData || audioData.length === 0) return
let processedAudioData: Float32Array = audioData
if (audioBufferFromBlob.sampleRate !== 16000) {
processedAudioData = resampleAudio(audioData, audioBufferFromBlob.sampleRate, 16000)
}
// Check for meaningful audio
const rms = Math.sqrt(processedAudioData.reduce((sum, val) => sum + val * val, 0) / processedAudioData.length)
if (rms < 0.001) return
// Limit to 2 seconds
const maxSamples = 32000
if (processedAudioData.length > maxSamples) {
processedAudioData = processedAudioData.slice(-maxSamples)
}
if (!transcriberRef.current) return
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const result = await (transcriberRef.current as any)(processedAudioData, {
language,
task: 'transcribe',
return_timestamps: false,
no_speech_threshold: 0.3
})
const text = (result as { text?: string })?.text?.trim() || ''
if (text && text.length > 2) {
const processedText = text.endsWith('.') || text.endsWith('?') || text.endsWith('!')
? text
: text + '.'
// Append if not duplicate
if (!transcriptRef.current.endsWith(processedText)) {
transcriptRef.current += (transcriptRef.current ? ' ' : '') + processedText
setTranscript(transcriptRef.current)
onTranscriptUpdate?.(processedText)
}
}
} catch (error) {
console.error('Streaming transcription error:', error)
}
}, [language, onTranscriptUpdate])
// Start recording
const startRecording = useCallback(async () => {
try {
// Initialize model if needed
if (!modelLoaded) {
await initializeTranscriber()
}
audioChunksRef.current = []
// Clear periodic timer
if (periodicTranscriptionRef.current) {
clearInterval(periodicTranscriptionRef.current)
periodicTranscriptionRef.current = null
}
// Get microphone access
const stream = await navigator.mediaDevices.getUserMedia({
audio: {
echoCancellation: true,
noiseSuppression: true,
autoGainControl: true,
sampleRate: 44100,
channelCount: 1
}
})
streamRef.current = stream
// Create MediaRecorder with fallback options
let mediaRecorder: MediaRecorder | null = null
const mimeOptions = [
{ mimeType: 'audio/webm;codecs=opus' },
{ mimeType: 'audio/webm' },
{ mimeType: 'audio/ogg;codecs=opus' },
{ mimeType: 'audio/mp4' }
]
for (const option of mimeOptions) {
if (MediaRecorder.isTypeSupported(option.mimeType)) {
mediaRecorder = new MediaRecorder(stream, option)
break
}
}
if (!mediaRecorder) {
throw new Error('No supported audio format found')
}
mediaRecorderRef.current = mediaRecorder
mediaRecorder.ondataavailable = (event) => {
if (event.data.size > 1000) {
audioChunksRef.current.push(event.data)
// Limit chunks to prevent memory issues
if (audioChunksRef.current.length > 20) {
audioChunksRef.current = audioChunksRef.current.slice(-15)
}
}
}
mediaRecorder.onstop = () => {
processAudioChunks()
}
mediaRecorder.onstart = () => {
setIsRecording(true)
isRecordingRef.current = true
// Start streaming transcription
if (enableStreaming) {
periodicTranscriptionRef.current = setInterval(() => {
if (isRecordingRef.current) {
processAccumulatedChunks()
}
}, 1000)
}
}
// Start with 1 second chunks
mediaRecorder.start(1000)
isRecordingRef.current = true
setIsRecording(true)
} catch (error) {
console.error('Error starting recording:', error)
onError?.(error as Error)
}
}, [processAudioChunks, processAccumulatedChunks, onError, enableStreaming, modelLoaded, initializeTranscriber])
// Stop recording
const stopRecording = useCallback(async () => {
try {
if (periodicTranscriptionRef.current) {
clearInterval(periodicTranscriptionRef.current)
periodicTranscriptionRef.current = null
}
if (mediaRecorderRef.current && isRecordingRef.current) {
mediaRecorderRef.current.stop()
}
if (streamRef.current) {
streamRef.current.getTracks().forEach(track => track.stop())
streamRef.current = null
}
isRecordingRef.current = false
setIsRecording(false)
} catch (error) {
console.error('Error stopping recording:', error)
onError?.(error as Error)
}
}, [onError])
// Clear transcript
const clearTranscript = useCallback(() => {
transcriptRef.current = ''
setTranscript('')
}, [])
// Cleanup on unmount
useEffect(() => {
return () => {
if (periodicTranscriptionRef.current) {
clearInterval(periodicTranscriptionRef.current)
}
if (mediaRecorderRef.current?.state === 'recording') {
mediaRecorderRef.current.stop()
}
if (streamRef.current) {
streamRef.current.getTracks().forEach(track => track.stop())
}
}
}, [])
return {
isRecording,
isTranscribing,
transcript,
modelLoaded,
modelLoading,
startRecording,
stopRecording,
clearTranscript,
initializeTranscriber
}
}
export default useWhisperTranscription

9
src/main.tsx Normal file
View File

@ -0,0 +1,9 @@
import React from 'react'
import ReactDOM from 'react-dom/client'
import App from './App'
ReactDOM.createRoot(document.getElementById('root')!).render(
<React.StrictMode>
<App />
</React.StrictMode>
)

1
src/vite-env.d.ts vendored Normal file
View File

@ -0,0 +1 @@
/// <reference types="vite/client" />

21
tsconfig.json Normal file
View File

@ -0,0 +1,21 @@
{
"compilerOptions": {
"target": "ES2020",
"useDefineForClassFields": true,
"lib": ["ES2020", "DOM", "DOM.Iterable"],
"module": "ESNext",
"skipLibCheck": true,
"moduleResolution": "bundler",
"allowImportingTsExtensions": true,
"resolveJsonModule": true,
"isolatedModules": true,
"noEmit": true,
"jsx": "react-jsx",
"strict": true,
"noUnusedLocals": true,
"noUnusedParameters": true,
"noFallthroughCasesInSwitch": true
},
"include": ["src"],
"references": [{ "path": "./tsconfig.node.json" }]
}

11
tsconfig.node.json Normal file
View File

@ -0,0 +1,11 @@
{
"compilerOptions": {
"composite": true,
"skipLibCheck": true,
"module": "ESNext",
"moduleResolution": "bundler",
"allowSyntheticDefaultImports": true,
"strict": true
},
"include": ["vite.config.ts"]
}

65
vite.config.ts Normal file
View File

@ -0,0 +1,65 @@
import { defineConfig } from 'vite'
import react from '@vitejs/plugin-react'
import { VitePWA } from 'vite-plugin-pwa'
export default defineConfig({
plugins: [
react(),
VitePWA({
registerType: 'autoUpdate',
includeAssets: ['favicon.ico', 'apple-touch-icon.png', 'mask-icon.svg'],
manifest: {
name: 'Transcribe',
short_name: 'Transcribe',
description: 'Voice transcription app with local Whisper and Web Speech API',
theme_color: '#ff9500',
background_color: '#1a1a2e',
display: 'standalone',
orientation: 'portrait',
scope: '/',
start_url: '/',
icons: [
{
src: '/icons/icon-192.png',
sizes: '192x192',
type: 'image/png'
},
{
src: '/icons/icon-512.png',
sizes: '512x512',
type: 'image/png'
},
{
src: '/icons/icon-512.png',
sizes: '512x512',
type: 'image/png',
purpose: 'maskable'
}
]
},
workbox: {
globPatterns: ['**/*.{js,css,html,ico,png,svg,woff2}'],
runtimeCaching: [
{
urlPattern: /^https:\/\/cdn\.jsdelivr\.net\/.*/i,
handler: 'CacheFirst',
options: {
cacheName: 'cdn-cache',
expiration: {
maxEntries: 50,
maxAgeSeconds: 60 * 60 * 24 * 30 // 30 days
}
}
}
]
}
})
],
build: {
target: 'esnext',
outDir: 'dist'
},
server: {
port: 3000
}
})