Initial commit: Standalone transcription PWA
Extracted from canvas-website with: - Web Speech API transcription (Chrome/Edge, fast) - Local Whisper AI model (offline capable, ~75MB) - PWA support with service worker - Docker + Runtipi deployment configs 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
commit
1a8774aa97
|
|
@ -0,0 +1,27 @@
|
|||
# Dependencies
|
||||
node_modules/
|
||||
|
||||
# Build output
|
||||
dist/
|
||||
|
||||
# Logs
|
||||
*.log
|
||||
npm-debug.log*
|
||||
|
||||
# Editor
|
||||
.vscode/
|
||||
.idea/
|
||||
*.swp
|
||||
*.swo
|
||||
|
||||
# OS
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
|
||||
# Environment
|
||||
.env
|
||||
.env.local
|
||||
.env.*.local
|
||||
|
||||
# PWA dev
|
||||
dev-dist/
|
||||
|
|
@ -0,0 +1,34 @@
|
|||
# Build stage
|
||||
FROM node:20-alpine AS builder
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Copy package files
|
||||
COPY package*.json ./
|
||||
|
||||
# Install dependencies
|
||||
RUN npm ci
|
||||
|
||||
# Copy source files
|
||||
COPY . .
|
||||
|
||||
# Build the app
|
||||
RUN npm run build
|
||||
|
||||
# Production stage
|
||||
FROM nginx:alpine
|
||||
|
||||
# Copy custom nginx config
|
||||
COPY nginx.conf /etc/nginx/conf.d/default.conf
|
||||
|
||||
# Copy built files from builder
|
||||
COPY --from=builder /app/dist /usr/share/nginx/html
|
||||
|
||||
# Expose port
|
||||
EXPOSE 80
|
||||
|
||||
# Health check
|
||||
HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
|
||||
CMD wget --no-verbose --tries=1 --spider http://localhost:80/ || exit 1
|
||||
|
||||
CMD ["nginx", "-g", "daemon off;"]
|
||||
|
|
@ -0,0 +1,67 @@
|
|||
# Transcribe
|
||||
|
||||
A standalone Progressive Web App (PWA) for voice transcription, extracted from canvas-website.
|
||||
|
||||
## Features
|
||||
|
||||
- **Two Transcription Modes:**
|
||||
- **Web Speech API** - Fast, real-time transcription using browser's built-in speech recognition (Chrome/Edge)
|
||||
- **Local Whisper** - Offline-capable transcription using Whisper AI model running entirely in the browser
|
||||
|
||||
- **PWA Support** - Install on any device, works offline
|
||||
- **Simple UI** - Start/stop recording, copy/download transcripts
|
||||
- **No Server Required** - All processing happens in the browser
|
||||
|
||||
## Quick Start
|
||||
|
||||
```bash
|
||||
# Install dependencies
|
||||
npm install
|
||||
|
||||
# Run development server
|
||||
npm run dev
|
||||
|
||||
# Build for production
|
||||
npm run build
|
||||
```
|
||||
|
||||
## Docker Deployment
|
||||
|
||||
```bash
|
||||
# Build and run with Docker
|
||||
docker compose up -d --build
|
||||
|
||||
# Or build manually
|
||||
docker build -t transcribe-app .
|
||||
docker run -p 3000:80 transcribe-app
|
||||
```
|
||||
|
||||
## Runtipi Deployment
|
||||
|
||||
Copy the contents of the `runtipi/` folder to your Runtipi apps directory:
|
||||
|
||||
```bash
|
||||
cp -r runtipi/* /path/to/runtipi/user-config/transcribe/
|
||||
```
|
||||
|
||||
Or use the pre-built Docker image from GitHub Container Registry.
|
||||
|
||||
## Browser Compatibility
|
||||
|
||||
| Browser | Web Speech API | Whisper (Local) |
|
||||
|---------|----------------|-----------------|
|
||||
| Chrome | Full | Full |
|
||||
| Edge | Full | Full |
|
||||
| Safari | Partial | Full |
|
||||
| Firefox | No | Full |
|
||||
|
||||
## Tech Stack
|
||||
|
||||
- React 18 + TypeScript
|
||||
- Vite with PWA plugin
|
||||
- @xenova/transformers for local Whisper inference
|
||||
- Web Speech API for browser-native transcription
|
||||
|
||||
## License
|
||||
|
||||
MIT
|
||||
|
|
@ -0,0 +1,20 @@
|
|||
version: '3.8'
|
||||
|
||||
services:
|
||||
transcribe:
|
||||
build: .
|
||||
container_name: transcribe-app
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "3000:80"
|
||||
labels:
|
||||
# Traefik labels for reverse proxy (if using Traefik)
|
||||
- "traefik.enable=true"
|
||||
- "traefik.http.routers.transcribe.rule=Host(`transcribe.jeffemmett.com`)"
|
||||
- "traefik.http.services.transcribe.loadbalancer.server.port=80"
|
||||
networks:
|
||||
- default
|
||||
|
||||
networks:
|
||||
default:
|
||||
name: transcribe-network
|
||||
|
|
@ -0,0 +1,16 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8" />
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0, viewport-fit=cover" />
|
||||
<meta name="theme-color" content="#1a1a2e" />
|
||||
<meta name="description" content="Voice transcription PWA with local Whisper AI and Web Speech API" />
|
||||
<link rel="icon" type="image/svg+xml" href="/icons/icon.svg" />
|
||||
<link rel="apple-touch-icon" href="/icons/apple-touch-icon.png" />
|
||||
<title>Transcribe</title>
|
||||
</head>
|
||||
<body>
|
||||
<div id="root"></div>
|
||||
<script type="module" src="/src/main.tsx"></script>
|
||||
</body>
|
||||
</html>
|
||||
|
|
@ -0,0 +1,40 @@
|
|||
server {
|
||||
listen 80;
|
||||
server_name localhost;
|
||||
root /usr/share/nginx/html;
|
||||
index index.html;
|
||||
|
||||
# Gzip compression
|
||||
gzip on;
|
||||
gzip_vary on;
|
||||
gzip_min_length 1024;
|
||||
gzip_proxied any;
|
||||
gzip_types text/plain text/css text/xml text/javascript application/javascript application/json application/xml;
|
||||
|
||||
# Security headers
|
||||
add_header X-Frame-Options "SAMEORIGIN" always;
|
||||
add_header X-Content-Type-Options "nosniff" always;
|
||||
add_header X-XSS-Protection "1; mode=block" always;
|
||||
|
||||
# Cache static assets
|
||||
location ~* \.(js|css|png|jpg|jpeg|gif|ico|svg|woff|woff2)$ {
|
||||
expires 1y;
|
||||
add_header Cache-Control "public, immutable";
|
||||
}
|
||||
|
||||
# Service worker - no cache
|
||||
location /sw.js {
|
||||
expires off;
|
||||
add_header Cache-Control "no-cache, no-store, must-revalidate";
|
||||
}
|
||||
|
||||
# PWA manifest
|
||||
location /manifest.webmanifest {
|
||||
types { application/manifest+json webmanifest; }
|
||||
}
|
||||
|
||||
# SPA fallback
|
||||
location / {
|
||||
try_files $uri $uri/ /index.html;
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,29 @@
|
|||
{
|
||||
"name": "transcribe-app",
|
||||
"version": "1.0.0",
|
||||
"description": "Standalone transcription PWA with Web Speech API and local Whisper",
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
"dev": "vite --host 0.0.0.0 --port 3000",
|
||||
"build": "tsc && vite build",
|
||||
"preview": "vite preview",
|
||||
"docker:build": "docker build -t transcribe-app .",
|
||||
"docker:run": "docker run -p 3000:80 transcribe-app"
|
||||
},
|
||||
"dependencies": {
|
||||
"@xenova/transformers": "^2.17.2",
|
||||
"react": "^18.2.0",
|
||||
"react-dom": "^18.2.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/react": "^18.2.0",
|
||||
"@types/react-dom": "^18.2.0",
|
||||
"@vitejs/plugin-react": "^4.2.1",
|
||||
"typescript": "^5.3.3",
|
||||
"vite": "^5.4.0",
|
||||
"vite-plugin-pwa": "^0.20.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=18.0.0"
|
||||
}
|
||||
}
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 348 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 341 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 348 KiB |
|
|
@ -0,0 +1,8 @@
|
|||
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 512 512" fill="none">
|
||||
<rect width="512" height="512" rx="96" fill="#1a1a2e"/>
|
||||
<circle cx="256" cy="200" r="80" fill="#ff9500"/>
|
||||
<rect x="216" y="280" width="80" height="120" rx="40" fill="#ff9500"/>
|
||||
<path d="M176 320 C176 400, 256 440, 256 440 C256 440, 336 400, 336 320" stroke="#ff9500" stroke-width="24" stroke-linecap="round" fill="none"/>
|
||||
<line x1="256" y1="440" x2="256" y2="480" stroke="#ff9500" stroke-width="24" stroke-linecap="round"/>
|
||||
<line x1="200" y1="480" x2="312" y2="480" stroke="#ff9500" stroke-width="24" stroke-linecap="round"/>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 620 B |
|
|
@ -0,0 +1,18 @@
|
|||
{
|
||||
"$schema": "../schema.json",
|
||||
"name": "Transcribe",
|
||||
"id": "transcribe",
|
||||
"available": true,
|
||||
"short_desc": "Voice transcription PWA with local Whisper AI and Web Speech API",
|
||||
"author": "Jeff Emmett",
|
||||
"port": 3000,
|
||||
"categories": ["utilities", "media"],
|
||||
"description": "A progressive web app for voice transcription featuring both browser-native Web Speech API (fast, requires internet) and local Whisper AI model (works offline). Perfect for transcribing meetings, notes, and dictation.",
|
||||
"tipiVersion": 1,
|
||||
"version": "1.0.0",
|
||||
"source": "https://github.com/jeffemmett/transcribe-app",
|
||||
"website": "https://transcribe.jeffemmett.com",
|
||||
"exposable": true,
|
||||
"supported_architectures": ["arm64", "amd64"],
|
||||
"form_fields": []
|
||||
}
|
||||
|
|
@ -0,0 +1,15 @@
|
|||
version: "3.8"
|
||||
|
||||
services:
|
||||
transcribe:
|
||||
container_name: transcribe
|
||||
image: ghcr.io/jeffemmett/transcribe-app:latest
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- "${APP_PORT}:80"
|
||||
networks:
|
||||
- tipi_main_network
|
||||
|
||||
networks:
|
||||
tipi_main_network:
|
||||
external: true
|
||||
|
|
@ -0,0 +1,362 @@
|
|||
* {
|
||||
box-sizing: border-box;
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
}
|
||||
|
||||
:root {
|
||||
--primary: #ff9500;
|
||||
--primary-dark: #e68600;
|
||||
--bg-dark: #1a1a2e;
|
||||
--bg-card: #16213e;
|
||||
--bg-input: #0f3460;
|
||||
--text-primary: #ffffff;
|
||||
--text-secondary: #a0a0b0;
|
||||
--error: #ff4757;
|
||||
--success: #2ecc71;
|
||||
--border-radius: 12px;
|
||||
}
|
||||
|
||||
html, body {
|
||||
height: 100%;
|
||||
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, Oxygen, Ubuntu, sans-serif;
|
||||
background: var(--bg-dark);
|
||||
color: var(--text-primary);
|
||||
}
|
||||
|
||||
#root {
|
||||
min-height: 100%;
|
||||
}
|
||||
|
||||
.app {
|
||||
min-height: 100vh;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
padding: 20px;
|
||||
max-width: 800px;
|
||||
margin: 0 auto;
|
||||
}
|
||||
|
||||
/* Header */
|
||||
.header {
|
||||
text-align: center;
|
||||
padding: 30px 0;
|
||||
}
|
||||
|
||||
.header h1 {
|
||||
font-size: 2.5rem;
|
||||
font-weight: 700;
|
||||
color: var(--primary);
|
||||
margin-bottom: 8px;
|
||||
}
|
||||
|
||||
.subtitle {
|
||||
color: var(--text-secondary);
|
||||
font-size: 1.1rem;
|
||||
}
|
||||
|
||||
/* Main content */
|
||||
.main {
|
||||
flex: 1;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 24px;
|
||||
}
|
||||
|
||||
/* Mode selector */
|
||||
.mode-selector {
|
||||
display: grid;
|
||||
grid-template-columns: 1fr 1fr;
|
||||
gap: 12px;
|
||||
}
|
||||
|
||||
.mode-btn {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
align-items: center;
|
||||
gap: 4px;
|
||||
padding: 16px;
|
||||
background: var(--bg-card);
|
||||
border: 2px solid transparent;
|
||||
border-radius: var(--border-radius);
|
||||
color: var(--text-primary);
|
||||
font-size: 1rem;
|
||||
font-weight: 600;
|
||||
cursor: pointer;
|
||||
transition: all 0.2s ease;
|
||||
}
|
||||
|
||||
.mode-btn:hover:not(:disabled) {
|
||||
border-color: var(--primary);
|
||||
}
|
||||
|
||||
.mode-btn.active {
|
||||
border-color: var(--primary);
|
||||
background: rgba(255, 149, 0, 0.1);
|
||||
}
|
||||
|
||||
.mode-btn:disabled {
|
||||
opacity: 0.5;
|
||||
cursor: not-allowed;
|
||||
}
|
||||
|
||||
.mode-desc {
|
||||
font-size: 0.75rem;
|
||||
font-weight: 400;
|
||||
color: var(--text-secondary);
|
||||
}
|
||||
|
||||
/* Model status */
|
||||
.model-status {
|
||||
background: var(--bg-card);
|
||||
border-radius: var(--border-radius);
|
||||
padding: 16px;
|
||||
}
|
||||
|
||||
.progress-bar {
|
||||
height: 8px;
|
||||
background: var(--bg-input);
|
||||
border-radius: 4px;
|
||||
overflow: hidden;
|
||||
margin-bottom: 8px;
|
||||
}
|
||||
|
||||
.progress-fill {
|
||||
height: 100%;
|
||||
background: var(--primary);
|
||||
transition: width 0.3s ease;
|
||||
}
|
||||
|
||||
.progress-text {
|
||||
font-size: 0.875rem;
|
||||
color: var(--text-secondary);
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
.model-info {
|
||||
font-size: 0.875rem;
|
||||
color: var(--text-secondary);
|
||||
text-align: center;
|
||||
}
|
||||
|
||||
/* Error banner */
|
||||
.error-banner {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: space-between;
|
||||
background: rgba(255, 71, 87, 0.1);
|
||||
border: 1px solid var(--error);
|
||||
border-radius: var(--border-radius);
|
||||
padding: 12px 16px;
|
||||
color: var(--error);
|
||||
}
|
||||
|
||||
.error-banner button {
|
||||
background: none;
|
||||
border: none;
|
||||
color: var(--error);
|
||||
font-size: 1.5rem;
|
||||
cursor: pointer;
|
||||
line-height: 1;
|
||||
}
|
||||
|
||||
/* Controls */
|
||||
.controls {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
align-items: center;
|
||||
gap: 16px;
|
||||
}
|
||||
|
||||
.record-btn {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
gap: 12px;
|
||||
padding: 20px 40px;
|
||||
border: none;
|
||||
border-radius: 50px;
|
||||
font-size: 1.125rem;
|
||||
font-weight: 600;
|
||||
cursor: pointer;
|
||||
transition: all 0.2s ease;
|
||||
}
|
||||
|
||||
.record-btn.start {
|
||||
background: var(--primary);
|
||||
color: var(--bg-dark);
|
||||
}
|
||||
|
||||
.record-btn.start:hover:not(:disabled) {
|
||||
background: var(--primary-dark);
|
||||
transform: scale(1.02);
|
||||
}
|
||||
|
||||
.record-btn.start:disabled {
|
||||
opacity: 0.5;
|
||||
cursor: not-allowed;
|
||||
}
|
||||
|
||||
.record-btn.stop {
|
||||
background: var(--error);
|
||||
color: white;
|
||||
}
|
||||
|
||||
.record-btn.stop:hover {
|
||||
background: #e84152;
|
||||
transform: scale(1.02);
|
||||
}
|
||||
|
||||
.record-btn svg {
|
||||
flex-shrink: 0;
|
||||
}
|
||||
|
||||
/* Recording indicator */
|
||||
.recording-indicator {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 8px;
|
||||
color: var(--error);
|
||||
}
|
||||
|
||||
.pulse {
|
||||
width: 12px;
|
||||
height: 12px;
|
||||
background: var(--error);
|
||||
border-radius: 50%;
|
||||
animation: pulse 1.5s ease-in-out infinite;
|
||||
}
|
||||
|
||||
@keyframes pulse {
|
||||
0%, 100% { opacity: 1; transform: scale(1); }
|
||||
50% { opacity: 0.5; transform: scale(1.2); }
|
||||
}
|
||||
|
||||
.time {
|
||||
font-size: 1.25rem;
|
||||
font-weight: 600;
|
||||
font-variant-numeric: tabular-nums;
|
||||
}
|
||||
|
||||
/* Transcript */
|
||||
.transcript-container {
|
||||
flex: 1;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
background: var(--bg-card);
|
||||
border-radius: var(--border-radius);
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
.transcript-header {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: space-between;
|
||||
padding: 16px 20px;
|
||||
border-bottom: 1px solid rgba(255, 255, 255, 0.1);
|
||||
}
|
||||
|
||||
.transcript-header h2 {
|
||||
font-size: 1rem;
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
.transcript-actions {
|
||||
display: flex;
|
||||
gap: 8px;
|
||||
}
|
||||
|
||||
.action-btn {
|
||||
padding: 6px 12px;
|
||||
background: var(--bg-input);
|
||||
border: none;
|
||||
border-radius: 6px;
|
||||
color: var(--text-primary);
|
||||
font-size: 0.875rem;
|
||||
cursor: pointer;
|
||||
transition: background 0.2s ease;
|
||||
}
|
||||
|
||||
.action-btn:hover {
|
||||
background: rgba(255, 149, 0, 0.2);
|
||||
}
|
||||
|
||||
.clear-btn:hover {
|
||||
background: rgba(255, 71, 87, 0.2);
|
||||
color: var(--error);
|
||||
}
|
||||
|
||||
.transcript-box {
|
||||
flex: 1;
|
||||
min-height: 200px;
|
||||
padding: 20px;
|
||||
overflow-y: auto;
|
||||
}
|
||||
|
||||
.transcript-text {
|
||||
font-size: 1.125rem;
|
||||
line-height: 1.7;
|
||||
white-space: pre-wrap;
|
||||
word-break: break-word;
|
||||
}
|
||||
|
||||
.transcript-placeholder {
|
||||
color: var(--text-secondary);
|
||||
font-style: italic;
|
||||
}
|
||||
|
||||
.interim-text {
|
||||
margin-top: 12px;
|
||||
padding-top: 12px;
|
||||
border-top: 1px dashed rgba(255, 255, 255, 0.1);
|
||||
color: var(--text-secondary);
|
||||
font-style: italic;
|
||||
}
|
||||
|
||||
.transcribing-indicator {
|
||||
margin-top: 12px;
|
||||
color: var(--primary);
|
||||
font-size: 0.875rem;
|
||||
}
|
||||
|
||||
/* Footer */
|
||||
.footer {
|
||||
text-align: center;
|
||||
padding: 20px 0;
|
||||
}
|
||||
|
||||
.footer p {
|
||||
color: var(--text-secondary);
|
||||
font-size: 0.875rem;
|
||||
}
|
||||
|
||||
/* Responsive */
|
||||
@media (max-width: 600px) {
|
||||
.app {
|
||||
padding: 16px;
|
||||
}
|
||||
|
||||
.header h1 {
|
||||
font-size: 2rem;
|
||||
}
|
||||
|
||||
.record-btn {
|
||||
padding: 16px 32px;
|
||||
font-size: 1rem;
|
||||
}
|
||||
|
||||
.transcript-actions {
|
||||
flex-wrap: wrap;
|
||||
}
|
||||
}
|
||||
|
||||
/* PWA standalone mode */
|
||||
@media (display-mode: standalone) {
|
||||
.header {
|
||||
padding-top: env(safe-area-inset-top, 20px);
|
||||
}
|
||||
|
||||
.footer {
|
||||
padding-bottom: env(safe-area-inset-bottom, 20px);
|
||||
}
|
||||
}
|
||||
|
|
@ -0,0 +1,266 @@
|
|||
import { useState, useCallback } from 'react'
|
||||
import { useWebSpeechTranscription } from './hooks/useWebSpeechTranscription'
|
||||
import { useWhisperTranscription } from './hooks/useWhisperTranscription'
|
||||
import './App.css'
|
||||
|
||||
type TranscriptionMode = 'webspeech' | 'whisper'
|
||||
|
||||
function App() {
|
||||
const [mode, setMode] = useState<TranscriptionMode>('webspeech')
|
||||
const [transcript, setTranscript] = useState('')
|
||||
const [error, setError] = useState<string | null>(null)
|
||||
const [modelProgress, setModelProgress] = useState<{ progress: number; status: string } | null>(null)
|
||||
const [recordingTime, setRecordingTime] = useState(0)
|
||||
const [timerInterval, setTimerInterval] = useState<NodeJS.Timeout | null>(null)
|
||||
|
||||
// Web Speech API hook
|
||||
const webSpeech = useWebSpeechTranscription({
|
||||
onTranscriptUpdate: (text) => {
|
||||
setTranscript(prev => prev + (prev ? ' ' : '') + text)
|
||||
},
|
||||
onError: (err) => setError(err.message),
|
||||
language: 'en-US'
|
||||
})
|
||||
|
||||
// Whisper hook
|
||||
const whisper = useWhisperTranscription({
|
||||
onTranscriptUpdate: (text) => {
|
||||
setTranscript(prev => prev + (prev ? ' ' : '') + text)
|
||||
},
|
||||
onError: (err) => setError(err.message),
|
||||
onModelProgress: (progress, status) => setModelProgress({ progress, status }),
|
||||
language: 'en'
|
||||
})
|
||||
|
||||
const isRecording = mode === 'webspeech' ? webSpeech.isRecording : whisper.isRecording
|
||||
const isSupported = mode === 'webspeech' ? webSpeech.isSupported : true
|
||||
|
||||
// Start timer
|
||||
const startTimer = useCallback(() => {
|
||||
setRecordingTime(0)
|
||||
const interval = setInterval(() => {
|
||||
setRecordingTime(prev => prev + 1)
|
||||
}, 1000)
|
||||
setTimerInterval(interval)
|
||||
}, [])
|
||||
|
||||
// Stop timer
|
||||
const stopTimer = useCallback(() => {
|
||||
if (timerInterval) {
|
||||
clearInterval(timerInterval)
|
||||
setTimerInterval(null)
|
||||
}
|
||||
}, [timerInterval])
|
||||
|
||||
// Handle start recording
|
||||
const handleStart = useCallback(async () => {
|
||||
setError(null)
|
||||
startTimer()
|
||||
|
||||
if (mode === 'webspeech') {
|
||||
webSpeech.startRecording()
|
||||
} else {
|
||||
await whisper.startRecording()
|
||||
}
|
||||
}, [mode, webSpeech, whisper, startTimer])
|
||||
|
||||
// Handle stop recording
|
||||
const handleStop = useCallback(async () => {
|
||||
stopTimer()
|
||||
|
||||
if (mode === 'webspeech') {
|
||||
webSpeech.stopRecording()
|
||||
} else {
|
||||
await whisper.stopRecording()
|
||||
}
|
||||
}, [mode, webSpeech, whisper, stopTimer])
|
||||
|
||||
// Handle clear
|
||||
const handleClear = useCallback(() => {
|
||||
setTranscript('')
|
||||
setRecordingTime(0)
|
||||
if (mode === 'webspeech') {
|
||||
webSpeech.clearTranscript()
|
||||
} else {
|
||||
whisper.clearTranscript()
|
||||
}
|
||||
}, [mode, webSpeech, whisper])
|
||||
|
||||
// Copy to clipboard
|
||||
const handleCopy = useCallback(async () => {
|
||||
try {
|
||||
await navigator.clipboard.writeText(transcript)
|
||||
// Show brief feedback
|
||||
const btn = document.querySelector('.copy-btn') as HTMLButtonElement
|
||||
if (btn) {
|
||||
const originalText = btn.textContent
|
||||
btn.textContent = 'Copied!'
|
||||
setTimeout(() => {
|
||||
btn.textContent = originalText
|
||||
}, 1500)
|
||||
}
|
||||
} catch (err) {
|
||||
console.error('Failed to copy:', err)
|
||||
}
|
||||
}, [transcript])
|
||||
|
||||
// Download as text file
|
||||
const handleDownload = useCallback(() => {
|
||||
const blob = new Blob([transcript], { type: 'text/plain' })
|
||||
const url = URL.createObjectURL(blob)
|
||||
const a = document.createElement('a')
|
||||
a.href = url
|
||||
a.download = `transcript-${new Date().toISOString().slice(0, 10)}.txt`
|
||||
document.body.appendChild(a)
|
||||
a.click()
|
||||
document.body.removeChild(a)
|
||||
URL.revokeObjectURL(url)
|
||||
}, [transcript])
|
||||
|
||||
// Format recording time
|
||||
const formatTime = (seconds: number) => {
|
||||
const mins = Math.floor(seconds / 60)
|
||||
const secs = seconds % 60
|
||||
return `${mins}:${secs.toString().padStart(2, '0')}`
|
||||
}
|
||||
|
||||
return (
|
||||
<div className="app">
|
||||
<header className="header">
|
||||
<h1>Transcribe</h1>
|
||||
<p className="subtitle">Voice to text, right in your browser</p>
|
||||
</header>
|
||||
|
||||
<main className="main">
|
||||
{/* Mode selector */}
|
||||
<div className="mode-selector">
|
||||
<button
|
||||
className={`mode-btn ${mode === 'webspeech' ? 'active' : ''}`}
|
||||
onClick={() => setMode('webspeech')}
|
||||
disabled={isRecording}
|
||||
>
|
||||
Web Speech
|
||||
<span className="mode-desc">Fast, requires internet</span>
|
||||
</button>
|
||||
<button
|
||||
className={`mode-btn ${mode === 'whisper' ? 'active' : ''}`}
|
||||
onClick={() => setMode('whisper')}
|
||||
disabled={isRecording}
|
||||
>
|
||||
Whisper
|
||||
<span className="mode-desc">Local AI, works offline</span>
|
||||
</button>
|
||||
</div>
|
||||
|
||||
{/* Model loading status for Whisper */}
|
||||
{mode === 'whisper' && !whisper.modelLoaded && (
|
||||
<div className="model-status">
|
||||
{whisper.modelLoading ? (
|
||||
<>
|
||||
<div className="progress-bar">
|
||||
<div
|
||||
className="progress-fill"
|
||||
style={{ width: `${modelProgress?.progress || 0}%` }}
|
||||
/>
|
||||
</div>
|
||||
<p className="progress-text">{modelProgress?.status || 'Loading model...'}</p>
|
||||
</>
|
||||
) : (
|
||||
<p className="model-info">
|
||||
Model will be downloaded on first use (~75MB)
|
||||
</p>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Error display */}
|
||||
{error && (
|
||||
<div className="error-banner">
|
||||
<span>{error}</span>
|
||||
<button onClick={() => setError(null)}>×</button>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Recording controls */}
|
||||
<div className="controls">
|
||||
{!isRecording ? (
|
||||
<button
|
||||
className="record-btn start"
|
||||
onClick={handleStart}
|
||||
disabled={!isSupported || (mode === 'whisper' && whisper.modelLoading)}
|
||||
>
|
||||
<svg viewBox="0 0 24 24" width="32" height="32" fill="currentColor">
|
||||
<circle cx="12" cy="12" r="10" />
|
||||
</svg>
|
||||
<span>Start Recording</span>
|
||||
</button>
|
||||
) : (
|
||||
<button className="record-btn stop" onClick={handleStop}>
|
||||
<svg viewBox="0 0 24 24" width="32" height="32" fill="currentColor">
|
||||
<rect x="6" y="6" width="12" height="12" rx="2" />
|
||||
</svg>
|
||||
<span>Stop Recording</span>
|
||||
</button>
|
||||
)}
|
||||
|
||||
{isRecording && (
|
||||
<div className="recording-indicator">
|
||||
<span className="pulse" />
|
||||
<span className="time">{formatTime(recordingTime)}</span>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
|
||||
{/* Transcript display */}
|
||||
<div className="transcript-container">
|
||||
<div className="transcript-header">
|
||||
<h2>Transcript</h2>
|
||||
{transcript && (
|
||||
<div className="transcript-actions">
|
||||
<button className="action-btn copy-btn" onClick={handleCopy}>
|
||||
Copy
|
||||
</button>
|
||||
<button className="action-btn" onClick={handleDownload}>
|
||||
Download
|
||||
</button>
|
||||
<button className="action-btn clear-btn" onClick={handleClear}>
|
||||
Clear
|
||||
</button>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
|
||||
<div className="transcript-box">
|
||||
{transcript ? (
|
||||
<p className="transcript-text">{transcript}</p>
|
||||
) : (
|
||||
<p className="transcript-placeholder">
|
||||
{isRecording ? 'Listening...' : 'Your transcript will appear here'}
|
||||
</p>
|
||||
)}
|
||||
|
||||
{/* Interim text for Web Speech */}
|
||||
{mode === 'webspeech' && webSpeech.interimTranscript && (
|
||||
<p className="interim-text">{webSpeech.interimTranscript}</p>
|
||||
)}
|
||||
|
||||
{/* Transcribing indicator for Whisper */}
|
||||
{mode === 'whisper' && whisper.isTranscribing && (
|
||||
<p className="transcribing-indicator">Processing audio...</p>
|
||||
)}
|
||||
</div>
|
||||
</div>
|
||||
</main>
|
||||
|
||||
<footer className="footer">
|
||||
<p>
|
||||
{mode === 'webspeech'
|
||||
? 'Using browser\'s built-in speech recognition'
|
||||
: 'Using local Whisper AI model'}
|
||||
</p>
|
||||
</footer>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
export default App
|
||||
|
|
@ -0,0 +1,235 @@
|
|||
import { useState, useRef, useCallback, useEffect } from 'react'
|
||||
|
||||
// TypeScript declarations for Web Speech API
|
||||
declare global {
|
||||
interface Window {
|
||||
SpeechRecognition: typeof SpeechRecognition
|
||||
webkitSpeechRecognition: typeof SpeechRecognition
|
||||
}
|
||||
|
||||
interface SpeechRecognition extends EventTarget {
|
||||
continuous: boolean
|
||||
interimResults: boolean
|
||||
lang: string
|
||||
maxAlternatives: number
|
||||
start(): void
|
||||
stop(): void
|
||||
onstart: ((this: SpeechRecognition, ev: Event) => void) | null
|
||||
onresult: ((this: SpeechRecognition, ev: SpeechRecognitionEvent) => void) | null
|
||||
onerror: ((this: SpeechRecognition, ev: SpeechRecognitionErrorEvent) => void) | null
|
||||
onend: ((this: SpeechRecognition, ev: Event) => void) | null
|
||||
}
|
||||
|
||||
interface SpeechRecognitionEvent extends Event {
|
||||
resultIndex: number
|
||||
results: SpeechRecognitionResultList
|
||||
}
|
||||
|
||||
interface SpeechRecognitionErrorEvent extends Event {
|
||||
error: string
|
||||
}
|
||||
|
||||
interface SpeechRecognitionResultList {
|
||||
readonly length: number
|
||||
item(index: number): SpeechRecognitionResult
|
||||
[index: number]: SpeechRecognitionResult
|
||||
}
|
||||
|
||||
interface SpeechRecognitionResult {
|
||||
readonly length: number
|
||||
item(index: number): SpeechRecognitionAlternative
|
||||
[index: number]: SpeechRecognitionAlternative
|
||||
readonly isFinal: boolean
|
||||
}
|
||||
|
||||
interface SpeechRecognitionAlternative {
|
||||
readonly transcript: string
|
||||
readonly confidence: number
|
||||
}
|
||||
|
||||
// eslint-disable-next-line no-var
|
||||
var SpeechRecognition: {
|
||||
prototype: SpeechRecognition
|
||||
new(): SpeechRecognition
|
||||
}
|
||||
}
|
||||
|
||||
interface UseWebSpeechTranscriptionOptions {
|
||||
onTranscriptUpdate?: (text: string) => void
|
||||
onError?: (error: Error) => void
|
||||
language?: string
|
||||
continuous?: boolean
|
||||
interimResults?: boolean
|
||||
}
|
||||
|
||||
export const useWebSpeechTranscription = ({
|
||||
onTranscriptUpdate,
|
||||
onError,
|
||||
language = 'en-US',
|
||||
continuous = true,
|
||||
interimResults = true
|
||||
}: UseWebSpeechTranscriptionOptions = {}) => {
|
||||
const [isRecording, setIsRecording] = useState(false)
|
||||
const [transcript, setTranscript] = useState('')
|
||||
const [interimTranscript, setInterimTranscript] = useState('')
|
||||
const [isSupported, setIsSupported] = useState(false)
|
||||
|
||||
const recognitionRef = useRef<SpeechRecognition | null>(null)
|
||||
const finalTranscriptRef = useRef('')
|
||||
const lastSpeechTimeRef = useRef<number>(0)
|
||||
|
||||
// Process transcript with line breaks after pauses
|
||||
const processTranscript = useCallback((text: string, isFinal: boolean = false) => {
|
||||
if (!text.trim()) return text
|
||||
|
||||
let processedText = text.trim()
|
||||
|
||||
// Add punctuation if missing at the end
|
||||
if (isFinal && processedText && !/[.!?]$/.test(processedText)) {
|
||||
processedText += '.'
|
||||
}
|
||||
|
||||
// Add line break if there's been a pause
|
||||
if (isFinal) {
|
||||
const now = Date.now()
|
||||
const timeSinceLastSpeech = now - lastSpeechTimeRef.current
|
||||
|
||||
if (timeSinceLastSpeech > 3000 && lastSpeechTimeRef.current > 0) {
|
||||
processedText = '\n\n' + processedText
|
||||
}
|
||||
|
||||
lastSpeechTimeRef.current = now
|
||||
}
|
||||
|
||||
return processedText
|
||||
}, [])
|
||||
|
||||
// Check if Web Speech API is supported
|
||||
useEffect(() => {
|
||||
const SpeechRecognitionAPI = window.SpeechRecognition || window.webkitSpeechRecognition
|
||||
if (SpeechRecognitionAPI) {
|
||||
setIsSupported(true)
|
||||
} else {
|
||||
setIsSupported(false)
|
||||
onError?.(new Error('Web Speech API is not supported in this browser. Try Chrome or Edge.'))
|
||||
}
|
||||
}, [onError])
|
||||
|
||||
// Initialize speech recognition
|
||||
const initializeRecognition = useCallback(() => {
|
||||
if (!isSupported) return null
|
||||
|
||||
const SpeechRecognitionAPI = window.SpeechRecognition || window.webkitSpeechRecognition
|
||||
const recognition = new SpeechRecognitionAPI()
|
||||
|
||||
recognition.continuous = continuous
|
||||
recognition.interimResults = interimResults
|
||||
recognition.lang = language
|
||||
recognition.maxAlternatives = 1
|
||||
|
||||
recognition.onstart = () => {
|
||||
setIsRecording(true)
|
||||
}
|
||||
|
||||
recognition.onresult = (event) => {
|
||||
let interim = ''
|
||||
let final = ''
|
||||
|
||||
for (let i = event.resultIndex; i < event.results.length; i++) {
|
||||
const result = event.results[i]
|
||||
const text = result[0].transcript
|
||||
|
||||
if (result.isFinal) {
|
||||
final += text
|
||||
} else {
|
||||
interim += text
|
||||
}
|
||||
}
|
||||
|
||||
if (final) {
|
||||
const processedFinal = processTranscript(final, true)
|
||||
finalTranscriptRef.current += processedFinal + ' '
|
||||
setTranscript(finalTranscriptRef.current)
|
||||
onTranscriptUpdate?.(processedFinal)
|
||||
}
|
||||
|
||||
if (interim) {
|
||||
setInterimTranscript(processTranscript(interim, false))
|
||||
}
|
||||
}
|
||||
|
||||
recognition.onerror = (event) => {
|
||||
console.error('Speech recognition error:', event.error)
|
||||
setIsRecording(false)
|
||||
|
||||
if (event.error !== 'aborted') {
|
||||
onError?.(new Error(`Speech recognition error: ${event.error}`))
|
||||
}
|
||||
}
|
||||
|
||||
recognition.onend = () => {
|
||||
setIsRecording(false)
|
||||
setInterimTranscript('')
|
||||
}
|
||||
|
||||
return recognition
|
||||
}, [isSupported, continuous, interimResults, language, onTranscriptUpdate, onError, processTranscript])
|
||||
|
||||
// Start recording
|
||||
const startRecording = useCallback(() => {
|
||||
if (!isSupported) {
|
||||
onError?.(new Error('Web Speech API is not supported'))
|
||||
return
|
||||
}
|
||||
|
||||
try {
|
||||
lastSpeechTimeRef.current = 0
|
||||
|
||||
const recognition = initializeRecognition()
|
||||
if (recognition) {
|
||||
recognitionRef.current = recognition
|
||||
recognition.start()
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error starting speech recognition:', error)
|
||||
onError?.(error as Error)
|
||||
}
|
||||
}, [isSupported, initializeRecognition, onError])
|
||||
|
||||
// Stop recording
|
||||
const stopRecording = useCallback(() => {
|
||||
if (recognitionRef.current) {
|
||||
recognitionRef.current.stop()
|
||||
recognitionRef.current = null
|
||||
}
|
||||
}, [])
|
||||
|
||||
// Clear transcript
|
||||
const clearTranscript = useCallback(() => {
|
||||
finalTranscriptRef.current = ''
|
||||
setTranscript('')
|
||||
setInterimTranscript('')
|
||||
}, [])
|
||||
|
||||
// Cleanup on unmount
|
||||
useEffect(() => {
|
||||
return () => {
|
||||
if (recognitionRef.current) {
|
||||
recognitionRef.current.stop()
|
||||
recognitionRef.current = null
|
||||
}
|
||||
}
|
||||
}, [])
|
||||
|
||||
return {
|
||||
isRecording,
|
||||
transcript,
|
||||
interimTranscript,
|
||||
isSupported,
|
||||
startRecording,
|
||||
stopRecording,
|
||||
clearTranscript
|
||||
}
|
||||
}
|
||||
|
||||
export default useWebSpeechTranscription
|
||||
|
|
@ -0,0 +1,426 @@
|
|||
import { useCallback, useEffect, useRef, useState } from 'react'
|
||||
import { pipeline, env } from '@xenova/transformers'
|
||||
|
||||
// Configure the transformers library
|
||||
env.allowRemoteModels = true
|
||||
env.allowLocalModels = false
|
||||
env.useBrowserCache = true
|
||||
|
||||
// Resample audio to 16kHz for Whisper
|
||||
function resampleAudio(audioData: Float32Array, fromSampleRate: number, toSampleRate: number): Float32Array {
|
||||
if (fromSampleRate === toSampleRate) {
|
||||
return audioData
|
||||
}
|
||||
|
||||
if (!audioData || audioData.length === 0) {
|
||||
throw new Error('Invalid audio data for resampling')
|
||||
}
|
||||
|
||||
const ratio = fromSampleRate / toSampleRate
|
||||
const newLength = Math.floor(audioData.length / ratio)
|
||||
|
||||
if (newLength <= 0) {
|
||||
throw new Error('Invalid resampled length')
|
||||
}
|
||||
|
||||
const resampled = new Float32Array(newLength)
|
||||
|
||||
for (let i = 0; i < newLength; i++) {
|
||||
const sourceIndex = Math.floor(i * ratio)
|
||||
if (sourceIndex >= 0 && sourceIndex < audioData.length) {
|
||||
resampled[i] = audioData[sourceIndex]
|
||||
} else {
|
||||
resampled[i] = 0
|
||||
}
|
||||
}
|
||||
|
||||
return resampled
|
||||
}
|
||||
|
||||
interface UseWhisperTranscriptionOptions {
|
||||
onTranscriptUpdate?: (text: string) => void
|
||||
onError?: (error: Error) => void
|
||||
onModelProgress?: (progress: number, status: string) => void
|
||||
language?: string
|
||||
enableStreaming?: boolean
|
||||
}
|
||||
|
||||
export const useWhisperTranscription = ({
|
||||
onTranscriptUpdate,
|
||||
onError,
|
||||
onModelProgress,
|
||||
language = 'en',
|
||||
enableStreaming = true
|
||||
}: UseWhisperTranscriptionOptions = {}) => {
|
||||
const [isRecording, setIsRecording] = useState(false)
|
||||
const [isTranscribing, setIsTranscribing] = useState(false)
|
||||
const [transcript, setTranscript] = useState('')
|
||||
const [modelLoaded, setModelLoaded] = useState(false)
|
||||
const [modelLoading, setModelLoading] = useState(false)
|
||||
|
||||
const transcriberRef = useRef<ReturnType<typeof pipeline> extends Promise<infer T> ? T : never>(null!)
|
||||
const streamRef = useRef<MediaStream | null>(null)
|
||||
const mediaRecorderRef = useRef<MediaRecorder | null>(null)
|
||||
const audioChunksRef = useRef<Blob[]>([])
|
||||
const isRecordingRef = useRef(false)
|
||||
const transcriptRef = useRef('')
|
||||
const periodicTranscriptionRef = useRef<NodeJS.Timeout | null>(null)
|
||||
|
||||
// Initialize Whisper model
|
||||
const initializeTranscriber = useCallback(async () => {
|
||||
if (transcriberRef.current || modelLoading) return transcriberRef.current
|
||||
|
||||
setModelLoading(true)
|
||||
|
||||
try {
|
||||
const modelNames = ['Xenova/whisper-tiny.en', 'Xenova/whisper-tiny']
|
||||
|
||||
let transcriber = null
|
||||
let lastError = null
|
||||
|
||||
for (const modelName of modelNames) {
|
||||
try {
|
||||
transcriber = await pipeline('automatic-speech-recognition', modelName, {
|
||||
quantized: true,
|
||||
progress_callback: (progress: { status: string; file?: string; progress?: number }) => {
|
||||
if (progress.status === 'downloading' && progress.progress) {
|
||||
onModelProgress?.(progress.progress, `Downloading ${progress.file || 'model'}...`)
|
||||
} else if (progress.status === 'loading') {
|
||||
onModelProgress?.(100, 'Loading model...')
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
transcriberRef.current = transcriber
|
||||
setModelLoaded(true)
|
||||
setModelLoading(false)
|
||||
return transcriber
|
||||
} catch (error) {
|
||||
console.warn(`Failed to load model ${modelName}:`, error)
|
||||
lastError = error
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
throw lastError || new Error('Failed to load any Whisper model')
|
||||
} catch (error) {
|
||||
console.error('Failed to load Whisper model:', error)
|
||||
setModelLoading(false)
|
||||
onError?.(error as Error)
|
||||
throw error
|
||||
}
|
||||
}, [onError, onModelProgress, modelLoading])
|
||||
|
||||
// Process audio chunks for transcription
|
||||
const processAudioChunks = useCallback(async () => {
|
||||
if (audioChunksRef.current.length === 0) return
|
||||
|
||||
if (!transcriberRef.current) {
|
||||
console.warn('Transcriber not initialized')
|
||||
return
|
||||
}
|
||||
|
||||
try {
|
||||
setIsTranscribing(true)
|
||||
|
||||
// Get MIME type from MediaRecorder
|
||||
let mimeType = 'audio/webm;codecs=opus'
|
||||
if (mediaRecorderRef.current?.mimeType) {
|
||||
mimeType = mediaRecorderRef.current.mimeType
|
||||
}
|
||||
|
||||
const validChunks = audioChunksRef.current.filter(chunk => chunk && chunk.size > 1000)
|
||||
if (validChunks.length === 0) return
|
||||
|
||||
const audioBlob = new Blob(validChunks, { type: mimeType })
|
||||
if (audioBlob.size < 10000) return
|
||||
|
||||
const arrayBuffer = await audioBlob.arrayBuffer()
|
||||
const audioContext = new AudioContext()
|
||||
|
||||
let audioBuffer: AudioBuffer
|
||||
try {
|
||||
audioBuffer = await audioContext.decodeAudioData(arrayBuffer)
|
||||
} catch {
|
||||
// Try alternative MIME type
|
||||
try {
|
||||
const altBlob = new Blob(validChunks, { type: 'audio/webm' })
|
||||
const altBuffer = await altBlob.arrayBuffer()
|
||||
audioBuffer = await audioContext.decodeAudioData(altBuffer)
|
||||
} catch (altError) {
|
||||
await audioContext.close()
|
||||
throw new Error('Failed to decode audio. Format may not be supported.')
|
||||
}
|
||||
}
|
||||
|
||||
await audioContext.close()
|
||||
|
||||
const audioData = audioBuffer.getChannelData(0)
|
||||
|
||||
// Resample to 16kHz
|
||||
let processedAudioData: Float32Array = audioData
|
||||
if (audioBuffer.sampleRate !== 16000) {
|
||||
processedAudioData = resampleAudio(audioData, audioBuffer.sampleRate, 16000)
|
||||
}
|
||||
|
||||
// Skip if too quiet
|
||||
const rms = Math.sqrt(processedAudioData.reduce((sum, val) => sum + val * val, 0) / processedAudioData.length)
|
||||
if (rms < 0.001) return
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
const result = await (transcriberRef.current as any)(processedAudioData, {
|
||||
language,
|
||||
task: 'transcribe',
|
||||
return_timestamps: false
|
||||
})
|
||||
|
||||
const newText = (result as { text?: string })?.text?.trim() || ''
|
||||
|
||||
if (newText) {
|
||||
const processedText = newText.endsWith('.') || newText.endsWith('?') || newText.endsWith('!')
|
||||
? newText
|
||||
: newText + '.'
|
||||
|
||||
transcriptRef.current += (transcriptRef.current ? ' ' : '') + processedText
|
||||
setTranscript(transcriptRef.current)
|
||||
onTranscriptUpdate?.(processedText)
|
||||
}
|
||||
|
||||
// Clear processed chunks
|
||||
audioChunksRef.current = []
|
||||
} catch (error) {
|
||||
console.error('Error processing audio:', error)
|
||||
onError?.(error as Error)
|
||||
} finally {
|
||||
setIsTranscribing(false)
|
||||
}
|
||||
}, [language, onTranscriptUpdate, onError])
|
||||
|
||||
// Process accumulated chunks for streaming
|
||||
const processAccumulatedChunks = useCallback(async () => {
|
||||
try {
|
||||
const chunks = audioChunksRef.current
|
||||
if (chunks.length < 3) return
|
||||
|
||||
// Get recent chunks (last 2 seconds worth)
|
||||
const recentChunks = chunks.slice(-5)
|
||||
const validChunks = recentChunks.filter(chunk => chunk && chunk.size > 2000)
|
||||
if (validChunks.length < 2) return
|
||||
|
||||
let mimeType = 'audio/webm;codecs=opus'
|
||||
if (mediaRecorderRef.current?.mimeType) {
|
||||
mimeType = mediaRecorderRef.current.mimeType
|
||||
}
|
||||
|
||||
const tempBlob = new Blob(validChunks, { type: mimeType })
|
||||
if (tempBlob.size < 20000) return
|
||||
|
||||
const audioBuffer = await tempBlob.arrayBuffer()
|
||||
const audioContext = new AudioContext()
|
||||
|
||||
let audioBufferFromBlob: AudioBuffer
|
||||
try {
|
||||
audioBufferFromBlob = await audioContext.decodeAudioData(audioBuffer)
|
||||
} catch {
|
||||
await audioContext.close()
|
||||
return
|
||||
}
|
||||
|
||||
await audioContext.close()
|
||||
|
||||
const audioData = audioBufferFromBlob.getChannelData(0)
|
||||
if (!audioData || audioData.length === 0) return
|
||||
|
||||
let processedAudioData: Float32Array = audioData
|
||||
if (audioBufferFromBlob.sampleRate !== 16000) {
|
||||
processedAudioData = resampleAudio(audioData, audioBufferFromBlob.sampleRate, 16000)
|
||||
}
|
||||
|
||||
// Check for meaningful audio
|
||||
const rms = Math.sqrt(processedAudioData.reduce((sum, val) => sum + val * val, 0) / processedAudioData.length)
|
||||
if (rms < 0.001) return
|
||||
|
||||
// Limit to 2 seconds
|
||||
const maxSamples = 32000
|
||||
if (processedAudioData.length > maxSamples) {
|
||||
processedAudioData = processedAudioData.slice(-maxSamples)
|
||||
}
|
||||
|
||||
if (!transcriberRef.current) return
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
const result = await (transcriberRef.current as any)(processedAudioData, {
|
||||
language,
|
||||
task: 'transcribe',
|
||||
return_timestamps: false,
|
||||
no_speech_threshold: 0.3
|
||||
})
|
||||
|
||||
const text = (result as { text?: string })?.text?.trim() || ''
|
||||
if (text && text.length > 2) {
|
||||
const processedText = text.endsWith('.') || text.endsWith('?') || text.endsWith('!')
|
||||
? text
|
||||
: text + '.'
|
||||
|
||||
// Append if not duplicate
|
||||
if (!transcriptRef.current.endsWith(processedText)) {
|
||||
transcriptRef.current += (transcriptRef.current ? ' ' : '') + processedText
|
||||
setTranscript(transcriptRef.current)
|
||||
onTranscriptUpdate?.(processedText)
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Streaming transcription error:', error)
|
||||
}
|
||||
}, [language, onTranscriptUpdate])
|
||||
|
||||
// Start recording
|
||||
const startRecording = useCallback(async () => {
|
||||
try {
|
||||
// Initialize model if needed
|
||||
if (!modelLoaded) {
|
||||
await initializeTranscriber()
|
||||
}
|
||||
|
||||
audioChunksRef.current = []
|
||||
|
||||
// Clear periodic timer
|
||||
if (periodicTranscriptionRef.current) {
|
||||
clearInterval(periodicTranscriptionRef.current)
|
||||
periodicTranscriptionRef.current = null
|
||||
}
|
||||
|
||||
// Get microphone access
|
||||
const stream = await navigator.mediaDevices.getUserMedia({
|
||||
audio: {
|
||||
echoCancellation: true,
|
||||
noiseSuppression: true,
|
||||
autoGainControl: true,
|
||||
sampleRate: 44100,
|
||||
channelCount: 1
|
||||
}
|
||||
})
|
||||
|
||||
streamRef.current = stream
|
||||
|
||||
// Create MediaRecorder with fallback options
|
||||
let mediaRecorder: MediaRecorder | null = null
|
||||
const mimeOptions = [
|
||||
{ mimeType: 'audio/webm;codecs=opus' },
|
||||
{ mimeType: 'audio/webm' },
|
||||
{ mimeType: 'audio/ogg;codecs=opus' },
|
||||
{ mimeType: 'audio/mp4' }
|
||||
]
|
||||
|
||||
for (const option of mimeOptions) {
|
||||
if (MediaRecorder.isTypeSupported(option.mimeType)) {
|
||||
mediaRecorder = new MediaRecorder(stream, option)
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if (!mediaRecorder) {
|
||||
throw new Error('No supported audio format found')
|
||||
}
|
||||
|
||||
mediaRecorderRef.current = mediaRecorder
|
||||
|
||||
mediaRecorder.ondataavailable = (event) => {
|
||||
if (event.data.size > 1000) {
|
||||
audioChunksRef.current.push(event.data)
|
||||
|
||||
// Limit chunks to prevent memory issues
|
||||
if (audioChunksRef.current.length > 20) {
|
||||
audioChunksRef.current = audioChunksRef.current.slice(-15)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
mediaRecorder.onstop = () => {
|
||||
processAudioChunks()
|
||||
}
|
||||
|
||||
mediaRecorder.onstart = () => {
|
||||
setIsRecording(true)
|
||||
isRecordingRef.current = true
|
||||
|
||||
// Start streaming transcription
|
||||
if (enableStreaming) {
|
||||
periodicTranscriptionRef.current = setInterval(() => {
|
||||
if (isRecordingRef.current) {
|
||||
processAccumulatedChunks()
|
||||
}
|
||||
}, 1000)
|
||||
}
|
||||
}
|
||||
|
||||
// Start with 1 second chunks
|
||||
mediaRecorder.start(1000)
|
||||
isRecordingRef.current = true
|
||||
setIsRecording(true)
|
||||
} catch (error) {
|
||||
console.error('Error starting recording:', error)
|
||||
onError?.(error as Error)
|
||||
}
|
||||
}, [processAudioChunks, processAccumulatedChunks, onError, enableStreaming, modelLoaded, initializeTranscriber])
|
||||
|
||||
// Stop recording
|
||||
const stopRecording = useCallback(async () => {
|
||||
try {
|
||||
if (periodicTranscriptionRef.current) {
|
||||
clearInterval(periodicTranscriptionRef.current)
|
||||
periodicTranscriptionRef.current = null
|
||||
}
|
||||
|
||||
if (mediaRecorderRef.current && isRecordingRef.current) {
|
||||
mediaRecorderRef.current.stop()
|
||||
}
|
||||
|
||||
if (streamRef.current) {
|
||||
streamRef.current.getTracks().forEach(track => track.stop())
|
||||
streamRef.current = null
|
||||
}
|
||||
|
||||
isRecordingRef.current = false
|
||||
setIsRecording(false)
|
||||
} catch (error) {
|
||||
console.error('Error stopping recording:', error)
|
||||
onError?.(error as Error)
|
||||
}
|
||||
}, [onError])
|
||||
|
||||
// Clear transcript
|
||||
const clearTranscript = useCallback(() => {
|
||||
transcriptRef.current = ''
|
||||
setTranscript('')
|
||||
}, [])
|
||||
|
||||
// Cleanup on unmount
|
||||
useEffect(() => {
|
||||
return () => {
|
||||
if (periodicTranscriptionRef.current) {
|
||||
clearInterval(periodicTranscriptionRef.current)
|
||||
}
|
||||
if (mediaRecorderRef.current?.state === 'recording') {
|
||||
mediaRecorderRef.current.stop()
|
||||
}
|
||||
if (streamRef.current) {
|
||||
streamRef.current.getTracks().forEach(track => track.stop())
|
||||
}
|
||||
}
|
||||
}, [])
|
||||
|
||||
return {
|
||||
isRecording,
|
||||
isTranscribing,
|
||||
transcript,
|
||||
modelLoaded,
|
||||
modelLoading,
|
||||
startRecording,
|
||||
stopRecording,
|
||||
clearTranscript,
|
||||
initializeTranscriber
|
||||
}
|
||||
}
|
||||
|
||||
export default useWhisperTranscription
|
||||
|
|
@ -0,0 +1,9 @@
|
|||
import React from 'react'
|
||||
import ReactDOM from 'react-dom/client'
|
||||
import App from './App'
|
||||
|
||||
ReactDOM.createRoot(document.getElementById('root')!).render(
|
||||
<React.StrictMode>
|
||||
<App />
|
||||
</React.StrictMode>
|
||||
)
|
||||
|
|
@ -0,0 +1 @@
|
|||
/// <reference types="vite/client" />
|
||||
|
|
@ -0,0 +1,21 @@
|
|||
{
|
||||
"compilerOptions": {
|
||||
"target": "ES2020",
|
||||
"useDefineForClassFields": true,
|
||||
"lib": ["ES2020", "DOM", "DOM.Iterable"],
|
||||
"module": "ESNext",
|
||||
"skipLibCheck": true,
|
||||
"moduleResolution": "bundler",
|
||||
"allowImportingTsExtensions": true,
|
||||
"resolveJsonModule": true,
|
||||
"isolatedModules": true,
|
||||
"noEmit": true,
|
||||
"jsx": "react-jsx",
|
||||
"strict": true,
|
||||
"noUnusedLocals": true,
|
||||
"noUnusedParameters": true,
|
||||
"noFallthroughCasesInSwitch": true
|
||||
},
|
||||
"include": ["src"],
|
||||
"references": [{ "path": "./tsconfig.node.json" }]
|
||||
}
|
||||
|
|
@ -0,0 +1,11 @@
|
|||
{
|
||||
"compilerOptions": {
|
||||
"composite": true,
|
||||
"skipLibCheck": true,
|
||||
"module": "ESNext",
|
||||
"moduleResolution": "bundler",
|
||||
"allowSyntheticDefaultImports": true,
|
||||
"strict": true
|
||||
},
|
||||
"include": ["vite.config.ts"]
|
||||
}
|
||||
|
|
@ -0,0 +1,65 @@
|
|||
import { defineConfig } from 'vite'
|
||||
import react from '@vitejs/plugin-react'
|
||||
import { VitePWA } from 'vite-plugin-pwa'
|
||||
|
||||
export default defineConfig({
|
||||
plugins: [
|
||||
react(),
|
||||
VitePWA({
|
||||
registerType: 'autoUpdate',
|
||||
includeAssets: ['favicon.ico', 'apple-touch-icon.png', 'mask-icon.svg'],
|
||||
manifest: {
|
||||
name: 'Transcribe',
|
||||
short_name: 'Transcribe',
|
||||
description: 'Voice transcription app with local Whisper and Web Speech API',
|
||||
theme_color: '#ff9500',
|
||||
background_color: '#1a1a2e',
|
||||
display: 'standalone',
|
||||
orientation: 'portrait',
|
||||
scope: '/',
|
||||
start_url: '/',
|
||||
icons: [
|
||||
{
|
||||
src: '/icons/icon-192.png',
|
||||
sizes: '192x192',
|
||||
type: 'image/png'
|
||||
},
|
||||
{
|
||||
src: '/icons/icon-512.png',
|
||||
sizes: '512x512',
|
||||
type: 'image/png'
|
||||
},
|
||||
{
|
||||
src: '/icons/icon-512.png',
|
||||
sizes: '512x512',
|
||||
type: 'image/png',
|
||||
purpose: 'maskable'
|
||||
}
|
||||
]
|
||||
},
|
||||
workbox: {
|
||||
globPatterns: ['**/*.{js,css,html,ico,png,svg,woff2}'],
|
||||
runtimeCaching: [
|
||||
{
|
||||
urlPattern: /^https:\/\/cdn\.jsdelivr\.net\/.*/i,
|
||||
handler: 'CacheFirst',
|
||||
options: {
|
||||
cacheName: 'cdn-cache',
|
||||
expiration: {
|
||||
maxEntries: 50,
|
||||
maxAgeSeconds: 60 * 60 * 24 * 30 // 30 days
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
})
|
||||
],
|
||||
build: {
|
||||
target: 'esnext',
|
||||
outDir: 'dist'
|
||||
},
|
||||
server: {
|
||||
port: 3000
|
||||
}
|
||||
})
|
||||
Loading…
Reference in New Issue