jeffsi-meet/deploy/meeting-intelligence/transcriber/app/processor.py

283 lines
9.2 KiB
Python

"""
Job Processor for the Transcription Service.
Handles the processing pipeline:
1. Audio extraction from video
2. Transcription
3. Speaker diarization
4. Database storage
"""
import asyncio
import os
import subprocess
from typing import Optional
import structlog
from .config import settings
from .transcriber import WhisperTranscriber, TranscriptionResult
from .diarizer import SpeakerDiarizer, SpeakerSegment
from .database import Database
log = structlog.get_logger()
class JobProcessor:
"""Processes transcription jobs from the queue."""
def __init__(
self,
transcriber: WhisperTranscriber,
diarizer: SpeakerDiarizer,
db: Database,
redis
):
self.transcriber = transcriber
self.diarizer = diarizer
self.db = db
self.redis = redis
self.active_jobs = 0
self._running = False
self._workers = []
async def process_jobs(self):
"""Main job processing loop."""
self._running = True
log.info("Job processor started", num_workers=settings.num_workers)
# Start worker tasks
for i in range(settings.num_workers):
worker = asyncio.create_task(self._worker(i))
self._workers.append(worker)
# Wait for all workers
await asyncio.gather(*self._workers, return_exceptions=True)
async def stop(self):
"""Stop the job processor."""
self._running = False
for worker in self._workers:
worker.cancel()
log.info("Job processor stopped")
async def _worker(self, worker_id: int):
"""Worker that processes individual jobs."""
log.info(f"Worker {worker_id} started")
while self._running:
try:
# Get next job from database
job = await self.db.get_next_pending_job()
if job is None:
# No jobs, wait a bit
await asyncio.sleep(2)
continue
job_id = job["id"]
meeting_id = job["meeting_id"]
log.info(
f"Worker {worker_id} processing job",
job_id=job_id,
meeting_id=meeting_id
)
self.active_jobs += 1
try:
await self._process_job(job)
except Exception as e:
log.error(
"Job processing failed",
job_id=job_id,
error=str(e)
)
await self.db.update_job_status(
job_id,
"failed",
error_message=str(e)
)
finally:
self.active_jobs -= 1
except asyncio.CancelledError:
break
except Exception as e:
log.error(f"Worker {worker_id} error", error=str(e))
await asyncio.sleep(5)
log.info(f"Worker {worker_id} stopped")
async def _process_job(self, job: dict):
"""Process a single transcription job."""
job_id = job["id"]
meeting_id = job["meeting_id"]
audio_path = job.get("audio_path")
video_path = job.get("video_path")
enable_diarization = job.get("enable_diarization", True)
language = job.get("language")
# Update status to processing
await self.db.update_job_status(job_id, "processing")
await self.db.update_meeting_status(meeting_id, "transcribing")
# Step 1: Extract audio if we have video
if video_path and not audio_path:
log.info("Extracting audio from video", video_path=video_path)
await self.db.update_job_status(job_id, "processing", progress=0.1)
audio_path = await self._extract_audio(video_path, meeting_id)
await self.db.update_job_audio_path(job_id, audio_path)
if not audio_path or not os.path.exists(audio_path):
raise RuntimeError(f"Audio file not found: {audio_path}")
# Step 2: Transcribe
log.info("Starting transcription", audio_path=audio_path)
await self.db.update_job_status(job_id, "processing", progress=0.3)
transcription = await asyncio.get_event_loop().run_in_executor(
None,
lambda: self.transcriber.transcribe(audio_path, language)
)
log.info(
"Transcription complete",
segments=len(transcription.segments),
duration=transcription.duration
)
# Step 3: Speaker diarization
speaker_segments = []
if enable_diarization and len(transcription.segments) > 0:
log.info("Starting speaker diarization")
await self.db.update_job_status(job_id, "processing", progress=0.6)
await self.db.update_meeting_status(meeting_id, "diarizing")
# Convert transcript segments to dicts for diarizer
transcript_dicts = [
{"start": s.start, "end": s.end, "text": s.text}
for s in transcription.segments
]
speaker_segments = await asyncio.get_event_loop().run_in_executor(
None,
lambda: self.diarizer.diarize(
audio_path,
transcript_segments=transcript_dicts
)
)
log.info(
"Diarization complete",
num_segments=len(speaker_segments),
num_speakers=len(set(s.speaker_id for s in speaker_segments))
)
# Step 4: Store results
log.info("Storing transcript in database")
await self.db.update_job_status(job_id, "processing", progress=0.9)
await self._store_transcript(
meeting_id,
transcription,
speaker_segments
)
# Mark job complete
await self.db.update_job_status(
job_id,
"completed",
result={
"segments": len(transcription.segments),
"duration": transcription.duration,
"language": transcription.language,
"speakers": len(set(s.speaker_id for s in speaker_segments)) if speaker_segments else 0
}
)
# Update meeting status - ready for summarization
await self.db.update_meeting_status(meeting_id, "summarizing")
log.info("Job completed successfully", job_id=job_id)
async def _extract_audio(self, video_path: str, meeting_id: str) -> str:
"""Extract audio from video file using ffmpeg."""
output_dir = os.path.join(settings.audio_output_path, meeting_id)
os.makedirs(output_dir, exist_ok=True)
audio_path = os.path.join(output_dir, "audio.wav")
cmd = [
"ffmpeg",
"-i", video_path,
"-vn", # No video
"-acodec", "pcm_s16le", # PCM 16-bit
"-ar", str(settings.audio_sample_rate), # Sample rate
"-ac", str(settings.audio_channels), # Mono
"-y", # Overwrite
audio_path
]
log.debug("Running ffmpeg", cmd=" ".join(cmd))
process = await asyncio.create_subprocess_exec(
*cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE
)
_, stderr = await process.communicate()
if process.returncode != 0:
raise RuntimeError(f"FFmpeg failed: {stderr.decode()}")
log.info("Audio extracted", output=audio_path)
return audio_path
async def _store_transcript(
self,
meeting_id: str,
transcription: TranscriptionResult,
speaker_segments: list
):
"""Store transcript segments in database."""
# Create a map from time ranges to speakers
speaker_map = {}
for seg in speaker_segments:
speaker_map[(seg.start, seg.end)] = (seg.speaker_id, seg.speaker_label)
# Store each transcript segment
for i, segment in enumerate(transcription.segments):
# Find matching speaker
speaker_id = None
speaker_label = None
for (start, end), (sid, slabel) in speaker_map.items():
if segment.start >= start and segment.end <= end:
speaker_id = sid
speaker_label = slabel
break
# If no exact match, find closest overlap
if speaker_id is None:
for seg in speaker_segments:
if segment.start < seg.end and segment.end > seg.start:
speaker_id = seg.speaker_id
speaker_label = seg.speaker_label
break
await self.db.insert_transcript_segment(
meeting_id=meeting_id,
segment_index=i,
start_time=segment.start,
end_time=segment.end,
text=segment.text,
speaker_id=speaker_id,
speaker_label=speaker_label,
confidence=segment.confidence,
language=transcription.language
)