clip-forge/backend/app/workers/tasks.py

304 lines
9.6 KiB
Python

"""Pipeline orchestration tasks for ARQ worker."""
import json
import logging
import os
import uuid
import redis.asyncio as aioredis
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker
from app.config import settings
from app.models import Job, Clip
from app.services import download, transcription, ai_analysis, clip_extraction
logger = logging.getLogger(__name__)
async def _get_session() -> AsyncSession:
engine = create_async_engine(settings.database_url, echo=False)
session_factory = async_sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
return session_factory()
async def _publish_progress(
redis: aioredis.Redis,
job_id: str,
status: str,
progress: float,
stage_message: str,
):
"""Publish progress update via Redis pub/sub."""
data = {
"status": status,
"progress": round(progress, 2),
"stage_message": stage_message,
}
await redis.publish(f"job:{job_id}:progress", json.dumps(data))
async def _update_job(
db: AsyncSession,
job: Job,
status: str,
progress: float,
stage_message: str,
**kwargs,
):
"""Update job in database."""
job.status = status
job.progress = progress
job.stage_message = stage_message
for k, v in kwargs.items():
setattr(job, k, v)
await db.commit()
async def process_job(ctx: dict, job_id: str):
"""Main pipeline: download → transcribe → AI analysis → extract clips."""
r = ctx.get("redis") or aioredis.from_url(settings.redis_url)
db = await _get_session()
try:
job = await db.get(Job, uuid.UUID(job_id))
if not job:
logger.error(f"Job {job_id} not found")
return
logger.info(f"Processing job {job_id}: {job.source_type}")
# === STAGE 1: DOWNLOAD ===
await _update_job(db, job, "downloading", 0.05, "Downloading video...")
await _publish_progress(r, job_id, "downloading", 0.05, "Downloading video...")
job_media_dir = os.path.join(settings.media_dir, job_id)
os.makedirs(job_media_dir, exist_ok=True)
if job.source_type == "youtube":
try:
video_info = await download.download_video(job.source_url, job_media_dir)
except Exception as dl_err:
if "Sign in to confirm" in str(dl_err) or "bot" in str(dl_err):
raise ValueError(
"YouTube blocked this download (bot detection). "
"Try uploading the video file directly instead."
)
raise
job.title = video_info.title
job.duration = video_info.duration
job.media_path = video_info.video_path
elif job.media_path:
# Uploaded file - get duration
duration = await clip_extraction.get_video_duration(job.media_path)
job.duration = duration
if not job.title:
job.title = job.source_filename or "Uploaded Video"
else:
raise ValueError("No video source available")
await db.commit()
await _publish_progress(
r, job_id, "downloading", 0.20,
f"Downloaded: {job.title} ({job.duration:.0f}s)"
)
# === STAGE 2: TRANSCRIBE ===
await _update_job(
db, job, "transcribing", 0.25,
"Extracting audio and transcribing..."
)
await _publish_progress(
r, job_id, "transcribing", 0.25,
"Extracting audio and transcribing..."
)
# Extract audio for transcription
audio_path = os.path.join(job_media_dir, "audio.mp3")
await download.extract_audio(job.media_path, audio_path)
await _publish_progress(
r, job_id, "transcribing", 0.30,
"Transcribing with Whisper..."
)
transcript = await transcription.transcribe(audio_path)
job.transcript = transcript
await db.commit()
word_count = len(transcript.get("words", []))
await _publish_progress(
r, job_id, "transcribing", 0.50,
f"Transcription complete: {word_count} words"
)
# === STAGE 3: AI ANALYSIS ===
await _update_job(
db, job, "analyzing", 0.55,
"AI analyzing transcript for viral clips..."
)
await _publish_progress(
r, job_id, "analyzing", 0.55,
"AI analyzing transcript for viral clips..."
)
clips_data = await ai_analysis.analyze_transcript(
transcript=transcript,
video_title=job.title or "",
video_duration=job.duration or 0,
)
if not clips_data:
raise ValueError("AI analysis returned no clips")
await _publish_progress(
r, job_id, "analyzing", 0.70,
f"Found {len(clips_data)} potential clips"
)
# === STAGE 4: EXTRACT CLIPS ===
await _update_job(
db, job, "extracting", 0.75,
f"Extracting {len(clips_data)} clips..."
)
await _publish_progress(
r, job_id, "extracting", 0.75,
f"Extracting {len(clips_data)} clips..."
)
clips_dir = os.path.join(settings.clips_dir, job_id)
os.makedirs(clips_dir, exist_ok=True)
for i, cd in enumerate(clips_data):
clip_filename = f"clip_{i:02d}.mp4"
clip_path = os.path.join(clips_dir, clip_filename)
thumb_path = os.path.join(clips_dir, f"thumb_{i:02d}.jpg")
# Extract the clip
await clip_extraction.extract_clip(
video_path=job.media_path,
start_time=cd["start_time"],
end_time=cd["end_time"],
output_path=clip_path,
)
# Extract thumbnail at 25% into the clip
thumb_time = cd["start_time"] + (cd["end_time"] - cd["start_time"]) * 0.25
try:
await clip_extraction.extract_thumbnail(
video_path=job.media_path,
timestamp=thumb_time,
output_path=thumb_path,
)
except Exception:
thumb_path = None
# Get transcript segment for this clip
segment_text = transcription.get_transcript_segment(
transcript.get("words", []),
cd["start_time"],
cd["end_time"],
)
# Save clip to database
clip = Clip(
job_id=job.id,
title=cd["title"],
start_time=cd["start_time"],
end_time=cd["end_time"],
virality_score=cd["virality_score"],
category=cd["category"],
reasoning=cd["reasoning"],
transcript_segment=segment_text,
thumbnail_path=thumb_path,
raw_clip_path=clip_path,
)
db.add(clip)
progress = 0.75 + (0.20 * (i + 1) / len(clips_data))
await _publish_progress(
r, job_id, "extracting", progress,
f"Extracted clip {i + 1}/{len(clips_data)}: {cd['title']}"
)
await db.commit()
# === COMPLETE ===
await _update_job(
db, job, "complete", 1.0,
f"Done! {len(clips_data)} clips extracted"
)
await _publish_progress(
r, job_id, "complete", 1.0,
f"Done! {len(clips_data)} clips extracted"
)
# Clean up audio file
if os.path.exists(audio_path):
os.remove(audio_path)
logger.info(f"Job {job_id} complete: {len(clips_data)} clips")
except Exception as e:
logger.exception(f"Job {job_id} failed: {e}")
try:
await _update_job(
db, job, "failed", job.progress,
str(e), error_message=str(e),
)
await _publish_progress(
r, job_id, "failed", job.progress, f"Error: {e}"
)
except Exception:
pass
finally:
await db.close()
async def render_clip(ctx: dict, render_id: str):
"""Render a clip with subtitles and aspect ratio conversion.
(Phase 3 - stub for now, copies raw clip)"""
from app.models import RenderRequest
db = await _get_session()
try:
render = await db.get(RenderRequest, uuid.UUID(render_id))
if not render:
return
render.status = "rendering"
render.progress = 0.5
await db.commit()
clip = await db.get(Clip, render.clip_id)
if not clip or not clip.raw_clip_path:
render.status = "failed"
render.error_message = "Clip not found or not extracted"
await db.commit()
return
# Phase 1: just copy the raw clip as-is
# Phase 3 will add subtitle rendering + aspect ratio conversion
import shutil
renders_dir = os.path.join(settings.renders_dir, str(render.clip_id))
os.makedirs(renders_dir, exist_ok=True)
output = os.path.join(
renders_dir,
f"render_{render.aspect_ratio.replace(':', 'x')}.mp4"
)
shutil.copy2(clip.raw_clip_path, output)
render.output_path = output
render.status = "complete"
render.progress = 1.0
await db.commit()
logger.info(f"Render {render_id} complete: {output}")
except Exception as e:
logger.exception(f"Render {render_id} failed: {e}")
render.status = "failed"
render.error_message = str(e)
await db.commit()
finally:
await db.close()