304 lines
9.6 KiB
Python
304 lines
9.6 KiB
Python
"""Pipeline orchestration tasks for ARQ worker."""
|
|
|
|
import json
|
|
import logging
|
|
import os
|
|
import uuid
|
|
|
|
import redis.asyncio as aioredis
|
|
from sqlalchemy import select
|
|
from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker
|
|
|
|
from app.config import settings
|
|
from app.models import Job, Clip
|
|
from app.services import download, transcription, ai_analysis, clip_extraction
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
async def _get_session() -> AsyncSession:
|
|
engine = create_async_engine(settings.database_url, echo=False)
|
|
session_factory = async_sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
|
|
return session_factory()
|
|
|
|
|
|
async def _publish_progress(
|
|
redis: aioredis.Redis,
|
|
job_id: str,
|
|
status: str,
|
|
progress: float,
|
|
stage_message: str,
|
|
):
|
|
"""Publish progress update via Redis pub/sub."""
|
|
data = {
|
|
"status": status,
|
|
"progress": round(progress, 2),
|
|
"stage_message": stage_message,
|
|
}
|
|
await redis.publish(f"job:{job_id}:progress", json.dumps(data))
|
|
|
|
|
|
async def _update_job(
|
|
db: AsyncSession,
|
|
job: Job,
|
|
status: str,
|
|
progress: float,
|
|
stage_message: str,
|
|
**kwargs,
|
|
):
|
|
"""Update job in database."""
|
|
job.status = status
|
|
job.progress = progress
|
|
job.stage_message = stage_message
|
|
for k, v in kwargs.items():
|
|
setattr(job, k, v)
|
|
await db.commit()
|
|
|
|
|
|
async def process_job(ctx: dict, job_id: str):
|
|
"""Main pipeline: download → transcribe → AI analysis → extract clips."""
|
|
r = ctx.get("redis") or aioredis.from_url(settings.redis_url)
|
|
db = await _get_session()
|
|
|
|
try:
|
|
job = await db.get(Job, uuid.UUID(job_id))
|
|
if not job:
|
|
logger.error(f"Job {job_id} not found")
|
|
return
|
|
|
|
logger.info(f"Processing job {job_id}: {job.source_type}")
|
|
|
|
# === STAGE 1: DOWNLOAD ===
|
|
await _update_job(db, job, "downloading", 0.05, "Downloading video...")
|
|
await _publish_progress(r, job_id, "downloading", 0.05, "Downloading video...")
|
|
|
|
job_media_dir = os.path.join(settings.media_dir, job_id)
|
|
os.makedirs(job_media_dir, exist_ok=True)
|
|
|
|
if job.source_type == "youtube":
|
|
try:
|
|
video_info = await download.download_video(job.source_url, job_media_dir)
|
|
except Exception as dl_err:
|
|
if "Sign in to confirm" in str(dl_err) or "bot" in str(dl_err):
|
|
raise ValueError(
|
|
"YouTube blocked this download (bot detection). "
|
|
"Try uploading the video file directly instead."
|
|
)
|
|
raise
|
|
job.title = video_info.title
|
|
job.duration = video_info.duration
|
|
job.media_path = video_info.video_path
|
|
elif job.media_path:
|
|
# Uploaded file - get duration
|
|
duration = await clip_extraction.get_video_duration(job.media_path)
|
|
job.duration = duration
|
|
if not job.title:
|
|
job.title = job.source_filename or "Uploaded Video"
|
|
else:
|
|
raise ValueError("No video source available")
|
|
|
|
await db.commit()
|
|
await _publish_progress(
|
|
r, job_id, "downloading", 0.20,
|
|
f"Downloaded: {job.title} ({job.duration:.0f}s)"
|
|
)
|
|
|
|
# === STAGE 2: TRANSCRIBE ===
|
|
await _update_job(
|
|
db, job, "transcribing", 0.25,
|
|
"Extracting audio and transcribing..."
|
|
)
|
|
await _publish_progress(
|
|
r, job_id, "transcribing", 0.25,
|
|
"Extracting audio and transcribing..."
|
|
)
|
|
|
|
# Extract audio for transcription
|
|
audio_path = os.path.join(job_media_dir, "audio.mp3")
|
|
await download.extract_audio(job.media_path, audio_path)
|
|
|
|
await _publish_progress(
|
|
r, job_id, "transcribing", 0.30,
|
|
"Transcribing with Whisper..."
|
|
)
|
|
|
|
transcript = await transcription.transcribe(audio_path)
|
|
job.transcript = transcript
|
|
await db.commit()
|
|
|
|
word_count = len(transcript.get("words", []))
|
|
await _publish_progress(
|
|
r, job_id, "transcribing", 0.50,
|
|
f"Transcription complete: {word_count} words"
|
|
)
|
|
|
|
# === STAGE 3: AI ANALYSIS ===
|
|
await _update_job(
|
|
db, job, "analyzing", 0.55,
|
|
"AI analyzing transcript for viral clips..."
|
|
)
|
|
await _publish_progress(
|
|
r, job_id, "analyzing", 0.55,
|
|
"AI analyzing transcript for viral clips..."
|
|
)
|
|
|
|
clips_data = await ai_analysis.analyze_transcript(
|
|
transcript=transcript,
|
|
video_title=job.title or "",
|
|
video_duration=job.duration or 0,
|
|
)
|
|
|
|
if not clips_data:
|
|
raise ValueError("AI analysis returned no clips")
|
|
|
|
await _publish_progress(
|
|
r, job_id, "analyzing", 0.70,
|
|
f"Found {len(clips_data)} potential clips"
|
|
)
|
|
|
|
# === STAGE 4: EXTRACT CLIPS ===
|
|
await _update_job(
|
|
db, job, "extracting", 0.75,
|
|
f"Extracting {len(clips_data)} clips..."
|
|
)
|
|
await _publish_progress(
|
|
r, job_id, "extracting", 0.75,
|
|
f"Extracting {len(clips_data)} clips..."
|
|
)
|
|
|
|
clips_dir = os.path.join(settings.clips_dir, job_id)
|
|
os.makedirs(clips_dir, exist_ok=True)
|
|
|
|
for i, cd in enumerate(clips_data):
|
|
clip_filename = f"clip_{i:02d}.mp4"
|
|
clip_path = os.path.join(clips_dir, clip_filename)
|
|
thumb_path = os.path.join(clips_dir, f"thumb_{i:02d}.jpg")
|
|
|
|
# Extract the clip
|
|
await clip_extraction.extract_clip(
|
|
video_path=job.media_path,
|
|
start_time=cd["start_time"],
|
|
end_time=cd["end_time"],
|
|
output_path=clip_path,
|
|
)
|
|
|
|
# Extract thumbnail at 25% into the clip
|
|
thumb_time = cd["start_time"] + (cd["end_time"] - cd["start_time"]) * 0.25
|
|
try:
|
|
await clip_extraction.extract_thumbnail(
|
|
video_path=job.media_path,
|
|
timestamp=thumb_time,
|
|
output_path=thumb_path,
|
|
)
|
|
except Exception:
|
|
thumb_path = None
|
|
|
|
# Get transcript segment for this clip
|
|
segment_text = transcription.get_transcript_segment(
|
|
transcript.get("words", []),
|
|
cd["start_time"],
|
|
cd["end_time"],
|
|
)
|
|
|
|
# Save clip to database
|
|
clip = Clip(
|
|
job_id=job.id,
|
|
title=cd["title"],
|
|
start_time=cd["start_time"],
|
|
end_time=cd["end_time"],
|
|
virality_score=cd["virality_score"],
|
|
category=cd["category"],
|
|
reasoning=cd["reasoning"],
|
|
transcript_segment=segment_text,
|
|
thumbnail_path=thumb_path,
|
|
raw_clip_path=clip_path,
|
|
)
|
|
db.add(clip)
|
|
|
|
progress = 0.75 + (0.20 * (i + 1) / len(clips_data))
|
|
await _publish_progress(
|
|
r, job_id, "extracting", progress,
|
|
f"Extracted clip {i + 1}/{len(clips_data)}: {cd['title']}"
|
|
)
|
|
|
|
await db.commit()
|
|
|
|
# === COMPLETE ===
|
|
await _update_job(
|
|
db, job, "complete", 1.0,
|
|
f"Done! {len(clips_data)} clips extracted"
|
|
)
|
|
await _publish_progress(
|
|
r, job_id, "complete", 1.0,
|
|
f"Done! {len(clips_data)} clips extracted"
|
|
)
|
|
|
|
# Clean up audio file
|
|
if os.path.exists(audio_path):
|
|
os.remove(audio_path)
|
|
|
|
logger.info(f"Job {job_id} complete: {len(clips_data)} clips")
|
|
|
|
except Exception as e:
|
|
logger.exception(f"Job {job_id} failed: {e}")
|
|
try:
|
|
await _update_job(
|
|
db, job, "failed", job.progress,
|
|
str(e), error_message=str(e),
|
|
)
|
|
await _publish_progress(
|
|
r, job_id, "failed", job.progress, f"Error: {e}"
|
|
)
|
|
except Exception:
|
|
pass
|
|
finally:
|
|
await db.close()
|
|
|
|
|
|
async def render_clip(ctx: dict, render_id: str):
|
|
"""Render a clip with subtitles and aspect ratio conversion.
|
|
(Phase 3 - stub for now, copies raw clip)"""
|
|
from app.models import RenderRequest
|
|
|
|
db = await _get_session()
|
|
try:
|
|
render = await db.get(RenderRequest, uuid.UUID(render_id))
|
|
if not render:
|
|
return
|
|
|
|
render.status = "rendering"
|
|
render.progress = 0.5
|
|
await db.commit()
|
|
|
|
clip = await db.get(Clip, render.clip_id)
|
|
if not clip or not clip.raw_clip_path:
|
|
render.status = "failed"
|
|
render.error_message = "Clip not found or not extracted"
|
|
await db.commit()
|
|
return
|
|
|
|
# Phase 1: just copy the raw clip as-is
|
|
# Phase 3 will add subtitle rendering + aspect ratio conversion
|
|
import shutil
|
|
renders_dir = os.path.join(settings.renders_dir, str(render.clip_id))
|
|
os.makedirs(renders_dir, exist_ok=True)
|
|
output = os.path.join(
|
|
renders_dir,
|
|
f"render_{render.aspect_ratio.replace(':', 'x')}.mp4"
|
|
)
|
|
shutil.copy2(clip.raw_clip_path, output)
|
|
|
|
render.output_path = output
|
|
render.status = "complete"
|
|
render.progress = 1.0
|
|
await db.commit()
|
|
|
|
logger.info(f"Render {render_id} complete: {output}")
|
|
except Exception as e:
|
|
logger.exception(f"Render {render_id} failed: {e}")
|
|
render.status = "failed"
|
|
render.error_message = str(e)
|
|
await db.commit()
|
|
finally:
|
|
await db.close()
|