clip-forge/backend/app/workers/tasks.py

"""Pipeline orchestration tasks for ARQ worker."""

import json
import logging
import os
import uuid

import redis.asyncio as aioredis
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker

from app.config import settings
from app.models import Job, Clip
from app.services import download, transcription, ai_analysis, clip_extraction

logger = logging.getLogger(__name__)


async def _get_session() -> AsyncSession:
    engine = create_async_engine(settings.database_url, echo=False)
    session_factory = async_sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
    return session_factory()


async def _publish_progress(
    redis: aioredis.Redis,
    job_id: str,
    status: str,
    progress: float,
    stage_message: str,
):
    """Publish progress update via Redis pub/sub."""
    data = {
        "status": status,
        "progress": round(progress, 2),
        "stage_message": stage_message,
    }
    await redis.publish(f"job:{job_id}:progress", json.dumps(data))


async def _update_job(
    db: AsyncSession,
    job: Job,
    status: str,
    progress: float,
    stage_message: str,
    **kwargs,
):
    """Update job in database."""
    job.status = status
    job.progress = progress
    job.stage_message = stage_message
    for k, v in kwargs.items():
        setattr(job, k, v)
    await db.commit()


async def process_job(ctx: dict, job_id: str):
    """Main pipeline: download → transcribe → AI analysis → extract clips."""
    r = ctx.get("redis") or aioredis.from_url(settings.redis_url)
    db = await _get_session()

    try:
        job = await db.get(Job, uuid.UUID(job_id))
        if not job:
            logger.error(f"Job {job_id} not found")
            return

        logger.info(f"Processing job {job_id}: {job.source_type}")

        # === STAGE 1: DOWNLOAD ===
        await _update_job(db, job, "downloading", 0.05, "Downloading video...")
        await _publish_progress(r, job_id, "downloading", 0.05, "Downloading video...")

        job_media_dir = os.path.join(settings.media_dir, job_id)
        os.makedirs(job_media_dir, exist_ok=True)

        if job.source_type == "youtube":
            try:
                video_info = await download.download_video(job.source_url, job_media_dir)
            except Exception as dl_err:
                if "Sign in to confirm" in str(dl_err) or "bot" in str(dl_err):
                    raise ValueError(
                        "YouTube blocked this download (bot detection). "
                        "Try uploading the video file directly instead."
                    )
                raise
            job.title = video_info.title
            job.duration = video_info.duration
            job.media_path = video_info.video_path
        elif job.media_path:
            # Uploaded file - get duration
            duration = await clip_extraction.get_video_duration(job.media_path)
            job.duration = duration
            if not job.title:
                job.title = job.source_filename or "Uploaded Video"
        else:
            raise ValueError("No video source available")

        await db.commit()
        await _publish_progress(
            r, job_id, "downloading", 0.20,
            f"Downloaded: {job.title} ({job.duration:.0f}s)"
        )

        # === STAGE 2: TRANSCRIBE ===
        await _update_job(
            db, job, "transcribing", 0.25,
            "Extracting audio and transcribing..."
        )
        await _publish_progress(
            r, job_id, "transcribing", 0.25,
            "Extracting audio and transcribing..."
        )

        # Extract audio for transcription
        audio_path = os.path.join(job_media_dir, "audio.mp3")
        await download.extract_audio(job.media_path, audio_path)

        await _publish_progress(
            r, job_id, "transcribing", 0.30,
            "Transcribing with Whisper..."
        )

        transcript = await transcription.transcribe(audio_path)
        job.transcript = transcript
        await db.commit()

        word_count = len(transcript.get("words", []))
        await _publish_progress(
            r, job_id, "transcribing", 0.50,
            f"Transcription complete: {word_count} words"
        )

        # === STAGE 3: AI ANALYSIS ===
        await _update_job(
            db, job, "analyzing", 0.55,
            "AI analyzing transcript for viral clips..."
        )
        await _publish_progress(
            r, job_id, "analyzing", 0.55,
            "AI analyzing transcript for viral clips..."
        )

        clips_data = await ai_analysis.analyze_transcript(
            transcript=transcript,
            video_title=job.title or "",
            video_duration=job.duration or 0,
        )

        if not clips_data:
            raise ValueError("AI analysis returned no clips")

        await _publish_progress(
            r, job_id, "analyzing", 0.70,
            f"Found {len(clips_data)} potential clips"
        )

        # === STAGE 4: EXTRACT CLIPS ===
        await _update_job(
            db, job, "extracting", 0.75,
            f"Extracting {len(clips_data)} clips..."
        )
        await _publish_progress(
            r, job_id, "extracting", 0.75,
            f"Extracting {len(clips_data)} clips..."
        )

        clips_dir = os.path.join(settings.clips_dir, job_id)
        os.makedirs(clips_dir, exist_ok=True)

        for i, cd in enumerate(clips_data):
            clip_filename = f"clip_{i:02d}.mp4"
            clip_path = os.path.join(clips_dir, clip_filename)
            thumb_path = os.path.join(clips_dir, f"thumb_{i:02d}.jpg")

            # Extract the clip
            await clip_extraction.extract_clip(
                video_path=job.media_path,
                start_time=cd["start_time"],
                end_time=cd["end_time"],
                output_path=clip_path,
            )

            # Extract thumbnail at 25% into the clip
            thumb_time = cd["start_time"] + (cd["end_time"] - cd["start_time"]) * 0.25
            try:
                await clip_extraction.extract_thumbnail(
                    video_path=job.media_path,
                    timestamp=thumb_time,
                    output_path=thumb_path,
                )
            except Exception:
                thumb_path = None

            # Get transcript segment for this clip
            segment_text = transcription.get_transcript_segment(
                transcript.get("words", []),
                cd["start_time"],
                cd["end_time"],
            )

            # Save clip to database
            clip = Clip(
                job_id=job.id,
                title=cd["title"],
                start_time=cd["start_time"],
                end_time=cd["end_time"],
                virality_score=cd["virality_score"],
                category=cd["category"],
                reasoning=cd["reasoning"],
                transcript_segment=segment_text,
                thumbnail_path=thumb_path,
                raw_clip_path=clip_path,
            )
            db.add(clip)

            progress = 0.75 + (0.20 * (i + 1) / len(clips_data))
            await _publish_progress(
                r, job_id, "extracting", progress,
                f"Extracted clip {i + 1}/{len(clips_data)}: {cd['title']}"
            )

        await db.commit()

        # === COMPLETE ===
        await _update_job(
            db, job, "complete", 1.0,
            f"Done! {len(clips_data)} clips extracted"
        )
        await _publish_progress(
            r, job_id, "complete", 1.0,
            f"Done! {len(clips_data)} clips extracted"
        )

        # Clean up audio file
        if os.path.exists(audio_path):
            os.remove(audio_path)

        logger.info(f"Job {job_id} complete: {len(clips_data)} clips")

    except Exception as e:
        logger.exception(f"Job {job_id} failed: {e}")
        try:
            await _update_job(
                db, job, "failed", job.progress,
                str(e), error_message=str(e),
            )
            await _publish_progress(
                r, job_id, "failed", job.progress, f"Error: {e}"
            )
        except Exception:
            pass
    finally:
        await db.close()


async def render_clip(ctx: dict, render_id: str):
    """Render a clip with subtitles and aspect ratio conversion.
    (Phase 3 - stub for now, copies raw clip)"""
    from app.models import RenderRequest

    db = await _get_session()
    try:
        render = await db.get(RenderRequest, uuid.UUID(render_id))
        if not render:
            return

        render.status = "rendering"
        render.progress = 0.5
        await db.commit()

        clip = await db.get(Clip, render.clip_id)
        if not clip or not clip.raw_clip_path:
            render.status = "failed"
            render.error_message = "Clip not found or not extracted"
            await db.commit()
            return

        # Phase 1: just copy the raw clip as-is
        # Phase 3 will add subtitle rendering + aspect ratio conversion
        import shutil
        renders_dir = os.path.join(settings.renders_dir, str(render.clip_id))
        os.makedirs(renders_dir, exist_ok=True)
        output = os.path.join(
            renders_dir,
            f"render_{render.aspect_ratio.replace(':', 'x')}.mp4"
        )
        shutil.copy2(clip.raw_clip_path, output)

        render.output_path = output
        render.status = "complete"
        render.progress = 1.0
        await db.commit()

        logger.info(f"Render {render_id} complete: {output}")
    except Exception as e:
        logger.exception(f"Render {render_id} failed: {e}")
        render.status = "failed"
        render.error_message = str(e)
        await db.commit()
    finally:
        await db.close()