From 6aa8a676ec64dc142e789abc4e332b99a64ad369 Mon Sep 17 00:00:00 2001 From: Jeff Emmett Date: Sun, 8 Feb 2026 12:27:43 +0000 Subject: [PATCH] feat: ClipForge Phase 1 - core pipeline MVP Self-hosted AI video clipper (Opus Clip alternative). Pipeline: YouTube URL -> yt-dlp download -> Whisper transcription -> Ollama AI clip selection -> FFmpeg extraction. - FastAPI backend with PostgreSQL + Redis + ARQ worker - 7-stage processing pipeline with SSE progress tracking - Services: download (yt-dlp), transcription (whisper.jeffemmett.com), AI analysis (Ollama), clip extraction (FFmpeg stream copy) - API: create jobs, track progress, list clips, render, download - Docker Compose with Traefik labels for clip.jeffemmett.com Cost: $0/video using existing infrastructure. Co-Authored-By: Claude Opus 4.6 --- .env.example | 33 +++ .gitignore | 9 + backend/Dockerfile | 20 ++ backend/app/__init__.py | 0 backend/app/api/__init__.py | 0 backend/app/api/routes/__init__.py | 0 backend/app/api/routes/clips.py | 34 +++ backend/app/api/routes/jobs.py | 167 ++++++++++++++ backend/app/api/routes/renders.py | 111 +++++++++ backend/app/config.py | 37 +++ backend/app/database.py | 11 + backend/app/main.py | 36 +++ backend/app/models.py | 110 +++++++++ backend/app/schemas.py | 83 +++++++ backend/app/services/__init__.py | 0 backend/app/services/ai_analysis.py | 169 ++++++++++++++ backend/app/services/clip_extraction.py | 117 ++++++++++ backend/app/services/download.py | 117 ++++++++++ backend/app/services/transcription.py | 82 +++++++ backend/app/worker.py | 33 +++ backend/app/workers/__init__.py | 0 backend/app/workers/tasks.py | 295 ++++++++++++++++++++++++ backend/requirements.txt | 13 ++ database/init.sql | 113 +++++++++ docker-compose.yml | 91 ++++++++ frontend/Dockerfile | 6 + 26 files changed, 1687 insertions(+) create mode 100644 .env.example create mode 100644 .gitignore create mode 100644 backend/Dockerfile create mode 100644 backend/app/__init__.py create mode 100644 backend/app/api/__init__.py create mode 100644 backend/app/api/routes/__init__.py create mode 100644 backend/app/api/routes/clips.py create mode 100644 backend/app/api/routes/jobs.py create mode 100644 backend/app/api/routes/renders.py create mode 100644 backend/app/config.py create mode 100644 backend/app/database.py create mode 100644 backend/app/main.py create mode 100644 backend/app/models.py create mode 100644 backend/app/schemas.py create mode 100644 backend/app/services/__init__.py create mode 100644 backend/app/services/ai_analysis.py create mode 100644 backend/app/services/clip_extraction.py create mode 100644 backend/app/services/download.py create mode 100644 backend/app/services/transcription.py create mode 100644 backend/app/worker.py create mode 100644 backend/app/workers/__init__.py create mode 100644 backend/app/workers/tasks.py create mode 100644 backend/requirements.txt create mode 100644 database/init.sql create mode 100644 docker-compose.yml create mode 100644 frontend/Dockerfile diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..a56b4c3 --- /dev/null +++ b/.env.example @@ -0,0 +1,33 @@ +# ClipForge Environment Configuration + +# Database +POSTGRES_USER=clipforge +POSTGRES_PASSWORD=changeme_clipforge_2025 +POSTGRES_DB=clipforge +DATABASE_URL=postgresql+asyncpg://clipforge:changeme_clipforge_2025@postgres:5432/clipforge + +# Redis +REDIS_URL=redis://redis:6379/0 + +# Whisper (self-hosted) +WHISPER_API_URL=https://whisper.jeffemmett.com +WHISPER_MODEL=deepdml/faster-whisper-large-v3-turbo-ct2 + +# Ollama (local) +OLLAMA_URL=http://host.docker.internal:11434 +OLLAMA_MODEL=llama3.1:8b + +# Storage paths (inside container) +MEDIA_DIR=/data/media +CLIPS_DIR=/data/clips +RENDERS_DIR=/data/renders + +# yt-dlp +YTDLP_COOKIES_FILE= +MAX_VIDEO_DURATION=7200 + +# Processing +MAX_CONCURRENT_JOBS=2 +CLIP_MIN_DURATION=15 +CLIP_MAX_DURATION=90 +TARGET_CLIPS=5 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0ef522d --- /dev/null +++ b/.gitignore @@ -0,0 +1,9 @@ +.env +__pycache__/ +*.pyc +.venv/ +node_modules/ +dist/ +data/ +*.egg-info/ +.DS_Store diff --git a/backend/Dockerfile b/backend/Dockerfile new file mode 100644 index 0000000..63b45a0 --- /dev/null +++ b/backend/Dockerfile @@ -0,0 +1,20 @@ +FROM python:3.12-slim + +# Install system dependencies +RUN apt-get update && apt-get install -y --no-install-recommends \ + ffmpeg \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +# Install Python dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application +COPY . . + +EXPOSE 8000 + +# Default: run API server +CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/backend/app/__init__.py b/backend/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/app/api/__init__.py b/backend/app/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/app/api/routes/__init__.py b/backend/app/api/routes/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/app/api/routes/clips.py b/backend/app/api/routes/clips.py new file mode 100644 index 0000000..71f7600 --- /dev/null +++ b/backend/app/api/routes/clips.py @@ -0,0 +1,34 @@ +from uuid import UUID + +from fastapi import APIRouter, Depends, HTTPException +from fastapi.responses import FileResponse +from sqlalchemy.ext.asyncio import AsyncSession + +from app.database import get_db +from app.models import Clip + +router = APIRouter() + + +@router.get("/clips/{clip_id}") +async def get_clip(clip_id: UUID, db: AsyncSession = Depends(get_db)): + clip = await db.get(Clip, clip_id) + if not clip: + raise HTTPException(404, "Clip not found") + clip.duration = clip.end_time - clip.start_time + return clip + + +@router.get("/clips/{clip_id}/preview") +async def preview_clip(clip_id: UUID, db: AsyncSession = Depends(get_db)): + clip = await db.get(Clip, clip_id) + if not clip: + raise HTTPException(404, "Clip not found") + if not clip.raw_clip_path: + raise HTTPException(404, "Clip not yet extracted") + + return FileResponse( + clip.raw_clip_path, + media_type="video/mp4", + filename=f"{clip.title}.mp4", + ) diff --git a/backend/app/api/routes/jobs.py b/backend/app/api/routes/jobs.py new file mode 100644 index 0000000..c908754 --- /dev/null +++ b/backend/app/api/routes/jobs.py @@ -0,0 +1,167 @@ +import asyncio +import json +from uuid import UUID + +from arq import create_pool +from arq.connections import RedisSettings +from fastapi import APIRouter, Depends, HTTPException, UploadFile, File, Form +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession +from sse_starlette.sse import EventSourceResponse + +from app.config import settings +from app.database import get_db +from app.models import Job, Clip +from app.schemas import JobCreate, JobResponse, ClipResponse + +router = APIRouter() + + +def _redis_settings() -> RedisSettings: + from urllib.parse import urlparse + parsed = urlparse(settings.redis_url) + return RedisSettings( + host=parsed.hostname or "redis", + port=parsed.port or 6379, + database=int(parsed.path.lstrip("/") or "0"), + ) + + +@router.post("/jobs", response_model=JobResponse, status_code=201) +async def create_job(job_in: JobCreate, db: AsyncSession = Depends(get_db)): + if job_in.source_type == "youtube" and not job_in.source_url: + raise HTTPException(400, "source_url required for youtube source") + + job = Job( + source_type=job_in.source_type, + source_url=job_in.source_url, + status="pending", + ) + db.add(job) + await db.commit() + await db.refresh(job) + + # Enqueue processing + pool = await create_pool(_redis_settings()) + await pool.enqueue_job("process_job", str(job.id)) + await pool.close() + + return job + + +@router.post("/jobs/upload", response_model=JobResponse, status_code=201) +async def create_job_upload( + file: UploadFile = File(...), + db: AsyncSession = Depends(get_db), +): + import os + import aiofiles + + os.makedirs(settings.media_dir, exist_ok=True) + safe_name = file.filename.replace("/", "_").replace("..", "_") + dest = os.path.join(settings.media_dir, f"upload_{safe_name}") + + async with aiofiles.open(dest, "wb") as f: + while chunk := await file.read(1024 * 1024): + await f.write(chunk) + + job = Job( + source_type="upload", + source_filename=safe_name, + media_path=dest, + status="pending", + ) + db.add(job) + await db.commit() + await db.refresh(job) + + pool = await create_pool(_redis_settings()) + await pool.enqueue_job("process_job", str(job.id)) + await pool.close() + + return job + + +@router.get("/jobs", response_model=list[JobResponse]) +async def list_jobs( + limit: int = 20, + offset: int = 0, + db: AsyncSession = Depends(get_db), +): + result = await db.execute( + select(Job).order_by(Job.created_at.desc()).offset(offset).limit(limit) + ) + return result.scalars().all() + + +@router.get("/jobs/{job_id}", response_model=JobResponse) +async def get_job(job_id: UUID, db: AsyncSession = Depends(get_db)): + job = await db.get(Job, job_id) + if not job: + raise HTTPException(404, "Job not found") + return job + + +@router.get("/jobs/{job_id}/clips", response_model=list[ClipResponse]) +async def get_job_clips(job_id: UUID, db: AsyncSession = Depends(get_db)): + job = await db.get(Job, job_id) + if not job: + raise HTTPException(404, "Job not found") + + result = await db.execute( + select(Clip) + .where(Clip.job_id == job_id) + .order_by(Clip.virality_score.desc()) + ) + clips = result.scalars().all() + # Compute duration manually since it's a generated column + for clip in clips: + clip.duration = clip.end_time - clip.start_time + return clips + + +@router.get("/jobs/{job_id}/progress") +async def job_progress_sse(job_id: UUID, db: AsyncSession = Depends(get_db)): + job = await db.get(Job, job_id) + if not job: + raise HTTPException(404, "Job not found") + + async def event_stream(): + import redis.asyncio as aioredis + + r = aioredis.from_url(settings.redis_url) + pubsub = r.pubsub() + await pubsub.subscribe(f"job:{job_id}:progress") + + # Send current state immediately + await db.refresh(job) + yield { + "event": "progress", + "data": json.dumps({ + "status": job.status, + "progress": job.progress, + "stage_message": job.stage_message, + }), + } + + if job.status in ("complete", "failed"): + await pubsub.unsubscribe() + await r.close() + return + + try: + while True: + msg = await pubsub.get_message( + ignore_subscribe_messages=True, timeout=1.0 + ) + if msg and msg["type"] == "message": + data = json.loads(msg["data"]) + yield {"event": "progress", "data": json.dumps(data)} + if data.get("status") in ("complete", "failed"): + break + await asyncio.sleep(0.5) + finally: + await pubsub.unsubscribe() + await r.close() + + return EventSourceResponse(event_stream()) diff --git a/backend/app/api/routes/renders.py b/backend/app/api/routes/renders.py new file mode 100644 index 0000000..83a50de --- /dev/null +++ b/backend/app/api/routes/renders.py @@ -0,0 +1,111 @@ +from uuid import UUID + +from arq import create_pool +from arq.connections import RedisSettings +from fastapi import APIRouter, Depends, HTTPException +from fastapi.responses import FileResponse +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.config import settings +from app.database import get_db +from app.models import Clip, RenderRequest +from app.schemas import RenderCreate, RenderResponse, BulkRenderCreate + +router = APIRouter() + + +def _redis_settings() -> RedisSettings: + from urllib.parse import urlparse + parsed = urlparse(settings.redis_url) + return RedisSettings( + host=parsed.hostname or "redis", + port=parsed.port or 6379, + database=int(parsed.path.lstrip("/") or "0"), + ) + + +@router.post("/clips/{clip_id}/render", response_model=RenderResponse, status_code=201) +async def render_clip( + clip_id: UUID, + render_in: RenderCreate, + db: AsyncSession = Depends(get_db), +): + clip = await db.get(Clip, clip_id) + if not clip: + raise HTTPException(404, "Clip not found") + if not clip.raw_clip_path: + raise HTTPException(400, "Clip not yet extracted") + + render = RenderRequest( + clip_id=clip_id, + aspect_ratio=render_in.aspect_ratio, + subtitle_style=render_in.subtitle_style, + status="pending", + ) + db.add(render) + await db.commit() + await db.refresh(render) + + pool = await create_pool(_redis_settings()) + await pool.enqueue_job("render_clip", str(render.id)) + await pool.close() + + return render + + +@router.post("/jobs/{job_id}/render-all", response_model=list[RenderResponse], status_code=201) +async def render_all_clips( + job_id: UUID, + bulk_in: BulkRenderCreate, + db: AsyncSession = Depends(get_db), +): + renders = [] + pool = await create_pool(_redis_settings()) + + for clip_id in bulk_in.clip_ids: + clip = await db.get(Clip, clip_id) + if not clip or not clip.raw_clip_path: + continue + + render = RenderRequest( + clip_id=clip_id, + aspect_ratio=bulk_in.aspect_ratio, + subtitle_style=bulk_in.subtitle_style, + status="pending", + ) + db.add(render) + await db.commit() + await db.refresh(render) + + await pool.enqueue_job("render_clip", str(render.id)) + renders.append(render) + + await pool.close() + return renders + + +@router.get("/renders/{render_id}", response_model=RenderResponse) +async def get_render(render_id: UUID, db: AsyncSession = Depends(get_db)): + render = await db.get(RenderRequest, render_id) + if not render: + raise HTTPException(404, "Render not found") + return render + + +@router.get("/renders/{render_id}/download") +async def download_render(render_id: UUID, db: AsyncSession = Depends(get_db)): + render = await db.get(RenderRequest, render_id) + if not render: + raise HTTPException(404, "Render not found") + if render.status != "complete" or not render.output_path: + raise HTTPException(400, "Render not complete") + + clip = await db.get(Clip, render.clip_id) + filename = f"{clip.title}_{render.aspect_ratio.replace(':', 'x')}.mp4" if clip else "clip.mp4" + + return FileResponse( + render.output_path, + media_type="video/mp4", + filename=filename, + ) diff --git a/backend/app/config.py b/backend/app/config.py new file mode 100644 index 0000000..f8ca9be --- /dev/null +++ b/backend/app/config.py @@ -0,0 +1,37 @@ +from pydantic_settings import BaseSettings + + +class Settings(BaseSettings): + # Database + database_url: str = "postgresql+asyncpg://clipforge:changeme_clipforge_2025@postgres:5432/clipforge" + + # Redis + redis_url: str = "redis://redis:6379/0" + + # Whisper + whisper_api_url: str = "https://whisper.jeffemmett.com" + whisper_model: str = "deepdml/faster-whisper-large-v3-turbo-ct2" + + # Ollama + ollama_url: str = "http://host.docker.internal:11434" + ollama_model: str = "llama3.1:8b" + + # Storage + media_dir: str = "/data/media" + clips_dir: str = "/data/clips" + renders_dir: str = "/data/renders" + + # yt-dlp + ytdlp_cookies_file: str = "" + max_video_duration: int = 7200 + + # Processing + max_concurrent_jobs: int = 2 + clip_min_duration: int = 15 + clip_max_duration: int = 90 + target_clips: int = 5 + + model_config = {"env_file": ".env", "extra": "ignore"} + + +settings = Settings() diff --git a/backend/app/database.py b/backend/app/database.py new file mode 100644 index 0000000..74f2d53 --- /dev/null +++ b/backend/app/database.py @@ -0,0 +1,11 @@ +from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker + +from app.config import settings + +engine = create_async_engine(settings.database_url, echo=False) +async_session = async_sessionmaker(engine, class_=AsyncSession, expire_on_commit=False) + + +async def get_db() -> AsyncSession: + async with async_session() as session: + yield session diff --git a/backend/app/main.py b/backend/app/main.py new file mode 100644 index 0000000..7a99708 --- /dev/null +++ b/backend/app/main.py @@ -0,0 +1,36 @@ +from contextlib import asynccontextmanager + +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware + +from app.api.routes import jobs, clips, renders + + +@asynccontextmanager +async def lifespan(app: FastAPI): + yield + + +app = FastAPI( + title="ClipForge", + description="Self-hosted AI video clipper", + version="0.1.0", + lifespan=lifespan, +) + +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + +app.include_router(jobs.router, prefix="/api") +app.include_router(clips.router, prefix="/api") +app.include_router(renders.router, prefix="/api") + + +@app.get("/health") +async def health(): + return {"status": "ok", "service": "clipforge"} diff --git a/backend/app/models.py b/backend/app/models.py new file mode 100644 index 0000000..b293659 --- /dev/null +++ b/backend/app/models.py @@ -0,0 +1,110 @@ +import uuid +from datetime import datetime + +from sqlalchemy import ( + Column, + DateTime, + Enum, + Float, + ForeignKey, + Index, + String, + Text, + func, +) +from sqlalchemy.dialects.postgresql import JSONB, UUID +from sqlalchemy.orm import DeclarativeBase, relationship + + +class Base(DeclarativeBase): + pass + + +class Job(Base): + __tablename__ = "jobs" + + id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + source_type = Column(Enum("youtube", "upload", name="source_type"), nullable=False) + source_url = Column(Text) + source_filename = Column(Text) + title = Column(Text) + duration = Column(Float) + status = Column( + Enum( + "pending", + "downloading", + "transcribing", + "analyzing", + "extracting", + "complete", + "failed", + name="job_status", + ), + nullable=False, + default="pending", + ) + progress = Column(Float, nullable=False, default=0.0) + stage_message = Column(Text) + error_message = Column(Text) + media_path = Column(Text) + transcript = Column(JSONB) + scene_boundaries = Column(JSONB) + created_at = Column(DateTime(timezone=True), server_default=func.now()) + updated_at = Column( + DateTime(timezone=True), server_default=func.now(), onupdate=func.now() + ) + + clips = relationship("Clip", back_populates="job", cascade="all, delete-orphan") + + +class Clip(Base): + __tablename__ = "clips" + + id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + job_id = Column( + UUID(as_uuid=True), ForeignKey("jobs.id", ondelete="CASCADE"), nullable=False + ) + title = Column(Text, nullable=False) + start_time = Column(Float, nullable=False) + end_time = Column(Float, nullable=False) + virality_score = Column(Float, nullable=False, default=0.0) + category = Column(Text) + reasoning = Column(Text) + transcript_segment = Column(Text) + thumbnail_path = Column(Text) + raw_clip_path = Column(Text) + created_at = Column(DateTime(timezone=True), server_default=func.now()) + + job = relationship("Job", back_populates="clips") + renders = relationship( + "RenderRequest", back_populates="clip", cascade="all, delete-orphan" + ) + + +class RenderRequest(Base): + __tablename__ = "render_requests" + + id = Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) + clip_id = Column( + UUID(as_uuid=True), ForeignKey("clips.id", ondelete="CASCADE"), nullable=False + ) + aspect_ratio = Column( + Enum("16:9", "9:16", "1:1", "4:5", name="aspect_ratio"), + nullable=False, + default="9:16", + ) + subtitle_style = Column(String, nullable=False, default="tiktok") + status = Column( + Enum("pending", "rendering", "complete", "failed", name="render_status"), + nullable=False, + default="pending", + ) + progress = Column(Float, nullable=False, default=0.0) + output_path = Column(Text) + error_message = Column(Text) + created_at = Column(DateTime(timezone=True), server_default=func.now()) + updated_at = Column( + DateTime(timezone=True), server_default=func.now(), onupdate=func.now() + ) + + clip = relationship("Clip", back_populates="renders") diff --git a/backend/app/schemas.py b/backend/app/schemas.py new file mode 100644 index 0000000..d432bb3 --- /dev/null +++ b/backend/app/schemas.py @@ -0,0 +1,83 @@ +from datetime import datetime +from typing import Optional +from uuid import UUID + +from pydantic import BaseModel, Field + + +# --- Job Schemas --- + +class JobCreate(BaseModel): + source_type: str = Field(..., pattern="^(youtube|upload)$") + source_url: Optional[str] = None + + +class JobResponse(BaseModel): + id: UUID + source_type: str + source_url: Optional[str] + source_filename: Optional[str] + title: Optional[str] + duration: Optional[float] + status: str + progress: float + stage_message: Optional[str] + error_message: Optional[str] + created_at: datetime + updated_at: datetime + + model_config = {"from_attributes": True} + + +class JobProgress(BaseModel): + status: str + progress: float + stage_message: Optional[str] + + +# --- Clip Schemas --- + +class ClipResponse(BaseModel): + id: UUID + job_id: UUID + title: str + start_time: float + end_time: float + duration: Optional[float] = None + virality_score: float + category: Optional[str] + reasoning: Optional[str] + transcript_segment: Optional[str] + thumbnail_path: Optional[str] + raw_clip_path: Optional[str] + created_at: datetime + + model_config = {"from_attributes": True} + + +# --- Render Schemas --- + +class RenderCreate(BaseModel): + aspect_ratio: str = Field(default="9:16", pattern="^(16:9|9:16|1:1|4:5)$") + subtitle_style: str = Field(default="tiktok") + + +class RenderResponse(BaseModel): + id: UUID + clip_id: UUID + aspect_ratio: str + subtitle_style: str + status: str + progress: float + output_path: Optional[str] + error_message: Optional[str] + created_at: datetime + updated_at: datetime + + model_config = {"from_attributes": True} + + +class BulkRenderCreate(BaseModel): + clip_ids: list[UUID] + aspect_ratio: str = Field(default="9:16", pattern="^(16:9|9:16|1:1|4:5)$") + subtitle_style: str = Field(default="tiktok") diff --git a/backend/app/services/__init__.py b/backend/app/services/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/app/services/ai_analysis.py b/backend/app/services/ai_analysis.py new file mode 100644 index 0000000..5e55dce --- /dev/null +++ b/backend/app/services/ai_analysis.py @@ -0,0 +1,169 @@ +"""AI clip analysis using Ollama (local LLM).""" + +import json +import logging +import re + +import httpx + +from app.config import settings + +logger = logging.getLogger(__name__) + +SYSTEM_PROMPT = """You are a viral video clip analyst. Given a video transcript with timestamps, identify the best short clips that would perform well on social media (TikTok, YouTube Shorts, Instagram Reels). + +For each clip, provide: +- A catchy title (max 60 chars) +- Start and end timestamps (in seconds) +- Virality score (0-100) +- Category (one of: hook, story, insight, humor, emotional, controversial, educational) +- Brief reasoning for why this clip would go viral + +Rules: +- Clips should be {min_dur}-{max_dur} seconds long +- Identify {target} clips, ranked by virality potential +- Clips should start and end at natural sentence boundaries +- Prefer clips with strong hooks in the first 3 seconds +- Look for emotional peaks, surprising statements, quotable moments +- Avoid clips that start mid-sentence or end abruptly + +Respond ONLY with valid JSON in this exact format: +{{ + "clips": [ + {{ + "title": "Clip title here", + "start_time": 12.5, + "end_time": 45.2, + "virality_score": 85, + "category": "hook", + "reasoning": "Why this clip would perform well" + }} + ] +}}""" + + +async def analyze_transcript( + transcript: dict, + video_title: str = "", + video_duration: float = 0, +) -> list[dict]: + """Use Ollama to identify the best clips from a transcript. + + Args: + transcript: dict with 'text', 'words', 'segments' from transcription service + video_title: original video title for context + video_duration: total video duration in seconds + + Returns: + List of clip dicts with title, start_time, end_time, virality_score, category, reasoning + """ + # Build timestamped transcript for the LLM + text = transcript.get("text", "") + segments = transcript.get("segments", []) + + if segments: + timestamped = "\n".join( + f"[{_fmt_time(s.get('start', 0))} - {_fmt_time(s.get('end', 0))}] " + f"{s.get('text', '').strip()}" + for s in segments + ) + else: + # Fall back to plain text with rough time estimates + timestamped = text + + system = SYSTEM_PROMPT.format( + min_dur=settings.clip_min_duration, + max_dur=settings.clip_max_duration, + target=settings.target_clips, + ) + + user_prompt = f"""Video Title: {video_title} +Video Duration: {_fmt_time(video_duration)} + +Transcript: +{timestamped} + +Identify the {settings.target_clips} best viral clips from this transcript.""" + + logger.info(f"Sending transcript to Ollama ({settings.ollama_model})...") + + async with httpx.AsyncClient(timeout=300.0) as client: + response = await client.post( + f"{settings.ollama_url}/api/chat", + json={ + "model": settings.ollama_model, + "messages": [ + {"role": "system", "content": system}, + {"role": "user", "content": user_prompt}, + ], + "stream": False, + "options": { + "temperature": 0.3, + "num_predict": 4096, + }, + }, + ) + response.raise_for_status() + result = response.json() + + content = result.get("message", {}).get("content", "") + clips = _parse_clips(content, video_duration) + + logger.info(f"AI identified {len(clips)} clips") + return clips + + +def _parse_clips(content: str, video_duration: float) -> list[dict]: + """Parse LLM response into clip list, handling imperfect JSON.""" + # Try to extract JSON from response + json_match = re.search(r"\{[\s\S]*\}", content) + if not json_match: + logger.error(f"No JSON found in LLM response: {content[:200]}") + return [] + + try: + data = json.loads(json_match.group()) + except json.JSONDecodeError: + # Try to fix common JSON issues + fixed = json_match.group() + fixed = re.sub(r",\s*}", "}", fixed) + fixed = re.sub(r",\s*]", "]", fixed) + try: + data = json.loads(fixed) + except json.JSONDecodeError: + logger.error(f"Failed to parse LLM JSON: {content[:200]}") + return [] + + raw_clips = data.get("clips", []) + clips = [] + + for c in raw_clips: + start = float(c.get("start_time", 0)) + end = float(c.get("end_time", 0)) + + # Validate + if end <= start: + continue + if start < 0: + start = 0 + if end > video_duration and video_duration > 0: + end = video_duration + + clips.append({ + "title": str(c.get("title", "Untitled"))[:100], + "start_time": round(start, 2), + "end_time": round(end, 2), + "virality_score": max(0, min(100, float(c.get("virality_score", 50)))), + "category": str(c.get("category", "general")), + "reasoning": str(c.get("reasoning", "")), + }) + + # Sort by virality score descending + clips.sort(key=lambda x: x["virality_score"], reverse=True) + return clips + + +def _fmt_time(seconds: float) -> str: + """Format seconds as MM:SS.""" + m, s = divmod(int(seconds), 60) + return f"{m:02d}:{s:02d}" diff --git a/backend/app/services/clip_extraction.py b/backend/app/services/clip_extraction.py new file mode 100644 index 0000000..347b1f0 --- /dev/null +++ b/backend/app/services/clip_extraction.py @@ -0,0 +1,117 @@ +"""Clip extraction service using FFmpeg.""" + +import asyncio +import logging +import os + +from app.config import settings + +logger = logging.getLogger(__name__) + + +async def extract_clip( + video_path: str, + start_time: float, + end_time: float, + output_path: str, +) -> str: + """Extract a clip from video using FFmpeg stream copy (instant, no re-encode). + + Args: + video_path: path to source video + start_time: clip start in seconds + end_time: clip end in seconds + output_path: where to write the clip + + Returns: + output_path + """ + os.makedirs(os.path.dirname(output_path), exist_ok=True) + + duration = end_time - start_time + + # Use stream copy for speed - seek before input for accuracy + cmd = [ + "ffmpeg", + "-ss", str(start_time), + "-i", video_path, + "-t", str(duration), + "-c", "copy", + "-avoid_negative_ts", "make_zero", + "-y", + output_path, + ] + + logger.info( + f"Extracting clip: {start_time:.1f}s - {end_time:.1f}s -> {output_path}" + ) + + proc = await asyncio.create_subprocess_exec( + *cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + _, stderr = await proc.communicate() + + if proc.returncode != 0: + raise RuntimeError(f"FFmpeg clip extraction failed: {stderr.decode()}") + + size_mb = os.path.getsize(output_path) / (1024 * 1024) + logger.info(f"Extracted clip: {output_path} ({size_mb:.1f} MB)") + return output_path + + +async def extract_thumbnail( + video_path: str, + timestamp: float, + output_path: str, +) -> str: + """Extract a single frame as thumbnail.""" + os.makedirs(os.path.dirname(output_path), exist_ok=True) + + cmd = [ + "ffmpeg", + "-ss", str(timestamp), + "-i", video_path, + "-vframes", "1", + "-q:v", "2", + "-y", + output_path, + ] + + proc = await asyncio.create_subprocess_exec( + *cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + _, stderr = await proc.communicate() + + if proc.returncode != 0: + raise RuntimeError(f"FFmpeg thumbnail extraction failed: {stderr.decode()}") + + return output_path + + +async def get_video_duration(video_path: str) -> float: + """Get video duration in seconds using ffprobe.""" + cmd = [ + "ffprobe", + "-v", "quiet", + "-print_format", "json", + "-show_format", + video_path, + ] + + proc = await asyncio.create_subprocess_exec( + *cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + stdout, _ = await proc.communicate() + + if proc.returncode != 0: + return 0.0 + + import json + data = json.loads(stdout.decode()) + return float(data.get("format", {}).get("duration", 0)) diff --git a/backend/app/services/download.py b/backend/app/services/download.py new file mode 100644 index 0000000..485e3b4 --- /dev/null +++ b/backend/app/services/download.py @@ -0,0 +1,117 @@ +"""Video download service using yt-dlp.""" + +import os +import re +import logging +from dataclasses import dataclass +from typing import Optional + +import yt_dlp + +from app.config import settings + +logger = logging.getLogger(__name__) + +COOKIES_FILE = settings.ytdlp_cookies_file + + +@dataclass +class VideoInfo: + title: str + duration: float + video_path: str + video_id: str + + +def extract_video_id(url: str) -> Optional[str]: + patterns = [ + r"(?:youtube\.com/watch\?v=|youtu\.be/|youtube\.com/embed/)([a-zA-Z0-9_-]{11})", + r"youtube\.com/shorts/([a-zA-Z0-9_-]{11})", + ] + for pattern in patterns: + match = re.search(pattern, url) + if match: + return match.group(1) + return None + + +def _base_opts() -> dict: + opts = {"quiet": True, "no_warnings": True} + if COOKIES_FILE and os.path.exists(COOKIES_FILE): + opts["cookiefile"] = COOKIES_FILE + return opts + + +async def get_video_metadata(url: str) -> dict: + """Get video metadata without downloading.""" + opts = _base_opts() + opts["extract_flat"] = False + + with yt_dlp.YoutubeDL(opts) as ydl: + info = ydl.extract_info(url, download=False) + return { + "title": info.get("title", "Unknown"), + "duration": info.get("duration", 0), + "video_id": info.get("id", ""), + } + + +async def download_video(url: str, output_dir: str) -> VideoInfo: + """Download video from YouTube URL. Downloads video+audio for clip extraction.""" + os.makedirs(output_dir, exist_ok=True) + + video_id = extract_video_id(url) or "video" + output_template = os.path.join(output_dir, f"{video_id}.%(ext)s") + + opts = _base_opts() + opts.update({ + # Download best video+audio merged to mp4 + "format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best", + "merge_output_format": "mp4", + "outtmpl": output_template, + }) + + with yt_dlp.YoutubeDL(opts) as ydl: + info = ydl.extract_info(url, download=True) + + video_path = os.path.join(output_dir, f"{video_id}.mp4") + if not os.path.exists(video_path): + # Find whatever file was downloaded + for f in os.listdir(output_dir): + if f.startswith(video_id) and not f.endswith(".part"): + video_path = os.path.join(output_dir, f) + break + + duration = info.get("duration", 0) + if duration > settings.max_video_duration: + raise ValueError( + f"Video is {duration}s, max is {settings.max_video_duration}s" + ) + + logger.info(f"Downloaded: {info.get('title')} ({duration}s) -> {video_path}") + return VideoInfo( + title=info.get("title", "Unknown"), + duration=duration, + video_path=video_path, + video_id=video_id, + ) + + +async def extract_audio(video_path: str, output_path: str) -> str: + """Extract audio from video file for transcription.""" + import asyncio + + proc = await asyncio.create_subprocess_exec( + "ffmpeg", "-i", video_path, + "-vn", "-acodec", "libmp3lame", "-q:a", "4", + "-y", output_path, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + _, stderr = await proc.communicate() + + if proc.returncode != 0: + raise RuntimeError(f"FFmpeg audio extraction failed: {stderr.decode()}") + + logger.info(f"Extracted audio: {output_path}") + return output_path diff --git a/backend/app/services/transcription.py b/backend/app/services/transcription.py new file mode 100644 index 0000000..712868b --- /dev/null +++ b/backend/app/services/transcription.py @@ -0,0 +1,82 @@ +"""Transcription service using self-hosted faster-whisper-server.""" + +import logging +import os + +import httpx + +from app.config import settings + +logger = logging.getLogger(__name__) + + +async def transcribe(audio_path: str) -> dict: + """Transcribe audio file using local Whisper API. + + Returns dict with: + - text: full transcript text + - words: list of {word, start, end} with word-level timestamps + - language: detected language + - duration: audio duration + """ + url = f"{settings.whisper_api_url}/v1/audio/transcriptions" + + async with httpx.AsyncClient(timeout=900.0) as client: + with open(audio_path, "rb") as f: + files = {"file": (os.path.basename(audio_path), f, "audio/mpeg")} + data = { + "model": settings.whisper_model, + "response_format": "verbose_json", + "timestamp_granularities[]": "word", + } + + logger.info(f"Transcribing {audio_path} via {settings.whisper_api_url}") + response = await client.post(url, files=files, data=data) + response.raise_for_status() + result = response.json() + + text = result.get("text", "").strip() + words = result.get("words", []) + segments = result.get("segments", []) + + # Build word-level timestamps + word_timestamps = [] + if words: + for w in words: + word_timestamps.append({ + "word": w.get("word", ""), + "start": w.get("start", 0.0), + "end": w.get("end", 0.0), + }) + elif segments: + # Fall back to segment-level if word-level not available + for seg in segments: + for w in seg.get("words", []): + word_timestamps.append({ + "word": w.get("word", ""), + "start": w.get("start", 0.0), + "end": w.get("end", 0.0), + }) + + logger.info( + f"Transcription complete: {len(text)} chars, " + f"{len(word_timestamps)} word timestamps" + ) + + return { + "text": text, + "words": word_timestamps, + "segments": segments, + "language": result.get("language", "en"), + "duration": result.get("duration", 0.0), + } + + +def get_transcript_segment(words: list[dict], start: float, end: float) -> str: + """Extract transcript text for a given time range.""" + segment_words = [ + w["word"] + for w in words + if w["start"] >= start - 0.5 and w["end"] <= end + 0.5 + ] + return " ".join(segment_words).strip() diff --git a/backend/app/worker.py b/backend/app/worker.py new file mode 100644 index 0000000..99212b8 --- /dev/null +++ b/backend/app/worker.py @@ -0,0 +1,33 @@ +"""ARQ worker entry point.""" + +import logging +from urllib.parse import urlparse + +from arq import cron +from arq.connections import RedisSettings + +from app.config import settings +from app.workers.tasks import process_job, render_clip + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", +) + + +def _redis_settings() -> RedisSettings: + parsed = urlparse(settings.redis_url) + return RedisSettings( + host=parsed.hostname or "redis", + port=parsed.port or 6379, + database=int(parsed.path.lstrip("/") or "0"), + ) + + +class WorkerSettings: + functions = [process_job, render_clip] + redis_settings = _redis_settings() + max_jobs = settings.max_concurrent_jobs + job_timeout = 3600 # 1 hour max per job + keep_result = 3600 + health_check_interval = 30 diff --git a/backend/app/workers/__init__.py b/backend/app/workers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/app/workers/tasks.py b/backend/app/workers/tasks.py new file mode 100644 index 0000000..d777995 --- /dev/null +++ b/backend/app/workers/tasks.py @@ -0,0 +1,295 @@ +"""Pipeline orchestration tasks for ARQ worker.""" + +import json +import logging +import os +import uuid + +import redis.asyncio as aioredis +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker + +from app.config import settings +from app.models import Job, Clip +from app.services import download, transcription, ai_analysis, clip_extraction + +logger = logging.getLogger(__name__) + + +async def _get_session() -> AsyncSession: + engine = create_async_engine(settings.database_url, echo=False) + session_factory = async_sessionmaker(engine, class_=AsyncSession, expire_on_commit=False) + return session_factory() + + +async def _publish_progress( + redis: aioredis.Redis, + job_id: str, + status: str, + progress: float, + stage_message: str, +): + """Publish progress update via Redis pub/sub.""" + data = { + "status": status, + "progress": round(progress, 2), + "stage_message": stage_message, + } + await redis.publish(f"job:{job_id}:progress", json.dumps(data)) + + +async def _update_job( + db: AsyncSession, + job: Job, + status: str, + progress: float, + stage_message: str, + **kwargs, +): + """Update job in database.""" + job.status = status + job.progress = progress + job.stage_message = stage_message + for k, v in kwargs.items(): + setattr(job, k, v) + await db.commit() + + +async def process_job(ctx: dict, job_id: str): + """Main pipeline: download → transcribe → AI analysis → extract clips.""" + r = ctx.get("redis") or aioredis.from_url(settings.redis_url) + db = await _get_session() + + try: + job = await db.get(Job, uuid.UUID(job_id)) + if not job: + logger.error(f"Job {job_id} not found") + return + + logger.info(f"Processing job {job_id}: {job.source_type}") + + # === STAGE 1: DOWNLOAD === + await _update_job(db, job, "downloading", 0.05, "Downloading video...") + await _publish_progress(r, job_id, "downloading", 0.05, "Downloading video...") + + job_media_dir = os.path.join(settings.media_dir, job_id) + os.makedirs(job_media_dir, exist_ok=True) + + if job.source_type == "youtube": + video_info = await download.download_video(job.source_url, job_media_dir) + job.title = video_info.title + job.duration = video_info.duration + job.media_path = video_info.video_path + elif job.media_path: + # Uploaded file - get duration + duration = await clip_extraction.get_video_duration(job.media_path) + job.duration = duration + if not job.title: + job.title = job.source_filename or "Uploaded Video" + else: + raise ValueError("No video source available") + + await db.commit() + await _publish_progress( + r, job_id, "downloading", 0.20, + f"Downloaded: {job.title} ({job.duration:.0f}s)" + ) + + # === STAGE 2: TRANSCRIBE === + await _update_job( + db, job, "transcribing", 0.25, + "Extracting audio and transcribing..." + ) + await _publish_progress( + r, job_id, "transcribing", 0.25, + "Extracting audio and transcribing..." + ) + + # Extract audio for transcription + audio_path = os.path.join(job_media_dir, "audio.mp3") + await download.extract_audio(job.media_path, audio_path) + + await _publish_progress( + r, job_id, "transcribing", 0.30, + "Transcribing with Whisper..." + ) + + transcript = await transcription.transcribe(audio_path) + job.transcript = transcript + await db.commit() + + word_count = len(transcript.get("words", [])) + await _publish_progress( + r, job_id, "transcribing", 0.50, + f"Transcription complete: {word_count} words" + ) + + # === STAGE 3: AI ANALYSIS === + await _update_job( + db, job, "analyzing", 0.55, + "AI analyzing transcript for viral clips..." + ) + await _publish_progress( + r, job_id, "analyzing", 0.55, + "AI analyzing transcript for viral clips..." + ) + + clips_data = await ai_analysis.analyze_transcript( + transcript=transcript, + video_title=job.title or "", + video_duration=job.duration or 0, + ) + + if not clips_data: + raise ValueError("AI analysis returned no clips") + + await _publish_progress( + r, job_id, "analyzing", 0.70, + f"Found {len(clips_data)} potential clips" + ) + + # === STAGE 4: EXTRACT CLIPS === + await _update_job( + db, job, "extracting", 0.75, + f"Extracting {len(clips_data)} clips..." + ) + await _publish_progress( + r, job_id, "extracting", 0.75, + f"Extracting {len(clips_data)} clips..." + ) + + clips_dir = os.path.join(settings.clips_dir, job_id) + os.makedirs(clips_dir, exist_ok=True) + + for i, cd in enumerate(clips_data): + clip_filename = f"clip_{i:02d}.mp4" + clip_path = os.path.join(clips_dir, clip_filename) + thumb_path = os.path.join(clips_dir, f"thumb_{i:02d}.jpg") + + # Extract the clip + await clip_extraction.extract_clip( + video_path=job.media_path, + start_time=cd["start_time"], + end_time=cd["end_time"], + output_path=clip_path, + ) + + # Extract thumbnail at 25% into the clip + thumb_time = cd["start_time"] + (cd["end_time"] - cd["start_time"]) * 0.25 + try: + await clip_extraction.extract_thumbnail( + video_path=job.media_path, + timestamp=thumb_time, + output_path=thumb_path, + ) + except Exception: + thumb_path = None + + # Get transcript segment for this clip + segment_text = transcription.get_transcript_segment( + transcript.get("words", []), + cd["start_time"], + cd["end_time"], + ) + + # Save clip to database + clip = Clip( + job_id=job.id, + title=cd["title"], + start_time=cd["start_time"], + end_time=cd["end_time"], + virality_score=cd["virality_score"], + category=cd["category"], + reasoning=cd["reasoning"], + transcript_segment=segment_text, + thumbnail_path=thumb_path, + raw_clip_path=clip_path, + ) + db.add(clip) + + progress = 0.75 + (0.20 * (i + 1) / len(clips_data)) + await _publish_progress( + r, job_id, "extracting", progress, + f"Extracted clip {i + 1}/{len(clips_data)}: {cd['title']}" + ) + + await db.commit() + + # === COMPLETE === + await _update_job( + db, job, "complete", 1.0, + f"Done! {len(clips_data)} clips extracted" + ) + await _publish_progress( + r, job_id, "complete", 1.0, + f"Done! {len(clips_data)} clips extracted" + ) + + # Clean up audio file + if os.path.exists(audio_path): + os.remove(audio_path) + + logger.info(f"Job {job_id} complete: {len(clips_data)} clips") + + except Exception as e: + logger.exception(f"Job {job_id} failed: {e}") + try: + await _update_job( + db, job, "failed", job.progress, + str(e), error_message=str(e), + ) + await _publish_progress( + r, job_id, "failed", job.progress, f"Error: {e}" + ) + except Exception: + pass + finally: + await db.close() + + +async def render_clip(ctx: dict, render_id: str): + """Render a clip with subtitles and aspect ratio conversion. + (Phase 3 - stub for now, copies raw clip)""" + from app.models import RenderRequest + + db = await _get_session() + try: + render = await db.get(RenderRequest, uuid.UUID(render_id)) + if not render: + return + + render.status = "rendering" + render.progress = 0.5 + await db.commit() + + clip = await db.get(Clip, render.clip_id) + if not clip or not clip.raw_clip_path: + render.status = "failed" + render.error_message = "Clip not found or not extracted" + await db.commit() + return + + # Phase 1: just copy the raw clip as-is + # Phase 3 will add subtitle rendering + aspect ratio conversion + import shutil + renders_dir = os.path.join(settings.renders_dir, str(render.clip_id)) + os.makedirs(renders_dir, exist_ok=True) + output = os.path.join( + renders_dir, + f"render_{render.aspect_ratio.replace(':', 'x')}.mp4" + ) + shutil.copy2(clip.raw_clip_path, output) + + render.output_path = output + render.status = "complete" + render.progress = 1.0 + await db.commit() + + logger.info(f"Render {render_id} complete: {output}") + except Exception as e: + logger.exception(f"Render {render_id} failed: {e}") + render.status = "failed" + render.error_message = str(e) + await db.commit() + finally: + await db.close() diff --git a/backend/requirements.txt b/backend/requirements.txt new file mode 100644 index 0000000..97a5a6f --- /dev/null +++ b/backend/requirements.txt @@ -0,0 +1,13 @@ +fastapi==0.115.6 +uvicorn[standard]==0.34.0 +sqlalchemy[asyncio]==2.0.36 +asyncpg==0.30.0 +pydantic==2.10.3 +pydantic-settings==2.7.0 +arq==0.26.1 +redis==5.2.1 +httpx==0.28.1 +yt-dlp==2024.12.23 +sse-starlette==2.2.1 +python-multipart==0.0.20 +aiofiles==24.1.0 diff --git a/database/init.sql b/database/init.sql new file mode 100644 index 0000000..a50a2aa --- /dev/null +++ b/database/init.sql @@ -0,0 +1,113 @@ +-- ClipForge Database Schema + +CREATE EXTENSION IF NOT EXISTS "uuid-ossp"; + +-- Job status enum +CREATE TYPE job_status AS ENUM ( + 'pending', + 'downloading', + 'transcribing', + 'analyzing', + 'extracting', + 'complete', + 'failed' +); + +-- Source type enum +CREATE TYPE source_type AS ENUM ( + 'youtube', + 'upload' +); + +-- Aspect ratio enum +CREATE TYPE aspect_ratio AS ENUM ( + '16:9', + '9:16', + '1:1', + '4:5' +); + +-- Render status enum +CREATE TYPE render_status AS ENUM ( + 'pending', + 'rendering', + 'complete', + 'failed' +); + +-- Jobs table +CREATE TABLE jobs ( + id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), + source_type source_type NOT NULL, + source_url TEXT, + source_filename TEXT, + title TEXT, + duration FLOAT, + status job_status NOT NULL DEFAULT 'pending', + progress FLOAT NOT NULL DEFAULT 0.0, + stage_message TEXT, + error_message TEXT, + media_path TEXT, + transcript JSONB, + scene_boundaries JSONB, + created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW() +); + +-- Clips table +CREATE TABLE clips ( + id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), + job_id UUID NOT NULL REFERENCES jobs(id) ON DELETE CASCADE, + title TEXT NOT NULL, + start_time FLOAT NOT NULL, + end_time FLOAT NOT NULL, + duration FLOAT GENERATED ALWAYS AS (end_time - start_time) STORED, + virality_score FLOAT NOT NULL DEFAULT 0.0, + category TEXT, + reasoning TEXT, + transcript_segment TEXT, + thumbnail_path TEXT, + raw_clip_path TEXT, + created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW() +); + +-- Render requests table +CREATE TABLE render_requests ( + id UUID PRIMARY KEY DEFAULT uuid_generate_v4(), + clip_id UUID NOT NULL REFERENCES clips(id) ON DELETE CASCADE, + aspect_ratio aspect_ratio NOT NULL DEFAULT '9:16', + subtitle_style TEXT NOT NULL DEFAULT 'tiktok', + status render_status NOT NULL DEFAULT 'pending', + progress FLOAT NOT NULL DEFAULT 0.0, + output_path TEXT, + error_message TEXT, + created_at TIMESTAMP WITH TIME ZONE DEFAULT NOW(), + updated_at TIMESTAMP WITH TIME ZONE DEFAULT NOW() +); + +-- Indexes +CREATE INDEX idx_jobs_status ON jobs(status); +CREATE INDEX idx_jobs_created_at ON jobs(created_at DESC); +CREATE INDEX idx_clips_job_id ON clips(job_id); +CREATE INDEX idx_clips_virality ON clips(virality_score DESC); +CREATE INDEX idx_renders_clip_id ON render_requests(clip_id); +CREATE INDEX idx_renders_status ON render_requests(status); + +-- Updated_at trigger +CREATE OR REPLACE FUNCTION update_updated_at() +RETURNS TRIGGER AS $$ +BEGIN + NEW.updated_at = NOW(); + RETURN NEW; +END; +$$ LANGUAGE plpgsql; + +CREATE TRIGGER jobs_updated_at + BEFORE UPDATE ON jobs + FOR EACH ROW + EXECUTE FUNCTION update_updated_at(); + +CREATE TRIGGER renders_updated_at + BEFORE UPDATE ON render_requests + FOR EACH ROW + EXECUTE FUNCTION update_updated_at(); diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..1c68a81 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,91 @@ +services: + postgres: + image: postgres:15-alpine + restart: unless-stopped + environment: + POSTGRES_USER: ${POSTGRES_USER:-clipforge} + POSTGRES_PASSWORD: ${POSTGRES_PASSWORD:-changeme_clipforge_2025} + POSTGRES_DB: ${POSTGRES_DB:-clipforge} + volumes: + - postgres_data:/var/lib/postgresql/data + - ./database/init.sql:/docker-entrypoint-initdb.d/init.sql + healthcheck: + test: ["CMD-SHELL", "pg_isready -U ${POSTGRES_USER:-clipforge}"] + interval: 5s + timeout: 5s + retries: 5 + + redis: + image: redis:7-alpine + restart: unless-stopped + command: redis-server --maxmemory 256mb --maxmemory-policy allkeys-lru + volumes: + - redis_data:/data + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 5s + timeout: 5s + retries: 5 + + backend: + build: + context: ./backend + dockerfile: Dockerfile + restart: unless-stopped + env_file: .env + environment: + - DATABASE_URL=${DATABASE_URL:-postgresql+asyncpg://clipforge:changeme_clipforge_2025@postgres:5432/clipforge} + - REDIS_URL=${REDIS_URL:-redis://redis:6379/0} + volumes: + - media_data:/data + depends_on: + postgres: + condition: service_healthy + redis: + condition: service_healthy + labels: + - "traefik.enable=true" + - "traefik.http.routers.clipforge.rule=Host(`clip.jeffemmett.com`)" + - "traefik.http.services.clipforge.loadbalancer.server.port=8000" + networks: + - default + - traefik-public + + worker: + build: + context: ./backend + dockerfile: Dockerfile + restart: unless-stopped + command: ["python", "-m", "app.worker"] + env_file: .env + environment: + - DATABASE_URL=${DATABASE_URL:-postgresql+asyncpg://clipforge:changeme_clipforge_2025@postgres:5432/clipforge} + - REDIS_URL=${REDIS_URL:-redis://redis:6379/0} + volumes: + - media_data:/data + depends_on: + postgres: + condition: service_healthy + redis: + condition: service_healthy + extra_hosts: + - "host.docker.internal:host-gateway" + + frontend: + build: + context: ./frontend + dockerfile: Dockerfile + restart: unless-stopped + labels: + - "traefik.enable=false" + networks: + - default + +volumes: + postgres_data: + redis_data: + media_data: + +networks: + traefik-public: + external: true diff --git a/frontend/Dockerfile b/frontend/Dockerfile new file mode 100644 index 0000000..9ad2a75 --- /dev/null +++ b/frontend/Dockerfile @@ -0,0 +1,6 @@ +FROM nginx:alpine + +# Placeholder frontend - Phase 4 will replace with React build +RUN echo 'ClipForge

ClipForge

Self-hosted AI video clipper

API: POST /api/jobs

Status: GET /api/jobs/{id}

Clips: GET /api/jobs/{id}/clips

Frontend coming in Phase 4

' > /usr/share/nginx/html/index.html + +EXPOSE 80