"""Subtitle rendering service using FFmpeg ASS filter. Generates word-by-word animated captions in various styles, then burns them into the video with optional aspect ratio conversion. """ import asyncio import logging import os import tempfile logger = logging.getLogger(__name__) # ASS style definitions for each caption style. # Playres Y is set to 720 (matching 1280x720 source). ASS_HEADER = """[Script Info] ScriptType: v4.00+ PlayResX: 1280 PlayResY: 720 WrapStyle: 0 [V4+ Styles] Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding {styles} [Events] Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text {events}""" # Each style defines: font, colors (AABBGGRR format), border, alignment STYLES = { "tiktok": { "name": "TikTok", "def": "Style: Default,Arial,52,&H00FFFFFF,&H000000FF,&H00000000,&H80000000,-1,0,0,0,100,100,0,0,1,3,0,2,40,40,40,1", "highlight": "Style: Highlight,Arial,52,&H0000FFFF,&H000000FF,&H00000000,&H80000000,-1,0,0,0,100,100,0,0,1,3,0,2,40,40,40,1", "words_per_group": 3, }, "hormozi": { "name": "Hormozi", "def": "Style: Default,Impact,60,&H00FFFFFF,&H000000FF,&H00000000,&H80000000,-1,0,0,0,100,100,2,0,1,4,0,2,40,40,50,1", "highlight": "Style: Highlight,Impact,60,&H0000DDFF,&H000000FF,&H00000000,&H80000000,-1,0,0,0,105,105,2,0,1,4,0,2,40,40,50,1", "words_per_group": 2, }, "karaoke": { "name": "Karaoke", "def": "Style: Default,Arial,48,&H80FFFFFF,&H000000FF,&H00000000,&H80000000,-1,0,0,0,100,100,0,0,1,2,0,2,40,40,40,1", "highlight": "Style: Highlight,Arial,48,&H0000FF00,&H000000FF,&H00000000,&H80000000,-1,0,0,0,100,100,0,0,1,2,0,2,40,40,40,1", "words_per_group": 4, }, "minimal": { "name": "Minimal", "def": "Style: Default,Helvetica,40,&H00FFFFFF,&H000000FF,&H00000000,&H80000000,0,0,0,0,100,100,0,0,1,2,0,2,40,40,30,1", "highlight": None, "words_per_group": 5, }, } def _ts(seconds: float) -> str: """Format seconds as ASS timestamp H:MM:SS.cc.""" h = int(seconds // 3600) m = int((seconds % 3600) // 60) s = seconds % 60 return f"{h}:{m:02d}:{s:05.2f}" def _build_word_groups(words: list[dict], words_per_group: int) -> list[dict]: """Group consecutive words for display as caption lines.""" groups = [] for i in range(0, len(words), words_per_group): chunk = words[i : i + words_per_group] if not chunk: continue groups.append( { "words": chunk, "text": " ".join(w["word"].strip() for w in chunk), "start": chunk[0]["start"], "end": chunk[-1]["end"], } ) return groups def generate_ass( words: list[dict], clip_start: float, clip_end: float, style_name: str, ) -> str: """Generate ASS subtitle content from word-level timestamps. Args: words: list of {word, start, end} from Whisper clip_start: clip start time in seconds (absolute, in source video) clip_end: clip end time in seconds style_name: one of tiktok, hormozi, karaoke, minimal """ style = STYLES.get(style_name, STYLES["tiktok"]) # Filter words to clip range and shift to clip-relative times clip_words = [] for w in words: if w["end"] < clip_start - 0.3 or w["start"] > clip_end + 0.3: continue clip_words.append( { "word": w["word"].strip(), "start": max(0, w["start"] - clip_start), "end": max(0, w["end"] - clip_start), } ) if not clip_words: return "" groups = _build_word_groups(clip_words, style["words_per_group"]) styles_str = style["def"] if style.get("highlight"): styles_str += "\n" + style["highlight"] events = [] for g in groups: start = _ts(g["start"]) end = _ts(g["end"]) if style.get("highlight"): # Build text with word-by-word highlight using override tags parts = [] for w in g["words"]: w_start = w["start"] - g["start"] # Fade-in highlight: override color at word start time # Using \kf (karaoke fill) for smooth highlight duration_cs = int((w["end"] - w["start"]) * 100) parts.append(f"{{\\kf{duration_cs}}}{w['word']}") text = " ".join(parts) events.append( f"Dialogue: 0,{start},{end},Highlight,,0,0,0,,{text}" ) else: # Simple display, no highlight animation events.append( f"Dialogue: 0,{start},{end},Default,,0,0,0,,{g['text']}" ) return ASS_HEADER.format(styles=styles_str, events="\n".join(events)) async def render_with_subtitles( video_path: str, output_path: str, ass_content: str, aspect_ratio: str = "9:16", ) -> str: """Render video with burned-in ASS subtitles and aspect ratio conversion. Args: video_path: path to raw clip mp4 output_path: where to write rendered output ass_content: ASS subtitle content string aspect_ratio: target aspect ratio (9:16, 16:9, 1:1, 4:5) """ os.makedirs(os.path.dirname(output_path), exist_ok=True) # Write ASS to temp file ass_file = tempfile.NamedTemporaryFile( suffix=".ass", delete=False, mode="w", encoding="utf-8" ) ass_file.write(ass_content) ass_file.close() try: # Build FFmpeg filter chain filters = [] # Aspect ratio conversion with padding ratio_map = {"9:16": (720, 1280), "16:9": (1280, 720), "1:1": (720, 720), "4:5": (576, 720)} w, h = ratio_map.get(aspect_ratio, (720, 1280)) filters.append(f"scale={w}:{h}:force_original_aspect_ratio=decrease") filters.append(f"pad={w}:{h}:(ow-iw)/2:(oh-ih)/2:black") # Burn in subtitles (ass_file path needs escaped colons on Windows, but we're on Linux) ass_escaped = ass_file.name.replace(":", "\\:") if ass_content: filters.append(f"ass={ass_escaped}") vf = ",".join(filters) cmd = [ "ffmpeg", "-i", video_path, "-vf", vf, "-c:v", "libx264", "-preset", "fast", "-crf", "23", "-c:a", "aac", "-b:a", "128k", "-movflags", "+faststart", "-y", output_path, ] logger.info(f"Rendering: {video_path} -> {output_path} ({aspect_ratio}, subs={bool(ass_content)})") proc = await asyncio.create_subprocess_exec( *cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE, ) _, stderr = await proc.communicate() if proc.returncode != 0: raise RuntimeError(f"FFmpeg render failed: {stderr.decode()[-500:]}") size_mb = os.path.getsize(output_path) / (1024 * 1024) logger.info(f"Rendered: {output_path} ({size_mb:.1f} MB)") return output_path finally: os.unlink(ass_file.name)