219 lines
7.2 KiB
Python
219 lines
7.2 KiB
Python
"""Subtitle rendering service using FFmpeg ASS filter.
|
|
|
|
Generates word-by-word animated captions in various styles, then burns
|
|
them into the video with optional aspect ratio conversion.
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
import os
|
|
import tempfile
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# ASS style definitions for each caption style.
|
|
# Playres Y is set to 720 (matching 1280x720 source).
|
|
ASS_HEADER = """[Script Info]
|
|
ScriptType: v4.00+
|
|
PlayResX: 1280
|
|
PlayResY: 720
|
|
WrapStyle: 0
|
|
|
|
[V4+ Styles]
|
|
Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
|
|
{styles}
|
|
|
|
[Events]
|
|
Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
|
|
{events}"""
|
|
|
|
# Each style defines: font, colors (AABBGGRR format), border, alignment
|
|
STYLES = {
|
|
"tiktok": {
|
|
"name": "TikTok",
|
|
"def": "Style: Default,Arial,52,&H00FFFFFF,&H000000FF,&H00000000,&H80000000,-1,0,0,0,100,100,0,0,1,3,0,2,40,40,40,1",
|
|
"highlight": "Style: Highlight,Arial,52,&H0000FFFF,&H000000FF,&H00000000,&H80000000,-1,0,0,0,100,100,0,0,1,3,0,2,40,40,40,1",
|
|
"words_per_group": 3,
|
|
},
|
|
"hormozi": {
|
|
"name": "Hormozi",
|
|
"def": "Style: Default,Impact,60,&H00FFFFFF,&H000000FF,&H00000000,&H80000000,-1,0,0,0,100,100,2,0,1,4,0,2,40,40,50,1",
|
|
"highlight": "Style: Highlight,Impact,60,&H0000DDFF,&H000000FF,&H00000000,&H80000000,-1,0,0,0,105,105,2,0,1,4,0,2,40,40,50,1",
|
|
"words_per_group": 2,
|
|
},
|
|
"karaoke": {
|
|
"name": "Karaoke",
|
|
"def": "Style: Default,Arial,48,&H80FFFFFF,&H000000FF,&H00000000,&H80000000,-1,0,0,0,100,100,0,0,1,2,0,2,40,40,40,1",
|
|
"highlight": "Style: Highlight,Arial,48,&H0000FF00,&H000000FF,&H00000000,&H80000000,-1,0,0,0,100,100,0,0,1,2,0,2,40,40,40,1",
|
|
"words_per_group": 4,
|
|
},
|
|
"minimal": {
|
|
"name": "Minimal",
|
|
"def": "Style: Default,Helvetica,40,&H00FFFFFF,&H000000FF,&H00000000,&H80000000,0,0,0,0,100,100,0,0,1,2,0,2,40,40,30,1",
|
|
"highlight": None,
|
|
"words_per_group": 5,
|
|
},
|
|
}
|
|
|
|
|
|
def _ts(seconds: float) -> str:
|
|
"""Format seconds as ASS timestamp H:MM:SS.cc."""
|
|
h = int(seconds // 3600)
|
|
m = int((seconds % 3600) // 60)
|
|
s = seconds % 60
|
|
return f"{h}:{m:02d}:{s:05.2f}"
|
|
|
|
|
|
def _build_word_groups(words: list[dict], words_per_group: int) -> list[dict]:
|
|
"""Group consecutive words for display as caption lines."""
|
|
groups = []
|
|
for i in range(0, len(words), words_per_group):
|
|
chunk = words[i : i + words_per_group]
|
|
if not chunk:
|
|
continue
|
|
groups.append(
|
|
{
|
|
"words": chunk,
|
|
"text": " ".join(w["word"].strip() for w in chunk),
|
|
"start": chunk[0]["start"],
|
|
"end": chunk[-1]["end"],
|
|
}
|
|
)
|
|
return groups
|
|
|
|
|
|
def generate_ass(
|
|
words: list[dict],
|
|
clip_start: float,
|
|
clip_end: float,
|
|
style_name: str,
|
|
) -> str:
|
|
"""Generate ASS subtitle content from word-level timestamps.
|
|
|
|
Args:
|
|
words: list of {word, start, end} from Whisper
|
|
clip_start: clip start time in seconds (absolute, in source video)
|
|
clip_end: clip end time in seconds
|
|
style_name: one of tiktok, hormozi, karaoke, minimal
|
|
"""
|
|
style = STYLES.get(style_name, STYLES["tiktok"])
|
|
|
|
# Filter words to clip range and shift to clip-relative times
|
|
clip_words = []
|
|
for w in words:
|
|
if w["end"] < clip_start - 0.3 or w["start"] > clip_end + 0.3:
|
|
continue
|
|
clip_words.append(
|
|
{
|
|
"word": w["word"].strip(),
|
|
"start": max(0, w["start"] - clip_start),
|
|
"end": max(0, w["end"] - clip_start),
|
|
}
|
|
)
|
|
|
|
if not clip_words:
|
|
return ""
|
|
|
|
groups = _build_word_groups(clip_words, style["words_per_group"])
|
|
|
|
styles_str = style["def"]
|
|
if style.get("highlight"):
|
|
styles_str += "\n" + style["highlight"]
|
|
|
|
events = []
|
|
for g in groups:
|
|
start = _ts(g["start"])
|
|
end = _ts(g["end"])
|
|
|
|
if style.get("highlight"):
|
|
# Build text with word-by-word highlight using override tags
|
|
parts = []
|
|
for w in g["words"]:
|
|
w_start = w["start"] - g["start"]
|
|
# Fade-in highlight: override color at word start time
|
|
# Using \kf (karaoke fill) for smooth highlight
|
|
duration_cs = int((w["end"] - w["start"]) * 100)
|
|
parts.append(f"{{\\kf{duration_cs}}}{w['word']}")
|
|
text = " ".join(parts)
|
|
events.append(
|
|
f"Dialogue: 0,{start},{end},Highlight,,0,0,0,,{text}"
|
|
)
|
|
else:
|
|
# Simple display, no highlight animation
|
|
events.append(
|
|
f"Dialogue: 0,{start},{end},Default,,0,0,0,,{g['text']}"
|
|
)
|
|
|
|
return ASS_HEADER.format(styles=styles_str, events="\n".join(events))
|
|
|
|
|
|
async def render_with_subtitles(
|
|
video_path: str,
|
|
output_path: str,
|
|
ass_content: str,
|
|
aspect_ratio: str = "9:16",
|
|
) -> str:
|
|
"""Render video with burned-in ASS subtitles and aspect ratio conversion.
|
|
|
|
Args:
|
|
video_path: path to raw clip mp4
|
|
output_path: where to write rendered output
|
|
ass_content: ASS subtitle content string
|
|
aspect_ratio: target aspect ratio (9:16, 16:9, 1:1, 4:5)
|
|
"""
|
|
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
|
|
# Write ASS to temp file
|
|
ass_file = tempfile.NamedTemporaryFile(
|
|
suffix=".ass", delete=False, mode="w", encoding="utf-8"
|
|
)
|
|
ass_file.write(ass_content)
|
|
ass_file.close()
|
|
|
|
try:
|
|
# Build FFmpeg filter chain
|
|
filters = []
|
|
|
|
# Aspect ratio conversion with padding
|
|
ratio_map = {"9:16": (720, 1280), "16:9": (1280, 720), "1:1": (720, 720), "4:5": (576, 720)}
|
|
w, h = ratio_map.get(aspect_ratio, (720, 1280))
|
|
filters.append(f"scale={w}:{h}:force_original_aspect_ratio=decrease")
|
|
filters.append(f"pad={w}:{h}:(ow-iw)/2:(oh-ih)/2:black")
|
|
|
|
# Burn in subtitles (ass_file path needs escaped colons on Windows, but we're on Linux)
|
|
ass_escaped = ass_file.name.replace(":", "\\:")
|
|
if ass_content:
|
|
filters.append(f"ass={ass_escaped}")
|
|
|
|
vf = ",".join(filters)
|
|
|
|
cmd = [
|
|
"ffmpeg",
|
|
"-i", video_path,
|
|
"-vf", vf,
|
|
"-c:v", "libx264", "-preset", "fast", "-crf", "23",
|
|
"-c:a", "aac", "-b:a", "128k",
|
|
"-movflags", "+faststart",
|
|
"-y",
|
|
output_path,
|
|
]
|
|
|
|
logger.info(f"Rendering: {video_path} -> {output_path} ({aspect_ratio}, subs={bool(ass_content)})")
|
|
|
|
proc = await asyncio.create_subprocess_exec(
|
|
*cmd,
|
|
stdout=asyncio.subprocess.PIPE,
|
|
stderr=asyncio.subprocess.PIPE,
|
|
)
|
|
_, stderr = await proc.communicate()
|
|
|
|
if proc.returncode != 0:
|
|
raise RuntimeError(f"FFmpeg render failed: {stderr.decode()[-500:]}")
|
|
|
|
size_mb = os.path.getsize(output_path) / (1024 * 1024)
|
|
logger.info(f"Rendered: {output_path} ({size_mb:.1f} MB)")
|
|
return output_path
|
|
|
|
finally:
|
|
os.unlink(ass_file.name)
|