fix: robust JSON parsing with regex fallback for LLM responses

Gemini sometimes produces JSON with unescaped characters that break standard parsing. Added multiple fallback strategies including regex extraction of individual clip objects. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-10 01:17:15 +00:00 · 2026-02-10 01:17:15 +00:00 · 3ce7945096
parent 1bac0b90a6
commit 3ce7945096
1 changed files with 58 additions and 14 deletions
--- a/backend/app/services/ai_analysis.py
+++ b/backend/app/services/ai_analysis.py
@ -259,6 +259,59 @@ async def _call_openai(system: str, user_prompt: str) -> str:
    return content
 def _try_parse_json(text: str) -> dict | None:
    """Try multiple strategies to parse JSON from LLM output."""
    # Attempt 1: direct parse
    try:
        return json.loads(text)
    except json.JSONDecodeError:
        pass
    # Attempt 2: fix trailing commas
    fixed = re.sub(r",\s*}", "}", text)
    fixed = re.sub(r",\s*]", "]", fixed)
    try:
        return json.loads(fixed)
    except json.JSONDecodeError:
        pass
    # Attempt 3: fix unescaped control characters in string values
    # Replace literal newlines/tabs inside JSON strings
    fixed2 = re.sub(r'(?<=": ")(.*?)(?="[,\s}])', lambda m: m.group().replace('\n', '\\n').replace('\t', '\\t'), fixed, flags=re.DOTALL)
    try:
        return json.loads(fixed2)
    except json.JSONDecodeError:
        pass
    # Attempt 4: extract individual clip objects with a more lenient approach
    try:
        clips = []
        clip_pattern = re.compile(
            r'"title"\s*:\s*"([^"]*)".*?'
            r'"start_time"\s*:\s*([0-9.]+).*?'
            r'"end_time"\s*:\s*([0-9.]+).*?'
            r'"virality_score"\s*:\s*([0-9.]+).*?'
            r'"category"\s*:\s*"([^"]*)"',
            re.DOTALL
        )
        for m in clip_pattern.finditer(text):
            clips.append({
                "title": m.group(1),
                "start_time": float(m.group(2)),
                "end_time": float(m.group(3)),
                "virality_score": float(m.group(4)),
                "category": m.group(5),
                "reasoning": "",
            })
        if clips:
            logger.info(f"Regex fallback extracted {len(clips)} clips")
            return {"clips": clips}
    except Exception:
        pass
    return None
 def _parse_clips(content: str, video_duration: float) -> list[dict]:
    """Parse LLM response into clip list, handling imperfect JSON."""
    # Strip markdown code fences (e.g. ```json ... ```)
@ -271,21 +324,12 @@ def _parse_clips(content: str, video_duration: float) -> list[dict]:
        return []
    raw_json = json_match.group()
-    logger.debug(f"Extracted JSON ({len(raw_json)} chars): {raw_json[:500]}")
+    logger.info(f"Extracted JSON ({len(raw_json)} chars)")
-    try:
+    data = _try_parse_json(raw_json)
-        data = json.loads(raw_json)
+    if data is None:
-    except json.JSONDecodeError as e:
+        logger.error(f"Failed to parse LLM JSON. Raw content: {content[:1000]}")
-        logger.warning(f"JSON parse attempt 1 failed: {e}")
+        return []
        # Try to fix common JSON issues
        fixed = raw_json
        fixed = re.sub(r",\s*}", "}", fixed)
        fixed = re.sub(r",\s*]", "]", fixed)
        try:
            data = json.loads(fixed)
        except json.JSONDecodeError as e2:
            logger.error(f"Failed to parse LLM JSON ({e2}): {content[:500]}")
            return []
    raw_clips = data.get("clips", [])
    clips = []