From 3ce79450962ebda7ec7f724ba20d560b2ed23267 Mon Sep 17 00:00:00 2001
From: Jeff Emmett <jeffemmett@gmail.com>
Date: Tue, 10 Feb 2026 01:17:15 +0000
Subject: [PATCH] fix: robust JSON parsing with regex fallback for LLM
 responses

Gemini sometimes produces JSON with unescaped characters that break
standard parsing. Added multiple fallback strategies including regex
extraction of individual clip objects.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 backend/app/services/ai_analysis.py | 72 +++++++++++++++++++++++------
 1 file changed, 58 insertions(+), 14 deletions(-)

diff --git a/backend/app/services/ai_analysis.py b/backend/app/services/ai_analysis.py
index c81f4c3..3ee07b2 100644
--- a/backend/app/services/ai_analysis.py
+++ b/backend/app/services/ai_analysis.py
@@ -259,6 +259,59 @@ async def _call_openai(system: str, user_prompt: str) -> str:
     return content
 
 
+def _try_parse_json(text: str) -> dict | None:
+    """Try multiple strategies to parse JSON from LLM output."""
+    # Attempt 1: direct parse
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError:
+        pass
+
+    # Attempt 2: fix trailing commas
+    fixed = re.sub(r",\s*}", "}", text)
+    fixed = re.sub(r",\s*]", "]", fixed)
+    try:
+        return json.loads(fixed)
+    except json.JSONDecodeError:
+        pass
+
+    # Attempt 3: fix unescaped control characters in string values
+    # Replace literal newlines/tabs inside JSON strings
+    fixed2 = re.sub(r'(?<=": ")(.*?)(?="[,\s}])', lambda m: m.group().replace('\n', '\\n').replace('\t', '\\t'), fixed, flags=re.DOTALL)
+    try:
+        return json.loads(fixed2)
+    except json.JSONDecodeError:
+        pass
+
+    # Attempt 4: extract individual clip objects with a more lenient approach
+    try:
+        clips = []
+        clip_pattern = re.compile(
+            r'"title"\s*:\s*"([^"]*)".*?'
+            r'"start_time"\s*:\s*([0-9.]+).*?'
+            r'"end_time"\s*:\s*([0-9.]+).*?'
+            r'"virality_score"\s*:\s*([0-9.]+).*?'
+            r'"category"\s*:\s*"([^"]*)"',
+            re.DOTALL
+        )
+        for m in clip_pattern.finditer(text):
+            clips.append({
+                "title": m.group(1),
+                "start_time": float(m.group(2)),
+                "end_time": float(m.group(3)),
+                "virality_score": float(m.group(4)),
+                "category": m.group(5),
+                "reasoning": "",
+            })
+        if clips:
+            logger.info(f"Regex fallback extracted {len(clips)} clips")
+            return {"clips": clips}
+    except Exception:
+        pass
+
+    return None
+
+
 def _parse_clips(content: str, video_duration: float) -> list[dict]:
     """Parse LLM response into clip list, handling imperfect JSON."""
     # Strip markdown code fences (e.g. ```json ... ```)
@@ -271,21 +324,12 @@ def _parse_clips(content: str, video_duration: float) -> list[dict]:
         return []
 
     raw_json = json_match.group()
-    logger.debug(f"Extracted JSON ({len(raw_json)} chars): {raw_json[:500]}")
+    logger.info(f"Extracted JSON ({len(raw_json)} chars)")
 
-    try:
-        data = json.loads(raw_json)
-    except json.JSONDecodeError as e:
-        logger.warning(f"JSON parse attempt 1 failed: {e}")
-        # Try to fix common JSON issues
-        fixed = raw_json
-        fixed = re.sub(r",\s*}", "}", fixed)
-        fixed = re.sub(r",\s*]", "]", fixed)
-        try:
-            data = json.loads(fixed)
-        except json.JSONDecodeError as e2:
-            logger.error(f"Failed to parse LLM JSON ({e2}): {content[:500]}")
-            return []
+    data = _try_parse_json(raw_json)
+    if data is None:
+        logger.error(f"Failed to parse LLM JSON. Raw content: {content[:1000]}")
+        return []
 
     raw_clips = data.get("clips", [])
     clips = []