fix: robust JSON parsing with regex fallback for LLM responses

Gemini sometimes produces JSON with unescaped characters that break
standard parsing. Added multiple fallback strategies including regex
extraction of individual clip objects.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Jeff Emmett 2026-02-10 01:17:15 +00:00
parent 1bac0b90a6
commit 3ce7945096
1 changed files with 58 additions and 14 deletions

View File

@ -259,6 +259,59 @@ async def _call_openai(system: str, user_prompt: str) -> str:
return content return content
def _try_parse_json(text: str) -> dict | None:
"""Try multiple strategies to parse JSON from LLM output."""
# Attempt 1: direct parse
try:
return json.loads(text)
except json.JSONDecodeError:
pass
# Attempt 2: fix trailing commas
fixed = re.sub(r",\s*}", "}", text)
fixed = re.sub(r",\s*]", "]", fixed)
try:
return json.loads(fixed)
except json.JSONDecodeError:
pass
# Attempt 3: fix unescaped control characters in string values
# Replace literal newlines/tabs inside JSON strings
fixed2 = re.sub(r'(?<=": ")(.*?)(?="[,\s}])', lambda m: m.group().replace('\n', '\\n').replace('\t', '\\t'), fixed, flags=re.DOTALL)
try:
return json.loads(fixed2)
except json.JSONDecodeError:
pass
# Attempt 4: extract individual clip objects with a more lenient approach
try:
clips = []
clip_pattern = re.compile(
r'"title"\s*:\s*"([^"]*)".*?'
r'"start_time"\s*:\s*([0-9.]+).*?'
r'"end_time"\s*:\s*([0-9.]+).*?'
r'"virality_score"\s*:\s*([0-9.]+).*?'
r'"category"\s*:\s*"([^"]*)"',
re.DOTALL
)
for m in clip_pattern.finditer(text):
clips.append({
"title": m.group(1),
"start_time": float(m.group(2)),
"end_time": float(m.group(3)),
"virality_score": float(m.group(4)),
"category": m.group(5),
"reasoning": "",
})
if clips:
logger.info(f"Regex fallback extracted {len(clips)} clips")
return {"clips": clips}
except Exception:
pass
return None
def _parse_clips(content: str, video_duration: float) -> list[dict]: def _parse_clips(content: str, video_duration: float) -> list[dict]:
"""Parse LLM response into clip list, handling imperfect JSON.""" """Parse LLM response into clip list, handling imperfect JSON."""
# Strip markdown code fences (e.g. ```json ... ```) # Strip markdown code fences (e.g. ```json ... ```)
@ -271,21 +324,12 @@ def _parse_clips(content: str, video_duration: float) -> list[dict]:
return [] return []
raw_json = json_match.group() raw_json = json_match.group()
logger.debug(f"Extracted JSON ({len(raw_json)} chars): {raw_json[:500]}") logger.info(f"Extracted JSON ({len(raw_json)} chars)")
try: data = _try_parse_json(raw_json)
data = json.loads(raw_json) if data is None:
except json.JSONDecodeError as e: logger.error(f"Failed to parse LLM JSON. Raw content: {content[:1000]}")
logger.warning(f"JSON parse attempt 1 failed: {e}") return []
# Try to fix common JSON issues
fixed = raw_json
fixed = re.sub(r",\s*}", "}", fixed)
fixed = re.sub(r",\s*]", "]", fixed)
try:
data = json.loads(fixed)
except json.JSONDecodeError as e2:
logger.error(f"Failed to parse LLM JSON ({e2}): {content[:500]}")
return []
raw_clips = data.get("clips", []) raw_clips = data.get("clips", [])
clips = [] clips = []