fix: robust JSON parsing with regex fallback for LLM responses
Gemini sometimes produces JSON with unescaped characters that break standard parsing. Added multiple fallback strategies including regex extraction of individual clip objects. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
1bac0b90a6
commit
3ce7945096
|
|
@ -259,6 +259,59 @@ async def _call_openai(system: str, user_prompt: str) -> str:
|
||||||
return content
|
return content
|
||||||
|
|
||||||
|
|
||||||
|
def _try_parse_json(text: str) -> dict | None:
|
||||||
|
"""Try multiple strategies to parse JSON from LLM output."""
|
||||||
|
# Attempt 1: direct parse
|
||||||
|
try:
|
||||||
|
return json.loads(text)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Attempt 2: fix trailing commas
|
||||||
|
fixed = re.sub(r",\s*}", "}", text)
|
||||||
|
fixed = re.sub(r",\s*]", "]", fixed)
|
||||||
|
try:
|
||||||
|
return json.loads(fixed)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Attempt 3: fix unescaped control characters in string values
|
||||||
|
# Replace literal newlines/tabs inside JSON strings
|
||||||
|
fixed2 = re.sub(r'(?<=": ")(.*?)(?="[,\s}])', lambda m: m.group().replace('\n', '\\n').replace('\t', '\\t'), fixed, flags=re.DOTALL)
|
||||||
|
try:
|
||||||
|
return json.loads(fixed2)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Attempt 4: extract individual clip objects with a more lenient approach
|
||||||
|
try:
|
||||||
|
clips = []
|
||||||
|
clip_pattern = re.compile(
|
||||||
|
r'"title"\s*:\s*"([^"]*)".*?'
|
||||||
|
r'"start_time"\s*:\s*([0-9.]+).*?'
|
||||||
|
r'"end_time"\s*:\s*([0-9.]+).*?'
|
||||||
|
r'"virality_score"\s*:\s*([0-9.]+).*?'
|
||||||
|
r'"category"\s*:\s*"([^"]*)"',
|
||||||
|
re.DOTALL
|
||||||
|
)
|
||||||
|
for m in clip_pattern.finditer(text):
|
||||||
|
clips.append({
|
||||||
|
"title": m.group(1),
|
||||||
|
"start_time": float(m.group(2)),
|
||||||
|
"end_time": float(m.group(3)),
|
||||||
|
"virality_score": float(m.group(4)),
|
||||||
|
"category": m.group(5),
|
||||||
|
"reasoning": "",
|
||||||
|
})
|
||||||
|
if clips:
|
||||||
|
logger.info(f"Regex fallback extracted {len(clips)} clips")
|
||||||
|
return {"clips": clips}
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _parse_clips(content: str, video_duration: float) -> list[dict]:
|
def _parse_clips(content: str, video_duration: float) -> list[dict]:
|
||||||
"""Parse LLM response into clip list, handling imperfect JSON."""
|
"""Parse LLM response into clip list, handling imperfect JSON."""
|
||||||
# Strip markdown code fences (e.g. ```json ... ```)
|
# Strip markdown code fences (e.g. ```json ... ```)
|
||||||
|
|
@ -271,21 +324,12 @@ def _parse_clips(content: str, video_duration: float) -> list[dict]:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
raw_json = json_match.group()
|
raw_json = json_match.group()
|
||||||
logger.debug(f"Extracted JSON ({len(raw_json)} chars): {raw_json[:500]}")
|
logger.info(f"Extracted JSON ({len(raw_json)} chars)")
|
||||||
|
|
||||||
try:
|
data = _try_parse_json(raw_json)
|
||||||
data = json.loads(raw_json)
|
if data is None:
|
||||||
except json.JSONDecodeError as e:
|
logger.error(f"Failed to parse LLM JSON. Raw content: {content[:1000]}")
|
||||||
logger.warning(f"JSON parse attempt 1 failed: {e}")
|
return []
|
||||||
# Try to fix common JSON issues
|
|
||||||
fixed = raw_json
|
|
||||||
fixed = re.sub(r",\s*}", "}", fixed)
|
|
||||||
fixed = re.sub(r",\s*]", "]", fixed)
|
|
||||||
try:
|
|
||||||
data = json.loads(fixed)
|
|
||||||
except json.JSONDecodeError as e2:
|
|
||||||
logger.error(f"Failed to parse LLM JSON ({e2}): {content[:500]}")
|
|
||||||
return []
|
|
||||||
|
|
||||||
raw_clips = data.get("clips", [])
|
raw_clips = data.get("clips", [])
|
||||||
clips = []
|
clips = []
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue