diff --git a/backend/app/services/ai_analysis.py b/backend/app/services/ai_analysis.py index c81f4c3..3ee07b2 100644 --- a/backend/app/services/ai_analysis.py +++ b/backend/app/services/ai_analysis.py @@ -259,6 +259,59 @@ async def _call_openai(system: str, user_prompt: str) -> str: return content +def _try_parse_json(text: str) -> dict | None: + """Try multiple strategies to parse JSON from LLM output.""" + # Attempt 1: direct parse + try: + return json.loads(text) + except json.JSONDecodeError: + pass + + # Attempt 2: fix trailing commas + fixed = re.sub(r",\s*}", "}", text) + fixed = re.sub(r",\s*]", "]", fixed) + try: + return json.loads(fixed) + except json.JSONDecodeError: + pass + + # Attempt 3: fix unescaped control characters in string values + # Replace literal newlines/tabs inside JSON strings + fixed2 = re.sub(r'(?<=": ")(.*?)(?="[,\s}])', lambda m: m.group().replace('\n', '\\n').replace('\t', '\\t'), fixed, flags=re.DOTALL) + try: + return json.loads(fixed2) + except json.JSONDecodeError: + pass + + # Attempt 4: extract individual clip objects with a more lenient approach + try: + clips = [] + clip_pattern = re.compile( + r'"title"\s*:\s*"([^"]*)".*?' + r'"start_time"\s*:\s*([0-9.]+).*?' + r'"end_time"\s*:\s*([0-9.]+).*?' + r'"virality_score"\s*:\s*([0-9.]+).*?' + r'"category"\s*:\s*"([^"]*)"', + re.DOTALL + ) + for m in clip_pattern.finditer(text): + clips.append({ + "title": m.group(1), + "start_time": float(m.group(2)), + "end_time": float(m.group(3)), + "virality_score": float(m.group(4)), + "category": m.group(5), + "reasoning": "", + }) + if clips: + logger.info(f"Regex fallback extracted {len(clips)} clips") + return {"clips": clips} + except Exception: + pass + + return None + + def _parse_clips(content: str, video_duration: float) -> list[dict]: """Parse LLM response into clip list, handling imperfect JSON.""" # Strip markdown code fences (e.g. ```json ... ```) @@ -271,21 +324,12 @@ def _parse_clips(content: str, video_duration: float) -> list[dict]: return [] raw_json = json_match.group() - logger.debug(f"Extracted JSON ({len(raw_json)} chars): {raw_json[:500]}") + logger.info(f"Extracted JSON ({len(raw_json)} chars)") - try: - data = json.loads(raw_json) - except json.JSONDecodeError as e: - logger.warning(f"JSON parse attempt 1 failed: {e}") - # Try to fix common JSON issues - fixed = raw_json - fixed = re.sub(r",\s*}", "}", fixed) - fixed = re.sub(r",\s*]", "]", fixed) - try: - data = json.loads(fixed) - except json.JSONDecodeError as e2: - logger.error(f"Failed to parse LLM JSON ({e2}): {content[:500]}") - return [] + data = _try_parse_json(raw_json) + if data is None: + logger.error(f"Failed to parse LLM JSON. Raw content: {content[:1000]}") + return [] raw_clips = data.get("clips", []) clips = []