Add function to repair partially broken JSON with multiple strategies

2026-01-12 03:48:45 +01:00 · 2026-01-12 03:48:45 +01:00 · c9b74634c5
commit c9b74634c5
parent 1c13e25b17
1 changed files with 117 additions and 6 deletions
--- a/twitch/management/commands/better_import_drops.py
+++ b/twitch/management/commands/better_import_drops.py
@ -260,6 +260,115 @@ def extract_operation_name_from_parsed(
    return None


+def repair_partially_broken_json(raw_text: str) -> str:  # noqa: PLR0915
+    """Attempt to repair partially broken JSON with multiple fallback strategies.
+
+    Handles "half-bad" JSON by:
+    1. First attempting json_repair on the whole content
+    2. If that fails, tries to extract valid JSON objects from the text
+    3. Falls back to wrapping content in an array if possible
+
+    Args:
+        raw_text: The potentially broken JSON string.
+
+    Returns:
+        A JSON-valid string, either repaired or best-effort fixed.
+    """
+    # Strategy 1: Direct repair attempt
+    try:
+        fixed: str = json_repair.repair_json(raw_text)
+        # Validate it produces valid JSON
+        parsed_data = json.loads(fixed)
+
+        # If it's a list, validate all items are GraphQL responses
+        if isinstance(parsed_data, list):
+            # Filter to only keep GraphQL responses
+            filtered = [
+                item for item in parsed_data if isinstance(item, dict) and ("data" in item or "extensions" in item)
+            ]
+            if filtered:
+                # If we filtered anything out, return the filtered version
+                if len(filtered) < len(parsed_data):
+                    return json.dumps(filtered)
+                # Otherwise return as-is
+                return fixed
+        # Single dict - check if it's a GraphQL response
+        elif isinstance(parsed_data, dict):
+            if "data" in parsed_data or "extensions" in parsed_data:
+                return fixed
+    except ValueError, TypeError, json.JSONDecodeError:
+        pass
+
+    # Strategy 2: Try wrapping in array brackets and validate the result
+    # Only use this if it produces valid GraphQL responses
+    try:
+        wrapped: str = f"[{raw_text}]"
+        wrapped_data = json.loads(wrapped)
+        # Validate that all items look like GraphQL responses
+        if isinstance(wrapped_data, list) and wrapped_data:  # noqa: SIM102
+            # Check if all items have "data" or "extensions" (GraphQL response structure)
+            if all(isinstance(item, dict) and ("data" in item or "extensions" in item) for item in wrapped_data):
+                return wrapped
+    except ValueError, json.JSONDecodeError:
+        pass
+
+    # Strategy 3: Try to extract individual valid GraphQL response objects
+    # Look for balanced braces and try to parse them, but only keep objects
+    # that look like GraphQL responses (have "data" or "extensions" fields)
+    valid_objects: list[dict[str, Any]] = []
+    depth: int = 0
+    current_obj: str = ""
+
+    for char in raw_text:
+        if char == "{":
+            if depth == 0:
+                current_obj = "{"
+            else:
+                current_obj += char
+            depth += 1
+        elif char == "}":
+            depth -= 1
+            current_obj += char
+            if depth == 0 and current_obj.strip():
+                try:
+                    obj: dict[str, Any] = json.loads(current_obj)
+                    # Only keep objects that look like GraphQL responses
+                    # (have "data" field) or extension metadata (have "extensions")
+                    if "data" in obj or "extensions" in obj:
+                        valid_objects.append(obj)
+                except ValueError, json.JSONDecodeError:
+                    pass
+                current_obj = ""
+        elif depth > 0:
+            current_obj += char
+
+    if valid_objects:
+        return json.dumps(valid_objects)
+
+    # Strategy 4: Last resort - attempt repair on each line
+    # Only keep lines that look like GraphQL responses
+    lines: list[str] = raw_text.split("\n")
+    valid_lines: list[dict[str, Any]] = []
+
+    for line in lines:
+        line: str = line.strip()  # noqa: PLW2901
+        if line and line.startswith("{"):
+            try:
+                fixed_line: str = json_repair.repair_json(line)
+                obj = json.loads(fixed_line)
+                # Only keep objects that look like GraphQL responses
+                if "data" in obj or "extensions" in obj:
+                    valid_lines.append(obj)
+            except ValueError, TypeError, json.JSONDecodeError:
+                pass
+
+    if valid_lines:
+        return json.dumps(valid_lines)
+
+    # Final fallback: return the original text and let downstream handle it
+    return raw_text
+
+
 class Command(BaseCommand):
    """Import Twitch drop campaign data from a JSON file or directory."""

@ -1081,9 +1190,10 @@ class Command(BaseCommand):
        try:
            raw_text: str = file_path.read_text(encoding="utf-8", errors="ignore")

-            # Parse JSON early to extract operation name for better directory organization
-            parsed_json: JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]] | str = json_repair.loads(
-                raw_text,
+            # Repair potentially broken JSON with multiple fallback strategies
+            fixed_json_str: str = repair_partially_broken_json(raw_text)
+            parsed_json: JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]] | str = json.loads(
+                fixed_json_str,
            )
            operation_name: str | None = extract_operation_name_from_parsed(parsed_json)

@ -1170,9 +1280,10 @@ class Command(BaseCommand):
            try:
                raw_text: str = file_path.read_text(encoding="utf-8", errors="ignore")

-                # Parse JSON early to extract operation name for better directory organization
-                parsed_json: JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]] | str = json_repair.loads(
-                    raw_text,
+                # Repair potentially broken JSON with multiple fallback strategies
+                fixed_json_str: str = repair_partially_broken_json(raw_text)
+                parsed_json: JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]] | str = json.loads(
+                    fixed_json_str,
                )
                operation_name: str | None = extract_operation_name_from_parsed(parsed_json)