Add function to repair partially broken JSON with multiple strategies
This commit is contained in:
parent
1c13e25b17
commit
c9b74634c5
1 changed files with 117 additions and 6 deletions
|
|
@ -260,6 +260,115 @@ def extract_operation_name_from_parsed(
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def repair_partially_broken_json(raw_text: str) -> str: # noqa: PLR0915
|
||||||
|
"""Attempt to repair partially broken JSON with multiple fallback strategies.
|
||||||
|
|
||||||
|
Handles "half-bad" JSON by:
|
||||||
|
1. First attempting json_repair on the whole content
|
||||||
|
2. If that fails, tries to extract valid JSON objects from the text
|
||||||
|
3. Falls back to wrapping content in an array if possible
|
||||||
|
|
||||||
|
Args:
|
||||||
|
raw_text: The potentially broken JSON string.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A JSON-valid string, either repaired or best-effort fixed.
|
||||||
|
"""
|
||||||
|
# Strategy 1: Direct repair attempt
|
||||||
|
try:
|
||||||
|
fixed: str = json_repair.repair_json(raw_text)
|
||||||
|
# Validate it produces valid JSON
|
||||||
|
parsed_data = json.loads(fixed)
|
||||||
|
|
||||||
|
# If it's a list, validate all items are GraphQL responses
|
||||||
|
if isinstance(parsed_data, list):
|
||||||
|
# Filter to only keep GraphQL responses
|
||||||
|
filtered = [
|
||||||
|
item for item in parsed_data if isinstance(item, dict) and ("data" in item or "extensions" in item)
|
||||||
|
]
|
||||||
|
if filtered:
|
||||||
|
# If we filtered anything out, return the filtered version
|
||||||
|
if len(filtered) < len(parsed_data):
|
||||||
|
return json.dumps(filtered)
|
||||||
|
# Otherwise return as-is
|
||||||
|
return fixed
|
||||||
|
# Single dict - check if it's a GraphQL response
|
||||||
|
elif isinstance(parsed_data, dict):
|
||||||
|
if "data" in parsed_data or "extensions" in parsed_data:
|
||||||
|
return fixed
|
||||||
|
except ValueError, TypeError, json.JSONDecodeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Strategy 2: Try wrapping in array brackets and validate the result
|
||||||
|
# Only use this if it produces valid GraphQL responses
|
||||||
|
try:
|
||||||
|
wrapped: str = f"[{raw_text}]"
|
||||||
|
wrapped_data = json.loads(wrapped)
|
||||||
|
# Validate that all items look like GraphQL responses
|
||||||
|
if isinstance(wrapped_data, list) and wrapped_data: # noqa: SIM102
|
||||||
|
# Check if all items have "data" or "extensions" (GraphQL response structure)
|
||||||
|
if all(isinstance(item, dict) and ("data" in item or "extensions" in item) for item in wrapped_data):
|
||||||
|
return wrapped
|
||||||
|
except ValueError, json.JSONDecodeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Strategy 3: Try to extract individual valid GraphQL response objects
|
||||||
|
# Look for balanced braces and try to parse them, but only keep objects
|
||||||
|
# that look like GraphQL responses (have "data" or "extensions" fields)
|
||||||
|
valid_objects: list[dict[str, Any]] = []
|
||||||
|
depth: int = 0
|
||||||
|
current_obj: str = ""
|
||||||
|
|
||||||
|
for char in raw_text:
|
||||||
|
if char == "{":
|
||||||
|
if depth == 0:
|
||||||
|
current_obj = "{"
|
||||||
|
else:
|
||||||
|
current_obj += char
|
||||||
|
depth += 1
|
||||||
|
elif char == "}":
|
||||||
|
depth -= 1
|
||||||
|
current_obj += char
|
||||||
|
if depth == 0 and current_obj.strip():
|
||||||
|
try:
|
||||||
|
obj: dict[str, Any] = json.loads(current_obj)
|
||||||
|
# Only keep objects that look like GraphQL responses
|
||||||
|
# (have "data" field) or extension metadata (have "extensions")
|
||||||
|
if "data" in obj or "extensions" in obj:
|
||||||
|
valid_objects.append(obj)
|
||||||
|
except ValueError, json.JSONDecodeError:
|
||||||
|
pass
|
||||||
|
current_obj = ""
|
||||||
|
elif depth > 0:
|
||||||
|
current_obj += char
|
||||||
|
|
||||||
|
if valid_objects:
|
||||||
|
return json.dumps(valid_objects)
|
||||||
|
|
||||||
|
# Strategy 4: Last resort - attempt repair on each line
|
||||||
|
# Only keep lines that look like GraphQL responses
|
||||||
|
lines: list[str] = raw_text.split("\n")
|
||||||
|
valid_lines: list[dict[str, Any]] = []
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
line: str = line.strip() # noqa: PLW2901
|
||||||
|
if line and line.startswith("{"):
|
||||||
|
try:
|
||||||
|
fixed_line: str = json_repair.repair_json(line)
|
||||||
|
obj = json.loads(fixed_line)
|
||||||
|
# Only keep objects that look like GraphQL responses
|
||||||
|
if "data" in obj or "extensions" in obj:
|
||||||
|
valid_lines.append(obj)
|
||||||
|
except ValueError, TypeError, json.JSONDecodeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if valid_lines:
|
||||||
|
return json.dumps(valid_lines)
|
||||||
|
|
||||||
|
# Final fallback: return the original text and let downstream handle it
|
||||||
|
return raw_text
|
||||||
|
|
||||||
|
|
||||||
class Command(BaseCommand):
|
class Command(BaseCommand):
|
||||||
"""Import Twitch drop campaign data from a JSON file or directory."""
|
"""Import Twitch drop campaign data from a JSON file or directory."""
|
||||||
|
|
||||||
|
|
@ -1081,9 +1190,10 @@ class Command(BaseCommand):
|
||||||
try:
|
try:
|
||||||
raw_text: str = file_path.read_text(encoding="utf-8", errors="ignore")
|
raw_text: str = file_path.read_text(encoding="utf-8", errors="ignore")
|
||||||
|
|
||||||
# Parse JSON early to extract operation name for better directory organization
|
# Repair potentially broken JSON with multiple fallback strategies
|
||||||
parsed_json: JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]] | str = json_repair.loads(
|
fixed_json_str: str = repair_partially_broken_json(raw_text)
|
||||||
raw_text,
|
parsed_json: JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]] | str = json.loads(
|
||||||
|
fixed_json_str,
|
||||||
)
|
)
|
||||||
operation_name: str | None = extract_operation_name_from_parsed(parsed_json)
|
operation_name: str | None = extract_operation_name_from_parsed(parsed_json)
|
||||||
|
|
||||||
|
|
@ -1170,9 +1280,10 @@ class Command(BaseCommand):
|
||||||
try:
|
try:
|
||||||
raw_text: str = file_path.read_text(encoding="utf-8", errors="ignore")
|
raw_text: str = file_path.read_text(encoding="utf-8", errors="ignore")
|
||||||
|
|
||||||
# Parse JSON early to extract operation name for better directory organization
|
# Repair potentially broken JSON with multiple fallback strategies
|
||||||
parsed_json: JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]] | str = json_repair.loads(
|
fixed_json_str: str = repair_partially_broken_json(raw_text)
|
||||||
raw_text,
|
parsed_json: JSONReturnType | tuple[JSONReturnType, list[dict[str, str]]] | str = json.loads(
|
||||||
|
fixed_json_str,
|
||||||
)
|
)
|
||||||
operation_name: str | None = extract_operation_name_from_parsed(parsed_json)
|
operation_name: str | None = extract_operation_name_from_parsed(parsed_json)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue