Implement parallel processing for JSON file imports

This commit is contained in:
Joakim Hellsén 2025-09-24 02:48:42 +02:00
commit b58849d11e

View file

@ -1,5 +1,6 @@
from __future__ import annotations from __future__ import annotations
import concurrent.futures
import logging import logging
import shutil import shutil
import traceback import traceback
@ -160,18 +161,17 @@ class Command(BaseCommand):
"""Process all JSON files in a directory using parallel processing. """Process all JSON files in a directory using parallel processing.
Args: Args:
directory: Path to the directory. directory: Path to the directory containing JSON files.
processed_path: Name of subdirectory to move processed files to. processed_path: Path to the subdirectory where processed files will be moved.
continue_on_error: Continue processing if an error occurs. continue_on_error: Whether to continue processing remaining files if an error occurs.
Raises: Raises:
CommandError: If the file/directory doesn't exist, isn't a JSON file, CommandError: If the path is invalid or moving files fails.
or has an invalid JSON structure. ValueError: If a JSON file has an invalid structure.
ValueError: If the JSON file has an invalid structure. TypeError: If a JSON file has an invalid structure.
TypeError: If the JSON file has an invalid structure. AttributeError: If a JSON file has an invalid structure.
AttributeError: If the JSON file has an invalid structure. KeyError: If a JSON file has an invalid structure.
KeyError: If the JSON file has an invalid structure. IndexError: If a JSON file has an invalid structure.
IndexError: If the JSON file has an invalid structure.
""" """
json_files: list[Path] = list(directory.glob("*.json")) json_files: list[Path] = list(directory.glob("*.json"))
if not json_files: if not json_files:
@ -181,19 +181,24 @@ class Command(BaseCommand):
total_files: int = len(json_files) total_files: int = len(json_files)
self.stdout.write(f"Found {total_files} JSON files to process") self.stdout.write(f"Found {total_files} JSON files to process")
for json_file in json_files: with concurrent.futures.ThreadPoolExecutor() as executor:
self.stdout.write(f"Processing file {json_file.name}...") future_to_file: dict[concurrent.futures.Future[None], Path] = {
try: executor.submit(self._process_file, json_file, processed_path): json_file for json_file in json_files
self._process_file(json_file, processed_path) }
except CommandError as e: for future in concurrent.futures.as_completed(future_to_file):
if not continue_on_error: json_file: Path = future_to_file[future]
raise self.stdout.write(f"Processing file {json_file.name}...")
self.stdout.write(self.style.ERROR(f"Error processing {json_file}: {e}")) try:
except (ValueError, TypeError, AttributeError, KeyError, IndexError): future.result()
if not continue_on_error: except CommandError as e:
raise if not continue_on_error:
self.stdout.write(self.style.ERROR(f"Data error processing {json_file}")) raise
self.stdout.write(self.style.ERROR(traceback.format_exc())) self.stdout.write(self.style.ERROR(f"Error processing {json_file}: {e}"))
except (ValueError, TypeError, AttributeError, KeyError, IndexError):
if not continue_on_error:
raise
self.stdout.write(self.style.ERROR(f"Data error processing {json_file}"))
self.stdout.write(self.style.ERROR(traceback.format_exc()))
msg: str = f"Processed {total_files} JSON files in {directory}. Moved processed files to {processed_path}." msg: str = f"Processed {total_files} JSON files in {directory}. Moved processed files to {processed_path}."
self.stdout.write(self.style.SUCCESS(msg)) self.stdout.write(self.style.SUCCESS(msg))