WIP better import drops

2025-12-01 21:38:28 +01:00 · 2025-12-01 21:38:28 +01:00 · 69fa30748a
commit 69fa30748a
parent 0400fed26a
10 changed files with 399 additions and 40 deletions
--- a/twitch/management/commands/better_import_drops.py
+++ b/twitch/management/commands/better_import_drops.py
@ -0,0 +1,194 @@
+from __future__ import annotations
+
+import os
+import sys
+from concurrent.futures import ProcessPoolExecutor
+from concurrent.futures import as_completed
+from pathlib import Path
+
+from django.core.management.base import BaseCommand
+from django.core.management.base import CommandError
+from django.core.management.base import CommandParser
+from pydantic import ValidationError
+
+from twitch.models import Channel
+from twitch.models import DropBenefit
+from twitch.models import DropCampaign
+from twitch.models import Game
+from twitch.models import Organization
+from twitch.schemas import ViewerDropsDashboardPayload
+
+
+def move_failed_validation_file(file_path: Path) -> Path:
+    """Moves a file that failed validation to a 'broken' subdirectory.
+
+    Args:
+        file_path: Path to the file that failed validation
+
+    Returns:
+        Path to the 'broken' directory where the file was moved
+    """
+    broken_dir: Path = file_path.parent / "broken"
+    broken_dir.mkdir(parents=True, exist_ok=True)
+
+    target_file: Path = broken_dir / file_path.name
+    file_path.rename(target_file)
+
+    return broken_dir
+
+
+class Command(BaseCommand):
+    """Import Twitch drop campaign data from a JSON file or directory of JSON files."""
+
+    help = "Import Twitch drop campaign data from a JSON file or directory"
+    requires_migrations_checks = True
+
+    game_cache: dict[str, Game] = {}
+    organization_cache: dict[str, Organization] = {}
+    drop_campaign_cache: dict[str, DropCampaign] = {}
+    channel_cache: dict[str, Channel] = {}
+    benefit_cache: dict[str, DropBenefit] = {}
+
+    def add_arguments(self, parser: CommandParser) -> None:
+        """Populate the command with arguments."""
+        parser.add_argument("path", type=str, help="Path to JSON file or directory")
+        parser.add_argument("--recursive", action="store_true", help="Recursively search directories for JSON files")
+        parser.add_argument("--crash-on-error", action="store_true", help="Crash the command on first error instead of continuing")
+
+    def pre_fill_cache(self) -> None:
+        """Load all existing IDs from DB into memory to avoid N+1 queries."""
+        self.stdout.write("Pre-filling caches...")
+        self.game_cache = {str(g.twitch_id): g for g in Game.objects.all()}
+        self.stdout.write(f"\tGames: {len(self.game_cache)}")
+
+        self.organization_cache = {str(o.twitch_id): o for o in Organization.objects.all()}
+        self.stdout.write(f"\tOrganizations: {len(self.organization_cache)}")
+
+        self.drop_campaign_cache = {str(c.twitch_id): c for c in DropCampaign.objects.all()}
+        self.stdout.write(f"\tDrop Campaigns: {len(self.drop_campaign_cache)}")
+
+        self.channel_cache = {str(ch.twitch_id): ch for ch in Channel.objects.all()}
+        self.stdout.write(f"\tChannels: {len(self.channel_cache)}")
+
+        self.benefit_cache = {str(b.twitch_id): b for b in DropBenefit.objects.all()}
+        self.stdout.write(f"\tBenefits: {len(self.benefit_cache)}")
+
+    def handle(self, *args, **options) -> None:  # noqa: ARG002
+        """Main entry point for the command.
+
+        Raises:
+            CommandError: If the provided path does not exist.
+        """
+        input_path: Path = Path(options["path"]).resolve()
+
+        self.pre_fill_cache()
+
+        try:
+            if input_path.is_file():
+                self.process_file(file_path=input_path, options=options)
+            elif input_path.is_dir():
+                self.process_json_files(input_path=input_path, options=options)
+            else:
+                msg: str = f"Path does not exist: {input_path}"
+                raise CommandError(msg)
+        except KeyboardInterrupt:
+            self.stdout.write(self.style.WARNING("\n\nInterrupted by user!"))
+            self.stdout.write(self.style.WARNING("Shutting down gracefully..."))
+            sys.exit(130)
+
+    def process_json_files(self, input_path: Path, options: dict) -> None:
+        """Process multiple JSON files in a directory.
+
+        Args:
+            input_path: Path to the directory containing JSON files
+            options: Command options
+        """
+        json_files: list[Path] = self.collect_json_files(options, input_path)
+        self.stdout.write(f"Found {len(json_files)} JSON files to process")
+
+        completed_count = 0
+        with ProcessPoolExecutor() as executor:
+            futures = {executor.submit(self.process_file_worker, file_path, options): file_path for file_path in json_files}
+
+            for future in as_completed(futures):
+                file_path: Path = futures[future]
+                try:
+                    result: dict[str, bool | str] = future.result()
+                    if result["success"]:
+                        self.stdout.write(f"✓ {file_path}")
+                    else:
+                        self.stdout.write(f"✗ {file_path} -> {result['broken_dir']}/{file_path.name}")
+
+                    completed_count += 1
+                except (OSError, ValueError, KeyError) as e:
+                    self.stdout.write(f"✗ {file_path} (error: {e})")
+                    completed_count += 1
+
+                self.stdout.write(f"Progress: {completed_count}/{len(json_files)} files processed")
+                self.stdout.write("")
+
+    def collect_json_files(self, options: dict, input_path: Path) -> list[Path]:
+        """Collect JSON files from the specified directory.
+
+        Args:
+            options: Command options
+            input_path: Path to the directory
+
+        Returns:
+            List of JSON file paths
+        """
+        json_files: list[Path] = []
+        if options["recursive"]:
+            for root, _dirs, files in os.walk(input_path):
+                root_path = Path(root)
+                json_files.extend(root_path / file for file in files if file.endswith(".json"))
+        else:
+            json_files = [f for f in input_path.iterdir() if f.is_file() and f.suffix == ".json"]
+        return json_files
+
+    @staticmethod
+    def process_file_worker(file_path: Path, options: dict) -> dict[str, bool | str]:
+        """Worker function for parallel processing of files.
+
+        Args:
+            file_path: Path to the JSON file to process
+            options: Command options
+
+        Raises:
+            ValidationError: If the JSON file fails validation
+
+        Returns:
+            Dict with success status and optional broken_dir path
+        """
+        try:
+            ViewerDropsDashboardPayload.model_validate_json(file_path.read_text(encoding="utf-8"))
+        except ValidationError:
+            if options["crash_on_error"]:
+                raise
+
+            broken_dir: Path = move_failed_validation_file(file_path)
+            return {"success": False, "broken_dir": str(broken_dir)}
+        else:
+            return {"success": True}
+
+    def process_file(self, file_path: Path, options: dict) -> None:
+        """Reads a JSON file and processes the campaign data.
+
+        Args:
+            file_path: Path to the JSON file
+            options: Command options
+
+        Raises:
+            ValidationError: If the JSON file fails validation
+        """
+        self.stdout.write(f"Processing file: {file_path}")
+
+        try:
+            _: ViewerDropsDashboardPayload = ViewerDropsDashboardPayload.model_validate_json(file_path.read_text(encoding="utf-8"))
+            self.stdout.write("\tProcessed drop campaigns")
+        except ValidationError:
+            if options["crash_on_error"]:
+                raise
+
+            broken_dir: Path = move_failed_validation_file(file_path)
+            self.stdout.write(f"\tMoved to {broken_dir} (validation failed)")
--- a/twitch/management/commands/import_drops.py
+++ b/twitch/management/commands/import_drops.py
@ -187,15 +187,15 @@ class Command(BaseCommand):
        """Load existing DB objects into in-memory caches to avoid repeated queries."""
        # These queries may be heavy if DB is huge — safe because optional via --no-preload
        with self._cache_locks["game"]:
-            self._game_cache = {str(g.id): g for g in Game.objects.all()}
+            self._game_cache = {str(g.twitch_id): g for g in Game.objects.all()}
        with self._cache_locks["org"]:
-            self._organization_cache = {str(o.id): o for o in Organization.objects.all()}
+            self._organization_cache = {str(o.twitch_id): o for o in Organization.objects.all()}
        with self._cache_locks["campaign"]:
-            self._drop_campaign_cache = {str(c.id): c for c in DropCampaign.objects.all()}
+            self._drop_campaign_cache = {str(c.twitch_id): c for c in DropCampaign.objects.all()}
        with self._cache_locks["channel"]:
-            self._channel_cache = {str(ch.id): ch for ch in Channel.objects.all()}
+            self._channel_cache = {str(ch.twitch_id): ch for ch in Channel.objects.all()}
        with self._cache_locks["benefit"]:
-            self._benefit_cache = {str(b.id): b for b in DropBenefit.objects.all()}
+            self._benefit_cache = {str(b.twitch_id): b for b in DropBenefit.objects.all()}

    def process_drops(self, *, continue_on_error: bool, path: Path, processed_path: Path) -> None:
        """Process drops from a file or directory.
@ -397,8 +397,8 @@ class Command(BaseCommand):
            return

        if isinstance(data, list):
-            for _item in data:
-                self.import_drop_campaign(_item, file_path=file_path)
+            for item in data:
+                self.import_drop_campaign(item, file_path=file_path)
        elif isinstance(data, dict):
            self.import_drop_campaign(data, file_path=file_path)
        else:
@ -534,7 +534,7 @@ class Command(BaseCommand):

        benefit_edges: list[dict[str, Any]] = drop_data.get("benefitEdges", [])
        if not benefit_edges:
-            tqdm.write(self.style.WARNING(f"No benefit edges found for drop {time_based_drop.name} (ID: {time_based_drop.id})"))
+            tqdm.write(self.style.WARNING(f"No benefit edges found for drop {time_based_drop.name} (ID: {time_based_drop.twitch_id})"))
            self.move_file(file_path, Path("no_benefit_edges") / file_path.name)
            return

@ -570,10 +570,10 @@ class Command(BaseCommand):
                    if created:
                        tqdm.write(f"Added {drop_benefit_edge}")
            except MultipleObjectsReturned as e:
-                msg = f"Error: Multiple DropBenefitEdge objects found for drop {time_based_drop.id} and benefit {benefit.id}. Cannot update or create."
+                msg = f"Error: Multiple DropBenefitEdge objects found for drop {time_based_drop.twitch_id} and benefit {benefit.twitch_id}. Cannot update or create."  # noqa: E501
                raise CommandError(msg) from e
            except (IntegrityError, DatabaseError, TypeError, ValueError) as e:
-                msg = f"Database or validation error creating DropBenefitEdge for drop {time_based_drop.id} and benefit {benefit.id}: {e}"
+                msg = f"Database or validation error creating DropBenefitEdge for drop {time_based_drop.twitch_id} and benefit {benefit.twitch_id}: {e}"
                raise CommandError(msg) from e

    def create_time_based_drop(self, drop_campaign: DropCampaign, drop_data: dict[str, Any]) -> TimeBasedDrop:
@ -847,7 +847,7 @@ class Command(BaseCommand):

            # Set the many-to-many relationship (save only if different)
            current_ids = set(drop_campaign.allow_channels.values_list("id", flat=True))
-            new_ids = {ch.id for ch in channel_objects}
+            new_ids = {ch.twitch_id for ch in channel_objects}
            if current_ids != new_ids:
                drop_campaign.allow_channels.set(channel_objects)