diff --git a/templates/twitch/dataset_backups.html b/templates/twitch/dataset_backups.html index c6caa80..0cb4064 100644 --- a/templates/twitch/dataset_backups.html +++ b/templates/twitch/dataset_backups.html @@ -5,6 +5,21 @@ {% block content %}

Dataset Backups

+
+

About this dataset

+

This site tracks and publishes open Twitch and Kick drop campaign data.

+

+ The exported datasets on this page are released under CC0 so you can reuse them freely. + The underlying source data is scraped from Twitch/Kick APIs and pages, so we do not control the + upstream content and cannot guarantee upstream accuracy or permanence. +

+

Note that some drops has missing or incomplete data due to Twitch API limitations.

+

+ Need a special format for your workflow or research pipeline? + Contact me via GitHub issues + and describe what you need. +

+
{% if datasets %} diff --git a/twitch/management/commands/backup_db.py b/twitch/management/commands/backup_db.py index b1b380d..88c7ece 100644 --- a/twitch/management/commands/backup_db.py +++ b/twitch/management/commands/backup_db.py @@ -87,7 +87,7 @@ class Command(BaseCommand): json_path: Path = output_dir / f"{prefix}-{timestamp}.json.zst" _write_json_dump(json_path, allowed_tables) - csv_paths: list[Path] = _write_csv_dumps( + csv_path: Path = _write_csv_dump( output_dir, prefix, timestamp, @@ -104,9 +104,7 @@ class Command(BaseCommand): ), ) self.stdout.write(self.style.SUCCESS(f"JSON backup created: {json_path}")) - self.stdout.write( - self.style.SUCCESS(f"CSV backups created: {len(csv_paths)} files"), - ) + self.stdout.write(self.style.SUCCESS(f"CSV backup created: {csv_path}")) self.stdout.write(self.style.SUCCESS(f"Included tables: {len(allowed_tables)}")) @@ -353,13 +351,13 @@ def _write_json_dump(output_path: Path, tables: list[str]) -> None: json.dump(data, handle, default=_json_default) -def _write_csv_dumps( +def _write_csv_dump( output_dir: Path, prefix: str, timestamp: str, tables: list[str], -) -> list[Path]: - """Write per-table CSV files into zstd-compressed files. +) -> Path: + """Write a combined CSV file containing rows from all tables. Args: output_dir: Directory where CSV files will be written. @@ -368,23 +366,29 @@ def _write_csv_dumps( tables: Table names to include. Returns: - List of created file paths. + Created file path. """ - paths: list[Path] = [] - with django_connection.cursor() as cursor: - for table in tables: - cursor.execute(f'SELECT * FROM "{table}"') # noqa: S608 - columns: list[str] = [col[0] for col in cursor.description] - rows: list[tuple] = cursor.fetchall() + output_path: Path = output_dir / f"{prefix}-{timestamp}.csv.zst" - output_path: Path = output_dir / f"{prefix}-{timestamp}-{table}.csv.zst" - with ( - output_path.open("wb") as raw_handle, - zstd.open(raw_handle, "w") as compressed, - io.TextIOWrapper(compressed, encoding="utf-8") as handle, - ): - writer: csv.Writer = csv.writer(handle) - writer.writerow(columns) - writer.writerows(rows) - paths.append(output_path) - return paths + with ( + output_path.open("wb") as raw_handle, + zstd.open(raw_handle, "w") as compressed, + io.TextIOWrapper(compressed, encoding="utf-8") as handle, + ): + writer: csv.Writer = csv.writer(handle) + writer.writerow(["table", "row_json"]) + + with django_connection.cursor() as cursor: + for table in tables: + cursor.execute(f'SELECT * FROM "{table}"') # noqa: S608 + columns: list[str] = [col[0] for col in cursor.description] + rows: list[tuple] = cursor.fetchall() + + for row in rows: + row_dict = dict(zip(columns, row, strict=False)) + writer.writerow([ + table, + json.dumps(row_dict, default=_json_default), + ]) + + return output_path diff --git a/twitch/tests/test_backup.py b/twitch/tests/test_backup.py index 40e88ca..0eb84a4 100644 --- a/twitch/tests/test_backup.py +++ b/twitch/tests/test_backup.py @@ -17,7 +17,7 @@ from django.urls import reverse from twitch.management.commands.backup_db import _get_allowed_tables from twitch.management.commands.backup_db import _json_default from twitch.management.commands.backup_db import _sql_literal -from twitch.management.commands.backup_db import _write_csv_dumps +from twitch.management.commands.backup_db import _write_csv_dump from twitch.management.commands.backup_db import _write_json_dump from twitch.management.commands.backup_db import _write_postgres_dump from twitch.management.commands.backup_db import _write_sqlite_dump @@ -198,8 +198,8 @@ class TestBackupCommand: row.get("name") == "Test Org JSON" for row in data["twitch_organization"] ) - def test_backup_creates_csv_files(self, tmp_path: Path) -> None: - """Test that backup command creates per-table CSV files alongside the SQL dump.""" + def test_backup_creates_single_csv_file(self, tmp_path: Path) -> None: + """Test that backup command creates a single CSV file alongside the SQL dump.""" _skip_if_pg_dump_missing() Organization.objects.create(twitch_id="test_csv", name="Test Org CSV") @@ -208,13 +208,11 @@ class TestBackupCommand: call_command("backup_db", output_dir=str(output_dir), prefix="test") - org_csv_files: list[Path] = list( - output_dir.glob("test-*-twitch_organization.csv.zst"), - ) - assert len(org_csv_files) == 1 + csv_files: list[Path] = list(output_dir.glob("test-*.csv.zst")) + assert len(csv_files) == 1 with ( - org_csv_files[0].open("rb") as raw_handle, + csv_files[0].open("rb") as raw_handle, zstd.open(raw_handle, "r") as compressed, io.TextIOWrapper(compressed, encoding="utf-8") as handle, ): @@ -222,8 +220,11 @@ class TestBackupCommand: rows: list[list[str]] = list(reader) assert len(rows) >= 2 # header + at least one data row - assert "name" in rows[0] - assert any("Test Org CSV" in row for row in rows[1:]) + assert rows[0] == ["table", "row_json"] + data_rows: list[list[str]] = [ + row for row in rows[1:] if row and row[0] == "twitch_organization" + ] + assert any("Test Org CSV" in row[1] for row in data_rows) @pytest.mark.django_db @@ -336,26 +337,23 @@ class TestBackupHelperFunctions: row.get("name") == "JSON Helper Org" for row in data["twitch_organization"] ) - def test_write_csv_dumps_creates_per_table_files(self, tmp_path: Path) -> None: - """Test _write_csv_dumps creates one compressed CSV file per table.""" + def test_write_csv_dump_creates_single_file(self, tmp_path: Path) -> None: + """Test _write_csv_dump creates one combined compressed CSV file.""" Organization.objects.create(twitch_id="test_csv_helper", name="CSV Helper Org") tables: list[str] = _get_allowed_tables("twitch_") - paths: list[Path] = _write_csv_dumps( + path: Path = _write_csv_dump( tmp_path, "test", "20260317-120000", tables, ) - assert len(paths) == len(tables) - assert all(p.exists() for p in paths) - - org_csv: Path = tmp_path / "test-20260317-120000-twitch_organization.csv.zst" - assert org_csv.exists() + assert path.exists() + assert path.name == "test-20260317-120000.csv.zst" with ( - org_csv.open("rb") as raw_handle, + path.open("rb") as raw_handle, zstd.open(raw_handle, "r") as compressed, io.TextIOWrapper(compressed, encoding="utf-8") as handle, ): @@ -363,8 +361,11 @@ class TestBackupHelperFunctions: rows: list[list[str]] = list(reader) assert len(rows) >= 2 # header + at least one data row - assert "name" in rows[0] - assert any("CSV Helper Org" in row for row in rows[1:]) + assert rows[0] == ["table", "row_json"] + data_rows: list[list[str]] = [ + row for row in rows[1:] if row and row[0] == "twitch_organization" + ] + assert any("CSV Helper Org" in row[1] for row in data_rows) def test_json_default_handles_bytes(self) -> None: """Test _json_default converts bytes to hex string.""" @@ -388,7 +389,7 @@ class TestDatasetBackupViews: Returns: Path to the created datasets directory. """ - datasets_dir = tmp_path / "datasets" + datasets_dir: Path = tmp_path / "datasets" datasets_dir.mkdir() return datasets_dir @@ -399,7 +400,7 @@ class TestDatasetBackupViews: Returns: Path to the created backup file. """ - backup_file = datasets_dir / "ttvdrops-20260210-120000.sql.zst" + backup_file: Path = datasets_dir / "ttvdrops-20260210-120000.sql.zst" with ( backup_file.open("wb") as raw_handle, zstd.open(raw_handle, "w") as compressed, @@ -452,8 +453,8 @@ class TestDatasetBackupViews: monkeypatch.setattr(settings, "DATA_DIR", datasets_dir.parent) # Create multiple backup files with different timestamps - older_backup = datasets_dir / "ttvdrops-20260210-100000.sql.zst" - newer_backup = datasets_dir / "ttvdrops-20260210-140000.sql.zst" + older_backup: Path = datasets_dir / "ttvdrops-20260210-100000.sql.zst" + newer_backup: Path = datasets_dir / "ttvdrops-20260210-140000.sql.zst" for backup in [older_backup, newer_backup]: with ( @@ -473,9 +474,9 @@ class TestDatasetBackupViews: reverse("core:dataset_backups"), ) - content = response.content.decode() - newer_pos = content.find("20260210-140000") - older_pos = content.find("20260210-100000") + content: str = response.content.decode() + newer_pos: int = content.find("20260210-140000") + older_pos: int = content.find("20260210-100000") # Newer backup should appear first (sorted descending) assert 0 < newer_pos < older_pos @@ -512,7 +513,7 @@ class TestDatasetBackupViews: monkeypatch.setattr(settings, "DATA_DIR", datasets_dir.parent) # Attempt path traversal - response = client.get( + response: _MonkeyPatchedWSGIResponse = client.get( reverse("core:dataset_backup_download", args=["../../../etc/passwd"]), ) assert response.status_code == 404 @@ -527,10 +528,10 @@ class TestDatasetBackupViews: monkeypatch.setattr(settings, "DATA_DIR", datasets_dir.parent) # Create a file with invalid extension - invalid_file = datasets_dir / "malicious.exe" - invalid_file.write_text("not a backup") + invalid_file: Path = datasets_dir / "malicious.exe" + invalid_file.write_text("not a backup", encoding="utf-8") - response = client.get( + response: _MonkeyPatchedWSGIResponse = client.get( reverse("core:dataset_backup_download", args=["malicious.exe"]), ) assert response.status_code == 404 @@ -544,7 +545,7 @@ class TestDatasetBackupViews: """Test download returns 404 for non-existent file.""" monkeypatch.setattr(settings, "DATA_DIR", datasets_dir.parent) - response = client.get( + response: _MonkeyPatchedWSGIResponse = client.get( reverse("core:dataset_backup_download", args=["nonexistent.sql.zst"]), ) assert response.status_code == 404 @@ -565,7 +566,7 @@ class TestDatasetBackupViews: assert response.status_code == 200 # Should contain size information (bytes, KB, MB, or GB) - content = response.content.decode() + content: str = response.content.decode() assert any(unit in content for unit in ["bytes", "KB", "MB", "GB"]) def test_dataset_list_ignores_non_zst_files( @@ -586,7 +587,7 @@ class TestDatasetBackupViews: reverse("core:dataset_backups"), ) - content = response.content.decode() + content: str = response.content.decode() assert "backup.sql.zst" in content assert "readme.txt" not in content assert "old_backup.gz" not in content @@ -601,9 +602,9 @@ class TestDatasetBackupViews: monkeypatch.setattr(settings, "DATA_DIR", datasets_dir.parent) # Create subdirectory with backup - subdir = datasets_dir / "2026" / "02" + subdir: Path = datasets_dir / "2026" / "02" subdir.mkdir(parents=True) - backup_file = subdir / "backup.sql.zst" + backup_file: Path = subdir / "backup.sql.zst" with ( backup_file.open("wb") as raw_handle, zstd.open(raw_handle, "w") as compressed,