diff --git a/core/views.py b/core/views.py index 601ce11..352109d 100644 --- a/core/views.py +++ b/core/views.py @@ -644,56 +644,9 @@ def dataset_backups_view(request: HttpRequest) -> HttpResponse: datasets.sort(key=operator.itemgetter("updated_at"), reverse=True) - dataset_distributions: list[dict[str, str]] = [] - for dataset in datasets: - download_path: str | None = dataset.get("download_path") - if not download_path: - continue - dataset_distributions.append({ - "@type": "DataDownload", - "name": dataset["name"], - "contentUrl": request.build_absolute_uri( - reverse("core:dataset_backup_download", args=[download_path]), - ), - "encodingFormat": "application/zstd", - }) - - dataset_schema: dict[str, Any] = { - "@context": "https://schema.org", - "@type": "Dataset", - "name": "Historical archive of Twitch and Kick drop data", - "identifier": request.build_absolute_uri(reverse("core:dataset_backups")), - "temporalCoverage": "2024-07-17/..", - "url": request.build_absolute_uri(reverse("core:dataset_backups")), - "license": "https://creativecommons.org/publicdomain/zero/1.0/", - "isAccessibleForFree": True, - "description": ( - "Historical data on Twitch and Kick drops, campaigns, rewards, and more, available for download as compressed SQL files or JSON." - ), - "keywords": [ - "Twitch drops", - "Kick drops", - ], - "creator": { - "@type": "Person", - "givenName": "Joakim", - "familyName": "Hellsén", - "name": "Joakim Hellsén", - "sameAs": "https://orcid.org/0009-0006-7305-524X", - }, - "includedInDataCatalog": { - "@type": "DataCatalog", - "name": "ttvdrops.lovinator.space", - "url": request.build_absolute_uri(reverse("core:dataset_backups")), - }, - } - if dataset_distributions: - dataset_schema["distribution"] = dataset_distributions - seo_context: dict[str, Any] = _build_seo_context( page_title="Twitch/Kick drop data", page_description="Twitch/Kick datasets available for download, including historical drop campaign data and more.", - schema_data=dataset_schema, ) context: dict[str, Any] = { "datasets": datasets, diff --git a/templates/twitch/dataset_backups.html b/templates/twitch/dataset_backups.html index 5837bf6..0cb4064 100644 --- a/templates/twitch/dataset_backups.html +++ b/templates/twitch/dataset_backups.html @@ -31,7 +31,7 @@ {% for dataset in datasets %} - + {{ dataset.name }} diff --git a/twitch/management/commands/backup_db.py b/twitch/management/commands/backup_db.py index d651cfe..88c7ece 100644 --- a/twitch/management/commands/backup_db.py +++ b/twitch/management/commands/backup_db.py @@ -1,3 +1,4 @@ +import csv import io import json import os @@ -86,6 +87,13 @@ class Command(BaseCommand): json_path: Path = output_dir / f"{prefix}-{timestamp}.json.zst" _write_json_dump(json_path, allowed_tables) + csv_path: Path = _write_csv_dump( + output_dir, + prefix, + timestamp, + allowed_tables, + ) + created_at: datetime = datetime.fromtimestamp( output_path.stat().st_mtime, tz=timezone.get_current_timezone(), @@ -96,6 +104,7 @@ class Command(BaseCommand): ), ) self.stdout.write(self.style.SUCCESS(f"JSON backup created: {json_path}")) + self.stdout.write(self.style.SUCCESS(f"CSV backup created: {csv_path}")) self.stdout.write(self.style.SUCCESS(f"Included tables: {len(allowed_tables)}")) @@ -340,3 +349,46 @@ def _write_json_dump(output_path: Path, tables: list[str]) -> None: io.TextIOWrapper(compressed, encoding="utf-8") as handle, ): json.dump(data, handle, default=_json_default) + + +def _write_csv_dump( + output_dir: Path, + prefix: str, + timestamp: str, + tables: list[str], +) -> Path: + """Write a combined CSV file containing rows from all tables. + + Args: + output_dir: Directory where CSV files will be written. + prefix: Filename prefix. + timestamp: Timestamp string for filenames. + tables: Table names to include. + + Returns: + Created file path. + """ + output_path: Path = output_dir / f"{prefix}-{timestamp}.csv.zst" + + with ( + output_path.open("wb") as raw_handle, + zstd.open(raw_handle, "w") as compressed, + io.TextIOWrapper(compressed, encoding="utf-8") as handle, + ): + writer: csv.Writer = csv.writer(handle) + writer.writerow(["table", "row_json"]) + + with django_connection.cursor() as cursor: + for table in tables: + cursor.execute(f'SELECT * FROM "{table}"') # noqa: S608 + columns: list[str] = [col[0] for col in cursor.description] + rows: list[tuple] = cursor.fetchall() + + for row in rows: + row_dict = dict(zip(columns, row, strict=False)) + writer.writerow([ + table, + json.dumps(row_dict, default=_json_default), + ]) + + return output_path diff --git a/twitch/tests/test_backup.py b/twitch/tests/test_backup.py index 1123a7a..0eb84a4 100644 --- a/twitch/tests/test_backup.py +++ b/twitch/tests/test_backup.py @@ -1,3 +1,4 @@ +import csv import io import json import math @@ -16,6 +17,7 @@ from django.urls import reverse from twitch.management.commands.backup_db import _get_allowed_tables from twitch.management.commands.backup_db import _json_default from twitch.management.commands.backup_db import _sql_literal +from twitch.management.commands.backup_db import _write_csv_dump from twitch.management.commands.backup_db import _write_json_dump from twitch.management.commands.backup_db import _write_postgres_dump from twitch.management.commands.backup_db import _write_sqlite_dump @@ -23,6 +25,7 @@ from twitch.models import Game from twitch.models import Organization if TYPE_CHECKING: + from csv import Reader from datetime import datetime from pathlib import Path @@ -195,6 +198,34 @@ class TestBackupCommand: row.get("name") == "Test Org JSON" for row in data["twitch_organization"] ) + def test_backup_creates_single_csv_file(self, tmp_path: Path) -> None: + """Test that backup command creates a single CSV file alongside the SQL dump.""" + _skip_if_pg_dump_missing() + Organization.objects.create(twitch_id="test_csv", name="Test Org CSV") + + output_dir: Path = tmp_path / "backups" + output_dir.mkdir() + + call_command("backup_db", output_dir=str(output_dir), prefix="test") + + csv_files: list[Path] = list(output_dir.glob("test-*.csv.zst")) + assert len(csv_files) == 1 + + with ( + csv_files[0].open("rb") as raw_handle, + zstd.open(raw_handle, "r") as compressed, + io.TextIOWrapper(compressed, encoding="utf-8") as handle, + ): + reader: Reader = csv.reader(handle) + rows: list[list[str]] = list(reader) + + assert len(rows) >= 2 # header + at least one data row + assert rows[0] == ["table", "row_json"] + data_rows: list[list[str]] = [ + row for row in rows[1:] if row and row[0] == "twitch_organization" + ] + assert any("Test Org CSV" in row[1] for row in data_rows) + @pytest.mark.django_db class TestBackupHelperFunctions: @@ -306,6 +337,36 @@ class TestBackupHelperFunctions: row.get("name") == "JSON Helper Org" for row in data["twitch_organization"] ) + def test_write_csv_dump_creates_single_file(self, tmp_path: Path) -> None: + """Test _write_csv_dump creates one combined compressed CSV file.""" + Organization.objects.create(twitch_id="test_csv_helper", name="CSV Helper Org") + + tables: list[str] = _get_allowed_tables("twitch_") + path: Path = _write_csv_dump( + tmp_path, + "test", + "20260317-120000", + tables, + ) + + assert path.exists() + assert path.name == "test-20260317-120000.csv.zst" + + with ( + path.open("rb") as raw_handle, + zstd.open(raw_handle, "r") as compressed, + io.TextIOWrapper(compressed, encoding="utf-8") as handle, + ): + reader: Reader = csv.reader(handle) + rows: list[list[str]] = list(reader) + + assert len(rows) >= 2 # header + at least one data row + assert rows[0] == ["table", "row_json"] + data_rows: list[list[str]] = [ + row for row in rows[1:] if row and row[0] == "twitch_organization" + ] + assert any("CSV Helper Org" in row[1] for row in data_rows) + def test_json_default_handles_bytes(self) -> None: """Test _json_default converts bytes to hex string.""" assert _json_default(b"\x00\x01") == "0001"