Compare commits

...

2 commits

Author SHA1 Message Date
9fd22ba8a8
Add JSON and CSV functionality to database backup command
All checks were successful
Deploy to Server / deploy (push) Successful in 40s
2026-03-17 00:38:01 +01:00
563266d8cc
Also add Kick to dataset 2026-03-17 00:07:22 +01:00
3 changed files with 238 additions and 12 deletions

View file

@ -645,8 +645,8 @@ def dataset_backups_view(request: HttpRequest) -> HttpResponse:
datasets.sort(key=operator.itemgetter("updated_at"), reverse=True) datasets.sort(key=operator.itemgetter("updated_at"), reverse=True)
seo_context: dict[str, Any] = _build_seo_context( seo_context: dict[str, Any] = _build_seo_context(
page_title="Twitch Dataset", page_title="Twitch/Kick drop data",
page_description="Database backups and datasets available for download.", page_description="Twitch/Kick datasets available for download, including historical drop campaign data and more.",
) )
context: dict[str, Any] = { context: dict[str, Any] = {
"datasets": datasets, "datasets": datasets,

View file

@ -1,4 +1,6 @@
import csv
import io import io
import json
import os import os
import shutil import shutil
import subprocess # noqa: S404 import subprocess # noqa: S404
@ -19,9 +21,9 @@ if TYPE_CHECKING:
class Command(BaseCommand): class Command(BaseCommand):
"""Create a compressed SQL dump of the Twitch dataset tables.""" """Create a compressed SQL dump of the Twitch and Kick dataset tables."""
help = "Create a compressed SQL dump of the Twitch dataset tables." help = "Create a compressed SQL dump of the Twitch and Kick dataset tables."
def add_arguments(self, parser: ArgumentParser) -> None: def add_arguments(self, parser: ArgumentParser) -> None:
"""Define arguments for the backup command.""" """Define arguments for the backup command."""
@ -59,9 +61,14 @@ class Command(BaseCommand):
timestamp: str = timezone.localtime(timezone.now()).strftime("%Y%m%d-%H%M%S") timestamp: str = timezone.localtime(timezone.now()).strftime("%Y%m%d-%H%M%S")
output_path: Path = output_dir / f"{prefix}-{timestamp}.sql.zst" output_path: Path = output_dir / f"{prefix}-{timestamp}.sql.zst"
allowed_tables = _get_allowed_tables("twitch_") allowed_tables = sorted({
*_get_allowed_tables("twitch_"),
*_get_allowed_tables("kick_"),
})
if not allowed_tables: if not allowed_tables:
self.stdout.write(self.style.WARNING("No twitch tables found to back up.")) self.stdout.write(
self.style.WARNING("No twitch or kick tables found to back up."),
)
return return
if django_connection.vendor == "postgresql": if django_connection.vendor == "postgresql":
@ -77,6 +84,16 @@ class Command(BaseCommand):
msg = f"Unsupported database backend: {django_connection.vendor}" msg = f"Unsupported database backend: {django_connection.vendor}"
raise CommandError(msg) raise CommandError(msg)
json_path: Path = output_dir / f"{prefix}-{timestamp}.json.zst"
_write_json_dump(json_path, allowed_tables)
csv_paths: list[Path] = _write_csv_dumps(
output_dir,
prefix,
timestamp,
allowed_tables,
)
created_at: datetime = datetime.fromtimestamp( created_at: datetime = datetime.fromtimestamp(
output_path.stat().st_mtime, output_path.stat().st_mtime,
tz=timezone.get_current_timezone(), tz=timezone.get_current_timezone(),
@ -86,6 +103,10 @@ class Command(BaseCommand):
f"Backup created: {output_path} (updated {created_at.isoformat()})", f"Backup created: {output_path} (updated {created_at.isoformat()})",
), ),
) )
self.stdout.write(self.style.SUCCESS(f"JSON backup created: {json_path}"))
self.stdout.write(
self.style.SUCCESS(f"CSV backups created: {len(csv_paths)} files"),
)
self.stdout.write(self.style.SUCCESS(f"Included tables: {len(allowed_tables)}")) self.stdout.write(self.style.SUCCESS(f"Included tables: {len(allowed_tables)}"))
@ -293,3 +314,77 @@ def _sql_literal(value: object) -> str:
if isinstance(value, bytes): if isinstance(value, bytes):
return "X'" + value.hex() + "'" return "X'" + value.hex() + "'"
return "'" + str(value).replace("'", "''") + "'" return "'" + str(value).replace("'", "''") + "'"
def _json_default(value: object) -> str:
"""Convert non-serializable values to JSON-compatible strings.
Args:
value: Value to convert.
Returns:
String representation.
"""
if isinstance(value, bytes):
return value.hex()
return str(value)
def _write_json_dump(output_path: Path, tables: list[str]) -> None:
"""Write a JSON dump of all tables into a zstd-compressed file.
Args:
output_path: Destination path for the zstd file.
tables: Table names to include.
"""
data: dict[str, list[dict]] = {}
with django_connection.cursor() as cursor:
for table in tables:
cursor.execute(f'SELECT * FROM "{table}"') # noqa: S608
columns: list[str] = [col[0] for col in cursor.description]
rows = cursor.fetchall()
data[table] = [dict(zip(columns, row, strict=False)) for row in rows]
with (
output_path.open("wb") as raw_handle,
zstd.open(raw_handle, "w") as compressed,
io.TextIOWrapper(compressed, encoding="utf-8") as handle,
):
json.dump(data, handle, default=_json_default)
def _write_csv_dumps(
output_dir: Path,
prefix: str,
timestamp: str,
tables: list[str],
) -> list[Path]:
"""Write per-table CSV files into zstd-compressed files.
Args:
output_dir: Directory where CSV files will be written.
prefix: Filename prefix.
timestamp: Timestamp string for filenames.
tables: Table names to include.
Returns:
List of created file paths.
"""
paths: list[Path] = []
with django_connection.cursor() as cursor:
for table in tables:
cursor.execute(f'SELECT * FROM "{table}"') # noqa: S608
columns: list[str] = [col[0] for col in cursor.description]
rows: list[tuple] = cursor.fetchall()
output_path: Path = output_dir / f"{prefix}-{timestamp}-{table}.csv.zst"
with (
output_path.open("wb") as raw_handle,
zstd.open(raw_handle, "w") as compressed,
io.TextIOWrapper(compressed, encoding="utf-8") as handle,
):
writer: csv.Writer = csv.writer(handle)
writer.writerow(columns)
writer.writerows(rows)
paths.append(output_path)
return paths

View file

@ -1,8 +1,11 @@
import csv
import io import io
import json
import math import math
import os import os
import shutil import shutil
from compression import zstd from compression import zstd
from datetime import datetime as dt
from typing import TYPE_CHECKING from typing import TYPE_CHECKING
import pytest import pytest
@ -12,13 +15,18 @@ from django.db import connection
from django.urls import reverse from django.urls import reverse
from twitch.management.commands.backup_db import _get_allowed_tables from twitch.management.commands.backup_db import _get_allowed_tables
from twitch.management.commands.backup_db import _json_default
from twitch.management.commands.backup_db import _sql_literal from twitch.management.commands.backup_db import _sql_literal
from twitch.management.commands.backup_db import _write_csv_dumps
from twitch.management.commands.backup_db import _write_json_dump
from twitch.management.commands.backup_db import _write_postgres_dump from twitch.management.commands.backup_db import _write_postgres_dump
from twitch.management.commands.backup_db import _write_sqlite_dump from twitch.management.commands.backup_db import _write_sqlite_dump
from twitch.models import Game from twitch.models import Game
from twitch.models import Organization from twitch.models import Organization
if TYPE_CHECKING: if TYPE_CHECKING:
from csv import Reader
from datetime import datetime
from pathlib import Path from pathlib import Path
from django.test import Client from django.test import Client
@ -84,34 +92,39 @@ class TestBackupCommand:
assert "twitch_game" in content assert "twitch_game" in content
assert "Test Org" in content assert "Test Org" in content
def test_backup_excludes_non_twitch_tables(self, tmp_path: Path) -> None: def test_backup_excludes_non_app_tables(self, tmp_path: Path) -> None:
"""Test that backup only includes twitch_ prefixed tables.""" """Test that backup includes app tables and excludes non-app tables."""
_skip_if_pg_dump_missing() _skip_if_pg_dump_missing()
# Create test data so tables exist # Create test data so tables exist
Organization.objects.create(twitch_id="test001", name="Test Org") Organization.objects.create(twitch_id="test001", name="Test Org")
output_dir = tmp_path / "backups" output_dir: Path = tmp_path / "backups"
output_dir.mkdir() output_dir.mkdir()
call_command("backup_db", output_dir=str(output_dir), prefix="test") call_command("backup_db", output_dir=str(output_dir), prefix="test")
backup_file = next(iter(output_dir.glob("test-*.sql.zst"))) backup_file: Path = next(iter(output_dir.glob("test-*.sql.zst")))
with ( with (
backup_file.open("rb") as raw_handle, backup_file.open("rb") as raw_handle,
zstd.open(raw_handle, "r") as compressed, zstd.open(raw_handle, "r") as compressed,
io.TextIOWrapper(compressed, encoding="utf-8") as handle, io.TextIOWrapper(compressed, encoding="utf-8") as handle,
): ):
content = handle.read() content: str = handle.read()
# Should NOT contain django admin, silk, or debug toolbar tables # Should NOT contain django admin, silk, or debug toolbar tables
assert "django_session" not in content assert "django_session" not in content
assert "django_migrations" not in content
assert "django_content_type" not in content
assert "silk_" not in content assert "silk_" not in content
assert "debug_toolbar_" not in content assert "debug_toolbar_" not in content
assert "django_admin_log" not in content assert "django_admin_log" not in content
assert "auth_" not in content
assert "youtube_" not in content
# Should contain twitch tables # Should contain twitch and kick tables
assert "twitch_" in content assert "twitch_" in content
assert "kick_" in content
def test_backup_with_custom_prefix(self, tmp_path: Path) -> None: def test_backup_with_custom_prefix(self, tmp_path: Path) -> None:
"""Test that custom prefix is used in filename.""" """Test that custom prefix is used in filename."""
@ -159,6 +172,59 @@ class TestBackupCommand:
backup_files = list(datasets_dir.glob("ttvdrops-*.sql.zst")) backup_files = list(datasets_dir.glob("ttvdrops-*.sql.zst"))
assert len(backup_files) >= 1 assert len(backup_files) >= 1
def test_backup_creates_json_file(self, tmp_path: Path) -> None:
"""Test that backup command creates a JSON file alongside the SQL dump."""
_skip_if_pg_dump_missing()
Organization.objects.create(twitch_id="test_json", name="Test Org JSON")
output_dir: Path = tmp_path / "backups"
output_dir.mkdir()
call_command("backup_db", output_dir=str(output_dir), prefix="test")
json_files: list[Path] = list(output_dir.glob("test-*.json.zst"))
assert len(json_files) == 1
with (
json_files[0].open("rb") as raw_handle,
zstd.open(raw_handle, "r") as compressed,
io.TextIOWrapper(compressed, encoding="utf-8") as handle,
):
data = json.load(handle)
assert isinstance(data, dict)
assert "twitch_organization" in data
assert any(
row.get("name") == "Test Org JSON" for row in data["twitch_organization"]
)
def test_backup_creates_csv_files(self, tmp_path: Path) -> None:
"""Test that backup command creates per-table CSV files alongside the SQL dump."""
_skip_if_pg_dump_missing()
Organization.objects.create(twitch_id="test_csv", name="Test Org CSV")
output_dir: Path = tmp_path / "backups"
output_dir.mkdir()
call_command("backup_db", output_dir=str(output_dir), prefix="test")
org_csv_files: list[Path] = list(
output_dir.glob("test-*-twitch_organization.csv.zst"),
)
assert len(org_csv_files) == 1
with (
org_csv_files[0].open("rb") as raw_handle,
zstd.open(raw_handle, "r") as compressed,
io.TextIOWrapper(compressed, encoding="utf-8") as handle,
):
reader: Reader = csv.reader(handle)
rows: list[list[str]] = list(reader)
assert len(rows) >= 2 # header + at least one data row
assert "name" in rows[0]
assert any("Test Org CSV" in row for row in rows[1:])
@pytest.mark.django_db @pytest.mark.django_db
class TestBackupHelperFunctions: class TestBackupHelperFunctions:
@ -245,6 +311,71 @@ class TestBackupHelperFunctions:
assert "INSERT INTO" in content assert "INSERT INTO" in content
assert "Write Test Org" in content assert "Write Test Org" in content
def test_write_json_dump_creates_valid_json(self, tmp_path: Path) -> None:
"""Test _write_json_dump creates valid compressed JSON with all tables."""
Organization.objects.create(
twitch_id="test_json_helper",
name="JSON Helper Org",
)
tables: list[str] = _get_allowed_tables("twitch_")
output_path: Path = tmp_path / "backup.json.zst"
_write_json_dump(output_path, tables)
with (
output_path.open("rb") as raw_handle,
zstd.open(raw_handle, "r") as compressed,
io.TextIOWrapper(compressed, encoding="utf-8") as handle,
):
data = json.load(handle)
assert isinstance(data, dict)
assert "twitch_organization" in data
assert all(table in data for table in tables)
assert any(
row.get("name") == "JSON Helper Org" for row in data["twitch_organization"]
)
def test_write_csv_dumps_creates_per_table_files(self, tmp_path: Path) -> None:
"""Test _write_csv_dumps creates one compressed CSV file per table."""
Organization.objects.create(twitch_id="test_csv_helper", name="CSV Helper Org")
tables: list[str] = _get_allowed_tables("twitch_")
paths: list[Path] = _write_csv_dumps(
tmp_path,
"test",
"20260317-120000",
tables,
)
assert len(paths) == len(tables)
assert all(p.exists() for p in paths)
org_csv: Path = tmp_path / "test-20260317-120000-twitch_organization.csv.zst"
assert org_csv.exists()
with (
org_csv.open("rb") as raw_handle,
zstd.open(raw_handle, "r") as compressed,
io.TextIOWrapper(compressed, encoding="utf-8") as handle,
):
reader: Reader = csv.reader(handle)
rows: list[list[str]] = list(reader)
assert len(rows) >= 2 # header + at least one data row
assert "name" in rows[0]
assert any("CSV Helper Org" in row for row in rows[1:])
def test_json_default_handles_bytes(self) -> None:
"""Test _json_default converts bytes to hex string."""
assert _json_default(b"\x00\x01") == "0001"
assert _json_default(b"hello") == "68656c6c6f"
def test_json_default_handles_other_types(self) -> None:
"""Test _json_default falls back to str() for other types."""
value: datetime = dt(2026, 3, 17, 12, 0, 0, tzinfo=dt.now().astimezone().tzinfo)
assert _json_default(value) == str(value)
@pytest.mark.django_db @pytest.mark.django_db
class TestDatasetBackupViews: class TestDatasetBackupViews: