Compare commits

..

No commits in common. "391097b8b832bc91707e938b33cd495500b7e18b" and "76b1cd70a5e7c0878042d728e0b2429da74ff3d5" have entirely different histories.

4 changed files with 114 additions and 48 deletions

View file

@ -644,56 +644,9 @@ def dataset_backups_view(request: HttpRequest) -> HttpResponse:
datasets.sort(key=operator.itemgetter("updated_at"), reverse=True) datasets.sort(key=operator.itemgetter("updated_at"), reverse=True)
dataset_distributions: list[dict[str, str]] = []
for dataset in datasets:
download_path: str | None = dataset.get("download_path")
if not download_path:
continue
dataset_distributions.append({
"@type": "DataDownload",
"name": dataset["name"],
"contentUrl": request.build_absolute_uri(
reverse("core:dataset_backup_download", args=[download_path]),
),
"encodingFormat": "application/zstd",
})
dataset_schema: dict[str, Any] = {
"@context": "https://schema.org",
"@type": "Dataset",
"name": "Historical archive of Twitch and Kick drop data",
"identifier": request.build_absolute_uri(reverse("core:dataset_backups")),
"temporalCoverage": "2024-07-17/..",
"url": request.build_absolute_uri(reverse("core:dataset_backups")),
"license": "https://creativecommons.org/publicdomain/zero/1.0/",
"isAccessibleForFree": True,
"description": (
"Historical data on Twitch and Kick drops, campaigns, rewards, and more, available for download as compressed SQL files or JSON."
),
"keywords": [
"Twitch drops",
"Kick drops",
],
"creator": {
"@type": "Person",
"givenName": "Joakim",
"familyName": "Hellsén",
"name": "Joakim Hellsén",
"sameAs": "https://orcid.org/0009-0006-7305-524X",
},
"includedInDataCatalog": {
"@type": "DataCatalog",
"name": "ttvdrops.lovinator.space",
"url": request.build_absolute_uri(reverse("core:dataset_backups")),
},
}
if dataset_distributions:
dataset_schema["distribution"] = dataset_distributions
seo_context: dict[str, Any] = _build_seo_context( seo_context: dict[str, Any] = _build_seo_context(
page_title="Twitch/Kick drop data", page_title="Twitch/Kick drop data",
page_description="Twitch/Kick datasets available for download, including historical drop campaign data and more.", page_description="Twitch/Kick datasets available for download, including historical drop campaign data and more.",
schema_data=dataset_schema,
) )
context: dict[str, Any] = { context: dict[str, Any] = {
"datasets": datasets, "datasets": datasets,

View file

@ -31,7 +31,7 @@
</thead> </thead>
<tbody> <tbody>
{% for dataset in datasets %} {% for dataset in datasets %}
<tr> <tr">
<td> <td>
<a href="{% url 'core:dataset_backup_download' dataset.download_path %}">{{ dataset.name }}</a> <a href="{% url 'core:dataset_backup_download' dataset.download_path %}">{{ dataset.name }}</a>
</td> </td>

View file

@ -1,3 +1,4 @@
import csv
import io import io
import json import json
import os import os
@ -86,6 +87,13 @@ class Command(BaseCommand):
json_path: Path = output_dir / f"{prefix}-{timestamp}.json.zst" json_path: Path = output_dir / f"{prefix}-{timestamp}.json.zst"
_write_json_dump(json_path, allowed_tables) _write_json_dump(json_path, allowed_tables)
csv_path: Path = _write_csv_dump(
output_dir,
prefix,
timestamp,
allowed_tables,
)
created_at: datetime = datetime.fromtimestamp( created_at: datetime = datetime.fromtimestamp(
output_path.stat().st_mtime, output_path.stat().st_mtime,
tz=timezone.get_current_timezone(), tz=timezone.get_current_timezone(),
@ -96,6 +104,7 @@ class Command(BaseCommand):
), ),
) )
self.stdout.write(self.style.SUCCESS(f"JSON backup created: {json_path}")) self.stdout.write(self.style.SUCCESS(f"JSON backup created: {json_path}"))
self.stdout.write(self.style.SUCCESS(f"CSV backup created: {csv_path}"))
self.stdout.write(self.style.SUCCESS(f"Included tables: {len(allowed_tables)}")) self.stdout.write(self.style.SUCCESS(f"Included tables: {len(allowed_tables)}"))
@ -340,3 +349,46 @@ def _write_json_dump(output_path: Path, tables: list[str]) -> None:
io.TextIOWrapper(compressed, encoding="utf-8") as handle, io.TextIOWrapper(compressed, encoding="utf-8") as handle,
): ):
json.dump(data, handle, default=_json_default) json.dump(data, handle, default=_json_default)
def _write_csv_dump(
output_dir: Path,
prefix: str,
timestamp: str,
tables: list[str],
) -> Path:
"""Write a combined CSV file containing rows from all tables.
Args:
output_dir: Directory where CSV files will be written.
prefix: Filename prefix.
timestamp: Timestamp string for filenames.
tables: Table names to include.
Returns:
Created file path.
"""
output_path: Path = output_dir / f"{prefix}-{timestamp}.csv.zst"
with (
output_path.open("wb") as raw_handle,
zstd.open(raw_handle, "w") as compressed,
io.TextIOWrapper(compressed, encoding="utf-8") as handle,
):
writer: csv.Writer = csv.writer(handle)
writer.writerow(["table", "row_json"])
with django_connection.cursor() as cursor:
for table in tables:
cursor.execute(f'SELECT * FROM "{table}"') # noqa: S608
columns: list[str] = [col[0] for col in cursor.description]
rows: list[tuple] = cursor.fetchall()
for row in rows:
row_dict = dict(zip(columns, row, strict=False))
writer.writerow([
table,
json.dumps(row_dict, default=_json_default),
])
return output_path

View file

@ -1,3 +1,4 @@
import csv
import io import io
import json import json
import math import math
@ -16,6 +17,7 @@ from django.urls import reverse
from twitch.management.commands.backup_db import _get_allowed_tables from twitch.management.commands.backup_db import _get_allowed_tables
from twitch.management.commands.backup_db import _json_default from twitch.management.commands.backup_db import _json_default
from twitch.management.commands.backup_db import _sql_literal from twitch.management.commands.backup_db import _sql_literal
from twitch.management.commands.backup_db import _write_csv_dump
from twitch.management.commands.backup_db import _write_json_dump from twitch.management.commands.backup_db import _write_json_dump
from twitch.management.commands.backup_db import _write_postgres_dump from twitch.management.commands.backup_db import _write_postgres_dump
from twitch.management.commands.backup_db import _write_sqlite_dump from twitch.management.commands.backup_db import _write_sqlite_dump
@ -23,6 +25,7 @@ from twitch.models import Game
from twitch.models import Organization from twitch.models import Organization
if TYPE_CHECKING: if TYPE_CHECKING:
from csv import Reader
from datetime import datetime from datetime import datetime
from pathlib import Path from pathlib import Path
@ -195,6 +198,34 @@ class TestBackupCommand:
row.get("name") == "Test Org JSON" for row in data["twitch_organization"] row.get("name") == "Test Org JSON" for row in data["twitch_organization"]
) )
def test_backup_creates_single_csv_file(self, tmp_path: Path) -> None:
"""Test that backup command creates a single CSV file alongside the SQL dump."""
_skip_if_pg_dump_missing()
Organization.objects.create(twitch_id="test_csv", name="Test Org CSV")
output_dir: Path = tmp_path / "backups"
output_dir.mkdir()
call_command("backup_db", output_dir=str(output_dir), prefix="test")
csv_files: list[Path] = list(output_dir.glob("test-*.csv.zst"))
assert len(csv_files) == 1
with (
csv_files[0].open("rb") as raw_handle,
zstd.open(raw_handle, "r") as compressed,
io.TextIOWrapper(compressed, encoding="utf-8") as handle,
):
reader: Reader = csv.reader(handle)
rows: list[list[str]] = list(reader)
assert len(rows) >= 2 # header + at least one data row
assert rows[0] == ["table", "row_json"]
data_rows: list[list[str]] = [
row for row in rows[1:] if row and row[0] == "twitch_organization"
]
assert any("Test Org CSV" in row[1] for row in data_rows)
@pytest.mark.django_db @pytest.mark.django_db
class TestBackupHelperFunctions: class TestBackupHelperFunctions:
@ -306,6 +337,36 @@ class TestBackupHelperFunctions:
row.get("name") == "JSON Helper Org" for row in data["twitch_organization"] row.get("name") == "JSON Helper Org" for row in data["twitch_organization"]
) )
def test_write_csv_dump_creates_single_file(self, tmp_path: Path) -> None:
"""Test _write_csv_dump creates one combined compressed CSV file."""
Organization.objects.create(twitch_id="test_csv_helper", name="CSV Helper Org")
tables: list[str] = _get_allowed_tables("twitch_")
path: Path = _write_csv_dump(
tmp_path,
"test",
"20260317-120000",
tables,
)
assert path.exists()
assert path.name == "test-20260317-120000.csv.zst"
with (
path.open("rb") as raw_handle,
zstd.open(raw_handle, "r") as compressed,
io.TextIOWrapper(compressed, encoding="utf-8") as handle,
):
reader: Reader = csv.reader(handle)
rows: list[list[str]] = list(reader)
assert len(rows) >= 2 # header + at least one data row
assert rows[0] == ["table", "row_json"]
data_rows: list[list[str]] = [
row for row in rows[1:] if row and row[0] == "twitch_organization"
]
assert any("CSV Helper Org" in row[1] for row in data_rows)
def test_json_default_handles_bytes(self) -> None: def test_json_default_handles_bytes(self) -> None:
"""Test _json_default converts bytes to hex string.""" """Test _json_default converts bytes to hex string."""
assert _json_default(b"\x00\x01") == "0001" assert _json_default(b"\x00\x01") == "0001"