Enhance dataset documentation and refactor CSV backup functionality to create a single combined file instead
All checks were successful
Deploy to Server / deploy (push) Successful in 11s
All checks were successful
Deploy to Server / deploy (push) Successful in 11s
This commit is contained in:
parent
9fd22ba8a8
commit
76b1cd70a5
3 changed files with 82 additions and 62 deletions
|
|
@ -5,6 +5,21 @@
|
||||||
{% block content %}
|
{% block content %}
|
||||||
<main>
|
<main>
|
||||||
<h1>Dataset Backups</h1>
|
<h1>Dataset Backups</h1>
|
||||||
|
<section>
|
||||||
|
<h2>About this dataset</h2>
|
||||||
|
<p>This site tracks and publishes open Twitch and Kick drop campaign data.</p>
|
||||||
|
<p>
|
||||||
|
The exported datasets on this page are released under <strong>CC0</strong> so you can reuse them freely.
|
||||||
|
The underlying source data is scraped from Twitch/Kick APIs and pages, so we do not control the
|
||||||
|
upstream content and cannot guarantee upstream accuracy or permanence.
|
||||||
|
</p>
|
||||||
|
<p>Note that some drops has missing or incomplete data due to Twitch API limitations.</p>
|
||||||
|
<p>
|
||||||
|
Need a special format for your workflow or research pipeline?
|
||||||
|
<a href="https://github.com/TheLovinator1/ttvdrops/issues">Contact me via GitHub issues</a>
|
||||||
|
and describe what you need.
|
||||||
|
</p>
|
||||||
|
</section>
|
||||||
{% if datasets %}
|
{% if datasets %}
|
||||||
<table>
|
<table>
|
||||||
<thead>
|
<thead>
|
||||||
|
|
|
||||||
|
|
@ -87,7 +87,7 @@ class Command(BaseCommand):
|
||||||
json_path: Path = output_dir / f"{prefix}-{timestamp}.json.zst"
|
json_path: Path = output_dir / f"{prefix}-{timestamp}.json.zst"
|
||||||
_write_json_dump(json_path, allowed_tables)
|
_write_json_dump(json_path, allowed_tables)
|
||||||
|
|
||||||
csv_paths: list[Path] = _write_csv_dumps(
|
csv_path: Path = _write_csv_dump(
|
||||||
output_dir,
|
output_dir,
|
||||||
prefix,
|
prefix,
|
||||||
timestamp,
|
timestamp,
|
||||||
|
|
@ -104,9 +104,7 @@ class Command(BaseCommand):
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
self.stdout.write(self.style.SUCCESS(f"JSON backup created: {json_path}"))
|
self.stdout.write(self.style.SUCCESS(f"JSON backup created: {json_path}"))
|
||||||
self.stdout.write(
|
self.stdout.write(self.style.SUCCESS(f"CSV backup created: {csv_path}"))
|
||||||
self.style.SUCCESS(f"CSV backups created: {len(csv_paths)} files"),
|
|
||||||
)
|
|
||||||
self.stdout.write(self.style.SUCCESS(f"Included tables: {len(allowed_tables)}"))
|
self.stdout.write(self.style.SUCCESS(f"Included tables: {len(allowed_tables)}"))
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -353,13 +351,13 @@ def _write_json_dump(output_path: Path, tables: list[str]) -> None:
|
||||||
json.dump(data, handle, default=_json_default)
|
json.dump(data, handle, default=_json_default)
|
||||||
|
|
||||||
|
|
||||||
def _write_csv_dumps(
|
def _write_csv_dump(
|
||||||
output_dir: Path,
|
output_dir: Path,
|
||||||
prefix: str,
|
prefix: str,
|
||||||
timestamp: str,
|
timestamp: str,
|
||||||
tables: list[str],
|
tables: list[str],
|
||||||
) -> list[Path]:
|
) -> Path:
|
||||||
"""Write per-table CSV files into zstd-compressed files.
|
"""Write a combined CSV file containing rows from all tables.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
output_dir: Directory where CSV files will be written.
|
output_dir: Directory where CSV files will be written.
|
||||||
|
|
@ -368,23 +366,29 @@ def _write_csv_dumps(
|
||||||
tables: Table names to include.
|
tables: Table names to include.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of created file paths.
|
Created file path.
|
||||||
"""
|
"""
|
||||||
paths: list[Path] = []
|
output_path: Path = output_dir / f"{prefix}-{timestamp}.csv.zst"
|
||||||
with django_connection.cursor() as cursor:
|
|
||||||
for table in tables:
|
|
||||||
cursor.execute(f'SELECT * FROM "{table}"') # noqa: S608
|
|
||||||
columns: list[str] = [col[0] for col in cursor.description]
|
|
||||||
rows: list[tuple] = cursor.fetchall()
|
|
||||||
|
|
||||||
output_path: Path = output_dir / f"{prefix}-{timestamp}-{table}.csv.zst"
|
|
||||||
with (
|
with (
|
||||||
output_path.open("wb") as raw_handle,
|
output_path.open("wb") as raw_handle,
|
||||||
zstd.open(raw_handle, "w") as compressed,
|
zstd.open(raw_handle, "w") as compressed,
|
||||||
io.TextIOWrapper(compressed, encoding="utf-8") as handle,
|
io.TextIOWrapper(compressed, encoding="utf-8") as handle,
|
||||||
):
|
):
|
||||||
writer: csv.Writer = csv.writer(handle)
|
writer: csv.Writer = csv.writer(handle)
|
||||||
writer.writerow(columns)
|
writer.writerow(["table", "row_json"])
|
||||||
writer.writerows(rows)
|
|
||||||
paths.append(output_path)
|
with django_connection.cursor() as cursor:
|
||||||
return paths
|
for table in tables:
|
||||||
|
cursor.execute(f'SELECT * FROM "{table}"') # noqa: S608
|
||||||
|
columns: list[str] = [col[0] for col in cursor.description]
|
||||||
|
rows: list[tuple] = cursor.fetchall()
|
||||||
|
|
||||||
|
for row in rows:
|
||||||
|
row_dict = dict(zip(columns, row, strict=False))
|
||||||
|
writer.writerow([
|
||||||
|
table,
|
||||||
|
json.dumps(row_dict, default=_json_default),
|
||||||
|
])
|
||||||
|
|
||||||
|
return output_path
|
||||||
|
|
|
||||||
|
|
@ -17,7 +17,7 @@ from django.urls import reverse
|
||||||
from twitch.management.commands.backup_db import _get_allowed_tables
|
from twitch.management.commands.backup_db import _get_allowed_tables
|
||||||
from twitch.management.commands.backup_db import _json_default
|
from twitch.management.commands.backup_db import _json_default
|
||||||
from twitch.management.commands.backup_db import _sql_literal
|
from twitch.management.commands.backup_db import _sql_literal
|
||||||
from twitch.management.commands.backup_db import _write_csv_dumps
|
from twitch.management.commands.backup_db import _write_csv_dump
|
||||||
from twitch.management.commands.backup_db import _write_json_dump
|
from twitch.management.commands.backup_db import _write_json_dump
|
||||||
from twitch.management.commands.backup_db import _write_postgres_dump
|
from twitch.management.commands.backup_db import _write_postgres_dump
|
||||||
from twitch.management.commands.backup_db import _write_sqlite_dump
|
from twitch.management.commands.backup_db import _write_sqlite_dump
|
||||||
|
|
@ -198,8 +198,8 @@ class TestBackupCommand:
|
||||||
row.get("name") == "Test Org JSON" for row in data["twitch_organization"]
|
row.get("name") == "Test Org JSON" for row in data["twitch_organization"]
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_backup_creates_csv_files(self, tmp_path: Path) -> None:
|
def test_backup_creates_single_csv_file(self, tmp_path: Path) -> None:
|
||||||
"""Test that backup command creates per-table CSV files alongside the SQL dump."""
|
"""Test that backup command creates a single CSV file alongside the SQL dump."""
|
||||||
_skip_if_pg_dump_missing()
|
_skip_if_pg_dump_missing()
|
||||||
Organization.objects.create(twitch_id="test_csv", name="Test Org CSV")
|
Organization.objects.create(twitch_id="test_csv", name="Test Org CSV")
|
||||||
|
|
||||||
|
|
@ -208,13 +208,11 @@ class TestBackupCommand:
|
||||||
|
|
||||||
call_command("backup_db", output_dir=str(output_dir), prefix="test")
|
call_command("backup_db", output_dir=str(output_dir), prefix="test")
|
||||||
|
|
||||||
org_csv_files: list[Path] = list(
|
csv_files: list[Path] = list(output_dir.glob("test-*.csv.zst"))
|
||||||
output_dir.glob("test-*-twitch_organization.csv.zst"),
|
assert len(csv_files) == 1
|
||||||
)
|
|
||||||
assert len(org_csv_files) == 1
|
|
||||||
|
|
||||||
with (
|
with (
|
||||||
org_csv_files[0].open("rb") as raw_handle,
|
csv_files[0].open("rb") as raw_handle,
|
||||||
zstd.open(raw_handle, "r") as compressed,
|
zstd.open(raw_handle, "r") as compressed,
|
||||||
io.TextIOWrapper(compressed, encoding="utf-8") as handle,
|
io.TextIOWrapper(compressed, encoding="utf-8") as handle,
|
||||||
):
|
):
|
||||||
|
|
@ -222,8 +220,11 @@ class TestBackupCommand:
|
||||||
rows: list[list[str]] = list(reader)
|
rows: list[list[str]] = list(reader)
|
||||||
|
|
||||||
assert len(rows) >= 2 # header + at least one data row
|
assert len(rows) >= 2 # header + at least one data row
|
||||||
assert "name" in rows[0]
|
assert rows[0] == ["table", "row_json"]
|
||||||
assert any("Test Org CSV" in row for row in rows[1:])
|
data_rows: list[list[str]] = [
|
||||||
|
row for row in rows[1:] if row and row[0] == "twitch_organization"
|
||||||
|
]
|
||||||
|
assert any("Test Org CSV" in row[1] for row in data_rows)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.django_db
|
@pytest.mark.django_db
|
||||||
|
|
@ -336,26 +337,23 @@ class TestBackupHelperFunctions:
|
||||||
row.get("name") == "JSON Helper Org" for row in data["twitch_organization"]
|
row.get("name") == "JSON Helper Org" for row in data["twitch_organization"]
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_write_csv_dumps_creates_per_table_files(self, tmp_path: Path) -> None:
|
def test_write_csv_dump_creates_single_file(self, tmp_path: Path) -> None:
|
||||||
"""Test _write_csv_dumps creates one compressed CSV file per table."""
|
"""Test _write_csv_dump creates one combined compressed CSV file."""
|
||||||
Organization.objects.create(twitch_id="test_csv_helper", name="CSV Helper Org")
|
Organization.objects.create(twitch_id="test_csv_helper", name="CSV Helper Org")
|
||||||
|
|
||||||
tables: list[str] = _get_allowed_tables("twitch_")
|
tables: list[str] = _get_allowed_tables("twitch_")
|
||||||
paths: list[Path] = _write_csv_dumps(
|
path: Path = _write_csv_dump(
|
||||||
tmp_path,
|
tmp_path,
|
||||||
"test",
|
"test",
|
||||||
"20260317-120000",
|
"20260317-120000",
|
||||||
tables,
|
tables,
|
||||||
)
|
)
|
||||||
|
|
||||||
assert len(paths) == len(tables)
|
assert path.exists()
|
||||||
assert all(p.exists() for p in paths)
|
assert path.name == "test-20260317-120000.csv.zst"
|
||||||
|
|
||||||
org_csv: Path = tmp_path / "test-20260317-120000-twitch_organization.csv.zst"
|
|
||||||
assert org_csv.exists()
|
|
||||||
|
|
||||||
with (
|
with (
|
||||||
org_csv.open("rb") as raw_handle,
|
path.open("rb") as raw_handle,
|
||||||
zstd.open(raw_handle, "r") as compressed,
|
zstd.open(raw_handle, "r") as compressed,
|
||||||
io.TextIOWrapper(compressed, encoding="utf-8") as handle,
|
io.TextIOWrapper(compressed, encoding="utf-8") as handle,
|
||||||
):
|
):
|
||||||
|
|
@ -363,8 +361,11 @@ class TestBackupHelperFunctions:
|
||||||
rows: list[list[str]] = list(reader)
|
rows: list[list[str]] = list(reader)
|
||||||
|
|
||||||
assert len(rows) >= 2 # header + at least one data row
|
assert len(rows) >= 2 # header + at least one data row
|
||||||
assert "name" in rows[0]
|
assert rows[0] == ["table", "row_json"]
|
||||||
assert any("CSV Helper Org" in row for row in rows[1:])
|
data_rows: list[list[str]] = [
|
||||||
|
row for row in rows[1:] if row and row[0] == "twitch_organization"
|
||||||
|
]
|
||||||
|
assert any("CSV Helper Org" in row[1] for row in data_rows)
|
||||||
|
|
||||||
def test_json_default_handles_bytes(self) -> None:
|
def test_json_default_handles_bytes(self) -> None:
|
||||||
"""Test _json_default converts bytes to hex string."""
|
"""Test _json_default converts bytes to hex string."""
|
||||||
|
|
@ -388,7 +389,7 @@ class TestDatasetBackupViews:
|
||||||
Returns:
|
Returns:
|
||||||
Path to the created datasets directory.
|
Path to the created datasets directory.
|
||||||
"""
|
"""
|
||||||
datasets_dir = tmp_path / "datasets"
|
datasets_dir: Path = tmp_path / "datasets"
|
||||||
datasets_dir.mkdir()
|
datasets_dir.mkdir()
|
||||||
return datasets_dir
|
return datasets_dir
|
||||||
|
|
||||||
|
|
@ -399,7 +400,7 @@ class TestDatasetBackupViews:
|
||||||
Returns:
|
Returns:
|
||||||
Path to the created backup file.
|
Path to the created backup file.
|
||||||
"""
|
"""
|
||||||
backup_file = datasets_dir / "ttvdrops-20260210-120000.sql.zst"
|
backup_file: Path = datasets_dir / "ttvdrops-20260210-120000.sql.zst"
|
||||||
with (
|
with (
|
||||||
backup_file.open("wb") as raw_handle,
|
backup_file.open("wb") as raw_handle,
|
||||||
zstd.open(raw_handle, "w") as compressed,
|
zstd.open(raw_handle, "w") as compressed,
|
||||||
|
|
@ -452,8 +453,8 @@ class TestDatasetBackupViews:
|
||||||
monkeypatch.setattr(settings, "DATA_DIR", datasets_dir.parent)
|
monkeypatch.setattr(settings, "DATA_DIR", datasets_dir.parent)
|
||||||
|
|
||||||
# Create multiple backup files with different timestamps
|
# Create multiple backup files with different timestamps
|
||||||
older_backup = datasets_dir / "ttvdrops-20260210-100000.sql.zst"
|
older_backup: Path = datasets_dir / "ttvdrops-20260210-100000.sql.zst"
|
||||||
newer_backup = datasets_dir / "ttvdrops-20260210-140000.sql.zst"
|
newer_backup: Path = datasets_dir / "ttvdrops-20260210-140000.sql.zst"
|
||||||
|
|
||||||
for backup in [older_backup, newer_backup]:
|
for backup in [older_backup, newer_backup]:
|
||||||
with (
|
with (
|
||||||
|
|
@ -473,9 +474,9 @@ class TestDatasetBackupViews:
|
||||||
reverse("core:dataset_backups"),
|
reverse("core:dataset_backups"),
|
||||||
)
|
)
|
||||||
|
|
||||||
content = response.content.decode()
|
content: str = response.content.decode()
|
||||||
newer_pos = content.find("20260210-140000")
|
newer_pos: int = content.find("20260210-140000")
|
||||||
older_pos = content.find("20260210-100000")
|
older_pos: int = content.find("20260210-100000")
|
||||||
|
|
||||||
# Newer backup should appear first (sorted descending)
|
# Newer backup should appear first (sorted descending)
|
||||||
assert 0 < newer_pos < older_pos
|
assert 0 < newer_pos < older_pos
|
||||||
|
|
@ -512,7 +513,7 @@ class TestDatasetBackupViews:
|
||||||
monkeypatch.setattr(settings, "DATA_DIR", datasets_dir.parent)
|
monkeypatch.setattr(settings, "DATA_DIR", datasets_dir.parent)
|
||||||
|
|
||||||
# Attempt path traversal
|
# Attempt path traversal
|
||||||
response = client.get(
|
response: _MonkeyPatchedWSGIResponse = client.get(
|
||||||
reverse("core:dataset_backup_download", args=["../../../etc/passwd"]),
|
reverse("core:dataset_backup_download", args=["../../../etc/passwd"]),
|
||||||
)
|
)
|
||||||
assert response.status_code == 404
|
assert response.status_code == 404
|
||||||
|
|
@ -527,10 +528,10 @@ class TestDatasetBackupViews:
|
||||||
monkeypatch.setattr(settings, "DATA_DIR", datasets_dir.parent)
|
monkeypatch.setattr(settings, "DATA_DIR", datasets_dir.parent)
|
||||||
|
|
||||||
# Create a file with invalid extension
|
# Create a file with invalid extension
|
||||||
invalid_file = datasets_dir / "malicious.exe"
|
invalid_file: Path = datasets_dir / "malicious.exe"
|
||||||
invalid_file.write_text("not a backup")
|
invalid_file.write_text("not a backup", encoding="utf-8")
|
||||||
|
|
||||||
response = client.get(
|
response: _MonkeyPatchedWSGIResponse = client.get(
|
||||||
reverse("core:dataset_backup_download", args=["malicious.exe"]),
|
reverse("core:dataset_backup_download", args=["malicious.exe"]),
|
||||||
)
|
)
|
||||||
assert response.status_code == 404
|
assert response.status_code == 404
|
||||||
|
|
@ -544,7 +545,7 @@ class TestDatasetBackupViews:
|
||||||
"""Test download returns 404 for non-existent file."""
|
"""Test download returns 404 for non-existent file."""
|
||||||
monkeypatch.setattr(settings, "DATA_DIR", datasets_dir.parent)
|
monkeypatch.setattr(settings, "DATA_DIR", datasets_dir.parent)
|
||||||
|
|
||||||
response = client.get(
|
response: _MonkeyPatchedWSGIResponse = client.get(
|
||||||
reverse("core:dataset_backup_download", args=["nonexistent.sql.zst"]),
|
reverse("core:dataset_backup_download", args=["nonexistent.sql.zst"]),
|
||||||
)
|
)
|
||||||
assert response.status_code == 404
|
assert response.status_code == 404
|
||||||
|
|
@ -565,7 +566,7 @@ class TestDatasetBackupViews:
|
||||||
|
|
||||||
assert response.status_code == 200
|
assert response.status_code == 200
|
||||||
# Should contain size information (bytes, KB, MB, or GB)
|
# Should contain size information (bytes, KB, MB, or GB)
|
||||||
content = response.content.decode()
|
content: str = response.content.decode()
|
||||||
assert any(unit in content for unit in ["bytes", "KB", "MB", "GB"])
|
assert any(unit in content for unit in ["bytes", "KB", "MB", "GB"])
|
||||||
|
|
||||||
def test_dataset_list_ignores_non_zst_files(
|
def test_dataset_list_ignores_non_zst_files(
|
||||||
|
|
@ -586,7 +587,7 @@ class TestDatasetBackupViews:
|
||||||
reverse("core:dataset_backups"),
|
reverse("core:dataset_backups"),
|
||||||
)
|
)
|
||||||
|
|
||||||
content = response.content.decode()
|
content: str = response.content.decode()
|
||||||
assert "backup.sql.zst" in content
|
assert "backup.sql.zst" in content
|
||||||
assert "readme.txt" not in content
|
assert "readme.txt" not in content
|
||||||
assert "old_backup.gz" not in content
|
assert "old_backup.gz" not in content
|
||||||
|
|
@ -601,9 +602,9 @@ class TestDatasetBackupViews:
|
||||||
monkeypatch.setattr(settings, "DATA_DIR", datasets_dir.parent)
|
monkeypatch.setattr(settings, "DATA_DIR", datasets_dir.parent)
|
||||||
|
|
||||||
# Create subdirectory with backup
|
# Create subdirectory with backup
|
||||||
subdir = datasets_dir / "2026" / "02"
|
subdir: Path = datasets_dir / "2026" / "02"
|
||||||
subdir.mkdir(parents=True)
|
subdir.mkdir(parents=True)
|
||||||
backup_file = subdir / "backup.sql.zst"
|
backup_file: Path = subdir / "backup.sql.zst"
|
||||||
with (
|
with (
|
||||||
backup_file.open("wb") as raw_handle,
|
backup_file.open("wb") as raw_handle,
|
||||||
zstd.open(raw_handle, "w") as compressed,
|
zstd.open(raw_handle, "w") as compressed,
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue