This commit is contained in:
Joakim Hellsén 2026-04-27 20:43:26 +02:00
commit a7a5b5c8ea
Signed by: Joakim Hellsén
SSH key fingerprint: SHA256:/9h/CsExpFp+PRhsfA0xznFx2CGfTT5R/kpuFfUgEQk
43 changed files with 5531 additions and 9 deletions

View file

@ -0,0 +1,254 @@
from __future__ import annotations
import json
from typing import TYPE_CHECKING
from urllib.error import HTTPError
from urllib.error import URLError
from urllib.request import urlopen
from django.utils import timezone
from control_plane.host_commands import HostCommandError
from control_plane.host_commands import run_host_command
from control_plane.local_test_runtime import build_test_django_container_names
from control_plane.local_test_runtime import build_test_django_local_url
from control_plane.local_test_runtime import build_test_django_project_root
from control_plane.models import Deployment
from control_plane.models import DeploymentStatus
if TYPE_CHECKING:
from collections.abc import Iterable
from pathlib import Path
from control_plane.models import RuntimeService
MAX_DIAGNOSTIC_LOG_LINES = 200
DEFAULT_SENTINEL_PROBE_TIMEOUT_SECONDS = 2.0
type JsonPrimitive = bool | int | float | str | None
type JsonValue = JsonPrimitive | list[JsonValue] | dict[str, JsonValue]
def build_test_deployment_diagnostics_root(deployment: Deployment) -> Path:
"""Return filesystem root for persisted deployment diagnostics."""
return build_test_django_project_root(deployment).parent / "diagnostics"
def build_test_deployment_diagnostics_snapshot_path(deployment: Deployment) -> Path:
"""Return JSON snapshot path for one deployment's latest diagnostics."""
return build_test_deployment_diagnostics_root(deployment) / "snapshot.json"
def capture_test_deployment_diagnostics(deployment_id: str) -> None:
"""Capture current pod, container, and log state for one deployment."""
deployment = (
Deployment.objects
.select_related("hosted_site__tenant")
.prefetch_related("runtime_services")
.get(pk=deployment_id)
)
snapshot_path = build_test_deployment_diagnostics_snapshot_path(deployment)
snapshot_path.parent.mkdir(parents=True, exist_ok=True)
snapshot_path.write_text(
json.dumps(_build_diagnostics_snapshot(deployment), indent=2),
encoding="utf-8",
)
def load_test_deployment_diagnostics(deployment: Deployment) -> dict[str, JsonValue] | None:
"""Load the latest persisted diagnostics snapshot for one deployment.
Returns:
Parsed diagnostics payload, or None when no snapshot has been captured yet.
"""
snapshot_path = build_test_deployment_diagnostics_snapshot_path(deployment)
if not snapshot_path.exists():
return None
try:
payload = json.loads(snapshot_path.read_text(encoding="utf-8"))
except json.JSONDecodeError as error:
return {
"capture_error": f"Unable to parse diagnostics snapshot: {error}",
"captured_at": None,
}
if not isinstance(payload, dict):
return {
"capture_error": "Diagnostics snapshot is not a JSON object.",
"captured_at": None,
}
return payload
def probe_test_deployment_health(
deployment: Deployment,
*,
timeout_seconds: float = DEFAULT_SENTINEL_PROBE_TIMEOUT_SECONDS,
) -> dict[str, JsonValue]:
"""Probe the generated deployment sentinel endpoint and return structured status.
Returns:
JSON-serializable probe state describing current sentinel reachability and payload.
"""
sentinel_url = build_test_django_local_url(deployment)
result: dict[str, JsonValue] = {
"checked_at": timezone.now().isoformat(),
"deployment_id": str(deployment.id),
"deployment_status": deployment.status,
"sentinel_url": sentinel_url,
"ok": False,
"status": "not-running",
"label": "Not Running",
"payload": None,
"error": "",
"http_status": None,
}
if deployment.status not in {DeploymentStatus.RUNNING.value, DeploymentStatus.BOOTING.value}:
return result
try:
with urlopen(sentinel_url, timeout=timeout_seconds) as response: # noqa: S310
payload = json.loads(response.read().decode("utf-8"))
result["http_status"] = int(getattr(response, "status", 200))
if isinstance(payload, dict):
result["payload"] = payload
if payload.get("status") == "ok":
result["ok"] = True
result["status"] = "healthy"
result["label"] = "Healthy"
else:
result["status"] = "unexpected-payload"
result["label"] = "Unexpected"
else:
result["payload"] = {"value": str(payload)}
result["status"] = "unexpected-payload"
result["label"] = "Unexpected"
except (HTTPError, URLError, OSError, json.JSONDecodeError) as error:
result["status"] = "unreachable"
result["label"] = "Unreachable"
result["error"] = str(error)
return result
def _build_diagnostics_snapshot(deployment: Deployment) -> dict[str, JsonValue]:
runtime_services = tuple(_ordered_runtime_services(deployment.runtime_services.all()))
server_container_name, _ = build_test_django_container_names(deployment)
pod_name = runtime_services[0].network_name if runtime_services else ""
return {
"captured_at": timezone.now().isoformat(),
"deployment_id": str(deployment.id),
"deployment_status": deployment.status,
"tenant_slug": deployment.hosted_site.tenant.slug,
"site_slug": deployment.hosted_site.slug,
"guest_port": deployment.guest_port,
"sentinel_url": build_test_django_local_url(deployment),
"last_error": deployment.last_error,
"pod": _collect_pod_diagnostics(pod_name),
"django": _collect_container_diagnostics(
container_name=server_container_name,
control_plane_status=deployment.status,
label="django",
),
"runtime_services": [
_collect_container_diagnostics(
container_name=runtime_service.container_name,
control_plane_status=runtime_service.status,
label=runtime_service.kind,
)
for runtime_service in runtime_services
],
}
def _ordered_runtime_services(runtime_services: Iterable[RuntimeService]) -> tuple[RuntimeService, ...]:
return tuple(sorted(runtime_services, key=lambda runtime_service: runtime_service.kind))
def _collect_pod_diagnostics(pod_name: str) -> dict[str, JsonValue]:
if not pod_name:
return {
"name": "",
"status": "missing",
"error": "No runtime services are linked to this deployment yet.",
}
try:
result = run_host_command(
command=("podman", "pod", "inspect", "--format", "{{.State}}", pod_name),
timeout_seconds=20.0,
)
except HostCommandError as error:
return {
"name": pod_name,
"status": "missing",
"error": _format_host_command_error(error),
}
return {
"name": pod_name,
"status": result.stdout.strip() or "unknown",
"error": "",
}
def _collect_container_diagnostics(
*,
container_name: str,
control_plane_status: str,
label: str,
) -> dict[str, JsonValue]:
container_status, inspect_error = _inspect_container_status(container_name)
logs, log_error = _read_container_logs(container_name)
return {
"label": label,
"container_name": container_name,
"control_plane_status": control_plane_status,
"container_status": container_status,
"logs": logs,
"inspect_error": inspect_error,
"log_error": log_error,
}
def _inspect_container_status(container_name: str) -> tuple[str, str]:
try:
result = run_host_command(
command=(
"podman",
"inspect",
"--format",
"{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}",
container_name,
),
timeout_seconds=20.0,
)
except HostCommandError as error:
return "missing", _format_host_command_error(error)
return result.stdout.strip() or "unknown", ""
def _read_container_logs(container_name: str) -> tuple[str, str]:
try:
result = run_host_command(
command=("podman", "logs", "--tail", str(MAX_DIAGNOSTIC_LOG_LINES), container_name),
timeout_seconds=20.0,
)
except HostCommandError as error:
return "", _format_host_command_error(error)
output = result.stdout.strip() or result.stderr.strip()
return output, ""
def _format_host_command_error(error: HostCommandError) -> str:
if error.stderr.strip():
return error.stderr.strip()
if error.stdout.strip():
return error.stdout.strip()
return str(error)