254 lines
8.7 KiB
Python
254 lines
8.7 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
from typing import TYPE_CHECKING
|
|
from urllib.error import HTTPError
|
|
from urllib.error import URLError
|
|
from urllib.request import urlopen
|
|
|
|
from django.utils import timezone
|
|
|
|
from control_plane.host_commands import HostCommandError
|
|
from control_plane.host_commands import run_host_command
|
|
from control_plane.local_test_runtime import build_test_django_container_names
|
|
from control_plane.local_test_runtime import build_test_django_local_url
|
|
from control_plane.local_test_runtime import build_test_django_project_root
|
|
from control_plane.models import Deployment
|
|
from control_plane.models import DeploymentStatus
|
|
|
|
if TYPE_CHECKING:
|
|
from collections.abc import Iterable
|
|
from pathlib import Path
|
|
|
|
from control_plane.models import RuntimeService
|
|
|
|
|
|
MAX_DIAGNOSTIC_LOG_LINES = 200
|
|
DEFAULT_SENTINEL_PROBE_TIMEOUT_SECONDS = 2.0
|
|
|
|
type JsonPrimitive = bool | int | float | str | None
|
|
type JsonValue = JsonPrimitive | list[JsonValue] | dict[str, JsonValue]
|
|
|
|
|
|
def build_test_deployment_diagnostics_root(deployment: Deployment) -> Path:
|
|
"""Return filesystem root for persisted deployment diagnostics."""
|
|
return build_test_django_project_root(deployment).parent / "diagnostics"
|
|
|
|
|
|
def build_test_deployment_diagnostics_snapshot_path(deployment: Deployment) -> Path:
|
|
"""Return JSON snapshot path for one deployment's latest diagnostics."""
|
|
return build_test_deployment_diagnostics_root(deployment) / "snapshot.json"
|
|
|
|
|
|
def capture_test_deployment_diagnostics(deployment_id: str) -> None:
|
|
"""Capture current pod, container, and log state for one deployment."""
|
|
deployment = (
|
|
Deployment.objects
|
|
.select_related("hosted_site__tenant")
|
|
.prefetch_related("runtime_services")
|
|
.get(pk=deployment_id)
|
|
)
|
|
snapshot_path = build_test_deployment_diagnostics_snapshot_path(deployment)
|
|
snapshot_path.parent.mkdir(parents=True, exist_ok=True)
|
|
snapshot_path.write_text(
|
|
json.dumps(_build_diagnostics_snapshot(deployment), indent=2),
|
|
encoding="utf-8",
|
|
)
|
|
|
|
|
|
def load_test_deployment_diagnostics(deployment: Deployment) -> dict[str, JsonValue] | None:
|
|
"""Load the latest persisted diagnostics snapshot for one deployment.
|
|
|
|
Returns:
|
|
Parsed diagnostics payload, or None when no snapshot has been captured yet.
|
|
"""
|
|
snapshot_path = build_test_deployment_diagnostics_snapshot_path(deployment)
|
|
if not snapshot_path.exists():
|
|
return None
|
|
|
|
try:
|
|
payload = json.loads(snapshot_path.read_text(encoding="utf-8"))
|
|
except json.JSONDecodeError as error:
|
|
return {
|
|
"capture_error": f"Unable to parse diagnostics snapshot: {error}",
|
|
"captured_at": None,
|
|
}
|
|
|
|
if not isinstance(payload, dict):
|
|
return {
|
|
"capture_error": "Diagnostics snapshot is not a JSON object.",
|
|
"captured_at": None,
|
|
}
|
|
|
|
return payload
|
|
|
|
|
|
def probe_test_deployment_health(
|
|
deployment: Deployment,
|
|
*,
|
|
timeout_seconds: float = DEFAULT_SENTINEL_PROBE_TIMEOUT_SECONDS,
|
|
) -> dict[str, JsonValue]:
|
|
"""Probe the generated deployment sentinel endpoint and return structured status.
|
|
|
|
Returns:
|
|
JSON-serializable probe state describing current sentinel reachability and payload.
|
|
"""
|
|
sentinel_url = build_test_django_local_url(deployment)
|
|
result: dict[str, JsonValue] = {
|
|
"checked_at": timezone.now().isoformat(),
|
|
"deployment_id": str(deployment.id),
|
|
"deployment_status": deployment.status,
|
|
"sentinel_url": sentinel_url,
|
|
"ok": False,
|
|
"status": "not-running",
|
|
"label": "Not Running",
|
|
"payload": None,
|
|
"error": "",
|
|
"http_status": None,
|
|
}
|
|
if deployment.status not in {DeploymentStatus.RUNNING.value, DeploymentStatus.BOOTING.value}:
|
|
return result
|
|
|
|
try:
|
|
with urlopen(sentinel_url, timeout=timeout_seconds) as response: # noqa: S310
|
|
payload = json.loads(response.read().decode("utf-8"))
|
|
result["http_status"] = int(getattr(response, "status", 200))
|
|
if isinstance(payload, dict):
|
|
result["payload"] = payload
|
|
if payload.get("status") == "ok":
|
|
result["ok"] = True
|
|
result["status"] = "healthy"
|
|
result["label"] = "Healthy"
|
|
else:
|
|
result["status"] = "unexpected-payload"
|
|
result["label"] = "Unexpected"
|
|
else:
|
|
result["payload"] = {"value": str(payload)}
|
|
result["status"] = "unexpected-payload"
|
|
result["label"] = "Unexpected"
|
|
except (HTTPError, URLError, OSError, json.JSONDecodeError) as error:
|
|
result["status"] = "unreachable"
|
|
result["label"] = "Unreachable"
|
|
result["error"] = str(error)
|
|
|
|
return result
|
|
|
|
|
|
def _build_diagnostics_snapshot(deployment: Deployment) -> dict[str, JsonValue]:
|
|
runtime_services = tuple(_ordered_runtime_services(deployment.runtime_services.all()))
|
|
server_container_name, _ = build_test_django_container_names(deployment)
|
|
pod_name = runtime_services[0].network_name if runtime_services else ""
|
|
|
|
return {
|
|
"captured_at": timezone.now().isoformat(),
|
|
"deployment_id": str(deployment.id),
|
|
"deployment_status": deployment.status,
|
|
"tenant_slug": deployment.hosted_site.tenant.slug,
|
|
"site_slug": deployment.hosted_site.slug,
|
|
"guest_port": deployment.guest_port,
|
|
"sentinel_url": build_test_django_local_url(deployment),
|
|
"last_error": deployment.last_error,
|
|
"pod": _collect_pod_diagnostics(pod_name),
|
|
"django": _collect_container_diagnostics(
|
|
container_name=server_container_name,
|
|
control_plane_status=deployment.status,
|
|
label="django",
|
|
),
|
|
"runtime_services": [
|
|
_collect_container_diagnostics(
|
|
container_name=runtime_service.container_name,
|
|
control_plane_status=runtime_service.status,
|
|
label=runtime_service.kind,
|
|
)
|
|
for runtime_service in runtime_services
|
|
],
|
|
}
|
|
|
|
|
|
def _ordered_runtime_services(runtime_services: Iterable[RuntimeService]) -> tuple[RuntimeService, ...]:
|
|
return tuple(sorted(runtime_services, key=lambda runtime_service: runtime_service.kind))
|
|
|
|
|
|
def _collect_pod_diagnostics(pod_name: str) -> dict[str, JsonValue]:
|
|
if not pod_name:
|
|
return {
|
|
"name": "",
|
|
"status": "missing",
|
|
"error": "No runtime services are linked to this deployment yet.",
|
|
}
|
|
|
|
try:
|
|
result = run_host_command(
|
|
command=("podman", "pod", "inspect", "--format", "{{.State}}", pod_name),
|
|
timeout_seconds=20.0,
|
|
)
|
|
except HostCommandError as error:
|
|
return {
|
|
"name": pod_name,
|
|
"status": "missing",
|
|
"error": _format_host_command_error(error),
|
|
}
|
|
|
|
return {
|
|
"name": pod_name,
|
|
"status": result.stdout.strip() or "unknown",
|
|
"error": "",
|
|
}
|
|
|
|
|
|
def _collect_container_diagnostics(
|
|
*,
|
|
container_name: str,
|
|
control_plane_status: str,
|
|
label: str,
|
|
) -> dict[str, JsonValue]:
|
|
container_status, inspect_error = _inspect_container_status(container_name)
|
|
logs, log_error = _read_container_logs(container_name)
|
|
return {
|
|
"label": label,
|
|
"container_name": container_name,
|
|
"control_plane_status": control_plane_status,
|
|
"container_status": container_status,
|
|
"logs": logs,
|
|
"inspect_error": inspect_error,
|
|
"log_error": log_error,
|
|
}
|
|
|
|
|
|
def _inspect_container_status(container_name: str) -> tuple[str, str]:
|
|
try:
|
|
result = run_host_command(
|
|
command=(
|
|
"podman",
|
|
"inspect",
|
|
"--format",
|
|
"{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}",
|
|
container_name,
|
|
),
|
|
timeout_seconds=20.0,
|
|
)
|
|
except HostCommandError as error:
|
|
return "missing", _format_host_command_error(error)
|
|
|
|
return result.stdout.strip() or "unknown", ""
|
|
|
|
|
|
def _read_container_logs(container_name: str) -> tuple[str, str]:
|
|
try:
|
|
result = run_host_command(
|
|
command=("podman", "logs", "--tail", str(MAX_DIAGNOSTIC_LOG_LINES), container_name),
|
|
timeout_seconds=20.0,
|
|
)
|
|
except HostCommandError as error:
|
|
return "", _format_host_command_error(error)
|
|
|
|
output = result.stdout.strip() or result.stderr.strip()
|
|
return output, ""
|
|
|
|
|
|
def _format_host_command_error(error: HostCommandError) -> str:
|
|
if error.stderr.strip():
|
|
return error.stderr.strip()
|
|
if error.stdout.strip():
|
|
return error.stdout.strip()
|
|
return str(error)
|