WIP
This commit is contained in:
parent
e70a0584c9
commit
a7a5b5c8ea
43 changed files with 5531 additions and 9 deletions
254
control_plane/observability.py
Normal file
254
control_plane/observability.py
Normal file
|
|
@ -0,0 +1,254 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from typing import TYPE_CHECKING
|
||||
from urllib.error import HTTPError
|
||||
from urllib.error import URLError
|
||||
from urllib.request import urlopen
|
||||
|
||||
from django.utils import timezone
|
||||
|
||||
from control_plane.host_commands import HostCommandError
|
||||
from control_plane.host_commands import run_host_command
|
||||
from control_plane.local_test_runtime import build_test_django_container_names
|
||||
from control_plane.local_test_runtime import build_test_django_local_url
|
||||
from control_plane.local_test_runtime import build_test_django_project_root
|
||||
from control_plane.models import Deployment
|
||||
from control_plane.models import DeploymentStatus
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
|
||||
from control_plane.models import RuntimeService
|
||||
|
||||
|
||||
MAX_DIAGNOSTIC_LOG_LINES = 200
|
||||
DEFAULT_SENTINEL_PROBE_TIMEOUT_SECONDS = 2.0
|
||||
|
||||
type JsonPrimitive = bool | int | float | str | None
|
||||
type JsonValue = JsonPrimitive | list[JsonValue] | dict[str, JsonValue]
|
||||
|
||||
|
||||
def build_test_deployment_diagnostics_root(deployment: Deployment) -> Path:
|
||||
"""Return filesystem root for persisted deployment diagnostics."""
|
||||
return build_test_django_project_root(deployment).parent / "diagnostics"
|
||||
|
||||
|
||||
def build_test_deployment_diagnostics_snapshot_path(deployment: Deployment) -> Path:
|
||||
"""Return JSON snapshot path for one deployment's latest diagnostics."""
|
||||
return build_test_deployment_diagnostics_root(deployment) / "snapshot.json"
|
||||
|
||||
|
||||
def capture_test_deployment_diagnostics(deployment_id: str) -> None:
|
||||
"""Capture current pod, container, and log state for one deployment."""
|
||||
deployment = (
|
||||
Deployment.objects
|
||||
.select_related("hosted_site__tenant")
|
||||
.prefetch_related("runtime_services")
|
||||
.get(pk=deployment_id)
|
||||
)
|
||||
snapshot_path = build_test_deployment_diagnostics_snapshot_path(deployment)
|
||||
snapshot_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
snapshot_path.write_text(
|
||||
json.dumps(_build_diagnostics_snapshot(deployment), indent=2),
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
|
||||
def load_test_deployment_diagnostics(deployment: Deployment) -> dict[str, JsonValue] | None:
|
||||
"""Load the latest persisted diagnostics snapshot for one deployment.
|
||||
|
||||
Returns:
|
||||
Parsed diagnostics payload, or None when no snapshot has been captured yet.
|
||||
"""
|
||||
snapshot_path = build_test_deployment_diagnostics_snapshot_path(deployment)
|
||||
if not snapshot_path.exists():
|
||||
return None
|
||||
|
||||
try:
|
||||
payload = json.loads(snapshot_path.read_text(encoding="utf-8"))
|
||||
except json.JSONDecodeError as error:
|
||||
return {
|
||||
"capture_error": f"Unable to parse diagnostics snapshot: {error}",
|
||||
"captured_at": None,
|
||||
}
|
||||
|
||||
if not isinstance(payload, dict):
|
||||
return {
|
||||
"capture_error": "Diagnostics snapshot is not a JSON object.",
|
||||
"captured_at": None,
|
||||
}
|
||||
|
||||
return payload
|
||||
|
||||
|
||||
def probe_test_deployment_health(
|
||||
deployment: Deployment,
|
||||
*,
|
||||
timeout_seconds: float = DEFAULT_SENTINEL_PROBE_TIMEOUT_SECONDS,
|
||||
) -> dict[str, JsonValue]:
|
||||
"""Probe the generated deployment sentinel endpoint and return structured status.
|
||||
|
||||
Returns:
|
||||
JSON-serializable probe state describing current sentinel reachability and payload.
|
||||
"""
|
||||
sentinel_url = build_test_django_local_url(deployment)
|
||||
result: dict[str, JsonValue] = {
|
||||
"checked_at": timezone.now().isoformat(),
|
||||
"deployment_id": str(deployment.id),
|
||||
"deployment_status": deployment.status,
|
||||
"sentinel_url": sentinel_url,
|
||||
"ok": False,
|
||||
"status": "not-running",
|
||||
"label": "Not Running",
|
||||
"payload": None,
|
||||
"error": "",
|
||||
"http_status": None,
|
||||
}
|
||||
if deployment.status not in {DeploymentStatus.RUNNING.value, DeploymentStatus.BOOTING.value}:
|
||||
return result
|
||||
|
||||
try:
|
||||
with urlopen(sentinel_url, timeout=timeout_seconds) as response: # noqa: S310
|
||||
payload = json.loads(response.read().decode("utf-8"))
|
||||
result["http_status"] = int(getattr(response, "status", 200))
|
||||
if isinstance(payload, dict):
|
||||
result["payload"] = payload
|
||||
if payload.get("status") == "ok":
|
||||
result["ok"] = True
|
||||
result["status"] = "healthy"
|
||||
result["label"] = "Healthy"
|
||||
else:
|
||||
result["status"] = "unexpected-payload"
|
||||
result["label"] = "Unexpected"
|
||||
else:
|
||||
result["payload"] = {"value": str(payload)}
|
||||
result["status"] = "unexpected-payload"
|
||||
result["label"] = "Unexpected"
|
||||
except (HTTPError, URLError, OSError, json.JSONDecodeError) as error:
|
||||
result["status"] = "unreachable"
|
||||
result["label"] = "Unreachable"
|
||||
result["error"] = str(error)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _build_diagnostics_snapshot(deployment: Deployment) -> dict[str, JsonValue]:
|
||||
runtime_services = tuple(_ordered_runtime_services(deployment.runtime_services.all()))
|
||||
server_container_name, _ = build_test_django_container_names(deployment)
|
||||
pod_name = runtime_services[0].network_name if runtime_services else ""
|
||||
|
||||
return {
|
||||
"captured_at": timezone.now().isoformat(),
|
||||
"deployment_id": str(deployment.id),
|
||||
"deployment_status": deployment.status,
|
||||
"tenant_slug": deployment.hosted_site.tenant.slug,
|
||||
"site_slug": deployment.hosted_site.slug,
|
||||
"guest_port": deployment.guest_port,
|
||||
"sentinel_url": build_test_django_local_url(deployment),
|
||||
"last_error": deployment.last_error,
|
||||
"pod": _collect_pod_diagnostics(pod_name),
|
||||
"django": _collect_container_diagnostics(
|
||||
container_name=server_container_name,
|
||||
control_plane_status=deployment.status,
|
||||
label="django",
|
||||
),
|
||||
"runtime_services": [
|
||||
_collect_container_diagnostics(
|
||||
container_name=runtime_service.container_name,
|
||||
control_plane_status=runtime_service.status,
|
||||
label=runtime_service.kind,
|
||||
)
|
||||
for runtime_service in runtime_services
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def _ordered_runtime_services(runtime_services: Iterable[RuntimeService]) -> tuple[RuntimeService, ...]:
|
||||
return tuple(sorted(runtime_services, key=lambda runtime_service: runtime_service.kind))
|
||||
|
||||
|
||||
def _collect_pod_diagnostics(pod_name: str) -> dict[str, JsonValue]:
|
||||
if not pod_name:
|
||||
return {
|
||||
"name": "",
|
||||
"status": "missing",
|
||||
"error": "No runtime services are linked to this deployment yet.",
|
||||
}
|
||||
|
||||
try:
|
||||
result = run_host_command(
|
||||
command=("podman", "pod", "inspect", "--format", "{{.State}}", pod_name),
|
||||
timeout_seconds=20.0,
|
||||
)
|
||||
except HostCommandError as error:
|
||||
return {
|
||||
"name": pod_name,
|
||||
"status": "missing",
|
||||
"error": _format_host_command_error(error),
|
||||
}
|
||||
|
||||
return {
|
||||
"name": pod_name,
|
||||
"status": result.stdout.strip() or "unknown",
|
||||
"error": "",
|
||||
}
|
||||
|
||||
|
||||
def _collect_container_diagnostics(
|
||||
*,
|
||||
container_name: str,
|
||||
control_plane_status: str,
|
||||
label: str,
|
||||
) -> dict[str, JsonValue]:
|
||||
container_status, inspect_error = _inspect_container_status(container_name)
|
||||
logs, log_error = _read_container_logs(container_name)
|
||||
return {
|
||||
"label": label,
|
||||
"container_name": container_name,
|
||||
"control_plane_status": control_plane_status,
|
||||
"container_status": container_status,
|
||||
"logs": logs,
|
||||
"inspect_error": inspect_error,
|
||||
"log_error": log_error,
|
||||
}
|
||||
|
||||
|
||||
def _inspect_container_status(container_name: str) -> tuple[str, str]:
|
||||
try:
|
||||
result = run_host_command(
|
||||
command=(
|
||||
"podman",
|
||||
"inspect",
|
||||
"--format",
|
||||
"{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}",
|
||||
container_name,
|
||||
),
|
||||
timeout_seconds=20.0,
|
||||
)
|
||||
except HostCommandError as error:
|
||||
return "missing", _format_host_command_error(error)
|
||||
|
||||
return result.stdout.strip() or "unknown", ""
|
||||
|
||||
|
||||
def _read_container_logs(container_name: str) -> tuple[str, str]:
|
||||
try:
|
||||
result = run_host_command(
|
||||
command=("podman", "logs", "--tail", str(MAX_DIAGNOSTIC_LOG_LINES), container_name),
|
||||
timeout_seconds=20.0,
|
||||
)
|
||||
except HostCommandError as error:
|
||||
return "", _format_host_command_error(error)
|
||||
|
||||
output = result.stdout.strip() or result.stderr.strip()
|
||||
return output, ""
|
||||
|
||||
|
||||
def _format_host_command_error(error: HostCommandError) -> str:
|
||||
if error.stderr.strip():
|
||||
return error.stderr.strip()
|
||||
if error.stdout.strip():
|
||||
return error.stdout.strip()
|
||||
return str(error)
|
||||
Loading…
Add table
Add a link
Reference in a new issue