Tussilago/control_plane/tasks.py
2026-04-27 20:43:26 +02:00

656 lines
24 KiB
Python

from __future__ import annotations
import json
import logging
import secrets
import time
from pathlib import Path
from typing import TYPE_CHECKING
from typing import NoReturn
from urllib.request import urlopen
from celery import shared_task
from django.conf import settings
from django.db import transaction
from django.utils import timezone
from control_plane.host_commands import HostCommandError
from control_plane.host_commands import run_host_command
from control_plane.local_test_runtime import TEST_DJANGO_CONTAINER_PORT
from control_plane.local_test_runtime import TEST_DJANGO_WORKDIR
from control_plane.local_test_runtime import build_test_django_container_context_path
from control_plane.local_test_runtime import build_test_django_container_labels
from control_plane.local_test_runtime import build_test_django_container_names
from control_plane.local_test_runtime import build_test_django_containerfile_path
from control_plane.local_test_runtime import build_test_django_environment
from control_plane.local_test_runtime import build_test_django_image_reference
from control_plane.local_test_runtime import build_test_django_local_url
from control_plane.local_test_runtime import build_test_django_secret_mounts
from control_plane.local_test_runtime import write_test_django_project
from control_plane.models import Deployment
from control_plane.models import DeploymentStatus
from control_plane.models import RuntimeService
from control_plane.models import RuntimeServiceKind
from control_plane.models import RuntimeServiceStatus
from control_plane.observability import capture_test_deployment_diagnostics
from control_plane.runtime_plans import DjangoApplicationLaunchConfig
from control_plane.runtime_plans import DjangoContainerImageBuildConfig
from control_plane.runtime_plans import DjangoContainerRuntimeConfig
from control_plane.runtime_plans import PostgresContainerConfig
from control_plane.runtime_plans import RedisContainerConfig
from control_plane.runtime_plans import build_django_container_image_command
from control_plane.runtime_plans import build_django_container_run_command
from control_plane.runtime_plans import build_django_migrate_command
from control_plane.runtime_plans import build_django_server_command
from control_plane.runtime_plans import build_postgres_container_command
from control_plane.runtime_plans import build_redis_container_command
if TYPE_CHECKING:
from celery.app.task import Task
type BoundControlPlaneTask = Task[..., str]
logger = logging.getLogger("tussilago.control_plane.tasks")
DEFAULT_HTTP_READY_TIMEOUT_SECONDS = 45.0
DEFAULT_CONTAINER_READY_TIMEOUT_SECONDS = 45.0
TERMINAL_DEPLOYMENT_STATES: frozenset[str] = frozenset(
{
DeploymentStatus.DESTROYED.value,
DeploymentStatus.FAILED.value,
},
)
TERMINAL_RUNTIME_SERVICE_STATES: frozenset[str] = frozenset(
{
RuntimeServiceStatus.DESTROYING.value,
RuntimeServiceStatus.DESTROYED.value,
},
)
def _runtime_service_root(runtime_service: RuntimeService) -> Path:
"""Return filesystem root for one runtime service's local test artifacts."""
return Path(settings.DATA_DIR) / "runtime-services" / str(runtime_service.deployment_id) / runtime_service.kind
def _mark_deployment_failed(*, deployment_id: str, message: str) -> None:
"""Persist failed deployment state with the latest error details."""
with transaction.atomic():
deployment = Deployment.objects.select_for_update().get(pk=deployment_id)
deployment.status = DeploymentStatus.FAILED.value
deployment.last_error = message
deployment.finished_at = timezone.now()
deployment.save(update_fields=["status", "last_error", "finished_at", "updated_at"])
def _capture_test_deployment_diagnostics_snapshot(deployment_id: str) -> None:
"""Persist best-effort diagnostics without breaking deployment flow."""
try:
capture_test_deployment_diagnostics(deployment_id)
except OSError:
logger.exception("Failed to write diagnostics snapshot deployment_id=%s", deployment_id)
except ValueError:
logger.exception("Invalid diagnostics snapshot state deployment_id=%s", deployment_id)
except Deployment.DoesNotExist:
logger.exception("Diagnostics snapshot skipped for missing deployment_id=%s", deployment_id)
def _ensure_test_django_image_exists(image_reference: str) -> None:
"""Build the reusable Django test image if it is missing locally.
Raises:
HostCommandError: If Podman image inspection or build fails.
"""
try:
run_host_command(command=("podman", "image", "exists", image_reference))
except HostCommandError as error:
if error.returncode != 1:
raise
run_host_command(
command=build_django_container_image_command(
DjangoContainerImageBuildConfig(
image_reference=image_reference,
containerfile_path=build_test_django_containerfile_path(),
context_directory=build_test_django_container_context_path(),
),
),
timeout_seconds=300.0,
)
def _read_container_logs(container_name: str) -> str:
"""Return captured container logs for failure reporting when available."""
try:
result = run_host_command(command=("podman", "logs", container_name))
except HostCommandError:
return ""
return result.stdout.strip() or result.stderr.strip()
def _read_container_status(container_name: str) -> str:
"""Return current Podman health status for one container when available."""
result = run_host_command(
command=(
"podman",
"inspect",
"--format",
"{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}",
container_name,
),
)
return result.stdout.strip()
def _wait_for_container_ready(
runtime_service: RuntimeService,
*,
timeout_seconds: float = DEFAULT_CONTAINER_READY_TIMEOUT_SECONDS,
) -> None:
"""Poll Podman health state until one runtime service is ready.
Raises:
RuntimeError: If the runtime service exits or becomes unhealthy before it is ready.
TimeoutError: If the runtime service does not become ready before timeout.
"""
deadline = time.monotonic() + timeout_seconds
while time.monotonic() < deadline:
status = _read_container_status(runtime_service.container_name)
if status == "healthy":
return
if status in {"exited", "dead", "stopped", "unhealthy"}:
logs = _read_container_logs(runtime_service.container_name)
message = f"Runtime service {runtime_service.kind} failed to become ready: {status}."
if logs:
message = f"{message}\n{logs}"
raise RuntimeError(message)
time.sleep(1.0)
msg = f"Timed out waiting for runtime service {runtime_service.kind} to become healthy."
raise TimeoutError(msg)
def _wait_for_http_ready(
url: str,
*,
timeout_seconds: float = DEFAULT_HTTP_READY_TIMEOUT_SECONDS,
) -> dict[str, str | int]:
"""Poll a sentinel endpoint until it confirms PostgreSQL and Redis connectivity.
Returns:
Parsed JSON response from the sentinel endpoint.
Raises:
TimeoutError: If the endpoint does not become healthy before timeout.
"""
deadline = time.monotonic() + timeout_seconds
last_error: Exception | None = None
while time.monotonic() < deadline:
try:
with urlopen(url, timeout=2) as response: # noqa: S310
payload = json.loads(response.read().decode("utf-8"))
if payload.get("status") == "ok":
return payload
except (OSError, json.JSONDecodeError) as error:
last_error = error
time.sleep(1.0)
msg = f"Timed out waiting for healthy Django sentinel endpoint at {url}"
raise TimeoutError(msg) from last_error
def _build_django_runtime_services(deployment: Deployment) -> tuple[RuntimeService, ...]:
return tuple(
RuntimeService.objects
.select_related("deployment__hosted_site__tenant")
.filter(deployment=deployment)
.order_by("kind"),
)
def _get_ready_django_runtime_services(deployment: Deployment) -> tuple[RuntimeService, ...]:
"""Return ready runtime services required by the generated Django test app.
Raises:
ValueError: If PostgreSQL or Redis containers are not ready.
"""
runtime_services = _build_django_runtime_services(deployment)
if not runtime_services or any(
runtime_service.status != RuntimeServiceStatus.READY.value for runtime_service in runtime_services
):
msg = "All runtime services must be ready before provisioning the Django test runtime."
raise ValueError(msg)
return runtime_services
def _build_django_runtime_configs(
deployment: Deployment,
runtime_services: tuple[RuntimeService, ...],
*,
project_root: Path,
) -> tuple[str, DjangoContainerRuntimeConfig, DjangoContainerRuntimeConfig]:
"""Build image reference plus migrate and server configs for one deployment.
Returns:
Image reference plus migrate and server Podman runtime configs.
"""
image_reference = build_test_django_image_reference()
environment = build_test_django_environment(deployment, runtime_services)
secret_mounts = build_test_django_secret_mounts(runtime_services)
labels = build_test_django_container_labels(deployment)
server_container_name, migrate_container_name = build_test_django_container_names(deployment)
network_name = runtime_services[0].network_name
migrate_config = DjangoContainerRuntimeConfig(
container_name=migrate_container_name,
network_name=network_name,
hostname="django-migrate.internal",
image_reference=image_reference,
application_directory=project_root,
pod_name=network_name,
working_directory=TEST_DJANGO_WORKDIR,
environment=environment,
secret_mounts=secret_mounts,
labels=labels,
)
server_config = DjangoContainerRuntimeConfig(
container_name=server_container_name,
network_name=network_name,
hostname="django.internal",
image_reference=image_reference,
application_directory=project_root,
pod_name=network_name,
host_port=deployment.guest_port,
guest_port=TEST_DJANGO_CONTAINER_PORT,
working_directory=TEST_DJANGO_WORKDIR,
environment=environment,
secret_mounts=secret_mounts,
labels=labels,
)
return image_reference, migrate_config, server_config
def _launch_django_runtime(
deployment: Deployment,
*,
image_reference: str,
migrate_config: DjangoContainerRuntimeConfig,
server_config: DjangoContainerRuntimeConfig,
) -> dict[str, str | int]:
"""Build image, run migrations, launch the Django container, and wait for readiness.
Returns:
Parsed JSON sentinel payload from the running Django test app.
"""
_ensure_test_django_image_exists(image_reference)
migrate_command = build_django_migrate_command(python_executable=Path("/usr/local/bin/python"))
run_host_command(
command=build_django_container_run_command(
migrate_config,
command=migrate_command,
detach=False,
remove=True,
),
timeout_seconds=120.0,
)
server_command = build_django_server_command(
DjangoApplicationLaunchConfig(
wsgi_module=deployment.hosted_site.wsgi_module,
bind_host="0.0.0.0", # noqa: S104
port=TEST_DJANGO_CONTAINER_PORT,
workers=1,
python_executable=Path("/usr/local/bin/python"),
),
)
run_host_command(
command=build_django_container_run_command(
server_config,
command=server_command,
detach=True,
),
timeout_seconds=120.0,
)
return _wait_for_http_ready(build_test_django_local_url(deployment))
def _retry_or_fail_django_runtime(
self: BoundControlPlaneTask,
*,
deployment: Deployment,
error: HostCommandError | TimeoutError,
) -> NoReturn:
"""Retry transient Django runtime failures, or mark deployment failed when retries are exhausted."""
retries = getattr(self.request, "retries", 0)
logger.warning(
"Django runtime provisioning retry deployment_id=%s retries=%s error=%s",
deployment.id,
retries,
error,
)
if retries >= self.max_retries:
server_container_name, _ = build_test_django_container_names(deployment)
logs = _read_container_logs(server_container_name)
failure_message = str(error)
if logs:
failure_message = f"{failure_message}\n{logs}"
_mark_deployment_failed(deployment_id=str(deployment.id), message=failure_message)
_capture_test_deployment_diagnostics_snapshot(str(deployment.id))
logger.error("Django runtime provisioning failed deployment_id=%s", deployment.id)
raise error
countdown = min(300, 2 ** (retries + 1))
raise self.retry(exc=error, countdown=countdown) from error
def run_test_django_runtime_provisioning(deployment_id: str) -> str:
"""Run generated Django runtime provisioning inline for one deployment.
Returns:
Final deployment status for the processed deployment.
"""
deployment = Deployment.objects.select_related("hosted_site__tenant").get(pk=deployment_id)
if deployment.status in TERMINAL_DEPLOYMENT_STATES or deployment.status == DeploymentStatus.RUNNING.value:
return deployment.status
runtime_services = _get_ready_django_runtime_services(deployment)
project_root = write_test_django_project(deployment, runtime_services)
image_reference, migrate_config, server_config = _build_django_runtime_configs(
deployment,
runtime_services,
project_root=project_root,
)
sentinel_payload = _launch_django_runtime(
deployment,
image_reference=image_reference,
migrate_config=migrate_config,
server_config=server_config,
)
with transaction.atomic():
deployment = Deployment.objects.select_for_update().get(pk=deployment_id)
if deployment.status in TERMINAL_DEPLOYMENT_STATES:
return deployment.status
deployment.status = DeploymentStatus.RUNNING.value
deployment.last_error = ""
deployment.started_at = timezone.now()
deployment.finished_at = None
deployment.save(update_fields=["status", "last_error", "started_at", "finished_at", "updated_at"])
_capture_test_deployment_diagnostics_snapshot(deployment_id)
logger.info(
"Django runtime ready deployment_id=%s tenant_slug=%s site_slug=%s postgres=%s redis=%s",
deployment_id,
deployment.hosted_site.tenant.slug,
deployment.hosted_site.slug,
sentinel_payload.get("postgres"),
sentinel_payload.get("redis"),
)
return DeploymentStatus.RUNNING.value
def _ensure_secret_file(password_file: Path) -> None:
"""Write a reusable password file for a test container if one does not already exist."""
password_file.parent.mkdir(parents=True, exist_ok=True)
if password_file.exists():
return
password_file.write_text(f"{secrets.token_urlsafe(24)}\n", encoding="utf-8")
password_file.chmod(0o600)
def _ensure_podman_pod(*, pod_name: str, host_port: int) -> None:
"""Create a Podman pod if it is missing.
Raises:
HostCommandError: If Podman pod inspection or creation fails.
"""
try:
run_host_command(command=("podman", "pod", "exists", pod_name))
except HostCommandError as error:
if error.returncode != 1:
raise
run_host_command(
command=(
"podman",
"pod",
"create",
"--replace",
"--name",
pod_name,
"--publish",
f"127.0.0.1:{host_port}:{TEST_DJANGO_CONTAINER_PORT}",
),
)
def _build_runtime_service_command(
runtime_service: RuntimeService,
*,
data_directory: Path,
password_file: Path,
) -> tuple[str, ...]:
"""Build a Podman command for one runtime service kind.
Returns:
Podman command arguments for the runtime service.
Raises:
ValueError: If the runtime service kind or configuration is unsupported.
"""
if runtime_service.kind == RuntimeServiceKind.POSTGRESQL.value:
if not runtime_service.connection_username or not runtime_service.connection_database:
msg = "PostgreSQL runtime service requires connection credentials."
raise ValueError(msg)
return build_postgres_container_command(
PostgresContainerConfig(
container_name=runtime_service.container_name,
network_name=runtime_service.network_name,
hostname=runtime_service.hostname,
username=runtime_service.connection_username,
database_name=runtime_service.connection_database,
data_directory=data_directory,
password_file=password_file,
pod_name=runtime_service.network_name,
image_reference=runtime_service.image_reference,
),
)
if runtime_service.kind == RuntimeServiceKind.REDIS.value:
return build_redis_container_command(
RedisContainerConfig(
container_name=runtime_service.container_name,
network_name=runtime_service.network_name,
hostname=runtime_service.hostname,
data_directory=data_directory,
password_file=password_file,
pod_name=runtime_service.network_name,
image_reference=runtime_service.image_reference,
),
)
msg = f"Unsupported runtime service kind: {runtime_service.kind}"
raise ValueError(msg)
def _provision_runtime_service_container(runtime_service: RuntimeService) -> None:
"""Create or replace a local test container for one runtime service."""
service_root = _runtime_service_root(runtime_service)
data_directory = service_root / "data"
password_file = service_root / "secrets" / "password"
data_directory.mkdir(parents=True, exist_ok=True)
_ensure_secret_file(password_file)
_ensure_podman_pod(
pod_name=runtime_service.network_name,
host_port=runtime_service.deployment.guest_port,
)
command = _build_runtime_service_command(
runtime_service,
data_directory=data_directory,
password_file=password_file,
)
run_host_command(command=command)
_wait_for_container_ready(runtime_service)
@shared_task(
bind=True,
autoretry_for=(HostCommandError, TimeoutError),
retry_backoff=True,
retry_backoff_max=300,
retry_jitter=True,
max_retries=5,
)
def provision_test_runtime_services(self: BoundControlPlaneTask, deployment_id: str) -> str:
"""Seed and provision runtime service test containers for one deployment.
Returns:
Final runtime service status for the processed deployment.
Raises:
HostCommandError: If Podman commands fail while provisioning backing services.
RuntimeError: If a backing container exits or becomes unhealthy during startup.
TimeoutError: If a backing container never becomes healthy.
ValueError: If runtime service configuration is invalid.
"""
del self
deployment = Deployment.objects.select_related("hosted_site__tenant").get(pk=deployment_id)
if deployment.status in TERMINAL_DEPLOYMENT_STATES:
return deployment.status
deployment.ensure_test_runtime_services()
runtime_services = tuple(
RuntimeService.objects
.select_related("deployment__hosted_site__tenant")
.filter(deployment=deployment)
.order_by("kind"),
)
pending_runtime_services = tuple(
runtime_service
for runtime_service in runtime_services
if runtime_service.status not in TERMINAL_RUNTIME_SERVICE_STATES
and runtime_service.status != RuntimeServiceStatus.READY.value
)
if not pending_runtime_services:
return RuntimeServiceStatus.READY.value
for runtime_service in pending_runtime_services:
runtime_service.status = RuntimeServiceStatus.PROVISIONING.value
runtime_service.save(update_fields=["status", "updated_at"])
try:
_provision_runtime_service_container(runtime_service)
except HostCommandError, RuntimeError, TimeoutError:
runtime_service.status = RuntimeServiceStatus.FAILED.value
runtime_service.save(update_fields=["status", "updated_at"])
_capture_test_deployment_diagnostics_snapshot(deployment_id)
logger.exception(
"Runtime service provisioning failed deployment_id=%s runtime_service_id=%s kind=%s",
deployment_id,
runtime_service.id,
runtime_service.kind,
)
raise
except ValueError:
runtime_service.status = RuntimeServiceStatus.FAILED.value
runtime_service.save(update_fields=["status", "updated_at"])
logger.exception(
"Runtime service configuration invalid deployment_id=%s runtime_service_id=%s kind=%s",
deployment_id,
runtime_service.id,
runtime_service.kind,
)
raise
runtime_service.status = RuntimeServiceStatus.READY.value
runtime_service.save(update_fields=["status", "updated_at"])
_capture_test_deployment_diagnostics_snapshot(deployment_id)
return RuntimeServiceStatus.READY.value
@shared_task(
bind=True,
retry_backoff=True,
retry_backoff_max=300,
retry_jitter=True,
max_retries=5,
)
def mark_deployment_provisioning(self: BoundControlPlaneTask, deployment_id: str) -> str:
"""Move a deployment into provisioning state in an idempotent way.
Returns:
The deployment status after the transition attempt.
"""
del self
with transaction.atomic():
deployment: Deployment = Deployment.objects.select_for_update().get(pk=deployment_id)
if deployment.status in TERMINAL_DEPLOYMENT_STATES:
return deployment.status
if deployment.status == DeploymentStatus.PROVISIONING.value:
return deployment.status
deployment.status = DeploymentStatus.PROVISIONING.value
deployment.last_error = ""
deployment.save(update_fields=["status", "last_error", "updated_at"])
return deployment.status
@shared_task(
bind=True,
retry_backoff=True,
retry_backoff_max=300,
retry_jitter=True,
max_retries=5,
)
def mark_deployment_booting(self: BoundControlPlaneTask, deployment_id: str) -> str:
"""Move a deployment into booting state in an idempotent way.
Returns:
The deployment status after the transition attempt.
"""
del self
with transaction.atomic():
deployment: Deployment = Deployment.objects.select_for_update().get(pk=deployment_id)
if deployment.status in TERMINAL_DEPLOYMENT_STATES:
return deployment.status
if deployment.status == DeploymentStatus.BOOTING.value:
return deployment.status
deployment.status = DeploymentStatus.BOOTING.value
deployment.save(update_fields=["status", "updated_at"])
return deployment.status
@shared_task(bind=True, max_retries=5)
def provision_test_django_runtime(self: BoundControlPlaneTask, deployment_id: str) -> str:
"""Build and run a generated Django test app against ready PostgreSQL and Redis containers.
Returns:
Final deployment status for the processed deployment.
Raises:
ValueError: If required backing services are not ready.
"""
try:
return run_test_django_runtime_provisioning(deployment_id)
except ValueError as error:
_mark_deployment_failed(deployment_id=deployment_id, message=str(error))
logger.exception("Django runtime configuration invalid deployment_id=%s", deployment_id)
raise
except (HostCommandError, TimeoutError) as error:
deployment = Deployment.objects.select_related("hosted_site__tenant").get(pk=deployment_id)
_retry_or_fail_django_runtime(self, deployment=deployment, error=error)