from __future__ import annotations import json import logging import secrets import time from pathlib import Path from typing import TYPE_CHECKING from typing import NoReturn from urllib.request import urlopen from celery import shared_task from django.conf import settings from django.db import transaction from django.utils import timezone from control_plane.host_commands import HostCommandError from control_plane.host_commands import run_host_command from control_plane.local_test_runtime import TEST_DJANGO_CONTAINER_PORT from control_plane.local_test_runtime import TEST_DJANGO_WORKDIR from control_plane.local_test_runtime import build_test_django_container_context_path from control_plane.local_test_runtime import build_test_django_container_labels from control_plane.local_test_runtime import build_test_django_container_names from control_plane.local_test_runtime import build_test_django_containerfile_path from control_plane.local_test_runtime import build_test_django_environment from control_plane.local_test_runtime import build_test_django_image_reference from control_plane.local_test_runtime import build_test_django_local_url from control_plane.local_test_runtime import build_test_django_secret_mounts from control_plane.local_test_runtime import write_test_django_project from control_plane.models import Deployment from control_plane.models import DeploymentStatus from control_plane.models import RuntimeService from control_plane.models import RuntimeServiceKind from control_plane.models import RuntimeServiceStatus from control_plane.observability import capture_test_deployment_diagnostics from control_plane.runtime_plans import DjangoApplicationLaunchConfig from control_plane.runtime_plans import DjangoContainerImageBuildConfig from control_plane.runtime_plans import DjangoContainerRuntimeConfig from control_plane.runtime_plans import PostgresContainerConfig from control_plane.runtime_plans import RedisContainerConfig from control_plane.runtime_plans import build_django_container_image_command from control_plane.runtime_plans import build_django_container_run_command from control_plane.runtime_plans import build_django_migrate_command from control_plane.runtime_plans import build_django_server_command from control_plane.runtime_plans import build_postgres_container_command from control_plane.runtime_plans import build_redis_container_command if TYPE_CHECKING: from celery.app.task import Task type BoundControlPlaneTask = Task[..., str] logger = logging.getLogger("tussilago.control_plane.tasks") DEFAULT_HTTP_READY_TIMEOUT_SECONDS = 45.0 DEFAULT_CONTAINER_READY_TIMEOUT_SECONDS = 45.0 TERMINAL_DEPLOYMENT_STATES: frozenset[str] = frozenset( { DeploymentStatus.DESTROYED.value, DeploymentStatus.FAILED.value, }, ) TERMINAL_RUNTIME_SERVICE_STATES: frozenset[str] = frozenset( { RuntimeServiceStatus.DESTROYING.value, RuntimeServiceStatus.DESTROYED.value, }, ) def _runtime_service_root(runtime_service: RuntimeService) -> Path: """Return filesystem root for one runtime service's local test artifacts.""" return Path(settings.DATA_DIR) / "runtime-services" / str(runtime_service.deployment_id) / runtime_service.kind def _mark_deployment_failed(*, deployment_id: str, message: str) -> None: """Persist failed deployment state with the latest error details.""" with transaction.atomic(): deployment = Deployment.objects.select_for_update().get(pk=deployment_id) deployment.status = DeploymentStatus.FAILED.value deployment.last_error = message deployment.finished_at = timezone.now() deployment.save(update_fields=["status", "last_error", "finished_at", "updated_at"]) def _capture_test_deployment_diagnostics_snapshot(deployment_id: str) -> None: """Persist best-effort diagnostics without breaking deployment flow.""" try: capture_test_deployment_diagnostics(deployment_id) except OSError: logger.exception("Failed to write diagnostics snapshot deployment_id=%s", deployment_id) except ValueError: logger.exception("Invalid diagnostics snapshot state deployment_id=%s", deployment_id) except Deployment.DoesNotExist: logger.exception("Diagnostics snapshot skipped for missing deployment_id=%s", deployment_id) def _ensure_test_django_image_exists(image_reference: str) -> None: """Build the reusable Django test image if it is missing locally. Raises: HostCommandError: If Podman image inspection or build fails. """ try: run_host_command(command=("podman", "image", "exists", image_reference)) except HostCommandError as error: if error.returncode != 1: raise run_host_command( command=build_django_container_image_command( DjangoContainerImageBuildConfig( image_reference=image_reference, containerfile_path=build_test_django_containerfile_path(), context_directory=build_test_django_container_context_path(), ), ), timeout_seconds=300.0, ) def _read_container_logs(container_name: str) -> str: """Return captured container logs for failure reporting when available.""" try: result = run_host_command(command=("podman", "logs", container_name)) except HostCommandError: return "" return result.stdout.strip() or result.stderr.strip() def _read_container_status(container_name: str) -> str: """Return current Podman health status for one container when available.""" result = run_host_command( command=( "podman", "inspect", "--format", "{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}", container_name, ), ) return result.stdout.strip() def _wait_for_container_ready( runtime_service: RuntimeService, *, timeout_seconds: float = DEFAULT_CONTAINER_READY_TIMEOUT_SECONDS, ) -> None: """Poll Podman health state until one runtime service is ready. Raises: RuntimeError: If the runtime service exits or becomes unhealthy before it is ready. TimeoutError: If the runtime service does not become ready before timeout. """ deadline = time.monotonic() + timeout_seconds while time.monotonic() < deadline: status = _read_container_status(runtime_service.container_name) if status == "healthy": return if status in {"exited", "dead", "stopped", "unhealthy"}: logs = _read_container_logs(runtime_service.container_name) message = f"Runtime service {runtime_service.kind} failed to become ready: {status}." if logs: message = f"{message}\n{logs}" raise RuntimeError(message) time.sleep(1.0) msg = f"Timed out waiting for runtime service {runtime_service.kind} to become healthy." raise TimeoutError(msg) def _wait_for_http_ready( url: str, *, timeout_seconds: float = DEFAULT_HTTP_READY_TIMEOUT_SECONDS, ) -> dict[str, str | int]: """Poll a sentinel endpoint until it confirms PostgreSQL and Redis connectivity. Returns: Parsed JSON response from the sentinel endpoint. Raises: TimeoutError: If the endpoint does not become healthy before timeout. """ deadline = time.monotonic() + timeout_seconds last_error: Exception | None = None while time.monotonic() < deadline: try: with urlopen(url, timeout=2) as response: # noqa: S310 payload = json.loads(response.read().decode("utf-8")) if payload.get("status") == "ok": return payload except (OSError, json.JSONDecodeError) as error: last_error = error time.sleep(1.0) msg = f"Timed out waiting for healthy Django sentinel endpoint at {url}" raise TimeoutError(msg) from last_error def _build_django_runtime_services(deployment: Deployment) -> tuple[RuntimeService, ...]: return tuple( RuntimeService.objects .select_related("deployment__hosted_site__tenant") .filter(deployment=deployment) .order_by("kind"), ) def _get_ready_django_runtime_services(deployment: Deployment) -> tuple[RuntimeService, ...]: """Return ready runtime services required by the generated Django test app. Raises: ValueError: If PostgreSQL or Redis containers are not ready. """ runtime_services = _build_django_runtime_services(deployment) if not runtime_services or any( runtime_service.status != RuntimeServiceStatus.READY.value for runtime_service in runtime_services ): msg = "All runtime services must be ready before provisioning the Django test runtime." raise ValueError(msg) return runtime_services def _build_django_runtime_configs( deployment: Deployment, runtime_services: tuple[RuntimeService, ...], *, project_root: Path, ) -> tuple[str, DjangoContainerRuntimeConfig, DjangoContainerRuntimeConfig]: """Build image reference plus migrate and server configs for one deployment. Returns: Image reference plus migrate and server Podman runtime configs. """ image_reference = build_test_django_image_reference() environment = build_test_django_environment(deployment, runtime_services) secret_mounts = build_test_django_secret_mounts(runtime_services) labels = build_test_django_container_labels(deployment) server_container_name, migrate_container_name = build_test_django_container_names(deployment) network_name = runtime_services[0].network_name migrate_config = DjangoContainerRuntimeConfig( container_name=migrate_container_name, network_name=network_name, hostname="django-migrate.internal", image_reference=image_reference, application_directory=project_root, pod_name=network_name, working_directory=TEST_DJANGO_WORKDIR, environment=environment, secret_mounts=secret_mounts, labels=labels, ) server_config = DjangoContainerRuntimeConfig( container_name=server_container_name, network_name=network_name, hostname="django.internal", image_reference=image_reference, application_directory=project_root, pod_name=network_name, host_port=deployment.guest_port, guest_port=TEST_DJANGO_CONTAINER_PORT, working_directory=TEST_DJANGO_WORKDIR, environment=environment, secret_mounts=secret_mounts, labels=labels, ) return image_reference, migrate_config, server_config def _launch_django_runtime( deployment: Deployment, *, image_reference: str, migrate_config: DjangoContainerRuntimeConfig, server_config: DjangoContainerRuntimeConfig, ) -> dict[str, str | int]: """Build image, run migrations, launch the Django container, and wait for readiness. Returns: Parsed JSON sentinel payload from the running Django test app. """ _ensure_test_django_image_exists(image_reference) migrate_command = build_django_migrate_command(python_executable=Path("/usr/local/bin/python")) run_host_command( command=build_django_container_run_command( migrate_config, command=migrate_command, detach=False, remove=True, ), timeout_seconds=120.0, ) server_command = build_django_server_command( DjangoApplicationLaunchConfig( wsgi_module=deployment.hosted_site.wsgi_module, bind_host="0.0.0.0", # noqa: S104 port=TEST_DJANGO_CONTAINER_PORT, workers=1, python_executable=Path("/usr/local/bin/python"), ), ) run_host_command( command=build_django_container_run_command( server_config, command=server_command, detach=True, ), timeout_seconds=120.0, ) return _wait_for_http_ready(build_test_django_local_url(deployment)) def _retry_or_fail_django_runtime( self: BoundControlPlaneTask, *, deployment: Deployment, error: HostCommandError | TimeoutError, ) -> NoReturn: """Retry transient Django runtime failures, or mark deployment failed when retries are exhausted.""" retries = getattr(self.request, "retries", 0) logger.warning( "Django runtime provisioning retry deployment_id=%s retries=%s error=%s", deployment.id, retries, error, ) if retries >= self.max_retries: server_container_name, _ = build_test_django_container_names(deployment) logs = _read_container_logs(server_container_name) failure_message = str(error) if logs: failure_message = f"{failure_message}\n{logs}" _mark_deployment_failed(deployment_id=str(deployment.id), message=failure_message) _capture_test_deployment_diagnostics_snapshot(str(deployment.id)) logger.error("Django runtime provisioning failed deployment_id=%s", deployment.id) raise error countdown = min(300, 2 ** (retries + 1)) raise self.retry(exc=error, countdown=countdown) from error def run_test_django_runtime_provisioning(deployment_id: str) -> str: """Run generated Django runtime provisioning inline for one deployment. Returns: Final deployment status for the processed deployment. """ deployment = Deployment.objects.select_related("hosted_site__tenant").get(pk=deployment_id) if deployment.status in TERMINAL_DEPLOYMENT_STATES or deployment.status == DeploymentStatus.RUNNING.value: return deployment.status runtime_services = _get_ready_django_runtime_services(deployment) project_root = write_test_django_project(deployment, runtime_services) image_reference, migrate_config, server_config = _build_django_runtime_configs( deployment, runtime_services, project_root=project_root, ) sentinel_payload = _launch_django_runtime( deployment, image_reference=image_reference, migrate_config=migrate_config, server_config=server_config, ) with transaction.atomic(): deployment = Deployment.objects.select_for_update().get(pk=deployment_id) if deployment.status in TERMINAL_DEPLOYMENT_STATES: return deployment.status deployment.status = DeploymentStatus.RUNNING.value deployment.last_error = "" deployment.started_at = timezone.now() deployment.finished_at = None deployment.save(update_fields=["status", "last_error", "started_at", "finished_at", "updated_at"]) _capture_test_deployment_diagnostics_snapshot(deployment_id) logger.info( "Django runtime ready deployment_id=%s tenant_slug=%s site_slug=%s postgres=%s redis=%s", deployment_id, deployment.hosted_site.tenant.slug, deployment.hosted_site.slug, sentinel_payload.get("postgres"), sentinel_payload.get("redis"), ) return DeploymentStatus.RUNNING.value def _ensure_secret_file(password_file: Path) -> None: """Write a reusable password file for a test container if one does not already exist.""" password_file.parent.mkdir(parents=True, exist_ok=True) if password_file.exists(): return password_file.write_text(f"{secrets.token_urlsafe(24)}\n", encoding="utf-8") password_file.chmod(0o600) def _ensure_podman_pod(*, pod_name: str, host_port: int) -> None: """Create a Podman pod if it is missing. Raises: HostCommandError: If Podman pod inspection or creation fails. """ try: run_host_command(command=("podman", "pod", "exists", pod_name)) except HostCommandError as error: if error.returncode != 1: raise run_host_command( command=( "podman", "pod", "create", "--replace", "--name", pod_name, "--publish", f"127.0.0.1:{host_port}:{TEST_DJANGO_CONTAINER_PORT}", ), ) def _build_runtime_service_command( runtime_service: RuntimeService, *, data_directory: Path, password_file: Path, ) -> tuple[str, ...]: """Build a Podman command for one runtime service kind. Returns: Podman command arguments for the runtime service. Raises: ValueError: If the runtime service kind or configuration is unsupported. """ if runtime_service.kind == RuntimeServiceKind.POSTGRESQL.value: if not runtime_service.connection_username or not runtime_service.connection_database: msg = "PostgreSQL runtime service requires connection credentials." raise ValueError(msg) return build_postgres_container_command( PostgresContainerConfig( container_name=runtime_service.container_name, network_name=runtime_service.network_name, hostname=runtime_service.hostname, username=runtime_service.connection_username, database_name=runtime_service.connection_database, data_directory=data_directory, password_file=password_file, pod_name=runtime_service.network_name, image_reference=runtime_service.image_reference, ), ) if runtime_service.kind == RuntimeServiceKind.REDIS.value: return build_redis_container_command( RedisContainerConfig( container_name=runtime_service.container_name, network_name=runtime_service.network_name, hostname=runtime_service.hostname, data_directory=data_directory, password_file=password_file, pod_name=runtime_service.network_name, image_reference=runtime_service.image_reference, ), ) msg = f"Unsupported runtime service kind: {runtime_service.kind}" raise ValueError(msg) def _provision_runtime_service_container(runtime_service: RuntimeService) -> None: """Create or replace a local test container for one runtime service.""" service_root = _runtime_service_root(runtime_service) data_directory = service_root / "data" password_file = service_root / "secrets" / "password" data_directory.mkdir(parents=True, exist_ok=True) _ensure_secret_file(password_file) _ensure_podman_pod( pod_name=runtime_service.network_name, host_port=runtime_service.deployment.guest_port, ) command = _build_runtime_service_command( runtime_service, data_directory=data_directory, password_file=password_file, ) run_host_command(command=command) _wait_for_container_ready(runtime_service) @shared_task( bind=True, autoretry_for=(HostCommandError, TimeoutError), retry_backoff=True, retry_backoff_max=300, retry_jitter=True, max_retries=5, ) def provision_test_runtime_services(self: BoundControlPlaneTask, deployment_id: str) -> str: """Seed and provision runtime service test containers for one deployment. Returns: Final runtime service status for the processed deployment. Raises: HostCommandError: If Podman commands fail while provisioning backing services. RuntimeError: If a backing container exits or becomes unhealthy during startup. TimeoutError: If a backing container never becomes healthy. ValueError: If runtime service configuration is invalid. """ del self deployment = Deployment.objects.select_related("hosted_site__tenant").get(pk=deployment_id) if deployment.status in TERMINAL_DEPLOYMENT_STATES: return deployment.status deployment.ensure_test_runtime_services() runtime_services = tuple( RuntimeService.objects .select_related("deployment__hosted_site__tenant") .filter(deployment=deployment) .order_by("kind"), ) pending_runtime_services = tuple( runtime_service for runtime_service in runtime_services if runtime_service.status not in TERMINAL_RUNTIME_SERVICE_STATES and runtime_service.status != RuntimeServiceStatus.READY.value ) if not pending_runtime_services: return RuntimeServiceStatus.READY.value for runtime_service in pending_runtime_services: runtime_service.status = RuntimeServiceStatus.PROVISIONING.value runtime_service.save(update_fields=["status", "updated_at"]) try: _provision_runtime_service_container(runtime_service) except HostCommandError, RuntimeError, TimeoutError: runtime_service.status = RuntimeServiceStatus.FAILED.value runtime_service.save(update_fields=["status", "updated_at"]) _capture_test_deployment_diagnostics_snapshot(deployment_id) logger.exception( "Runtime service provisioning failed deployment_id=%s runtime_service_id=%s kind=%s", deployment_id, runtime_service.id, runtime_service.kind, ) raise except ValueError: runtime_service.status = RuntimeServiceStatus.FAILED.value runtime_service.save(update_fields=["status", "updated_at"]) logger.exception( "Runtime service configuration invalid deployment_id=%s runtime_service_id=%s kind=%s", deployment_id, runtime_service.id, runtime_service.kind, ) raise runtime_service.status = RuntimeServiceStatus.READY.value runtime_service.save(update_fields=["status", "updated_at"]) _capture_test_deployment_diagnostics_snapshot(deployment_id) return RuntimeServiceStatus.READY.value @shared_task( bind=True, retry_backoff=True, retry_backoff_max=300, retry_jitter=True, max_retries=5, ) def mark_deployment_provisioning(self: BoundControlPlaneTask, deployment_id: str) -> str: """Move a deployment into provisioning state in an idempotent way. Returns: The deployment status after the transition attempt. """ del self with transaction.atomic(): deployment: Deployment = Deployment.objects.select_for_update().get(pk=deployment_id) if deployment.status in TERMINAL_DEPLOYMENT_STATES: return deployment.status if deployment.status == DeploymentStatus.PROVISIONING.value: return deployment.status deployment.status = DeploymentStatus.PROVISIONING.value deployment.last_error = "" deployment.save(update_fields=["status", "last_error", "updated_at"]) return deployment.status @shared_task( bind=True, retry_backoff=True, retry_backoff_max=300, retry_jitter=True, max_retries=5, ) def mark_deployment_booting(self: BoundControlPlaneTask, deployment_id: str) -> str: """Move a deployment into booting state in an idempotent way. Returns: The deployment status after the transition attempt. """ del self with transaction.atomic(): deployment: Deployment = Deployment.objects.select_for_update().get(pk=deployment_id) if deployment.status in TERMINAL_DEPLOYMENT_STATES: return deployment.status if deployment.status == DeploymentStatus.BOOTING.value: return deployment.status deployment.status = DeploymentStatus.BOOTING.value deployment.save(update_fields=["status", "updated_at"]) return deployment.status @shared_task(bind=True, max_retries=5) def provision_test_django_runtime(self: BoundControlPlaneTask, deployment_id: str) -> str: """Build and run a generated Django test app against ready PostgreSQL and Redis containers. Returns: Final deployment status for the processed deployment. Raises: ValueError: If required backing services are not ready. """ try: return run_test_django_runtime_provisioning(deployment_id) except ValueError as error: _mark_deployment_failed(deployment_id=deployment_id, message=str(error)) logger.exception("Django runtime configuration invalid deployment_id=%s", deployment_id) raise except (HostCommandError, TimeoutError) as error: deployment = Deployment.objects.select_related("hosted_site__tenant").get(pk=deployment_id) _retry_or_fail_django_runtime(self, deployment=deployment, error=error)