Tussilago/control_plane/tasks.py

from __future__ import annotations

import json
import logging
import secrets
import time
from pathlib import Path
from typing import TYPE_CHECKING
from typing import NoReturn
from urllib.request import urlopen

from celery import shared_task
from django.conf import settings
from django.db import transaction
from django.utils import timezone

from control_plane.host_commands import HostCommandError
from control_plane.host_commands import run_host_command
from control_plane.local_test_runtime import TEST_DJANGO_CONTAINER_PORT
from control_plane.local_test_runtime import TEST_DJANGO_WORKDIR
from control_plane.local_test_runtime import build_test_django_container_context_path
from control_plane.local_test_runtime import build_test_django_container_labels
from control_plane.local_test_runtime import build_test_django_container_names
from control_plane.local_test_runtime import build_test_django_containerfile_path
from control_plane.local_test_runtime import build_test_django_environment
from control_plane.local_test_runtime import build_test_django_image_reference
from control_plane.local_test_runtime import build_test_django_local_url
from control_plane.local_test_runtime import build_test_django_secret_mounts
from control_plane.local_test_runtime import write_test_django_project
from control_plane.models import Deployment
from control_plane.models import DeploymentStatus
from control_plane.models import RuntimeService
from control_plane.models import RuntimeServiceKind
from control_plane.models import RuntimeServiceStatus
from control_plane.observability import capture_test_deployment_diagnostics
from control_plane.runtime_plans import DjangoApplicationLaunchConfig
from control_plane.runtime_plans import DjangoContainerImageBuildConfig
from control_plane.runtime_plans import DjangoContainerRuntimeConfig
from control_plane.runtime_plans import PostgresContainerConfig
from control_plane.runtime_plans import RedisContainerConfig
from control_plane.runtime_plans import build_django_container_image_command
from control_plane.runtime_plans import build_django_container_run_command
from control_plane.runtime_plans import build_django_migrate_command
from control_plane.runtime_plans import build_django_server_command
from control_plane.runtime_plans import build_postgres_container_command
from control_plane.runtime_plans import build_redis_container_command

if TYPE_CHECKING:
    from celery.app.task import Task

    type BoundControlPlaneTask = Task[..., str]


logger = logging.getLogger("tussilago.control_plane.tasks")

DEFAULT_HTTP_READY_TIMEOUT_SECONDS = 45.0
DEFAULT_CONTAINER_READY_TIMEOUT_SECONDS = 45.0


TERMINAL_DEPLOYMENT_STATES: frozenset[str] = frozenset(
    {
        DeploymentStatus.DESTROYED.value,
        DeploymentStatus.FAILED.value,
    },
)

TERMINAL_RUNTIME_SERVICE_STATES: frozenset[str] = frozenset(
    {
        RuntimeServiceStatus.DESTROYING.value,
        RuntimeServiceStatus.DESTROYED.value,
    },
)


def _runtime_service_root(runtime_service: RuntimeService) -> Path:
    """Return filesystem root for one runtime service's local test artifacts."""
    return Path(settings.DATA_DIR) / "runtime-services" / str(runtime_service.deployment_id) / runtime_service.kind


def _mark_deployment_failed(*, deployment_id: str, message: str) -> None:
    """Persist failed deployment state with the latest error details."""
    with transaction.atomic():
        deployment = Deployment.objects.select_for_update().get(pk=deployment_id)
        deployment.status = DeploymentStatus.FAILED.value
        deployment.last_error = message
        deployment.finished_at = timezone.now()
        deployment.save(update_fields=["status", "last_error", "finished_at", "updated_at"])


def _capture_test_deployment_diagnostics_snapshot(deployment_id: str) -> None:
    """Persist best-effort diagnostics without breaking deployment flow."""
    try:
        capture_test_deployment_diagnostics(deployment_id)
    except OSError:
        logger.exception("Failed to write diagnostics snapshot deployment_id=%s", deployment_id)
    except ValueError:
        logger.exception("Invalid diagnostics snapshot state deployment_id=%s", deployment_id)
    except Deployment.DoesNotExist:
        logger.exception("Diagnostics snapshot skipped for missing deployment_id=%s", deployment_id)


def _ensure_test_django_image_exists(image_reference: str) -> None:
    """Build the reusable Django test image if it is missing locally.

    Raises:
        HostCommandError: If Podman image inspection or build fails.
    """
    try:
        run_host_command(command=("podman", "image", "exists", image_reference))
    except HostCommandError as error:
        if error.returncode != 1:
            raise

        run_host_command(
            command=build_django_container_image_command(
                DjangoContainerImageBuildConfig(
                    image_reference=image_reference,
                    containerfile_path=build_test_django_containerfile_path(),
                    context_directory=build_test_django_container_context_path(),
                ),
            ),
            timeout_seconds=300.0,
        )


def _read_container_logs(container_name: str) -> str:
    """Return captured container logs for failure reporting when available."""
    try:
        result = run_host_command(command=("podman", "logs", container_name))
    except HostCommandError:
        return ""

    return result.stdout.strip() or result.stderr.strip()


def _read_container_status(container_name: str) -> str:
    """Return current Podman health status for one container when available."""
    result = run_host_command(
        command=(
            "podman",
            "inspect",
            "--format",
            "{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}",
            container_name,
        ),
    )
    return result.stdout.strip()


def _wait_for_container_ready(
    runtime_service: RuntimeService,
    *,
    timeout_seconds: float = DEFAULT_CONTAINER_READY_TIMEOUT_SECONDS,
) -> None:
    """Poll Podman health state until one runtime service is ready.

    Raises:
        RuntimeError: If the runtime service exits or becomes unhealthy before it is ready.
        TimeoutError: If the runtime service does not become ready before timeout.
    """
    deadline = time.monotonic() + timeout_seconds
    while time.monotonic() < deadline:
        status = _read_container_status(runtime_service.container_name)
        if status == "healthy":
            return
        if status in {"exited", "dead", "stopped", "unhealthy"}:
            logs = _read_container_logs(runtime_service.container_name)
            message = f"Runtime service {runtime_service.kind} failed to become ready: {status}."
            if logs:
                message = f"{message}\n{logs}"
            raise RuntimeError(message)

        time.sleep(1.0)

    msg = f"Timed out waiting for runtime service {runtime_service.kind} to become healthy."
    raise TimeoutError(msg)


def _wait_for_http_ready(
    url: str,
    *,
    timeout_seconds: float = DEFAULT_HTTP_READY_TIMEOUT_SECONDS,
) -> dict[str, str | int]:
    """Poll a sentinel endpoint until it confirms PostgreSQL and Redis connectivity.

    Returns:
        Parsed JSON response from the sentinel endpoint.

    Raises:
        TimeoutError: If the endpoint does not become healthy before timeout.
    """
    deadline = time.monotonic() + timeout_seconds
    last_error: Exception | None = None
    while time.monotonic() < deadline:
        try:
            with urlopen(url, timeout=2) as response:  # noqa: S310
                payload = json.loads(response.read().decode("utf-8"))
                if payload.get("status") == "ok":
                    return payload
        except (OSError, json.JSONDecodeError) as error:
            last_error = error

        time.sleep(1.0)

    msg = f"Timed out waiting for healthy Django sentinel endpoint at {url}"
    raise TimeoutError(msg) from last_error


def _build_django_runtime_services(deployment: Deployment) -> tuple[RuntimeService, ...]:
    return tuple(
        RuntimeService.objects
        .select_related("deployment__hosted_site__tenant")
        .filter(deployment=deployment)
        .order_by("kind"),
    )


def _get_ready_django_runtime_services(deployment: Deployment) -> tuple[RuntimeService, ...]:
    """Return ready runtime services required by the generated Django test app.

    Raises:
        ValueError: If PostgreSQL or Redis containers are not ready.
    """
    runtime_services = _build_django_runtime_services(deployment)
    if not runtime_services or any(
        runtime_service.status != RuntimeServiceStatus.READY.value for runtime_service in runtime_services
    ):
        msg = "All runtime services must be ready before provisioning the Django test runtime."
        raise ValueError(msg)

    return runtime_services


def _build_django_runtime_configs(
    deployment: Deployment,
    runtime_services: tuple[RuntimeService, ...],
    *,
    project_root: Path,
) -> tuple[str, DjangoContainerRuntimeConfig, DjangoContainerRuntimeConfig]:
    """Build image reference plus migrate and server configs for one deployment.

    Returns:
        Image reference plus migrate and server Podman runtime configs.
    """
    image_reference = build_test_django_image_reference()
    environment = build_test_django_environment(deployment, runtime_services)
    secret_mounts = build_test_django_secret_mounts(runtime_services)
    labels = build_test_django_container_labels(deployment)
    server_container_name, migrate_container_name = build_test_django_container_names(deployment)
    network_name = runtime_services[0].network_name
    migrate_config = DjangoContainerRuntimeConfig(
        container_name=migrate_container_name,
        network_name=network_name,
        hostname="django-migrate.internal",
        image_reference=image_reference,
        application_directory=project_root,
        pod_name=network_name,
        working_directory=TEST_DJANGO_WORKDIR,
        environment=environment,
        secret_mounts=secret_mounts,
        labels=labels,
    )
    server_config = DjangoContainerRuntimeConfig(
        container_name=server_container_name,
        network_name=network_name,
        hostname="django.internal",
        image_reference=image_reference,
        application_directory=project_root,
        pod_name=network_name,
        host_port=deployment.guest_port,
        guest_port=TEST_DJANGO_CONTAINER_PORT,
        working_directory=TEST_DJANGO_WORKDIR,
        environment=environment,
        secret_mounts=secret_mounts,
        labels=labels,
    )
    return image_reference, migrate_config, server_config


def _launch_django_runtime(
    deployment: Deployment,
    *,
    image_reference: str,
    migrate_config: DjangoContainerRuntimeConfig,
    server_config: DjangoContainerRuntimeConfig,
) -> dict[str, str | int]:
    """Build image, run migrations, launch the Django container, and wait for readiness.

    Returns:
        Parsed JSON sentinel payload from the running Django test app.
    """
    _ensure_test_django_image_exists(image_reference)

    migrate_command = build_django_migrate_command(python_executable=Path("/usr/local/bin/python"))
    run_host_command(
        command=build_django_container_run_command(
            migrate_config,
            command=migrate_command,
            detach=False,
            remove=True,
        ),
        timeout_seconds=120.0,
    )

    server_command = build_django_server_command(
        DjangoApplicationLaunchConfig(
            wsgi_module=deployment.hosted_site.wsgi_module,
            bind_host="0.0.0.0",  # noqa: S104
            port=TEST_DJANGO_CONTAINER_PORT,
            workers=1,
            python_executable=Path("/usr/local/bin/python"),
        ),
    )
    run_host_command(
        command=build_django_container_run_command(
            server_config,
            command=server_command,
            detach=True,
        ),
        timeout_seconds=120.0,
    )
    return _wait_for_http_ready(build_test_django_local_url(deployment))


def _retry_or_fail_django_runtime(
    self: BoundControlPlaneTask,
    *,
    deployment: Deployment,
    error: HostCommandError | TimeoutError,
) -> NoReturn:
    """Retry transient Django runtime failures, or mark deployment failed when retries are exhausted."""
    retries = getattr(self.request, "retries", 0)
    logger.warning(
        "Django runtime provisioning retry deployment_id=%s retries=%s error=%s",
        deployment.id,
        retries,
        error,
    )
    if retries >= self.max_retries:
        server_container_name, _ = build_test_django_container_names(deployment)
        logs = _read_container_logs(server_container_name)
        failure_message = str(error)
        if logs:
            failure_message = f"{failure_message}\n{logs}"
        _mark_deployment_failed(deployment_id=str(deployment.id), message=failure_message)
        _capture_test_deployment_diagnostics_snapshot(str(deployment.id))
        logger.error("Django runtime provisioning failed deployment_id=%s", deployment.id)
        raise error

    countdown = min(300, 2 ** (retries + 1))
    raise self.retry(exc=error, countdown=countdown) from error


def run_test_django_runtime_provisioning(deployment_id: str) -> str:
    """Run generated Django runtime provisioning inline for one deployment.

    Returns:
        Final deployment status for the processed deployment.
    """
    deployment = Deployment.objects.select_related("hosted_site__tenant").get(pk=deployment_id)
    if deployment.status in TERMINAL_DEPLOYMENT_STATES or deployment.status == DeploymentStatus.RUNNING.value:
        return deployment.status

    runtime_services = _get_ready_django_runtime_services(deployment)
    project_root = write_test_django_project(deployment, runtime_services)
    image_reference, migrate_config, server_config = _build_django_runtime_configs(
        deployment,
        runtime_services,
        project_root=project_root,
    )
    sentinel_payload = _launch_django_runtime(
        deployment,
        image_reference=image_reference,
        migrate_config=migrate_config,
        server_config=server_config,
    )

    with transaction.atomic():
        deployment = Deployment.objects.select_for_update().get(pk=deployment_id)
        if deployment.status in TERMINAL_DEPLOYMENT_STATES:
            return deployment.status

        deployment.status = DeploymentStatus.RUNNING.value
        deployment.last_error = ""
        deployment.started_at = timezone.now()
        deployment.finished_at = None
        deployment.save(update_fields=["status", "last_error", "started_at", "finished_at", "updated_at"])

    _capture_test_deployment_diagnostics_snapshot(deployment_id)
    logger.info(
        "Django runtime ready deployment_id=%s tenant_slug=%s site_slug=%s postgres=%s redis=%s",
        deployment_id,
        deployment.hosted_site.tenant.slug,
        deployment.hosted_site.slug,
        sentinel_payload.get("postgres"),
        sentinel_payload.get("redis"),
    )
    return DeploymentStatus.RUNNING.value


def _ensure_secret_file(password_file: Path) -> None:
    """Write a reusable password file for a test container if one does not already exist."""
    password_file.parent.mkdir(parents=True, exist_ok=True)
    if password_file.exists():
        return

    password_file.write_text(f"{secrets.token_urlsafe(24)}\n", encoding="utf-8")
    password_file.chmod(0o600)


def _ensure_podman_pod(*, pod_name: str, host_port: int) -> None:
    """Create a Podman pod if it is missing.

    Raises:
        HostCommandError: If Podman pod inspection or creation fails.
    """
    try:
        run_host_command(command=("podman", "pod", "exists", pod_name))
    except HostCommandError as error:
        if error.returncode != 1:
            raise

        run_host_command(
            command=(
                "podman",
                "pod",
                "create",
                "--replace",
                "--name",
                pod_name,
                "--publish",
                f"127.0.0.1:{host_port}:{TEST_DJANGO_CONTAINER_PORT}",
            ),
        )


def _build_runtime_service_command(
    runtime_service: RuntimeService,
    *,
    data_directory: Path,
    password_file: Path,
) -> tuple[str, ...]:
    """Build a Podman command for one runtime service kind.

    Returns:
        Podman command arguments for the runtime service.

    Raises:
        ValueError: If the runtime service kind or configuration is unsupported.
    """
    if runtime_service.kind == RuntimeServiceKind.POSTGRESQL.value:
        if not runtime_service.connection_username or not runtime_service.connection_database:
            msg = "PostgreSQL runtime service requires connection credentials."
            raise ValueError(msg)

        return build_postgres_container_command(
            PostgresContainerConfig(
                container_name=runtime_service.container_name,
                network_name=runtime_service.network_name,
                hostname=runtime_service.hostname,
                username=runtime_service.connection_username,
                database_name=runtime_service.connection_database,
                data_directory=data_directory,
                password_file=password_file,
                pod_name=runtime_service.network_name,
                image_reference=runtime_service.image_reference,
            ),
        )

    if runtime_service.kind == RuntimeServiceKind.REDIS.value:
        return build_redis_container_command(
            RedisContainerConfig(
                container_name=runtime_service.container_name,
                network_name=runtime_service.network_name,
                hostname=runtime_service.hostname,
                data_directory=data_directory,
                password_file=password_file,
                pod_name=runtime_service.network_name,
                image_reference=runtime_service.image_reference,
            ),
        )

    msg = f"Unsupported runtime service kind: {runtime_service.kind}"
    raise ValueError(msg)


def _provision_runtime_service_container(runtime_service: RuntimeService) -> None:
    """Create or replace a local test container for one runtime service."""
    service_root = _runtime_service_root(runtime_service)
    data_directory = service_root / "data"
    password_file = service_root / "secrets" / "password"

    data_directory.mkdir(parents=True, exist_ok=True)
    _ensure_secret_file(password_file)
    _ensure_podman_pod(
        pod_name=runtime_service.network_name,
        host_port=runtime_service.deployment.guest_port,
    )

    command = _build_runtime_service_command(
        runtime_service,
        data_directory=data_directory,
        password_file=password_file,
    )
    run_host_command(command=command)
    _wait_for_container_ready(runtime_service)


@shared_task(
    bind=True,
    autoretry_for=(HostCommandError, TimeoutError),
    retry_backoff=True,
    retry_backoff_max=300,
    retry_jitter=True,
    max_retries=5,
)
def provision_test_runtime_services(self: BoundControlPlaneTask, deployment_id: str) -> str:
    """Seed and provision runtime service test containers for one deployment.

    Returns:
        Final runtime service status for the processed deployment.

    Raises:
        HostCommandError: If Podman commands fail while provisioning backing services.
        RuntimeError: If a backing container exits or becomes unhealthy during startup.
        TimeoutError: If a backing container never becomes healthy.
        ValueError: If runtime service configuration is invalid.
    """
    del self
    deployment = Deployment.objects.select_related("hosted_site__tenant").get(pk=deployment_id)
    if deployment.status in TERMINAL_DEPLOYMENT_STATES:
        return deployment.status

    deployment.ensure_test_runtime_services()
    runtime_services = tuple(
        RuntimeService.objects
        .select_related("deployment__hosted_site__tenant")
        .filter(deployment=deployment)
        .order_by("kind"),
    )
    pending_runtime_services = tuple(
        runtime_service
        for runtime_service in runtime_services
        if runtime_service.status not in TERMINAL_RUNTIME_SERVICE_STATES
        and runtime_service.status != RuntimeServiceStatus.READY.value
    )
    if not pending_runtime_services:
        return RuntimeServiceStatus.READY.value

    for runtime_service in pending_runtime_services:
        runtime_service.status = RuntimeServiceStatus.PROVISIONING.value
        runtime_service.save(update_fields=["status", "updated_at"])

        try:
            _provision_runtime_service_container(runtime_service)
        except HostCommandError, RuntimeError, TimeoutError:
            runtime_service.status = RuntimeServiceStatus.FAILED.value
            runtime_service.save(update_fields=["status", "updated_at"])
            _capture_test_deployment_diagnostics_snapshot(deployment_id)
            logger.exception(
                "Runtime service provisioning failed deployment_id=%s runtime_service_id=%s kind=%s",
                deployment_id,
                runtime_service.id,
                runtime_service.kind,
            )
            raise
        except ValueError:
            runtime_service.status = RuntimeServiceStatus.FAILED.value
            runtime_service.save(update_fields=["status", "updated_at"])
            logger.exception(
                "Runtime service configuration invalid deployment_id=%s runtime_service_id=%s kind=%s",
                deployment_id,
                runtime_service.id,
                runtime_service.kind,
            )
            raise

        runtime_service.status = RuntimeServiceStatus.READY.value
        runtime_service.save(update_fields=["status", "updated_at"])

    _capture_test_deployment_diagnostics_snapshot(deployment_id)
    return RuntimeServiceStatus.READY.value


@shared_task(
    bind=True,
    retry_backoff=True,
    retry_backoff_max=300,
    retry_jitter=True,
    max_retries=5,
)
def mark_deployment_provisioning(self: BoundControlPlaneTask, deployment_id: str) -> str:
    """Move a deployment into provisioning state in an idempotent way.

    Returns:
        The deployment status after the transition attempt.
    """
    del self
    with transaction.atomic():
        deployment: Deployment = Deployment.objects.select_for_update().get(pk=deployment_id)
        if deployment.status in TERMINAL_DEPLOYMENT_STATES:
            return deployment.status
        if deployment.status == DeploymentStatus.PROVISIONING.value:
            return deployment.status

        deployment.status = DeploymentStatus.PROVISIONING.value
        deployment.last_error = ""
        deployment.save(update_fields=["status", "last_error", "updated_at"])
        return deployment.status


@shared_task(
    bind=True,
    retry_backoff=True,
    retry_backoff_max=300,
    retry_jitter=True,
    max_retries=5,
)
def mark_deployment_booting(self: BoundControlPlaneTask, deployment_id: str) -> str:
    """Move a deployment into booting state in an idempotent way.

    Returns:
        The deployment status after the transition attempt.
    """
    del self
    with transaction.atomic():
        deployment: Deployment = Deployment.objects.select_for_update().get(pk=deployment_id)
        if deployment.status in TERMINAL_DEPLOYMENT_STATES:
            return deployment.status
        if deployment.status == DeploymentStatus.BOOTING.value:
            return deployment.status

        deployment.status = DeploymentStatus.BOOTING.value
        deployment.save(update_fields=["status", "updated_at"])
        return deployment.status


@shared_task(bind=True, max_retries=5)
def provision_test_django_runtime(self: BoundControlPlaneTask, deployment_id: str) -> str:
    """Build and run a generated Django test app against ready PostgreSQL and Redis containers.

    Returns:
        Final deployment status for the processed deployment.

    Raises:
        ValueError: If required backing services are not ready.
    """
    try:
        return run_test_django_runtime_provisioning(deployment_id)
    except ValueError as error:
        _mark_deployment_failed(deployment_id=deployment_id, message=str(error))
        logger.exception("Django runtime configuration invalid deployment_id=%s", deployment_id)
        raise
    except (HostCommandError, TimeoutError) as error:
        deployment = Deployment.objects.select_related("hosted_site__tenant").get(pk=deployment_id)
        _retry_or_fail_django_runtime(self, deployment=deployment, error=error)