From 4977e3ff4349bcfcb6730f7697b9e3ba5d104cc0 Mon Sep 17 00:00:00 2001
From: prathamesh0 <42446521+prathamesh0@users.noreply.github.com>
Date: Tue, 21 Apr 2026 14:40:39 +0530
Subject: [PATCH] k8s: manage Caddy ingress image via spec (so-p3p) (#749)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes so-p3p:
- New spec key `caddy-ingress-image`: on fresh install, deploys Caddy with this image; on subsequent `deployment start`, patches the running Caddy Deployment if the image differs. Defaults to the manifest's hardcoded image when absent
- When the spec key is absent, SO does **not** touch a running Caddy — avoids silently reverting an image set out-of-band (ansible playbook, another deployment's spec)
- `strategy: Recreate` on the Caddy Deployment manifest (required — hostPort 80/443 deadlocks rolling updates)
- Reconcile runs under both `--perform-cluster-management` and the default `--skip-cluster-management` (it's a k8s-API patch, not a cluster-lifecycle op)
- Image template by container name rather than string match, so the spec override wins regardless of what the shipped manifest hardcodes
- Cluster-scoped caveat documented: `caddy-system` is shared across deployments, so the last `deployment start` that sets the key wins for everyone
---
 .pebbles/events.jsonl                         |  2 +
 docs/deployment_patterns.md                   | 35 ++++++++
 stack_orchestrator/constants.py               |  2 +
 .../ingress/ingress-caddy-kind-deploy.yaml    |  5 ++
 stack_orchestrator/deploy/k8s/deploy_k8s.py   | 18 ++++
 stack_orchestrator/deploy/k8s/helpers.py      | 88 ++++++++++++++++++-
 stack_orchestrator/deploy/spec.py             | 18 ++++
 7 files changed, 166 insertions(+), 2 deletions(-)

diff --git a/.pebbles/events.jsonl b/.pebbles/events.jsonl
index 0cf36fce..ece11c09 100644
--- a/.pebbles/events.jsonl
+++ b/.pebbles/events.jsonl
@@ -52,3 +52,5 @@
 {"type":"status_update","timestamp":"2026-04-21T05:57:12.928842469Z","issue_id":"so-n1n","payload":{"status":"closed"}}
 {"type":"comment","timestamp":"2026-04-21T06:08:13.933886638Z","issue_id":"so-ad7","payload":{"body":"Fixed in PR #744 (cf8b7533). get_services() now includes the maintenance pod in the container-ports map so its per-pod Service is built and available for the Ingress swap."}}
 {"type":"status_update","timestamp":"2026-04-21T06:08:14.457815115Z","issue_id":"so-ad7","payload":{"status":"closed"}}
+{"type":"update","timestamp":"2026-04-21T09:00:47.364859946Z","issue_id":"so-p3p","payload":{"description":"## Problem\n\nThe Caddy ingress controller image is hardcoded in `ingress-caddy-kind-deploy.yaml`, with no mechanism to update it short of cluster recreation or manual `kubectl patch`. laconic-so should: (1) allow spec.yml to specify a Caddy image, (2) support updating the Caddy image as part of `deployment start`, (3) set `strategy: Recreate` on the Caddy Deployment since hostPort pods can't rolling-update.\n\n## Resolution\n\n- New spec key `caddy-ingress-image`. Fresh install uses it (fallback: manifest default). On subsequent `deployment start`, if the spec key is set and the running Caddy image differs, SO patches the Deployment and waits for rollout.\n- Spec key absent =\u003e SO does **not** touch a running Caddy, to avoid silently reverting images set out-of-band (ansible playbook, another deployment's spec).\n- `strategy: Recreate` added to the Caddy Deployment manifest.\n- Reconcile runs under both `--perform-cluster-management` and the default `--skip-cluster-management` (it's a plain k8s-API patch, not a cluster lifecycle op).\n- Image substitution locates the container by name instead of string-matching the shipped default, so the spec override wins regardless of what the manifest hardcodes.\n- Cluster-scoped caveat: `caddy-system` is shared across deployments; last `deployment start` that sets the key wins for everyone. Documented in `deployment_patterns.md`."}}
+{"type":"status_update","timestamp":"2026-04-21T09:00:47.745675131Z","issue_id":"so-p3p","payload":{"status":"closed"}}
diff --git a/docs/deployment_patterns.md b/docs/deployment_patterns.md
index 525622aa..0eb2548a 100644
--- a/docs/deployment_patterns.md
+++ b/docs/deployment_patterns.md
@@ -202,6 +202,41 @@ with a `DeployerException` pointing at the `namespace:` spec
 override. Catches operator-error cases where the same deployment dir
 is effectively registered twice.
 
+### Caddy ingress image lifecycle
+
+The Caddy ingress controller lives in the cluster-scoped
+`caddy-system` namespace and is installed on first `deployment start`.
+Its image is configurable per deployment:
+
+```yaml
+# spec.yml
+caddy-ingress-image: ghcr.io/laconicnetwork/caddy-ingress:v1.2.3
+```
+
+Two cases, intentionally different:
+
+- **Spec key set**: on first install the manifest is templated with
+  this image. On subsequent `deployment start`, if the running Caddy
+  Deployment's image differs, laconic-so patches it and waits for the
+  rollout. The Deployment uses `strategy: Recreate` (hostPort 80/443
+  blocks rolling updates from ever completing), so expect ~10–30s of
+  ingress downtime while the old pod terminates and the new one
+  starts.
+- **Spec key absent**: on first install the manifest's hardcoded
+  default (`ghcr.io/laconicnetwork/caddy-ingress:latest`) is used.
+  On subsequent `deployment start`, laconic-so does **not** touch the
+  running Caddy Deployment. This matters when the image was set
+  out-of-band (via an ansible playbook, or by another deployment's
+  spec that's since been removed) — a silent revert to the default
+  would be worse than doing nothing. If you want to go back to the
+  default image, set `caddy-ingress-image` to it explicitly.
+
+**Cluster-scoped caveat**: `caddy-system` is shared by every
+deployment on the cluster. Setting `caddy-ingress-image` in any one
+deployment's spec rolls the controller for all of them — last
+`deployment start` wins. Treat it as a cluster-level knob; keep the
+value consistent across the deployments sharing a cluster.
+
 ## Volume Persistence in k8s-kind
 
 k8s-kind has 3 storage layers:
diff --git a/stack_orchestrator/constants.py b/stack_orchestrator/constants.py
index e5c83698..72c29837 100644
--- a/stack_orchestrator/constants.py
+++ b/stack_orchestrator/constants.py
@@ -48,5 +48,7 @@ high_memlock_runtime = "high-memlock"
 high_memlock_spec_filename = "high-memlock-spec.json"
 acme_email_key = "acme-email"
 kind_mount_root_key = "kind-mount-root"
+caddy_ingress_image_key = "caddy-ingress-image"
+default_caddy_ingress_image = "ghcr.io/laconicnetwork/caddy-ingress:latest"
 external_services_key = "external-services"
 ca_certificates_key = "ca-certificates"
diff --git a/stack_orchestrator/data/k8s/components/ingress/ingress-caddy-kind-deploy.yaml b/stack_orchestrator/data/k8s/components/ingress/ingress-caddy-kind-deploy.yaml
index 88025837..b368c50e 100644
--- a/stack_orchestrator/data/k8s/components/ingress/ingress-caddy-kind-deploy.yaml
+++ b/stack_orchestrator/data/k8s/components/ingress/ingress-caddy-kind-deploy.yaml
@@ -160,6 +160,11 @@ metadata:
     app.kubernetes.io/component: controller
 spec:
   replicas: 1
+  # Recreate is required: the pod binds hostPort 80/443, which a
+  # RollingUpdate would try to double-claim during cutover (new pod
+  # pending until old pod exits — never exits, rollout deadlocks).
+  strategy:
+    type: Recreate
   selector:
     matchLabels:
       app.kubernetes.io/name: caddy-ingress-controller
diff --git a/stack_orchestrator/deploy/k8s/deploy_k8s.py b/stack_orchestrator/deploy/k8s/deploy_k8s.py
index 84318cde..688c5696 100644
--- a/stack_orchestrator/deploy/k8s/deploy_k8s.py
+++ b/stack_orchestrator/deploy/k8s/deploy_k8s.py
@@ -34,6 +34,7 @@ from stack_orchestrator.deploy.k8s.helpers import (
 )
 from stack_orchestrator.deploy.k8s.helpers import (
     install_ingress_for_kind,
+    update_caddy_ingress_image,
     wait_for_ingress_in_kind,
     is_ingress_running,
 )
@@ -880,11 +881,16 @@ class K8sDeployer(Deployer):
             check_mounts_compatible(existing, kind_config)
         self.connect_api()
         self._ensure_namespace()
+        caddy_image = self.cluster_info.spec.get_caddy_ingress_image()
+        # Fresh-install path: gated on cluster lifecycle ownership
+        # because install_ingress_for_kind also seeds caddy-system
+        # (namespace, secrets restore, cert-backup CronJob).
         if self.is_kind() and not self.skip_cluster_management:
             if not is_ingress_running():
                 install_ingress_for_kind(
                     self.cluster_info.spec.get_acme_email(),
                     self.cluster_info.spec.get_kind_mount_root(),
+                    caddy_image=caddy_image,
                 )
                 wait_for_ingress_in_kind()
             if self.cluster_info.spec.get_unlimited_memlock():
@@ -892,6 +898,18 @@ class K8sDeployer(Deployer):
                     constants.high_memlock_runtime,
                     constants.high_memlock_runtime,
                 )
+        # Reconcile Caddy image whenever the operator explicitly set
+        # it in spec, regardless of cluster lifecycle ownership —
+        # --skip-cluster-management (the default) shouldn't prevent
+        # a routine k8s-API-level patch of a running Deployment.
+        # Spec absent => don't touch: the operator may have set the
+        # image out-of-band (ansible playbook, prior explicit spec on
+        # a different deployment) and a silent revert would be worse
+        # than doing nothing. caddy-system is cluster-scoped, so
+        # whichever deployment's spec sets the image last wins.
+        if self.is_kind() and caddy_image is not None and is_ingress_running():
+            if update_caddy_ingress_image(caddy_image):
+                wait_for_ingress_in_kind()
 
     def _create_ingress(self):
         """Create or update Ingress with TLS certificate lookup."""
diff --git a/stack_orchestrator/deploy/k8s/helpers.py b/stack_orchestrator/deploy/k8s/helpers.py
index 44d1e49c..4cbf3270 100644
--- a/stack_orchestrator/deploy/k8s/helpers.py
+++ b/stack_orchestrator/deploy/k8s/helpers.py
@@ -466,7 +466,9 @@ def wait_for_ingress_in_kind():
 
 
 def install_ingress_for_kind(
-    acme_email: str = "", kind_mount_root: Optional[str] = None
+    acme_email: str = "",
+    kind_mount_root: Optional[str] = None,
+    caddy_image: Optional[str] = None,
 ):
     api_client = client.ApiClient()
     ingress_install = os.path.abspath(
@@ -477,7 +479,7 @@ def install_ingress_for_kind(
     if opts.o.debug:
         print("Installing Caddy ingress controller in kind cluster")
 
-    # Template the YAML with email before applying
+    # Template the YAML with email and image before applying
     with open(ingress_install) as f:
         yaml_content = f.read()
 
@@ -488,6 +490,27 @@ def install_ingress_for_kind(
 
     yaml_objects = list(yaml.safe_load_all(yaml_content))
 
+    # Override the Caddy container's image when a spec value is set.
+    # Works regardless of what's hardcoded in the manifest — we locate
+    # the container by name and overwrite its image field, rather than
+    # relying on a string match of the default.
+    if caddy_image:
+        for obj in yaml_objects:
+            if not obj:
+                continue
+            if (
+                obj.get("kind") == "Deployment"
+                and obj.get("metadata", {}).get("name")
+                == "caddy-ingress-controller"
+            ):
+                for c in (
+                    obj["spec"]["template"]["spec"].get("containers") or []
+                ):
+                    if c.get("name") == "caddy-ingress-controller":
+                        c["image"] = caddy_image
+                        if opts.o.debug:
+                            print(f"Configured Caddy image: {caddy_image}")
+
     # Split: apply everything except the Caddy controller Deployment first,
     # so the namespace + secrets exist before the pod can start and read its
     # secret_store. Race-free: Caddy has no way to see the cluster until
@@ -530,6 +553,67 @@ def install_ingress_for_kind(
     _install_caddy_cert_backup(api_client, kind_mount_root)
 
 
+def update_caddy_ingress_image(caddy_image: str) -> bool:
+    """Patch the running Caddy ingress Deployment to a new image.
+
+    No-op if the live Deployment already runs the requested image.
+    Returns True if a patch was applied, False otherwise.
+
+    Caddy lives in the cluster-scoped `caddy-system` namespace, so
+    this affects every deployment sharing the cluster. The
+    `strategy: Recreate` in the Deployment manifest handles the
+    hostPort-80/443 handoff; expect ~10-30s of ingress downtime while
+    the old pod terminates and the new one starts.
+    """
+    apps_api = client.AppsV1Api()
+    try:
+        dep = apps_api.read_namespaced_deployment(
+            name="caddy-ingress-controller", namespace="caddy-system"
+        )
+    except ApiException as e:
+        if e.status == 404:
+            if opts.o.debug:
+                print(
+                    "Caddy ingress Deployment not found; nothing to "
+                    "update (install path handles fresh clusters)"
+                )
+            return False
+        raise
+
+    containers = dep.spec.template.spec.containers or []
+    current = containers[0].image if containers else None
+    if current == caddy_image:
+        if opts.o.debug:
+            print(f"Caddy image already at {caddy_image}; no update needed")
+        return False
+
+    print(
+        f"Updating Caddy ingress image: {current} -> {caddy_image} "
+        "(expect brief ingress downtime)"
+    )
+    patch = {
+        "spec": {
+            "template": {
+                "spec": {
+                    "containers": [
+                        {
+                            "name": "caddy-ingress-controller",
+                            "image": caddy_image,
+                            "imagePullPolicy": "Always",
+                        }
+                    ]
+                }
+            }
+        }
+    }
+    apps_api.patch_namespaced_deployment(
+        name="caddy-ingress-controller",
+        namespace="caddy-system",
+        body=patch,
+    )
+    return True
+
+
 def load_images_into_kind(kind_cluster_name: str, image_set: Set[str]):
     for image in image_set:
         result = _run_command(
diff --git a/stack_orchestrator/deploy/spec.py b/stack_orchestrator/deploy/spec.py
index 5ac61662..fcdc9de8 100644
--- a/stack_orchestrator/deploy/spec.py
+++ b/stack_orchestrator/deploy/spec.py
@@ -304,6 +304,24 @@ class Spec:
         """
         return self.obj.get(constants.kind_mount_root_key)
 
+    def get_caddy_ingress_image(self) -> typing.Optional[str]:
+        """Return the Caddy ingress controller image override, or None.
+
+        Returns None (not the default image) when the spec key is
+        absent. That distinction matters: the install path falls back
+        to the hardcoded default so there's always *some* image to
+        deploy, while the update-on-reuse path treats None as "operator
+        didn't ask to touch Caddy" and skips the patch — avoiding
+        silent reverts of an image set out-of-band (e.g. via an
+        ansible playbook or a prior deployment's spec).
+
+        Cluster-scoped: the Caddy ingress lives in the shared
+        `caddy-system` namespace, so setting this key in any
+        deployment's spec rolls the controller for every deployment
+        using the cluster.
+        """
+        return self.obj.get(constants.caddy_ingress_image_key)
+
     def get_maintenance_service(self) -> typing.Optional[str]:
         """Return maintenance-service value (e.g. 'dumpster-maintenance:8000') or None.