From a6d54c7bf8d760ea022866f2e5276a6b56efe437 Mon Sep 17 00:00:00 2001
From: Prathamesh Musale <prathamesh.musale0@gmail.com>
Date: Tue, 21 Apr 2026 06:51:53 +0000
Subject: [PATCH] feat(k8s): manage Caddy ingress image lifecycle via spec
 (so-p3p)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The Caddy ingress image was hardcoded in the component manifest and
had no update path shy of cluster recreate or manual kubectl patch.
That forced woodburn to run an out-of-band ansible playbook to bump
Caddy, and broke the "spec.yml is source of truth" model.

Changes:

- spec.yml: new `caddy-ingress-image` key (default
  `ghcr.io/laconicnetwork/caddy-ingress:latest`).
- Deployment manifest: `strategy: Recreate` on the Caddy Deployment
  — required because the pod binds hostPort 80/443, which prevents
  any rolling update from completing (new pod hangs Pending forever
  waiting for old pod to release the ports).
- install_ingress_for_kind: accepts caddy_image and templates the
  manifest before applying, same pattern as the existing acme-email
  templating.
- update_caddy_ingress_image: patches the running Caddy Deployment
  when the spec image differs from the live image. No-op if they
  match. Returns True if a patch was applied so the caller can wait
  for the rollout.
- deploy_k8s._setup_cluster: on cluster reuse (ingress already up),
  reconcile the running image against the spec. Installs path
  unchanged; only the "already running, maybe needs update" branch
  is new.

Cluster-scoped caveat: caddy-system is shared by every deployment on
the cluster, so the spec value in any one deployment rolls Caddy for
all of them — last `deployment start` wins. Documented in
deployment_patterns.md alongside the other cluster-scoped concerns
(kind-mount-root, namespace ownership).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .pebbles/events.jsonl                         |  2 +
 docs/deployment_patterns.md                   | 26 +++++++
 stack_orchestrator/constants.py               |  2 +
 .../ingress/ingress-caddy-kind-deploy.yaml    |  5 ++
 stack_orchestrator/deploy/k8s/deploy_k8s.py   | 12 +++
 stack_orchestrator/deploy/k8s/helpers.py      | 74 ++++++++++++++++++-
 stack_orchestrator/deploy/spec.py             | 13 ++++
 7 files changed, 132 insertions(+), 2 deletions(-)

diff --git a/.pebbles/events.jsonl b/.pebbles/events.jsonl
index 0cf36fce..b712e73b 100644
--- a/.pebbles/events.jsonl
+++ b/.pebbles/events.jsonl
@@ -52,3 +52,5 @@
 {"type":"status_update","timestamp":"2026-04-21T05:57:12.928842469Z","issue_id":"so-n1n","payload":{"status":"closed"}}
 {"type":"comment","timestamp":"2026-04-21T06:08:13.933886638Z","issue_id":"so-ad7","payload":{"body":"Fixed in PR #744 (cf8b7533). get_services() now includes the maintenance pod in the container-ports map so its per-pod Service is built and available for the Ingress swap."}}
 {"type":"status_update","timestamp":"2026-04-21T06:08:14.457815115Z","issue_id":"so-ad7","payload":{"status":"closed"}}
+{"type":"status_update","timestamp":"2026-04-21T06:51:38.213606012Z","issue_id":"so-p3p","payload":{"status":"closed"}}
+{"type":"comment","timestamp":"2026-04-21T06:51:38.749628156Z","issue_id":"so-p3p","payload":{"body":"Implemented on branch feat/so-p3p-caddy-image-lifecycle: spec key caddy-ingress-image, strategy: Recreate on the Caddy Deployment manifest, and image reconciliation on deployment start (patches if spec image differs from running image)."}}
diff --git a/docs/deployment_patterns.md b/docs/deployment_patterns.md
index 525622aa..e716f2e0 100644
--- a/docs/deployment_patterns.md
+++ b/docs/deployment_patterns.md
@@ -202,6 +202,32 @@ with a `DeployerException` pointing at the `namespace:` spec
 override. Catches operator-error cases where the same deployment dir
 is effectively registered twice.
 
+### Caddy ingress image lifecycle
+
+The Caddy ingress controller lives in the cluster-scoped
+`caddy-system` namespace and is installed on first `deployment start`.
+Its image is configurable per deployment:
+
+```yaml
+# spec.yml
+caddy-ingress-image: ghcr.io/laconicnetwork/caddy-ingress:v1.2.3
+```
+
+Defaults to `ghcr.io/laconicnetwork/caddy-ingress:latest` when not set.
+
+On subsequent `deployment start`, if the running Caddy image differs
+from the spec value, laconic-so patches the Caddy Deployment to the
+new image. The Deployment uses `strategy: Recreate` (the hostPort
+80/443 binding blocks a rolling update from ever completing), so
+expect ~10–30s of ingress downtime while the old pod terminates and
+the new one starts.
+
+**Cluster-scoped caveat**: `caddy-system` is shared by every
+deployment on the cluster. Setting `caddy-ingress-image` in any one
+deployment's spec rolls the controller for all of them — last
+`deployment start` wins. Treat it as a cluster-level knob; keep the
+value consistent across the deployments sharing a cluster.
+
 ## Volume Persistence in k8s-kind
 
 k8s-kind has 3 storage layers:
diff --git a/stack_orchestrator/constants.py b/stack_orchestrator/constants.py
index e5c83698..72c29837 100644
--- a/stack_orchestrator/constants.py
+++ b/stack_orchestrator/constants.py
@@ -48,5 +48,7 @@ high_memlock_runtime = "high-memlock"
 high_memlock_spec_filename = "high-memlock-spec.json"
 acme_email_key = "acme-email"
 kind_mount_root_key = "kind-mount-root"
+caddy_ingress_image_key = "caddy-ingress-image"
+default_caddy_ingress_image = "ghcr.io/laconicnetwork/caddy-ingress:latest"
 external_services_key = "external-services"
 ca_certificates_key = "ca-certificates"
diff --git a/stack_orchestrator/data/k8s/components/ingress/ingress-caddy-kind-deploy.yaml b/stack_orchestrator/data/k8s/components/ingress/ingress-caddy-kind-deploy.yaml
index 88025837..b368c50e 100644
--- a/stack_orchestrator/data/k8s/components/ingress/ingress-caddy-kind-deploy.yaml
+++ b/stack_orchestrator/data/k8s/components/ingress/ingress-caddy-kind-deploy.yaml
@@ -160,6 +160,11 @@ metadata:
     app.kubernetes.io/component: controller
 spec:
   replicas: 1
+  # Recreate is required: the pod binds hostPort 80/443, which a
+  # RollingUpdate would try to double-claim during cutover (new pod
+  # pending until old pod exits — never exits, rollout deadlocks).
+  strategy:
+    type: Recreate
   selector:
     matchLabels:
       app.kubernetes.io/name: caddy-ingress-controller
diff --git a/stack_orchestrator/deploy/k8s/deploy_k8s.py b/stack_orchestrator/deploy/k8s/deploy_k8s.py
index 84318cde..7912ff1c 100644
--- a/stack_orchestrator/deploy/k8s/deploy_k8s.py
+++ b/stack_orchestrator/deploy/k8s/deploy_k8s.py
@@ -34,6 +34,7 @@ from stack_orchestrator.deploy.k8s.helpers import (
 )
 from stack_orchestrator.deploy.k8s.helpers import (
     install_ingress_for_kind,
+    update_caddy_ingress_image,
     wait_for_ingress_in_kind,
     is_ingress_running,
 )
@@ -881,12 +882,23 @@ class K8sDeployer(Deployer):
         self.connect_api()
         self._ensure_namespace()
         if self.is_kind() and not self.skip_cluster_management:
+            caddy_image = self.cluster_info.spec.get_caddy_ingress_image()
             if not is_ingress_running():
                 install_ingress_for_kind(
                     self.cluster_info.spec.get_acme_email(),
                     self.cluster_info.spec.get_kind_mount_root(),
+                    caddy_image=caddy_image,
                 )
                 wait_for_ingress_in_kind()
+            else:
+                # Ingress is already up from a prior start — reconcile
+                # the running image against this deployment's spec.
+                # Patches only if they differ. Note: caddy-system is
+                # cluster-scoped, so every deployment sharing the
+                # cluster effectively votes on the image; last start
+                # wins. Documented in deployment_patterns.md.
+                if update_caddy_ingress_image(caddy_image):
+                    wait_for_ingress_in_kind()
             if self.cluster_info.spec.get_unlimited_memlock():
                 _create_runtime_class(
                     constants.high_memlock_runtime,
diff --git a/stack_orchestrator/deploy/k8s/helpers.py b/stack_orchestrator/deploy/k8s/helpers.py
index 44d1e49c..7194d3ca 100644
--- a/stack_orchestrator/deploy/k8s/helpers.py
+++ b/stack_orchestrator/deploy/k8s/helpers.py
@@ -466,7 +466,9 @@ def wait_for_ingress_in_kind():
 
 
 def install_ingress_for_kind(
-    acme_email: str = "", kind_mount_root: Optional[str] = None
+    acme_email: str = "",
+    kind_mount_root: Optional[str] = None,
+    caddy_image: Optional[str] = None,
 ):
     api_client = client.ApiClient()
     ingress_install = os.path.abspath(
@@ -477,7 +479,7 @@ def install_ingress_for_kind(
     if opts.o.debug:
         print("Installing Caddy ingress controller in kind cluster")
 
-    # Template the YAML with email before applying
+    # Template the YAML with email and image before applying
     with open(ingress_install) as f:
         yaml_content = f.read()
 
@@ -486,6 +488,13 @@ def install_ingress_for_kind(
         if opts.o.debug:
             print(f"Configured Caddy with ACME email: {acme_email}")
 
+    if caddy_image and caddy_image != constants.default_caddy_ingress_image:
+        yaml_content = yaml_content.replace(
+            constants.default_caddy_ingress_image, caddy_image
+        )
+        if opts.o.debug:
+            print(f"Configured Caddy image: {caddy_image}")
+
     yaml_objects = list(yaml.safe_load_all(yaml_content))
 
     # Split: apply everything except the Caddy controller Deployment first,
@@ -530,6 +539,67 @@ def install_ingress_for_kind(
     _install_caddy_cert_backup(api_client, kind_mount_root)
 
 
+def update_caddy_ingress_image(caddy_image: str) -> bool:
+    """Patch the running Caddy ingress Deployment to a new image.
+
+    No-op if the live Deployment already runs the requested image.
+    Returns True if a patch was applied, False otherwise.
+
+    Caddy lives in the cluster-scoped `caddy-system` namespace, so
+    this affects every deployment sharing the cluster. The
+    `strategy: Recreate` in the Deployment manifest handles the
+    hostPort-80/443 handoff; expect ~10-30s of ingress downtime while
+    the old pod terminates and the new one starts.
+    """
+    apps_api = client.AppsV1Api()
+    try:
+        dep = apps_api.read_namespaced_deployment(
+            name="caddy-ingress-controller", namespace="caddy-system"
+        )
+    except ApiException as e:
+        if e.status == 404:
+            if opts.o.debug:
+                print(
+                    "Caddy ingress Deployment not found; nothing to "
+                    "update (install path handles fresh clusters)"
+                )
+            return False
+        raise
+
+    containers = dep.spec.template.spec.containers or []
+    current = containers[0].image if containers else None
+    if current == caddy_image:
+        if opts.o.debug:
+            print(f"Caddy image already at {caddy_image}; no update needed")
+        return False
+
+    print(
+        f"Updating Caddy ingress image: {current} -> {caddy_image} "
+        "(expect brief ingress downtime)"
+    )
+    patch = {
+        "spec": {
+            "template": {
+                "spec": {
+                    "containers": [
+                        {
+                            "name": "caddy-ingress-controller",
+                            "image": caddy_image,
+                            "imagePullPolicy": "Always",
+                        }
+                    ]
+                }
+            }
+        }
+    }
+    apps_api.patch_namespaced_deployment(
+        name="caddy-ingress-controller",
+        namespace="caddy-system",
+        body=patch,
+    )
+    return True
+
+
 def load_images_into_kind(kind_cluster_name: str, image_set: Set[str]):
     for image in image_set:
         result = _run_command(
diff --git a/stack_orchestrator/deploy/spec.py b/stack_orchestrator/deploy/spec.py
index 5ac61662..a18c14bc 100644
--- a/stack_orchestrator/deploy/spec.py
+++ b/stack_orchestrator/deploy/spec.py
@@ -304,6 +304,19 @@ class Spec:
         """
         return self.obj.get(constants.kind_mount_root_key)
 
+    def get_caddy_ingress_image(self) -> str:
+        """Return the Caddy ingress controller image to deploy/patch.
+
+        Defaults to the upstream tag when not set in spec. Cluster-
+        scoped: the Caddy ingress lives in the shared `caddy-system`
+        namespace, so setting this key in any deployment's spec will
+        roll the controller for every deployment using the cluster.
+        """
+        return self.obj.get(
+            constants.caddy_ingress_image_key,
+            constants.default_caddy_ingress_image,
+        )
+
     def get_maintenance_service(self) -> typing.Optional[str]:
         """Return maintenance-service value (e.g. 'dumpster-maintenance:8000') or None.