From 4977e3ff4349bcfcb6730f7697b9e3ba5d104cc0 Mon Sep 17 00:00:00 2001 From: prathamesh0 <42446521+prathamesh0@users.noreply.github.com> Date: Tue, 21 Apr 2026 14:40:39 +0530 Subject: [PATCH] k8s: manage Caddy ingress image via spec (so-p3p) (#749) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes so-p3p: - New spec key `caddy-ingress-image`: on fresh install, deploys Caddy with this image; on subsequent `deployment start`, patches the running Caddy Deployment if the image differs. Defaults to the manifest's hardcoded image when absent - When the spec key is absent, SO does **not** touch a running Caddy — avoids silently reverting an image set out-of-band (ansible playbook, another deployment's spec) - `strategy: Recreate` on the Caddy Deployment manifest (required — hostPort 80/443 deadlocks rolling updates) - Reconcile runs under both `--perform-cluster-management` and the default `--skip-cluster-management` (it's a k8s-API patch, not a cluster-lifecycle op) - Image template by container name rather than string match, so the spec override wins regardless of what the shipped manifest hardcodes - Cluster-scoped caveat documented: `caddy-system` is shared across deployments, so the last `deployment start` that sets the key wins for everyone --- .pebbles/events.jsonl | 2 + docs/deployment_patterns.md | 35 ++++++++ stack_orchestrator/constants.py | 2 + .../ingress/ingress-caddy-kind-deploy.yaml | 5 ++ stack_orchestrator/deploy/k8s/deploy_k8s.py | 18 ++++ stack_orchestrator/deploy/k8s/helpers.py | 88 ++++++++++++++++++- stack_orchestrator/deploy/spec.py | 18 ++++ 7 files changed, 166 insertions(+), 2 deletions(-) diff --git a/.pebbles/events.jsonl b/.pebbles/events.jsonl index 0cf36fce..ece11c09 100644 --- a/.pebbles/events.jsonl +++ b/.pebbles/events.jsonl @@ -52,3 +52,5 @@ {"type":"status_update","timestamp":"2026-04-21T05:57:12.928842469Z","issue_id":"so-n1n","payload":{"status":"closed"}} {"type":"comment","timestamp":"2026-04-21T06:08:13.933886638Z","issue_id":"so-ad7","payload":{"body":"Fixed in PR #744 (cf8b7533). get_services() now includes the maintenance pod in the container-ports map so its per-pod Service is built and available for the Ingress swap."}} {"type":"status_update","timestamp":"2026-04-21T06:08:14.457815115Z","issue_id":"so-ad7","payload":{"status":"closed"}} +{"type":"update","timestamp":"2026-04-21T09:00:47.364859946Z","issue_id":"so-p3p","payload":{"description":"## Problem\n\nThe Caddy ingress controller image is hardcoded in `ingress-caddy-kind-deploy.yaml`, with no mechanism to update it short of cluster recreation or manual `kubectl patch`. laconic-so should: (1) allow spec.yml to specify a Caddy image, (2) support updating the Caddy image as part of `deployment start`, (3) set `strategy: Recreate` on the Caddy Deployment since hostPort pods can't rolling-update.\n\n## Resolution\n\n- New spec key `caddy-ingress-image`. Fresh install uses it (fallback: manifest default). On subsequent `deployment start`, if the spec key is set and the running Caddy image differs, SO patches the Deployment and waits for rollout.\n- Spec key absent =\u003e SO does **not** touch a running Caddy, to avoid silently reverting images set out-of-band (ansible playbook, another deployment's spec).\n- `strategy: Recreate` added to the Caddy Deployment manifest.\n- Reconcile runs under both `--perform-cluster-management` and the default `--skip-cluster-management` (it's a plain k8s-API patch, not a cluster lifecycle op).\n- Image substitution locates the container by name instead of string-matching the shipped default, so the spec override wins regardless of what the manifest hardcodes.\n- Cluster-scoped caveat: `caddy-system` is shared across deployments; last `deployment start` that sets the key wins for everyone. Documented in `deployment_patterns.md`."}} +{"type":"status_update","timestamp":"2026-04-21T09:00:47.745675131Z","issue_id":"so-p3p","payload":{"status":"closed"}} diff --git a/docs/deployment_patterns.md b/docs/deployment_patterns.md index 525622aa..0eb2548a 100644 --- a/docs/deployment_patterns.md +++ b/docs/deployment_patterns.md @@ -202,6 +202,41 @@ with a `DeployerException` pointing at the `namespace:` spec override. Catches operator-error cases where the same deployment dir is effectively registered twice. +### Caddy ingress image lifecycle + +The Caddy ingress controller lives in the cluster-scoped +`caddy-system` namespace and is installed on first `deployment start`. +Its image is configurable per deployment: + +```yaml +# spec.yml +caddy-ingress-image: ghcr.io/laconicnetwork/caddy-ingress:v1.2.3 +``` + +Two cases, intentionally different: + +- **Spec key set**: on first install the manifest is templated with + this image. On subsequent `deployment start`, if the running Caddy + Deployment's image differs, laconic-so patches it and waits for the + rollout. The Deployment uses `strategy: Recreate` (hostPort 80/443 + blocks rolling updates from ever completing), so expect ~10–30s of + ingress downtime while the old pod terminates and the new one + starts. +- **Spec key absent**: on first install the manifest's hardcoded + default (`ghcr.io/laconicnetwork/caddy-ingress:latest`) is used. + On subsequent `deployment start`, laconic-so does **not** touch the + running Caddy Deployment. This matters when the image was set + out-of-band (via an ansible playbook, or by another deployment's + spec that's since been removed) — a silent revert to the default + would be worse than doing nothing. If you want to go back to the + default image, set `caddy-ingress-image` to it explicitly. + +**Cluster-scoped caveat**: `caddy-system` is shared by every +deployment on the cluster. Setting `caddy-ingress-image` in any one +deployment's spec rolls the controller for all of them — last +`deployment start` wins. Treat it as a cluster-level knob; keep the +value consistent across the deployments sharing a cluster. + ## Volume Persistence in k8s-kind k8s-kind has 3 storage layers: diff --git a/stack_orchestrator/constants.py b/stack_orchestrator/constants.py index e5c83698..72c29837 100644 --- a/stack_orchestrator/constants.py +++ b/stack_orchestrator/constants.py @@ -48,5 +48,7 @@ high_memlock_runtime = "high-memlock" high_memlock_spec_filename = "high-memlock-spec.json" acme_email_key = "acme-email" kind_mount_root_key = "kind-mount-root" +caddy_ingress_image_key = "caddy-ingress-image" +default_caddy_ingress_image = "ghcr.io/laconicnetwork/caddy-ingress:latest" external_services_key = "external-services" ca_certificates_key = "ca-certificates" diff --git a/stack_orchestrator/data/k8s/components/ingress/ingress-caddy-kind-deploy.yaml b/stack_orchestrator/data/k8s/components/ingress/ingress-caddy-kind-deploy.yaml index 88025837..b368c50e 100644 --- a/stack_orchestrator/data/k8s/components/ingress/ingress-caddy-kind-deploy.yaml +++ b/stack_orchestrator/data/k8s/components/ingress/ingress-caddy-kind-deploy.yaml @@ -160,6 +160,11 @@ metadata: app.kubernetes.io/component: controller spec: replicas: 1 + # Recreate is required: the pod binds hostPort 80/443, which a + # RollingUpdate would try to double-claim during cutover (new pod + # pending until old pod exits — never exits, rollout deadlocks). + strategy: + type: Recreate selector: matchLabels: app.kubernetes.io/name: caddy-ingress-controller diff --git a/stack_orchestrator/deploy/k8s/deploy_k8s.py b/stack_orchestrator/deploy/k8s/deploy_k8s.py index 84318cde..688c5696 100644 --- a/stack_orchestrator/deploy/k8s/deploy_k8s.py +++ b/stack_orchestrator/deploy/k8s/deploy_k8s.py @@ -34,6 +34,7 @@ from stack_orchestrator.deploy.k8s.helpers import ( ) from stack_orchestrator.deploy.k8s.helpers import ( install_ingress_for_kind, + update_caddy_ingress_image, wait_for_ingress_in_kind, is_ingress_running, ) @@ -880,11 +881,16 @@ class K8sDeployer(Deployer): check_mounts_compatible(existing, kind_config) self.connect_api() self._ensure_namespace() + caddy_image = self.cluster_info.spec.get_caddy_ingress_image() + # Fresh-install path: gated on cluster lifecycle ownership + # because install_ingress_for_kind also seeds caddy-system + # (namespace, secrets restore, cert-backup CronJob). if self.is_kind() and not self.skip_cluster_management: if not is_ingress_running(): install_ingress_for_kind( self.cluster_info.spec.get_acme_email(), self.cluster_info.spec.get_kind_mount_root(), + caddy_image=caddy_image, ) wait_for_ingress_in_kind() if self.cluster_info.spec.get_unlimited_memlock(): @@ -892,6 +898,18 @@ class K8sDeployer(Deployer): constants.high_memlock_runtime, constants.high_memlock_runtime, ) + # Reconcile Caddy image whenever the operator explicitly set + # it in spec, regardless of cluster lifecycle ownership — + # --skip-cluster-management (the default) shouldn't prevent + # a routine k8s-API-level patch of a running Deployment. + # Spec absent => don't touch: the operator may have set the + # image out-of-band (ansible playbook, prior explicit spec on + # a different deployment) and a silent revert would be worse + # than doing nothing. caddy-system is cluster-scoped, so + # whichever deployment's spec sets the image last wins. + if self.is_kind() and caddy_image is not None and is_ingress_running(): + if update_caddy_ingress_image(caddy_image): + wait_for_ingress_in_kind() def _create_ingress(self): """Create or update Ingress with TLS certificate lookup.""" diff --git a/stack_orchestrator/deploy/k8s/helpers.py b/stack_orchestrator/deploy/k8s/helpers.py index 44d1e49c..4cbf3270 100644 --- a/stack_orchestrator/deploy/k8s/helpers.py +++ b/stack_orchestrator/deploy/k8s/helpers.py @@ -466,7 +466,9 @@ def wait_for_ingress_in_kind(): def install_ingress_for_kind( - acme_email: str = "", kind_mount_root: Optional[str] = None + acme_email: str = "", + kind_mount_root: Optional[str] = None, + caddy_image: Optional[str] = None, ): api_client = client.ApiClient() ingress_install = os.path.abspath( @@ -477,7 +479,7 @@ def install_ingress_for_kind( if opts.o.debug: print("Installing Caddy ingress controller in kind cluster") - # Template the YAML with email before applying + # Template the YAML with email and image before applying with open(ingress_install) as f: yaml_content = f.read() @@ -488,6 +490,27 @@ def install_ingress_for_kind( yaml_objects = list(yaml.safe_load_all(yaml_content)) + # Override the Caddy container's image when a spec value is set. + # Works regardless of what's hardcoded in the manifest — we locate + # the container by name and overwrite its image field, rather than + # relying on a string match of the default. + if caddy_image: + for obj in yaml_objects: + if not obj: + continue + if ( + obj.get("kind") == "Deployment" + and obj.get("metadata", {}).get("name") + == "caddy-ingress-controller" + ): + for c in ( + obj["spec"]["template"]["spec"].get("containers") or [] + ): + if c.get("name") == "caddy-ingress-controller": + c["image"] = caddy_image + if opts.o.debug: + print(f"Configured Caddy image: {caddy_image}") + # Split: apply everything except the Caddy controller Deployment first, # so the namespace + secrets exist before the pod can start and read its # secret_store. Race-free: Caddy has no way to see the cluster until @@ -530,6 +553,67 @@ def install_ingress_for_kind( _install_caddy_cert_backup(api_client, kind_mount_root) +def update_caddy_ingress_image(caddy_image: str) -> bool: + """Patch the running Caddy ingress Deployment to a new image. + + No-op if the live Deployment already runs the requested image. + Returns True if a patch was applied, False otherwise. + + Caddy lives in the cluster-scoped `caddy-system` namespace, so + this affects every deployment sharing the cluster. The + `strategy: Recreate` in the Deployment manifest handles the + hostPort-80/443 handoff; expect ~10-30s of ingress downtime while + the old pod terminates and the new one starts. + """ + apps_api = client.AppsV1Api() + try: + dep = apps_api.read_namespaced_deployment( + name="caddy-ingress-controller", namespace="caddy-system" + ) + except ApiException as e: + if e.status == 404: + if opts.o.debug: + print( + "Caddy ingress Deployment not found; nothing to " + "update (install path handles fresh clusters)" + ) + return False + raise + + containers = dep.spec.template.spec.containers or [] + current = containers[0].image if containers else None + if current == caddy_image: + if opts.o.debug: + print(f"Caddy image already at {caddy_image}; no update needed") + return False + + print( + f"Updating Caddy ingress image: {current} -> {caddy_image} " + "(expect brief ingress downtime)" + ) + patch = { + "spec": { + "template": { + "spec": { + "containers": [ + { + "name": "caddy-ingress-controller", + "image": caddy_image, + "imagePullPolicy": "Always", + } + ] + } + } + } + } + apps_api.patch_namespaced_deployment( + name="caddy-ingress-controller", + namespace="caddy-system", + body=patch, + ) + return True + + def load_images_into_kind(kind_cluster_name: str, image_set: Set[str]): for image in image_set: result = _run_command( diff --git a/stack_orchestrator/deploy/spec.py b/stack_orchestrator/deploy/spec.py index 5ac61662..fcdc9de8 100644 --- a/stack_orchestrator/deploy/spec.py +++ b/stack_orchestrator/deploy/spec.py @@ -304,6 +304,24 @@ class Spec: """ return self.obj.get(constants.kind_mount_root_key) + def get_caddy_ingress_image(self) -> typing.Optional[str]: + """Return the Caddy ingress controller image override, or None. + + Returns None (not the default image) when the spec key is + absent. That distinction matters: the install path falls back + to the hardcoded default so there's always *some* image to + deploy, while the update-on-reuse path treats None as "operator + didn't ask to touch Caddy" and skips the patch — avoiding + silent reverts of an image set out-of-band (e.g. via an + ansible playbook or a prior deployment's spec). + + Cluster-scoped: the Caddy ingress lives in the shared + `caddy-system` namespace, so setting this key in any + deployment's spec rolls the controller for every deployment + using the cluster. + """ + return self.obj.get(constants.caddy_ingress_image_key) + def get_maintenance_service(self) -> typing.Optional[str]: """Return maintenance-service value (e.g. 'dumpster-maintenance:8000') or None.