From d65802f8ce6be3c2a70c2e563917f2da783bbd3d Mon Sep 17 00:00:00 2001 From: Prathamesh Musale Date: Tue, 21 Apr 2026 09:02:22 +0000 Subject: [PATCH] fix(k8s): reconcile Caddy image regardless of --skip-cluster-management MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The Caddy image reconcile was gated on `not self.skip_cluster_management` alongside the install path. With --skip-cluster-management being the default, that meant spec image changes never propagated on normal restarts — the operator had to pass --perform-cluster-management for the reconcile to run, which is surprising and conflates cluster lifecycle ownership with routine workload reconciliation. Split the two: install still requires cluster-management ownership (it seeds namespace + secrets + CronJob, which are cluster-creation concerns). The update-on-reuse patch is a plain k8s-API operation against a running Deployment — run it in both modes. Co-Authored-By: Claude Opus 4.7 (1M context) --- stack_orchestrator/deploy/k8s/deploy_k8s.py | 30 +++++++++++---------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/stack_orchestrator/deploy/k8s/deploy_k8s.py b/stack_orchestrator/deploy/k8s/deploy_k8s.py index a706b1c1..688c5696 100644 --- a/stack_orchestrator/deploy/k8s/deploy_k8s.py +++ b/stack_orchestrator/deploy/k8s/deploy_k8s.py @@ -881,33 +881,35 @@ class K8sDeployer(Deployer): check_mounts_compatible(existing, kind_config) self.connect_api() self._ensure_namespace() + caddy_image = self.cluster_info.spec.get_caddy_ingress_image() + # Fresh-install path: gated on cluster lifecycle ownership + # because install_ingress_for_kind also seeds caddy-system + # (namespace, secrets restore, cert-backup CronJob). if self.is_kind() and not self.skip_cluster_management: - caddy_image = self.cluster_info.spec.get_caddy_ingress_image() if not is_ingress_running(): - # Fresh install — always needs an image; use the spec - # value if set, else the hardcoded default. install_ingress_for_kind( self.cluster_info.spec.get_acme_email(), self.cluster_info.spec.get_kind_mount_root(), caddy_image=caddy_image, ) wait_for_ingress_in_kind() - elif caddy_image is not None: - # Ingress already up AND the operator explicitly set a - # caddy-ingress-image in spec — reconcile the running - # image. Spec absent => don't touch: the operator may - # have set the image out-of-band (ansible playbook, - # prior explicit spec on a different deployment) and a - # silent revert would be worse than doing nothing. - # Note: caddy-system is cluster-scoped, so whichever - # deployment's spec sets the image last, wins. - if update_caddy_ingress_image(caddy_image): - wait_for_ingress_in_kind() if self.cluster_info.spec.get_unlimited_memlock(): _create_runtime_class( constants.high_memlock_runtime, constants.high_memlock_runtime, ) + # Reconcile Caddy image whenever the operator explicitly set + # it in spec, regardless of cluster lifecycle ownership — + # --skip-cluster-management (the default) shouldn't prevent + # a routine k8s-API-level patch of a running Deployment. + # Spec absent => don't touch: the operator may have set the + # image out-of-band (ansible playbook, prior explicit spec on + # a different deployment) and a silent revert would be worse + # than doing nothing. caddy-system is cluster-scoped, so + # whichever deployment's spec sets the image last wins. + if self.is_kind() and caddy_image is not None and is_ingress_running(): + if update_caddy_ingress_image(caddy_image): + wait_for_ingress_in_kind() def _create_ingress(self): """Create or update Ingress with TLS certificate lookup."""