fix(k8s): only patch Caddy when operator explicitly set the spec key

Returning the hardcoded default from get_caddy_ingress_image() when
the spec key was absent meant every `deployment start` would patch a
running Caddy back to :latest — silently reverting any image set
out-of-band (ansible playbook, prior deployment's spec).

Make get_caddy_ingress_image() return Optional[str]. Install path
still falls back to the default (needs *some* image to install); the
update-on-reuse path treats None as "operator didn't ask, leave the
running Caddy alone".

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
pull/749/head
Prathamesh Musale 2026-04-21 08:12:15 +00:00
parent a6d54c7bf8
commit 616475ce2d
4 changed files with 44 additions and 24 deletions

View File

@ -213,14 +213,23 @@ Its image is configurable per deployment:
caddy-ingress-image: ghcr.io/laconicnetwork/caddy-ingress:v1.2.3 caddy-ingress-image: ghcr.io/laconicnetwork/caddy-ingress:v1.2.3
``` ```
Defaults to `ghcr.io/laconicnetwork/caddy-ingress:latest` when not set. Two cases, intentionally different:
On subsequent `deployment start`, if the running Caddy image differs - **Spec key set**: on first install the manifest is templated with
from the spec value, laconic-so patches the Caddy Deployment to the this image. On subsequent `deployment start`, if the running Caddy
new image. The Deployment uses `strategy: Recreate` (the hostPort Deployment's image differs, laconic-so patches it and waits for the
80/443 binding blocks a rolling update from ever completing), so rollout. The Deployment uses `strategy: Recreate` (hostPort 80/443
expect ~1030s of ingress downtime while the old pod terminates and blocks rolling updates from ever completing), so expect ~1030s of
the new one starts. ingress downtime while the old pod terminates and the new one
starts.
- **Spec key absent**: on first install the manifest's hardcoded
default (`ghcr.io/laconicnetwork/caddy-ingress:latest`) is used.
On subsequent `deployment start`, laconic-so does **not** touch the
running Caddy Deployment. This matters when the image was set
out-of-band (via an ansible playbook, or by another deployment's
spec that's since been removed) — a silent revert to the default
would be worse than doing nothing. If you want to go back to the
default image, set `caddy-ingress-image` to it explicitly.
**Cluster-scoped caveat**: `caddy-system` is shared by every **Cluster-scoped caveat**: `caddy-system` is shared by every
deployment on the cluster. Setting `caddy-ingress-image` in any one deployment on the cluster. Setting `caddy-ingress-image` in any one

View File

@ -884,19 +884,23 @@ class K8sDeployer(Deployer):
if self.is_kind() and not self.skip_cluster_management: if self.is_kind() and not self.skip_cluster_management:
caddy_image = self.cluster_info.spec.get_caddy_ingress_image() caddy_image = self.cluster_info.spec.get_caddy_ingress_image()
if not is_ingress_running(): if not is_ingress_running():
# Fresh install — always needs an image; use the spec
# value if set, else the hardcoded default.
install_ingress_for_kind( install_ingress_for_kind(
self.cluster_info.spec.get_acme_email(), self.cluster_info.spec.get_acme_email(),
self.cluster_info.spec.get_kind_mount_root(), self.cluster_info.spec.get_kind_mount_root(),
caddy_image=caddy_image, caddy_image=caddy_image,
) )
wait_for_ingress_in_kind() wait_for_ingress_in_kind()
else: elif caddy_image is not None:
# Ingress is already up from a prior start — reconcile # Ingress already up AND the operator explicitly set a
# the running image against this deployment's spec. # caddy-ingress-image in spec — reconcile the running
# Patches only if they differ. Note: caddy-system is # image. Spec absent => don't touch: the operator may
# cluster-scoped, so every deployment sharing the # have set the image out-of-band (ansible playbook,
# cluster effectively votes on the image; last start # prior explicit spec on a different deployment) and a
# wins. Documented in deployment_patterns.md. # silent revert would be worse than doing nothing.
# Note: caddy-system is cluster-scoped, so whichever
# deployment's spec sets the image last, wins.
if update_caddy_ingress_image(caddy_image): if update_caddy_ingress_image(caddy_image):
wait_for_ingress_in_kind() wait_for_ingress_in_kind()
if self.cluster_info.spec.get_unlimited_memlock(): if self.cluster_info.spec.get_unlimited_memlock():

View File

@ -488,6 +488,8 @@ def install_ingress_for_kind(
if opts.o.debug: if opts.o.debug:
print(f"Configured Caddy with ACME email: {acme_email}") print(f"Configured Caddy with ACME email: {acme_email}")
# Substitute image only when an override is requested; otherwise
# leave the hardcoded default in the manifest.
if caddy_image and caddy_image != constants.default_caddy_ingress_image: if caddy_image and caddy_image != constants.default_caddy_ingress_image:
yaml_content = yaml_content.replace( yaml_content = yaml_content.replace(
constants.default_caddy_ingress_image, caddy_image constants.default_caddy_ingress_image, caddy_image

View File

@ -304,18 +304,23 @@ class Spec:
""" """
return self.obj.get(constants.kind_mount_root_key) return self.obj.get(constants.kind_mount_root_key)
def get_caddy_ingress_image(self) -> str: def get_caddy_ingress_image(self) -> typing.Optional[str]:
"""Return the Caddy ingress controller image to deploy/patch. """Return the Caddy ingress controller image override, or None.
Defaults to the upstream tag when not set in spec. Cluster- Returns None (not the default image) when the spec key is
scoped: the Caddy ingress lives in the shared `caddy-system` absent. That distinction matters: the install path falls back
namespace, so setting this key in any deployment's spec will to the hardcoded default so there's always *some* image to
roll the controller for every deployment using the cluster. deploy, while the update-on-reuse path treats None as "operator
didn't ask to touch Caddy" and skips the patch — avoiding
silent reverts of an image set out-of-band (e.g. via an
ansible playbook or a prior deployment's spec).
Cluster-scoped: the Caddy ingress lives in the shared
`caddy-system` namespace, so setting this key in any
deployment's spec rolls the controller for every deployment
using the cluster.
""" """
return self.obj.get( return self.obj.get(constants.caddy_ingress_image_key)
constants.caddy_ingress_image_key,
constants.default_caddy_ingress_image,
)
def get_maintenance_service(self) -> typing.Optional[str]: def get_maintenance_service(self) -> typing.Optional[str]:
"""Return maintenance-service value (e.g. 'dumpster-maintenance:8000') or None. """Return maintenance-service value (e.g. 'dumpster-maintenance:8000') or None.