From 7f205732f2eadf01548ba2839a530d3cbce58ddb Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Sat, 7 Mar 2026 17:56:13 +0000 Subject: [PATCH] fix(k8s): expand etcd cleanup whitelist to preserve core cluster services _clean_etcd_keeping_certs() only preserved /registry/secrets/caddy-system, deleting everything else including the kubernetes ClusterIP service in the default namespace. When kind recreated the cluster with the cleaned etcd, kube-apiserver saw existing data and skipped bootstrapping the service. kindnet panicked on KUBERNETES_SERVICE_HOST missing, blocking all pod networking. Expand the whitelist to also preserve: - /registry/services/specs/default/kubernetes - /registry/services/endpoints/default/kubernetes Loop over multiple prefixes instead of a single etcdctl get --prefix call. See docs/bug-laconic-so-etcd-cleanup.md in biscayne-agave-runbook. Co-Authored-By: Claude Opus 4.6 --- stack_orchestrator/deploy/k8s/helpers.py | 28 +++++++++++++++++++----- 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/stack_orchestrator/deploy/k8s/helpers.py b/stack_orchestrator/deploy/k8s/helpers.py index ac4e8603..85f3d5f7 100644 --- a/stack_orchestrator/deploy/k8s/helpers.py +++ b/stack_orchestrator/deploy/k8s/helpers.py @@ -148,8 +148,16 @@ def _clean_etcd_keeping_certs(etcd_path: str) -> bool: etcd_image = "gcr.io/etcd-development/etcd:v3.5.9" temp_dir = "/tmp/laconic-etcd-cleanup" - # Whitelist: prefixes to KEEP - everything else gets deleted - keep_prefixes = "/registry/secrets/caddy-system" + # Whitelist: prefixes to KEEP - everything else gets deleted. + # Must include core cluster resources (kubernetes service, kube-system + # secrets) or kindnet panics on restart — KUBERNETES_SERVICE_HOST is + # injected from the kubernetes ClusterIP service in default namespace. + keep_prefixes = [ + "/registry/secrets/caddy-system", + "/registry/services/specs/default/kubernetes", + "/registry/services/endpoints/default/kubernetes", + ] + keep_prefixes_str = " ".join(keep_prefixes) # The etcd image is distroless (no shell). We extract the statically-linked # etcdctl binary and run it from alpine which has shell + jq support. @@ -195,13 +203,21 @@ def _clean_etcd_keeping_certs(etcd_path: str) -> bool: sleep 3 # Use alpine with extracted etcdctl to run commands (alpine has shell + jq) - # Export caddy secrets + # Export whitelisted keys (caddy TLS certs + core cluster services) docker run --rm \ -v {temp_dir}:/backup \ --network container:laconic-etcd-cleanup \ - $ALPINE_IMAGE sh -c \ - '/backup/etcdctl get --prefix "{keep_prefixes}" -w json \ - > /backup/kept.json 2>/dev/null || echo "{{}}" > /backup/kept.json' + $ALPINE_IMAGE sh -c ' + apk add --no-cache jq >/dev/null 2>&1 + echo "[]" > /backup/all-kvs.json + for prefix in {keep_prefixes_str}; do + /backup/etcdctl get --prefix "$prefix" -w json 2>/dev/null \ + | jq ".kvs // []" >> /backup/all-kvs.json || true + done + jq -s "add" /backup/all-kvs.json \ + | jq "{{kvs: .}}" > /backup/kept.json 2>/dev/null \ + || echo "{{}}" > /backup/kept.json + ' # Delete ALL registry keys docker run --rm \