fix(k8s): expand etcd cleanup whitelist to preserve core cluster services

_clean_etcd_keeping_certs() only preserved /registry/secrets/caddy-system,
deleting everything else including the kubernetes ClusterIP service in the
default namespace. When kind recreated the cluster with the cleaned etcd,
kube-apiserver saw existing data and skipped bootstrapping the service.
kindnet panicked on KUBERNETES_SERVICE_HOST missing, blocking all pod
networking.

Expand the whitelist to also preserve:
- /registry/services/specs/default/kubernetes
- /registry/services/endpoints/default/kubernetes

Loop over multiple prefixes instead of a single etcdctl get --prefix call.

See docs/bug-laconic-so-etcd-cleanup.md in biscayne-agave-runbook.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
fix/kind-mount-propagation
A. F. Dudley 2026-03-07 17:56:13 +00:00
parent a11d40f2f3
commit 7f205732f2
1 changed files with 22 additions and 6 deletions

View File

@ -148,8 +148,16 @@ def _clean_etcd_keeping_certs(etcd_path: str) -> bool:
etcd_image = "gcr.io/etcd-development/etcd:v3.5.9"
temp_dir = "/tmp/laconic-etcd-cleanup"
# Whitelist: prefixes to KEEP - everything else gets deleted
keep_prefixes = "/registry/secrets/caddy-system"
# Whitelist: prefixes to KEEP - everything else gets deleted.
# Must include core cluster resources (kubernetes service, kube-system
# secrets) or kindnet panics on restart — KUBERNETES_SERVICE_HOST is
# injected from the kubernetes ClusterIP service in default namespace.
keep_prefixes = [
"/registry/secrets/caddy-system",
"/registry/services/specs/default/kubernetes",
"/registry/services/endpoints/default/kubernetes",
]
keep_prefixes_str = " ".join(keep_prefixes)
# The etcd image is distroless (no shell). We extract the statically-linked
# etcdctl binary and run it from alpine which has shell + jq support.
@ -195,13 +203,21 @@ def _clean_etcd_keeping_certs(etcd_path: str) -> bool:
sleep 3
# Use alpine with extracted etcdctl to run commands (alpine has shell + jq)
# Export caddy secrets
# Export whitelisted keys (caddy TLS certs + core cluster services)
docker run --rm \
-v {temp_dir}:/backup \
--network container:laconic-etcd-cleanup \
$ALPINE_IMAGE sh -c \
'/backup/etcdctl get --prefix "{keep_prefixes}" -w json \
> /backup/kept.json 2>/dev/null || echo "{{}}" > /backup/kept.json'
$ALPINE_IMAGE sh -c '
apk add --no-cache jq >/dev/null 2>&1
echo "[]" > /backup/all-kvs.json
for prefix in {keep_prefixes_str}; do
/backup/etcdctl get --prefix "$prefix" -w json 2>/dev/null \
| jq ".kvs // []" >> /backup/all-kvs.json || true
done
jq -s "add" /backup/all-kvs.json \
| jq "{{kvs: .}}" > /backup/kept.json 2>/dev/null \
|| echo "{{}}" > /backup/kept.json
'
# Delete ALL registry keys
docker run --rm \