fix(k8s): expand etcd cleanup whitelist to preserve core cluster services
_clean_etcd_keeping_certs() only preserved /registry/secrets/caddy-system, deleting everything else including the kubernetes ClusterIP service in the default namespace. When kind recreated the cluster with the cleaned etcd, kube-apiserver saw existing data and skipped bootstrapping the service. kindnet panicked on KUBERNETES_SERVICE_HOST missing, blocking all pod networking. Expand the whitelist to also preserve: - /registry/services/specs/default/kubernetes - /registry/services/endpoints/default/kubernetes Loop over multiple prefixes instead of a single etcdctl get --prefix call. See docs/bug-laconic-so-etcd-cleanup.md in biscayne-agave-runbook. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>fix/kind-mount-propagation
parent
a11d40f2f3
commit
7f205732f2
|
|
@ -148,8 +148,16 @@ def _clean_etcd_keeping_certs(etcd_path: str) -> bool:
|
||||||
etcd_image = "gcr.io/etcd-development/etcd:v3.5.9"
|
etcd_image = "gcr.io/etcd-development/etcd:v3.5.9"
|
||||||
temp_dir = "/tmp/laconic-etcd-cleanup"
|
temp_dir = "/tmp/laconic-etcd-cleanup"
|
||||||
|
|
||||||
# Whitelist: prefixes to KEEP - everything else gets deleted
|
# Whitelist: prefixes to KEEP - everything else gets deleted.
|
||||||
keep_prefixes = "/registry/secrets/caddy-system"
|
# Must include core cluster resources (kubernetes service, kube-system
|
||||||
|
# secrets) or kindnet panics on restart — KUBERNETES_SERVICE_HOST is
|
||||||
|
# injected from the kubernetes ClusterIP service in default namespace.
|
||||||
|
keep_prefixes = [
|
||||||
|
"/registry/secrets/caddy-system",
|
||||||
|
"/registry/services/specs/default/kubernetes",
|
||||||
|
"/registry/services/endpoints/default/kubernetes",
|
||||||
|
]
|
||||||
|
keep_prefixes_str = " ".join(keep_prefixes)
|
||||||
|
|
||||||
# The etcd image is distroless (no shell). We extract the statically-linked
|
# The etcd image is distroless (no shell). We extract the statically-linked
|
||||||
# etcdctl binary and run it from alpine which has shell + jq support.
|
# etcdctl binary and run it from alpine which has shell + jq support.
|
||||||
|
|
@ -195,13 +203,21 @@ def _clean_etcd_keeping_certs(etcd_path: str) -> bool:
|
||||||
sleep 3
|
sleep 3
|
||||||
|
|
||||||
# Use alpine with extracted etcdctl to run commands (alpine has shell + jq)
|
# Use alpine with extracted etcdctl to run commands (alpine has shell + jq)
|
||||||
# Export caddy secrets
|
# Export whitelisted keys (caddy TLS certs + core cluster services)
|
||||||
docker run --rm \
|
docker run --rm \
|
||||||
-v {temp_dir}:/backup \
|
-v {temp_dir}:/backup \
|
||||||
--network container:laconic-etcd-cleanup \
|
--network container:laconic-etcd-cleanup \
|
||||||
$ALPINE_IMAGE sh -c \
|
$ALPINE_IMAGE sh -c '
|
||||||
'/backup/etcdctl get --prefix "{keep_prefixes}" -w json \
|
apk add --no-cache jq >/dev/null 2>&1
|
||||||
> /backup/kept.json 2>/dev/null || echo "{{}}" > /backup/kept.json'
|
echo "[]" > /backup/all-kvs.json
|
||||||
|
for prefix in {keep_prefixes_str}; do
|
||||||
|
/backup/etcdctl get --prefix "$prefix" -w json 2>/dev/null \
|
||||||
|
| jq ".kvs // []" >> /backup/all-kvs.json || true
|
||||||
|
done
|
||||||
|
jq -s "add" /backup/all-kvs.json \
|
||||||
|
| jq "{{kvs: .}}" > /backup/kept.json 2>/dev/null \
|
||||||
|
|| echo "{{}}" > /backup/kept.json
|
||||||
|
'
|
||||||
|
|
||||||
# Delete ALL registry keys
|
# Delete ALL registry keys
|
||||||
docker run --rm \
|
docker run --rm \
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue