so-o2o: kubectl-level Caddy cert backup/restore (#746)
Publish / Gate: k8s deploy e2e (push) Failing after 3s
Details
Publish / Build and publish (push) Has been skipped
Details
K8s Deploy Test / Run deploy test suite on kind/k8s (push) Failing after 0s
Details
Lint Checks / Run linter (push) Failing after 0s
Details
Deploy Test / Run deploy test suite (push) Failing after 0s
Details
Webapp Test / Run webapp test suite (push) Failing after 0s
Details
Smoke Test / Run basic test suite (push) Failing after 0s
Details
Publish / Gate: k8s deploy e2e (push) Failing after 3s
Details
Publish / Build and publish (push) Has been skipped
Details
K8s Deploy Test / Run deploy test suite on kind/k8s (push) Failing after 0s
Details
Lint Checks / Run linter (push) Failing after 0s
Details
Deploy Test / Run deploy test suite (push) Failing after 0s
Details
Webapp Test / Run webapp test suite (push) Failing after 0s
Details
Smoke Test / Run basic test suite (push) Failing after 0s
Details
Replaces the etcd-surgery persistence approach with a CronJob that dumps `manager=caddy` Secrets to `{kind-mount-root}/caddy-cert-backup/` every 5 min, and a restore step that applies the file before Caddy starts on a fresh cluster. Closes so-o2o.
Deletes `_clean_etcd_keeping_certs` and the etcd+PKI extraMounts. No new spec keys - activates when `kind-mount-root` is set.
pull/747/head
v1.1.0-7f4b058-202604171011
parent
1334900407
commit
7f4b058066
|
|
@ -0,0 +1,96 @@
|
|||
# Caddy cert backup CronJob.
|
||||
# Periodically exports Secrets labeled manager=caddy (the Let's Encrypt
|
||||
# TLS certs issued by the Caddy ingress controller) to a hostPath under
|
||||
# {kind-mount-root}/caddy-cert-backup/ so they survive `kind delete`.
|
||||
# On subsequent cluster creates, SO restores this file before Caddy starts.
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: caddy-cert-backup
|
||||
namespace: caddy-system
|
||||
labels:
|
||||
app.kubernetes.io/name: caddy-cert-backup
|
||||
app.kubernetes.io/instance: caddy-ingress
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: Role
|
||||
metadata:
|
||||
name: caddy-cert-backup
|
||||
namespace: caddy-system
|
||||
labels:
|
||||
app.kubernetes.io/name: caddy-cert-backup
|
||||
app.kubernetes.io/instance: caddy-ingress
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["secrets"]
|
||||
verbs: ["get", "list"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: RoleBinding
|
||||
metadata:
|
||||
name: caddy-cert-backup
|
||||
namespace: caddy-system
|
||||
labels:
|
||||
app.kubernetes.io/name: caddy-cert-backup
|
||||
app.kubernetes.io/instance: caddy-ingress
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: Role
|
||||
name: caddy-cert-backup
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: caddy-cert-backup
|
||||
namespace: caddy-system
|
||||
---
|
||||
apiVersion: batch/v1
|
||||
kind: CronJob
|
||||
metadata:
|
||||
name: caddy-cert-backup
|
||||
namespace: caddy-system
|
||||
labels:
|
||||
app.kubernetes.io/name: caddy-cert-backup
|
||||
app.kubernetes.io/instance: caddy-ingress
|
||||
spec:
|
||||
schedule: "*/5 * * * *"
|
||||
concurrencyPolicy: Replace
|
||||
successfulJobsHistoryLimit: 1
|
||||
failedJobsHistoryLimit: 3
|
||||
jobTemplate:
|
||||
spec:
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: caddy-cert-backup
|
||||
app.kubernetes.io/instance: caddy-ingress
|
||||
spec:
|
||||
serviceAccountName: caddy-cert-backup
|
||||
restartPolicy: OnFailure
|
||||
nodeSelector:
|
||||
ingress-ready: "true"
|
||||
tolerations:
|
||||
- effect: NoSchedule
|
||||
key: node-role.kubernetes.io/master
|
||||
operator: Equal
|
||||
- effect: NoSchedule
|
||||
key: node-role.kubernetes.io/control-plane
|
||||
operator: Equal
|
||||
containers:
|
||||
- name: backup
|
||||
image: alpine/kubectl:1.35.3
|
||||
command:
|
||||
- sh
|
||||
- -c
|
||||
- |
|
||||
set -e
|
||||
kubectl get secret -n caddy-system -l manager=caddy -o yaml \
|
||||
> /backup/caddy-secrets.yaml.tmp
|
||||
mv /backup/caddy-secrets.yaml.tmp /backup/caddy-secrets.yaml
|
||||
echo "Wrote $(wc -l < /backup/caddy-secrets.yaml) lines to /backup/caddy-secrets.yaml"
|
||||
volumeMounts:
|
||||
- name: backup
|
||||
mountPath: /backup
|
||||
volumes:
|
||||
- name: backup
|
||||
hostPath:
|
||||
path: /mnt/caddy-cert-backup
|
||||
type: DirectoryOrCreate
|
||||
|
|
@ -790,7 +790,10 @@ class K8sDeployer(Deployer):
|
|||
self._ensure_namespace()
|
||||
if self.is_kind() and not self.skip_cluster_management:
|
||||
if not is_ingress_running():
|
||||
install_ingress_for_kind(self.cluster_info.spec.get_acme_email())
|
||||
install_ingress_for_kind(
|
||||
self.cluster_info.spec.get_acme_email(),
|
||||
self.cluster_info.spec.get_kind_mount_root(),
|
||||
)
|
||||
wait_for_ingress_in_kind()
|
||||
if self.cluster_info.spec.get_unlimited_memlock():
|
||||
_create_runtime_class(
|
||||
|
|
|
|||
|
|
@ -17,10 +17,8 @@ from kubernetes import client, utils, watch
|
|||
from kubernetes.client.exceptions import ApiException
|
||||
import os
|
||||
from pathlib import Path
|
||||
import shlex
|
||||
import subprocess
|
||||
import re
|
||||
import time
|
||||
from typing import Set, Mapping, List, Optional, cast
|
||||
import yaml
|
||||
|
||||
|
|
@ -100,255 +98,122 @@ def _run_command(command: str):
|
|||
return result
|
||||
|
||||
|
||||
def _get_etcd_host_path_from_kind_config(config_file: str) -> Optional[str]:
|
||||
"""Extract etcd host path from kind config extraMounts."""
|
||||
import yaml
|
||||
|
||||
try:
|
||||
with open(config_file, "r") as f:
|
||||
config = yaml.safe_load(f)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
nodes = config.get("nodes", [])
|
||||
for node in nodes:
|
||||
extra_mounts = node.get("extraMounts", [])
|
||||
for mount in extra_mounts:
|
||||
if mount.get("containerPath") == "/var/lib/etcd":
|
||||
return mount.get("hostPath")
|
||||
return None
|
||||
def _caddy_cert_backup_file(kind_mount_root: str) -> Path:
|
||||
"""Host path of the serialized caddy-system Secrets backup."""
|
||||
return Path(kind_mount_root) / "caddy-cert-backup" / "caddy-secrets.yaml"
|
||||
|
||||
|
||||
def _etcd_image_ref_path(etcd_path: str) -> Path:
|
||||
"""Location of the persisted etcd image reference file."""
|
||||
return Path(etcd_path).parent / "etcd-image.txt"
|
||||
def _read_caddy_cert_backup(kind_mount_root: str) -> Optional[str]:
|
||||
"""Read the caddy cert backup file.
|
||||
|
||||
|
||||
def _capture_etcd_image(cluster_name: str, etcd_path: str) -> bool:
|
||||
"""Persist the etcd image ref from a running Kind cluster.
|
||||
|
||||
Kind runs etcd as a static pod via containerd inside the node container.
|
||||
We query crictl to discover which etcd image the current Kind version
|
||||
uses, then write it alongside the etcd backup so future
|
||||
``_clean_etcd_keeping_certs`` calls use a matching version (avoiding
|
||||
on-disk format skew between etcd releases).
|
||||
The file is written by the in-cluster backup CronJob running as root
|
||||
(via kubectl image), so it lands on the host owned by root. Read it
|
||||
through an alpine container to sidestep permissions.
|
||||
"""
|
||||
node_name = f"{cluster_name}-control-plane"
|
||||
query_cmd = (
|
||||
f"docker exec {node_name} crictl images 2>/dev/null "
|
||||
"| awk '/etcd/ {print $1\":\"$2; exit}'"
|
||||
)
|
||||
image_ref = ""
|
||||
for _ in range(15):
|
||||
result = subprocess.run(query_cmd, shell=True, capture_output=True, text=True)
|
||||
image_ref = result.stdout.strip()
|
||||
if image_ref:
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
if not image_ref:
|
||||
print(f"Warning: could not capture etcd image ref from {node_name}")
|
||||
return False
|
||||
|
||||
image_file = _etcd_image_ref_path(etcd_path)
|
||||
write_cmd = (
|
||||
f"docker run --rm -v {image_file.parent}:/work alpine:3.19 "
|
||||
f"sh -c 'echo {shlex.quote(image_ref)} > /work/{image_file.name}'"
|
||||
)
|
||||
result = subprocess.run(write_cmd, shell=True, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
print(f"Warning: failed to write {image_file}: {result.stderr}")
|
||||
return False
|
||||
|
||||
if opts.o.debug:
|
||||
print(f"Captured etcd image: {image_ref} -> {image_file}")
|
||||
return True
|
||||
|
||||
|
||||
def _read_etcd_image_ref(etcd_path: str) -> Optional[str]:
|
||||
"""Read etcd image ref persisted by a prior cluster create."""
|
||||
image_file = _etcd_image_ref_path(etcd_path)
|
||||
backup_file = _caddy_cert_backup_file(kind_mount_root)
|
||||
read_cmd = (
|
||||
f"docker run --rm -v {image_file.parent}:/work:ro alpine:3.19 "
|
||||
f"cat /work/{image_file.name}"
|
||||
f"docker run --rm -v {backup_file.parent}:/work:ro alpine:3.19 "
|
||||
f"sh -c 'test -f /work/{backup_file.name} && cat /work/{backup_file.name} "
|
||||
"|| true'"
|
||||
)
|
||||
result = subprocess.run(read_cmd, shell=True, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
print(f"Warning: could not read {backup_file}: {result.stderr.strip()}")
|
||||
return None
|
||||
ref = result.stdout.strip()
|
||||
return ref or None
|
||||
content = result.stdout
|
||||
return content if content.strip() else None
|
||||
|
||||
|
||||
def _clean_etcd_keeping_certs(etcd_path: str) -> bool:
|
||||
"""Clean persisted etcd, keeping only TLS certificates.
|
||||
def _restore_caddy_certs(kind_mount_root: Optional[str]) -> None:
|
||||
"""Restore manager=caddy Secrets from a prior cluster's backup.
|
||||
|
||||
When etcd is persisted and a cluster is recreated, kind tries to install
|
||||
resources fresh but they already exist. Instead of trying to delete
|
||||
specific stale resources (blacklist), we keep only the valuable data
|
||||
(caddy TLS certs) and delete everything else (whitelist approach).
|
||||
|
||||
The etcd image is distroless (no shell), so we extract the statically-linked
|
||||
etcdctl binary and run it from alpine which has shell support.
|
||||
|
||||
Returns True if cleanup succeeded, False if no action needed or failed.
|
||||
Runs BEFORE the Caddy ingress controller Deployment is applied. Caddy's
|
||||
secret_store driver reads existing certs at startup and skips ACME for
|
||||
any domain whose cert is already present — so restoring here avoids
|
||||
Let's Encrypt calls (and rate limits) on cluster recreate.
|
||||
"""
|
||||
db_path = Path(etcd_path) / "member" / "snap" / "db"
|
||||
# Check existence using docker since etcd dir is root-owned
|
||||
check_cmd = (
|
||||
f"docker run --rm -v {etcd_path}:/etcd:ro alpine:3.19 "
|
||||
"test -f /etcd/member/snap/db"
|
||||
if not kind_mount_root:
|
||||
return
|
||||
content = _read_caddy_cert_backup(kind_mount_root)
|
||||
if not content:
|
||||
if opts.o.debug:
|
||||
print(
|
||||
f"No caddy cert backup at {_caddy_cert_backup_file(kind_mount_root)}, "
|
||||
"skipping restore"
|
||||
)
|
||||
return
|
||||
|
||||
try:
|
||||
backup = yaml.safe_load(content)
|
||||
except yaml.YAMLError as e:
|
||||
print(f"Warning: invalid caddy cert backup yaml: {e}")
|
||||
return
|
||||
|
||||
# `kubectl get -o yaml` wraps results in a List kind.
|
||||
items = backup.get("items", []) if isinstance(backup, dict) else []
|
||||
if not items:
|
||||
if opts.o.debug:
|
||||
print("Caddy cert backup has no items, skipping restore")
|
||||
return
|
||||
|
||||
# Strip server-managed metadata so the objects apply cleanly on a
|
||||
# freshly-minted cluster.
|
||||
stale_fields = (
|
||||
"resourceVersion",
|
||||
"uid",
|
||||
"creationTimestamp",
|
||||
"selfLink",
|
||||
"generation",
|
||||
"managedFields",
|
||||
)
|
||||
check_result = subprocess.run(check_cmd, shell=True, capture_output=True)
|
||||
if check_result.returncode != 0:
|
||||
for item in items:
|
||||
meta = item.get("metadata", {})
|
||||
for field in stale_fields:
|
||||
meta.pop(field, None)
|
||||
|
||||
core_api = client.CoreV1Api()
|
||||
restored = 0
|
||||
for item in items:
|
||||
name = item.get("metadata", {}).get("name", "<unnamed>")
|
||||
try:
|
||||
core_api.create_namespaced_secret(namespace="caddy-system", body=item)
|
||||
restored += 1
|
||||
if opts.o.debug:
|
||||
print(f" Restored secret: {name}")
|
||||
except ApiException as e:
|
||||
if e.status == 409:
|
||||
try:
|
||||
core_api.replace_namespaced_secret(
|
||||
name=name, namespace="caddy-system", body=item
|
||||
)
|
||||
restored += 1
|
||||
if opts.o.debug:
|
||||
print(f" Updated secret: {name}")
|
||||
except ApiException as e2:
|
||||
print(f"Warning: failed to replace caddy secret {name}: {e2}")
|
||||
else:
|
||||
print(f"Warning: failed to restore caddy secret {name}: {e}")
|
||||
print(f"Restored {restored}/{len(items)} caddy cert secret(s)")
|
||||
|
||||
|
||||
def _install_caddy_cert_backup(
|
||||
api_client: client.ApiClient, kind_mount_root: Optional[str]
|
||||
) -> None:
|
||||
"""Deploy the CronJob that snapshots manager=caddy secrets to disk."""
|
||||
if not kind_mount_root:
|
||||
if opts.o.debug:
|
||||
print(f"No etcd snapshot at {db_path}, skipping cleanup")
|
||||
return False
|
||||
|
||||
etcd_image = _read_etcd_image_ref(etcd_path)
|
||||
if not etcd_image:
|
||||
print(
|
||||
f"Warning: etcd data at {etcd_path} but no image ref file "
|
||||
f"({_etcd_image_ref_path(etcd_path)}); skipping cleanup"
|
||||
print("No kind-mount-root configured; caddy cert backup disabled")
|
||||
return
|
||||
manifest = os.path.abspath(
|
||||
get_k8s_dir().joinpath(
|
||||
"components", "ingress", "caddy-cert-backup.yaml"
|
||||
)
|
||||
return False
|
||||
|
||||
)
|
||||
with open(manifest) as f:
|
||||
objects = list(yaml.safe_load_all(f))
|
||||
utils.create_from_yaml(api_client, yaml_objects=objects)
|
||||
if opts.o.debug:
|
||||
print(
|
||||
f"Cleaning persisted etcd at {etcd_path} using {etcd_image}, "
|
||||
"keeping only TLS certs"
|
||||
)
|
||||
|
||||
temp_dir = "/tmp/laconic-etcd-cleanup"
|
||||
|
||||
# Whitelist: prefixes to KEEP - everything else gets deleted.
|
||||
# Must include core cluster resources (kubernetes service, kube-system
|
||||
# secrets) or kindnet panics on restart — KUBERNETES_SERVICE_HOST is
|
||||
# injected from the kubernetes ClusterIP service in default namespace.
|
||||
keep_prefixes = [
|
||||
"/registry/secrets/caddy-system",
|
||||
"/registry/services/specs/default/kubernetes",
|
||||
"/registry/services/endpoints/default/kubernetes",
|
||||
]
|
||||
keep_prefixes_str = " ".join(keep_prefixes)
|
||||
|
||||
# The etcd image is distroless (no shell). We extract the statically-linked
|
||||
# etcdctl binary and run it from alpine which has shell + jq support.
|
||||
cleanup_script = f"""
|
||||
set -e
|
||||
ALPINE_IMAGE="alpine:3.19"
|
||||
|
||||
# Cleanup previous runs
|
||||
docker rm -f laconic-etcd-cleanup 2>/dev/null || true
|
||||
docker rm -f etcd-extract 2>/dev/null || true
|
||||
docker run --rm -v /tmp:/tmp $ALPINE_IMAGE rm -rf {temp_dir}
|
||||
|
||||
# Create temp dir
|
||||
docker run --rm -v /tmp:/tmp $ALPINE_IMAGE mkdir -p {temp_dir}
|
||||
|
||||
# Extract etcdctl binary (it's statically linked)
|
||||
docker create --name etcd-extract {etcd_image}
|
||||
docker cp etcd-extract:/usr/local/bin/etcdctl /tmp/etcdctl-bin
|
||||
docker rm etcd-extract
|
||||
docker run --rm -v /tmp/etcdctl-bin:/src:ro -v {temp_dir}:/dst $ALPINE_IMAGE \
|
||||
sh -c "cp /src /dst/etcdctl && chmod +x /dst/etcdctl"
|
||||
|
||||
# Copy db to temp location
|
||||
docker run --rm \
|
||||
-v {etcd_path}:/etcd:ro \
|
||||
-v {temp_dir}:/tmp-work \
|
||||
$ALPINE_IMAGE cp /etcd/member/snap/db /tmp-work/etcd-snapshot.db
|
||||
|
||||
# Restore snapshot
|
||||
docker run --rm -v {temp_dir}:/work {etcd_image} \
|
||||
etcdutl snapshot restore /work/etcd-snapshot.db \
|
||||
--data-dir=/work/etcd-data --skip-hash-check 2>/dev/null
|
||||
|
||||
# Start temp etcd (runs the etcd binary, no shell needed)
|
||||
docker run -d --name laconic-etcd-cleanup \
|
||||
-v {temp_dir}/etcd-data:/etcd-data \
|
||||
-v {temp_dir}:/backup \
|
||||
{etcd_image} etcd \
|
||||
--data-dir=/etcd-data \
|
||||
--listen-client-urls=http://0.0.0.0:2379 \
|
||||
--advertise-client-urls=http://localhost:2379
|
||||
|
||||
sleep 3
|
||||
|
||||
# Use alpine with extracted etcdctl to run commands (alpine has shell + jq)
|
||||
# Export whitelisted keys (caddy TLS certs + core cluster services)
|
||||
docker run --rm \
|
||||
-v {temp_dir}:/backup \
|
||||
--network container:laconic-etcd-cleanup \
|
||||
$ALPINE_IMAGE sh -c '
|
||||
apk add --no-cache jq >/dev/null 2>&1
|
||||
echo "[]" > /backup/all-kvs.json
|
||||
for prefix in {keep_prefixes_str}; do
|
||||
/backup/etcdctl get --prefix "$prefix" -w json 2>/dev/null \
|
||||
| jq ".kvs // []" >> /backup/all-kvs.json || true
|
||||
done
|
||||
jq -s "add" /backup/all-kvs.json \
|
||||
| jq "{{kvs: .}}" > /backup/kept.json 2>/dev/null \
|
||||
|| echo "{{}}" > /backup/kept.json
|
||||
'
|
||||
|
||||
# Delete ALL registry keys
|
||||
docker run --rm \
|
||||
-v {temp_dir}:/backup \
|
||||
--network container:laconic-etcd-cleanup \
|
||||
$ALPINE_IMAGE /backup/etcdctl del --prefix /registry
|
||||
|
||||
# Restore kept keys using jq
|
||||
docker run --rm \
|
||||
-v {temp_dir}:/backup \
|
||||
--network container:laconic-etcd-cleanup \
|
||||
$ALPINE_IMAGE sh -c '
|
||||
apk add --no-cache jq >/dev/null 2>&1
|
||||
jq -r ".kvs[] | @base64" /backup/kept.json 2>/dev/null | \
|
||||
while read encoded; do
|
||||
key=$(echo $encoded | base64 -d | jq -r ".key" | base64 -d)
|
||||
val=$(echo $encoded | base64 -d | jq -r ".value" | base64 -d)
|
||||
echo "$val" | /backup/etcdctl put "$key"
|
||||
done
|
||||
' || true
|
||||
|
||||
# Save cleaned snapshot
|
||||
docker exec laconic-etcd-cleanup \
|
||||
etcdctl snapshot save /etcd-data/cleaned-snapshot.db
|
||||
|
||||
docker stop laconic-etcd-cleanup
|
||||
docker rm laconic-etcd-cleanup
|
||||
|
||||
# Restore to temp location first to verify it works
|
||||
docker run --rm \
|
||||
-v {temp_dir}/etcd-data/cleaned-snapshot.db:/data/db:ro \
|
||||
-v {temp_dir}:/restore \
|
||||
{etcd_image} \
|
||||
etcdutl snapshot restore /data/db --data-dir=/restore/new-etcd \
|
||||
--skip-hash-check 2>/dev/null
|
||||
|
||||
# Create timestamped backup of original (kept forever)
|
||||
TIMESTAMP=$(date +%Y%m%d-%H%M%S)
|
||||
docker run --rm -v {etcd_path}:/etcd $ALPINE_IMAGE \
|
||||
cp -a /etcd/member /etcd/member.backup-$TIMESTAMP
|
||||
|
||||
# Replace original with cleaned version
|
||||
docker run --rm -v {etcd_path}:/etcd -v {temp_dir}:/tmp-work $ALPINE_IMAGE \
|
||||
sh -c "rm -rf /etcd/member && mv /tmp-work/new-etcd/member /etcd/member"
|
||||
|
||||
# Cleanup temp files (but NOT the timestamped backup in etcd_path)
|
||||
docker run --rm -v /tmp:/tmp $ALPINE_IMAGE rm -rf {temp_dir}
|
||||
rm -f /tmp/etcdctl-bin
|
||||
"""
|
||||
|
||||
result = subprocess.run(cleanup_script, shell=True, capture_output=True, text=True)
|
||||
if result.returncode != 0:
|
||||
if opts.o.debug:
|
||||
print(f"Warning: etcd cleanup failed: {result.stderr}")
|
||||
return False
|
||||
|
||||
if opts.o.debug:
|
||||
print("Cleaned etcd, kept only TLS certificates")
|
||||
return True
|
||||
print("Installed caddy cert backup CronJob")
|
||||
|
||||
|
||||
def create_cluster(name: str, config_file: str):
|
||||
|
|
@ -369,21 +234,10 @@ def create_cluster(name: str, config_file: str):
|
|||
print(f"Using existing cluster: {existing}")
|
||||
return existing
|
||||
|
||||
# Clean persisted etcd, keeping only TLS certificates
|
||||
etcd_path = _get_etcd_host_path_from_kind_config(config_file)
|
||||
if etcd_path:
|
||||
_clean_etcd_keeping_certs(etcd_path)
|
||||
|
||||
print(f"Creating new cluster: {name}")
|
||||
result = _run_command(f"kind create cluster --name {name} --config {config_file}")
|
||||
if result.returncode != 0:
|
||||
raise DeployerException(f"kind create cluster failed: {result}")
|
||||
|
||||
# Persist the etcd image ref so future _clean_etcd_keeping_certs calls
|
||||
# use a version that matches the on-disk format kind is writing now.
|
||||
if etcd_path:
|
||||
_capture_etcd_image(name, etcd_path)
|
||||
|
||||
return name
|
||||
|
||||
|
||||
|
|
@ -439,7 +293,9 @@ def wait_for_ingress_in_kind():
|
|||
error_exit("ERROR: Timed out waiting for Caddy ingress to become ready")
|
||||
|
||||
|
||||
def install_ingress_for_kind(acme_email: str = ""):
|
||||
def install_ingress_for_kind(
|
||||
acme_email: str = "", kind_mount_root: Optional[str] = None
|
||||
):
|
||||
api_client = client.ApiClient()
|
||||
ingress_install = os.path.abspath(
|
||||
get_k8s_dir().joinpath(
|
||||
|
|
@ -458,9 +314,23 @@ def install_ingress_for_kind(acme_email: str = ""):
|
|||
if opts.o.debug:
|
||||
print(f"Configured Caddy with ACME email: {acme_email}")
|
||||
|
||||
# Apply templated YAML
|
||||
yaml_objects = list(yaml.safe_load_all(yaml_content))
|
||||
utils.create_from_yaml(api_client, yaml_objects=yaml_objects)
|
||||
|
||||
# Split: apply everything except the Caddy controller Deployment first,
|
||||
# so the namespace + secrets exist before the pod can start and read its
|
||||
# secret_store. Race-free: Caddy has no way to see the cluster until
|
||||
# its Deployment object is created in Phase 3.
|
||||
def _is_caddy_deployment(o):
|
||||
return (
|
||||
o.get("kind") == "Deployment"
|
||||
and o.get("metadata", {}).get("name") == "caddy-ingress-controller"
|
||||
)
|
||||
|
||||
pre_deployment = [o for o in yaml_objects if not _is_caddy_deployment(o)]
|
||||
caddy_deployment = [o for o in yaml_objects if _is_caddy_deployment(o)]
|
||||
|
||||
# Phase 1: namespace, SA, RBAC, ConfigMap, Service, IngressClass
|
||||
utils.create_from_yaml(api_client, yaml_objects=pre_deployment)
|
||||
|
||||
# Patch ConfigMap with ACME email if provided
|
||||
if acme_email:
|
||||
|
|
@ -477,6 +347,16 @@ def install_ingress_for_kind(acme_email: str = ""):
|
|||
body=configmap,
|
||||
)
|
||||
|
||||
# Phase 2: restore caddy cert secrets before Caddy can start
|
||||
_restore_caddy_certs(kind_mount_root)
|
||||
|
||||
# Phase 3: start Caddy (reads restored secrets on startup)
|
||||
utils.create_from_yaml(api_client, yaml_objects=caddy_deployment)
|
||||
|
||||
# Install the backup CronJob last — it targets the same namespace and
|
||||
# depends on nothing in the Caddy Deployment.
|
||||
_install_caddy_cert_backup(api_client, kind_mount_root)
|
||||
|
||||
|
||||
def load_images_into_kind(kind_cluster_name: str, image_set: Set[str]):
|
||||
for image in image_set:
|
||||
|
|
@ -676,29 +556,6 @@ def _generate_kind_mounts(parsed_pod_files, deployment_dir, deployment_context):
|
|||
seen_host_path_mounts = set() # Track to avoid duplicate mounts
|
||||
kind_mount_root = deployment_context.spec.get_kind_mount_root()
|
||||
|
||||
# Cluster state backup for offline data recovery (unique per deployment)
|
||||
# etcd contains all k8s state; PKI certs needed to decrypt etcd offline
|
||||
deployment_id = deployment_context.id
|
||||
backup_subdir = f"cluster-backups/{deployment_id}"
|
||||
|
||||
etcd_host_path = _make_absolute_host_path(
|
||||
Path(f"./data/{backup_subdir}/etcd"), deployment_dir
|
||||
)
|
||||
volume_definitions.append(
|
||||
f" - hostPath: {etcd_host_path}\n"
|
||||
f" containerPath: /var/lib/etcd\n"
|
||||
f" propagation: HostToContainer\n"
|
||||
)
|
||||
|
||||
pki_host_path = _make_absolute_host_path(
|
||||
Path(f"./data/{backup_subdir}/pki"), deployment_dir
|
||||
)
|
||||
volume_definitions.append(
|
||||
f" - hostPath: {pki_host_path}\n"
|
||||
f" containerPath: /etc/kubernetes/pki\n"
|
||||
f" propagation: HostToContainer\n"
|
||||
)
|
||||
|
||||
# When kind-mount-root is set, emit a single extraMount for the root.
|
||||
# Individual volumes whose host path starts with the root are covered
|
||||
# by this single mount and don't need their own extraMount entries.
|
||||
|
|
|
|||
|
|
@ -83,6 +83,9 @@ assert_no_labeled_resources () {
|
|||
TEST_TARGET_SO=$( ls -t1 ./package/laconic-so* | head -1 )
|
||||
# Set a non-default repo dir
|
||||
export CERC_REPO_BASE_DIR=~/stack-orchestrator-test/repo-base-dir
|
||||
# kind-mount-root is cluster-level infra, independent of any deployment dir.
|
||||
# Previous runs' CronJob wrote files here as root via the Kind bind mount.
|
||||
export KIND_MOUNT_ROOT=~/stack-orchestrator-test/kind-mount
|
||||
echo "Testing this package: $TEST_TARGET_SO"
|
||||
echo "Test version command"
|
||||
reported_version_string=$( $TEST_TARGET_SO version )
|
||||
|
|
@ -90,6 +93,8 @@ echo "Version reported is: ${reported_version_string}"
|
|||
echo "Cloning repositories into: $CERC_REPO_BASE_DIR"
|
||||
rm -rf $CERC_REPO_BASE_DIR
|
||||
mkdir -p $CERC_REPO_BASE_DIR
|
||||
sudo rm -rf $KIND_MOUNT_ROOT
|
||||
mkdir -p $KIND_MOUNT_ROOT
|
||||
$TEST_TARGET_SO --stack test setup-repositories
|
||||
$TEST_TARGET_SO --stack test build-containers
|
||||
# Test basic stack-orchestrator deploy to k8s
|
||||
|
|
@ -107,6 +112,9 @@ echo "deploy init test: passed"
|
|||
# Switch to a full path for bind mount.
|
||||
sed -i "s|^\(\s*test-data-bind:$\)$|\1 ${test_deployment_dir}/data/test-data-bind|" $test_deployment_spec
|
||||
|
||||
# Enable caddy cert backup by setting kind-mount-root.
|
||||
echo "kind-mount-root: $KIND_MOUNT_ROOT" >> $test_deployment_spec
|
||||
|
||||
$TEST_TARGET_SO --stack test deploy create --spec-file $test_deployment_spec --deployment-dir $test_deployment_dir
|
||||
# Check the deployment dir exists
|
||||
if [ ! -d "$test_deployment_dir" ]; then
|
||||
|
|
@ -148,6 +156,16 @@ echo "deploy create output file test: passed"
|
|||
# because 'start' defaults to --skip-cluster-management)
|
||||
$TEST_TARGET_SO deployment --dir $test_deployment_dir start --perform-cluster-management
|
||||
wait_for_pods_started
|
||||
|
||||
# Caddy cert backup install: CronJob + RBAC should exist in caddy-system
|
||||
for kind in serviceaccount role rolebinding cronjob; do
|
||||
if ! kubectl get $kind caddy-cert-backup -n caddy-system >/dev/null 2>&1; then
|
||||
echo "caddy-cert-backup $kind install test: FAILED"
|
||||
cleanup_and_exit
|
||||
fi
|
||||
done
|
||||
echo "caddy-cert-backup install test: passed"
|
||||
|
||||
# Check logs command works
|
||||
wait_for_log_output
|
||||
sleep 1
|
||||
|
|
@ -292,7 +310,69 @@ else
|
|||
cleanup_and_exit
|
||||
fi
|
||||
|
||||
# Full teardown: --delete-namespace nukes the namespace after labeled cleanup.
|
||||
# --- Caddy cert backup/restore E2E ---
|
||||
# Seed a fake cert secret in caddy-system (simulates an LE-issued cert).
|
||||
fake_cert_name="caddy.ingress--certificates.test-domain.test-domain.crt"
|
||||
fake_cert_value="fake-cert-$(date +%s)"
|
||||
kubectl create secret generic "$fake_cert_name" \
|
||||
-n caddy-system \
|
||||
--from-literal=value="$fake_cert_value"
|
||||
kubectl label secret "$fake_cert_name" -n caddy-system manager=caddy
|
||||
|
||||
# Trigger the CronJob immediately (it fires every 5min on its own).
|
||||
kubectl create job --from=cronjob/caddy-cert-backup \
|
||||
caddy-cert-backup-manual -n caddy-system
|
||||
if ! kubectl wait --for=condition=complete \
|
||||
job/caddy-cert-backup-manual -n caddy-system --timeout=120s; then
|
||||
echo "caddy cert backup job test: FAILED (job did not complete)"
|
||||
echo "--- job description ---"
|
||||
kubectl describe job/caddy-cert-backup-manual -n caddy-system || true
|
||||
echo "--- pod list ---"
|
||||
kubectl get pod -n caddy-system -l job-name=caddy-cert-backup-manual -o wide || true
|
||||
echo "--- pod logs ---"
|
||||
kubectl logs -n caddy-system -l job-name=caddy-cert-backup-manual --tail=200 || true
|
||||
cleanup_and_exit
|
||||
fi
|
||||
|
||||
# Backup file is root-owned (CronJob writes as root via kind bind mount).
|
||||
# The secret's data.value is base64-encoded in YAML output, so assert on
|
||||
# the secret name (which is plaintext in metadata). Value correctness is
|
||||
# verified in the restore phase after a round-trip decode.
|
||||
backup_file=$KIND_MOUNT_ROOT/caddy-cert-backup/caddy-secrets.yaml
|
||||
if ! sudo test -f "$backup_file"; then
|
||||
echo "caddy cert backup file test: FAILED (missing $backup_file)"
|
||||
cleanup_and_exit
|
||||
fi
|
||||
if ! sudo grep -q "$fake_cert_name" "$backup_file"; then
|
||||
echo "caddy cert backup content test: FAILED (seeded secret not in backup)"
|
||||
sudo head -50 "$backup_file" || true
|
||||
cleanup_and_exit
|
||||
fi
|
||||
echo "caddy cert backup write test: passed"
|
||||
|
||||
# Full teardown including Kind cluster — --perform-cluster-management on stop
|
||||
# destroys the cluster, simulating the "recreate from scratch" scenario.
|
||||
$TEST_TARGET_SO deployment --dir $test_deployment_dir \
|
||||
stop --delete-volumes --delete-namespace --perform-cluster-management
|
||||
|
||||
# Recreate: new Kind cluster, Caddy install should restore from backup BEFORE
|
||||
# the Caddy Deployment pod starts.
|
||||
$TEST_TARGET_SO deployment --dir $test_deployment_dir start --perform-cluster-management
|
||||
wait_for_pods_started
|
||||
|
||||
if ! kubectl get secret "$fake_cert_name" -n caddy-system >/dev/null 2>&1; then
|
||||
echo "caddy cert restore test: FAILED (secret missing from new cluster)"
|
||||
cleanup_and_exit
|
||||
fi
|
||||
restored_value=$(kubectl get secret "$fake_cert_name" -n caddy-system \
|
||||
-o jsonpath='{.data.value}' | base64 -d)
|
||||
if [ "$restored_value" != "$fake_cert_value" ]; then
|
||||
echo "caddy cert restore test: FAILED (value mismatch: '$restored_value')"
|
||||
cleanup_and_exit
|
||||
fi
|
||||
echo "caddy cert restore test: passed"
|
||||
|
||||
# Final teardown: --delete-namespace nukes the namespace after labeled cleanup.
|
||||
# Verify the namespace is actually gone.
|
||||
$TEST_TARGET_SO deployment --dir $test_deployment_dir \
|
||||
stop --delete-volumes --delete-namespace --skip-cluster-management
|
||||
|
|
|
|||
Loading…
Reference in New Issue