From 52fab97e9b139196da8b6d0438c028ca28055d41 Mon Sep 17 00:00:00 2001 From: Prathamesh Musale Date: Fri, 17 Apr 2026 09:19:26 +0000 Subject: [PATCH] so-o2o: use alpine/kubectl; surface diagnostics on job timeout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI test hit 'timed out waiting for the condition on jobs/caddy-cert-backup-manual'. Root cause: bitnami/kubectl runs as uid 1001 by default, but the hostPath mount target (/mnt/caddy-cert-backup on the kind node → /srv/kind/... on the host) is root-owned because kind creates bind-mounted dirs via the docker daemon. The pod couldn't write its output. Switch to alpine/kubectl:1.35.3 which runs as root by default and is smaller (faster pull in CI). Bump wait timeout to 120s as a cushion. Dump describe/pod-list/logs on timeout so future failures are debuggable from log output alone. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../components/ingress/caddy-cert-backup.yaml | 2 +- tests/k8s-deploy/run-deploy-test.sh | 21 +++++++++++++++---- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/stack_orchestrator/data/k8s/components/ingress/caddy-cert-backup.yaml b/stack_orchestrator/data/k8s/components/ingress/caddy-cert-backup.yaml index 1b68f03d..7a81e3f6 100644 --- a/stack_orchestrator/data/k8s/components/ingress/caddy-cert-backup.yaml +++ b/stack_orchestrator/data/k8s/components/ingress/caddy-cert-backup.yaml @@ -76,7 +76,7 @@ spec: operator: Equal containers: - name: backup - image: bitnami/kubectl:latest + image: alpine/kubectl:1.35.3 command: - sh - -c diff --git a/tests/k8s-deploy/run-deploy-test.sh b/tests/k8s-deploy/run-deploy-test.sh index bf9c613f..f16462ac 100755 --- a/tests/k8s-deploy/run-deploy-test.sh +++ b/tests/k8s-deploy/run-deploy-test.sh @@ -322,17 +322,30 @@ kubectl label secret "$fake_cert_name" -n caddy-system manager=caddy # Trigger the CronJob immediately (it fires every 5min on its own). kubectl create job --from=cronjob/caddy-cert-backup \ caddy-cert-backup-manual -n caddy-system -kubectl wait --for=condition=complete \ - job/caddy-cert-backup-manual -n caddy-system --timeout=60s +if ! kubectl wait --for=condition=complete \ + job/caddy-cert-backup-manual -n caddy-system --timeout=120s; then + echo "caddy cert backup job test: FAILED (job did not complete)" + echo "--- job description ---" + kubectl describe job/caddy-cert-backup-manual -n caddy-system || true + echo "--- pod list ---" + kubectl get pod -n caddy-system -l job-name=caddy-cert-backup-manual -o wide || true + echo "--- pod logs ---" + kubectl logs -n caddy-system -l job-name=caddy-cert-backup-manual --tail=200 || true + cleanup_and_exit +fi # Backup file is root-owned (CronJob writes as root via kind bind mount). +# The secret's data.value is base64-encoded in YAML output, so assert on +# the secret name (which is plaintext in metadata). Value correctness is +# verified in the restore phase after a round-trip decode. backup_file=$KIND_MOUNT_ROOT/caddy-cert-backup/caddy-secrets.yaml if ! sudo test -f "$backup_file"; then echo "caddy cert backup file test: FAILED (missing $backup_file)" cleanup_and_exit fi -if ! sudo grep -q "$fake_cert_value" "$backup_file"; then - echo "caddy cert backup content test: FAILED (value not found in backup)" +if ! sudo grep -q "$fake_cert_name" "$backup_file"; then + echo "caddy cert backup content test: FAILED (seeded secret not in backup)" + sudo head -50 "$backup_file" || true cleanup_and_exit fi echo "caddy cert backup write test: passed"