--- # Recover agave validator from any state to healthy # # This playbook is idempotent — it assesses current state and picks up # from wherever the system is. Each step checks its precondition and # skips if already satisfied. # # Steps: # 1. Scale deployment to 0 # 2. Wait for pods to terminate (io_uring safety check) # 3. Wipe accounts ramdisk # 4. Clean old snapshots # 5. Ensure terminationGracePeriodSeconds is 300 (for graceful shutdown) # 6. Scale to 1 — container entrypoint downloads snapshot + starts validator # # The playbook exits after step 5. The container handles snapshot download # (60+ min) and validator startup autonomously. Monitor with: # scripts/check-status.py --watch # # Usage: # ansible-playbook -i inventory/biscayne.yml playbooks/biscayne-recover.yml # - name: Recover agave validator hosts: all gather_facts: false environment: KUBECONFIG: /home/rix/.kube/config vars: kind_cluster: laconic-70ce4c4b47e23b85 k8s_namespace: "laconic-{{ kind_cluster }}" deployment_name: "{{ kind_cluster }}-deployment" snapshot_dir: /srv/kind/solana/snapshots accounts_dir: /srv/kind/solana/ramdisk/accounts ramdisk_mount: /srv/kind/solana/ramdisk tasks: # ---- step 1: scale to 0 --------------------------------------------------- - name: Get current replica count ansible.builtin.command: > kubectl get deployment {{ deployment_name }} -n {{ k8s_namespace }} -o jsonpath='{.spec.replicas}' register: current_replicas failed_when: false changed_when: false - name: Scale deployment to 0 ansible.builtin.command: > kubectl scale deployment {{ deployment_name }} -n {{ k8s_namespace }} --replicas=0 when: current_replicas.stdout | default('0') | int > 0 changed_when: true # ---- step 2: wait for pods to terminate ------------------------------------ - name: Wait for pods to terminate ansible.builtin.command: > kubectl get pods -n {{ k8s_namespace }} -l app={{ deployment_name }} -o jsonpath='{.items}' register: pods_remaining retries: 60 delay: 5 until: pods_remaining.stdout == "[]" or pods_remaining.stdout == "" changed_when: false when: current_replicas.stdout | default('0') | int > 0 - name: Verify no agave processes in kind node (io_uring safety check) ansible.builtin.command: > docker exec {{ kind_cluster }}-control-plane pgrep -c agave-validator register: agave_procs failed_when: false changed_when: false - name: Fail if agave zombie detected ansible.builtin.fail: msg: >- agave-validator process still running inside kind node after pod termination. This is the io_uring/ZFS deadlock. Do NOT proceed — host reboot required. See CLAUDE.md. when: agave_procs.rc == 0 # ---- step 3: wipe accounts ramdisk ----------------------------------------- # Cannot umount+remount because the kind node's bind mount holds it open. # rm -rf is required here (slower than remount but the only option). - name: Wipe accounts data ansible.builtin.shell: | rm -rf {{ accounts_dir }}/* chown solana:solana {{ ramdisk_mount }} {{ accounts_dir }} become: true changed_when: true # ---- step 4: clean old snapshots ------------------------------------------- - name: Remove all old snapshots ansible.builtin.shell: rm -f {{ snapshot_dir }}/*.tar.* {{ snapshot_dir }}/*.tar become: true changed_when: true # ---- step 5: ensure terminationGracePeriodSeconds ------------------------- # laconic-so doesn't support this declaratively. Patch the deployment so # k8s gives the entrypoint 300s to perform graceful shutdown via admin RPC. - name: Ensure terminationGracePeriodSeconds is 300 ansible.builtin.command: > kubectl patch deployment {{ deployment_name }} -n {{ k8s_namespace }} -p '{"spec":{"template":{"spec":{"terminationGracePeriodSeconds":300}}}}' register: patch_result changed_when: "'no change' not in patch_result.stdout" # ---- step 6: scale to 1 — entrypoint handles snapshot download ------------ # The container's entrypoint.py checks snapshot freshness, cleans stale # snapshots, downloads fresh ones (with rolling incremental convergence), # then starts the validator. No host-side download needed. - name: Scale deployment to 1 ansible.builtin.command: > kubectl scale deployment {{ deployment_name }} -n {{ k8s_namespace }} --replicas=1 changed_when: true - name: Report ansible.builtin.debug: msg: >- Recovery initiated. The container entrypoint will download a fresh snapshot and start the validator. Monitor progress with: scripts/check-status.py --watch