--- # Recover agave validator from any state to healthy # # This playbook is idempotent — it assesses current state and picks up # from wherever the system is. Each step checks its precondition and # skips if already satisfied. # # Steps: # 1. Scale deployment to 0 # 2. Wait for pods to terminate # 3. Wipe accounts ramdisk # 4. Clean old snapshots # 5. Scale to 1 — container entrypoint downloads snapshot + starts validator # 6. Verify snapshot freshness # 7. Wait for pod Running # 8. Verify validator log # 9. Check RPC health # # Usage: # ansible-playbook -i inventory/biscayne.yml playbooks/biscayne-recover.yml # - name: Recover agave validator hosts: all gather_facts: false environment: KUBECONFIG: /home/rix/.kube/config vars: kind_cluster: laconic-70ce4c4b47e23b85 k8s_namespace: "laconic-{{ kind_cluster }}" deployment_name: "{{ kind_cluster }}-deployment" snapshot_dir: /srv/kind/solana/snapshots accounts_dir: /srv/kind/solana/ramdisk/accounts ramdisk_mount: /srv/kind/solana/ramdisk # Mainnet RPC for slot comparison mainnet_rpc: https://api.mainnet-beta.solana.com # Maximum slots behind before snapshot is considered stale max_slot_lag: 20000 tasks: # ---- step 1: scale to 0 --------------------------------------------------- - name: Get current replica count ansible.builtin.command: > kubectl get deployment {{ deployment_name }} -n {{ k8s_namespace }} -o jsonpath='{.spec.replicas}' register: current_replicas failed_when: false changed_when: false - name: Scale deployment to 0 ansible.builtin.command: > kubectl scale deployment {{ deployment_name }} -n {{ k8s_namespace }} --replicas=0 when: current_replicas.stdout | default('0') | int > 0 changed_when: true # ---- step 2: wait for pods to terminate ------------------------------------ - name: Wait for pods to terminate ansible.builtin.command: > kubectl get pods -n {{ k8s_namespace }} -l app={{ deployment_name }} -o jsonpath='{.items}' register: pods_remaining retries: 60 delay: 5 until: pods_remaining.stdout == "[]" or pods_remaining.stdout == "" changed_when: false when: current_replicas.stdout | default('0') | int > 0 - name: Verify no agave processes in kind node (io_uring safety check) ansible.builtin.command: > docker exec {{ kind_cluster }}-control-plane pgrep -c agave-validator register: agave_procs failed_when: false changed_when: false - name: Fail if agave zombie detected ansible.builtin.fail: msg: >- agave-validator process still running inside kind node after pod termination. This is the io_uring/ZFS deadlock. Do NOT proceed — host reboot required. See CLAUDE.md. when: agave_procs.rc == 0 # ---- step 3: wipe accounts ramdisk ----------------------------------------- # Cannot umount+remount because the kind node's bind mount holds it open. # rm -rf is required here (slower than remount but the only option). - name: Wipe accounts data ansible.builtin.shell: | rm -rf {{ accounts_dir }}/* chown solana:solana {{ ramdisk_mount }} {{ accounts_dir }} become: true changed_when: true # ---- step 4: clean old snapshots ------------------------------------------- - name: Remove all old snapshots ansible.builtin.shell: rm -f {{ snapshot_dir }}/*.tar.* {{ snapshot_dir }}/*.tar become: true changed_when: true # ---- step 5: scale to 1 — entrypoint handles snapshot download ------------ # The container's entrypoint.py checks snapshot freshness, cleans stale # snapshots, downloads fresh ones (with rolling incremental convergence), # then starts the validator. No host-side download needed. - name: Scale deployment to 1 ansible.builtin.command: > kubectl scale deployment {{ deployment_name }} -n {{ k8s_namespace }} --replicas=1 changed_when: true # ---- step 6: wait for pod running ------------------------------------------ # The entrypoint downloads the snapshot before starting the validator. # The pod reaches Running immediately (entrypoint is PID 1), but the # validator log won't appear until download + startup completes. - name: Wait for pod to be running ansible.builtin.command: > kubectl get pods -n {{ k8s_namespace }} -l app={{ deployment_name }} -o jsonpath='{.items[0].status.phase}' register: pod_status retries: 60 delay: 10 until: pod_status.stdout == "Running" changed_when: false # ---- step 7: wait for snapshot download to complete ----------------------- # The entrypoint writes the snapshot to the PV. Wait for it to appear # on the host (zvol mount is shared). - name: Wait for snapshot file to appear ansible.builtin.shell: set -o pipefail && ls -1 {{ snapshot_dir }}/snapshot-*.tar.* 2>/dev/null | head -1 args: executable: /bin/bash register: snapshot_file retries: 180 delay: 20 until: snapshot_file.stdout != "" changed_when: false # ---- step 8: verify snapshot freshness ------------------------------------ - name: Get snapshot filename ansible.builtin.shell: set -o pipefail && ls -1 {{ snapshot_dir }}/snapshot-*.tar.* | head -1 | xargs basename args: executable: /bin/bash register: snapshot_filename changed_when: false - name: Extract snapshot slot from filename ansible.builtin.set_fact: snapshot_slot: "{{ snapshot_filename.stdout | regex_search('snapshot-([0-9]+)-', '\\1') | first }}" - name: Get current mainnet slot ansible.builtin.uri: url: "{{ mainnet_rpc }}" method: POST body_format: json body: jsonrpc: "2.0" id: 1 method: getSlot params: - commitment: finalized return_content: true register: mainnet_slot_response - name: Report snapshot freshness ansible.builtin.debug: msg: >- Snapshot slot {{ snapshot_slot }}, mainnet {{ mainnet_slot_response.json.result }}, {{ mainnet_slot_response.json.result | int - snapshot_slot | int }} slots behind. # ---- step 9: wait for validator log --------------------------------------- - name: Wait for validator log file ansible.builtin.command: > kubectl exec -n {{ k8s_namespace }} deployment/{{ deployment_name }} -c agave-validator -- test -f /data/log/validator.log register: log_file_check retries: 30 delay: 20 until: log_file_check.rc == 0 changed_when: false # ---- step 10: check RPC health -------------------------------------------- - name: Check RPC health (non-blocking) ansible.builtin.uri: url: http://{{ inventory_hostname }}:8899/health return_content: true register: rpc_health retries: 6 delay: 30 until: rpc_health.status == 200 failed_when: false - name: Report final status ansible.builtin.debug: msg: >- Recovery complete. Snapshot: slot {{ snapshot_slot }} ({{ mainnet_slot_response.json.result | int - snapshot_slot | int }} slots behind). Pod: {{ pod_status.stdout }}. Log: {{ 'writing' if log_file_check.rc == 0 else 'not yet' }}. RPC: {{ rpc_health.content | default('not yet responding — still catching up') }}.