stack-orchestrator/playbooks/biscayne-recover.yml

204 lines
7.5 KiB
YAML
Raw Normal View History

---
# Recover agave validator from any state to healthy
#
# This playbook is idempotent — it assesses current state and picks up
# from wherever the system is. Each step checks its precondition and
# skips if already satisfied.
#
# Steps:
# 1. Scale deployment to 0
# 2. Wait for pods to terminate
# 3. Wipe accounts ramdisk
# 4. Clean old snapshots
# 5. Scale to 1 — container entrypoint downloads snapshot + starts validator
# 6. Verify snapshot freshness
# 7. Wait for pod Running
# 8. Verify validator log
# 9. Check RPC health
#
# Usage:
# ansible-playbook -i inventory/biscayne.yml playbooks/biscayne-recover.yml
#
- name: Recover agave validator
hosts: all
gather_facts: false
environment:
KUBECONFIG: /home/rix/.kube/config
vars:
kind_cluster: laconic-70ce4c4b47e23b85
k8s_namespace: "laconic-{{ kind_cluster }}"
deployment_name: "{{ kind_cluster }}-deployment"
snapshot_dir: /srv/kind/solana/snapshots
accounts_dir: /srv/kind/solana/ramdisk/accounts
ramdisk_mount: /srv/kind/solana/ramdisk
# Mainnet RPC for slot comparison
mainnet_rpc: https://api.mainnet-beta.solana.com
# Maximum slots behind before snapshot is considered stale
max_slot_lag: 20000
tasks:
# ---- step 1: scale to 0 ---------------------------------------------------
- name: Get current replica count
ansible.builtin.command: >
kubectl get deployment {{ deployment_name }}
-n {{ k8s_namespace }}
-o jsonpath='{.spec.replicas}'
register: current_replicas
failed_when: false
changed_when: false
- name: Scale deployment to 0
ansible.builtin.command: >
kubectl scale deployment {{ deployment_name }}
-n {{ k8s_namespace }} --replicas=0
when: current_replicas.stdout | default('0') | int > 0
changed_when: true
# ---- step 2: wait for pods to terminate ------------------------------------
- name: Wait for pods to terminate
ansible.builtin.command: >
kubectl get pods -n {{ k8s_namespace }}
-l app={{ deployment_name }}
-o jsonpath='{.items}'
register: pods_remaining
retries: 60
delay: 5
until: pods_remaining.stdout == "[]" or pods_remaining.stdout == ""
changed_when: false
when: current_replicas.stdout | default('0') | int > 0
- name: Verify no agave processes in kind node (io_uring safety check)
ansible.builtin.command: >
docker exec {{ kind_cluster }}-control-plane
pgrep -c agave-validator
register: agave_procs
failed_when: false
changed_when: false
- name: Fail if agave zombie detected
ansible.builtin.fail:
msg: >-
agave-validator process still running inside kind node after pod
termination. This is the io_uring/ZFS deadlock. Do NOT proceed —
host reboot required. See CLAUDE.md.
when: agave_procs.rc == 0
# ---- step 3: wipe accounts ramdisk -----------------------------------------
# Cannot umount+remount because the kind node's bind mount holds it open.
# rm -rf is required here (slower than remount but the only option).
- name: Wipe accounts data
ansible.builtin.shell: |
rm -rf {{ accounts_dir }}/*
chown solana:solana {{ ramdisk_mount }} {{ accounts_dir }}
become: true
changed_when: true
# ---- step 4: clean old snapshots -------------------------------------------
- name: Remove all old snapshots
ansible.builtin.shell: rm -f {{ snapshot_dir }}/*.tar.* {{ snapshot_dir }}/*.tar
become: true
changed_when: true
# ---- step 5: scale to 1 — entrypoint handles snapshot download ------------
# The container's entrypoint.py checks snapshot freshness, cleans stale
# snapshots, downloads fresh ones (with rolling incremental convergence),
# then starts the validator. No host-side download needed.
- name: Scale deployment to 1
ansible.builtin.command: >
kubectl scale deployment {{ deployment_name }}
-n {{ k8s_namespace }} --replicas=1
changed_when: true
# ---- step 6: wait for pod running ------------------------------------------
# The entrypoint downloads the snapshot before starting the validator.
# The pod reaches Running immediately (entrypoint is PID 1), but the
# validator log won't appear until download + startup completes.
- name: Wait for pod to be running
ansible.builtin.command: >
kubectl get pods -n {{ k8s_namespace }}
-l app={{ deployment_name }}
-o jsonpath='{.items[0].status.phase}'
register: pod_status
retries: 60
delay: 10
until: pod_status.stdout == "Running"
changed_when: false
# ---- step 7: wait for snapshot download to complete -----------------------
# The entrypoint writes the snapshot to the PV. Wait for it to appear
# on the host (zvol mount is shared).
- name: Wait for snapshot file to appear
ansible.builtin.shell: set -o pipefail && ls -1 {{ snapshot_dir }}/snapshot-*.tar.* 2>/dev/null | head -1
args:
executable: /bin/bash
register: snapshot_file
retries: 180
delay: 20
until: snapshot_file.stdout != ""
changed_when: false
# ---- step 8: verify snapshot freshness ------------------------------------
- name: Get snapshot filename
ansible.builtin.shell: set -o pipefail && ls -1 {{ snapshot_dir }}/snapshot-*.tar.* | head -1 | xargs basename
args:
executable: /bin/bash
register: snapshot_filename
changed_when: false
- name: Extract snapshot slot from filename
ansible.builtin.set_fact:
snapshot_slot: "{{ snapshot_filename.stdout | regex_search('snapshot-([0-9]+)-', '\\1') | first }}"
- name: Get current mainnet slot
ansible.builtin.uri:
url: "{{ mainnet_rpc }}"
method: POST
body_format: json
body:
jsonrpc: "2.0"
id: 1
method: getSlot
params:
- commitment: finalized
return_content: true
register: mainnet_slot_response
- name: Report snapshot freshness
ansible.builtin.debug:
msg: >-
Snapshot slot {{ snapshot_slot }}, mainnet {{ mainnet_slot_response.json.result }},
{{ mainnet_slot_response.json.result | int - snapshot_slot | int }} slots behind.
# ---- step 9: wait for validator log ---------------------------------------
- name: Wait for validator log file
ansible.builtin.command: >
kubectl exec -n {{ k8s_namespace }}
deployment/{{ deployment_name }}
-c agave-validator -- test -f /data/log/validator.log
register: log_file_check
retries: 30
delay: 20
until: log_file_check.rc == 0
changed_when: false
# ---- step 10: check RPC health --------------------------------------------
- name: Check RPC health (non-blocking)
ansible.builtin.uri:
url: http://{{ inventory_hostname }}:8899/health
return_content: true
register: rpc_health
retries: 6
delay: 30
until: rpc_health.status == 200
failed_when: false
- name: Report final status
ansible.builtin.debug:
msg: >-
Recovery complete.
Snapshot: slot {{ snapshot_slot }}
({{ mainnet_slot_response.json.result | int - snapshot_slot | int }} slots behind).
Pod: {{ pod_status.stdout }}.
Log: {{ 'writing' if log_file_check.rc == 0 else 'not yet' }}.
RPC: {{ rpc_health.content | default('not yet responding — still catching up') }}.