fix: recovery playbook delegates snapshot download to container entrypoint
The container's entrypoint.py already handles snapshot freshness checks, cleanup, download (with rolling incremental convergence), and validator startup. Remove the host-side download and let the container do the work. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>fix/kind-mount-propagation
parent
f842aba56a
commit
3dc345ea7d
|
|
@ -10,19 +10,14 @@
|
||||||
# 2. Wait for pods to terminate
|
# 2. Wait for pods to terminate
|
||||||
# 3. Wipe accounts ramdisk
|
# 3. Wipe accounts ramdisk
|
||||||
# 4. Clean old snapshots
|
# 4. Clean old snapshots
|
||||||
# 5. Download fresh snapshot via aria2c
|
# 5. Scale to 1 — container entrypoint downloads snapshot + starts validator
|
||||||
# 6. Verify snapshot accessible via PV (kubectl)
|
# 6. Verify snapshot freshness
|
||||||
# 7. Scale deployment to 1
|
# 7. Wait for pod Running
|
||||||
# 8. Wait for pod Running
|
# 8. Verify validator log
|
||||||
# 9. Verify validator log shows snapshot unpacking
|
# 9. Check RPC health
|
||||||
# 10. Check RPC health
|
|
||||||
#
|
#
|
||||||
# Usage:
|
# Usage:
|
||||||
# ansible-playbook -i biscayne.vaasl.io, playbooks/biscayne-recover.yml
|
# ansible-playbook -i inventory/biscayne.yml playbooks/biscayne-recover.yml
|
||||||
#
|
|
||||||
# # Pass extra args to snapshot-download.py
|
|
||||||
# ansible-playbook -i biscayne.vaasl.io, playbooks/biscayne-recover.yml \
|
|
||||||
# -e 'snapshot_args=--version 2.2'
|
|
||||||
#
|
#
|
||||||
- name: Recover agave validator
|
- name: Recover agave validator
|
||||||
hosts: all
|
hosts: all
|
||||||
|
|
@ -36,9 +31,6 @@
|
||||||
snapshot_dir: /srv/kind/solana/snapshots
|
snapshot_dir: /srv/kind/solana/snapshots
|
||||||
accounts_dir: /srv/kind/solana/ramdisk/accounts
|
accounts_dir: /srv/kind/solana/ramdisk/accounts
|
||||||
ramdisk_mount: /srv/kind/solana/ramdisk
|
ramdisk_mount: /srv/kind/solana/ramdisk
|
||||||
snapshot_script_local: "{{ playbook_dir }}/../scripts/agave-container/snapshot_download.py"
|
|
||||||
snapshot_script: /tmp/snapshot-download.py
|
|
||||||
snapshot_args: ""
|
|
||||||
# Mainnet RPC for slot comparison
|
# Mainnet RPC for slot comparison
|
||||||
mainnet_rpc: https://api.mainnet-beta.solana.com
|
mainnet_rpc: https://api.mainnet-beta.solana.com
|
||||||
# Maximum slots behind before snapshot is considered stale
|
# Maximum slots behind before snapshot is considered stale
|
||||||
|
|
@ -107,32 +99,45 @@
|
||||||
become: true
|
become: true
|
||||||
changed_when: true
|
changed_when: true
|
||||||
|
|
||||||
# ---- step 5: download fresh snapshot ---------------------------------------
|
# ---- step 5: scale to 1 — entrypoint handles snapshot download ------------
|
||||||
- name: Verify aria2c installed
|
# The container's entrypoint.py checks snapshot freshness, cleans stale
|
||||||
ansible.builtin.command: which aria2c
|
# snapshots, downloads fresh ones (with rolling incremental convergence),
|
||||||
changed_when: false
|
# then starts the validator. No host-side download needed.
|
||||||
|
- name: Scale deployment to 1
|
||||||
- name: Copy snapshot script to remote
|
ansible.builtin.command: >
|
||||||
ansible.builtin.copy:
|
kubectl scale deployment {{ deployment_name }}
|
||||||
src: "{{ snapshot_script_local }}"
|
-n {{ k8s_namespace }} --replicas=1
|
||||||
dest: "{{ snapshot_script }}"
|
|
||||||
mode: "0755"
|
|
||||||
|
|
||||||
- name: Download snapshot and scale to 1
|
|
||||||
ansible.builtin.shell: |
|
|
||||||
python3 {{ snapshot_script }} \
|
|
||||||
-o {{ snapshot_dir }} \
|
|
||||||
--max-snapshot-age {{ max_slot_lag }} \
|
|
||||||
--max-latency 500 \
|
|
||||||
{{ snapshot_args }} \
|
|
||||||
&& KUBECONFIG=/home/rix/.kube/config kubectl scale deployment \
|
|
||||||
{{ deployment_name }} -n {{ k8s_namespace }} --replicas=1
|
|
||||||
become: true
|
|
||||||
register: snapshot_result
|
|
||||||
timeout: 3600
|
|
||||||
changed_when: true
|
changed_when: true
|
||||||
|
|
||||||
# ---- step 6: verify snapshot accessible via PV -----------------------------
|
# ---- step 6: wait for pod running ------------------------------------------
|
||||||
|
# The entrypoint downloads the snapshot before starting the validator.
|
||||||
|
# The pod reaches Running immediately (entrypoint is PID 1), but the
|
||||||
|
# validator log won't appear until download + startup completes.
|
||||||
|
- name: Wait for pod to be running
|
||||||
|
ansible.builtin.command: >
|
||||||
|
kubectl get pods -n {{ k8s_namespace }}
|
||||||
|
-l app={{ deployment_name }}
|
||||||
|
-o jsonpath='{.items[0].status.phase}'
|
||||||
|
register: pod_status
|
||||||
|
retries: 60
|
||||||
|
delay: 10
|
||||||
|
until: pod_status.stdout == "Running"
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
# ---- step 7: wait for snapshot download to complete -----------------------
|
||||||
|
# The entrypoint writes the snapshot to the PV. Wait for it to appear
|
||||||
|
# on the host (zvol mount is shared).
|
||||||
|
- name: Wait for snapshot file to appear
|
||||||
|
ansible.builtin.shell: set -o pipefail && ls -1 {{ snapshot_dir }}/snapshot-*.tar.* 2>/dev/null | head -1
|
||||||
|
args:
|
||||||
|
executable: /bin/bash
|
||||||
|
register: snapshot_file
|
||||||
|
retries: 180
|
||||||
|
delay: 20
|
||||||
|
until: snapshot_file.stdout != ""
|
||||||
|
changed_when: false
|
||||||
|
|
||||||
|
# ---- step 8: verify snapshot freshness ------------------------------------
|
||||||
- name: Get snapshot filename
|
- name: Get snapshot filename
|
||||||
ansible.builtin.shell: set -o pipefail && ls -1 {{ snapshot_dir }}/snapshot-*.tar.* | head -1 | xargs basename
|
ansible.builtin.shell: set -o pipefail && ls -1 {{ snapshot_dir }}/snapshot-*.tar.* | head -1 | xargs basename
|
||||||
args:
|
args:
|
||||||
|
|
@ -158,48 +163,25 @@
|
||||||
return_content: true
|
return_content: true
|
||||||
register: mainnet_slot_response
|
register: mainnet_slot_response
|
||||||
|
|
||||||
- name: Check snapshot freshness
|
|
||||||
ansible.builtin.fail:
|
|
||||||
msg: >-
|
|
||||||
Snapshot too old: slot {{ snapshot_slot }}, mainnet at
|
|
||||||
{{ mainnet_slot_response.json.result }},
|
|
||||||
{{ mainnet_slot_response.json.result | int - snapshot_slot | int }} slots behind
|
|
||||||
(max {{ max_slot_lag }}).
|
|
||||||
when: (mainnet_slot_response.json.result | int - snapshot_slot | int) > max_slot_lag
|
|
||||||
|
|
||||||
- name: Report snapshot freshness
|
- name: Report snapshot freshness
|
||||||
ansible.builtin.debug:
|
ansible.builtin.debug:
|
||||||
msg: >-
|
msg: >-
|
||||||
Snapshot slot {{ snapshot_slot }}, mainnet {{ mainnet_slot_response.json.result }},
|
Snapshot slot {{ snapshot_slot }}, mainnet {{ mainnet_slot_response.json.result }},
|
||||||
{{ mainnet_slot_response.json.result | int - snapshot_slot | int }} slots behind.
|
{{ mainnet_slot_response.json.result | int - snapshot_slot | int }} slots behind.
|
||||||
|
|
||||||
# ---- step 7: scale already done in download step above ----------------------
|
# ---- step 9: wait for validator log ---------------------------------------
|
||||||
|
|
||||||
# ---- step 8: wait for pod running ------------------------------------------
|
|
||||||
- name: Wait for pod to be running
|
|
||||||
ansible.builtin.command: >
|
|
||||||
kubectl get pods -n {{ k8s_namespace }}
|
|
||||||
-l app={{ deployment_name }}
|
|
||||||
-o jsonpath='{.items[0].status.phase}'
|
|
||||||
register: pod_status
|
|
||||||
retries: 60
|
|
||||||
delay: 10
|
|
||||||
until: pod_status.stdout == "Running"
|
|
||||||
changed_when: false
|
|
||||||
|
|
||||||
# ---- step 9: verify validator log ------------------------------------------
|
|
||||||
- name: Wait for validator log file
|
- name: Wait for validator log file
|
||||||
ansible.builtin.command: >
|
ansible.builtin.command: >
|
||||||
kubectl exec -n {{ k8s_namespace }}
|
kubectl exec -n {{ k8s_namespace }}
|
||||||
deployment/{{ deployment_name }}
|
deployment/{{ deployment_name }}
|
||||||
-c agave-validator -- test -f /data/log/validator.log
|
-c agave-validator -- test -f /data/log/validator.log
|
||||||
register: log_file_check
|
register: log_file_check
|
||||||
retries: 12
|
retries: 30
|
||||||
delay: 10
|
delay: 20
|
||||||
until: log_file_check.rc == 0
|
until: log_file_check.rc == 0
|
||||||
changed_when: false
|
changed_when: false
|
||||||
|
|
||||||
# ---- step 10: check RPC health ---------------------------------------------
|
# ---- step 10: check RPC health --------------------------------------------
|
||||||
- name: Check RPC health (non-blocking)
|
- name: Check RPC health (non-blocking)
|
||||||
ansible.builtin.uri:
|
ansible.builtin.uri:
|
||||||
url: http://{{ inventory_hostname }}:8899/health
|
url: http://{{ inventory_hostname }}:8899/health
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue