2026-03-07 01:44:25 +00:00
|
|
|
---
|
|
|
|
|
# Recover agave validator from any state to healthy
|
|
|
|
|
#
|
|
|
|
|
# This playbook is idempotent — it assesses current state and picks up
|
|
|
|
|
# from wherever the system is. Each step checks its precondition and
|
|
|
|
|
# skips if already satisfied.
|
|
|
|
|
#
|
|
|
|
|
# Steps:
|
|
|
|
|
# 1. Scale deployment to 0
|
2026-03-09 06:39:25 +00:00
|
|
|
# 2. Wait for pods to terminate (io_uring safety check)
|
2026-03-07 01:44:25 +00:00
|
|
|
# 3. Wipe accounts ramdisk
|
|
|
|
|
# 4. Clean old snapshots
|
feat: graceful shutdown, ZFS upgrade, storage migration, sync-tools build
- entrypoint.py: Python stays PID 1, traps SIGTERM, requests graceful exit
via admin RPC (agave-validator exit --force) before falling back to signals
- snapshot_download.py: fix break-on-failure bug in incremental download loop
(continue + re-probe instead of giving up)
- biscayne-upgrade-zfs.yml: upgrade ZFS 2.2.2 → 2.2.9 via arter97/zfs-lts
PPA to fix io_uring deadlock at kernel module level
- biscayne-migrate-storage.yml: one-time migration from zvol/XFS to ZFS
dataset (zvol workaround no longer needed with graceful shutdown + ZFS fix)
- biscayne-stop.yml: patch terminationGracePeriodSeconds to 300 before
scaling to 0, updated docs for admin RPC shutdown
- biscayne-sync-tools.yml: fix SSH agent forwarding (vars: ansible_become),
add --tags build-container support, add set -e to shell blocks
- biscayne-recover.yml: updated for graceful shutdown awareness
- check-status.py: add --pane flag for tmux, clean redraw in watch mode
- CLAUDE.md: update docs for ZFS dataset storage, graceful shutdown
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-09 07:58:37 +00:00
|
|
|
# 5. Ensure terminationGracePeriodSeconds is 300 (for graceful shutdown)
|
2026-03-10 00:57:36 +00:00
|
|
|
# 6. Fix PV permissions (grafana runs as UID 472, laconic-so creates as root)
|
|
|
|
|
# 7. Scale to 1 — container entrypoint downloads snapshot + starts validator
|
2026-03-09 06:39:25 +00:00
|
|
|
#
|
|
|
|
|
# The playbook exits after step 5. The container handles snapshot download
|
|
|
|
|
# (60+ min) and validator startup autonomously. Monitor with:
|
|
|
|
|
# scripts/check-status.py --watch
|
2026-03-07 01:44:25 +00:00
|
|
|
#
|
|
|
|
|
# Usage:
|
2026-03-09 06:28:01 +00:00
|
|
|
# ansible-playbook -i inventory/biscayne.yml playbooks/biscayne-recover.yml
|
2026-03-07 01:44:25 +00:00
|
|
|
#
|
|
|
|
|
- name: Recover agave validator
|
|
|
|
|
hosts: all
|
|
|
|
|
gather_facts: false
|
|
|
|
|
environment:
|
|
|
|
|
KUBECONFIG: /home/rix/.kube/config
|
|
|
|
|
vars:
|
|
|
|
|
kind_cluster: laconic-70ce4c4b47e23b85
|
|
|
|
|
k8s_namespace: "laconic-{{ kind_cluster }}"
|
|
|
|
|
deployment_name: "{{ kind_cluster }}-deployment"
|
2026-03-08 18:43:41 +00:00
|
|
|
snapshot_dir: /srv/kind/solana/snapshots
|
|
|
|
|
accounts_dir: /srv/kind/solana/ramdisk/accounts
|
|
|
|
|
ramdisk_mount: /srv/kind/solana/ramdisk
|
2026-03-07 01:44:25 +00:00
|
|
|
|
|
|
|
|
tasks:
|
|
|
|
|
# ---- step 1: scale to 0 ---------------------------------------------------
|
|
|
|
|
- name: Get current replica count
|
2026-03-07 10:52:40 +00:00
|
|
|
ansible.builtin.command: >
|
2026-03-07 01:44:25 +00:00
|
|
|
kubectl get deployment {{ deployment_name }}
|
|
|
|
|
-n {{ k8s_namespace }}
|
|
|
|
|
-o jsonpath='{.spec.replicas}'
|
|
|
|
|
register: current_replicas
|
|
|
|
|
failed_when: false
|
|
|
|
|
changed_when: false
|
|
|
|
|
|
|
|
|
|
- name: Scale deployment to 0
|
2026-03-07 10:52:40 +00:00
|
|
|
ansible.builtin.command: >
|
2026-03-07 01:44:25 +00:00
|
|
|
kubectl scale deployment {{ deployment_name }}
|
|
|
|
|
-n {{ k8s_namespace }} --replicas=0
|
|
|
|
|
when: current_replicas.stdout | default('0') | int > 0
|
|
|
|
|
changed_when: true
|
|
|
|
|
|
|
|
|
|
# ---- step 2: wait for pods to terminate ------------------------------------
|
|
|
|
|
- name: Wait for pods to terminate
|
2026-03-07 10:52:40 +00:00
|
|
|
ansible.builtin.command: >
|
2026-03-07 01:44:25 +00:00
|
|
|
kubectl get pods -n {{ k8s_namespace }}
|
|
|
|
|
-l app={{ deployment_name }}
|
|
|
|
|
-o jsonpath='{.items}'
|
|
|
|
|
register: pods_remaining
|
|
|
|
|
retries: 60
|
|
|
|
|
delay: 5
|
|
|
|
|
until: pods_remaining.stdout == "[]" or pods_remaining.stdout == ""
|
|
|
|
|
changed_when: false
|
|
|
|
|
when: current_replicas.stdout | default('0') | int > 0
|
|
|
|
|
|
|
|
|
|
- name: Verify no agave processes in kind node (io_uring safety check)
|
2026-03-07 10:52:40 +00:00
|
|
|
ansible.builtin.command: >
|
2026-03-07 01:44:25 +00:00
|
|
|
docker exec {{ kind_cluster }}-control-plane
|
|
|
|
|
pgrep -c agave-validator
|
|
|
|
|
register: agave_procs
|
|
|
|
|
failed_when: false
|
|
|
|
|
changed_when: false
|
|
|
|
|
|
|
|
|
|
- name: Fail if agave zombie detected
|
|
|
|
|
ansible.builtin.fail:
|
|
|
|
|
msg: >-
|
|
|
|
|
agave-validator process still running inside kind node after pod
|
|
|
|
|
termination. This is the io_uring/ZFS deadlock. Do NOT proceed —
|
|
|
|
|
host reboot required. See CLAUDE.md.
|
|
|
|
|
when: agave_procs.rc == 0
|
|
|
|
|
|
|
|
|
|
# ---- step 3: wipe accounts ramdisk -----------------------------------------
|
2026-03-08 18:45:44 +00:00
|
|
|
# Cannot umount+remount because the kind node's bind mount holds it open.
|
|
|
|
|
# rm -rf is required here (slower than remount but the only option).
|
2026-03-07 01:44:25 +00:00
|
|
|
- name: Wipe accounts data
|
|
|
|
|
ansible.builtin.shell: |
|
|
|
|
|
rm -rf {{ accounts_dir }}/*
|
|
|
|
|
chown solana:solana {{ ramdisk_mount }} {{ accounts_dir }}
|
|
|
|
|
become: true
|
|
|
|
|
changed_when: true
|
|
|
|
|
|
|
|
|
|
# ---- step 4: clean old snapshots -------------------------------------------
|
|
|
|
|
- name: Remove all old snapshots
|
|
|
|
|
ansible.builtin.shell: rm -f {{ snapshot_dir }}/*.tar.* {{ snapshot_dir }}/*.tar
|
|
|
|
|
become: true
|
|
|
|
|
changed_when: true
|
|
|
|
|
|
feat: graceful shutdown, ZFS upgrade, storage migration, sync-tools build
- entrypoint.py: Python stays PID 1, traps SIGTERM, requests graceful exit
via admin RPC (agave-validator exit --force) before falling back to signals
- snapshot_download.py: fix break-on-failure bug in incremental download loop
(continue + re-probe instead of giving up)
- biscayne-upgrade-zfs.yml: upgrade ZFS 2.2.2 → 2.2.9 via arter97/zfs-lts
PPA to fix io_uring deadlock at kernel module level
- biscayne-migrate-storage.yml: one-time migration from zvol/XFS to ZFS
dataset (zvol workaround no longer needed with graceful shutdown + ZFS fix)
- biscayne-stop.yml: patch terminationGracePeriodSeconds to 300 before
scaling to 0, updated docs for admin RPC shutdown
- biscayne-sync-tools.yml: fix SSH agent forwarding (vars: ansible_become),
add --tags build-container support, add set -e to shell blocks
- biscayne-recover.yml: updated for graceful shutdown awareness
- check-status.py: add --pane flag for tmux, clean redraw in watch mode
- CLAUDE.md: update docs for ZFS dataset storage, graceful shutdown
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-09 07:58:37 +00:00
|
|
|
# ---- step 5: ensure terminationGracePeriodSeconds -------------------------
|
|
|
|
|
# laconic-so doesn't support this declaratively. Patch the deployment so
|
|
|
|
|
# k8s gives the entrypoint 300s to perform graceful shutdown via admin RPC.
|
|
|
|
|
- name: Ensure terminationGracePeriodSeconds is 300
|
|
|
|
|
ansible.builtin.command: >
|
|
|
|
|
kubectl patch deployment {{ deployment_name }}
|
|
|
|
|
-n {{ k8s_namespace }}
|
|
|
|
|
-p '{"spec":{"template":{"spec":{"terminationGracePeriodSeconds":300}}}}'
|
|
|
|
|
register: patch_result
|
|
|
|
|
changed_when: "'no change' not in patch_result.stdout"
|
|
|
|
|
|
2026-03-10 00:57:36 +00:00
|
|
|
# ---- step 6: fix PV permissions ---------------------------------------------
|
|
|
|
|
# laconic-so creates PV hostPath dirs as root. Grafana runs as UID 472 and
|
|
|
|
|
# can't write to its data dir. Fix ownership inside the kind node.
|
|
|
|
|
- name: Fix grafana PV ownership in kind node
|
|
|
|
|
ansible.builtin.command: >
|
|
|
|
|
docker exec {{ kind_cluster }}-control-plane
|
|
|
|
|
chown 472:472 /tmp/grafana-data
|
|
|
|
|
changed_when: true
|
|
|
|
|
|
|
|
|
|
# ---- step 7: scale to 1 — entrypoint handles snapshot download ------------
|
2026-03-09 06:28:01 +00:00
|
|
|
# The container's entrypoint.py checks snapshot freshness, cleans stale
|
|
|
|
|
# snapshots, downloads fresh ones (with rolling incremental convergence),
|
|
|
|
|
# then starts the validator. No host-side download needed.
|
|
|
|
|
- name: Scale deployment to 1
|
|
|
|
|
ansible.builtin.command: >
|
|
|
|
|
kubectl scale deployment {{ deployment_name }}
|
|
|
|
|
-n {{ k8s_namespace }} --replicas=1
|
|
|
|
|
changed_when: true
|
2026-03-07 01:44:25 +00:00
|
|
|
|
2026-03-09 06:39:25 +00:00
|
|
|
- name: Report
|
2026-03-07 01:44:25 +00:00
|
|
|
ansible.builtin.debug:
|
|
|
|
|
msg: >-
|
2026-03-09 06:39:25 +00:00
|
|
|
Recovery initiated. The container entrypoint will download a fresh
|
|
|
|
|
snapshot and start the validator. Monitor progress with:
|
|
|
|
|
scripts/check-status.py --watch
|