--- # Redeploy agave-stack on biscayne # # The container entrypoint (entrypoint.py) handles snapshot download and # agave-validator startup internally. This playbook just manages the k8s # lifecycle: teardown, optional data wipe, deploy, and verify. # # Flow: # 1. [teardown] Scale to 0, wait for clean exit, delete namespace # 2. [wipe] Conditionally clear ledger / accounts / old snapshots # 3. [deploy] Preflight checks, laconic-so deployment start # 4. [verify] Wait for pod Running, check logs + RPC health # # The entrypoint.py inside the container: # - Checks snapshot freshness against mainnet # - Downloads fresh snapshot via aria2c if needed # - Builds agave-validator args from env vars # - Execs agave-validator # # Usage: # # Standard redeploy # ansible-playbook -i inventory/biscayne.yml playbooks/biscayne-redeploy.yml # # # Full wipe (accounts + ledger) — slow rebuild # ansible-playbook -i inventory/biscayne.yml playbooks/biscayne-redeploy.yml \ # -e wipe_accounts=true -e wipe_ledger=true # # # Skip snapshot cleanup (use existing) # ansible-playbook -i inventory/biscayne.yml playbooks/biscayne-redeploy.yml \ # -e skip_snapshot_cleanup=true # - name: Redeploy agave validator on biscayne hosts: all gather_facts: false environment: KUBECONFIG: /home/rix/.kube/config vars: deployment_dir: /srv/deployments/agave stack_repo: /srv/deployments/agave-stack stack_path: /srv/deployments/agave-stack/stack-orchestrator/stacks/agave laconic_so: /home/rix/.local/bin/laconic-so laconic_so_repo: /home/rix/stack-orchestrator laconic_so_branch: fix/kind-mount-propagation kind_cluster: laconic-70ce4c4b47e23b85 k8s_namespace: "laconic-{{ kind_cluster }}" deployment_name: "{{ kind_cluster }}-deployment" snapshot_dir: /srv/kind/solana/snapshots ledger_dir: /srv/kind/solana/ledger accounts_dir: /srv/kind/solana/ramdisk/accounts ramdisk_mount: /srv/kind/solana/ramdisk ramdisk_size: 1024G # Flags — non-destructive by default wipe_accounts: false wipe_ledger: false skip_snapshot_cleanup: false tasks: # ---- teardown: graceful stop, then delete namespace ---------------------- # # IMPORTANT: Scale to 0 first, wait for agave to exit cleanly. # Deleting the namespace while agave is running causes io_uring/ZFS # deadlock (unkillable D-state threads). See CLAUDE.md. - name: Scale deployment to 0 (graceful stop) ansible.builtin.command: > kubectl scale deployment {{ deployment_name }} -n {{ k8s_namespace }} --replicas=0 register: pre_teardown_scale changed_when: pre_teardown_scale.rc == 0 failed_when: false tags: [teardown] - name: Wait for agave to exit ansible.builtin.command: > kubectl get pods -n {{ k8s_namespace }} -l app={{ deployment_name }} -o jsonpath='{.items}' register: pre_teardown_pods retries: 60 delay: 5 until: pre_teardown_pods.stdout == "[]" or pre_teardown_pods.stdout == "" or pre_teardown_pods.rc != 0 changed_when: false failed_when: false when: pre_teardown_scale.rc == 0 tags: [teardown] - name: Delete deployment namespace ansible.builtin.command: > kubectl delete namespace {{ k8s_namespace }} --timeout=120s register: ns_delete changed_when: ns_delete.rc == 0 failed_when: false tags: [teardown] - name: Wait for namespace to terminate ansible.builtin.command: > kubectl get namespace {{ k8s_namespace }} -o jsonpath='{.status.phase}' register: ns_status retries: 30 delay: 5 until: ns_status.rc != 0 changed_when: false failed_when: false when: ns_delete.rc == 0 tags: [teardown] - name: Clear stale claimRefs on Released PVs ansible.builtin.shell: cmd: | set -o pipefail for pv in $(kubectl get pv -o jsonpath='{range .items[?(@.status.phase=="Released")]}{.metadata.name}{"\n"}{end}'); do kubectl patch pv "$pv" --type json \ -p '[{"op":"remove","path":"/spec/claimRef"}]' done executable: /bin/bash register: pv_patch changed_when: pv_patch.stdout != "" tags: [teardown] # ---- wipe: opt-in data cleanup ------------------------------------------ - name: Wipe ledger data ansible.builtin.shell: rm -rf {{ ledger_dir }}/* become: true changed_when: true when: wipe_ledger | bool tags: [wipe] - name: Wipe accounts ramdisk (remount tmpfs) ansible.builtin.shell: | umount {{ ramdisk_mount }} 2>/dev/null || true mount -t tmpfs -o nodev,nosuid,noexec,nodiratime,size={{ ramdisk_size }} tmpfs {{ ramdisk_mount }} mkdir -p {{ accounts_dir }} chown solana:solana {{ ramdisk_mount }} {{ accounts_dir }} become: true changed_when: true when: wipe_accounts | bool tags: [wipe] - name: Clean old snapshots (keep newest full + incremental) ansible.builtin.shell: cmd: | set -o pipefail cd {{ snapshot_dir }} || exit 0 newest=$(ls -t snapshot-*.tar.* 2>/dev/null | head -1) if [ -n "$newest" ]; then newest_inc=$(ls -t incremental-snapshot-*.tar.* 2>/dev/null | head -1) find . -maxdepth 1 -name '*.tar.*' \ ! -name "$newest" \ ! -name "${newest_inc:-__none__}" \ -delete fi executable: /bin/bash become: true changed_when: true when: not skip_snapshot_cleanup | bool tags: [wipe] # ---- preflight: verify ramdisk and mounts before deploy ------------------ - name: Verify ramdisk is mounted ansible.builtin.command: mountpoint -q {{ ramdisk_mount }} register: ramdisk_check failed_when: ramdisk_check.rc != 0 changed_when: false tags: [deploy, preflight] - name: Verify ramdisk is tmpfs (not the underlying ZFS) ansible.builtin.shell: cmd: set -o pipefail && df -T {{ ramdisk_mount }} | grep -q tmpfs executable: /bin/bash register: ramdisk_type failed_when: ramdisk_type.rc != 0 changed_when: false tags: [deploy, preflight] # ---- deploy: bring up cluster, let entrypoint handle snapshot ------------ - name: Check kind-config.yml mount style ansible.builtin.command: "grep -c 'containerPath: /mnt$' {{ deployment_dir }}/kind-config.yml" register: mount_root_check changed_when: false failed_when: false tags: [deploy] - name: Warn if unified mount root not found ansible.builtin.debug: msg: >- WARNING: kind-config.yml does not have unified mount root (containerPath: /mnt). laconic-so may be using individual PV mounts. Verify PV hostPaths match expected paths after deploy. when: mount_root_check.stdout | default('0') | int < 1 tags: [deploy] - name: Start deployment (creates kind cluster + deploys pod) ansible.builtin.command: "{{ laconic_so }} deployment --dir {{ deployment_dir }} start" register: deploy_start changed_when: deploy_start.rc == 0 failed_when: false timeout: 1200 tags: [deploy] - name: Verify deployment started or already exists ansible.builtin.command: > kubectl get deployment {{ deployment_name }} -n {{ k8s_namespace }} -o jsonpath='{.metadata.name}' register: deploy_verify changed_when: false failed_when: deploy_verify.rc != 0 when: deploy_start.rc != 0 tags: [deploy] - name: Show deployment start warning ansible.builtin.debug: msg: >- laconic-so deployment start returned rc={{ deploy_start.rc }} but deployment exists — continuing (idempotent). when: deploy_start.rc != 0 and (deploy_verify.rc | default(1)) == 0 tags: [deploy] - name: Wait for deployment to exist ansible.builtin.command: > kubectl get deployment {{ deployment_name }} -n {{ k8s_namespace }} -o jsonpath='{.metadata.name}' register: deploy_exists retries: 30 delay: 10 until: deploy_exists.rc == 0 changed_when: false tags: [deploy] - name: Verify ramdisk visible inside kind node ansible.builtin.shell: cmd: > set -o pipefail && docker exec {{ kind_cluster }}-control-plane df -T /mnt/validator-accounts 2>/dev/null | grep -q tmpfs executable: /bin/bash register: kind_ramdisk_check failed_when: kind_ramdisk_check.rc != 0 changed_when: false tags: [deploy] # ---- verify: confirm validator is running -------------------------------- # The entrypoint.py handles snapshot download + agave-validator startup. # Pod will be Running once the container starts, but agave-validator won't # exec until after snapshot download completes (if needed). - name: Wait for pod to be running ansible.builtin.command: > kubectl get pods -n {{ k8s_namespace }} -o jsonpath='{.items[0].status.phase}' register: pod_status retries: 60 delay: 10 until: pod_status.stdout == "Running" changed_when: false tags: [verify] - name: Verify PV mounts inside kind node ansible.builtin.shell: cmd: > set -o pipefail && docker exec {{ kind_cluster }}-control-plane df -T /mnt/validator-ledger /mnt/validator-accounts /mnt/validator-snapshots /mnt/validator-log 2>&1 executable: /bin/bash register: mount_check changed_when: false failed_when: false tags: [verify] - name: Show mount contents ansible.builtin.debug: msg: "{{ mount_check.stdout_lines }}" tags: [verify] - name: Check validator log file is being written ansible.builtin.command: > kubectl exec -n {{ k8s_namespace }} deployment/{{ deployment_name }} -c agave-validator -- test -f /data/log/validator.log retries: 12 delay: 10 until: log_file_check.rc == 0 register: log_file_check changed_when: false failed_when: false tags: [verify] - name: Check RPC health ansible.builtin.uri: url: http://127.0.0.1:8899/health return_content: true register: rpc_health retries: 6 delay: 10 until: rpc_health.status == 200 failed_when: false delegate_to: "{{ inventory_hostname }}" tags: [verify] - name: Report status ansible.builtin.debug: msg: >- Deployment complete. Log: {{ 'writing' if log_file_check.rc == 0 else 'not yet created' }}. RPC: {{ rpc_health.content | default('not responding') }}. Wiped: ledger={{ wipe_ledger }}, accounts={{ wipe_accounts }}. tags: [verify]