--- # Health check for biscayne agave-stack deployment # # Gathers system, validator, DoubleZero, and network status in a single run. # All tasks are read-only — safe to run at any time. # # Usage: # ansible-playbook playbooks/health-check.yml # ansible-playbook playbooks/health-check.yml -t validator # just validator checks # ansible-playbook playbooks/health-check.yml -t doublezero # just DZ checks # ansible-playbook playbooks/health-check.yml -t network # just network checks - name: Biscayne agave-stack health check hosts: all gather_facts: false environment: KUBECONFIG: /home/rix/.kube/config tasks: # ------------------------------------------------------------------ # Discover kind cluster and namespace # ------------------------------------------------------------------ - name: Get kind cluster name ansible.builtin.command: cmd: kind get clusters register: kind_clusters changed_when: false failed_when: kind_clusters.rc != 0 or kind_clusters.stdout_lines | length == 0 tags: [always] - name: Set cluster name fact ansible.builtin.set_fact: kind_cluster: "{{ kind_clusters.stdout_lines[0] }}" tags: [always] - name: Discover agave namespace ansible.builtin.shell: cmd: >- set -o pipefail && kubectl get namespaces --no-headers -o custom-columns=':metadata.name' | grep '^laconic-' executable: /bin/bash register: ns_result changed_when: false failed_when: ns_result.stdout_lines | length == 0 tags: [always] - name: Set namespace fact ansible.builtin.set_fact: agave_ns: "{{ ns_result.stdout_lines[0] }}" tags: [always] - name: Get pod name ansible.builtin.shell: cmd: >- set -o pipefail && kubectl get pods -n {{ agave_ns }} --no-headers -o custom-columns=':metadata.name' | head -1 executable: /bin/bash register: pod_result changed_when: false failed_when: false tags: [always] - name: Set pod fact ansible.builtin.set_fact: agave_pod: "{{ pod_result.stdout | default('') | trim }}" tags: [always] - name: Show discovered resources ansible.builtin.debug: msg: "cluster={{ kind_cluster }} ns={{ agave_ns }} pod={{ agave_pod | default('none') }}" tags: [always] # ------------------------------------------------------------------ # Pod status # ------------------------------------------------------------------ - name: Get pod status ansible.builtin.command: cmd: kubectl get pods -n {{ agave_ns }} -o wide register: pod_status changed_when: false tags: [validator] - name: Show pod status ansible.builtin.debug: var: pod_status.stdout_lines tags: [validator] - name: Get container restart counts ansible.builtin.shell: cmd: >- kubectl get pod {{ agave_pod }} -n {{ agave_ns }} -o jsonpath='{range .status.containerStatuses[*]}{.name}{" restarts="}{.restartCount}{" ready="}{.ready}{"\n"}{end}' register: restart_counts changed_when: false tags: [validator] - name: Show restart counts ansible.builtin.debug: var: restart_counts.stdout_lines tags: [validator] # ------------------------------------------------------------------ # Validator sync status # ------------------------------------------------------------------ - name: Get validator recent logs (replay progress) ansible.builtin.command: cmd: >- kubectl logs -n {{ agave_ns }} {{ agave_pod }} -c agave-validator --tail=30 register: validator_logs changed_when: false tags: [validator] - name: Show validator logs ansible.builtin.debug: var: validator_logs.stdout_lines tags: [validator] - name: Check RPC health endpoint ansible.builtin.uri: url: http://127.0.0.1:8899/health method: GET return_content: true timeout: 5 register: rpc_health failed_when: false tags: [validator] - name: Show RPC health ansible.builtin.debug: msg: "RPC health: {{ rpc_health.status | default('unreachable') }} — {{ rpc_health.content | default('no response') }}" tags: [validator] - name: Get validator version ansible.builtin.shell: cmd: >- kubectl exec -n {{ agave_ns }} {{ agave_pod }} -c agave-validator -- agave-validator --version 2>&1 || true register: validator_version changed_when: false tags: [validator] - name: Show validator version ansible.builtin.debug: var: validator_version.stdout tags: [validator] # ------------------------------------------------------------------ # DoubleZero status # ------------------------------------------------------------------ - name: Get host DZ identity ansible.builtin.command: cmd: sudo -u solana doublezero address register: dz_address changed_when: false failed_when: false tags: [doublezero] - name: Get host DZ tunnel status ansible.builtin.command: cmd: sudo -u solana doublezero -e {{ dz_environment }} status register: dz_status changed_when: false failed_when: false tags: [doublezero] - name: Get DZ routes ansible.builtin.shell: cmd: set -o pipefail && ip route | grep doublezero0 || echo "no doublezero0 routes" executable: /bin/bash register: dz_routes changed_when: false tags: [doublezero] - name: Get host doublezerod service state ansible.builtin.systemd: name: doublezerod register: dz_systemd_info failed_when: false check_mode: true tags: [doublezero] - name: Set DZ systemd state ansible.builtin.set_fact: dz_systemd_state: "{{ dz_systemd_info.status.ActiveState | default('unknown') }}" tags: [doublezero] - name: Get container DZ status ansible.builtin.shell: cmd: >- kubectl exec -n {{ agave_ns }} {{ agave_pod }} -c doublezerod -- doublezero status 2>&1 || echo "container DZ unavailable" register: dz_container_status changed_when: false tags: [doublezero] - name: Show DoubleZero status ansible.builtin.debug: msg: identity: "{{ dz_address.stdout | default('unknown') }}" host_tunnel: "{{ dz_status.stdout_lines | default(['unknown']) }}" host_systemd: "{{ dz_systemd_state }}" container: "{{ dz_container_status.stdout_lines | default(['unknown']) }}" routes: "{{ dz_routes.stdout_lines | default([]) }}" tags: [doublezero] # ------------------------------------------------------------------ # Storage # ------------------------------------------------------------------ - name: Check ramdisk usage ansible.builtin.command: cmd: df -h /srv/kind/solana/ramdisk register: ramdisk_df changed_when: false failed_when: false tags: [storage] - name: Check ZFS dataset usage ansible.builtin.command: cmd: zfs list -o name,used,avail,mountpoint -r biscayne/DATA register: zfs_list changed_when: false tags: [storage] - name: Check ZFS zvol I/O ansible.builtin.shell: cmd: set -o pipefail && iostat -x zd0 1 2 | tail -3 executable: /bin/bash register: zvol_io changed_when: false failed_when: false tags: [storage] - name: Check host mount chain ansible.builtin.shell: cmd: > set -o pipefail && findmnt -n -o TARGET,SOURCE,FSTYPE,PROPAGATION /srv/kind/solana /srv/kind/solana/ramdisk 2>&1 executable: /bin/bash register: host_mounts changed_when: false failed_when: false tags: [storage, mounts] - name: Check kind node mount visibility ansible.builtin.shell: cmd: | set -o pipefail echo "=== PV mount filesystems ===" docker exec {{ kind_cluster }}-control-plane df -T /mnt/validator-ledger /mnt/validator-accounts /mnt/validator-snapshots /mnt/validator-log 2>/dev/null || echo "PV mounts not visible" echo "=== /mnt/validator-ledger ===" docker exec {{ kind_cluster }}-control-plane ls /mnt/validator-ledger/ 2>/dev/null | head -5 || echo "ledger not visible" echo "=== /mnt/validator-snapshots ===" docker exec {{ kind_cluster }}-control-plane ls /mnt/validator-snapshots/ 2>/dev/null || echo "snapshots not visible" echo "=== /mnt/validator-accounts ===" docker exec {{ kind_cluster }}-control-plane ls /mnt/validator-accounts/ 2>/dev/null || echo "accounts not visible" executable: /bin/bash register: kind_mounts changed_when: false failed_when: false tags: [storage, mounts] - name: Check mount propagation ansible.builtin.shell: cmd: > set -o pipefail && findmnt -n -o PROPAGATION /srv/kind executable: /bin/bash register: mount_propagation changed_when: false failed_when: false tags: [storage, mounts] - name: Show storage status ansible.builtin.debug: msg: ramdisk: "{{ ramdisk_df.stdout_lines | default(['not mounted']) }}" zfs: "{{ zfs_list.stdout_lines | default([]) }}" zvol_io: "{{ zvol_io.stdout_lines | default([]) }}" host_mounts: "{{ host_mounts.stdout_lines | default([]) }}" kind_mounts: "{{ kind_mounts.stdout_lines | default([]) }}" mount_propagation: "{{ mount_propagation.stdout | default('unknown') }}" tags: [storage, mounts] # ------------------------------------------------------------------ # System resources # ------------------------------------------------------------------ - name: Check memory ansible.builtin.command: cmd: free -h register: mem changed_when: false tags: [system] - name: Check load average ansible.builtin.command: cmd: cat /proc/loadavg register: loadavg changed_when: false tags: [system] - name: Check swap ansible.builtin.command: cmd: swapon --show register: swap changed_when: false failed_when: false tags: [system] - name: Show system resources ansible.builtin.debug: msg: memory: "{{ mem.stdout_lines }}" load: "{{ loadavg.stdout }}" swap: "{{ swap.stdout | default('none') }}" tags: [system] # ------------------------------------------------------------------ # Network / shred throughput # ------------------------------------------------------------------ - name: Count shred packets per interface (5 sec sample) ansible.builtin.shell: cmd: | set -o pipefail for iface in eno1 doublezero0; do count=$(timeout 5 tcpdump -i "$iface" -nn 'udp dst portrange 9000-10000' -q 2>&1 | grep -oP '\d+(?= packets captured)' || echo 0) echo "$iface: $count packets/5s" done executable: /bin/bash register: shred_counts changed_when: false failed_when: false tags: [network] - name: Check interface throughput ansible.builtin.shell: cmd: >- set -o pipefail && grep -E 'eno1|doublezero0' /proc/net/dev | awk '{printf "%s rx=%s tx=%s\n", $1, $2, $10}' executable: /bin/bash register: iface_stats changed_when: false tags: [network] - name: Check gossip/repair port connections ansible.builtin.shell: cmd: >- set -o pipefail && ss -tupn | grep -E ':8001|:900[0-9]' | head -20 || echo "no connections" executable: /bin/bash register: gossip_ports changed_when: false tags: [network] - name: Check iptables DNAT rule (TVU shred relay) ansible.builtin.shell: cmd: >- set -o pipefail && iptables -t nat -L PREROUTING -v -n | grep -E '64.92.84.81|20000' || echo "no DNAT rule" executable: /bin/bash register: dnat_rule changed_when: false tags: [network] - name: Show network status ansible.builtin.debug: msg: shred_counts: "{{ shred_counts.stdout_lines | default([]) }}" interfaces: "{{ iface_stats.stdout_lines | default([]) }}" gossip_ports: "{{ gossip_ports.stdout_lines | default([]) }}" tvu_dnat: "{{ dnat_rule.stdout_lines | default([]) }}" tags: [network] # ------------------------------------------------------------------ # Summary # ------------------------------------------------------------------ - name: Health check summary ansible.builtin.debug: msg: | === Biscayne Health Check === Cluster: {{ kind_cluster }} Namespace: {{ agave_ns }} Pod: {{ agave_pod }} RPC: {{ rpc_health.status | default('unreachable') }} DZ identity: {{ dz_address.stdout | default('unknown') | trim }} DZ tunnel: {{ 'UP' if dz_status.rc | default(1) == 0 else 'DOWN' }} DZ systemd: {{ dz_systemd_state }} Ramdisk: {{ ramdisk_df.stdout_lines[-1] | default('unknown') }} Load: {{ loadavg.stdout | default('unknown') }}