2026-03-07 01:44:25 +00:00
|
|
|
---
|
|
|
|
|
# Health check for biscayne agave-stack deployment
|
|
|
|
|
#
|
|
|
|
|
# Gathers system, validator, DoubleZero, and network status in a single run.
|
|
|
|
|
# All tasks are read-only — safe to run at any time.
|
|
|
|
|
#
|
|
|
|
|
# Usage:
|
|
|
|
|
# ansible-playbook playbooks/health-check.yml
|
|
|
|
|
# ansible-playbook playbooks/health-check.yml -t validator # just validator checks
|
|
|
|
|
# ansible-playbook playbooks/health-check.yml -t doublezero # just DZ checks
|
|
|
|
|
# ansible-playbook playbooks/health-check.yml -t network # just network checks
|
|
|
|
|
|
|
|
|
|
- name: Biscayne agave-stack health check
|
|
|
|
|
hosts: biscayne
|
|
|
|
|
gather_facts: false
|
2026-03-07 10:52:40 +00:00
|
|
|
environment:
|
|
|
|
|
KUBECONFIG: /home/rix/.kube/config
|
2026-03-07 01:44:25 +00:00
|
|
|
|
|
|
|
|
tasks:
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
# Discover kind cluster and namespace
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
- name: Get kind cluster name
|
|
|
|
|
ansible.builtin.command:
|
|
|
|
|
cmd: kind get clusters
|
|
|
|
|
register: kind_clusters
|
|
|
|
|
changed_when: false
|
|
|
|
|
failed_when: kind_clusters.rc != 0 or kind_clusters.stdout_lines | length == 0
|
2026-03-07 13:07:54 +00:00
|
|
|
tags: [always]
|
2026-03-07 01:44:25 +00:00
|
|
|
|
|
|
|
|
- name: Set cluster name fact
|
|
|
|
|
ansible.builtin.set_fact:
|
|
|
|
|
kind_cluster: "{{ kind_clusters.stdout_lines[0] }}"
|
2026-03-07 13:07:54 +00:00
|
|
|
tags: [always]
|
2026-03-07 01:44:25 +00:00
|
|
|
|
|
|
|
|
- name: Discover agave namespace
|
|
|
|
|
ansible.builtin.shell:
|
|
|
|
|
cmd: >-
|
|
|
|
|
set -o pipefail &&
|
|
|
|
|
kubectl get namespaces --no-headers -o custom-columns=':metadata.name'
|
|
|
|
|
| grep '^laconic-'
|
|
|
|
|
executable: /bin/bash
|
|
|
|
|
register: ns_result
|
|
|
|
|
changed_when: false
|
|
|
|
|
failed_when: ns_result.stdout_lines | length == 0
|
2026-03-07 13:07:54 +00:00
|
|
|
tags: [always]
|
2026-03-07 01:44:25 +00:00
|
|
|
|
|
|
|
|
- name: Set namespace fact
|
|
|
|
|
ansible.builtin.set_fact:
|
|
|
|
|
agave_ns: "{{ ns_result.stdout_lines[0] }}"
|
2026-03-07 13:07:54 +00:00
|
|
|
tags: [always]
|
2026-03-07 01:44:25 +00:00
|
|
|
|
|
|
|
|
- name: Get pod name
|
|
|
|
|
ansible.builtin.shell:
|
|
|
|
|
cmd: >-
|
|
|
|
|
set -o pipefail &&
|
|
|
|
|
kubectl get pods -n {{ agave_ns }} --no-headers
|
|
|
|
|
-o custom-columns=':metadata.name' | head -1
|
|
|
|
|
executable: /bin/bash
|
|
|
|
|
register: pod_result
|
|
|
|
|
changed_when: false
|
2026-03-07 13:07:54 +00:00
|
|
|
failed_when: false
|
|
|
|
|
tags: [always]
|
2026-03-07 01:44:25 +00:00
|
|
|
|
|
|
|
|
- name: Set pod fact
|
|
|
|
|
ansible.builtin.set_fact:
|
2026-03-07 13:07:54 +00:00
|
|
|
agave_pod: "{{ pod_result.stdout | default('') | trim }}"
|
|
|
|
|
tags: [always]
|
2026-03-07 01:44:25 +00:00
|
|
|
|
|
|
|
|
- name: Show discovered resources
|
|
|
|
|
ansible.builtin.debug:
|
2026-03-07 13:07:54 +00:00
|
|
|
msg: "cluster={{ kind_cluster }} ns={{ agave_ns }} pod={{ agave_pod | default('none') }}"
|
|
|
|
|
tags: [always]
|
2026-03-07 01:44:25 +00:00
|
|
|
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
# Pod status
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
- name: Get pod status
|
|
|
|
|
ansible.builtin.command:
|
|
|
|
|
cmd: kubectl get pods -n {{ agave_ns }} -o wide
|
|
|
|
|
register: pod_status
|
|
|
|
|
changed_when: false
|
|
|
|
|
tags: [validator]
|
|
|
|
|
|
|
|
|
|
- name: Show pod status
|
|
|
|
|
ansible.builtin.debug:
|
|
|
|
|
var: pod_status.stdout_lines
|
|
|
|
|
tags: [validator]
|
|
|
|
|
|
|
|
|
|
- name: Get container restart counts
|
|
|
|
|
ansible.builtin.shell:
|
|
|
|
|
cmd: >-
|
|
|
|
|
kubectl get pod {{ agave_pod }} -n {{ agave_ns }}
|
|
|
|
|
-o jsonpath='{range .status.containerStatuses[*]}{.name}{" restarts="}{.restartCount}{" ready="}{.ready}{"\n"}{end}'
|
|
|
|
|
register: restart_counts
|
|
|
|
|
changed_when: false
|
|
|
|
|
tags: [validator]
|
|
|
|
|
|
|
|
|
|
- name: Show restart counts
|
|
|
|
|
ansible.builtin.debug:
|
|
|
|
|
var: restart_counts.stdout_lines
|
|
|
|
|
tags: [validator]
|
|
|
|
|
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
# Validator sync status
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
- name: Get validator recent logs (replay progress)
|
|
|
|
|
ansible.builtin.command:
|
|
|
|
|
cmd: >-
|
|
|
|
|
kubectl logs -n {{ agave_ns }} {{ agave_pod }}
|
|
|
|
|
-c agave-validator --tail=30
|
|
|
|
|
register: validator_logs
|
|
|
|
|
changed_when: false
|
|
|
|
|
tags: [validator]
|
|
|
|
|
|
|
|
|
|
- name: Show validator logs
|
|
|
|
|
ansible.builtin.debug:
|
|
|
|
|
var: validator_logs.stdout_lines
|
|
|
|
|
tags: [validator]
|
|
|
|
|
|
|
|
|
|
- name: Check RPC health endpoint
|
|
|
|
|
ansible.builtin.uri:
|
|
|
|
|
url: http://127.0.0.1:8899/health
|
|
|
|
|
method: GET
|
|
|
|
|
return_content: true
|
|
|
|
|
timeout: 5
|
|
|
|
|
register: rpc_health
|
|
|
|
|
failed_when: false
|
|
|
|
|
tags: [validator]
|
|
|
|
|
|
|
|
|
|
- name: Show RPC health
|
|
|
|
|
ansible.builtin.debug:
|
|
|
|
|
msg: "RPC health: {{ rpc_health.status | default('unreachable') }} — {{ rpc_health.content | default('no response') }}"
|
|
|
|
|
tags: [validator]
|
|
|
|
|
|
|
|
|
|
- name: Get validator version
|
|
|
|
|
ansible.builtin.shell:
|
|
|
|
|
cmd: >-
|
|
|
|
|
kubectl exec -n {{ agave_ns }} {{ agave_pod }}
|
|
|
|
|
-c agave-validator -- agave-validator --version 2>&1 || true
|
|
|
|
|
register: validator_version
|
|
|
|
|
changed_when: false
|
|
|
|
|
tags: [validator]
|
|
|
|
|
|
|
|
|
|
- name: Show validator version
|
|
|
|
|
ansible.builtin.debug:
|
|
|
|
|
var: validator_version.stdout
|
|
|
|
|
tags: [validator]
|
|
|
|
|
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
# DoubleZero status
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
- name: Get host DZ identity
|
|
|
|
|
ansible.builtin.command:
|
|
|
|
|
cmd: sudo -u solana doublezero address
|
|
|
|
|
register: dz_address
|
|
|
|
|
changed_when: false
|
|
|
|
|
failed_when: false
|
|
|
|
|
tags: [doublezero]
|
|
|
|
|
|
|
|
|
|
- name: Get host DZ tunnel status
|
|
|
|
|
ansible.builtin.command:
|
|
|
|
|
cmd: sudo -u solana doublezero -e {{ dz_environment }} status
|
|
|
|
|
register: dz_status
|
|
|
|
|
changed_when: false
|
|
|
|
|
failed_when: false
|
|
|
|
|
tags: [doublezero]
|
|
|
|
|
|
|
|
|
|
- name: Get DZ routes
|
|
|
|
|
ansible.builtin.shell:
|
|
|
|
|
cmd: set -o pipefail && ip route | grep doublezero0 || echo "no doublezero0 routes"
|
|
|
|
|
executable: /bin/bash
|
|
|
|
|
register: dz_routes
|
|
|
|
|
changed_when: false
|
|
|
|
|
tags: [doublezero]
|
|
|
|
|
|
|
|
|
|
- name: Get host doublezerod service state
|
|
|
|
|
ansible.builtin.systemd:
|
|
|
|
|
name: doublezerod
|
|
|
|
|
register: dz_systemd_info
|
|
|
|
|
failed_when: false
|
|
|
|
|
check_mode: true
|
|
|
|
|
tags: [doublezero]
|
|
|
|
|
|
|
|
|
|
- name: Set DZ systemd state
|
|
|
|
|
ansible.builtin.set_fact:
|
|
|
|
|
dz_systemd_state: "{{ dz_systemd_info.status.ActiveState | default('unknown') }}"
|
|
|
|
|
tags: [doublezero]
|
|
|
|
|
|
|
|
|
|
- name: Get container DZ status
|
|
|
|
|
ansible.builtin.shell:
|
|
|
|
|
cmd: >-
|
|
|
|
|
kubectl exec -n {{ agave_ns }} {{ agave_pod }}
|
|
|
|
|
-c doublezerod -- doublezero status 2>&1 || echo "container DZ unavailable"
|
|
|
|
|
register: dz_container_status
|
|
|
|
|
changed_when: false
|
|
|
|
|
tags: [doublezero]
|
|
|
|
|
|
|
|
|
|
- name: Show DoubleZero status
|
|
|
|
|
ansible.builtin.debug:
|
|
|
|
|
msg:
|
|
|
|
|
identity: "{{ dz_address.stdout | default('unknown') }}"
|
|
|
|
|
host_tunnel: "{{ dz_status.stdout_lines | default(['unknown']) }}"
|
|
|
|
|
host_systemd: "{{ dz_systemd_state }}"
|
|
|
|
|
container: "{{ dz_container_status.stdout_lines | default(['unknown']) }}"
|
|
|
|
|
routes: "{{ dz_routes.stdout_lines | default([]) }}"
|
|
|
|
|
tags: [doublezero]
|
|
|
|
|
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
# Storage
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
- name: Check ramdisk usage
|
|
|
|
|
ansible.builtin.command:
|
|
|
|
|
cmd: df -h /srv/solana/ramdisk
|
|
|
|
|
register: ramdisk_df
|
|
|
|
|
changed_when: false
|
|
|
|
|
failed_when: false
|
|
|
|
|
tags: [storage]
|
|
|
|
|
|
|
|
|
|
- name: Check ZFS dataset usage
|
|
|
|
|
ansible.builtin.command:
|
|
|
|
|
cmd: zfs list -o name,used,avail,mountpoint -r biscayne/DATA
|
|
|
|
|
register: zfs_list
|
|
|
|
|
changed_when: false
|
|
|
|
|
tags: [storage]
|
|
|
|
|
|
|
|
|
|
- name: Check ZFS zvol I/O
|
|
|
|
|
ansible.builtin.shell:
|
|
|
|
|
cmd: set -o pipefail && iostat -x zd0 1 2 | tail -3
|
|
|
|
|
executable: /bin/bash
|
|
|
|
|
register: zvol_io
|
|
|
|
|
changed_when: false
|
|
|
|
|
failed_when: false
|
|
|
|
|
tags: [storage]
|
|
|
|
|
|
2026-03-07 13:07:54 +00:00
|
|
|
- name: Check host mount chain
|
|
|
|
|
ansible.builtin.shell:
|
|
|
|
|
cmd: >
|
|
|
|
|
set -o pipefail &&
|
|
|
|
|
findmnt -n -o TARGET,SOURCE,FSTYPE,PROPAGATION
|
|
|
|
|
/srv/solana /srv/solana/ramdisk /srv/kind/solana 2>&1
|
|
|
|
|
executable: /bin/bash
|
|
|
|
|
register: host_mounts
|
|
|
|
|
changed_when: false
|
|
|
|
|
failed_when: false
|
|
|
|
|
tags: [storage, mounts]
|
|
|
|
|
|
|
|
|
|
- name: Check kind node mount visibility
|
|
|
|
|
ansible.builtin.shell:
|
|
|
|
|
cmd: |
|
|
|
|
|
set -o pipefail
|
|
|
|
|
echo "=== /mnt/solana contents ==="
|
|
|
|
|
docker exec {{ kind_cluster }}-control-plane ls /mnt/solana/
|
|
|
|
|
echo "=== /mnt/solana filesystem ==="
|
|
|
|
|
docker exec {{ kind_cluster }}-control-plane df -T /mnt/solana
|
|
|
|
|
echo "=== /mnt/solana/ramdisk filesystem ==="
|
|
|
|
|
docker exec {{ kind_cluster }}-control-plane df -T /mnt/solana/ramdisk 2>/dev/null || echo "ramdisk not visible"
|
|
|
|
|
echo "=== /mnt/solana/snapshots ==="
|
|
|
|
|
docker exec {{ kind_cluster }}-control-plane ls /mnt/solana/snapshots/ 2>/dev/null || echo "snapshots not visible"
|
|
|
|
|
echo "=== /mnt/solana/ledger ==="
|
|
|
|
|
docker exec {{ kind_cluster }}-control-plane ls /mnt/solana/ledger/ 2>/dev/null | head -5 || echo "ledger not visible"
|
|
|
|
|
executable: /bin/bash
|
|
|
|
|
register: kind_mounts
|
|
|
|
|
changed_when: false
|
|
|
|
|
failed_when: false
|
|
|
|
|
tags: [storage, mounts]
|
|
|
|
|
|
|
|
|
|
- name: Check mount propagation
|
|
|
|
|
ansible.builtin.shell:
|
|
|
|
|
cmd: >
|
|
|
|
|
set -o pipefail &&
|
|
|
|
|
findmnt -n -o PROPAGATION /srv/kind
|
|
|
|
|
executable: /bin/bash
|
|
|
|
|
register: mount_propagation
|
|
|
|
|
changed_when: false
|
|
|
|
|
failed_when: false
|
|
|
|
|
tags: [storage, mounts]
|
|
|
|
|
|
2026-03-07 01:44:25 +00:00
|
|
|
- name: Show storage status
|
|
|
|
|
ansible.builtin.debug:
|
|
|
|
|
msg:
|
|
|
|
|
ramdisk: "{{ ramdisk_df.stdout_lines | default(['not mounted']) }}"
|
|
|
|
|
zfs: "{{ zfs_list.stdout_lines | default([]) }}"
|
|
|
|
|
zvol_io: "{{ zvol_io.stdout_lines | default([]) }}"
|
2026-03-07 13:07:54 +00:00
|
|
|
host_mounts: "{{ host_mounts.stdout_lines | default([]) }}"
|
|
|
|
|
kind_mounts: "{{ kind_mounts.stdout_lines | default([]) }}"
|
|
|
|
|
mount_propagation: "{{ mount_propagation.stdout | default('unknown') }}"
|
|
|
|
|
tags: [storage, mounts]
|
2026-03-07 01:44:25 +00:00
|
|
|
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
# System resources
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
- name: Check memory
|
|
|
|
|
ansible.builtin.command:
|
|
|
|
|
cmd: free -h
|
|
|
|
|
register: mem
|
|
|
|
|
changed_when: false
|
|
|
|
|
tags: [system]
|
|
|
|
|
|
|
|
|
|
- name: Check load average
|
|
|
|
|
ansible.builtin.command:
|
|
|
|
|
cmd: cat /proc/loadavg
|
|
|
|
|
register: loadavg
|
|
|
|
|
changed_when: false
|
|
|
|
|
tags: [system]
|
|
|
|
|
|
|
|
|
|
- name: Check swap
|
|
|
|
|
ansible.builtin.command:
|
|
|
|
|
cmd: swapon --show
|
|
|
|
|
register: swap
|
|
|
|
|
changed_when: false
|
|
|
|
|
failed_when: false
|
|
|
|
|
tags: [system]
|
|
|
|
|
|
|
|
|
|
- name: Show system resources
|
|
|
|
|
ansible.builtin.debug:
|
|
|
|
|
msg:
|
|
|
|
|
memory: "{{ mem.stdout_lines }}"
|
|
|
|
|
load: "{{ loadavg.stdout }}"
|
|
|
|
|
swap: "{{ swap.stdout | default('none') }}"
|
|
|
|
|
tags: [system]
|
|
|
|
|
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
# Network / shred throughput
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
- name: Count shred packets per interface (5 sec sample)
|
|
|
|
|
ansible.builtin.shell:
|
|
|
|
|
cmd: |
|
|
|
|
|
set -o pipefail
|
|
|
|
|
for iface in eno1 doublezero0; do
|
|
|
|
|
count=$(timeout 5 tcpdump -i "$iface" -nn 'udp dst portrange 9000-10000' -q 2>&1 | grep -oP '\d+(?= packets captured)' || echo 0)
|
|
|
|
|
echo "$iface: $count packets/5s"
|
|
|
|
|
done
|
|
|
|
|
executable: /bin/bash
|
|
|
|
|
register: shred_counts
|
|
|
|
|
changed_when: false
|
|
|
|
|
failed_when: false
|
|
|
|
|
tags: [network]
|
|
|
|
|
|
|
|
|
|
- name: Check interface throughput
|
|
|
|
|
ansible.builtin.shell:
|
|
|
|
|
cmd: >-
|
|
|
|
|
set -o pipefail &&
|
|
|
|
|
grep -E 'eno1|doublezero0' /proc/net/dev
|
|
|
|
|
| awk '{printf "%s rx=%s tx=%s\n", $1, $2, $10}'
|
|
|
|
|
executable: /bin/bash
|
|
|
|
|
register: iface_stats
|
|
|
|
|
changed_when: false
|
|
|
|
|
tags: [network]
|
|
|
|
|
|
|
|
|
|
- name: Check gossip/repair port connections
|
|
|
|
|
ansible.builtin.shell:
|
|
|
|
|
cmd: >-
|
|
|
|
|
set -o pipefail &&
|
|
|
|
|
ss -tupn | grep -E ':8001|:900[0-9]' | head -20 || echo "no connections"
|
|
|
|
|
executable: /bin/bash
|
|
|
|
|
register: gossip_ports
|
|
|
|
|
changed_when: false
|
|
|
|
|
tags: [network]
|
|
|
|
|
|
|
|
|
|
- name: Check iptables DNAT rule (TVU shred relay)
|
|
|
|
|
ansible.builtin.shell:
|
|
|
|
|
cmd: >-
|
|
|
|
|
set -o pipefail &&
|
|
|
|
|
iptables -t nat -L PREROUTING -v -n | grep -E '64.92.84.81|20000' || echo "no DNAT rule"
|
|
|
|
|
executable: /bin/bash
|
|
|
|
|
register: dnat_rule
|
|
|
|
|
changed_when: false
|
|
|
|
|
tags: [network]
|
|
|
|
|
|
|
|
|
|
- name: Show network status
|
|
|
|
|
ansible.builtin.debug:
|
|
|
|
|
msg:
|
|
|
|
|
shred_counts: "{{ shred_counts.stdout_lines | default([]) }}"
|
|
|
|
|
interfaces: "{{ iface_stats.stdout_lines | default([]) }}"
|
|
|
|
|
gossip_ports: "{{ gossip_ports.stdout_lines | default([]) }}"
|
|
|
|
|
tvu_dnat: "{{ dnat_rule.stdout_lines | default([]) }}"
|
|
|
|
|
tags: [network]
|
|
|
|
|
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
# Summary
|
|
|
|
|
# ------------------------------------------------------------------
|
|
|
|
|
- name: Health check summary
|
|
|
|
|
ansible.builtin.debug:
|
|
|
|
|
msg: |
|
|
|
|
|
=== Biscayne Health Check ===
|
|
|
|
|
Cluster: {{ kind_cluster }}
|
|
|
|
|
Namespace: {{ agave_ns }}
|
|
|
|
|
Pod: {{ agave_pod }}
|
|
|
|
|
RPC: {{ rpc_health.status | default('unreachable') }}
|
|
|
|
|
DZ identity: {{ dz_address.stdout | default('unknown') | trim }}
|
|
|
|
|
DZ tunnel: {{ 'UP' if dz_status.rc | default(1) == 0 else 'DOWN' }}
|
|
|
|
|
DZ systemd: {{ dz_systemd_state }}
|
|
|
|
|
Ramdisk: {{ ramdisk_df.stdout_lines[-1] | default('unknown') }}
|
|
|
|
|
Load: {{ loadavg.stdout | default('unknown') }}
|