stack-orchestrator/playbooks/health-check.yml

394 lines
13 KiB
YAML
Raw Normal View History

---
# Health check for biscayne agave-stack deployment
#
# Gathers system, validator, DoubleZero, and network status in a single run.
# All tasks are read-only — safe to run at any time.
#
# Usage:
# ansible-playbook playbooks/health-check.yml
# ansible-playbook playbooks/health-check.yml -t validator # just validator checks
# ansible-playbook playbooks/health-check.yml -t doublezero # just DZ checks
# ansible-playbook playbooks/health-check.yml -t network # just network checks
- name: Biscayne agave-stack health check
hosts: all
gather_facts: false
environment:
KUBECONFIG: /home/rix/.kube/config
tasks:
# ------------------------------------------------------------------
# Discover kind cluster and namespace
# ------------------------------------------------------------------
- name: Get kind cluster name
ansible.builtin.command:
cmd: kind get clusters
register: kind_clusters
changed_when: false
failed_when: kind_clusters.rc != 0 or kind_clusters.stdout_lines | length == 0
tags: [always]
- name: Set cluster name fact
ansible.builtin.set_fact:
kind_cluster: "{{ kind_clusters.stdout_lines[0] }}"
tags: [always]
- name: Discover agave namespace
ansible.builtin.shell:
cmd: >-
set -o pipefail &&
kubectl get namespaces --no-headers -o custom-columns=':metadata.name'
| grep '^laconic-'
executable: /bin/bash
register: ns_result
changed_when: false
failed_when: ns_result.stdout_lines | length == 0
tags: [always]
- name: Set namespace fact
ansible.builtin.set_fact:
agave_ns: "{{ ns_result.stdout_lines[0] }}"
tags: [always]
- name: Get pod name
ansible.builtin.shell:
cmd: >-
set -o pipefail &&
kubectl get pods -n {{ agave_ns }} --no-headers
-o custom-columns=':metadata.name' | head -1
executable: /bin/bash
register: pod_result
changed_when: false
failed_when: false
tags: [always]
- name: Set pod fact
ansible.builtin.set_fact:
agave_pod: "{{ pod_result.stdout | default('') | trim }}"
tags: [always]
- name: Show discovered resources
ansible.builtin.debug:
msg: "cluster={{ kind_cluster }} ns={{ agave_ns }} pod={{ agave_pod | default('none') }}"
tags: [always]
# ------------------------------------------------------------------
# Pod status
# ------------------------------------------------------------------
- name: Get pod status
ansible.builtin.command:
cmd: kubectl get pods -n {{ agave_ns }} -o wide
register: pod_status
changed_when: false
tags: [validator]
- name: Show pod status
ansible.builtin.debug:
var: pod_status.stdout_lines
tags: [validator]
- name: Get container restart counts
ansible.builtin.shell:
cmd: >-
kubectl get pod {{ agave_pod }} -n {{ agave_ns }}
-o jsonpath='{range .status.containerStatuses[*]}{.name}{" restarts="}{.restartCount}{" ready="}{.ready}{"\n"}{end}'
register: restart_counts
changed_when: false
tags: [validator]
- name: Show restart counts
ansible.builtin.debug:
var: restart_counts.stdout_lines
tags: [validator]
# ------------------------------------------------------------------
# Validator sync status
# ------------------------------------------------------------------
- name: Get validator recent logs (replay progress)
ansible.builtin.command:
cmd: >-
kubectl logs -n {{ agave_ns }} {{ agave_pod }}
-c agave-validator --tail=30
register: validator_logs
changed_when: false
tags: [validator]
- name: Show validator logs
ansible.builtin.debug:
var: validator_logs.stdout_lines
tags: [validator]
- name: Check RPC health endpoint
ansible.builtin.uri:
url: http://127.0.0.1:8899/health
method: GET
return_content: true
timeout: 5
register: rpc_health
failed_when: false
tags: [validator]
- name: Show RPC health
ansible.builtin.debug:
msg: "RPC health: {{ rpc_health.status | default('unreachable') }} — {{ rpc_health.content | default('no response') }}"
tags: [validator]
- name: Get validator version
ansible.builtin.shell:
cmd: >-
kubectl exec -n {{ agave_ns }} {{ agave_pod }}
-c agave-validator -- agave-validator --version 2>&1 || true
register: validator_version
changed_when: false
tags: [validator]
- name: Show validator version
ansible.builtin.debug:
var: validator_version.stdout
tags: [validator]
# ------------------------------------------------------------------
# DoubleZero status
# ------------------------------------------------------------------
- name: Get host DZ identity
ansible.builtin.command:
cmd: sudo -u solana doublezero address
register: dz_address
changed_when: false
failed_when: false
tags: [doublezero]
- name: Get host DZ tunnel status
ansible.builtin.command:
cmd: sudo -u solana doublezero -e {{ dz_environment }} status
register: dz_status
changed_when: false
failed_when: false
tags: [doublezero]
- name: Get DZ routes
ansible.builtin.shell:
cmd: set -o pipefail && ip route | grep doublezero0 || echo "no doublezero0 routes"
executable: /bin/bash
register: dz_routes
changed_when: false
tags: [doublezero]
- name: Get host doublezerod service state
ansible.builtin.systemd:
name: doublezerod
register: dz_systemd_info
failed_when: false
check_mode: true
tags: [doublezero]
- name: Set DZ systemd state
ansible.builtin.set_fact:
dz_systemd_state: "{{ dz_systemd_info.status.ActiveState | default('unknown') }}"
tags: [doublezero]
- name: Get container DZ status
ansible.builtin.shell:
cmd: >-
kubectl exec -n {{ agave_ns }} {{ agave_pod }}
-c doublezerod -- doublezero status 2>&1 || echo "container DZ unavailable"
register: dz_container_status
changed_when: false
tags: [doublezero]
- name: Show DoubleZero status
ansible.builtin.debug:
msg:
identity: "{{ dz_address.stdout | default('unknown') }}"
host_tunnel: "{{ dz_status.stdout_lines | default(['unknown']) }}"
host_systemd: "{{ dz_systemd_state }}"
container: "{{ dz_container_status.stdout_lines | default(['unknown']) }}"
routes: "{{ dz_routes.stdout_lines | default([]) }}"
tags: [doublezero]
# ------------------------------------------------------------------
# Storage
# ------------------------------------------------------------------
- name: Check ramdisk usage
ansible.builtin.command:
cmd: df -h /srv/kind/solana/ramdisk
register: ramdisk_df
changed_when: false
failed_when: false
tags: [storage]
- name: Check ZFS dataset usage
ansible.builtin.command:
cmd: zfs list -o name,used,avail,mountpoint -r biscayne/DATA
register: zfs_list
changed_when: false
tags: [storage]
- name: Check ZFS zvol I/O
ansible.builtin.shell:
cmd: set -o pipefail && iostat -x zd0 1 2 | tail -3
executable: /bin/bash
register: zvol_io
changed_when: false
failed_when: false
tags: [storage]
- name: Check host mount chain
ansible.builtin.shell:
cmd: >
set -o pipefail &&
findmnt -n -o TARGET,SOURCE,FSTYPE,PROPAGATION
/srv/kind/solana /srv/kind/solana/ramdisk 2>&1
executable: /bin/bash
register: host_mounts
changed_when: false
failed_when: false
tags: [storage, mounts]
- name: Check kind node mount visibility
ansible.builtin.shell:
cmd: |
set -o pipefail
echo "=== PV mount filesystems ==="
docker exec {{ kind_cluster }}-control-plane df -T /mnt/validator-ledger /mnt/validator-accounts /mnt/validator-snapshots /mnt/validator-log 2>/dev/null || echo "PV mounts not visible"
echo "=== /mnt/validator-ledger ==="
docker exec {{ kind_cluster }}-control-plane ls /mnt/validator-ledger/ 2>/dev/null | head -5 || echo "ledger not visible"
echo "=== /mnt/validator-snapshots ==="
docker exec {{ kind_cluster }}-control-plane ls /mnt/validator-snapshots/ 2>/dev/null || echo "snapshots not visible"
echo "=== /mnt/validator-accounts ==="
docker exec {{ kind_cluster }}-control-plane ls /mnt/validator-accounts/ 2>/dev/null || echo "accounts not visible"
executable: /bin/bash
register: kind_mounts
changed_when: false
failed_when: false
tags: [storage, mounts]
- name: Check mount propagation
ansible.builtin.shell:
cmd: >
set -o pipefail &&
findmnt -n -o PROPAGATION /srv/kind
executable: /bin/bash
register: mount_propagation
changed_when: false
failed_when: false
tags: [storage, mounts]
- name: Show storage status
ansible.builtin.debug:
msg:
ramdisk: "{{ ramdisk_df.stdout_lines | default(['not mounted']) }}"
zfs: "{{ zfs_list.stdout_lines | default([]) }}"
zvol_io: "{{ zvol_io.stdout_lines | default([]) }}"
host_mounts: "{{ host_mounts.stdout_lines | default([]) }}"
kind_mounts: "{{ kind_mounts.stdout_lines | default([]) }}"
mount_propagation: "{{ mount_propagation.stdout | default('unknown') }}"
tags: [storage, mounts]
# ------------------------------------------------------------------
# System resources
# ------------------------------------------------------------------
- name: Check memory
ansible.builtin.command:
cmd: free -h
register: mem
changed_when: false
tags: [system]
- name: Check load average
ansible.builtin.command:
cmd: cat /proc/loadavg
register: loadavg
changed_when: false
tags: [system]
- name: Check swap
ansible.builtin.command:
cmd: swapon --show
register: swap
changed_when: false
failed_when: false
tags: [system]
- name: Show system resources
ansible.builtin.debug:
msg:
memory: "{{ mem.stdout_lines }}"
load: "{{ loadavg.stdout }}"
swap: "{{ swap.stdout | default('none') }}"
tags: [system]
# ------------------------------------------------------------------
# Network / shred throughput
# ------------------------------------------------------------------
- name: Count shred packets per interface (5 sec sample)
ansible.builtin.shell:
cmd: |
set -o pipefail
for iface in eno1 doublezero0; do
count=$(timeout 5 tcpdump -i "$iface" -nn 'udp dst portrange 9000-10000' -q 2>&1 | grep -oP '\d+(?= packets captured)' || echo 0)
echo "$iface: $count packets/5s"
done
executable: /bin/bash
register: shred_counts
changed_when: false
failed_when: false
tags: [network]
- name: Check interface throughput
ansible.builtin.shell:
cmd: >-
set -o pipefail &&
grep -E 'eno1|doublezero0' /proc/net/dev
| awk '{printf "%s rx=%s tx=%s\n", $1, $2, $10}'
executable: /bin/bash
register: iface_stats
changed_when: false
tags: [network]
- name: Check gossip/repair port connections
ansible.builtin.shell:
cmd: >-
set -o pipefail &&
ss -tupn | grep -E ':8001|:900[0-9]' | head -20 || echo "no connections"
executable: /bin/bash
register: gossip_ports
changed_when: false
tags: [network]
- name: Check iptables DNAT rule (TVU shred relay)
ansible.builtin.shell:
cmd: >-
set -o pipefail &&
iptables -t nat -L PREROUTING -v -n | grep -E '64.92.84.81|20000' || echo "no DNAT rule"
executable: /bin/bash
register: dnat_rule
changed_when: false
tags: [network]
- name: Show network status
ansible.builtin.debug:
msg:
shred_counts: "{{ shred_counts.stdout_lines | default([]) }}"
interfaces: "{{ iface_stats.stdout_lines | default([]) }}"
gossip_ports: "{{ gossip_ports.stdout_lines | default([]) }}"
tvu_dnat: "{{ dnat_rule.stdout_lines | default([]) }}"
tags: [network]
# ------------------------------------------------------------------
# Summary
# ------------------------------------------------------------------
- name: Health check summary
ansible.builtin.debug:
msg: |
=== Biscayne Health Check ===
Cluster: {{ kind_cluster }}
Namespace: {{ agave_ns }}
Pod: {{ agave_pod }}
RPC: {{ rpc_health.status | default('unreachable') }}
DZ identity: {{ dz_address.stdout | default('unknown') | trim }}
DZ tunnel: {{ 'UP' if dz_status.rc | default(1) == 0 else 'DOWN' }}
DZ systemd: {{ dz_systemd_state }}
Ramdisk: {{ ramdisk_df.stdout_lines[-1] | default('unknown') }}
Load: {{ loadavg.stdout | default('unknown') }}