From 09728a719c30553b217e60b05d2e58b7606af62e Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Mon, 9 Mar 2026 06:39:25 +0000 Subject: [PATCH] fix: recovery playbook is fire-and-forget, add check-status.py The recovery playbook now exits after scaling to 1. The container entrypoint handles snapshot download (60+ min) and validator startup autonomously. Removed all polling/verification steps that would time out waiting. Added scripts/check-status.py for monitoring download progress, validator slot, gap to mainnet, catch-up rate, and ramdisk usage. Co-Authored-By: Claude Opus 4.6 --- playbooks/biscayne-recover.yml | 108 ++----------- scripts/check-status.py | 276 +++++++++++++++++++++++++++++++++ 2 files changed, 285 insertions(+), 99 deletions(-) create mode 100755 scripts/check-status.py diff --git a/playbooks/biscayne-recover.yml b/playbooks/biscayne-recover.yml index 53ebe3e9..1d46c78e 100644 --- a/playbooks/biscayne-recover.yml +++ b/playbooks/biscayne-recover.yml @@ -7,14 +7,14 @@ # # Steps: # 1. Scale deployment to 0 -# 2. Wait for pods to terminate +# 2. Wait for pods to terminate (io_uring safety check) # 3. Wipe accounts ramdisk # 4. Clean old snapshots # 5. Scale to 1 — container entrypoint downloads snapshot + starts validator -# 6. Verify snapshot freshness -# 7. Wait for pod Running -# 8. Verify validator log -# 9. Check RPC health +# +# The playbook exits after step 5. The container handles snapshot download +# (60+ min) and validator startup autonomously. Monitor with: +# scripts/check-status.py --watch # # Usage: # ansible-playbook -i inventory/biscayne.yml playbooks/biscayne-recover.yml @@ -31,10 +31,6 @@ snapshot_dir: /srv/kind/solana/snapshots accounts_dir: /srv/kind/solana/ramdisk/accounts ramdisk_mount: /srv/kind/solana/ramdisk - # Mainnet RPC for slot comparison - mainnet_rpc: https://api.mainnet-beta.solana.com - # Maximum slots behind before snapshot is considered stale - max_slot_lag: 20000 tasks: # ---- step 1: scale to 0 --------------------------------------------------- @@ -109,95 +105,9 @@ -n {{ k8s_namespace }} --replicas=1 changed_when: true - # ---- step 6: wait for pod running ------------------------------------------ - # The entrypoint downloads the snapshot before starting the validator. - # The pod reaches Running immediately (entrypoint is PID 1), but the - # validator log won't appear until download + startup completes. - - name: Wait for pod to be running - ansible.builtin.command: > - kubectl get pods -n {{ k8s_namespace }} - -l app={{ deployment_name }} - -o jsonpath='{.items[0].status.phase}' - register: pod_status - retries: 60 - delay: 10 - until: pod_status.stdout == "Running" - changed_when: false - - # ---- step 7: wait for snapshot download to complete ----------------------- - # The entrypoint writes the snapshot to the PV. Wait for it to appear - # on the host (zvol mount is shared). - - name: Wait for snapshot file to appear - ansible.builtin.shell: set -o pipefail && ls -1 {{ snapshot_dir }}/snapshot-*.tar.* 2>/dev/null | head -1 - args: - executable: /bin/bash - register: snapshot_file - retries: 180 - delay: 20 - until: snapshot_file.stdout != "" - changed_when: false - - # ---- step 8: verify snapshot freshness ------------------------------------ - - name: Get snapshot filename - ansible.builtin.shell: set -o pipefail && ls -1 {{ snapshot_dir }}/snapshot-*.tar.* | head -1 | xargs basename - args: - executable: /bin/bash - register: snapshot_filename - changed_when: false - - - name: Extract snapshot slot from filename - ansible.builtin.set_fact: - snapshot_slot: "{{ snapshot_filename.stdout | regex_search('snapshot-([0-9]+)-', '\\1') | first }}" - - - name: Get current mainnet slot - ansible.builtin.uri: - url: "{{ mainnet_rpc }}" - method: POST - body_format: json - body: - jsonrpc: "2.0" - id: 1 - method: getSlot - params: - - commitment: finalized - return_content: true - register: mainnet_slot_response - - - name: Report snapshot freshness + - name: Report ansible.builtin.debug: msg: >- - Snapshot slot {{ snapshot_slot }}, mainnet {{ mainnet_slot_response.json.result }}, - {{ mainnet_slot_response.json.result | int - snapshot_slot | int }} slots behind. - - # ---- step 9: wait for validator log --------------------------------------- - - name: Wait for validator log file - ansible.builtin.command: > - kubectl exec -n {{ k8s_namespace }} - deployment/{{ deployment_name }} - -c agave-validator -- test -f /data/log/validator.log - register: log_file_check - retries: 30 - delay: 20 - until: log_file_check.rc == 0 - changed_when: false - - # ---- step 10: check RPC health -------------------------------------------- - - name: Check RPC health (non-blocking) - ansible.builtin.uri: - url: http://{{ inventory_hostname }}:8899/health - return_content: true - register: rpc_health - retries: 6 - delay: 30 - until: rpc_health.status == 200 - failed_when: false - - - name: Report final status - ansible.builtin.debug: - msg: >- - Recovery complete. - Snapshot: slot {{ snapshot_slot }} - ({{ mainnet_slot_response.json.result | int - snapshot_slot | int }} slots behind). - Pod: {{ pod_status.stdout }}. - Log: {{ 'writing' if log_file_check.rc == 0 else 'not yet' }}. - RPC: {{ rpc_health.content | default('not yet responding — still catching up') }}. + Recovery initiated. The container entrypoint will download a fresh + snapshot and start the validator. Monitor progress with: + scripts/check-status.py --watch diff --git a/scripts/check-status.py b/scripts/check-status.py new file mode 100755 index 00000000..ae0dc4b4 --- /dev/null +++ b/scripts/check-status.py @@ -0,0 +1,276 @@ +#!/usr/bin/env python3 +"""Check agave validator and snapshot download status on biscayne. + +Runs kubectl and host commands over SSH to report: + - Pod phase and container states + - Entrypoint logs (snapshot download progress) + - Snapshot files on disk + - Validator slot vs mainnet slot (gap + catch-up rate) + - Ramdisk usage + +Usage: + scripts/check-status.py # one-shot + scripts/check-status.py --watch # repeat every 30s + scripts/check-status.py --watch -i 10 # repeat every 10s +""" + +from __future__ import annotations + +import argparse +import json +import subprocess +import sys +import time +import urllib.request + +# -- Config ------------------------------------------------------------------- + +SSH_HOST = "biscayne.vaasl.io" +KUBECONFIG = "/home/rix/.kube/config" +NAMESPACE = "laconic-laconic-70ce4c4b47e23b85" +DEPLOYMENT = "laconic-70ce4c4b47e23b85-deployment" +KIND_CONTAINER = "laconic-70ce4c4b47e23b85-control-plane" +SNAPSHOT_DIR = "/srv/kind/solana/snapshots" +RAMDISK = "/srv/kind/solana/ramdisk" +MAINNET_RPC = "https://api.mainnet-beta.solana.com" + + +# -- Helpers ------------------------------------------------------------------ + + +def ssh(cmd: str, timeout: int = 15) -> tuple[int, str]: + """Run a command on biscayne via SSH. Returns (rc, stdout).""" + r = subprocess.run( + ["ssh", SSH_HOST, cmd], + capture_output=True, text=True, timeout=timeout, + ) + return r.returncode, r.stdout.strip() + + +def kubectl(args: str, timeout: int = 15) -> tuple[int, str]: + """Run kubectl on biscayne.""" + return ssh(f"KUBECONFIG={KUBECONFIG} kubectl {args}", timeout) + + +def get_mainnet_slot() -> int | None: + """Query mainnet for current finalized slot.""" + req = urllib.request.Request( + MAINNET_RPC, + data=json.dumps({ + "jsonrpc": "2.0", "id": 1, + "method": "getSlot", + "params": [{"commitment": "finalized"}], + }).encode(), + headers={"Content-Type": "application/json"}, + ) + try: + with urllib.request.urlopen(req, timeout=10) as resp: + return json.loads(resp.read())["result"] + except Exception: + return None + + +# -- Checks ------------------------------------------------------------------- + + +def check_pod() -> dict: + """Get pod phase and container statuses.""" + rc, out = kubectl( + f"get pods -n {NAMESPACE} -l app={DEPLOYMENT} " + "-o json" + ) + if rc != 0 or not out: + return {"phase": "NoPod", "containers": {}} + + data = json.loads(out) + if not data.get("items"): + return {"phase": "NoPod", "containers": {}} + + pod = data["items"][0] + phase = pod["status"].get("phase", "Unknown") + containers = {} + for cs in pod["status"].get("containerStatuses", []): + state_key = list(cs["state"].keys())[0] + state = cs["state"][state_key] + reason = state.get("reason", "") + detail = f"{state_key}" + if reason: + detail += f"({reason})" + containers[cs["name"]] = { + "ready": cs["ready"], + "state": detail, + "restarts": cs["restartCount"], + } + return {"phase": phase, "containers": containers} + + +def check_entrypoint_logs(lines: int = 15) -> str: + """Get recent entrypoint logs from the agave-validator container.""" + rc, out = kubectl( + f"logs -n {NAMESPACE} deployment/{DEPLOYMENT} " + f"-c agave-validator --tail={lines}", + timeout=20, + ) + return out if rc == 0 else "(no logs)" + + +def check_snapshots() -> list[dict]: + """List snapshot files on disk with sizes.""" + rc, out = ssh( + f"ls -lhS {SNAPSHOT_DIR}/*.tar.* 2>/dev/null " + f"|| echo 'NO_SNAPSHOTS'" + ) + if "NO_SNAPSHOTS" in out: + return [] + + files = [] + for line in out.splitlines(): + parts = line.split() + if len(parts) >= 9: + files.append({"size": parts[4], "name": parts[-1].split("/")[-1]}) + return files + + +def check_validator_slot() -> int | None: + """Query the validator's current processed slot via RPC.""" + rc, out = kubectl( + f"exec -n {NAMESPACE} deployment/{DEPLOYMENT} " + f"-c agave-validator -- " + "curl -s -X POST -H 'Content-Type: application/json' " + "-d '{\"jsonrpc\":\"2.0\",\"id\":1,\"method\":\"getSlot\"," + "\"params\":[{\"commitment\":\"processed\"}]}' " + "http://localhost:8899", + timeout=10, + ) + if rc != 0 or not out: + return None + try: + return json.loads(out)["result"] + except (json.JSONDecodeError, KeyError): + return None + + +def check_ramdisk() -> str: + """Get ramdisk usage.""" + rc, out = ssh(f"df -h {RAMDISK} | tail -1") + if rc != 0: + return "unknown" + parts = out.split() + if len(parts) >= 5: + return f"{parts[2]}/{parts[1]} ({parts[4]})" + return out + + +# -- Display ------------------------------------------------------------------ + + +prev_slot: int | None = None +prev_time: float | None = None + + +def display(iteration: int = 0) -> None: + """Run all checks and print status.""" + global prev_slot, prev_time + + now = time.time() + ts = time.strftime("%H:%M:%S") + + # Gather data + pod = check_pod() + mainnet = get_mainnet_slot() + snapshots = check_snapshots() + ramdisk = check_ramdisk() + + print(f"\n{'=' * 60}") + print(f" Biscayne Agave Status — {ts}") + print(f"{'=' * 60}") + + # Pod + print(f"\n Pod: {pod['phase']}") + for name, cs in pod["containers"].items(): + ready = "✓" if cs["ready"] else "✗" + restarts = f" (restarts: {cs['restarts']})" if cs["restarts"] > 0 else "" + print(f" {ready} {name}: {cs['state']}{restarts}") + + # Validator slot + validator_slot = None + if pod["phase"] == "Running": + agave = pod["containers"].get("agave-validator", {}) + if agave.get("ready"): + validator_slot = check_validator_slot() + + if validator_slot is not None and mainnet is not None: + gap = mainnet - validator_slot + rate = "" + if prev_slot is not None and prev_time is not None: + dt = now - prev_time + if dt > 0: + slots_gained = validator_slot - prev_slot + # Net rate = our replay rate minus chain production + net_rate = slots_gained / dt + if net_rate > 0: + eta_sec = gap / net_rate + eta_min = eta_sec / 60 + rate = f" net {net_rate:+.1f} slots/s, ETA ~{eta_min:.0f}m" + else: + rate = f" net {net_rate:+.1f} slots/s (falling behind)" + prev_slot = validator_slot + prev_time = now + print(f"\n Validator: slot {validator_slot:,}") + print(f" Mainnet: slot {mainnet:,}") + print(f" Gap: {gap:,} slots{rate}") + elif mainnet is not None: + print(f"\n Validator: not responding (downloading or starting)") + print(f" Mainnet: slot {mainnet:,}") + else: + print(f"\n Mainnet: unreachable") + + # Snapshots + if snapshots: + print(f"\n Snapshots:") + for s in snapshots: + print(f" {s['size']:>6s} {s['name']}") + else: + print(f"\n Snapshots: none on disk") + + # Ramdisk + print(f" Ramdisk: {ramdisk}") + + # Entrypoint logs (only if validator not yet responding) + if validator_slot is None and pod["phase"] in ("Running", "Pending"): + logs = check_entrypoint_logs(10) + if logs and logs != "(no logs)": + print(f"\n Entrypoint logs (last 10 lines):") + for line in logs.splitlines(): + print(f" {line}") + + print() + + +# -- Main --------------------------------------------------------------------- + + +def main() -> int: + p = argparse.ArgumentParser(description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + p.add_argument("--watch", action="store_true", help="Repeat every interval") + p.add_argument("-i", "--interval", type=int, default=30, + help="Watch interval in seconds (default: 30)") + args = p.parse_args() + + try: + if args.watch: + i = 0 + while True: + display(i) + i += 1 + time.sleep(args.interval) + else: + display() + except KeyboardInterrupt: + print() + return 0 + + +if __name__ == "__main__": + sys.exit(main())