fix: recovery playbook is fire-and-forget, add check-status.py

The recovery playbook now exits after scaling to 1. The container entrypoint handles snapshot download (60+ min) and validator startup autonomously. Removed all polling/verification steps that would time out waiting. Added scripts/check-status.py for monitoring download progress, validator slot, gap to mainnet, catch-up rate, and ramdisk usage. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-09 06:39:25 +00:00 · 2026-03-09 06:39:25 +00:00 · 09728a719c
parent 3dc345ea7d
commit 09728a719c
2 changed files with 285 additions and 99 deletions
--- a/playbooks/biscayne-recover.yml
+++ b/playbooks/biscayne-recover.yml
@ -7,14 +7,14 @@
 #
 # Steps:
 #   1. Scale deployment to 0
-#   2. Wait for pods to terminate
+#   2. Wait for pods to terminate (io_uring safety check)
 #   3. Wipe accounts ramdisk
 #   4. Clean old snapshots
 #   5. Scale to 1 — container entrypoint downloads snapshot + starts validator
-#   6. Verify snapshot freshness
-#   7. Wait for pod Running
-#   8. Verify validator log
-#   9. Check RPC health
+#
+# The playbook exits after step 5. The container handles snapshot download
+# (60+ min) and validator startup autonomously. Monitor with:
+#   scripts/check-status.py --watch
 #
 # Usage:
 #   ansible-playbook -i inventory/biscayne.yml playbooks/biscayne-recover.yml
@ -31,10 +31,6 @@
    snapshot_dir: /srv/kind/solana/snapshots
    accounts_dir: /srv/kind/solana/ramdisk/accounts
    ramdisk_mount: /srv/kind/solana/ramdisk
-    # Mainnet RPC for slot comparison
-    mainnet_rpc: https://api.mainnet-beta.solana.com
-    # Maximum slots behind before snapshot is considered stale
-    max_slot_lag: 20000

  tasks:
    # ---- step 1: scale to 0 ---------------------------------------------------
@ -109,95 +105,9 @@
        -n {{ k8s_namespace }} --replicas=1
      changed_when: true

-    # ---- step 6: wait for pod running ------------------------------------------
-    # The entrypoint downloads the snapshot before starting the validator.
-    # The pod reaches Running immediately (entrypoint is PID 1), but the
-    # validator log won't appear until download + startup completes.
-    - name: Wait for pod to be running
-      ansible.builtin.command: >
-        kubectl get pods -n {{ k8s_namespace }}
-        -l app={{ deployment_name }}
-        -o jsonpath='{.items[0].status.phase}'
-      register: pod_status
-      retries: 60
-      delay: 10
-      until: pod_status.stdout == "Running"
-      changed_when: false
-
-    # ---- step 7: wait for snapshot download to complete -----------------------
-    # The entrypoint writes the snapshot to the PV. Wait for it to appear
-    # on the host (zvol mount is shared).
-    - name: Wait for snapshot file to appear
-      ansible.builtin.shell: set -o pipefail && ls -1 {{ snapshot_dir }}/snapshot-*.tar.* 2>/dev/null | head -1
-      args:
-        executable: /bin/bash
-      register: snapshot_file
-      retries: 180
-      delay: 20
-      until: snapshot_file.stdout != ""
-      changed_when: false
-
-    # ---- step 8: verify snapshot freshness ------------------------------------
-    - name: Get snapshot filename
-      ansible.builtin.shell: set -o pipefail && ls -1 {{ snapshot_dir }}/snapshot-*.tar.* | head -1 | xargs basename
-      args:
-        executable: /bin/bash
-      register: snapshot_filename
-      changed_when: false
-
-    - name: Extract snapshot slot from filename
-      ansible.builtin.set_fact:
-        snapshot_slot: "{{ snapshot_filename.stdout | regex_search('snapshot-([0-9]+)-', '\\1') | first }}"
-
-    - name: Get current mainnet slot
-      ansible.builtin.uri:
-        url: "{{ mainnet_rpc }}"
-        method: POST
-        body_format: json
-        body:
-          jsonrpc: "2.0"
-          id: 1
-          method: getSlot
-          params:
-            - commitment: finalized
-        return_content: true
-      register: mainnet_slot_response
-
-    - name: Report snapshot freshness
+    - name: Report
      ansible.builtin.debug:
        msg: >-
-          Snapshot slot {{ snapshot_slot }}, mainnet {{ mainnet_slot_response.json.result }},
-          {{ mainnet_slot_response.json.result | int - snapshot_slot | int }} slots behind.
-
-    # ---- step 9: wait for validator log ---------------------------------------
-    - name: Wait for validator log file
-      ansible.builtin.command: >
-        kubectl exec -n {{ k8s_namespace }}
-        deployment/{{ deployment_name }}
-        -c agave-validator -- test -f /data/log/validator.log
-      register: log_file_check
-      retries: 30
-      delay: 20
-      until: log_file_check.rc == 0
-      changed_when: false
-
-    # ---- step 10: check RPC health --------------------------------------------
-    - name: Check RPC health (non-blocking)
-      ansible.builtin.uri:
-        url: http://{{ inventory_hostname }}:8899/health
-        return_content: true
-      register: rpc_health
-      retries: 6
-      delay: 30
-      until: rpc_health.status == 200
-      failed_when: false
-
-    - name: Report final status
-      ansible.builtin.debug:
-        msg: >-
-          Recovery complete.
-          Snapshot: slot {{ snapshot_slot }}
-          ({{ mainnet_slot_response.json.result | int - snapshot_slot | int }} slots behind).
-          Pod: {{ pod_status.stdout }}.
-          Log: {{ 'writing' if log_file_check.rc == 0 else 'not yet' }}.
-          RPC: {{ rpc_health.content | default('not yet responding — still catching up') }}.
+          Recovery initiated. The container entrypoint will download a fresh
+          snapshot and start the validator. Monitor progress with:
+          scripts/check-status.py --watch
--- a/scripts/check-status.py
+++ b/scripts/check-status.py
@ -0,0 +1,276 @@
+#!/usr/bin/env python3
+"""Check agave validator and snapshot download status on biscayne.
+
+Runs kubectl and host commands over SSH to report:
+  - Pod phase and container states
+  - Entrypoint logs (snapshot download progress)
+  - Snapshot files on disk
+  - Validator slot vs mainnet slot (gap + catch-up rate)
+  - Ramdisk usage
+
+Usage:
+    scripts/check-status.py                  # one-shot
+    scripts/check-status.py --watch          # repeat every 30s
+    scripts/check-status.py --watch -i 10    # repeat every 10s
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import subprocess
+import sys
+import time
+import urllib.request
+
+# -- Config -------------------------------------------------------------------
+
+SSH_HOST = "biscayne.vaasl.io"
+KUBECONFIG = "/home/rix/.kube/config"
+NAMESPACE = "laconic-laconic-70ce4c4b47e23b85"
+DEPLOYMENT = "laconic-70ce4c4b47e23b85-deployment"
+KIND_CONTAINER = "laconic-70ce4c4b47e23b85-control-plane"
+SNAPSHOT_DIR = "/srv/kind/solana/snapshots"
+RAMDISK = "/srv/kind/solana/ramdisk"
+MAINNET_RPC = "https://api.mainnet-beta.solana.com"
+
+
+# -- Helpers ------------------------------------------------------------------
+
+
+def ssh(cmd: str, timeout: int = 15) -> tuple[int, str]:
+    """Run a command on biscayne via SSH. Returns (rc, stdout)."""
+    r = subprocess.run(
+        ["ssh", SSH_HOST, cmd],
+        capture_output=True, text=True, timeout=timeout,
+    )
+    return r.returncode, r.stdout.strip()
+
+
+def kubectl(args: str, timeout: int = 15) -> tuple[int, str]:
+    """Run kubectl on biscayne."""
+    return ssh(f"KUBECONFIG={KUBECONFIG} kubectl {args}", timeout)
+
+
+def get_mainnet_slot() -> int | None:
+    """Query mainnet for current finalized slot."""
+    req = urllib.request.Request(
+        MAINNET_RPC,
+        data=json.dumps({
+            "jsonrpc": "2.0", "id": 1,
+            "method": "getSlot",
+            "params": [{"commitment": "finalized"}],
+        }).encode(),
+        headers={"Content-Type": "application/json"},
+    )
+    try:
+        with urllib.request.urlopen(req, timeout=10) as resp:
+            return json.loads(resp.read())["result"]
+    except Exception:
+        return None
+
+
+# -- Checks -------------------------------------------------------------------
+
+
+def check_pod() -> dict:
+    """Get pod phase and container statuses."""
+    rc, out = kubectl(
+        f"get pods -n {NAMESPACE} -l app={DEPLOYMENT} "
+        "-o json"
+    )
+    if rc != 0 or not out:
+        return {"phase": "NoPod", "containers": {}}
+
+    data = json.loads(out)
+    if not data.get("items"):
+        return {"phase": "NoPod", "containers": {}}
+
+    pod = data["items"][0]
+    phase = pod["status"].get("phase", "Unknown")
+    containers = {}
+    for cs in pod["status"].get("containerStatuses", []):
+        state_key = list(cs["state"].keys())[0]
+        state = cs["state"][state_key]
+        reason = state.get("reason", "")
+        detail = f"{state_key}"
+        if reason:
+            detail += f"({reason})"
+        containers[cs["name"]] = {
+            "ready": cs["ready"],
+            "state": detail,
+            "restarts": cs["restartCount"],
+        }
+    return {"phase": phase, "containers": containers}
+
+
+def check_entrypoint_logs(lines: int = 15) -> str:
+    """Get recent entrypoint logs from the agave-validator container."""
+    rc, out = kubectl(
+        f"logs -n {NAMESPACE} deployment/{DEPLOYMENT} "
+        f"-c agave-validator --tail={lines}",
+        timeout=20,
+    )
+    return out if rc == 0 else "(no logs)"
+
+
+def check_snapshots() -> list[dict]:
+    """List snapshot files on disk with sizes."""
+    rc, out = ssh(
+        f"ls -lhS {SNAPSHOT_DIR}/*.tar.* 2>/dev/null "
+        f"|| echo 'NO_SNAPSHOTS'"
+    )
+    if "NO_SNAPSHOTS" in out:
+        return []
+
+    files = []
+    for line in out.splitlines():
+        parts = line.split()
+        if len(parts) >= 9:
+            files.append({"size": parts[4], "name": parts[-1].split("/")[-1]})
+    return files
+
+
+def check_validator_slot() -> int | None:
+    """Query the validator's current processed slot via RPC."""
+    rc, out = kubectl(
+        f"exec -n {NAMESPACE} deployment/{DEPLOYMENT} "
+        f"-c agave-validator -- "
+        "curl -s -X POST -H 'Content-Type: application/json' "
+        "-d '{\"jsonrpc\":\"2.0\",\"id\":1,\"method\":\"getSlot\","
+        "\"params\":[{\"commitment\":\"processed\"}]}' "
+        "http://localhost:8899",
+        timeout=10,
+    )
+    if rc != 0 or not out:
+        return None
+    try:
+        return json.loads(out)["result"]
+    except (json.JSONDecodeError, KeyError):
+        return None
+
+
+def check_ramdisk() -> str:
+    """Get ramdisk usage."""
+    rc, out = ssh(f"df -h {RAMDISK} | tail -1")
+    if rc != 0:
+        return "unknown"
+    parts = out.split()
+    if len(parts) >= 5:
+        return f"{parts[2]}/{parts[1]} ({parts[4]})"
+    return out
+
+
+# -- Display ------------------------------------------------------------------
+
+
+prev_slot: int | None = None
+prev_time: float | None = None
+
+
+def display(iteration: int = 0) -> None:
+    """Run all checks and print status."""
+    global prev_slot, prev_time
+
+    now = time.time()
+    ts = time.strftime("%H:%M:%S")
+
+    # Gather data
+    pod = check_pod()
+    mainnet = get_mainnet_slot()
+    snapshots = check_snapshots()
+    ramdisk = check_ramdisk()
+
+    print(f"\n{'=' * 60}")
+    print(f"  Biscayne Agave Status — {ts}")
+    print(f"{'=' * 60}")
+
+    # Pod
+    print(f"\n  Pod: {pod['phase']}")
+    for name, cs in pod["containers"].items():
+        ready = "✓" if cs["ready"] else "✗"
+        restarts = f" (restarts: {cs['restarts']})" if cs["restarts"] > 0 else ""
+        print(f"    {ready} {name}: {cs['state']}{restarts}")
+
+    # Validator slot
+    validator_slot = None
+    if pod["phase"] == "Running":
+        agave = pod["containers"].get("agave-validator", {})
+        if agave.get("ready"):
+            validator_slot = check_validator_slot()
+
+    if validator_slot is not None and mainnet is not None:
+        gap = mainnet - validator_slot
+        rate = ""
+        if prev_slot is not None and prev_time is not None:
+            dt = now - prev_time
+            if dt > 0:
+                slots_gained = validator_slot - prev_slot
+                # Net rate = our replay rate minus chain production
+                net_rate = slots_gained / dt
+                if net_rate > 0:
+                    eta_sec = gap / net_rate
+                    eta_min = eta_sec / 60
+                    rate = f"  net {net_rate:+.1f} slots/s, ETA ~{eta_min:.0f}m"
+                else:
+                    rate = f"  net {net_rate:+.1f} slots/s (falling behind)"
+        prev_slot = validator_slot
+        prev_time = now
+        print(f"\n  Validator: slot {validator_slot:,}")
+        print(f"  Mainnet:   slot {mainnet:,}")
+        print(f"  Gap:       {gap:,} slots{rate}")
+    elif mainnet is not None:
+        print(f"\n  Validator: not responding (downloading or starting)")
+        print(f"  Mainnet:   slot {mainnet:,}")
+    else:
+        print(f"\n  Mainnet:   unreachable")
+
+    # Snapshots
+    if snapshots:
+        print(f"\n  Snapshots:")
+        for s in snapshots:
+            print(f"    {s['size']:>6s}  {s['name']}")
+    else:
+        print(f"\n  Snapshots: none on disk")
+
+    # Ramdisk
+    print(f"  Ramdisk:   {ramdisk}")
+
+    # Entrypoint logs (only if validator not yet responding)
+    if validator_slot is None and pod["phase"] in ("Running", "Pending"):
+        logs = check_entrypoint_logs(10)
+        if logs and logs != "(no logs)":
+            print(f"\n  Entrypoint logs (last 10 lines):")
+            for line in logs.splitlines():
+                print(f"    {line}")
+
+    print()
+
+
+# -- Main ---------------------------------------------------------------------
+
+
+def main() -> int:
+    p = argparse.ArgumentParser(description=__doc__,
+                                formatter_class=argparse.RawDescriptionHelpFormatter)
+    p.add_argument("--watch", action="store_true", help="Repeat every interval")
+    p.add_argument("-i", "--interval", type=int, default=30,
+                   help="Watch interval in seconds (default: 30)")
+    args = p.parse_args()
+
+    try:
+        if args.watch:
+            i = 0
+            while True:
+                display(i)
+                i += 1
+                time.sleep(args.interval)
+        else:
+            display()
+    except KeyboardInterrupt:
+        print()
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())