fix: recovery playbook is fire-and-forget, add check-status.py

The recovery playbook now exits after scaling to 1. The container entrypoint handles snapshot download (60+ min) and validator startup autonomously. Removed all polling/verification steps that would time out waiting. Added scripts/check-status.py for monitoring download progress, validator slot, gap to mainnet, catch-up rate, and ramdisk usage. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-09 06:39:25 +00:00 · 2026-03-09 06:39:25 +00:00 · 09728a719c
parent 3dc345ea7d
commit 09728a719c
2 changed files with 285 additions and 99 deletions
--- a/playbooks/biscayne-recover.yml
+++ b/playbooks/biscayne-recover.yml
@ -7,14 +7,14 @@
 #
 # Steps:
 #   1. Scale deployment to 0
-#   2. Wait for pods to terminate
+#   2. Wait for pods to terminate (io_uring safety check)
 #   3. Wipe accounts ramdisk
 #   4. Clean old snapshots
 #   5. Scale to 1 — container entrypoint downloads snapshot + starts validator
-#   6. Verify snapshot freshness
+#
-#   7. Wait for pod Running
+# The playbook exits after step 5. The container handles snapshot download
-#   8. Verify validator log
+# (60+ min) and validator startup autonomously. Monitor with:
-#   9. Check RPC health
+#   scripts/check-status.py --watch
 #
 # Usage:
 #   ansible-playbook -i inventory/biscayne.yml playbooks/biscayne-recover.yml
@ -31,10 +31,6 @@
    snapshot_dir: /srv/kind/solana/snapshots
    accounts_dir: /srv/kind/solana/ramdisk/accounts
    ramdisk_mount: /srv/kind/solana/ramdisk
    # Mainnet RPC for slot comparison
    mainnet_rpc: https://api.mainnet-beta.solana.com
    # Maximum slots behind before snapshot is considered stale
    max_slot_lag: 20000
  tasks:
    # ---- step 1: scale to 0 ---------------------------------------------------
@ -109,95 +105,9 @@
        -n {{ k8s_namespace }} --replicas=1
      changed_when: true
-    # ---- step 6: wait for pod running ------------------------------------------
+    - name: Report
    # The entrypoint downloads the snapshot before starting the validator.
    # The pod reaches Running immediately (entrypoint is PID 1), but the
    # validator log won't appear until download + startup completes.
    - name: Wait for pod to be running
      ansible.builtin.command: >
        kubectl get pods -n {{ k8s_namespace }}
        -l app={{ deployment_name }}
        -o jsonpath='{.items[0].status.phase}'
      register: pod_status
      retries: 60
      delay: 10
      until: pod_status.stdout == "Running"
      changed_when: false
    # ---- step 7: wait for snapshot download to complete -----------------------
    # The entrypoint writes the snapshot to the PV. Wait for it to appear
    # on the host (zvol mount is shared).
    - name: Wait for snapshot file to appear
      ansible.builtin.shell: set -o pipefail && ls -1 {{ snapshot_dir }}/snapshot-*.tar.* 2>/dev/null | head -1
      args:
        executable: /bin/bash
      register: snapshot_file
      retries: 180
      delay: 20
      until: snapshot_file.stdout != ""
      changed_when: false
    # ---- step 8: verify snapshot freshness ------------------------------------
    - name: Get snapshot filename
      ansible.builtin.shell: set -o pipefail && ls -1 {{ snapshot_dir }}/snapshot-*.tar.* | head -1 | xargs basename
      args:
        executable: /bin/bash
      register: snapshot_filename
      changed_when: false
    - name: Extract snapshot slot from filename
      ansible.builtin.set_fact:
        snapshot_slot: "{{ snapshot_filename.stdout | regex_search('snapshot-([0-9]+)-', '\\1') | first }}"
    - name: Get current mainnet slot
      ansible.builtin.uri:
        url: "{{ mainnet_rpc }}"
        method: POST
        body_format: json
        body:
          jsonrpc: "2.0"
          id: 1
          method: getSlot
          params:
            - commitment: finalized
        return_content: true
      register: mainnet_slot_response
    - name: Report snapshot freshness
      ansible.builtin.debug:
        msg: >-
-          Snapshot slot {{ snapshot_slot }}, mainnet {{ mainnet_slot_response.json.result }},
+          Recovery initiated. The container entrypoint will download a fresh
-          {{ mainnet_slot_response.json.result | int - snapshot_slot | int }} slots behind.
+          snapshot and start the validator. Monitor progress with:
-
+          scripts/check-status.py --watch
    # ---- step 9: wait for validator log ---------------------------------------
    - name: Wait for validator log file
      ansible.builtin.command: >
        kubectl exec -n {{ k8s_namespace }}
        deployment/{{ deployment_name }}
        -c agave-validator -- test -f /data/log/validator.log
      register: log_file_check
      retries: 30
      delay: 20
      until: log_file_check.rc == 0
      changed_when: false
    # ---- step 10: check RPC health --------------------------------------------
    - name: Check RPC health (non-blocking)
      ansible.builtin.uri:
        url: http://{{ inventory_hostname }}:8899/health
        return_content: true
      register: rpc_health
      retries: 6
      delay: 30
      until: rpc_health.status == 200
      failed_when: false
    - name: Report final status
      ansible.builtin.debug:
        msg: >-
          Recovery complete.
          Snapshot: slot {{ snapshot_slot }}
          ({{ mainnet_slot_response.json.result | int - snapshot_slot | int }} slots behind).
          Pod: {{ pod_status.stdout }}.
          Log: {{ 'writing' if log_file_check.rc == 0 else 'not yet' }}.
          RPC: {{ rpc_health.content | default('not yet responding — still catching up') }}.
--- a/scripts/check-status.py
+++ b/scripts/check-status.py
@ -0,0 +1,276 @@
 #!/usr/bin/env python3
 """Check agave validator and snapshot download status on biscayne.
 Runs kubectl and host commands over SSH to report:
  - Pod phase and container states
  - Entrypoint logs (snapshot download progress)
  - Snapshot files on disk
  - Validator slot vs mainnet slot (gap + catch-up rate)
  - Ramdisk usage
 Usage:
    scripts/check-status.py                  # one-shot
    scripts/check-status.py --watch          # repeat every 30s
    scripts/check-status.py --watch -i 10    # repeat every 10s
 """
 from __future__ import annotations
 import argparse
 import json
 import subprocess
 import sys
 import time
 import urllib.request
 # -- Config -------------------------------------------------------------------
 SSH_HOST = "biscayne.vaasl.io"
 KUBECONFIG = "/home/rix/.kube/config"
 NAMESPACE = "laconic-laconic-70ce4c4b47e23b85"
 DEPLOYMENT = "laconic-70ce4c4b47e23b85-deployment"
 KIND_CONTAINER = "laconic-70ce4c4b47e23b85-control-plane"
 SNAPSHOT_DIR = "/srv/kind/solana/snapshots"
 RAMDISK = "/srv/kind/solana/ramdisk"
 MAINNET_RPC = "https://api.mainnet-beta.solana.com"
 # -- Helpers ------------------------------------------------------------------
 def ssh(cmd: str, timeout: int = 15) -> tuple[int, str]:
    """Run a command on biscayne via SSH. Returns (rc, stdout)."""
    r = subprocess.run(
        ["ssh", SSH_HOST, cmd],
        capture_output=True, text=True, timeout=timeout,
    )
    return r.returncode, r.stdout.strip()
 def kubectl(args: str, timeout: int = 15) -> tuple[int, str]:
    """Run kubectl on biscayne."""
    return ssh(f"KUBECONFIG={KUBECONFIG} kubectl {args}", timeout)
 def get_mainnet_slot() -> int | None:
    """Query mainnet for current finalized slot."""
    req = urllib.request.Request(
        MAINNET_RPC,
        data=json.dumps({
            "jsonrpc": "2.0", "id": 1,
            "method": "getSlot",
            "params": [{"commitment": "finalized"}],
        }).encode(),
        headers={"Content-Type": "application/json"},
    )
    try:
        with urllib.request.urlopen(req, timeout=10) as resp:
            return json.loads(resp.read())["result"]
    except Exception:
        return None
 # -- Checks -------------------------------------------------------------------
 def check_pod() -> dict:
    """Get pod phase and container statuses."""
    rc, out = kubectl(
        f"get pods -n {NAMESPACE} -l app={DEPLOYMENT} "
        "-o json"
    )
    if rc != 0 or not out:
        return {"phase": "NoPod", "containers": {}}
    data = json.loads(out)
    if not data.get("items"):
        return {"phase": "NoPod", "containers": {}}
    pod = data["items"][0]
    phase = pod["status"].get("phase", "Unknown")
    containers = {}
    for cs in pod["status"].get("containerStatuses", []):
        state_key = list(cs["state"].keys())[0]
        state = cs["state"][state_key]
        reason = state.get("reason", "")
        detail = f"{state_key}"
        if reason:
            detail += f"({reason})"
        containers[cs["name"]] = {
            "ready": cs["ready"],
            "state": detail,
            "restarts": cs["restartCount"],
        }
    return {"phase": phase, "containers": containers}
 def check_entrypoint_logs(lines: int = 15) -> str:
    """Get recent entrypoint logs from the agave-validator container."""
    rc, out = kubectl(
        f"logs -n {NAMESPACE} deployment/{DEPLOYMENT} "
        f"-c agave-validator --tail={lines}",
        timeout=20,
    )
    return out if rc == 0 else "(no logs)"
 def check_snapshots() -> list[dict]:
    """List snapshot files on disk with sizes."""
    rc, out = ssh(
        f"ls -lhS {SNAPSHOT_DIR}/*.tar.* 2>/dev/null "
        f"|| echo 'NO_SNAPSHOTS'"
    )
    if "NO_SNAPSHOTS" in out:
        return []
    files = []
    for line in out.splitlines():
        parts = line.split()
        if len(parts) >= 9:
            files.append({"size": parts[4], "name": parts[-1].split("/")[-1]})
    return files
 def check_validator_slot() -> int | None:
    """Query the validator's current processed slot via RPC."""
    rc, out = kubectl(
        f"exec -n {NAMESPACE} deployment/{DEPLOYMENT} "
        f"-c agave-validator -- "
        "curl -s -X POST -H 'Content-Type: application/json' "
        "-d '{\"jsonrpc\":\"2.0\",\"id\":1,\"method\":\"getSlot\","
        "\"params\":[{\"commitment\":\"processed\"}]}' "
        "http://localhost:8899",
        timeout=10,
    )
    if rc != 0 or not out:
        return None
    try:
        return json.loads(out)["result"]
    except (json.JSONDecodeError, KeyError):
        return None
 def check_ramdisk() -> str:
    """Get ramdisk usage."""
    rc, out = ssh(f"df -h {RAMDISK} | tail -1")
    if rc != 0:
        return "unknown"
    parts = out.split()
    if len(parts) >= 5:
        return f"{parts[2]}/{parts[1]} ({parts[4]})"
    return out
 # -- Display ------------------------------------------------------------------
 prev_slot: int | None = None
 prev_time: float | None = None
 def display(iteration: int = 0) -> None:
    """Run all checks and print status."""
    global prev_slot, prev_time
    now = time.time()
    ts = time.strftime("%H:%M:%S")
    # Gather data
    pod = check_pod()
    mainnet = get_mainnet_slot()
    snapshots = check_snapshots()
    ramdisk = check_ramdisk()
    print(f"\n{'=' * 60}")
    print(f"  Biscayne Agave Status — {ts}")
    print(f"{'=' * 60}")
    # Pod
    print(f"\n  Pod: {pod['phase']}")
    for name, cs in pod["containers"].items():
        ready = "✓" if cs["ready"] else "✗"
        restarts = f" (restarts: {cs['restarts']})" if cs["restarts"] > 0 else ""
        print(f"    {ready} {name}: {cs['state']}{restarts}")
    # Validator slot
    validator_slot = None
    if pod["phase"] == "Running":
        agave = pod["containers"].get("agave-validator", {})
        if agave.get("ready"):
            validator_slot = check_validator_slot()
    if validator_slot is not None and mainnet is not None:
        gap = mainnet - validator_slot
        rate = ""
        if prev_slot is not None and prev_time is not None:
            dt = now - prev_time
            if dt > 0:
                slots_gained = validator_slot - prev_slot
                # Net rate = our replay rate minus chain production
                net_rate = slots_gained / dt
                if net_rate > 0:
                    eta_sec = gap / net_rate
                    eta_min = eta_sec / 60
                    rate = f"  net {net_rate:+.1f} slots/s, ETA ~{eta_min:.0f}m"
                else:
                    rate = f"  net {net_rate:+.1f} slots/s (falling behind)"
        prev_slot = validator_slot
        prev_time = now
        print(f"\n  Validator: slot {validator_slot:,}")
        print(f"  Mainnet:   slot {mainnet:,}")
        print(f"  Gap:       {gap:,} slots{rate}")
    elif mainnet is not None:
        print(f"\n  Validator: not responding (downloading or starting)")
        print(f"  Mainnet:   slot {mainnet:,}")
    else:
        print(f"\n  Mainnet:   unreachable")
    # Snapshots
    if snapshots:
        print(f"\n  Snapshots:")
        for s in snapshots:
            print(f"    {s['size']:>6s}  {s['name']}")
    else:
        print(f"\n  Snapshots: none on disk")
    # Ramdisk
    print(f"  Ramdisk:   {ramdisk}")
    # Entrypoint logs (only if validator not yet responding)
    if validator_slot is None and pod["phase"] in ("Running", "Pending"):
        logs = check_entrypoint_logs(10)
        if logs and logs != "(no logs)":
            print(f"\n  Entrypoint logs (last 10 lines):")
            for line in logs.splitlines():
                print(f"    {line}")
    print()
 # -- Main ---------------------------------------------------------------------
 def main() -> int:
    p = argparse.ArgumentParser(description=__doc__,
                                formatter_class=argparse.RawDescriptionHelpFormatter)
    p.add_argument("--watch", action="store_true", help="Repeat every interval")
    p.add_argument("-i", "--interval", type=int, default=30,
                   help="Watch interval in seconds (default: 30)")
    args = p.parse_args()
    try:
        if args.watch:
            i = 0
            while True:
                display(i)
                i += 1
                time.sleep(args.interval)
        else:
            display()
    except KeyboardInterrupt:
        print()
    return 0
 if __name__ == "__main__":
    sys.exit(main())