fix: recovery playbook is fire-and-forget, add check-status.py
The recovery playbook now exits after scaling to 1. The container entrypoint handles snapshot download (60+ min) and validator startup autonomously. Removed all polling/verification steps that would time out waiting. Added scripts/check-status.py for monitoring download progress, validator slot, gap to mainnet, catch-up rate, and ramdisk usage. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>fix/kind-mount-propagation
parent
3dc345ea7d
commit
09728a719c
|
|
@ -7,14 +7,14 @@
|
||||||
#
|
#
|
||||||
# Steps:
|
# Steps:
|
||||||
# 1. Scale deployment to 0
|
# 1. Scale deployment to 0
|
||||||
# 2. Wait for pods to terminate
|
# 2. Wait for pods to terminate (io_uring safety check)
|
||||||
# 3. Wipe accounts ramdisk
|
# 3. Wipe accounts ramdisk
|
||||||
# 4. Clean old snapshots
|
# 4. Clean old snapshots
|
||||||
# 5. Scale to 1 — container entrypoint downloads snapshot + starts validator
|
# 5. Scale to 1 — container entrypoint downloads snapshot + starts validator
|
||||||
# 6. Verify snapshot freshness
|
#
|
||||||
# 7. Wait for pod Running
|
# The playbook exits after step 5. The container handles snapshot download
|
||||||
# 8. Verify validator log
|
# (60+ min) and validator startup autonomously. Monitor with:
|
||||||
# 9. Check RPC health
|
# scripts/check-status.py --watch
|
||||||
#
|
#
|
||||||
# Usage:
|
# Usage:
|
||||||
# ansible-playbook -i inventory/biscayne.yml playbooks/biscayne-recover.yml
|
# ansible-playbook -i inventory/biscayne.yml playbooks/biscayne-recover.yml
|
||||||
|
|
@ -31,10 +31,6 @@
|
||||||
snapshot_dir: /srv/kind/solana/snapshots
|
snapshot_dir: /srv/kind/solana/snapshots
|
||||||
accounts_dir: /srv/kind/solana/ramdisk/accounts
|
accounts_dir: /srv/kind/solana/ramdisk/accounts
|
||||||
ramdisk_mount: /srv/kind/solana/ramdisk
|
ramdisk_mount: /srv/kind/solana/ramdisk
|
||||||
# Mainnet RPC for slot comparison
|
|
||||||
mainnet_rpc: https://api.mainnet-beta.solana.com
|
|
||||||
# Maximum slots behind before snapshot is considered stale
|
|
||||||
max_slot_lag: 20000
|
|
||||||
|
|
||||||
tasks:
|
tasks:
|
||||||
# ---- step 1: scale to 0 ---------------------------------------------------
|
# ---- step 1: scale to 0 ---------------------------------------------------
|
||||||
|
|
@ -109,95 +105,9 @@
|
||||||
-n {{ k8s_namespace }} --replicas=1
|
-n {{ k8s_namespace }} --replicas=1
|
||||||
changed_when: true
|
changed_when: true
|
||||||
|
|
||||||
# ---- step 6: wait for pod running ------------------------------------------
|
- name: Report
|
||||||
# The entrypoint downloads the snapshot before starting the validator.
|
|
||||||
# The pod reaches Running immediately (entrypoint is PID 1), but the
|
|
||||||
# validator log won't appear until download + startup completes.
|
|
||||||
- name: Wait for pod to be running
|
|
||||||
ansible.builtin.command: >
|
|
||||||
kubectl get pods -n {{ k8s_namespace }}
|
|
||||||
-l app={{ deployment_name }}
|
|
||||||
-o jsonpath='{.items[0].status.phase}'
|
|
||||||
register: pod_status
|
|
||||||
retries: 60
|
|
||||||
delay: 10
|
|
||||||
until: pod_status.stdout == "Running"
|
|
||||||
changed_when: false
|
|
||||||
|
|
||||||
# ---- step 7: wait for snapshot download to complete -----------------------
|
|
||||||
# The entrypoint writes the snapshot to the PV. Wait for it to appear
|
|
||||||
# on the host (zvol mount is shared).
|
|
||||||
- name: Wait for snapshot file to appear
|
|
||||||
ansible.builtin.shell: set -o pipefail && ls -1 {{ snapshot_dir }}/snapshot-*.tar.* 2>/dev/null | head -1
|
|
||||||
args:
|
|
||||||
executable: /bin/bash
|
|
||||||
register: snapshot_file
|
|
||||||
retries: 180
|
|
||||||
delay: 20
|
|
||||||
until: snapshot_file.stdout != ""
|
|
||||||
changed_when: false
|
|
||||||
|
|
||||||
# ---- step 8: verify snapshot freshness ------------------------------------
|
|
||||||
- name: Get snapshot filename
|
|
||||||
ansible.builtin.shell: set -o pipefail && ls -1 {{ snapshot_dir }}/snapshot-*.tar.* | head -1 | xargs basename
|
|
||||||
args:
|
|
||||||
executable: /bin/bash
|
|
||||||
register: snapshot_filename
|
|
||||||
changed_when: false
|
|
||||||
|
|
||||||
- name: Extract snapshot slot from filename
|
|
||||||
ansible.builtin.set_fact:
|
|
||||||
snapshot_slot: "{{ snapshot_filename.stdout | regex_search('snapshot-([0-9]+)-', '\\1') | first }}"
|
|
||||||
|
|
||||||
- name: Get current mainnet slot
|
|
||||||
ansible.builtin.uri:
|
|
||||||
url: "{{ mainnet_rpc }}"
|
|
||||||
method: POST
|
|
||||||
body_format: json
|
|
||||||
body:
|
|
||||||
jsonrpc: "2.0"
|
|
||||||
id: 1
|
|
||||||
method: getSlot
|
|
||||||
params:
|
|
||||||
- commitment: finalized
|
|
||||||
return_content: true
|
|
||||||
register: mainnet_slot_response
|
|
||||||
|
|
||||||
- name: Report snapshot freshness
|
|
||||||
ansible.builtin.debug:
|
ansible.builtin.debug:
|
||||||
msg: >-
|
msg: >-
|
||||||
Snapshot slot {{ snapshot_slot }}, mainnet {{ mainnet_slot_response.json.result }},
|
Recovery initiated. The container entrypoint will download a fresh
|
||||||
{{ mainnet_slot_response.json.result | int - snapshot_slot | int }} slots behind.
|
snapshot and start the validator. Monitor progress with:
|
||||||
|
scripts/check-status.py --watch
|
||||||
# ---- step 9: wait for validator log ---------------------------------------
|
|
||||||
- name: Wait for validator log file
|
|
||||||
ansible.builtin.command: >
|
|
||||||
kubectl exec -n {{ k8s_namespace }}
|
|
||||||
deployment/{{ deployment_name }}
|
|
||||||
-c agave-validator -- test -f /data/log/validator.log
|
|
||||||
register: log_file_check
|
|
||||||
retries: 30
|
|
||||||
delay: 20
|
|
||||||
until: log_file_check.rc == 0
|
|
||||||
changed_when: false
|
|
||||||
|
|
||||||
# ---- step 10: check RPC health --------------------------------------------
|
|
||||||
- name: Check RPC health (non-blocking)
|
|
||||||
ansible.builtin.uri:
|
|
||||||
url: http://{{ inventory_hostname }}:8899/health
|
|
||||||
return_content: true
|
|
||||||
register: rpc_health
|
|
||||||
retries: 6
|
|
||||||
delay: 30
|
|
||||||
until: rpc_health.status == 200
|
|
||||||
failed_when: false
|
|
||||||
|
|
||||||
- name: Report final status
|
|
||||||
ansible.builtin.debug:
|
|
||||||
msg: >-
|
|
||||||
Recovery complete.
|
|
||||||
Snapshot: slot {{ snapshot_slot }}
|
|
||||||
({{ mainnet_slot_response.json.result | int - snapshot_slot | int }} slots behind).
|
|
||||||
Pod: {{ pod_status.stdout }}.
|
|
||||||
Log: {{ 'writing' if log_file_check.rc == 0 else 'not yet' }}.
|
|
||||||
RPC: {{ rpc_health.content | default('not yet responding — still catching up') }}.
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,276 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Check agave validator and snapshot download status on biscayne.
|
||||||
|
|
||||||
|
Runs kubectl and host commands over SSH to report:
|
||||||
|
- Pod phase and container states
|
||||||
|
- Entrypoint logs (snapshot download progress)
|
||||||
|
- Snapshot files on disk
|
||||||
|
- Validator slot vs mainnet slot (gap + catch-up rate)
|
||||||
|
- Ramdisk usage
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
scripts/check-status.py # one-shot
|
||||||
|
scripts/check-status.py --watch # repeat every 30s
|
||||||
|
scripts/check-status.py --watch -i 10 # repeat every 10s
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
import urllib.request
|
||||||
|
|
||||||
|
# -- Config -------------------------------------------------------------------
|
||||||
|
|
||||||
|
SSH_HOST = "biscayne.vaasl.io"
|
||||||
|
KUBECONFIG = "/home/rix/.kube/config"
|
||||||
|
NAMESPACE = "laconic-laconic-70ce4c4b47e23b85"
|
||||||
|
DEPLOYMENT = "laconic-70ce4c4b47e23b85-deployment"
|
||||||
|
KIND_CONTAINER = "laconic-70ce4c4b47e23b85-control-plane"
|
||||||
|
SNAPSHOT_DIR = "/srv/kind/solana/snapshots"
|
||||||
|
RAMDISK = "/srv/kind/solana/ramdisk"
|
||||||
|
MAINNET_RPC = "https://api.mainnet-beta.solana.com"
|
||||||
|
|
||||||
|
|
||||||
|
# -- Helpers ------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def ssh(cmd: str, timeout: int = 15) -> tuple[int, str]:
|
||||||
|
"""Run a command on biscayne via SSH. Returns (rc, stdout)."""
|
||||||
|
r = subprocess.run(
|
||||||
|
["ssh", SSH_HOST, cmd],
|
||||||
|
capture_output=True, text=True, timeout=timeout,
|
||||||
|
)
|
||||||
|
return r.returncode, r.stdout.strip()
|
||||||
|
|
||||||
|
|
||||||
|
def kubectl(args: str, timeout: int = 15) -> tuple[int, str]:
|
||||||
|
"""Run kubectl on biscayne."""
|
||||||
|
return ssh(f"KUBECONFIG={KUBECONFIG} kubectl {args}", timeout)
|
||||||
|
|
||||||
|
|
||||||
|
def get_mainnet_slot() -> int | None:
|
||||||
|
"""Query mainnet for current finalized slot."""
|
||||||
|
req = urllib.request.Request(
|
||||||
|
MAINNET_RPC,
|
||||||
|
data=json.dumps({
|
||||||
|
"jsonrpc": "2.0", "id": 1,
|
||||||
|
"method": "getSlot",
|
||||||
|
"params": [{"commitment": "finalized"}],
|
||||||
|
}).encode(),
|
||||||
|
headers={"Content-Type": "application/json"},
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
with urllib.request.urlopen(req, timeout=10) as resp:
|
||||||
|
return json.loads(resp.read())["result"]
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
# -- Checks -------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def check_pod() -> dict:
|
||||||
|
"""Get pod phase and container statuses."""
|
||||||
|
rc, out = kubectl(
|
||||||
|
f"get pods -n {NAMESPACE} -l app={DEPLOYMENT} "
|
||||||
|
"-o json"
|
||||||
|
)
|
||||||
|
if rc != 0 or not out:
|
||||||
|
return {"phase": "NoPod", "containers": {}}
|
||||||
|
|
||||||
|
data = json.loads(out)
|
||||||
|
if not data.get("items"):
|
||||||
|
return {"phase": "NoPod", "containers": {}}
|
||||||
|
|
||||||
|
pod = data["items"][0]
|
||||||
|
phase = pod["status"].get("phase", "Unknown")
|
||||||
|
containers = {}
|
||||||
|
for cs in pod["status"].get("containerStatuses", []):
|
||||||
|
state_key = list(cs["state"].keys())[0]
|
||||||
|
state = cs["state"][state_key]
|
||||||
|
reason = state.get("reason", "")
|
||||||
|
detail = f"{state_key}"
|
||||||
|
if reason:
|
||||||
|
detail += f"({reason})"
|
||||||
|
containers[cs["name"]] = {
|
||||||
|
"ready": cs["ready"],
|
||||||
|
"state": detail,
|
||||||
|
"restarts": cs["restartCount"],
|
||||||
|
}
|
||||||
|
return {"phase": phase, "containers": containers}
|
||||||
|
|
||||||
|
|
||||||
|
def check_entrypoint_logs(lines: int = 15) -> str:
|
||||||
|
"""Get recent entrypoint logs from the agave-validator container."""
|
||||||
|
rc, out = kubectl(
|
||||||
|
f"logs -n {NAMESPACE} deployment/{DEPLOYMENT} "
|
||||||
|
f"-c agave-validator --tail={lines}",
|
||||||
|
timeout=20,
|
||||||
|
)
|
||||||
|
return out if rc == 0 else "(no logs)"
|
||||||
|
|
||||||
|
|
||||||
|
def check_snapshots() -> list[dict]:
|
||||||
|
"""List snapshot files on disk with sizes."""
|
||||||
|
rc, out = ssh(
|
||||||
|
f"ls -lhS {SNAPSHOT_DIR}/*.tar.* 2>/dev/null "
|
||||||
|
f"|| echo 'NO_SNAPSHOTS'"
|
||||||
|
)
|
||||||
|
if "NO_SNAPSHOTS" in out:
|
||||||
|
return []
|
||||||
|
|
||||||
|
files = []
|
||||||
|
for line in out.splitlines():
|
||||||
|
parts = line.split()
|
||||||
|
if len(parts) >= 9:
|
||||||
|
files.append({"size": parts[4], "name": parts[-1].split("/")[-1]})
|
||||||
|
return files
|
||||||
|
|
||||||
|
|
||||||
|
def check_validator_slot() -> int | None:
|
||||||
|
"""Query the validator's current processed slot via RPC."""
|
||||||
|
rc, out = kubectl(
|
||||||
|
f"exec -n {NAMESPACE} deployment/{DEPLOYMENT} "
|
||||||
|
f"-c agave-validator -- "
|
||||||
|
"curl -s -X POST -H 'Content-Type: application/json' "
|
||||||
|
"-d '{\"jsonrpc\":\"2.0\",\"id\":1,\"method\":\"getSlot\","
|
||||||
|
"\"params\":[{\"commitment\":\"processed\"}]}' "
|
||||||
|
"http://localhost:8899",
|
||||||
|
timeout=10,
|
||||||
|
)
|
||||||
|
if rc != 0 or not out:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return json.loads(out)["result"]
|
||||||
|
except (json.JSONDecodeError, KeyError):
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def check_ramdisk() -> str:
|
||||||
|
"""Get ramdisk usage."""
|
||||||
|
rc, out = ssh(f"df -h {RAMDISK} | tail -1")
|
||||||
|
if rc != 0:
|
||||||
|
return "unknown"
|
||||||
|
parts = out.split()
|
||||||
|
if len(parts) >= 5:
|
||||||
|
return f"{parts[2]}/{parts[1]} ({parts[4]})"
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
# -- Display ------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
prev_slot: int | None = None
|
||||||
|
prev_time: float | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def display(iteration: int = 0) -> None:
|
||||||
|
"""Run all checks and print status."""
|
||||||
|
global prev_slot, prev_time
|
||||||
|
|
||||||
|
now = time.time()
|
||||||
|
ts = time.strftime("%H:%M:%S")
|
||||||
|
|
||||||
|
# Gather data
|
||||||
|
pod = check_pod()
|
||||||
|
mainnet = get_mainnet_slot()
|
||||||
|
snapshots = check_snapshots()
|
||||||
|
ramdisk = check_ramdisk()
|
||||||
|
|
||||||
|
print(f"\n{'=' * 60}")
|
||||||
|
print(f" Biscayne Agave Status — {ts}")
|
||||||
|
print(f"{'=' * 60}")
|
||||||
|
|
||||||
|
# Pod
|
||||||
|
print(f"\n Pod: {pod['phase']}")
|
||||||
|
for name, cs in pod["containers"].items():
|
||||||
|
ready = "✓" if cs["ready"] else "✗"
|
||||||
|
restarts = f" (restarts: {cs['restarts']})" if cs["restarts"] > 0 else ""
|
||||||
|
print(f" {ready} {name}: {cs['state']}{restarts}")
|
||||||
|
|
||||||
|
# Validator slot
|
||||||
|
validator_slot = None
|
||||||
|
if pod["phase"] == "Running":
|
||||||
|
agave = pod["containers"].get("agave-validator", {})
|
||||||
|
if agave.get("ready"):
|
||||||
|
validator_slot = check_validator_slot()
|
||||||
|
|
||||||
|
if validator_slot is not None and mainnet is not None:
|
||||||
|
gap = mainnet - validator_slot
|
||||||
|
rate = ""
|
||||||
|
if prev_slot is not None and prev_time is not None:
|
||||||
|
dt = now - prev_time
|
||||||
|
if dt > 0:
|
||||||
|
slots_gained = validator_slot - prev_slot
|
||||||
|
# Net rate = our replay rate minus chain production
|
||||||
|
net_rate = slots_gained / dt
|
||||||
|
if net_rate > 0:
|
||||||
|
eta_sec = gap / net_rate
|
||||||
|
eta_min = eta_sec / 60
|
||||||
|
rate = f" net {net_rate:+.1f} slots/s, ETA ~{eta_min:.0f}m"
|
||||||
|
else:
|
||||||
|
rate = f" net {net_rate:+.1f} slots/s (falling behind)"
|
||||||
|
prev_slot = validator_slot
|
||||||
|
prev_time = now
|
||||||
|
print(f"\n Validator: slot {validator_slot:,}")
|
||||||
|
print(f" Mainnet: slot {mainnet:,}")
|
||||||
|
print(f" Gap: {gap:,} slots{rate}")
|
||||||
|
elif mainnet is not None:
|
||||||
|
print(f"\n Validator: not responding (downloading or starting)")
|
||||||
|
print(f" Mainnet: slot {mainnet:,}")
|
||||||
|
else:
|
||||||
|
print(f"\n Mainnet: unreachable")
|
||||||
|
|
||||||
|
# Snapshots
|
||||||
|
if snapshots:
|
||||||
|
print(f"\n Snapshots:")
|
||||||
|
for s in snapshots:
|
||||||
|
print(f" {s['size']:>6s} {s['name']}")
|
||||||
|
else:
|
||||||
|
print(f"\n Snapshots: none on disk")
|
||||||
|
|
||||||
|
# Ramdisk
|
||||||
|
print(f" Ramdisk: {ramdisk}")
|
||||||
|
|
||||||
|
# Entrypoint logs (only if validator not yet responding)
|
||||||
|
if validator_slot is None and pod["phase"] in ("Running", "Pending"):
|
||||||
|
logs = check_entrypoint_logs(10)
|
||||||
|
if logs and logs != "(no logs)":
|
||||||
|
print(f"\n Entrypoint logs (last 10 lines):")
|
||||||
|
for line in logs.splitlines():
|
||||||
|
print(f" {line}")
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
|
||||||
|
# -- Main ---------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
p = argparse.ArgumentParser(description=__doc__,
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter)
|
||||||
|
p.add_argument("--watch", action="store_true", help="Repeat every interval")
|
||||||
|
p.add_argument("-i", "--interval", type=int, default=30,
|
||||||
|
help="Watch interval in seconds (default: 30)")
|
||||||
|
args = p.parse_args()
|
||||||
|
|
||||||
|
try:
|
||||||
|
if args.watch:
|
||||||
|
i = 0
|
||||||
|
while True:
|
||||||
|
display(i)
|
||||||
|
i += 1
|
||||||
|
time.sleep(args.interval)
|
||||||
|
else:
|
||||||
|
display()
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
print()
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
Loading…
Reference in New Issue