stack-orchestrator/playbooks/biscayne-redeploy.yml

322 lines
11 KiB
YAML
Raw Normal View History

---
# Redeploy agave-stack on biscayne with aria2c snapshot pre-download
#
# The validator's built-in downloader fetches snapshots at ~18 MB/s (single
# connection). snapshot-download.py uses aria2c with 16 parallel connections to
# saturate available bandwidth, cutting 90+ min downloads to ~10 min.
#
# Flow:
# 1. [teardown] Delete k8s namespace (preserve kind cluster)
# 2. [wipe] Conditionally clear ledger / accounts / old snapshots
# 3. [deploy] laconic-so deployment start, then immediately scale to 0
# 4. [snapshot] Download snapshot via aria2c to host bind mount
# 5. [snapshot] Verify snapshot visible inside kind node
# 6. [deploy] Scale validator back to 1
# 7. [verify] Wait for pod Running, check logs + RPC health
#
# The validator cannot run during snapshot download — it would lock/use the
# snapshot files. laconic-so creates the cluster AND deploys the pod in one
# shot, so we scale to 0 immediately after deploy, download, then scale to 1.
#
# Usage:
# # Standard redeploy (download snapshot, preserve accounts + ledger)
# ansible-playbook -i biscayne.vaasl.io, playbooks/biscayne-redeploy.yml
#
# # Full wipe (accounts + ledger) — slow rebuild
# ansible-playbook -i biscayne.vaasl.io, playbooks/biscayne-redeploy.yml \
# -e wipe_accounts=true -e wipe_ledger=true
#
# # Skip snapshot download (use existing)
# ansible-playbook -i biscayne.vaasl.io, playbooks/biscayne-redeploy.yml \
# -e skip_snapshot=true
#
# # Pass extra args to snapshot-download.py
# ansible-playbook -i biscayne.vaasl.io, playbooks/biscayne-redeploy.yml \
# -e 'snapshot_args=--version 2.2 --min-download-speed 50'
#
# # Snapshot only (no teardown/deploy)
# ansible-playbook -i biscayne.vaasl.io, playbooks/biscayne-redeploy.yml \
# --tags snapshot
#
- name: Redeploy agave validator on biscayne
hosts: all
gather_facts: false
environment:
KUBECONFIG: /home/rix/.kube/config
vars:
deployment_dir: /srv/deployments/agave
laconic_so: /home/rix/.local/bin/laconic-so
kind_cluster: laconic-70ce4c4b47e23b85
k8s_namespace: "laconic-{{ kind_cluster }}"
deployment_name: "{{ kind_cluster }}-deployment"
snapshot_dir: /srv/solana/snapshots
ledger_dir: /srv/solana/ledger
accounts_dir: /srv/solana/ramdisk/accounts
ramdisk_mount: /srv/solana/ramdisk
ramdisk_device: /dev/ram0
snapshot_script_local: "{{ playbook_dir }}/../scripts/snapshot-download.py"
snapshot_script: /tmp/snapshot-download.py
# Flags — non-destructive by default
wipe_accounts: false
wipe_ledger: false
skip_snapshot: false
snapshot_args: ""
tasks:
# ---- teardown: graceful stop, then delete namespace ----------------------
#
# IMPORTANT: Scale to 0 first, wait for agave to exit cleanly.
# Deleting the namespace while agave is running causes io_uring/ZFS
# deadlock (unkillable D-state threads). See CLAUDE.md.
- name: Scale deployment to 0 (graceful stop)
command: >
kubectl scale deployment {{ deployment_name }}
-n {{ k8s_namespace }} --replicas=0
register: pre_teardown_scale
failed_when: false
tags: [teardown]
- name: Wait for agave to exit
command: >
kubectl get pods -n {{ k8s_namespace }}
-l app={{ deployment_name }}
-o jsonpath='{.items}'
register: pre_teardown_pods
retries: 60
delay: 5
until: pre_teardown_pods.stdout == "[]" or pre_teardown_pods.stdout == "" or pre_teardown_pods.rc != 0
failed_when: false
when: pre_teardown_scale.rc == 0
tags: [teardown]
- name: Delete deployment namespace
command: >
kubectl delete namespace {{ k8s_namespace }} --timeout=120s
register: ns_delete
failed_when: false
tags: [teardown]
- name: Wait for namespace to terminate
command: >
kubectl get namespace {{ k8s_namespace }}
-o jsonpath='{.status.phase}'
register: ns_status
retries: 30
delay: 5
until: ns_status.rc != 0
failed_when: false
when: ns_delete.rc == 0
tags: [teardown]
# ---- wipe: opt-in data cleanup ------------------------------------------
- name: Wipe ledger data
shell: rm -rf {{ ledger_dir }}/*
become: true
when: wipe_ledger | bool
tags: [wipe]
- name: Wipe accounts ramdisk (umount + mkfs.xfs + mount)
shell: |
mountpoint -q {{ ramdisk_mount }} && umount {{ ramdisk_mount }} || true
mkfs.xfs -f {{ ramdisk_device }}
mount {{ ramdisk_mount }}
mkdir -p {{ accounts_dir }}
chown solana:solana {{ ramdisk_mount }} {{ accounts_dir }}
become: true
when: wipe_accounts | bool
tags: [wipe]
- name: Clean old snapshots (keep newest full + incremental)
shell: |
cd {{ snapshot_dir }} || exit 0
newest=$(ls -t snapshot-*.tar.* 2>/dev/null | head -1)
if [ -n "$newest" ]; then
newest_inc=$(ls -t incremental-snapshot-*.tar.* 2>/dev/null | head -1)
find . -maxdepth 1 -name '*.tar.*' \
! -name "$newest" \
! -name "${newest_inc:-__none__}" \
-delete
fi
become: true
when: not skip_snapshot | bool
tags: [wipe]
# ---- preflight: verify ramdisk and mounts before deploy ------------------
- name: Verify ramdisk is mounted
command: mountpoint -q {{ ramdisk_mount }}
register: ramdisk_check
failed_when: ramdisk_check.rc != 0
changed_when: false
tags: [deploy, preflight]
- name: Verify ramdisk is xfs (not the underlying ZFS)
shell: df -T {{ ramdisk_mount }} | grep -q xfs
register: ramdisk_type
failed_when: ramdisk_type.rc != 0
changed_when: false
tags: [deploy, preflight]
- name: Verify ramdisk visible inside kind node
shell: >
docker exec {{ kind_cluster }}-control-plane
df -T /mnt/solana/ramdisk 2>/dev/null | grep -q xfs
register: kind_ramdisk_check
failed_when: kind_ramdisk_check.rc != 0
changed_when: false
tags: [deploy, preflight]
# ---- deploy: bring up cluster, scale to 0 immediately -------------------
- name: Verify kind-config.yml has unified mount root
command: "grep -c 'containerPath: /mnt$' {{ deployment_dir }}/kind-config.yml"
register: mount_root_check
failed_when: mount_root_check.stdout | int < 1
tags: [deploy]
- name: Start deployment (creates kind cluster + deploys pod)
command: "{{ laconic_so }} deployment --dir {{ deployment_dir }} start"
timeout: 1200
tags: [deploy]
- name: Wait for deployment to exist
command: >
kubectl get deployment {{ deployment_name }}
-n {{ k8s_namespace }}
-o jsonpath='{.metadata.name}'
register: deploy_exists
retries: 30
delay: 10
until: deploy_exists.rc == 0
tags: [deploy]
- name: Scale validator to 0 (stop before snapshot download)
command: >
kubectl scale deployment {{ deployment_name }}
-n {{ k8s_namespace }} --replicas=0
tags: [deploy]
- name: Wait for pods to terminate
command: >
kubectl get pods -n {{ k8s_namespace }}
-l app={{ deployment_name }}
-o jsonpath='{.items}'
register: pods_gone
retries: 30
delay: 5
until: pods_gone.stdout == "[]" or pods_gone.stdout == ""
failed_when: false
tags: [deploy]
# ---- snapshot: download via aria2c, verify in kind node ------------------
- name: Verify aria2c installed
command: which aria2c
changed_when: false
when: not skip_snapshot | bool
tags: [snapshot]
- name: Copy snapshot script to remote
copy:
src: "{{ snapshot_script_local }}"
dest: "{{ snapshot_script }}"
mode: "0755"
when: not skip_snapshot | bool
tags: [snapshot]
- name: Verify kind node mounts
command: >
docker exec {{ kind_cluster }}-control-plane
ls /mnt/solana/snapshots/
register: kind_mount_check
tags: [snapshot]
- name: Download snapshot via aria2c
shell: >
python3 {{ snapshot_script }}
-o {{ snapshot_dir }}
{{ snapshot_args }}
become: true
register: snapshot_result
when: not skip_snapshot | bool
timeout: 3600
tags: [snapshot]
- name: Show snapshot download result
debug:
msg: "{{ snapshot_result.stdout_lines | default(['skipped']) }}"
tags: [snapshot]
- name: Verify snapshot visible inside kind node
shell: >
docker exec {{ kind_cluster }}-control-plane
ls -lhS /mnt/solana/snapshots/*.tar.* 2>/dev/null | head -5
register: kind_snapshot_check
failed_when: kind_snapshot_check.stdout == ""
when: not skip_snapshot | bool
tags: [snapshot]
- name: Show snapshot files in kind node
debug:
msg: "{{ kind_snapshot_check.stdout_lines | default(['skipped']) }}"
when: not skip_snapshot | bool
tags: [snapshot]
# ---- deploy (cont): scale validator back up with snapshot ----------------
- name: Scale validator to 1 (start with downloaded snapshot)
command: >
kubectl scale deployment {{ deployment_name }}
-n {{ k8s_namespace }} --replicas=1
tags: [deploy]
# ---- verify: confirm validator is running --------------------------------
- name: Wait for pod to be running
command: >
kubectl get pods -n {{ k8s_namespace }}
-o jsonpath='{.items[0].status.phase}'
register: pod_status
retries: 60
delay: 10
until: pod_status.stdout == "Running"
tags: [verify]
- name: Verify unified mount inside kind node
command: "docker exec {{ kind_cluster }}-control-plane ls /mnt/solana/"
register: mount_check
tags: [verify]
- name: Show mount contents
debug:
msg: "{{ mount_check.stdout_lines }}"
tags: [verify]
- name: Check validator log file is being written
command: >
kubectl exec -n {{ k8s_namespace }}
deployment/{{ deployment_name }}
-c agave-validator -- test -f /data/log/validator.log
retries: 12
delay: 10
until: log_file_check.rc == 0
register: log_file_check
failed_when: false
tags: [verify]
- name: Check RPC health
uri:
url: http://127.0.0.1:8899/health
return_content: true
register: rpc_health
retries: 6
delay: 10
until: rpc_health.status == 200
failed_when: false
delegate_to: "{{ inventory_hostname }}"
tags: [verify]
- name: Report status
debug:
msg: >-
Deployment complete.
Log: {{ 'writing' if log_file_check.rc == 0 else 'not yet created' }}.
RPC: {{ rpc_health.content | default('not responding') }}.
Wiped: ledger={{ wipe_ledger }}, accounts={{ wipe_accounts }}.
tags: [verify]