432 lines
15 KiB
YAML
432 lines
15 KiB
YAML
---
|
|
# Redeploy agave-stack on biscayne with aria2c snapshot pre-download
|
|
#
|
|
# The validator's built-in downloader fetches snapshots at ~18 MB/s (single
|
|
# connection). snapshot-download.py uses aria2c with 16 parallel connections to
|
|
# saturate available bandwidth, cutting 90+ min downloads to ~10 min.
|
|
#
|
|
# Flow:
|
|
# 1. [teardown] Delete k8s namespace (preserve kind cluster)
|
|
# 2. [wipe] Conditionally clear ledger / accounts / old snapshots
|
|
# 3. [deploy] laconic-so deployment start, then immediately scale to 0
|
|
# 4. [snapshot] Download snapshot via aria2c to host bind mount
|
|
# 5. [snapshot] Verify snapshot visible inside kind node
|
|
# 6. [deploy,scale-up] Scale validator back to 1
|
|
# 7. [verify] Wait for pod Running, check logs + RPC health
|
|
#
|
|
# The validator cannot run during snapshot download — it would lock/use the
|
|
# snapshot files. laconic-so creates the cluster AND deploys the pod in one
|
|
# shot, so we scale to 0 immediately after deploy, download, then scale to 1.
|
|
#
|
|
# Usage:
|
|
# # Standard redeploy (download snapshot, preserve accounts + ledger)
|
|
# ansible-playbook -i biscayne.vaasl.io, playbooks/biscayne-redeploy.yml
|
|
#
|
|
# # Full wipe (accounts + ledger) — slow rebuild
|
|
# ansible-playbook -i biscayne.vaasl.io, playbooks/biscayne-redeploy.yml \
|
|
# -e wipe_accounts=true -e wipe_ledger=true
|
|
#
|
|
# # Skip snapshot download (use existing)
|
|
# ansible-playbook -i biscayne.vaasl.io, playbooks/biscayne-redeploy.yml \
|
|
# -e skip_snapshot=true
|
|
#
|
|
# # Pass extra args to snapshot-download.py
|
|
# ansible-playbook -i biscayne.vaasl.io, playbooks/biscayne-redeploy.yml \
|
|
# -e 'snapshot_args=--version 2.2 --min-download-speed 50'
|
|
#
|
|
# # Snapshot only (no teardown/deploy)
|
|
# ansible-playbook -i biscayne.vaasl.io, playbooks/biscayne-redeploy.yml \
|
|
# --tags snapshot
|
|
#
|
|
# # Resume after partial failure (download snapshot, scale up, verify)
|
|
# ansible-playbook -i biscayne.vaasl.io, playbooks/biscayne-redeploy.yml \
|
|
# --tags snapshot,scale-up,verify
|
|
#
|
|
- name: Redeploy agave validator on biscayne
|
|
hosts: all
|
|
gather_facts: false
|
|
environment:
|
|
KUBECONFIG: /home/rix/.kube/config
|
|
vars:
|
|
deployment_dir: /srv/deployments/agave
|
|
stack_repo: /srv/deployments/agave-stack
|
|
stack_path: /srv/deployments/agave-stack/stack-orchestrator/stacks/agave
|
|
laconic_so: /home/rix/.local/bin/laconic-so
|
|
laconic_so_repo: /home/rix/stack-orchestrator
|
|
laconic_so_branch: main
|
|
kind_cluster: laconic-70ce4c4b47e23b85
|
|
k8s_namespace: "laconic-{{ kind_cluster }}"
|
|
deployment_name: "{{ kind_cluster }}-deployment"
|
|
snapshot_dir: /srv/solana/snapshots
|
|
ledger_dir: /srv/solana/ledger
|
|
accounts_dir: /srv/solana/ramdisk/accounts
|
|
ramdisk_mount: /srv/solana/ramdisk
|
|
ramdisk_device: /dev/ram0
|
|
snapshot_script_local: "{{ playbook_dir }}/../scripts/snapshot-download.py"
|
|
snapshot_script: /tmp/snapshot-download.py
|
|
# Flags — non-destructive by default
|
|
wipe_accounts: false
|
|
wipe_ledger: false
|
|
skip_snapshot: false
|
|
snapshot_args: ""
|
|
|
|
tasks:
|
|
# ---- teardown: graceful stop, then delete namespace ----------------------
|
|
#
|
|
# IMPORTANT: Scale to 0 first, wait for agave to exit cleanly.
|
|
# Deleting the namespace while agave is running causes io_uring/ZFS
|
|
# deadlock (unkillable D-state threads). See CLAUDE.md.
|
|
- name: Scale deployment to 0 (graceful stop)
|
|
ansible.builtin.command: >
|
|
kubectl scale deployment {{ deployment_name }}
|
|
-n {{ k8s_namespace }} --replicas=0
|
|
register: pre_teardown_scale
|
|
changed_when: pre_teardown_scale.rc == 0
|
|
failed_when: false
|
|
tags: [teardown]
|
|
|
|
- name: Wait for agave to exit
|
|
ansible.builtin.command: >
|
|
kubectl get pods -n {{ k8s_namespace }}
|
|
-l app={{ deployment_name }}
|
|
-o jsonpath='{.items}'
|
|
register: pre_teardown_pods
|
|
retries: 60
|
|
delay: 5
|
|
until: pre_teardown_pods.stdout == "[]" or pre_teardown_pods.stdout == "" or pre_teardown_pods.rc != 0
|
|
changed_when: false
|
|
failed_when: false
|
|
when: pre_teardown_scale.rc == 0
|
|
tags: [teardown]
|
|
|
|
- name: Delete deployment namespace
|
|
ansible.builtin.command: >
|
|
kubectl delete namespace {{ k8s_namespace }} --timeout=120s
|
|
register: ns_delete
|
|
changed_when: ns_delete.rc == 0
|
|
failed_when: false
|
|
tags: [teardown]
|
|
|
|
- name: Wait for namespace to terminate
|
|
ansible.builtin.command: >
|
|
kubectl get namespace {{ k8s_namespace }}
|
|
-o jsonpath='{.status.phase}'
|
|
register: ns_status
|
|
retries: 30
|
|
delay: 5
|
|
until: ns_status.rc != 0
|
|
changed_when: false
|
|
failed_when: false
|
|
when: ns_delete.rc == 0
|
|
tags: [teardown]
|
|
|
|
- name: Clear stale claimRefs on Released PVs
|
|
ansible.builtin.shell: |
|
|
set -o pipefail
|
|
for pv in $(kubectl get pv -o jsonpath='{range .items[?(@.status.phase=="Released")]}{.metadata.name}{"\n"}{end}'); do
|
|
kubectl patch pv "$pv" --type json \
|
|
-p '[{"op":"remove","path":"/spec/claimRef"}]'
|
|
done
|
|
register: pv_patch
|
|
changed_when: pv_patch.stdout != ""
|
|
tags: [teardown]
|
|
|
|
# ---- wipe: opt-in data cleanup ------------------------------------------
|
|
- name: Wipe ledger data
|
|
ansible.builtin.shell: rm -rf {{ ledger_dir }}/*
|
|
become: true
|
|
changed_when: true
|
|
when: wipe_ledger | bool
|
|
tags: [wipe]
|
|
|
|
- name: Wipe accounts ramdisk (umount + mkfs.xfs + mount)
|
|
ansible.builtin.shell: |
|
|
set -o pipefail
|
|
mountpoint -q {{ ramdisk_mount }} && umount {{ ramdisk_mount }} || true
|
|
mkfs.xfs -f {{ ramdisk_device }}
|
|
mount {{ ramdisk_mount }}
|
|
mkdir -p {{ accounts_dir }}
|
|
chown solana:solana {{ ramdisk_mount }} {{ accounts_dir }}
|
|
become: true
|
|
changed_when: true
|
|
when: wipe_accounts | bool
|
|
tags: [wipe]
|
|
|
|
- name: Clean old snapshots (keep newest full + incremental)
|
|
ansible.builtin.shell: |
|
|
set -o pipefail
|
|
cd {{ snapshot_dir }} || exit 0
|
|
newest=$(ls -t snapshot-*.tar.* 2>/dev/null | head -1)
|
|
if [ -n "$newest" ]; then
|
|
newest_inc=$(ls -t incremental-snapshot-*.tar.* 2>/dev/null | head -1)
|
|
find . -maxdepth 1 -name '*.tar.*' \
|
|
! -name "$newest" \
|
|
! -name "${newest_inc:-__none__}" \
|
|
-delete
|
|
fi
|
|
become: true
|
|
changed_when: true
|
|
when: not skip_snapshot | bool
|
|
tags: [wipe]
|
|
|
|
# ---- preflight: verify ramdisk and mounts before deploy ------------------
|
|
- name: Verify ramdisk is mounted
|
|
ansible.builtin.command: mountpoint -q {{ ramdisk_mount }}
|
|
register: ramdisk_check
|
|
failed_when: ramdisk_check.rc != 0
|
|
changed_when: false
|
|
tags: [deploy, preflight]
|
|
|
|
- name: Verify ramdisk is xfs (not the underlying ZFS)
|
|
ansible.builtin.shell:
|
|
cmd: set -o pipefail && df -T {{ ramdisk_mount }} | grep -q xfs
|
|
executable: /bin/bash
|
|
register: ramdisk_type
|
|
failed_when: ramdisk_type.rc != 0
|
|
changed_when: false
|
|
tags: [deploy, preflight]
|
|
|
|
# ---- deploy: sync config, bring up cluster, scale to 0 ------------------
|
|
- name: Pull agave-stack repo
|
|
ansible.builtin.shell: |
|
|
cd {{ stack_repo }}
|
|
git fetch origin
|
|
git reset --hard origin/{{ laconic_so_branch }}
|
|
changed_when: true
|
|
tags: [deploy]
|
|
|
|
- name: Regenerate deployment config from updated stack
|
|
ansible.builtin.command: >
|
|
{{ laconic_so }}
|
|
--stack {{ stack_path }}
|
|
deploy create
|
|
--spec-file {{ deployment_dir }}/spec.yml
|
|
--deployment-dir {{ deployment_dir }}
|
|
--update
|
|
changed_when: true
|
|
tags: [deploy]
|
|
|
|
- name: Check kind-config.yml mount style
|
|
ansible.builtin.command: "grep -c 'containerPath: /mnt$' {{ deployment_dir }}/kind-config.yml"
|
|
register: mount_root_check
|
|
changed_when: false
|
|
failed_when: false
|
|
tags: [deploy]
|
|
|
|
- name: Warn if unified mount root not found
|
|
ansible.builtin.debug:
|
|
msg: >-
|
|
WARNING: kind-config.yml does not have unified mount root
|
|
(containerPath: /mnt). laconic-so may be using individual PV mounts.
|
|
Verify PV hostPaths match expected paths after deploy.
|
|
when: mount_root_check.stdout | default('0') | int < 1
|
|
tags: [deploy]
|
|
|
|
- name: Update laconic-so (editable install)
|
|
ansible.builtin.shell: |
|
|
cd {{ laconic_so_repo }}
|
|
git fetch origin
|
|
git reset --hard origin/{{ laconic_so_branch }}
|
|
changed_when: true
|
|
tags: [deploy]
|
|
|
|
- name: Start deployment (creates kind cluster + deploys pod)
|
|
ansible.builtin.command: "{{ laconic_so }} deployment --dir {{ deployment_dir }} start"
|
|
register: deploy_start
|
|
changed_when: deploy_start.rc == 0
|
|
failed_when: false
|
|
timeout: 1200
|
|
tags: [deploy]
|
|
|
|
- name: Verify deployment started or already exists
|
|
ansible.builtin.command: >
|
|
kubectl get deployment {{ deployment_name }}
|
|
-n {{ k8s_namespace }}
|
|
-o jsonpath='{.metadata.name}'
|
|
register: deploy_verify
|
|
changed_when: false
|
|
failed_when: deploy_verify.rc != 0
|
|
when: deploy_start.rc != 0
|
|
tags: [deploy]
|
|
|
|
- name: Show deployment start warning
|
|
ansible.builtin.debug:
|
|
msg: >-
|
|
laconic-so deployment start returned rc={{ deploy_start.rc }}
|
|
but deployment exists — continuing (idempotent).
|
|
when: deploy_start.rc != 0 and (deploy_verify.rc | default(1)) == 0
|
|
tags: [deploy]
|
|
|
|
- name: Wait for deployment to exist
|
|
ansible.builtin.command: >
|
|
kubectl get deployment {{ deployment_name }}
|
|
-n {{ k8s_namespace }}
|
|
-o jsonpath='{.metadata.name}'
|
|
register: deploy_exists
|
|
retries: 30
|
|
delay: 10
|
|
until: deploy_exists.rc == 0
|
|
changed_when: false
|
|
tags: [deploy]
|
|
|
|
- name: Verify ramdisk visible inside kind node
|
|
ansible.builtin.shell:
|
|
cmd: >
|
|
set -o pipefail &&
|
|
docker exec {{ kind_cluster }}-control-plane
|
|
df -T /mnt/validator-accounts 2>/dev/null | grep -q xfs
|
|
executable: /bin/bash
|
|
register: kind_ramdisk_check
|
|
failed_when: kind_ramdisk_check.rc != 0
|
|
changed_when: false
|
|
tags: [deploy]
|
|
|
|
- name: Scale validator to 0 (stop before snapshot download)
|
|
ansible.builtin.command: >
|
|
kubectl scale deployment {{ deployment_name }}
|
|
-n {{ k8s_namespace }} --replicas=0
|
|
changed_when: true
|
|
tags: [deploy]
|
|
|
|
- name: Wait for pods to terminate
|
|
ansible.builtin.command: >
|
|
kubectl get pods -n {{ k8s_namespace }}
|
|
-l app={{ deployment_name }}
|
|
-o jsonpath='{.items}'
|
|
register: pods_gone
|
|
retries: 30
|
|
delay: 5
|
|
until: pods_gone.stdout == "[]" or pods_gone.stdout == ""
|
|
changed_when: false
|
|
failed_when: false
|
|
tags: [deploy]
|
|
|
|
# ---- snapshot: download via aria2c, verify in kind node ------------------
|
|
- name: Verify aria2c installed
|
|
ansible.builtin.command: which aria2c
|
|
changed_when: false
|
|
when: not skip_snapshot | bool
|
|
tags: [snapshot]
|
|
|
|
- name: Copy snapshot script to remote
|
|
ansible.builtin.copy:
|
|
src: "{{ snapshot_script_local }}"
|
|
dest: "{{ snapshot_script }}"
|
|
mode: "0755"
|
|
when: not skip_snapshot | bool
|
|
tags: [snapshot]
|
|
|
|
- name: Verify kind node mounts
|
|
ansible.builtin.command: >
|
|
docker exec {{ kind_cluster }}-control-plane
|
|
ls /mnt/validator-snapshots/
|
|
register: kind_mount_check
|
|
changed_when: false
|
|
tags: [snapshot]
|
|
|
|
- name: Download snapshot via aria2c
|
|
ansible.builtin.shell: >
|
|
python3 {{ snapshot_script }}
|
|
-o {{ snapshot_dir }}
|
|
{{ snapshot_args }}
|
|
become: true
|
|
register: snapshot_result
|
|
changed_when: true
|
|
when: not skip_snapshot | bool
|
|
timeout: 3600
|
|
tags: [snapshot]
|
|
|
|
- name: Show snapshot download result
|
|
ansible.builtin.debug:
|
|
msg: "{{ snapshot_result.stdout_lines | default(['skipped']) }}"
|
|
tags: [snapshot]
|
|
|
|
- name: Verify snapshot visible inside kind node
|
|
ansible.builtin.shell: >
|
|
set -o pipefail &&
|
|
docker exec {{ kind_cluster }}-control-plane
|
|
find /mnt/validator-snapshots/ -name '*.tar.*' -maxdepth 1 | head -5
|
|
register: kind_snapshot_check
|
|
failed_when: kind_snapshot_check.stdout == ""
|
|
changed_when: false
|
|
when: not skip_snapshot | bool
|
|
tags: [snapshot]
|
|
|
|
- name: Show snapshot files in kind node
|
|
ansible.builtin.debug:
|
|
msg: "{{ kind_snapshot_check.stdout_lines | default(['skipped']) }}"
|
|
when: not skip_snapshot | bool
|
|
tags: [snapshot]
|
|
|
|
# ---- deploy (cont): scale validator back up with snapshot ----------------
|
|
- name: Scale validator to 1 (start with downloaded snapshot)
|
|
ansible.builtin.command: >
|
|
kubectl scale deployment {{ deployment_name }}
|
|
-n {{ k8s_namespace }} --replicas=1
|
|
changed_when: true
|
|
tags: [deploy, scale-up]
|
|
|
|
# ---- verify: confirm validator is running --------------------------------
|
|
- name: Wait for pod to be running
|
|
ansible.builtin.command: >
|
|
kubectl get pods -n {{ k8s_namespace }}
|
|
-o jsonpath='{.items[0].status.phase}'
|
|
register: pod_status
|
|
retries: 60
|
|
delay: 10
|
|
until: pod_status.stdout == "Running"
|
|
changed_when: false
|
|
tags: [verify]
|
|
|
|
- name: Verify PV mounts inside kind node
|
|
ansible.builtin.shell:
|
|
cmd: >
|
|
set -o pipefail &&
|
|
docker exec {{ kind_cluster }}-control-plane
|
|
df -T /mnt/validator-ledger /mnt/validator-accounts
|
|
/mnt/validator-snapshots /mnt/validator-log 2>&1
|
|
executable: /bin/bash
|
|
register: mount_check
|
|
changed_when: false
|
|
failed_when: false
|
|
tags: [verify]
|
|
|
|
- name: Show mount contents
|
|
ansible.builtin.debug:
|
|
msg: "{{ mount_check.stdout_lines }}"
|
|
tags: [verify]
|
|
|
|
- name: Check validator log file is being written
|
|
ansible.builtin.command: >
|
|
kubectl exec -n {{ k8s_namespace }}
|
|
deployment/{{ deployment_name }}
|
|
-c agave-validator -- test -f /data/log/validator.log
|
|
retries: 12
|
|
delay: 10
|
|
until: log_file_check.rc == 0
|
|
register: log_file_check
|
|
changed_when: false
|
|
failed_when: false
|
|
tags: [verify]
|
|
|
|
- name: Check RPC health
|
|
ansible.builtin.uri:
|
|
url: http://127.0.0.1:8899/health
|
|
return_content: true
|
|
register: rpc_health
|
|
retries: 6
|
|
delay: 10
|
|
until: rpc_health.status == 200
|
|
failed_when: false
|
|
delegate_to: "{{ inventory_hostname }}"
|
|
tags: [verify]
|
|
|
|
- name: Report status
|
|
ansible.builtin.debug:
|
|
msg: >-
|
|
Deployment complete.
|
|
Log: {{ 'writing' if log_file_check.rc == 0 else 'not yet created' }}.
|
|
RPC: {{ rpc_health.content | default('not responding') }}.
|
|
Wiped: ledger={{ wipe_ledger }}, accounts={{ wipe_accounts }}.
|
|
tags: [verify]
|