diff --git a/CLAUDE.md b/CLAUDE.md index 6fb2164c..f8cd1ee3 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -10,16 +10,16 @@ below it are correct. Playbooks belong to exactly one layer. | 1. Base system | Docker, ZFS, packages | Out of scope (manual/PXE) | | 2. Prepare kind | `/srv/kind` exists (ZFS dataset) | None needed (ZFS handles it) | | 3. Install kind | `laconic-so deployment start` creates kind cluster, mounts `/srv/kind` → `/mnt` in kind node | `biscayne-redeploy.yml` (deploy tags) | -| 4. Prepare agave | Host storage for agave: zvol, ramdisk, rbind into `/srv/kind/solana` | `biscayne-prepare-agave.yml` | +| 4. Prepare agave | Host storage for agave: ZFS dataset, ramdisk | `biscayne-prepare-agave.yml` | | 5. Deploy agave | Deploy agave-stack into kind, snapshot download, scale up | `biscayne-redeploy.yml` (snapshot/verify tags), `biscayne-recover.yml` | **Layer 4 invariants** (asserted by `biscayne-prepare-agave.yml`): -- `/srv/kind/solana` is XFS on a zvol — agave uses io_uring which deadlocks on ZFS. `/srv/solana` is NOT the zvol (it's a ZFS dataset directory); never use it for data paths +- `/srv/kind/solana` is a ZFS dataset (`biscayne/DATA/srv/kind/solana`), child of the `/srv/kind` dataset - `/srv/kind/solana/ramdisk` is tmpfs (1TB) — accounts must be in RAM +- `/srv/solana` is NOT the data path — it's a directory on the parent ZFS dataset. All data paths use `/srv/kind/solana` These invariants are checked at runtime and persisted to fstab/systemd so they -survive reboot. They are agave's requirements reaching into the boot sequence, -not base system concerns. +survive reboot. **Cross-cutting**: `health-check.yml` (read-only diagnostics), `biscayne-stop.yml` (layer 5 — graceful shutdown), `fix-pv-mounts.yml` (layer 5 — PV repair). @@ -30,11 +30,8 @@ not base system concerns. The agave validator runs inside a kind-based k8s cluster managed by `laconic-so`. The kind node is a Docker container. **Never restart or kill the kind node container -while the validator is running.** Agave uses `io_uring` for async I/O, and on ZFS, -killing the process can produce unkillable kernel threads (D-state in -`io_wq_put_and_exit` blocked on ZFS transaction commits). This deadlocks the -container's PID namespace, making `docker stop`, `docker restart`, `docker exec`, -and even `reboot` hang. +while the validator is running.** Use `agave-validator exit --force` via the admin +RPC socket for graceful shutdown, or scale the deployment to 0 and wait. Correct shutdown sequence: @@ -61,15 +58,16 @@ The accounts directory must be in RAM for performance. tmpfs is used instead of `/dev/ram0` — simpler (no format-on-boot service needed), resizable on the fly with `mount -o remount,size=`, and what most Solana operators use. -**Boot ordering**: fstab entry mounts tmpfs at `/srv/kind/solana/ramdisk` with -`x-systemd.requires=srv-kind-solana.mount`. tmpfs mounts natively via fstab — -no systemd format service needed. **No manual intervention after reboot.** +**Boot ordering**: `/srv/kind/solana` is a ZFS dataset mounted automatically by +`zfs-mount.service`. The tmpfs ramdisk fstab entry uses +`x-systemd.requires=zfs-mount.service` to ensure the dataset is mounted first. +**No manual intervention after reboot.** **Mount propagation**: The kind node bind-mounts `/srv/kind` → `/mnt` at container start. laconic-so sets `propagation: HostToContainer` on all kind extraMounts -(commit `a11d40f2` in stack-orchestrator), so host submounts (like the rbind at -`/srv/kind/solana`) propagate into the kind node automatically. A kind restart -is required to pick up the new config after updating laconic-so. +(commit `a11d40f2` in stack-orchestrator), so host submounts propagate into the +kind node automatically. A kind restart is required to pick up the new config +after updating laconic-so. ### KUBECONFIG @@ -92,21 +90,20 @@ Then export it: export SSH_AUTH_SOCK=/tmp/ssh-XXXX/agent.NNNN ``` -### io_uring/ZFS Deadlock — Root Cause +### io_uring/ZFS Deadlock — Historical Note -When agave-validator is killed while performing I/O against ZFS-backed paths (not -the ramdisk), io_uring worker threads get stuck in D-state: -``` -io_wq_put_and_exit → dsl_dir_tempreserve_space (ZFS module) -``` -These threads are unkillable (SIGKILL has no effect on D-state processes). They -prevent the container's PID namespace from being reaped (`zap_pid_ns_processes` -waits forever), which breaks `docker stop`, `docker restart`, `docker exec`, and -even `reboot`. The only fix is a hard power cycle. +Agave uses io_uring for async I/O. Killing agave ungracefully while it has +outstanding I/O against ZFS can produce unkillable D-state kernel threads +(`io_wq_put_and_exit` blocked on ZFS transactions), deadlocking the container. -**Prevention**: Always scale the deployment to 0 and wait for the pod to terminate -before any destructive operation (namespace delete, kind restart, host reboot). -The `biscayne-stop.yml` playbook enforces this. +**Prevention**: Use graceful shutdown (`agave-validator exit --force` via admin +RPC, or scale to 0 and wait). The `biscayne-stop.yml` playbook enforces this. +With graceful shutdown, io_uring contexts are closed cleanly and ZFS storage +is safe to use directly (no zvol/XFS workaround needed). + +**ZFS fix**: The underlying io_uring bug is fixed in ZFS 2.2.8+ (PR #17298). +Biscayne currently runs ZFS 2.2.2. Upgrading ZFS will eliminate the deadlock +risk entirely, even for ungraceful shutdowns. ### laconic-so Architecture @@ -133,11 +130,11 @@ kind node via a single bind mount. - Deployment: `laconic-70ce4c4b47e23b85-deployment` - Kind node container: `laconic-70ce4c4b47e23b85-control-plane` - Deployment dir: `/srv/deployments/agave` -- Snapshot dir: `/srv/kind/solana/snapshots` (on zvol, visible to kind at `/mnt/validator-snapshots`) -- Ledger dir: `/srv/kind/solana/ledger` (on zvol, visible to kind at `/mnt/validator-ledger`) -- Accounts dir: `/srv/kind/solana/ramdisk/accounts` (on ramdisk `/dev/ram0`, visible to kind at `/mnt/validator-accounts`) -- Log dir: `/srv/kind/solana/log` (on zvol, visible to kind at `/mnt/validator-log`) -- **WARNING**: `/srv/solana` is a ZFS dataset directory, NOT the zvol. Never use it for data paths. +- Snapshot dir: `/srv/kind/solana/snapshots` (ZFS dataset, visible to kind at `/mnt/validator-snapshots`) +- Ledger dir: `/srv/kind/solana/ledger` (ZFS dataset, visible to kind at `/mnt/validator-ledger`) +- Accounts dir: `/srv/kind/solana/ramdisk/accounts` (tmpfs ramdisk, visible to kind at `/mnt/validator-accounts`) +- Log dir: `/srv/kind/solana/log` (ZFS dataset, visible to kind at `/mnt/validator-log`) +- **WARNING**: `/srv/solana` is a different ZFS dataset directory. All data paths use `/srv/kind/solana`. - Host bind mount root: `/srv/kind` -> kind node `/mnt` - laconic-so: `/home/rix/.local/bin/laconic-so` (editable install) diff --git a/playbooks/biscayne-migrate-storage.yml b/playbooks/biscayne-migrate-storage.yml new file mode 100644 index 00000000..995b0001 --- /dev/null +++ b/playbooks/biscayne-migrate-storage.yml @@ -0,0 +1,286 @@ +--- +# One-time migration: zvol/XFS → ZFS dataset for /srv/kind/solana +# +# Background: +# Biscayne used a ZFS zvol formatted as XFS to work around io_uring/ZFS +# deadlocks. The root cause is now handled by graceful shutdown via admin +# RPC (agave-validator exit --force), so the zvol/XFS layer is unnecessary. +# +# What this does: +# 1. Asserts the validator is scaled to 0 (does NOT scale it — that's +# the operator's job via biscayne-stop.yml) +# 2. Creates a child ZFS dataset biscayne/DATA/srv/kind/solana +# 3. Copies data from the zvol to the new dataset (rsync) +# 4. Updates fstab (removes zvol line, fixes tmpfs dependency) +# 5. Destroys the zvol after verification +# +# Prerequisites: +# - Validator MUST be stopped (scale 0, no agave processes) +# - Run biscayne-stop.yml first +# +# Usage: +# ansible-playbook -i inventory/ playbooks/biscayne-migrate-storage.yml +# +# After migration, run biscayne-prepare-agave.yml to update its checks, +# then biscayne-start.yml to bring the validator back up. +# +- name: Migrate storage from zvol/XFS to ZFS dataset + hosts: all + gather_facts: false + become: true + environment: + KUBECONFIG: /home/rix/.kube/config + vars: + kind_cluster: laconic-70ce4c4b47e23b85 + k8s_namespace: "laconic-{{ kind_cluster }}" + deployment_name: "{{ kind_cluster }}-deployment" + zvol_device: /dev/zvol/biscayne/DATA/volumes/solana + zvol_dataset: biscayne/DATA/volumes/solana + new_dataset: biscayne/DATA/srv/kind/solana + kind_solana_dir: /srv/kind/solana + ramdisk_mount: /srv/kind/solana/ramdisk + ramdisk_size: 1024G + # Temporary mount for zvol during data copy + zvol_tmp_mount: /mnt/zvol-migration-tmp + + tasks: + # ---- preconditions -------------------------------------------------------- + - name: Check deployment replica count + ansible.builtin.command: > + kubectl get deployment {{ deployment_name }} + -n {{ k8s_namespace }} + -o jsonpath='{.spec.replicas}' + register: current_replicas + failed_when: false + changed_when: false + + - name: Fail if validator is running + ansible.builtin.fail: + msg: >- + Validator must be scaled to 0 before migration. + Current replicas: {{ current_replicas.stdout | default('unknown') }}. + Run biscayne-stop.yml first. + when: current_replicas.stdout | default('0') | int > 0 + + - name: Verify no agave processes in kind node + ansible.builtin.command: > + docker exec {{ kind_cluster }}-control-plane + pgrep -c agave-validator + register: agave_procs + failed_when: false + changed_when: false + + - name: Fail if agave still running + ansible.builtin.fail: + msg: >- + agave-validator process still running inside kind node. + Cannot migrate while validator is active. + when: agave_procs.rc == 0 + + # ---- check current state -------------------------------------------------- + - name: Check if zvol device exists + ansible.builtin.stat: + path: "{{ zvol_device }}" + register: zvol_exists + + - name: Check if ZFS dataset already exists + ansible.builtin.command: zfs list -H -o name {{ new_dataset }} + register: dataset_exists + failed_when: false + changed_when: false + + - name: Check current mount type at {{ kind_solana_dir }} + ansible.builtin.shell: + cmd: set -o pipefail && findmnt -n -o FSTYPE {{ kind_solana_dir }} + executable: /bin/bash + register: current_fstype + failed_when: false + changed_when: false + + - name: Report current state + ansible.builtin.debug: + msg: + zvol_exists: "{{ zvol_exists.stat.exists | default(false) }}" + dataset_exists: "{{ dataset_exists.rc == 0 }}" + current_fstype: "{{ current_fstype.stdout | default('none') }}" + + # ---- skip if already migrated --------------------------------------------- + - name: End play if already on ZFS dataset + ansible.builtin.meta: end_play + when: + - dataset_exists.rc == 0 + - current_fstype.stdout | default('') == 'zfs' + - not (zvol_exists.stat.exists | default(false)) + + # ---- step 1: unmount ramdisk and zvol ------------------------------------ + - name: Unmount ramdisk + ansible.posix.mount: + path: "{{ ramdisk_mount }}" + state: unmounted + + - name: Unmount zvol from {{ kind_solana_dir }} + ansible.posix.mount: + path: "{{ kind_solana_dir }}" + state: unmounted + when: current_fstype.stdout | default('') == 'xfs' + + # ---- step 2: create ZFS dataset ----------------------------------------- + - name: Create ZFS dataset {{ new_dataset }} + ansible.builtin.command: > + zfs create -o mountpoint={{ kind_solana_dir }} {{ new_dataset }} + changed_when: true + when: dataset_exists.rc != 0 + + - name: Mount ZFS dataset if it already existed + ansible.builtin.command: zfs mount {{ new_dataset }} + changed_when: true + failed_when: false + when: dataset_exists.rc == 0 + + - name: Verify ZFS dataset is mounted + ansible.builtin.shell: + cmd: set -o pipefail && findmnt -n -o FSTYPE {{ kind_solana_dir }} | grep -q zfs + executable: /bin/bash + changed_when: false + + # ---- step 3: copy data from zvol ---------------------------------------- + - name: Create temporary mount point for zvol + ansible.builtin.file: + path: "{{ zvol_tmp_mount }}" + state: directory + mode: "0755" + when: zvol_exists.stat.exists | default(false) + + - name: Mount zvol at temporary location + ansible.posix.mount: + path: "{{ zvol_tmp_mount }}" + src: "{{ zvol_device }}" + fstype: xfs + state: mounted + when: zvol_exists.stat.exists | default(false) + + - name: Copy data from zvol to ZFS dataset # noqa: command-instead-of-module + ansible.builtin.command: > + rsync -a --info=progress2 + --exclude='ramdisk/' + {{ zvol_tmp_mount }}/ + {{ kind_solana_dir }}/ + changed_when: true + when: zvol_exists.stat.exists | default(false) + + # ---- step 4: verify data integrity -------------------------------------- + - name: Check key directories exist on new dataset + ansible.builtin.stat: + path: "{{ kind_solana_dir }}/{{ item }}" + register: dir_checks + loop: + - ledger + - snapshots + - log + + - name: Report directory verification + ansible.builtin.debug: + msg: "{{ item.item }}: {{ 'exists' if item.stat.exists else 'MISSING' }}" + loop: "{{ dir_checks.results }}" + loop_control: + label: "{{ item.item }}" + + # ---- step 5: update fstab ------------------------------------------------ + - name: Remove zvol fstab entry + ansible.builtin.lineinfile: + path: /etc/fstab + regexp: '^\S+zvol\S+\s+{{ kind_solana_dir }}\s' + state: absent + register: fstab_zvol_removed + + # Also match any XFS entry for kind_solana_dir (non-zvol form) + - name: Remove any XFS fstab entry for {{ kind_solana_dir }} + ansible.builtin.lineinfile: + path: /etc/fstab + regexp: '^\S+\s+{{ kind_solana_dir }}\s+xfs' + state: absent + + # ZFS datasets are mounted by zfs-mount.service automatically. + # The tmpfs ramdisk depends on the solana dir existing, which ZFS + # guarantees via zfs-mount.service. Update the systemd dependency. + - name: Update tmpfs ramdisk fstab entry + ansible.builtin.lineinfile: + path: /etc/fstab + regexp: '^\S+\s+{{ ramdisk_mount }}\s' + line: "tmpfs {{ ramdisk_mount }} tmpfs nodev,nosuid,noexec,nodiratime,size={{ ramdisk_size }},nofail,x-systemd.requires=zfs-mount.service 0 0" + + - name: Reload systemd # noqa: no-handler + ansible.builtin.systemd: + daemon_reload: true + when: fstab_zvol_removed.changed + + # ---- step 6: mount ramdisk ----------------------------------------------- + - name: Mount tmpfs ramdisk + ansible.posix.mount: + path: "{{ ramdisk_mount }}" + src: tmpfs + fstype: tmpfs + opts: "nodev,nosuid,noexec,nodiratime,size={{ ramdisk_size }}" + state: mounted + + - name: Ensure accounts directory + ansible.builtin.file: + path: "{{ ramdisk_mount }}/accounts" + state: directory + owner: solana + group: solana + mode: "0755" + + # ---- step 7: clean up zvol ----------------------------------------------- + - name: Unmount zvol from temporary location + ansible.posix.mount: + path: "{{ zvol_tmp_mount }}" + state: unmounted + when: zvol_exists.stat.exists | default(false) + + - name: Remove temporary mount point + ansible.builtin.file: + path: "{{ zvol_tmp_mount }}" + state: absent + + - name: Destroy zvol {{ zvol_dataset }} + ansible.builtin.command: zfs destroy {{ zvol_dataset }} + changed_when: true + when: zvol_exists.stat.exists | default(false) + + # ---- step 8: ensure shared propagation for docker ------------------------ + - name: Ensure shared propagation on kind mounts # noqa: command-instead-of-module + ansible.builtin.command: + cmd: mount --make-shared {{ item }} + loop: + - "{{ kind_solana_dir }}" + - "{{ ramdisk_mount }}" + changed_when: false + + # ---- verification --------------------------------------------------------- + - name: Verify solana dir is ZFS + ansible.builtin.shell: + cmd: set -o pipefail && df -T {{ kind_solana_dir }} | grep -q zfs + executable: /bin/bash + changed_when: false + + - name: Verify ramdisk is tmpfs + ansible.builtin.shell: + cmd: set -o pipefail && df -T {{ ramdisk_mount }} | grep -q tmpfs + executable: /bin/bash + changed_when: false + + - name: Verify zvol is destroyed + ansible.builtin.command: zfs list -H -o name {{ zvol_dataset }} + register: zvol_gone + failed_when: zvol_gone.rc == 0 + changed_when: false + + - name: Migration complete + ansible.builtin.debug: + msg: >- + Storage migration complete. + {{ kind_solana_dir }} is now a ZFS dataset ({{ new_dataset }}). + Ramdisk at {{ ramdisk_mount }} (tmpfs, {{ ramdisk_size }}). + zvol {{ zvol_dataset }} destroyed. + Next: update biscayne-prepare-agave.yml, then start the validator. diff --git a/playbooks/biscayne-recover.yml b/playbooks/biscayne-recover.yml index 1d46c78e..ea0d9b3e 100644 --- a/playbooks/biscayne-recover.yml +++ b/playbooks/biscayne-recover.yml @@ -10,7 +10,8 @@ # 2. Wait for pods to terminate (io_uring safety check) # 3. Wipe accounts ramdisk # 4. Clean old snapshots -# 5. Scale to 1 — container entrypoint downloads snapshot + starts validator +# 5. Ensure terminationGracePeriodSeconds is 300 (for graceful shutdown) +# 6. Scale to 1 — container entrypoint downloads snapshot + starts validator # # The playbook exits after step 5. The container handles snapshot download # (60+ min) and validator startup autonomously. Monitor with: @@ -95,7 +96,18 @@ become: true changed_when: true - # ---- step 5: scale to 1 — entrypoint handles snapshot download ------------ + # ---- step 5: ensure terminationGracePeriodSeconds ------------------------- + # laconic-so doesn't support this declaratively. Patch the deployment so + # k8s gives the entrypoint 300s to perform graceful shutdown via admin RPC. + - name: Ensure terminationGracePeriodSeconds is 300 + ansible.builtin.command: > + kubectl patch deployment {{ deployment_name }} + -n {{ k8s_namespace }} + -p '{"spec":{"template":{"spec":{"terminationGracePeriodSeconds":300}}}}' + register: patch_result + changed_when: "'no change' not in patch_result.stdout" + + # ---- step 6: scale to 1 — entrypoint handles snapshot download ------------ # The container's entrypoint.py checks snapshot freshness, cleans stale # snapshots, downloads fresh ones (with rolling incremental convergence), # then starts the validator. No host-side download needed. diff --git a/playbooks/biscayne-stop.yml b/playbooks/biscayne-stop.yml index 2f9290f6..4a83ca63 100644 --- a/playbooks/biscayne-stop.yml +++ b/playbooks/biscayne-stop.yml @@ -5,11 +5,12 @@ # This MUST be done before any kind node restart, host reboot, # or docker operations. # -# The agave validator uses io_uring for async I/O. On ZFS, killing -# the process ungracefully (SIGKILL, docker kill, etc.) can produce -# unkillable kernel threads stuck in io_wq_put_and_exit, deadlocking -# the container's PID namespace. A graceful SIGTERM via k8s scale-down -# allows agave to flush and close its io_uring contexts cleanly. +# The container entrypoint (PID 1) traps SIGTERM and runs +# ``agave-validator exit --force --ledger /data/ledger`` which tells +# the validator to flush I/O and exit cleanly via the admin RPC Unix +# socket. This avoids the io_uring/ZFS deadlock that occurs when the +# process is killed. terminationGracePeriodSeconds must be set to 300 +# on the k8s deployment to allow time for the flush. # # Usage: # # Stop the validator @@ -42,6 +43,17 @@ failed_when: false changed_when: false + # Ensure k8s gives the entrypoint enough time for graceful shutdown + # via admin RPC before sending SIGKILL. + - name: Ensure terminationGracePeriodSeconds is 300 + ansible.builtin.command: > + kubectl patch deployment {{ deployment_name }} + -n {{ k8s_namespace }} + -p '{"spec":{"template":{"spec":{"terminationGracePeriodSeconds":300}}}}' + register: patch_result + changed_when: "'no change' not in patch_result.stdout" + when: current_replicas.stdout | default('0') | int > 0 + - name: Scale deployment to 0 ansible.builtin.command: > kubectl scale deployment {{ deployment_name }} diff --git a/playbooks/biscayne-sync-tools.yml b/playbooks/biscayne-sync-tools.yml index 76ba610e..dfecd340 100644 --- a/playbooks/biscayne-sync-tools.yml +++ b/playbooks/biscayne-sync-tools.yml @@ -15,6 +15,10 @@ # ansible-playbook -i inventory/biscayne.yml playbooks/biscayne-sync-tools.yml \ # -e laconic_so_branch=fix/kind-mount-propagation # +# # Sync and rebuild the agave container image +# ansible-playbook -i inventory/biscayne.yml playbooks/biscayne-sync-tools.yml \ +# --tags build-container +# - name: Sync laconic-so and agave-stack hosts: all gather_facts: false @@ -30,49 +34,55 @@ stack_branch: main tasks: - # Git operations run as the connecting user (no become) so that - # SSH agent forwarding works. sudo drops SSH_AUTH_SOCK. - name: Update laconic-so (editable install) - become: false ansible.builtin.shell: | + set -e cd {{ laconic_so_repo }} git fetch origin {{ laconic_so_branch }} git reset --hard origin/{{ laconic_so_branch }} + vars: + ansible_become: false register: laconic_so_update changed_when: true + tags: [sync, build-container] - name: Show laconic-so version - become: false ansible.builtin.shell: cmd: set -o pipefail && cd {{ laconic_so_repo }} && git log --oneline -1 executable: /bin/bash register: laconic_so_version changed_when: false + tags: [sync, build-container] - name: Report laconic-so ansible.builtin.debug: msg: "laconic-so: {{ laconic_so_version.stdout }}" + tags: [sync, build-container] - name: Pull agave-stack repo - become: false ansible.builtin.shell: | + set -e cd {{ stack_repo }} git fetch origin {{ stack_branch }} git reset --hard origin/{{ stack_branch }} + vars: + ansible_become: false register: stack_update changed_when: true + tags: [sync, build-container] - name: Show agave-stack version - become: false ansible.builtin.shell: cmd: set -o pipefail && cd {{ stack_repo }} && git log --oneline -1 executable: /bin/bash register: stack_version changed_when: false + tags: [sync, build-container] - name: Report agave-stack ansible.builtin.debug: msg: "agave-stack: {{ stack_version.stdout }}" + tags: [sync, build-container] - name: Regenerate deployment config from updated stack ansible.builtin.command: > @@ -84,6 +94,7 @@ --update register: regen_result changed_when: true + tags: [sync, build-container] - name: Report sync complete ansible.builtin.debug: @@ -91,3 +102,27 @@ Sync complete. laconic-so and agave-stack updated to origin/{{ laconic_so_branch }}. Deployment config regenerated. Restart or redeploy required to apply changes. + tags: [sync, build-container] + + # ---- optional: rebuild container image -------------------------------------- + # Only runs when explicitly requested with --tags build-container. + # Safe to run while the validator is running — just builds a new image. + # The running pod keeps the old image until restarted. + - name: Build agave container image + ansible.builtin.command: > + {{ laconic_so }} + --stack {{ stack_path }} + build-containers + --include laconicnetwork-agave + tags: + - build-container + - never + register: build_result + changed_when: true + + - name: Report build complete + ansible.builtin.debug: + msg: "Container image built. Will be used on next pod restart." + tags: + - build-container + - never diff --git a/playbooks/biscayne-upgrade-zfs.yml b/playbooks/biscayne-upgrade-zfs.yml new file mode 100644 index 00000000..a1b38c9d --- /dev/null +++ b/playbooks/biscayne-upgrade-zfs.yml @@ -0,0 +1,158 @@ +--- +# Upgrade ZFS from 2.2.2 to 2.2.9 via arter97's zfs-lts PPA +# +# Fixes the io_uring deadlock (OpenZFS PR #17298) at the kernel module level. +# After this upgrade, the zvol/XFS workaround is unnecessary and can be removed +# with biscayne-migrate-storage.yml. +# +# PPA: ppa:arter97/zfs-lts (Juhyung Park, OpenZFS contributor) +# Builds from source on Launchpad — transparent, auditable. +# +# WARNING: This playbook triggers a reboot at the end. If the io_uring zombie +# is present, the reboot WILL HANG. The operator must hard power cycle the +# machine (IPMI/iDRAC/physical). The playbook does not wait for the reboot — +# run the verify tag separately after the machine comes back. +# +# Usage: +# # Full upgrade (adds PPA, upgrades, reboots) +# ansible-playbook -i inventory/ playbooks/biscayne-upgrade-zfs.yml +# +# # Verify after reboot +# ansible-playbook -i inventory/ playbooks/biscayne-upgrade-zfs.yml \ +# --tags verify +# +# # Dry run — show what would be upgraded +# ansible-playbook -i inventory/ playbooks/biscayne-upgrade-zfs.yml \ +# --tags dry-run +# +- name: Upgrade ZFS via arter97/zfs-lts PPA + hosts: all + gather_facts: true + become: true + vars: + zfs_min_version: "2.2.8" + ppa_name: "ppa:arter97/zfs-lts" + zfs_packages: + - zfsutils-linux + - zfs-dkms + - libzfs5linux + + tasks: + # ---- pre-flight checks ---------------------------------------------------- + - name: Get current ZFS version + ansible.builtin.command: modinfo -F version zfs + register: zfs_current_version + changed_when: false + tags: [always] + + - name: Report current ZFS version + ansible.builtin.debug: + msg: "Current ZFS: {{ zfs_current_version.stdout }}" + tags: [always] + + - name: Skip if already upgraded + ansible.builtin.meta: end_play + when: zfs_current_version.stdout is version(zfs_min_version, '>=') + tags: [always] + + # ---- dry run --------------------------------------------------------------- + - name: Show available ZFS packages from PPA (dry run) + ansible.builtin.shell: + cmd: > + set -o pipefail && + apt-cache policy zfsutils-linux zfs-dkms 2>/dev/null | + grep -A2 'zfsutils-linux\|zfs-dkms' + executable: /bin/bash + changed_when: false + failed_when: false + tags: + - dry-run + - never + + # ---- add PPA --------------------------------------------------------------- + - name: Add arter97/zfs-lts PPA + ansible.builtin.apt_repository: + repo: "{{ ppa_name }}" + state: present + update_cache: true + tags: [upgrade] + + # ---- upgrade ZFS packages -------------------------------------------------- + - name: Upgrade ZFS packages + ansible.builtin.apt: + name: "{{ zfs_packages }}" + state: latest # noqa: package-latest + update_cache: true + register: zfs_upgrade + tags: [upgrade] + + - name: Show upgrade result + ansible.builtin.debug: + msg: "{{ zfs_upgrade.stdout_lines | default(['no output']) }}" + tags: [upgrade] + + # ---- reboot ---------------------------------------------------------------- + - name: Report pre-reboot status + ansible.builtin.debug: + msg: >- + ZFS packages upgraded. Rebooting now. + If the io_uring zombie is present, this reboot WILL HANG. + Hard power cycle the machine, then run this playbook with + --tags verify to confirm the upgrade. + tags: [upgrade] + + - name: Reboot to load new ZFS modules + ansible.builtin.reboot: + msg: "ZFS upgrade — rebooting to load new kernel modules" + reboot_timeout: 600 + tags: [upgrade] + # This will timeout if io_uring zombie blocks shutdown. + # Operator must hard power cycle. That's expected. + + # ---- post-reboot verification ----------------------------------------------- + - name: Get ZFS version after reboot + ansible.builtin.command: modinfo -F version zfs + register: zfs_new_version + changed_when: false + tags: + - verify + - never + + - name: Verify ZFS version meets minimum + ansible.builtin.assert: + that: + - zfs_new_version.stdout is version(zfs_min_version, '>=') + fail_msg: >- + ZFS version {{ zfs_new_version.stdout }} is below minimum + {{ zfs_min_version }}. Upgrade may have failed. + success_msg: "ZFS {{ zfs_new_version.stdout }} — io_uring fix confirmed." + tags: + - verify + - never + + - name: Verify ZFS pools are healthy + ansible.builtin.command: zpool status -x + register: zpool_status + changed_when: false + failed_when: "'all pools are healthy' not in zpool_status.stdout" + tags: + - verify + - never + + - name: Verify ZFS datasets are mounted + ansible.builtin.command: zfs mount + register: zfs_mounts + changed_when: false + tags: + - verify + - never + + - name: Report verification + ansible.builtin.debug: + msg: + zfs_version: "{{ zfs_new_version.stdout }}" + pools: "{{ zpool_status.stdout }}" + mounts: "{{ zfs_mounts.stdout_lines }}" + tags: + - verify + - never diff --git a/scripts/agave-container/entrypoint.py b/scripts/agave-container/entrypoint.py index 519c7be2..20961624 100644 --- a/scripts/agave-container/entrypoint.py +++ b/scripts/agave-container/entrypoint.py @@ -2,12 +2,17 @@ """Agave validator entrypoint — snapshot management, arg construction, liveness probe. Two subcommands: - entrypoint.py serve (default) — snapshot freshness check + exec agave-validator + entrypoint.py serve (default) — snapshot freshness check + run agave-validator entrypoint.py probe — liveness probe (slot lag check, exits 0/1) Replaces the bash entrypoint.sh / start-rpc.sh / start-validator.sh with a single Python module. Test mode still dispatches to start-test.sh. +Python stays as PID 1 and traps SIGTERM. On SIGTERM, it runs +``agave-validator exit --force --ledger /data/ledger`` which connects to the +admin RPC Unix socket and tells the validator to flush I/O and exit cleanly. +This avoids the io_uring/ZFS deadlock that occurs when the process is killed. + All configuration comes from environment variables — same vars as the original bash scripts. See compose files for defaults. """ @@ -18,8 +23,10 @@ import json import logging import os import re +import signal import subprocess import sys +import threading import time import urllib.error import urllib.request @@ -365,11 +372,77 @@ def append_extra_args(args: list[str]) -> list[str]: return args +# -- Graceful shutdown -------------------------------------------------------- + +# Timeout for graceful exit via admin RPC. Leave 30s margin for k8s +# terminationGracePeriodSeconds (300s). +GRACEFUL_EXIT_TIMEOUT = 270 + + +def graceful_exit(child: subprocess.Popen[bytes]) -> None: + """Request graceful shutdown via the admin RPC Unix socket. + + Runs ``agave-validator exit --force --ledger /data/ledger`` which connects + to the admin RPC socket at ``/data/ledger/admin.rpc`` and sets the + validator's exit flag. The validator flushes all I/O and exits cleanly, + avoiding the io_uring/ZFS deadlock. + + If the admin RPC exit fails or the child doesn't exit within the timeout, + falls back to SIGTERM then SIGKILL. + """ + log.info("SIGTERM received — requesting graceful exit via admin RPC") + try: + result = subprocess.run( + ["agave-validator", "exit", "--force", "--ledger", LEDGER_DIR], + capture_output=True, text=True, timeout=30, + ) + if result.returncode == 0: + log.info("Admin RPC exit requested successfully") + else: + log.warning( + "Admin RPC exit returned %d: %s", + result.returncode, result.stderr.strip(), + ) + except subprocess.TimeoutExpired: + log.warning("Admin RPC exit command timed out after 30s") + except FileNotFoundError: + log.warning("agave-validator binary not found for exit command") + + # Wait for child to exit + try: + child.wait(timeout=GRACEFUL_EXIT_TIMEOUT) + log.info("Validator exited cleanly with code %d", child.returncode) + return + except subprocess.TimeoutExpired: + log.warning( + "Validator did not exit within %ds — sending SIGTERM", + GRACEFUL_EXIT_TIMEOUT, + ) + + # Fallback: SIGTERM + child.terminate() + try: + child.wait(timeout=15) + log.info("Validator exited after SIGTERM with code %d", child.returncode) + return + except subprocess.TimeoutExpired: + log.warning("Validator did not exit after SIGTERM — sending SIGKILL") + + # Last resort: SIGKILL + child.kill() + child.wait() + log.info("Validator killed with SIGKILL, code %d", child.returncode) + + # -- Serve subcommand --------------------------------------------------------- def cmd_serve() -> None: - """Main serve flow: snapshot check, setup, exec agave-validator.""" + """Main serve flow: snapshot check, setup, run agave-validator as child. + + Python stays as PID 1 and traps SIGTERM to perform graceful shutdown + via the admin RPC Unix socket. + """ mode = env("AGAVE_MODE", "test") log.info("AGAVE_MODE=%s", mode) @@ -407,7 +480,21 @@ def cmd_serve() -> None: Path("/tmp/entrypoint-start").write_text(str(time.time())) log.info("Starting agave-validator with %d arguments", len(args)) - os.execvp("agave-validator", ["agave-validator"] + args) + child = subprocess.Popen(["agave-validator"] + args) + + # Forward SIGUSR1 to child (log rotation) + signal.signal(signal.SIGUSR1, lambda _sig, _frame: child.send_signal(signal.SIGUSR1)) + + # Trap SIGTERM — run graceful_exit in a thread so the signal handler returns + # immediately and child.wait() in the main thread can observe the exit. + def _on_sigterm(_sig: int, _frame: object) -> None: + threading.Thread(target=graceful_exit, args=(child,), daemon=True).start() + + signal.signal(signal.SIGTERM, _on_sigterm) + + # Wait for child — if it exits on its own (crash, normal exit), propagate code + child.wait() + sys.exit(child.returncode) # -- Probe subcommand --------------------------------------------------------- diff --git a/scripts/agave-container/snapshot_download.py b/scripts/agave-container/snapshot_download.py index 151b2f26..146b7291 100644 --- a/scripts/agave-container/snapshot_download.py +++ b/scripts/agave-container/snapshot_download.py @@ -655,8 +655,9 @@ def download_best_snapshot( log.info("Downloading incremental %s (%d mirrors, slot %d, gap %d slots)", inc_fn, len(inc_mirrors), inc_slot, gap) if not download_aria2c(inc_mirrors, output_dir, inc_fn, connections): - log.error("Failed to download incremental %s", inc_fn) - break + log.warning("Failed to download incremental %s — re-probing in 10s", inc_fn) + time.sleep(10) + continue prev_inc_filename = inc_fn diff --git a/scripts/check-status.py b/scripts/check-status.py index 2f9cf131..b3974392 100755 --- a/scripts/check-status.py +++ b/scripts/check-status.py @@ -18,6 +18,7 @@ from __future__ import annotations import argparse import json +import os import subprocess import sys import time @@ -206,9 +207,11 @@ def display(iteration: int = 0) -> None: snapshots = check_snapshots() ramdisk = check_ramdisk() - print(f"\n{'=' * 60}") - print(f" Biscayne Agave Status — {ts}") - print(f"{'=' * 60}") + # Clear screen and home cursor for clean redraw in watch mode + if iteration > 0: + print("\033[2J\033[H", end="") + + print(f"\n Biscayne Agave Status — {ts}\n") # Pod print(f"\n Pod: {pod['phase']}") @@ -275,14 +278,30 @@ def display(iteration: int = 0) -> None: # -- Main --------------------------------------------------------------------- +def spawn_tmux_pane(interval: int) -> None: + """Launch this script with --watch in a new tmux pane.""" + script = os.path.abspath(__file__) + cmd = f"python3 {script} --watch -i {interval}" + subprocess.run( + ["tmux", "split-window", "-h", "-d", cmd], + check=True, + ) + + def main() -> int: p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) p.add_argument("--watch", action="store_true", help="Repeat every interval") + p.add_argument("--pane", action="store_true", + help="Launch --watch in a new tmux pane") p.add_argument("-i", "--interval", type=int, default=30, help="Watch interval in seconds (default: 30)") args = p.parse_args() + if args.pane: + spawn_tmux_pane(args.interval) + return 0 + discover() try: