diff --git a/CLAUDE.md b/CLAUDE.md index 138d8d75..21542520 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,5 +1,30 @@ # Biscayne Agave Runbook +## Deployment Layers + +Operations on biscayne follow a strict layering. Each layer assumes the layers +below it are correct. Playbooks belong to exactly one layer. + +| Layer | What | Playbooks | +|-------|------|-----------| +| 1. Base system | Docker, ZFS, packages | Out of scope (manual/PXE) | +| 2. Prepare kind | `/srv/kind` exists (ZFS dataset) | None needed (ZFS handles it) | +| 3. Install kind | `laconic-so deployment start` creates kind cluster, mounts `/srv/kind` → `/mnt` in kind node | `biscayne-redeploy.yml` (deploy tags) | +| 4. Prepare agave | Host storage for agave: zvol, ramdisk, rbind into `/srv/kind/solana` | `biscayne-prepare-agave.yml` | +| 5. Deploy agave | Deploy agave-stack into kind, snapshot download, scale up | `biscayne-redeploy.yml` (snapshot/verify tags), `biscayne-recover.yml` | + +**Layer 4 invariants** (asserted by `biscayne-prepare-agave.yml`): +- `/srv/solana` is XFS on a zvol — agave uses io_uring which deadlocks on ZFS +- `/srv/solana/ramdisk` is XFS on `/dev/ram0` — accounts must be on ramdisk +- `/srv/kind/solana` is an rbind of `/srv/solana` — makes the zvol visible to kind at `/mnt/solana` + +These invariants are checked at runtime and persisted to fstab/systemd so they +survive reboot. They are agave's requirements reaching into the boot sequence, +not base system concerns. + +**Cross-cutting**: `health-check.yml` (read-only diagnostics), `biscayne-stop.yml` +(layer 5 — graceful shutdown), `fix-pv-mounts.yml` (layer 5 — PV repair). + ## Cluster Operations ### Shutdown Order @@ -36,7 +61,7 @@ Correct shutdown sequence: The accounts directory must be on a ramdisk for performance. `/dev/ram0` loses its filesystem on reboot and must be reformatted before mounting. -**Boot ordering is handled by systemd units** (installed by `biscayne-boot.yml`): +**Boot ordering is handled by systemd units** (installed by `biscayne-prepare-agave.yml`): - `format-ramdisk.service`: runs `mkfs.xfs -f /dev/ram0` before `local-fs.target` - fstab entry: mounts `/dev/ram0` at `/srv/solana/ramdisk` with `x-systemd.requires=format-ramdisk.service` @@ -46,11 +71,12 @@ filesystem on reboot and must be reformatted before mounting. These units run before docker, so the kind node's bind mounts always see the ramdisk. **No manual intervention is needed after reboot.** -**Mount propagation**: The kind node bind-mounts `/srv/kind` → `/mnt`. Because -the ramdisk is mounted at `/srv/solana/ramdisk` and symlinked/overlaid through -`/srv/kind/solana/ramdisk`, mount propagation makes it visible inside the kind -node at `/mnt/solana/ramdisk` without restarting the kind node. **Do NOT restart -the kind node just to pick up a ramdisk mount.** +**Mount propagation**: The kind node bind-mounts `/srv/kind` → `/mnt` at container +start. New mounts under `/srv/kind` on the host (like the rbind at +`/srv/kind/solana`) do NOT propagate into the kind node because kind's default +mount propagation is `None`. A kind node restart is required to pick up new host +mounts. **TODO**: Fix laconic-so to set `propagation: HostToContainer` on the +kind-mount-root extraMount, which would make host mounts propagate automatically. ### KUBECONFIG diff --git a/inventory/biscayne.yml b/inventory/biscayne.yml index 722a696a..f0afa001 100644 --- a/inventory/biscayne.yml +++ b/inventory/biscayne.yml @@ -4,6 +4,7 @@ all: ansible_host: biscayne.vaasl.io ansible_user: rix ansible_become: true + ansible_python_interpreter: /usr/bin/python3.12 # DoubleZero identities dz_identity: 3Bw6v7EruQvTwoY79h2QjQCs2KBQFzSneBdYUbcXK1Tr diff --git a/playbooks/biscayne-boot.yml b/playbooks/biscayne-boot.yml deleted file mode 100644 index af89a312..00000000 --- a/playbooks/biscayne-boot.yml +++ /dev/null @@ -1,108 +0,0 @@ ---- -# Configure biscayne OS-level services for agave validator -# -# Installs a systemd unit that formats and mounts the ramdisk on boot. -# /dev/ram0 loses its filesystem on reboot, so mkfs.xfs must run before -# the fstab mount. This unit runs before docker, ensuring the kind node's -# bind mounts always see the ramdisk. -# -# This playbook is idempotent — safe to run multiple times. -# -# Usage: -# ansible-playbook -i biscayne.vaasl.io, playbooks/biscayne-boot.yml -# -- name: Configure OS-level services for agave - hosts: all - gather_facts: false - become: true - vars: - ramdisk_device: /dev/ram0 - ramdisk_mount: /srv/solana/ramdisk - accounts_dir: /srv/solana/ramdisk/accounts - - tasks: - - name: Install ramdisk format service - ansible.builtin.copy: - dest: /etc/systemd/system/format-ramdisk.service - mode: "0644" - content: | - [Unit] - Description=Format /dev/ram0 as XFS for Solana accounts - DefaultDependencies=no - Before=local-fs.target - After=systemd-modules-load.service - ConditionPathExists={{ ramdisk_device }} - - [Service] - Type=oneshot - RemainAfterExit=yes - ExecStart=/sbin/mkfs.xfs -f {{ ramdisk_device }} - - [Install] - WantedBy=local-fs.target - register: unit_file - - - name: Install ramdisk post-mount service - ansible.builtin.copy: - dest: /etc/systemd/system/ramdisk-accounts.service - mode: "0644" - content: | - [Unit] - Description=Create Solana accounts directory on ramdisk - After=srv-solana-ramdisk.mount - Requires=srv-solana-ramdisk.mount - - [Service] - Type=oneshot - RemainAfterExit=yes - ExecStart=/bin/bash -c 'mkdir -p {{ accounts_dir }} && chown solana:solana {{ ramdisk_mount }} {{ accounts_dir }}' - - [Install] - WantedBy=multi-user.target - register: accounts_unit - - - name: Ensure fstab entry uses nofail - ansible.builtin.lineinfile: - path: /etc/fstab - regexp: '^{{ ramdisk_device }}\s+{{ ramdisk_mount }}' - line: '{{ ramdisk_device }} {{ ramdisk_mount }} xfs noatime,nodiratime,nofail,x-systemd.requires=format-ramdisk.service 0 0' - register: fstab_entry - - - name: Reload systemd - ansible.builtin.systemd: - daemon_reload: true - when: unit_file.changed or accounts_unit.changed or fstab_entry.changed - - - name: Enable ramdisk services - ansible.builtin.systemd: - name: "{{ item }}" - enabled: true - loop: - - format-ramdisk.service - - ramdisk-accounts.service - - # ---- apply now if ramdisk not mounted ------------------------------------ - - name: Check if ramdisk is mounted - ansible.builtin.command: mountpoint -q {{ ramdisk_mount }} - register: ramdisk_mounted - failed_when: false - changed_when: false - - - name: Format and mount ramdisk now - ansible.builtin.shell: | - mkfs.xfs -f {{ ramdisk_device }} - mount {{ ramdisk_mount }} - mkdir -p {{ accounts_dir }} - chown solana:solana {{ ramdisk_mount }} {{ accounts_dir }} - changed_when: ramdisk_mounted.rc != 0 - when: ramdisk_mounted.rc != 0 - - # ---- verify -------------------------------------------------------------- - - name: Verify ramdisk - ansible.builtin.command: df -hT {{ ramdisk_mount }} - register: ramdisk_df - changed_when: false - - - name: Show ramdisk status - ansible.builtin.debug: - msg: "{{ ramdisk_df.stdout_lines }}" diff --git a/playbooks/biscayne-prepare-agave.yml b/playbooks/biscayne-prepare-agave.yml new file mode 100644 index 00000000..cc2be16b --- /dev/null +++ b/playbooks/biscayne-prepare-agave.yml @@ -0,0 +1,243 @@ +--- +# Prepare biscayne host for agave validator +# +# Deployment layers: +# 1. Base system — Docker, ZFS (out of scope) +# 2. Prepare kind — /srv/kind directory exists (ZFS dataset, out of scope) +# 3. laconic-so — Installs kind, mounts /srv/kind → /mnt in kind node +# 4. Prepare agave — THIS PLAYBOOK +# 5. Deploy agave — laconic-so deploys agave-stack into kind +# +# Agave requires three things from the host that kind doesn't provide: +# +# Invariant 1: /srv/solana is XFS on a zvol (not ZFS) +# Why: agave uses io_uring for async I/O. io_uring workers deadlock on +# ZFS datasets (D-state in dsl_dir_tempreserve_space). XFS on a zvol +# (block device) works fine. This is why the data lives on a zvol, not +# a ZFS dataset. +# Persisted as: fstab entry mounting /dev/zvol/.../solana at /srv/solana +# +# Invariant 2: /srv/solana/ramdisk is XFS on /dev/ram0 (600G ramdisk) +# Why: agave accounts must be on ramdisk for performance. /dev/ram0 +# loses its filesystem on reboot, so it must be reformatted before +# mounting each boot. +# Persisted as: format-ramdisk.service (mkfs before mount) + fstab entry +# +# Invariant 3: /srv/kind/solana is an rbind of /srv/solana +# Why: kind mounts /srv/kind → /mnt inside the kind node. PVs reference +# /mnt/solana/*. Without the rbind, /srv/kind/solana resolves to the ZFS +# dataset (biscayne/DATA/srv/kind), not the zvol — violating invariant 1. +# Persisted as: fstab entry with x-systemd.requires=zfs-mount.service +# (must mount AFTER ZFS, or ZFS overlay at /srv/kind hides it) +# +# This playbook checks each invariant and only acts if it's not met. +# Idempotent — safe to run multiple times. +# +# Usage: +# ansible-playbook playbooks/biscayne-prepare-agave.yml +# +- name: Configure OS-level services for agave + hosts: all + gather_facts: false + become: true + vars: + ramdisk_device: /dev/ram0 + zvol_device: /dev/zvol/biscayne/DATA/volumes/solana + solana_dir: /srv/solana + ramdisk_mount: /srv/solana/ramdisk + kind_solana_dir: /srv/kind/solana + accounts_dir: /srv/solana/ramdisk/accounts + deployment_dir: /srv/deployments/agave + + tasks: + # ---- systemd units ---------------------------------------------------------- + - name: Install ramdisk format service + ansible.builtin.copy: + dest: /etc/systemd/system/format-ramdisk.service + mode: "0644" + content: | + [Unit] + Description=Format /dev/ram0 as XFS for Solana accounts + DefaultDependencies=no + Before=local-fs.target + After=systemd-modules-load.service + ConditionPathExists={{ ramdisk_device }} + + [Service] + Type=oneshot + RemainAfterExit=yes + ExecStart=/sbin/mkfs.xfs -f {{ ramdisk_device }} + + [Install] + WantedBy=local-fs.target + register: unit_file + + - name: Install ramdisk post-mount service + ansible.builtin.copy: + dest: /etc/systemd/system/ramdisk-accounts.service + mode: "0644" + content: | + [Unit] + Description=Create Solana accounts directory on ramdisk + After=srv-solana-ramdisk.mount + Requires=srv-solana-ramdisk.mount + + [Service] + Type=oneshot + RemainAfterExit=yes + ExecStart=/bin/bash -c 'mkdir -p {{ accounts_dir }} && chown solana:solana {{ ramdisk_mount }} {{ accounts_dir }}' + + [Install] + WantedBy=multi-user.target + register: accounts_unit + + # ---- fstab entries ---------------------------------------------------------- + - name: Ensure zvol fstab entry + ansible.builtin.lineinfile: + path: /etc/fstab + regexp: '^\S+\s+{{ solana_dir }}\s' + line: '{{ zvol_device }} {{ solana_dir }} xfs defaults 0 2' + register: fstab_zvol + + - name: Ensure ramdisk fstab entry + ansible.builtin.lineinfile: + path: /etc/fstab + regexp: '^{{ ramdisk_device }}\s+{{ ramdisk_mount }}\s' + line: '{{ ramdisk_device }} {{ ramdisk_mount }} xfs noatime,nodiratime,nofail,x-systemd.requires=format-ramdisk.service 0 0' + register: fstab_ramdisk + + # rbind /srv/solana to /srv/kind/solana AFTER zfs-mount.service and ramdisk. + # Without this ordering, ZFS overlay at /srv/kind hides the bind mount. + - name: Ensure kind bind mount fstab entry + ansible.builtin.lineinfile: + path: /etc/fstab + regexp: '^\S+\s+{{ kind_solana_dir }}\s' + line: '{{ solana_dir }} {{ kind_solana_dir }} none rbind,nofail,x-systemd.requires=zfs-mount.service,x-systemd.requires=srv-solana-ramdisk.mount 0 0' + register: fstab_kind + + # Remove stale fstab entries from previous attempts (direct zvol mount, + # separate ramdisk mount at /srv/kind/solana/ramdisk) + - name: Remove stale kind zvol fstab entry + ansible.builtin.lineinfile: + path: /etc/fstab + regexp: '^{{ zvol_device }}\s+{{ kind_solana_dir }}\s' + state: absent + register: fstab_stale_zvol + + - name: Remove stale kind ramdisk fstab entry + ansible.builtin.lineinfile: + path: /etc/fstab + regexp: '^\S+\s+{{ kind_solana_dir }}/ramdisk\s' + state: absent + register: fstab_stale_ramdisk + + # ---- reload and enable ------------------------------------------------------ + - name: Reload systemd + ansible.builtin.systemd: + daemon_reload: true + when: >- + unit_file.changed or accounts_unit.changed or + fstab_zvol.changed or fstab_ramdisk.changed or fstab_kind.changed or + fstab_stale_zvol.changed or fstab_stale_ramdisk.changed + + - name: Enable ramdisk services + ansible.builtin.systemd: + name: "{{ item }}" + enabled: true + loop: + - format-ramdisk.service + - ramdisk-accounts.service + + # ---- apply now if ramdisk not mounted -------------------------------------- + - name: Check if ramdisk is mounted + ansible.builtin.command: mountpoint -q {{ ramdisk_mount }} + register: ramdisk_mounted + failed_when: false + changed_when: false + + - name: Format and mount ramdisk now + ansible.builtin.shell: | + mkfs.xfs -f {{ ramdisk_device }} + mount {{ ramdisk_mount }} + mkdir -p {{ accounts_dir }} + chown solana:solana {{ ramdisk_mount }} {{ accounts_dir }} + changed_when: ramdisk_mounted.rc != 0 + when: ramdisk_mounted.rc != 0 + + # ---- apply kind bind mount now if not correct ------------------------------ + - name: Check kind bind mount + ansible.builtin.shell: + cmd: > + set -o pipefail && + findmnt -n -o SOURCE {{ kind_solana_dir }} | grep -q '{{ solana_dir }}' + executable: /bin/bash + register: kind_mount_check + failed_when: false + changed_when: false + + - name: Unmount stale kind mounts + ansible.builtin.shell: + cmd: | + umount {{ kind_solana_dir }}/ramdisk 2>/dev/null || true + umount {{ kind_solana_dir }} 2>/dev/null || true + executable: /bin/bash + changed_when: kind_mount_check.rc != 0 + when: kind_mount_check.rc != 0 + + - name: Apply kind bind mount now + ansible.posix.mount: + path: "{{ kind_solana_dir }}" + src: "{{ solana_dir }}" + fstype: none + opts: rbind + state: mounted + when: kind_mount_check.rc != 0 + + # ---- verify ----------------------------------------------------------------- + - name: Verify ramdisk is XFS + ansible.builtin.shell: + cmd: set -o pipefail && df -T {{ ramdisk_mount }} | grep -q xfs + executable: /bin/bash + changed_when: false + + - name: Verify zvol is XFS + ansible.builtin.shell: + cmd: set -o pipefail && df -T {{ solana_dir }} | grep -q xfs + executable: /bin/bash + changed_when: false + + - name: Verify kind bind mount contents + ansible.builtin.shell: + cmd: > + set -o pipefail && + ls {{ kind_solana_dir }}/ledger {{ kind_solana_dir }}/snapshots + {{ kind_solana_dir }}/ramdisk/accounts 2>&1 | head -5 + executable: /bin/bash + register: kind_mount_verify + changed_when: false + + # Assert the kind node sees XFS (zvol), not ZFS. If this fails, kind + # needs a restart or laconic-so needs the HostToContainer propagation fix. + - name: Read cluster-id from deployment + ansible.builtin.shell: + cmd: set -o pipefail && grep '^cluster-id:' {{ deployment_dir }}/deployment.yml | awk '{print $2}' + executable: /bin/bash + register: cluster_id_result + changed_when: false + + - name: Verify kind node sees XFS at /mnt/solana + ansible.builtin.shell: + cmd: > + set -o pipefail && + docker exec {{ cluster_id_result.stdout }}-control-plane + stat -f -c '%T' /mnt/solana | grep -q xfs + executable: /bin/bash + register: kind_fstype + changed_when: false + failed_when: false + + - name: Show status + ansible.builtin.debug: + msg: + kind_mount: "{{ kind_mount_verify.stdout_lines }}" + kind_fstype: "{{ 'xfs (correct)' if kind_fstype.rc == 0 else 'NOT XFS — kind restart required' }}" diff --git a/playbooks/biscayne-redeploy.yml b/playbooks/biscayne-redeploy.yml index 216091dc..86de9c75 100644 --- a/playbooks/biscayne-redeploy.yml +++ b/playbooks/biscayne-redeploy.yml @@ -172,17 +172,21 @@ tags: [deploy, preflight] - name: Verify ramdisk is xfs (not the underlying ZFS) - ansible.builtin.shell: set -o pipefail && df -T {{ ramdisk_mount }} | grep -q xfs + ansible.builtin.shell: + cmd: set -o pipefail && df -T {{ ramdisk_mount }} | grep -q xfs + executable: /bin/bash register: ramdisk_type failed_when: ramdisk_type.rc != 0 changed_when: false tags: [deploy, preflight] - name: Verify ramdisk visible inside kind node - ansible.builtin.shell: > - set -o pipefail && - docker exec {{ kind_cluster }}-control-plane - df -T /mnt/solana/ramdisk 2>/dev/null | grep -q xfs + ansible.builtin.shell: + cmd: > + set -o pipefail && + docker exec {{ kind_cluster }}-control-plane + df -T /mnt/solana/ramdisk 2>/dev/null | grep -q xfs + executable: /bin/bash register: kind_ramdisk_check failed_when: kind_ramdisk_check.rc != 0 changed_when: false diff --git a/playbooks/health-check.yml b/playbooks/health-check.yml index c0aa4ee6..138a9aa6 100644 --- a/playbooks/health-check.yml +++ b/playbooks/health-check.yml @@ -26,10 +26,12 @@ register: kind_clusters changed_when: false failed_when: kind_clusters.rc != 0 or kind_clusters.stdout_lines | length == 0 + tags: [always] - name: Set cluster name fact ansible.builtin.set_fact: kind_cluster: "{{ kind_clusters.stdout_lines[0] }}" + tags: [always] - name: Discover agave namespace ansible.builtin.shell: @@ -41,10 +43,12 @@ register: ns_result changed_when: false failed_when: ns_result.stdout_lines | length == 0 + tags: [always] - name: Set namespace fact ansible.builtin.set_fact: agave_ns: "{{ ns_result.stdout_lines[0] }}" + tags: [always] - name: Get pod name ansible.builtin.shell: @@ -55,15 +59,18 @@ executable: /bin/bash register: pod_result changed_when: false - failed_when: pod_result.stdout | trim == '' + failed_when: false + tags: [always] - name: Set pod fact ansible.builtin.set_fact: - agave_pod: "{{ pod_result.stdout | trim }}" + agave_pod: "{{ pod_result.stdout | default('') | trim }}" + tags: [always] - name: Show discovered resources ansible.builtin.debug: - msg: "cluster={{ kind_cluster }} ns={{ agave_ns }} pod={{ agave_pod }}" + msg: "cluster={{ kind_cluster }} ns={{ agave_ns }} pod={{ agave_pod | default('none') }}" + tags: [always] # ------------------------------------------------------------------ # Pod status @@ -226,13 +233,59 @@ failed_when: false tags: [storage] + - name: Check host mount chain + ansible.builtin.shell: + cmd: > + set -o pipefail && + findmnt -n -o TARGET,SOURCE,FSTYPE,PROPAGATION + /srv/solana /srv/solana/ramdisk /srv/kind/solana 2>&1 + executable: /bin/bash + register: host_mounts + changed_when: false + failed_when: false + tags: [storage, mounts] + + - name: Check kind node mount visibility + ansible.builtin.shell: + cmd: | + set -o pipefail + echo "=== /mnt/solana contents ===" + docker exec {{ kind_cluster }}-control-plane ls /mnt/solana/ + echo "=== /mnt/solana filesystem ===" + docker exec {{ kind_cluster }}-control-plane df -T /mnt/solana + echo "=== /mnt/solana/ramdisk filesystem ===" + docker exec {{ kind_cluster }}-control-plane df -T /mnt/solana/ramdisk 2>/dev/null || echo "ramdisk not visible" + echo "=== /mnt/solana/snapshots ===" + docker exec {{ kind_cluster }}-control-plane ls /mnt/solana/snapshots/ 2>/dev/null || echo "snapshots not visible" + echo "=== /mnt/solana/ledger ===" + docker exec {{ kind_cluster }}-control-plane ls /mnt/solana/ledger/ 2>/dev/null | head -5 || echo "ledger not visible" + executable: /bin/bash + register: kind_mounts + changed_when: false + failed_when: false + tags: [storage, mounts] + + - name: Check mount propagation + ansible.builtin.shell: + cmd: > + set -o pipefail && + findmnt -n -o PROPAGATION /srv/kind + executable: /bin/bash + register: mount_propagation + changed_when: false + failed_when: false + tags: [storage, mounts] + - name: Show storage status ansible.builtin.debug: msg: ramdisk: "{{ ramdisk_df.stdout_lines | default(['not mounted']) }}" zfs: "{{ zfs_list.stdout_lines | default([]) }}" zvol_io: "{{ zvol_io.stdout_lines | default([]) }}" - tags: [storage] + host_mounts: "{{ host_mounts.stdout_lines | default([]) }}" + kind_mounts: "{{ kind_mounts.stdout_lines | default([]) }}" + mount_propagation: "{{ mount_propagation.stdout | default('unknown') }}" + tags: [storage, mounts] # ------------------------------------------------------------------ # System resources