fix: remove Ansible snapshot download, add sync-tools playbook

The container entrypoint (entrypoint.py) handles snapshot download internally via aria2c. Ansible no longer needs to scale-to-0, download, scale-to-1 — it just deploys and lets the container manage startup. - biscayne-redeploy.yml: remove snapshot download section, simplify to teardown → wipe → deploy → verify - biscayne-sync-tools.yml: new playbook to sync laconic-so and agave-stack repos on biscayne, with separate branch controls - snapshot_download.py: re-probe for fresh incremental after full snapshot download completes (old incremental is stale by then) - Switch laconic_so_branch to fix/kind-mount-propagation (has hostNetwork translation code) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-09 05:14:43 +00:00 · 2026-03-09 05:14:43 +00:00 · bd38c1b791
parent 3574e387cc
commit bd38c1b791
4 changed files with 211 additions and 179 deletions
--- a/playbooks/biscayne-redeploy.yml
+++ b/playbooks/biscayne-redeploy.yml
@ -1,46 +1,33 @@
 ---
-# Redeploy agave-stack on biscayne with aria2c snapshot pre-download
+# Redeploy agave-stack on biscayne
 #
-# The validator's built-in downloader fetches snapshots at ~18 MB/s (single
+# The container entrypoint (entrypoint.py) handles snapshot download and
-# connection). snapshot-download.py uses aria2c with 16 parallel connections to
+# agave-validator startup internally. This playbook just manages the k8s
-# saturate available bandwidth, cutting 90+ min downloads to ~10 min.
+# lifecycle: teardown, optional data wipe, deploy, and verify.
 #
 # Flow:
-#   1. [teardown]  Delete k8s namespace (preserve kind cluster)
+#   1. [teardown]  Scale to 0, wait for clean exit, delete namespace
 #   2. [wipe]      Conditionally clear ledger / accounts / old snapshots
-#   3. [deploy]    laconic-so deployment start, then immediately scale to 0
+#   3. [deploy]    Preflight checks, laconic-so deployment start
-#   4. [snapshot]  Download snapshot via aria2c to host bind mount
+#   4. [verify]    Wait for pod Running, check logs + RPC health
 #   5. [snapshot]  Verify snapshot visible inside kind node
 #   6. [deploy,scale-up] Scale validator back to 1
 #   7. [verify]    Wait for pod Running, check logs + RPC health
 #
-# The validator cannot run during snapshot download — it would lock/use the
+# The entrypoint.py inside the container:
-# snapshot files. laconic-so creates the cluster AND deploys the pod in one
+#   - Checks snapshot freshness against mainnet
-# shot, so we scale to 0 immediately after deploy, download, then scale to 1.
+#   - Downloads fresh snapshot via aria2c if needed
 #   - Builds agave-validator args from env vars
 #   - Execs agave-validator
 #
 # Usage:
-#   # Standard redeploy (download snapshot, preserve accounts + ledger)
+#   # Standard redeploy
-#   ansible-playbook -i biscayne.vaasl.io, playbooks/biscayne-redeploy.yml
+#   ansible-playbook -i inventory/biscayne.yml playbooks/biscayne-redeploy.yml
 #
 #   # Full wipe (accounts + ledger) — slow rebuild
-#   ansible-playbook -i biscayne.vaasl.io, playbooks/biscayne-redeploy.yml \
+#   ansible-playbook -i inventory/biscayne.yml playbooks/biscayne-redeploy.yml \
 #     -e wipe_accounts=true -e wipe_ledger=true
 #
-#   # Skip snapshot download (use existing)
+#   # Skip snapshot cleanup (use existing)
-#   ansible-playbook -i biscayne.vaasl.io, playbooks/biscayne-redeploy.yml \
+#   ansible-playbook -i inventory/biscayne.yml playbooks/biscayne-redeploy.yml \
-#     -e skip_snapshot=true
+#     -e skip_snapshot_cleanup=true
 #
 #   # Pass extra args to snapshot-download.py
 #   ansible-playbook -i biscayne.vaasl.io, playbooks/biscayne-redeploy.yml \
 #     -e 'snapshot_args=--version 2.2 --min-download-speed 50'
 #
 #   # Snapshot only (no teardown/deploy)
 #   ansible-playbook -i biscayne.vaasl.io, playbooks/biscayne-redeploy.yml \
 #     --tags snapshot
 #
 #   # Resume after partial failure (download snapshot, scale up, verify)
 #   ansible-playbook -i biscayne.vaasl.io, playbooks/biscayne-redeploy.yml \
 #     --tags snapshot,scale-up,verify
 #
 - name: Redeploy agave validator on biscayne
  hosts: all
@ -53,7 +40,7 @@
    stack_path: /srv/deployments/agave-stack/stack-orchestrator/stacks/agave
    laconic_so: /home/rix/.local/bin/laconic-so
    laconic_so_repo: /home/rix/stack-orchestrator
-    laconic_so_branch: main
+    laconic_so_branch: fix/kind-mount-propagation
    kind_cluster: laconic-70ce4c4b47e23b85
    k8s_namespace: "laconic-{{ kind_cluster }}"
    deployment_name: "{{ kind_cluster }}-deployment"
@ -62,13 +49,10 @@
    accounts_dir: /srv/kind/solana/ramdisk/accounts
    ramdisk_mount: /srv/kind/solana/ramdisk
    ramdisk_size: 1024G
    snapshot_script_local: "{{ playbook_dir }}/../scripts/agave-container/snapshot_download.py"
    snapshot_script: /tmp/snapshot-download.py
    # Flags — non-destructive by default
    wipe_accounts: false
    wipe_ledger: false
-    skip_snapshot: false
+    skip_snapshot_cleanup: false
    snapshot_args: ""
  tasks:
    # ---- teardown: graceful stop, then delete namespace ----------------------
@ -121,12 +105,14 @@
      tags: [teardown]
    - name: Clear stale claimRefs on Released PVs
-      ansible.builtin.shell: |
+      ansible.builtin.shell:
-        set -o pipefail
+        cmd: |
-        for pv in $(kubectl get pv -o jsonpath='{range .items[?(@.status.phase=="Released")]}{.metadata.name}{"\n"}{end}'); do
+          set -o pipefail
-          kubectl patch pv "$pv" --type json \
+          for pv in $(kubectl get pv -o jsonpath='{range .items[?(@.status.phase=="Released")]}{.metadata.name}{"\n"}{end}'); do
-            -p '[{"op":"remove","path":"/spec/claimRef"}]'
+            kubectl patch pv "$pv" --type json \
-        done
+              -p '[{"op":"remove","path":"/spec/claimRef"}]'
          done
        executable: /bin/bash
      register: pv_patch
      changed_when: pv_patch.stdout != ""
      tags: [teardown]
@ -151,20 +137,22 @@
      tags: [wipe]
    - name: Clean old snapshots (keep newest full + incremental)
-      ansible.builtin.shell: |
+      ansible.builtin.shell:
-        set -o pipefail
+        cmd: |
-        cd {{ snapshot_dir }} || exit 0
+          set -o pipefail
-        newest=$(ls -t snapshot-*.tar.* 2>/dev/null | head -1)
+          cd {{ snapshot_dir }} || exit 0
-        if [ -n "$newest" ]; then
+          newest=$(ls -t snapshot-*.tar.* 2>/dev/null | head -1)
-          newest_inc=$(ls -t incremental-snapshot-*.tar.* 2>/dev/null | head -1)
+          if [ -n "$newest" ]; then
-          find . -maxdepth 1 -name '*.tar.*' \
+            newest_inc=$(ls -t incremental-snapshot-*.tar.* 2>/dev/null | head -1)
-            ! -name "$newest" \
+            find . -maxdepth 1 -name '*.tar.*' \
-            ! -name "${newest_inc:-__none__}" \
+              ! -name "$newest" \
-            -delete
+              ! -name "${newest_inc:-__none__}" \
-        fi
+              -delete
          fi
        executable: /bin/bash
      become: true
      changed_when: true
-      when: not skip_snapshot | bool
+      when: not skip_snapshot_cleanup | bool
      tags: [wipe]
    # ---- preflight: verify ramdisk and mounts before deploy ------------------
@ -175,35 +163,16 @@
      changed_when: false
      tags: [deploy, preflight]
-    - name: Verify ramdisk is xfs (not the underlying ZFS)
+    - name: Verify ramdisk is tmpfs (not the underlying ZFS)
      ansible.builtin.shell:
-        cmd: set -o pipefail && df -T {{ ramdisk_mount }} | grep -q xfs
+        cmd: set -o pipefail && df -T {{ ramdisk_mount }} | grep -q tmpfs
        executable: /bin/bash
      register: ramdisk_type
      failed_when: ramdisk_type.rc != 0
      changed_when: false
      tags: [deploy, preflight]
-    # ---- deploy: sync config, bring up cluster, scale to 0 ------------------
+    # ---- deploy: bring up cluster, let entrypoint handle snapshot ------------
    - name: Pull agave-stack repo
      ansible.builtin.shell: |
        cd {{ stack_repo }}
        git fetch origin
        git reset --hard origin/{{ laconic_so_branch }}
      changed_when: true
      tags: [deploy]
    - name: Regenerate deployment config from updated stack
      ansible.builtin.command: >
        {{ laconic_so }}
        --stack {{ stack_path }}
        deploy create
        --spec-file {{ deployment_dir }}/spec.yml
        --deployment-dir {{ deployment_dir }}
        --update
      changed_when: true
      tags: [deploy]
    - name: Check kind-config.yml mount style
      ansible.builtin.command: "grep -c 'containerPath: /mnt$' {{ deployment_dir }}/kind-config.yml"
      register: mount_root_check
@ -220,14 +189,6 @@
      when: mount_root_check.stdout | default('0') | int < 1
      tags: [deploy]
    - name: Update laconic-so (editable install)
      ansible.builtin.shell: |
        cd {{ laconic_so_repo }}
        git fetch origin
        git reset --hard origin/{{ laconic_so_branch }}
      changed_when: true
      tags: [deploy]
    - name: Start deployment (creates kind cluster + deploys pod)
      ansible.builtin.command: "{{ laconic_so }} deployment --dir {{ deployment_dir }} start"
      register: deploy_start
@ -272,99 +233,17 @@
        cmd: >
          set -o pipefail &&
          docker exec {{ kind_cluster }}-control-plane
-          df -T /mnt/validator-accounts 2>/dev/null | grep -q xfs
+          df -T /mnt/validator-accounts 2>/dev/null | grep -q tmpfs
        executable: /bin/bash
      register: kind_ramdisk_check
      failed_when: kind_ramdisk_check.rc != 0
      changed_when: false
      tags: [deploy]
    - name: Scale validator to 0 (stop before snapshot download)
      ansible.builtin.command: >
        kubectl scale deployment {{ deployment_name }}
        -n {{ k8s_namespace }} --replicas=0
      changed_when: true
      tags: [deploy]
    - name: Wait for pods to terminate
      ansible.builtin.command: >
        kubectl get pods -n {{ k8s_namespace }}
        -l app={{ deployment_name }}
        -o jsonpath='{.items}'
      register: pods_gone
      retries: 30
      delay: 5
      until: pods_gone.stdout == "[]" or pods_gone.stdout == ""
      changed_when: false
      failed_when: false
      tags: [deploy]
    # ---- snapshot: download via aria2c, verify in kind node ------------------
    - name: Verify aria2c installed
      ansible.builtin.command: which aria2c
      changed_when: false
      when: not skip_snapshot | bool
      tags: [snapshot]
    - name: Copy snapshot script to remote
      ansible.builtin.copy:
        src: "{{ snapshot_script_local }}"
        dest: "{{ snapshot_script }}"
        mode: "0755"
      when: not skip_snapshot | bool
      tags: [snapshot]
    - name: Verify kind node mounts
      ansible.builtin.command: >
        docker exec {{ kind_cluster }}-control-plane
        ls /mnt/validator-snapshots/
      register: kind_mount_check
      changed_when: false
      tags: [snapshot]
    - name: Download snapshot via aria2c
      ansible.builtin.shell: >
        python3 {{ snapshot_script }}
        -o {{ snapshot_dir }}
        {{ snapshot_args }}
      become: true
      register: snapshot_result
      changed_when: true
      when: not skip_snapshot | bool
      timeout: 3600
      tags: [snapshot]
    - name: Show snapshot download result
      ansible.builtin.debug:
        msg: "{{ snapshot_result.stdout_lines | default(['skipped']) }}"
      tags: [snapshot]
    - name: Verify snapshot visible inside kind node
      ansible.builtin.shell: >
        set -o pipefail &&
        docker exec {{ kind_cluster }}-control-plane
        find /mnt/validator-snapshots/ -name '*.tar.*' -maxdepth 1 | head -5
      register: kind_snapshot_check
      failed_when: kind_snapshot_check.stdout == ""
      changed_when: false
      when: not skip_snapshot | bool
      tags: [snapshot]
    - name: Show snapshot files in kind node
      ansible.builtin.debug:
        msg: "{{ kind_snapshot_check.stdout_lines | default(['skipped']) }}"
      when: not skip_snapshot | bool
      tags: [snapshot]
    # ---- deploy (cont): scale validator back up with snapshot ----------------
    - name: Scale validator to 1 (start with downloaded snapshot)
      ansible.builtin.command: >
        kubectl scale deployment {{ deployment_name }}
        -n {{ k8s_namespace }} --replicas=1
      changed_when: true
      tags: [deploy, scale-up]
    # ---- verify: confirm validator is running --------------------------------
    # The entrypoint.py handles snapshot download + agave-validator startup.
    # Pod will be Running once the container starts, but agave-validator won't
    # exec until after snapshot download completes (if needed).
    - name: Wait for pod to be running
      ansible.builtin.command: >
        kubectl get pods -n {{ k8s_namespace }}
--- a/playbooks/biscayne-start.yml
+++ b/playbooks/biscayne-start.yml
@ -61,24 +61,33 @@
    # laconic-so creates individual extraMounts per volume:
    #   /srv/kind/solana/ledger → /mnt/validator-ledger (inside kind node)
    #   /srv/kind/solana/ramdisk/accounts → /mnt/validator-accounts
-    - name: Verify kind node sees XFS at PV paths
+    - name: Verify kind node sees correct filesystems at PV paths
      ansible.builtin.shell:
        cmd: >
          set -o pipefail &&
          docker exec {{ kind_node }}
          df -T /mnt/validator-ledger /mnt/validator-accounts
          | grep -c xfs
        executable: /bin/bash
-      register: kind_xfs_check
+      register: kind_fs_check
      changed_when: false
-    - name: Fail if PV paths are not XFS
+    - name: Fail if ledger is not XFS (zvol)
      ansible.builtin.fail:
        msg: >-
-          Expected 2 XFS mounts (validator-ledger, validator-accounts) but
+          validator-ledger must be XFS (on zvol). Got:
-          found {{ kind_xfs_check.stdout }}. Run biscayne-prepare-agave.yml
+          {{ kind_fs_check.stdout }}
-          and restart the kind container.
+      when: "'xfs' not in kind_fs_check.stdout"
-      when: kind_xfs_check.stdout | int < 2
+
    - name: Fail if accounts is on ZFS (must be tmpfs)
      ansible.builtin.shell:
        cmd: >
          set -o pipefail &&
          docker exec {{ kind_node }}
          df -T /mnt/validator-accounts | grep -q zfs
        executable: /bin/bash
      register: accounts_zfs_check
      changed_when: false
      failed_when: accounts_zfs_check.rc == 0
    - name: Show kind node PV filesystems
      ansible.builtin.shell:
--- a/playbooks/biscayne-sync-tools.yml
+++ b/playbooks/biscayne-sync-tools.yml
@ -0,0 +1,96 @@
 ---
 # Sync laconic-so and agave-stack to latest on biscayne
 #
 # Updates both repos that laconic-so deployment commands depend on:
 #   - stack-orchestrator (laconic-so itself, editable install)
 #   - agave-stack (stack definitions, compose files, container scripts)
 #
 # Then regenerates the deployment config from the updated stack.
 # Does NOT restart anything — just syncs code and config.
 #
 # Usage:
 #   ansible-playbook -i inventory/biscayne.yml playbooks/biscayne-sync-tools.yml
 #
 #   # Use a feature branch
 #   ansible-playbook -i inventory/biscayne.yml playbooks/biscayne-sync-tools.yml \
 #     -e laconic_so_branch=fix/kind-mount-propagation
 #
 - name: Sync laconic-so and agave-stack
  hosts: all
  gather_facts: false
  environment:
    KUBECONFIG: /home/rix/.kube/config
  vars:
    deployment_dir: /srv/deployments/agave
    stack_repo: /srv/deployments/agave-stack
    stack_path: /srv/deployments/agave-stack/stack-orchestrator/stacks/agave
    laconic_so: /home/rix/.local/bin/laconic-so
    laconic_so_repo: /home/rix/stack-orchestrator
    laconic_so_branch: fix/kind-mount-propagation
    stack_branch: main
  tasks:
    - name: Update laconic-so (editable install)
      ansible.builtin.shell: |
        cd {{ laconic_so_repo }}
        git fetch origin
        git reset --hard origin/{{ laconic_so_branch }}
      register: laconic_so_update
      changed_when: true
    - name: Show laconic-so version
      ansible.builtin.shell:
        cmd: set -o pipefail && cd {{ laconic_so_repo }} && git log --oneline -1
        executable: /bin/bash
      register: laconic_so_version
      changed_when: false
    - name: Report laconic-so
      ansible.builtin.debug:
        msg: "laconic-so: {{ laconic_so_version.stdout }}"
    - name: Find SSH agent socket
      ansible.builtin.shell:
        cmd: set -o pipefail && ls -t /tmp/ssh-*/agent.* 2>/dev/null | head -1
        executable: /bin/bash
      register: ssh_agent_socket
      changed_when: false
      failed_when: ssh_agent_socket.stdout == ""
    - name: Pull agave-stack repo
      ansible.builtin.shell: |
        export SSH_AUTH_SOCK={{ ssh_agent_socket.stdout }}
        cd {{ stack_repo }}
        git fetch origin
        git reset --hard origin/{{ stack_branch }}
      register: stack_update
      changed_when: true
    - name: Show agave-stack version
      ansible.builtin.shell:
        cmd: set -o pipefail && cd {{ stack_repo }} && git log --oneline -1
        executable: /bin/bash
      register: stack_version
      changed_when: false
    - name: Report agave-stack
      ansible.builtin.debug:
        msg: "agave-stack: {{ stack_version.stdout }}"
    - name: Regenerate deployment config from updated stack
      ansible.builtin.command: >
        {{ laconic_so }}
        --stack {{ stack_path }}
        deploy create
        --spec-file {{ deployment_dir }}/spec.yml
        --deployment-dir {{ deployment_dir }}
        --update
      register: regen_result
      changed_when: true
    - name: Report sync complete
      ansible.builtin.debug:
        msg: >-
          Sync complete. laconic-so and agave-stack updated to
          origin/{{ laconic_so_branch }}. Deployment config regenerated.
          Restart or redeploy required to apply changes.
--- a/scripts/agave-container/snapshot_download.py
+++ b/scripts/agave-container/snapshot_download.py
@ -513,11 +513,18 @@ def download_best_snapshot(
    for filename, mirror_urls in download_plan:
        log.info("  %s (%d mirrors)", filename, len(mirror_urls))
-    # Download
+    # Download — full snapshot first, then re-probe for fresh incremental
    os.makedirs(output_dir, exist_ok=True)
    total_start: float = time.monotonic()
    # Separate full and incremental from the initial plan
    full_downloads: list[tuple[str, list[str]]] = []
    for filename, mirror_urls in download_plan:
        if filename.startswith("snapshot-"):
            full_downloads.append((filename, mirror_urls))
    # Download full snapshot(s)
    for filename, mirror_urls in full_downloads:
        filepath: Path = Path(output_dir) / filename
        if filepath.exists() and filepath.stat().st_size > 0:
            log.info("Skipping %s (already exists: %.1f GB)",
@ -527,6 +534,47 @@ def download_best_snapshot(
            log.error("Failed to download %s", filename)
            return False
    # After full snapshot download, re-probe for a fresh incremental.
    # The initial incremental is stale by now (full download takes 10+ min).
    if not full_only:
        # Get the full snapshot slot from the filename we just downloaded
        full_filename: str = full_downloads[0][0]
        fm_post: re.Match[str] | None = FULL_SNAP_RE.match(full_filename)
        if fm_post:
            full_snap_slot: int = int(fm_post.group(1))
            log.info("Re-probing for fresh incremental based on slot %d...", full_snap_slot)
            inc_downloaded: bool = False
            for source in fast_sources:
                inc_url_re: str = f"http://{source.rpc_address}/incremental-snapshot.tar.bz2"
                inc_location, _ = head_no_follow(inc_url_re, timeout=2)
                if not inc_location:
                    continue
                inc_fn, inc_fp = _parse_snapshot_filename(inc_location)
                m_inc: re.Match[str] | None = INCR_SNAP_RE.match(inc_fn)
                if not m_inc:
                    continue
                if int(m_inc.group(1)) != full_snap_slot:
                    log.debug("  %s: incremental base slot %s != full %d, skipping",
                              source.rpc_address, m_inc.group(1), full_snap_slot)
                    continue
                # Found a matching incremental — build mirror list and download
                inc_mirrors: list[str] = [f"http://{source.rpc_address}{inc_fp}"]
                for other in fast_sources:
                    if other.rpc_address == source.rpc_address:
                        continue
                    other_loc, _ = head_no_follow(
                        f"http://{other.rpc_address}/incremental-snapshot.tar.bz2", timeout=2)
                    if other_loc:
                        other_fn, other_fp = _parse_snapshot_filename(other_loc)
                        if other_fn == inc_fn:
                            inc_mirrors.append(f"http://{other.rpc_address}{other_fp}")
                log.info("  Found incremental %s (%d mirrors)", inc_fn, len(inc_mirrors))
                if download_aria2c(inc_mirrors, output_dir, inc_fn, connections):
                    inc_downloaded = True
                break
            if not inc_downloaded:
                log.info("No matching incremental found — validator will replay from full snapshot")
    total_elapsed: float = time.monotonic() - total_start
    log.info("All downloads complete in %.0fs", total_elapsed)
    for filename, _ in download_plan: