fix: remove Ansible snapshot download, add sync-tools playbook

The container entrypoint (entrypoint.py) handles snapshot download internally via aria2c. Ansible no longer needs to scale-to-0, download, scale-to-1 — it just deploys and lets the container manage startup. - biscayne-redeploy.yml: remove snapshot download section, simplify to teardown → wipe → deploy → verify - biscayne-sync-tools.yml: new playbook to sync laconic-so and agave-stack repos on biscayne, with separate branch controls - snapshot_download.py: re-probe for fresh incremental after full snapshot download completes (old incremental is stale by then) - Switch laconic_so_branch to fix/kind-mount-propagation (has hostNetwork translation code) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-09 05:14:43 +00:00 · 2026-03-09 05:14:43 +00:00 · bd38c1b791
parent 3574e387cc
commit bd38c1b791
4 changed files with 211 additions and 179 deletions
--- a/playbooks/biscayne-redeploy.yml
+++ b/playbooks/biscayne-redeploy.yml
@ -1,46 +1,33 @@
 ---
-# Redeploy agave-stack on biscayne with aria2c snapshot pre-download
+# Redeploy agave-stack on biscayne
 #
-# The validator's built-in downloader fetches snapshots at ~18 MB/s (single
-# connection). snapshot-download.py uses aria2c with 16 parallel connections to
-# saturate available bandwidth, cutting 90+ min downloads to ~10 min.
+# The container entrypoint (entrypoint.py) handles snapshot download and
+# agave-validator startup internally. This playbook just manages the k8s
+# lifecycle: teardown, optional data wipe, deploy, and verify.
 #
 # Flow:
-#   1. [teardown]  Delete k8s namespace (preserve kind cluster)
+#   1. [teardown]  Scale to 0, wait for clean exit, delete namespace
 #   2. [wipe]      Conditionally clear ledger / accounts / old snapshots
-#   3. [deploy]    laconic-so deployment start, then immediately scale to 0
-#   4. [snapshot]  Download snapshot via aria2c to host bind mount
-#   5. [snapshot]  Verify snapshot visible inside kind node
-#   6. [deploy,scale-up] Scale validator back to 1
-#   7. [verify]    Wait for pod Running, check logs + RPC health
+#   3. [deploy]    Preflight checks, laconic-so deployment start
+#   4. [verify]    Wait for pod Running, check logs + RPC health
 #
-# The validator cannot run during snapshot download — it would lock/use the
-# snapshot files. laconic-so creates the cluster AND deploys the pod in one
-# shot, so we scale to 0 immediately after deploy, download, then scale to 1.
+# The entrypoint.py inside the container:
+#   - Checks snapshot freshness against mainnet
+#   - Downloads fresh snapshot via aria2c if needed
+#   - Builds agave-validator args from env vars
+#   - Execs agave-validator
 #
 # Usage:
-#   # Standard redeploy (download snapshot, preserve accounts + ledger)
-#   ansible-playbook -i biscayne.vaasl.io, playbooks/biscayne-redeploy.yml
+#   # Standard redeploy
+#   ansible-playbook -i inventory/biscayne.yml playbooks/biscayne-redeploy.yml
 #
 #   # Full wipe (accounts + ledger) — slow rebuild
-#   ansible-playbook -i biscayne.vaasl.io, playbooks/biscayne-redeploy.yml \
+#   ansible-playbook -i inventory/biscayne.yml playbooks/biscayne-redeploy.yml \
 #     -e wipe_accounts=true -e wipe_ledger=true
 #
-#   # Skip snapshot download (use existing)
-#   ansible-playbook -i biscayne.vaasl.io, playbooks/biscayne-redeploy.yml \
-#     -e skip_snapshot=true
-#
-#   # Pass extra args to snapshot-download.py
-#   ansible-playbook -i biscayne.vaasl.io, playbooks/biscayne-redeploy.yml \
-#     -e 'snapshot_args=--version 2.2 --min-download-speed 50'
-#
-#   # Snapshot only (no teardown/deploy)
-#   ansible-playbook -i biscayne.vaasl.io, playbooks/biscayne-redeploy.yml \
-#     --tags snapshot
-#
-#   # Resume after partial failure (download snapshot, scale up, verify)
-#   ansible-playbook -i biscayne.vaasl.io, playbooks/biscayne-redeploy.yml \
-#     --tags snapshot,scale-up,verify
+#   # Skip snapshot cleanup (use existing)
+#   ansible-playbook -i inventory/biscayne.yml playbooks/biscayne-redeploy.yml \
+#     -e skip_snapshot_cleanup=true
 #
 - name: Redeploy agave validator on biscayne
  hosts: all
@ -53,7 +40,7 @@
    stack_path: /srv/deployments/agave-stack/stack-orchestrator/stacks/agave
    laconic_so: /home/rix/.local/bin/laconic-so
    laconic_so_repo: /home/rix/stack-orchestrator
-    laconic_so_branch: main
+    laconic_so_branch: fix/kind-mount-propagation
    kind_cluster: laconic-70ce4c4b47e23b85
    k8s_namespace: "laconic-{{ kind_cluster }}"
    deployment_name: "{{ kind_cluster }}-deployment"
@ -62,13 +49,10 @@
    accounts_dir: /srv/kind/solana/ramdisk/accounts
    ramdisk_mount: /srv/kind/solana/ramdisk
    ramdisk_size: 1024G
-    snapshot_script_local: "{{ playbook_dir }}/../scripts/agave-container/snapshot_download.py"
-    snapshot_script: /tmp/snapshot-download.py
    # Flags — non-destructive by default
    wipe_accounts: false
    wipe_ledger: false
-    skip_snapshot: false
-    snapshot_args: ""
+    skip_snapshot_cleanup: false

  tasks:
    # ---- teardown: graceful stop, then delete namespace ----------------------
@ -121,12 +105,14 @@
      tags: [teardown]

    - name: Clear stale claimRefs on Released PVs
-      ansible.builtin.shell: |
+      ansible.builtin.shell:
+        cmd: |
          set -o pipefail
          for pv in $(kubectl get pv -o jsonpath='{range .items[?(@.status.phase=="Released")]}{.metadata.name}{"\n"}{end}'); do
            kubectl patch pv "$pv" --type json \
              -p '[{"op":"remove","path":"/spec/claimRef"}]'
          done
+        executable: /bin/bash
      register: pv_patch
      changed_when: pv_patch.stdout != ""
      tags: [teardown]
@ -151,7 +137,8 @@
      tags: [wipe]

    - name: Clean old snapshots (keep newest full + incremental)
-      ansible.builtin.shell: |
+      ansible.builtin.shell:
+        cmd: |
          set -o pipefail
          cd {{ snapshot_dir }} || exit 0
          newest=$(ls -t snapshot-*.tar.* 2>/dev/null | head -1)
@ -162,9 +149,10 @@
              ! -name "${newest_inc:-__none__}" \
              -delete
          fi
+        executable: /bin/bash
      become: true
      changed_when: true
-      when: not skip_snapshot | bool
+      when: not skip_snapshot_cleanup | bool
      tags: [wipe]

    # ---- preflight: verify ramdisk and mounts before deploy ------------------
@ -175,35 +163,16 @@
      changed_when: false
      tags: [deploy, preflight]

-    - name: Verify ramdisk is xfs (not the underlying ZFS)
+    - name: Verify ramdisk is tmpfs (not the underlying ZFS)
      ansible.builtin.shell:
-        cmd: set -o pipefail && df -T {{ ramdisk_mount }} | grep -q xfs
+        cmd: set -o pipefail && df -T {{ ramdisk_mount }} | grep -q tmpfs
        executable: /bin/bash
      register: ramdisk_type
      failed_when: ramdisk_type.rc != 0
      changed_when: false
      tags: [deploy, preflight]

-    # ---- deploy: sync config, bring up cluster, scale to 0 ------------------
-    - name: Pull agave-stack repo
-      ansible.builtin.shell: |
-        cd {{ stack_repo }}
-        git fetch origin
-        git reset --hard origin/{{ laconic_so_branch }}
-      changed_when: true
-      tags: [deploy]
-
-    - name: Regenerate deployment config from updated stack
-      ansible.builtin.command: >
-        {{ laconic_so }}
-        --stack {{ stack_path }}
-        deploy create
-        --spec-file {{ deployment_dir }}/spec.yml
-        --deployment-dir {{ deployment_dir }}
-        --update
-      changed_when: true
-      tags: [deploy]
-
+    # ---- deploy: bring up cluster, let entrypoint handle snapshot ------------
    - name: Check kind-config.yml mount style
      ansible.builtin.command: "grep -c 'containerPath: /mnt$' {{ deployment_dir }}/kind-config.yml"
      register: mount_root_check
@ -220,14 +189,6 @@
      when: mount_root_check.stdout | default('0') | int < 1
      tags: [deploy]

-    - name: Update laconic-so (editable install)
-      ansible.builtin.shell: |
-        cd {{ laconic_so_repo }}
-        git fetch origin
-        git reset --hard origin/{{ laconic_so_branch }}
-      changed_when: true
-      tags: [deploy]
-
    - name: Start deployment (creates kind cluster + deploys pod)
      ansible.builtin.command: "{{ laconic_so }} deployment --dir {{ deployment_dir }} start"
      register: deploy_start
@ -272,99 +233,17 @@
        cmd: >
          set -o pipefail &&
          docker exec {{ kind_cluster }}-control-plane
-          df -T /mnt/validator-accounts 2>/dev/null | grep -q xfs
+          df -T /mnt/validator-accounts 2>/dev/null | grep -q tmpfs
        executable: /bin/bash
      register: kind_ramdisk_check
      failed_when: kind_ramdisk_check.rc != 0
      changed_when: false
      tags: [deploy]

-    - name: Scale validator to 0 (stop before snapshot download)
-      ansible.builtin.command: >
-        kubectl scale deployment {{ deployment_name }}
-        -n {{ k8s_namespace }} --replicas=0
-      changed_when: true
-      tags: [deploy]
-
-    - name: Wait for pods to terminate
-      ansible.builtin.command: >
-        kubectl get pods -n {{ k8s_namespace }}
-        -l app={{ deployment_name }}
-        -o jsonpath='{.items}'
-      register: pods_gone
-      retries: 30
-      delay: 5
-      until: pods_gone.stdout == "[]" or pods_gone.stdout == ""
-      changed_when: false
-      failed_when: false
-      tags: [deploy]
-
-    # ---- snapshot: download via aria2c, verify in kind node ------------------
-    - name: Verify aria2c installed
-      ansible.builtin.command: which aria2c
-      changed_when: false
-      when: not skip_snapshot | bool
-      tags: [snapshot]
-
-    - name: Copy snapshot script to remote
-      ansible.builtin.copy:
-        src: "{{ snapshot_script_local }}"
-        dest: "{{ snapshot_script }}"
-        mode: "0755"
-      when: not skip_snapshot | bool
-      tags: [snapshot]
-
-    - name: Verify kind node mounts
-      ansible.builtin.command: >
-        docker exec {{ kind_cluster }}-control-plane
-        ls /mnt/validator-snapshots/
-      register: kind_mount_check
-      changed_when: false
-      tags: [snapshot]
-
-    - name: Download snapshot via aria2c
-      ansible.builtin.shell: >
-        python3 {{ snapshot_script }}
-        -o {{ snapshot_dir }}
-        {{ snapshot_args }}
-      become: true
-      register: snapshot_result
-      changed_when: true
-      when: not skip_snapshot | bool
-      timeout: 3600
-      tags: [snapshot]
-
-    - name: Show snapshot download result
-      ansible.builtin.debug:
-        msg: "{{ snapshot_result.stdout_lines | default(['skipped']) }}"
-      tags: [snapshot]
-
-    - name: Verify snapshot visible inside kind node
-      ansible.builtin.shell: >
-        set -o pipefail &&
-        docker exec {{ kind_cluster }}-control-plane
-        find /mnt/validator-snapshots/ -name '*.tar.*' -maxdepth 1 | head -5
-      register: kind_snapshot_check
-      failed_when: kind_snapshot_check.stdout == ""
-      changed_when: false
-      when: not skip_snapshot | bool
-      tags: [snapshot]
-
-    - name: Show snapshot files in kind node
-      ansible.builtin.debug:
-        msg: "{{ kind_snapshot_check.stdout_lines | default(['skipped']) }}"
-      when: not skip_snapshot | bool
-      tags: [snapshot]
-
-    # ---- deploy (cont): scale validator back up with snapshot ----------------
-    - name: Scale validator to 1 (start with downloaded snapshot)
-      ansible.builtin.command: >
-        kubectl scale deployment {{ deployment_name }}
-        -n {{ k8s_namespace }} --replicas=1
-      changed_when: true
-      tags: [deploy, scale-up]
-
    # ---- verify: confirm validator is running --------------------------------
+    # The entrypoint.py handles snapshot download + agave-validator startup.
+    # Pod will be Running once the container starts, but agave-validator won't
+    # exec until after snapshot download completes (if needed).
    - name: Wait for pod to be running
      ansible.builtin.command: >
        kubectl get pods -n {{ k8s_namespace }}
--- a/playbooks/biscayne-start.yml
+++ b/playbooks/biscayne-start.yml
@ -61,24 +61,33 @@
    # laconic-so creates individual extraMounts per volume:
    #   /srv/kind/solana/ledger → /mnt/validator-ledger (inside kind node)
    #   /srv/kind/solana/ramdisk/accounts → /mnt/validator-accounts
-    - name: Verify kind node sees XFS at PV paths
+    - name: Verify kind node sees correct filesystems at PV paths
      ansible.builtin.shell:
        cmd: >
          set -o pipefail &&
          docker exec {{ kind_node }}
          df -T /mnt/validator-ledger /mnt/validator-accounts
-          | grep -c xfs
        executable: /bin/bash
-      register: kind_xfs_check
+      register: kind_fs_check
      changed_when: false

-    - name: Fail if PV paths are not XFS
+    - name: Fail if ledger is not XFS (zvol)
      ansible.builtin.fail:
        msg: >-
-          Expected 2 XFS mounts (validator-ledger, validator-accounts) but
-          found {{ kind_xfs_check.stdout }}. Run biscayne-prepare-agave.yml
-          and restart the kind container.
-      when: kind_xfs_check.stdout | int < 2
+          validator-ledger must be XFS (on zvol). Got:
+          {{ kind_fs_check.stdout }}
+      when: "'xfs' not in kind_fs_check.stdout"
+
+    - name: Fail if accounts is on ZFS (must be tmpfs)
+      ansible.builtin.shell:
+        cmd: >
+          set -o pipefail &&
+          docker exec {{ kind_node }}
+          df -T /mnt/validator-accounts | grep -q zfs
+        executable: /bin/bash
+      register: accounts_zfs_check
+      changed_when: false
+      failed_when: accounts_zfs_check.rc == 0

    - name: Show kind node PV filesystems
      ansible.builtin.shell:
--- a/playbooks/biscayne-sync-tools.yml
+++ b/playbooks/biscayne-sync-tools.yml
@ -0,0 +1,96 @@
+---
+# Sync laconic-so and agave-stack to latest on biscayne
+#
+# Updates both repos that laconic-so deployment commands depend on:
+#   - stack-orchestrator (laconic-so itself, editable install)
+#   - agave-stack (stack definitions, compose files, container scripts)
+#
+# Then regenerates the deployment config from the updated stack.
+# Does NOT restart anything — just syncs code and config.
+#
+# Usage:
+#   ansible-playbook -i inventory/biscayne.yml playbooks/biscayne-sync-tools.yml
+#
+#   # Use a feature branch
+#   ansible-playbook -i inventory/biscayne.yml playbooks/biscayne-sync-tools.yml \
+#     -e laconic_so_branch=fix/kind-mount-propagation
+#
+- name: Sync laconic-so and agave-stack
+  hosts: all
+  gather_facts: false
+  environment:
+    KUBECONFIG: /home/rix/.kube/config
+  vars:
+    deployment_dir: /srv/deployments/agave
+    stack_repo: /srv/deployments/agave-stack
+    stack_path: /srv/deployments/agave-stack/stack-orchestrator/stacks/agave
+    laconic_so: /home/rix/.local/bin/laconic-so
+    laconic_so_repo: /home/rix/stack-orchestrator
+    laconic_so_branch: fix/kind-mount-propagation
+    stack_branch: main
+
+  tasks:
+    - name: Update laconic-so (editable install)
+      ansible.builtin.shell: |
+        cd {{ laconic_so_repo }}
+        git fetch origin
+        git reset --hard origin/{{ laconic_so_branch }}
+      register: laconic_so_update
+      changed_when: true
+
+    - name: Show laconic-so version
+      ansible.builtin.shell:
+        cmd: set -o pipefail && cd {{ laconic_so_repo }} && git log --oneline -1
+        executable: /bin/bash
+      register: laconic_so_version
+      changed_when: false
+
+    - name: Report laconic-so
+      ansible.builtin.debug:
+        msg: "laconic-so: {{ laconic_so_version.stdout }}"
+
+    - name: Find SSH agent socket
+      ansible.builtin.shell:
+        cmd: set -o pipefail && ls -t /tmp/ssh-*/agent.* 2>/dev/null | head -1
+        executable: /bin/bash
+      register: ssh_agent_socket
+      changed_when: false
+      failed_when: ssh_agent_socket.stdout == ""
+
+    - name: Pull agave-stack repo
+      ansible.builtin.shell: |
+        export SSH_AUTH_SOCK={{ ssh_agent_socket.stdout }}
+        cd {{ stack_repo }}
+        git fetch origin
+        git reset --hard origin/{{ stack_branch }}
+      register: stack_update
+      changed_when: true
+
+    - name: Show agave-stack version
+      ansible.builtin.shell:
+        cmd: set -o pipefail && cd {{ stack_repo }} && git log --oneline -1
+        executable: /bin/bash
+      register: stack_version
+      changed_when: false
+
+    - name: Report agave-stack
+      ansible.builtin.debug:
+        msg: "agave-stack: {{ stack_version.stdout }}"
+
+    - name: Regenerate deployment config from updated stack
+      ansible.builtin.command: >
+        {{ laconic_so }}
+        --stack {{ stack_path }}
+        deploy create
+        --spec-file {{ deployment_dir }}/spec.yml
+        --deployment-dir {{ deployment_dir }}
+        --update
+      register: regen_result
+      changed_when: true
+
+    - name: Report sync complete
+      ansible.builtin.debug:
+        msg: >-
+          Sync complete. laconic-so and agave-stack updated to
+          origin/{{ laconic_so_branch }}. Deployment config regenerated.
+          Restart or redeploy required to apply changes.
--- a/scripts/agave-container/snapshot_download.py
+++ b/scripts/agave-container/snapshot_download.py
@ -513,11 +513,18 @@ def download_best_snapshot(
    for filename, mirror_urls in download_plan:
        log.info("  %s (%d mirrors)", filename, len(mirror_urls))

-    # Download
+    # Download — full snapshot first, then re-probe for fresh incremental
    os.makedirs(output_dir, exist_ok=True)
    total_start: float = time.monotonic()

+    # Separate full and incremental from the initial plan
+    full_downloads: list[tuple[str, list[str]]] = []
    for filename, mirror_urls in download_plan:
+        if filename.startswith("snapshot-"):
+            full_downloads.append((filename, mirror_urls))
+
+    # Download full snapshot(s)
+    for filename, mirror_urls in full_downloads:
        filepath: Path = Path(output_dir) / filename
        if filepath.exists() and filepath.stat().st_size > 0:
            log.info("Skipping %s (already exists: %.1f GB)",
@ -527,6 +534,47 @@ def download_best_snapshot(
            log.error("Failed to download %s", filename)
            return False

+    # After full snapshot download, re-probe for a fresh incremental.
+    # The initial incremental is stale by now (full download takes 10+ min).
+    if not full_only:
+        # Get the full snapshot slot from the filename we just downloaded
+        full_filename: str = full_downloads[0][0]
+        fm_post: re.Match[str] | None = FULL_SNAP_RE.match(full_filename)
+        if fm_post:
+            full_snap_slot: int = int(fm_post.group(1))
+            log.info("Re-probing for fresh incremental based on slot %d...", full_snap_slot)
+            inc_downloaded: bool = False
+            for source in fast_sources:
+                inc_url_re: str = f"http://{source.rpc_address}/incremental-snapshot.tar.bz2"
+                inc_location, _ = head_no_follow(inc_url_re, timeout=2)
+                if not inc_location:
+                    continue
+                inc_fn, inc_fp = _parse_snapshot_filename(inc_location)
+                m_inc: re.Match[str] | None = INCR_SNAP_RE.match(inc_fn)
+                if not m_inc:
+                    continue
+                if int(m_inc.group(1)) != full_snap_slot:
+                    log.debug("  %s: incremental base slot %s != full %d, skipping",
+                              source.rpc_address, m_inc.group(1), full_snap_slot)
+                    continue
+                # Found a matching incremental — build mirror list and download
+                inc_mirrors: list[str] = [f"http://{source.rpc_address}{inc_fp}"]
+                for other in fast_sources:
+                    if other.rpc_address == source.rpc_address:
+                        continue
+                    other_loc, _ = head_no_follow(
+                        f"http://{other.rpc_address}/incremental-snapshot.tar.bz2", timeout=2)
+                    if other_loc:
+                        other_fn, other_fp = _parse_snapshot_filename(other_loc)
+                        if other_fn == inc_fn:
+                            inc_mirrors.append(f"http://{other.rpc_address}{other_fp}")
+                log.info("  Found incremental %s (%d mirrors)", inc_fn, len(inc_mirrors))
+                if download_aria2c(inc_mirrors, output_dir, inc_fn, connections):
+                    inc_downloaded = True
+                break
+            if not inc_downloaded:
+                log.info("No matching incremental found — validator will replay from full snapshot")
+
    total_elapsed: float = time.monotonic() - total_start
    log.info("All downloads complete in %.0fs", total_elapsed)
    for filename, _ in download_plan: