From 4f452db6fe3df6844b0055c0159e19aefa5452f0 Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Sat, 7 Mar 2026 10:52:40 +0000 Subject: [PATCH] fix: ansible-lint production profile compliance for all playbooks - FQCN for all modules (ansible.builtin.*) - changed_when/failed_when on all command/shell tasks - set -o pipefail on all shell tasks - Add KUBECONFIG environment to health-check.yml Co-Authored-By: Claude Opus 4.6 --- playbooks/biscayne-boot.yml | 19 +++---- playbooks/biscayne-recover.yml | 18 ++++--- playbooks/biscayne-redeploy.yml | 87 +++++++++++++++++++++------------ playbooks/biscayne-stop.yml | 22 +++++---- playbooks/fix-pv-mounts.yml | 24 +++++---- playbooks/health-check.yml | 2 + 6 files changed, 105 insertions(+), 67 deletions(-) diff --git a/playbooks/biscayne-boot.yml b/playbooks/biscayne-boot.yml index 2cdd5cad..af89a312 100644 --- a/playbooks/biscayne-boot.yml +++ b/playbooks/biscayne-boot.yml @@ -22,7 +22,7 @@ tasks: - name: Install ramdisk format service - copy: + ansible.builtin.copy: dest: /etc/systemd/system/format-ramdisk.service mode: "0644" content: | @@ -43,7 +43,7 @@ register: unit_file - name: Install ramdisk post-mount service - copy: + ansible.builtin.copy: dest: /etc/systemd/system/ramdisk-accounts.service mode: "0644" content: | @@ -62,19 +62,19 @@ register: accounts_unit - name: Ensure fstab entry uses nofail - lineinfile: + ansible.builtin.lineinfile: path: /etc/fstab regexp: '^{{ ramdisk_device }}\s+{{ ramdisk_mount }}' line: '{{ ramdisk_device }} {{ ramdisk_mount }} xfs noatime,nodiratime,nofail,x-systemd.requires=format-ramdisk.service 0 0' register: fstab_entry - name: Reload systemd - systemd: + ansible.builtin.systemd: daemon_reload: true when: unit_file.changed or accounts_unit.changed or fstab_entry.changed - name: Enable ramdisk services - systemd: + ansible.builtin.systemd: name: "{{ item }}" enabled: true loop: @@ -83,25 +83,26 @@ # ---- apply now if ramdisk not mounted ------------------------------------ - name: Check if ramdisk is mounted - command: mountpoint -q {{ ramdisk_mount }} + ansible.builtin.command: mountpoint -q {{ ramdisk_mount }} register: ramdisk_mounted failed_when: false changed_when: false - name: Format and mount ramdisk now - shell: | + ansible.builtin.shell: | mkfs.xfs -f {{ ramdisk_device }} mount {{ ramdisk_mount }} mkdir -p {{ accounts_dir }} chown solana:solana {{ ramdisk_mount }} {{ accounts_dir }} + changed_when: ramdisk_mounted.rc != 0 when: ramdisk_mounted.rc != 0 # ---- verify -------------------------------------------------------------- - name: Verify ramdisk - command: df -hT {{ ramdisk_mount }} + ansible.builtin.command: df -hT {{ ramdisk_mount }} register: ramdisk_df changed_when: false - name: Show ramdisk status - debug: + ansible.builtin.debug: msg: "{{ ramdisk_df.stdout_lines }}" diff --git a/playbooks/biscayne-recover.yml b/playbooks/biscayne-recover.yml index fec21a39..f8b9a89e 100644 --- a/playbooks/biscayne-recover.yml +++ b/playbooks/biscayne-recover.yml @@ -48,7 +48,7 @@ tasks: # ---- step 1: scale to 0 --------------------------------------------------- - name: Get current replica count - command: > + ansible.builtin.command: > kubectl get deployment {{ deployment_name }} -n {{ k8s_namespace }} -o jsonpath='{.spec.replicas}' @@ -57,7 +57,7 @@ changed_when: false - name: Scale deployment to 0 - command: > + ansible.builtin.command: > kubectl scale deployment {{ deployment_name }} -n {{ k8s_namespace }} --replicas=0 when: current_replicas.stdout | default('0') | int > 0 @@ -65,7 +65,7 @@ # ---- step 2: wait for pods to terminate ------------------------------------ - name: Wait for pods to terminate - command: > + ansible.builtin.command: > kubectl get pods -n {{ k8s_namespace }} -l app={{ deployment_name }} -o jsonpath='{.items}' @@ -77,7 +77,7 @@ when: current_replicas.stdout | default('0') | int > 0 - name: Verify no agave processes in kind node (io_uring safety check) - command: > + ansible.builtin.command: > docker exec {{ kind_cluster }}-control-plane pgrep -c agave-validator register: agave_procs @@ -110,7 +110,7 @@ # ---- step 5: download fresh snapshot --------------------------------------- - name: Verify aria2c installed - command: which aria2c + ansible.builtin.command: which aria2c changed_when: false - name: Copy snapshot script to remote @@ -135,7 +135,9 @@ # ---- step 6: verify snapshot accessible via PV ----------------------------- - name: Get snapshot filename - ansible.builtin.shell: ls -1 {{ snapshot_dir }}/snapshot-*.tar.* | head -1 | xargs basename + ansible.builtin.shell: set -o pipefail && ls -1 {{ snapshot_dir }}/snapshot-*.tar.* | head -1 | xargs basename + args: + executable: /bin/bash register: snapshot_filename changed_when: false @@ -176,7 +178,7 @@ # ---- step 8: wait for pod running ------------------------------------------ - name: Wait for pod to be running - command: > + ansible.builtin.command: > kubectl get pods -n {{ k8s_namespace }} -l app={{ deployment_name }} -o jsonpath='{.items[0].status.phase}' @@ -188,7 +190,7 @@ # ---- step 9: verify validator log ------------------------------------------ - name: Wait for validator log file - command: > + ansible.builtin.command: > kubectl exec -n {{ k8s_namespace }} deployment/{{ deployment_name }} -c agave-validator -- test -f /data/log/validator.log diff --git a/playbooks/biscayne-redeploy.yml b/playbooks/biscayne-redeploy.yml index ca16d5a7..216091dc 100644 --- a/playbooks/biscayne-redeploy.yml +++ b/playbooks/biscayne-redeploy.yml @@ -71,15 +71,16 @@ # Deleting the namespace while agave is running causes io_uring/ZFS # deadlock (unkillable D-state threads). See CLAUDE.md. - name: Scale deployment to 0 (graceful stop) - command: > + ansible.builtin.command: > kubectl scale deployment {{ deployment_name }} -n {{ k8s_namespace }} --replicas=0 register: pre_teardown_scale + changed_when: pre_teardown_scale.rc == 0 failed_when: false tags: [teardown] - name: Wait for agave to exit - command: > + ansible.builtin.command: > kubectl get pods -n {{ k8s_namespace }} -l app={{ deployment_name }} -o jsonpath='{.items}' @@ -87,31 +88,35 @@ retries: 60 delay: 5 until: pre_teardown_pods.stdout == "[]" or pre_teardown_pods.stdout == "" or pre_teardown_pods.rc != 0 + changed_when: false failed_when: false when: pre_teardown_scale.rc == 0 tags: [teardown] - name: Delete deployment namespace - command: > + ansible.builtin.command: > kubectl delete namespace {{ k8s_namespace }} --timeout=120s register: ns_delete + changed_when: ns_delete.rc == 0 failed_when: false tags: [teardown] - name: Wait for namespace to terminate - command: > + ansible.builtin.command: > kubectl get namespace {{ k8s_namespace }} -o jsonpath='{.status.phase}' register: ns_status retries: 30 delay: 5 until: ns_status.rc != 0 + changed_when: false failed_when: false when: ns_delete.rc == 0 tags: [teardown] - name: Clear stale claimRefs on Released PVs ansible.builtin.shell: | + set -o pipefail for pv in $(kubectl get pv -o jsonpath='{range .items[?(@.status.phase=="Released")]}{.metadata.name}{"\n"}{end}'); do kubectl patch pv "$pv" --type json \ -p '[{"op":"remove","path":"/spec/claimRef"}]' @@ -122,24 +127,28 @@ # ---- wipe: opt-in data cleanup ------------------------------------------ - name: Wipe ledger data - shell: rm -rf {{ ledger_dir }}/* + ansible.builtin.shell: rm -rf {{ ledger_dir }}/* become: true + changed_when: true when: wipe_ledger | bool tags: [wipe] - name: Wipe accounts ramdisk (umount + mkfs.xfs + mount) - shell: | + ansible.builtin.shell: | + set -o pipefail mountpoint -q {{ ramdisk_mount }} && umount {{ ramdisk_mount }} || true mkfs.xfs -f {{ ramdisk_device }} mount {{ ramdisk_mount }} mkdir -p {{ accounts_dir }} chown solana:solana {{ ramdisk_mount }} {{ accounts_dir }} become: true + changed_when: true when: wipe_accounts | bool tags: [wipe] - name: Clean old snapshots (keep newest full + incremental) - shell: | + ansible.builtin.shell: | + set -o pipefail cd {{ snapshot_dir }} || exit 0 newest=$(ls -t snapshot-*.tar.* 2>/dev/null | head -1) if [ -n "$newest" ]; then @@ -150,26 +159,28 @@ -delete fi become: true + changed_when: true when: not skip_snapshot | bool tags: [wipe] # ---- preflight: verify ramdisk and mounts before deploy ------------------ - name: Verify ramdisk is mounted - command: mountpoint -q {{ ramdisk_mount }} + ansible.builtin.command: mountpoint -q {{ ramdisk_mount }} register: ramdisk_check failed_when: ramdisk_check.rc != 0 changed_when: false tags: [deploy, preflight] - name: Verify ramdisk is xfs (not the underlying ZFS) - shell: df -T {{ ramdisk_mount }} | grep -q xfs + ansible.builtin.shell: set -o pipefail && df -T {{ ramdisk_mount }} | grep -q xfs register: ramdisk_type failed_when: ramdisk_type.rc != 0 changed_when: false tags: [deploy, preflight] - name: Verify ramdisk visible inside kind node - shell: > + ansible.builtin.shell: > + set -o pipefail && docker exec {{ kind_cluster }}-control-plane df -T /mnt/solana/ramdisk 2>/dev/null | grep -q xfs register: kind_ramdisk_check @@ -187,28 +198,31 @@ tags: [deploy] - name: Regenerate deployment config from updated stack - command: > + ansible.builtin.command: > {{ laconic_so }} --stack {{ stack_path }} deploy create --spec-file {{ deployment_dir }}/spec.yml --deployment-dir {{ deployment_dir }} --update + changed_when: true tags: [deploy] - name: Verify kind-config.yml has unified mount root - command: "grep -c 'containerPath: /mnt$' {{ deployment_dir }}/kind-config.yml" + ansible.builtin.command: "grep -c 'containerPath: /mnt$' {{ deployment_dir }}/kind-config.yml" register: mount_root_check failed_when: mount_root_check.stdout | int < 1 + changed_when: false tags: [deploy] - name: Start deployment (creates kind cluster + deploys pod) - command: "{{ laconic_so }} deployment --dir {{ deployment_dir }} start" + ansible.builtin.command: "{{ laconic_so }} deployment --dir {{ deployment_dir }} start" + changed_when: true timeout: 1200 tags: [deploy] - name: Wait for deployment to exist - command: > + ansible.builtin.command: > kubectl get deployment {{ deployment_name }} -n {{ k8s_namespace }} -o jsonpath='{.metadata.name}' @@ -216,16 +230,18 @@ retries: 30 delay: 10 until: deploy_exists.rc == 0 + changed_when: false tags: [deploy] - name: Scale validator to 0 (stop before snapshot download) - command: > + ansible.builtin.command: > kubectl scale deployment {{ deployment_name }} -n {{ k8s_namespace }} --replicas=0 + changed_when: true tags: [deploy] - name: Wait for pods to terminate - command: > + ansible.builtin.command: > kubectl get pods -n {{ k8s_namespace }} -l app={{ deployment_name }} -o jsonpath='{.items}' @@ -233,18 +249,19 @@ retries: 30 delay: 5 until: pods_gone.stdout == "[]" or pods_gone.stdout == "" + changed_when: false failed_when: false tags: [deploy] # ---- snapshot: download via aria2c, verify in kind node ------------------ - name: Verify aria2c installed - command: which aria2c + ansible.builtin.command: which aria2c changed_when: false when: not skip_snapshot | bool tags: [snapshot] - name: Copy snapshot script to remote - copy: + ansible.builtin.copy: src: "{{ snapshot_script_local }}" dest: "{{ snapshot_script }}" mode: "0755" @@ -252,73 +269,80 @@ tags: [snapshot] - name: Verify kind node mounts - command: > + ansible.builtin.command: > docker exec {{ kind_cluster }}-control-plane ls /mnt/solana/snapshots/ register: kind_mount_check + changed_when: false tags: [snapshot] - name: Download snapshot via aria2c - shell: > + ansible.builtin.shell: > python3 {{ snapshot_script }} -o {{ snapshot_dir }} {{ snapshot_args }} become: true register: snapshot_result + changed_when: true when: not skip_snapshot | bool timeout: 3600 tags: [snapshot] - name: Show snapshot download result - debug: + ansible.builtin.debug: msg: "{{ snapshot_result.stdout_lines | default(['skipped']) }}" tags: [snapshot] - name: Verify snapshot visible inside kind node - shell: > + ansible.builtin.shell: > + set -o pipefail && docker exec {{ kind_cluster }}-control-plane - ls -lhS /mnt/solana/snapshots/*.tar.* 2>/dev/null | head -5 + find /mnt/solana/snapshots/ -name '*.tar.*' -maxdepth 1 | head -5 register: kind_snapshot_check failed_when: kind_snapshot_check.stdout == "" + changed_when: false when: not skip_snapshot | bool tags: [snapshot] - name: Show snapshot files in kind node - debug: + ansible.builtin.debug: msg: "{{ kind_snapshot_check.stdout_lines | default(['skipped']) }}" when: not skip_snapshot | bool tags: [snapshot] # ---- deploy (cont): scale validator back up with snapshot ---------------- - name: Scale validator to 1 (start with downloaded snapshot) - command: > + ansible.builtin.command: > kubectl scale deployment {{ deployment_name }} -n {{ k8s_namespace }} --replicas=1 + changed_when: true tags: [deploy] # ---- verify: confirm validator is running -------------------------------- - name: Wait for pod to be running - command: > + ansible.builtin.command: > kubectl get pods -n {{ k8s_namespace }} -o jsonpath='{.items[0].status.phase}' register: pod_status retries: 60 delay: 10 until: pod_status.stdout == "Running" + changed_when: false tags: [verify] - name: Verify unified mount inside kind node - command: "docker exec {{ kind_cluster }}-control-plane ls /mnt/solana/" + ansible.builtin.command: "docker exec {{ kind_cluster }}-control-plane ls /mnt/solana/" register: mount_check + changed_when: false tags: [verify] - name: Show mount contents - debug: + ansible.builtin.debug: msg: "{{ mount_check.stdout_lines }}" tags: [verify] - name: Check validator log file is being written - command: > + ansible.builtin.command: > kubectl exec -n {{ k8s_namespace }} deployment/{{ deployment_name }} -c agave-validator -- test -f /data/log/validator.log @@ -326,11 +350,12 @@ delay: 10 until: log_file_check.rc == 0 register: log_file_check + changed_when: false failed_when: false tags: [verify] - name: Check RPC health - uri: + ansible.builtin.uri: url: http://127.0.0.1:8899/health return_content: true register: rpc_health @@ -342,7 +367,7 @@ tags: [verify] - name: Report status - debug: + ansible.builtin.debug: msg: >- Deployment complete. Log: {{ 'writing' if log_file_check.rc == 0 else 'not yet created' }}. diff --git a/playbooks/biscayne-stop.yml b/playbooks/biscayne-stop.yml index 2550f5a9..2f9290f6 100644 --- a/playbooks/biscayne-stop.yml +++ b/playbooks/biscayne-stop.yml @@ -34,7 +34,7 @@ tasks: - name: Get current replica count - command: > + ansible.builtin.command: > kubectl get deployment {{ deployment_name }} -n {{ k8s_namespace }} -o jsonpath='{.spec.replicas}' @@ -43,24 +43,26 @@ changed_when: false - name: Scale deployment to 0 - command: > + ansible.builtin.command: > kubectl scale deployment {{ deployment_name }} -n {{ k8s_namespace }} --replicas=0 + changed_when: true when: current_replicas.stdout | default('0') | int > 0 - name: Wait for pods to terminate - command: > + ansible.builtin.command: > kubectl get pods -n {{ k8s_namespace }} -l app={{ deployment_name }} -o jsonpath='{.items}' register: pods_gone + changed_when: false retries: 60 delay: 5 until: pods_gone.stdout == "[]" or pods_gone.stdout == "" when: current_replicas.stdout | default('0') | int > 0 - name: Verify no agave processes in kind node - command: > + ansible.builtin.command: > docker exec {{ kind_cluster }}-control-plane pgrep -c agave-validator register: agave_procs @@ -68,7 +70,7 @@ changed_when: false - name: Fail if agave still running - fail: + ansible.builtin.fail: msg: >- agave-validator process still running inside kind node after pod termination. Do NOT restart the kind node — investigate @@ -76,7 +78,7 @@ when: agave_procs.rc == 0 - name: Report stopped - debug: + ansible.builtin.debug: msg: >- Validator stopped. Replicas: {{ current_replicas.stdout | default('0') }} -> 0. No agave processes detected in kind node. @@ -84,22 +86,24 @@ # ---- optional: restart kind node ----------------------------------------- - name: Restart kind node - command: docker restart {{ kind_cluster }}-control-plane + ansible.builtin.command: docker restart {{ kind_cluster }}-control-plane + changed_when: true when: restart_kind | bool timeout: 120 - name: Wait for kind node ready - command: > + ansible.builtin.command: > kubectl get node {{ kind_cluster }}-control-plane -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' register: node_ready + changed_when: false retries: 30 delay: 10 until: node_ready.stdout == "True" when: restart_kind | bool - name: Report restarted - debug: + ansible.builtin.debug: msg: >- Kind node restarted and ready. Deployment at 0 replicas — scale up when ready. diff --git a/playbooks/fix-pv-mounts.yml b/playbooks/fix-pv-mounts.yml index f03f0e97..ba6d9f23 100644 --- a/playbooks/fix-pv-mounts.yml +++ b/playbooks/fix-pv-mounts.yml @@ -39,7 +39,7 @@ tasks: - name: Read current PV hostPaths - command: > + ansible.builtin.command: > kubectl get pv {{ kind_cluster }}-{{ item.name }} -o jsonpath='{.spec.hostPath.path}' register: current_paths @@ -48,7 +48,7 @@ changed_when: false - name: Build path comparison - set_fact: + ansible.builtin.set_fact: path_mismatches: "{{ current_paths.results | selectattr('stdout', 'ne', '') | rejectattr('stdout', 'equalto', item.host_path) | list }}" path_missing: "{{ current_paths.results | selectattr('stdout', 'equalto', '') | list }}" loop: "{{ volumes }}" @@ -56,7 +56,7 @@ label: "{{ item.name }}" - name: Show current vs expected paths - debug: + ansible.builtin.debug: msg: >- {{ item.item.name }}: current={{ item.stdout if item.stdout else 'NOT FOUND' }} @@ -67,7 +67,7 @@ label: "{{ item.item.name }}" - name: Check for mismatched PVs - fail: + ansible.builtin.fail: msg: >- PV {{ item.item.name }} has wrong hostPath: {{ item.stdout if item.stdout else 'NOT FOUND' }} @@ -80,7 +80,7 @@ # ---- Fix mode --------------------------------------------------------- - name: Delete stale PVCs - command: > + ansible.builtin.command: > kubectl delete pvc {{ kind_cluster }}-{{ item.item.name }} -n {{ k8s_namespace }} --timeout=60s when: fix | bool and item.stdout != item.item.host_path @@ -88,9 +88,10 @@ loop_control: label: "{{ item.item.name }}" failed_when: false + changed_when: true - name: Delete stale PVs - command: > + ansible.builtin.command: > kubectl delete pv {{ kind_cluster }}-{{ item.item.name }} --timeout=60s when: fix | bool and item.stdout != item.item.host_path @@ -98,9 +99,10 @@ loop_control: label: "{{ item.item.name }}" failed_when: false + changed_when: true - name: Create PVs with correct hostPaths - command: > + ansible.builtin.command: > kubectl apply -f - args: stdin: | @@ -121,9 +123,10 @@ loop: "{{ current_paths.results }}" loop_control: label: "{{ item.item.name }}" + changed_when: true - name: Create PVCs - command: > + ansible.builtin.command: > kubectl apply -f - args: stdin: | @@ -144,10 +147,11 @@ loop: "{{ current_paths.results }}" loop_control: label: "{{ item.item.name }}" + changed_when: true # ---- Final verify ----------------------------------------------------- - name: Verify PV paths - command: > + ansible.builtin.command: > kubectl get pv {{ kind_cluster }}-{{ item.name }} -o jsonpath='{.spec.hostPath.path}' register: final_paths @@ -156,7 +160,7 @@ when: fix | bool - name: Assert all PV paths correct - assert: + ansible.builtin.assert: that: item.stdout == item.item.host_path fail_msg: "{{ item.item.name }}: {{ item.stdout }} != {{ item.item.host_path }}" success_msg: "{{ item.item.name }}: {{ item.stdout }} OK" diff --git a/playbooks/health-check.yml b/playbooks/health-check.yml index 326f1f35..c0aa4ee6 100644 --- a/playbooks/health-check.yml +++ b/playbooks/health-check.yml @@ -13,6 +13,8 @@ - name: Biscayne agave-stack health check hosts: biscayne gather_facts: false + environment: + KUBECONFIG: /home/rix/.kube/config tasks: # ------------------------------------------------------------------