fix: ansible-lint production profile compliance for all playbooks
- FQCN for all modules (ansible.builtin.*) - changed_when/failed_when on all command/shell tasks - set -o pipefail on all shell tasks - Add KUBECONFIG environment to health-check.yml Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>fix/kind-mount-propagation
parent
d36a71f13d
commit
4f452db6fe
|
|
@ -22,7 +22,7 @@
|
||||||
|
|
||||||
tasks:
|
tasks:
|
||||||
- name: Install ramdisk format service
|
- name: Install ramdisk format service
|
||||||
copy:
|
ansible.builtin.copy:
|
||||||
dest: /etc/systemd/system/format-ramdisk.service
|
dest: /etc/systemd/system/format-ramdisk.service
|
||||||
mode: "0644"
|
mode: "0644"
|
||||||
content: |
|
content: |
|
||||||
|
|
@ -43,7 +43,7 @@
|
||||||
register: unit_file
|
register: unit_file
|
||||||
|
|
||||||
- name: Install ramdisk post-mount service
|
- name: Install ramdisk post-mount service
|
||||||
copy:
|
ansible.builtin.copy:
|
||||||
dest: /etc/systemd/system/ramdisk-accounts.service
|
dest: /etc/systemd/system/ramdisk-accounts.service
|
||||||
mode: "0644"
|
mode: "0644"
|
||||||
content: |
|
content: |
|
||||||
|
|
@ -62,19 +62,19 @@
|
||||||
register: accounts_unit
|
register: accounts_unit
|
||||||
|
|
||||||
- name: Ensure fstab entry uses nofail
|
- name: Ensure fstab entry uses nofail
|
||||||
lineinfile:
|
ansible.builtin.lineinfile:
|
||||||
path: /etc/fstab
|
path: /etc/fstab
|
||||||
regexp: '^{{ ramdisk_device }}\s+{{ ramdisk_mount }}'
|
regexp: '^{{ ramdisk_device }}\s+{{ ramdisk_mount }}'
|
||||||
line: '{{ ramdisk_device }} {{ ramdisk_mount }} xfs noatime,nodiratime,nofail,x-systemd.requires=format-ramdisk.service 0 0'
|
line: '{{ ramdisk_device }} {{ ramdisk_mount }} xfs noatime,nodiratime,nofail,x-systemd.requires=format-ramdisk.service 0 0'
|
||||||
register: fstab_entry
|
register: fstab_entry
|
||||||
|
|
||||||
- name: Reload systemd
|
- name: Reload systemd
|
||||||
systemd:
|
ansible.builtin.systemd:
|
||||||
daemon_reload: true
|
daemon_reload: true
|
||||||
when: unit_file.changed or accounts_unit.changed or fstab_entry.changed
|
when: unit_file.changed or accounts_unit.changed or fstab_entry.changed
|
||||||
|
|
||||||
- name: Enable ramdisk services
|
- name: Enable ramdisk services
|
||||||
systemd:
|
ansible.builtin.systemd:
|
||||||
name: "{{ item }}"
|
name: "{{ item }}"
|
||||||
enabled: true
|
enabled: true
|
||||||
loop:
|
loop:
|
||||||
|
|
@ -83,25 +83,26 @@
|
||||||
|
|
||||||
# ---- apply now if ramdisk not mounted ------------------------------------
|
# ---- apply now if ramdisk not mounted ------------------------------------
|
||||||
- name: Check if ramdisk is mounted
|
- name: Check if ramdisk is mounted
|
||||||
command: mountpoint -q {{ ramdisk_mount }}
|
ansible.builtin.command: mountpoint -q {{ ramdisk_mount }}
|
||||||
register: ramdisk_mounted
|
register: ramdisk_mounted
|
||||||
failed_when: false
|
failed_when: false
|
||||||
changed_when: false
|
changed_when: false
|
||||||
|
|
||||||
- name: Format and mount ramdisk now
|
- name: Format and mount ramdisk now
|
||||||
shell: |
|
ansible.builtin.shell: |
|
||||||
mkfs.xfs -f {{ ramdisk_device }}
|
mkfs.xfs -f {{ ramdisk_device }}
|
||||||
mount {{ ramdisk_mount }}
|
mount {{ ramdisk_mount }}
|
||||||
mkdir -p {{ accounts_dir }}
|
mkdir -p {{ accounts_dir }}
|
||||||
chown solana:solana {{ ramdisk_mount }} {{ accounts_dir }}
|
chown solana:solana {{ ramdisk_mount }} {{ accounts_dir }}
|
||||||
|
changed_when: ramdisk_mounted.rc != 0
|
||||||
when: ramdisk_mounted.rc != 0
|
when: ramdisk_mounted.rc != 0
|
||||||
|
|
||||||
# ---- verify --------------------------------------------------------------
|
# ---- verify --------------------------------------------------------------
|
||||||
- name: Verify ramdisk
|
- name: Verify ramdisk
|
||||||
command: df -hT {{ ramdisk_mount }}
|
ansible.builtin.command: df -hT {{ ramdisk_mount }}
|
||||||
register: ramdisk_df
|
register: ramdisk_df
|
||||||
changed_when: false
|
changed_when: false
|
||||||
|
|
||||||
- name: Show ramdisk status
|
- name: Show ramdisk status
|
||||||
debug:
|
ansible.builtin.debug:
|
||||||
msg: "{{ ramdisk_df.stdout_lines }}"
|
msg: "{{ ramdisk_df.stdout_lines }}"
|
||||||
|
|
|
||||||
|
|
@ -48,7 +48,7 @@
|
||||||
tasks:
|
tasks:
|
||||||
# ---- step 1: scale to 0 ---------------------------------------------------
|
# ---- step 1: scale to 0 ---------------------------------------------------
|
||||||
- name: Get current replica count
|
- name: Get current replica count
|
||||||
command: >
|
ansible.builtin.command: >
|
||||||
kubectl get deployment {{ deployment_name }}
|
kubectl get deployment {{ deployment_name }}
|
||||||
-n {{ k8s_namespace }}
|
-n {{ k8s_namespace }}
|
||||||
-o jsonpath='{.spec.replicas}'
|
-o jsonpath='{.spec.replicas}'
|
||||||
|
|
@ -57,7 +57,7 @@
|
||||||
changed_when: false
|
changed_when: false
|
||||||
|
|
||||||
- name: Scale deployment to 0
|
- name: Scale deployment to 0
|
||||||
command: >
|
ansible.builtin.command: >
|
||||||
kubectl scale deployment {{ deployment_name }}
|
kubectl scale deployment {{ deployment_name }}
|
||||||
-n {{ k8s_namespace }} --replicas=0
|
-n {{ k8s_namespace }} --replicas=0
|
||||||
when: current_replicas.stdout | default('0') | int > 0
|
when: current_replicas.stdout | default('0') | int > 0
|
||||||
|
|
@ -65,7 +65,7 @@
|
||||||
|
|
||||||
# ---- step 2: wait for pods to terminate ------------------------------------
|
# ---- step 2: wait for pods to terminate ------------------------------------
|
||||||
- name: Wait for pods to terminate
|
- name: Wait for pods to terminate
|
||||||
command: >
|
ansible.builtin.command: >
|
||||||
kubectl get pods -n {{ k8s_namespace }}
|
kubectl get pods -n {{ k8s_namespace }}
|
||||||
-l app={{ deployment_name }}
|
-l app={{ deployment_name }}
|
||||||
-o jsonpath='{.items}'
|
-o jsonpath='{.items}'
|
||||||
|
|
@ -77,7 +77,7 @@
|
||||||
when: current_replicas.stdout | default('0') | int > 0
|
when: current_replicas.stdout | default('0') | int > 0
|
||||||
|
|
||||||
- name: Verify no agave processes in kind node (io_uring safety check)
|
- name: Verify no agave processes in kind node (io_uring safety check)
|
||||||
command: >
|
ansible.builtin.command: >
|
||||||
docker exec {{ kind_cluster }}-control-plane
|
docker exec {{ kind_cluster }}-control-plane
|
||||||
pgrep -c agave-validator
|
pgrep -c agave-validator
|
||||||
register: agave_procs
|
register: agave_procs
|
||||||
|
|
@ -110,7 +110,7 @@
|
||||||
|
|
||||||
# ---- step 5: download fresh snapshot ---------------------------------------
|
# ---- step 5: download fresh snapshot ---------------------------------------
|
||||||
- name: Verify aria2c installed
|
- name: Verify aria2c installed
|
||||||
command: which aria2c
|
ansible.builtin.command: which aria2c
|
||||||
changed_when: false
|
changed_when: false
|
||||||
|
|
||||||
- name: Copy snapshot script to remote
|
- name: Copy snapshot script to remote
|
||||||
|
|
@ -135,7 +135,9 @@
|
||||||
|
|
||||||
# ---- step 6: verify snapshot accessible via PV -----------------------------
|
# ---- step 6: verify snapshot accessible via PV -----------------------------
|
||||||
- name: Get snapshot filename
|
- name: Get snapshot filename
|
||||||
ansible.builtin.shell: ls -1 {{ snapshot_dir }}/snapshot-*.tar.* | head -1 | xargs basename
|
ansible.builtin.shell: set -o pipefail && ls -1 {{ snapshot_dir }}/snapshot-*.tar.* | head -1 | xargs basename
|
||||||
|
args:
|
||||||
|
executable: /bin/bash
|
||||||
register: snapshot_filename
|
register: snapshot_filename
|
||||||
changed_when: false
|
changed_when: false
|
||||||
|
|
||||||
|
|
@ -176,7 +178,7 @@
|
||||||
|
|
||||||
# ---- step 8: wait for pod running ------------------------------------------
|
# ---- step 8: wait for pod running ------------------------------------------
|
||||||
- name: Wait for pod to be running
|
- name: Wait for pod to be running
|
||||||
command: >
|
ansible.builtin.command: >
|
||||||
kubectl get pods -n {{ k8s_namespace }}
|
kubectl get pods -n {{ k8s_namespace }}
|
||||||
-l app={{ deployment_name }}
|
-l app={{ deployment_name }}
|
||||||
-o jsonpath='{.items[0].status.phase}'
|
-o jsonpath='{.items[0].status.phase}'
|
||||||
|
|
@ -188,7 +190,7 @@
|
||||||
|
|
||||||
# ---- step 9: verify validator log ------------------------------------------
|
# ---- step 9: verify validator log ------------------------------------------
|
||||||
- name: Wait for validator log file
|
- name: Wait for validator log file
|
||||||
command: >
|
ansible.builtin.command: >
|
||||||
kubectl exec -n {{ k8s_namespace }}
|
kubectl exec -n {{ k8s_namespace }}
|
||||||
deployment/{{ deployment_name }}
|
deployment/{{ deployment_name }}
|
||||||
-c agave-validator -- test -f /data/log/validator.log
|
-c agave-validator -- test -f /data/log/validator.log
|
||||||
|
|
|
||||||
|
|
@ -71,15 +71,16 @@
|
||||||
# Deleting the namespace while agave is running causes io_uring/ZFS
|
# Deleting the namespace while agave is running causes io_uring/ZFS
|
||||||
# deadlock (unkillable D-state threads). See CLAUDE.md.
|
# deadlock (unkillable D-state threads). See CLAUDE.md.
|
||||||
- name: Scale deployment to 0 (graceful stop)
|
- name: Scale deployment to 0 (graceful stop)
|
||||||
command: >
|
ansible.builtin.command: >
|
||||||
kubectl scale deployment {{ deployment_name }}
|
kubectl scale deployment {{ deployment_name }}
|
||||||
-n {{ k8s_namespace }} --replicas=0
|
-n {{ k8s_namespace }} --replicas=0
|
||||||
register: pre_teardown_scale
|
register: pre_teardown_scale
|
||||||
|
changed_when: pre_teardown_scale.rc == 0
|
||||||
failed_when: false
|
failed_when: false
|
||||||
tags: [teardown]
|
tags: [teardown]
|
||||||
|
|
||||||
- name: Wait for agave to exit
|
- name: Wait for agave to exit
|
||||||
command: >
|
ansible.builtin.command: >
|
||||||
kubectl get pods -n {{ k8s_namespace }}
|
kubectl get pods -n {{ k8s_namespace }}
|
||||||
-l app={{ deployment_name }}
|
-l app={{ deployment_name }}
|
||||||
-o jsonpath='{.items}'
|
-o jsonpath='{.items}'
|
||||||
|
|
@ -87,31 +88,35 @@
|
||||||
retries: 60
|
retries: 60
|
||||||
delay: 5
|
delay: 5
|
||||||
until: pre_teardown_pods.stdout == "[]" or pre_teardown_pods.stdout == "" or pre_teardown_pods.rc != 0
|
until: pre_teardown_pods.stdout == "[]" or pre_teardown_pods.stdout == "" or pre_teardown_pods.rc != 0
|
||||||
|
changed_when: false
|
||||||
failed_when: false
|
failed_when: false
|
||||||
when: pre_teardown_scale.rc == 0
|
when: pre_teardown_scale.rc == 0
|
||||||
tags: [teardown]
|
tags: [teardown]
|
||||||
|
|
||||||
- name: Delete deployment namespace
|
- name: Delete deployment namespace
|
||||||
command: >
|
ansible.builtin.command: >
|
||||||
kubectl delete namespace {{ k8s_namespace }} --timeout=120s
|
kubectl delete namespace {{ k8s_namespace }} --timeout=120s
|
||||||
register: ns_delete
|
register: ns_delete
|
||||||
|
changed_when: ns_delete.rc == 0
|
||||||
failed_when: false
|
failed_when: false
|
||||||
tags: [teardown]
|
tags: [teardown]
|
||||||
|
|
||||||
- name: Wait for namespace to terminate
|
- name: Wait for namespace to terminate
|
||||||
command: >
|
ansible.builtin.command: >
|
||||||
kubectl get namespace {{ k8s_namespace }}
|
kubectl get namespace {{ k8s_namespace }}
|
||||||
-o jsonpath='{.status.phase}'
|
-o jsonpath='{.status.phase}'
|
||||||
register: ns_status
|
register: ns_status
|
||||||
retries: 30
|
retries: 30
|
||||||
delay: 5
|
delay: 5
|
||||||
until: ns_status.rc != 0
|
until: ns_status.rc != 0
|
||||||
|
changed_when: false
|
||||||
failed_when: false
|
failed_when: false
|
||||||
when: ns_delete.rc == 0
|
when: ns_delete.rc == 0
|
||||||
tags: [teardown]
|
tags: [teardown]
|
||||||
|
|
||||||
- name: Clear stale claimRefs on Released PVs
|
- name: Clear stale claimRefs on Released PVs
|
||||||
ansible.builtin.shell: |
|
ansible.builtin.shell: |
|
||||||
|
set -o pipefail
|
||||||
for pv in $(kubectl get pv -o jsonpath='{range .items[?(@.status.phase=="Released")]}{.metadata.name}{"\n"}{end}'); do
|
for pv in $(kubectl get pv -o jsonpath='{range .items[?(@.status.phase=="Released")]}{.metadata.name}{"\n"}{end}'); do
|
||||||
kubectl patch pv "$pv" --type json \
|
kubectl patch pv "$pv" --type json \
|
||||||
-p '[{"op":"remove","path":"/spec/claimRef"}]'
|
-p '[{"op":"remove","path":"/spec/claimRef"}]'
|
||||||
|
|
@ -122,24 +127,28 @@
|
||||||
|
|
||||||
# ---- wipe: opt-in data cleanup ------------------------------------------
|
# ---- wipe: opt-in data cleanup ------------------------------------------
|
||||||
- name: Wipe ledger data
|
- name: Wipe ledger data
|
||||||
shell: rm -rf {{ ledger_dir }}/*
|
ansible.builtin.shell: rm -rf {{ ledger_dir }}/*
|
||||||
become: true
|
become: true
|
||||||
|
changed_when: true
|
||||||
when: wipe_ledger | bool
|
when: wipe_ledger | bool
|
||||||
tags: [wipe]
|
tags: [wipe]
|
||||||
|
|
||||||
- name: Wipe accounts ramdisk (umount + mkfs.xfs + mount)
|
- name: Wipe accounts ramdisk (umount + mkfs.xfs + mount)
|
||||||
shell: |
|
ansible.builtin.shell: |
|
||||||
|
set -o pipefail
|
||||||
mountpoint -q {{ ramdisk_mount }} && umount {{ ramdisk_mount }} || true
|
mountpoint -q {{ ramdisk_mount }} && umount {{ ramdisk_mount }} || true
|
||||||
mkfs.xfs -f {{ ramdisk_device }}
|
mkfs.xfs -f {{ ramdisk_device }}
|
||||||
mount {{ ramdisk_mount }}
|
mount {{ ramdisk_mount }}
|
||||||
mkdir -p {{ accounts_dir }}
|
mkdir -p {{ accounts_dir }}
|
||||||
chown solana:solana {{ ramdisk_mount }} {{ accounts_dir }}
|
chown solana:solana {{ ramdisk_mount }} {{ accounts_dir }}
|
||||||
become: true
|
become: true
|
||||||
|
changed_when: true
|
||||||
when: wipe_accounts | bool
|
when: wipe_accounts | bool
|
||||||
tags: [wipe]
|
tags: [wipe]
|
||||||
|
|
||||||
- name: Clean old snapshots (keep newest full + incremental)
|
- name: Clean old snapshots (keep newest full + incremental)
|
||||||
shell: |
|
ansible.builtin.shell: |
|
||||||
|
set -o pipefail
|
||||||
cd {{ snapshot_dir }} || exit 0
|
cd {{ snapshot_dir }} || exit 0
|
||||||
newest=$(ls -t snapshot-*.tar.* 2>/dev/null | head -1)
|
newest=$(ls -t snapshot-*.tar.* 2>/dev/null | head -1)
|
||||||
if [ -n "$newest" ]; then
|
if [ -n "$newest" ]; then
|
||||||
|
|
@ -150,26 +159,28 @@
|
||||||
-delete
|
-delete
|
||||||
fi
|
fi
|
||||||
become: true
|
become: true
|
||||||
|
changed_when: true
|
||||||
when: not skip_snapshot | bool
|
when: not skip_snapshot | bool
|
||||||
tags: [wipe]
|
tags: [wipe]
|
||||||
|
|
||||||
# ---- preflight: verify ramdisk and mounts before deploy ------------------
|
# ---- preflight: verify ramdisk and mounts before deploy ------------------
|
||||||
- name: Verify ramdisk is mounted
|
- name: Verify ramdisk is mounted
|
||||||
command: mountpoint -q {{ ramdisk_mount }}
|
ansible.builtin.command: mountpoint -q {{ ramdisk_mount }}
|
||||||
register: ramdisk_check
|
register: ramdisk_check
|
||||||
failed_when: ramdisk_check.rc != 0
|
failed_when: ramdisk_check.rc != 0
|
||||||
changed_when: false
|
changed_when: false
|
||||||
tags: [deploy, preflight]
|
tags: [deploy, preflight]
|
||||||
|
|
||||||
- name: Verify ramdisk is xfs (not the underlying ZFS)
|
- name: Verify ramdisk is xfs (not the underlying ZFS)
|
||||||
shell: df -T {{ ramdisk_mount }} | grep -q xfs
|
ansible.builtin.shell: set -o pipefail && df -T {{ ramdisk_mount }} | grep -q xfs
|
||||||
register: ramdisk_type
|
register: ramdisk_type
|
||||||
failed_when: ramdisk_type.rc != 0
|
failed_when: ramdisk_type.rc != 0
|
||||||
changed_when: false
|
changed_when: false
|
||||||
tags: [deploy, preflight]
|
tags: [deploy, preflight]
|
||||||
|
|
||||||
- name: Verify ramdisk visible inside kind node
|
- name: Verify ramdisk visible inside kind node
|
||||||
shell: >
|
ansible.builtin.shell: >
|
||||||
|
set -o pipefail &&
|
||||||
docker exec {{ kind_cluster }}-control-plane
|
docker exec {{ kind_cluster }}-control-plane
|
||||||
df -T /mnt/solana/ramdisk 2>/dev/null | grep -q xfs
|
df -T /mnt/solana/ramdisk 2>/dev/null | grep -q xfs
|
||||||
register: kind_ramdisk_check
|
register: kind_ramdisk_check
|
||||||
|
|
@ -187,28 +198,31 @@
|
||||||
tags: [deploy]
|
tags: [deploy]
|
||||||
|
|
||||||
- name: Regenerate deployment config from updated stack
|
- name: Regenerate deployment config from updated stack
|
||||||
command: >
|
ansible.builtin.command: >
|
||||||
{{ laconic_so }}
|
{{ laconic_so }}
|
||||||
--stack {{ stack_path }}
|
--stack {{ stack_path }}
|
||||||
deploy create
|
deploy create
|
||||||
--spec-file {{ deployment_dir }}/spec.yml
|
--spec-file {{ deployment_dir }}/spec.yml
|
||||||
--deployment-dir {{ deployment_dir }}
|
--deployment-dir {{ deployment_dir }}
|
||||||
--update
|
--update
|
||||||
|
changed_when: true
|
||||||
tags: [deploy]
|
tags: [deploy]
|
||||||
|
|
||||||
- name: Verify kind-config.yml has unified mount root
|
- name: Verify kind-config.yml has unified mount root
|
||||||
command: "grep -c 'containerPath: /mnt$' {{ deployment_dir }}/kind-config.yml"
|
ansible.builtin.command: "grep -c 'containerPath: /mnt$' {{ deployment_dir }}/kind-config.yml"
|
||||||
register: mount_root_check
|
register: mount_root_check
|
||||||
failed_when: mount_root_check.stdout | int < 1
|
failed_when: mount_root_check.stdout | int < 1
|
||||||
|
changed_when: false
|
||||||
tags: [deploy]
|
tags: [deploy]
|
||||||
|
|
||||||
- name: Start deployment (creates kind cluster + deploys pod)
|
- name: Start deployment (creates kind cluster + deploys pod)
|
||||||
command: "{{ laconic_so }} deployment --dir {{ deployment_dir }} start"
|
ansible.builtin.command: "{{ laconic_so }} deployment --dir {{ deployment_dir }} start"
|
||||||
|
changed_when: true
|
||||||
timeout: 1200
|
timeout: 1200
|
||||||
tags: [deploy]
|
tags: [deploy]
|
||||||
|
|
||||||
- name: Wait for deployment to exist
|
- name: Wait for deployment to exist
|
||||||
command: >
|
ansible.builtin.command: >
|
||||||
kubectl get deployment {{ deployment_name }}
|
kubectl get deployment {{ deployment_name }}
|
||||||
-n {{ k8s_namespace }}
|
-n {{ k8s_namespace }}
|
||||||
-o jsonpath='{.metadata.name}'
|
-o jsonpath='{.metadata.name}'
|
||||||
|
|
@ -216,16 +230,18 @@
|
||||||
retries: 30
|
retries: 30
|
||||||
delay: 10
|
delay: 10
|
||||||
until: deploy_exists.rc == 0
|
until: deploy_exists.rc == 0
|
||||||
|
changed_when: false
|
||||||
tags: [deploy]
|
tags: [deploy]
|
||||||
|
|
||||||
- name: Scale validator to 0 (stop before snapshot download)
|
- name: Scale validator to 0 (stop before snapshot download)
|
||||||
command: >
|
ansible.builtin.command: >
|
||||||
kubectl scale deployment {{ deployment_name }}
|
kubectl scale deployment {{ deployment_name }}
|
||||||
-n {{ k8s_namespace }} --replicas=0
|
-n {{ k8s_namespace }} --replicas=0
|
||||||
|
changed_when: true
|
||||||
tags: [deploy]
|
tags: [deploy]
|
||||||
|
|
||||||
- name: Wait for pods to terminate
|
- name: Wait for pods to terminate
|
||||||
command: >
|
ansible.builtin.command: >
|
||||||
kubectl get pods -n {{ k8s_namespace }}
|
kubectl get pods -n {{ k8s_namespace }}
|
||||||
-l app={{ deployment_name }}
|
-l app={{ deployment_name }}
|
||||||
-o jsonpath='{.items}'
|
-o jsonpath='{.items}'
|
||||||
|
|
@ -233,18 +249,19 @@
|
||||||
retries: 30
|
retries: 30
|
||||||
delay: 5
|
delay: 5
|
||||||
until: pods_gone.stdout == "[]" or pods_gone.stdout == ""
|
until: pods_gone.stdout == "[]" or pods_gone.stdout == ""
|
||||||
|
changed_when: false
|
||||||
failed_when: false
|
failed_when: false
|
||||||
tags: [deploy]
|
tags: [deploy]
|
||||||
|
|
||||||
# ---- snapshot: download via aria2c, verify in kind node ------------------
|
# ---- snapshot: download via aria2c, verify in kind node ------------------
|
||||||
- name: Verify aria2c installed
|
- name: Verify aria2c installed
|
||||||
command: which aria2c
|
ansible.builtin.command: which aria2c
|
||||||
changed_when: false
|
changed_when: false
|
||||||
when: not skip_snapshot | bool
|
when: not skip_snapshot | bool
|
||||||
tags: [snapshot]
|
tags: [snapshot]
|
||||||
|
|
||||||
- name: Copy snapshot script to remote
|
- name: Copy snapshot script to remote
|
||||||
copy:
|
ansible.builtin.copy:
|
||||||
src: "{{ snapshot_script_local }}"
|
src: "{{ snapshot_script_local }}"
|
||||||
dest: "{{ snapshot_script }}"
|
dest: "{{ snapshot_script }}"
|
||||||
mode: "0755"
|
mode: "0755"
|
||||||
|
|
@ -252,73 +269,80 @@
|
||||||
tags: [snapshot]
|
tags: [snapshot]
|
||||||
|
|
||||||
- name: Verify kind node mounts
|
- name: Verify kind node mounts
|
||||||
command: >
|
ansible.builtin.command: >
|
||||||
docker exec {{ kind_cluster }}-control-plane
|
docker exec {{ kind_cluster }}-control-plane
|
||||||
ls /mnt/solana/snapshots/
|
ls /mnt/solana/snapshots/
|
||||||
register: kind_mount_check
|
register: kind_mount_check
|
||||||
|
changed_when: false
|
||||||
tags: [snapshot]
|
tags: [snapshot]
|
||||||
|
|
||||||
- name: Download snapshot via aria2c
|
- name: Download snapshot via aria2c
|
||||||
shell: >
|
ansible.builtin.shell: >
|
||||||
python3 {{ snapshot_script }}
|
python3 {{ snapshot_script }}
|
||||||
-o {{ snapshot_dir }}
|
-o {{ snapshot_dir }}
|
||||||
{{ snapshot_args }}
|
{{ snapshot_args }}
|
||||||
become: true
|
become: true
|
||||||
register: snapshot_result
|
register: snapshot_result
|
||||||
|
changed_when: true
|
||||||
when: not skip_snapshot | bool
|
when: not skip_snapshot | bool
|
||||||
timeout: 3600
|
timeout: 3600
|
||||||
tags: [snapshot]
|
tags: [snapshot]
|
||||||
|
|
||||||
- name: Show snapshot download result
|
- name: Show snapshot download result
|
||||||
debug:
|
ansible.builtin.debug:
|
||||||
msg: "{{ snapshot_result.stdout_lines | default(['skipped']) }}"
|
msg: "{{ snapshot_result.stdout_lines | default(['skipped']) }}"
|
||||||
tags: [snapshot]
|
tags: [snapshot]
|
||||||
|
|
||||||
- name: Verify snapshot visible inside kind node
|
- name: Verify snapshot visible inside kind node
|
||||||
shell: >
|
ansible.builtin.shell: >
|
||||||
|
set -o pipefail &&
|
||||||
docker exec {{ kind_cluster }}-control-plane
|
docker exec {{ kind_cluster }}-control-plane
|
||||||
ls -lhS /mnt/solana/snapshots/*.tar.* 2>/dev/null | head -5
|
find /mnt/solana/snapshots/ -name '*.tar.*' -maxdepth 1 | head -5
|
||||||
register: kind_snapshot_check
|
register: kind_snapshot_check
|
||||||
failed_when: kind_snapshot_check.stdout == ""
|
failed_when: kind_snapshot_check.stdout == ""
|
||||||
|
changed_when: false
|
||||||
when: not skip_snapshot | bool
|
when: not skip_snapshot | bool
|
||||||
tags: [snapshot]
|
tags: [snapshot]
|
||||||
|
|
||||||
- name: Show snapshot files in kind node
|
- name: Show snapshot files in kind node
|
||||||
debug:
|
ansible.builtin.debug:
|
||||||
msg: "{{ kind_snapshot_check.stdout_lines | default(['skipped']) }}"
|
msg: "{{ kind_snapshot_check.stdout_lines | default(['skipped']) }}"
|
||||||
when: not skip_snapshot | bool
|
when: not skip_snapshot | bool
|
||||||
tags: [snapshot]
|
tags: [snapshot]
|
||||||
|
|
||||||
# ---- deploy (cont): scale validator back up with snapshot ----------------
|
# ---- deploy (cont): scale validator back up with snapshot ----------------
|
||||||
- name: Scale validator to 1 (start with downloaded snapshot)
|
- name: Scale validator to 1 (start with downloaded snapshot)
|
||||||
command: >
|
ansible.builtin.command: >
|
||||||
kubectl scale deployment {{ deployment_name }}
|
kubectl scale deployment {{ deployment_name }}
|
||||||
-n {{ k8s_namespace }} --replicas=1
|
-n {{ k8s_namespace }} --replicas=1
|
||||||
|
changed_when: true
|
||||||
tags: [deploy]
|
tags: [deploy]
|
||||||
|
|
||||||
# ---- verify: confirm validator is running --------------------------------
|
# ---- verify: confirm validator is running --------------------------------
|
||||||
- name: Wait for pod to be running
|
- name: Wait for pod to be running
|
||||||
command: >
|
ansible.builtin.command: >
|
||||||
kubectl get pods -n {{ k8s_namespace }}
|
kubectl get pods -n {{ k8s_namespace }}
|
||||||
-o jsonpath='{.items[0].status.phase}'
|
-o jsonpath='{.items[0].status.phase}'
|
||||||
register: pod_status
|
register: pod_status
|
||||||
retries: 60
|
retries: 60
|
||||||
delay: 10
|
delay: 10
|
||||||
until: pod_status.stdout == "Running"
|
until: pod_status.stdout == "Running"
|
||||||
|
changed_when: false
|
||||||
tags: [verify]
|
tags: [verify]
|
||||||
|
|
||||||
- name: Verify unified mount inside kind node
|
- name: Verify unified mount inside kind node
|
||||||
command: "docker exec {{ kind_cluster }}-control-plane ls /mnt/solana/"
|
ansible.builtin.command: "docker exec {{ kind_cluster }}-control-plane ls /mnt/solana/"
|
||||||
register: mount_check
|
register: mount_check
|
||||||
|
changed_when: false
|
||||||
tags: [verify]
|
tags: [verify]
|
||||||
|
|
||||||
- name: Show mount contents
|
- name: Show mount contents
|
||||||
debug:
|
ansible.builtin.debug:
|
||||||
msg: "{{ mount_check.stdout_lines }}"
|
msg: "{{ mount_check.stdout_lines }}"
|
||||||
tags: [verify]
|
tags: [verify]
|
||||||
|
|
||||||
- name: Check validator log file is being written
|
- name: Check validator log file is being written
|
||||||
command: >
|
ansible.builtin.command: >
|
||||||
kubectl exec -n {{ k8s_namespace }}
|
kubectl exec -n {{ k8s_namespace }}
|
||||||
deployment/{{ deployment_name }}
|
deployment/{{ deployment_name }}
|
||||||
-c agave-validator -- test -f /data/log/validator.log
|
-c agave-validator -- test -f /data/log/validator.log
|
||||||
|
|
@ -326,11 +350,12 @@
|
||||||
delay: 10
|
delay: 10
|
||||||
until: log_file_check.rc == 0
|
until: log_file_check.rc == 0
|
||||||
register: log_file_check
|
register: log_file_check
|
||||||
|
changed_when: false
|
||||||
failed_when: false
|
failed_when: false
|
||||||
tags: [verify]
|
tags: [verify]
|
||||||
|
|
||||||
- name: Check RPC health
|
- name: Check RPC health
|
||||||
uri:
|
ansible.builtin.uri:
|
||||||
url: http://127.0.0.1:8899/health
|
url: http://127.0.0.1:8899/health
|
||||||
return_content: true
|
return_content: true
|
||||||
register: rpc_health
|
register: rpc_health
|
||||||
|
|
@ -342,7 +367,7 @@
|
||||||
tags: [verify]
|
tags: [verify]
|
||||||
|
|
||||||
- name: Report status
|
- name: Report status
|
||||||
debug:
|
ansible.builtin.debug:
|
||||||
msg: >-
|
msg: >-
|
||||||
Deployment complete.
|
Deployment complete.
|
||||||
Log: {{ 'writing' if log_file_check.rc == 0 else 'not yet created' }}.
|
Log: {{ 'writing' if log_file_check.rc == 0 else 'not yet created' }}.
|
||||||
|
|
|
||||||
|
|
@ -34,7 +34,7 @@
|
||||||
|
|
||||||
tasks:
|
tasks:
|
||||||
- name: Get current replica count
|
- name: Get current replica count
|
||||||
command: >
|
ansible.builtin.command: >
|
||||||
kubectl get deployment {{ deployment_name }}
|
kubectl get deployment {{ deployment_name }}
|
||||||
-n {{ k8s_namespace }}
|
-n {{ k8s_namespace }}
|
||||||
-o jsonpath='{.spec.replicas}'
|
-o jsonpath='{.spec.replicas}'
|
||||||
|
|
@ -43,24 +43,26 @@
|
||||||
changed_when: false
|
changed_when: false
|
||||||
|
|
||||||
- name: Scale deployment to 0
|
- name: Scale deployment to 0
|
||||||
command: >
|
ansible.builtin.command: >
|
||||||
kubectl scale deployment {{ deployment_name }}
|
kubectl scale deployment {{ deployment_name }}
|
||||||
-n {{ k8s_namespace }} --replicas=0
|
-n {{ k8s_namespace }} --replicas=0
|
||||||
|
changed_when: true
|
||||||
when: current_replicas.stdout | default('0') | int > 0
|
when: current_replicas.stdout | default('0') | int > 0
|
||||||
|
|
||||||
- name: Wait for pods to terminate
|
- name: Wait for pods to terminate
|
||||||
command: >
|
ansible.builtin.command: >
|
||||||
kubectl get pods -n {{ k8s_namespace }}
|
kubectl get pods -n {{ k8s_namespace }}
|
||||||
-l app={{ deployment_name }}
|
-l app={{ deployment_name }}
|
||||||
-o jsonpath='{.items}'
|
-o jsonpath='{.items}'
|
||||||
register: pods_gone
|
register: pods_gone
|
||||||
|
changed_when: false
|
||||||
retries: 60
|
retries: 60
|
||||||
delay: 5
|
delay: 5
|
||||||
until: pods_gone.stdout == "[]" or pods_gone.stdout == ""
|
until: pods_gone.stdout == "[]" or pods_gone.stdout == ""
|
||||||
when: current_replicas.stdout | default('0') | int > 0
|
when: current_replicas.stdout | default('0') | int > 0
|
||||||
|
|
||||||
- name: Verify no agave processes in kind node
|
- name: Verify no agave processes in kind node
|
||||||
command: >
|
ansible.builtin.command: >
|
||||||
docker exec {{ kind_cluster }}-control-plane
|
docker exec {{ kind_cluster }}-control-plane
|
||||||
pgrep -c agave-validator
|
pgrep -c agave-validator
|
||||||
register: agave_procs
|
register: agave_procs
|
||||||
|
|
@ -68,7 +70,7 @@
|
||||||
changed_when: false
|
changed_when: false
|
||||||
|
|
||||||
- name: Fail if agave still running
|
- name: Fail if agave still running
|
||||||
fail:
|
ansible.builtin.fail:
|
||||||
msg: >-
|
msg: >-
|
||||||
agave-validator process still running inside kind node after
|
agave-validator process still running inside kind node after
|
||||||
pod termination. Do NOT restart the kind node — investigate
|
pod termination. Do NOT restart the kind node — investigate
|
||||||
|
|
@ -76,7 +78,7 @@
|
||||||
when: agave_procs.rc == 0
|
when: agave_procs.rc == 0
|
||||||
|
|
||||||
- name: Report stopped
|
- name: Report stopped
|
||||||
debug:
|
ansible.builtin.debug:
|
||||||
msg: >-
|
msg: >-
|
||||||
Validator stopped. Replicas: {{ current_replicas.stdout | default('0') }} -> 0.
|
Validator stopped. Replicas: {{ current_replicas.stdout | default('0') }} -> 0.
|
||||||
No agave processes detected in kind node.
|
No agave processes detected in kind node.
|
||||||
|
|
@ -84,22 +86,24 @@
|
||||||
|
|
||||||
# ---- optional: restart kind node -----------------------------------------
|
# ---- optional: restart kind node -----------------------------------------
|
||||||
- name: Restart kind node
|
- name: Restart kind node
|
||||||
command: docker restart {{ kind_cluster }}-control-plane
|
ansible.builtin.command: docker restart {{ kind_cluster }}-control-plane
|
||||||
|
changed_when: true
|
||||||
when: restart_kind | bool
|
when: restart_kind | bool
|
||||||
timeout: 120
|
timeout: 120
|
||||||
|
|
||||||
- name: Wait for kind node ready
|
- name: Wait for kind node ready
|
||||||
command: >
|
ansible.builtin.command: >
|
||||||
kubectl get node {{ kind_cluster }}-control-plane
|
kubectl get node {{ kind_cluster }}-control-plane
|
||||||
-o jsonpath='{.status.conditions[?(@.type=="Ready")].status}'
|
-o jsonpath='{.status.conditions[?(@.type=="Ready")].status}'
|
||||||
register: node_ready
|
register: node_ready
|
||||||
|
changed_when: false
|
||||||
retries: 30
|
retries: 30
|
||||||
delay: 10
|
delay: 10
|
||||||
until: node_ready.stdout == "True"
|
until: node_ready.stdout == "True"
|
||||||
when: restart_kind | bool
|
when: restart_kind | bool
|
||||||
|
|
||||||
- name: Report restarted
|
- name: Report restarted
|
||||||
debug:
|
ansible.builtin.debug:
|
||||||
msg: >-
|
msg: >-
|
||||||
Kind node restarted and ready.
|
Kind node restarted and ready.
|
||||||
Deployment at 0 replicas — scale up when ready.
|
Deployment at 0 replicas — scale up when ready.
|
||||||
|
|
|
||||||
|
|
@ -39,7 +39,7 @@
|
||||||
|
|
||||||
tasks:
|
tasks:
|
||||||
- name: Read current PV hostPaths
|
- name: Read current PV hostPaths
|
||||||
command: >
|
ansible.builtin.command: >
|
||||||
kubectl get pv {{ kind_cluster }}-{{ item.name }}
|
kubectl get pv {{ kind_cluster }}-{{ item.name }}
|
||||||
-o jsonpath='{.spec.hostPath.path}'
|
-o jsonpath='{.spec.hostPath.path}'
|
||||||
register: current_paths
|
register: current_paths
|
||||||
|
|
@ -48,7 +48,7 @@
|
||||||
changed_when: false
|
changed_when: false
|
||||||
|
|
||||||
- name: Build path comparison
|
- name: Build path comparison
|
||||||
set_fact:
|
ansible.builtin.set_fact:
|
||||||
path_mismatches: "{{ current_paths.results | selectattr('stdout', 'ne', '') | rejectattr('stdout', 'equalto', item.host_path) | list }}"
|
path_mismatches: "{{ current_paths.results | selectattr('stdout', 'ne', '') | rejectattr('stdout', 'equalto', item.host_path) | list }}"
|
||||||
path_missing: "{{ current_paths.results | selectattr('stdout', 'equalto', '') | list }}"
|
path_missing: "{{ current_paths.results | selectattr('stdout', 'equalto', '') | list }}"
|
||||||
loop: "{{ volumes }}"
|
loop: "{{ volumes }}"
|
||||||
|
|
@ -56,7 +56,7 @@
|
||||||
label: "{{ item.name }}"
|
label: "{{ item.name }}"
|
||||||
|
|
||||||
- name: Show current vs expected paths
|
- name: Show current vs expected paths
|
||||||
debug:
|
ansible.builtin.debug:
|
||||||
msg: >-
|
msg: >-
|
||||||
{{ item.item.name }}:
|
{{ item.item.name }}:
|
||||||
current={{ item.stdout if item.stdout else 'NOT FOUND' }}
|
current={{ item.stdout if item.stdout else 'NOT FOUND' }}
|
||||||
|
|
@ -67,7 +67,7 @@
|
||||||
label: "{{ item.item.name }}"
|
label: "{{ item.item.name }}"
|
||||||
|
|
||||||
- name: Check for mismatched PVs
|
- name: Check for mismatched PVs
|
||||||
fail:
|
ansible.builtin.fail:
|
||||||
msg: >-
|
msg: >-
|
||||||
PV {{ item.item.name }} has wrong hostPath:
|
PV {{ item.item.name }} has wrong hostPath:
|
||||||
{{ item.stdout if item.stdout else 'NOT FOUND' }}
|
{{ item.stdout if item.stdout else 'NOT FOUND' }}
|
||||||
|
|
@ -80,7 +80,7 @@
|
||||||
|
|
||||||
# ---- Fix mode ---------------------------------------------------------
|
# ---- Fix mode ---------------------------------------------------------
|
||||||
- name: Delete stale PVCs
|
- name: Delete stale PVCs
|
||||||
command: >
|
ansible.builtin.command: >
|
||||||
kubectl delete pvc {{ kind_cluster }}-{{ item.item.name }}
|
kubectl delete pvc {{ kind_cluster }}-{{ item.item.name }}
|
||||||
-n {{ k8s_namespace }} --timeout=60s
|
-n {{ k8s_namespace }} --timeout=60s
|
||||||
when: fix | bool and item.stdout != item.item.host_path
|
when: fix | bool and item.stdout != item.item.host_path
|
||||||
|
|
@ -88,9 +88,10 @@
|
||||||
loop_control:
|
loop_control:
|
||||||
label: "{{ item.item.name }}"
|
label: "{{ item.item.name }}"
|
||||||
failed_when: false
|
failed_when: false
|
||||||
|
changed_when: true
|
||||||
|
|
||||||
- name: Delete stale PVs
|
- name: Delete stale PVs
|
||||||
command: >
|
ansible.builtin.command: >
|
||||||
kubectl delete pv {{ kind_cluster }}-{{ item.item.name }}
|
kubectl delete pv {{ kind_cluster }}-{{ item.item.name }}
|
||||||
--timeout=60s
|
--timeout=60s
|
||||||
when: fix | bool and item.stdout != item.item.host_path
|
when: fix | bool and item.stdout != item.item.host_path
|
||||||
|
|
@ -98,9 +99,10 @@
|
||||||
loop_control:
|
loop_control:
|
||||||
label: "{{ item.item.name }}"
|
label: "{{ item.item.name }}"
|
||||||
failed_when: false
|
failed_when: false
|
||||||
|
changed_when: true
|
||||||
|
|
||||||
- name: Create PVs with correct hostPaths
|
- name: Create PVs with correct hostPaths
|
||||||
command: >
|
ansible.builtin.command: >
|
||||||
kubectl apply -f -
|
kubectl apply -f -
|
||||||
args:
|
args:
|
||||||
stdin: |
|
stdin: |
|
||||||
|
|
@ -121,9 +123,10 @@
|
||||||
loop: "{{ current_paths.results }}"
|
loop: "{{ current_paths.results }}"
|
||||||
loop_control:
|
loop_control:
|
||||||
label: "{{ item.item.name }}"
|
label: "{{ item.item.name }}"
|
||||||
|
changed_when: true
|
||||||
|
|
||||||
- name: Create PVCs
|
- name: Create PVCs
|
||||||
command: >
|
ansible.builtin.command: >
|
||||||
kubectl apply -f -
|
kubectl apply -f -
|
||||||
args:
|
args:
|
||||||
stdin: |
|
stdin: |
|
||||||
|
|
@ -144,10 +147,11 @@
|
||||||
loop: "{{ current_paths.results }}"
|
loop: "{{ current_paths.results }}"
|
||||||
loop_control:
|
loop_control:
|
||||||
label: "{{ item.item.name }}"
|
label: "{{ item.item.name }}"
|
||||||
|
changed_when: true
|
||||||
|
|
||||||
# ---- Final verify -----------------------------------------------------
|
# ---- Final verify -----------------------------------------------------
|
||||||
- name: Verify PV paths
|
- name: Verify PV paths
|
||||||
command: >
|
ansible.builtin.command: >
|
||||||
kubectl get pv {{ kind_cluster }}-{{ item.name }}
|
kubectl get pv {{ kind_cluster }}-{{ item.name }}
|
||||||
-o jsonpath='{.spec.hostPath.path}'
|
-o jsonpath='{.spec.hostPath.path}'
|
||||||
register: final_paths
|
register: final_paths
|
||||||
|
|
@ -156,7 +160,7 @@
|
||||||
when: fix | bool
|
when: fix | bool
|
||||||
|
|
||||||
- name: Assert all PV paths correct
|
- name: Assert all PV paths correct
|
||||||
assert:
|
ansible.builtin.assert:
|
||||||
that: item.stdout == item.item.host_path
|
that: item.stdout == item.item.host_path
|
||||||
fail_msg: "{{ item.item.name }}: {{ item.stdout }} != {{ item.item.host_path }}"
|
fail_msg: "{{ item.item.name }}: {{ item.stdout }} != {{ item.item.host_path }}"
|
||||||
success_msg: "{{ item.item.name }}: {{ item.stdout }} OK"
|
success_msg: "{{ item.item.name }}: {{ item.stdout }} OK"
|
||||||
|
|
|
||||||
|
|
@ -13,6 +13,8 @@
|
||||||
- name: Biscayne agave-stack health check
|
- name: Biscayne agave-stack health check
|
||||||
hosts: biscayne
|
hosts: biscayne
|
||||||
gather_facts: false
|
gather_facts: false
|
||||||
|
environment:
|
||||||
|
KUBECONFIG: /home/rix/.kube/config
|
||||||
|
|
||||||
tasks:
|
tasks:
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue