--- # Graceful shutdown of agave validator on biscayne # # Scales the deployment to 0 and waits for the pod to terminate. # This MUST be done before any kind node restart, host reboot, # or docker operations. # # The container entrypoint (PID 1) traps SIGTERM and runs # ``agave-validator exit --force --ledger /data/ledger`` which tells # the validator to flush I/O and exit cleanly via the admin RPC Unix # socket. This avoids the io_uring/ZFS deadlock that occurs when the # process is killed. terminationGracePeriodSeconds must be set to 300 # on the k8s deployment to allow time for the flush. # # Usage: # # Stop the validator # ansible-playbook -i biscayne.vaasl.io, playbooks/biscayne-stop.yml # # # Stop and restart kind node (LAST RESORT — e.g., broken namespace) # # Normally unnecessary: mount propagation means ramdisk/ZFS changes # # are visible in the kind node without restarting it. # ansible-playbook -i biscayne.vaasl.io, playbooks/biscayne-stop.yml \ # -e restart_kind=true # - name: Graceful validator shutdown hosts: all gather_facts: false environment: KUBECONFIG: /home/rix/.kube/config vars: kind_cluster: laconic-70ce4c4b47e23b85 k8s_namespace: "laconic-{{ kind_cluster }}" deployment_name: "{{ kind_cluster }}-deployment" restart_kind: false tasks: - name: Get current replica count ansible.builtin.command: > kubectl get deployment {{ deployment_name }} -n {{ k8s_namespace }} -o jsonpath='{.spec.replicas}' register: current_replicas failed_when: false changed_when: false # Ensure k8s gives the entrypoint enough time for graceful shutdown # via admin RPC before sending SIGKILL. - name: Ensure terminationGracePeriodSeconds is 300 ansible.builtin.command: > kubectl patch deployment {{ deployment_name }} -n {{ k8s_namespace }} -p '{"spec":{"template":{"spec":{"terminationGracePeriodSeconds":300}}}}' register: patch_result changed_when: "'no change' not in patch_result.stdout" when: current_replicas.stdout | default('0') | int > 0 - name: Scale deployment to 0 ansible.builtin.command: > kubectl scale deployment {{ deployment_name }} -n {{ k8s_namespace }} --replicas=0 changed_when: true when: current_replicas.stdout | default('0') | int > 0 - name: Wait for pods to terminate ansible.builtin.command: > kubectl get pods -n {{ k8s_namespace }} -l app={{ deployment_name }} -o jsonpath='{.items}' register: pods_gone changed_when: false retries: 60 delay: 5 until: pods_gone.stdout == "[]" or pods_gone.stdout == "" when: current_replicas.stdout | default('0') | int > 0 - name: Verify no agave processes in kind node ansible.builtin.command: > docker exec {{ kind_cluster }}-control-plane pgrep -c agave-validator register: agave_procs failed_when: false changed_when: false - name: Fail if agave still running ansible.builtin.fail: msg: >- agave-validator process still running inside kind node after pod termination. Do NOT restart the kind node — investigate first to avoid io_uring/ZFS deadlock. when: agave_procs.rc == 0 - name: Report stopped ansible.builtin.debug: msg: >- Validator stopped. Replicas: {{ current_replicas.stdout | default('0') }} -> 0. No agave processes detected in kind node. when: not restart_kind | bool # ---- optional: restart kind node ----------------------------------------- - name: Restart kind node ansible.builtin.command: docker restart {{ kind_cluster }}-control-plane changed_when: true when: restart_kind | bool timeout: 120 - name: Wait for kind node ready ansible.builtin.command: > kubectl get node {{ kind_cluster }}-control-plane -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' register: node_ready changed_when: false retries: 30 delay: 10 until: node_ready.stdout == "True" when: restart_kind | bool - name: Report restarted ansible.builtin.debug: msg: >- Kind node restarted and ready. Deployment at 0 replicas — scale up when ready. when: restart_kind | bool