stack-orchestrator/playbooks/biscayne-stop.yml

123 lines
4.3 KiB
YAML
Raw Normal View History

---
# Graceful shutdown of agave validator on biscayne
#
# Scales the deployment to 0 and waits for the pod to terminate.
# This MUST be done before any kind node restart, host reboot,
# or docker operations.
#
# The container entrypoint (PID 1) traps SIGTERM and runs
# ``agave-validator exit --force --ledger /data/ledger`` which tells
# the validator to flush I/O and exit cleanly via the admin RPC Unix
# socket. This avoids the io_uring/ZFS deadlock that occurs when the
# process is killed. terminationGracePeriodSeconds must be set to 300
# on the k8s deployment to allow time for the flush.
#
# Usage:
# # Stop the validator
# ansible-playbook -i biscayne.vaasl.io, playbooks/biscayne-stop.yml
#
# # Stop and restart kind node (LAST RESORT — e.g., broken namespace)
# # Normally unnecessary: mount propagation means ramdisk/ZFS changes
# # are visible in the kind node without restarting it.
# ansible-playbook -i biscayne.vaasl.io, playbooks/biscayne-stop.yml \
# -e restart_kind=true
#
- name: Graceful validator shutdown
hosts: all
gather_facts: false
environment:
KUBECONFIG: /home/rix/.kube/config
vars:
kind_cluster: laconic-70ce4c4b47e23b85
k8s_namespace: "laconic-{{ kind_cluster }}"
deployment_name: "{{ kind_cluster }}-deployment"
restart_kind: false
tasks:
- name: Get current replica count
ansible.builtin.command: >
kubectl get deployment {{ deployment_name }}
-n {{ k8s_namespace }}
-o jsonpath='{.spec.replicas}'
register: current_replicas
failed_when: false
changed_when: false
# Ensure k8s gives the entrypoint enough time for graceful shutdown
# via admin RPC before sending SIGKILL.
- name: Ensure terminationGracePeriodSeconds is 300
ansible.builtin.command: >
kubectl patch deployment {{ deployment_name }}
-n {{ k8s_namespace }}
-p '{"spec":{"template":{"spec":{"terminationGracePeriodSeconds":300}}}}'
register: patch_result
changed_when: "'no change' not in patch_result.stdout"
when: current_replicas.stdout | default('0') | int > 0
- name: Scale deployment to 0
ansible.builtin.command: >
kubectl scale deployment {{ deployment_name }}
-n {{ k8s_namespace }} --replicas=0
changed_when: true
when: current_replicas.stdout | default('0') | int > 0
- name: Wait for pods to terminate
ansible.builtin.command: >
kubectl get pods -n {{ k8s_namespace }}
-l app={{ deployment_name }}
-o jsonpath='{.items}'
register: pods_gone
changed_when: false
retries: 60
delay: 5
until: pods_gone.stdout == "[]" or pods_gone.stdout == ""
when: current_replicas.stdout | default('0') | int > 0
- name: Verify no agave processes in kind node
ansible.builtin.command: >
docker exec {{ kind_cluster }}-control-plane
pgrep -c agave-validator
register: agave_procs
failed_when: false
changed_when: false
- name: Fail if agave still running
ansible.builtin.fail:
msg: >-
agave-validator process still running inside kind node after
pod termination. Do NOT restart the kind node — investigate
first to avoid io_uring/ZFS deadlock.
when: agave_procs.rc == 0
- name: Report stopped
ansible.builtin.debug:
msg: >-
Validator stopped. Replicas: {{ current_replicas.stdout | default('0') }} -> 0.
No agave processes detected in kind node.
when: not restart_kind | bool
# ---- optional: restart kind node -----------------------------------------
- name: Restart kind node
ansible.builtin.command: docker restart {{ kind_cluster }}-control-plane
changed_when: true
when: restart_kind | bool
timeout: 120
- name: Wait for kind node ready
ansible.builtin.command: >
kubectl get node {{ kind_cluster }}-control-plane
-o jsonpath='{.status.conditions[?(@.type=="Ready")].status}'
register: node_ready
changed_when: false
retries: 30
delay: 10
until: node_ready.stdout == "True"
when: restart_kind | bool
- name: Report restarted
ansible.builtin.debug:
msg: >-
Kind node restarted and ready.
Deployment at 0 replicas — scale up when ready.
when: restart_kind | bool