From e143bb45c70c1ce8bd26e80aff7a1b0ed0898497 Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Tue, 10 Mar 2026 06:21:46 +0000 Subject: [PATCH] feat: add biscayne-restart.yml for graceful restart without cluster teardown Uses laconic-so deployment restart (GitOps) to pick up new container images and config. Gracefully stops the validator first (scale to 0, wait for pod termination, verify no agave processes). Preserves the kind cluster, all data volumes, and cluster state. Co-Authored-By: Claude Opus 4.6 --- playbooks/biscayne-restart.yml | 109 +++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 playbooks/biscayne-restart.yml diff --git a/playbooks/biscayne-restart.yml b/playbooks/biscayne-restart.yml new file mode 100644 index 00000000..fa6becd6 --- /dev/null +++ b/playbooks/biscayne-restart.yml @@ -0,0 +1,109 @@ +--- +# Restart agave validator with updated image/config +# +# Gracefully stops the validator, then uses laconic-so deployment restart +# to pick up new container images and config changes. Does NOT recreate +# the kind cluster — preserves all data volumes and cluster state. +# +# Prerequisites: +# - biscayne-sync-tools.yml has been run (optionally with --tags build-container) +# +# Usage: +# ansible-playbook -i inventory/biscayne.yml playbooks/biscayne-restart.yml +# +- name: Restart agave validator + hosts: all + gather_facts: false + environment: + KUBECONFIG: /home/rix/.kube/config + vars: + deployment_dir: /srv/deployments/agave + laconic_so: /home/rix/.local/bin/laconic-so + kind_cluster: laconic-70ce4c4b47e23b85 + k8s_namespace: "laconic-{{ kind_cluster }}" + deployment_name: "{{ kind_cluster }}-deployment" + + tasks: + # ---- graceful stop ------------------------------------------------------- + - name: Get current replica count + ansible.builtin.command: > + kubectl get deployment {{ deployment_name }} + -n {{ k8s_namespace }} + -o jsonpath='{.spec.replicas}' + register: current_replicas + failed_when: false + changed_when: false + + - name: Ensure terminationGracePeriodSeconds is 300 + ansible.builtin.command: > + kubectl patch deployment {{ deployment_name }} + -n {{ k8s_namespace }} + -p '{"spec":{"template":{"spec":{"terminationGracePeriodSeconds":300}}}}' + register: patch_result + changed_when: "'no change' not in patch_result.stdout" + when: current_replicas.stdout | default('0') | int > 0 + + - name: Scale deployment to 0 + ansible.builtin.command: > + kubectl scale deployment {{ deployment_name }} + -n {{ k8s_namespace }} --replicas=0 + changed_when: true + when: current_replicas.stdout | default('0') | int > 0 + + - name: Wait for pods to terminate + ansible.builtin.command: > + kubectl get pods -n {{ k8s_namespace }} + -l app={{ deployment_name }} + -o jsonpath='{.items}' + register: pods_gone + changed_when: false + retries: 60 + delay: 5 + until: pods_gone.stdout == "[]" or pods_gone.stdout == "" + when: current_replicas.stdout | default('0') | int > 0 + + - name: Verify no agave processes in kind node + ansible.builtin.command: > + docker exec {{ kind_cluster }}-control-plane + pgrep -c agave-validator + register: agave_procs + failed_when: false + changed_when: false + + - name: Fail if agave still running + ansible.builtin.fail: + msg: >- + agave-validator still running inside kind node after pod + termination. Investigate before proceeding. + when: agave_procs.rc == 0 + + - name: Report stopped + ansible.builtin.debug: + msg: "Validator stopped cleanly. Applying new config..." + + # ---- apply new config and restart ---------------------------------------- + - name: Restart deployment with updated config/image + ansible.builtin.command: > + {{ laconic_so }} + deployment --dir {{ deployment_dir }} + restart + register: restart_result + changed_when: true + + # ---- verify -------------------------------------------------------------- + - name: Wait for pod running + ansible.builtin.command: > + kubectl get pods -n {{ k8s_namespace }} + -l app={{ deployment_name }} + -o jsonpath='{.items[0].status.phase}' + register: pod_phase + changed_when: false + retries: 30 + delay: 10 + until: pod_phase.stdout == "Running" + + - name: Report restarted + ansible.builtin.debug: + msg: >- + Validator restarted with new image/config. + Pod phase: {{ pod_phase.stdout }}.