feat: add biscayne-restart.yml for graceful restart without cluster teardown

Uses laconic-so deployment restart (GitOps) to pick up new container
images and config. Gracefully stops the validator first (scale to 0,
wait for pod termination, verify no agave processes). Preserves the
kind cluster, all data volumes, and cluster state.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
fix/kind-mount-propagation
A. F. Dudley 2026-03-10 06:21:46 +00:00
parent 0bbc3b5a64
commit e143bb45c7
1 changed files with 109 additions and 0 deletions

View File

@ -0,0 +1,109 @@
---
# Restart agave validator with updated image/config
#
# Gracefully stops the validator, then uses laconic-so deployment restart
# to pick up new container images and config changes. Does NOT recreate
# the kind cluster — preserves all data volumes and cluster state.
#
# Prerequisites:
# - biscayne-sync-tools.yml has been run (optionally with --tags build-container)
#
# Usage:
# ansible-playbook -i inventory/biscayne.yml playbooks/biscayne-restart.yml
#
- name: Restart agave validator
hosts: all
gather_facts: false
environment:
KUBECONFIG: /home/rix/.kube/config
vars:
deployment_dir: /srv/deployments/agave
laconic_so: /home/rix/.local/bin/laconic-so
kind_cluster: laconic-70ce4c4b47e23b85
k8s_namespace: "laconic-{{ kind_cluster }}"
deployment_name: "{{ kind_cluster }}-deployment"
tasks:
# ---- graceful stop -------------------------------------------------------
- name: Get current replica count
ansible.builtin.command: >
kubectl get deployment {{ deployment_name }}
-n {{ k8s_namespace }}
-o jsonpath='{.spec.replicas}'
register: current_replicas
failed_when: false
changed_when: false
- name: Ensure terminationGracePeriodSeconds is 300
ansible.builtin.command: >
kubectl patch deployment {{ deployment_name }}
-n {{ k8s_namespace }}
-p '{"spec":{"template":{"spec":{"terminationGracePeriodSeconds":300}}}}'
register: patch_result
changed_when: "'no change' not in patch_result.stdout"
when: current_replicas.stdout | default('0') | int > 0
- name: Scale deployment to 0
ansible.builtin.command: >
kubectl scale deployment {{ deployment_name }}
-n {{ k8s_namespace }} --replicas=0
changed_when: true
when: current_replicas.stdout | default('0') | int > 0
- name: Wait for pods to terminate
ansible.builtin.command: >
kubectl get pods -n {{ k8s_namespace }}
-l app={{ deployment_name }}
-o jsonpath='{.items}'
register: pods_gone
changed_when: false
retries: 60
delay: 5
until: pods_gone.stdout == "[]" or pods_gone.stdout == ""
when: current_replicas.stdout | default('0') | int > 0
- name: Verify no agave processes in kind node
ansible.builtin.command: >
docker exec {{ kind_cluster }}-control-plane
pgrep -c agave-validator
register: agave_procs
failed_when: false
changed_when: false
- name: Fail if agave still running
ansible.builtin.fail:
msg: >-
agave-validator still running inside kind node after pod
termination. Investigate before proceeding.
when: agave_procs.rc == 0
- name: Report stopped
ansible.builtin.debug:
msg: "Validator stopped cleanly. Applying new config..."
# ---- apply new config and restart ----------------------------------------
- name: Restart deployment with updated config/image
ansible.builtin.command: >
{{ laconic_so }}
deployment --dir {{ deployment_dir }}
restart
register: restart_result
changed_when: true
# ---- verify --------------------------------------------------------------
- name: Wait for pod running
ansible.builtin.command: >
kubectl get pods -n {{ k8s_namespace }}
-l app={{ deployment_name }}
-o jsonpath='{.items[0].status.phase}'
register: pod_phase
changed_when: false
retries: 30
delay: 10
until: pod_phase.stdout == "Running"
- name: Report restarted
ansible.builtin.debug:
msg: >-
Validator restarted with new image/config.
Pod phase: {{ pod_phase.stdout }}.