2026-03-10 06:21:46 +00:00
|
|
|
---
|
|
|
|
|
# Restart agave validator with updated image/config
|
|
|
|
|
#
|
|
|
|
|
# Gracefully stops the validator, then uses laconic-so deployment restart
|
|
|
|
|
# to pick up new container images and config changes. Does NOT recreate
|
|
|
|
|
# the kind cluster — preserves all data volumes and cluster state.
|
|
|
|
|
#
|
|
|
|
|
# Prerequisites:
|
|
|
|
|
# - biscayne-sync-tools.yml has been run (optionally with --tags build-container)
|
|
|
|
|
#
|
|
|
|
|
# Usage:
|
|
|
|
|
# ansible-playbook -i inventory/biscayne.yml playbooks/biscayne-restart.yml
|
|
|
|
|
#
|
|
|
|
|
- name: Restart agave validator
|
|
|
|
|
hosts: all
|
|
|
|
|
gather_facts: false
|
|
|
|
|
environment:
|
|
|
|
|
KUBECONFIG: /home/rix/.kube/config
|
|
|
|
|
vars:
|
|
|
|
|
deployment_dir: /srv/deployments/agave
|
|
|
|
|
laconic_so: /home/rix/.local/bin/laconic-so
|
|
|
|
|
kind_cluster: laconic-70ce4c4b47e23b85
|
|
|
|
|
k8s_namespace: "laconic-{{ kind_cluster }}"
|
|
|
|
|
deployment_name: "{{ kind_cluster }}-deployment"
|
|
|
|
|
|
|
|
|
|
tasks:
|
|
|
|
|
# ---- graceful stop -------------------------------------------------------
|
|
|
|
|
- name: Get current replica count
|
|
|
|
|
ansible.builtin.command: >
|
|
|
|
|
kubectl get deployment {{ deployment_name }}
|
|
|
|
|
-n {{ k8s_namespace }}
|
|
|
|
|
-o jsonpath='{.spec.replicas}'
|
|
|
|
|
register: current_replicas
|
|
|
|
|
failed_when: false
|
|
|
|
|
changed_when: false
|
|
|
|
|
|
|
|
|
|
- name: Ensure terminationGracePeriodSeconds is 300
|
|
|
|
|
ansible.builtin.command: >
|
|
|
|
|
kubectl patch deployment {{ deployment_name }}
|
|
|
|
|
-n {{ k8s_namespace }}
|
|
|
|
|
-p '{"spec":{"template":{"spec":{"terminationGracePeriodSeconds":300}}}}'
|
|
|
|
|
register: patch_result
|
|
|
|
|
changed_when: "'no change' not in patch_result.stdout"
|
|
|
|
|
when: current_replicas.stdout | default('0') | int > 0
|
|
|
|
|
|
|
|
|
|
- name: Scale deployment to 0
|
|
|
|
|
ansible.builtin.command: >
|
|
|
|
|
kubectl scale deployment {{ deployment_name }}
|
|
|
|
|
-n {{ k8s_namespace }} --replicas=0
|
|
|
|
|
changed_when: true
|
|
|
|
|
when: current_replicas.stdout | default('0') | int > 0
|
|
|
|
|
|
|
|
|
|
- name: Wait for pods to terminate
|
|
|
|
|
ansible.builtin.command: >
|
|
|
|
|
kubectl get pods -n {{ k8s_namespace }}
|
|
|
|
|
-l app={{ deployment_name }}
|
|
|
|
|
-o jsonpath='{.items}'
|
|
|
|
|
register: pods_gone
|
|
|
|
|
changed_when: false
|
|
|
|
|
retries: 60
|
|
|
|
|
delay: 5
|
|
|
|
|
until: pods_gone.stdout == "[]" or pods_gone.stdout == ""
|
|
|
|
|
when: current_replicas.stdout | default('0') | int > 0
|
|
|
|
|
|
|
|
|
|
- name: Verify no agave processes in kind node
|
|
|
|
|
ansible.builtin.command: >
|
|
|
|
|
docker exec {{ kind_cluster }}-control-plane
|
|
|
|
|
pgrep -c agave-validator
|
|
|
|
|
register: agave_procs
|
|
|
|
|
failed_when: false
|
|
|
|
|
changed_when: false
|
|
|
|
|
|
|
|
|
|
- name: Fail if agave still running
|
|
|
|
|
ansible.builtin.fail:
|
|
|
|
|
msg: >-
|
|
|
|
|
agave-validator still running inside kind node after pod
|
|
|
|
|
termination. Investigate before proceeding.
|
|
|
|
|
when: agave_procs.rc == 0
|
|
|
|
|
|
|
|
|
|
- name: Report stopped
|
|
|
|
|
ansible.builtin.debug:
|
|
|
|
|
msg: "Validator stopped cleanly. Applying new config..."
|
|
|
|
|
|
|
|
|
|
# ---- apply new config and restart ----------------------------------------
|
|
|
|
|
- name: Restart deployment with updated config/image
|
|
|
|
|
ansible.builtin.command: >
|
|
|
|
|
{{ laconic_so }}
|
|
|
|
|
deployment --dir {{ deployment_dir }}
|
|
|
|
|
restart
|
2026-03-10 06:37:45 +00:00
|
|
|
vars:
|
|
|
|
|
# -E preserves SSH_AUTH_SOCK through sudo so laconic-so can git pull
|
|
|
|
|
ansible_become_flags: "-E"
|
2026-03-10 06:21:46 +00:00
|
|
|
register: restart_result
|
|
|
|
|
changed_when: true
|
|
|
|
|
|
2026-03-10 06:37:45 +00:00
|
|
|
# laconic-so restart deletes the namespace (dropping PVCs) then recreates.
|
|
|
|
|
# PVs survive but enter Released state with stale claimRefs. Clear them
|
|
|
|
|
# so the new PVCs can bind.
|
|
|
|
|
- name: Clear stale claimRefs on Released PVs
|
|
|
|
|
ansible.builtin.shell:
|
|
|
|
|
cmd: >-
|
|
|
|
|
set -o pipefail &&
|
|
|
|
|
kubectl get pv -o json |
|
|
|
|
|
python3 -c "
|
|
|
|
|
import json, subprocess, sys;
|
|
|
|
|
pvs = json.load(sys.stdin)['items'];
|
|
|
|
|
[subprocess.run(['kubectl', 'patch', 'pv', pv['metadata']['name'],
|
|
|
|
|
'--type=json', '-p=[{\"op\":\"remove\",\"path\":\"/spec/claimRef\"}]'],
|
|
|
|
|
check=True) for pv in pvs if pv['status'].get('phase') == 'Released']
|
|
|
|
|
"
|
|
|
|
|
executable: /bin/bash
|
|
|
|
|
changed_when: true
|
|
|
|
|
failed_when: false
|
|
|
|
|
|
2026-03-10 06:21:46 +00:00
|
|
|
# ---- verify --------------------------------------------------------------
|
|
|
|
|
- name: Wait for pod running
|
|
|
|
|
ansible.builtin.command: >
|
|
|
|
|
kubectl get pods -n {{ k8s_namespace }}
|
|
|
|
|
-l app={{ deployment_name }}
|
|
|
|
|
-o jsonpath='{.items[0].status.phase}'
|
|
|
|
|
register: pod_phase
|
|
|
|
|
changed_when: false
|
|
|
|
|
retries: 30
|
|
|
|
|
delay: 10
|
|
|
|
|
until: pod_phase.stdout == "Running"
|
|
|
|
|
|
|
|
|
|
- name: Report restarted
|
|
|
|
|
ansible.builtin.debug:
|
|
|
|
|
msg: >-
|
|
|
|
|
Validator restarted with new image/config.
|
|
|
|
|
Pod phase: {{ pod_phase.stdout }}.
|