From 3d83c6ad272815ff96a2ab01b3abd443889805ed Mon Sep 17 00:00:00 2001 From: Prathamesh Musale Date: Thu, 16 Apr 2026 05:28:29 +0000 Subject: [PATCH] so-l2l: make down() synchronous via _wait_for_labeled_deletions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit delete_collection returns before the apiserver actually removes objects — finalizers on PVs, PVCs, and pod graceful shutdown all propagate async. Add _wait_for_labeled_deletions that polls the same label selector across every kind we triggered a delete for, with a 120s timeout. down() now returns only once the cluster has actually settled, so callers (tests, ansible, cryovial) don't need their own wait loops. Update the k8s-deploy test's assert_no_labeled_resources to rely on that synchronous contract — no polling in the test. Co-Authored-By: Claude Opus 4.6 (1M context) --- stack_orchestrator/deploy/k8s/deploy_k8s.py | 118 ++++++++++++++++++++ tests/k8s-deploy/run-deploy-test.sh | 6 +- 2 files changed, 122 insertions(+), 2 deletions(-) diff --git a/stack_orchestrator/deploy/k8s/deploy_k8s.py b/stack_orchestrator/deploy/k8s/deploy_k8s.py index a29e767d..d7eb8990 100644 --- a/stack_orchestrator/deploy/k8s/deploy_k8s.py +++ b/stack_orchestrator/deploy/k8s/deploy_k8s.py @@ -1063,6 +1063,124 @@ class K8sDeployer(Deployer): if opts.o.debug: print(f"Error listing PVs: {e}") + self._wait_for_labeled_deletions( + namespace, label_selector, delete_volumes=delete_volumes + ) + + def _wait_for_labeled_deletions( + self, + namespace: str, + label_selector: str, + delete_volumes: bool, + timeout_seconds: int = 120, + ): + """Block until stack-labeled resources finish terminating. + + delete_collection returns before the apiserver has actually removed + the objects — finalizers (PVs waiting for PVCs, PVCs waiting for + VolumeAttachment, pods waiting for graceful shutdown) propagate + async. Poll until everything we triggered a delete for is gone, + so callers can assume a synchronous tear-down. + """ + import time + + # (kind name, lister callable) — lister returns an object with .items + listers = [ + ( + "deployment", + lambda: self.apps_api.list_namespaced_deployment( + namespace=namespace, label_selector=label_selector + ), + ), + ( + "ingress", + lambda: self.networking_api.list_namespaced_ingress( + namespace=namespace, label_selector=label_selector + ), + ), + ( + "job", + lambda: self.batch_api.list_namespaced_job( + namespace=namespace, label_selector=label_selector + ), + ), + ( + "service", + lambda: self.core_api.list_namespaced_service( + namespace=namespace, label_selector=label_selector + ), + ), + ( + "configmap", + lambda: self.core_api.list_namespaced_config_map( + namespace=namespace, label_selector=label_selector + ), + ), + ( + "secret", + lambda: self.core_api.list_namespaced_secret( + namespace=namespace, label_selector=label_selector + ), + ), + ( + "pod", + lambda: self.core_api.list_namespaced_pod( + namespace=namespace, label_selector=label_selector + ), + ), + ] + if delete_volumes: + listers.append( + ( + "persistentvolumeclaim", + lambda: self.core_api.list_namespaced_persistent_volume_claim( + namespace=namespace, label_selector=label_selector + ), + ) + ) + listers.append( + ( + "persistentvolume", + lambda: self.core_api.list_persistent_volume( + label_selector=label_selector + ), + ) + ) + + deadline = time.monotonic() + timeout_seconds + while time.monotonic() < deadline: + remaining = [] + for kind, lister in listers: + try: + items = lister().items + except ApiException as e: + if e.status == 404: + continue + raise + if items: + remaining.append((kind, len(items))) + if not remaining: + return + if opts.o.debug: + print(f"Waiting for deletions: {remaining}") + time.sleep(2) + + # Timed out — warn but don't raise. Caller may still have the + # cluster in a sensible state. + still_present = [] + for kind, lister in listers: + try: + items = lister().items + except ApiException: + continue + if items: + still_present.append((kind, len(items))) + if still_present: + print( + f"Warning: resources still present after {timeout_seconds}s: " + f"{still_present}" + ) + def status(self): self.connect_api() # Call whatever API we need to get the running container list diff --git a/tests/k8s-deploy/run-deploy-test.sh b/tests/k8s-deploy/run-deploy-test.sh index 7cef3923..a8f5481e 100755 --- a/tests/k8s-deploy/run-deploy-test.sh +++ b/tests/k8s-deploy/run-deploy-test.sh @@ -63,7 +63,9 @@ assert_ns_phase () { fi } -# Count labeled resources in the deployment namespace. Exit 1 on mismatch. +# Count labeled resources in the deployment namespace. down() is +# synchronous on its own cleanup (waits for PVCs/pods to terminate +# before returning) so callers can assert immediately. # Usage: assert_no_labeled_resources assert_no_labeled_resources () { local kind=$1 @@ -260,7 +262,7 @@ $TEST_TARGET_SO deployment --dir $test_deployment_dir stop --delete-volumes --sk assert_ns_phase "Active" echo "stop preserves namespace test: passed" -for kind in deployment service configmap secret pvc; do +for kind in deployment job ingress service configmap secret pvc pod; do assert_no_labeled_resources "$kind" done echo "stop cleans labeled resources test: passed"