so-l2l: make down() synchronous via _wait_for_labeled_deletions

delete_collection returns before the apiserver actually removes
objects — finalizers on PVs, PVCs, and pod graceful shutdown all
propagate async. Add _wait_for_labeled_deletions that polls the
same label selector across every kind we triggered a delete for,
with a 120s timeout. down() now returns only once the cluster has
actually settled, so callers (tests, ansible, cryovial) don't
need their own wait loops.

Update the k8s-deploy test's assert_no_labeled_resources to rely
on that synchronous contract — no polling in the test.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
pull/743/head
Prathamesh Musale 2026-04-16 05:28:29 +00:00
parent 98ad60ca03
commit 3d83c6ad27
2 changed files with 122 additions and 2 deletions

View File

@ -1063,6 +1063,124 @@ class K8sDeployer(Deployer):
if opts.o.debug:
print(f"Error listing PVs: {e}")
self._wait_for_labeled_deletions(
namespace, label_selector, delete_volumes=delete_volumes
)
def _wait_for_labeled_deletions(
self,
namespace: str,
label_selector: str,
delete_volumes: bool,
timeout_seconds: int = 120,
):
"""Block until stack-labeled resources finish terminating.
delete_collection returns before the apiserver has actually removed
the objects finalizers (PVs waiting for PVCs, PVCs waiting for
VolumeAttachment, pods waiting for graceful shutdown) propagate
async. Poll until everything we triggered a delete for is gone,
so callers can assume a synchronous tear-down.
"""
import time
# (kind name, lister callable) — lister returns an object with .items
listers = [
(
"deployment",
lambda: self.apps_api.list_namespaced_deployment(
namespace=namespace, label_selector=label_selector
),
),
(
"ingress",
lambda: self.networking_api.list_namespaced_ingress(
namespace=namespace, label_selector=label_selector
),
),
(
"job",
lambda: self.batch_api.list_namespaced_job(
namespace=namespace, label_selector=label_selector
),
),
(
"service",
lambda: self.core_api.list_namespaced_service(
namespace=namespace, label_selector=label_selector
),
),
(
"configmap",
lambda: self.core_api.list_namespaced_config_map(
namespace=namespace, label_selector=label_selector
),
),
(
"secret",
lambda: self.core_api.list_namespaced_secret(
namespace=namespace, label_selector=label_selector
),
),
(
"pod",
lambda: self.core_api.list_namespaced_pod(
namespace=namespace, label_selector=label_selector
),
),
]
if delete_volumes:
listers.append(
(
"persistentvolumeclaim",
lambda: self.core_api.list_namespaced_persistent_volume_claim(
namespace=namespace, label_selector=label_selector
),
)
)
listers.append(
(
"persistentvolume",
lambda: self.core_api.list_persistent_volume(
label_selector=label_selector
),
)
)
deadline = time.monotonic() + timeout_seconds
while time.monotonic() < deadline:
remaining = []
for kind, lister in listers:
try:
items = lister().items
except ApiException as e:
if e.status == 404:
continue
raise
if items:
remaining.append((kind, len(items)))
if not remaining:
return
if opts.o.debug:
print(f"Waiting for deletions: {remaining}")
time.sleep(2)
# Timed out — warn but don't raise. Caller may still have the
# cluster in a sensible state.
still_present = []
for kind, lister in listers:
try:
items = lister().items
except ApiException:
continue
if items:
still_present.append((kind, len(items)))
if still_present:
print(
f"Warning: resources still present after {timeout_seconds}s: "
f"{still_present}"
)
def status(self):
self.connect_api()
# Call whatever API we need to get the running container list

View File

@ -63,7 +63,9 @@ assert_ns_phase () {
fi
}
# Count labeled resources in the deployment namespace. Exit 1 on mismatch.
# Count labeled resources in the deployment namespace. down() is
# synchronous on its own cleanup (waits for PVCs/pods to terminate
# before returning) so callers can assert immediately.
# Usage: assert_no_labeled_resources <kind>
assert_no_labeled_resources () {
local kind=$1
@ -260,7 +262,7 @@ $TEST_TARGET_SO deployment --dir $test_deployment_dir stop --delete-volumes --sk
assert_ns_phase "Active"
echo "stop preserves namespace test: passed"
for kind in deployment service configmap secret pvc; do
for kind in deployment job ingress service configmap secret pvc pod; do
assert_no_labeled_resources "$kind"
done
echo "stop cleans labeled resources test: passed"