so-l2l: make down() synchronous via _wait_for_labeled_deletions
delete_collection returns before the apiserver actually removes objects — finalizers on PVs, PVCs, and pod graceful shutdown all propagate async. Add _wait_for_labeled_deletions that polls the same label selector across every kind we triggered a delete for, with a 120s timeout. down() now returns only once the cluster has actually settled, so callers (tests, ansible, cryovial) don't need their own wait loops. Update the k8s-deploy test's assert_no_labeled_resources to rely on that synchronous contract — no polling in the test. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>pull/743/head
parent
98ad60ca03
commit
3d83c6ad27
|
|
@ -1063,6 +1063,124 @@ class K8sDeployer(Deployer):
|
|||
if opts.o.debug:
|
||||
print(f"Error listing PVs: {e}")
|
||||
|
||||
self._wait_for_labeled_deletions(
|
||||
namespace, label_selector, delete_volumes=delete_volumes
|
||||
)
|
||||
|
||||
def _wait_for_labeled_deletions(
|
||||
self,
|
||||
namespace: str,
|
||||
label_selector: str,
|
||||
delete_volumes: bool,
|
||||
timeout_seconds: int = 120,
|
||||
):
|
||||
"""Block until stack-labeled resources finish terminating.
|
||||
|
||||
delete_collection returns before the apiserver has actually removed
|
||||
the objects — finalizers (PVs waiting for PVCs, PVCs waiting for
|
||||
VolumeAttachment, pods waiting for graceful shutdown) propagate
|
||||
async. Poll until everything we triggered a delete for is gone,
|
||||
so callers can assume a synchronous tear-down.
|
||||
"""
|
||||
import time
|
||||
|
||||
# (kind name, lister callable) — lister returns an object with .items
|
||||
listers = [
|
||||
(
|
||||
"deployment",
|
||||
lambda: self.apps_api.list_namespaced_deployment(
|
||||
namespace=namespace, label_selector=label_selector
|
||||
),
|
||||
),
|
||||
(
|
||||
"ingress",
|
||||
lambda: self.networking_api.list_namespaced_ingress(
|
||||
namespace=namespace, label_selector=label_selector
|
||||
),
|
||||
),
|
||||
(
|
||||
"job",
|
||||
lambda: self.batch_api.list_namespaced_job(
|
||||
namespace=namespace, label_selector=label_selector
|
||||
),
|
||||
),
|
||||
(
|
||||
"service",
|
||||
lambda: self.core_api.list_namespaced_service(
|
||||
namespace=namespace, label_selector=label_selector
|
||||
),
|
||||
),
|
||||
(
|
||||
"configmap",
|
||||
lambda: self.core_api.list_namespaced_config_map(
|
||||
namespace=namespace, label_selector=label_selector
|
||||
),
|
||||
),
|
||||
(
|
||||
"secret",
|
||||
lambda: self.core_api.list_namespaced_secret(
|
||||
namespace=namespace, label_selector=label_selector
|
||||
),
|
||||
),
|
||||
(
|
||||
"pod",
|
||||
lambda: self.core_api.list_namespaced_pod(
|
||||
namespace=namespace, label_selector=label_selector
|
||||
),
|
||||
),
|
||||
]
|
||||
if delete_volumes:
|
||||
listers.append(
|
||||
(
|
||||
"persistentvolumeclaim",
|
||||
lambda: self.core_api.list_namespaced_persistent_volume_claim(
|
||||
namespace=namespace, label_selector=label_selector
|
||||
),
|
||||
)
|
||||
)
|
||||
listers.append(
|
||||
(
|
||||
"persistentvolume",
|
||||
lambda: self.core_api.list_persistent_volume(
|
||||
label_selector=label_selector
|
||||
),
|
||||
)
|
||||
)
|
||||
|
||||
deadline = time.monotonic() + timeout_seconds
|
||||
while time.monotonic() < deadline:
|
||||
remaining = []
|
||||
for kind, lister in listers:
|
||||
try:
|
||||
items = lister().items
|
||||
except ApiException as e:
|
||||
if e.status == 404:
|
||||
continue
|
||||
raise
|
||||
if items:
|
||||
remaining.append((kind, len(items)))
|
||||
if not remaining:
|
||||
return
|
||||
if opts.o.debug:
|
||||
print(f"Waiting for deletions: {remaining}")
|
||||
time.sleep(2)
|
||||
|
||||
# Timed out — warn but don't raise. Caller may still have the
|
||||
# cluster in a sensible state.
|
||||
still_present = []
|
||||
for kind, lister in listers:
|
||||
try:
|
||||
items = lister().items
|
||||
except ApiException:
|
||||
continue
|
||||
if items:
|
||||
still_present.append((kind, len(items)))
|
||||
if still_present:
|
||||
print(
|
||||
f"Warning: resources still present after {timeout_seconds}s: "
|
||||
f"{still_present}"
|
||||
)
|
||||
|
||||
def status(self):
|
||||
self.connect_api()
|
||||
# Call whatever API we need to get the running container list
|
||||
|
|
|
|||
|
|
@ -63,7 +63,9 @@ assert_ns_phase () {
|
|||
fi
|
||||
}
|
||||
|
||||
# Count labeled resources in the deployment namespace. Exit 1 on mismatch.
|
||||
# Count labeled resources in the deployment namespace. down() is
|
||||
# synchronous on its own cleanup (waits for PVCs/pods to terminate
|
||||
# before returning) so callers can assert immediately.
|
||||
# Usage: assert_no_labeled_resources <kind>
|
||||
assert_no_labeled_resources () {
|
||||
local kind=$1
|
||||
|
|
@ -260,7 +262,7 @@ $TEST_TARGET_SO deployment --dir $test_deployment_dir stop --delete-volumes --sk
|
|||
assert_ns_phase "Active"
|
||||
echo "stop preserves namespace test: passed"
|
||||
|
||||
for kind in deployment service configmap secret pvc; do
|
||||
for kind in deployment job ingress service configmap secret pvc pod; do
|
||||
assert_no_labeled_resources "$kind"
|
||||
done
|
||||
echo "stop cleans labeled resources test: passed"
|
||||
|
|
|
|||
Loading…
Reference in New Issue