From 7c65d39bb214acf5dd7ab48a2a6bd89a5f3503db Mon Sep 17 00:00:00 2001 From: prathamesh0 <42446521+prathamesh0@users.noreply.github.com> Date: Tue, 28 Apr 2026 17:28:02 +0530 Subject: [PATCH] Make deployments self-sufficient and add E2E restart test (#750) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - `deploy create` now copies each pod's `commands.py` into `/hooks/`. `call_stack_deploy_start` loads from there, so `deployment start` / `restart` no longer need the live stack source on disk to run the `start()` hook - Only the `start()` hook is affected. `init`, `setup`, and `create` still load from the live source — they only run at `deploy create` time, when the source is guaranteed to be present - Multi-repo stacks produce `hooks/commands_0.py`, `hooks/commands_1.py`, …; `call_stack_deploy_start` loads them all in sorted order - Adds `tests/k8s-deploy/run-restart-test.sh` covering the full single-repo restart cycle (v1 -> mutate working tree -> `restart` re-copies and re-executes v2) and the multi-repo file-naming + multi-hook invocation. Wired into the existing **K8s Deploy Test** workflow --- .github/workflows/test-k8s-deploy.yml | 3 + .pebbles/events.jsonl | 2 + .../compose/docker-compose-test-restart.yml | 5 + .../data/stacks/test-restart-multi/README.md | 14 + .../data/stacks/test-restart-multi/stack.yml | 10 + .../data/stacks/test-restart/README.md | 15 + .../stacks/test-restart/deploy/commands.py | 32 +++ .../data/stacks/test-restart/stack.yml | 5 + .../deploy/deployment_context.py | 4 +- .../deploy/deployment_create.py | 66 +++-- stack_orchestrator/deploy/dns_probe.py | 4 +- stack_orchestrator/deploy/k8s/cluster_info.py | 24 +- stack_orchestrator/deploy/k8s/deploy_k8s.py | 86 ++++-- stack_orchestrator/deploy/k8s/helpers.py | 15 +- tests/k8s-deploy/run-deploy-test.sh | 14 +- tests/k8s-deploy/run-restart-test.sh | 265 ++++++++++++++++++ 16 files changed, 478 insertions(+), 86 deletions(-) create mode 100644 stack_orchestrator/data/compose/docker-compose-test-restart.yml create mode 100644 stack_orchestrator/data/stacks/test-restart-multi/README.md create mode 100644 stack_orchestrator/data/stacks/test-restart-multi/stack.yml create mode 100644 stack_orchestrator/data/stacks/test-restart/README.md create mode 100644 stack_orchestrator/data/stacks/test-restart/deploy/commands.py create mode 100644 stack_orchestrator/data/stacks/test-restart/stack.yml create mode 100755 tests/k8s-deploy/run-restart-test.sh diff --git a/.github/workflows/test-k8s-deploy.yml b/.github/workflows/test-k8s-deploy.yml index aeffa467..62ce4a42 100644 --- a/.github/workflows/test-k8s-deploy.yml +++ b/.github/workflows/test-k8s-deploy.yml @@ -16,6 +16,7 @@ on: - '.github/workflows/triggers/test-k8s-deploy' - '.github/workflows/test-k8s-deploy.yml' - 'tests/k8s-deploy/run-deploy-test.sh' + - 'tests/k8s-deploy/run-restart-test.sh' schedule: - cron: '3 15 * * *' @@ -46,3 +47,5 @@ jobs: run: ./tests/scripts/install-kubectl.sh - name: "Run k8s deployment test" run: ./tests/k8s-deploy/run-deploy-test.sh + - name: "Run restart k8s deployment test" + run: ./tests/k8s-deploy/run-restart-test.sh diff --git a/.pebbles/events.jsonl b/.pebbles/events.jsonl index ece11c09..1d04bb8a 100644 --- a/.pebbles/events.jsonl +++ b/.pebbles/events.jsonl @@ -54,3 +54,5 @@ {"type":"status_update","timestamp":"2026-04-21T06:08:14.457815115Z","issue_id":"so-ad7","payload":{"status":"closed"}} {"type":"update","timestamp":"2026-04-21T09:00:47.364859946Z","issue_id":"so-p3p","payload":{"description":"## Problem\n\nThe Caddy ingress controller image is hardcoded in `ingress-caddy-kind-deploy.yaml`, with no mechanism to update it short of cluster recreation or manual `kubectl patch`. laconic-so should: (1) allow spec.yml to specify a Caddy image, (2) support updating the Caddy image as part of `deployment start`, (3) set `strategy: Recreate` on the Caddy Deployment since hostPort pods can't rolling-update.\n\n## Resolution\n\n- New spec key `caddy-ingress-image`. Fresh install uses it (fallback: manifest default). On subsequent `deployment start`, if the spec key is set and the running Caddy image differs, SO patches the Deployment and waits for rollout.\n- Spec key absent =\u003e SO does **not** touch a running Caddy, to avoid silently reverting images set out-of-band (ansible playbook, another deployment's spec).\n- `strategy: Recreate` added to the Caddy Deployment manifest.\n- Reconcile runs under both `--perform-cluster-management` and the default `--skip-cluster-management` (it's a plain k8s-API patch, not a cluster lifecycle op).\n- Image substitution locates the container by name instead of string-matching the shipped default, so the spec override wins regardless of what the manifest hardcodes.\n- Cluster-scoped caveat: `caddy-system` is shared across deployments; last `deployment start` that sets the key wins for everyone. Documented in `deployment_patterns.md`."}} {"type":"status_update","timestamp":"2026-04-21T09:00:47.745675131Z","issue_id":"so-p3p","payload":{"status":"closed"}} +{"type":"comment","timestamp":"2026-04-27T13:41:16.962883653Z","issue_id":"so-078","payload":{"body":"Fixed. deploy create now copies commands.py into deployment_dir/hooks/. call_stack_deploy_start loads hooks from the deployment dir instead of resolving via get_stack_path, so deployment start no longer requires the stack repo to be present or cwd to be correct."}} +{"type":"close","timestamp":"2026-04-27T13:41:17.073012545Z","issue_id":"so-078","payload":{}} diff --git a/stack_orchestrator/data/compose/docker-compose-test-restart.yml b/stack_orchestrator/data/compose/docker-compose-test-restart.yml new file mode 100644 index 00000000..e5f69565 --- /dev/null +++ b/stack_orchestrator/data/compose/docker-compose-test-restart.yml @@ -0,0 +1,5 @@ +services: + test-restart: + image: busybox:1.36 + command: ["sh", "-c", "echo started && sleep infinity"] + restart: always diff --git a/stack_orchestrator/data/stacks/test-restart-multi/README.md b/stack_orchestrator/data/stacks/test-restart-multi/README.md new file mode 100644 index 00000000..6a638182 --- /dev/null +++ b/stack_orchestrator/data/stacks/test-restart-multi/README.md @@ -0,0 +1,14 @@ +# test-restart-multi + +E2E test stack used by `tests/k8s-deploy/run-restart-test.sh` to cover the +multi-repo case: `pods:` references two pod repos, each shipping its own +`deploy/commands.py`. `deploy create` should produce +`/hooks/commands_0.py` and `/hooks/commands_1.py`, +and `deployment start` should invoke both `start()` hooks (each writes its +own marker file so neither overwrites the other). + +The pod repos themselves are created by the test script as bare-repo + +working-clone pairs under `$CERC_REPO_BASE_DIR/test-restart-pod-{a,b}`; +they are not committed to this repository. Each pod repo ships its own +`docker-compose.yml` (resolved by `get_pod_file_path` for dict-form pods) +and `stack/deploy/commands.py` — the stack repo only owns `stack.yml`. diff --git a/stack_orchestrator/data/stacks/test-restart-multi/stack.yml b/stack_orchestrator/data/stacks/test-restart-multi/stack.yml new file mode 100644 index 00000000..431ebf14 --- /dev/null +++ b/stack_orchestrator/data/stacks/test-restart-multi/stack.yml @@ -0,0 +1,10 @@ +version: "1.0" +name: test-restart-multi +description: "E2E test stack for the deployment restart command (multi-repo case)" +pods: + - name: test-restart-multi-a + repository: test-restart-pod-a + path: . + - name: test-restart-multi-b + repository: test-restart-pod-b + path: . diff --git a/stack_orchestrator/data/stacks/test-restart/README.md b/stack_orchestrator/data/stacks/test-restart/README.md new file mode 100644 index 00000000..73cdb1fb --- /dev/null +++ b/stack_orchestrator/data/stacks/test-restart/README.md @@ -0,0 +1,15 @@ +# test-restart + +E2E test stack used by `tests/k8s-deploy/run-restart-test.sh`. + +The stack ships a small `start()` hook that writes a versioned marker file +into the deployment directory. The test exercises `deployment restart`: + +1. `deploy create` → asserts `commands.py` was copied into `/hooks/`. +2. `deployment start` → asserts the marker file contains the v1 string. +3. Modifies `commands.py` in the stack-source working tree (v1 → v2). +4. `deployment restart` → asserts the new `commands.py` was re-copied into + `/hooks/` and the marker file now contains the v2 string. + +The pod uses a public `busybox` image that just sleeps; the marker file is +the only thing under test. diff --git a/stack_orchestrator/data/stacks/test-restart/deploy/commands.py b/stack_orchestrator/data/stacks/test-restart/deploy/commands.py new file mode 100644 index 00000000..eaa15860 --- /dev/null +++ b/stack_orchestrator/data/stacks/test-restart/deploy/commands.py @@ -0,0 +1,32 @@ +# Copyright © 2026 Vulcanize + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from stack_orchestrator.util import get_yaml +from stack_orchestrator.deploy.deployment_context import DeploymentContext + +default_spec_file_content = "" + + +def init(command_context): + return get_yaml().load(default_spec_file_content) + + +def start(deployment_context: DeploymentContext): + # Writes a marker file the e2e test asserts on. The test flips the + # literal below from "v1" to "v2" in the stack-source working tree + # before running 'deployment restart' to verify the updated hook is + # copied into deployment_dir/hooks/ and re-executed. + marker = deployment_context.deployment_dir / "marker" + marker.write_text("v1") diff --git a/stack_orchestrator/data/stacks/test-restart/stack.yml b/stack_orchestrator/data/stacks/test-restart/stack.yml new file mode 100644 index 00000000..ccb394f6 --- /dev/null +++ b/stack_orchestrator/data/stacks/test-restart/stack.yml @@ -0,0 +1,5 @@ +version: "1.0" +name: test-restart +description: "E2E test stack for the deployment restart command" +pods: + - test-restart diff --git a/stack_orchestrator/deploy/deployment_context.py b/stack_orchestrator/deploy/deployment_context.py index 1776699e..fb475589 100644 --- a/stack_orchestrator/deploy/deployment_context.py +++ b/stack_orchestrator/deploy/deployment_context.py @@ -83,9 +83,7 @@ class DeploymentContext: # Fallback to cluster-id for deployments created before the # deployment-id field was introduced. Keeps existing resource # names stable across this upgrade. - self.deployment_id = obj.get( - constants.deployment_id_key, self.id - ) + self.deployment_id = obj.get(constants.deployment_id_key, self.id) # Handle the case of a legacy deployment with no file # Code below is intended to match the output from _make_default_cluster_name() # TODO: remove when we no longer need to support legacy deployments diff --git a/stack_orchestrator/deploy/deployment_create.py b/stack_orchestrator/deploy/deployment_create.py index fd7ec4f1..efdd2efb 100644 --- a/stack_orchestrator/deploy/deployment_create.py +++ b/stack_orchestrator/deploy/deployment_create.py @@ -276,16 +276,17 @@ def call_stack_deploy_start(deployment_context): create additional k8s resources (Services, etc.) in the deployment namespace. The namespace can be derived as f"laconic-{deployment_context.id}". """ - python_file_paths = _commands_plugin_paths(deployment_context.stack.name) - for python_file_path in python_file_paths: - if python_file_path.exists(): - spec = util.spec_from_file_location("commands", python_file_path) - if spec is None or spec.loader is None: - continue - imported_stack = util.module_from_spec(spec) - spec.loader.exec_module(imported_stack) - if _has_method(imported_stack, "start"): - imported_stack.start(deployment_context) + hooks_dir = deployment_context.deployment_dir / "hooks" + if not hooks_dir.exists(): + return + for python_file_path in sorted(hooks_dir.glob("commands*.py")): + spec = util.spec_from_file_location("commands", python_file_path) + if spec is None or spec.loader is None: + continue + imported_stack = util.module_from_spec(spec) + spec.loader.exec_module(imported_stack) + if _has_method(imported_stack, "start"): + imported_stack.start(deployment_context) # Inspect the pod yaml to find config files referenced in subdirectories @@ -379,9 +380,7 @@ def _validate_host_path_mounts(parsed_pod_file, pod_name, pod_file_path): "content at runtime.\n\n" "See docs/deployment_patterns.md." ) - total = sum( - p.stat().st_size for p in entries if p.is_file() - ) + total = sum(p.stat().st_size for p in entries if p.is_file()) if total > _HOST_PATH_CONFIGMAP_BUDGET_BYTES: raise DeployerException( f"Directory host-path bind '{volume_str}' in " @@ -1111,6 +1110,37 @@ def _safe_copy_tree(src: Path, dst: Path, exclude_patterns: Optional[List[str]] safe_copy_file(src_path, dst_path) +def _copy_hooks(stack_name: str, target_dir: Path): + """Copy commands.py hooks into deployment_dir/hooks/ for self-sufficiency. + + Single repo: hooks/commands.py + Multi-repo: hooks/commands_0.py, hooks/commands_1.py, ... — indexed by + plugin path order. + + Note: the whole commands.py file is copied (init/setup/create/start), but + at runtime only call_stack_deploy_start loads from this copied location. + call_stack_deploy_init, call_stack_deploy_setup, and call_stack_deploy_create + still resolve commands.py from the live stack source via + get_plugin_code_paths — they only run at deploy create time when the source + is guaranteed to be present, so they don't need to be self-sufficient. + """ + plugin_paths = get_plugin_code_paths(stack_name) + sources = [ + p.joinpath("deploy", "commands.py") + for p in plugin_paths + if p.joinpath("deploy", "commands.py").exists() + ] + if not sources: + return + hooks_dir = target_dir / "hooks" + hooks_dir.mkdir(exist_ok=True) + if len(sources) == 1: + copyfile(sources[0], hooks_dir / "commands.py") + else: + for i, src in enumerate(sources): + copyfile(src, hooks_dir / f"commands_{i}.py") + + def _write_deployment_files( target_dir: Path, spec_file: Path, @@ -1138,6 +1168,8 @@ def _write_deployment_files( copyfile(spec_file, target_dir.joinpath(constants.spec_file_name)) copyfile(stack_file, target_dir.joinpath(constants.stack_file_name)) + _copy_hooks(stack_name, target_dir) + # Create deployment file if requested if include_deployment_file: _create_deployment_file(target_dir, stack_source=stack_source) @@ -1249,7 +1281,9 @@ def _write_deployment_files( else: source_config_dir = resolve_config_dir(stack_name, configmap_name) if os.path.exists(source_config_dir): - destination_config_dir = target_dir.joinpath("configmaps", configmap_name) + destination_config_dir = target_dir.joinpath( + "configmaps", configmap_name + ) copytree(source_config_dir, destination_config_dir, dirs_exist_ok=True) # Copy the job files into the target dir @@ -1262,9 +1296,7 @@ def _write_deployment_files( if job_file_path and job_file_path.exists(): parsed_job_file = yaml.load(open(job_file_path, "r")) if parsed_spec.is_kubernetes_deployment(): - _validate_host_path_mounts( - parsed_job_file, job, job_file_path - ) + _validate_host_path_mounts(parsed_job_file, job, job_file_path) _fixup_pod_file(parsed_job_file, parsed_spec, destination_compose_dir) with open( destination_compose_jobs_dir.joinpath( diff --git a/stack_orchestrator/deploy/dns_probe.py b/stack_orchestrator/deploy/dns_probe.py index e04b4ea2..6363a72e 100644 --- a/stack_orchestrator/deploy/dns_probe.py +++ b/stack_orchestrator/deploy/dns_probe.py @@ -6,7 +6,7 @@ import secrets import socket import time -from typing import Optional +from typing import List, Optional import requests from kubernetes import client @@ -18,7 +18,7 @@ def get_server_egress_ip() -> str: return response.text.strip() -def resolve_hostname(hostname: str) -> list[str]: +def resolve_hostname(hostname: str) -> List[str]: """Resolve hostname to list of IP addresses.""" try: _, _, ips = socket.gethostbyname_ex(hostname) diff --git a/stack_orchestrator/deploy/k8s/cluster_info.py b/stack_orchestrator/deploy/k8s/cluster_info.py index 2febf6ad..e03b4bea 100644 --- a/stack_orchestrator/deploy/k8s/cluster_info.py +++ b/stack_orchestrator/deploy/k8s/cluster_info.py @@ -479,9 +479,7 @@ class ClusterInfo: if sanitized in seen: continue seen.add(sanitized) - abs_src = resolve_host_path_for_kind( - src, deployment_dir - ) + abs_src = resolve_host_path_for_kind(src, deployment_dir) data = self._read_host_path_source(abs_src, mount_string) cm = client.V1ConfigMap( metadata=client.V1ObjectMeta( @@ -495,9 +493,7 @@ class ClusterInfo: result.append(cm) return result - def _read_host_path_source( - self, abs_src: Path, mount_string: str - ) -> dict: + def _read_host_path_source(self, abs_src: Path, mount_string: str) -> dict: """Read file or flat-directory content for a host-path ConfigMap. Validates shape at read time as a defensive second check — the @@ -517,9 +513,7 @@ class ClusterInfo: for entry in abs_src.iterdir(): if entry.is_file(): with open(entry, "rb") as f: - data[entry.name] = base64.b64encode(f.read()).decode( - "ASCII" - ) + data[entry.name] = base64.b64encode(f.read()).decode("ASCII") return data def get_pvs(self): @@ -711,9 +705,7 @@ class ClusterInfo: volume_mounts = volume_mounts_for_service( parsed_yaml_map, service_name, - Path(self.spec.file_path).parent - if self.spec.file_path - else None, + Path(self.spec.file_path).parent if self.spec.file_path else None, ) # Handle command/entrypoint from compose file # In docker-compose: entrypoint -> k8s command, command -> k8s args @@ -1021,9 +1013,7 @@ class ClusterInfo: metadata=client.V1ObjectMeta( name=deployment_name, labels=self._stack_labels( - {"app.kubernetes.io/component": pod_name} - if multi_pod - else None + {"app.kubernetes.io/component": pod_name} if multi_pod else None ), ), spec=spec, @@ -1071,9 +1061,7 @@ class ClusterInfo: container_ports[container].add(port) if maintenance_svc and ":" in maintenance_svc: maint_container, maint_port_str = maintenance_svc.split(":", 1) - container_ports.setdefault(maint_container, set()).add( - int(maint_port_str) - ) + container_ports.setdefault(maint_container, set()).add(int(maint_port_str)) # Build map: pod_file -> set of service names in that pod pod_services_map: dict = {} diff --git a/stack_orchestrator/deploy/k8s/deploy_k8s.py b/stack_orchestrator/deploy/k8s/deploy_k8s.py index 688c5696..ac4acf7e 100644 --- a/stack_orchestrator/deploy/k8s/deploy_k8s.py +++ b/stack_orchestrator/deploy/k8s/deploy_k8s.py @@ -219,10 +219,7 @@ class K8sDeployer(Deployer): ) self.core_api.create_namespace(body=ns) if opts.o.debug: - print( - f"Created namespace {self.k8s_namespace} " - f"owned by {my_dir}" - ) + print(f"Created namespace {self.k8s_namespace} " f"owned by {my_dir}") return annotations = (existing.metadata.annotations or {}) if existing.metadata else {} @@ -1025,9 +1022,7 @@ class K8sDeployer(Deployer): call_stack_deploy_start(self.deployment_context) - def down( - self, timeout, volumes, skip_cluster_management, delete_namespace=False - ): + def down(self, timeout, volumes, skip_cluster_management, delete_namespace=False): """Tear down stack-labeled resources. Phases: 1. Delete namespaced resources (if namespace still exists). @@ -1221,34 +1216,68 @@ class K8sDeployer(Deployer): listers = [] if namespace_present: listers += [ - ("deployment", lambda: self.apps_api.list_namespaced_deployment( - namespace=namespace, label_selector=selector)), - ("ingress", lambda: self.networking_api.list_namespaced_ingress( - namespace=namespace, label_selector=selector)), - ("job", lambda: self.batch_api.list_namespaced_job( - namespace=namespace, label_selector=selector)), - ("service", lambda: self.core_api.list_namespaced_service( - namespace=namespace, label_selector=selector)), - ("configmap", lambda: self.core_api.list_namespaced_config_map( - namespace=namespace, label_selector=selector)), - ("secret", lambda: self.core_api.list_namespaced_secret( - namespace=namespace, label_selector=selector)), - ("pod", lambda: self.core_api.list_namespaced_pod( - namespace=namespace, label_selector=selector)), + ( + "deployment", + lambda: self.apps_api.list_namespaced_deployment( + namespace=namespace, label_selector=selector + ), + ), + ( + "ingress", + lambda: self.networking_api.list_namespaced_ingress( + namespace=namespace, label_selector=selector + ), + ), + ( + "job", + lambda: self.batch_api.list_namespaced_job( + namespace=namespace, label_selector=selector + ), + ), + ( + "service", + lambda: self.core_api.list_namespaced_service( + namespace=namespace, label_selector=selector + ), + ), + ( + "configmap", + lambda: self.core_api.list_namespaced_config_map( + namespace=namespace, label_selector=selector + ), + ), + ( + "secret", + lambda: self.core_api.list_namespaced_secret( + namespace=namespace, label_selector=selector + ), + ), + ( + "pod", + lambda: self.core_api.list_namespaced_pod( + namespace=namespace, label_selector=selector + ), + ), ] if delete_volumes: listers.append( - ("persistentvolumeclaim", - lambda: self.core_api.list_namespaced_persistent_volume_claim( - namespace=namespace, label_selector=selector)) + ( + "persistentvolumeclaim", + lambda: self.core_api.list_namespaced_persistent_volume_claim( + namespace=namespace, label_selector=selector + ), + ) ) # PVs are cluster-scoped — wait for them even when the namespace # is already gone (orphaned from a prior --delete-namespace). if delete_volumes: listers.append( - ("persistentvolume", - lambda: self.core_api.list_persistent_volume( - label_selector=selector)) + ( + "persistentvolume", + lambda: self.core_api.list_persistent_volume( + label_selector=selector + ), + ) ) def remaining(): @@ -1276,8 +1305,7 @@ class K8sDeployer(Deployer): left = remaining() if left: print( - f"Warning: resources still present after {timeout_seconds}s: " - f"{left}" + f"Warning: resources still present after {timeout_seconds}s: " f"{left}" ) def status(self): diff --git a/stack_orchestrator/deploy/k8s/helpers.py b/stack_orchestrator/deploy/k8s/helpers.py index 4cbf3270..682fe442 100644 --- a/stack_orchestrator/deploy/k8s/helpers.py +++ b/stack_orchestrator/deploy/k8s/helpers.py @@ -207,9 +207,7 @@ def _install_caddy_cert_backup( print("No kind-mount-root configured; caddy cert backup disabled") return manifest = os.path.abspath( - get_k8s_dir().joinpath( - "components", "ingress", "caddy-cert-backup.yaml" - ) + get_k8s_dir().joinpath("components", "ingress", "caddy-cert-backup.yaml") ) with open(manifest) as f: objects = list(yaml.safe_load_all(f)) @@ -233,9 +231,7 @@ def _parse_kind_extra_mounts(config_file: str) -> List[Dict[str, str]]: host_path = m.get("hostPath") container_path = m.get("containerPath") if host_path and container_path: - mounts.append( - {"hostPath": host_path, "containerPath": container_path} - ) + mounts.append({"hostPath": host_path, "containerPath": container_path}) return mounts @@ -500,12 +496,9 @@ def install_ingress_for_kind( continue if ( obj.get("kind") == "Deployment" - and obj.get("metadata", {}).get("name") - == "caddy-ingress-controller" + and obj.get("metadata", {}).get("name") == "caddy-ingress-controller" ): - for c in ( - obj["spec"]["template"]["spec"].get("containers") or [] - ): + for c in obj["spec"]["template"]["spec"].get("containers") or []: if c.get("name") == "caddy-ingress-controller": c["image"] = caddy_image if opts.o.debug: diff --git a/tests/k8s-deploy/run-deploy-test.sh b/tests/k8s-deploy/run-deploy-test.sh index 08a89c6a..7feb8038 100755 --- a/tests/k8s-deploy/run-deploy-test.sh +++ b/tests/k8s-deploy/run-deploy-test.sh @@ -413,14 +413,16 @@ if [ "$restored_value" != "$fake_cert_value" ]; then fi echo "caddy cert restore test: passed" -# Final teardown: --delete-namespace nukes the namespace after labeled cleanup. -# Verify the namespace is actually gone. +# Final teardown: --delete-namespace nukes the namespace, and +# --perform-cluster-management tears down the Kind cluster so the next test +# step in this CI workflow (e.g. run-restart-test.sh) starts from a clean +# host. $TEST_TARGET_SO deployment --dir $test_deployment_dir \ - stop --delete-volumes --delete-namespace --skip-cluster-management -if kubectl get namespace ${deployment_ns} >/dev/null 2>&1; then - echo "delete-namespace test: FAILED (namespace still present)" + stop --delete-volumes --delete-namespace --perform-cluster-management +if kind get clusters 2>/dev/null | grep -q .; then + echo "cluster teardown test: FAILED (kind cluster still present)" exit 1 fi -echo "delete-namespace test: passed" +echo "cluster teardown test: passed" echo "Test passed" diff --git a/tests/k8s-deploy/run-restart-test.sh b/tests/k8s-deploy/run-restart-test.sh new file mode 100755 index 00000000..c3298d01 --- /dev/null +++ b/tests/k8s-deploy/run-restart-test.sh @@ -0,0 +1,265 @@ +#!/usr/bin/env bash +set -e +if [ -n "$CERC_SCRIPT_DEBUG" ]; then + set -x + echo "Environment variables:" + env +fi + +# Helper functions: TODO move into a separate file (mirrors run-deploy-test.sh:10). +wait_for_pods_started () { + local dir=$1 + for i in {1..50} + do + local ps_output=$( $TEST_TARGET_SO deployment --dir $dir ps ) + + if [[ "$ps_output" == *"Running containers:"* ]]; then + return + else + sleep 5 + fi + done + echo "waiting for pods to start: FAILED" + cleanup_and_exit +} + +# Multi-pod stacks aren't visible to 'deployment ps' (deploy_k8s.py:1366 +# filters by app_name-deployment substring, which doesn't match +# laconic---deployment- names). Wait via kubectl. +wait_for_k8s_pods_ready () { + local ns=$1 + local timeout=240 + local waited=0 + # First wait for at least one pod to appear in the namespace. + while [ $waited -lt $timeout ]; do + local count=$(kubectl get pods -n "$ns" --no-headers 2>/dev/null | wc -l) + if [ "$count" -gt 0 ]; then + break + fi + sleep 2 + waited=$((waited + 2)) + done + if ! kubectl wait --for=condition=Ready pod --all \ + -n "$ns" --timeout=$((timeout - waited))s 2>&1; then + echo "kubectl wait pods ready: FAILED (ns=$ns)" + kubectl get pods -n "$ns" 2>&1 || true + kubectl describe pods -n "$ns" 2>&1 | tail -80 || true + cleanup_and_exit + fi +} + +# Best-effort full teardown so CI runners don't leak namespaces/PVs/clusters +# between runs. Variables may be unset depending on which phase tripped. +cleanup_and_exit () { + if [ -n "$DEP1" ] && [ -d "$DEP1" ]; then + $TEST_TARGET_SO deployment --dir $DEP1 \ + stop --delete-volumes --delete-namespace --skip-cluster-management || true + fi + if [ -n "$DEP2" ] && [ -d "$DEP2" ]; then + $TEST_TARGET_SO deployment --dir $DEP2 \ + stop --delete-volumes --delete-namespace --perform-cluster-management || true + fi + exit 1 +} + +# Make a clone usable for `git commit` without touching the runner's global config. +configure_git_identity () { + local repo_dir=$1 + git -C $repo_dir config user.email "test@stack-orchestrator.test" + git -C $repo_dir config user.name "test" +} + +TEST_TARGET_SO=$( ls -t1 ./package/laconic-so* | head -1 ) +echo "Testing this package: $TEST_TARGET_SO" + +WORK_DIR=~/stack-orchestrator-test/restart +# Multi-repo pod working clones land here; resolved by get_plugin_code_paths. +export CERC_REPO_BASE_DIR=$WORK_DIR/repo-base +rm -rf $WORK_DIR +mkdir -p $WORK_DIR $CERC_REPO_BASE_DIR + +# Source location of the test stacks shipped in this checkout. The test stages +# them into a temp git repo so 'deployment restart' (which runs 'git pull' on +# the stack source) has a real repo to pull from. +DATA_DIR=stack_orchestrator/data + +# ============================================================================ +# Phase 1 — single-repo restart cycle. Verifies that: +# * deploy create copies commands.py into /hooks/ +# * deployment start runs the copied start() hook +# * mutating the stack-source commands.py and running 'deployment restart' +# re-copies the new file into hooks/ and re-executes the new start() +# ============================================================================ +echo "=== Phase 1: single-repo restart cycle ===" + +BARE1=$WORK_DIR/stack-single.git +CLONE1=$WORK_DIR/stack-single +git init -b main --bare $BARE1 +git clone $BARE1 $CLONE1 +configure_git_identity $CLONE1 + +# External-stack layout: /stack-orchestrator/{stacks,compose}/... +mkdir -p $CLONE1/stack-orchestrator/stacks $CLONE1/stack-orchestrator/compose +cp -r $DATA_DIR/stacks/test-restart $CLONE1/stack-orchestrator/stacks/ +cp $DATA_DIR/compose/docker-compose-test-restart.yml $CLONE1/stack-orchestrator/compose/ + +git -C $CLONE1 add . +git -C $CLONE1 commit -m "test-restart v1" +git -C $CLONE1 push -u origin main + +STACK_PATH_SINGLE=$CLONE1/stack-orchestrator/stacks/test-restart +SPEC1=$WORK_DIR/spec-single.yml +DEP1=$WORK_DIR/dep-single + +$TEST_TARGET_SO --stack $STACK_PATH_SINGLE deploy --deploy-to k8s-kind init --output $SPEC1 +$TEST_TARGET_SO --stack $STACK_PATH_SINGLE deploy create --spec-file $SPEC1 --deployment-dir $DEP1 + +if [ ! -f "$DEP1/hooks/commands.py" ]; then + echo "single-repo deploy create test: FAILED (hooks/commands.py missing)" + cleanup_and_exit +fi +if ! grep -q '"v1"' "$DEP1/hooks/commands.py"; then + echo "single-repo deploy create test: FAILED (hooks/commands.py does not contain v1 marker)" + cleanup_and_exit +fi +echo "single-repo deploy create test: passed" + +$TEST_TARGET_SO deployment --dir $DEP1 start --perform-cluster-management +wait_for_pods_started $DEP1 + +# call_stack_deploy_start runs synchronously inside the start command +# (deploy_k8s.py:1026), so the marker is on disk before 'start' returns. +if [ ! -f "$DEP1/marker" ]; then + echo "single-repo start v1 test: FAILED (marker file missing)" + cleanup_and_exit +fi +marker_v1=$(cat $DEP1/marker) +if [ "$marker_v1" != "v1" ]; then + echo "single-repo start v1 test: FAILED (got: $marker_v1)" + cleanup_and_exit +fi +echo "single-repo start v1 test: passed" + +# Mutate the stack-source working tree v1 -> v2. No commit needed: 'deployment +# restart' runs 'git pull' against the bare which is a no-op, and _copy_hooks +# reads the working tree directly via get_plugin_code_paths. +sed -i 's/"v1"/"v2"/' $STACK_PATH_SINGLE/deploy/commands.py + +$TEST_TARGET_SO deployment --dir $DEP1 restart --stack-path $STACK_PATH_SINGLE + +if ! grep -q '"v2"' "$DEP1/hooks/commands.py"; then + echo "single-repo restart re-copy test: FAILED (hooks/commands.py still v1)" + cleanup_and_exit +fi +echo "single-repo restart re-copy test: passed" + +marker_v2=$(cat $DEP1/marker) +if [ "$marker_v2" != "v2" ]; then + echo "single-repo restart re-execute test: FAILED (got: $marker_v2)" + cleanup_and_exit +fi +echo "single-repo restart re-execute test: passed" + +# Stop phase 1 deployment but keep the cluster for phase 2. +$TEST_TARGET_SO deployment --dir $DEP1 \ + stop --delete-volumes --delete-namespace --skip-cluster-management + +# ============================================================================ +# Phase 2 — multi-repo create + start. Verifies that a stack with N pods, each +# from a separate repo, produces hooks/commands_0.py ... commands_{N-1}.py and +# that call_stack_deploy_start invokes every module's start(). +# ============================================================================ +echo "=== Phase 2: multi-repo create + start ===" + +# Pod repos: stack.yml's pods[].repository = 'cerc-io/test-restart-pod-X' +# resolves (via get_plugin_code_paths) to +# $CERC_REPO_BASE_DIR/test-restart-pod-X//stack/... +for label in a b; do + POD_BARE=$WORK_DIR/pod-$label.git + POD_CLONE=$CERC_REPO_BASE_DIR/test-restart-pod-$label + git init -b main --bare $POD_BARE + git clone $POD_BARE $POD_CLONE + configure_git_identity $POD_CLONE + mkdir -p $POD_CLONE/stack/deploy + # For dict-form pods, get_pod_file_path resolves the compose file at + # //docker-compose.yml — owned by the pod repo, not + # the stack repo. get_plugin_code_paths adds the trailing 'stack/', so + # commands.py lives at //stack/deploy/commands.py. + cat > $POD_CLONE/docker-compose.yml < $POD_CLONE/stack/deploy/commands.py <