From 7c65d39bb214acf5dd7ab48a2a6bd89a5f3503db Mon Sep 17 00:00:00 2001
From: prathamesh0 <42446521+prathamesh0@users.noreply.github.com>
Date: Tue, 28 Apr 2026 17:28:02 +0530
Subject: [PATCH] Make deployments self-sufficient and add E2E restart test
 (#750)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- `deploy create` now copies each pod's `commands.py` into `<deployment>/hooks/`. `call_stack_deploy_start` loads from there, so `deployment start` / `restart` no longer need the live stack source on disk to run the `start()` hook
- Only the `start()` hook is affected. `init`, `setup`, and `create` still load from the live source — they only run at `deploy create` time, when the source is guaranteed to be present
- Multi-repo stacks produce `hooks/commands_0.py`, `hooks/commands_1.py`, …; `call_stack_deploy_start` loads them all in sorted order
- Adds `tests/k8s-deploy/run-restart-test.sh` covering the full single-repo restart cycle (v1 -> mutate working tree -> `restart` re-copies and re-executes v2) and the multi-repo file-naming + multi-hook invocation. Wired into the existing **K8s Deploy Test** workflow
---
 .github/workflows/test-k8s-deploy.yml         |   3 +
 .pebbles/events.jsonl                         |   2 +
 .../compose/docker-compose-test-restart.yml   |   5 +
 .../data/stacks/test-restart-multi/README.md  |  14 +
 .../data/stacks/test-restart-multi/stack.yml  |  10 +
 .../data/stacks/test-restart/README.md        |  15 +
 .../stacks/test-restart/deploy/commands.py    |  32 +++
 .../data/stacks/test-restart/stack.yml        |   5 +
 .../deploy/deployment_context.py              |   4 +-
 .../deploy/deployment_create.py               |  66 +++--
 stack_orchestrator/deploy/dns_probe.py        |   4 +-
 stack_orchestrator/deploy/k8s/cluster_info.py |  24 +-
 stack_orchestrator/deploy/k8s/deploy_k8s.py   |  86 ++++--
 stack_orchestrator/deploy/k8s/helpers.py      |  15 +-
 tests/k8s-deploy/run-deploy-test.sh           |  14 +-
 tests/k8s-deploy/run-restart-test.sh          | 265 ++++++++++++++++++
 16 files changed, 478 insertions(+), 86 deletions(-)
 create mode 100644 stack_orchestrator/data/compose/docker-compose-test-restart.yml
 create mode 100644 stack_orchestrator/data/stacks/test-restart-multi/README.md
 create mode 100644 stack_orchestrator/data/stacks/test-restart-multi/stack.yml
 create mode 100644 stack_orchestrator/data/stacks/test-restart/README.md
 create mode 100644 stack_orchestrator/data/stacks/test-restart/deploy/commands.py
 create mode 100644 stack_orchestrator/data/stacks/test-restart/stack.yml
 create mode 100755 tests/k8s-deploy/run-restart-test.sh
diff --git a/.github/workflows/test-k8s-deploy.yml b/.github/workflows/test-k8s-deploy.yml
index aeffa467..62ce4a42 100644
--- a/.github/workflows/test-k8s-deploy.yml
+++ b/.github/workflows/test-k8s-deploy.yml
@@ -16,6 +16,7 @@ on:
       - '.github/workflows/triggers/test-k8s-deploy'
       - '.github/workflows/test-k8s-deploy.yml'
       - 'tests/k8s-deploy/run-deploy-test.sh'
+      - 'tests/k8s-deploy/run-restart-test.sh'
   schedule:
     - cron: '3 15 * * *'
 
@@ -46,3 +47,5 @@ jobs:
         run: ./tests/scripts/install-kubectl.sh
       - name: "Run k8s deployment test"
         run: ./tests/k8s-deploy/run-deploy-test.sh
+      - name: "Run restart k8s deployment test"
+        run: ./tests/k8s-deploy/run-restart-test.sh
diff --git a/.pebbles/events.jsonl b/.pebbles/events.jsonl
index ece11c09..1d04bb8a 100644
--- a/.pebbles/events.jsonl
+++ b/.pebbles/events.jsonl
@@ -54,3 +54,5 @@
 {"type":"status_update","timestamp":"2026-04-21T06:08:14.457815115Z","issue_id":"so-ad7","payload":{"status":"closed"}}
 {"type":"update","timestamp":"2026-04-21T09:00:47.364859946Z","issue_id":"so-p3p","payload":{"description":"## Problem\n\nThe Caddy ingress controller image is hardcoded in `ingress-caddy-kind-deploy.yaml`, with no mechanism to update it short of cluster recreation or manual `kubectl patch`. laconic-so should: (1) allow spec.yml to specify a Caddy image, (2) support updating the Caddy image as part of `deployment start`, (3) set `strategy: Recreate` on the Caddy Deployment since hostPort pods can't rolling-update.\n\n## Resolution\n\n- New spec key `caddy-ingress-image`. Fresh install uses it (fallback: manifest default). On subsequent `deployment start`, if the spec key is set and the running Caddy image differs, SO patches the Deployment and waits for rollout.\n- Spec key absent =\u003e SO does **not** touch a running Caddy, to avoid silently reverting images set out-of-band (ansible playbook, another deployment's spec).\n- `strategy: Recreate` added to the Caddy Deployment manifest.\n- Reconcile runs under both `--perform-cluster-management` and the default `--skip-cluster-management` (it's a plain k8s-API patch, not a cluster lifecycle op).\n- Image substitution locates the container by name instead of string-matching the shipped default, so the spec override wins regardless of what the manifest hardcodes.\n- Cluster-scoped caveat: `caddy-system` is shared across deployments; last `deployment start` that sets the key wins for everyone. Documented in `deployment_patterns.md`."}}
 {"type":"status_update","timestamp":"2026-04-21T09:00:47.745675131Z","issue_id":"so-p3p","payload":{"status":"closed"}}
+{"type":"comment","timestamp":"2026-04-27T13:41:16.962883653Z","issue_id":"so-078","payload":{"body":"Fixed. deploy create now copies commands.py into deployment_dir/hooks/. call_stack_deploy_start loads hooks from the deployment dir instead of resolving via get_stack_path, so deployment start no longer requires the stack repo to be present or cwd to be correct."}}
+{"type":"close","timestamp":"2026-04-27T13:41:17.073012545Z","issue_id":"so-078","payload":{}}
diff --git a/stack_orchestrator/data/compose/docker-compose-test-restart.yml b/stack_orchestrator/data/compose/docker-compose-test-restart.yml
new file mode 100644
index 00000000..e5f69565
--- /dev/null
+++ b/stack_orchestrator/data/compose/docker-compose-test-restart.yml
@@ -0,0 +1,5 @@
+services:
+  test-restart:
+    image: busybox:1.36
+    command: ["sh", "-c", "echo started && sleep infinity"]
+    restart: always
diff --git a/stack_orchestrator/data/stacks/test-restart-multi/README.md b/stack_orchestrator/data/stacks/test-restart-multi/README.md
new file mode 100644
index 00000000..6a638182
--- /dev/null
+++ b/stack_orchestrator/data/stacks/test-restart-multi/README.md
@@ -0,0 +1,14 @@
+# test-restart-multi
+
+E2E test stack used by `tests/k8s-deploy/run-restart-test.sh` to cover the
+multi-repo case: `pods:` references two pod repos, each shipping its own
+`deploy/commands.py`. `deploy create` should produce
+`<deployment>/hooks/commands_0.py` and `<deployment>/hooks/commands_1.py`,
+and `deployment start` should invoke both `start()` hooks (each writes its
+own marker file so neither overwrites the other).
+
+The pod repos themselves are created by the test script as bare-repo +
+working-clone pairs under `$CERC_REPO_BASE_DIR/test-restart-pod-{a,b}`;
+they are not committed to this repository. Each pod repo ships its own
+`docker-compose.yml` (resolved by `get_pod_file_path` for dict-form pods)
+and `stack/deploy/commands.py` — the stack repo only owns `stack.yml`.
diff --git a/stack_orchestrator/data/stacks/test-restart-multi/stack.yml b/stack_orchestrator/data/stacks/test-restart-multi/stack.yml
new file mode 100644
index 00000000..431ebf14
--- /dev/null
+++ b/stack_orchestrator/data/stacks/test-restart-multi/stack.yml
@@ -0,0 +1,10 @@
+version: "1.0"
+name: test-restart-multi
+description: "E2E test stack for the deployment restart command (multi-repo case)"
+pods:
+  - name: test-restart-multi-a
+    repository: test-restart-pod-a
+    path: .
+  - name: test-restart-multi-b
+    repository: test-restart-pod-b
+    path: .
diff --git a/stack_orchestrator/data/stacks/test-restart/README.md b/stack_orchestrator/data/stacks/test-restart/README.md
new file mode 100644
index 00000000..73cdb1fb
--- /dev/null
+++ b/stack_orchestrator/data/stacks/test-restart/README.md
@@ -0,0 +1,15 @@
+# test-restart
+
+E2E test stack used by `tests/k8s-deploy/run-restart-test.sh`.
+
+The stack ships a small `start()` hook that writes a versioned marker file
+into the deployment directory. The test exercises `deployment restart`:
+
+1. `deploy create` → asserts `commands.py` was copied into `<deployment>/hooks/`.
+2. `deployment start` → asserts the marker file contains the v1 string.
+3. Modifies `commands.py` in the stack-source working tree (v1 → v2).
+4. `deployment restart` → asserts the new `commands.py` was re-copied into
+   `<deployment>/hooks/` and the marker file now contains the v2 string.
+
+The pod uses a public `busybox` image that just sleeps; the marker file is
+the only thing under test.
diff --git a/stack_orchestrator/data/stacks/test-restart/deploy/commands.py b/stack_orchestrator/data/stacks/test-restart/deploy/commands.py
new file mode 100644
index 00000000..eaa15860
--- /dev/null
+++ b/stack_orchestrator/data/stacks/test-restart/deploy/commands.py
@@ -0,0 +1,32 @@
+# Copyright © 2026 Vulcanize
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http:#www.gnu.org/licenses/>.
+
+from stack_orchestrator.util import get_yaml
+from stack_orchestrator.deploy.deployment_context import DeploymentContext
+
+default_spec_file_content = ""
+
+
+def init(command_context):
+    return get_yaml().load(default_spec_file_content)
+
+
+def start(deployment_context: DeploymentContext):
+    # Writes a marker file the e2e test asserts on. The test flips the
+    # literal below from "v1" to "v2" in the stack-source working tree
+    # before running 'deployment restart' to verify the updated hook is
+    # copied into deployment_dir/hooks/ and re-executed.
+    marker = deployment_context.deployment_dir / "marker"
+    marker.write_text("v1")
diff --git a/stack_orchestrator/data/stacks/test-restart/stack.yml b/stack_orchestrator/data/stacks/test-restart/stack.yml
new file mode 100644
index 00000000..ccb394f6
--- /dev/null
+++ b/stack_orchestrator/data/stacks/test-restart/stack.yml
@@ -0,0 +1,5 @@
+version: "1.0"
+name: test-restart
+description: "E2E test stack for the deployment restart command"
+pods:
+  - test-restart
diff --git a/stack_orchestrator/deploy/deployment_context.py b/stack_orchestrator/deploy/deployment_context.py
index 1776699e..fb475589 100644
--- a/stack_orchestrator/deploy/deployment_context.py
+++ b/stack_orchestrator/deploy/deployment_context.py
@@ -83,9 +83,7 @@ class DeploymentContext:
             # Fallback to cluster-id for deployments created before the
             # deployment-id field was introduced. Keeps existing resource
             # names stable across this upgrade.
-            self.deployment_id = obj.get(
-                constants.deployment_id_key, self.id
-            )
+            self.deployment_id = obj.get(constants.deployment_id_key, self.id)
         # Handle the case of a legacy deployment with no file
         # Code below is intended to match the output from _make_default_cluster_name()
         # TODO: remove when we no longer need to support legacy deployments
diff --git a/stack_orchestrator/deploy/deployment_create.py b/stack_orchestrator/deploy/deployment_create.py
index fd7ec4f1..efdd2efb 100644
--- a/stack_orchestrator/deploy/deployment_create.py
+++ b/stack_orchestrator/deploy/deployment_create.py
@@ -276,16 +276,17 @@ def call_stack_deploy_start(deployment_context):
     create additional k8s resources (Services, etc.) in the deployment namespace.
     The namespace can be derived as f"laconic-{deployment_context.id}".
     """
-    python_file_paths = _commands_plugin_paths(deployment_context.stack.name)
-    for python_file_path in python_file_paths:
-        if python_file_path.exists():
-            spec = util.spec_from_file_location("commands", python_file_path)
-            if spec is None or spec.loader is None:
-                continue
-            imported_stack = util.module_from_spec(spec)
-            spec.loader.exec_module(imported_stack)
-            if _has_method(imported_stack, "start"):
-                imported_stack.start(deployment_context)
+    hooks_dir = deployment_context.deployment_dir / "hooks"
+    if not hooks_dir.exists():
+        return
+    for python_file_path in sorted(hooks_dir.glob("commands*.py")):
+        spec = util.spec_from_file_location("commands", python_file_path)
+        if spec is None or spec.loader is None:
+            continue
+        imported_stack = util.module_from_spec(spec)
+        spec.loader.exec_module(imported_stack)
+        if _has_method(imported_stack, "start"):
+            imported_stack.start(deployment_context)
 
 
 # Inspect the pod yaml to find config files referenced in subdirectories
@@ -379,9 +380,7 @@ def _validate_host_path_mounts(parsed_pod_file, pod_name, pod_file_path):
                         "content at runtime.\n\n"
                         "See docs/deployment_patterns.md."
                     )
-                total = sum(
-                    p.stat().st_size for p in entries if p.is_file()
-                )
+                total = sum(p.stat().st_size for p in entries if p.is_file())
                 if total > _HOST_PATH_CONFIGMAP_BUDGET_BYTES:
                     raise DeployerException(
                         f"Directory host-path bind '{volume_str}' in "
@@ -1111,6 +1110,37 @@ def _safe_copy_tree(src: Path, dst: Path, exclude_patterns: Optional[List[str]]
             safe_copy_file(src_path, dst_path)
 
 
+def _copy_hooks(stack_name: str, target_dir: Path):
+    """Copy commands.py hooks into deployment_dir/hooks/ for self-sufficiency.
+
+    Single repo: hooks/commands.py
+    Multi-repo: hooks/commands_0.py, hooks/commands_1.py, ... — indexed by
+    plugin path order.
+
+    Note: the whole commands.py file is copied (init/setup/create/start), but
+    at runtime only call_stack_deploy_start loads from this copied location.
+    call_stack_deploy_init, call_stack_deploy_setup, and call_stack_deploy_create
+    still resolve commands.py from the live stack source via
+    get_plugin_code_paths — they only run at deploy create time when the source
+    is guaranteed to be present, so they don't need to be self-sufficient.
+    """
+    plugin_paths = get_plugin_code_paths(stack_name)
+    sources = [
+        p.joinpath("deploy", "commands.py")
+        for p in plugin_paths
+        if p.joinpath("deploy", "commands.py").exists()
+    ]
+    if not sources:
+        return
+    hooks_dir = target_dir / "hooks"
+    hooks_dir.mkdir(exist_ok=True)
+    if len(sources) == 1:
+        copyfile(sources[0], hooks_dir / "commands.py")
+    else:
+        for i, src in enumerate(sources):
+            copyfile(src, hooks_dir / f"commands_{i}.py")
+
+
 def _write_deployment_files(
     target_dir: Path,
     spec_file: Path,
@@ -1138,6 +1168,8 @@ def _write_deployment_files(
     copyfile(spec_file, target_dir.joinpath(constants.spec_file_name))
     copyfile(stack_file, target_dir.joinpath(constants.stack_file_name))
 
+    _copy_hooks(stack_name, target_dir)
+
     # Create deployment file if requested
     if include_deployment_file:
         _create_deployment_file(target_dir, stack_source=stack_source)
@@ -1249,7 +1281,9 @@ def _write_deployment_files(
             else:
                 source_config_dir = resolve_config_dir(stack_name, configmap_name)
             if os.path.exists(source_config_dir):
-                destination_config_dir = target_dir.joinpath("configmaps", configmap_name)
+                destination_config_dir = target_dir.joinpath(
+                    "configmaps", configmap_name
+                )
                 copytree(source_config_dir, destination_config_dir, dirs_exist_ok=True)
 
     # Copy the job files into the target dir
@@ -1262,9 +1296,7 @@ def _write_deployment_files(
             if job_file_path and job_file_path.exists():
                 parsed_job_file = yaml.load(open(job_file_path, "r"))
                 if parsed_spec.is_kubernetes_deployment():
-                    _validate_host_path_mounts(
-                        parsed_job_file, job, job_file_path
-                    )
+                    _validate_host_path_mounts(parsed_job_file, job, job_file_path)
                 _fixup_pod_file(parsed_job_file, parsed_spec, destination_compose_dir)
                 with open(
                     destination_compose_jobs_dir.joinpath(
diff --git a/stack_orchestrator/deploy/dns_probe.py b/stack_orchestrator/deploy/dns_probe.py
index e04b4ea2..6363a72e 100644
--- a/stack_orchestrator/deploy/dns_probe.py
+++ b/stack_orchestrator/deploy/dns_probe.py
@@ -6,7 +6,7 @@
 import secrets
 import socket
 import time
-from typing import Optional
+from typing import List, Optional
 import requests
 from kubernetes import client
 
@@ -18,7 +18,7 @@ def get_server_egress_ip() -> str:
     return response.text.strip()
 
 
-def resolve_hostname(hostname: str) -> list[str]:
+def resolve_hostname(hostname: str) -> List[str]:
     """Resolve hostname to list of IP addresses."""
     try:
         _, _, ips = socket.gethostbyname_ex(hostname)
diff --git a/stack_orchestrator/deploy/k8s/cluster_info.py b/stack_orchestrator/deploy/k8s/cluster_info.py
index 2febf6ad..e03b4bea 100644
--- a/stack_orchestrator/deploy/k8s/cluster_info.py
+++ b/stack_orchestrator/deploy/k8s/cluster_info.py
@@ -479,9 +479,7 @@ class ClusterInfo:
                         if sanitized in seen:
                             continue
                         seen.add(sanitized)
-                        abs_src = resolve_host_path_for_kind(
-                            src, deployment_dir
-                        )
+                        abs_src = resolve_host_path_for_kind(src, deployment_dir)
                         data = self._read_host_path_source(abs_src, mount_string)
                         cm = client.V1ConfigMap(
                             metadata=client.V1ObjectMeta(
@@ -495,9 +493,7 @@ class ClusterInfo:
                         result.append(cm)
         return result
 
-    def _read_host_path_source(
-        self, abs_src: Path, mount_string: str
-    ) -> dict:
+    def _read_host_path_source(self, abs_src: Path, mount_string: str) -> dict:
         """Read file or flat-directory content for a host-path ConfigMap.
 
         Validates shape at read time as a defensive second check — the
@@ -517,9 +513,7 @@ class ClusterInfo:
             for entry in abs_src.iterdir():
                 if entry.is_file():
                     with open(entry, "rb") as f:
-                        data[entry.name] = base64.b64encode(f.read()).decode(
-                            "ASCII"
-                        )
+                        data[entry.name] = base64.b64encode(f.read()).decode("ASCII")
         return data
 
     def get_pvs(self):
@@ -711,9 +705,7 @@ class ClusterInfo:
                 volume_mounts = volume_mounts_for_service(
                     parsed_yaml_map,
                     service_name,
-                    Path(self.spec.file_path).parent
-                    if self.spec.file_path
-                    else None,
+                    Path(self.spec.file_path).parent if self.spec.file_path else None,
                 )
                 # Handle command/entrypoint from compose file
                 # In docker-compose: entrypoint -> k8s command, command -> k8s args
@@ -1021,9 +1013,7 @@ class ClusterInfo:
                 metadata=client.V1ObjectMeta(
                     name=deployment_name,
                     labels=self._stack_labels(
-                        {"app.kubernetes.io/component": pod_name}
-                        if multi_pod
-                        else None
+                        {"app.kubernetes.io/component": pod_name} if multi_pod else None
                     ),
                 ),
                 spec=spec,
@@ -1071,9 +1061,7 @@ class ClusterInfo:
                     container_ports[container].add(port)
         if maintenance_svc and ":" in maintenance_svc:
             maint_container, maint_port_str = maintenance_svc.split(":", 1)
-            container_ports.setdefault(maint_container, set()).add(
-                int(maint_port_str)
-            )
+            container_ports.setdefault(maint_container, set()).add(int(maint_port_str))
 
         # Build map: pod_file -> set of service names in that pod
         pod_services_map: dict = {}
diff --git a/stack_orchestrator/deploy/k8s/deploy_k8s.py b/stack_orchestrator/deploy/k8s/deploy_k8s.py
index 688c5696..ac4acf7e 100644
--- a/stack_orchestrator/deploy/k8s/deploy_k8s.py
+++ b/stack_orchestrator/deploy/k8s/deploy_k8s.py
@@ -219,10 +219,7 @@ class K8sDeployer(Deployer):
             )
             self.core_api.create_namespace(body=ns)
             if opts.o.debug:
-                print(
-                    f"Created namespace {self.k8s_namespace} "
-                    f"owned by {my_dir}"
-                )
+                print(f"Created namespace {self.k8s_namespace} " f"owned by {my_dir}")
             return
 
         annotations = (existing.metadata.annotations or {}) if existing.metadata else {}
@@ -1025,9 +1022,7 @@ class K8sDeployer(Deployer):
 
             call_stack_deploy_start(self.deployment_context)
 
-    def down(
-        self, timeout, volumes, skip_cluster_management, delete_namespace=False
-    ):
+    def down(self, timeout, volumes, skip_cluster_management, delete_namespace=False):
         """Tear down stack-labeled resources. Phases:
 
         1. Delete namespaced resources (if namespace still exists).
@@ -1221,34 +1216,68 @@ class K8sDeployer(Deployer):
         listers = []
         if namespace_present:
             listers += [
-                ("deployment", lambda: self.apps_api.list_namespaced_deployment(
-                    namespace=namespace, label_selector=selector)),
-                ("ingress", lambda: self.networking_api.list_namespaced_ingress(
-                    namespace=namespace, label_selector=selector)),
-                ("job", lambda: self.batch_api.list_namespaced_job(
-                    namespace=namespace, label_selector=selector)),
-                ("service", lambda: self.core_api.list_namespaced_service(
-                    namespace=namespace, label_selector=selector)),
-                ("configmap", lambda: self.core_api.list_namespaced_config_map(
-                    namespace=namespace, label_selector=selector)),
-                ("secret", lambda: self.core_api.list_namespaced_secret(
-                    namespace=namespace, label_selector=selector)),
-                ("pod", lambda: self.core_api.list_namespaced_pod(
-                    namespace=namespace, label_selector=selector)),
+                (
+                    "deployment",
+                    lambda: self.apps_api.list_namespaced_deployment(
+                        namespace=namespace, label_selector=selector
+                    ),
+                ),
+                (
+                    "ingress",
+                    lambda: self.networking_api.list_namespaced_ingress(
+                        namespace=namespace, label_selector=selector
+                    ),
+                ),
+                (
+                    "job",
+                    lambda: self.batch_api.list_namespaced_job(
+                        namespace=namespace, label_selector=selector
+                    ),
+                ),
+                (
+                    "service",
+                    lambda: self.core_api.list_namespaced_service(
+                        namespace=namespace, label_selector=selector
+                    ),
+                ),
+                (
+                    "configmap",
+                    lambda: self.core_api.list_namespaced_config_map(
+                        namespace=namespace, label_selector=selector
+                    ),
+                ),
+                (
+                    "secret",
+                    lambda: self.core_api.list_namespaced_secret(
+                        namespace=namespace, label_selector=selector
+                    ),
+                ),
+                (
+                    "pod",
+                    lambda: self.core_api.list_namespaced_pod(
+                        namespace=namespace, label_selector=selector
+                    ),
+                ),
             ]
             if delete_volumes:
                 listers.append(
-                    ("persistentvolumeclaim",
-                     lambda: self.core_api.list_namespaced_persistent_volume_claim(
-                         namespace=namespace, label_selector=selector))
+                    (
+                        "persistentvolumeclaim",
+                        lambda: self.core_api.list_namespaced_persistent_volume_claim(
+                            namespace=namespace, label_selector=selector
+                        ),
+                    )
                 )
         # PVs are cluster-scoped — wait for them even when the namespace
         # is already gone (orphaned from a prior --delete-namespace).
         if delete_volumes:
             listers.append(
-                ("persistentvolume",
-                 lambda: self.core_api.list_persistent_volume(
-                     label_selector=selector))
+                (
+                    "persistentvolume",
+                    lambda: self.core_api.list_persistent_volume(
+                        label_selector=selector
+                    ),
+                )
             )
 
         def remaining():
@@ -1276,8 +1305,7 @@ class K8sDeployer(Deployer):
         left = remaining()
         if left:
             print(
-                f"Warning: resources still present after {timeout_seconds}s: "
-                f"{left}"
+                f"Warning: resources still present after {timeout_seconds}s: " f"{left}"
             )
 
     def status(self):
diff --git a/stack_orchestrator/deploy/k8s/helpers.py b/stack_orchestrator/deploy/k8s/helpers.py
index 4cbf3270..682fe442 100644
--- a/stack_orchestrator/deploy/k8s/helpers.py
+++ b/stack_orchestrator/deploy/k8s/helpers.py
@@ -207,9 +207,7 @@ def _install_caddy_cert_backup(
             print("No kind-mount-root configured; caddy cert backup disabled")
         return
     manifest = os.path.abspath(
-        get_k8s_dir().joinpath(
-            "components", "ingress", "caddy-cert-backup.yaml"
-        )
+        get_k8s_dir().joinpath("components", "ingress", "caddy-cert-backup.yaml")
     )
     with open(manifest) as f:
         objects = list(yaml.safe_load_all(f))
@@ -233,9 +231,7 @@ def _parse_kind_extra_mounts(config_file: str) -> List[Dict[str, str]]:
             host_path = m.get("hostPath")
             container_path = m.get("containerPath")
             if host_path and container_path:
-                mounts.append(
-                    {"hostPath": host_path, "containerPath": container_path}
-                )
+                mounts.append({"hostPath": host_path, "containerPath": container_path})
     return mounts
 
 
@@ -500,12 +496,9 @@ def install_ingress_for_kind(
                 continue
             if (
                 obj.get("kind") == "Deployment"
-                and obj.get("metadata", {}).get("name")
-                == "caddy-ingress-controller"
+                and obj.get("metadata", {}).get("name") == "caddy-ingress-controller"
             ):
-                for c in (
-                    obj["spec"]["template"]["spec"].get("containers") or []
-                ):
+                for c in obj["spec"]["template"]["spec"].get("containers") or []:
                     if c.get("name") == "caddy-ingress-controller":
                         c["image"] = caddy_image
                         if opts.o.debug:
diff --git a/tests/k8s-deploy/run-deploy-test.sh b/tests/k8s-deploy/run-deploy-test.sh
index 08a89c6a..7feb8038 100755
--- a/tests/k8s-deploy/run-deploy-test.sh
+++ b/tests/k8s-deploy/run-deploy-test.sh
@@ -413,14 +413,16 @@ if [ "$restored_value" != "$fake_cert_value" ]; then
 fi
 echo "caddy cert restore test: passed"
 
-# Final teardown: --delete-namespace nukes the namespace after labeled cleanup.
-# Verify the namespace is actually gone.
+# Final teardown: --delete-namespace nukes the namespace, and
+# --perform-cluster-management tears down the Kind cluster so the next test
+# step in this CI workflow (e.g. run-restart-test.sh) starts from a clean
+# host.
 $TEST_TARGET_SO deployment --dir $test_deployment_dir \
-    stop --delete-volumes --delete-namespace --skip-cluster-management
-if kubectl get namespace ${deployment_ns} >/dev/null 2>&1; then
-    echo "delete-namespace test: FAILED (namespace still present)"
+    stop --delete-volumes --delete-namespace --perform-cluster-management
+if kind get clusters 2>/dev/null | grep -q .; then
+    echo "cluster teardown test: FAILED (kind cluster still present)"
     exit 1
 fi
-echo "delete-namespace test: passed"
+echo "cluster teardown test: passed"
 
 echo "Test passed"
diff --git a/tests/k8s-deploy/run-restart-test.sh b/tests/k8s-deploy/run-restart-test.sh
new file mode 100755
index 00000000..c3298d01
--- /dev/null
+++ b/tests/k8s-deploy/run-restart-test.sh
@@ -0,0 +1,265 @@
+#!/usr/bin/env bash
+set -e
+if [ -n "$CERC_SCRIPT_DEBUG" ]; then
+    set -x
+    echo "Environment variables:"
+    env
+fi
+
+# Helper functions: TODO move into a separate file (mirrors run-deploy-test.sh:10).
+wait_for_pods_started () {
+    local dir=$1
+    for i in {1..50}
+    do
+        local ps_output=$( $TEST_TARGET_SO deployment --dir $dir ps )
+
+        if [[ "$ps_output" == *"Running containers:"* ]]; then
+            return
+        else
+            sleep 5
+        fi
+    done
+    echo "waiting for pods to start: FAILED"
+    cleanup_and_exit
+}
+
+# Multi-pod stacks aren't visible to 'deployment ps' (deploy_k8s.py:1366
+# filters by app_name-deployment substring, which doesn't match
+# laconic-<id>-<podname>-deployment-<hash> names). Wait via kubectl.
+wait_for_k8s_pods_ready () {
+    local ns=$1
+    local timeout=240
+    local waited=0
+    # First wait for at least one pod to appear in the namespace.
+    while [ $waited -lt $timeout ]; do
+        local count=$(kubectl get pods -n "$ns" --no-headers 2>/dev/null | wc -l)
+        if [ "$count" -gt 0 ]; then
+            break
+        fi
+        sleep 2
+        waited=$((waited + 2))
+    done
+    if ! kubectl wait --for=condition=Ready pod --all \
+            -n "$ns" --timeout=$((timeout - waited))s 2>&1; then
+        echo "kubectl wait pods ready: FAILED (ns=$ns)"
+        kubectl get pods -n "$ns" 2>&1 || true
+        kubectl describe pods -n "$ns" 2>&1 | tail -80 || true
+        cleanup_and_exit
+    fi
+}
+
+# Best-effort full teardown so CI runners don't leak namespaces/PVs/clusters
+# between runs. Variables may be unset depending on which phase tripped.
+cleanup_and_exit () {
+    if [ -n "$DEP1" ] && [ -d "$DEP1" ]; then
+        $TEST_TARGET_SO deployment --dir $DEP1 \
+            stop --delete-volumes --delete-namespace --skip-cluster-management || true
+    fi
+    if [ -n "$DEP2" ] && [ -d "$DEP2" ]; then
+        $TEST_TARGET_SO deployment --dir $DEP2 \
+            stop --delete-volumes --delete-namespace --perform-cluster-management || true
+    fi
+    exit 1
+}
+
+# Make a clone usable for `git commit` without touching the runner's global config.
+configure_git_identity () {
+    local repo_dir=$1
+    git -C $repo_dir config user.email "test@stack-orchestrator.test"
+    git -C $repo_dir config user.name "test"
+}
+
+TEST_TARGET_SO=$( ls -t1 ./package/laconic-so* | head -1 )
+echo "Testing this package: $TEST_TARGET_SO"
+
+WORK_DIR=~/stack-orchestrator-test/restart
+# Multi-repo pod working clones land here; resolved by get_plugin_code_paths.
+export CERC_REPO_BASE_DIR=$WORK_DIR/repo-base
+rm -rf $WORK_DIR
+mkdir -p $WORK_DIR $CERC_REPO_BASE_DIR
+
+# Source location of the test stacks shipped in this checkout. The test stages
+# them into a temp git repo so 'deployment restart' (which runs 'git pull' on
+# the stack source) has a real repo to pull from.
+DATA_DIR=stack_orchestrator/data
+
+# ============================================================================
+# Phase 1 — single-repo restart cycle. Verifies that:
+#   * deploy create copies commands.py into <deployment>/hooks/
+#   * deployment start runs the copied start() hook
+#   * mutating the stack-source commands.py and running 'deployment restart'
+#     re-copies the new file into hooks/ and re-executes the new start()
+# ============================================================================
+echo "=== Phase 1: single-repo restart cycle ==="
+
+BARE1=$WORK_DIR/stack-single.git
+CLONE1=$WORK_DIR/stack-single
+git init -b main --bare $BARE1
+git clone $BARE1 $CLONE1
+configure_git_identity $CLONE1
+
+# External-stack layout: <repo>/stack-orchestrator/{stacks,compose}/...
+mkdir -p $CLONE1/stack-orchestrator/stacks $CLONE1/stack-orchestrator/compose
+cp -r $DATA_DIR/stacks/test-restart $CLONE1/stack-orchestrator/stacks/
+cp $DATA_DIR/compose/docker-compose-test-restart.yml $CLONE1/stack-orchestrator/compose/
+
+git -C $CLONE1 add .
+git -C $CLONE1 commit -m "test-restart v1"
+git -C $CLONE1 push -u origin main
+
+STACK_PATH_SINGLE=$CLONE1/stack-orchestrator/stacks/test-restart
+SPEC1=$WORK_DIR/spec-single.yml
+DEP1=$WORK_DIR/dep-single
+
+$TEST_TARGET_SO --stack $STACK_PATH_SINGLE deploy --deploy-to k8s-kind init --output $SPEC1
+$TEST_TARGET_SO --stack $STACK_PATH_SINGLE deploy create --spec-file $SPEC1 --deployment-dir $DEP1
+
+if [ ! -f "$DEP1/hooks/commands.py" ]; then
+    echo "single-repo deploy create test: FAILED (hooks/commands.py missing)"
+    cleanup_and_exit
+fi
+if ! grep -q '"v1"' "$DEP1/hooks/commands.py"; then
+    echo "single-repo deploy create test: FAILED (hooks/commands.py does not contain v1 marker)"
+    cleanup_and_exit
+fi
+echo "single-repo deploy create test: passed"
+
+$TEST_TARGET_SO deployment --dir $DEP1 start --perform-cluster-management
+wait_for_pods_started $DEP1
+
+# call_stack_deploy_start runs synchronously inside the start command
+# (deploy_k8s.py:1026), so the marker is on disk before 'start' returns.
+if [ ! -f "$DEP1/marker" ]; then
+    echo "single-repo start v1 test: FAILED (marker file missing)"
+    cleanup_and_exit
+fi
+marker_v1=$(cat $DEP1/marker)
+if [ "$marker_v1" != "v1" ]; then
+    echo "single-repo start v1 test: FAILED (got: $marker_v1)"
+    cleanup_and_exit
+fi
+echo "single-repo start v1 test: passed"
+
+# Mutate the stack-source working tree v1 -> v2. No commit needed: 'deployment
+# restart' runs 'git pull' against the bare which is a no-op, and _copy_hooks
+# reads the working tree directly via get_plugin_code_paths.
+sed -i 's/"v1"/"v2"/' $STACK_PATH_SINGLE/deploy/commands.py
+
+$TEST_TARGET_SO deployment --dir $DEP1 restart --stack-path $STACK_PATH_SINGLE
+
+if ! grep -q '"v2"' "$DEP1/hooks/commands.py"; then
+    echo "single-repo restart re-copy test: FAILED (hooks/commands.py still v1)"
+    cleanup_and_exit
+fi
+echo "single-repo restart re-copy test: passed"
+
+marker_v2=$(cat $DEP1/marker)
+if [ "$marker_v2" != "v2" ]; then
+    echo "single-repo restart re-execute test: FAILED (got: $marker_v2)"
+    cleanup_and_exit
+fi
+echo "single-repo restart re-execute test: passed"
+
+# Stop phase 1 deployment but keep the cluster for phase 2.
+$TEST_TARGET_SO deployment --dir $DEP1 \
+    stop --delete-volumes --delete-namespace --skip-cluster-management
+
+# ============================================================================
+# Phase 2 — multi-repo create + start. Verifies that a stack with N pods, each
+# from a separate repo, produces hooks/commands_0.py ... commands_{N-1}.py and
+# that call_stack_deploy_start invokes every module's start().
+# ============================================================================
+echo "=== Phase 2: multi-repo create + start ==="
+
+# Pod repos: stack.yml's pods[].repository = 'cerc-io/test-restart-pod-X'
+# resolves (via get_plugin_code_paths) to
+# $CERC_REPO_BASE_DIR/test-restart-pod-X/<pod_path>/stack/...
+for label in a b; do
+    POD_BARE=$WORK_DIR/pod-$label.git
+    POD_CLONE=$CERC_REPO_BASE_DIR/test-restart-pod-$label
+    git init -b main --bare $POD_BARE
+    git clone $POD_BARE $POD_CLONE
+    configure_git_identity $POD_CLONE
+    mkdir -p $POD_CLONE/stack/deploy
+    # For dict-form pods, get_pod_file_path resolves the compose file at
+    # <pod_repo>/<pod_path>/docker-compose.yml — owned by the pod repo, not
+    # the stack repo. get_plugin_code_paths adds the trailing 'stack/', so
+    # commands.py lives at <pod_repo>/<pod_path>/stack/deploy/commands.py.
+    cat > $POD_CLONE/docker-compose.yml <<EOF
+services:
+  test-restart-multi-$label:
+    image: busybox:1.36
+    command: ["sh", "-c", "sleep infinity"]
+    restart: always
+EOF
+    # Each pod hook writes a distinct marker file so neither overwrites the
+    # other when both start() hooks are loaded by call_stack_deploy_start.
+    cat > $POD_CLONE/stack/deploy/commands.py <<EOF
+from stack_orchestrator.deploy.deployment_context import DeploymentContext
+
+
+def start(deployment_context: DeploymentContext):
+    marker = deployment_context.deployment_dir / "marker-$label"
+    marker.write_text("v1")
+EOF
+    git -C $POD_CLONE add .
+    git -C $POD_CLONE commit -m "pod $label v1"
+    git -C $POD_CLONE push -u origin main
+done
+
+# Stack repo
+BARE2=$WORK_DIR/stack-multi.git
+CLONE2=$WORK_DIR/stack-multi
+git init -b main --bare $BARE2
+git clone $BARE2 $CLONE2
+configure_git_identity $CLONE2
+
+# For multi-repo (dict-form pods), the stack repo only owns stack.yml — pod
+# compose files and hooks live in the per-pod repos under CERC_REPO_BASE_DIR.
+mkdir -p $CLONE2/stack-orchestrator/stacks
+cp -r $DATA_DIR/stacks/test-restart-multi $CLONE2/stack-orchestrator/stacks/
+
+git -C $CLONE2 add .
+git -C $CLONE2 commit -m "test-restart-multi v1"
+git -C $CLONE2 push -u origin main
+
+STACK_PATH_MULTI=$CLONE2/stack-orchestrator/stacks/test-restart-multi
+SPEC2=$WORK_DIR/spec-multi.yml
+DEP2=$WORK_DIR/dep-multi
+
+$TEST_TARGET_SO --stack $STACK_PATH_MULTI deploy --deploy-to k8s-kind init --output $SPEC2
+$TEST_TARGET_SO --stack $STACK_PATH_MULTI deploy create --spec-file $SPEC2 --deployment-dir $DEP2
+
+# get_plugin_code_paths returns list(set(...)) so the index ordering is not
+# guaranteed; we assert presence of both files rather than mapping each to
+# a specific pod.
+if [ ! -f "$DEP2/hooks/commands_0.py" ] || [ ! -f "$DEP2/hooks/commands_1.py" ]; then
+    echo "multi-repo deploy create test: FAILED (hooks/commands_{0,1}.py missing)"
+    ls -la $DEP2/hooks/ || true
+    cleanup_and_exit
+fi
+echo "multi-repo deploy create test: passed"
+
+$TEST_TARGET_SO deployment --dir $DEP2 start --skip-cluster-management
+wait_for_k8s_pods_ready laconic-test-restart-multi
+
+for label in a b; do
+    if [ ! -f "$DEP2/marker-$label" ]; then
+        echo "multi-repo start test: FAILED (marker-$label missing)"
+        cleanup_and_exit
+    fi
+    val=$(cat $DEP2/marker-$label)
+    if [ "$val" != "v1" ]; then
+        echo "multi-repo start test: FAILED (marker-$label content: $val)"
+        cleanup_and_exit
+    fi
+done
+echo "multi-repo start test: passed"
+
+# Final teardown — destroy the cluster for the next CI run.
+$TEST_TARGET_SO deployment --dir $DEP2 \
+    stop --delete-volumes --delete-namespace --perform-cluster-management
+
+rm -rf $WORK_DIR
+
+echo "Test passed"