26 changed files with 99 additions and 962 deletions
--- a/.github/workflows/test-k8s-deploy.yml
+++ b/.github/workflows/test-k8s-deploy.yml
@ -16,7 +16,6 @@ on:
      - '.github/workflows/triggers/test-k8s-deploy'
      - '.github/workflows/test-k8s-deploy.yml'
      - 'tests/k8s-deploy/run-deploy-test.sh'
-      - 'tests/k8s-deploy/run-restart-test.sh'
  schedule:
    - cron: '3 15 * * *'

@ -47,5 +46,3 @@ jobs:
        run: ./tests/scripts/install-kubectl.sh
      - name: "Run k8s deployment test"
        run: ./tests/k8s-deploy/run-deploy-test.sh
-      - name: "Run restart k8s deployment test"
-        run: ./tests/k8s-deploy/run-restart-test.sh
--- a/.pebbles/events.jsonl
+++ b/.pebbles/events.jsonl
@ -54,5 +54,3 @@
 {"type":"status_update","timestamp":"2026-04-21T06:08:14.457815115Z","issue_id":"so-ad7","payload":{"status":"closed"}}
 {"type":"update","timestamp":"2026-04-21T09:00:47.364859946Z","issue_id":"so-p3p","payload":{"description":"## Problem\n\nThe Caddy ingress controller image is hardcoded in `ingress-caddy-kind-deploy.yaml`, with no mechanism to update it short of cluster recreation or manual `kubectl patch`. laconic-so should: (1) allow spec.yml to specify a Caddy image, (2) support updating the Caddy image as part of `deployment start`, (3) set `strategy: Recreate` on the Caddy Deployment since hostPort pods can't rolling-update.\n\n## Resolution\n\n- New spec key `caddy-ingress-image`. Fresh install uses it (fallback: manifest default). On subsequent `deployment start`, if the spec key is set and the running Caddy image differs, SO patches the Deployment and waits for rollout.\n- Spec key absent =\u003e SO does **not** touch a running Caddy, to avoid silently reverting images set out-of-band (ansible playbook, another deployment's spec).\n- `strategy: Recreate` added to the Caddy Deployment manifest.\n- Reconcile runs under both `--perform-cluster-management` and the default `--skip-cluster-management` (it's a plain k8s-API patch, not a cluster lifecycle op).\n- Image substitution locates the container by name instead of string-matching the shipped default, so the spec override wins regardless of what the manifest hardcodes.\n- Cluster-scoped caveat: `caddy-system` is shared across deployments; last `deployment start` that sets the key wins for everyone. Documented in `deployment_patterns.md`."}}
 {"type":"status_update","timestamp":"2026-04-21T09:00:47.745675131Z","issue_id":"so-p3p","payload":{"status":"closed"}}
-{"type":"comment","timestamp":"2026-04-27T13:41:16.962883653Z","issue_id":"so-078","payload":{"body":"Fixed. deploy create now copies commands.py into deployment_dir/hooks/. call_stack_deploy_start loads hooks from the deployment dir instead of resolving via get_stack_path, so deployment start no longer requires the stack repo to be present or cwd to be correct."}}
-{"type":"close","timestamp":"2026-04-27T13:41:17.073012545Z","issue_id":"so-078","payload":{}}
--- a/stack_orchestrator/data/compose/docker-compose-host-metrics.yml
+++ b/stack_orchestrator/data/compose/docker-compose-host-metrics.yml
@ -1,33 +0,0 @@
-version: '3.2'
-
-services:
-  host-telegraf:
-    image: telegraf:1.36
-    restart: unless-stopped
-    network_mode: host
-    pid: host
-    entrypoint: ["/scripts/telegraf-entrypoint.sh"]
-    environment:
-      INFLUXDB_URL: ${INFLUXDB_URL}
-      INFLUXDB_DB: ${INFLUXDB_DB:-host_metrics}
-      INFLUXDB_USER: ${INFLUXDB_WRITE_USER}
-      INFLUXDB_PASSWORD: ${INFLUXDB_WRITE_PASSWORD}
-      COLLECT_INTERVAL: ${COLLECT_INTERVAL:-10s}
-      HOST_TAG: ${HOST_TAG:-}
-      COLLECT_ZFS: ${COLLECT_ZFS:-false}
-    volumes:
-      - ../config/host-metrics/telegraf.conf.tpl:/etc/telegraf/telegraf.conf.tpl:ro
-      - ../config/host-metrics/scripts/telegraf-entrypoint.sh:/scripts/telegraf-entrypoint.sh:ro
-      - /proc:/hostfs/proc:ro
-      - /sys:/hostfs/sys:ro
-      - /:/hostfs:ro
-      # /dev is needed by inputs.diskio: it enumerates devices from
-      # /proc/diskstats and then opens /dev/<name> for udev/uevent lookups.
-      # Without this mount telegraf logs an "error reading /dev/<name>" warning
-      # per device per collection cycle.
-      - /dev:/dev:ro
-      # /run/udev is where modern systemd stores the udev database that
-      # gopsutil consults for per-device tags. Without it telegraf falls
-      # back to the legacy /dev/.udev/db/... path which doesn't exist on
-      # systemd hosts, producing "stat /dev/.udev/db/block:..." warnings.
-      - /run/udev:/run/udev:ro
--- a/stack_orchestrator/data/compose/docker-compose-test-restart.yml
+++ b/stack_orchestrator/data/compose/docker-compose-test-restart.yml
@ -1,5 +0,0 @@
-services:
-  test-restart:
-    image: busybox:1.36
-    command: ["sh", "-c", "echo started && sleep infinity"]
-    restart: always
--- a/stack_orchestrator/data/config/host-metrics/scripts/telegraf-entrypoint.sh
+++ b/stack_orchestrator/data/config/host-metrics/scripts/telegraf-entrypoint.sh
@ -1,68 +0,0 @@
-#!/bin/sh
-# host-metrics telegraf-entrypoint.sh
-# Render telegraf.conf from telegraf.conf.tpl, then exec telegraf.
-#
-# Substitutions performed here (by awk):
-#   @@HOST_TAG_BLOCK@@ -> "[global_tags]\n  host = \"$HOST_TAG\"" if set, else empty.
-#   @@ZFS_BLOCK@@      -> "[[inputs.zfs]]\n  poolMetrics = true"  if COLLECT_ZFS=true, else empty.
-#
-# Variables of the form ${VAR} in the template (INFLUXDB_URL, INFLUXDB_DB,
-# INFLUXDB_USER, INFLUXDB_PASSWORD, COLLECT_INTERVAL) are resolved by
-# telegraf's own env-var substitution at config-load time and are NOT
-# touched by this script.
-#
-# TELEGRAF_CONF_DIR overrides the conf directory for tests; defaults to
-# /etc/telegraf which is the standard path inside the official image.
-
-set -eu
-
-CONF_DIR="${TELEGRAF_CONF_DIR:-/etc/telegraf}"
-TPL="$CONF_DIR/telegraf.conf.tpl"
-OUT="$CONF_DIR/telegraf.conf"
-
-# Fail-fast required env. Empty string counts as missing -- a half-rendered
-# conf or a noisy telegraf auth error is worse than a clear startup failure.
-for v in INFLUXDB_URL INFLUXDB_USER INFLUXDB_PASSWORD; do
-    eval val=\${$v:-}
-    if [ -z "$val" ]; then
-        echo "FATAL: $v is required but empty" >&2
-        exit 1
-    fi
-done
-
-# Apply defaults for optional vars.
-: "${INFLUXDB_DB:=host_metrics}"
-: "${COLLECT_INTERVAL:=10s}"
-: "${HOST_TAG:=}"
-: "${COLLECT_ZFS:=false}"
-
-# Build the marker substitutions. Use printf for the newline so the
-# rendered block lands on its own line.
-if [ -n "$HOST_TAG" ]; then
-    HOST_TAG_BLOCK=$(printf '[global_tags]\n  host = "%s"' "$HOST_TAG")
-else
-    HOST_TAG_BLOCK=""
-fi
-
-if [ "$COLLECT_ZFS" = "true" ]; then
-    ZFS_BLOCK=$(printf '[[inputs.zfs]]\n  poolMetrics = true')
-else
-    ZFS_BLOCK=""
-fi
-
-# Export telegraf hostfs envs so /proc, /sys, and root come from the
-# bind-mount under /hostfs (set in compose).
-export HOST_PROC=/hostfs/proc
-export HOST_SYS=/hostfs/sys
-export HOST_MOUNT_PREFIX=/hostfs
-
-# Render with awk: handles multi-line replacement values cleanly,
-# avoids sed's newline-in-replacement portability quirks across BusyBox /
-# GNU / BSD sed.
-awk -v ht="$HOST_TAG_BLOCK" -v zb="$ZFS_BLOCK" '
-    { gsub(/@@HOST_TAG_BLOCK@@/, ht);
-      gsub(/@@ZFS_BLOCK@@/, zb);
-      print }
-' "$TPL" > "$OUT"
-
-exec telegraf --config "$OUT"
--- a/stack_orchestrator/data/config/host-metrics/scripts/test-telegraf-entrypoint.sh
+++ b/stack_orchestrator/data/config/host-metrics/scripts/test-telegraf-entrypoint.sh
@ -1,121 +0,0 @@
-#!/bin/sh
-# Offline tests for host-metrics telegraf-entrypoint.sh.
-# Stubs telegraf and envsubst's downstream consumer; no telegraf binary needed.
-set -eu
-
-SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
-ENTRYPOINT="$SCRIPT_DIR/telegraf-entrypoint.sh"
-
-[ -x "$ENTRYPOINT" ] || { echo "FATAL: $ENTRYPOINT not executable"; exit 2; }
-
-TMP=$(mktemp -d)
-trap 'rm -rf "$TMP"' EXIT
-mkdir -p "$TMP/bin" "$TMP/etc/telegraf"
-
-# Stub telegraf so `exec telegraf` is a no-op.
-cat > "$TMP/bin/telegraf" <<'EOF'
-#!/bin/sh
-exit 0
-EOF
-chmod +x "$TMP/bin/telegraf"
-
-# Minimal template that exercises both markers.
-cat > "$TMP/etc/telegraf/telegraf.conf.tpl" <<'EOF'
-@@HOST_TAG_BLOCK@@
-
-[agent]
-  interval = "${COLLECT_INTERVAL}"
-
-[[outputs.influxdb]]
-  urls = ["${INFLUXDB_URL}"]
-
-@@ZFS_BLOCK@@
-EOF
-
-PASS=0
-FAIL=0
-
-# run sets required env defaults, then layers caller env on top.
-run() {
-    env PATH="$TMP/bin:$PATH" \
-        TELEGRAF_CONF_DIR="$TMP/etc/telegraf" \
-        INFLUXDB_URL="${INFLUXDB_URL-http://example/}" \
-        INFLUXDB_USER="${INFLUXDB_USER-writer}" \
-        INFLUXDB_PASSWORD="${INFLUXDB_PASSWORD-secret}" \
-        INFLUXDB_DB="${INFLUXDB_DB-host_metrics}" \
-        COLLECT_INTERVAL="${COLLECT_INTERVAL-10s}" \
-        HOST_TAG="${HOST_TAG-}" \
-        COLLECT_ZFS="${COLLECT_ZFS-false}" \
-        sh "$ENTRYPOINT" >/dev/null
-    rc=$?
-    [ -f "$TMP/etc/telegraf/telegraf.conf" ] && cat "$TMP/etc/telegraf/telegraf.conf"
-    return $rc
-}
-
-assert_grep() {
-    name=$1; actual=$2; pattern=$3
-    if printf '%s' "$actual" | grep -qE "$pattern"; then
-        echo "PASS: $name"; PASS=$((PASS + 1))
-    else
-        echo "FAIL: $name"
-        echo "  expected pattern: $pattern"
-        echo "  actual: $actual"
-        FAIL=$((FAIL + 1))
-    fi
-}
-
-assert_not_grep() {
-    name=$1; actual=$2; pattern=$3
-    if printf '%s' "$actual" | grep -qE "$pattern"; then
-        echo "FAIL: $name (matched pattern $pattern)"; FAIL=$((FAIL + 1))
-    else
-        echo "PASS: $name"; PASS=$((PASS + 1))
-    fi
-}
-
-# T1: HOST_TAG unset -> no [global_tags] block emitted
-out=$(HOST_TAG="" run)
-assert_not_grep "T1: HOST_TAG empty -> no global_tags" "$out" '^\[global_tags\]'
-
-# T2: HOST_TAG set -> [global_tags] block with host = "<value>"
-out=$(HOST_TAG="validator-1" run)
-assert_grep "T2: HOST_TAG set -> [global_tags] block" "$out" '^\[global_tags\]'
-assert_grep "T2: HOST_TAG set -> host = \"validator-1\"" "$out" 'host = "validator-1"'
-
-# T3: COLLECT_ZFS=true -> [[inputs.zfs]] block present
-out=$(COLLECT_ZFS="true" run)
-assert_grep "T3: COLLECT_ZFS true -> inputs.zfs block" "$out" '\[\[inputs\.zfs\]\]'
-
-# T4: COLLECT_ZFS=false -> no inputs.zfs block
-out=$(COLLECT_ZFS="false" run)
-assert_not_grep "T4: COLLECT_ZFS false -> no inputs.zfs" "$out" '\[\[inputs\.zfs\]\]'
-
-# T5: markers fully removed even when block bodies are empty
-out=$(HOST_TAG="" COLLECT_ZFS="false" run)
-assert_not_grep "T5: no leftover @@HOST_TAG_BLOCK@@" "$out" '@@HOST_TAG_BLOCK@@'
-assert_not_grep "T5: no leftover @@ZFS_BLOCK@@" "$out" '@@ZFS_BLOCK@@'
-
-# T6: missing INFLUXDB_URL -> exit non-zero, error on stderr
-rc=0
-INFLUXDB_URL="" run 2>"$TMP/err" || rc=$?
-[ "$rc" -ne 0 ] && grep -q INFLUXDB_URL "$TMP/err" \
-    && { echo "PASS: T6: missing INFLUXDB_URL -> error"; PASS=$((PASS + 1)); } \
-    || { echo "FAIL: T6: missing INFLUXDB_URL handling (rc=$rc)"; FAIL=$((FAIL + 1)); }
-
-# T7: missing INFLUXDB_USER -> exit non-zero
-rc=0
-INFLUXDB_USER="" run 2>"$TMP/err" || rc=$?
-[ "$rc" -ne 0 ] && grep -q INFLUXDB_USER "$TMP/err" \
-    && { echo "PASS: T7: missing INFLUXDB_USER -> error"; PASS=$((PASS + 1)); } \
-    || { echo "FAIL: T7: missing INFLUXDB_USER handling (rc=$rc)"; FAIL=$((FAIL + 1)); }
-
-# T8: missing INFLUXDB_PASSWORD -> exit non-zero
-rc=0
-INFLUXDB_PASSWORD="" run 2>"$TMP/err" || rc=$?
-[ "$rc" -ne 0 ] && grep -q INFLUXDB_PASSWORD "$TMP/err" \
-    && { echo "PASS: T8: missing INFLUXDB_PASSWORD -> error"; PASS=$((PASS + 1)); } \
-    || { echo "FAIL: T8: missing INFLUXDB_PASSWORD handling (rc=$rc)"; FAIL=$((FAIL + 1)); }
-
-echo
-echo "Results: $PASS passed, $FAIL failed"
-[ "$FAIL" = "0" ]
--- a/stack_orchestrator/data/config/host-metrics/telegraf.conf.tpl
+++ b/stack_orchestrator/data/config/host-metrics/telegraf.conf.tpl
@ -1,66 +0,0 @@
-# host-metrics telegraf template.
-# Rendered at container start by telegraf-entrypoint.sh. The entrypoint
-# replaces two single-line markers in this file with TOML block fragments;
-# see telegraf-entrypoint.sh for the substitution details. All ${...}
-# variables are resolved by telegraf's native env substitution at
-# config-load time.
-
-@@HOST_TAG_BLOCK@@
-
-[agent]
-  interval = "${COLLECT_INTERVAL}"
-  round_interval = true
-  collection_jitter = "0s"
-  flush_interval = "${COLLECT_INTERVAL}"
-  flush_jitter = "0s"
-  precision = "0s"
-  hostname = ""
-  omit_hostname = false
-
-[[outputs.influxdb]]
-  urls = ["${INFLUXDB_URL}"]
-  database = "${INFLUXDB_DB}"
-  skip_database_creation = true
-  username = "${INFLUXDB_USER}"
-  password = "${INFLUXDB_PASSWORD}"
-  retention_policy = ""
-  write_consistency = "any"
-  timeout = "10s"
-
-[[inputs.cpu]]
-  percpu = false
-  totalcpu = true
-  collect_cpu_time = false
-  report_active = true
-
-[[inputs.mem]]
-
-[[inputs.swap]]
-
-[[inputs.system]]
-
-[[inputs.processes]]
-
-[[inputs.disk]]
-  # gopsutil with HOST_MOUNT_PREFIX=/hostfs strips the /hostfs prefix
-  # from /proc/mounts entries, so the mountpoints telegraf sees are
-  # the host's real paths (/, /boot, /home, ...). No mount_points
-  # filter; let ignore_fs do the noise reduction.
-  ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay",
-               "aufs", "squashfs", "nsfs", "tracefs", "proc", "sysfs",
-               "cgroup", "cgroup2", "fuse.lxcfs"]
-
-[[inputs.diskio]]
-  device_tags = ["DEVNAME"]
-  skip_serial_number = true
-  name_templates = ["$DEVNAME"]
-
-[[inputs.net]]
-  # Allowlist covers physical ethernet (eth*, en*), wireless (wl*),
-  # cellular (wwan*), and bonded/teamed interfaces (bond*). Excludes
-  # docker bridges, veth pairs, tun/tap, and lo. Adjust per host if
-  # you need a more specific scope.
-  ignore_protocol_stats = true
-  interfaces = ["eth*", "en*", "wl*", "wwan*", "bond*"]
-
-@@ZFS_BLOCK@@
--- a/stack_orchestrator/data/stacks/host-metrics/README.md
+++ b/stack_orchestrator/data/stacks/host-metrics/README.md
@ -1,127 +0,0 @@
-# host-metrics stack
-
-Per-host system metrics collector. Runs telegraf with host networking, host
-PID namespace, and read-only bind mounts of /proc, /sys, and / so it can
-report real CPU, memory, disk, network, and process metrics for the machine
-it runs on. Writes to an InfluxDB 1.x endpoint of your choosing.
-
-Deploy one instance per machine you want monitored.
-
-## What gets collected
-
-| Input | Measurements (in InfluxDB) |
-|-------|----------------------------|
-| inputs.cpu (totalcpu only) | cpu (`cpu=cpu-total`) |
-| inputs.mem | mem |
-| inputs.swap | swap |
-| inputs.system | system (uptime, load1/5/15, n_users, n_cpus) |
-| inputs.processes | processes (running/sleeping/blocked/zombies) |
-| inputs.disk | disk (used/free/used_percent per mount) |
-| inputs.diskio | diskio (read/write bytes/ops per device) |
-| inputs.net | net (bytes/packets/err in/out per interface) |
-| inputs.zfs (opt-in via COLLECT_ZFS=true) | zfs (ARC stats, pool state) |
-
-All rows are tagged with `host` (kernel hostname, or `HOST_TAG` override).
-
-## Deploy
-
-### Create a spec
-
-```bash
-laconic-so --stack host-metrics deploy init --output spec-host-metrics.yml
-```
-
-Edit `spec-host-metrics.yml` to look like:
-
-```yaml
-stack: host-metrics
-deploy-to: compose
-credentials-files:
-  - ~/.credentials/host-metrics.env
-config:
-  INFLUXDB_URL: 'https://influxdb.example.com'
-  INFLUXDB_DB: 'host_metrics'          # default; override for a custom DB
-  HOST_TAG: 'validator-1'              # optional; defaults to kernel hostname
-  COLLECT_INTERVAL: '10s'              # telegraf collection + flush cadence
-  COLLECT_ZFS: 'false'                 # set to 'true' on ZFS hosts
-```
-
-`~/.credentials/host-metrics.env` must contain:
-
-```
-INFLUXDB_WRITE_USER=<writer-username>
-INFLUXDB_WRITE_PASSWORD=<writer-password>
-```
-
-These are issued by the InfluxDB admin (the monitoring host operator); they
-are the same writer-only credentials used by validators/RPCs to push agave
-metrics.
-
-### Create and start
-
-```bash
-laconic-so --stack host-metrics deploy create \
-    --spec-file spec-host-metrics.yml \
-    --deployment-dir ./deployment-host-metrics
-laconic-so deployment --dir ./deployment-host-metrics start
-```
-
-`deploy create` builds the deployment dir from the spec; `deployment start`
-brings the containers up. The `--stack` option is required for `deploy`
-subcommands but rejected on `deployment` subcommands (the deployment dir
-already knows its stack).
-
-### Verify
-
-```bash
-laconic-so deployment --dir ./deployment-host-metrics logs host-telegraf | head
-```
-
-Expected: telegraf prints its startup banner and `Loaded inputs: ...`. No
-errors about missing config or auth failures.
-
-Within ~20 seconds, the host's data appears in the InfluxDB endpoint's
-`host_metrics` database (or whichever DB you set in INFLUXDB_DB) and in
-any Grafana dashboards bound to that DB.
-
-## Configuration reference
-
-| Env | Required | Default | Notes |
-|-----|----------|---------|-------|
-| `INFLUXDB_URL` | yes | - | Full URL including scheme. Example: `https://influxdb.example.com`. |
-| `INFLUXDB_DB` | no | `host_metrics` | Target database. Must exist (writer is not granted CREATE). |
-| `INFLUXDB_WRITE_USER` | yes | - | Writer-only user. |
-| `INFLUXDB_WRITE_PASSWORD` | yes | - | Writer-only password. |
-| `COLLECT_INTERVAL` | no | `10s` | Telegraf collection and flush cadence. |
-| `HOST_TAG` | no | empty | Overrides the kernel hostname for the `host` tag on every row. Useful when a VM has a generic hostname. |
-| `COLLECT_ZFS` | no | `false` | Set to `true` to enable `inputs.zfs` (pool state + ARC stats). |
-
-## ZFS hosts
-
-`inputs.disk` already reports used/free per mount for any filesystem type
-including ZFS, so the disk-usage view works out of the box. Setting
-`COLLECT_ZFS=true` additionally enables `inputs.zfs` which reads
-`/proc/spl/kstat/zfs/...` and emits ARC hit ratio, ARC size, and per-pool
-health metrics. The bind mount of `/proc` provides the necessary
-visibility; no extra mounts are needed.
-
-If you set `COLLECT_ZFS=true` on a non-ZFS host, telegraf logs an error
-once per collection cycle and skips the input. Harmless but noisy; leave
-the toggle off on non-ZFS machines.
-
-## Troubleshooting
-
-| Symptom | Likely cause |
-|---------|-------------|
-| Container fails to start with `FATAL: INFLUXDB_URL is required but empty` | Missing required env. Check spec.yml + credentials file. |
-| Container starts, no rows appear in InfluxDB | Writer credentials wrong, or InfluxDB unreachable from this host's network. Check `docker logs <host-telegraf>` for `Post ... 401` / `connection refused`. |
-| Two hosts overwriting each other's series | Both use the same kernel hostname. Set distinct `HOST_TAG` values. |
-| `inputs.processes` reports only 1 process | `pid: host` missing from compose. Re-deploy. |
-
-## Caveats
-
- Requires Docker with privileges to bind-mount `/`, `/proc`, `/sys`, and to
-  share the host PID namespace. Rootless Docker installations may refuse
-  `pid: host` and the `/` bind mount.
- One deployment per host. Running two on the same machine writes
-  duplicate rows under the same `host` tag.
--- a/stack_orchestrator/data/stacks/host-metrics/stack.yml
+++ b/stack_orchestrator/data/stacks/host-metrics/stack.yml
@ -1,5 +0,0 @@
-version: "1.1"
-name: host-metrics
-description: "Per-host system metrics collector (telegraf -> InfluxDB)"
-pods:
-  - host-metrics
--- a/stack_orchestrator/data/stacks/test-restart-multi/README.md
+++ b/stack_orchestrator/data/stacks/test-restart-multi/README.md
@ -1,14 +0,0 @@
-# test-restart-multi
-
-E2E test stack used by `tests/k8s-deploy/run-restart-test.sh` to cover the
-multi-repo case: `pods:` references two pod repos, each shipping its own
-`deploy/commands.py`. `deploy create` should produce
-`<deployment>/hooks/commands_0.py` and `<deployment>/hooks/commands_1.py`,
-and `deployment start` should invoke both `start()` hooks (each writes its
-own marker file so neither overwrites the other).
-
-The pod repos themselves are created by the test script as bare-repo +
-working-clone pairs under `$CERC_REPO_BASE_DIR/test-restart-pod-{a,b}`;
-they are not committed to this repository. Each pod repo ships its own
-`docker-compose.yml` (resolved by `get_pod_file_path` for dict-form pods)
-and `stack/deploy/commands.py` — the stack repo only owns `stack.yml`.
--- a/stack_orchestrator/data/stacks/test-restart-multi/stack.yml
+++ b/stack_orchestrator/data/stacks/test-restart-multi/stack.yml
@ -1,10 +0,0 @@
-version: "1.0"
-name: test-restart-multi
-description: "E2E test stack for the deployment restart command (multi-repo case)"
-pods:
-  - name: test-restart-multi-a
-    repository: test-restart-pod-a
-    path: .
-  - name: test-restart-multi-b
-    repository: test-restart-pod-b
-    path: .
--- a/stack_orchestrator/data/stacks/test-restart/README.md
+++ b/stack_orchestrator/data/stacks/test-restart/README.md
@ -1,15 +0,0 @@
-# test-restart
-
-E2E test stack used by `tests/k8s-deploy/run-restart-test.sh`.
-
-The stack ships a small `start()` hook that writes a versioned marker file
-into the deployment directory. The test exercises `deployment restart`:
-
-1. `deploy create` → asserts `commands.py` was copied into `<deployment>/hooks/`.
-2. `deployment start` → asserts the marker file contains the v1 string.
-3. Modifies `commands.py` in the stack-source working tree (v1 → v2).
-4. `deployment restart` → asserts the new `commands.py` was re-copied into
-   `<deployment>/hooks/` and the marker file now contains the v2 string.
-
-The pod uses a public `busybox` image that just sleeps; the marker file is
-the only thing under test.
--- a/stack_orchestrator/data/stacks/test-restart/deploy/commands.py
+++ b/stack_orchestrator/data/stacks/test-restart/deploy/commands.py
@ -1,32 +0,0 @@
-# Copyright © 2026 Vulcanize
-
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Affero General Public License for more details.
-
-# You should have received a copy of the GNU Affero General Public License
-# along with this program.  If not, see <http:#www.gnu.org/licenses/>.
-
-from stack_orchestrator.util import get_yaml
-from stack_orchestrator.deploy.deployment_context import DeploymentContext
-
-default_spec_file_content = ""
-
-
-def init(command_context):
-    return get_yaml().load(default_spec_file_content)
-
-
-def start(deployment_context: DeploymentContext):
-    # Writes a marker file the e2e test asserts on. The test flips the
-    # literal below from "v1" to "v2" in the stack-source working tree
-    # before running 'deployment restart' to verify the updated hook is
-    # copied into deployment_dir/hooks/ and re-executed.
-    marker = deployment_context.deployment_dir / "marker"
-    marker.write_text("v1")
--- a/stack_orchestrator/data/stacks/test-restart/stack.yml
+++ b/stack_orchestrator/data/stacks/test-restart/stack.yml
@ -1,5 +0,0 @@
-version: "1.0"
-name: test-restart
-description: "E2E test stack for the deployment restart command"
-pods:
-  - test-restart
--- a/stack_orchestrator/deploy/compose/deploy_docker.py
+++ b/stack_orchestrator/deploy/compose/deploy_docker.py
@ -48,21 +48,10 @@ class DockerDeployer(Deployer):
        self.compose_project_name = compose_project_name
        self.compose_env_file = compose_env_file

-    def up(
-        self,
-        detach,
-        skip_cluster_management,
-        services,
-        image_overrides=None,
-        force_recreate=False,
-    ):
+    def up(self, detach, skip_cluster_management, services, image_overrides=None):
        if not opts.o.dry_run:
            try:
-                return self.docker.compose.up(
-                    detach=detach,
-                    services=services,
-                    force_recreate=force_recreate,
-                )
+                return self.docker.compose.up(detach=detach, services=services)
            except DockerException as e:
                raise DeployerException(e)

--- a/stack_orchestrator/deploy/deploy.py
+++ b/stack_orchestrator/deploy/deploy.py
@ -142,7 +142,6 @@ def up_operation(
    stay_attached=False,
    skip_cluster_management=False,
    image_overrides=None,
-    force_recreate=False,
 ):
    global_context = ctx.parent.parent.obj
    deploy_context = ctx.obj
@ -162,7 +161,6 @@ def up_operation(
        skip_cluster_management=skip_cluster_management,
        services=services_list,
        image_overrides=image_overrides,
-        force_recreate=force_recreate,
    )
    for post_start_command in cluster_context.post_start_commands:
        _run_command(global_context, cluster_context.cluster, post_start_command)
--- a/stack_orchestrator/deploy/deployer.py
+++ b/stack_orchestrator/deploy/deployer.py
@ -20,14 +20,7 @@ from typing import Optional

 class Deployer(ABC):
    @abstractmethod
-    def up(
-        self,
-        detach,
-        skip_cluster_management,
-        services,
-        image_overrides=None,
-        force_recreate=False,
-    ):
+    def up(self, detach, skip_cluster_management, services, image_overrides=None):
        pass

    @abstractmethod
--- a/stack_orchestrator/deploy/deployment.py
+++ b/stack_orchestrator/deploy/deployment.py
@ -471,18 +471,12 @@ def restart(ctx, stack_path, spec_file, config_file, force, expected_ip, image):
            ctx, deployment_context, maintenance_svc, image_overrides
        )
    else:
-        # force_recreate=True so source-file edits (alert rules, dashboards,
-        # entrypoint scripts, etc. mounted via bind volumes) are picked up.
-        # docker compose up -d alone is a no-op when the service definition
-        # itself is unchanged, leaving the running container with stale
-        # in-memory state.
        up_operation(
            ctx,
            services_list=None,
            stay_attached=False,
            skip_cluster_management=True,
            image_overrides=image_overrides or None,
-            force_recreate=True,
        )

    # Restore cwd after both create_operation and up_operation have run.
@ -520,15 +514,12 @@ def _restart_with_maintenance(

    # Step 1: Apply the full deployment (creates/updates all pods + services)
    # This ensures maintenance pod exists before we swap Ingress to it.
-    # force_recreate intent matches the non-maintenance restart path; the
-    # k8s deployer currently ignores the flag (TODO in deploy_k8s.up).
    up_operation(
        ctx,
        services_list=None,
        stay_attached=False,
        skip_cluster_management=True,
        image_overrides=image_overrides or None,
-        force_recreate=True,
    )

    # Parse maintenance service spec: "container-name:port"
--- a/stack_orchestrator/deploy/deployment_context.py
+++ b/stack_orchestrator/deploy/deployment_context.py
@ -83,7 +83,9 @@ class DeploymentContext:
            # Fallback to cluster-id for deployments created before the
            # deployment-id field was introduced. Keeps existing resource
            # names stable across this upgrade.
-            self.deployment_id = obj.get(constants.deployment_id_key, self.id)
+            self.deployment_id = obj.get(
+                constants.deployment_id_key, self.id
+            )
        # Handle the case of a legacy deployment with no file
        # Code below is intended to match the output from _make_default_cluster_name()
        # TODO: remove when we no longer need to support legacy deployments
--- a/stack_orchestrator/deploy/deployment_create.py
+++ b/stack_orchestrator/deploy/deployment_create.py
@ -276,10 +276,9 @@ def call_stack_deploy_start(deployment_context):
    create additional k8s resources (Services, etc.) in the deployment namespace.
    The namespace can be derived as f"laconic-{deployment_context.id}".
    """
-    hooks_dir = deployment_context.deployment_dir / "hooks"
-    if not hooks_dir.exists():
-        return
-    for python_file_path in sorted(hooks_dir.glob("commands*.py")):
+    python_file_paths = _commands_plugin_paths(deployment_context.stack.name)
+    for python_file_path in python_file_paths:
+        if python_file_path.exists():
            spec = util.spec_from_file_location("commands", python_file_path)
            if spec is None or spec.loader is None:
                continue
@ -380,7 +379,9 @@ def _validate_host_path_mounts(parsed_pod_file, pod_name, pod_file_path):
                        "content at runtime.\n\n"
                        "See docs/deployment_patterns.md."
                    )
-                total = sum(p.stat().st_size for p in entries if p.is_file())
+                total = sum(
+                    p.stat().st_size for p in entries if p.is_file()
+                )
                if total > _HOST_PATH_CONFIGMAP_BUDGET_BYTES:
                    raise DeployerException(
                        f"Directory host-path bind '{volume_str}' in "
@ -1110,37 +1111,6 @@ def _safe_copy_tree(src: Path, dst: Path, exclude_patterns: Optional[List[str]]
            safe_copy_file(src_path, dst_path)


-def _copy_hooks(stack_name: str, target_dir: Path):
-    """Copy commands.py hooks into deployment_dir/hooks/ for self-sufficiency.
-
-    Single repo: hooks/commands.py
-    Multi-repo: hooks/commands_0.py, hooks/commands_1.py, ... — indexed by
-    plugin path order.
-
-    Note: the whole commands.py file is copied (init/setup/create/start), but
-    at runtime only call_stack_deploy_start loads from this copied location.
-    call_stack_deploy_init, call_stack_deploy_setup, and call_stack_deploy_create
-    still resolve commands.py from the live stack source via
-    get_plugin_code_paths — they only run at deploy create time when the source
-    is guaranteed to be present, so they don't need to be self-sufficient.
-    """
-    plugin_paths = get_plugin_code_paths(stack_name)
-    sources = [
-        p.joinpath("deploy", "commands.py")
-        for p in plugin_paths
-        if p.joinpath("deploy", "commands.py").exists()
-    ]
-    if not sources:
-        return
-    hooks_dir = target_dir / "hooks"
-    hooks_dir.mkdir(exist_ok=True)
-    if len(sources) == 1:
-        copyfile(sources[0], hooks_dir / "commands.py")
-    else:
-        for i, src in enumerate(sources):
-            copyfile(src, hooks_dir / f"commands_{i}.py")
-
-
 def _write_deployment_files(
    target_dir: Path,
    spec_file: Path,
@ -1168,8 +1138,6 @@ def _write_deployment_files(
    copyfile(spec_file, target_dir.joinpath(constants.spec_file_name))
    copyfile(stack_file, target_dir.joinpath(constants.stack_file_name))

-    _copy_hooks(stack_name, target_dir)
-
    # Create deployment file if requested
    if include_deployment_file:
        _create_deployment_file(target_dir, stack_source=stack_source)
@ -1281,9 +1249,7 @@ def _write_deployment_files(
            else:
                source_config_dir = resolve_config_dir(stack_name, configmap_name)
            if os.path.exists(source_config_dir):
-                destination_config_dir = target_dir.joinpath(
-                    "configmaps", configmap_name
-                )
+                destination_config_dir = target_dir.joinpath("configmaps", configmap_name)
                copytree(source_config_dir, destination_config_dir, dirs_exist_ok=True)

    # Copy the job files into the target dir
@ -1296,7 +1262,9 @@ def _write_deployment_files(
            if job_file_path and job_file_path.exists():
                parsed_job_file = yaml.load(open(job_file_path, "r"))
                if parsed_spec.is_kubernetes_deployment():
-                    _validate_host_path_mounts(parsed_job_file, job, job_file_path)
+                    _validate_host_path_mounts(
+                        parsed_job_file, job, job_file_path
+                    )
                _fixup_pod_file(parsed_job_file, parsed_spec, destination_compose_dir)
                with open(
                    destination_compose_jobs_dir.joinpath(
--- a/stack_orchestrator/deploy/dns_probe.py
+++ b/stack_orchestrator/deploy/dns_probe.py
@ -6,7 +6,7 @@
 import secrets
 import socket
 import time
-from typing import List, Optional
+from typing import Optional
 import requests
 from kubernetes import client

@ -18,7 +18,7 @@ def get_server_egress_ip() -> str:
    return response.text.strip()


-def resolve_hostname(hostname: str) -> List[str]:
+def resolve_hostname(hostname: str) -> list[str]:
    """Resolve hostname to list of IP addresses."""
    try:
        _, _, ips = socket.gethostbyname_ex(hostname)
--- a/stack_orchestrator/deploy/k8s/cluster_info.py
+++ b/stack_orchestrator/deploy/k8s/cluster_info.py
@ -479,7 +479,9 @@ class ClusterInfo:
                        if sanitized in seen:
                            continue
                        seen.add(sanitized)
-                        abs_src = resolve_host_path_for_kind(src, deployment_dir)
+                        abs_src = resolve_host_path_for_kind(
+                            src, deployment_dir
+                        )
                        data = self._read_host_path_source(abs_src, mount_string)
                        cm = client.V1ConfigMap(
                            metadata=client.V1ObjectMeta(
@ -493,7 +495,9 @@ class ClusterInfo:
                        result.append(cm)
        return result

-    def _read_host_path_source(self, abs_src: Path, mount_string: str) -> dict:
+    def _read_host_path_source(
+        self, abs_src: Path, mount_string: str
+    ) -> dict:
        """Read file or flat-directory content for a host-path ConfigMap.

        Validates shape at read time as a defensive second check — the
@ -513,7 +517,9 @@ class ClusterInfo:
            for entry in abs_src.iterdir():
                if entry.is_file():
                    with open(entry, "rb") as f:
-                        data[entry.name] = base64.b64encode(f.read()).decode("ASCII")
+                        data[entry.name] = base64.b64encode(f.read()).decode(
+                            "ASCII"
+                        )
        return data

    def get_pvs(self):
@ -705,7 +711,9 @@ class ClusterInfo:
                volume_mounts = volume_mounts_for_service(
                    parsed_yaml_map,
                    service_name,
-                    Path(self.spec.file_path).parent if self.spec.file_path else None,
+                    Path(self.spec.file_path).parent
+                    if self.spec.file_path
+                    else None,
                )
                # Handle command/entrypoint from compose file
                # In docker-compose: entrypoint -> k8s command, command -> k8s args
@ -1013,7 +1021,9 @@ class ClusterInfo:
                metadata=client.V1ObjectMeta(
                    name=deployment_name,
                    labels=self._stack_labels(
-                        {"app.kubernetes.io/component": pod_name} if multi_pod else None
+                        {"app.kubernetes.io/component": pod_name}
+                        if multi_pod
+                        else None
                    ),
                ),
                spec=spec,
@ -1061,7 +1071,9 @@ class ClusterInfo:
                    container_ports[container].add(port)
        if maintenance_svc and ":" in maintenance_svc:
            maint_container, maint_port_str = maintenance_svc.split(":", 1)
-            container_ports.setdefault(maint_container, set()).add(int(maint_port_str))
+            container_ports.setdefault(maint_container, set()).add(
+                int(maint_port_str)
+            )

        # Build map: pod_file -> set of service names in that pod
        pod_services_map: dict = {}
--- a/stack_orchestrator/deploy/k8s/deploy_k8s.py
+++ b/stack_orchestrator/deploy/k8s/deploy_k8s.py
@ -30,7 +30,6 @@ from stack_orchestrator.deploy.k8s.helpers import (
    create_cluster,
    destroy_cluster,
    get_kind_cluster,
-    is_image_available_locally,
    load_images_into_kind,
 )
 from stack_orchestrator.deploy.k8s.helpers import (
@ -220,7 +219,10 @@ class K8sDeployer(Deployer):
            )
            self.core_api.create_namespace(body=ns)
            if opts.o.debug:
-                print(f"Created namespace {self.k8s_namespace} " f"owned by {my_dir}")
+                print(
+                    f"Created namespace {self.k8s_namespace} "
+                    f"owned by {my_dir}"
+                )
            return

        annotations = (existing.metadata.annotations or {}) if existing.metadata else {}
@ -834,17 +836,16 @@ class K8sDeployer(Deployer):
            actual_cluster = create_cluster(self.kind_cluster_name, kind_config)
            if actual_cluster != self.kind_cluster_name:
                self.kind_cluster_name = actual_cluster
+            # Only load locally-built images into kind
            local_containers = self.deployment_context.stack.obj.get("containers", [])
-            images_to_preload = set((self.image_overrides or {}).values()) | {
+            if local_containers:
+                local_images = {
                    img
                    for img in self.cluster_info.image_set
                    if any(c in img for c in local_containers)
                }
-            images_to_preload = {
-                img for img in images_to_preload if is_image_available_locally(img)
-            }
-            if images_to_preload:
-                load_images_into_kind(self.kind_cluster_name, images_to_preload)
+                if local_images:
+                    load_images_into_kind(self.kind_cluster_name, local_images)
        elif self.is_kind():
            # --skip-cluster-management (default): cluster must already exist.
            # Without this check, connect_api() below raises a cryptic
@ -987,19 +988,7 @@ class K8sDeployer(Deployer):
                    else:
                        raise

-    def up(
-        self,
-        detach,
-        skip_cluster_management,
-        services,
-        image_overrides=None,
-        force_recreate=False,
-    ):
-        # TODO: honor force_recreate by stamping the
-        # kubectl.kubernetes.io/restartedAt annotation on managed
-        # Deployments so a rollout occurs even when the manifest is
-        # unchanged. Today this method is a no-op for that flag.
-        # Tracked separately from the compose-side fix.
+    def up(self, detach, skip_cluster_management, services, image_overrides=None):
        # Merge spec-level image overrides with CLI overrides
        spec_overrides = self.cluster_info.spec.get("image-overrides", {})
        if spec_overrides:
@ -1036,7 +1025,9 @@ class K8sDeployer(Deployer):

            call_stack_deploy_start(self.deployment_context)

-    def down(self, timeout, volumes, skip_cluster_management, delete_namespace=False):
+    def down(
+        self, timeout, volumes, skip_cluster_management, delete_namespace=False
+    ):
        """Tear down stack-labeled resources. Phases:

        1. Delete namespaced resources (if namespace still exists).
@ -1230,68 +1221,34 @@ class K8sDeployer(Deployer):
        listers = []
        if namespace_present:
            listers += [
-                (
-                    "deployment",
-                    lambda: self.apps_api.list_namespaced_deployment(
-                        namespace=namespace, label_selector=selector
-                    ),
-                ),
-                (
-                    "ingress",
-                    lambda: self.networking_api.list_namespaced_ingress(
-                        namespace=namespace, label_selector=selector
-                    ),
-                ),
-                (
-                    "job",
-                    lambda: self.batch_api.list_namespaced_job(
-                        namespace=namespace, label_selector=selector
-                    ),
-                ),
-                (
-                    "service",
-                    lambda: self.core_api.list_namespaced_service(
-                        namespace=namespace, label_selector=selector
-                    ),
-                ),
-                (
-                    "configmap",
-                    lambda: self.core_api.list_namespaced_config_map(
-                        namespace=namespace, label_selector=selector
-                    ),
-                ),
-                (
-                    "secret",
-                    lambda: self.core_api.list_namespaced_secret(
-                        namespace=namespace, label_selector=selector
-                    ),
-                ),
-                (
-                    "pod",
-                    lambda: self.core_api.list_namespaced_pod(
-                        namespace=namespace, label_selector=selector
-                    ),
-                ),
+                ("deployment", lambda: self.apps_api.list_namespaced_deployment(
+                    namespace=namespace, label_selector=selector)),
+                ("ingress", lambda: self.networking_api.list_namespaced_ingress(
+                    namespace=namespace, label_selector=selector)),
+                ("job", lambda: self.batch_api.list_namespaced_job(
+                    namespace=namespace, label_selector=selector)),
+                ("service", lambda: self.core_api.list_namespaced_service(
+                    namespace=namespace, label_selector=selector)),
+                ("configmap", lambda: self.core_api.list_namespaced_config_map(
+                    namespace=namespace, label_selector=selector)),
+                ("secret", lambda: self.core_api.list_namespaced_secret(
+                    namespace=namespace, label_selector=selector)),
+                ("pod", lambda: self.core_api.list_namespaced_pod(
+                    namespace=namespace, label_selector=selector)),
            ]
            if delete_volumes:
                listers.append(
-                    (
-                        "persistentvolumeclaim",
+                    ("persistentvolumeclaim",
                     lambda: self.core_api.list_namespaced_persistent_volume_claim(
-                            namespace=namespace, label_selector=selector
-                        ),
-                    )
+                         namespace=namespace, label_selector=selector))
                )
        # PVs are cluster-scoped — wait for them even when the namespace
        # is already gone (orphaned from a prior --delete-namespace).
        if delete_volumes:
            listers.append(
-                (
-                    "persistentvolume",
+                ("persistentvolume",
                 lambda: self.core_api.list_persistent_volume(
-                        label_selector=selector
-                    ),
-                )
+                     label_selector=selector))
            )

        def remaining():
@ -1319,7 +1276,8 @@ class K8sDeployer(Deployer):
        left = remaining()
        if left:
            print(
-                f"Warning: resources still present after {timeout_seconds}s: " f"{left}"
+                f"Warning: resources still present after {timeout_seconds}s: "
+                f"{left}"
            )

    def status(self):
--- a/stack_orchestrator/deploy/k8s/helpers.py
+++ b/stack_orchestrator/deploy/k8s/helpers.py
@ -207,7 +207,9 @@ def _install_caddy_cert_backup(
            print("No kind-mount-root configured; caddy cert backup disabled")
        return
    manifest = os.path.abspath(
-        get_k8s_dir().joinpath("components", "ingress", "caddy-cert-backup.yaml")
+        get_k8s_dir().joinpath(
+            "components", "ingress", "caddy-cert-backup.yaml"
+        )
    )
    with open(manifest) as f:
        objects = list(yaml.safe_load_all(f))
@ -231,7 +233,9 @@ def _parse_kind_extra_mounts(config_file: str) -> List[Dict[str, str]]:
            host_path = m.get("hostPath")
            container_path = m.get("containerPath")
            if host_path and container_path:
-                mounts.append({"hostPath": host_path, "containerPath": container_path})
+                mounts.append(
+                    {"hostPath": host_path, "containerPath": container_path}
+                )
    return mounts


@ -496,9 +500,12 @@ def install_ingress_for_kind(
                continue
            if (
                obj.get("kind") == "Deployment"
-                and obj.get("metadata", {}).get("name") == "caddy-ingress-controller"
+                and obj.get("metadata", {}).get("name")
+                == "caddy-ingress-controller"
+            ):
+                for c in (
+                    obj["spec"]["template"]["spec"].get("containers") or []
                ):
-                for c in obj["spec"]["template"]["spec"].get("containers") or []:
                    if c.get("name") == "caddy-ingress-controller":
                        c["image"] = caddy_image
                        if opts.o.debug:
@ -607,14 +614,6 @@ def update_caddy_ingress_image(caddy_image: str) -> bool:
    return True


-def is_image_available_locally(image: str) -> bool:
-    result = subprocess.run(
-        ["docker", "image", "inspect", image],
-        capture_output=True,
-    )
-    return result.returncode == 0
-
-
 def load_images_into_kind(kind_cluster_name: str, image_set: Set[str]):
    for image in image_set:
        result = _run_command(
--- a/tests/k8s-deploy/run-deploy-test.sh
+++ b/tests/k8s-deploy/run-deploy-test.sh
@ -413,16 +413,14 @@ if [ "$restored_value" != "$fake_cert_value" ]; then
 fi
 echo "caddy cert restore test: passed"

-# Final teardown: --delete-namespace nukes the namespace, and
-# --perform-cluster-management tears down the Kind cluster so the next test
-# step in this CI workflow (e.g. run-restart-test.sh) starts from a clean
-# host.
+# Final teardown: --delete-namespace nukes the namespace after labeled cleanup.
+# Verify the namespace is actually gone.
 $TEST_TARGET_SO deployment --dir $test_deployment_dir \
-    stop --delete-volumes --delete-namespace --perform-cluster-management
-if kind get clusters 2>/dev/null | grep -q .; then
-    echo "cluster teardown test: FAILED (kind cluster still present)"
+    stop --delete-volumes --delete-namespace --skip-cluster-management
+if kubectl get namespace ${deployment_ns} >/dev/null 2>&1; then
+    echo "delete-namespace test: FAILED (namespace still present)"
    exit 1
 fi
-echo "cluster teardown test: passed"
+echo "delete-namespace test: passed"

 echo "Test passed"
--- a/tests/k8s-deploy/run-restart-test.sh
+++ b/tests/k8s-deploy/run-restart-test.sh
@ -1,265 +0,0 @@
-#!/usr/bin/env bash
-set -e
-if [ -n "$CERC_SCRIPT_DEBUG" ]; then
-    set -x
-    echo "Environment variables:"
-    env
-fi
-
-# Helper functions: TODO move into a separate file (mirrors run-deploy-test.sh:10).
-wait_for_pods_started () {
-    local dir=$1
-    for i in {1..50}
-    do
-        local ps_output=$( $TEST_TARGET_SO deployment --dir $dir ps )
-
-        if [[ "$ps_output" == *"Running containers:"* ]]; then
-            return
-        else
-            sleep 5
-        fi
-    done
-    echo "waiting for pods to start: FAILED"
-    cleanup_and_exit
-}
-
-# Multi-pod stacks aren't visible to 'deployment ps' (deploy_k8s.py:1366
-# filters by app_name-deployment substring, which doesn't match
-# laconic-<id>-<podname>-deployment-<hash> names). Wait via kubectl.
-wait_for_k8s_pods_ready () {
-    local ns=$1
-    local timeout=240
-    local waited=0
-    # First wait for at least one pod to appear in the namespace.
-    while [ $waited -lt $timeout ]; do
-        local count=$(kubectl get pods -n "$ns" --no-headers 2>/dev/null | wc -l)
-        if [ "$count" -gt 0 ]; then
-            break
-        fi
-        sleep 2
-        waited=$((waited + 2))
-    done
-    if ! kubectl wait --for=condition=Ready pod --all \
-            -n "$ns" --timeout=$((timeout - waited))s 2>&1; then
-        echo "kubectl wait pods ready: FAILED (ns=$ns)"
-        kubectl get pods -n "$ns" 2>&1 || true
-        kubectl describe pods -n "$ns" 2>&1 | tail -80 || true
-        cleanup_and_exit
-    fi
-}
-
-# Best-effort full teardown so CI runners don't leak namespaces/PVs/clusters
-# between runs. Variables may be unset depending on which phase tripped.
-cleanup_and_exit () {
-    if [ -n "$DEP1" ] && [ -d "$DEP1" ]; then
-        $TEST_TARGET_SO deployment --dir $DEP1 \
-            stop --delete-volumes --delete-namespace --skip-cluster-management || true
-    fi
-    if [ -n "$DEP2" ] && [ -d "$DEP2" ]; then
-        $TEST_TARGET_SO deployment --dir $DEP2 \
-            stop --delete-volumes --delete-namespace --perform-cluster-management || true
-    fi
-    exit 1
-}
-
-# Make a clone usable for `git commit` without touching the runner's global config.
-configure_git_identity () {
-    local repo_dir=$1
-    git -C $repo_dir config user.email "test@stack-orchestrator.test"
-    git -C $repo_dir config user.name "test"
-}
-
-TEST_TARGET_SO=$( ls -t1 ./package/laconic-so* | head -1 )
-echo "Testing this package: $TEST_TARGET_SO"
-
-WORK_DIR=~/stack-orchestrator-test/restart
-# Multi-repo pod working clones land here; resolved by get_plugin_code_paths.
-export CERC_REPO_BASE_DIR=$WORK_DIR/repo-base
-rm -rf $WORK_DIR
-mkdir -p $WORK_DIR $CERC_REPO_BASE_DIR
-
-# Source location of the test stacks shipped in this checkout. The test stages
-# them into a temp git repo so 'deployment restart' (which runs 'git pull' on
-# the stack source) has a real repo to pull from.
-DATA_DIR=stack_orchestrator/data
-
-# ============================================================================
-# Phase 1 — single-repo restart cycle. Verifies that:
-#   * deploy create copies commands.py into <deployment>/hooks/
-#   * deployment start runs the copied start() hook
-#   * mutating the stack-source commands.py and running 'deployment restart'
-#     re-copies the new file into hooks/ and re-executes the new start()
-# ============================================================================
-echo "=== Phase 1: single-repo restart cycle ==="
-
-BARE1=$WORK_DIR/stack-single.git
-CLONE1=$WORK_DIR/stack-single
-git init -b main --bare $BARE1
-git clone $BARE1 $CLONE1
-configure_git_identity $CLONE1
-
-# External-stack layout: <repo>/stack-orchestrator/{stacks,compose}/...
-mkdir -p $CLONE1/stack-orchestrator/stacks $CLONE1/stack-orchestrator/compose
-cp -r $DATA_DIR/stacks/test-restart $CLONE1/stack-orchestrator/stacks/
-cp $DATA_DIR/compose/docker-compose-test-restart.yml $CLONE1/stack-orchestrator/compose/
-
-git -C $CLONE1 add .
-git -C $CLONE1 commit -m "test-restart v1"
-git -C $CLONE1 push -u origin main
-
-STACK_PATH_SINGLE=$CLONE1/stack-orchestrator/stacks/test-restart
-SPEC1=$WORK_DIR/spec-single.yml
-DEP1=$WORK_DIR/dep-single
-
-$TEST_TARGET_SO --stack $STACK_PATH_SINGLE deploy --deploy-to k8s-kind init --output $SPEC1
-$TEST_TARGET_SO --stack $STACK_PATH_SINGLE deploy create --spec-file $SPEC1 --deployment-dir $DEP1
-
-if [ ! -f "$DEP1/hooks/commands.py" ]; then
-    echo "single-repo deploy create test: FAILED (hooks/commands.py missing)"
-    cleanup_and_exit
-fi
-if ! grep -q '"v1"' "$DEP1/hooks/commands.py"; then
-    echo "single-repo deploy create test: FAILED (hooks/commands.py does not contain v1 marker)"
-    cleanup_and_exit
-fi
-echo "single-repo deploy create test: passed"
-
-$TEST_TARGET_SO deployment --dir $DEP1 start --perform-cluster-management
-wait_for_pods_started $DEP1
-
-# call_stack_deploy_start runs synchronously inside the start command
-# (deploy_k8s.py:1026), so the marker is on disk before 'start' returns.
-if [ ! -f "$DEP1/marker" ]; then
-    echo "single-repo start v1 test: FAILED (marker file missing)"
-    cleanup_and_exit
-fi
-marker_v1=$(cat $DEP1/marker)
-if [ "$marker_v1" != "v1" ]; then
-    echo "single-repo start v1 test: FAILED (got: $marker_v1)"
-    cleanup_and_exit
-fi
-echo "single-repo start v1 test: passed"
-
-# Mutate the stack-source working tree v1 -> v2. No commit needed: 'deployment
-# restart' runs 'git pull' against the bare which is a no-op, and _copy_hooks
-# reads the working tree directly via get_plugin_code_paths.
-sed -i 's/"v1"/"v2"/' $STACK_PATH_SINGLE/deploy/commands.py
-
-$TEST_TARGET_SO deployment --dir $DEP1 restart --stack-path $STACK_PATH_SINGLE
-
-if ! grep -q '"v2"' "$DEP1/hooks/commands.py"; then
-    echo "single-repo restart re-copy test: FAILED (hooks/commands.py still v1)"
-    cleanup_and_exit
-fi
-echo "single-repo restart re-copy test: passed"
-
-marker_v2=$(cat $DEP1/marker)
-if [ "$marker_v2" != "v2" ]; then
-    echo "single-repo restart re-execute test: FAILED (got: $marker_v2)"
-    cleanup_and_exit
-fi
-echo "single-repo restart re-execute test: passed"
-
-# Stop phase 1 deployment but keep the cluster for phase 2.
-$TEST_TARGET_SO deployment --dir $DEP1 \
-    stop --delete-volumes --delete-namespace --skip-cluster-management
-
-# ============================================================================
-# Phase 2 — multi-repo create + start. Verifies that a stack with N pods, each
-# from a separate repo, produces hooks/commands_0.py ... commands_{N-1}.py and
-# that call_stack_deploy_start invokes every module's start().
-# ============================================================================
-echo "=== Phase 2: multi-repo create + start ==="
-
-# Pod repos: stack.yml's pods[].repository = 'cerc-io/test-restart-pod-X'
-# resolves (via get_plugin_code_paths) to
-# $CERC_REPO_BASE_DIR/test-restart-pod-X/<pod_path>/stack/...
-for label in a b; do
-    POD_BARE=$WORK_DIR/pod-$label.git
-    POD_CLONE=$CERC_REPO_BASE_DIR/test-restart-pod-$label
-    git init -b main --bare $POD_BARE
-    git clone $POD_BARE $POD_CLONE
-    configure_git_identity $POD_CLONE
-    mkdir -p $POD_CLONE/stack/deploy
-    # For dict-form pods, get_pod_file_path resolves the compose file at
-    # <pod_repo>/<pod_path>/docker-compose.yml — owned by the pod repo, not
-    # the stack repo. get_plugin_code_paths adds the trailing 'stack/', so
-    # commands.py lives at <pod_repo>/<pod_path>/stack/deploy/commands.py.
-    cat > $POD_CLONE/docker-compose.yml <<EOF
-services:
-  test-restart-multi-$label:
-    image: busybox:1.36
-    command: ["sh", "-c", "sleep infinity"]
-    restart: always
-EOF
-    # Each pod hook writes a distinct marker file so neither overwrites the
-    # other when both start() hooks are loaded by call_stack_deploy_start.
-    cat > $POD_CLONE/stack/deploy/commands.py <<EOF
-from stack_orchestrator.deploy.deployment_context import DeploymentContext
-
-
-def start(deployment_context: DeploymentContext):
-    marker = deployment_context.deployment_dir / "marker-$label"
-    marker.write_text("v1")
-EOF
-    git -C $POD_CLONE add .
-    git -C $POD_CLONE commit -m "pod $label v1"
-    git -C $POD_CLONE push -u origin main
-done
-
-# Stack repo
-BARE2=$WORK_DIR/stack-multi.git
-CLONE2=$WORK_DIR/stack-multi
-git init -b main --bare $BARE2
-git clone $BARE2 $CLONE2
-configure_git_identity $CLONE2
-
-# For multi-repo (dict-form pods), the stack repo only owns stack.yml — pod
-# compose files and hooks live in the per-pod repos under CERC_REPO_BASE_DIR.
-mkdir -p $CLONE2/stack-orchestrator/stacks
-cp -r $DATA_DIR/stacks/test-restart-multi $CLONE2/stack-orchestrator/stacks/
-
-git -C $CLONE2 add .
-git -C $CLONE2 commit -m "test-restart-multi v1"
-git -C $CLONE2 push -u origin main
-
-STACK_PATH_MULTI=$CLONE2/stack-orchestrator/stacks/test-restart-multi
-SPEC2=$WORK_DIR/spec-multi.yml
-DEP2=$WORK_DIR/dep-multi
-
-$TEST_TARGET_SO --stack $STACK_PATH_MULTI deploy --deploy-to k8s-kind init --output $SPEC2
-$TEST_TARGET_SO --stack $STACK_PATH_MULTI deploy create --spec-file $SPEC2 --deployment-dir $DEP2
-
-# get_plugin_code_paths returns list(set(...)) so the index ordering is not
-# guaranteed; we assert presence of both files rather than mapping each to
-# a specific pod.
-if [ ! -f "$DEP2/hooks/commands_0.py" ] || [ ! -f "$DEP2/hooks/commands_1.py" ]; then
-    echo "multi-repo deploy create test: FAILED (hooks/commands_{0,1}.py missing)"
-    ls -la $DEP2/hooks/ || true
-    cleanup_and_exit
-fi
-echo "multi-repo deploy create test: passed"
-
-$TEST_TARGET_SO deployment --dir $DEP2 start --skip-cluster-management
-wait_for_k8s_pods_ready laconic-test-restart-multi
-
-for label in a b; do
-    if [ ! -f "$DEP2/marker-$label" ]; then
-        echo "multi-repo start test: FAILED (marker-$label missing)"
-        cleanup_and_exit
-    fi
-    val=$(cat $DEP2/marker-$label)
-    if [ "$val" != "v1" ]; then
-        echo "multi-repo start test: FAILED (marker-$label content: $val)"
-        cleanup_and_exit
-    fi
-done
-echo "multi-repo start test: passed"
-
-# Final teardown — destroy the cluster for the next CI run.
-$TEST_TARGET_SO deployment --dir $DEP2 \
-    stop --delete-volumes --delete-namespace --perform-cluster-management
-
-rm -rf $WORK_DIR
-
-echo "Test passed"