From fb69cc58ffd015e187d208910b233c140128046c Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Tue, 3 Mar 2026 05:28:52 +0000 Subject: [PATCH 01/19] feat(k8s): map compose service ports to Kind extraPortMappings and support hostNetwork Kind's extraPortMappings only included ports 80/443 for Caddy. Compose service ports (RPC, gossip, UDP) were never forwarded, making them unreachable from the host. Also adds hostNetwork/dnsPolicy to the k8s pod spec when any compose service uses network_mode: host. Co-Authored-By: Claude Opus 4.6 --- stack_orchestrator/deploy/k8s/cluster_info.py | 13 ++++++++++ stack_orchestrator/deploy/k8s/helpers.py | 24 +++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/stack_orchestrator/deploy/k8s/cluster_info.py b/stack_orchestrator/deploy/k8s/cluster_info.py index da24bdc2..161fbd03 100644 --- a/stack_orchestrator/deploy/k8s/cluster_info.py +++ b/stack_orchestrator/deploy/k8s/cluster_info.py @@ -394,6 +394,14 @@ class ClusterInfo: result.append(pv) return result + def _any_service_has_host_network(self): + for pod_name in self.parsed_pod_yaml_map: + pod = self.parsed_pod_yaml_map[pod_name] + for svc in pod.get("services", {}).values(): + if svc.get("network_mode") == "host": + return True + return False + # TODO: put things like image pull policy into an object-scope struct def get_deployment(self, image_pull_policy: Optional[str] = None): containers = [] @@ -568,6 +576,7 @@ class ClusterInfo: ) ) + use_host_network = self._any_service_has_host_network() template = client.V1PodTemplateSpec( metadata=client.V1ObjectMeta(annotations=annotations, labels=labels), spec=client.V1PodSpec( @@ -577,6 +586,10 @@ class ClusterInfo: affinity=affinity, tolerations=tolerations, runtime_class_name=self.spec.get_runtime_class(), + host_network=use_host_network or None, + dns_policy=( + "ClusterFirstWithHostNet" if use_host_network else None + ), ), ) spec = client.V1DeploymentSpec( diff --git a/stack_orchestrator/deploy/k8s/helpers.py b/stack_orchestrator/deploy/k8s/helpers.py index 8b367f86..4d9cbe3b 100644 --- a/stack_orchestrator/deploy/k8s/helpers.py +++ b/stack_orchestrator/deploy/k8s/helpers.py @@ -683,11 +683,35 @@ def _generate_kind_port_mappings_from_services(parsed_pod_files): def _generate_kind_port_mappings(parsed_pod_files): port_definitions = [] + seen = set() # Map port 80 and 443 for the Caddy ingress controller (HTTPS support) for port_string in ["80", "443"]: port_definitions.append( f" - containerPort: {port_string}\n hostPort: {port_string}\n" ) + seen.add((port_string, "TCP")) + # Map ports declared in compose services + for pod in parsed_pod_files: + parsed_pod_file = parsed_pod_files[pod] + if "services" in parsed_pod_file: + for service_name in parsed_pod_file["services"]: + service_obj = parsed_pod_file["services"][service_name] + for port_entry in service_obj.get("ports", []): + port_str = str(port_entry) + protocol = "TCP" + if "/" in port_str: + port_str, proto = port_str.split("/", 1) + protocol = proto.upper() + if ":" in port_str: + port_str = port_str.split(":")[-1] + port_num = port_str.strip("'\"") + if (port_num, protocol) not in seen: + seen.add((port_num, protocol)) + port_definitions.append( + f" - containerPort: {port_num}\n" + f" hostPort: {port_num}\n" + f" protocol: {protocol}\n" + ) return ( "" if len(port_definitions) == 0 From f305214ce1b6a545c2d5c1533e7a3b5e6b391f01 Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Tue, 3 Mar 2026 05:28:55 +0000 Subject: [PATCH 02/19] add local test runner script Co-Authored-By: Claude Opus 4.6 --- tests/scripts/run-test-local.sh | 53 +++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100755 tests/scripts/run-test-local.sh diff --git a/tests/scripts/run-test-local.sh b/tests/scripts/run-test-local.sh new file mode 100755 index 00000000..f6f32346 --- /dev/null +++ b/tests/scripts/run-test-local.sh @@ -0,0 +1,53 @@ +#!/bin/bash +# Run a test suite locally in an isolated venv. +# +# Usage: +# ./tests/scripts/run-test-local.sh +# +# Examples: +# ./tests/scripts/run-test-local.sh tests/webapp-test/run-webapp-test.sh +# ./tests/scripts/run-test-local.sh tests/smoke-test/run-smoke-test.sh +# ./tests/scripts/run-test-local.sh tests/k8s-deploy/run-deploy-test.sh +# +# The script creates a temporary venv, installs shiv, builds the laconic-so +# package, runs the requested test, then cleans up. + +set -euo pipefail + +if [ $# -lt 1 ]; then + echo "Usage: $0 [args...]" + exit 1 +fi + +TEST_SCRIPT="$1" +shift + +if [ ! -f "$TEST_SCRIPT" ]; then + echo "Error: $TEST_SCRIPT not found" + exit 1 +fi + +REPO_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" +VENV_DIR=$(mktemp -d /tmp/so-test-XXXXXX) + +cleanup() { + echo "Cleaning up venv: $VENV_DIR" + rm -rf "$VENV_DIR" +} +trap cleanup EXIT + +cd "$REPO_DIR" + +echo "==> Creating venv in $VENV_DIR" +python3 -m venv "$VENV_DIR" +source "$VENV_DIR/bin/activate" + +echo "==> Installing shiv" +pip install -q shiv + +echo "==> Building laconic-so package" +./scripts/create_build_tag_file.sh +./scripts/build_shiv_package.sh + +echo "==> Running: $TEST_SCRIPT $*" +exec "./$TEST_SCRIPT" "$@" From 7cd5043a835ff8866b58bf7fd8e3173aaddb89af Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Wed, 4 Mar 2026 16:41:16 +0000 Subject: [PATCH 03/19] feat(k8s): add kind-mount-root for unified kind extraMount When kind-mount-root is set in spec.yml, emit a single extraMount mapping the root to /mnt instead of per-volume mounts. This allows adding new volumes without recreating the kind cluster. Volumes whose host path is under the root are skipped for individual extraMounts and their PV paths resolve to /mnt/{relative_path}. Volumes outside the root keep individual extraMounts as before. Co-Authored-By: Claude Opus 4.6 --- stack_orchestrator/constants.py | 1 + stack_orchestrator/deploy/k8s/cluster_info.py | 6 +++++- stack_orchestrator/deploy/k8s/helpers.py | 21 ++++++++++++++++++- stack_orchestrator/deploy/spec.py | 3 +++ 4 files changed, 29 insertions(+), 2 deletions(-) diff --git a/stack_orchestrator/constants.py b/stack_orchestrator/constants.py index 75bd0ebc..2c0c8de0 100644 --- a/stack_orchestrator/constants.py +++ b/stack_orchestrator/constants.py @@ -45,3 +45,4 @@ runtime_class_key = "runtime-class" high_memlock_runtime = "high-memlock" high_memlock_spec_filename = "high-memlock-spec.json" acme_email_key = "acme-email" +kind_mount_root_key = "kind-mount-root" diff --git a/stack_orchestrator/deploy/k8s/cluster_info.py b/stack_orchestrator/deploy/k8s/cluster_info.py index 161fbd03..dc967469 100644 --- a/stack_orchestrator/deploy/k8s/cluster_info.py +++ b/stack_orchestrator/deploy/k8s/cluster_info.py @@ -371,7 +371,11 @@ class ClusterInfo: if self.spec.is_kind_deployment(): host_path = client.V1HostPathVolumeSource( - path=get_kind_pv_bind_mount_path(volume_name) + path=get_kind_pv_bind_mount_path( + volume_name, + kind_mount_root=self.spec.get_kind_mount_root(), + host_path=volume_path, + ) ) else: host_path = client.V1HostPathVolumeSource(path=volume_path) diff --git a/stack_orchestrator/deploy/k8s/helpers.py b/stack_orchestrator/deploy/k8s/helpers.py index 4d9cbe3b..a316f0c7 100644 --- a/stack_orchestrator/deploy/k8s/helpers.py +++ b/stack_orchestrator/deploy/k8s/helpers.py @@ -440,7 +440,11 @@ def named_volumes_from_pod_files(parsed_pod_files): return named_volumes -def get_kind_pv_bind_mount_path(volume_name: str): +def get_kind_pv_bind_mount_path(volume_name: str, kind_mount_root: Optional[str] = None, + host_path: Optional[str] = None): + if kind_mount_root and host_path and host_path.startswith(kind_mount_root): + rel = os.path.relpath(host_path, kind_mount_root) + return f"/mnt/{rel}" return f"/mnt/{volume_name}" @@ -563,6 +567,7 @@ def _generate_kind_mounts(parsed_pod_files, deployment_dir, deployment_context): volume_definitions = [] volume_host_path_map = _get_host_paths_for_volumes(deployment_context) seen_host_path_mounts = set() # Track to avoid duplicate mounts + kind_mount_root = deployment_context.spec.get_kind_mount_root() # Cluster state backup for offline data recovery (unique per deployment) # etcd contains all k8s state; PKI certs needed to decrypt etcd offline @@ -583,6 +588,17 @@ def _generate_kind_mounts(parsed_pod_files, deployment_dir, deployment_context): f" - hostPath: {pki_host_path}\n" f" containerPath: /etc/kubernetes/pki\n" ) + # When kind-mount-root is set, emit a single extraMount for the root. + # Individual volumes whose host path starts with the root are covered + # by this single mount and don't need their own extraMount entries. + mount_root_emitted = False + if kind_mount_root: + volume_definitions.append( + f" - hostPath: {kind_mount_root}\n" + f" containerPath: /mnt\n" + ) + mount_root_emitted = True + # Note these paths are relative to the location of the pod files (at present) # So we need to fix up to make them correct and absolute because kind assumes # relative to the cwd. @@ -642,6 +658,9 @@ def _generate_kind_mounts(parsed_pod_files, deployment_dir, deployment_context): volume_host_path_map[volume_name], deployment_dir, ) + # Skip individual extraMount if covered by mount root + if mount_root_emitted and str(host_path).startswith(kind_mount_root): + continue container_path = get_kind_pv_bind_mount_path( volume_name ) diff --git a/stack_orchestrator/deploy/spec.py b/stack_orchestrator/deploy/spec.py index e5647b04..c62d0aea 100644 --- a/stack_orchestrator/deploy/spec.py +++ b/stack_orchestrator/deploy/spec.py @@ -202,5 +202,8 @@ class Spec: def is_kind_deployment(self): return self.get_deployment_type() in [constants.k8s_kind_deploy_type] + def get_kind_mount_root(self): + return self.obj.get(constants.kind_mount_root_key) + def is_docker_deployment(self): return self.get_deployment_type() in [constants.compose_deploy_type] From 26dea540e98cf145ab9b83a5cfb648d56334f8e8 Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Wed, 4 Mar 2026 17:13:08 +0000 Subject: [PATCH 04/19] fix(k8s): use deployment namespace for pod and container lookups pods_in_deployment() and containers_in_pod() were hardcoded to search the "default" namespace, but deployments are created in a per-deployment namespace (laconic-{name}). This caused logs() to report "Pods not running" even when pods were healthy. Co-Authored-By: Claude Opus 4.6 --- stack_orchestrator/deploy/k8s/deploy_k8s.py | 4 ++-- stack_orchestrator/deploy/k8s/helpers.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/stack_orchestrator/deploy/k8s/deploy_k8s.py b/stack_orchestrator/deploy/k8s/deploy_k8s.py index f7f8ad43..58801d33 100644 --- a/stack_orchestrator/deploy/k8s/deploy_k8s.py +++ b/stack_orchestrator/deploy/k8s/deploy_k8s.py @@ -574,14 +574,14 @@ class K8sDeployer(Deployer): def logs(self, services, tail, follow, stream): self.connect_api() - pods = pods_in_deployment(self.core_api, self.cluster_info.app_name) + pods = pods_in_deployment(self.core_api, self.cluster_info.app_name, namespace=self.k8s_namespace) if len(pods) > 1: print("Warning: more than one pod in the deployment") if len(pods) == 0: log_data = "******* Pods not running ********\n" else: k8s_pod_name = pods[0] - containers = containers_in_pod(self.core_api, k8s_pod_name) + containers = containers_in_pod(self.core_api, k8s_pod_name, namespace=self.k8s_namespace) # If pod not started, logs request below will throw an exception try: log_data = "" diff --git a/stack_orchestrator/deploy/k8s/helpers.py b/stack_orchestrator/deploy/k8s/helpers.py index a316f0c7..c7b9703d 100644 --- a/stack_orchestrator/deploy/k8s/helpers.py +++ b/stack_orchestrator/deploy/k8s/helpers.py @@ -393,10 +393,10 @@ def load_images_into_kind(kind_cluster_name: str, image_set: Set[str]): raise DeployerException(f"kind load docker-image failed: {result}") -def pods_in_deployment(core_api: client.CoreV1Api, deployment_name: str): +def pods_in_deployment(core_api: client.CoreV1Api, deployment_name: str, namespace: str = "default"): pods = [] pod_response = core_api.list_namespaced_pod( - namespace="default", label_selector=f"app={deployment_name}" + namespace=namespace, label_selector=f"app={deployment_name}" ) if opts.o.debug: print(f"pod_response: {pod_response}") @@ -406,10 +406,10 @@ def pods_in_deployment(core_api: client.CoreV1Api, deployment_name: str): return pods -def containers_in_pod(core_api: client.CoreV1Api, pod_name: str) -> List[str]: +def containers_in_pod(core_api: client.CoreV1Api, pod_name: str, namespace: str = "default") -> List[str]: containers: List[str] = [] pod_response = cast( - client.V1Pod, core_api.read_namespaced_pod(pod_name, namespace="default") + client.V1Pod, core_api.read_namespaced_pod(pod_name, namespace=namespace) ) if opts.o.debug: print(f"pod_response: {pod_response}") From d090f2064e60ac0e1ffa10069b7cdcb437b0602b Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Sat, 7 Mar 2026 08:47:12 +0000 Subject: [PATCH 05/19] docs: annotate spec.yml config layering conventions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Compose file owns application defaults. spec.yml config: section is for deployment-specific overrides only (hostnames, IPs, secrets). Start scripts should not have their own defaults — they read what the compose file provides. Annotations added: - CLAUDE.md: config layering table and anti-pattern callout - spec.py: Spec class docstring with good/bad config examples - deployment_create.py: _write_config_file docstring Co-Authored-By: Claude Opus 4.6 --- CLAUDE.md | 27 +++++++++++++++++ .../deploy/deployment_create.py | 12 ++++++++ stack_orchestrator/deploy/spec.py | 29 +++++++++++++++++++ 3 files changed, 68 insertions(+) diff --git a/CLAUDE.md b/CLAUDE.md index 845cbd22..33054f8d 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -114,6 +114,33 @@ One Kind cluster per host by design. Never request or expect separate clusters. - `helpers.py`: `create_cluster()`, etcd cleanup, kind operations - `cluster_info.py`: K8s resource generation (Deployment, Service, Ingress) +## spec.yml: Config Layering + +**The compose file is the single source of truth for application defaults.** + +The configuration chain is: compose defaults → spec.yml overrides → container env. + +| Layer | Owns | Example | +|-------|------|---------| +| **compose file** | All env vars and their defaults | `RPC_PORT: ${RPC_PORT:-8899}` | +| **spec.yml config:** | Deployment-specific overrides only | `GOSSIP_HOST: 10.0.0.1` | +| **start script** | Reads env vars, no defaults of its own | `${RPC_PORT}` | + +**What goes in spec.yml config:** +- Values unique to this deployment (hostnames, IPs, endpoints) +- Secrets (`$generate:hex:32$`) +- Overrides that differ from the compose default for this specific deployment + +**What does NOT go in spec.yml config:** +- Application defaults (ports, log levels, intervals, feature flags) +- Values that would be the same across all deployments of this stack +- Every env var the service accepts — that's the compose file's job + +**Anti-pattern:** Dumping all env vars from the compose file into spec.yml. +This creates three sources of truth (compose, spec, start script) that +inevitably diverge. If someone changes the default in the compose file, +spec.yml still has the old value and silently overrides it. + ## Insights and Observations ### Design Principles diff --git a/stack_orchestrator/deploy/deployment_create.py b/stack_orchestrator/deploy/deployment_create.py index 511445be..0e2effc2 100644 --- a/stack_orchestrator/deploy/deployment_create.py +++ b/stack_orchestrator/deploy/deployment_create.py @@ -639,6 +639,18 @@ def create_registry_secret(spec: Spec, deployment_name: str) -> Optional[str]: def _write_config_file( spec_file: Path, config_env_file: Path, deployment_name: Optional[str] = None ): + """Write spec.yml config: entries to config.env. + + The config: section in spec.yml should contain only deployment-specific + overrides — values that differ between deployments (hostnames, endpoints, + credentials, secrets via $generate:...$). + + Application defaults (ports, log levels, feature flags, tuning params) + belong in the compose file's environment section. The compose file is + the single source of truth for what env vars a service accepts and + their default values. spec.yml overrides those defaults for a specific + deployment. + """ spec_content = get_parsed_deployment_spec(spec_file) config_vars = spec_content.get("config", {}) or {} diff --git a/stack_orchestrator/deploy/spec.py b/stack_orchestrator/deploy/spec.py index c62d0aea..072b035c 100644 --- a/stack_orchestrator/deploy/spec.py +++ b/stack_orchestrator/deploy/spec.py @@ -73,6 +73,35 @@ class Resources: class Spec: + """Deployment spec (spec.yml) — describes WHERE and HOW to deploy a stack. + + A spec.yml contains deployment-specific infrastructure configuration: + - stack: path to the stack definition + - deploy-to: target platform (k8s-kind, k8s, compose) + - network: ports, http-proxy, acme-email + - resources: CPU/memory limits and reservations + - security: privileged, capabilities, memlock + - volumes: host path mappings for persistent data + - configmaps: directories mounted as k8s ConfigMaps + - config: deployment-specific env var OVERRIDES (see below) + + The config: section is for deployment-specific values only — things + that differ between deployments (hostnames, endpoints, secrets). + Application defaults belong in the compose file's environment section, + not here. If a value would be the same across all deployments of this + stack, it belongs in the compose file, not in spec.yml. + + Good config: entries (deployment-specific): + VALIDATOR_ENTRYPOINT: my-cluster.example.com:8001 + PUBLIC_RPC_ADDRESS: my-node.example.com:8899 + GOSSIP_HOST: 10.0.0.1 + + Bad config: entries (these are application defaults): + RPC_PORT: '8899' # same everywhere, belongs in compose + LIMIT_LEDGER_SIZE: '50000000' # same everywhere, belongs in compose + RUST_LOG: info # same everywhere, belongs in compose + """ + obj: typing.Any file_path: Optional[Path] From d4dcbedd48bd3fe984fbc3fe17c6a51258aed6f5 Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Sat, 7 Mar 2026 09:55:24 +0000 Subject: [PATCH 06/19] bug: deploy create doesn't auto-generate volume mappings for new pods Co-Authored-By: Claude Opus 4.6 --- TODO.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/TODO.md b/TODO.md index 349530c8..65439ab5 100644 --- a/TODO.md +++ b/TODO.md @@ -7,6 +7,25 @@ We need an "update stack" command in stack orchestrator and cleaner documentatio **Context**: Currently, `deploy init` generates a spec file and `deploy create` creates a deployment directory. The `deployment update` command (added by Thomas Lackey) only syncs env vars and restarts - it doesn't regenerate configurations. There's a gap in the workflow for updating stack configurations after initial deployment. +## Bugs + +### `deploy create` doesn't auto-generate volume mappings for new pods + +When a new pod is added to `stack.yml` (e.g. `monitoring`), `deploy create` +does not generate default host path mappings in spec.yml for the new pod's +volumes. The deployment then fails at scheduling because the PVCs don't exist. + +**Expected**: `deploy create` enumerates all volumes from all compose files +in the stack and generates default host paths for any that aren't already +mapped in the spec.yml `volumes:` section. + +**Actual**: Only volumes already in spec.yml get PVs. New volumes are silently +missing, causing `FailedScheduling: persistentvolumeclaim not found`. + +**Workaround**: Manually add volume entries to spec.yml and create host dirs. + +**Files**: `deployment_create.py` (`_write_config_file`, volume handling) + ## Architecture Refactoring ### Separate Deployer from Stack Orchestrator CLI From 8a8b882e322b19bae3190b20845b4b0245469927 Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Sat, 7 Mar 2026 09:56:28 +0000 Subject: [PATCH 07/19] bug: deploy create doesn't auto-generate volume mappings for new pods Co-Authored-By: Claude Opus 4.6 --- TODO.md | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/TODO.md b/TODO.md index 349530c8..65439ab5 100644 --- a/TODO.md +++ b/TODO.md @@ -7,6 +7,25 @@ We need an "update stack" command in stack orchestrator and cleaner documentatio **Context**: Currently, `deploy init` generates a spec file and `deploy create` creates a deployment directory. The `deployment update` command (added by Thomas Lackey) only syncs env vars and restarts - it doesn't regenerate configurations. There's a gap in the workflow for updating stack configurations after initial deployment. +## Bugs + +### `deploy create` doesn't auto-generate volume mappings for new pods + +When a new pod is added to `stack.yml` (e.g. `monitoring`), `deploy create` +does not generate default host path mappings in spec.yml for the new pod's +volumes. The deployment then fails at scheduling because the PVCs don't exist. + +**Expected**: `deploy create` enumerates all volumes from all compose files +in the stack and generates default host paths for any that aren't already +mapped in the spec.yml `volumes:` section. + +**Actual**: Only volumes already in spec.yml get PVs. New volumes are silently +missing, causing `FailedScheduling: persistentvolumeclaim not found`. + +**Workaround**: Manually add volume entries to spec.yml and create host dirs. + +**Files**: `deployment_create.py` (`_write_config_file`, volume handling) + ## Architecture Refactoring ### Separate Deployer from Stack Orchestrator CLI From eae4c3cdffac48d2b9c8ea685dc634db1de17060 Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Sat, 7 Mar 2026 10:26:10 +0000 Subject: [PATCH 08/19] feat(k8s): per-service resource layering in deployer Resolve container resources using layered priority: 1. spec.yml per-container override (resources.containers.) 2. Compose file deploy.resources block 3. spec.yml global resources 4. DEFAULT_CONTAINER_RESOURCES fallback This prevents monitoring sidecars from inheriting the validator's resource requests (e.g., 256G memory). Each service gets appropriate resources from its compose definition unless explicitly overridden. Note: existing deployments with a global resources block in spec.yml can remove it once compose files declare per-service defaults. Co-Authored-By: Claude Opus 4.6 --- stack_orchestrator/deploy/k8s/cluster_info.py | 44 +++++++++++++++++-- stack_orchestrator/deploy/spec.py | 21 +++++++++ 2 files changed, 61 insertions(+), 4 deletions(-) diff --git a/stack_orchestrator/deploy/k8s/cluster_info.py b/stack_orchestrator/deploy/k8s/cluster_info.py index da24bdc2..2ebf96f2 100644 --- a/stack_orchestrator/deploy/k8s/cluster_info.py +++ b/stack_orchestrator/deploy/k8s/cluster_info.py @@ -394,13 +394,43 @@ class ClusterInfo: result.append(pv) return result + def _any_service_has_host_network(self): + for pod_name in self.parsed_pod_yaml_map: + pod = self.parsed_pod_yaml_map[pod_name] + for svc in pod.get("services", {}).values(): + if svc.get("network_mode") == "host": + return True + return False + + def _resolve_container_resources( + self, container_name: str, service_info: dict, global_resources: Resources + ) -> Resources: + """Resolve resources for a container using layered priority. + + Priority: spec per-container > compose deploy.resources + > spec global > DEFAULT + """ + # 1. Check spec.yml for per-container override + per_container = self.spec.get_container_resources_for(container_name) + if per_container: + return per_container + + # 2. Check compose service_info for deploy.resources + deploy_block = service_info.get("deploy", {}) + compose_resources = deploy_block.get("resources", {}) if deploy_block else {} + if compose_resources: + return Resources(compose_resources) + + # 3. Fall back to spec.yml global (already resolved with DEFAULT fallback) + return global_resources + # TODO: put things like image pull policy into an object-scope struct def get_deployment(self, image_pull_policy: Optional[str] = None): containers = [] services = {} - resources = self.spec.get_container_resources() - if not resources: - resources = DEFAULT_CONTAINER_RESOURCES + global_resources = self.spec.get_container_resources() + if not global_resources: + global_resources = DEFAULT_CONTAINER_RESOURCES for pod_name in self.parsed_pod_yaml_map: pod = self.parsed_pod_yaml_map[pod_name] services = pod["services"] @@ -483,6 +513,9 @@ class ClusterInfo: ) ) ] + container_resources = self._resolve_container_resources( + container_name, service_info, global_resources + ) container = client.V1Container( name=container_name, image=image_to_use, @@ -501,7 +534,7 @@ class ClusterInfo: if self.spec.get_capabilities() else None, ), - resources=to_k8s_resource_requirements(resources), + resources=to_k8s_resource_requirements(container_resources), ) containers.append(container) volumes = volumes_for_pod_files( @@ -568,6 +601,7 @@ class ClusterInfo: ) ) + use_host_network = self._any_service_has_host_network() template = client.V1PodTemplateSpec( metadata=client.V1ObjectMeta(annotations=annotations, labels=labels), spec=client.V1PodSpec( @@ -577,6 +611,8 @@ class ClusterInfo: affinity=affinity, tolerations=tolerations, runtime_class_name=self.spec.get_runtime_class(), + host_network=use_host_network or None, + dns_policy=("ClusterFirstWithHostNet" if use_host_network else None), ), ) spec = client.V1DeploymentSpec( diff --git a/stack_orchestrator/deploy/spec.py b/stack_orchestrator/deploy/spec.py index e5647b04..bd62779e 100644 --- a/stack_orchestrator/deploy/spec.py +++ b/stack_orchestrator/deploy/spec.py @@ -120,6 +120,27 @@ class Spec: self.obj.get(constants.resources_key, {}).get("containers", {}) ) + def get_container_resources_for( + self, container_name: str + ) -> typing.Optional[Resources]: + """Look up per-container resource overrides from spec.yml. + + Checks resources.containers. in the spec. Returns None + if no per-container override exists (caller falls back to other sources). + """ + containers_block = self.obj.get(constants.resources_key, {}).get( + "containers", {} + ) + if container_name in containers_block: + entry = containers_block[container_name] + # Only treat it as a per-container override if it's a dict with + # reservations/limits nested inside (not a top-level global key) + if isinstance(entry, dict) and ( + "reservations" in entry or "limits" in entry + ): + return Resources(entry) + return None + def get_volume_resources(self): return Resources( self.obj.get(constants.resources_key, {}).get(constants.volumes_key, {}) From b6d6ad81455d91479884f2ffd05af41c8dc1d339 Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Wed, 4 Mar 2026 16:41:16 +0000 Subject: [PATCH 09/19] feat(k8s): add kind-mount-root for unified kind extraMount When kind-mount-root is set in spec.yml, emit a single extraMount mapping the root to /mnt instead of per-volume mounts. This allows adding new volumes without recreating the kind cluster. Volumes whose host path is under the root are skipped for individual extraMounts and their PV paths resolve to /mnt/{relative_path}. Volumes outside the root keep individual extraMounts as before. Co-Authored-By: Claude Opus 4.6 --- stack_orchestrator/constants.py | 1 + stack_orchestrator/deploy/k8s/cluster_info.py | 6 +++++- stack_orchestrator/deploy/k8s/helpers.py | 21 ++++++++++++++++++- stack_orchestrator/deploy/spec.py | 3 +++ 4 files changed, 29 insertions(+), 2 deletions(-) diff --git a/stack_orchestrator/constants.py b/stack_orchestrator/constants.py index 75bd0ebc..2c0c8de0 100644 --- a/stack_orchestrator/constants.py +++ b/stack_orchestrator/constants.py @@ -45,3 +45,4 @@ runtime_class_key = "runtime-class" high_memlock_runtime = "high-memlock" high_memlock_spec_filename = "high-memlock-spec.json" acme_email_key = "acme-email" +kind_mount_root_key = "kind-mount-root" diff --git a/stack_orchestrator/deploy/k8s/cluster_info.py b/stack_orchestrator/deploy/k8s/cluster_info.py index 2ebf96f2..818ffa25 100644 --- a/stack_orchestrator/deploy/k8s/cluster_info.py +++ b/stack_orchestrator/deploy/k8s/cluster_info.py @@ -371,7 +371,11 @@ class ClusterInfo: if self.spec.is_kind_deployment(): host_path = client.V1HostPathVolumeSource( - path=get_kind_pv_bind_mount_path(volume_name) + path=get_kind_pv_bind_mount_path( + volume_name, + kind_mount_root=self.spec.get_kind_mount_root(), + host_path=volume_path, + ) ) else: host_path = client.V1HostPathVolumeSource(path=volume_path) diff --git a/stack_orchestrator/deploy/k8s/helpers.py b/stack_orchestrator/deploy/k8s/helpers.py index 8b367f86..95e53d73 100644 --- a/stack_orchestrator/deploy/k8s/helpers.py +++ b/stack_orchestrator/deploy/k8s/helpers.py @@ -440,7 +440,11 @@ def named_volumes_from_pod_files(parsed_pod_files): return named_volumes -def get_kind_pv_bind_mount_path(volume_name: str): +def get_kind_pv_bind_mount_path(volume_name: str, kind_mount_root: Optional[str] = None, + host_path: Optional[str] = None): + if kind_mount_root and host_path and host_path.startswith(kind_mount_root): + rel = os.path.relpath(host_path, kind_mount_root) + return f"/mnt/{rel}" return f"/mnt/{volume_name}" @@ -563,6 +567,7 @@ def _generate_kind_mounts(parsed_pod_files, deployment_dir, deployment_context): volume_definitions = [] volume_host_path_map = _get_host_paths_for_volumes(deployment_context) seen_host_path_mounts = set() # Track to avoid duplicate mounts + kind_mount_root = deployment_context.spec.get_kind_mount_root() # Cluster state backup for offline data recovery (unique per deployment) # etcd contains all k8s state; PKI certs needed to decrypt etcd offline @@ -583,6 +588,17 @@ def _generate_kind_mounts(parsed_pod_files, deployment_dir, deployment_context): f" - hostPath: {pki_host_path}\n" f" containerPath: /etc/kubernetes/pki\n" ) + # When kind-mount-root is set, emit a single extraMount for the root. + # Individual volumes whose host path starts with the root are covered + # by this single mount and don't need their own extraMount entries. + mount_root_emitted = False + if kind_mount_root: + volume_definitions.append( + f" - hostPath: {kind_mount_root}\n" + f" containerPath: /mnt\n" + ) + mount_root_emitted = True + # Note these paths are relative to the location of the pod files (at present) # So we need to fix up to make them correct and absolute because kind assumes # relative to the cwd. @@ -642,6 +658,9 @@ def _generate_kind_mounts(parsed_pod_files, deployment_dir, deployment_context): volume_host_path_map[volume_name], deployment_dir, ) + # Skip individual extraMount if covered by mount root + if mount_root_emitted and str(host_path).startswith(kind_mount_root): + continue container_path = get_kind_pv_bind_mount_path( volume_name ) diff --git a/stack_orchestrator/deploy/spec.py b/stack_orchestrator/deploy/spec.py index bd62779e..c4cde6f8 100644 --- a/stack_orchestrator/deploy/spec.py +++ b/stack_orchestrator/deploy/spec.py @@ -223,5 +223,8 @@ class Spec: def is_kind_deployment(self): return self.get_deployment_type() in [constants.k8s_kind_deploy_type] + def get_kind_mount_root(self): + return self.obj.get(constants.kind_mount_root_key) + def is_docker_deployment(self): return self.get_deployment_type() in [constants.compose_deploy_type] From 929bdab8a421a96f0ef3bb4db6ad0b8a5cfea383 Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Sat, 7 Mar 2026 12:58:04 +0000 Subject: [PATCH 10/19] fix(k8s): add HostToContainer mount propagation to kind-mount-root The kind-mount-root extraMount entry used kind's default propagation (None), so new bind mounts under the root on the host (e.g. zvols mounted under /srv/kind) were not visible inside the kind node until restart. Setting propagation to HostToContainer makes host-side mount changes propagate into the kind node automatically. Co-Authored-By: Claude Opus 4.6 --- stack_orchestrator/deploy/k8s/helpers.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/stack_orchestrator/deploy/k8s/helpers.py b/stack_orchestrator/deploy/k8s/helpers.py index 95e53d73..ec136233 100644 --- a/stack_orchestrator/deploy/k8s/helpers.py +++ b/stack_orchestrator/deploy/k8s/helpers.py @@ -440,8 +440,11 @@ def named_volumes_from_pod_files(parsed_pod_files): return named_volumes -def get_kind_pv_bind_mount_path(volume_name: str, kind_mount_root: Optional[str] = None, - host_path: Optional[str] = None): +def get_kind_pv_bind_mount_path( + volume_name: str, + kind_mount_root: Optional[str] = None, + host_path: Optional[str] = None, +): if kind_mount_root and host_path and host_path.startswith(kind_mount_root): rel = os.path.relpath(host_path, kind_mount_root) return f"/mnt/{rel}" @@ -596,6 +599,7 @@ def _generate_kind_mounts(parsed_pod_files, deployment_dir, deployment_context): volume_definitions.append( f" - hostPath: {kind_mount_root}\n" f" containerPath: /mnt\n" + f" propagation: HostToContainer\n" ) mount_root_emitted = True @@ -658,8 +662,10 @@ def _generate_kind_mounts(parsed_pod_files, deployment_dir, deployment_context): volume_host_path_map[volume_name], deployment_dir, ) - # Skip individual extraMount if covered by mount root - if mount_root_emitted and str(host_path).startswith(kind_mount_root): + # Skip if covered by mount root + if mount_root_emitted and str(host_path).startswith( + kind_mount_root + ): continue container_path = get_kind_pv_bind_mount_path( volume_name From a11d40f2f340011f06d918c6552e1ac4ba8ad360 Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Sat, 7 Mar 2026 13:07:12 +0000 Subject: [PATCH 11/19] fix(k8s): add HostToContainer mount propagation to kind extraMounts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without propagation, rbind submounts on the host (e.g., XFS zvol at /srv/kind/solana) are invisible inside the kind node — it sees the underlying filesystem (ZFS) instead. This causes agave's io_uring to deadlock on ZFS transaction commits (D-state in dsl_dir_tempreserve_space). HostToContainer propagation ensures host submounts propagate into the kind node, so /mnt/solana correctly resolves to the XFS zvol. Co-Authored-By: Claude Opus 4.6 --- stack_orchestrator/deploy/k8s/helpers.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/stack_orchestrator/deploy/k8s/helpers.py b/stack_orchestrator/deploy/k8s/helpers.py index 8b367f86..ac4e8603 100644 --- a/stack_orchestrator/deploy/k8s/helpers.py +++ b/stack_orchestrator/deploy/k8s/helpers.py @@ -573,14 +573,18 @@ def _generate_kind_mounts(parsed_pod_files, deployment_dir, deployment_context): Path(f"./data/{backup_subdir}/etcd"), deployment_dir ) volume_definitions.append( - f" - hostPath: {etcd_host_path}\n" f" containerPath: /var/lib/etcd\n" + f" - hostPath: {etcd_host_path}\n" + f" containerPath: /var/lib/etcd\n" + f" propagation: HostToContainer\n" ) pki_host_path = _make_absolute_host_path( Path(f"./data/{backup_subdir}/pki"), deployment_dir ) volume_definitions.append( - f" - hostPath: {pki_host_path}\n" f" containerPath: /etc/kubernetes/pki\n" + f" - hostPath: {pki_host_path}\n" + f" containerPath: /etc/kubernetes/pki\n" + f" propagation: HostToContainer\n" ) # Note these paths are relative to the location of the pod files (at present) @@ -621,6 +625,7 @@ def _generate_kind_mounts(parsed_pod_files, deployment_dir, deployment_context): volume_definitions.append( f" - hostPath: {host_path}\n" f" containerPath: {container_path}\n" + f" propagation: HostToContainer\n" ) if opts.o.debug: print(f"Added host path mount: {host_path}") @@ -648,6 +653,7 @@ def _generate_kind_mounts(parsed_pod_files, deployment_dir, deployment_context): volume_definitions.append( f" - hostPath: {host_path}\n" f" containerPath: {container_path}\n" + f" propagation: HostToContainer\n" ) return ( "" @@ -703,7 +709,11 @@ def _generate_high_memlock_spec_mount(deployment_dir: Path): references an absolute path. """ spec_path = deployment_dir.joinpath(constants.high_memlock_spec_filename).resolve() - return f" - hostPath: {spec_path}\n" f" containerPath: {spec_path}\n" + return ( + f" - hostPath: {spec_path}\n" + f" containerPath: {spec_path}\n" + f" propagation: HostToContainer\n" + ) def generate_high_memlock_spec_json(): From 7f205732f2eadf01548ba2839a530d3cbce58ddb Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Sat, 7 Mar 2026 17:56:13 +0000 Subject: [PATCH 12/19] fix(k8s): expand etcd cleanup whitelist to preserve core cluster services _clean_etcd_keeping_certs() only preserved /registry/secrets/caddy-system, deleting everything else including the kubernetes ClusterIP service in the default namespace. When kind recreated the cluster with the cleaned etcd, kube-apiserver saw existing data and skipped bootstrapping the service. kindnet panicked on KUBERNETES_SERVICE_HOST missing, blocking all pod networking. Expand the whitelist to also preserve: - /registry/services/specs/default/kubernetes - /registry/services/endpoints/default/kubernetes Loop over multiple prefixes instead of a single etcdctl get --prefix call. See docs/bug-laconic-so-etcd-cleanup.md in biscayne-agave-runbook. Co-Authored-By: Claude Opus 4.6 --- stack_orchestrator/deploy/k8s/helpers.py | 28 +++++++++++++++++++----- 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/stack_orchestrator/deploy/k8s/helpers.py b/stack_orchestrator/deploy/k8s/helpers.py index ac4e8603..85f3d5f7 100644 --- a/stack_orchestrator/deploy/k8s/helpers.py +++ b/stack_orchestrator/deploy/k8s/helpers.py @@ -148,8 +148,16 @@ def _clean_etcd_keeping_certs(etcd_path: str) -> bool: etcd_image = "gcr.io/etcd-development/etcd:v3.5.9" temp_dir = "/tmp/laconic-etcd-cleanup" - # Whitelist: prefixes to KEEP - everything else gets deleted - keep_prefixes = "/registry/secrets/caddy-system" + # Whitelist: prefixes to KEEP - everything else gets deleted. + # Must include core cluster resources (kubernetes service, kube-system + # secrets) or kindnet panics on restart — KUBERNETES_SERVICE_HOST is + # injected from the kubernetes ClusterIP service in default namespace. + keep_prefixes = [ + "/registry/secrets/caddy-system", + "/registry/services/specs/default/kubernetes", + "/registry/services/endpoints/default/kubernetes", + ] + keep_prefixes_str = " ".join(keep_prefixes) # The etcd image is distroless (no shell). We extract the statically-linked # etcdctl binary and run it from alpine which has shell + jq support. @@ -195,13 +203,21 @@ def _clean_etcd_keeping_certs(etcd_path: str) -> bool: sleep 3 # Use alpine with extracted etcdctl to run commands (alpine has shell + jq) - # Export caddy secrets + # Export whitelisted keys (caddy TLS certs + core cluster services) docker run --rm \ -v {temp_dir}:/backup \ --network container:laconic-etcd-cleanup \ - $ALPINE_IMAGE sh -c \ - '/backup/etcdctl get --prefix "{keep_prefixes}" -w json \ - > /backup/kept.json 2>/dev/null || echo "{{}}" > /backup/kept.json' + $ALPINE_IMAGE sh -c ' + apk add --no-cache jq >/dev/null 2>&1 + echo "[]" > /backup/all-kvs.json + for prefix in {keep_prefixes_str}; do + /backup/etcdctl get --prefix "$prefix" -w json 2>/dev/null \ + | jq ".kvs // []" >> /backup/all-kvs.json || true + done + jq -s "add" /backup/all-kvs.json \ + | jq "{{kvs: .}}" > /backup/kept.json 2>/dev/null \ + || echo "{{}}" > /backup/kept.json + ' # Delete ALL registry keys docker run --rm \ From 806c1bb723f3a627a6efcee4414a5459ce8ba860 Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Sun, 8 Mar 2026 02:33:20 +0000 Subject: [PATCH 13/19] refactor: rename `deployment update` to `deployment update-envs` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The update command only patches environment variables and adds a restart annotation. It does not update ports, volumes, configmaps, or any other deployment spec. The old name was misleading — it implied a full spec update, causing operators to expect changes that never took effect. Co-Authored-By: Claude Opus 4.6 --- stack_orchestrator/deploy/compose/deploy_docker.py | 2 +- stack_orchestrator/deploy/deploy.py | 4 ++-- stack_orchestrator/deploy/deployer.py | 2 +- stack_orchestrator/deploy/deployment.py | 8 ++++---- stack_orchestrator/deploy/k8s/deploy_k8s.py | 2 +- stack_orchestrator/deploy/webapp/util.py | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/stack_orchestrator/deploy/compose/deploy_docker.py b/stack_orchestrator/deploy/compose/deploy_docker.py index c6397aad..fa0ac1d4 100644 --- a/stack_orchestrator/deploy/compose/deploy_docker.py +++ b/stack_orchestrator/deploy/compose/deploy_docker.py @@ -62,7 +62,7 @@ class DockerDeployer(Deployer): except DockerException as e: raise DeployerException(e) - def update(self): + def update_envs(self): if not opts.o.dry_run: try: return self.docker.compose.restart() diff --git a/stack_orchestrator/deploy/deploy.py b/stack_orchestrator/deploy/deploy.py index 86c1856c..f2bf977c 100644 --- a/stack_orchestrator/deploy/deploy.py +++ b/stack_orchestrator/deploy/deploy.py @@ -182,8 +182,8 @@ def status_operation(ctx): ctx.obj.deployer.status() -def update_operation(ctx): - ctx.obj.deployer.update() +def update_envs_operation(ctx): + ctx.obj.deployer.update_envs() def ps_operation(ctx): diff --git a/stack_orchestrator/deploy/deployer.py b/stack_orchestrator/deploy/deployer.py index d8fb656b..11fb6592 100644 --- a/stack_orchestrator/deploy/deployer.py +++ b/stack_orchestrator/deploy/deployer.py @@ -28,7 +28,7 @@ class Deployer(ABC): pass @abstractmethod - def update(self): + def update_envs(self): pass @abstractmethod diff --git a/stack_orchestrator/deploy/deployment.py b/stack_orchestrator/deploy/deployment.py index b76e6486..902780fb 100644 --- a/stack_orchestrator/deploy/deployment.py +++ b/stack_orchestrator/deploy/deployment.py @@ -31,7 +31,7 @@ from stack_orchestrator.deploy.deploy import ( exec_operation, logs_operation, create_deploy_context, - update_operation, + update_envs_operation, ) from stack_orchestrator.deploy.deploy_types import DeployCommandContext from stack_orchestrator.deploy.deployment_context import DeploymentContext @@ -210,11 +210,11 @@ def status(ctx): status_operation(ctx) -@command.command() +@command.command(name="update-envs") @click.pass_context -def update(ctx): +def update_envs(ctx): ctx.obj = make_deploy_context(ctx) - update_operation(ctx) + update_envs_operation(ctx) @command.command() diff --git a/stack_orchestrator/deploy/k8s/deploy_k8s.py b/stack_orchestrator/deploy/k8s/deploy_k8s.py index f7f8ad43..3b235538 100644 --- a/stack_orchestrator/deploy/k8s/deploy_k8s.py +++ b/stack_orchestrator/deploy/k8s/deploy_k8s.py @@ -598,7 +598,7 @@ class K8sDeployer(Deployer): log_data = "******* No logs available ********\n" return log_stream_from_string(log_data) - def update(self): + def update_envs(self): self.connect_api() ref_deployment = self.cluster_info.get_deployment() if not ref_deployment or not ref_deployment.metadata: diff --git a/stack_orchestrator/deploy/webapp/util.py b/stack_orchestrator/deploy/webapp/util.py index 3c536477..84accbcd 100644 --- a/stack_orchestrator/deploy/webapp/util.py +++ b/stack_orchestrator/deploy/webapp/util.py @@ -696,7 +696,7 @@ def deploy_to_k8s(deploy_record, deployment_dir, recreate, logger): if not deploy_record: commands_to_run = ["start"] else: - commands_to_run = ["update"] + commands_to_run = ["update-envs"] for command in commands_to_run: logger.log(f"Running {command} command on deployment dir: {deployment_dir}") From cc6acd5f0940c0f77e7c9faa5ec4f3f3c05a7415 Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Sun, 8 Mar 2026 02:41:25 +0000 Subject: [PATCH 14/19] fix: default skip-cluster-management to true Destroying the kind cluster on stop/start is almost never the intent. The cluster holds PVs, ConfigMaps, and networking state that are expensive to recreate. Default to preserving the cluster; pass --perform-cluster-management explicitly when a full teardown is needed. Co-Authored-By: Claude Opus 4.6 --- stack_orchestrator/deploy/deployment.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/stack_orchestrator/deploy/deployment.py b/stack_orchestrator/deploy/deployment.py index 902780fb..1182d23f 100644 --- a/stack_orchestrator/deploy/deployment.py +++ b/stack_orchestrator/deploy/deployment.py @@ -114,7 +114,7 @@ def up(ctx, stay_attached, skip_cluster_management, extra_args): ) @click.option( "--skip-cluster-management/--perform-cluster-management", - default=False, + default=True, help="Skip cluster initialization/tear-down (only for kind-k8s deployments)", ) @click.argument("extra_args", nargs=-1) # help: command: up @@ -132,7 +132,7 @@ def start(ctx, stay_attached, skip_cluster_management, extra_args): ) @click.option( "--skip-cluster-management/--perform-cluster-management", - default=False, + default=True, help="Skip cluster initialization/tear-down (only for kind-k8s deployments)", ) @click.argument("extra_args", nargs=-1) # help: command: down @@ -151,7 +151,7 @@ def down(ctx, delete_volumes, skip_cluster_management, extra_args): ) @click.option( "--skip-cluster-management/--perform-cluster-management", - default=False, + default=True, help="Skip cluster initialization/tear-down (only for kind-k8s deployments)", ) @click.argument("extra_args", nargs=-1) # help: command: down From 1da69cf739352ffc8632c7612dd5a2b3541b8afc Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Sun, 8 Mar 2026 04:15:03 +0000 Subject: [PATCH 15/19] fix(k8s): make deploy_k8s.py idempotent with create-or-replace semantics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All K8s resource creation in deploy_k8s.py now uses try-create, catch ApiException(409), then replace — matching the pattern already used for secrets in deployment_create.py. This allows `deployment start` to be safely re-run without 409 Conflict errors. Resources made idempotent: - Deployment (create_namespaced_deployment → replace on 409) - Service (create_namespaced_service → replace on 409) - Ingress (create_namespaced_ingress → replace on 409) - NodePort services (same as Service) - ConfigMap (create_namespaced_config_map → replace on 409) - PV/PVC: bare `except: pass` replaced with explicit ApiException catch for 404 Extracted _ensure_deployment(), _ensure_service(), _ensure_ingress(), and _ensure_config_map() helpers to keep cyclomatic complexity in check. Co-Authored-By: Claude Opus 4.6 --- stack_orchestrator/deploy/k8s/deploy_k8s.py | 149 ++++++++++++++------ 1 file changed, 104 insertions(+), 45 deletions(-) diff --git a/stack_orchestrator/deploy/k8s/deploy_k8s.py b/stack_orchestrator/deploy/k8s/deploy_k8s.py index 3b235538..c0272be7 100644 --- a/stack_orchestrator/deploy/k8s/deploy_k8s.py +++ b/stack_orchestrator/deploy/k8s/deploy_k8s.py @@ -192,6 +192,99 @@ class K8sDeployer(Deployer): else: raise + def _ensure_config_map(self, cfg_map): + """Create or replace a ConfigMap (idempotent).""" + try: + resp = self.core_api.create_namespaced_config_map( + body=cfg_map, namespace=self.k8s_namespace + ) + if opts.o.debug: + print(f"ConfigMap created: {resp}") + except ApiException as e: + if e.status == 409: + resp = self.core_api.replace_namespaced_config_map( + name=cfg_map.metadata.name, + namespace=self.k8s_namespace, + body=cfg_map, + ) + if opts.o.debug: + print(f"ConfigMap updated: {resp}") + else: + raise + + def _ensure_deployment(self, deployment): + """Create or replace a Deployment (idempotent).""" + try: + resp = cast( + client.V1Deployment, + self.apps_api.create_namespaced_deployment( + body=deployment, namespace=self.k8s_namespace + ), + ) + if opts.o.debug: + print("Deployment created:") + except ApiException as e: + if e.status == 409: + resp = cast( + client.V1Deployment, + self.apps_api.replace_namespaced_deployment( + name=deployment.metadata.name, + namespace=self.k8s_namespace, + body=deployment, + ), + ) + if opts.o.debug: + print("Deployment updated:") + else: + raise + if opts.o.debug: + meta = resp.metadata + spec = resp.spec + if meta and spec and spec.template.spec: + containers = spec.template.spec.containers + img = containers[0].image if containers else None + print(f"{meta.namespace} {meta.name} {meta.generation} {img}") + + def _ensure_service(self, service, kind: str = "Service"): + """Create or replace a Service (idempotent).""" + try: + resp = self.core_api.create_namespaced_service( + namespace=self.k8s_namespace, body=service + ) + if opts.o.debug: + print(f"{kind} created: {resp}") + except ApiException as e: + if e.status == 409: + resp = self.core_api.replace_namespaced_service( + name=service.metadata.name, + namespace=self.k8s_namespace, + body=service, + ) + if opts.o.debug: + print(f"{kind} updated: {resp}") + else: + raise + + def _ensure_ingress(self, ingress): + """Create or replace an Ingress (idempotent).""" + try: + resp = self.networking_api.create_namespaced_ingress( + namespace=self.k8s_namespace, body=ingress + ) + if opts.o.debug: + print(f"Ingress created: {resp}") + except ApiException as e: + if e.status == 409: + resp = self.networking_api.replace_namespaced_ingress( + name=ingress.metadata.name, + namespace=self.k8s_namespace, + body=ingress, + ) + if opts.o.debug: + print(f"Ingress updated: {resp}") + else: + raise + def _create_volume_data(self): # Create the host-path-mounted PVs for this deployment pvs = self.cluster_info.get_pvs() @@ -208,8 +301,9 @@ class K8sDeployer(Deployer): print("PVs already present:") print(f"{pv_resp}") continue - except: # noqa: E722 - pass + except ApiException as e: + if e.status != 404: + raise pv_resp = self.core_api.create_persistent_volume(body=pv) if opts.o.debug: @@ -232,8 +326,9 @@ class K8sDeployer(Deployer): print("PVCs already present:") print(f"{pvc_resp}") continue - except: # noqa: E722 - pass + except ApiException as e: + if e.status != 404: + raise pvc_resp = self.core_api.create_namespaced_persistent_volume_claim( body=pvc, namespace=self.k8s_namespace @@ -248,12 +343,7 @@ class K8sDeployer(Deployer): if opts.o.debug: print(f"Sending this ConfigMap: {cfg_map}") if not opts.o.dry_run: - cfg_rsp = self.core_api.create_namespaced_config_map( - body=cfg_map, namespace=self.k8s_namespace - ) - if opts.o.debug: - print("ConfigMap created:") - print(f"{cfg_rsp}") + self._ensure_config_map(cfg_map) def _create_deployment(self): # Process compose files into a Deployment @@ -264,34 +354,13 @@ class K8sDeployer(Deployer): if opts.o.debug: print(f"Sending this deployment: {deployment}") if not opts.o.dry_run: - deployment_resp = cast( - client.V1Deployment, - self.apps_api.create_namespaced_deployment( - body=deployment, namespace=self.k8s_namespace - ), - ) - if opts.o.debug: - print("Deployment created:") - meta = deployment_resp.metadata - spec = deployment_resp.spec - if meta and spec and spec.template.spec: - ns = meta.namespace - name = meta.name - gen = meta.generation - containers = spec.template.spec.containers - img = containers[0].image if containers else None - print(f"{ns} {name} {gen} {img}") + self._ensure_deployment(deployment) service = self.cluster_info.get_service() if opts.o.debug: print(f"Sending this service: {service}") if service and not opts.o.dry_run: - service_resp = self.core_api.create_namespaced_service( - namespace=self.k8s_namespace, body=service - ) - if opts.o.debug: - print("Service created:") - print(f"{service_resp}") + self._ensure_service(service) def _find_certificate_for_host_name(self, host_name): all_certificates = self.custom_obj_api.list_namespaced_custom_object( @@ -404,12 +473,7 @@ class K8sDeployer(Deployer): if opts.o.debug: print(f"Sending this ingress: {ingress}") if not opts.o.dry_run: - ingress_resp = self.networking_api.create_namespaced_ingress( - namespace=self.k8s_namespace, body=ingress - ) - if opts.o.debug: - print("Ingress created:") - print(f"{ingress_resp}") + self._ensure_ingress(ingress) else: if opts.o.debug: print("No ingress configured") @@ -419,12 +483,7 @@ class K8sDeployer(Deployer): if opts.o.debug: print(f"Sending this nodeport: {nodeport}") if not opts.o.dry_run: - nodeport_resp = self.core_api.create_namespaced_service( - namespace=self.k8s_namespace, body=nodeport - ) - if opts.o.debug: - print("NodePort created:") - print(f"{nodeport_resp}") + self._ensure_service(nodeport, kind="NodePort") def down(self, timeout, volumes, skip_cluster_management): self.skip_cluster_management = skip_cluster_management From 14f423ea0c04c624ebaca9be5d6a223bc7402ef1 Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Sun, 8 Mar 2026 04:32:20 +0000 Subject: [PATCH 16/19] fix(k8s): read existing resourceVersion/clusterIP before replace K8s PUT (replace) operations require metadata.resourceVersion for optimistic concurrency control. Services additionally have immutable spec.clusterIP that must be preserved from the existing object. On 409 conflict, all _ensure_* methods now read the existing resource first and copy resourceVersion (and clusterIP for Services) into the body before calling replace. Co-Authored-By: Claude Opus 4.6 --- stack_orchestrator/deploy/k8s/deploy_k8s.py | 27 ++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/stack_orchestrator/deploy/k8s/deploy_k8s.py b/stack_orchestrator/deploy/k8s/deploy_k8s.py index c0272be7..b34e3291 100644 --- a/stack_orchestrator/deploy/k8s/deploy_k8s.py +++ b/stack_orchestrator/deploy/k8s/deploy_k8s.py @@ -202,6 +202,10 @@ class K8sDeployer(Deployer): print(f"ConfigMap created: {resp}") except ApiException as e: if e.status == 409: + existing = self.core_api.read_namespaced_config_map( + name=cfg_map.metadata.name, namespace=self.k8s_namespace + ) + cfg_map.metadata.resource_version = existing.metadata.resource_version resp = self.core_api.replace_namespaced_config_map( name=cfg_map.metadata.name, namespace=self.k8s_namespace, @@ -225,6 +229,13 @@ class K8sDeployer(Deployer): print("Deployment created:") except ApiException as e: if e.status == 409: + existing = self.apps_api.read_namespaced_deployment( + name=deployment.metadata.name, + namespace=self.k8s_namespace, + ) + deployment.metadata.resource_version = ( + existing.metadata.resource_version + ) resp = cast( client.V1Deployment, self.apps_api.replace_namespaced_deployment( @@ -246,7 +257,11 @@ class K8sDeployer(Deployer): print(f"{meta.namespace} {meta.name} {meta.generation} {img}") def _ensure_service(self, service, kind: str = "Service"): - """Create or replace a Service (idempotent).""" + """Create or replace a Service (idempotent). + + Services have immutable fields (spec.clusterIP) that must be + preserved from the existing object on replace. + """ try: resp = self.core_api.create_namespaced_service( namespace=self.k8s_namespace, body=service @@ -255,6 +270,12 @@ class K8sDeployer(Deployer): print(f"{kind} created: {resp}") except ApiException as e: if e.status == 409: + existing = self.core_api.read_namespaced_service( + name=service.metadata.name, namespace=self.k8s_namespace + ) + service.metadata.resource_version = existing.metadata.resource_version + if existing.spec.cluster_ip: + service.spec.cluster_ip = existing.spec.cluster_ip resp = self.core_api.replace_namespaced_service( name=service.metadata.name, namespace=self.k8s_namespace, @@ -275,6 +296,10 @@ class K8sDeployer(Deployer): print(f"Ingress created: {resp}") except ApiException as e: if e.status == 409: + existing = self.networking_api.read_namespaced_ingress( + name=ingress.metadata.name, namespace=self.k8s_namespace + ) + ingress.metadata.resource_version = existing.metadata.resource_version resp = self.networking_api.replace_namespaced_ingress( name=ingress.metadata.name, namespace=self.k8s_namespace, From 9c5b8e3f4e429f6e51b4373088a2cbdfaf89ba38 Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Sun, 8 Mar 2026 06:56:25 +0000 Subject: [PATCH 17/19] chore: initialize pebbles issue tracker MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Track stack-orchestrator work items with pebbles (append-only event log). Epic so-076: Stack composition — deploy multiple stacks into one kind cluster with independent lifecycle management per sub-stack. Co-Authored-By: Claude Opus 4.6 --- .pebbles/config.json | 3 +++ .pebbles/events.jsonl | 15 +++++++++++++++ 2 files changed, 18 insertions(+) create mode 100644 .pebbles/config.json create mode 100644 .pebbles/events.jsonl diff --git a/.pebbles/config.json b/.pebbles/config.json new file mode 100644 index 00000000..806dcad9 --- /dev/null +++ b/.pebbles/config.json @@ -0,0 +1,3 @@ +{ + "prefix": "so" +} \ No newline at end of file diff --git a/.pebbles/events.jsonl b/.pebbles/events.jsonl new file mode 100644 index 00000000..944b62d0 --- /dev/null +++ b/.pebbles/events.jsonl @@ -0,0 +1,15 @@ +{"type":"create","timestamp":"2026-03-08T06:56:07.080584539Z","issue_id":"so-076","payload":{"description":"Currently laconic-so maps one stack to one deployment to one pod. All containers\nin a stack's compose files become containers in a single k8s pod. This means:\n\n- Can't upgrade doublezero without restarting agave-validator\n- Can't restart monitoring without disrupting the validator\n- Can't independently scale or lifecycle-manage components\n\nThe fix is stack composition. A meta-stack (e.g. biscayne-stack) composes\nsub-stacks (agave, doublezero, agave-monitoring), each becoming its own\nk8s Deployment with independent lifecycle.","priority":"2","title":"Stack composition: deploy multiple stacks into one kind cluster","type":"epic"}} +{"type":"create","timestamp":"2026-03-08T06:56:07.551986919Z","issue_id":"so-ab0","payload":{"description":"Add laconic-so deployment prepare that creates cluster infrastructure without pods. Already implemented, needs review.","priority":"2","title":"deployment prepare command","type":"task"}} +{"type":"create","timestamp":"2026-03-08T06:56:07.884418759Z","issue_id":"so-04f","payload":{"description":"deployment stop on ANY deployment deletes the shared kind cluster. Should only delete its own namespace.","priority":"2","title":"deployment stop should not destroy shared cluster","type":"bug"}} +{"type":"create","timestamp":"2026-03-08T06:56:08.253520249Z","issue_id":"so-370","payload":{"description":"Allow stack.yml to reference sub-stacks. Each sub-stack becomes its own k8s Deployment sharing namespace and PVs.","priority":"2","title":"Add stacks: field to stack.yml for composition","type":"task"}} +{"type":"create","timestamp":"2026-03-08T06:56:08.646764337Z","issue_id":"so-f7c","payload":{"description":"Create three independent stacks from the monolithic agave-stack. Each gets its own compose file and independent lifecycle.","priority":"2","title":"Split agave-stack into agave + doublezero + monitoring","type":"task"}} +{"type":"rename","timestamp":"2026-03-08T06:56:14.499990161Z","issue_id":"so-ab0","payload":{"new_id":"so-076.1"}} +{"type":"dep_add","timestamp":"2026-03-08T06:56:14.499992031Z","issue_id":"so-076.1","payload":{"dep_type":"parent-child","depends_on":"so-076"}} +{"type":"rename","timestamp":"2026-03-08T06:56:14.786407752Z","issue_id":"so-04f","payload":{"new_id":"so-076.2"}} +{"type":"dep_add","timestamp":"2026-03-08T06:56:14.786409842Z","issue_id":"so-076.2","payload":{"dep_type":"parent-child","depends_on":"so-076"}} +{"type":"rename","timestamp":"2026-03-08T06:56:15.058959714Z","issue_id":"so-370","payload":{"new_id":"so-076.3"}} +{"type":"dep_add","timestamp":"2026-03-08T06:56:15.058961364Z","issue_id":"so-076.3","payload":{"dep_type":"parent-child","depends_on":"so-076"}} +{"type":"rename","timestamp":"2026-03-08T06:56:15.410080785Z","issue_id":"so-f7c","payload":{"new_id":"so-076.4"}} +{"type":"dep_add","timestamp":"2026-03-08T06:56:15.410082305Z","issue_id":"so-076.4","payload":{"dep_type":"parent-child","depends_on":"so-076"}} +{"type":"dep_add","timestamp":"2026-03-08T06:56:16.313585082Z","issue_id":"so-076.3","payload":{"dep_type":"blocks","depends_on":"so-076.2"}} +{"type":"dep_add","timestamp":"2026-03-08T06:56:16.567629422Z","issue_id":"so-076.4","payload":{"dep_type":"blocks","depends_on":"so-076.3"}} From 974eed0c733324da2b3d844821a7923297843b6b Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Sun, 8 Mar 2026 06:56:34 +0000 Subject: [PATCH 18/19] feat: add `deployment prepare` command (so-076.1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refactors K8sDeployer.up() into three composable methods: - _setup_cluster_and_namespace(): kind cluster, API, namespace, ingress - _create_infrastructure(): PVs, PVCs, ConfigMaps, Services, NodePorts - _create_deployment(): Deployment resource (pods) `prepare` calls the first two only — creates all cluster infrastructure without starting pods. This eliminates the scale-to-0 workaround where operators had to run `deployment start` then immediately scale down. Usage: laconic-so deployment --dir prepare Co-Authored-By: Claude Opus 4.6 --- .gitignore | 1 + stack_orchestrator/deploy/deploy.py | 6 ++ stack_orchestrator/deploy/deployer.py | 9 +++ stack_orchestrator/deploy/deployment.py | 22 +++++++ stack_orchestrator/deploy/k8s/deploy_k8s.py | 65 +++++++++++---------- 5 files changed, 73 insertions(+), 30 deletions(-) diff --git a/.gitignore b/.gitignore index 3aaa220b..6abbf941 100644 --- a/.gitignore +++ b/.gitignore @@ -8,3 +8,4 @@ __pycache__ package stack_orchestrator/data/build_tag.txt /build +.worktrees diff --git a/stack_orchestrator/deploy/deploy.py b/stack_orchestrator/deploy/deploy.py index f2bf977c..6e914b92 100644 --- a/stack_orchestrator/deploy/deploy.py +++ b/stack_orchestrator/deploy/deploy.py @@ -182,6 +182,12 @@ def status_operation(ctx): ctx.obj.deployer.status() +def prepare_operation(ctx, skip_cluster_management=False): + ctx.obj.deployer.prepare( + skip_cluster_management=skip_cluster_management, + ) + + def update_envs_operation(ctx): ctx.obj.deployer.update_envs() diff --git a/stack_orchestrator/deploy/deployer.py b/stack_orchestrator/deploy/deployer.py index 11fb6592..b950e29b 100644 --- a/stack_orchestrator/deploy/deployer.py +++ b/stack_orchestrator/deploy/deployer.py @@ -69,6 +69,15 @@ class Deployer(ABC): def run_job(self, job_name: str, release_name: Optional[str] = None): pass + def prepare(self, skip_cluster_management): + """Create cluster infrastructure (namespace, PVs, services) without starting pods. + + Only supported for k8s deployers. Compose deployers raise an error. + """ + raise DeployerException( + "prepare is only supported for k8s deployments" + ) + class DeployerException(Exception): def __init__(self, *args: object) -> None: diff --git a/stack_orchestrator/deploy/deployment.py b/stack_orchestrator/deploy/deployment.py index 1182d23f..0dc9ac37 100644 --- a/stack_orchestrator/deploy/deployment.py +++ b/stack_orchestrator/deploy/deployment.py @@ -23,6 +23,7 @@ from stack_orchestrator.deploy.images import push_images_operation from stack_orchestrator.deploy.deploy import ( up_operation, down_operation, + prepare_operation, ps_operation, port_operation, status_operation, @@ -125,6 +126,27 @@ def start(ctx, stay_attached, skip_cluster_management, extra_args): up_operation(ctx, services_list, stay_attached, skip_cluster_management) +@command.command() +@click.option( + "--skip-cluster-management/--perform-cluster-management", + default=False, + help="Skip cluster initialization (only for kind-k8s deployments)", +) +@click.pass_context +def prepare(ctx, skip_cluster_management): + """Create cluster infrastructure without starting pods. + + Sets up the kind cluster, namespace, PVs, PVCs, ConfigMaps, Services, + and Ingresses — everything that 'start' does EXCEPT creating the + Deployment resource. No pods will be scheduled. + + Use 'start --skip-cluster-management' afterward to create the Deployment + and start pods when ready. + """ + ctx.obj = make_deploy_context(ctx) + prepare_operation(ctx, skip_cluster_management) + + # TODO: remove legacy up command since it's an alias for stop @command.command() @click.option( diff --git a/stack_orchestrator/deploy/k8s/deploy_k8s.py b/stack_orchestrator/deploy/k8s/deploy_k8s.py index b34e3291..1eee8ffd 100644 --- a/stack_orchestrator/deploy/k8s/deploy_k8s.py +++ b/stack_orchestrator/deploy/k8s/deploy_k8s.py @@ -371,22 +371,15 @@ class K8sDeployer(Deployer): self._ensure_config_map(cfg_map) def _create_deployment(self): - # Process compose files into a Deployment + """Create the k8s Deployment resource (which starts pods).""" deployment = self.cluster_info.get_deployment( image_pull_policy=None if self.is_kind() else "Always" ) - # Create the k8s objects if opts.o.debug: print(f"Sending this deployment: {deployment}") if not opts.o.dry_run: self._ensure_deployment(deployment) - service = self.cluster_info.get_service() - if opts.o.debug: - print(f"Sending this service: {service}") - if service and not opts.o.dry_run: - self._ensure_service(service) - def _find_certificate_for_host_name(self, host_name): all_certificates = self.custom_obj_api.list_namespaced_custom_object( group="cert-manager.io", @@ -424,24 +417,25 @@ class K8sDeployer(Deployer): return None def up(self, detach, skip_cluster_management, services): + self._setup_cluster_and_namespace(skip_cluster_management) + self._create_infrastructure() + self._create_deployment() + + def _setup_cluster_and_namespace(self, skip_cluster_management): + """Create kind cluster (if needed) and namespace. Shared by up() and prepare().""" self.skip_cluster_management = skip_cluster_management if not opts.o.dry_run: if self.is_kind() and not self.skip_cluster_management: - # Create the kind cluster (or reuse existing one) kind_config = str( self.deployment_dir.joinpath(constants.kind_config_filename) ) actual_cluster = create_cluster(self.kind_cluster_name, kind_config) if actual_cluster != self.kind_cluster_name: - # An existing cluster was found, use it instead self.kind_cluster_name = actual_cluster - # Only load locally-built images into kind - # Registry images (docker.io, ghcr.io, etc.) will be pulled by k8s local_containers = self.deployment_context.stack.obj.get( "containers", [] ) if local_containers: - # Filter image_set to only images matching local containers local_images = { img for img in self.cluster_info.image_set @@ -449,47 +443,48 @@ class K8sDeployer(Deployer): } if local_images: load_images_into_kind(self.kind_cluster_name, local_images) - # Note: if no local containers defined, all images come from registries self.connect_api() - # Create deployment-specific namespace for resource isolation self._ensure_namespace() if self.is_kind() and not self.skip_cluster_management: - # Configure ingress controller (not installed by default in kind) - # Skip if already running (idempotent for shared cluster) if not is_ingress_running(): install_ingress_for_kind(self.cluster_info.spec.get_acme_email()) - # Wait for ingress to start - # (deployment provisioning will fail unless this is done) wait_for_ingress_in_kind() - # Create RuntimeClass if unlimited_memlock is enabled if self.cluster_info.spec.get_unlimited_memlock(): _create_runtime_class( constants.high_memlock_runtime, constants.high_memlock_runtime, ) - else: print("Dry run mode enabled, skipping k8s API connect") - # Create registry secret if configured + def _create_infrastructure(self): + """Create PVs, PVCs, ConfigMaps, Services, Ingresses, NodePorts. + + Everything except the Deployment resource (which starts pods). + Shared by up() and prepare(). + """ from stack_orchestrator.deploy.deployment_create import create_registry_secret create_registry_secret(self.cluster_info.spec, self.cluster_info.app_name) self._create_volume_data() - self._create_deployment() + + # Create the ClusterIP service (paired with the deployment) + service = self.cluster_info.get_service() + if service and not opts.o.dry_run: + if opts.o.debug: + print(f"Sending this service: {service}") + self._ensure_service(service) http_proxy_info = self.cluster_info.spec.get_http_proxy() - # Note: we don't support tls for kind (enabling tls causes errors) use_tls = http_proxy_info and not self.is_kind() certificate = ( self._find_certificate_for_host_name(http_proxy_info[0]["host-name"]) if use_tls else None ) - if opts.o.debug: - if certificate: - print(f"Using existing certificate: {certificate}") + if opts.o.debug and certificate: + print(f"Using existing certificate: {certificate}") ingress = self.cluster_info.get_ingress( use_tls=use_tls, certificate=certificate @@ -499,9 +494,8 @@ class K8sDeployer(Deployer): print(f"Sending this ingress: {ingress}") if not opts.o.dry_run: self._ensure_ingress(ingress) - else: - if opts.o.debug: - print("No ingress configured") + elif opts.o.debug: + print("No ingress configured") nodeports: List[client.V1Service] = self.cluster_info.get_nodeports() for nodeport in nodeports: @@ -510,6 +504,17 @@ class K8sDeployer(Deployer): if not opts.o.dry_run: self._ensure_service(nodeport, kind="NodePort") + def prepare(self, skip_cluster_management): + """Create cluster infrastructure without starting pods. + + Sets up kind cluster, namespace, PVs, PVCs, ConfigMaps, Services, + Ingresses, and NodePorts — everything that up() does EXCEPT creating + the Deployment resource. + """ + self._setup_cluster_and_namespace(skip_cluster_management) + self._create_infrastructure() + print("Cluster infrastructure prepared (no pods started).") + def down(self, timeout, volumes, skip_cluster_management): self.skip_cluster_management = skip_cluster_management self.connect_api() From 36c37d2bdec49e4e32a7fb0a50c880d2a86d5b89 Mon Sep 17 00:00:00 2001 From: "A. F. Dudley" Date: Mon, 16 Mar 2026 08:01:11 +0000 Subject: [PATCH 19/19] wd-a7b: Fix cluster-id and namespace naming - Replace token_hex cluster IDs with sortable timestamp-based IDs (laconic-{base62_timestamp}{random_suffix}) via new ids.py module - Check for existing Kind cluster before generating a new cluster-id - Derive k8s namespace from stack name instead of compose_project_name (e.g. laconic-dumpster instead of laconic-) - Plumb namespace through to secret generation instead of hardcoding 'default' Co-Authored-By: Claude Opus 4.6 (1M context) --- .../deploy/deployment_create.py | 53 +++++++++++--- stack_orchestrator/deploy/k8s/cluster_info.py | 73 ++++++++++++------- stack_orchestrator/deploy/k8s/deploy_k8s.py | 32 +++++--- stack_orchestrator/deploy/k8s/helpers.py | 8 +- stack_orchestrator/deploy/spec.py | 8 +- stack_orchestrator/ids.py | 47 ++++++++++++ 6 files changed, 164 insertions(+), 57 deletions(-) create mode 100644 stack_orchestrator/ids.py diff --git a/stack_orchestrator/deploy/deployment_create.py b/stack_orchestrator/deploy/deployment_create.py index 792d8e3d..3ff3e169 100644 --- a/stack_orchestrator/deploy/deployment_create.py +++ b/stack_orchestrator/deploy/deployment_create.py @@ -24,11 +24,13 @@ from typing import List, Optional import random from shutil import copy, copyfile, copytree, rmtree from secrets import token_hex +import subprocess import sys import filecmp import tempfile from stack_orchestrator import constants +from stack_orchestrator.ids import generate_id from stack_orchestrator.opts import opts from stack_orchestrator.util import ( get_stack_path, @@ -513,7 +515,9 @@ def init_operation( GENERATE_TOKEN_PATTERN = re.compile(r"\$generate:(\w+):(\d+)\$") -def _generate_and_store_secrets(config_vars: dict, deployment_name: str): +def _generate_and_store_secrets( + config_vars: dict, deployment_name: str, namespace: str = "default" +): """Generate secrets for $generate:...$ tokens and store in K8s Secret. Called by `deploy create` - generates fresh secrets and stores them. @@ -555,7 +559,6 @@ def _generate_and_store_secrets(config_vars: dict, deployment_name: str): v1 = client.CoreV1Api() secret_name = f"{deployment_name}-generated-secrets" - namespace = "default" secret_data = {k: base64.b64encode(v.encode()).decode() for k, v in secrets.items()} k8s_secret = client.V1Secret( @@ -659,7 +662,10 @@ def create_registry_secret(spec: Spec, deployment_name: str) -> Optional[str]: def _write_config_file( - spec_file: Path, config_env_file: Path, deployment_name: Optional[str] = None + spec_file: Path, + config_env_file: Path, + deployment_name: Optional[str] = None, + namespace: str = "default", ): spec_content = get_parsed_deployment_spec(spec_file) config_vars = spec_content.get("config", {}) or {} @@ -671,7 +677,7 @@ def _write_config_file( for v in config_vars.values() ) if has_generate_tokens: - _generate_and_store_secrets(config_vars, deployment_name) + _generate_and_store_secrets(config_vars, deployment_name, namespace) # Write non-secret config to config.env (exclude $generate:...$ tokens) with open(config_env_file, "w") as output_file: @@ -697,9 +703,31 @@ def _copy_files_to_directory(file_paths: List[Path], directory: Path): copy(path, os.path.join(directory, os.path.basename(path))) +def _get_existing_kind_cluster() -> Optional[str]: + """Return the name of an existing Kind cluster, or None.""" + try: + result = subprocess.run( + ["kind", "get", "clusters"], + capture_output=True, + text=True, + timeout=10, + ) + if result.returncode == 0: + clusters = [ + c.strip() for c in result.stdout.strip().splitlines() if c.strip() + ] + if clusters: + return clusters[0] + except (FileNotFoundError, subprocess.TimeoutExpired): + pass + return None + + def _create_deployment_file(deployment_dir: Path, stack_source: Optional[Path] = None): deployment_file_path = deployment_dir.joinpath(constants.deployment_file_name) - cluster = f"{constants.cluster_name_prefix}{token_hex(8)}" + # Reuse existing Kind cluster if one exists, otherwise generate a timestamp-based ID + existing = _get_existing_kind_cluster() + cluster = existing if existing else generate_id("laconic") deployment_content = {constants.cluster_id_key: cluster} if stack_source: deployment_content["stack-source"] = str(stack_source) @@ -953,8 +981,13 @@ def _write_deployment_files( # Use stack_name as deployment_name for K8s secret naming # Extract just the name part if stack_name is a path ("path/to/stack" -> "stack") deployment_name = Path(stack_name).name.replace("_", "-") + # Derive namespace from spec or stack name, matching deploy_k8s logic + namespace = parsed_spec.get_namespace() or f"laconic-{deployment_name}" _write_config_file( - spec_file, target_dir.joinpath(constants.config_file_name), deployment_name + spec_file, + target_dir.joinpath(constants.config_file_name), + deployment_name, + namespace=namespace, ) # Copy any k8s config file into the target dir @@ -1032,12 +1065,8 @@ def _write_deployment_files( for configmap in parsed_spec.get_configmaps(): source_config_dir = resolve_config_dir(stack_name, configmap) if os.path.exists(source_config_dir): - destination_config_dir = target_dir.joinpath( - "configmaps", configmap - ) - copytree( - source_config_dir, destination_config_dir, dirs_exist_ok=True - ) + destination_config_dir = target_dir.joinpath("configmaps", configmap) + copytree(source_config_dir, destination_config_dir, dirs_exist_ok=True) # Copy the job files into the target dir jobs = get_job_list(parsed_stack) diff --git a/stack_orchestrator/deploy/k8s/cluster_info.py b/stack_orchestrator/deploy/k8s/cluster_info.py index d84474ab..34f41c99 100644 --- a/stack_orchestrator/deploy/k8s/cluster_info.py +++ b/stack_orchestrator/deploy/k8s/cluster_info.py @@ -82,7 +82,14 @@ class ClusterInfo: def __init__(self) -> None: self.parsed_job_yaml_map = {} - def int(self, pod_files: List[str], compose_env_file, deployment_name, spec: Spec, stack_name=""): + def int( + self, + pod_files: List[str], + compose_env_file, + deployment_name, + spec: Spec, + stack_name="", + ): self.parsed_pod_yaml_map = parsed_pod_files_map_from_file_names(pod_files) # Find the set of images in the pods self.image_set = images_for_deployment(pod_files) @@ -292,8 +299,7 @@ class ClusterInfo: # Per-volume resources override global, which overrides default. vol_resources = ( - self.spec.get_volume_resources_for(volume_name) - or global_resources + self.spec.get_volume_resources_for(volume_name) or global_resources ) labels = { @@ -395,8 +401,7 @@ class ClusterInfo: continue vol_resources = ( - self.spec.get_volume_resources_for(volume_name) - or global_resources + self.spec.get_volume_resources_for(volume_name) or global_resources ) if self.spec.is_kind_deployment(): host_path = client.V1HostPathVolumeSource( @@ -531,9 +536,7 @@ class ClusterInfo: if self.spec.get_image_registry() is not None else image ) - volume_mounts = volume_mounts_for_service( - parsed_yaml_map, service_name - ) + volume_mounts = volume_mounts_for_service(parsed_yaml_map, service_name) # Handle command/entrypoint from compose file # In docker-compose: entrypoint -> k8s command, command -> k8s args container_command = None @@ -581,7 +584,9 @@ class ClusterInfo: volume_mounts=volume_mounts, security_context=client.V1SecurityContext( privileged=self.spec.get_privileged(), - run_as_user=int(service_info["user"]) if "user" in service_info else None, + run_as_user=int(service_info["user"]) + if "user" in service_info + else None, capabilities=client.V1Capabilities( add=self.spec.get_capabilities() ) @@ -595,19 +600,17 @@ class ClusterInfo: svc_labels = service_info.get("labels", {}) if isinstance(svc_labels, list): # docker-compose labels can be a list of "key=value" - svc_labels = dict( - item.split("=", 1) for item in svc_labels - ) - is_init = str( - svc_labels.get("laconic.init-container", "") - ).lower() in ("true", "1", "yes") + svc_labels = dict(item.split("=", 1) for item in svc_labels) + is_init = str(svc_labels.get("laconic.init-container", "")).lower() in ( + "true", + "1", + "yes", + ) if is_init: init_containers.append(container) else: containers.append(container) - volumes = volumes_for_pod_files( - parsed_yaml_map, self.spec, self.app_name - ) + volumes = volumes_for_pod_files(parsed_yaml_map, self.spec, self.app_name) return containers, init_containers, services, volumes # TODO: put things like image pull policy into an object-scope struct @@ -704,7 +707,14 @@ class ClusterInfo: kind="Deployment", metadata=client.V1ObjectMeta( name=f"{self.app_name}-deployment", - labels={"app": self.app_name, **({"app.kubernetes.io/stack": self.stack_name} if self.stack_name else {})}, + labels={ + "app": self.app_name, + **( + {"app.kubernetes.io/stack": self.stack_name} + if self.stack_name + else {} + ), + }, ), spec=spec, ) @@ -732,8 +742,8 @@ class ClusterInfo: for job_file in self.parsed_job_yaml_map: # Build containers for this single job file single_job_map = {job_file: self.parsed_job_yaml_map[job_file]} - containers, init_containers, _services, volumes = ( - self._build_containers(single_job_map, image_pull_policy) + containers, init_containers, _services, volumes = self._build_containers( + single_job_map, image_pull_policy ) # Derive job name from file path: docker-compose-.yml -> @@ -741,7 +751,7 @@ class ClusterInfo: # Strip docker-compose- prefix and .yml suffix job_name = base if job_name.startswith("docker-compose-"): - job_name = job_name[len("docker-compose-"):] + job_name = job_name[len("docker-compose-") :] if job_name.endswith(".yml"): job_name = job_name[: -len(".yml")] elif job_name.endswith(".yaml"): @@ -751,12 +761,14 @@ class ClusterInfo: # picked up by pods_in_deployment() which queries app={app_name}. pod_labels = { "app": f"{self.app_name}-job", - **({"app.kubernetes.io/stack": self.stack_name} if self.stack_name else {}), + **( + {"app.kubernetes.io/stack": self.stack_name} + if self.stack_name + else {} + ), } template = client.V1PodTemplateSpec( - metadata=client.V1ObjectMeta( - labels=pod_labels - ), + metadata=client.V1ObjectMeta(labels=pod_labels), spec=client.V1PodSpec( containers=containers, init_containers=init_containers or None, @@ -769,7 +781,14 @@ class ClusterInfo: template=template, backoff_limit=0, ) - job_labels = {"app": self.app_name, **({"app.kubernetes.io/stack": self.stack_name} if self.stack_name else {})} + job_labels = { + "app": self.app_name, + **( + {"app.kubernetes.io/stack": self.stack_name} + if self.stack_name + else {} + ), + } job = client.V1Job( api_version="batch/v1", kind="Job", diff --git a/stack_orchestrator/deploy/k8s/deploy_k8s.py b/stack_orchestrator/deploy/k8s/deploy_k8s.py index 8a41acea..dfb148f9 100644 --- a/stack_orchestrator/deploy/k8s/deploy_k8s.py +++ b/stack_orchestrator/deploy/k8s/deploy_k8s.py @@ -122,14 +122,18 @@ class K8sDeployer(Deployer): return self.deployment_dir = deployment_context.deployment_dir self.deployment_context = deployment_context - self.kind_cluster_name = deployment_context.spec.get_kind_cluster_name() or compose_project_name - # Use spec namespace if provided, otherwise derive from cluster-id - self.k8s_namespace = deployment_context.spec.get_namespace() or f"laconic-{compose_project_name}" - self.cluster_info = ClusterInfo() + self.kind_cluster_name = ( + deployment_context.spec.get_kind_cluster_name() or compose_project_name + ) # stack.name may be an absolute path (from spec "stack:" key after # path resolution). Extract just the directory basename for labels. raw_name = deployment_context.stack.name if deployment_context else "" stack_name = Path(raw_name).name if raw_name else "" + # Use spec namespace if provided, otherwise derive from stack name + self.k8s_namespace = deployment_context.spec.get_namespace() or ( + f"laconic-{stack_name}" if stack_name else f"laconic-{compose_project_name}" + ) + self.cluster_info = ClusterInfo() self.cluster_info.int( compose_files, compose_env_file, @@ -232,7 +236,8 @@ class K8sDeployer(Deployer): for job in jobs.items: print(f"Deleting Job {job.metadata.name}") self.batch_api.delete_namespaced_job( - name=job.metadata.name, namespace=ns, + name=job.metadata.name, + namespace=ns, body=client.V1DeleteOptions(propagation_policy="Background"), ) except ApiException as e: @@ -555,7 +560,10 @@ class K8sDeployer(Deployer): # Call start() hooks — stacks can create additional k8s resources if self.deployment_context: - from stack_orchestrator.deploy.deployment_create import call_stack_deploy_start + from stack_orchestrator.deploy.deployment_create import ( + call_stack_deploy_start, + ) + call_stack_deploy_start(self.deployment_context) def down(self, timeout, volumes, skip_cluster_management): @@ -567,9 +575,7 @@ class K8sDeployer(Deployer): # PersistentVolumes are cluster-scoped (not namespaced), so delete by label if volumes: try: - pvs = self.core_api.list_persistent_volume( - label_selector=app_label - ) + pvs = self.core_api.list_persistent_volume(label_selector=app_label) for pv in pvs.items: if opts.o.debug: print(f"Deleting PV: {pv.metadata.name}") @@ -713,14 +719,18 @@ class K8sDeployer(Deployer): def logs(self, services, tail, follow, stream): self.connect_api() - pods = pods_in_deployment(self.core_api, self.cluster_info.app_name, namespace=self.k8s_namespace) + pods = pods_in_deployment( + self.core_api, self.cluster_info.app_name, namespace=self.k8s_namespace + ) if len(pods) > 1: print("Warning: more than one pod in the deployment") if len(pods) == 0: log_data = "******* Pods not running ********\n" else: k8s_pod_name = pods[0] - containers = containers_in_pod(self.core_api, k8s_pod_name, namespace=self.k8s_namespace) + containers = containers_in_pod( + self.core_api, k8s_pod_name, namespace=self.k8s_namespace + ) # If pod not started, logs request below will throw an exception try: log_data = "" diff --git a/stack_orchestrator/deploy/k8s/helpers.py b/stack_orchestrator/deploy/k8s/helpers.py index 1eedfd5f..426e3125 100644 --- a/stack_orchestrator/deploy/k8s/helpers.py +++ b/stack_orchestrator/deploy/k8s/helpers.py @@ -393,7 +393,9 @@ def load_images_into_kind(kind_cluster_name: str, image_set: Set[str]): raise DeployerException(f"kind load docker-image failed: {result}") -def pods_in_deployment(core_api: client.CoreV1Api, deployment_name: str, namespace: str = "default"): +def pods_in_deployment( + core_api: client.CoreV1Api, deployment_name: str, namespace: str = "default" +): pods = [] pod_response = core_api.list_namespaced_pod( namespace=namespace, label_selector=f"app={deployment_name}" @@ -406,7 +408,9 @@ def pods_in_deployment(core_api: client.CoreV1Api, deployment_name: str, namespa return pods -def containers_in_pod(core_api: client.CoreV1Api, pod_name: str, namespace: str = "default") -> List[str]: +def containers_in_pod( + core_api: client.CoreV1Api, pod_name: str, namespace: str = "default" +) -> List[str]: containers: List[str] = [] pod_response = cast( client.V1Pod, core_api.read_namespaced_pod(pod_name, namespace=namespace) diff --git a/stack_orchestrator/deploy/spec.py b/stack_orchestrator/deploy/spec.py index 83762b35..8eb9456f 100644 --- a/stack_orchestrator/deploy/spec.py +++ b/stack_orchestrator/deploy/spec.py @@ -170,15 +170,13 @@ class Spec: Returns the per-volume Resources if found, otherwise None. The caller should fall back to get_volume_resources() then the default. """ - vol_section = ( - self.obj.get(constants.resources_key, {}).get(constants.volumes_key, {}) + vol_section = self.obj.get(constants.resources_key, {}).get( + constants.volumes_key, {} ) if volume_name not in vol_section: return None entry = vol_section[volume_name] - if isinstance(entry, dict) and ( - "reservations" in entry or "limits" in entry - ): + if isinstance(entry, dict) and ("reservations" in entry or "limits" in entry): return Resources(entry) return None diff --git a/stack_orchestrator/ids.py b/stack_orchestrator/ids.py new file mode 100644 index 00000000..e6a67782 --- /dev/null +++ b/stack_orchestrator/ids.py @@ -0,0 +1,47 @@ +"""Sortable timestamp-based ID generation for cluster naming. + +Uses base62 encoding with 100ms resolution and a 2024-01-01 epoch +to produce compact, sortable IDs like 'laconic-iqE6Za'. + +Format: {prefix}-{timestamp}{random} +- timestamp: 5 chars (100ms resolution, ~180 years from 2024) +- random: 2 chars (3,844 unique per 100ms slot) +""" +# Adapted from exophial/src/exophial/ids.py + +import random +import time + +# 2024-01-01 00:00:00 UTC in milliseconds +EPOCH_2024 = 1704067200000 + +# Sortable base62 alphabet (0-9, A-Z, a-z) +ALPHABET = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" + + +def _base62(n: int) -> str: + """Encode integer as base62 string.""" + if n == 0: + return ALPHABET[0] + s = "" + while n: + n, r = divmod(n, 62) + s = ALPHABET[r] + s + return s + + +def _random_suffix(length: int = 2) -> str: + """Generate random base62 suffix.""" + return "".join(random.choice(ALPHABET) for _ in range(length)) + + +def _timestamp_id() -> str: + """Generate a sortable timestamp ID (100ms resolution, 2024 epoch) with random suffix.""" + now_ms = int(time.time() * 1000) + offset = (now_ms - EPOCH_2024) // 100 # 100ms resolution + return f"{_base62(offset)}{_random_suffix()}" + + +def generate_id(prefix: str) -> str: + """Generate a sortable ID with an arbitrary prefix like 'laconic-iqE6Za'.""" + return f"{prefix}-{_timestamp_id()}"