diff --git a/docs/deployment_patterns.md b/docs/deployment_patterns.md index 9fd7ed0b..c29e0c22 100644 --- a/docs/deployment_patterns.md +++ b/docs/deployment_patterns.md @@ -164,6 +164,9 @@ To stop a single deployment without affecting the cluster: laconic-so deployment --dir my-deployment stop --skip-cluster-management ``` +Stacks sharing a cluster must agree on mount topology. See +[Volume Persistence in k8s-kind](#volume-persistence-in-k8s-kind). + ## Volume Persistence in k8s-kind k8s-kind has 3 storage layers: @@ -172,7 +175,9 @@ k8s-kind has 3 storage layers: - **Kind Node**: A Docker container simulating a k8s node - **Pod Container**: Your workload -For k8s-kind, volumes with paths are mounted from Docker Host → Kind Node → Pod via extraMounts. +Volumes with paths are mounted from Docker Host → Kind Node → Pod via kind +`extraMounts`. Kind applies `extraMounts` only at cluster creation — they +cannot be added to a running cluster. | spec.yml volume | Storage Location | Survives Pod Restart | Survives Cluster Restart | |-----------------|------------------|---------------------|-------------------------| @@ -200,3 +205,51 @@ Empty-path volumes appear persistent because they survive pod restarts (data liv in Kind Node container). However, this data is lost when the kind cluster is recreated. This "false persistence" has caused data loss when operators assumed their data was safe. + +### Shared Clusters: Use `kind-mount-root` + +Because kind `extraMounts` can only be set at cluster creation, the first +deployment to start locks in the mount topology. Later deployments that +declare new `extraMounts` have them silently ignored — their PVs fall +through to the kind node's overlay filesystem and lose data on cluster +destroy. + +The fix is an umbrella mount. Set `kind-mount-root` in the spec, pointing +at a host directory all stacks will share: + +```yaml +# spec.yml +kind-mount-root: /srv/kind + +volumes: + my-data: /srv/kind/my-stack/data # visible at /mnt/my-stack/data in-node +``` + +SO emits a single `extraMount` (`` → `/mnt`). Any new +host subdirectory under the root is visible in the node immediately — no +cluster recreate needed to add stacks. + +**All stacks sharing a cluster must agree on `kind-mount-root`** and keep +their host paths under it. + +### Mount Compatibility Enforcement + +`laconic-so deployment start` validates mount topology: + +- **On first cluster creation** without an umbrella mount: prints a + warning (future stacks may require a full recreate to add mounts). +- **On cluster reuse**: compares the new deployment's `extraMounts` + against the live mounts on the control-plane container. Any mismatch + (wrong host path, or mount missing) fails the deploy. + +### Migrating an Existing Cluster + +If a cluster was created without an umbrella mount and you need to add a +stack that requires new host-path mounts, the cluster must be recreated: + +1. Back up ephemeral state (DBs, caches) from PVs that lack host mounts — + these are in the kind node overlay FS and do not survive `kind delete`. +2. Update every stack's spec to set a shared `kind-mount-root` and place + host paths under it. +3. Stop all deployments, destroy the cluster, recreate it by starting any + stack (umbrella now active), and restore state. diff --git a/stack_orchestrator/deploy/k8s/helpers.py b/stack_orchestrator/deploy/k8s/helpers.py index 9f0f2171..1864bbf1 100644 --- a/stack_orchestrator/deploy/k8s/helpers.py +++ b/stack_orchestrator/deploy/k8s/helpers.py @@ -15,11 +15,13 @@ from kubernetes import client, utils, watch from kubernetes.client.exceptions import ApiException +import json import os from pathlib import Path import subprocess import re -from typing import Set, Mapping, List, Optional, cast +import sys +from typing import Dict, Set, Mapping, List, Optional, cast import yaml from stack_orchestrator.util import get_k8s_dir, error_exit @@ -216,6 +218,142 @@ def _install_caddy_cert_backup( print("Installed caddy cert backup CronJob") +def _parse_kind_extra_mounts(config_file: str) -> List[Dict[str, str]]: + """Return the list of extraMounts declared in a kind config file.""" + try: + with open(config_file) as f: + config = yaml.safe_load(f) or {} + except (OSError, yaml.YAMLError) as e: + if opts.o.debug: + print(f"Could not parse kind config {config_file}: {e}") + return [] + mounts = [] + for node in config.get("nodes", []) or []: + for m in node.get("extraMounts", []) or []: + host_path = m.get("hostPath") + container_path = m.get("containerPath") + if host_path and container_path: + mounts.append( + {"hostPath": host_path, "containerPath": container_path} + ) + return mounts + + +def _get_control_plane_node(cluster_name: str) -> Optional[str]: + """Return the kind control-plane node container name for a cluster.""" + result = subprocess.run( + ["kind", "get", "nodes", "--name", cluster_name], + capture_output=True, + text=True, + ) + if result.returncode != 0: + return None + for line in result.stdout.splitlines(): + line = line.strip() + if line.endswith("control-plane"): + return line + return None + + +def _get_running_cluster_mounts(cluster_name: str) -> Dict[str, str]: + """Return {containerPath: hostPath} for bind mounts on the control-plane.""" + node = _get_control_plane_node(cluster_name) + if not node: + return {} + result = subprocess.run( + ["docker", "inspect", node, "--format", "{{json .Mounts}}"], + capture_output=True, + text=True, + ) + if result.returncode != 0: + return {} + try: + mounts = json.loads(result.stdout or "[]") + except json.JSONDecodeError: + return {} + return { + m["Destination"]: m["Source"] + for m in mounts + if m.get("Type") == "bind" and m.get("Destination") and m.get("Source") + } + + +def _check_mounts_compatible(cluster_name: str, config_file: str) -> None: + """Fail if the new deployment's extraMounts aren't active on the cluster. + + Kind applies extraMounts only at cluster creation. When a deployment + joins an existing cluster, any extraMount its kind-config declares that + isn't already active on the running node will silently fall through to + the node's overlay filesystem — data looks persisted but is lost on + cluster destroy. Catch this up front. + """ + required = _parse_kind_extra_mounts(config_file) + if not required: + return + live = _get_running_cluster_mounts(cluster_name) + if not live: + # Could not inspect — don't block deployment, but warn. + print( + f"WARNING: could not inspect mounts on cluster '{cluster_name}'; " + "skipping extraMount compatibility check", + file=sys.stderr, + ) + return + mismatches = [] + for m in required: + dest = m["containerPath"] + want = m["hostPath"] + have = live.get(dest) + if have != want: + mismatches.append((dest, want, have)) + if not mismatches: + return + lines = [ + f"This deployment declares extraMounts that are not active on the " + f"running cluster '{cluster_name}':", + ] + for dest, want, have in mismatches: + lines.append( + f" - {dest}: expected host path '{want}', " + f"actual '{have or 'NOT MOUNTED'}'" + ) + lines.extend( + [ + "", + "Kind applies extraMounts only at cluster creation — neither " + "kind nor Docker supports adding bind mounts to a running " + "container. Without a recreate, any PV backed by one of the " + "missing mounts will silently fall through to the node's " + "overlay filesystem and lose data on cluster destroy.", + "", + "Fix: destroy and recreate the cluster with a kind-config that " + "includes an umbrella mount via 'kind-mount-root'. All stacks " + "sharing the cluster must agree on 'kind-mount-root' and place " + "their host paths under it. See docs/deployment_patterns.md.", + ] + ) + raise DeployerException("\n".join(lines)) + + +def _warn_if_no_umbrella(config_file: str) -> None: + """Warn if creating a cluster without a '/mnt' umbrella mount. + + Without an umbrella, future stacks joining this cluster that need new + host-path mounts will fail the compatibility check and require a full + cluster recreate to add them. + """ + mounts = _parse_kind_extra_mounts(config_file) + if any(m.get("containerPath") == "/mnt" for m in mounts): + return + print( + "WARNING: creating kind cluster without an umbrella mount " + "('kind-mount-root' not set). Future stacks added to this cluster " + "that require new host-path mounts will not be able to without a " + "full cluster recreate. See docs/deployment_patterns.md.", + file=sys.stderr, + ) + + def create_cluster(name: str, config_file: str): """Create or reuse the single kind cluster for this host. @@ -232,8 +370,10 @@ def create_cluster(name: str, config_file: str): existing = get_kind_cluster() if existing: print(f"Using existing cluster: {existing}") + _check_mounts_compatible(existing, config_file) return existing + _warn_if_no_umbrella(config_file) print(f"Creating new cluster: {name}") result = _run_command(f"kind create cluster --name {name} --config {config_file}") if result.returncode != 0: