Compare commits
7 Commits
v1.1.0-eb4
...
main
| Author | SHA1 | Date |
|---|---|---|
|
|
b3e9366ca0 | |
|
|
3d703708c4 | |
|
|
2ff7e5eb77 | |
|
|
cf0e230b66 | |
|
|
7c65d39bb2 | |
|
|
4977e3ff43 | |
|
|
421b83c430 |
|
|
@ -16,6 +16,7 @@ on:
|
||||||
- '.github/workflows/triggers/test-k8s-deploy'
|
- '.github/workflows/triggers/test-k8s-deploy'
|
||||||
- '.github/workflows/test-k8s-deploy.yml'
|
- '.github/workflows/test-k8s-deploy.yml'
|
||||||
- 'tests/k8s-deploy/run-deploy-test.sh'
|
- 'tests/k8s-deploy/run-deploy-test.sh'
|
||||||
|
- 'tests/k8s-deploy/run-restart-test.sh'
|
||||||
schedule:
|
schedule:
|
||||||
- cron: '3 15 * * *'
|
- cron: '3 15 * * *'
|
||||||
|
|
||||||
|
|
@ -46,3 +47,5 @@ jobs:
|
||||||
run: ./tests/scripts/install-kubectl.sh
|
run: ./tests/scripts/install-kubectl.sh
|
||||||
- name: "Run k8s deployment test"
|
- name: "Run k8s deployment test"
|
||||||
run: ./tests/k8s-deploy/run-deploy-test.sh
|
run: ./tests/k8s-deploy/run-deploy-test.sh
|
||||||
|
- name: "Run restart k8s deployment test"
|
||||||
|
run: ./tests/k8s-deploy/run-restart-test.sh
|
||||||
|
|
|
||||||
|
|
@ -46,3 +46,13 @@
|
||||||
{"type":"comment","timestamp":"2026-04-17T08:13:32.753112339Z","issue_id":"so-o2o","payload":{"body":"Tested the version-detection fix (commit 832ab66d) locally. Fix works for its scope but surfaces two more bugs downstream. Current approach is broken at the architectural level, not just one-bug-fixable.\n\nWhat 832ab66d does: captures etcd image ref from crictl after cluster create, writes to {backup_dir}/etcd-image.txt, reads it on subsequent cleanup runs. Self-adapts to Kind upgrades. No more hardcoded v3.5.9. Confirmed locally: etcd-image.txt is written after first create, cleanup on second start uses it, member.backup-YYYYMMDD-HHMMSS dir is produced (proves cleanup ran end-to-end).\n\nWhat still fails after version fix: kubeadm init on cluster recreate. apiserver comes up but returns:\n- 403 Forbidden: User \"kubernetes-admin\" cannot get path /livez\n- 500: Body was not decodable ... json: cannot unmarshal array into Go value of type struct\n- eventually times out waiting for apiserver /livez\n\nTwo new bugs behind those:\n\n(a) Restore step corrupts binary values. In _clean_etcd_keeping_certs the restore loop is:\n key=$(echo $encoded | base64 -d | jq -r .key | base64 -d)\n val=$(echo $encoded | base64 -d | jq -r .value | base64 -d)\n echo \"$val\" | /backup/etcdctl put \"$key\"\nk8s stores objects as protobuf. Piping raw protobuf through bash variable expansion + echo mangles non-printable bytes, truncates at null bytes, and appends a trailing newline. Explains the \"cannot unmarshal\" from apiserver — the kubernetes Service/Endpoints objects in /registry are corrupted on re-put.\n\n(b) Whitelist is too narrow. We keep only /registry/secrets/caddy-system and the /registry/services entries for kubernetes. Everything else is deleted — including /registry/clusterrolebindings (cluster-admin is gone), /registry/serviceaccounts, /registry/secrets/kube-system (bootstrap tokens), RBAC roles, apiserver's auth config. Explains the 403 for kubernetes-admin — cluster-admin binding doesn't exist yet and kubeadm's pre-addon health check can't authorize.\n\nFixing (a) would mean rewriting the restore step to not use shell piping — either use a proper etcdctl-based Go tool, or write directly to the on-disk snapshot format. Fixing (b) means exhaustively whitelisting everything kubeadm/apiserver bootstrapping needs — a moving target across k8s versions. Both together are a significant undertaking for the actual requirement (\"keep 4 Caddy secrets across cluster recreate\").\n\nDecision: merge 832ab66d for the narrow version-detection fix + diagnosis trail, then implement the kubectl-level backup/restore on a separate branch. The etcd approach is not salvageable at reasonable cost."}}
|
{"type":"comment","timestamp":"2026-04-17T08:13:32.753112339Z","issue_id":"so-o2o","payload":{"body":"Tested the version-detection fix (commit 832ab66d) locally. Fix works for its scope but surfaces two more bugs downstream. Current approach is broken at the architectural level, not just one-bug-fixable.\n\nWhat 832ab66d does: captures etcd image ref from crictl after cluster create, writes to {backup_dir}/etcd-image.txt, reads it on subsequent cleanup runs. Self-adapts to Kind upgrades. No more hardcoded v3.5.9. Confirmed locally: etcd-image.txt is written after first create, cleanup on second start uses it, member.backup-YYYYMMDD-HHMMSS dir is produced (proves cleanup ran end-to-end).\n\nWhat still fails after version fix: kubeadm init on cluster recreate. apiserver comes up but returns:\n- 403 Forbidden: User \"kubernetes-admin\" cannot get path /livez\n- 500: Body was not decodable ... json: cannot unmarshal array into Go value of type struct\n- eventually times out waiting for apiserver /livez\n\nTwo new bugs behind those:\n\n(a) Restore step corrupts binary values. In _clean_etcd_keeping_certs the restore loop is:\n key=$(echo $encoded | base64 -d | jq -r .key | base64 -d)\n val=$(echo $encoded | base64 -d | jq -r .value | base64 -d)\n echo \"$val\" | /backup/etcdctl put \"$key\"\nk8s stores objects as protobuf. Piping raw protobuf through bash variable expansion + echo mangles non-printable bytes, truncates at null bytes, and appends a trailing newline. Explains the \"cannot unmarshal\" from apiserver — the kubernetes Service/Endpoints objects in /registry are corrupted on re-put.\n\n(b) Whitelist is too narrow. We keep only /registry/secrets/caddy-system and the /registry/services entries for kubernetes. Everything else is deleted — including /registry/clusterrolebindings (cluster-admin is gone), /registry/serviceaccounts, /registry/secrets/kube-system (bootstrap tokens), RBAC roles, apiserver's auth config. Explains the 403 for kubernetes-admin — cluster-admin binding doesn't exist yet and kubeadm's pre-addon health check can't authorize.\n\nFixing (a) would mean rewriting the restore step to not use shell piping — either use a proper etcdctl-based Go tool, or write directly to the on-disk snapshot format. Fixing (b) means exhaustively whitelisting everything kubeadm/apiserver bootstrapping needs — a moving target across k8s versions. Both together are a significant undertaking for the actual requirement (\"keep 4 Caddy secrets across cluster recreate\").\n\nDecision: merge 832ab66d for the narrow version-detection fix + diagnosis trail, then implement the kubectl-level backup/restore on a separate branch. The etcd approach is not salvageable at reasonable cost."}}
|
||||||
{"type":"comment","timestamp":"2026-04-17T11:04:26.542659482Z","issue_id":"so-o2o","payload":{"body":"Shipped in PR #746. Etcd-persistence approach replaced with a kubectl-level Caddy Secret backup/restore gated on kind-mount-root.\n\nSummary of what landed:\n- components/ingress/caddy-cert-backup.yaml: SA/Role/RoleBinding + CronJob (alpine/kubectl:1.35.3) firing every 5min, writes {kind-mount-root}/caddy-cert-backup/caddy-secrets.yaml via atomic tmp+rename.\n- install_ingress_for_kind splits into 3 phases: pre-Deployment manifests → _restore_caddy_certs (kubectl apply from backup file) → Caddy Deployment → _install_caddy_cert_backup. Caddy pod can't exist until phase 3, so certs are always in place before secret_store startup.\n- Deleted _clean_etcd_keeping_certs, _get_etcd_host_path_from_kind_config, _capture_etcd_image, _read_etcd_image_ref, _etcd_image_ref_path and the etcd+PKI block in _generate_kind_mounts.\n- No new spec keys.\n\nTest coverage in tests/k8s-deploy/run-deploy-test.sh: install assertion after first --perform-cluster-management start, plus full E2E (seed fake manager=caddy Secret → trigger CronJob → verify backup file → stop/start --perform-cluster-management for cluster recreate → assert secret restored with matching decoded value).\n\nWoodburn migration: one-shot host-kubectl export to seed {kind-mount-root}/caddy-cert-backup/caddy-secrets.yaml was done manually on the running cluster (the in-cluster CronJob couldn't reach the host because the /srv/kind → /mnt extraMount was staged in kind-config.yml but never applied to the running cluster — it was added after cluster creation). File is in place for the eventual cluster recreate."}}
|
{"type":"comment","timestamp":"2026-04-17T11:04:26.542659482Z","issue_id":"so-o2o","payload":{"body":"Shipped in PR #746. Etcd-persistence approach replaced with a kubectl-level Caddy Secret backup/restore gated on kind-mount-root.\n\nSummary of what landed:\n- components/ingress/caddy-cert-backup.yaml: SA/Role/RoleBinding + CronJob (alpine/kubectl:1.35.3) firing every 5min, writes {kind-mount-root}/caddy-cert-backup/caddy-secrets.yaml via atomic tmp+rename.\n- install_ingress_for_kind splits into 3 phases: pre-Deployment manifests → _restore_caddy_certs (kubectl apply from backup file) → Caddy Deployment → _install_caddy_cert_backup. Caddy pod can't exist until phase 3, so certs are always in place before secret_store startup.\n- Deleted _clean_etcd_keeping_certs, _get_etcd_host_path_from_kind_config, _capture_etcd_image, _read_etcd_image_ref, _etcd_image_ref_path and the etcd+PKI block in _generate_kind_mounts.\n- No new spec keys.\n\nTest coverage in tests/k8s-deploy/run-deploy-test.sh: install assertion after first --perform-cluster-management start, plus full E2E (seed fake manager=caddy Secret → trigger CronJob → verify backup file → stop/start --perform-cluster-management for cluster recreate → assert secret restored with matching decoded value).\n\nWoodburn migration: one-shot host-kubectl export to seed {kind-mount-root}/caddy-cert-backup/caddy-secrets.yaml was done manually on the running cluster (the in-cluster CronJob couldn't reach the host because the /srv/kind → /mnt extraMount was staged in kind-config.yml but never applied to the running cluster — it was added after cluster creation). File is in place for the eventual cluster recreate."}}
|
||||||
{"type":"close","timestamp":"2026-04-17T11:04:26.999711375Z","issue_id":"so-o2o","payload":{}}
|
{"type":"close","timestamp":"2026-04-17T11:04:26.999711375Z","issue_id":"so-o2o","payload":{}}
|
||||||
|
{"type":"create","timestamp":"2026-04-20T13:14:26.312724048Z","issue_id":"so-7fc","payload":{"description":"## Problem\n\nFile-level host-path compose volumes (e.g. `../config/foo.sh:/opt/foo.sh`) were synthesized into a kind extraMount + k8s hostPath PV chain with a sanitized containerPath (`/mnt/host-path-\u003csanitized\u003e`).\n\n- On kind: two deployments of the same stack sharing a cluster collide at that containerPath — kind only honors the first deployment's bind, so subsequent deployments' pods silently read the first's file. No error, no warning.\n- On real k8s: the same code emits `hostPath: /mnt/host-path-*` but nothing populates that path on worker nodes — effectively broken.\n\nFile-level host-path binds are conceptually k8s ConfigMaps. The `snowballtools-base-backend` stack already uses the ConfigMap-backed named-volume pattern manually; this issue is to make that automatic for all stacks.\n\n## Resolution\n\nImplemented on branch `feat/so-b86-auto-configmap-host-path` (commit `cb84388d`), stacked on top of `feat/kind-mount-invariant-check`.\n\n**No deployment-dir file rewriting.** Compose files, spec.yml, and `{deployment_dir}/config/\u003cpod\u003e/` are untouched — trivially diffable against stack source, no synthetic volume names. ConfigMaps are materialized at deploy start and visible only in k8s (`kubectl get cm -n \u003cns\u003e`).\n\n### Deploy create — validation only\n\n| Source shape | Behavior |\n|---|---|\n| Single file | Accepted |\n| Flat directory, no subdirs, ≤ ~700 KiB | Accepted |\n| Directory with subdirs | `DeployerException` — guidance: embed in image / split configmaps / initContainer |\n| File or directory \u003e ~700 KiB | `DeployerException` — ConfigMap budget (accounts for base64 + metadata) |\n| `:rw` on any host-path bind | `DeployerException` — use a named volume for writable data |\n\n### Deploy start — k8s object generation\n\n- `cluster_info.get_configmaps()` walks pod + job compose volumes and emits a `V1ConfigMap` per host-path bind (deduped by sanitized name), content read from `{deployment_dir}/config/\u003cpod\u003e/\u003cfile\u003e`.\n- `volumes_for_pod_files` emits `V1ConfigMapVolumeSource` instead of `V1HostPathVolumeSource` for host-path binds.\n- `volume_mounts_for_service` stats the source and sets `V1VolumeMount.sub_path` to the filename when source is a regular file.\n- `_generate_kind_mounts` no longer emits `/mnt/host-path-*` extraMounts — ConfigMap path bypasses the kind node FS entirely.\n\n### Transition\n\nThe `/mnt/host-path-*` skip in `check_mounts_compatible` is retained as a transition tolerance for deployments created before this change. Test coverage in `tests/k8s-deploy/run-deploy-test.sh` asserts host-path ConfigMaps exist in the namespace, compose/spec in deployment dir unchanged, and no `/mnt/host-path-*` entries in kind-config.yml.","priority":"2","title":"File-level host-path compose volumes alias across deployments sharing a kind cluster","type":"bug"}}
|
||||||
|
{"type":"status_update","timestamp":"2026-04-20T13:14:26.833816262Z","issue_id":"so-7fc","payload":{"status":"closed"}}
|
||||||
|
{"type":"comment","timestamp":"2026-04-21T05:57:12.476299839Z","issue_id":"so-n1n","payload":{"body":"Already merged: 929bdab8 is an ancestor of origin/main; all four extraMount emit sites in helpers.py carry `propagation: HostToContainer` (umbrella, per-volume named, per-volume host-path, high-memlock spec)."}}
|
||||||
|
{"type":"status_update","timestamp":"2026-04-21T05:57:12.928842469Z","issue_id":"so-n1n","payload":{"status":"closed"}}
|
||||||
|
{"type":"comment","timestamp":"2026-04-21T06:08:13.933886638Z","issue_id":"so-ad7","payload":{"body":"Fixed in PR #744 (cf8b7533). get_services() now includes the maintenance pod in the container-ports map so its per-pod Service is built and available for the Ingress swap."}}
|
||||||
|
{"type":"status_update","timestamp":"2026-04-21T06:08:14.457815115Z","issue_id":"so-ad7","payload":{"status":"closed"}}
|
||||||
|
{"type":"update","timestamp":"2026-04-21T09:00:47.364859946Z","issue_id":"so-p3p","payload":{"description":"## Problem\n\nThe Caddy ingress controller image is hardcoded in `ingress-caddy-kind-deploy.yaml`, with no mechanism to update it short of cluster recreation or manual `kubectl patch`. laconic-so should: (1) allow spec.yml to specify a Caddy image, (2) support updating the Caddy image as part of `deployment start`, (3) set `strategy: Recreate` on the Caddy Deployment since hostPort pods can't rolling-update.\n\n## Resolution\n\n- New spec key `caddy-ingress-image`. Fresh install uses it (fallback: manifest default). On subsequent `deployment start`, if the spec key is set and the running Caddy image differs, SO patches the Deployment and waits for rollout.\n- Spec key absent =\u003e SO does **not** touch a running Caddy, to avoid silently reverting images set out-of-band (ansible playbook, another deployment's spec).\n- `strategy: Recreate` added to the Caddy Deployment manifest.\n- Reconcile runs under both `--perform-cluster-management` and the default `--skip-cluster-management` (it's a plain k8s-API patch, not a cluster lifecycle op).\n- Image substitution locates the container by name instead of string-matching the shipped default, so the spec override wins regardless of what the manifest hardcodes.\n- Cluster-scoped caveat: `caddy-system` is shared across deployments; last `deployment start` that sets the key wins for everyone. Documented in `deployment_patterns.md`."}}
|
||||||
|
{"type":"status_update","timestamp":"2026-04-21T09:00:47.745675131Z","issue_id":"so-p3p","payload":{"status":"closed"}}
|
||||||
|
{"type":"comment","timestamp":"2026-04-27T13:41:16.962883653Z","issue_id":"so-078","payload":{"body":"Fixed. deploy create now copies commands.py into deployment_dir/hooks/. call_stack_deploy_start loads hooks from the deployment dir instead of resolving via get_stack_path, so deployment start no longer requires the stack repo to be present or cwd to be correct."}}
|
||||||
|
{"type":"close","timestamp":"2026-04-27T13:41:17.073012545Z","issue_id":"so-078","payload":{}}
|
||||||
|
|
|
||||||
|
|
@ -164,6 +164,79 @@ To stop a single deployment without affecting the cluster:
|
||||||
laconic-so deployment --dir my-deployment stop --skip-cluster-management
|
laconic-so deployment --dir my-deployment stop --skip-cluster-management
|
||||||
```
|
```
|
||||||
|
|
||||||
|
Stacks sharing a cluster must agree on mount topology. See
|
||||||
|
[Volume Persistence in k8s-kind](#volume-persistence-in-k8s-kind).
|
||||||
|
|
||||||
|
### cluster-id vs deployment-id
|
||||||
|
|
||||||
|
Each deployment's `deployment.yml` carries two identifiers with
|
||||||
|
different roles:
|
||||||
|
|
||||||
|
- **`cluster-id`** — which kind cluster this deployment attaches to.
|
||||||
|
Used for the kube-config context name (`kind-{cluster-id}`) and for
|
||||||
|
kind lifecycle ops. Inherited from the running cluster at
|
||||||
|
`deploy create` time when one exists; freshly generated otherwise.
|
||||||
|
Shared across every deployment that joins the same cluster.
|
||||||
|
- **`deployment-id`** — this particular deployment's identity.
|
||||||
|
Generated fresh on every `deploy create` and never inherited. Flows
|
||||||
|
into `app_name`, the prefix on every k8s resource name this
|
||||||
|
deployment creates (PVs, ConfigMaps, Deployments, PVCs, …). Distinct
|
||||||
|
per deployment even when the cluster is shared.
|
||||||
|
|
||||||
|
The split prevents silent resource-name collisions between
|
||||||
|
deployments sharing a cluster: two deployments of the same stack,
|
||||||
|
or any two deployments that happen to declare a volume with the same
|
||||||
|
name, still produce distinct `{deployment-id}-{vol}` PV names.
|
||||||
|
|
||||||
|
**Backward compatibility**: `deployment.yml` files written before the
|
||||||
|
`deployment-id` field existed fall back to using `cluster-id` as the
|
||||||
|
deployment-id. Existing resource names stay stable across this
|
||||||
|
upgrade — no PV renames, no re-bind, no data orphaning. The next
|
||||||
|
`deploy create` writes both fields going forward.
|
||||||
|
|
||||||
|
**Namespace ownership**: on top of distinct resource names, SO stamps
|
||||||
|
the k8s namespace with a `laconic.com/deployment-dir` annotation on
|
||||||
|
first creation. A subsequent `deployment start` from a different
|
||||||
|
deployment directory that would land in the same namespace fails
|
||||||
|
with a `DeployerException` pointing at the `namespace:` spec
|
||||||
|
override. Catches operator-error cases where the same deployment dir
|
||||||
|
is effectively registered twice.
|
||||||
|
|
||||||
|
### Caddy ingress image lifecycle
|
||||||
|
|
||||||
|
The Caddy ingress controller lives in the cluster-scoped
|
||||||
|
`caddy-system` namespace and is installed on first `deployment start`.
|
||||||
|
Its image is configurable per deployment:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# spec.yml
|
||||||
|
caddy-ingress-image: ghcr.io/laconicnetwork/caddy-ingress:v1.2.3
|
||||||
|
```
|
||||||
|
|
||||||
|
Two cases, intentionally different:
|
||||||
|
|
||||||
|
- **Spec key set**: on first install the manifest is templated with
|
||||||
|
this image. On subsequent `deployment start`, if the running Caddy
|
||||||
|
Deployment's image differs, laconic-so patches it and waits for the
|
||||||
|
rollout. The Deployment uses `strategy: Recreate` (hostPort 80/443
|
||||||
|
blocks rolling updates from ever completing), so expect ~10–30s of
|
||||||
|
ingress downtime while the old pod terminates and the new one
|
||||||
|
starts.
|
||||||
|
- **Spec key absent**: on first install the manifest's hardcoded
|
||||||
|
default (`ghcr.io/laconicnetwork/caddy-ingress:latest`) is used.
|
||||||
|
On subsequent `deployment start`, laconic-so does **not** touch the
|
||||||
|
running Caddy Deployment. This matters when the image was set
|
||||||
|
out-of-band (via an ansible playbook, or by another deployment's
|
||||||
|
spec that's since been removed) — a silent revert to the default
|
||||||
|
would be worse than doing nothing. If you want to go back to the
|
||||||
|
default image, set `caddy-ingress-image` to it explicitly.
|
||||||
|
|
||||||
|
**Cluster-scoped caveat**: `caddy-system` is shared by every
|
||||||
|
deployment on the cluster. Setting `caddy-ingress-image` in any one
|
||||||
|
deployment's spec rolls the controller for all of them — last
|
||||||
|
`deployment start` wins. Treat it as a cluster-level knob; keep the
|
||||||
|
value consistent across the deployments sharing a cluster.
|
||||||
|
|
||||||
## Volume Persistence in k8s-kind
|
## Volume Persistence in k8s-kind
|
||||||
|
|
||||||
k8s-kind has 3 storage layers:
|
k8s-kind has 3 storage layers:
|
||||||
|
|
@ -172,7 +245,9 @@ k8s-kind has 3 storage layers:
|
||||||
- **Kind Node**: A Docker container simulating a k8s node
|
- **Kind Node**: A Docker container simulating a k8s node
|
||||||
- **Pod Container**: Your workload
|
- **Pod Container**: Your workload
|
||||||
|
|
||||||
For k8s-kind, volumes with paths are mounted from Docker Host → Kind Node → Pod via extraMounts.
|
Volumes with paths are mounted from Docker Host → Kind Node → Pod via kind
|
||||||
|
`extraMounts`. Kind applies `extraMounts` only at cluster creation — they
|
||||||
|
cannot be added to a running cluster.
|
||||||
|
|
||||||
| spec.yml volume | Storage Location | Survives Pod Restart | Survives Cluster Restart |
|
| spec.yml volume | Storage Location | Survives Pod Restart | Survives Cluster Restart |
|
||||||
|-----------------|------------------|---------------------|-------------------------|
|
|-----------------|------------------|---------------------|-------------------------|
|
||||||
|
|
@ -200,3 +275,100 @@ Empty-path volumes appear persistent because they survive pod restarts (data liv
|
||||||
in Kind Node container). However, this data is lost when the kind cluster is
|
in Kind Node container). However, this data is lost when the kind cluster is
|
||||||
recreated. This "false persistence" has caused data loss when operators assumed
|
recreated. This "false persistence" has caused data loss when operators assumed
|
||||||
their data was safe.
|
their data was safe.
|
||||||
|
|
||||||
|
### Shared Clusters: Use `kind-mount-root`
|
||||||
|
|
||||||
|
Because kind `extraMounts` can only be set at cluster creation, the first
|
||||||
|
deployment to start locks in the mount topology. Later deployments that
|
||||||
|
declare new `extraMounts` have them silently ignored — their PVs fall
|
||||||
|
through to the kind node's overlay filesystem and lose data on cluster
|
||||||
|
destroy.
|
||||||
|
|
||||||
|
The fix is an umbrella mount. Set `kind-mount-root` in the spec, pointing
|
||||||
|
at a host directory all stacks will share:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# spec.yml
|
||||||
|
kind-mount-root: /srv/kind
|
||||||
|
|
||||||
|
volumes:
|
||||||
|
my-data: /srv/kind/my-stack/data # visible at /mnt/my-stack/data in-node
|
||||||
|
```
|
||||||
|
|
||||||
|
SO emits a single `extraMount` (`<kind-mount-root>` → `/mnt`). Any new
|
||||||
|
host subdirectory under the root is visible in the node immediately — no
|
||||||
|
cluster recreate needed to add stacks.
|
||||||
|
|
||||||
|
**All stacks sharing a cluster must agree on `kind-mount-root`** and keep
|
||||||
|
their host paths under it.
|
||||||
|
|
||||||
|
### Mount Compatibility Enforcement
|
||||||
|
|
||||||
|
`laconic-so deployment start` validates mount topology:
|
||||||
|
|
||||||
|
- **On first cluster creation** without an umbrella mount: prints a
|
||||||
|
warning (future stacks may require a full recreate to add mounts).
|
||||||
|
- **On cluster reuse**: compares the new deployment's `extraMounts`
|
||||||
|
against the live mounts on the control-plane container. Any mismatch
|
||||||
|
(wrong host path, or mount missing) fails the deploy.
|
||||||
|
|
||||||
|
### Static files in compose volumes → auto-ConfigMap
|
||||||
|
|
||||||
|
Compose volumes that bind a host file or flat directory into a container
|
||||||
|
(e.g. `../config/test/script.sh:/opt/run.sh`) are used to inject static
|
||||||
|
content that ships with the stack. k8s doesn't have a native notion of
|
||||||
|
this — the canonical way to inject static content is a ConfigMap.
|
||||||
|
|
||||||
|
At `deploy start`, laconic-so auto-generates a namespace-scoped
|
||||||
|
ConfigMap per host-path compose volume (deduped by source) and mounts
|
||||||
|
it into the pod instead of routing the bind through the kind node:
|
||||||
|
|
||||||
|
| Source shape | Behavior |
|
||||||
|
|---|---|
|
||||||
|
| Single file | ConfigMap with one key (the filename); pod mount uses `subPath` so the single key lands at the compose target path |
|
||||||
|
| Flat directory (no subdirs, ≤ ~700 KiB) | ConfigMap with one key per file; pod mount exposes all keys at the target path |
|
||||||
|
| Directory with subdirs, or over budget | Rejected at `deploy create` — embed in the container image, split into multiple ConfigMaps, or use an initContainer |
|
||||||
|
| `:rw` on any host-path bind | Rejected at `deploy create` — use a named volume with a spec-configured host path for writable data |
|
||||||
|
|
||||||
|
The deployment dir layout is unchanged: compose files stay verbatim and
|
||||||
|
`spec.yml` is not rewritten. Source files remain under
|
||||||
|
`{deployment_dir}/config/{pod}/` (as copied by `deploy create`); the
|
||||||
|
ConfigMap is built from them at deploy start and no kind extraMount is
|
||||||
|
emitted for these paths.
|
||||||
|
|
||||||
|
This works identically on kind and real k8s (ConfigMaps are
|
||||||
|
cluster-native; no node-side landing pad required), and two deployments
|
||||||
|
of the same stack sharing a cluster get their own per-namespace
|
||||||
|
ConfigMaps — no aliasing.
|
||||||
|
|
||||||
|
### Writable / generated data → named volume + host path
|
||||||
|
|
||||||
|
For volumes the workload *writes to* (databases, ledgers, caches, logs),
|
||||||
|
use a named volume backed by a spec-configured host path under
|
||||||
|
`kind-mount-root`:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
# compose
|
||||||
|
volumes:
|
||||||
|
- my-data:/var/lib/foo
|
||||||
|
|
||||||
|
# spec.yml
|
||||||
|
kind-mount-root: /srv/kind
|
||||||
|
volumes:
|
||||||
|
my-data: /srv/kind/my-stack/data
|
||||||
|
```
|
||||||
|
|
||||||
|
Works on both kind (via the umbrella mount) and real k8s (operator
|
||||||
|
provisions `/srv/kind/my-stack/data` on each node).
|
||||||
|
|
||||||
|
### Migrating an Existing Cluster
|
||||||
|
|
||||||
|
If a cluster was created without an umbrella mount and you need to add a
|
||||||
|
stack that requires new host-path mounts, the cluster must be recreated:
|
||||||
|
|
||||||
|
1. Back up ephemeral state (DBs, caches) from PVs that lack host mounts —
|
||||||
|
these are in the kind node overlay FS and do not survive `kind delete`.
|
||||||
|
2. Update every stack's spec to set a shared `kind-mount-root` and place
|
||||||
|
host paths under it.
|
||||||
|
3. Stop all deployments, destroy the cluster, recreate it by starting any
|
||||||
|
stack (umbrella now active), and restore state.
|
||||||
|
|
|
||||||
|
|
@ -23,6 +23,7 @@ compose_deploy_type = "compose"
|
||||||
k8s_kind_deploy_type = "k8s-kind"
|
k8s_kind_deploy_type = "k8s-kind"
|
||||||
k8s_deploy_type = "k8s"
|
k8s_deploy_type = "k8s"
|
||||||
cluster_id_key = "cluster-id"
|
cluster_id_key = "cluster-id"
|
||||||
|
deployment_id_key = "deployment-id"
|
||||||
kube_config_key = "kube-config"
|
kube_config_key = "kube-config"
|
||||||
deploy_to_key = "deploy-to"
|
deploy_to_key = "deploy-to"
|
||||||
network_key = "network"
|
network_key = "network"
|
||||||
|
|
@ -47,5 +48,7 @@ high_memlock_runtime = "high-memlock"
|
||||||
high_memlock_spec_filename = "high-memlock-spec.json"
|
high_memlock_spec_filename = "high-memlock-spec.json"
|
||||||
acme_email_key = "acme-email"
|
acme_email_key = "acme-email"
|
||||||
kind_mount_root_key = "kind-mount-root"
|
kind_mount_root_key = "kind-mount-root"
|
||||||
|
caddy_ingress_image_key = "caddy-ingress-image"
|
||||||
|
default_caddy_ingress_image = "ghcr.io/laconicnetwork/caddy-ingress:latest"
|
||||||
external_services_key = "external-services"
|
external_services_key = "external-services"
|
||||||
ca_certificates_key = "ca-certificates"
|
ca_certificates_key = "ca-certificates"
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,33 @@
|
||||||
|
version: '3.2'
|
||||||
|
|
||||||
|
services:
|
||||||
|
host-telegraf:
|
||||||
|
image: telegraf:1.36
|
||||||
|
restart: unless-stopped
|
||||||
|
network_mode: host
|
||||||
|
pid: host
|
||||||
|
entrypoint: ["/scripts/telegraf-entrypoint.sh"]
|
||||||
|
environment:
|
||||||
|
INFLUXDB_URL: ${INFLUXDB_URL}
|
||||||
|
INFLUXDB_DB: ${INFLUXDB_DB:-host_metrics}
|
||||||
|
INFLUXDB_USER: ${INFLUXDB_WRITE_USER}
|
||||||
|
INFLUXDB_PASSWORD: ${INFLUXDB_WRITE_PASSWORD}
|
||||||
|
COLLECT_INTERVAL: ${COLLECT_INTERVAL:-10s}
|
||||||
|
HOST_TAG: ${HOST_TAG:-}
|
||||||
|
COLLECT_ZFS: ${COLLECT_ZFS:-false}
|
||||||
|
volumes:
|
||||||
|
- ../config/host-metrics/telegraf.conf.tpl:/etc/telegraf/telegraf.conf.tpl:ro
|
||||||
|
- ../config/host-metrics/scripts/telegraf-entrypoint.sh:/scripts/telegraf-entrypoint.sh:ro
|
||||||
|
- /proc:/hostfs/proc:ro
|
||||||
|
- /sys:/hostfs/sys:ro
|
||||||
|
- /:/hostfs:ro
|
||||||
|
# /dev is needed by inputs.diskio: it enumerates devices from
|
||||||
|
# /proc/diskstats and then opens /dev/<name> for udev/uevent lookups.
|
||||||
|
# Without this mount telegraf logs an "error reading /dev/<name>" warning
|
||||||
|
# per device per collection cycle.
|
||||||
|
- /dev:/dev:ro
|
||||||
|
# /run/udev is where modern systemd stores the udev database that
|
||||||
|
# gopsutil consults for per-device tags. Without it telegraf falls
|
||||||
|
# back to the legacy /dev/.udev/db/... path which doesn't exist on
|
||||||
|
# systemd hosts, producing "stat /dev/.udev/db/block:..." warnings.
|
||||||
|
- /run/udev:/run/udev:ro
|
||||||
|
|
@ -0,0 +1,5 @@
|
||||||
|
services:
|
||||||
|
test-restart:
|
||||||
|
image: busybox:1.36
|
||||||
|
command: ["sh", "-c", "echo started && sleep infinity"]
|
||||||
|
restart: always
|
||||||
|
|
@ -0,0 +1,68 @@
|
||||||
|
#!/bin/sh
|
||||||
|
# host-metrics telegraf-entrypoint.sh
|
||||||
|
# Render telegraf.conf from telegraf.conf.tpl, then exec telegraf.
|
||||||
|
#
|
||||||
|
# Substitutions performed here (by awk):
|
||||||
|
# @@HOST_TAG_BLOCK@@ -> "[global_tags]\n host = \"$HOST_TAG\"" if set, else empty.
|
||||||
|
# @@ZFS_BLOCK@@ -> "[[inputs.zfs]]\n poolMetrics = true" if COLLECT_ZFS=true, else empty.
|
||||||
|
#
|
||||||
|
# Variables of the form ${VAR} in the template (INFLUXDB_URL, INFLUXDB_DB,
|
||||||
|
# INFLUXDB_USER, INFLUXDB_PASSWORD, COLLECT_INTERVAL) are resolved by
|
||||||
|
# telegraf's own env-var substitution at config-load time and are NOT
|
||||||
|
# touched by this script.
|
||||||
|
#
|
||||||
|
# TELEGRAF_CONF_DIR overrides the conf directory for tests; defaults to
|
||||||
|
# /etc/telegraf which is the standard path inside the official image.
|
||||||
|
|
||||||
|
set -eu
|
||||||
|
|
||||||
|
CONF_DIR="${TELEGRAF_CONF_DIR:-/etc/telegraf}"
|
||||||
|
TPL="$CONF_DIR/telegraf.conf.tpl"
|
||||||
|
OUT="$CONF_DIR/telegraf.conf"
|
||||||
|
|
||||||
|
# Fail-fast required env. Empty string counts as missing -- a half-rendered
|
||||||
|
# conf or a noisy telegraf auth error is worse than a clear startup failure.
|
||||||
|
for v in INFLUXDB_URL INFLUXDB_USER INFLUXDB_PASSWORD; do
|
||||||
|
eval val=\${$v:-}
|
||||||
|
if [ -z "$val" ]; then
|
||||||
|
echo "FATAL: $v is required but empty" >&2
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
|
||||||
|
# Apply defaults for optional vars.
|
||||||
|
: "${INFLUXDB_DB:=host_metrics}"
|
||||||
|
: "${COLLECT_INTERVAL:=10s}"
|
||||||
|
: "${HOST_TAG:=}"
|
||||||
|
: "${COLLECT_ZFS:=false}"
|
||||||
|
|
||||||
|
# Build the marker substitutions. Use printf for the newline so the
|
||||||
|
# rendered block lands on its own line.
|
||||||
|
if [ -n "$HOST_TAG" ]; then
|
||||||
|
HOST_TAG_BLOCK=$(printf '[global_tags]\n host = "%s"' "$HOST_TAG")
|
||||||
|
else
|
||||||
|
HOST_TAG_BLOCK=""
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ "$COLLECT_ZFS" = "true" ]; then
|
||||||
|
ZFS_BLOCK=$(printf '[[inputs.zfs]]\n poolMetrics = true')
|
||||||
|
else
|
||||||
|
ZFS_BLOCK=""
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Export telegraf hostfs envs so /proc, /sys, and root come from the
|
||||||
|
# bind-mount under /hostfs (set in compose).
|
||||||
|
export HOST_PROC=/hostfs/proc
|
||||||
|
export HOST_SYS=/hostfs/sys
|
||||||
|
export HOST_MOUNT_PREFIX=/hostfs
|
||||||
|
|
||||||
|
# Render with awk: handles multi-line replacement values cleanly,
|
||||||
|
# avoids sed's newline-in-replacement portability quirks across BusyBox /
|
||||||
|
# GNU / BSD sed.
|
||||||
|
awk -v ht="$HOST_TAG_BLOCK" -v zb="$ZFS_BLOCK" '
|
||||||
|
{ gsub(/@@HOST_TAG_BLOCK@@/, ht);
|
||||||
|
gsub(/@@ZFS_BLOCK@@/, zb);
|
||||||
|
print }
|
||||||
|
' "$TPL" > "$OUT"
|
||||||
|
|
||||||
|
exec telegraf --config "$OUT"
|
||||||
|
|
@ -0,0 +1,121 @@
|
||||||
|
#!/bin/sh
|
||||||
|
# Offline tests for host-metrics telegraf-entrypoint.sh.
|
||||||
|
# Stubs telegraf and envsubst's downstream consumer; no telegraf binary needed.
|
||||||
|
set -eu
|
||||||
|
|
||||||
|
SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
|
||||||
|
ENTRYPOINT="$SCRIPT_DIR/telegraf-entrypoint.sh"
|
||||||
|
|
||||||
|
[ -x "$ENTRYPOINT" ] || { echo "FATAL: $ENTRYPOINT not executable"; exit 2; }
|
||||||
|
|
||||||
|
TMP=$(mktemp -d)
|
||||||
|
trap 'rm -rf "$TMP"' EXIT
|
||||||
|
mkdir -p "$TMP/bin" "$TMP/etc/telegraf"
|
||||||
|
|
||||||
|
# Stub telegraf so `exec telegraf` is a no-op.
|
||||||
|
cat > "$TMP/bin/telegraf" <<'EOF'
|
||||||
|
#!/bin/sh
|
||||||
|
exit 0
|
||||||
|
EOF
|
||||||
|
chmod +x "$TMP/bin/telegraf"
|
||||||
|
|
||||||
|
# Minimal template that exercises both markers.
|
||||||
|
cat > "$TMP/etc/telegraf/telegraf.conf.tpl" <<'EOF'
|
||||||
|
@@HOST_TAG_BLOCK@@
|
||||||
|
|
||||||
|
[agent]
|
||||||
|
interval = "${COLLECT_INTERVAL}"
|
||||||
|
|
||||||
|
[[outputs.influxdb]]
|
||||||
|
urls = ["${INFLUXDB_URL}"]
|
||||||
|
|
||||||
|
@@ZFS_BLOCK@@
|
||||||
|
EOF
|
||||||
|
|
||||||
|
PASS=0
|
||||||
|
FAIL=0
|
||||||
|
|
||||||
|
# run sets required env defaults, then layers caller env on top.
|
||||||
|
run() {
|
||||||
|
env PATH="$TMP/bin:$PATH" \
|
||||||
|
TELEGRAF_CONF_DIR="$TMP/etc/telegraf" \
|
||||||
|
INFLUXDB_URL="${INFLUXDB_URL-http://example/}" \
|
||||||
|
INFLUXDB_USER="${INFLUXDB_USER-writer}" \
|
||||||
|
INFLUXDB_PASSWORD="${INFLUXDB_PASSWORD-secret}" \
|
||||||
|
INFLUXDB_DB="${INFLUXDB_DB-host_metrics}" \
|
||||||
|
COLLECT_INTERVAL="${COLLECT_INTERVAL-10s}" \
|
||||||
|
HOST_TAG="${HOST_TAG-}" \
|
||||||
|
COLLECT_ZFS="${COLLECT_ZFS-false}" \
|
||||||
|
sh "$ENTRYPOINT" >/dev/null
|
||||||
|
rc=$?
|
||||||
|
[ -f "$TMP/etc/telegraf/telegraf.conf" ] && cat "$TMP/etc/telegraf/telegraf.conf"
|
||||||
|
return $rc
|
||||||
|
}
|
||||||
|
|
||||||
|
assert_grep() {
|
||||||
|
name=$1; actual=$2; pattern=$3
|
||||||
|
if printf '%s' "$actual" | grep -qE "$pattern"; then
|
||||||
|
echo "PASS: $name"; PASS=$((PASS + 1))
|
||||||
|
else
|
||||||
|
echo "FAIL: $name"
|
||||||
|
echo " expected pattern: $pattern"
|
||||||
|
echo " actual: $actual"
|
||||||
|
FAIL=$((FAIL + 1))
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
assert_not_grep() {
|
||||||
|
name=$1; actual=$2; pattern=$3
|
||||||
|
if printf '%s' "$actual" | grep -qE "$pattern"; then
|
||||||
|
echo "FAIL: $name (matched pattern $pattern)"; FAIL=$((FAIL + 1))
|
||||||
|
else
|
||||||
|
echo "PASS: $name"; PASS=$((PASS + 1))
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# T1: HOST_TAG unset -> no [global_tags] block emitted
|
||||||
|
out=$(HOST_TAG="" run)
|
||||||
|
assert_not_grep "T1: HOST_TAG empty -> no global_tags" "$out" '^\[global_tags\]'
|
||||||
|
|
||||||
|
# T2: HOST_TAG set -> [global_tags] block with host = "<value>"
|
||||||
|
out=$(HOST_TAG="validator-1" run)
|
||||||
|
assert_grep "T2: HOST_TAG set -> [global_tags] block" "$out" '^\[global_tags\]'
|
||||||
|
assert_grep "T2: HOST_TAG set -> host = \"validator-1\"" "$out" 'host = "validator-1"'
|
||||||
|
|
||||||
|
# T3: COLLECT_ZFS=true -> [[inputs.zfs]] block present
|
||||||
|
out=$(COLLECT_ZFS="true" run)
|
||||||
|
assert_grep "T3: COLLECT_ZFS true -> inputs.zfs block" "$out" '\[\[inputs\.zfs\]\]'
|
||||||
|
|
||||||
|
# T4: COLLECT_ZFS=false -> no inputs.zfs block
|
||||||
|
out=$(COLLECT_ZFS="false" run)
|
||||||
|
assert_not_grep "T4: COLLECT_ZFS false -> no inputs.zfs" "$out" '\[\[inputs\.zfs\]\]'
|
||||||
|
|
||||||
|
# T5: markers fully removed even when block bodies are empty
|
||||||
|
out=$(HOST_TAG="" COLLECT_ZFS="false" run)
|
||||||
|
assert_not_grep "T5: no leftover @@HOST_TAG_BLOCK@@" "$out" '@@HOST_TAG_BLOCK@@'
|
||||||
|
assert_not_grep "T5: no leftover @@ZFS_BLOCK@@" "$out" '@@ZFS_BLOCK@@'
|
||||||
|
|
||||||
|
# T6: missing INFLUXDB_URL -> exit non-zero, error on stderr
|
||||||
|
rc=0
|
||||||
|
INFLUXDB_URL="" run 2>"$TMP/err" || rc=$?
|
||||||
|
[ "$rc" -ne 0 ] && grep -q INFLUXDB_URL "$TMP/err" \
|
||||||
|
&& { echo "PASS: T6: missing INFLUXDB_URL -> error"; PASS=$((PASS + 1)); } \
|
||||||
|
|| { echo "FAIL: T6: missing INFLUXDB_URL handling (rc=$rc)"; FAIL=$((FAIL + 1)); }
|
||||||
|
|
||||||
|
# T7: missing INFLUXDB_USER -> exit non-zero
|
||||||
|
rc=0
|
||||||
|
INFLUXDB_USER="" run 2>"$TMP/err" || rc=$?
|
||||||
|
[ "$rc" -ne 0 ] && grep -q INFLUXDB_USER "$TMP/err" \
|
||||||
|
&& { echo "PASS: T7: missing INFLUXDB_USER -> error"; PASS=$((PASS + 1)); } \
|
||||||
|
|| { echo "FAIL: T7: missing INFLUXDB_USER handling (rc=$rc)"; FAIL=$((FAIL + 1)); }
|
||||||
|
|
||||||
|
# T8: missing INFLUXDB_PASSWORD -> exit non-zero
|
||||||
|
rc=0
|
||||||
|
INFLUXDB_PASSWORD="" run 2>"$TMP/err" || rc=$?
|
||||||
|
[ "$rc" -ne 0 ] && grep -q INFLUXDB_PASSWORD "$TMP/err" \
|
||||||
|
&& { echo "PASS: T8: missing INFLUXDB_PASSWORD -> error"; PASS=$((PASS + 1)); } \
|
||||||
|
|| { echo "FAIL: T8: missing INFLUXDB_PASSWORD handling (rc=$rc)"; FAIL=$((FAIL + 1)); }
|
||||||
|
|
||||||
|
echo
|
||||||
|
echo "Results: $PASS passed, $FAIL failed"
|
||||||
|
[ "$FAIL" = "0" ]
|
||||||
|
|
@ -0,0 +1,66 @@
|
||||||
|
# host-metrics telegraf template.
|
||||||
|
# Rendered at container start by telegraf-entrypoint.sh. The entrypoint
|
||||||
|
# replaces two single-line markers in this file with TOML block fragments;
|
||||||
|
# see telegraf-entrypoint.sh for the substitution details. All ${...}
|
||||||
|
# variables are resolved by telegraf's native env substitution at
|
||||||
|
# config-load time.
|
||||||
|
|
||||||
|
@@HOST_TAG_BLOCK@@
|
||||||
|
|
||||||
|
[agent]
|
||||||
|
interval = "${COLLECT_INTERVAL}"
|
||||||
|
round_interval = true
|
||||||
|
collection_jitter = "0s"
|
||||||
|
flush_interval = "${COLLECT_INTERVAL}"
|
||||||
|
flush_jitter = "0s"
|
||||||
|
precision = "0s"
|
||||||
|
hostname = ""
|
||||||
|
omit_hostname = false
|
||||||
|
|
||||||
|
[[outputs.influxdb]]
|
||||||
|
urls = ["${INFLUXDB_URL}"]
|
||||||
|
database = "${INFLUXDB_DB}"
|
||||||
|
skip_database_creation = true
|
||||||
|
username = "${INFLUXDB_USER}"
|
||||||
|
password = "${INFLUXDB_PASSWORD}"
|
||||||
|
retention_policy = ""
|
||||||
|
write_consistency = "any"
|
||||||
|
timeout = "10s"
|
||||||
|
|
||||||
|
[[inputs.cpu]]
|
||||||
|
percpu = false
|
||||||
|
totalcpu = true
|
||||||
|
collect_cpu_time = false
|
||||||
|
report_active = true
|
||||||
|
|
||||||
|
[[inputs.mem]]
|
||||||
|
|
||||||
|
[[inputs.swap]]
|
||||||
|
|
||||||
|
[[inputs.system]]
|
||||||
|
|
||||||
|
[[inputs.processes]]
|
||||||
|
|
||||||
|
[[inputs.disk]]
|
||||||
|
# gopsutil with HOST_MOUNT_PREFIX=/hostfs strips the /hostfs prefix
|
||||||
|
# from /proc/mounts entries, so the mountpoints telegraf sees are
|
||||||
|
# the host's real paths (/, /boot, /home, ...). No mount_points
|
||||||
|
# filter; let ignore_fs do the noise reduction.
|
||||||
|
ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay",
|
||||||
|
"aufs", "squashfs", "nsfs", "tracefs", "proc", "sysfs",
|
||||||
|
"cgroup", "cgroup2", "fuse.lxcfs"]
|
||||||
|
|
||||||
|
[[inputs.diskio]]
|
||||||
|
device_tags = ["DEVNAME"]
|
||||||
|
skip_serial_number = true
|
||||||
|
name_templates = ["$DEVNAME"]
|
||||||
|
|
||||||
|
[[inputs.net]]
|
||||||
|
# Allowlist covers physical ethernet (eth*, en*), wireless (wl*),
|
||||||
|
# cellular (wwan*), and bonded/teamed interfaces (bond*). Excludes
|
||||||
|
# docker bridges, veth pairs, tun/tap, and lo. Adjust per host if
|
||||||
|
# you need a more specific scope.
|
||||||
|
ignore_protocol_stats = true
|
||||||
|
interfaces = ["eth*", "en*", "wl*", "wwan*", "bond*"]
|
||||||
|
|
||||||
|
@@ZFS_BLOCK@@
|
||||||
|
|
@ -160,6 +160,11 @@ metadata:
|
||||||
app.kubernetes.io/component: controller
|
app.kubernetes.io/component: controller
|
||||||
spec:
|
spec:
|
||||||
replicas: 1
|
replicas: 1
|
||||||
|
# Recreate is required: the pod binds hostPort 80/443, which a
|
||||||
|
# RollingUpdate would try to double-claim during cutover (new pod
|
||||||
|
# pending until old pod exits — never exits, rollout deadlocks).
|
||||||
|
strategy:
|
||||||
|
type: Recreate
|
||||||
selector:
|
selector:
|
||||||
matchLabels:
|
matchLabels:
|
||||||
app.kubernetes.io/name: caddy-ingress-controller
|
app.kubernetes.io/name: caddy-ingress-controller
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,127 @@
|
||||||
|
# host-metrics stack
|
||||||
|
|
||||||
|
Per-host system metrics collector. Runs telegraf with host networking, host
|
||||||
|
PID namespace, and read-only bind mounts of /proc, /sys, and / so it can
|
||||||
|
report real CPU, memory, disk, network, and process metrics for the machine
|
||||||
|
it runs on. Writes to an InfluxDB 1.x endpoint of your choosing.
|
||||||
|
|
||||||
|
Deploy one instance per machine you want monitored.
|
||||||
|
|
||||||
|
## What gets collected
|
||||||
|
|
||||||
|
| Input | Measurements (in InfluxDB) |
|
||||||
|
|-------|----------------------------|
|
||||||
|
| inputs.cpu (totalcpu only) | cpu (`cpu=cpu-total`) |
|
||||||
|
| inputs.mem | mem |
|
||||||
|
| inputs.swap | swap |
|
||||||
|
| inputs.system | system (uptime, load1/5/15, n_users, n_cpus) |
|
||||||
|
| inputs.processes | processes (running/sleeping/blocked/zombies) |
|
||||||
|
| inputs.disk | disk (used/free/used_percent per mount) |
|
||||||
|
| inputs.diskio | diskio (read/write bytes/ops per device) |
|
||||||
|
| inputs.net | net (bytes/packets/err in/out per interface) |
|
||||||
|
| inputs.zfs (opt-in via COLLECT_ZFS=true) | zfs (ARC stats, pool state) |
|
||||||
|
|
||||||
|
All rows are tagged with `host` (kernel hostname, or `HOST_TAG` override).
|
||||||
|
|
||||||
|
## Deploy
|
||||||
|
|
||||||
|
### Create a spec
|
||||||
|
|
||||||
|
```bash
|
||||||
|
laconic-so --stack host-metrics deploy init --output spec-host-metrics.yml
|
||||||
|
```
|
||||||
|
|
||||||
|
Edit `spec-host-metrics.yml` to look like:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
stack: host-metrics
|
||||||
|
deploy-to: compose
|
||||||
|
credentials-files:
|
||||||
|
- ~/.credentials/host-metrics.env
|
||||||
|
config:
|
||||||
|
INFLUXDB_URL: 'https://influxdb.example.com'
|
||||||
|
INFLUXDB_DB: 'host_metrics' # default; override for a custom DB
|
||||||
|
HOST_TAG: 'validator-1' # optional; defaults to kernel hostname
|
||||||
|
COLLECT_INTERVAL: '10s' # telegraf collection + flush cadence
|
||||||
|
COLLECT_ZFS: 'false' # set to 'true' on ZFS hosts
|
||||||
|
```
|
||||||
|
|
||||||
|
`~/.credentials/host-metrics.env` must contain:
|
||||||
|
|
||||||
|
```
|
||||||
|
INFLUXDB_WRITE_USER=<writer-username>
|
||||||
|
INFLUXDB_WRITE_PASSWORD=<writer-password>
|
||||||
|
```
|
||||||
|
|
||||||
|
These are issued by the InfluxDB admin (the monitoring host operator); they
|
||||||
|
are the same writer-only credentials used by validators/RPCs to push agave
|
||||||
|
metrics.
|
||||||
|
|
||||||
|
### Create and start
|
||||||
|
|
||||||
|
```bash
|
||||||
|
laconic-so --stack host-metrics deploy create \
|
||||||
|
--spec-file spec-host-metrics.yml \
|
||||||
|
--deployment-dir ./deployment-host-metrics
|
||||||
|
laconic-so deployment --dir ./deployment-host-metrics start
|
||||||
|
```
|
||||||
|
|
||||||
|
`deploy create` builds the deployment dir from the spec; `deployment start`
|
||||||
|
brings the containers up. The `--stack` option is required for `deploy`
|
||||||
|
subcommands but rejected on `deployment` subcommands (the deployment dir
|
||||||
|
already knows its stack).
|
||||||
|
|
||||||
|
### Verify
|
||||||
|
|
||||||
|
```bash
|
||||||
|
laconic-so deployment --dir ./deployment-host-metrics logs host-telegraf | head
|
||||||
|
```
|
||||||
|
|
||||||
|
Expected: telegraf prints its startup banner and `Loaded inputs: ...`. No
|
||||||
|
errors about missing config or auth failures.
|
||||||
|
|
||||||
|
Within ~20 seconds, the host's data appears in the InfluxDB endpoint's
|
||||||
|
`host_metrics` database (or whichever DB you set in INFLUXDB_DB) and in
|
||||||
|
any Grafana dashboards bound to that DB.
|
||||||
|
|
||||||
|
## Configuration reference
|
||||||
|
|
||||||
|
| Env | Required | Default | Notes |
|
||||||
|
|-----|----------|---------|-------|
|
||||||
|
| `INFLUXDB_URL` | yes | - | Full URL including scheme. Example: `https://influxdb.example.com`. |
|
||||||
|
| `INFLUXDB_DB` | no | `host_metrics` | Target database. Must exist (writer is not granted CREATE). |
|
||||||
|
| `INFLUXDB_WRITE_USER` | yes | - | Writer-only user. |
|
||||||
|
| `INFLUXDB_WRITE_PASSWORD` | yes | - | Writer-only password. |
|
||||||
|
| `COLLECT_INTERVAL` | no | `10s` | Telegraf collection and flush cadence. |
|
||||||
|
| `HOST_TAG` | no | empty | Overrides the kernel hostname for the `host` tag on every row. Useful when a VM has a generic hostname. |
|
||||||
|
| `COLLECT_ZFS` | no | `false` | Set to `true` to enable `inputs.zfs` (pool state + ARC stats). |
|
||||||
|
|
||||||
|
## ZFS hosts
|
||||||
|
|
||||||
|
`inputs.disk` already reports used/free per mount for any filesystem type
|
||||||
|
including ZFS, so the disk-usage view works out of the box. Setting
|
||||||
|
`COLLECT_ZFS=true` additionally enables `inputs.zfs` which reads
|
||||||
|
`/proc/spl/kstat/zfs/...` and emits ARC hit ratio, ARC size, and per-pool
|
||||||
|
health metrics. The bind mount of `/proc` provides the necessary
|
||||||
|
visibility; no extra mounts are needed.
|
||||||
|
|
||||||
|
If you set `COLLECT_ZFS=true` on a non-ZFS host, telegraf logs an error
|
||||||
|
once per collection cycle and skips the input. Harmless but noisy; leave
|
||||||
|
the toggle off on non-ZFS machines.
|
||||||
|
|
||||||
|
## Troubleshooting
|
||||||
|
|
||||||
|
| Symptom | Likely cause |
|
||||||
|
|---------|-------------|
|
||||||
|
| Container fails to start with `FATAL: INFLUXDB_URL is required but empty` | Missing required env. Check spec.yml + credentials file. |
|
||||||
|
| Container starts, no rows appear in InfluxDB | Writer credentials wrong, or InfluxDB unreachable from this host's network. Check `docker logs <host-telegraf>` for `Post ... 401` / `connection refused`. |
|
||||||
|
| Two hosts overwriting each other's series | Both use the same kernel hostname. Set distinct `HOST_TAG` values. |
|
||||||
|
| `inputs.processes` reports only 1 process | `pid: host` missing from compose. Re-deploy. |
|
||||||
|
|
||||||
|
## Caveats
|
||||||
|
|
||||||
|
- Requires Docker with privileges to bind-mount `/`, `/proc`, `/sys`, and to
|
||||||
|
share the host PID namespace. Rootless Docker installations may refuse
|
||||||
|
`pid: host` and the `/` bind mount.
|
||||||
|
- One deployment per host. Running two on the same machine writes
|
||||||
|
duplicate rows under the same `host` tag.
|
||||||
|
|
@ -0,0 +1,5 @@
|
||||||
|
version: "1.1"
|
||||||
|
name: host-metrics
|
||||||
|
description: "Per-host system metrics collector (telegraf -> InfluxDB)"
|
||||||
|
pods:
|
||||||
|
- host-metrics
|
||||||
|
|
@ -0,0 +1,14 @@
|
||||||
|
# test-restart-multi
|
||||||
|
|
||||||
|
E2E test stack used by `tests/k8s-deploy/run-restart-test.sh` to cover the
|
||||||
|
multi-repo case: `pods:` references two pod repos, each shipping its own
|
||||||
|
`deploy/commands.py`. `deploy create` should produce
|
||||||
|
`<deployment>/hooks/commands_0.py` and `<deployment>/hooks/commands_1.py`,
|
||||||
|
and `deployment start` should invoke both `start()` hooks (each writes its
|
||||||
|
own marker file so neither overwrites the other).
|
||||||
|
|
||||||
|
The pod repos themselves are created by the test script as bare-repo +
|
||||||
|
working-clone pairs under `$CERC_REPO_BASE_DIR/test-restart-pod-{a,b}`;
|
||||||
|
they are not committed to this repository. Each pod repo ships its own
|
||||||
|
`docker-compose.yml` (resolved by `get_pod_file_path` for dict-form pods)
|
||||||
|
and `stack/deploy/commands.py` — the stack repo only owns `stack.yml`.
|
||||||
|
|
@ -0,0 +1,10 @@
|
||||||
|
version: "1.0"
|
||||||
|
name: test-restart-multi
|
||||||
|
description: "E2E test stack for the deployment restart command (multi-repo case)"
|
||||||
|
pods:
|
||||||
|
- name: test-restart-multi-a
|
||||||
|
repository: test-restart-pod-a
|
||||||
|
path: .
|
||||||
|
- name: test-restart-multi-b
|
||||||
|
repository: test-restart-pod-b
|
||||||
|
path: .
|
||||||
|
|
@ -0,0 +1,15 @@
|
||||||
|
# test-restart
|
||||||
|
|
||||||
|
E2E test stack used by `tests/k8s-deploy/run-restart-test.sh`.
|
||||||
|
|
||||||
|
The stack ships a small `start()` hook that writes a versioned marker file
|
||||||
|
into the deployment directory. The test exercises `deployment restart`:
|
||||||
|
|
||||||
|
1. `deploy create` → asserts `commands.py` was copied into `<deployment>/hooks/`.
|
||||||
|
2. `deployment start` → asserts the marker file contains the v1 string.
|
||||||
|
3. Modifies `commands.py` in the stack-source working tree (v1 → v2).
|
||||||
|
4. `deployment restart` → asserts the new `commands.py` was re-copied into
|
||||||
|
`<deployment>/hooks/` and the marker file now contains the v2 string.
|
||||||
|
|
||||||
|
The pod uses a public `busybox` image that just sleeps; the marker file is
|
||||||
|
the only thing under test.
|
||||||
|
|
@ -0,0 +1,32 @@
|
||||||
|
# Copyright © 2026 Vulcanize
|
||||||
|
|
||||||
|
# This program is free software: you can redistribute it and/or modify
|
||||||
|
# it under the terms of the GNU Affero General Public License as published by
|
||||||
|
# the Free Software Foundation, either version 3 of the License, or
|
||||||
|
# (at your option) any later version.
|
||||||
|
|
||||||
|
# This program is distributed in the hope that it will be useful,
|
||||||
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
# GNU Affero General Public License for more details.
|
||||||
|
|
||||||
|
# You should have received a copy of the GNU Affero General Public License
|
||||||
|
# along with this program. If not, see <http:#www.gnu.org/licenses/>.
|
||||||
|
|
||||||
|
from stack_orchestrator.util import get_yaml
|
||||||
|
from stack_orchestrator.deploy.deployment_context import DeploymentContext
|
||||||
|
|
||||||
|
default_spec_file_content = ""
|
||||||
|
|
||||||
|
|
||||||
|
def init(command_context):
|
||||||
|
return get_yaml().load(default_spec_file_content)
|
||||||
|
|
||||||
|
|
||||||
|
def start(deployment_context: DeploymentContext):
|
||||||
|
# Writes a marker file the e2e test asserts on. The test flips the
|
||||||
|
# literal below from "v1" to "v2" in the stack-source working tree
|
||||||
|
# before running 'deployment restart' to verify the updated hook is
|
||||||
|
# copied into deployment_dir/hooks/ and re-executed.
|
||||||
|
marker = deployment_context.deployment_dir / "marker"
|
||||||
|
marker.write_text("v1")
|
||||||
|
|
@ -0,0 +1,5 @@
|
||||||
|
version: "1.0"
|
||||||
|
name: test-restart
|
||||||
|
description: "E2E test stack for the deployment restart command"
|
||||||
|
pods:
|
||||||
|
- test-restart
|
||||||
|
|
@ -48,10 +48,21 @@ class DockerDeployer(Deployer):
|
||||||
self.compose_project_name = compose_project_name
|
self.compose_project_name = compose_project_name
|
||||||
self.compose_env_file = compose_env_file
|
self.compose_env_file = compose_env_file
|
||||||
|
|
||||||
def up(self, detach, skip_cluster_management, services, image_overrides=None):
|
def up(
|
||||||
|
self,
|
||||||
|
detach,
|
||||||
|
skip_cluster_management,
|
||||||
|
services,
|
||||||
|
image_overrides=None,
|
||||||
|
force_recreate=False,
|
||||||
|
):
|
||||||
if not opts.o.dry_run:
|
if not opts.o.dry_run:
|
||||||
try:
|
try:
|
||||||
return self.docker.compose.up(detach=detach, services=services)
|
return self.docker.compose.up(
|
||||||
|
detach=detach,
|
||||||
|
services=services,
|
||||||
|
force_recreate=force_recreate,
|
||||||
|
)
|
||||||
except DockerException as e:
|
except DockerException as e:
|
||||||
raise DeployerException(e)
|
raise DeployerException(e)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -142,6 +142,7 @@ def up_operation(
|
||||||
stay_attached=False,
|
stay_attached=False,
|
||||||
skip_cluster_management=False,
|
skip_cluster_management=False,
|
||||||
image_overrides=None,
|
image_overrides=None,
|
||||||
|
force_recreate=False,
|
||||||
):
|
):
|
||||||
global_context = ctx.parent.parent.obj
|
global_context = ctx.parent.parent.obj
|
||||||
deploy_context = ctx.obj
|
deploy_context = ctx.obj
|
||||||
|
|
@ -161,6 +162,7 @@ def up_operation(
|
||||||
skip_cluster_management=skip_cluster_management,
|
skip_cluster_management=skip_cluster_management,
|
||||||
services=services_list,
|
services=services_list,
|
||||||
image_overrides=image_overrides,
|
image_overrides=image_overrides,
|
||||||
|
force_recreate=force_recreate,
|
||||||
)
|
)
|
||||||
for post_start_command in cluster_context.post_start_commands:
|
for post_start_command in cluster_context.post_start_commands:
|
||||||
_run_command(global_context, cluster_context.cluster, post_start_command)
|
_run_command(global_context, cluster_context.cluster, post_start_command)
|
||||||
|
|
|
||||||
|
|
@ -20,7 +20,14 @@ from typing import Optional
|
||||||
|
|
||||||
class Deployer(ABC):
|
class Deployer(ABC):
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def up(self, detach, skip_cluster_management, services, image_overrides=None):
|
def up(
|
||||||
|
self,
|
||||||
|
detach,
|
||||||
|
skip_cluster_management,
|
||||||
|
services,
|
||||||
|
image_overrides=None,
|
||||||
|
force_recreate=False,
|
||||||
|
):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
|
|
|
||||||
|
|
@ -471,12 +471,18 @@ def restart(ctx, stack_path, spec_file, config_file, force, expected_ip, image):
|
||||||
ctx, deployment_context, maintenance_svc, image_overrides
|
ctx, deployment_context, maintenance_svc, image_overrides
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
|
# force_recreate=True so source-file edits (alert rules, dashboards,
|
||||||
|
# entrypoint scripts, etc. mounted via bind volumes) are picked up.
|
||||||
|
# docker compose up -d alone is a no-op when the service definition
|
||||||
|
# itself is unchanged, leaving the running container with stale
|
||||||
|
# in-memory state.
|
||||||
up_operation(
|
up_operation(
|
||||||
ctx,
|
ctx,
|
||||||
services_list=None,
|
services_list=None,
|
||||||
stay_attached=False,
|
stay_attached=False,
|
||||||
skip_cluster_management=True,
|
skip_cluster_management=True,
|
||||||
image_overrides=image_overrides or None,
|
image_overrides=image_overrides or None,
|
||||||
|
force_recreate=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Restore cwd after both create_operation and up_operation have run.
|
# Restore cwd after both create_operation and up_operation have run.
|
||||||
|
|
@ -514,12 +520,15 @@ def _restart_with_maintenance(
|
||||||
|
|
||||||
# Step 1: Apply the full deployment (creates/updates all pods + services)
|
# Step 1: Apply the full deployment (creates/updates all pods + services)
|
||||||
# This ensures maintenance pod exists before we swap Ingress to it.
|
# This ensures maintenance pod exists before we swap Ingress to it.
|
||||||
|
# force_recreate intent matches the non-maintenance restart path; the
|
||||||
|
# k8s deployer currently ignores the flag (TODO in deploy_k8s.up).
|
||||||
up_operation(
|
up_operation(
|
||||||
ctx,
|
ctx,
|
||||||
services_list=None,
|
services_list=None,
|
||||||
stay_attached=False,
|
stay_attached=False,
|
||||||
skip_cluster_management=True,
|
skip_cluster_management=True,
|
||||||
image_overrides=image_overrides or None,
|
image_overrides=image_overrides or None,
|
||||||
|
force_recreate=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Parse maintenance service spec: "container-name:port"
|
# Parse maintenance service spec: "container-name:port"
|
||||||
|
|
|
||||||
|
|
@ -26,6 +26,7 @@ from stack_orchestrator.deploy.spec import Spec
|
||||||
class DeploymentContext:
|
class DeploymentContext:
|
||||||
deployment_dir: Path
|
deployment_dir: Path
|
||||||
id: str
|
id: str
|
||||||
|
deployment_id: str
|
||||||
spec: Spec
|
spec: Spec
|
||||||
stack: Stack
|
stack: Stack
|
||||||
|
|
||||||
|
|
@ -48,8 +49,27 @@ class DeploymentContext:
|
||||||
return self.get_compose_dir() / f"docker-compose-{name}.yml"
|
return self.get_compose_dir() / f"docker-compose-{name}.yml"
|
||||||
|
|
||||||
def get_cluster_id(self):
|
def get_cluster_id(self):
|
||||||
|
"""Identifier of the kind cluster this deployment attaches to.
|
||||||
|
|
||||||
|
Shared across deployments that join the same kind cluster. Used
|
||||||
|
for the kube-config context name (`kind-{cluster-id}`) and for
|
||||||
|
kind cluster lifecycle ops.
|
||||||
|
"""
|
||||||
return self.id
|
return self.id
|
||||||
|
|
||||||
|
def get_deployment_id(self):
|
||||||
|
"""Identifier of this particular deployment's k8s resources.
|
||||||
|
|
||||||
|
Distinct per deployment even when multiple deployments share a
|
||||||
|
cluster. Used as compose_project_name → app_name → prefix for
|
||||||
|
all k8s resource names (PVs, ConfigMaps, Deployments, …).
|
||||||
|
|
||||||
|
Backward compat: for deployment.yml files written before this
|
||||||
|
field existed, falls back to cluster-id so existing on-disk
|
||||||
|
resource names remain stable (no PV renames, no re-bind).
|
||||||
|
"""
|
||||||
|
return self.deployment_id
|
||||||
|
|
||||||
def init(self, dir: Path):
|
def init(self, dir: Path):
|
||||||
self.deployment_dir = dir.absolute()
|
self.deployment_dir = dir.absolute()
|
||||||
self.spec = Spec()
|
self.spec = Spec()
|
||||||
|
|
@ -60,6 +80,10 @@ class DeploymentContext:
|
||||||
if deployment_file_path.exists():
|
if deployment_file_path.exists():
|
||||||
obj = get_yaml().load(open(deployment_file_path, "r"))
|
obj = get_yaml().load(open(deployment_file_path, "r"))
|
||||||
self.id = obj[constants.cluster_id_key]
|
self.id = obj[constants.cluster_id_key]
|
||||||
|
# Fallback to cluster-id for deployments created before the
|
||||||
|
# deployment-id field was introduced. Keeps existing resource
|
||||||
|
# names stable across this upgrade.
|
||||||
|
self.deployment_id = obj.get(constants.deployment_id_key, self.id)
|
||||||
# Handle the case of a legacy deployment with no file
|
# Handle the case of a legacy deployment with no file
|
||||||
# Code below is intended to match the output from _make_default_cluster_name()
|
# Code below is intended to match the output from _make_default_cluster_name()
|
||||||
# TODO: remove when we no longer need to support legacy deployments
|
# TODO: remove when we no longer need to support legacy deployments
|
||||||
|
|
@ -68,6 +92,7 @@ class DeploymentContext:
|
||||||
unique_cluster_descriptor = f"{path},{self.get_stack_file()},None,None"
|
unique_cluster_descriptor = f"{path},{self.get_stack_file()},None,None"
|
||||||
hash = hashlib.md5(unique_cluster_descriptor.encode()).hexdigest()[:16]
|
hash = hashlib.md5(unique_cluster_descriptor.encode()).hexdigest()[:16]
|
||||||
self.id = f"{constants.cluster_name_prefix}{hash}"
|
self.id = f"{constants.cluster_name_prefix}{hash}"
|
||||||
|
self.deployment_id = self.id
|
||||||
|
|
||||||
def modify_yaml(self, file_path: Path, modifier_func):
|
def modify_yaml(self, file_path: Path, modifier_func):
|
||||||
"""Load a YAML, apply a modification function, and write it back."""
|
"""Load a YAML, apply a modification function, and write it back."""
|
||||||
|
|
|
||||||
|
|
@ -51,8 +51,10 @@ from stack_orchestrator.util import (
|
||||||
)
|
)
|
||||||
from stack_orchestrator.deploy.spec import Spec
|
from stack_orchestrator.deploy.spec import Spec
|
||||||
from stack_orchestrator.deploy.deploy_types import LaconicStackSetupCommand
|
from stack_orchestrator.deploy.deploy_types import LaconicStackSetupCommand
|
||||||
|
from stack_orchestrator.deploy.deployer import DeployerException
|
||||||
from stack_orchestrator.deploy.deployer_factory import getDeployerConfigGenerator
|
from stack_orchestrator.deploy.deployer_factory import getDeployerConfigGenerator
|
||||||
from stack_orchestrator.deploy.deployment_context import DeploymentContext
|
from stack_orchestrator.deploy.deployment_context import DeploymentContext
|
||||||
|
from stack_orchestrator.deploy.k8s.helpers import is_host_path_mount
|
||||||
|
|
||||||
|
|
||||||
def _make_default_deployment_dir():
|
def _make_default_deployment_dir():
|
||||||
|
|
@ -274,19 +276,125 @@ def call_stack_deploy_start(deployment_context):
|
||||||
create additional k8s resources (Services, etc.) in the deployment namespace.
|
create additional k8s resources (Services, etc.) in the deployment namespace.
|
||||||
The namespace can be derived as f"laconic-{deployment_context.id}".
|
The namespace can be derived as f"laconic-{deployment_context.id}".
|
||||||
"""
|
"""
|
||||||
python_file_paths = _commands_plugin_paths(deployment_context.stack.name)
|
hooks_dir = deployment_context.deployment_dir / "hooks"
|
||||||
for python_file_path in python_file_paths:
|
if not hooks_dir.exists():
|
||||||
if python_file_path.exists():
|
return
|
||||||
spec = util.spec_from_file_location("commands", python_file_path)
|
for python_file_path in sorted(hooks_dir.glob("commands*.py")):
|
||||||
if spec is None or spec.loader is None:
|
spec = util.spec_from_file_location("commands", python_file_path)
|
||||||
continue
|
if spec is None or spec.loader is None:
|
||||||
imported_stack = util.module_from_spec(spec)
|
continue
|
||||||
spec.loader.exec_module(imported_stack)
|
imported_stack = util.module_from_spec(spec)
|
||||||
if _has_method(imported_stack, "start"):
|
spec.loader.exec_module(imported_stack)
|
||||||
imported_stack.start(deployment_context)
|
if _has_method(imported_stack, "start"):
|
||||||
|
imported_stack.start(deployment_context)
|
||||||
|
|
||||||
|
|
||||||
# Inspect the pod yaml to find config files referenced in subdirectories
|
# Inspect the pod yaml to find config files referenced in subdirectories
|
||||||
|
# Safety margin under the k8s ConfigMap 1 MiB hard limit. Accounts for
|
||||||
|
# base64 expansion (~33%) and ConfigMap metadata overhead.
|
||||||
|
_HOST_PATH_CONFIGMAP_BUDGET_BYTES = 700 * 1024
|
||||||
|
|
||||||
|
|
||||||
|
def _validate_host_path_mounts(parsed_pod_file, pod_name, pod_file_path):
|
||||||
|
"""Fail fast at deploy create on unsupported host-path compose volumes.
|
||||||
|
|
||||||
|
Host-path compose volumes (`<src>:<dst>[:opts]` with src starting
|
||||||
|
with /, ., or ~) flow through auto-generated ConfigMaps at deploy
|
||||||
|
start. ConfigMaps can't represent:
|
||||||
|
- directories with subdirectories (flat key space)
|
||||||
|
- content exceeding ~700 KiB (k8s 1 MiB limit minus base64/overhead)
|
||||||
|
- writable mounts (ConfigMap mounts are read-only)
|
||||||
|
|
||||||
|
Reject those shapes up front with a clear error so users don't hit
|
||||||
|
the failure later at start time.
|
||||||
|
|
||||||
|
Source resolution: compose paths like `../config/foo.sh` are
|
||||||
|
relative to the compose file location in the stack source tree at
|
||||||
|
deploy create time. At deploy start, the file is read from the
|
||||||
|
matching copy under `{deployment_dir}/config/{pod}/` that deploy
|
||||||
|
create lays down.
|
||||||
|
"""
|
||||||
|
compose_stack_dir = Path(pod_file_path).resolve().parent
|
||||||
|
services = parsed_pod_file.get("services") or {}
|
||||||
|
for service_name, service_info in services.items():
|
||||||
|
for volume_str in service_info.get("volumes") or []:
|
||||||
|
parts = volume_str.split(":")
|
||||||
|
if len(parts) < 2:
|
||||||
|
continue
|
||||||
|
src = parts[0]
|
||||||
|
if not is_host_path_mount(src):
|
||||||
|
continue
|
||||||
|
mount_opts = parts[2] if len(parts) > 2 else None
|
||||||
|
opt_tokens = (
|
||||||
|
[t.strip() for t in mount_opts.split(",") if t.strip()]
|
||||||
|
if mount_opts
|
||||||
|
else []
|
||||||
|
)
|
||||||
|
if "rw" in opt_tokens:
|
||||||
|
raise DeployerException(
|
||||||
|
f"Writable host-path bind not supported: "
|
||||||
|
f"'{volume_str}' in {pod_name}/{service_name}.\n"
|
||||||
|
"Host-path binds from the deployment directory are "
|
||||||
|
"static content injected as ConfigMaps (read-only). "
|
||||||
|
"Use a named volume with a spec-configured host path "
|
||||||
|
"under 'kind-mount-root' for writable data. See "
|
||||||
|
"docs/deployment_patterns.md."
|
||||||
|
)
|
||||||
|
|
||||||
|
abs_src = (compose_stack_dir / src).resolve()
|
||||||
|
if not abs_src.exists():
|
||||||
|
# Preserve existing behavior — compose-level binds with
|
||||||
|
# missing sources fail later; don't introduce a new
|
||||||
|
# early failure mode here.
|
||||||
|
continue
|
||||||
|
if abs_src.is_file():
|
||||||
|
# Single files are always fine — one-key ConfigMap with
|
||||||
|
# subPath. Budget check here too in case of huge single
|
||||||
|
# files.
|
||||||
|
size = abs_src.stat().st_size
|
||||||
|
if size > _HOST_PATH_CONFIGMAP_BUDGET_BYTES:
|
||||||
|
raise DeployerException(
|
||||||
|
f"Host-path bind '{volume_str}' in "
|
||||||
|
f"{pod_name}/{service_name} points at a file of "
|
||||||
|
f"{size} bytes, exceeding the ConfigMap budget "
|
||||||
|
f"({_HOST_PATH_CONFIGMAP_BUDGET_BYTES} bytes "
|
||||||
|
f"after base64/overhead).\n\n"
|
||||||
|
"Embed the file in the container image at build "
|
||||||
|
"time, or split into multiple smaller files."
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
if abs_src.is_dir():
|
||||||
|
entries = list(abs_src.iterdir())
|
||||||
|
if any(p.is_dir() for p in entries):
|
||||||
|
raise DeployerException(
|
||||||
|
f"Directory host-path bind '{volume_str}' in "
|
||||||
|
f"{pod_name}/{service_name} contains "
|
||||||
|
"subdirectories, which cannot be represented "
|
||||||
|
"in a k8s ConfigMap.\n\n"
|
||||||
|
"Restructure the stack to either:\n"
|
||||||
|
" - embed the directory in the container "
|
||||||
|
"image at build time,\n"
|
||||||
|
" - split into multiple ConfigMap entries "
|
||||||
|
"(one per subdir),\n"
|
||||||
|
" - or use an initContainer to populate the "
|
||||||
|
"content at runtime.\n\n"
|
||||||
|
"See docs/deployment_patterns.md."
|
||||||
|
)
|
||||||
|
total = sum(p.stat().st_size for p in entries if p.is_file())
|
||||||
|
if total > _HOST_PATH_CONFIGMAP_BUDGET_BYTES:
|
||||||
|
raise DeployerException(
|
||||||
|
f"Directory host-path bind '{volume_str}' in "
|
||||||
|
f"{pod_name}/{service_name} totals {total} "
|
||||||
|
f"bytes, exceeding the ConfigMap budget "
|
||||||
|
f"({_HOST_PATH_CONFIGMAP_BUDGET_BYTES} bytes "
|
||||||
|
f"after base64/overhead).\n\n"
|
||||||
|
"Embed the content in the container image at "
|
||||||
|
"build time, or split into smaller ConfigMaps. "
|
||||||
|
"See docs/deployment_patterns.md."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# _find_extra_config_dirs: Find config dirs referenced in the pod files
|
||||||
# other than the one associated with the pod
|
# other than the one associated with the pod
|
||||||
def _find_extra_config_dirs(parsed_pod_file, pod):
|
def _find_extra_config_dirs(parsed_pod_file, pod):
|
||||||
config_dirs = set()
|
config_dirs = set()
|
||||||
|
|
@ -778,7 +886,15 @@ def _create_deployment_file(deployment_dir: Path, stack_source: Optional[Path] =
|
||||||
# Reuse existing Kind cluster if one exists, otherwise generate a timestamp-based ID
|
# Reuse existing Kind cluster if one exists, otherwise generate a timestamp-based ID
|
||||||
existing = _get_existing_kind_cluster()
|
existing = _get_existing_kind_cluster()
|
||||||
cluster = existing if existing else generate_id("laconic")
|
cluster = existing if existing else generate_id("laconic")
|
||||||
deployment_content = {constants.cluster_id_key: cluster}
|
# deployment-id is always fresh per `deploy create`, even when
|
||||||
|
# cluster-id is inherited from a running cluster. Keeps each
|
||||||
|
# deployment's k8s resource names (PVs, ConfigMaps, Deployment)
|
||||||
|
# distinct even when multiple deployments share a cluster.
|
||||||
|
deployment_id = generate_id("laconic")
|
||||||
|
deployment_content = {
|
||||||
|
constants.cluster_id_key: cluster,
|
||||||
|
constants.deployment_id_key: deployment_id,
|
||||||
|
}
|
||||||
if stack_source:
|
if stack_source:
|
||||||
deployment_content["stack-source"] = str(stack_source)
|
deployment_content["stack-source"] = str(stack_source)
|
||||||
with open(deployment_file_path, "w") as output_file:
|
with open(deployment_file_path, "w") as output_file:
|
||||||
|
|
@ -994,6 +1110,37 @@ def _safe_copy_tree(src: Path, dst: Path, exclude_patterns: Optional[List[str]]
|
||||||
safe_copy_file(src_path, dst_path)
|
safe_copy_file(src_path, dst_path)
|
||||||
|
|
||||||
|
|
||||||
|
def _copy_hooks(stack_name: str, target_dir: Path):
|
||||||
|
"""Copy commands.py hooks into deployment_dir/hooks/ for self-sufficiency.
|
||||||
|
|
||||||
|
Single repo: hooks/commands.py
|
||||||
|
Multi-repo: hooks/commands_0.py, hooks/commands_1.py, ... — indexed by
|
||||||
|
plugin path order.
|
||||||
|
|
||||||
|
Note: the whole commands.py file is copied (init/setup/create/start), but
|
||||||
|
at runtime only call_stack_deploy_start loads from this copied location.
|
||||||
|
call_stack_deploy_init, call_stack_deploy_setup, and call_stack_deploy_create
|
||||||
|
still resolve commands.py from the live stack source via
|
||||||
|
get_plugin_code_paths — they only run at deploy create time when the source
|
||||||
|
is guaranteed to be present, so they don't need to be self-sufficient.
|
||||||
|
"""
|
||||||
|
plugin_paths = get_plugin_code_paths(stack_name)
|
||||||
|
sources = [
|
||||||
|
p.joinpath("deploy", "commands.py")
|
||||||
|
for p in plugin_paths
|
||||||
|
if p.joinpath("deploy", "commands.py").exists()
|
||||||
|
]
|
||||||
|
if not sources:
|
||||||
|
return
|
||||||
|
hooks_dir = target_dir / "hooks"
|
||||||
|
hooks_dir.mkdir(exist_ok=True)
|
||||||
|
if len(sources) == 1:
|
||||||
|
copyfile(sources[0], hooks_dir / "commands.py")
|
||||||
|
else:
|
||||||
|
for i, src in enumerate(sources):
|
||||||
|
copyfile(src, hooks_dir / f"commands_{i}.py")
|
||||||
|
|
||||||
|
|
||||||
def _write_deployment_files(
|
def _write_deployment_files(
|
||||||
target_dir: Path,
|
target_dir: Path,
|
||||||
spec_file: Path,
|
spec_file: Path,
|
||||||
|
|
@ -1021,6 +1168,8 @@ def _write_deployment_files(
|
||||||
copyfile(spec_file, target_dir.joinpath(constants.spec_file_name))
|
copyfile(spec_file, target_dir.joinpath(constants.spec_file_name))
|
||||||
copyfile(stack_file, target_dir.joinpath(constants.stack_file_name))
|
copyfile(stack_file, target_dir.joinpath(constants.stack_file_name))
|
||||||
|
|
||||||
|
_copy_hooks(stack_name, target_dir)
|
||||||
|
|
||||||
# Create deployment file if requested
|
# Create deployment file if requested
|
||||||
if include_deployment_file:
|
if include_deployment_file:
|
||||||
_create_deployment_file(target_dir, stack_source=stack_source)
|
_create_deployment_file(target_dir, stack_source=stack_source)
|
||||||
|
|
@ -1058,6 +1207,12 @@ def _write_deployment_files(
|
||||||
if pod_file_path is None:
|
if pod_file_path is None:
|
||||||
continue
|
continue
|
||||||
parsed_pod_file = yaml.load(open(pod_file_path, "r"))
|
parsed_pod_file = yaml.load(open(pod_file_path, "r"))
|
||||||
|
# Reject host-path compose volumes whose shape can't land as a
|
||||||
|
# ConfigMap (dir-with-subdirs, oversize, writable). File-level
|
||||||
|
# and flat-dir host-path binds are accepted — they auto-convert
|
||||||
|
# to ConfigMaps at deploy start via cluster_info.get_configmaps.
|
||||||
|
if parsed_spec.is_kubernetes_deployment():
|
||||||
|
_validate_host_path_mounts(parsed_pod_file, pod, pod_file_path)
|
||||||
extra_config_dirs = _find_extra_config_dirs(parsed_pod_file, pod)
|
extra_config_dirs = _find_extra_config_dirs(parsed_pod_file, pod)
|
||||||
destination_pod_dir = destination_pods_dir.joinpath(pod)
|
destination_pod_dir = destination_pods_dir.joinpath(pod)
|
||||||
os.makedirs(destination_pod_dir, exist_ok=True)
|
os.makedirs(destination_pod_dir, exist_ok=True)
|
||||||
|
|
@ -1126,7 +1281,9 @@ def _write_deployment_files(
|
||||||
else:
|
else:
|
||||||
source_config_dir = resolve_config_dir(stack_name, configmap_name)
|
source_config_dir = resolve_config_dir(stack_name, configmap_name)
|
||||||
if os.path.exists(source_config_dir):
|
if os.path.exists(source_config_dir):
|
||||||
destination_config_dir = target_dir.joinpath("configmaps", configmap_name)
|
destination_config_dir = target_dir.joinpath(
|
||||||
|
"configmaps", configmap_name
|
||||||
|
)
|
||||||
copytree(source_config_dir, destination_config_dir, dirs_exist_ok=True)
|
copytree(source_config_dir, destination_config_dir, dirs_exist_ok=True)
|
||||||
|
|
||||||
# Copy the job files into the target dir
|
# Copy the job files into the target dir
|
||||||
|
|
@ -1138,6 +1295,8 @@ def _write_deployment_files(
|
||||||
job_file_path = get_job_file_path(stack_name, parsed_stack, job)
|
job_file_path = get_job_file_path(stack_name, parsed_stack, job)
|
||||||
if job_file_path and job_file_path.exists():
|
if job_file_path and job_file_path.exists():
|
||||||
parsed_job_file = yaml.load(open(job_file_path, "r"))
|
parsed_job_file = yaml.load(open(job_file_path, "r"))
|
||||||
|
if parsed_spec.is_kubernetes_deployment():
|
||||||
|
_validate_host_path_mounts(parsed_job_file, job, job_file_path)
|
||||||
_fixup_pod_file(parsed_job_file, parsed_spec, destination_compose_dir)
|
_fixup_pod_file(parsed_job_file, parsed_spec, destination_compose_dir)
|
||||||
with open(
|
with open(
|
||||||
destination_compose_jobs_dir.joinpath(
|
destination_compose_jobs_dir.joinpath(
|
||||||
|
|
|
||||||
|
|
@ -6,7 +6,7 @@
|
||||||
import secrets
|
import secrets
|
||||||
import socket
|
import socket
|
||||||
import time
|
import time
|
||||||
from typing import Optional
|
from typing import List, Optional
|
||||||
import requests
|
import requests
|
||||||
from kubernetes import client
|
from kubernetes import client
|
||||||
|
|
||||||
|
|
@ -18,7 +18,7 @@ def get_server_egress_ip() -> str:
|
||||||
return response.text.strip()
|
return response.text.strip()
|
||||||
|
|
||||||
|
|
||||||
def resolve_hostname(hostname: str) -> list[str]:
|
def resolve_hostname(hostname: str) -> List[str]:
|
||||||
"""Resolve hostname to list of IP addresses."""
|
"""Resolve hostname to list of IP addresses."""
|
||||||
try:
|
try:
|
||||||
_, _, ips = socket.gethostbyname_ex(hostname)
|
_, _, ips = socket.gethostbyname_ex(hostname)
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,7 @@
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import base64
|
import base64
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from kubernetes import client
|
from kubernetes import client
|
||||||
from typing import Any, List, Optional, Set
|
from typing import Any, List, Optional, Set
|
||||||
|
|
@ -22,7 +23,10 @@ from typing import Any, List, Optional, Set
|
||||||
from stack_orchestrator.opts import opts
|
from stack_orchestrator.opts import opts
|
||||||
from stack_orchestrator.util import env_var_map_from_file
|
from stack_orchestrator.util import env_var_map_from_file
|
||||||
from stack_orchestrator.deploy.k8s.helpers import (
|
from stack_orchestrator.deploy.k8s.helpers import (
|
||||||
|
is_host_path_mount,
|
||||||
named_volumes_from_pod_files,
|
named_volumes_from_pod_files,
|
||||||
|
resolve_host_path_for_kind,
|
||||||
|
sanitize_host_path_to_volume_name,
|
||||||
volume_mounts_for_service,
|
volume_mounts_for_service,
|
||||||
volumes_for_pod_files,
|
volumes_for_pod_files,
|
||||||
)
|
)
|
||||||
|
|
@ -433,8 +437,85 @@ class ClusterInfo:
|
||||||
binary_data=data,
|
binary_data=data,
|
||||||
)
|
)
|
||||||
result.append(spec)
|
result.append(spec)
|
||||||
|
|
||||||
|
# Auto-generated ConfigMaps for file-level and flat-dir host-path
|
||||||
|
# compose volumes. Avoids the aliasing failure mode where two
|
||||||
|
# deployments sharing a cluster would collide at the same kind
|
||||||
|
# node path — each deployment gets its own namespace-scoped
|
||||||
|
# ConfigMap instead. See docs/deployment_patterns.md.
|
||||||
|
result.extend(self._host_path_bind_configmaps())
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
def _host_path_bind_configmaps(self) -> List[client.V1ConfigMap]:
|
||||||
|
"""Build V1ConfigMap objects for host-path compose volumes.
|
||||||
|
|
||||||
|
Walks every service in every parsed pod/job compose file. For each
|
||||||
|
volume whose source is a host path (starts with /, ., or ~),
|
||||||
|
reads the resolved file or flat directory from the deployment
|
||||||
|
directory and packages it as a V1ConfigMap.
|
||||||
|
|
||||||
|
Dedupes by sanitized name across pods and services — a source
|
||||||
|
referenced from N places yields one ConfigMap.
|
||||||
|
"""
|
||||||
|
if self.spec.file_path is None:
|
||||||
|
return []
|
||||||
|
deployment_dir = Path(self.spec.file_path).parent
|
||||||
|
seen: Set[str] = set()
|
||||||
|
result: List[client.V1ConfigMap] = []
|
||||||
|
|
||||||
|
all_pod_maps = [self.parsed_pod_yaml_map, self.parsed_job_yaml_map]
|
||||||
|
for pod_map in all_pod_maps:
|
||||||
|
for _pod_key, pod in pod_map.items():
|
||||||
|
services = pod.get("services") or {}
|
||||||
|
for _svc_name, svc in services.items():
|
||||||
|
for mount_string in svc.get("volumes") or []:
|
||||||
|
parts = mount_string.split(":")
|
||||||
|
if len(parts) < 2:
|
||||||
|
continue
|
||||||
|
src = parts[0]
|
||||||
|
if not is_host_path_mount(src):
|
||||||
|
continue
|
||||||
|
sanitized = sanitize_host_path_to_volume_name(src)
|
||||||
|
if sanitized in seen:
|
||||||
|
continue
|
||||||
|
seen.add(sanitized)
|
||||||
|
abs_src = resolve_host_path_for_kind(src, deployment_dir)
|
||||||
|
data = self._read_host_path_source(abs_src, mount_string)
|
||||||
|
cm = client.V1ConfigMap(
|
||||||
|
metadata=client.V1ObjectMeta(
|
||||||
|
name=f"{self.app_name}-{sanitized}",
|
||||||
|
labels=self._stack_labels(
|
||||||
|
{"configmap-label": sanitized}
|
||||||
|
),
|
||||||
|
),
|
||||||
|
binary_data=data,
|
||||||
|
)
|
||||||
|
result.append(cm)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def _read_host_path_source(self, abs_src: Path, mount_string: str) -> dict:
|
||||||
|
"""Read file or flat-directory content for a host-path ConfigMap.
|
||||||
|
|
||||||
|
Validates shape at read time as a defensive second check — the
|
||||||
|
same rules are enforced earlier at `deploy create`, but deploy-
|
||||||
|
dir content may have been edited since then.
|
||||||
|
"""
|
||||||
|
if not abs_src.exists():
|
||||||
|
raise RuntimeError(
|
||||||
|
f"Source for host-path compose volume does not exist: "
|
||||||
|
f"{abs_src} (volume: '{mount_string}')"
|
||||||
|
)
|
||||||
|
data = {}
|
||||||
|
if abs_src.is_file():
|
||||||
|
with open(abs_src, "rb") as f:
|
||||||
|
data[abs_src.name] = base64.b64encode(f.read()).decode("ASCII")
|
||||||
|
elif abs_src.is_dir():
|
||||||
|
for entry in abs_src.iterdir():
|
||||||
|
if entry.is_file():
|
||||||
|
with open(entry, "rb") as f:
|
||||||
|
data[entry.name] = base64.b64encode(f.read()).decode("ASCII")
|
||||||
|
return data
|
||||||
|
|
||||||
def get_pvs(self):
|
def get_pvs(self):
|
||||||
result = []
|
result = []
|
||||||
spec_volumes = self.spec.get_volumes()
|
spec_volumes = self.spec.get_volumes()
|
||||||
|
|
@ -621,7 +702,11 @@ class ClusterInfo:
|
||||||
if self.spec.get_image_registry() is not None
|
if self.spec.get_image_registry() is not None
|
||||||
else image
|
else image
|
||||||
)
|
)
|
||||||
volume_mounts = volume_mounts_for_service(parsed_yaml_map, service_name)
|
volume_mounts = volume_mounts_for_service(
|
||||||
|
parsed_yaml_map,
|
||||||
|
service_name,
|
||||||
|
Path(self.spec.file_path).parent if self.spec.file_path else None,
|
||||||
|
)
|
||||||
# Handle command/entrypoint from compose file
|
# Handle command/entrypoint from compose file
|
||||||
# In docker-compose: entrypoint -> k8s command, command -> k8s args
|
# In docker-compose: entrypoint -> k8s command, command -> k8s args
|
||||||
container_command = None
|
container_command = None
|
||||||
|
|
@ -928,9 +1013,7 @@ class ClusterInfo:
|
||||||
metadata=client.V1ObjectMeta(
|
metadata=client.V1ObjectMeta(
|
||||||
name=deployment_name,
|
name=deployment_name,
|
||||||
labels=self._stack_labels(
|
labels=self._stack_labels(
|
||||||
{"app.kubernetes.io/component": pod_name}
|
{"app.kubernetes.io/component": pod_name} if multi_pod else None
|
||||||
if multi_pod
|
|
||||||
else None
|
|
||||||
),
|
),
|
||||||
),
|
),
|
||||||
spec=spec,
|
spec=spec,
|
||||||
|
|
@ -978,9 +1061,7 @@ class ClusterInfo:
|
||||||
container_ports[container].add(port)
|
container_ports[container].add(port)
|
||||||
if maintenance_svc and ":" in maintenance_svc:
|
if maintenance_svc and ":" in maintenance_svc:
|
||||||
maint_container, maint_port_str = maintenance_svc.split(":", 1)
|
maint_container, maint_port_str = maintenance_svc.split(":", 1)
|
||||||
container_ports.setdefault(maint_container, set()).add(
|
container_ports.setdefault(maint_container, set()).add(int(maint_port_str))
|
||||||
int(maint_port_str)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Build map: pod_file -> set of service names in that pod
|
# Build map: pod_file -> set of service names in that pod
|
||||||
pod_services_map: dict = {}
|
pod_services_map: dict = {}
|
||||||
|
|
|
||||||
|
|
@ -20,14 +20,22 @@ from kubernetes.client.exceptions import ApiException
|
||||||
from typing import Any, Dict, List, Optional, cast
|
from typing import Any, Dict, List, Optional, cast
|
||||||
|
|
||||||
from stack_orchestrator import constants
|
from stack_orchestrator import constants
|
||||||
from stack_orchestrator.deploy.deployer import Deployer, DeployerConfigGenerator
|
from stack_orchestrator.deploy.deployer import (
|
||||||
|
Deployer,
|
||||||
|
DeployerConfigGenerator,
|
||||||
|
DeployerException,
|
||||||
|
)
|
||||||
from stack_orchestrator.deploy.k8s.helpers import (
|
from stack_orchestrator.deploy.k8s.helpers import (
|
||||||
|
check_mounts_compatible,
|
||||||
create_cluster,
|
create_cluster,
|
||||||
destroy_cluster,
|
destroy_cluster,
|
||||||
|
get_kind_cluster,
|
||||||
|
is_image_available_locally,
|
||||||
load_images_into_kind,
|
load_images_into_kind,
|
||||||
)
|
)
|
||||||
from stack_orchestrator.deploy.k8s.helpers import (
|
from stack_orchestrator.deploy.k8s.helpers import (
|
||||||
install_ingress_for_kind,
|
install_ingress_for_kind,
|
||||||
|
update_caddy_ingress_image,
|
||||||
wait_for_ingress_in_kind,
|
wait_for_ingress_in_kind,
|
||||||
is_ingress_running,
|
is_ingress_running,
|
||||||
)
|
)
|
||||||
|
|
@ -123,27 +131,34 @@ class K8sDeployer(Deployer):
|
||||||
return
|
return
|
||||||
self.deployment_dir = deployment_context.deployment_dir
|
self.deployment_dir = deployment_context.deployment_dir
|
||||||
self.deployment_context = deployment_context
|
self.deployment_context = deployment_context
|
||||||
|
# kind cluster name comes from cluster-id — which kind cluster this
|
||||||
|
# deployment attaches to. Shared across deployments that join the
|
||||||
|
# same cluster. compose_project_name is kept as a parameter for
|
||||||
|
# interface compatibility with the compose deployer path.
|
||||||
|
cluster_id = deployment_context.get_cluster_id()
|
||||||
|
deployment_id = deployment_context.get_deployment_id()
|
||||||
self.kind_cluster_name = (
|
self.kind_cluster_name = (
|
||||||
deployment_context.spec.get_kind_cluster_name() or compose_project_name
|
deployment_context.spec.get_kind_cluster_name() or cluster_id
|
||||||
)
|
|
||||||
# Use spec namespace if provided, otherwise derive from cluster-id
|
|
||||||
self.k8s_namespace = (
|
|
||||||
deployment_context.spec.get_namespace() or f"laconic-{compose_project_name}"
|
|
||||||
)
|
)
|
||||||
self.cluster_info = ClusterInfo()
|
self.cluster_info = ClusterInfo()
|
||||||
# stack.name may be an absolute path (from spec "stack:" key after
|
# stack.name may be an absolute path (from spec "stack:" key after
|
||||||
# path resolution). Extract just the directory basename for labels.
|
# path resolution). Extract just the directory basename for labels.
|
||||||
raw_name = deployment_context.stack.name if deployment_context else ""
|
raw_name = deployment_context.stack.name if deployment_context else ""
|
||||||
stack_name = Path(raw_name).name if raw_name else ""
|
stack_name = Path(raw_name).name if raw_name else ""
|
||||||
# Use spec namespace if provided, otherwise derive from stack name
|
# Namespace: spec override wins; else derive from stack name; else
|
||||||
|
# fall back to deployment-id. (On older deployment.yml files without
|
||||||
|
# deployment-id, get_deployment_id() returns cluster-id — same as
|
||||||
|
# the pre-decouple behavior.)
|
||||||
self.k8s_namespace = deployment_context.spec.get_namespace() or (
|
self.k8s_namespace = deployment_context.spec.get_namespace() or (
|
||||||
f"laconic-{stack_name}" if stack_name else f"laconic-{compose_project_name}"
|
f"laconic-{stack_name}" if stack_name else f"laconic-{deployment_id}"
|
||||||
)
|
)
|
||||||
self.cluster_info = ClusterInfo()
|
self.cluster_info = ClusterInfo()
|
||||||
|
# app_name comes from deployment-id so each deployment owns its own
|
||||||
|
# k8s resource names, even when multiple deployments share a cluster.
|
||||||
self.cluster_info.int(
|
self.cluster_info.int(
|
||||||
compose_files,
|
compose_files,
|
||||||
compose_env_file,
|
compose_env_file,
|
||||||
compose_project_name,
|
deployment_id,
|
||||||
deployment_context.spec,
|
deployment_context.spec,
|
||||||
stack_name=stack_name,
|
stack_name=stack_name,
|
||||||
)
|
)
|
||||||
|
|
@ -175,28 +190,71 @@ class K8sDeployer(Deployer):
|
||||||
self.custom_obj_api = client.CustomObjectsApi()
|
self.custom_obj_api = client.CustomObjectsApi()
|
||||||
|
|
||||||
def _ensure_namespace(self):
|
def _ensure_namespace(self):
|
||||||
"""Create the deployment namespace if it doesn't exist."""
|
"""Create the deployment namespace if it doesn't exist.
|
||||||
|
|
||||||
|
Stamps the namespace with a `laconic.com/deployment-dir`
|
||||||
|
annotation so that a subsequent `deployment start` from a
|
||||||
|
different deployment dir — which would otherwise silently
|
||||||
|
patch this deployment's k8s resources in place — fails with
|
||||||
|
a clear error directing at the `namespace:` spec override.
|
||||||
|
"""
|
||||||
if opts.o.dry_run:
|
if opts.o.dry_run:
|
||||||
print(f"Dry run: would create namespace {self.k8s_namespace}")
|
print(f"Dry run: would create namespace {self.k8s_namespace}")
|
||||||
return
|
return
|
||||||
|
owner_key = "laconic.com/deployment-dir"
|
||||||
|
my_dir = str(Path(self.deployment_dir).resolve())
|
||||||
try:
|
try:
|
||||||
self.core_api.read_namespace(name=self.k8s_namespace)
|
existing = self.core_api.read_namespace(name=self.k8s_namespace)
|
||||||
if opts.o.debug:
|
|
||||||
print(f"Namespace {self.k8s_namespace} already exists")
|
|
||||||
except ApiException as e:
|
except ApiException as e:
|
||||||
if e.status == 404:
|
if e.status != 404:
|
||||||
# Create the namespace
|
|
||||||
ns = client.V1Namespace(
|
|
||||||
metadata=client.V1ObjectMeta(
|
|
||||||
name=self.k8s_namespace,
|
|
||||||
labels=self.cluster_info._stack_labels(),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
self.core_api.create_namespace(body=ns)
|
|
||||||
if opts.o.debug:
|
|
||||||
print(f"Created namespace {self.k8s_namespace}")
|
|
||||||
else:
|
|
||||||
raise
|
raise
|
||||||
|
existing = None
|
||||||
|
|
||||||
|
if existing is None:
|
||||||
|
ns = client.V1Namespace(
|
||||||
|
metadata=client.V1ObjectMeta(
|
||||||
|
name=self.k8s_namespace,
|
||||||
|
labels=self.cluster_info._stack_labels(),
|
||||||
|
annotations={owner_key: my_dir},
|
||||||
|
)
|
||||||
|
)
|
||||||
|
self.core_api.create_namespace(body=ns)
|
||||||
|
if opts.o.debug:
|
||||||
|
print(f"Created namespace {self.k8s_namespace} " f"owned by {my_dir}")
|
||||||
|
return
|
||||||
|
|
||||||
|
annotations = (existing.metadata.annotations or {}) if existing.metadata else {}
|
||||||
|
owner = annotations.get(owner_key)
|
||||||
|
if owner and owner != my_dir:
|
||||||
|
raise DeployerException(
|
||||||
|
f"Namespace '{self.k8s_namespace}' is already owned by "
|
||||||
|
f"another deployment at:\n {owner}\n"
|
||||||
|
f"\nThis deployment is at:\n {my_dir}\n"
|
||||||
|
"\nTwo deployments of the same stack sharing a cluster "
|
||||||
|
"cannot share a namespace — every namespace-scoped "
|
||||||
|
"resource (Deployment, ConfigMaps, Services, PVCs) "
|
||||||
|
"would collide and silently patch each other.\n"
|
||||||
|
"\nFix: add an explicit `namespace:` override to this "
|
||||||
|
"deployment's spec.yml so it lands in its own "
|
||||||
|
"namespace. For example:\n"
|
||||||
|
f" namespace: {self.k8s_namespace}-<suffix>\n"
|
||||||
|
"\n(k8s namespace names must be lowercase alphanumeric "
|
||||||
|
"plus '-', start and end with an alphanumeric character, "
|
||||||
|
"≤63 chars.)"
|
||||||
|
)
|
||||||
|
if not owner:
|
||||||
|
# Legacy namespace (pre-dates this check) or user-created.
|
||||||
|
# Adopt it by stamping the ownership annotation so
|
||||||
|
# subsequent conflicting deployments fail loudly.
|
||||||
|
patch = {"metadata": {"annotations": {owner_key: my_dir}}}
|
||||||
|
self.core_api.patch_namespace(name=self.k8s_namespace, body=patch)
|
||||||
|
if opts.o.debug:
|
||||||
|
print(
|
||||||
|
f"Adopted existing namespace {self.k8s_namespace} "
|
||||||
|
f"as owned by {my_dir}"
|
||||||
|
)
|
||||||
|
elif opts.o.debug:
|
||||||
|
print(f"Namespace {self.k8s_namespace} already owned by {my_dir}")
|
||||||
|
|
||||||
def _delete_namespace(self):
|
def _delete_namespace(self):
|
||||||
"""Delete the deployment namespace and all resources within it."""
|
"""Delete the deployment namespace and all resources within it."""
|
||||||
|
|
@ -776,23 +834,62 @@ class K8sDeployer(Deployer):
|
||||||
actual_cluster = create_cluster(self.kind_cluster_name, kind_config)
|
actual_cluster = create_cluster(self.kind_cluster_name, kind_config)
|
||||||
if actual_cluster != self.kind_cluster_name:
|
if actual_cluster != self.kind_cluster_name:
|
||||||
self.kind_cluster_name = actual_cluster
|
self.kind_cluster_name = actual_cluster
|
||||||
# Only load locally-built images into kind
|
|
||||||
local_containers = self.deployment_context.stack.obj.get("containers", [])
|
local_containers = self.deployment_context.stack.obj.get("containers", [])
|
||||||
if local_containers:
|
images_to_preload = set((self.image_overrides or {}).values()) | {
|
||||||
local_images = {
|
img
|
||||||
img
|
for img in self.cluster_info.image_set
|
||||||
for img in self.cluster_info.image_set
|
if any(c in img for c in local_containers)
|
||||||
if any(c in img for c in local_containers)
|
}
|
||||||
}
|
images_to_preload = {
|
||||||
if local_images:
|
img for img in images_to_preload if is_image_available_locally(img)
|
||||||
load_images_into_kind(self.kind_cluster_name, local_images)
|
}
|
||||||
|
if images_to_preload:
|
||||||
|
load_images_into_kind(self.kind_cluster_name, images_to_preload)
|
||||||
|
elif self.is_kind():
|
||||||
|
# --skip-cluster-management (default): cluster must already exist.
|
||||||
|
# Without this check, connect_api() below raises a cryptic
|
||||||
|
# kubernetes.config.ConfigException when the context is missing.
|
||||||
|
existing = get_kind_cluster()
|
||||||
|
if existing is None:
|
||||||
|
raise DeployerException(
|
||||||
|
f"No kind cluster is running. This deployment expects "
|
||||||
|
f"cluster '{self.kind_cluster_name}' to exist.\n"
|
||||||
|
"\n"
|
||||||
|
"--skip-cluster-management is the default; pass "
|
||||||
|
"--perform-cluster-management to have laconic-so "
|
||||||
|
"create the cluster, or start it manually first."
|
||||||
|
)
|
||||||
|
if existing != self.kind_cluster_name:
|
||||||
|
raise DeployerException(
|
||||||
|
f"Running kind cluster '{existing}' does not match the "
|
||||||
|
f"cluster-id '{self.kind_cluster_name}' in "
|
||||||
|
f"{self.deployment_dir}/deployment.yml.\n"
|
||||||
|
"\n"
|
||||||
|
"Fix by either:\n"
|
||||||
|
" - editing deployment.yml to set "
|
||||||
|
f"cluster-id: {existing}, or\n"
|
||||||
|
" - passing --perform-cluster-management to create a "
|
||||||
|
"fresh cluster (note: destroys the existing one if "
|
||||||
|
"names collide)."
|
||||||
|
)
|
||||||
|
# Mount topology applies regardless of who owns cluster
|
||||||
|
# lifecycle — validate here too.
|
||||||
|
kind_config = str(
|
||||||
|
self.deployment_dir.joinpath(constants.kind_config_filename)
|
||||||
|
)
|
||||||
|
check_mounts_compatible(existing, kind_config)
|
||||||
self.connect_api()
|
self.connect_api()
|
||||||
self._ensure_namespace()
|
self._ensure_namespace()
|
||||||
|
caddy_image = self.cluster_info.spec.get_caddy_ingress_image()
|
||||||
|
# Fresh-install path: gated on cluster lifecycle ownership
|
||||||
|
# because install_ingress_for_kind also seeds caddy-system
|
||||||
|
# (namespace, secrets restore, cert-backup CronJob).
|
||||||
if self.is_kind() and not self.skip_cluster_management:
|
if self.is_kind() and not self.skip_cluster_management:
|
||||||
if not is_ingress_running():
|
if not is_ingress_running():
|
||||||
install_ingress_for_kind(
|
install_ingress_for_kind(
|
||||||
self.cluster_info.spec.get_acme_email(),
|
self.cluster_info.spec.get_acme_email(),
|
||||||
self.cluster_info.spec.get_kind_mount_root(),
|
self.cluster_info.spec.get_kind_mount_root(),
|
||||||
|
caddy_image=caddy_image,
|
||||||
)
|
)
|
||||||
wait_for_ingress_in_kind()
|
wait_for_ingress_in_kind()
|
||||||
if self.cluster_info.spec.get_unlimited_memlock():
|
if self.cluster_info.spec.get_unlimited_memlock():
|
||||||
|
|
@ -800,6 +897,18 @@ class K8sDeployer(Deployer):
|
||||||
constants.high_memlock_runtime,
|
constants.high_memlock_runtime,
|
||||||
constants.high_memlock_runtime,
|
constants.high_memlock_runtime,
|
||||||
)
|
)
|
||||||
|
# Reconcile Caddy image whenever the operator explicitly set
|
||||||
|
# it in spec, regardless of cluster lifecycle ownership —
|
||||||
|
# --skip-cluster-management (the default) shouldn't prevent
|
||||||
|
# a routine k8s-API-level patch of a running Deployment.
|
||||||
|
# Spec absent => don't touch: the operator may have set the
|
||||||
|
# image out-of-band (ansible playbook, prior explicit spec on
|
||||||
|
# a different deployment) and a silent revert would be worse
|
||||||
|
# than doing nothing. caddy-system is cluster-scoped, so
|
||||||
|
# whichever deployment's spec sets the image last wins.
|
||||||
|
if self.is_kind() and caddy_image is not None and is_ingress_running():
|
||||||
|
if update_caddy_ingress_image(caddy_image):
|
||||||
|
wait_for_ingress_in_kind()
|
||||||
|
|
||||||
def _create_ingress(self):
|
def _create_ingress(self):
|
||||||
"""Create or update Ingress with TLS certificate lookup."""
|
"""Create or update Ingress with TLS certificate lookup."""
|
||||||
|
|
@ -878,7 +987,19 @@ class K8sDeployer(Deployer):
|
||||||
else:
|
else:
|
||||||
raise
|
raise
|
||||||
|
|
||||||
def up(self, detach, skip_cluster_management, services, image_overrides=None):
|
def up(
|
||||||
|
self,
|
||||||
|
detach,
|
||||||
|
skip_cluster_management,
|
||||||
|
services,
|
||||||
|
image_overrides=None,
|
||||||
|
force_recreate=False,
|
||||||
|
):
|
||||||
|
# TODO: honor force_recreate by stamping the
|
||||||
|
# kubectl.kubernetes.io/restartedAt annotation on managed
|
||||||
|
# Deployments so a rollout occurs even when the manifest is
|
||||||
|
# unchanged. Today this method is a no-op for that flag.
|
||||||
|
# Tracked separately from the compose-side fix.
|
||||||
# Merge spec-level image overrides with CLI overrides
|
# Merge spec-level image overrides with CLI overrides
|
||||||
spec_overrides = self.cluster_info.spec.get("image-overrides", {})
|
spec_overrides = self.cluster_info.spec.get("image-overrides", {})
|
||||||
if spec_overrides:
|
if spec_overrides:
|
||||||
|
|
@ -915,9 +1036,7 @@ class K8sDeployer(Deployer):
|
||||||
|
|
||||||
call_stack_deploy_start(self.deployment_context)
|
call_stack_deploy_start(self.deployment_context)
|
||||||
|
|
||||||
def down(
|
def down(self, timeout, volumes, skip_cluster_management, delete_namespace=False):
|
||||||
self, timeout, volumes, skip_cluster_management, delete_namespace=False
|
|
||||||
):
|
|
||||||
"""Tear down stack-labeled resources. Phases:
|
"""Tear down stack-labeled resources. Phases:
|
||||||
|
|
||||||
1. Delete namespaced resources (if namespace still exists).
|
1. Delete namespaced resources (if namespace still exists).
|
||||||
|
|
@ -1111,34 +1230,68 @@ class K8sDeployer(Deployer):
|
||||||
listers = []
|
listers = []
|
||||||
if namespace_present:
|
if namespace_present:
|
||||||
listers += [
|
listers += [
|
||||||
("deployment", lambda: self.apps_api.list_namespaced_deployment(
|
(
|
||||||
namespace=namespace, label_selector=selector)),
|
"deployment",
|
||||||
("ingress", lambda: self.networking_api.list_namespaced_ingress(
|
lambda: self.apps_api.list_namespaced_deployment(
|
||||||
namespace=namespace, label_selector=selector)),
|
namespace=namespace, label_selector=selector
|
||||||
("job", lambda: self.batch_api.list_namespaced_job(
|
),
|
||||||
namespace=namespace, label_selector=selector)),
|
),
|
||||||
("service", lambda: self.core_api.list_namespaced_service(
|
(
|
||||||
namespace=namespace, label_selector=selector)),
|
"ingress",
|
||||||
("configmap", lambda: self.core_api.list_namespaced_config_map(
|
lambda: self.networking_api.list_namespaced_ingress(
|
||||||
namespace=namespace, label_selector=selector)),
|
namespace=namespace, label_selector=selector
|
||||||
("secret", lambda: self.core_api.list_namespaced_secret(
|
),
|
||||||
namespace=namespace, label_selector=selector)),
|
),
|
||||||
("pod", lambda: self.core_api.list_namespaced_pod(
|
(
|
||||||
namespace=namespace, label_selector=selector)),
|
"job",
|
||||||
|
lambda: self.batch_api.list_namespaced_job(
|
||||||
|
namespace=namespace, label_selector=selector
|
||||||
|
),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"service",
|
||||||
|
lambda: self.core_api.list_namespaced_service(
|
||||||
|
namespace=namespace, label_selector=selector
|
||||||
|
),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"configmap",
|
||||||
|
lambda: self.core_api.list_namespaced_config_map(
|
||||||
|
namespace=namespace, label_selector=selector
|
||||||
|
),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"secret",
|
||||||
|
lambda: self.core_api.list_namespaced_secret(
|
||||||
|
namespace=namespace, label_selector=selector
|
||||||
|
),
|
||||||
|
),
|
||||||
|
(
|
||||||
|
"pod",
|
||||||
|
lambda: self.core_api.list_namespaced_pod(
|
||||||
|
namespace=namespace, label_selector=selector
|
||||||
|
),
|
||||||
|
),
|
||||||
]
|
]
|
||||||
if delete_volumes:
|
if delete_volumes:
|
||||||
listers.append(
|
listers.append(
|
||||||
("persistentvolumeclaim",
|
(
|
||||||
lambda: self.core_api.list_namespaced_persistent_volume_claim(
|
"persistentvolumeclaim",
|
||||||
namespace=namespace, label_selector=selector))
|
lambda: self.core_api.list_namespaced_persistent_volume_claim(
|
||||||
|
namespace=namespace, label_selector=selector
|
||||||
|
),
|
||||||
|
)
|
||||||
)
|
)
|
||||||
# PVs are cluster-scoped — wait for them even when the namespace
|
# PVs are cluster-scoped — wait for them even when the namespace
|
||||||
# is already gone (orphaned from a prior --delete-namespace).
|
# is already gone (orphaned from a prior --delete-namespace).
|
||||||
if delete_volumes:
|
if delete_volumes:
|
||||||
listers.append(
|
listers.append(
|
||||||
("persistentvolume",
|
(
|
||||||
lambda: self.core_api.list_persistent_volume(
|
"persistentvolume",
|
||||||
label_selector=selector))
|
lambda: self.core_api.list_persistent_volume(
|
||||||
|
label_selector=selector
|
||||||
|
),
|
||||||
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
def remaining():
|
def remaining():
|
||||||
|
|
@ -1166,8 +1319,7 @@ class K8sDeployer(Deployer):
|
||||||
left = remaining()
|
left = remaining()
|
||||||
if left:
|
if left:
|
||||||
print(
|
print(
|
||||||
f"Warning: resources still present after {timeout_seconds}s: "
|
f"Warning: resources still present after {timeout_seconds}s: " f"{left}"
|
||||||
f"{left}"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def status(self):
|
def status(self):
|
||||||
|
|
|
||||||
|
|
@ -15,11 +15,13 @@
|
||||||
|
|
||||||
from kubernetes import client, utils, watch
|
from kubernetes import client, utils, watch
|
||||||
from kubernetes.client.exceptions import ApiException
|
from kubernetes.client.exceptions import ApiException
|
||||||
|
import json
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import subprocess
|
import subprocess
|
||||||
import re
|
import re
|
||||||
from typing import Set, Mapping, List, Optional, cast
|
import sys
|
||||||
|
from typing import Dict, Set, Mapping, List, Optional, cast
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
from stack_orchestrator.util import get_k8s_dir, error_exit
|
from stack_orchestrator.util import get_k8s_dir, error_exit
|
||||||
|
|
@ -205,9 +207,7 @@ def _install_caddy_cert_backup(
|
||||||
print("No kind-mount-root configured; caddy cert backup disabled")
|
print("No kind-mount-root configured; caddy cert backup disabled")
|
||||||
return
|
return
|
||||||
manifest = os.path.abspath(
|
manifest = os.path.abspath(
|
||||||
get_k8s_dir().joinpath(
|
get_k8s_dir().joinpath("components", "ingress", "caddy-cert-backup.yaml")
|
||||||
"components", "ingress", "caddy-cert-backup.yaml"
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
with open(manifest) as f:
|
with open(manifest) as f:
|
||||||
objects = list(yaml.safe_load_all(f))
|
objects = list(yaml.safe_load_all(f))
|
||||||
|
|
@ -216,6 +216,172 @@ def _install_caddy_cert_backup(
|
||||||
print("Installed caddy cert backup CronJob")
|
print("Installed caddy cert backup CronJob")
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_kind_extra_mounts(config_file: str) -> List[Dict[str, str]]:
|
||||||
|
"""Return the list of extraMounts declared in a kind config file."""
|
||||||
|
try:
|
||||||
|
with open(config_file) as f:
|
||||||
|
config = yaml.safe_load(f) or {}
|
||||||
|
except (OSError, yaml.YAMLError) as e:
|
||||||
|
if opts.o.debug:
|
||||||
|
print(f"Could not parse kind config {config_file}: {e}")
|
||||||
|
return []
|
||||||
|
mounts = []
|
||||||
|
for node in config.get("nodes", []) or []:
|
||||||
|
for m in node.get("extraMounts", []) or []:
|
||||||
|
host_path = m.get("hostPath")
|
||||||
|
container_path = m.get("containerPath")
|
||||||
|
if host_path and container_path:
|
||||||
|
mounts.append({"hostPath": host_path, "containerPath": container_path})
|
||||||
|
return mounts
|
||||||
|
|
||||||
|
|
||||||
|
def _get_control_plane_node(cluster_name: str) -> Optional[str]:
|
||||||
|
"""Return the kind control-plane node container name for a cluster."""
|
||||||
|
result = subprocess.run(
|
||||||
|
["kind", "get", "nodes", "--name", cluster_name],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
)
|
||||||
|
if result.returncode != 0:
|
||||||
|
return None
|
||||||
|
for line in result.stdout.splitlines():
|
||||||
|
line = line.strip()
|
||||||
|
if line.endswith("control-plane"):
|
||||||
|
return line
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _get_running_cluster_mounts(cluster_name: str) -> Dict[str, str]:
|
||||||
|
"""Return {containerPath: hostPath} for bind mounts on the control-plane."""
|
||||||
|
node = _get_control_plane_node(cluster_name)
|
||||||
|
if not node:
|
||||||
|
return {}
|
||||||
|
result = subprocess.run(
|
||||||
|
["docker", "inspect", node, "--format", "{{json .Mounts}}"],
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
)
|
||||||
|
if result.returncode != 0:
|
||||||
|
return {}
|
||||||
|
try:
|
||||||
|
mounts = json.loads(result.stdout or "[]")
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
return {}
|
||||||
|
return {
|
||||||
|
m["Destination"]: m["Source"]
|
||||||
|
for m in mounts
|
||||||
|
if m.get("Type") == "bind" and m.get("Destination") and m.get("Source")
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def check_mounts_compatible(cluster_name: str, config_file: str) -> None:
|
||||||
|
"""Fail if the new deployment's extraMounts aren't active on the cluster.
|
||||||
|
|
||||||
|
Kind applies extraMounts only at cluster creation. When a deployment
|
||||||
|
joins an existing cluster, any extraMount its kind-config declares that
|
||||||
|
isn't already active on the running node will silently fall through to
|
||||||
|
the node's overlay filesystem — data looks persisted but is lost on
|
||||||
|
cluster destroy. Catch this up front.
|
||||||
|
"""
|
||||||
|
required = _parse_kind_extra_mounts(config_file)
|
||||||
|
if not required:
|
||||||
|
return
|
||||||
|
live = _get_running_cluster_mounts(cluster_name)
|
||||||
|
if not live:
|
||||||
|
# Could not inspect — don't block deployment, but warn.
|
||||||
|
print(
|
||||||
|
f"WARNING: could not inspect mounts on cluster '{cluster_name}'; "
|
||||||
|
"skipping extraMount compatibility check",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
return
|
||||||
|
# File-level host-path binds (e.g. `./config/x.sh` from compose volumes)
|
||||||
|
# are emitted per-deployment with containerPath `/mnt/host-path-*` and
|
||||||
|
# source paths under each deployment's own directory. Two deployments
|
||||||
|
# of the same stack will always clash here — a pre-existing SO aliasing
|
||||||
|
# misfeature that's orthogonal to umbrella compatibility. Skip them so
|
||||||
|
# this check stays focused on the umbrella and named-volume data mounts
|
||||||
|
# it was designed for.
|
||||||
|
mismatches = []
|
||||||
|
for m in required:
|
||||||
|
dest = m["containerPath"]
|
||||||
|
if dest.startswith("/mnt/host-path-"):
|
||||||
|
continue
|
||||||
|
want = m["hostPath"]
|
||||||
|
have = live.get(dest)
|
||||||
|
if have != want:
|
||||||
|
mismatches.append((dest, want, have))
|
||||||
|
if not mismatches:
|
||||||
|
return
|
||||||
|
lines = [
|
||||||
|
f"This deployment declares extraMounts incompatible with the "
|
||||||
|
f"running cluster '{cluster_name}':",
|
||||||
|
]
|
||||||
|
for dest, want, have in mismatches:
|
||||||
|
lines.append(
|
||||||
|
f" - {dest}: expected host path '{want}', "
|
||||||
|
f"actual '{have or 'NOT MOUNTED'}'"
|
||||||
|
)
|
||||||
|
lines.append("")
|
||||||
|
|
||||||
|
cluster_umbrella = live.get("/mnt")
|
||||||
|
if cluster_umbrella:
|
||||||
|
lines.extend(
|
||||||
|
[
|
||||||
|
f"The running cluster has an umbrella mount: "
|
||||||
|
f"'{cluster_umbrella}' -> /mnt.",
|
||||||
|
"",
|
||||||
|
f"Fix: set 'kind-mount-root: {cluster_umbrella}' in this "
|
||||||
|
"deployment's spec and place host paths for its volumes "
|
||||||
|
f"under '{cluster_umbrella}/'. Kind applies extraMounts "
|
||||||
|
"only at cluster creation, so new bind mounts cannot be "
|
||||||
|
"added to the running cluster without a recreate — but "
|
||||||
|
"the existing umbrella already covers any subdirectory "
|
||||||
|
"you create on the host.",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
lines.extend(
|
||||||
|
[
|
||||||
|
"The running cluster has no umbrella mount "
|
||||||
|
"(no extraMount with containerPath=/mnt).",
|
||||||
|
"",
|
||||||
|
"Kind applies extraMounts only at cluster creation — "
|
||||||
|
"neither kind nor Docker supports adding bind mounts to "
|
||||||
|
"a running container. Without a recreate, any PV backed "
|
||||||
|
"by one of the missing mounts will silently fall through "
|
||||||
|
"to the node's overlay filesystem and lose data on "
|
||||||
|
"cluster destroy.",
|
||||||
|
"",
|
||||||
|
"Fix: destroy and recreate the cluster with a kind-config "
|
||||||
|
"that sets 'kind-mount-root' so future stacks can share "
|
||||||
|
"an umbrella without recreating.",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
lines.append("")
|
||||||
|
lines.append("See docs/deployment_patterns.md.")
|
||||||
|
raise DeployerException("\n".join(lines))
|
||||||
|
|
||||||
|
|
||||||
|
def _warn_if_no_umbrella(config_file: str) -> None:
|
||||||
|
"""Warn if creating a cluster without a '/mnt' umbrella mount.
|
||||||
|
|
||||||
|
Without an umbrella, future stacks joining this cluster that need new
|
||||||
|
host-path mounts will fail the compatibility check and require a full
|
||||||
|
cluster recreate to add them.
|
||||||
|
"""
|
||||||
|
mounts = _parse_kind_extra_mounts(config_file)
|
||||||
|
if any(m.get("containerPath") == "/mnt" for m in mounts):
|
||||||
|
return
|
||||||
|
print(
|
||||||
|
"WARNING: creating kind cluster without an umbrella mount "
|
||||||
|
"('kind-mount-root' not set). Future stacks added to this cluster "
|
||||||
|
"that require new host-path mounts will not be able to without a "
|
||||||
|
"full cluster recreate. See docs/deployment_patterns.md.",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def create_cluster(name: str, config_file: str):
|
def create_cluster(name: str, config_file: str):
|
||||||
"""Create or reuse the single kind cluster for this host.
|
"""Create or reuse the single kind cluster for this host.
|
||||||
|
|
||||||
|
|
@ -232,8 +398,10 @@ def create_cluster(name: str, config_file: str):
|
||||||
existing = get_kind_cluster()
|
existing = get_kind_cluster()
|
||||||
if existing:
|
if existing:
|
||||||
print(f"Using existing cluster: {existing}")
|
print(f"Using existing cluster: {existing}")
|
||||||
|
check_mounts_compatible(existing, config_file)
|
||||||
return existing
|
return existing
|
||||||
|
|
||||||
|
_warn_if_no_umbrella(config_file)
|
||||||
print(f"Creating new cluster: {name}")
|
print(f"Creating new cluster: {name}")
|
||||||
result = _run_command(f"kind create cluster --name {name} --config {config_file}")
|
result = _run_command(f"kind create cluster --name {name} --config {config_file}")
|
||||||
if result.returncode != 0:
|
if result.returncode != 0:
|
||||||
|
|
@ -294,7 +462,9 @@ def wait_for_ingress_in_kind():
|
||||||
|
|
||||||
|
|
||||||
def install_ingress_for_kind(
|
def install_ingress_for_kind(
|
||||||
acme_email: str = "", kind_mount_root: Optional[str] = None
|
acme_email: str = "",
|
||||||
|
kind_mount_root: Optional[str] = None,
|
||||||
|
caddy_image: Optional[str] = None,
|
||||||
):
|
):
|
||||||
api_client = client.ApiClient()
|
api_client = client.ApiClient()
|
||||||
ingress_install = os.path.abspath(
|
ingress_install = os.path.abspath(
|
||||||
|
|
@ -305,7 +475,7 @@ def install_ingress_for_kind(
|
||||||
if opts.o.debug:
|
if opts.o.debug:
|
||||||
print("Installing Caddy ingress controller in kind cluster")
|
print("Installing Caddy ingress controller in kind cluster")
|
||||||
|
|
||||||
# Template the YAML with email before applying
|
# Template the YAML with email and image before applying
|
||||||
with open(ingress_install) as f:
|
with open(ingress_install) as f:
|
||||||
yaml_content = f.read()
|
yaml_content = f.read()
|
||||||
|
|
||||||
|
|
@ -316,6 +486,24 @@ def install_ingress_for_kind(
|
||||||
|
|
||||||
yaml_objects = list(yaml.safe_load_all(yaml_content))
|
yaml_objects = list(yaml.safe_load_all(yaml_content))
|
||||||
|
|
||||||
|
# Override the Caddy container's image when a spec value is set.
|
||||||
|
# Works regardless of what's hardcoded in the manifest — we locate
|
||||||
|
# the container by name and overwrite its image field, rather than
|
||||||
|
# relying on a string match of the default.
|
||||||
|
if caddy_image:
|
||||||
|
for obj in yaml_objects:
|
||||||
|
if not obj:
|
||||||
|
continue
|
||||||
|
if (
|
||||||
|
obj.get("kind") == "Deployment"
|
||||||
|
and obj.get("metadata", {}).get("name") == "caddy-ingress-controller"
|
||||||
|
):
|
||||||
|
for c in obj["spec"]["template"]["spec"].get("containers") or []:
|
||||||
|
if c.get("name") == "caddy-ingress-controller":
|
||||||
|
c["image"] = caddy_image
|
||||||
|
if opts.o.debug:
|
||||||
|
print(f"Configured Caddy image: {caddy_image}")
|
||||||
|
|
||||||
# Split: apply everything except the Caddy controller Deployment first,
|
# Split: apply everything except the Caddy controller Deployment first,
|
||||||
# so the namespace + secrets exist before the pod can start and read its
|
# so the namespace + secrets exist before the pod can start and read its
|
||||||
# secret_store. Race-free: Caddy has no way to see the cluster until
|
# secret_store. Race-free: Caddy has no way to see the cluster until
|
||||||
|
|
@ -358,6 +546,75 @@ def install_ingress_for_kind(
|
||||||
_install_caddy_cert_backup(api_client, kind_mount_root)
|
_install_caddy_cert_backup(api_client, kind_mount_root)
|
||||||
|
|
||||||
|
|
||||||
|
def update_caddy_ingress_image(caddy_image: str) -> bool:
|
||||||
|
"""Patch the running Caddy ingress Deployment to a new image.
|
||||||
|
|
||||||
|
No-op if the live Deployment already runs the requested image.
|
||||||
|
Returns True if a patch was applied, False otherwise.
|
||||||
|
|
||||||
|
Caddy lives in the cluster-scoped `caddy-system` namespace, so
|
||||||
|
this affects every deployment sharing the cluster. The
|
||||||
|
`strategy: Recreate` in the Deployment manifest handles the
|
||||||
|
hostPort-80/443 handoff; expect ~10-30s of ingress downtime while
|
||||||
|
the old pod terminates and the new one starts.
|
||||||
|
"""
|
||||||
|
apps_api = client.AppsV1Api()
|
||||||
|
try:
|
||||||
|
dep = apps_api.read_namespaced_deployment(
|
||||||
|
name="caddy-ingress-controller", namespace="caddy-system"
|
||||||
|
)
|
||||||
|
except ApiException as e:
|
||||||
|
if e.status == 404:
|
||||||
|
if opts.o.debug:
|
||||||
|
print(
|
||||||
|
"Caddy ingress Deployment not found; nothing to "
|
||||||
|
"update (install path handles fresh clusters)"
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
raise
|
||||||
|
|
||||||
|
containers = dep.spec.template.spec.containers or []
|
||||||
|
current = containers[0].image if containers else None
|
||||||
|
if current == caddy_image:
|
||||||
|
if opts.o.debug:
|
||||||
|
print(f"Caddy image already at {caddy_image}; no update needed")
|
||||||
|
return False
|
||||||
|
|
||||||
|
print(
|
||||||
|
f"Updating Caddy ingress image: {current} -> {caddy_image} "
|
||||||
|
"(expect brief ingress downtime)"
|
||||||
|
)
|
||||||
|
patch = {
|
||||||
|
"spec": {
|
||||||
|
"template": {
|
||||||
|
"spec": {
|
||||||
|
"containers": [
|
||||||
|
{
|
||||||
|
"name": "caddy-ingress-controller",
|
||||||
|
"image": caddy_image,
|
||||||
|
"imagePullPolicy": "Always",
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
apps_api.patch_namespaced_deployment(
|
||||||
|
name="caddy-ingress-controller",
|
||||||
|
namespace="caddy-system",
|
||||||
|
body=patch,
|
||||||
|
)
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def is_image_available_locally(image: str) -> bool:
|
||||||
|
result = subprocess.run(
|
||||||
|
["docker", "image", "inspect", image],
|
||||||
|
capture_output=True,
|
||||||
|
)
|
||||||
|
return result.returncode == 0
|
||||||
|
|
||||||
|
|
||||||
def load_images_into_kind(kind_cluster_name: str, image_set: Set[str]):
|
def load_images_into_kind(kind_cluster_name: str, image_set: Set[str]):
|
||||||
for image in image_set:
|
for image in image_set:
|
||||||
result = _run_command(
|
result = _run_command(
|
||||||
|
|
@ -435,7 +692,7 @@ def get_kind_pv_bind_mount_path(
|
||||||
return f"/mnt/{volume_name}"
|
return f"/mnt/{volume_name}"
|
||||||
|
|
||||||
|
|
||||||
def volume_mounts_for_service(parsed_pod_files, service):
|
def volume_mounts_for_service(parsed_pod_files, service, deployment_dir=None):
|
||||||
result = []
|
result = []
|
||||||
# Find the service
|
# Find the service
|
||||||
for pod in parsed_pod_files:
|
for pod in parsed_pod_files:
|
||||||
|
|
@ -459,11 +716,24 @@ def volume_mounts_for_service(parsed_pod_files, service):
|
||||||
mount_options = (
|
mount_options = (
|
||||||
mount_split[2] if len(mount_split) == 3 else None
|
mount_split[2] if len(mount_split) == 3 else None
|
||||||
)
|
)
|
||||||
# For host path mounts, use sanitized name
|
sub_path = None
|
||||||
|
# For host path mounts, use sanitized name.
|
||||||
|
# When the source resolves to a single file,
|
||||||
|
# the auto-generated ConfigMap has one key
|
||||||
|
# (the file basename). Set subPath so the
|
||||||
|
# mount lands at the compose target as a
|
||||||
|
# single file, not as a directory with the
|
||||||
|
# key as a child entry.
|
||||||
if is_host_path_mount(volume_name):
|
if is_host_path_mount(volume_name):
|
||||||
k8s_volume_name = sanitize_host_path_to_volume_name(
|
k8s_volume_name = sanitize_host_path_to_volume_name(
|
||||||
volume_name
|
volume_name
|
||||||
)
|
)
|
||||||
|
if deployment_dir is not None:
|
||||||
|
abs_src = resolve_host_path_for_kind(
|
||||||
|
volume_name, deployment_dir
|
||||||
|
)
|
||||||
|
if abs_src.is_file():
|
||||||
|
sub_path = abs_src.name
|
||||||
else:
|
else:
|
||||||
k8s_volume_name = volume_name
|
k8s_volume_name = volume_name
|
||||||
if opts.o.debug:
|
if opts.o.debug:
|
||||||
|
|
@ -471,10 +741,12 @@ def volume_mounts_for_service(parsed_pod_files, service):
|
||||||
print(f"k8s_volume_name: {k8s_volume_name}")
|
print(f"k8s_volume_name: {k8s_volume_name}")
|
||||||
print(f"mount path: {mount_path}")
|
print(f"mount path: {mount_path}")
|
||||||
print(f"mount options: {mount_options}")
|
print(f"mount options: {mount_options}")
|
||||||
|
print(f"sub_path: {sub_path}")
|
||||||
volume_device = client.V1VolumeMount(
|
volume_device = client.V1VolumeMount(
|
||||||
mount_path=mount_path,
|
mount_path=mount_path,
|
||||||
name=k8s_volume_name,
|
name=k8s_volume_name,
|
||||||
read_only="ro" == mount_options,
|
read_only="ro" == mount_options,
|
||||||
|
sub_path=sub_path,
|
||||||
)
|
)
|
||||||
result.append(volume_device)
|
result.append(volume_device)
|
||||||
return result
|
return result
|
||||||
|
|
@ -507,7 +779,11 @@ def volumes_for_pod_files(parsed_pod_files, spec, app_name):
|
||||||
)
|
)
|
||||||
result.append(volume)
|
result.append(volume)
|
||||||
|
|
||||||
# Handle host path mounts from service volumes
|
# File-level and flat-dir host-path compose volumes flow through
|
||||||
|
# auto-generated ConfigMaps. Emit a ConfigMap-backed V1Volume so
|
||||||
|
# the pod reads from the namespace-scoped ConfigMap rather than
|
||||||
|
# a kind-node hostPath (which would alias across deployments
|
||||||
|
# sharing a cluster and not work on real k8s at all).
|
||||||
if "services" in parsed_pod_file:
|
if "services" in parsed_pod_file:
|
||||||
services = parsed_pod_file["services"]
|
services = parsed_pod_file["services"]
|
||||||
for service_name in services:
|
for service_name in services:
|
||||||
|
|
@ -522,19 +798,19 @@ def volumes_for_pod_files(parsed_pod_files, spec, app_name):
|
||||||
)
|
)
|
||||||
if sanitized_name not in seen_host_path_volumes:
|
if sanitized_name not in seen_host_path_volumes:
|
||||||
seen_host_path_volumes.add(sanitized_name)
|
seen_host_path_volumes.add(sanitized_name)
|
||||||
# Create hostPath volume for mount inside kind node
|
config_map = client.V1ConfigMapVolumeSource(
|
||||||
kind_mount_path = get_kind_host_path_mount_path(
|
name=f"{app_name}-{sanitized_name}",
|
||||||
sanitized_name
|
default_mode=0o755,
|
||||||
)
|
|
||||||
host_path_source = client.V1HostPathVolumeSource(
|
|
||||||
path=kind_mount_path, type="FileOrCreate"
|
|
||||||
)
|
)
|
||||||
volume = client.V1Volume(
|
volume = client.V1Volume(
|
||||||
name=sanitized_name, host_path=host_path_source
|
name=sanitized_name, config_map=config_map
|
||||||
)
|
)
|
||||||
result.append(volume)
|
result.append(volume)
|
||||||
if opts.o.debug:
|
if opts.o.debug:
|
||||||
print(f"Created hostPath volume: {sanitized_name}")
|
print(
|
||||||
|
f"Created configmap-backed host-path "
|
||||||
|
f"volume: {sanitized_name}"
|
||||||
|
)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -553,7 +829,6 @@ def _make_absolute_host_path(data_mount_path: Path, deployment_dir: Path) -> Pat
|
||||||
def _generate_kind_mounts(parsed_pod_files, deployment_dir, deployment_context):
|
def _generate_kind_mounts(parsed_pod_files, deployment_dir, deployment_context):
|
||||||
volume_definitions = []
|
volume_definitions = []
|
||||||
volume_host_path_map = _get_host_paths_for_volumes(deployment_context)
|
volume_host_path_map = _get_host_paths_for_volumes(deployment_context)
|
||||||
seen_host_path_mounts = set() # Track to avoid duplicate mounts
|
|
||||||
kind_mount_root = deployment_context.spec.get_kind_mount_root()
|
kind_mount_root = deployment_context.spec.get_kind_mount_root()
|
||||||
|
|
||||||
# When kind-mount-root is set, emit a single extraMount for the root.
|
# When kind-mount-root is set, emit a single extraMount for the root.
|
||||||
|
|
@ -590,26 +865,12 @@ def _generate_kind_mounts(parsed_pod_files, deployment_dir, deployment_context):
|
||||||
mount_path = mount_split[1]
|
mount_path = mount_split[1]
|
||||||
|
|
||||||
if is_host_path_mount(volume_name):
|
if is_host_path_mount(volume_name):
|
||||||
# Host path mount - add extraMount for kind
|
# File-level host-path binds (e.g. compose
|
||||||
sanitized_name = sanitize_host_path_to_volume_name(
|
# `../config/foo.sh:/opt/foo.sh`) flow
|
||||||
volume_name
|
# through an auto-generated k8s ConfigMap at
|
||||||
)
|
# deploy start — no extraMount needed. See
|
||||||
if sanitized_name not in seen_host_path_mounts:
|
# cluster_info.get_configmaps().
|
||||||
seen_host_path_mounts.add(sanitized_name)
|
continue
|
||||||
# Resolve path relative to compose directory
|
|
||||||
host_path = resolve_host_path_for_kind(
|
|
||||||
volume_name, deployment_dir
|
|
||||||
)
|
|
||||||
container_path = get_kind_host_path_mount_path(
|
|
||||||
sanitized_name
|
|
||||||
)
|
|
||||||
volume_definitions.append(
|
|
||||||
f" - hostPath: {host_path}\n"
|
|
||||||
f" containerPath: {container_path}\n"
|
|
||||||
f" propagation: HostToContainer\n"
|
|
||||||
)
|
|
||||||
if opts.o.debug:
|
|
||||||
print(f"Added host path mount: {host_path}")
|
|
||||||
else:
|
else:
|
||||||
# Named volume
|
# Named volume
|
||||||
if opts.o.debug:
|
if opts.o.debug:
|
||||||
|
|
|
||||||
|
|
@ -304,6 +304,24 @@ class Spec:
|
||||||
"""
|
"""
|
||||||
return self.obj.get(constants.kind_mount_root_key)
|
return self.obj.get(constants.kind_mount_root_key)
|
||||||
|
|
||||||
|
def get_caddy_ingress_image(self) -> typing.Optional[str]:
|
||||||
|
"""Return the Caddy ingress controller image override, or None.
|
||||||
|
|
||||||
|
Returns None (not the default image) when the spec key is
|
||||||
|
absent. That distinction matters: the install path falls back
|
||||||
|
to the hardcoded default so there's always *some* image to
|
||||||
|
deploy, while the update-on-reuse path treats None as "operator
|
||||||
|
didn't ask to touch Caddy" and skips the patch — avoiding
|
||||||
|
silent reverts of an image set out-of-band (e.g. via an
|
||||||
|
ansible playbook or a prior deployment's spec).
|
||||||
|
|
||||||
|
Cluster-scoped: the Caddy ingress lives in the shared
|
||||||
|
`caddy-system` namespace, so setting this key in any
|
||||||
|
deployment's spec rolls the controller for every deployment
|
||||||
|
using the cluster.
|
||||||
|
"""
|
||||||
|
return self.obj.get(constants.caddy_ingress_image_key)
|
||||||
|
|
||||||
def get_maintenance_service(self) -> typing.Optional[str]:
|
def get_maintenance_service(self) -> typing.Optional[str]:
|
||||||
"""Return maintenance-service value (e.g. 'dumpster-maintenance:8000') or None.
|
"""Return maintenance-service value (e.g. 'dumpster-maintenance:8000') or None.
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -147,7 +147,13 @@ deployment_spec_file=${test_deployment_dir}/spec.yml
|
||||||
sed -i 's/^secrets: {}$/secrets:\n test-secret:\n - TEST_SECRET_KEY/' ${deployment_spec_file}
|
sed -i 's/^secrets: {}$/secrets:\n test-secret:\n - TEST_SECRET_KEY/' ${deployment_spec_file}
|
||||||
|
|
||||||
# Get the deployment ID and namespace for kubectl queries
|
# Get the deployment ID and namespace for kubectl queries
|
||||||
deployment_id=$(cat ${test_deployment_dir}/deployment.yml | cut -d ' ' -f 2)
|
# deployment-id is what flows into app_name → resource name prefix.
|
||||||
|
# Fall back to cluster-id for deployment.yml files written before the
|
||||||
|
# deployment-id field existed (pre-decouple compatibility).
|
||||||
|
deployment_id=$(awk '/^deployment-id:/ {print $2; exit}' ${test_deployment_dir}/deployment.yml)
|
||||||
|
if [ -z "$deployment_id" ]; then
|
||||||
|
deployment_id=$(awk '/^cluster-id:/ {print $2; exit}' ${test_deployment_dir}/deployment.yml)
|
||||||
|
fi
|
||||||
# Namespace is derived from stack name: laconic-{stack_name}
|
# Namespace is derived from stack name: laconic-{stack_name}
|
||||||
deployment_ns="laconic-test"
|
deployment_ns="laconic-test"
|
||||||
|
|
||||||
|
|
@ -166,6 +172,41 @@ for kind in serviceaccount role rolebinding cronjob; do
|
||||||
done
|
done
|
||||||
echo "caddy-cert-backup install test: passed"
|
echo "caddy-cert-backup install test: passed"
|
||||||
|
|
||||||
|
# Host-path compose volumes (../config/test/script.sh, ../config/test/settings.env)
|
||||||
|
# should flow through auto-generated per-namespace ConfigMaps — no kind
|
||||||
|
# extraMount, no compose/spec rewriting. The pod mount lands via
|
||||||
|
# ConfigMap + subPath.
|
||||||
|
for cm_name in \
|
||||||
|
"${deployment_id}-host-path-config-test-script-sh" \
|
||||||
|
"${deployment_id}-host-path-config-test-settings-env"; do
|
||||||
|
if ! kubectl get configmap "$cm_name" -n "$deployment_ns" >/dev/null 2>&1; then
|
||||||
|
echo "host-path configmap test: ConfigMap $cm_name not found"
|
||||||
|
cleanup_and_exit
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
echo "host-path configmap test: passed"
|
||||||
|
|
||||||
|
# Deployment dir should be untouched — compose file still has the
|
||||||
|
# original host-path volume entries and no synthetic configmap dirs.
|
||||||
|
if ! grep -q '\.\./config/test/script\.sh:/opt/run\.sh' \
|
||||||
|
"$test_deployment_dir/compose/docker-compose-test.yml"; then
|
||||||
|
echo "compose unchanged test: host-path volume entry missing"
|
||||||
|
cleanup_and_exit
|
||||||
|
fi
|
||||||
|
if [ -d "$test_deployment_dir/configmaps/host-path-config-test-script-sh" ]; then
|
||||||
|
echo "compose unchanged test: unexpected configmaps/host-path-* dir present"
|
||||||
|
cleanup_and_exit
|
||||||
|
fi
|
||||||
|
echo "compose unchanged test: passed"
|
||||||
|
|
||||||
|
# kind-config.yml should NOT contain /mnt/host-path-* extraMounts —
|
||||||
|
# they are replaced by the ConfigMap mechanism.
|
||||||
|
if grep -q 'containerPath: /mnt/host-path-' "$test_deployment_dir/kind-config.yml"; then
|
||||||
|
echo "no-host-path-extramount test: FAILED"
|
||||||
|
cleanup_and_exit
|
||||||
|
fi
|
||||||
|
echo "no-host-path-extramount test: passed"
|
||||||
|
|
||||||
# Check logs command works
|
# Check logs command works
|
||||||
wait_for_log_output
|
wait_for_log_output
|
||||||
sleep 1
|
sleep 1
|
||||||
|
|
@ -372,14 +413,16 @@ if [ "$restored_value" != "$fake_cert_value" ]; then
|
||||||
fi
|
fi
|
||||||
echo "caddy cert restore test: passed"
|
echo "caddy cert restore test: passed"
|
||||||
|
|
||||||
# Final teardown: --delete-namespace nukes the namespace after labeled cleanup.
|
# Final teardown: --delete-namespace nukes the namespace, and
|
||||||
# Verify the namespace is actually gone.
|
# --perform-cluster-management tears down the Kind cluster so the next test
|
||||||
|
# step in this CI workflow (e.g. run-restart-test.sh) starts from a clean
|
||||||
|
# host.
|
||||||
$TEST_TARGET_SO deployment --dir $test_deployment_dir \
|
$TEST_TARGET_SO deployment --dir $test_deployment_dir \
|
||||||
stop --delete-volumes --delete-namespace --skip-cluster-management
|
stop --delete-volumes --delete-namespace --perform-cluster-management
|
||||||
if kubectl get namespace ${deployment_ns} >/dev/null 2>&1; then
|
if kind get clusters 2>/dev/null | grep -q .; then
|
||||||
echo "delete-namespace test: FAILED (namespace still present)"
|
echo "cluster teardown test: FAILED (kind cluster still present)"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
echo "delete-namespace test: passed"
|
echo "cluster teardown test: passed"
|
||||||
|
|
||||||
echo "Test passed"
|
echo "Test passed"
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,265 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
set -e
|
||||||
|
if [ -n "$CERC_SCRIPT_DEBUG" ]; then
|
||||||
|
set -x
|
||||||
|
echo "Environment variables:"
|
||||||
|
env
|
||||||
|
fi
|
||||||
|
|
||||||
|
# Helper functions: TODO move into a separate file (mirrors run-deploy-test.sh:10).
|
||||||
|
wait_for_pods_started () {
|
||||||
|
local dir=$1
|
||||||
|
for i in {1..50}
|
||||||
|
do
|
||||||
|
local ps_output=$( $TEST_TARGET_SO deployment --dir $dir ps )
|
||||||
|
|
||||||
|
if [[ "$ps_output" == *"Running containers:"* ]]; then
|
||||||
|
return
|
||||||
|
else
|
||||||
|
sleep 5
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
echo "waiting for pods to start: FAILED"
|
||||||
|
cleanup_and_exit
|
||||||
|
}
|
||||||
|
|
||||||
|
# Multi-pod stacks aren't visible to 'deployment ps' (deploy_k8s.py:1366
|
||||||
|
# filters by app_name-deployment substring, which doesn't match
|
||||||
|
# laconic-<id>-<podname>-deployment-<hash> names). Wait via kubectl.
|
||||||
|
wait_for_k8s_pods_ready () {
|
||||||
|
local ns=$1
|
||||||
|
local timeout=240
|
||||||
|
local waited=0
|
||||||
|
# First wait for at least one pod to appear in the namespace.
|
||||||
|
while [ $waited -lt $timeout ]; do
|
||||||
|
local count=$(kubectl get pods -n "$ns" --no-headers 2>/dev/null | wc -l)
|
||||||
|
if [ "$count" -gt 0 ]; then
|
||||||
|
break
|
||||||
|
fi
|
||||||
|
sleep 2
|
||||||
|
waited=$((waited + 2))
|
||||||
|
done
|
||||||
|
if ! kubectl wait --for=condition=Ready pod --all \
|
||||||
|
-n "$ns" --timeout=$((timeout - waited))s 2>&1; then
|
||||||
|
echo "kubectl wait pods ready: FAILED (ns=$ns)"
|
||||||
|
kubectl get pods -n "$ns" 2>&1 || true
|
||||||
|
kubectl describe pods -n "$ns" 2>&1 | tail -80 || true
|
||||||
|
cleanup_and_exit
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
# Best-effort full teardown so CI runners don't leak namespaces/PVs/clusters
|
||||||
|
# between runs. Variables may be unset depending on which phase tripped.
|
||||||
|
cleanup_and_exit () {
|
||||||
|
if [ -n "$DEP1" ] && [ -d "$DEP1" ]; then
|
||||||
|
$TEST_TARGET_SO deployment --dir $DEP1 \
|
||||||
|
stop --delete-volumes --delete-namespace --skip-cluster-management || true
|
||||||
|
fi
|
||||||
|
if [ -n "$DEP2" ] && [ -d "$DEP2" ]; then
|
||||||
|
$TEST_TARGET_SO deployment --dir $DEP2 \
|
||||||
|
stop --delete-volumes --delete-namespace --perform-cluster-management || true
|
||||||
|
fi
|
||||||
|
exit 1
|
||||||
|
}
|
||||||
|
|
||||||
|
# Make a clone usable for `git commit` without touching the runner's global config.
|
||||||
|
configure_git_identity () {
|
||||||
|
local repo_dir=$1
|
||||||
|
git -C $repo_dir config user.email "test@stack-orchestrator.test"
|
||||||
|
git -C $repo_dir config user.name "test"
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_TARGET_SO=$( ls -t1 ./package/laconic-so* | head -1 )
|
||||||
|
echo "Testing this package: $TEST_TARGET_SO"
|
||||||
|
|
||||||
|
WORK_DIR=~/stack-orchestrator-test/restart
|
||||||
|
# Multi-repo pod working clones land here; resolved by get_plugin_code_paths.
|
||||||
|
export CERC_REPO_BASE_DIR=$WORK_DIR/repo-base
|
||||||
|
rm -rf $WORK_DIR
|
||||||
|
mkdir -p $WORK_DIR $CERC_REPO_BASE_DIR
|
||||||
|
|
||||||
|
# Source location of the test stacks shipped in this checkout. The test stages
|
||||||
|
# them into a temp git repo so 'deployment restart' (which runs 'git pull' on
|
||||||
|
# the stack source) has a real repo to pull from.
|
||||||
|
DATA_DIR=stack_orchestrator/data
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Phase 1 — single-repo restart cycle. Verifies that:
|
||||||
|
# * deploy create copies commands.py into <deployment>/hooks/
|
||||||
|
# * deployment start runs the copied start() hook
|
||||||
|
# * mutating the stack-source commands.py and running 'deployment restart'
|
||||||
|
# re-copies the new file into hooks/ and re-executes the new start()
|
||||||
|
# ============================================================================
|
||||||
|
echo "=== Phase 1: single-repo restart cycle ==="
|
||||||
|
|
||||||
|
BARE1=$WORK_DIR/stack-single.git
|
||||||
|
CLONE1=$WORK_DIR/stack-single
|
||||||
|
git init -b main --bare $BARE1
|
||||||
|
git clone $BARE1 $CLONE1
|
||||||
|
configure_git_identity $CLONE1
|
||||||
|
|
||||||
|
# External-stack layout: <repo>/stack-orchestrator/{stacks,compose}/...
|
||||||
|
mkdir -p $CLONE1/stack-orchestrator/stacks $CLONE1/stack-orchestrator/compose
|
||||||
|
cp -r $DATA_DIR/stacks/test-restart $CLONE1/stack-orchestrator/stacks/
|
||||||
|
cp $DATA_DIR/compose/docker-compose-test-restart.yml $CLONE1/stack-orchestrator/compose/
|
||||||
|
|
||||||
|
git -C $CLONE1 add .
|
||||||
|
git -C $CLONE1 commit -m "test-restart v1"
|
||||||
|
git -C $CLONE1 push -u origin main
|
||||||
|
|
||||||
|
STACK_PATH_SINGLE=$CLONE1/stack-orchestrator/stacks/test-restart
|
||||||
|
SPEC1=$WORK_DIR/spec-single.yml
|
||||||
|
DEP1=$WORK_DIR/dep-single
|
||||||
|
|
||||||
|
$TEST_TARGET_SO --stack $STACK_PATH_SINGLE deploy --deploy-to k8s-kind init --output $SPEC1
|
||||||
|
$TEST_TARGET_SO --stack $STACK_PATH_SINGLE deploy create --spec-file $SPEC1 --deployment-dir $DEP1
|
||||||
|
|
||||||
|
if [ ! -f "$DEP1/hooks/commands.py" ]; then
|
||||||
|
echo "single-repo deploy create test: FAILED (hooks/commands.py missing)"
|
||||||
|
cleanup_and_exit
|
||||||
|
fi
|
||||||
|
if ! grep -q '"v1"' "$DEP1/hooks/commands.py"; then
|
||||||
|
echo "single-repo deploy create test: FAILED (hooks/commands.py does not contain v1 marker)"
|
||||||
|
cleanup_and_exit
|
||||||
|
fi
|
||||||
|
echo "single-repo deploy create test: passed"
|
||||||
|
|
||||||
|
$TEST_TARGET_SO deployment --dir $DEP1 start --perform-cluster-management
|
||||||
|
wait_for_pods_started $DEP1
|
||||||
|
|
||||||
|
# call_stack_deploy_start runs synchronously inside the start command
|
||||||
|
# (deploy_k8s.py:1026), so the marker is on disk before 'start' returns.
|
||||||
|
if [ ! -f "$DEP1/marker" ]; then
|
||||||
|
echo "single-repo start v1 test: FAILED (marker file missing)"
|
||||||
|
cleanup_and_exit
|
||||||
|
fi
|
||||||
|
marker_v1=$(cat $DEP1/marker)
|
||||||
|
if [ "$marker_v1" != "v1" ]; then
|
||||||
|
echo "single-repo start v1 test: FAILED (got: $marker_v1)"
|
||||||
|
cleanup_and_exit
|
||||||
|
fi
|
||||||
|
echo "single-repo start v1 test: passed"
|
||||||
|
|
||||||
|
# Mutate the stack-source working tree v1 -> v2. No commit needed: 'deployment
|
||||||
|
# restart' runs 'git pull' against the bare which is a no-op, and _copy_hooks
|
||||||
|
# reads the working tree directly via get_plugin_code_paths.
|
||||||
|
sed -i 's/"v1"/"v2"/' $STACK_PATH_SINGLE/deploy/commands.py
|
||||||
|
|
||||||
|
$TEST_TARGET_SO deployment --dir $DEP1 restart --stack-path $STACK_PATH_SINGLE
|
||||||
|
|
||||||
|
if ! grep -q '"v2"' "$DEP1/hooks/commands.py"; then
|
||||||
|
echo "single-repo restart re-copy test: FAILED (hooks/commands.py still v1)"
|
||||||
|
cleanup_and_exit
|
||||||
|
fi
|
||||||
|
echo "single-repo restart re-copy test: passed"
|
||||||
|
|
||||||
|
marker_v2=$(cat $DEP1/marker)
|
||||||
|
if [ "$marker_v2" != "v2" ]; then
|
||||||
|
echo "single-repo restart re-execute test: FAILED (got: $marker_v2)"
|
||||||
|
cleanup_and_exit
|
||||||
|
fi
|
||||||
|
echo "single-repo restart re-execute test: passed"
|
||||||
|
|
||||||
|
# Stop phase 1 deployment but keep the cluster for phase 2.
|
||||||
|
$TEST_TARGET_SO deployment --dir $DEP1 \
|
||||||
|
stop --delete-volumes --delete-namespace --skip-cluster-management
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Phase 2 — multi-repo create + start. Verifies that a stack with N pods, each
|
||||||
|
# from a separate repo, produces hooks/commands_0.py ... commands_{N-1}.py and
|
||||||
|
# that call_stack_deploy_start invokes every module's start().
|
||||||
|
# ============================================================================
|
||||||
|
echo "=== Phase 2: multi-repo create + start ==="
|
||||||
|
|
||||||
|
# Pod repos: stack.yml's pods[].repository = 'cerc-io/test-restart-pod-X'
|
||||||
|
# resolves (via get_plugin_code_paths) to
|
||||||
|
# $CERC_REPO_BASE_DIR/test-restart-pod-X/<pod_path>/stack/...
|
||||||
|
for label in a b; do
|
||||||
|
POD_BARE=$WORK_DIR/pod-$label.git
|
||||||
|
POD_CLONE=$CERC_REPO_BASE_DIR/test-restart-pod-$label
|
||||||
|
git init -b main --bare $POD_BARE
|
||||||
|
git clone $POD_BARE $POD_CLONE
|
||||||
|
configure_git_identity $POD_CLONE
|
||||||
|
mkdir -p $POD_CLONE/stack/deploy
|
||||||
|
# For dict-form pods, get_pod_file_path resolves the compose file at
|
||||||
|
# <pod_repo>/<pod_path>/docker-compose.yml — owned by the pod repo, not
|
||||||
|
# the stack repo. get_plugin_code_paths adds the trailing 'stack/', so
|
||||||
|
# commands.py lives at <pod_repo>/<pod_path>/stack/deploy/commands.py.
|
||||||
|
cat > $POD_CLONE/docker-compose.yml <<EOF
|
||||||
|
services:
|
||||||
|
test-restart-multi-$label:
|
||||||
|
image: busybox:1.36
|
||||||
|
command: ["sh", "-c", "sleep infinity"]
|
||||||
|
restart: always
|
||||||
|
EOF
|
||||||
|
# Each pod hook writes a distinct marker file so neither overwrites the
|
||||||
|
# other when both start() hooks are loaded by call_stack_deploy_start.
|
||||||
|
cat > $POD_CLONE/stack/deploy/commands.py <<EOF
|
||||||
|
from stack_orchestrator.deploy.deployment_context import DeploymentContext
|
||||||
|
|
||||||
|
|
||||||
|
def start(deployment_context: DeploymentContext):
|
||||||
|
marker = deployment_context.deployment_dir / "marker-$label"
|
||||||
|
marker.write_text("v1")
|
||||||
|
EOF
|
||||||
|
git -C $POD_CLONE add .
|
||||||
|
git -C $POD_CLONE commit -m "pod $label v1"
|
||||||
|
git -C $POD_CLONE push -u origin main
|
||||||
|
done
|
||||||
|
|
||||||
|
# Stack repo
|
||||||
|
BARE2=$WORK_DIR/stack-multi.git
|
||||||
|
CLONE2=$WORK_DIR/stack-multi
|
||||||
|
git init -b main --bare $BARE2
|
||||||
|
git clone $BARE2 $CLONE2
|
||||||
|
configure_git_identity $CLONE2
|
||||||
|
|
||||||
|
# For multi-repo (dict-form pods), the stack repo only owns stack.yml — pod
|
||||||
|
# compose files and hooks live in the per-pod repos under CERC_REPO_BASE_DIR.
|
||||||
|
mkdir -p $CLONE2/stack-orchestrator/stacks
|
||||||
|
cp -r $DATA_DIR/stacks/test-restart-multi $CLONE2/stack-orchestrator/stacks/
|
||||||
|
|
||||||
|
git -C $CLONE2 add .
|
||||||
|
git -C $CLONE2 commit -m "test-restart-multi v1"
|
||||||
|
git -C $CLONE2 push -u origin main
|
||||||
|
|
||||||
|
STACK_PATH_MULTI=$CLONE2/stack-orchestrator/stacks/test-restart-multi
|
||||||
|
SPEC2=$WORK_DIR/spec-multi.yml
|
||||||
|
DEP2=$WORK_DIR/dep-multi
|
||||||
|
|
||||||
|
$TEST_TARGET_SO --stack $STACK_PATH_MULTI deploy --deploy-to k8s-kind init --output $SPEC2
|
||||||
|
$TEST_TARGET_SO --stack $STACK_PATH_MULTI deploy create --spec-file $SPEC2 --deployment-dir $DEP2
|
||||||
|
|
||||||
|
# get_plugin_code_paths returns list(set(...)) so the index ordering is not
|
||||||
|
# guaranteed; we assert presence of both files rather than mapping each to
|
||||||
|
# a specific pod.
|
||||||
|
if [ ! -f "$DEP2/hooks/commands_0.py" ] || [ ! -f "$DEP2/hooks/commands_1.py" ]; then
|
||||||
|
echo "multi-repo deploy create test: FAILED (hooks/commands_{0,1}.py missing)"
|
||||||
|
ls -la $DEP2/hooks/ || true
|
||||||
|
cleanup_and_exit
|
||||||
|
fi
|
||||||
|
echo "multi-repo deploy create test: passed"
|
||||||
|
|
||||||
|
$TEST_TARGET_SO deployment --dir $DEP2 start --skip-cluster-management
|
||||||
|
wait_for_k8s_pods_ready laconic-test-restart-multi
|
||||||
|
|
||||||
|
for label in a b; do
|
||||||
|
if [ ! -f "$DEP2/marker-$label" ]; then
|
||||||
|
echo "multi-repo start test: FAILED (marker-$label missing)"
|
||||||
|
cleanup_and_exit
|
||||||
|
fi
|
||||||
|
val=$(cat $DEP2/marker-$label)
|
||||||
|
if [ "$val" != "v1" ]; then
|
||||||
|
echo "multi-repo start test: FAILED (marker-$label content: $val)"
|
||||||
|
cleanup_and_exit
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
echo "multi-repo start test: passed"
|
||||||
|
|
||||||
|
# Final teardown — destroy the cluster for the next CI run.
|
||||||
|
$TEST_TARGET_SO deployment --dir $DEP2 \
|
||||||
|
stop --delete-volumes --delete-namespace --perform-cluster-management
|
||||||
|
|
||||||
|
rm -rf $WORK_DIR
|
||||||
|
|
||||||
|
echo "Test passed"
|
||||||
|
|
@ -185,8 +185,15 @@ node-tolerations:
|
||||||
value: c
|
value: c
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
# Get the deployment ID so we can generate low level kubectl commands later
|
# cluster-id names the kind cluster (and its worker node names).
|
||||||
deployment_id=$(cat ${test_deployment_dir}/deployment.yml | cut -d ' ' -f 2)
|
# deployment-id is what flows into app_name / resource name prefixes.
|
||||||
|
# Fall back to cluster-id for deployment.yml files written before the
|
||||||
|
# deployment-id field existed.
|
||||||
|
cluster_id=$(awk '/^cluster-id:/ {print $2; exit}' ${test_deployment_dir}/deployment.yml)
|
||||||
|
deployment_id=$(awk '/^deployment-id:/ {print $2; exit}' ${test_deployment_dir}/deployment.yml)
|
||||||
|
if [ -z "$deployment_id" ]; then
|
||||||
|
deployment_id=$cluster_id
|
||||||
|
fi
|
||||||
|
|
||||||
# Try to start the deployment
|
# Try to start the deployment
|
||||||
$TEST_TARGET_SO deployment --dir $test_deployment_dir start --perform-cluster-management
|
$TEST_TARGET_SO deployment --dir $test_deployment_dir start --perform-cluster-management
|
||||||
|
|
@ -208,7 +215,7 @@ fi
|
||||||
# Get get the node onto which the stack pod has been deployed
|
# Get get the node onto which the stack pod has been deployed
|
||||||
# Namespace is now derived from stack name, not cluster-id
|
# Namespace is now derived from stack name, not cluster-id
|
||||||
deployment_node=$(kubectl get pods -n laconic-test -l app=${deployment_id} -o=jsonpath='{.items..spec.nodeName}')
|
deployment_node=$(kubectl get pods -n laconic-test -l app=${deployment_id} -o=jsonpath='{.items..spec.nodeName}')
|
||||||
expected_node=${deployment_id}-worker3
|
expected_node=${cluster_id}-worker3
|
||||||
echo "Stack pod deployed to node: ${deployment_node}"
|
echo "Stack pod deployed to node: ${deployment_node}"
|
||||||
if [[ ${deployment_node} == ${expected_node} ]]; then
|
if [[ ${deployment_node} == ${expected_node} ]]; then
|
||||||
echo "deployment of pod test: passed"
|
echo "deployment of pod test: passed"
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue