Compare commits
4 Commits
v1.1.0-7c6
...
main
| Author | SHA1 | Date |
|---|---|---|
|
|
b3e9366ca0 | |
|
|
3d703708c4 | |
|
|
2ff7e5eb77 | |
|
|
cf0e230b66 |
|
|
@ -0,0 +1,33 @@
|
|||
version: '3.2'
|
||||
|
||||
services:
|
||||
host-telegraf:
|
||||
image: telegraf:1.36
|
||||
restart: unless-stopped
|
||||
network_mode: host
|
||||
pid: host
|
||||
entrypoint: ["/scripts/telegraf-entrypoint.sh"]
|
||||
environment:
|
||||
INFLUXDB_URL: ${INFLUXDB_URL}
|
||||
INFLUXDB_DB: ${INFLUXDB_DB:-host_metrics}
|
||||
INFLUXDB_USER: ${INFLUXDB_WRITE_USER}
|
||||
INFLUXDB_PASSWORD: ${INFLUXDB_WRITE_PASSWORD}
|
||||
COLLECT_INTERVAL: ${COLLECT_INTERVAL:-10s}
|
||||
HOST_TAG: ${HOST_TAG:-}
|
||||
COLLECT_ZFS: ${COLLECT_ZFS:-false}
|
||||
volumes:
|
||||
- ../config/host-metrics/telegraf.conf.tpl:/etc/telegraf/telegraf.conf.tpl:ro
|
||||
- ../config/host-metrics/scripts/telegraf-entrypoint.sh:/scripts/telegraf-entrypoint.sh:ro
|
||||
- /proc:/hostfs/proc:ro
|
||||
- /sys:/hostfs/sys:ro
|
||||
- /:/hostfs:ro
|
||||
# /dev is needed by inputs.diskio: it enumerates devices from
|
||||
# /proc/diskstats and then opens /dev/<name> for udev/uevent lookups.
|
||||
# Without this mount telegraf logs an "error reading /dev/<name>" warning
|
||||
# per device per collection cycle.
|
||||
- /dev:/dev:ro
|
||||
# /run/udev is where modern systemd stores the udev database that
|
||||
# gopsutil consults for per-device tags. Without it telegraf falls
|
||||
# back to the legacy /dev/.udev/db/... path which doesn't exist on
|
||||
# systemd hosts, producing "stat /dev/.udev/db/block:..." warnings.
|
||||
- /run/udev:/run/udev:ro
|
||||
|
|
@ -0,0 +1,68 @@
|
|||
#!/bin/sh
|
||||
# host-metrics telegraf-entrypoint.sh
|
||||
# Render telegraf.conf from telegraf.conf.tpl, then exec telegraf.
|
||||
#
|
||||
# Substitutions performed here (by awk):
|
||||
# @@HOST_TAG_BLOCK@@ -> "[global_tags]\n host = \"$HOST_TAG\"" if set, else empty.
|
||||
# @@ZFS_BLOCK@@ -> "[[inputs.zfs]]\n poolMetrics = true" if COLLECT_ZFS=true, else empty.
|
||||
#
|
||||
# Variables of the form ${VAR} in the template (INFLUXDB_URL, INFLUXDB_DB,
|
||||
# INFLUXDB_USER, INFLUXDB_PASSWORD, COLLECT_INTERVAL) are resolved by
|
||||
# telegraf's own env-var substitution at config-load time and are NOT
|
||||
# touched by this script.
|
||||
#
|
||||
# TELEGRAF_CONF_DIR overrides the conf directory for tests; defaults to
|
||||
# /etc/telegraf which is the standard path inside the official image.
|
||||
|
||||
set -eu
|
||||
|
||||
CONF_DIR="${TELEGRAF_CONF_DIR:-/etc/telegraf}"
|
||||
TPL="$CONF_DIR/telegraf.conf.tpl"
|
||||
OUT="$CONF_DIR/telegraf.conf"
|
||||
|
||||
# Fail-fast required env. Empty string counts as missing -- a half-rendered
|
||||
# conf or a noisy telegraf auth error is worse than a clear startup failure.
|
||||
for v in INFLUXDB_URL INFLUXDB_USER INFLUXDB_PASSWORD; do
|
||||
eval val=\${$v:-}
|
||||
if [ -z "$val" ]; then
|
||||
echo "FATAL: $v is required but empty" >&2
|
||||
exit 1
|
||||
fi
|
||||
done
|
||||
|
||||
# Apply defaults for optional vars.
|
||||
: "${INFLUXDB_DB:=host_metrics}"
|
||||
: "${COLLECT_INTERVAL:=10s}"
|
||||
: "${HOST_TAG:=}"
|
||||
: "${COLLECT_ZFS:=false}"
|
||||
|
||||
# Build the marker substitutions. Use printf for the newline so the
|
||||
# rendered block lands on its own line.
|
||||
if [ -n "$HOST_TAG" ]; then
|
||||
HOST_TAG_BLOCK=$(printf '[global_tags]\n host = "%s"' "$HOST_TAG")
|
||||
else
|
||||
HOST_TAG_BLOCK=""
|
||||
fi
|
||||
|
||||
if [ "$COLLECT_ZFS" = "true" ]; then
|
||||
ZFS_BLOCK=$(printf '[[inputs.zfs]]\n poolMetrics = true')
|
||||
else
|
||||
ZFS_BLOCK=""
|
||||
fi
|
||||
|
||||
# Export telegraf hostfs envs so /proc, /sys, and root come from the
|
||||
# bind-mount under /hostfs (set in compose).
|
||||
export HOST_PROC=/hostfs/proc
|
||||
export HOST_SYS=/hostfs/sys
|
||||
export HOST_MOUNT_PREFIX=/hostfs
|
||||
|
||||
# Render with awk: handles multi-line replacement values cleanly,
|
||||
# avoids sed's newline-in-replacement portability quirks across BusyBox /
|
||||
# GNU / BSD sed.
|
||||
awk -v ht="$HOST_TAG_BLOCK" -v zb="$ZFS_BLOCK" '
|
||||
{ gsub(/@@HOST_TAG_BLOCK@@/, ht);
|
||||
gsub(/@@ZFS_BLOCK@@/, zb);
|
||||
print }
|
||||
' "$TPL" > "$OUT"
|
||||
|
||||
exec telegraf --config "$OUT"
|
||||
|
|
@ -0,0 +1,121 @@
|
|||
#!/bin/sh
|
||||
# Offline tests for host-metrics telegraf-entrypoint.sh.
|
||||
# Stubs telegraf and envsubst's downstream consumer; no telegraf binary needed.
|
||||
set -eu
|
||||
|
||||
SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
|
||||
ENTRYPOINT="$SCRIPT_DIR/telegraf-entrypoint.sh"
|
||||
|
||||
[ -x "$ENTRYPOINT" ] || { echo "FATAL: $ENTRYPOINT not executable"; exit 2; }
|
||||
|
||||
TMP=$(mktemp -d)
|
||||
trap 'rm -rf "$TMP"' EXIT
|
||||
mkdir -p "$TMP/bin" "$TMP/etc/telegraf"
|
||||
|
||||
# Stub telegraf so `exec telegraf` is a no-op.
|
||||
cat > "$TMP/bin/telegraf" <<'EOF'
|
||||
#!/bin/sh
|
||||
exit 0
|
||||
EOF
|
||||
chmod +x "$TMP/bin/telegraf"
|
||||
|
||||
# Minimal template that exercises both markers.
|
||||
cat > "$TMP/etc/telegraf/telegraf.conf.tpl" <<'EOF'
|
||||
@@HOST_TAG_BLOCK@@
|
||||
|
||||
[agent]
|
||||
interval = "${COLLECT_INTERVAL}"
|
||||
|
||||
[[outputs.influxdb]]
|
||||
urls = ["${INFLUXDB_URL}"]
|
||||
|
||||
@@ZFS_BLOCK@@
|
||||
EOF
|
||||
|
||||
PASS=0
|
||||
FAIL=0
|
||||
|
||||
# run sets required env defaults, then layers caller env on top.
|
||||
run() {
|
||||
env PATH="$TMP/bin:$PATH" \
|
||||
TELEGRAF_CONF_DIR="$TMP/etc/telegraf" \
|
||||
INFLUXDB_URL="${INFLUXDB_URL-http://example/}" \
|
||||
INFLUXDB_USER="${INFLUXDB_USER-writer}" \
|
||||
INFLUXDB_PASSWORD="${INFLUXDB_PASSWORD-secret}" \
|
||||
INFLUXDB_DB="${INFLUXDB_DB-host_metrics}" \
|
||||
COLLECT_INTERVAL="${COLLECT_INTERVAL-10s}" \
|
||||
HOST_TAG="${HOST_TAG-}" \
|
||||
COLLECT_ZFS="${COLLECT_ZFS-false}" \
|
||||
sh "$ENTRYPOINT" >/dev/null
|
||||
rc=$?
|
||||
[ -f "$TMP/etc/telegraf/telegraf.conf" ] && cat "$TMP/etc/telegraf/telegraf.conf"
|
||||
return $rc
|
||||
}
|
||||
|
||||
assert_grep() {
|
||||
name=$1; actual=$2; pattern=$3
|
||||
if printf '%s' "$actual" | grep -qE "$pattern"; then
|
||||
echo "PASS: $name"; PASS=$((PASS + 1))
|
||||
else
|
||||
echo "FAIL: $name"
|
||||
echo " expected pattern: $pattern"
|
||||
echo " actual: $actual"
|
||||
FAIL=$((FAIL + 1))
|
||||
fi
|
||||
}
|
||||
|
||||
assert_not_grep() {
|
||||
name=$1; actual=$2; pattern=$3
|
||||
if printf '%s' "$actual" | grep -qE "$pattern"; then
|
||||
echo "FAIL: $name (matched pattern $pattern)"; FAIL=$((FAIL + 1))
|
||||
else
|
||||
echo "PASS: $name"; PASS=$((PASS + 1))
|
||||
fi
|
||||
}
|
||||
|
||||
# T1: HOST_TAG unset -> no [global_tags] block emitted
|
||||
out=$(HOST_TAG="" run)
|
||||
assert_not_grep "T1: HOST_TAG empty -> no global_tags" "$out" '^\[global_tags\]'
|
||||
|
||||
# T2: HOST_TAG set -> [global_tags] block with host = "<value>"
|
||||
out=$(HOST_TAG="validator-1" run)
|
||||
assert_grep "T2: HOST_TAG set -> [global_tags] block" "$out" '^\[global_tags\]'
|
||||
assert_grep "T2: HOST_TAG set -> host = \"validator-1\"" "$out" 'host = "validator-1"'
|
||||
|
||||
# T3: COLLECT_ZFS=true -> [[inputs.zfs]] block present
|
||||
out=$(COLLECT_ZFS="true" run)
|
||||
assert_grep "T3: COLLECT_ZFS true -> inputs.zfs block" "$out" '\[\[inputs\.zfs\]\]'
|
||||
|
||||
# T4: COLLECT_ZFS=false -> no inputs.zfs block
|
||||
out=$(COLLECT_ZFS="false" run)
|
||||
assert_not_grep "T4: COLLECT_ZFS false -> no inputs.zfs" "$out" '\[\[inputs\.zfs\]\]'
|
||||
|
||||
# T5: markers fully removed even when block bodies are empty
|
||||
out=$(HOST_TAG="" COLLECT_ZFS="false" run)
|
||||
assert_not_grep "T5: no leftover @@HOST_TAG_BLOCK@@" "$out" '@@HOST_TAG_BLOCK@@'
|
||||
assert_not_grep "T5: no leftover @@ZFS_BLOCK@@" "$out" '@@ZFS_BLOCK@@'
|
||||
|
||||
# T6: missing INFLUXDB_URL -> exit non-zero, error on stderr
|
||||
rc=0
|
||||
INFLUXDB_URL="" run 2>"$TMP/err" || rc=$?
|
||||
[ "$rc" -ne 0 ] && grep -q INFLUXDB_URL "$TMP/err" \
|
||||
&& { echo "PASS: T6: missing INFLUXDB_URL -> error"; PASS=$((PASS + 1)); } \
|
||||
|| { echo "FAIL: T6: missing INFLUXDB_URL handling (rc=$rc)"; FAIL=$((FAIL + 1)); }
|
||||
|
||||
# T7: missing INFLUXDB_USER -> exit non-zero
|
||||
rc=0
|
||||
INFLUXDB_USER="" run 2>"$TMP/err" || rc=$?
|
||||
[ "$rc" -ne 0 ] && grep -q INFLUXDB_USER "$TMP/err" \
|
||||
&& { echo "PASS: T7: missing INFLUXDB_USER -> error"; PASS=$((PASS + 1)); } \
|
||||
|| { echo "FAIL: T7: missing INFLUXDB_USER handling (rc=$rc)"; FAIL=$((FAIL + 1)); }
|
||||
|
||||
# T8: missing INFLUXDB_PASSWORD -> exit non-zero
|
||||
rc=0
|
||||
INFLUXDB_PASSWORD="" run 2>"$TMP/err" || rc=$?
|
||||
[ "$rc" -ne 0 ] && grep -q INFLUXDB_PASSWORD "$TMP/err" \
|
||||
&& { echo "PASS: T8: missing INFLUXDB_PASSWORD -> error"; PASS=$((PASS + 1)); } \
|
||||
|| { echo "FAIL: T8: missing INFLUXDB_PASSWORD handling (rc=$rc)"; FAIL=$((FAIL + 1)); }
|
||||
|
||||
echo
|
||||
echo "Results: $PASS passed, $FAIL failed"
|
||||
[ "$FAIL" = "0" ]
|
||||
|
|
@ -0,0 +1,66 @@
|
|||
# host-metrics telegraf template.
|
||||
# Rendered at container start by telegraf-entrypoint.sh. The entrypoint
|
||||
# replaces two single-line markers in this file with TOML block fragments;
|
||||
# see telegraf-entrypoint.sh for the substitution details. All ${...}
|
||||
# variables are resolved by telegraf's native env substitution at
|
||||
# config-load time.
|
||||
|
||||
@@HOST_TAG_BLOCK@@
|
||||
|
||||
[agent]
|
||||
interval = "${COLLECT_INTERVAL}"
|
||||
round_interval = true
|
||||
collection_jitter = "0s"
|
||||
flush_interval = "${COLLECT_INTERVAL}"
|
||||
flush_jitter = "0s"
|
||||
precision = "0s"
|
||||
hostname = ""
|
||||
omit_hostname = false
|
||||
|
||||
[[outputs.influxdb]]
|
||||
urls = ["${INFLUXDB_URL}"]
|
||||
database = "${INFLUXDB_DB}"
|
||||
skip_database_creation = true
|
||||
username = "${INFLUXDB_USER}"
|
||||
password = "${INFLUXDB_PASSWORD}"
|
||||
retention_policy = ""
|
||||
write_consistency = "any"
|
||||
timeout = "10s"
|
||||
|
||||
[[inputs.cpu]]
|
||||
percpu = false
|
||||
totalcpu = true
|
||||
collect_cpu_time = false
|
||||
report_active = true
|
||||
|
||||
[[inputs.mem]]
|
||||
|
||||
[[inputs.swap]]
|
||||
|
||||
[[inputs.system]]
|
||||
|
||||
[[inputs.processes]]
|
||||
|
||||
[[inputs.disk]]
|
||||
# gopsutil with HOST_MOUNT_PREFIX=/hostfs strips the /hostfs prefix
|
||||
# from /proc/mounts entries, so the mountpoints telegraf sees are
|
||||
# the host's real paths (/, /boot, /home, ...). No mount_points
|
||||
# filter; let ignore_fs do the noise reduction.
|
||||
ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay",
|
||||
"aufs", "squashfs", "nsfs", "tracefs", "proc", "sysfs",
|
||||
"cgroup", "cgroup2", "fuse.lxcfs"]
|
||||
|
||||
[[inputs.diskio]]
|
||||
device_tags = ["DEVNAME"]
|
||||
skip_serial_number = true
|
||||
name_templates = ["$DEVNAME"]
|
||||
|
||||
[[inputs.net]]
|
||||
# Allowlist covers physical ethernet (eth*, en*), wireless (wl*),
|
||||
# cellular (wwan*), and bonded/teamed interfaces (bond*). Excludes
|
||||
# docker bridges, veth pairs, tun/tap, and lo. Adjust per host if
|
||||
# you need a more specific scope.
|
||||
ignore_protocol_stats = true
|
||||
interfaces = ["eth*", "en*", "wl*", "wwan*", "bond*"]
|
||||
|
||||
@@ZFS_BLOCK@@
|
||||
|
|
@ -0,0 +1,127 @@
|
|||
# host-metrics stack
|
||||
|
||||
Per-host system metrics collector. Runs telegraf with host networking, host
|
||||
PID namespace, and read-only bind mounts of /proc, /sys, and / so it can
|
||||
report real CPU, memory, disk, network, and process metrics for the machine
|
||||
it runs on. Writes to an InfluxDB 1.x endpoint of your choosing.
|
||||
|
||||
Deploy one instance per machine you want monitored.
|
||||
|
||||
## What gets collected
|
||||
|
||||
| Input | Measurements (in InfluxDB) |
|
||||
|-------|----------------------------|
|
||||
| inputs.cpu (totalcpu only) | cpu (`cpu=cpu-total`) |
|
||||
| inputs.mem | mem |
|
||||
| inputs.swap | swap |
|
||||
| inputs.system | system (uptime, load1/5/15, n_users, n_cpus) |
|
||||
| inputs.processes | processes (running/sleeping/blocked/zombies) |
|
||||
| inputs.disk | disk (used/free/used_percent per mount) |
|
||||
| inputs.diskio | diskio (read/write bytes/ops per device) |
|
||||
| inputs.net | net (bytes/packets/err in/out per interface) |
|
||||
| inputs.zfs (opt-in via COLLECT_ZFS=true) | zfs (ARC stats, pool state) |
|
||||
|
||||
All rows are tagged with `host` (kernel hostname, or `HOST_TAG` override).
|
||||
|
||||
## Deploy
|
||||
|
||||
### Create a spec
|
||||
|
||||
```bash
|
||||
laconic-so --stack host-metrics deploy init --output spec-host-metrics.yml
|
||||
```
|
||||
|
||||
Edit `spec-host-metrics.yml` to look like:
|
||||
|
||||
```yaml
|
||||
stack: host-metrics
|
||||
deploy-to: compose
|
||||
credentials-files:
|
||||
- ~/.credentials/host-metrics.env
|
||||
config:
|
||||
INFLUXDB_URL: 'https://influxdb.example.com'
|
||||
INFLUXDB_DB: 'host_metrics' # default; override for a custom DB
|
||||
HOST_TAG: 'validator-1' # optional; defaults to kernel hostname
|
||||
COLLECT_INTERVAL: '10s' # telegraf collection + flush cadence
|
||||
COLLECT_ZFS: 'false' # set to 'true' on ZFS hosts
|
||||
```
|
||||
|
||||
`~/.credentials/host-metrics.env` must contain:
|
||||
|
||||
```
|
||||
INFLUXDB_WRITE_USER=<writer-username>
|
||||
INFLUXDB_WRITE_PASSWORD=<writer-password>
|
||||
```
|
||||
|
||||
These are issued by the InfluxDB admin (the monitoring host operator); they
|
||||
are the same writer-only credentials used by validators/RPCs to push agave
|
||||
metrics.
|
||||
|
||||
### Create and start
|
||||
|
||||
```bash
|
||||
laconic-so --stack host-metrics deploy create \
|
||||
--spec-file spec-host-metrics.yml \
|
||||
--deployment-dir ./deployment-host-metrics
|
||||
laconic-so deployment --dir ./deployment-host-metrics start
|
||||
```
|
||||
|
||||
`deploy create` builds the deployment dir from the spec; `deployment start`
|
||||
brings the containers up. The `--stack` option is required for `deploy`
|
||||
subcommands but rejected on `deployment` subcommands (the deployment dir
|
||||
already knows its stack).
|
||||
|
||||
### Verify
|
||||
|
||||
```bash
|
||||
laconic-so deployment --dir ./deployment-host-metrics logs host-telegraf | head
|
||||
```
|
||||
|
||||
Expected: telegraf prints its startup banner and `Loaded inputs: ...`. No
|
||||
errors about missing config or auth failures.
|
||||
|
||||
Within ~20 seconds, the host's data appears in the InfluxDB endpoint's
|
||||
`host_metrics` database (or whichever DB you set in INFLUXDB_DB) and in
|
||||
any Grafana dashboards bound to that DB.
|
||||
|
||||
## Configuration reference
|
||||
|
||||
| Env | Required | Default | Notes |
|
||||
|-----|----------|---------|-------|
|
||||
| `INFLUXDB_URL` | yes | - | Full URL including scheme. Example: `https://influxdb.example.com`. |
|
||||
| `INFLUXDB_DB` | no | `host_metrics` | Target database. Must exist (writer is not granted CREATE). |
|
||||
| `INFLUXDB_WRITE_USER` | yes | - | Writer-only user. |
|
||||
| `INFLUXDB_WRITE_PASSWORD` | yes | - | Writer-only password. |
|
||||
| `COLLECT_INTERVAL` | no | `10s` | Telegraf collection and flush cadence. |
|
||||
| `HOST_TAG` | no | empty | Overrides the kernel hostname for the `host` tag on every row. Useful when a VM has a generic hostname. |
|
||||
| `COLLECT_ZFS` | no | `false` | Set to `true` to enable `inputs.zfs` (pool state + ARC stats). |
|
||||
|
||||
## ZFS hosts
|
||||
|
||||
`inputs.disk` already reports used/free per mount for any filesystem type
|
||||
including ZFS, so the disk-usage view works out of the box. Setting
|
||||
`COLLECT_ZFS=true` additionally enables `inputs.zfs` which reads
|
||||
`/proc/spl/kstat/zfs/...` and emits ARC hit ratio, ARC size, and per-pool
|
||||
health metrics. The bind mount of `/proc` provides the necessary
|
||||
visibility; no extra mounts are needed.
|
||||
|
||||
If you set `COLLECT_ZFS=true` on a non-ZFS host, telegraf logs an error
|
||||
once per collection cycle and skips the input. Harmless but noisy; leave
|
||||
the toggle off on non-ZFS machines.
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
| Symptom | Likely cause |
|
||||
|---------|-------------|
|
||||
| Container fails to start with `FATAL: INFLUXDB_URL is required but empty` | Missing required env. Check spec.yml + credentials file. |
|
||||
| Container starts, no rows appear in InfluxDB | Writer credentials wrong, or InfluxDB unreachable from this host's network. Check `docker logs <host-telegraf>` for `Post ... 401` / `connection refused`. |
|
||||
| Two hosts overwriting each other's series | Both use the same kernel hostname. Set distinct `HOST_TAG` values. |
|
||||
| `inputs.processes` reports only 1 process | `pid: host` missing from compose. Re-deploy. |
|
||||
|
||||
## Caveats
|
||||
|
||||
- Requires Docker with privileges to bind-mount `/`, `/proc`, `/sys`, and to
|
||||
share the host PID namespace. Rootless Docker installations may refuse
|
||||
`pid: host` and the `/` bind mount.
|
||||
- One deployment per host. Running two on the same machine writes
|
||||
duplicate rows under the same `host` tag.
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
version: "1.1"
|
||||
name: host-metrics
|
||||
description: "Per-host system metrics collector (telegraf -> InfluxDB)"
|
||||
pods:
|
||||
- host-metrics
|
||||
|
|
@ -48,10 +48,21 @@ class DockerDeployer(Deployer):
|
|||
self.compose_project_name = compose_project_name
|
||||
self.compose_env_file = compose_env_file
|
||||
|
||||
def up(self, detach, skip_cluster_management, services, image_overrides=None):
|
||||
def up(
|
||||
self,
|
||||
detach,
|
||||
skip_cluster_management,
|
||||
services,
|
||||
image_overrides=None,
|
||||
force_recreate=False,
|
||||
):
|
||||
if not opts.o.dry_run:
|
||||
try:
|
||||
return self.docker.compose.up(detach=detach, services=services)
|
||||
return self.docker.compose.up(
|
||||
detach=detach,
|
||||
services=services,
|
||||
force_recreate=force_recreate,
|
||||
)
|
||||
except DockerException as e:
|
||||
raise DeployerException(e)
|
||||
|
||||
|
|
|
|||
|
|
@ -142,6 +142,7 @@ def up_operation(
|
|||
stay_attached=False,
|
||||
skip_cluster_management=False,
|
||||
image_overrides=None,
|
||||
force_recreate=False,
|
||||
):
|
||||
global_context = ctx.parent.parent.obj
|
||||
deploy_context = ctx.obj
|
||||
|
|
@ -161,6 +162,7 @@ def up_operation(
|
|||
skip_cluster_management=skip_cluster_management,
|
||||
services=services_list,
|
||||
image_overrides=image_overrides,
|
||||
force_recreate=force_recreate,
|
||||
)
|
||||
for post_start_command in cluster_context.post_start_commands:
|
||||
_run_command(global_context, cluster_context.cluster, post_start_command)
|
||||
|
|
|
|||
|
|
@ -20,7 +20,14 @@ from typing import Optional
|
|||
|
||||
class Deployer(ABC):
|
||||
@abstractmethod
|
||||
def up(self, detach, skip_cluster_management, services, image_overrides=None):
|
||||
def up(
|
||||
self,
|
||||
detach,
|
||||
skip_cluster_management,
|
||||
services,
|
||||
image_overrides=None,
|
||||
force_recreate=False,
|
||||
):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
|
|
|
|||
|
|
@ -471,12 +471,18 @@ def restart(ctx, stack_path, spec_file, config_file, force, expected_ip, image):
|
|||
ctx, deployment_context, maintenance_svc, image_overrides
|
||||
)
|
||||
else:
|
||||
# force_recreate=True so source-file edits (alert rules, dashboards,
|
||||
# entrypoint scripts, etc. mounted via bind volumes) are picked up.
|
||||
# docker compose up -d alone is a no-op when the service definition
|
||||
# itself is unchanged, leaving the running container with stale
|
||||
# in-memory state.
|
||||
up_operation(
|
||||
ctx,
|
||||
services_list=None,
|
||||
stay_attached=False,
|
||||
skip_cluster_management=True,
|
||||
image_overrides=image_overrides or None,
|
||||
force_recreate=True,
|
||||
)
|
||||
|
||||
# Restore cwd after both create_operation and up_operation have run.
|
||||
|
|
@ -514,12 +520,15 @@ def _restart_with_maintenance(
|
|||
|
||||
# Step 1: Apply the full deployment (creates/updates all pods + services)
|
||||
# This ensures maintenance pod exists before we swap Ingress to it.
|
||||
# force_recreate intent matches the non-maintenance restart path; the
|
||||
# k8s deployer currently ignores the flag (TODO in deploy_k8s.up).
|
||||
up_operation(
|
||||
ctx,
|
||||
services_list=None,
|
||||
stay_attached=False,
|
||||
skip_cluster_management=True,
|
||||
image_overrides=image_overrides or None,
|
||||
force_recreate=True,
|
||||
)
|
||||
|
||||
# Parse maintenance service spec: "container-name:port"
|
||||
|
|
|
|||
|
|
@ -30,6 +30,7 @@ from stack_orchestrator.deploy.k8s.helpers import (
|
|||
create_cluster,
|
||||
destroy_cluster,
|
||||
get_kind_cluster,
|
||||
is_image_available_locally,
|
||||
load_images_into_kind,
|
||||
)
|
||||
from stack_orchestrator.deploy.k8s.helpers import (
|
||||
|
|
@ -833,16 +834,17 @@ class K8sDeployer(Deployer):
|
|||
actual_cluster = create_cluster(self.kind_cluster_name, kind_config)
|
||||
if actual_cluster != self.kind_cluster_name:
|
||||
self.kind_cluster_name = actual_cluster
|
||||
# Only load locally-built images into kind
|
||||
local_containers = self.deployment_context.stack.obj.get("containers", [])
|
||||
if local_containers:
|
||||
local_images = {
|
||||
images_to_preload = set((self.image_overrides or {}).values()) | {
|
||||
img
|
||||
for img in self.cluster_info.image_set
|
||||
if any(c in img for c in local_containers)
|
||||
}
|
||||
if local_images:
|
||||
load_images_into_kind(self.kind_cluster_name, local_images)
|
||||
images_to_preload = {
|
||||
img for img in images_to_preload if is_image_available_locally(img)
|
||||
}
|
||||
if images_to_preload:
|
||||
load_images_into_kind(self.kind_cluster_name, images_to_preload)
|
||||
elif self.is_kind():
|
||||
# --skip-cluster-management (default): cluster must already exist.
|
||||
# Without this check, connect_api() below raises a cryptic
|
||||
|
|
@ -985,7 +987,19 @@ class K8sDeployer(Deployer):
|
|||
else:
|
||||
raise
|
||||
|
||||
def up(self, detach, skip_cluster_management, services, image_overrides=None):
|
||||
def up(
|
||||
self,
|
||||
detach,
|
||||
skip_cluster_management,
|
||||
services,
|
||||
image_overrides=None,
|
||||
force_recreate=False,
|
||||
):
|
||||
# TODO: honor force_recreate by stamping the
|
||||
# kubectl.kubernetes.io/restartedAt annotation on managed
|
||||
# Deployments so a rollout occurs even when the manifest is
|
||||
# unchanged. Today this method is a no-op for that flag.
|
||||
# Tracked separately from the compose-side fix.
|
||||
# Merge spec-level image overrides with CLI overrides
|
||||
spec_overrides = self.cluster_info.spec.get("image-overrides", {})
|
||||
if spec_overrides:
|
||||
|
|
|
|||
|
|
@ -607,6 +607,14 @@ def update_caddy_ingress_image(caddy_image: str) -> bool:
|
|||
return True
|
||||
|
||||
|
||||
def is_image_available_locally(image: str) -> bool:
|
||||
result = subprocess.run(
|
||||
["docker", "image", "inspect", image],
|
||||
capture_output=True,
|
||||
)
|
||||
return result.returncode == 0
|
||||
|
||||
|
||||
def load_images_into_kind(kind_cluster_name: str, image_set: Set[str]):
|
||||
for image in image_set:
|
||||
result = _run_command(
|
||||
|
|
|
|||
Loading…
Reference in New Issue