Add host-metrics stack: per-host system metrics via telegraf (#753)

- New host-metrics native stack: one telegraf container per host, pushes CPU/memory/disk/network/load/processes/swap to a user-specified InfluxDB 1.x endpoint
- `HOST_TAG` overrides the kernel hostname for the host tag; `COLLECT_ZFS=true` enables `inputs.zfs` (pool health + ARC stats)
- Offline shell tests cover marker substitution and required-env validation; no telegraf binary needed to run them
main
prathamesh0 2026-05-11 18:31:38 +05:30 committed by GitHub
parent 2ff7e5eb77
commit 3d703708c4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 420 additions and 0 deletions

View File

@ -0,0 +1,33 @@
version: '3.2'
services:
telegraf:
image: telegraf:1.36
restart: unless-stopped
network_mode: host
pid: host
entrypoint: ["/scripts/telegraf-entrypoint.sh"]
environment:
INFLUXDB_URL: ${INFLUXDB_URL}
INFLUXDB_DB: ${INFLUXDB_DB:-host_metrics}
INFLUXDB_USER: ${INFLUXDB_WRITE_USER}
INFLUXDB_PASSWORD: ${INFLUXDB_WRITE_PASSWORD}
COLLECT_INTERVAL: ${COLLECT_INTERVAL:-10s}
HOST_TAG: ${HOST_TAG:-}
COLLECT_ZFS: ${COLLECT_ZFS:-false}
volumes:
- ../config/host-metrics/telegraf.conf.tpl:/etc/telegraf/telegraf.conf.tpl:ro
- ../config/host-metrics/scripts/telegraf-entrypoint.sh:/scripts/telegraf-entrypoint.sh:ro
- /proc:/hostfs/proc:ro
- /sys:/hostfs/sys:ro
- /:/hostfs:ro
# /dev is needed by inputs.diskio: it enumerates devices from
# /proc/diskstats and then opens /dev/<name> for udev/uevent lookups.
# Without this mount telegraf logs an "error reading /dev/<name>" warning
# per device per collection cycle.
- /dev:/dev:ro
# /run/udev is where modern systemd stores the udev database that
# gopsutil consults for per-device tags. Without it telegraf falls
# back to the legacy /dev/.udev/db/... path which doesn't exist on
# systemd hosts, producing "stat /dev/.udev/db/block:..." warnings.
- /run/udev:/run/udev:ro

View File

@ -0,0 +1,68 @@
#!/bin/sh
# host-metrics telegraf-entrypoint.sh
# Render telegraf.conf from telegraf.conf.tpl, then exec telegraf.
#
# Substitutions performed here (by awk):
# @@HOST_TAG_BLOCK@@ -> "[global_tags]\n host = \"$HOST_TAG\"" if set, else empty.
# @@ZFS_BLOCK@@ -> "[[inputs.zfs]]\n poolMetrics = true" if COLLECT_ZFS=true, else empty.
#
# Variables of the form ${VAR} in the template (INFLUXDB_URL, INFLUXDB_DB,
# INFLUXDB_USER, INFLUXDB_PASSWORD, COLLECT_INTERVAL) are resolved by
# telegraf's own env-var substitution at config-load time and are NOT
# touched by this script.
#
# TELEGRAF_CONF_DIR overrides the conf directory for tests; defaults to
# /etc/telegraf which is the standard path inside the official image.
set -eu
CONF_DIR="${TELEGRAF_CONF_DIR:-/etc/telegraf}"
TPL="$CONF_DIR/telegraf.conf.tpl"
OUT="$CONF_DIR/telegraf.conf"
# Fail-fast required env. Empty string counts as missing -- a half-rendered
# conf or a noisy telegraf auth error is worse than a clear startup failure.
for v in INFLUXDB_URL INFLUXDB_USER INFLUXDB_PASSWORD; do
eval val=\${$v:-}
if [ -z "$val" ]; then
echo "FATAL: $v is required but empty" >&2
exit 1
fi
done
# Apply defaults for optional vars.
: "${INFLUXDB_DB:=host_metrics}"
: "${COLLECT_INTERVAL:=10s}"
: "${HOST_TAG:=}"
: "${COLLECT_ZFS:=false}"
# Build the marker substitutions. Use printf for the newline so the
# rendered block lands on its own line.
if [ -n "$HOST_TAG" ]; then
HOST_TAG_BLOCK=$(printf '[global_tags]\n host = "%s"' "$HOST_TAG")
else
HOST_TAG_BLOCK=""
fi
if [ "$COLLECT_ZFS" = "true" ]; then
ZFS_BLOCK=$(printf '[[inputs.zfs]]\n poolMetrics = true')
else
ZFS_BLOCK=""
fi
# Export telegraf hostfs envs so /proc, /sys, and root come from the
# bind-mount under /hostfs (set in compose).
export HOST_PROC=/hostfs/proc
export HOST_SYS=/hostfs/sys
export HOST_MOUNT_PREFIX=/hostfs
# Render with awk: handles multi-line replacement values cleanly,
# avoids sed's newline-in-replacement portability quirks across BusyBox /
# GNU / BSD sed.
awk -v ht="$HOST_TAG_BLOCK" -v zb="$ZFS_BLOCK" '
{ gsub(/@@HOST_TAG_BLOCK@@/, ht);
gsub(/@@ZFS_BLOCK@@/, zb);
print }
' "$TPL" > "$OUT"
exec telegraf --config "$OUT"

View File

@ -0,0 +1,121 @@
#!/bin/sh
# Offline tests for host-metrics telegraf-entrypoint.sh.
# Stubs telegraf and envsubst's downstream consumer; no telegraf binary needed.
set -eu
SCRIPT_DIR=$(cd "$(dirname "$0")" && pwd)
ENTRYPOINT="$SCRIPT_DIR/telegraf-entrypoint.sh"
[ -x "$ENTRYPOINT" ] || { echo "FATAL: $ENTRYPOINT not executable"; exit 2; }
TMP=$(mktemp -d)
trap 'rm -rf "$TMP"' EXIT
mkdir -p "$TMP/bin" "$TMP/etc/telegraf"
# Stub telegraf so `exec telegraf` is a no-op.
cat > "$TMP/bin/telegraf" <<'EOF'
#!/bin/sh
exit 0
EOF
chmod +x "$TMP/bin/telegraf"
# Minimal template that exercises both markers.
cat > "$TMP/etc/telegraf/telegraf.conf.tpl" <<'EOF'
@@HOST_TAG_BLOCK@@
[agent]
interval = "${COLLECT_INTERVAL}"
[[outputs.influxdb]]
urls = ["${INFLUXDB_URL}"]
@@ZFS_BLOCK@@
EOF
PASS=0
FAIL=0
# run sets required env defaults, then layers caller env on top.
run() {
env PATH="$TMP/bin:$PATH" \
TELEGRAF_CONF_DIR="$TMP/etc/telegraf" \
INFLUXDB_URL="${INFLUXDB_URL-http://example/}" \
INFLUXDB_USER="${INFLUXDB_USER-writer}" \
INFLUXDB_PASSWORD="${INFLUXDB_PASSWORD-secret}" \
INFLUXDB_DB="${INFLUXDB_DB-host_metrics}" \
COLLECT_INTERVAL="${COLLECT_INTERVAL-10s}" \
HOST_TAG="${HOST_TAG-}" \
COLLECT_ZFS="${COLLECT_ZFS-false}" \
sh "$ENTRYPOINT" >/dev/null
rc=$?
[ -f "$TMP/etc/telegraf/telegraf.conf" ] && cat "$TMP/etc/telegraf/telegraf.conf"
return $rc
}
assert_grep() {
name=$1; actual=$2; pattern=$3
if printf '%s' "$actual" | grep -qE "$pattern"; then
echo "PASS: $name"; PASS=$((PASS + 1))
else
echo "FAIL: $name"
echo " expected pattern: $pattern"
echo " actual: $actual"
FAIL=$((FAIL + 1))
fi
}
assert_not_grep() {
name=$1; actual=$2; pattern=$3
if printf '%s' "$actual" | grep -qE "$pattern"; then
echo "FAIL: $name (matched pattern $pattern)"; FAIL=$((FAIL + 1))
else
echo "PASS: $name"; PASS=$((PASS + 1))
fi
}
# T1: HOST_TAG unset -> no [global_tags] block emitted
out=$(HOST_TAG="" run)
assert_not_grep "T1: HOST_TAG empty -> no global_tags" "$out" '^\[global_tags\]'
# T2: HOST_TAG set -> [global_tags] block with host = "<value>"
out=$(HOST_TAG="validator-1" run)
assert_grep "T2: HOST_TAG set -> [global_tags] block" "$out" '^\[global_tags\]'
assert_grep "T2: HOST_TAG set -> host = \"validator-1\"" "$out" 'host = "validator-1"'
# T3: COLLECT_ZFS=true -> [[inputs.zfs]] block present
out=$(COLLECT_ZFS="true" run)
assert_grep "T3: COLLECT_ZFS true -> inputs.zfs block" "$out" '\[\[inputs\.zfs\]\]'
# T4: COLLECT_ZFS=false -> no inputs.zfs block
out=$(COLLECT_ZFS="false" run)
assert_not_grep "T4: COLLECT_ZFS false -> no inputs.zfs" "$out" '\[\[inputs\.zfs\]\]'
# T5: markers fully removed even when block bodies are empty
out=$(HOST_TAG="" COLLECT_ZFS="false" run)
assert_not_grep "T5: no leftover @@HOST_TAG_BLOCK@@" "$out" '@@HOST_TAG_BLOCK@@'
assert_not_grep "T5: no leftover @@ZFS_BLOCK@@" "$out" '@@ZFS_BLOCK@@'
# T6: missing INFLUXDB_URL -> exit non-zero, error on stderr
rc=0
INFLUXDB_URL="" run 2>"$TMP/err" || rc=$?
[ "$rc" -ne 0 ] && grep -q INFLUXDB_URL "$TMP/err" \
&& { echo "PASS: T6: missing INFLUXDB_URL -> error"; PASS=$((PASS + 1)); } \
|| { echo "FAIL: T6: missing INFLUXDB_URL handling (rc=$rc)"; FAIL=$((FAIL + 1)); }
# T7: missing INFLUXDB_USER -> exit non-zero
rc=0
INFLUXDB_USER="" run 2>"$TMP/err" || rc=$?
[ "$rc" -ne 0 ] && grep -q INFLUXDB_USER "$TMP/err" \
&& { echo "PASS: T7: missing INFLUXDB_USER -> error"; PASS=$((PASS + 1)); } \
|| { echo "FAIL: T7: missing INFLUXDB_USER handling (rc=$rc)"; FAIL=$((FAIL + 1)); }
# T8: missing INFLUXDB_PASSWORD -> exit non-zero
rc=0
INFLUXDB_PASSWORD="" run 2>"$TMP/err" || rc=$?
[ "$rc" -ne 0 ] && grep -q INFLUXDB_PASSWORD "$TMP/err" \
&& { echo "PASS: T8: missing INFLUXDB_PASSWORD -> error"; PASS=$((PASS + 1)); } \
|| { echo "FAIL: T8: missing INFLUXDB_PASSWORD handling (rc=$rc)"; FAIL=$((FAIL + 1)); }
echo
echo "Results: $PASS passed, $FAIL failed"
[ "$FAIL" = "0" ]

View File

@ -0,0 +1,66 @@
# host-metrics telegraf template.
# Rendered at container start by telegraf-entrypoint.sh. The entrypoint
# replaces two single-line markers in this file with TOML block fragments;
# see telegraf-entrypoint.sh for the substitution details. All ${...}
# variables are resolved by telegraf's native env substitution at
# config-load time.
@@HOST_TAG_BLOCK@@
[agent]
interval = "${COLLECT_INTERVAL}"
round_interval = true
collection_jitter = "0s"
flush_interval = "${COLLECT_INTERVAL}"
flush_jitter = "0s"
precision = "0s"
hostname = ""
omit_hostname = false
[[outputs.influxdb]]
urls = ["${INFLUXDB_URL}"]
database = "${INFLUXDB_DB}"
skip_database_creation = true
username = "${INFLUXDB_USER}"
password = "${INFLUXDB_PASSWORD}"
retention_policy = ""
write_consistency = "any"
timeout = "10s"
[[inputs.cpu]]
percpu = false
totalcpu = true
collect_cpu_time = false
report_active = true
[[inputs.mem]]
[[inputs.swap]]
[[inputs.system]]
[[inputs.processes]]
[[inputs.disk]]
# gopsutil with HOST_MOUNT_PREFIX=/hostfs strips the /hostfs prefix
# from /proc/mounts entries, so the mountpoints telegraf sees are
# the host's real paths (/, /boot, /home, ...). No mount_points
# filter; let ignore_fs do the noise reduction.
ignore_fs = ["tmpfs", "devtmpfs", "devfs", "iso9660", "overlay",
"aufs", "squashfs", "nsfs", "tracefs", "proc", "sysfs",
"cgroup", "cgroup2", "fuse.lxcfs"]
[[inputs.diskio]]
device_tags = ["DEVNAME"]
skip_serial_number = true
name_templates = ["$DEVNAME"]
[[inputs.net]]
# Allowlist covers physical ethernet (eth*, en*), wireless (wl*),
# cellular (wwan*), and bonded/teamed interfaces (bond*). Excludes
# docker bridges, veth pairs, tun/tap, and lo. Adjust per host if
# you need a more specific scope.
ignore_protocol_stats = true
interfaces = ["eth*", "en*", "wl*", "wwan*", "bond*"]
@@ZFS_BLOCK@@

View File

@ -0,0 +1,127 @@
# host-metrics stack
Per-host system metrics collector. Runs telegraf with host networking, host
PID namespace, and read-only bind mounts of /proc, /sys, and / so it can
report real CPU, memory, disk, network, and process metrics for the machine
it runs on. Writes to an InfluxDB 1.x endpoint of your choosing.
Deploy one instance per machine you want monitored.
## What gets collected
| Input | Measurements (in InfluxDB) |
|-------|----------------------------|
| inputs.cpu (totalcpu only) | cpu (`cpu=cpu-total`) |
| inputs.mem | mem |
| inputs.swap | swap |
| inputs.system | system (uptime, load1/5/15, n_users, n_cpus) |
| inputs.processes | processes (running/sleeping/blocked/zombies) |
| inputs.disk | disk (used/free/used_percent per mount) |
| inputs.diskio | diskio (read/write bytes/ops per device) |
| inputs.net | net (bytes/packets/err in/out per interface) |
| inputs.zfs (opt-in via COLLECT_ZFS=true) | zfs (ARC stats, pool state) |
All rows are tagged with `host` (kernel hostname, or `HOST_TAG` override).
## Deploy
### Create a spec
```bash
laconic-so --stack host-metrics deploy init --output spec-host-metrics.yml
```
Edit `spec-host-metrics.yml` to look like:
```yaml
stack: host-metrics
deploy-to: compose
credentials-files:
- ~/.credentials/host-metrics.env
config:
INFLUXDB_URL: 'https://influxdb.example.com'
INFLUXDB_DB: 'host_metrics' # default; override for a custom DB
HOST_TAG: 'validator-1' # optional; defaults to kernel hostname
COLLECT_INTERVAL: '10s' # telegraf collection + flush cadence
COLLECT_ZFS: 'false' # set to 'true' on ZFS hosts
```
`~/.credentials/host-metrics.env` must contain:
```
INFLUXDB_WRITE_USER=<writer-username>
INFLUXDB_WRITE_PASSWORD=<writer-password>
```
These are issued by the InfluxDB admin (the monitoring host operator); they
are the same writer-only credentials used by validators/RPCs to push agave
metrics.
### Create and start
```bash
laconic-so --stack host-metrics deploy create \
--spec-file spec-host-metrics.yml \
--deployment-dir ./deployment-host-metrics
laconic-so deployment --dir ./deployment-host-metrics start
```
`deploy create` builds the deployment dir from the spec; `deployment start`
brings the containers up. The `--stack` option is required for `deploy`
subcommands but rejected on `deployment` subcommands (the deployment dir
already knows its stack).
### Verify
```bash
laconic-so deployment --dir ./deployment-host-metrics logs telegraf | head
```
Expected: telegraf prints its startup banner and `Loaded inputs: ...`. No
errors about missing config or auth failures.
Within ~20 seconds, the host's data appears in the InfluxDB endpoint's
`host_metrics` database (or whichever DB you set in INFLUXDB_DB) and in
any Grafana dashboards bound to that DB.
## Configuration reference
| Env | Required | Default | Notes |
|-----|----------|---------|-------|
| `INFLUXDB_URL` | yes | - | Full URL including scheme. Example: `https://influxdb.example.com`. |
| `INFLUXDB_DB` | no | `host_metrics` | Target database. Must exist (writer is not granted CREATE). |
| `INFLUXDB_WRITE_USER` | yes | - | Writer-only user. |
| `INFLUXDB_WRITE_PASSWORD` | yes | - | Writer-only password. |
| `COLLECT_INTERVAL` | no | `10s` | Telegraf collection and flush cadence. |
| `HOST_TAG` | no | empty | Overrides the kernel hostname for the `host` tag on every row. Useful when a VM has a generic hostname. |
| `COLLECT_ZFS` | no | `false` | Set to `true` to enable `inputs.zfs` (pool state + ARC stats). |
## ZFS hosts
`inputs.disk` already reports used/free per mount for any filesystem type
including ZFS, so the disk-usage view works out of the box. Setting
`COLLECT_ZFS=true` additionally enables `inputs.zfs` which reads
`/proc/spl/kstat/zfs/...` and emits ARC hit ratio, ARC size, and per-pool
health metrics. The bind mount of `/proc` provides the necessary
visibility; no extra mounts are needed.
If you set `COLLECT_ZFS=true` on a non-ZFS host, telegraf logs an error
once per collection cycle and skips the input. Harmless but noisy; leave
the toggle off on non-ZFS machines.
## Troubleshooting
| Symptom | Likely cause |
|---------|-------------|
| Container fails to start with `FATAL: INFLUXDB_URL is required but empty` | Missing required env. Check spec.yml + credentials file. |
| Container starts, no rows appear in InfluxDB | Writer credentials wrong, or InfluxDB unreachable from this host's network. Check `docker logs <telegraf>` for `Post ... 401` / `connection refused`. |
| Two hosts overwriting each other's series | Both use the same kernel hostname. Set distinct `HOST_TAG` values. |
| `inputs.processes` reports only 1 process | `pid: host` missing from compose. Re-deploy. |
## Caveats
- Requires Docker with privileges to bind-mount `/`, `/proc`, `/sys`, and to
share the host PID namespace. Rootless Docker installations may refuse
`pid: host` and the `/` bind mount.
- One deployment per host. Running two on the same machine writes
duplicate rows under the same `host` tag.

View File

@ -0,0 +1,5 @@
version: "1.1"
name: host-metrics
description: "Per-host system metrics collector (telegraf -> InfluxDB)"
pods:
- host-metrics