stack-orchestrator/agave-stack/scripts/backlog.sh

235 lines
7.4 KiB
Bash
Raw Normal View History

#!/bin/bash
set -Eeuo pipefail
export PATH=/sbin:/bin:/usr/sbin:/usr/bin:/usr/local/sbin:/usr/local/bin
export XDG_RUNTIME_DIR="/run/user/$(id -u)"
mkdir -p "$XDG_RUNTIME_DIR"
# optional suffix from command-line, prepend dash if non-empty
SUFFIX="${1:-}"
SUFFIX="${SUFFIX:+-$SUFFIX}"
# define variables
DATASET="biscayne/DATA/deployments"
DEPLOYMENT_DIR="/srv/deployments/agave"
LOG_FILE="$HOME/.backlog_history"
ZFS_HOLD="backlog:pending"
SERVICE_STOP_TIMEOUT="300"
SNAPSHOT_RETENTION="6"
SNAPSHOT_PREFIX="backlog"
SNAPSHOT_TAG="$(date +%Y%m%d)${SUFFIX}"
SNAPSHOT="${DATASET}@${SNAPSHOT_PREFIX}-${SNAPSHOT_TAG}"
# remote replication targets
REMOTES=(
"mysterio:edith/DATA/backlog/biscayne-main"
"ardham:batterywharf/DATA/backlog/biscayne-main"
)
# log functions
log() {
local time_fmt
time_fmt=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
echo "[$time_fmt] $1" >> "$LOG_FILE"
}
log_close() {
local end_time duration
end_time=$(date +%s)
duration=$((end_time - start_time))
log "Backlog completed in ${duration}s"
echo "" >> "$LOG_FILE"
}
# service controls
services() {
local action="$1"
case "$action" in
stop)
log "Stopping agave deployment..."
laconic-so deployment --dir "$DEPLOYMENT_DIR" stop
log "Waiting for services to fully stop..."
local deadline=$(( $(date +%s) + SERVICE_STOP_TIMEOUT ))
while true; do
local running
running=$(docker ps --filter "label=com.docker.compose.project.working_dir=$DEPLOYMENT_DIR" -q 2>/dev/null | wc -l)
if [[ "$running" -eq 0 ]]; then
break
fi
if (( $(date +%s) >= deadline )); then
log "WARNING: Timeout waiting for services to stop; continuing."
break
fi
sleep 0.2
done
;;
start)
log "Starting agave deployment..."
laconic-so deployment --dir "$DEPLOYMENT_DIR" start
;;
*)
log "ERROR: Unknown action '$action' in services()"
exit 2
;;
esac
}
# send a snapshot to one remote
# args: snap remote_host remote_dataset
snapshot_send_one() {
local snap="$1" remote_host="$2" remote_dataset="$3"
log "Checking remote snapshots on $remote_host..."
local -a local_snaps remote_snaps
mapfile -t local_snaps < <(zfs list -H -t snapshot -o name -s creation -d1 "$DATASET" | grep -F "${DATASET}@${SNAPSHOT_PREFIX}-")
mapfile -t remote_snaps < <(ssh "$remote_host" zfs list -H -t snapshot -o name -s creation "$remote_dataset" | grep -F "${remote_dataset}@${SNAPSHOT_PREFIX}-" || true)
# find latest common snapshot
local base=""
local local_snap remote_snap remote_check
for local_snap in "${local_snaps[@]}"; do
remote_snap="${local_snap/$DATASET/$remote_dataset}"
for remote_check in "${remote_snaps[@]}"; do
if [[ "$remote_check" == "$remote_snap" ]]; then
base="$local_snap"
break
fi
done
done
if [[ -z "$base" && ${#remote_snaps[@]} -eq 0 ]]; then
log "No remote snapshots found on $remote_host — sending full snapshot."
if zfs send "$snap" | ssh "$remote_host" zfs receive -sF "$remote_dataset"; then
log "Full send to $remote_host succeeded."
return 0
else
log "ERROR: Full send to $remote_host failed."
return 1
fi
elif [[ -n "$base" ]]; then
log "Common base snapshot $base found — sending incremental to $remote_host."
if zfs send -i "$base" "$snap" | ssh "$remote_host" zfs receive -sF "$remote_dataset"; then
log "Incremental send to $remote_host succeeded."
return 0
else
log "ERROR: Incremental send to $remote_host failed."
return 1
fi
else
log "STALE DESTINATION: $remote_host has snapshots but no common base with local — skipping."
return 1
fi
}
# send snapshot to all remotes
snapshot_send() {
local snap="$1"
local failure_count=0
set +e
local entry remote_host remote_dataset
for entry in "${REMOTES[@]}"; do
remote_host="${entry%%:*}"
remote_dataset="${entry#*:}"
if ! snapshot_send_one "$snap" "$remote_host" "$remote_dataset"; then
failure_count=$((failure_count + 1))
fi
done
set -e
if [[ "$failure_count" -gt 0 ]]; then
log "WARNING: $failure_count destination(s) failed or are out of sync."
return 1
fi
return 0
}
# snapshot management
snapshot() {
local action="$1"
case "$action" in
create)
log "Creating snapshot: $SNAPSHOT"
zfs snapshot "$SNAPSHOT"
zfs hold "$ZFS_HOLD" "$SNAPSHOT" || log "ERROR: Failed to hold $SNAPSHOT"
;;
send)
log "Sending snapshot $SNAPSHOT..."
if snapshot_send "$SNAPSHOT"; then
log "Snapshot send completed. Releasing hold."
zfs release "$ZFS_HOLD" "$SNAPSHOT" || log "ERROR: Failed to release hold on $SNAPSHOT"
else
log "WARNING: Snapshot send encountered errors. Hold retained on $SNAPSHOT."
fi
;;
prune)
if [[ "$SNAPSHOT_RETENTION" -gt 0 ]]; then
log "Pruning old snapshots in $DATASET (retaining $SNAPSHOT_RETENTION destroyable snapshots)..."
local -a all_snaps destroyable
mapfile -t all_snaps < <(zfs list -H -t snapshot -o name -s creation -d1 "$DATASET" | grep -F "${DATASET}@${SNAPSHOT_PREFIX}-")
destroyable=()
for snap in "${all_snaps[@]}"; do
if zfs destroy -n -- "$snap" &>/dev/null; then
destroyable+=("$snap")
else
log "Skipping $snap — snapshot not destroyable (likely held)"
fi
done
local count to_destroy
count="${#destroyable[@]}"
to_destroy=$((count - SNAPSHOT_RETENTION))
if [[ "$to_destroy" -le 0 ]]; then
log "Nothing to prune — only $count destroyable snapshots exist"
else
local i
for (( i=0; i<to_destroy; i++ )); do
snap="${destroyable[$i]}"
log "Destroying snapshot: $snap"
if ! zfs destroy -- "$snap"; then
log "WARNING: Failed to destroy $snap despite earlier check"
fi
done
fi
else
log "Skipping pruning — retention is set to $SNAPSHOT_RETENTION"
fi
;;
*)
log "ERROR: Snapshot unknown action: $action"
exit 2
;;
esac
}
# open logging and begin execution
mkdir -p "$(dirname -- "$LOG_FILE")"
start_time=$(date +%s)
exec >> "$LOG_FILE" 2>&1
trap 'log_close' EXIT
trap 'rc=$?; log "ERROR: command failed at line $LINENO (exit $rc)"; exit $rc' ERR
log "Backlog Started"
if zfs list -H -t snapshot -o name -d1 "$DATASET" | grep -qxF "$SNAPSHOT"; then
log "WARNING: Snapshot $SNAPSHOT already exists. Exiting."
exit 1
fi
services stop
snapshot create
services start
snapshot send
snapshot prune
# end