Fix restart command for GitOps deployments

- Remove init_operation() from restart - don't regenerate spec from
  commands.py defaults, use existing git-tracked spec.yml instead
- Add docs/deployment_patterns.md documenting GitOps workflow
- Add pre-commit rule to CLAUDE.md
- Fix line length issues in helpers.py

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
fix-kind-relative-volume-paths
A. F. Dudley 2026-02-02 22:18:19 -05:00
parent be334ca39f
commit ca3153bb78
5 changed files with 158 additions and 81 deletions

View File

@ -8,6 +8,7 @@ NEVER assume your hypotheses are true without evidence
ALWAYS clearly state when something is a hypothesis ALWAYS clearly state when something is a hypothesis
ALWAYS use evidence from the systems your interacting with to support your claims and hypotheses ALWAYS use evidence from the systems your interacting with to support your claims and hypotheses
ALWAYS run `pre-commit run --all-files` before committing changes
## Key Principles ## Key Principles

View File

@ -0,0 +1,77 @@
# Deployment Patterns
## GitOps Pattern
For production deployments, we recommend a GitOps approach where your deployment configuration is tracked in version control.
### Overview
- **spec.yml is your source of truth**: Maintain it in your operator repository
- **Don't regenerate on every restart**: Run `deploy init` once, then customize and commit
- **Use restart for updates**: The restart command respects your git-tracked spec.yml
### Workflow
1. **Initial setup**: Run `deploy init` once to generate a spec.yml template
2. **Customize and commit**: Edit spec.yml with your configuration (hostnames, resources, etc.) and commit to your operator repo
3. **Deploy from git**: Use the committed spec.yml for deployments
4. **Update via git**: Make changes in git, then restart to apply
```bash
# Initial setup (run once)
laconic-so --stack my-stack deploy init --output spec.yml
# Customize for your environment
vim spec.yml # Set hostname, resources, etc.
# Commit to your operator repository
git add spec.yml
git commit -m "Add my-stack deployment configuration"
git push
# On deployment server: deploy from git-tracked spec
laconic-so deploy create \
--spec-file /path/to/operator-repo/spec.yml \
--deployment-dir my-deployment
laconic-so deployment --dir my-deployment start
```
### Updating Deployments
When you need to update a deployment:
```bash
# 1. Make changes in your operator repo
vim /path/to/operator-repo/spec.yml
git commit -am "Update configuration"
git push
# 2. On deployment server: pull and restart
cd /path/to/operator-repo && git pull
laconic-so deployment --dir my-deployment restart
```
The `restart` command:
- Pulls latest code from the stack repository
- Uses your git-tracked spec.yml (does NOT regenerate from defaults)
- Syncs the deployment directory
- Restarts services
### Anti-patterns
**Don't do this:**
```bash
# BAD: Regenerating spec on every deployment
laconic-so --stack my-stack deploy init --output spec.yml
laconic-so deploy create --spec-file spec.yml ...
```
This overwrites your customizations with defaults from the stack's `commands.py`.
**Do this instead:**
```bash
# GOOD: Use your git-tracked spec
git pull # Get latest spec.yml from your operator repo
laconic-so deployment --dir my-deployment restart
```

View File

@ -17,7 +17,6 @@ import click
from pathlib import Path from pathlib import Path
import subprocess import subprocess
import sys import sys
import tempfile
import time import time
from stack_orchestrator import constants from stack_orchestrator import constants
from stack_orchestrator.deploy.images import push_images_operation from stack_orchestrator.deploy.images import push_images_operation
@ -248,13 +247,13 @@ def run_job(ctx, job_name, helm_release):
) )
@click.pass_context @click.pass_context
def restart(ctx, stack_path, config_file, force, expected_ip): def restart(ctx, stack_path, config_file, force, expected_ip):
"""Pull latest stack, regenerate spec, and restart deployment. """Pull latest code and restart deployment using git-tracked spec.
This command: GitOps workflow:
1. Pulls latest code from the stack git repository 1. Operator maintains spec.yml in their git repository
2. Regenerates spec.yml from the stack's commands.py 2. This command pulls latest code (including updated spec.yml)
3. If hostname changed, verifies DNS routes to this server 3. If hostname changed, verifies DNS routes to this server
4. Syncs the deployment directory (preserves cluster ID and data) 4. Syncs deployment directory with the git-tracked spec
5. Stops and restarts the deployment 5. Stops and restarts the deployment
Data volumes are always preserved. The cluster is never destroyed. Data volumes are always preserved. The cluster is never destroyed.
@ -264,19 +263,17 @@ def restart(ctx, stack_path, config_file, force, expected_ip):
2. stack-source field in deployment.yml (if stored) 2. stack-source field in deployment.yml (if stored)
3. Error if neither available 3. Error if neither available
Note: After restart, Caddy will automatically provision TLS certificates Note: spec.yml should be maintained in git, not regenerated from
for any new hostnames. commands.py on each restart. Use 'deploy init' only for initial
spec generation, then customize and commit to your operator repo.
""" """
from stack_orchestrator.util import get_yaml, get_parsed_deployment_spec from stack_orchestrator.util import get_yaml, get_parsed_deployment_spec
from stack_orchestrator.deploy.deployment_create import ( from stack_orchestrator.deploy.deployment_create import create_operation
init_operation,
create_operation,
)
from stack_orchestrator.deploy.dns_probe import verify_dns_via_probe from stack_orchestrator.deploy.dns_probe import verify_dns_via_probe
deployment_context: DeploymentContext = ctx.obj deployment_context: DeploymentContext = ctx.obj
# Get current spec info # Get current spec info (before git pull)
current_spec = deployment_context.spec current_spec = deployment_context.spec
current_http_proxy = current_spec.get_http_proxy() current_http_proxy = current_spec.get_http_proxy()
current_hostname = ( current_hostname = (
@ -310,8 +307,8 @@ def restart(ctx, stack_path, config_file, force, expected_ip):
print(f"Stack source: {stack_source}") print(f"Stack source: {stack_source}")
print(f"Current hostname: {current_hostname}") print(f"Current hostname: {current_hostname}")
# Step 1: Git pull # Step 1: Git pull (brings in updated spec.yml from operator's repo)
print("\n[1/6] Pulling latest code from stack repository...") print("\n[1/4] Pulling latest code from stack repository...")
git_result = subprocess.run( git_result = subprocess.run(
["git", "pull"], cwd=stack_source, capture_output=True, text=True ["git", "pull"], cwd=stack_source, capture_output=True, text=True
) )
@ -320,36 +317,23 @@ def restart(ctx, stack_path, config_file, force, expected_ip):
sys.exit(1) sys.exit(1)
print(f"Git pull: {git_result.stdout.strip()}") print(f"Git pull: {git_result.stdout.strip()}")
# Step 2: Regenerate spec # Use the spec.yml from the deployment directory (updated by git pull if tracked)
print("\n[2/6] Regenerating spec from commands.py...") spec_file_path = deployment_context.deployment_dir / "spec.yml"
with tempfile.NamedTemporaryFile(mode="w", suffix=".yml", delete=False) as tmp: if not spec_file_path.exists():
new_spec_path = tmp.name print(f"Error: spec.yml not found at {spec_file_path}")
print("Ensure spec.yml exists in the deployment directory.")
sys.exit(1)
# Build deploy context for init # Parse spec to check for hostname changes
deploy_ctx = make_deploy_context(ctx) new_spec_obj = get_parsed_deployment_spec(str(spec_file_path))
init_operation(
deploy_command_context=deploy_ctx,
stack=str(stack_source),
deployer_type=current_spec.obj[constants.deploy_to_key],
config=None,
config_file=config_file,
kube_config=None,
image_registry=None,
output=new_spec_path,
map_ports_to_host=None,
)
# Parse new spec to get new hostname
new_spec_obj = get_parsed_deployment_spec(new_spec_path)
new_http_proxy = new_spec_obj.get("network", {}).get("http-proxy", []) new_http_proxy = new_spec_obj.get("network", {}).get("http-proxy", [])
new_hostname = new_http_proxy[0]["host-name"] if new_http_proxy else None new_hostname = new_http_proxy[0]["host-name"] if new_http_proxy else None
print(f"New hostname: {new_hostname}") print(f"Spec hostname: {new_hostname}")
# Step 3: DNS verification (only if hostname changed) # Step 2: DNS verification (only if hostname changed)
if new_hostname and new_hostname != current_hostname: if new_hostname and new_hostname != current_hostname:
print(f"\n[3/6] Hostname changed: {current_hostname} -> {new_hostname}") print(f"\n[2/4] Hostname changed: {current_hostname} -> {new_hostname}")
if force: if force:
print("DNS verification skipped (--force)") print("DNS verification skipped (--force)")
else: else:
@ -360,25 +344,26 @@ def restart(ctx, stack_path, config_file, force, expected_ip):
print("Use --force to skip this check.") print("Use --force to skip this check.")
sys.exit(1) sys.exit(1)
else: else:
print("\n[3/6] Hostname unchanged, skipping DNS verification") print("\n[2/4] Hostname unchanged, skipping DNS verification")
# Step 4: Sync deployment directory # Step 3: Sync deployment directory with spec
print("\n[4/6] Syncing deployment directory...") print("\n[3/4] Syncing deployment directory...")
deploy_ctx = make_deploy_context(ctx)
create_operation( create_operation(
deployment_command_context=deploy_ctx, deployment_command_context=deploy_ctx,
spec_file=new_spec_path, spec_file=str(spec_file_path),
deployment_dir=str(deployment_context.deployment_dir), deployment_dir=str(deployment_context.deployment_dir),
update=True, update=True,
network_dir=None, network_dir=None,
initial_peers=None, initial_peers=None,
) )
# Reload deployment context with new spec # Reload deployment context with updated spec
deployment_context.init(deployment_context.deployment_dir) deployment_context.init(deployment_context.deployment_dir)
ctx.obj = deployment_context ctx.obj = deployment_context
# Step 5: Stop deployment # Stop deployment
print("\n[5/6] Stopping deployment...") print("\n[4/4] Restarting deployment...")
ctx.obj = make_deploy_context(ctx) ctx.obj = make_deploy_context(ctx)
down_operation( down_operation(
ctx, delete_volumes=False, extra_args_list=[], skip_cluster_management=True ctx, delete_volumes=False, extra_args_list=[], skip_cluster_management=True
@ -387,17 +372,13 @@ def restart(ctx, stack_path, config_file, force, expected_ip):
# Brief pause to ensure clean shutdown # Brief pause to ensure clean shutdown
time.sleep(5) time.sleep(5)
# Step 6: Start deployment # Start deployment
print("\n[6/6] Starting deployment...")
up_operation( up_operation(
ctx, services_list=None, stay_attached=False, skip_cluster_management=True ctx, services_list=None, stay_attached=False, skip_cluster_management=True
) )
print("\n=== Restart Complete ===") print("\n=== Restart Complete ===")
print("Deployment restarted with updated configuration.") print("Deployment restarted with git-tracked configuration.")
if new_hostname and new_hostname != current_hostname: if new_hostname and new_hostname != current_hostname:
print(f"\nNew hostname: {new_hostname}") print(f"\nNew hostname: {new_hostname}")
print("Caddy will automatically provision TLS certificate.") print("Caddy will automatically provision TLS certificate.")
# Cleanup temp file
Path(new_spec_path).unlink(missing_ok=True)

View File

@ -123,6 +123,9 @@ def _clean_etcd_keeping_certs(etcd_path: str) -> bool:
specific stale resources (blacklist), we keep only the valuable data specific stale resources (blacklist), we keep only the valuable data
(caddy TLS certs) and delete everything else (whitelist approach). (caddy TLS certs) and delete everything else (whitelist approach).
The etcd image is distroless (no shell), so we extract the statically-linked
etcdctl binary and run it from alpine which has shell support.
Returns True if cleanup succeeded, False if no action needed or failed. Returns True if cleanup succeeded, False if no action needed or failed.
""" """
db_path = Path(etcd_path) / "member" / "snap" / "db" db_path = Path(etcd_path) / "member" / "snap" / "db"
@ -146,14 +149,26 @@ def _clean_etcd_keeping_certs(etcd_path: str) -> bool:
# Whitelist: prefixes to KEEP - everything else gets deleted # Whitelist: prefixes to KEEP - everything else gets deleted
keep_prefixes = "/registry/secrets/caddy-system" keep_prefixes = "/registry/secrets/caddy-system"
# All operations in docker to handle root-owned etcd files # The etcd image is distroless (no shell). We extract the statically-linked
# etcdctl binary and run it from alpine which has shell + jq support.
cleanup_script = f""" cleanup_script = f"""
set -e set -e
ALPINE_IMAGE="alpine:3.19" ALPINE_IMAGE="alpine:3.19"
# Cleanup previous runs
docker rm -f laconic-etcd-cleanup 2>/dev/null || true
docker rm -f etcd-extract 2>/dev/null || true
docker run --rm -v /tmp:/tmp $ALPINE_IMAGE rm -rf {temp_dir}
# Create temp dir # Create temp dir
docker run --rm -v /tmp:/tmp $ALPINE_IMAGE \ docker run --rm -v /tmp:/tmp $ALPINE_IMAGE mkdir -p {temp_dir}
sh -c "rm -rf {temp_dir} && mkdir -p {temp_dir}"
# Extract etcdctl binary (it's statically linked)
docker create --name etcd-extract {etcd_image}
docker cp etcd-extract:/usr/local/bin/etcdctl /tmp/etcdctl-bin
docker rm etcd-extract
docker run --rm -v /tmp/etcdctl-bin:/src:ro -v {temp_dir}:/dst $ALPINE_IMAGE \
sh -c "cp /src /dst/etcdctl && chmod +x /dst/etcdctl"
# Copy db to temp location # Copy db to temp location
docker run --rm \ docker run --rm \
@ -166,8 +181,7 @@ def _clean_etcd_keeping_certs(etcd_path: str) -> bool:
etcdutl snapshot restore /work/etcd-snapshot.db \ etcdutl snapshot restore /work/etcd-snapshot.db \
--data-dir=/work/etcd-data --skip-hash-check 2>/dev/null --data-dir=/work/etcd-data --skip-hash-check 2>/dev/null
# Start temp etcd # Start temp etcd (runs the etcd binary, no shell needed)
docker rm -f laconic-etcd-cleanup 2>/dev/null || true
docker run -d --name laconic-etcd-cleanup \ docker run -d --name laconic-etcd-cleanup \
-v {temp_dir}/etcd-data:/etcd-data \ -v {temp_dir}/etcd-data:/etcd-data \
-v {temp_dir}:/backup \ -v {temp_dir}:/backup \
@ -178,31 +192,34 @@ def _clean_etcd_keeping_certs(etcd_path: str) -> bool:
sleep 3 sleep 3
# Export caddy secrets to backup file (the only thing we keep) # Use alpine with extracted etcdctl to run commands (alpine has shell + jq)
docker exec laconic-etcd-cleanup \ # Export caddy secrets
etcdctl get --prefix "{keep_prefixes}" -w json > {temp_dir}/kept.json \ docker run --rm \
2>/dev/null || echo '{{}}' > {temp_dir}/kept.json -v {temp_dir}:/backup \
--network container:laconic-etcd-cleanup \
$ALPINE_IMAGE sh -c \
'/backup/etcdctl get --prefix "{keep_prefixes}" -w json \
> /backup/kept.json 2>/dev/null || echo "{{}}" > /backup/kept.json'
# Delete ALL registry keys # Delete ALL registry keys
docker exec laconic-etcd-cleanup etcdctl del --prefix /registry docker run --rm \
-v {temp_dir}:/backup \
--network container:laconic-etcd-cleanup \
$ALPINE_IMAGE /backup/etcdctl del --prefix /registry
# Restore kept keys using etcdctl txn # Restore kept keys using jq
docker exec laconic-etcd-cleanup sh -c ' docker run --rm \
cat /backup/kept.json 2>/dev/null | \ -v {temp_dir}:/backup \
(python3 -c " --network container:laconic-etcd-cleanup \
import sys, json, base64 $ALPINE_IMAGE sh -c '
try: apk add --no-cache jq >/dev/null 2>&1
data = json.load(sys.stdin) jq -r ".kvs[] | @base64" /backup/kept.json 2>/dev/null | \
for kv in data.get(\"kvs\", []): while read encoded; do
k = base64.b64decode(kv[\"key\"]).decode() key=$(echo $encoded | base64 -d | jq -r ".key" | base64 -d)
v = base64.b64decode(kv[\"value\"]).decode(\"latin-1\") val=$(echo $encoded | base64 -d | jq -r ".value" | base64 -d)
print(k) echo "$val" | /backup/etcdctl put "$key"
print(v) done
except: pass ' || true
" 2>/dev/null || true) | while IFS= read -r key && IFS= read -r value; do
printf \"%s\" \"$value\" | etcdctl put \"$key\"
done
' 2>/dev/null || true
# Save cleaned snapshot # Save cleaned snapshot
docker exec laconic-etcd-cleanup \ docker exec laconic-etcd-cleanup \
@ -228,8 +245,9 @@ except: pass
docker run --rm -v {etcd_path}:/etcd -v {temp_dir}:/tmp-work $ALPINE_IMAGE \ docker run --rm -v {etcd_path}:/etcd -v {temp_dir}:/tmp-work $ALPINE_IMAGE \
sh -c "rm -rf /etcd/member && mv /tmp-work/new-etcd/member /etcd/member" sh -c "rm -rf /etcd/member && mv /tmp-work/new-etcd/member /etcd/member"
# Cleanup temp (but NOT the backup) # Cleanup temp files (but NOT the timestamped backup in etcd_path)
docker run --rm -v /tmp:/tmp $ALPINE_IMAGE rm -rf {temp_dir} docker run --rm -v /tmp:/tmp $ALPINE_IMAGE rm -rf {temp_dir}
rm -f /tmp/etcdctl-bin
""" """
result = subprocess.run(cleanup_script, shell=True, capture_output=True, text=True) result = subprocess.run(cleanup_script, shell=True, capture_output=True, text=True)

View File

@ -180,7 +180,7 @@ class Spec:
return self.obj.get(constants.deploy_to_key) return self.obj.get(constants.deploy_to_key)
def get_acme_email(self): def get_acme_email(self):
return self.obj.get(constants.acme_email_key, "") return self.obj.get(constants.network_key, {}).get(constants.acme_email_key, "")
def is_kubernetes_deployment(self): def is_kubernetes_deployment(self):
return self.get_deployment_type() in [ return self.get_deployment_type() in [