Fix restart command for GitOps deployments

- Remove init_operation() from restart - don't regenerate spec from commands.py defaults, use existing git-tracked spec.yml instead - Add docs/deployment_patterns.md documenting GitOps workflow - Add pre-commit rule to CLAUDE.md - Fix line length issues in helpers.py Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-02 22:18:19 -05:00 · 2026-02-02 22:18:19 -05:00 · ca3153bb78
parent be334ca39f
commit ca3153bb78
5 changed files with 158 additions and 81 deletions
--- a/CLAUDE.md
+++ b/CLAUDE.md
@ -8,6 +8,7 @@ NEVER assume your hypotheses are true without evidence
 ALWAYS clearly state when something is a hypothesis
 ALWAYS use evidence from the systems your interacting with to support your claims and hypotheses
 ALWAYS run `pre-commit run --all-files` before committing changes
 ## Key Principles
--- a/docs/deployment_patterns.md
+++ b/docs/deployment_patterns.md
@ -0,0 +1,77 @@
 # Deployment Patterns
 ## GitOps Pattern
 For production deployments, we recommend a GitOps approach where your deployment configuration is tracked in version control.
 ### Overview
 - **spec.yml is your source of truth**: Maintain it in your operator repository
 - **Don't regenerate on every restart**: Run `deploy init` once, then customize and commit
 - **Use restart for updates**: The restart command respects your git-tracked spec.yml
 ### Workflow
 1. **Initial setup**: Run `deploy init` once to generate a spec.yml template
 2. **Customize and commit**: Edit spec.yml with your configuration (hostnames, resources, etc.) and commit to your operator repo
 3. **Deploy from git**: Use the committed spec.yml for deployments
 4. **Update via git**: Make changes in git, then restart to apply
 ```bash
 # Initial setup (run once)
 laconic-so --stack my-stack deploy init --output spec.yml
 # Customize for your environment
 vim spec.yml  # Set hostname, resources, etc.
 # Commit to your operator repository
 git add spec.yml
 git commit -m "Add my-stack deployment configuration"
 git push
 # On deployment server: deploy from git-tracked spec
 laconic-so deploy create \
  --spec-file /path/to/operator-repo/spec.yml \
  --deployment-dir my-deployment
 laconic-so deployment --dir my-deployment start
 ```
 ### Updating Deployments
 When you need to update a deployment:
 ```bash
 # 1. Make changes in your operator repo
 vim /path/to/operator-repo/spec.yml
 git commit -am "Update configuration"
 git push
 # 2. On deployment server: pull and restart
 cd /path/to/operator-repo && git pull
 laconic-so deployment --dir my-deployment restart
 ```
 The `restart` command:
 - Pulls latest code from the stack repository
 - Uses your git-tracked spec.yml (does NOT regenerate from defaults)
 - Syncs the deployment directory
 - Restarts services
 ### Anti-patterns
 **Don't do this:**
 ```bash
 # BAD: Regenerating spec on every deployment
 laconic-so --stack my-stack deploy init --output spec.yml
 laconic-so deploy create --spec-file spec.yml ...
 ```
 This overwrites your customizations with defaults from the stack's `commands.py`.
 **Do this instead:**
 ```bash
 # GOOD: Use your git-tracked spec
 git pull  # Get latest spec.yml from your operator repo
 laconic-so deployment --dir my-deployment restart
 ```
--- a/stack_orchestrator/deploy/deployment.py
+++ b/stack_orchestrator/deploy/deployment.py
@ -17,7 +17,6 @@ import click
 from pathlib import Path
 import subprocess
 import sys
 import tempfile
 import time
 from stack_orchestrator import constants
 from stack_orchestrator.deploy.images import push_images_operation
@ -248,13 +247,13 @@ def run_job(ctx, job_name, helm_release):
 )
@click.pass_context
 def restart(ctx, stack_path, config_file, force, expected_ip):
-    """Pull latest stack, regenerate spec, and restart deployment.
+    """Pull latest code and restart deployment using git-tracked spec.
-    This command:
+    GitOps workflow:
-    1. Pulls latest code from the stack git repository
+    1. Operator maintains spec.yml in their git repository
-    2. Regenerates spec.yml from the stack's commands.py
+    2. This command pulls latest code (including updated spec.yml)
    3. If hostname changed, verifies DNS routes to this server
-    4. Syncs the deployment directory (preserves cluster ID and data)
+    4. Syncs deployment directory with the git-tracked spec
    5. Stops and restarts the deployment
    Data volumes are always preserved. The cluster is never destroyed.
@ -264,19 +263,17 @@ def restart(ctx, stack_path, config_file, force, expected_ip):
    2. stack-source field in deployment.yml (if stored)
    3. Error if neither available
-    Note: After restart, Caddy will automatically provision TLS certificates
+    Note: spec.yml should be maintained in git, not regenerated from
-    for any new hostnames.
+    commands.py on each restart. Use 'deploy init' only for initial
    spec generation, then customize and commit to your operator repo.
    """
    from stack_orchestrator.util import get_yaml, get_parsed_deployment_spec
-    from stack_orchestrator.deploy.deployment_create import (
+    from stack_orchestrator.deploy.deployment_create import create_operation
        init_operation,
        create_operation,
    )
    from stack_orchestrator.deploy.dns_probe import verify_dns_via_probe
    deployment_context: DeploymentContext = ctx.obj
-    # Get current spec info
+    # Get current spec info (before git pull)
    current_spec = deployment_context.spec
    current_http_proxy = current_spec.get_http_proxy()
    current_hostname = (
@ -310,8 +307,8 @@ def restart(ctx, stack_path, config_file, force, expected_ip):
    print(f"Stack source: {stack_source}")
    print(f"Current hostname: {current_hostname}")
-    # Step 1: Git pull
+    # Step 1: Git pull (brings in updated spec.yml from operator's repo)
-    print("\n[1/6] Pulling latest code from stack repository...")
+    print("\n[1/4] Pulling latest code from stack repository...")
    git_result = subprocess.run(
        ["git", "pull"], cwd=stack_source, capture_output=True, text=True
    )
@ -320,36 +317,23 @@ def restart(ctx, stack_path, config_file, force, expected_ip):
        sys.exit(1)
    print(f"Git pull: {git_result.stdout.strip()}")
-    # Step 2: Regenerate spec
+    # Use the spec.yml from the deployment directory (updated by git pull if tracked)
-    print("\n[2/6] Regenerating spec from commands.py...")
+    spec_file_path = deployment_context.deployment_dir / "spec.yml"
-    with tempfile.NamedTemporaryFile(mode="w", suffix=".yml", delete=False) as tmp:
+    if not spec_file_path.exists():
-        new_spec_path = tmp.name
+        print(f"Error: spec.yml not found at {spec_file_path}")
        print("Ensure spec.yml exists in the deployment directory.")
        sys.exit(1)
-    # Build deploy context for init
+    # Parse spec to check for hostname changes
-    deploy_ctx = make_deploy_context(ctx)
+    new_spec_obj = get_parsed_deployment_spec(str(spec_file_path))
    init_operation(
        deploy_command_context=deploy_ctx,
        stack=str(stack_source),
        deployer_type=current_spec.obj[constants.deploy_to_key],
        config=None,
        config_file=config_file,
        kube_config=None,
        image_registry=None,
        output=new_spec_path,
        map_ports_to_host=None,
    )
    # Parse new spec to get new hostname
    new_spec_obj = get_parsed_deployment_spec(new_spec_path)
    new_http_proxy = new_spec_obj.get("network", {}).get("http-proxy", [])
    new_hostname = new_http_proxy[0]["host-name"] if new_http_proxy else None
-    print(f"New hostname: {new_hostname}")
+    print(f"Spec hostname: {new_hostname}")
-    # Step 3: DNS verification (only if hostname changed)
+    # Step 2: DNS verification (only if hostname changed)
    if new_hostname and new_hostname != current_hostname:
-        print(f"\n[3/6] Hostname changed: {current_hostname} -> {new_hostname}")
+        print(f"\n[2/4] Hostname changed: {current_hostname} -> {new_hostname}")
        if force:
            print("DNS verification skipped (--force)")
        else:
@ -360,25 +344,26 @@ def restart(ctx, stack_path, config_file, force, expected_ip):
                print("Use --force to skip this check.")
                sys.exit(1)
    else:
-        print("\n[3/6] Hostname unchanged, skipping DNS verification")
+        print("\n[2/4] Hostname unchanged, skipping DNS verification")
-    # Step 4: Sync deployment directory
+    # Step 3: Sync deployment directory with spec
-    print("\n[4/6] Syncing deployment directory...")
+    print("\n[3/4] Syncing deployment directory...")
    deploy_ctx = make_deploy_context(ctx)
    create_operation(
        deployment_command_context=deploy_ctx,
-        spec_file=new_spec_path,
+        spec_file=str(spec_file_path),
        deployment_dir=str(deployment_context.deployment_dir),
        update=True,
        network_dir=None,
        initial_peers=None,
    )
-    # Reload deployment context with new spec
+    # Reload deployment context with updated spec
    deployment_context.init(deployment_context.deployment_dir)
    ctx.obj = deployment_context
-    # Step 5: Stop deployment
+    # Stop deployment
-    print("\n[5/6] Stopping deployment...")
+    print("\n[4/4] Restarting deployment...")
    ctx.obj = make_deploy_context(ctx)
    down_operation(
        ctx, delete_volumes=False, extra_args_list=[], skip_cluster_management=True
@ -387,17 +372,13 @@ def restart(ctx, stack_path, config_file, force, expected_ip):
    # Brief pause to ensure clean shutdown
    time.sleep(5)
-    # Step 6: Start deployment
+    # Start deployment
    print("\n[6/6] Starting deployment...")
    up_operation(
        ctx, services_list=None, stay_attached=False, skip_cluster_management=True
    )
    print("\n=== Restart Complete ===")
-    print("Deployment restarted with updated configuration.")
+    print("Deployment restarted with git-tracked configuration.")
    if new_hostname and new_hostname != current_hostname:
        print(f"\nNew hostname: {new_hostname}")
        print("Caddy will automatically provision TLS certificate.")
    # Cleanup temp file
    Path(new_spec_path).unlink(missing_ok=True)
--- a/stack_orchestrator/deploy/k8s/helpers.py
+++ b/stack_orchestrator/deploy/k8s/helpers.py
@ -123,6 +123,9 @@ def _clean_etcd_keeping_certs(etcd_path: str) -> bool:
    specific stale resources (blacklist), we keep only the valuable data
    (caddy TLS certs) and delete everything else (whitelist approach).
    The etcd image is distroless (no shell), so we extract the statically-linked
    etcdctl binary and run it from alpine which has shell support.
    Returns True if cleanup succeeded, False if no action needed or failed.
    """
    db_path = Path(etcd_path) / "member" / "snap" / "db"
@ -146,14 +149,26 @@ def _clean_etcd_keeping_certs(etcd_path: str) -> bool:
    # Whitelist: prefixes to KEEP - everything else gets deleted
    keep_prefixes = "/registry/secrets/caddy-system"
-    # All operations in docker to handle root-owned etcd files
+    # The etcd image is distroless (no shell). We extract the statically-linked
    # etcdctl binary and run it from alpine which has shell + jq support.
    cleanup_script = f"""
        set -e
        ALPINE_IMAGE="alpine:3.19"
        # Cleanup previous runs
        docker rm -f laconic-etcd-cleanup 2>/dev/null || true
        docker rm -f etcd-extract 2>/dev/null || true
        docker run --rm -v /tmp:/tmp $ALPINE_IMAGE rm -rf {temp_dir}
        # Create temp dir
-        docker run --rm -v /tmp:/tmp $ALPINE_IMAGE \
+        docker run --rm -v /tmp:/tmp $ALPINE_IMAGE mkdir -p {temp_dir}
-            sh -c "rm -rf {temp_dir} && mkdir -p {temp_dir}"
+
        # Extract etcdctl binary (it's statically linked)
        docker create --name etcd-extract {etcd_image}
        docker cp etcd-extract:/usr/local/bin/etcdctl /tmp/etcdctl-bin
        docker rm etcd-extract
        docker run --rm -v /tmp/etcdctl-bin:/src:ro -v {temp_dir}:/dst $ALPINE_IMAGE \
            sh -c "cp /src /dst/etcdctl && chmod +x /dst/etcdctl"
        # Copy db to temp location
        docker run --rm \
@ -166,8 +181,7 @@ def _clean_etcd_keeping_certs(etcd_path: str) -> bool:
            etcdutl snapshot restore /work/etcd-snapshot.db \
                --data-dir=/work/etcd-data --skip-hash-check 2>/dev/null
-        # Start temp etcd
+        # Start temp etcd (runs the etcd binary, no shell needed)
        docker rm -f laconic-etcd-cleanup 2>/dev/null || true
        docker run -d --name laconic-etcd-cleanup \
            -v {temp_dir}/etcd-data:/etcd-data \
            -v {temp_dir}:/backup \
@ -178,31 +192,34 @@ def _clean_etcd_keeping_certs(etcd_path: str) -> bool:
        sleep 3
-        # Export caddy secrets to backup file (the only thing we keep)
+        # Use alpine with extracted etcdctl to run commands (alpine has shell + jq)
-        docker exec laconic-etcd-cleanup \
+        # Export caddy secrets
-            etcdctl get --prefix "{keep_prefixes}" -w json > {temp_dir}/kept.json \
+        docker run --rm \
-            2>/dev/null || echo '{{}}' > {temp_dir}/kept.json
+            -v {temp_dir}:/backup \
            --network container:laconic-etcd-cleanup \
            $ALPINE_IMAGE sh -c \
            '/backup/etcdctl get --prefix "{keep_prefixes}" -w json \
                > /backup/kept.json 2>/dev/null || echo "{{}}" > /backup/kept.json'
        # Delete ALL registry keys
-        docker exec laconic-etcd-cleanup etcdctl del --prefix /registry
+        docker run --rm \
            -v {temp_dir}:/backup \
            --network container:laconic-etcd-cleanup \
            $ALPINE_IMAGE /backup/etcdctl del --prefix /registry
-        # Restore kept keys using etcdctl txn
+        # Restore kept keys using jq
-        docker exec laconic-etcd-cleanup sh -c '
+        docker run --rm \
-            cat /backup/kept.json 2>/dev/null | \
+            -v {temp_dir}:/backup \
-            (python3 -c "
+            --network container:laconic-etcd-cleanup \
-import sys, json, base64
+            $ALPINE_IMAGE sh -c '
-try:
+                apk add --no-cache jq >/dev/null 2>&1
-    data = json.load(sys.stdin)
+                jq -r ".kvs[] | @base64" /backup/kept.json 2>/dev/null | \
-    for kv in data.get(\"kvs\", []):
+                while read encoded; do
-        k = base64.b64decode(kv[\"key\"]).decode()
+                    key=$(echo $encoded | base64 -d | jq -r ".key" | base64 -d)
-        v = base64.b64decode(kv[\"value\"]).decode(\"latin-1\")
+                    val=$(echo $encoded | base64 -d | jq -r ".value" | base64 -d)
-        print(k)
+                    echo "$val" | /backup/etcdctl put "$key"
-        print(v)
+                done
-except: pass
+            ' || true
 " 2>/dev/null || true) | while IFS= read -r key && IFS= read -r value; do
                printf \"%s\" \"$value\" | etcdctl put \"$key\"
            done
        ' 2>/dev/null || true
        # Save cleaned snapshot
        docker exec laconic-etcd-cleanup \
@ -228,8 +245,9 @@ except: pass
        docker run --rm -v {etcd_path}:/etcd -v {temp_dir}:/tmp-work $ALPINE_IMAGE \
            sh -c "rm -rf /etcd/member && mv /tmp-work/new-etcd/member /etcd/member"
-        # Cleanup temp (but NOT the backup)
+        # Cleanup temp files (but NOT the timestamped backup in etcd_path)
        docker run --rm -v /tmp:/tmp $ALPINE_IMAGE rm -rf {temp_dir}
        rm -f /tmp/etcdctl-bin
    """
    result = subprocess.run(cleanup_script, shell=True, capture_output=True, text=True)
--- a/stack_orchestrator/deploy/spec.py
+++ b/stack_orchestrator/deploy/spec.py
@ -180,7 +180,7 @@ class Spec:
        return self.obj.get(constants.deploy_to_key)
    def get_acme_email(self):
-        return self.obj.get(constants.acme_email_key, "")
+        return self.obj.get(constants.network_key, {}).get(constants.acme_email_key, "")
    def is_kubernetes_deployment(self):
        return self.get_deployment_type() in [