Merge commit '19bb90f8148833ea7ff79cba312b048abc0d790b' as 'stack-orchestrator'

fix/kind-mount-propagation
A. F. Dudley 2026-03-10 08:08:04 +00:00
commit 03a5b5e39e
850 changed files with 119212 additions and 63770 deletions

View File

@ -0,0 +1,66 @@
name: Fixturenet-Laconicd-Test
on:
push:
branches: '*'
paths:
- '!**'
- '.gitea/workflows/triggers/fixturenet-laconicd-test'
schedule:
- cron: '1 13 * * *'
jobs:
test:
name: "Run Laconicd fixturenet and Laconic CLI tests"
runs-on: ubuntu-latest
steps:
- name: 'Update'
run: apt-get update
- name: 'Setup jq'
run: apt-get install jq -y
- name: 'Check jq'
run: |
which jq
jq --version
- name: "Clone project repository"
uses: actions/checkout@v3
# At present the stock setup-python action fails on Linux/aarch64
# Conditional steps below workaroud this by using deadsnakes for that case only
- name: "Install Python for ARM on Linux"
if: ${{ runner.arch == 'arm64' && runner.os == 'Linux' }}
uses: deadsnakes/action@v3.0.1
with:
python-version: '3.8'
- name: "Install Python cases other than ARM on Linux"
if: ${{ ! (runner.arch == 'arm64' && runner.os == 'Linux') }}
uses: actions/setup-python@v4
with:
python-version: '3.8'
- name: "Print Python version"
run: python3 --version
- name: "Install shiv"
run: pip install shiv==1.0.6
- name: "Generate build version file"
run: ./scripts/create_build_tag_file.sh
- name: "Build local shiv package"
run: ./scripts/build_shiv_package.sh
- name: "Run fixturenet-laconicd tests"
run: ./tests/fixturenet-laconicd/run-test.sh
- name: "Run laconic CLI tests"
run: ./tests/fixturenet-laconicd/run-cli-test.sh
- name: Notify Vulcanize Slack on CI failure
if: ${{ always() && github.ref_name == 'main' }}
uses: ravsamhq/notify-slack-action@v2
with:
status: ${{ job.status }}
notify_when: 'failure'
env:
SLACK_WEBHOOK_URL: ${{ secrets.VULCANIZE_SLACK_CI_ALERTS }}
- name: Notify DeepStack Slack on CI failure
if: ${{ always() && github.ref_name == 'main' }}
uses: ravsamhq/notify-slack-action@v2
with:
status: ${{ job.status }}
notify_when: 'failure'
env:
SLACK_WEBHOOK_URL: ${{ secrets.DEEPSTACK_SLACK_CI_ALERTS }}

View File

@ -0,0 +1,37 @@
name: Lint Checks
on:
pull_request:
branches: '*'
push:
branches: '*'
jobs:
test:
name: "Run linter"
runs-on: ubuntu-latest
steps:
- name: "Clone project repository"
uses: actions/checkout@v3
- name: "Install Python"
uses: actions/setup-python@v4
with:
python-version: '3.8'
- name : "Run flake8"
uses: py-actions/flake8@v2
- name: Notify Vulcanize Slack on CI failure
if: ${{ always() && github.ref_name == 'main' }}
uses: ravsamhq/notify-slack-action@v2
with:
status: ${{ job.status }}
notify_when: 'failure'
env:
SLACK_WEBHOOK_URL: ${{ secrets.VULCANIZE_SLACK_CI_ALERTS }}
- name: Notify DeepStack Slack on CI failure
if: ${{ always() && github.ref_name == 'main' }}
uses: ravsamhq/notify-slack-action@v2
with:
status: ${{ job.status }}
notify_when: 'failure'
env:
SLACK_WEBHOOK_URL: ${{ secrets.DEEPSTACK_SLACK_CI_ALERTS }}

View File

@ -0,0 +1,72 @@
name: Publish
on:
push:
branches:
- main
- publish-test
paths-ignore:
- '.gitea/workflows/triggers/*'
jobs:
publish:
name: "Build and publish"
runs-on: ubuntu-latest
steps:
- name: "Clone project repository"
uses: actions/checkout@v3
- name: "Get build info"
id: build-info
run: |
build_tag=$(./scripts/create_build_tag_file.sh)
echo "build-tag=v${build_tag}" >> $GITHUB_OUTPUT
# At present the stock setup-python action fails on Linux/aarch64
# Conditional steps below workaroud this by using deadsnakes for that case only
- name: "Install Python for ARM on Linux"
if: ${{ runner.arch == 'arm64' && runner.os == 'Linux' }}
uses: deadsnakes/action@v3.0.1
with:
python-version: '3.8'
- name: "Install Python cases other than ARM on Linux"
if: ${{ ! (runner.arch == 'arm64' && runner.os == 'Linux') }}
uses: actions/setup-python@v4
with:
python-version: '3.8'
- name: "Print Python version"
run: python3 --version
- name: "Install shiv"
run: pip install shiv==1.0.6
- name: "Build local shiv package"
id: build
run: |
./scripts/build_shiv_package.sh
result_code=$?
echo "package-file=$(ls ./package/*)" >> $GITHUB_OUTPUT
exit $result_code
- name: "Stage artifact file"
run: |
cp ${{ steps.build.outputs.package-file }} ./laconic-so
- name: "Create release"
uses: https://gitea.com/cerc-io/action-gh-release@gitea-v2
with:
tag_name: ${{ steps.build-info.outputs.build-tag }}
# On the publish test branch, mark our release as a draft
# Hack using endsWith to workaround Gitea sometimes sending "publish-test" vs "refs/heads/publish-test"
draft: ${{ endsWith('publish-test', github.ref ) }}
files: ./laconic-so
- name: Notify Vulcanize Slack on CI failure
if: ${{ always() && github.ref_name == 'main' }}
uses: ravsamhq/notify-slack-action@v2
with:
status: ${{ job.status }}
notify_when: 'failure'
env:
SLACK_WEBHOOK_URL: ${{ secrets.VULCANIZE_SLACK_CI_ALERTS }}
- name: Notify DeepStack Slack on CI failure
if: ${{ always() && github.ref_name == 'main' }}
uses: ravsamhq/notify-slack-action@v2
with:
status: ${{ job.status }}
notify_when: 'failure'
env:
SLACK_WEBHOOK_URL: ${{ secrets.DEEPSTACK_SLACK_CI_ALERTS }}

View File

@ -0,0 +1,69 @@
name: Container Registry Test
on:
push:
branches: '*'
paths:
- '!**'
- '.gitea/workflows/triggers/test-container-registry'
- '.gitea/workflows/test-container-registry.yml'
- 'tests/container-registry/run-test.sh'
schedule: # Note: coordinate with other tests to not overload runners at the same time of day
- cron: '6 19 * * *'
jobs:
test:
name: "Run contaier registry hosting test on kind/k8s"
runs-on: ubuntu-22.04
steps:
- name: "Clone project repository"
uses: actions/checkout@v3
# At present the stock setup-python action fails on Linux/aarch64
# Conditional steps below workaroud this by using deadsnakes for that case only
- name: "Install Python for ARM on Linux"
if: ${{ runner.arch == 'arm64' && runner.os == 'Linux' }}
uses: deadsnakes/action@v3.0.1
with:
python-version: '3.8'
- name: "Install Python cases other than ARM on Linux"
if: ${{ ! (runner.arch == 'arm64' && runner.os == 'Linux') }}
uses: actions/setup-python@v4
with:
python-version: '3.8'
- name: "Print Python version"
run: python3 --version
- name: "Install shiv"
run: pip install shiv==1.0.6
- name: "Generate build version file"
run: ./scripts/create_build_tag_file.sh
- name: "Build local shiv package"
run: ./scripts/build_shiv_package.sh
- name: "Check cgroups version"
run: mount | grep cgroup
- name: "Install kind"
run: ./tests/scripts/install-kind.sh
- name: "Install Kubectl"
run: ./tests/scripts/install-kubectl.sh
- name: "Install ed" # Only needed until we remove the need to edit the spec file
run: apt update && apt install -y ed
- name: "Run container registry deployment test"
run: |
source /opt/bash-utils/cgroup-helper.sh
join_cgroup
./tests/container-registry/run-test.sh
- name: Notify Vulcanize Slack on CI failure
if: ${{ always() && github.ref_name == 'main' }}
uses: ravsamhq/notify-slack-action@v2
with:
status: ${{ job.status }}
notify_when: 'failure'
env:
SLACK_WEBHOOK_URL: ${{ secrets.VULCANIZE_SLACK_CI_ALERTS }}
- name: Notify DeepStack Slack on CI failure
if: ${{ always() && github.ref_name == 'main' }}
uses: ravsamhq/notify-slack-action@v2
with:
status: ${{ job.status }}
notify_when: 'failure'
env:
SLACK_WEBHOOK_URL: ${{ secrets.DEEPSTACK_SLACK_CI_ALERTS }}

View File

@ -0,0 +1,67 @@
name: Database Test
on:
push:
branches: '*'
paths:
- '!**'
- '.gitea/workflows/triggers/test-database'
- '.gitea/workflows/test-database.yml'
- 'tests/database/run-test.sh'
schedule: # Note: coordinate with other tests to not overload runners at the same time of day
- cron: '5 18 * * *'
jobs:
test:
name: "Run database hosting test on kind/k8s"
runs-on: ubuntu-22.04
steps:
- name: "Clone project repository"
uses: actions/checkout@v3
# At present the stock setup-python action fails on Linux/aarch64
# Conditional steps below workaroud this by using deadsnakes for that case only
- name: "Install Python for ARM on Linux"
if: ${{ runner.arch == 'arm64' && runner.os == 'Linux' }}
uses: deadsnakes/action@v3.0.1
with:
python-version: '3.8'
- name: "Install Python cases other than ARM on Linux"
if: ${{ ! (runner.arch == 'arm64' && runner.os == 'Linux') }}
uses: actions/setup-python@v4
with:
python-version: '3.8'
- name: "Print Python version"
run: python3 --version
- name: "Install shiv"
run: pip install shiv==1.0.6
- name: "Generate build version file"
run: ./scripts/create_build_tag_file.sh
- name: "Build local shiv package"
run: ./scripts/build_shiv_package.sh
- name: "Check cgroups version"
run: mount | grep cgroup
- name: "Install kind"
run: ./tests/scripts/install-kind.sh
- name: "Install Kubectl"
run: ./tests/scripts/install-kubectl.sh
- name: "Run database deployment test"
run: |
source /opt/bash-utils/cgroup-helper.sh
join_cgroup
./tests/database/run-test.sh
- name: Notify Vulcanize Slack on CI failure
if: ${{ always() && github.ref_name == 'main' }}
uses: ravsamhq/notify-slack-action@v2
with:
status: ${{ job.status }}
notify_when: 'failure'
env:
SLACK_WEBHOOK_URL: ${{ secrets.VULCANIZE_SLACK_CI_ALERTS }}
- name: Notify DeepStack Slack on CI failure
if: ${{ always() && github.ref_name == 'main' }}
uses: ravsamhq/notify-slack-action@v2
with:
status: ${{ job.status }}
notify_when: 'failure'
env:
SLACK_WEBHOOK_URL: ${{ secrets.DEEPSTACK_SLACK_CI_ALERTS }}

View File

@ -0,0 +1,59 @@
name: Deploy Test
on:
pull_request:
branches:
- main
push:
branches:
- main
- ci-test
paths-ignore:
- '.gitea/workflows/triggers/*'
jobs:
test:
name: "Run deploy test suite"
runs-on: ubuntu-latest
steps:
- name: "Clone project repository"
uses: actions/checkout@v3
# At present the stock setup-python action fails on Linux/aarch64
# Conditional steps below workaroud this by using deadsnakes for that case only
- name: "Install Python for ARM on Linux"
if: ${{ runner.arch == 'arm64' && runner.os == 'Linux' }}
uses: deadsnakes/action@v3.0.1
with:
python-version: '3.8'
- name: "Install Python cases other than ARM on Linux"
if: ${{ ! (runner.arch == 'arm64' && runner.os == 'Linux') }}
uses: actions/setup-python@v4
with:
python-version: '3.8'
- name: "Print Python version"
run: python3 --version
- name: "Install shiv"
run: pip install shiv==1.0.6
- name: "Generate build version file"
run: ./scripts/create_build_tag_file.sh
- name: "Build local shiv package"
run: ./scripts/build_shiv_package.sh
- name: "Run deploy tests"
run: ./tests/deploy/run-deploy-test.sh
- name: Notify Vulcanize Slack on CI failure
if: ${{ always() && github.ref_name == 'main' }}
uses: ravsamhq/notify-slack-action@v2
with:
status: ${{ job.status }}
notify_when: 'failure'
env:
SLACK_WEBHOOK_URL: ${{ secrets.VULCANIZE_SLACK_CI_ALERTS }}
- name: Notify DeepStack Slack on CI failure
if: ${{ always() && github.ref_name == 'main' }}
uses: ravsamhq/notify-slack-action@v2
with:
status: ${{ job.status }}
notify_when: 'failure'
env:
SLACK_WEBHOOK_URL: ${{ secrets.DEEPSTACK_SLACK_CI_ALERTS }}

View File

@ -0,0 +1,58 @@
name: External Stack Test
on:
push:
branches: '*'
paths:
- '!**'
- '.gitea/workflows/triggers/test-external-stack'
- '.gitea/workflows/test-external-stack.yml'
- 'tests/external-stack/run-test.sh'
schedule: # Note: coordinate with other tests to not overload runners at the same time of day
- cron: '8 19 * * *'
jobs:
test:
name: "Run external stack test suite"
runs-on: ubuntu-latest
steps:
- name: "Clone project repository"
uses: actions/checkout@v3
# At present the stock setup-python action fails on Linux/aarch64
# Conditional steps below workaroud this by using deadsnakes for that case only
- name: "Install Python for ARM on Linux"
if: ${{ runner.arch == 'arm64' && runner.os == 'Linux' }}
uses: deadsnakes/action@v3.0.1
with:
python-version: '3.8'
- name: "Install Python cases other than ARM on Linux"
if: ${{ ! (runner.arch == 'arm64' && runner.os == 'Linux') }}
uses: actions/setup-python@v4
with:
python-version: '3.8'
- name: "Print Python version"
run: python3 --version
- name: "Install shiv"
run: pip install shiv==1.0.6
- name: "Generate build version file"
run: ./scripts/create_build_tag_file.sh
- name: "Build local shiv package"
run: ./scripts/build_shiv_package.sh
- name: "Run external stack tests"
run: ./tests/external-stack/run-test.sh
- name: Notify Vulcanize Slack on CI failure
if: ${{ always() && github.ref_name == 'main' }}
uses: ravsamhq/notify-slack-action@v2
with:
status: ${{ job.status }}
notify_when: 'failure'
env:
SLACK_WEBHOOK_URL: ${{ secrets.VULCANIZE_SLACK_CI_ALERTS }}
- name: Notify DeepStack Slack on CI failure
if: ${{ always() && github.ref_name == 'main' }}
uses: ravsamhq/notify-slack-action@v2
with:
status: ${{ job.status }}
notify_when: 'failure'
env:
SLACK_WEBHOOK_URL: ${{ secrets.DEEPSTACK_SLACK_CI_ALERTS }}

View File

@ -0,0 +1,70 @@
name: K8s Deploy Test
on:
pull_request:
branches:
- main
push:
branches: '*'
paths:
- '!**'
- '.gitea/workflows/triggers/test-k8s-deploy'
- '.gitea/workflows/test-k8s-deploy.yml'
- 'tests/k8s-deploy/run-deploy-test.sh'
schedule: # Note: coordinate with other tests to not overload runners at the same time of day
- cron: '3 15 * * *'
jobs:
test:
name: "Run deploy test suite on kind/k8s"
runs-on: ubuntu-22.04
steps:
- name: "Clone project repository"
uses: actions/checkout@v3
# At present the stock setup-python action fails on Linux/aarch64
# Conditional steps below workaroud this by using deadsnakes for that case only
- name: "Install Python for ARM on Linux"
if: ${{ runner.arch == 'arm64' && runner.os == 'Linux' }}
uses: deadsnakes/action@v3.0.1
with:
python-version: '3.8'
- name: "Install Python cases other than ARM on Linux"
if: ${{ ! (runner.arch == 'arm64' && runner.os == 'Linux') }}
uses: actions/setup-python@v4
with:
python-version: '3.8'
- name: "Print Python version"
run: python3 --version
- name: "Install shiv"
run: pip install shiv==1.0.6
- name: "Generate build version file"
run: ./scripts/create_build_tag_file.sh
- name: "Build local shiv package"
run: ./scripts/build_shiv_package.sh
- name: "Check cgroups version"
run: mount | grep cgroup
- name: "Install kind"
run: ./tests/scripts/install-kind.sh
- name: "Install Kubectl"
run: ./tests/scripts/install-kubectl.sh
- name: "Run k8s deployment test"
run: |
source /opt/bash-utils/cgroup-helper.sh
join_cgroup
./tests/k8s-deploy/run-deploy-test.sh
- name: Notify Vulcanize Slack on CI failure
if: ${{ always() && github.ref_name == 'main' }}
uses: ravsamhq/notify-slack-action@v2
with:
status: ${{ job.status }}
notify_when: 'failure'
env:
SLACK_WEBHOOK_URL: ${{ secrets.VULCANIZE_SLACK_CI_ALERTS }}
- name: Notify DeepStack Slack on CI failure
if: ${{ always() && github.ref_name == 'main' }}
uses: ravsamhq/notify-slack-action@v2
with:
status: ${{ job.status }}
notify_when: 'failure'
env:
SLACK_WEBHOOK_URL: ${{ secrets.DEEPSTACK_SLACK_CI_ALERTS }}

View File

@ -0,0 +1,70 @@
name: K8s Deployment Control Test
on:
pull_request:
branches:
- main
push:
branches: '*'
paths:
- '!**'
- '.gitea/workflows/triggers/test-k8s-deployment-control'
- '.gitea/workflows/test-k8s-deployment-control.yml'
- 'tests/k8s-deployment-control/run-test.sh'
schedule: # Note: coordinate with other tests to not overload runners at the same time of day
- cron: '3 30 * * *'
jobs:
test:
name: "Run deployment control suite on kind/k8s"
runs-on: ubuntu-22.04
steps:
- name: "Clone project repository"
uses: actions/checkout@v3
# At present the stock setup-python action fails on Linux/aarch64
# Conditional steps below workaroud this by using deadsnakes for that case only
- name: "Install Python for ARM on Linux"
if: ${{ runner.arch == 'arm64' && runner.os == 'Linux' }}
uses: deadsnakes/action@v3.0.1
with:
python-version: '3.8'
- name: "Install Python cases other than ARM on Linux"
if: ${{ ! (runner.arch == 'arm64' && runner.os == 'Linux') }}
uses: actions/setup-python@v4
with:
python-version: '3.8'
- name: "Print Python version"
run: python3 --version
- name: "Install shiv"
run: pip install shiv==1.0.6
- name: "Generate build version file"
run: ./scripts/create_build_tag_file.sh
- name: "Build local shiv package"
run: ./scripts/build_shiv_package.sh
- name: "Check cgroups version"
run: mount | grep cgroup
- name: "Install kind"
run: ./tests/scripts/install-kind.sh
- name: "Install Kubectl"
run: ./tests/scripts/install-kubectl.sh
- name: "Run k8s deployment control test"
run: |
source /opt/bash-utils/cgroup-helper.sh
join_cgroup
./tests/k8s-deployment-control/run-test.sh
- name: Notify Vulcanize Slack on CI failure
if: ${{ always() && github.ref_name == 'main' }}
uses: ravsamhq/notify-slack-action@v2
with:
status: ${{ job.status }}
notify_when: 'failure'
env:
SLACK_WEBHOOK_URL: ${{ secrets.VULCANIZE_SLACK_CI_ALERTS }}
- name: Notify DeepStack Slack on CI failure
if: ${{ always() && github.ref_name == 'main' }}
uses: ravsamhq/notify-slack-action@v2
with:
status: ${{ job.status }}
notify_when: 'failure'
env:
SLACK_WEBHOOK_URL: ${{ secrets.DEEPSTACK_SLACK_CI_ALERTS }}

View File

@ -0,0 +1,60 @@
name: Webapp Test
on:
pull_request:
branches:
- main
push:
branches:
- main
- ci-test
paths-ignore:
- '.gitea/workflows/triggers/*'
jobs:
test:
name: "Run webapp test suite"
runs-on: ubuntu-latest
steps:
- name: "Clone project repository"
uses: actions/checkout@v3
# At present the stock setup-python action fails on Linux/aarch64
# Conditional steps below workaroud this by using deadsnakes for that case only
- name: "Install Python for ARM on Linux"
if: ${{ runner.arch == 'arm64' && runner.os == 'Linux' }}
uses: deadsnakes/action@v3.0.1
with:
python-version: '3.8'
- name: "Install Python cases other than ARM on Linux"
if: ${{ ! (runner.arch == 'arm64' && runner.os == 'Linux') }}
uses: actions/setup-python@v4
with:
python-version: '3.8'
- name: "Print Python version"
run: python3 --version
- name: "Install shiv"
run: pip install shiv==1.0.6
- name: "Generate build version file"
run: ./scripts/create_build_tag_file.sh
- name: "Build local shiv package"
run: ./scripts/build_shiv_package.sh
- name: "Install wget" # 20240109 - Only needed until the executors are updated.
run: apt update && apt install -y wget
- name: "Run webapp tests"
run: ./tests/webapp-test/run-webapp-test.sh
- name: Notify Vulcanize Slack on CI failure
if: ${{ always() && github.ref_name == 'main' }}
uses: ravsamhq/notify-slack-action@v2
with:
status: ${{ job.status }}
notify_when: 'failure'
env:
SLACK_WEBHOOK_URL: ${{ secrets.VULCANIZE_SLACK_CI_ALERTS }}
- name: Notify DeepStack Slack on CI failure
if: ${{ always() && github.ref_name == 'main' }}
uses: ravsamhq/notify-slack-action@v2
with:
status: ${{ job.status }}
notify_when: 'failure'
env:
SLACK_WEBHOOK_URL: ${{ secrets.DEEPSTACK_SLACK_CI_ALERTS }}

View File

@ -0,0 +1,58 @@
name: Smoke Test
on:
pull_request:
branches: '*'
push:
branches:
- main
- ci-test
paths-ignore:
- '.gitea/workflows/triggers/*'
jobs:
test:
name: "Run basic test suite"
runs-on: ubuntu-latest
steps:
- name: "Clone project repository"
uses: actions/checkout@v3
# At present the stock setup-python action fails on Linux/aarch64
# Conditional steps below workaroud this by using deadsnakes for that case only
- name: "Install Python for ARM on Linux"
if: ${{ runner.arch == 'arm64' && runner.os == 'Linux' }}
uses: deadsnakes/action@v3.0.1
with:
python-version: '3.8'
- name: "Install Python cases other than ARM on Linux"
if: ${{ ! (runner.arch == 'arm64' && runner.os == 'Linux') }}
uses: actions/setup-python@v4
with:
python-version: '3.8'
- name: "Print Python version"
run: python3 --version
- name: "Install shiv"
run: pip install shiv==1.0.6
- name: "Generate build version file"
run: ./scripts/create_build_tag_file.sh
- name: "Build local shiv package"
run: ./scripts/build_shiv_package.sh
- name: "Run smoke tests"
run: ./tests/smoke-test/run-smoke-test.sh
- name: Notify Vulcanize Slack on CI failure
if: ${{ always() && github.ref_name == 'main' }}
uses: ravsamhq/notify-slack-action@v2
with:
status: ${{ job.status }}
notify_when: 'failure'
env:
SLACK_WEBHOOK_URL: ${{ secrets.VULCANIZE_SLACK_CI_ALERTS }}
- name: Notify DeepStack Slack on CI failure
if: ${{ always() && github.ref_name == 'main' }}
uses: ravsamhq/notify-slack-action@v2
with:
status: ${{ job.status }}
notify_when: 'failure'
env:
SLACK_WEBHOOK_URL: ${{ secrets.DEEPSTACK_SLACK_CI_ALERTS }}

View File

@ -0,0 +1,10 @@
Change this file to trigger running the fixturenet-laconicd-test CI job
Trigger
Trigger
Trigger
Trigger
Trigger
Trigger
Trigger
Trigger
Trigger

View File

@ -0,0 +1,3 @@
Change this file to trigger running the test-container-registry CI job
Triggered: 2026-01-21
Triggered: 2026-01-21 19:28:29

View File

@ -0,0 +1,2 @@
Change this file to trigger running the test-database CI job
Trigger test run

View File

@ -0,0 +1,2 @@
Change this file to trigger running the external-stack CI job
trigger

View File

@ -0,0 +1,2 @@
Change this file to trigger running the test-k8s-deploy CI job
Trigger test on PR branch

View File

@ -0,0 +1,30 @@
name: Fixturenet-Eth Test
on:
push:
branches: '*'
paths:
- '!**'
- '.github/workflows/triggers/fixturenet-eth-test'
jobs:
test:
name: "Run fixturenet-eth test suite"
runs-on: ubuntu-latest
steps:
- name: "Clone project repository"
uses: actions/checkout@v3
- name: "Install Python"
uses: actions/setup-python@v4
with:
python-version: '3.8'
- name: "Print Python version"
run: python3 --version
- name: "Install shiv"
run: pip install shiv
- name: "Generate build version file"
run: ./scripts/create_build_tag_file.sh
- name: "Build local shiv package"
run: ./scripts/build_shiv_package.sh
- name: "Run fixturenet-eth tests"
run: ./tests/fixturenet-eth/run-test.sh

View File

@ -0,0 +1,30 @@
name: Fixturenet-Laconicd Test
on:
push:
branches: '*'
paths:
- '!**'
- '.github/workflows/triggers/fixturenet-laconicd-test'
jobs:
test:
name: "Run fixturenet-laconicd test suite"
runs-on: ubuntu-latest
steps:
- name: "Clone project repository"
uses: actions/checkout@v3
- name: "Install Python"
uses: actions/setup-python@v4
with:
python-version: '3.8'
- name: "Print Python version"
run: python3 --version
- name: "Install shiv"
run: pip install shiv
- name: "Generate build version file"
run: ./scripts/create_build_tag_file.sh
- name: "Build local shiv package"
run: ./scripts/build_shiv_package.sh
- name: "Run fixturenet-laconicd tests"
run: ./tests/fixturenet-laconicd/run-test.sh

21
.github/workflows/lint.yml vendored 100644
View File

@ -0,0 +1,21 @@
name: Lint Checks
on:
pull_request:
branches: '*'
push:
branches: '*'
jobs:
test:
name: "Run linter"
runs-on: ubuntu-latest
steps:
- name: "Clone project repository"
uses: actions/checkout@v3
- name: "Install Python"
uses: actions/setup-python@v4
with:
python-version: '3.8'
- name : "Run flake8"
uses: py-actions/flake8@v2

46
.github/workflows/publish.yml vendored 100644
View File

@ -0,0 +1,46 @@
name: Publish
on:
push:
branches:
- main
- publish-test
jobs:
publish:
name: "Build and publish"
runs-on: ubuntu-latest
steps:
- name: "Clone project repository"
uses: actions/checkout@v3
- name: "Get build info"
id: build-info
run: |
build_tag=$(./scripts/create_build_tag_file.sh)
echo "build-tag=v${build_tag}" >> $GITHUB_OUTPUT
- name: "Install Python"
uses: actions/setup-python@v4
with:
python-version: '3.8'
- name: "Print Python version"
run: python3 --version
- name: "Install shiv"
run: pip install shiv
- name: "Build local shiv package"
id: build
run: |
./scripts/build_shiv_package.sh
result_code=$?
echo "package-file=$(ls ./package/*)" >> $GITHUB_OUTPUT
exit $result_code
- name: "Stage artifact file"
run: |
cp ${{ steps.build.outputs.package-file }} ./laconic-so
- name: "Create release"
uses: softprops/action-gh-release@v1
with:
tag_name: ${{ steps.build-info.outputs.build-tag }}
# On the publish test branch, mark our release as a draft
# Hack using endsWith to workaround Gitea sometimes sending "publish-test" vs "refs/heads/publish-test"
draft: ${{ endsWith('publish-test', github.ref ) }}
files: ./laconic-so

View File

@ -0,0 +1,29 @@
name: Deploy Test
on:
pull_request:
branches: '*'
push:
branches: '*'
jobs:
test:
name: "Run deploy test suite"
runs-on: ubuntu-latest
steps:
- name: "Clone project repository"
uses: actions/checkout@v3
- name: "Install Python"
uses: actions/setup-python@v4
with:
python-version: '3.8'
- name: "Print Python version"
run: python3 --version
- name: "Install shiv"
run: pip install shiv
- name: "Generate build version file"
run: ./scripts/create_build_tag_file.sh
- name: "Build local shiv package"
run: ./scripts/build_shiv_package.sh
- name: "Run deploy tests"
run: ./tests/deploy/run-deploy-test.sh

View File

@ -0,0 +1,29 @@
name: Webapp Test
on:
pull_request:
branches: '*'
push:
branches: '*'
jobs:
test:
name: "Run webapp test suite"
runs-on: ubuntu-latest
steps:
- name: "Clone project repository"
uses: actions/checkout@v3
- name: "Install Python"
uses: actions/setup-python@v4
with:
python-version: '3.8'
- name: "Print Python version"
run: python3 --version
- name: "Install shiv"
run: pip install shiv
- name: "Generate build version file"
run: ./scripts/create_build_tag_file.sh
- name: "Build local shiv package"
run: ./scripts/build_shiv_package.sh
- name: "Run webapp tests"
run: ./tests/webapp-test/run-webapp-test.sh

29
.github/workflows/test.yml vendored 100644
View File

@ -0,0 +1,29 @@
name: Smoke Test
on:
pull_request:
branches: '*'
push:
branches: '*'
jobs:
test:
name: "Run basic test suite"
runs-on: ubuntu-latest
steps:
- name: "Clone project repository"
uses: actions/checkout@v3
- name: "Install Python"
uses: actions/setup-python@v4
with:
python-version: '3.8'
- name: "Print Python version"
run: python3 --version
- name: "Install shiv"
run: pip install shiv
- name: "Generate build version file"
run: ./scripts/create_build_tag_file.sh
- name: "Build local shiv package"
run: ./scripts/build_shiv_package.sh
- name: "Run smoke tests"
run: ./tests/smoke-test/run-smoke-test.sh

View File

@ -0,0 +1 @@
Change this file to trigger running the fixturenet-eth-test CI job

View File

@ -0,0 +1,3 @@
Change this file to trigger running the fixturenet-laconicd-test CI job
trigger

13
.gitignore vendored
View File

@ -1,4 +1,11 @@
.venv/
sessions.duckdb
sessions.duckdb.wal
.idea
venv
.vscode
laconic-so
laconic_stack_orchestrator.egg-info
__pycache__
*~
package
stack_orchestrator/data/build_tag.txt
/build
.worktrees

1
.pebbles/.gitignore vendored
View File

@ -1 +0,0 @@
pebbles.db

View File

@ -1,3 +1,3 @@
{
"prefix": "bar"
"prefix": "so"
}

View File

@ -1,52 +1,15 @@
{"type":"create","timestamp":"2026-03-06T07:57:55.427398426Z","issue_id":"bar-48f","payload":{"description":"Route all validator traffic (gossip, repair, TVU, TPU) through 137.239.194.65 on laconic-was-sw01 in Ashburn. Supersedes old TVU-only shred relay. See docs/ashburn-validator-relay.md for full design.","priority":"1","title":"Ashburn Full Validator Traffic Relay","type":"epic"}}
{"type":"create","timestamp":"2026-03-06T07:58:01.589463071Z","issue_id":"bar-a47","payload":{"description":"Create Loopback101 (137.239.194.65/32), VALIDATOR-RELAY ACL + traffic-policy on Et1/1, replacing old SHRED-RELAY. Uses 5-min auto-revert config session. Playbook: playbooks/ashburn-relay-was-sw01.yml","priority":"1","title":"was-sw01: Inbound validator relay config","type":"task"}}
{"type":"create","timestamp":"2026-03-06T07:58:07.292140983Z","issue_id":"bar-0e5","payload":{"description":"Add 137.239.194.65/32 to lo, DNAT rules for ports 8001,9000-9025 to kind node 172.20.0.2. Playbook: playbooks/ashburn-relay-biscayne.yml -t inbound","priority":"1","title":"biscayne: Inbound DNAT rules","type":"task"}}
{"type":"create","timestamp":"2026-03-06T07:58:10.838534858Z","issue_id":"bar-f9b","payload":{"description":"Ping 137.239.194.65 from external host, check DNAT counters on biscayne, verify traffic-policy counters on was-sw01.","priority":"1","title":"Verify inbound relay","type":"task"}}
{"type":"create","timestamp":"2026-03-06T07:58:15.228970622Z","issue_id":"bar-bf4","payload":{"description":"Pre-flight to discover GRE tunnel interface, then apply VALIDATOR-OUTBOUND traffic-policy redirecting src 137.239.194.65 to was-sw01 via backbone. Playbook: playbooks/ashburn-relay-mia-sw01.yml","priority":"1","title":"mia-sw01: Outbound validator redirect","type":"task"}}
{"type":"create","timestamp":"2026-03-06T07:58:19.571640837Z","issue_id":"bar-78d","payload":{"description":"fwmark 100 on validator source ports, SNAT to 137.239.194.65, policy route via doublezero0 table ashburn. Playbook: playbooks/ashburn-relay-biscayne.yml -t outbound","priority":"1","title":"biscayne: Outbound SNAT + policy routing","type":"task"}}
{"type":"create","timestamp":"2026-03-06T07:58:23.377441628Z","issue_id":"bar-f3b","payload":{"description":"Verify traffic-policy counters on both switches, iptables counters on biscayne, validator gossip ContactInfo shows 137.239.194.65, repair peer count increases, slot catchup rate improves. Write memory on both switches.","priority":"1","title":"End-to-end verification","type":"task"}}
{"type":"create","timestamp":"2026-03-06T07:58:27.341320984Z","issue_id":"bar-8a9","payload":{"description":"After stable: remove old SHRED-RELAY policy and ACL from was-sw01, remove old 64.92.84.81:20000 DNAT from biscayne.","priority":"2","title":"Cleanup old SHRED-RELAY","type":"task"}}
{"type":"rename","timestamp":"2026-03-06T07:58:32.091645662Z","issue_id":"bar-a47","payload":{"new_id":"bar-48f.1"}}
{"type":"dep_add","timestamp":"2026-03-06T07:58:32.091647902Z","issue_id":"bar-48f.1","payload":{"dep_type":"parent-child","depends_on":"bar-48f"}}
{"type":"rename","timestamp":"2026-03-06T07:58:32.274391159Z","issue_id":"bar-0e5","payload":{"new_id":"bar-48f.2"}}
{"type":"dep_add","timestamp":"2026-03-06T07:58:32.274392749Z","issue_id":"bar-48f.2","payload":{"dep_type":"parent-child","depends_on":"bar-48f"}}
{"type":"rename","timestamp":"2026-03-06T07:58:32.468426932Z","issue_id":"bar-f9b","payload":{"new_id":"bar-48f.3"}}
{"type":"dep_add","timestamp":"2026-03-06T07:58:32.468428522Z","issue_id":"bar-48f.3","payload":{"dep_type":"parent-child","depends_on":"bar-48f"}}
{"type":"rename","timestamp":"2026-03-06T07:58:32.657295386Z","issue_id":"bar-bf4","payload":{"new_id":"bar-48f.4"}}
{"type":"dep_add","timestamp":"2026-03-06T07:58:32.657297846Z","issue_id":"bar-48f.4","payload":{"dep_type":"parent-child","depends_on":"bar-48f"}}
{"type":"rename","timestamp":"2026-03-06T07:58:32.864939519Z","issue_id":"bar-78d","payload":{"new_id":"bar-48f.5"}}
{"type":"dep_add","timestamp":"2026-03-06T07:58:32.864941739Z","issue_id":"bar-48f.5","payload":{"dep_type":"parent-child","depends_on":"bar-48f"}}
{"type":"rename","timestamp":"2026-03-06T07:58:33.364299485Z","issue_id":"bar-f3b","payload":{"new_id":"bar-48f.6"}}
{"type":"dep_add","timestamp":"2026-03-06T07:58:33.364301305Z","issue_id":"bar-48f.6","payload":{"dep_type":"parent-child","depends_on":"bar-48f"}}
{"type":"rename","timestamp":"2026-03-06T07:58:33.639638369Z","issue_id":"bar-8a9","payload":{"new_id":"bar-48f.7"}}
{"type":"dep_add","timestamp":"2026-03-06T07:58:33.639640069Z","issue_id":"bar-48f.7","payload":{"dep_type":"parent-child","depends_on":"bar-48f"}}
{"type":"dep_add","timestamp":"2026-03-06T07:58:39.486721446Z","issue_id":"bar-48f.2","payload":{"dep_type":"blocks","depends_on":"bar-48f.1"}}
{"type":"dep_add","timestamp":"2026-03-06T07:58:39.911749641Z","issue_id":"bar-48f.3","payload":{"dep_type":"blocks","depends_on":"bar-48f.2"}}
{"type":"dep_add","timestamp":"2026-03-06T07:58:40.398532353Z","issue_id":"bar-48f.4","payload":{"dep_type":"blocks","depends_on":"bar-48f.3"}}
{"type":"dep_add","timestamp":"2026-03-06T07:58:40.762666046Z","issue_id":"bar-48f.5","payload":{"dep_type":"blocks","depends_on":"bar-48f.4"}}
{"type":"dep_add","timestamp":"2026-03-06T07:58:41.173027726Z","issue_id":"bar-48f.6","payload":{"dep_type":"blocks","depends_on":"bar-48f.5"}}
{"type":"dep_add","timestamp":"2026-03-06T07:58:41.467313496Z","issue_id":"bar-48f.7","payload":{"dep_type":"blocks","depends_on":"bar-48f.6"}}
{"type":"update","timestamp":"2026-03-06T18:32:00.041874266Z","issue_id":"bar-48f.1","payload":{"description":"Run ansible playbook (pane A) to apply config session with 5-min auto-revert. Review output. In pane B, SSH to install@137.239.200.198 and manually verify (show session-config diffs, show traffic-policy counters). Type 'configure session validator-relay commit' and 'write memory' when satisfied. Playbook: playbooks/ashburn-relay-was-sw01.yml (do NOT use -e commit=true; commit is manual via SSH)."}}
{"type":"update","timestamp":"2026-03-06T18:32:05.861153312Z","issue_id":"bar-48f.4","payload":{"description":"Run ansible playbook pre-flight (pane A) to discover GRE tunnel interface. Then run with -e apply=true -e tunnel_interface=TunnelX for 5-min auto-revert. In pane B, SSH to install@209.42.167.133 and manually verify. Type 'configure session validator-outbound commit' and 'write memory' when satisfied. Playbook: playbooks/ashburn-relay-mia-sw01.yml (do NOT use -e commit=true; commit is manual via SSH)."}}
{"type":"status_update","timestamp":"2026-03-06T18:35:35.320628231Z","issue_id":"bar-48f","payload":{"status":"in_progress"}}
{"type":"status_update","timestamp":"2026-03-06T18:35:35.717040604Z","issue_id":"bar-48f.1","payload":{"status":"in_progress"}}
{"type":"close","timestamp":"2026-03-06T20:12:45.087966093Z","issue_id":"bar-48f.1","payload":{}}
{"type":"status_update","timestamp":"2026-03-06T20:16:34.00466057Z","issue_id":"bar-48f.2","payload":{"status":"in_progress"}}
{"type":"close","timestamp":"2026-03-06T20:17:18.681131396Z","issue_id":"bar-48f.2","payload":{}}
{"type":"status_update","timestamp":"2026-03-06T20:17:19.159927405Z","issue_id":"bar-48f.3","payload":{"status":"in_progress"}}
{"type":"close","timestamp":"2026-03-06T20:18:42.42112937Z","issue_id":"bar-48f.3","payload":{}}
{"type":"status_update","timestamp":"2026-03-06T20:18:42.930237032Z","issue_id":"bar-48f.4","payload":{"status":"in_progress"}}
{"type":"create","timestamp":"2026-03-08T06:58:52.122307149Z","issue_id":"bar-02e","payload":{"description":"/srv/solana is a directory on the ZFS dataset biscayne/DATA/srv (mounted at /srv\nwith overlay=on). The fstab zvol mount at /srv/solana was shadowed by ZFS.\n\nFixed 2026-03-08: removed /srv/solana fstab entries, canonical data path is now\n/srv/kind/solana. All playbooks updated. fstab clean. Mounts verified.","priority":"1","title":"zvol mount: /srv/solana resolves to ZFS dataset, not zvol","type":"bug"}}
{"type":"create","timestamp":"2026-03-08T06:58:52.557582445Z","issue_id":"bar-41a","payload":{"description":"laconic-so creates configmap resources for telegraf but does not generate\nvolumeMounts in the pod spec. The telegraf container crashes because\n/etc/telegraf and /scripts are empty. Manual configmap creation works but\nthe volume mounts are still missing. Root cause is in laconic-so's stack\nmigration — configmap volume mount generation is incomplete.","priority":"1","title":"telegraf volume mounts missing from pod spec","type":"bug"}}
{"type":"create","timestamp":"2026-03-08T06:58:53.065888933Z","issue_id":"bar-a3b","payload":{"description":"Validator exits shortly after starting. Log shows UDP port reachability checks\nand TCP port checks failing. Needs full log analysis from kind node path\n(/mnt/validator-log/validator.log). May be related to networking/firewall\nconfiguration or the shred relay setup.","priority":"0","title":"agave-validator crash after ~57 seconds","type":"bug"}}
{"type":"create","timestamp":"2026-03-08T06:58:53.589221516Z","issue_id":"bar-b04","payload":{"description":"Once laconic-so deployment prepare lands, update biscayne-redeploy.yml to use\nprepare instead of start+scale-to-0 workaround. The deploy tag section should\ncall deployment prepare, and scale-up should call deployment start\n--skip-cluster-management.","priority":"2","title":"update biscayne-redeploy to use deployment prepare","type":"task"}}
{"type":"create","timestamp":"2026-03-08T06:58:54.238136989Z","issue_id":"bar-b41","payload":{"description":"Automate the leapfrog recovery strategy documented in CLAUDE.md. When the\nvalidator is stuck in a repair-dependent gap, download a fresh snapshot past\nthe incomplete zone while preserving the existing ledger (which has turbine\nshreds at the tip). Needs: shred completeness check, snapshot slot targeting,\nselective wipe (accounts+snapshots only, keep ledger).","priority":"2","title":"snapshot leapfrog recovery playbook","type":"feature"}}
{"type":"create","timestamp":"2026-03-08T06:58:54.756609299Z","issue_id":"bar-0b4","payload":{"description":"biscayne-prepare-agave.yml unconditionally imports ashburn-relay-biscayne.yml\nat the end. This couples filesystem preparation to relay setup. The relay\nplaybook fails if the kind node isn't running (ping to 172.20.0.2 fails).\nShould be a separate playbook invocation, not an import.","priority":"3","title":"biscayne-prepare-agave imports ashburn-relay-biscayne unconditionally","type":"bug"}}
{"type":"close","timestamp":"2026-03-08T06:59:00.140156099Z","issue_id":"bar-02e","payload":{}}
{"type":"create","timestamp":"2026-03-10T08:05:07.190617713Z","issue_id":"bar-2c9","payload":{"description":"laconic-so build-containers --include filter does exact string match via\ninclude_exclude_check(). Container names use slash (laconicnetwork/agave),\nnot dash. Using --include laconicnetwork-agave silently skips the build\nand reports success.\n\nFixed in biscayne-sync-tools.yml (commit ceea8f0) but the underlying\nlaconic-so behavior of silently skipping with no warning is a bug.","priority":"2","title":"build-containers --include uses slash not dash in container names","type":"bug"}}
{"type":"create","timestamp":"2026-03-10T08:05:12.506655809Z","issue_id":"bar-6cb","payload":{"description":"When laconic-so deployment restart deletes the namespace, PVCs are\ncascade-deleted but PVs (cluster-scoped) survive in Released state with\nstale claimRefs pointing to the old PVC UIDs. New PVCs created by the\nrestarted deployment can't bind because the PVs still reference the\ndeleted PVCs.\n\nWorkaround: patch Released PVs to clear claimRef after restart.\nAdded to biscayne-restart.yml. Root cause is in laconic-so — it should\nclear stale claimRefs as part of the restart flow.\n\nRelated: so-933 (namespace termination race).","priority":"1","title":"PV claimRefs go stale after deployment restart","type":"bug"}}
{"type":"create","timestamp":"2026-03-10T08:05:15.941416301Z","issue_id":"bar-fec","payload":{"description":"monitoring-grafana-data volume is defined in spec.yml but laconic-so's\nget_pvcs() does not generate a PVC for it. The PV is created but no\nmatching PVC exists, so the grafana container can't mount its data volume.\n\nWorkaround: manually kubectl apply the PVC after each deployment restart.\nRoot cause is in stack-orchestrator deploy_k8s.py get_pvcs().","priority":"2","title":"grafana PVC not generated by get_pvcs()","type":"bug"}}
{"type":"create","timestamp":"2026-03-10T08:05:22.853965263Z","issue_id":"bar-822","payload":{"description":"Rebuilding a container image on the Docker host does NOT update the image\ninside the kind node. With imagePullPolicy: IfNotPresent (the default for\n:local tags), kind uses its cached copy. Must run:\n\n kind load docker-image laconicnetwork/agave:local \\\n --name laconic-70ce4c4b47e23b85\n\nafter every rebuild. This step is not in any playbook or laconic-so flow.\nShould be added to biscayne-sync-tools.yml build-container tag or to\nlaconic-so build-containers itself.","priority":"2","title":"kind load docker-image required after container rebuild","type":"bug"}}
{"type":"create","timestamp":"2026-03-10T08:05:28.585915055Z","issue_id":"bar-571","payload":{"description":"Full snapshot slots differ per validator depending on when each started.\nThe entrypoint's incremental download loop assumes it can find an\nincremental keyed to any full snapshot's base slot, but no other validator\nmay have produced a full at that exact slot.\n\nThis causes the incremental download to retry forever when the local\nfull snapshot has a base slot that no network peer has incrementals for.\n\nDocumented for awareness. The entrypoint's infinite retry is intentional\n(user decision) — eventually a matching incremental will appear or the\nentrypoint falls through to download a fresh full+incremental pair.","priority":"3","title":"snapshot base slots are not consensus-aligned across validators","type":"bug"}}
{"type":"create","timestamp":"2026-03-10T08:05:32.262889286Z","issue_id":"bar-2d9","payload":{"description":"When spec.yml has explicit values for env vars that also have defaults in\nthe compose file, the spec.yml values win. Changing compose file defaults\nhas no effect unless the spec.yml override is also removed.\n\nThis is by design (spec.yml is deployment-specific config) but the\ninteraction is non-obvious. Bit us when changing snapshot settings in\ncompose but spec.yml still had the old values.\n\nNot a code bug — more a documentation/workflow issue. Operators must\ncheck both compose defaults and spec.yml overrides.","priority":"3","title":"spec.yml overrides compose defaults silently","type":"bug"}}
{"type":"create","timestamp":"2026-03-10T08:05:36.212405156Z","issue_id":"bar-31a","payload":{"description":"laconic-so deployment restart sleeps only 5s between down and up. If the\nnamespace is still terminating when 'up' runs, k8s returns 403 Forbidden\ncreating configmaps in the new namespace.\n\nCross-ref: so-933 in the stack-orchestrator pebbles project.\n\nWorkaround: retry the restart or wait manually. The restart playbook\n(biscayne-restart.yml) handles this by scaling to 0 first, waiting for\npod termination, then calling laconic-so restart.","priority":"1","title":"deployment restart namespace termination race","type":"bug"}}
{"type":"create","timestamp":"2026-03-08T06:56:07.080584539Z","issue_id":"so-076","payload":{"description":"Currently laconic-so maps one stack to one deployment to one pod. All containers\nin a stack's compose files become containers in a single k8s pod. This means:\n\n- Can't upgrade doublezero without restarting agave-validator\n- Can't restart monitoring without disrupting the validator\n- Can't independently scale or lifecycle-manage components\n\nThe fix is stack composition. A meta-stack (e.g. biscayne-stack) composes\nsub-stacks (agave, doublezero, agave-monitoring), each becoming its own\nk8s Deployment with independent lifecycle.","priority":"2","title":"Stack composition: deploy multiple stacks into one kind cluster","type":"epic"}}
{"type":"create","timestamp":"2026-03-08T06:56:07.551986919Z","issue_id":"so-ab0","payload":{"description":"Add laconic-so deployment prepare that creates cluster infrastructure without pods. Already implemented, needs review.","priority":"2","title":"deployment prepare command","type":"task"}}
{"type":"create","timestamp":"2026-03-08T06:56:07.884418759Z","issue_id":"so-04f","payload":{"description":"deployment stop on ANY deployment deletes the shared kind cluster. Should only delete its own namespace.","priority":"2","title":"deployment stop should not destroy shared cluster","type":"bug"}}
{"type":"create","timestamp":"2026-03-08T06:56:08.253520249Z","issue_id":"so-370","payload":{"description":"Allow stack.yml to reference sub-stacks. Each sub-stack becomes its own k8s Deployment sharing namespace and PVs.","priority":"2","title":"Add stacks: field to stack.yml for composition","type":"task"}}
{"type":"create","timestamp":"2026-03-08T06:56:08.646764337Z","issue_id":"so-f7c","payload":{"description":"Create three independent stacks from the monolithic agave-stack. Each gets its own compose file and independent lifecycle.","priority":"2","title":"Split agave-stack into agave + doublezero + monitoring","type":"task"}}
{"type":"rename","timestamp":"2026-03-08T06:56:14.499990161Z","issue_id":"so-ab0","payload":{"new_id":"so-076.1"}}
{"type":"dep_add","timestamp":"2026-03-08T06:56:14.499992031Z","issue_id":"so-076.1","payload":{"dep_type":"parent-child","depends_on":"so-076"}}
{"type":"rename","timestamp":"2026-03-08T06:56:14.786407752Z","issue_id":"so-04f","payload":{"new_id":"so-076.2"}}
{"type":"dep_add","timestamp":"2026-03-08T06:56:14.786409842Z","issue_id":"so-076.2","payload":{"dep_type":"parent-child","depends_on":"so-076"}}
{"type":"rename","timestamp":"2026-03-08T06:56:15.058959714Z","issue_id":"so-370","payload":{"new_id":"so-076.3"}}
{"type":"dep_add","timestamp":"2026-03-08T06:56:15.058961364Z","issue_id":"so-076.3","payload":{"dep_type":"parent-child","depends_on":"so-076"}}
{"type":"rename","timestamp":"2026-03-08T06:56:15.410080785Z","issue_id":"so-f7c","payload":{"new_id":"so-076.4"}}
{"type":"dep_add","timestamp":"2026-03-08T06:56:15.410082305Z","issue_id":"so-076.4","payload":{"dep_type":"parent-child","depends_on":"so-076"}}
{"type":"dep_add","timestamp":"2026-03-08T06:56:16.313585082Z","issue_id":"so-076.3","payload":{"dep_type":"blocks","depends_on":"so-076.2"}}
{"type":"dep_add","timestamp":"2026-03-08T06:56:16.567629422Z","issue_id":"so-076.4","payload":{"dep_type":"blocks","depends_on":"so-076.3"}}

View File

@ -0,0 +1,34 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
args: ['--allow-multiple-documents']
- id: check-json
- id: check-merge-conflict
- id: check-added-large-files
- repo: https://github.com/psf/black
rev: 23.12.1
hooks:
- id: black
language_version: python3
- repo: https://github.com/PyCQA/flake8
rev: 7.1.1
hooks:
- id: flake8
args: ['--max-line-length=88', '--extend-ignore=E203,W503,E402']
- repo: https://github.com/RobertCraigie/pyright-python
rev: v1.1.345
hooks:
- id: pyright
- repo: https://github.com/adrienverge/yamllint
rev: v1.35.1
hooks:
- id: yamllint
args: [-d, relaxed]

151
AI-FRIENDLY-PLAN.md 100644
View File

@ -0,0 +1,151 @@
# Plan: Make Stack-Orchestrator AI-Friendly
## Goal
Make the stack-orchestrator repository easier for AI tools (Claude Code, Cursor, Copilot) to understand and use for generating stacks, including adding a `create-stack` command.
---
## Part 1: Documentation & Context Files
### 1.1 Add CLAUDE.md
Create a root-level context file for AI assistants.
**File:** `CLAUDE.md`
Contents:
- Project overview (what stack-orchestrator does)
- Stack creation workflow (step-by-step)
- File naming conventions
- Required vs optional fields in stack.yml
- Common patterns and anti-patterns
- Links to example stacks (simple, medium, complex)
### 1.2 Add JSON Schema for stack.yml
Create formal validation schema.
**File:** `schemas/stack-schema.json`
Benefits:
- AI tools can validate generated stacks
- IDEs provide autocomplete
- CI can catch errors early
### 1.3 Add Template Stack with Comments
Create an annotated template for reference.
**File:** `stack_orchestrator/data/stacks/_template/stack.yml`
```yaml
# Stack definition template - copy this directory to create a new stack
version: "1.2" # Required: 1.0, 1.1, or 1.2
name: my-stack # Required: lowercase, hyphens only
description: "Human-readable description" # Optional
repos: # Git repositories to clone
- github.com/org/repo
containers: # Container images to build (must have matching container-build/)
- cerc/my-container
pods: # Deployment units (must have matching docker-compose-{pod}.yml)
- my-pod
```
### 1.4 Document Validation Rules
Create explicit documentation of constraints currently scattered in code.
**File:** `docs/stack-format.md`
Contents:
- Container names must start with `cerc/`
- Pod names must match compose file: `docker-compose-{pod}.yml`
- Repository format: `host/org/repo[@ref]`
- Stack directory name should match `name` field
- Version field options and differences
---
## Part 2: Add `create-stack` Command
### 2.1 Command Overview
```bash
laconic-so create-stack --repo github.com/org/my-app [--name my-app] [--type webapp]
```
**Behavior:**
1. Parse repo URL to extract app name (if --name not provided)
2. Create `stacks/{name}/stack.yml`
3. Create `container-build/cerc-{name}/Dockerfile` and `build.sh`
4. Create `compose/docker-compose-{name}.yml`
5. Update list files (repository-list.txt, container-image-list.txt, pod-list.txt)
### 2.2 Files to Create
| File | Purpose |
|------|---------|
| `stack_orchestrator/create/__init__.py` | Package init |
| `stack_orchestrator/create/create_stack.py` | Command implementation |
### 2.3 Files to Modify
| File | Change |
|------|--------|
| `stack_orchestrator/main.py` | Add import and `cli.add_command()` |
### 2.4 Command Options
| Option | Required | Description |
|--------|----------|-------------|
| `--repo` | Yes | Git repository URL (e.g., github.com/org/repo) |
| `--name` | No | Stack name (defaults to repo name) |
| `--type` | No | Template type: webapp, service, empty (default: webapp) |
| `--force` | No | Overwrite existing files |
### 2.5 Template Types
| Type | Base Image | Port | Use Case |
|------|------------|------|----------|
| webapp | node:20-bullseye-slim | 3000 | React/Vue/Next.js apps |
| service | python:3.11-slim | 8080 | Python backend services |
| empty | none | none | Custom from scratch |
---
## Part 3: Implementation Summary
### New Files (6)
1. `CLAUDE.md` - AI assistant context
2. `schemas/stack-schema.json` - Validation schema
3. `stack_orchestrator/data/stacks/_template/stack.yml` - Annotated template
4. `docs/stack-format.md` - Stack format documentation
5. `stack_orchestrator/create/__init__.py` - Package init
6. `stack_orchestrator/create/create_stack.py` - Command implementation
### Modified Files (1)
1. `stack_orchestrator/main.py` - Register create-stack command
---
## Verification
```bash
# 1. Command appears in help
laconic-so --help | grep create-stack
# 2. Dry run works
laconic-so --dry-run create-stack --repo github.com/org/test-app
# 3. Creates all expected files
laconic-so create-stack --repo github.com/org/test-app
ls stack_orchestrator/data/stacks/test-app/
ls stack_orchestrator/data/container-build/cerc-test-app/
ls stack_orchestrator/data/compose/docker-compose-test-app.yml
# 4. Build works with generated stack
laconic-so --stack test-app build-containers
```

278
CLAUDE.md
View File

@ -1,221 +1,121 @@
# Biscayne Agave Runbook
# CLAUDE.md
## Deployment Layers
This file provides guidance to Claude Code when working with the stack-orchestrator project.
Operations on biscayne follow a strict layering. Each layer assumes the layers
below it are correct. Playbooks belong to exactly one layer.
## Some rules to follow
NEVER speculate about the cause of something
NEVER assume your hypotheses are true without evidence
| Layer | What | Playbooks |
|-------|------|-----------|
| 1. Base system | Docker, ZFS, packages | Out of scope (manual/PXE) |
| 2. Prepare kind | `/srv/kind` exists (ZFS dataset) | None needed (ZFS handles it) |
| 3. Install kind | `laconic-so deployment start` creates kind cluster, mounts `/srv/kind``/mnt` in kind node | `biscayne-redeploy.yml` (deploy tags) |
| 4. Prepare agave | Host storage for agave: ZFS dataset, ramdisk | `biscayne-prepare-agave.yml` |
| 5. Deploy agave | Deploy agave-stack into kind, snapshot download, scale up | `biscayne-redeploy.yml` (snapshot/verify tags), `biscayne-recover.yml` |
ALWAYS clearly state when something is a hypothesis
ALWAYS use evidence from the systems your interacting with to support your claims and hypotheses
ALWAYS run `pre-commit run --all-files` before committing changes
**Layer 4 invariants** (asserted by `biscayne-prepare-agave.yml`):
- `/srv/kind/solana` is a ZFS dataset (`biscayne/DATA/srv/kind/solana`), child of the `/srv/kind` dataset
- `/srv/kind/solana/ramdisk` is tmpfs (1TB) — accounts must be in RAM
- `/srv/solana` is NOT the data path — it's a directory on the parent ZFS dataset. All data paths use `/srv/kind/solana`
## Key Principles
These invariants are checked at runtime and persisted to fstab/systemd so they
survive reboot.
### Development Guidelines
- **Single responsibility** - Each component has one clear purpose
- **Fail fast** - Let errors propagate, don't hide failures
- **DRY/KISS** - Minimize duplication and complexity
**Cross-cutting**: `health-check.yml` (read-only diagnostics), `biscayne-stop.yml`
(layer 5 — graceful shutdown), `fix-pv-mounts.yml` (layer 5 — PV repair).
## Development Philosophy: Conversational Literate Programming
## Cluster Operations
### Approach
This project follows principles inspired by literate programming, where development happens through explanatory conversation rather than code-first implementation.
### Shutdown Order
### Core Principles
- **Documentation-First**: All changes begin with discussion of intent and reasoning
- **Narrative-Driven**: Complex systems are explained through conversational exploration
- **Justification Required**: Every coding task must have a corresponding TODO.md item explaining the "why"
- **Iterative Understanding**: Architecture and implementation evolve through dialogue
The agave validator runs inside a kind-based k8s cluster managed by `laconic-so`.
The kind node is a Docker container. **Never restart or kill the kind node container
while the validator is running.** Use `agave-validator exit --force` via the admin
RPC socket for graceful shutdown, or scale the deployment to 0 and wait.
### Working Method
1. **Explore and Understand**: Read existing code to understand current state
2. **Discuss Architecture**: Workshop complex design decisions through conversation
3. **Document Intent**: Update TODO.md with clear justification before coding
4. **Explain Changes**: Each modification includes reasoning and context
5. **Maintain Narrative**: Conversations serve as living documentation of design evolution
Correct shutdown sequence:
### Implementation Guidelines
- Treat conversations as primary documentation
- Explain architectural decisions before implementing
- Use TODO.md as the "literate document" that justifies all work
- Maintain clear narrative threads across sessions
- Workshop complex ideas before coding
1. Scale the deployment to 0 and wait for the pod to terminate:
```
kubectl scale deployment laconic-70ce4c4b47e23b85-deployment \
-n laconic-laconic-70ce4c4b47e23b85 --replicas=0
kubectl wait --for=delete pod -l app=laconic-70ce4c4b47e23b85-deployment \
-n laconic-laconic-70ce4c4b47e23b85 --timeout=120s
```
2. Only then restart the kind node if needed:
```
docker restart laconic-70ce4c4b47e23b85-control-plane
```
3. Scale back up:
```
kubectl scale deployment laconic-70ce4c4b47e23b85-deployment \
-n laconic-laconic-70ce4c4b47e23b85 --replicas=1
```
This approach treats the human-AI collaboration as a form of **conversational literate programming** where understanding emerges through dialogue before code implementation.
### Ramdisk
## External Stacks Preferred
The accounts directory must be in RAM for performance. tmpfs is used instead of
`/dev/ram0` — simpler (no format-on-boot service needed), resizable on the fly
with `mount -o remount,size=<new>`, and what most Solana operators use.
When creating new stacks for any reason, **use the external stack pattern** rather than adding stacks directly to this repository.
**Boot ordering**: `/srv/kind/solana` is a ZFS dataset mounted automatically by
`zfs-mount.service`. The tmpfs ramdisk fstab entry uses
`x-systemd.requires=zfs-mount.service` to ensure the dataset is mounted first.
**No manual intervention after reboot.**
External stacks follow this structure:
**Mount propagation**: The kind node bind-mounts `/srv/kind``/mnt` at container
start. laconic-so sets `propagation: HostToContainer` on all kind extraMounts
(commit `a11d40f2` in stack-orchestrator), so host submounts propagate into the
kind node automatically. A kind restart is required to pick up the new config
after updating laconic-so.
### KUBECONFIG
kubectl must be told where the kubeconfig is when running as root or via ansible:
```
KUBECONFIG=/home/rix/.kube/config kubectl ...
my-stack/
└── stack-orchestrator/
├── stacks/
│ └── my-stack/
│ ├── stack.yml
│ └── README.md
├── compose/
│ └── docker-compose-my-stack.yml
└── config/
└── my-stack/
└── (config files)
```
The ansible playbooks set `environment: KUBECONFIG: /home/rix/.kube/config`.
### Usage
### SSH Agent
```bash
# Fetch external stack
laconic-so fetch-stack github.com/org/my-stack
SSH to biscayne goes through a ProxyCommand jump host (abernathy.ch2.vaasl.io).
The SSH agent socket rotates when the user reconnects. Find the current one:
```
ls -t /tmp/ssh-*/agent.* | head -1
```
Then export it:
```
export SSH_AUTH_SOCK=/tmp/ssh-XXXX/agent.NNNN
# Use external stack
STACK_PATH=~/cerc/my-stack/stack-orchestrator/stacks/my-stack
laconic-so --stack $STACK_PATH deploy init --output spec.yml
laconic-so --stack $STACK_PATH deploy create --spec-file spec.yml --deployment-dir deployment
laconic-so deployment --dir deployment start
```
### io_uring/ZFS Deadlock — Historical Note
### Examples
Agave uses io_uring for async I/O. Killing agave ungracefully while it has
outstanding I/O against ZFS can produce unkillable D-state kernel threads
(`io_wq_put_and_exit` blocked on ZFS transactions), deadlocking the container.
- `zenith-karma-stack` - Karma watcher deployment
- `urbit-stack` - Fake Urbit ship for testing
- `zenith-desk-stack` - Desk deployment stack
**Prevention**: Use graceful shutdown (`agave-validator exit --force` via admin
RPC, or scale to 0 and wait). The `biscayne-stop.yml` playbook enforces this.
With graceful shutdown, io_uring contexts are closed cleanly and ZFS storage
is safe to use directly (no zvol/XFS workaround needed).
## Architecture: k8s-kind Deployments
**ZFS fix**: The underlying io_uring bug is fixed in ZFS 2.2.8+ (PR #17298).
Biscayne currently runs ZFS 2.2.2. Upgrading ZFS will eliminate the deadlock
risk entirely, even for ungraceful shutdowns.
### One Cluster Per Host
One Kind cluster per host by design. Never request or expect separate clusters.
### laconic-so Architecture
- `create_cluster()` in `helpers.py` reuses any existing cluster
- `cluster-id` in deployment.yml is an identifier, not a cluster request
- All deployments share: ingress controller, etcd, certificates
`laconic-so` manages kind clusters atomically — `deployment start` creates the
kind cluster, namespace, PVs, PVCs, and deployment in one shot. There is no way
to create the cluster without deploying the pod.
### Stack Resolution
- External stacks detected via `Path(stack).exists()` in `util.py`
- Config/compose resolution: external path first, then internal fallback
- External path structure: `stack_orchestrator/data/stacks/<name>/stack.yml`
Key code paths in stack-orchestrator:
- `deploy_k8s.py:up()` — creates everything atomically
- `cluster_info.py:get_pvs()` — translates host paths using `kind-mount-root`
- `helpers_k8s.py:get_kind_pv_bind_mount_path()` — strips `kind-mount-root`
prefix and prepends `/mnt/`
- `helpers_k8s.py:_generate_kind_mounts()` — when `kind-mount-root` is set,
emits a single `/srv/kind``/mnt` mount instead of individual mounts
### Secret Generation Implementation
- `GENERATE_TOKEN_PATTERN` in `deployment_create.py` matches `$generate:type:length$`
- `_generate_and_store_secrets()` creates K8s Secret
- `cluster_info.py` adds `envFrom` with `secretRef` to containers
- Non-secret config written to `config.env`
The `kind-mount-root: /srv/kind` setting in `spec.yml` means all data volumes
whose host paths start with `/srv/kind` get translated to `/mnt/...` inside the
kind node via a single bind mount.
### Repository Cloning
`setup-repositories --git-ssh` clones repos defined in stack.yml's `repos:` field. Requires SSH agent.
### Key Identifiers
### Key Files (for codebase navigation)
- `repos/setup_repositories.py`: `setup-repositories` command (git clone)
- `deployment_create.py`: `deploy create` command, secret generation
- `deployment.py`: `deployment start/stop/restart` commands
- `deploy_k8s.py`: K8s deployer, cluster management calls
- `helpers.py`: `create_cluster()`, etcd cleanup, kind operations
- `cluster_info.py`: K8s resource generation (Deployment, Service, Ingress)
- Kind cluster: `laconic-70ce4c4b47e23b85`
- Namespace: `laconic-laconic-70ce4c4b47e23b85`
- Deployment: `laconic-70ce4c4b47e23b85-deployment`
- Kind node container: `laconic-70ce4c4b47e23b85-control-plane`
- Deployment dir: `/srv/deployments/agave`
- Snapshot dir: `/srv/kind/solana/snapshots` (ZFS dataset, visible to kind at `/mnt/validator-snapshots`)
- Ledger dir: `/srv/kind/solana/ledger` (ZFS dataset, visible to kind at `/mnt/validator-ledger`)
- Accounts dir: `/srv/kind/solana/ramdisk/accounts` (tmpfs ramdisk, visible to kind at `/mnt/validator-accounts`)
- Log dir: `/srv/kind/solana/log` (ZFS dataset, visible to kind at `/mnt/validator-log`)
- **WARNING**: `/srv/solana` is a different ZFS dataset directory. All data paths use `/srv/kind/solana`.
- Host bind mount root: `/srv/kind` -> kind node `/mnt`
- laconic-so: `/home/rix/.local/bin/laconic-so` (editable install)
## Insights and Observations
### PV Mount Paths (inside kind node)
| PV Name | hostPath |
|----------------------|-------------------------------|
| validator-snapshots | /mnt/validator-snapshots |
| validator-ledger | /mnt/validator-ledger |
| validator-accounts | /mnt/validator-accounts |
| validator-log | /mnt/validator-log |
### Snapshot Freshness
If the snapshot is more than **20,000 slots behind** the current mainnet tip, it is
too old. Stop the validator, download a fresh snapshot, and restart. Do NOT let it
try to catch up from an old snapshot — it will take too long and may never converge.
Check with:
```
# Snapshot slot (from filename)
ls /srv/kind/solana/snapshots/snapshot-*.tar.*
# Current mainnet slot
curl -s -X POST -H "Content-Type: application/json" \
-d '{"jsonrpc":"2.0","id":1,"method":"getSlot","params":[{"commitment":"finalized"}]}' \
https://api.mainnet-beta.solana.com
```
### Snapshot Leapfrog Recovery
When the validator is stuck in a repair-dependent gap (incomplete shreds from a
relay outage or insufficient turbine coverage), "grinding through" doesn't work.
At 0.4 slots/sec replay through incomplete blocks vs 2.5 slots/sec chain
production, the gap grows faster than it shrinks.
**Strategy**: Download a fresh snapshot whose slot lands *past* the incomplete zone,
into the range where turbine+relay shreds are accumulating in the blockstore.
**Keep the existing ledger** — it has those shreds. The validator replays from
local blockstore data instead of waiting on repair.
**Steps**:
1. Let the validator run — turbine+relay accumulate shreds at the tip
2. Monitor shred completeness at the tip:
`scripts/check-shred-completeness.sh 500`
3. When there's a contiguous run of complete blocks (>100 slots), note the
starting slot of that run
4. Scale to 0, wipe accounts (ramdisk), wipe old snapshots
5. **Do NOT wipe ledger** — it has the turbine shreds
6. Download a fresh snapshot (its slot should be within the complete run)
7. Scale to 1 — validator replays from local blockstore at 3-5 slots/sec
**Why this works**: Turbine delivers ~60% of shreds in real-time. Repair fills
the rest for recent slots quickly (peers prioritize recent data). The only
problem is repair for *old* slots (minutes/hours behind) which peers deprioritize.
By snapshotting past the gap, we skip the old-slot repair bottleneck entirely.
### Shred Relay (Ashburn)
The TVU shred relay from laconic-was-sw01 provides ~4,000 additional shreds/sec.
Without it, turbine alone delivers ~60% of blocks. With it, completeness improves
but still requires repair for full coverage.
**Current state**: Old pipeline (monitor session + socat + shred-unwrap.py).
The traffic-policy redirect was never committed (auto-revert after 5 min timer).
See `docs/tvu-shred-relay.md` for the traffic-policy config that needs to be
properly applied.
**Boot dependency**: `shred-unwrap.py` must be running on biscayne for the old
pipeline to work. It is NOT persistent across reboots. The iptables DNAT rule
for the new pipeline IS persistent (iptables-persistent installed).
### Redeploy Flow
See `playbooks/biscayne-redeploy.yml`. The scale-to-0 pattern is required because
`laconic-so` creates the cluster and deploys the pod atomically:
1. Delete namespace (teardown)
2. Optionally wipe data
3. `laconic-so deployment start` (creates cluster + pod)
4. Immediately scale to 0
5. Download snapshot via aria2c
6. Scale to 1
7. Verify
### Design Principles
- **When something times out that doesn't mean it needs a longer timeout it means something that was expected never happened, not that we need to wait longer for it.**
- **NEVER change a timeout because you believe something truncated, you don't understand timeouts, don't edit them unless told to explicitly by user.**

661
LICENSE 100644
View File

@ -0,0 +1,661 @@
GNU AFFERO GENERAL PUBLIC LICENSE
Version 3, 19 November 2007
Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
Everyone is permitted to copy and distribute verbatim copies
of this license document, but changing it is not allowed.
Preamble
The GNU Affero General Public License is a free, copyleft license for
software and other kinds of works, specifically designed to ensure
cooperation with the community in the case of network server software.
The licenses for most software and other practical works are designed
to take away your freedom to share and change the works. By contrast,
our General Public Licenses are intended to guarantee your freedom to
share and change all versions of a program--to make sure it remains free
software for all its users.
When we speak of free software, we are referring to freedom, not
price. Our General Public Licenses are designed to make sure that you
have the freedom to distribute copies of free software (and charge for
them if you wish), that you receive source code or can get it if you
want it, that you can change the software or use pieces of it in new
free programs, and that you know you can do these things.
Developers that use our General Public Licenses protect your rights
with two steps: (1) assert copyright on the software, and (2) offer
you this License which gives you legal permission to copy, distribute
and/or modify the software.
A secondary benefit of defending all users' freedom is that
improvements made in alternate versions of the program, if they
receive widespread use, become available for other developers to
incorporate. Many developers of free software are heartened and
encouraged by the resulting cooperation. However, in the case of
software used on network servers, this result may fail to come about.
The GNU General Public License permits making a modified version and
letting the public access it on a server without ever releasing its
source code to the public.
The GNU Affero General Public License is designed specifically to
ensure that, in such cases, the modified source code becomes available
to the community. It requires the operator of a network server to
provide the source code of the modified version running there to the
users of that server. Therefore, public use of a modified version, on
a publicly accessible server, gives the public access to the source
code of the modified version.
An older license, called the Affero General Public License and
published by Affero, was designed to accomplish similar goals. This is
a different license, not a version of the Affero GPL, but Affero has
released a new version of the Affero GPL which permits relicensing under
this license.
The precise terms and conditions for copying, distribution and
modification follow.
TERMS AND CONDITIONS
0. Definitions.
"This License" refers to version 3 of the GNU Affero General Public License.
"Copyright" also means copyright-like laws that apply to other kinds of
works, such as semiconductor masks.
"The Program" refers to any copyrightable work licensed under this
License. Each licensee is addressed as "you". "Licensees" and
"recipients" may be individuals or organizations.
To "modify" a work means to copy from or adapt all or part of the work
in a fashion requiring copyright permission, other than the making of an
exact copy. The resulting work is called a "modified version" of the
earlier work or a work "based on" the earlier work.
A "covered work" means either the unmodified Program or a work based
on the Program.
To "propagate" a work means to do anything with it that, without
permission, would make you directly or secondarily liable for
infringement under applicable copyright law, except executing it on a
computer or modifying a private copy. Propagation includes copying,
distribution (with or without modification), making available to the
public, and in some countries other activities as well.
To "convey" a work means any kind of propagation that enables other
parties to make or receive copies. Mere interaction with a user through
a computer network, with no transfer of a copy, is not conveying.
An interactive user interface displays "Appropriate Legal Notices"
to the extent that it includes a convenient and prominently visible
feature that (1) displays an appropriate copyright notice, and (2)
tells the user that there is no warranty for the work (except to the
extent that warranties are provided), that licensees may convey the
work under this License, and how to view a copy of this License. If
the interface presents a list of user commands or options, such as a
menu, a prominent item in the list meets this criterion.
1. Source Code.
The "source code" for a work means the preferred form of the work
for making modifications to it. "Object code" means any non-source
form of a work.
A "Standard Interface" means an interface that either is an official
standard defined by a recognized standards body, or, in the case of
interfaces specified for a particular programming language, one that
is widely used among developers working in that language.
The "System Libraries" of an executable work include anything, other
than the work as a whole, that (a) is included in the normal form of
packaging a Major Component, but which is not part of that Major
Component, and (b) serves only to enable use of the work with that
Major Component, or to implement a Standard Interface for which an
implementation is available to the public in source code form. A
"Major Component", in this context, means a major essential component
(kernel, window system, and so on) of the specific operating system
(if any) on which the executable work runs, or a compiler used to
produce the work, or an object code interpreter used to run it.
The "Corresponding Source" for a work in object code form means all
the source code needed to generate, install, and (for an executable
work) run the object code and to modify the work, including scripts to
control those activities. However, it does not include the work's
System Libraries, or general-purpose tools or generally available free
programs which are used unmodified in performing those activities but
which are not part of the work. For example, Corresponding Source
includes interface definition files associated with source files for
the work, and the source code for shared libraries and dynamically
linked subprograms that the work is specifically designed to require,
such as by intimate data communication or control flow between those
subprograms and other parts of the work.
The Corresponding Source need not include anything that users
can regenerate automatically from other parts of the Corresponding
Source.
The Corresponding Source for a work in source code form is that
same work.
2. Basic Permissions.
All rights granted under this License are granted for the term of
copyright on the Program, and are irrevocable provided the stated
conditions are met. This License explicitly affirms your unlimited
permission to run the unmodified Program. The output from running a
covered work is covered by this License only if the output, given its
content, constitutes a covered work. This License acknowledges your
rights of fair use or other equivalent, as provided by copyright law.
You may make, run and propagate covered works that you do not
convey, without conditions so long as your license otherwise remains
in force. You may convey covered works to others for the sole purpose
of having them make modifications exclusively for you, or provide you
with facilities for running those works, provided that you comply with
the terms of this License in conveying all material for which you do
not control copyright. Those thus making or running the covered works
for you must do so exclusively on your behalf, under your direction
and control, on terms that prohibit them from making any copies of
your copyrighted material outside their relationship with you.
Conveying under any other circumstances is permitted solely under
the conditions stated below. Sublicensing is not allowed; section 10
makes it unnecessary.
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
No covered work shall be deemed part of an effective technological
measure under any applicable law fulfilling obligations under article
11 of the WIPO copyright treaty adopted on 20 December 1996, or
similar laws prohibiting or restricting circumvention of such
measures.
When you convey a covered work, you waive any legal power to forbid
circumvention of technological measures to the extent such circumvention
is effected by exercising rights under this License with respect to
the covered work, and you disclaim any intention to limit operation or
modification of the work as a means of enforcing, against the work's
users, your or third parties' legal rights to forbid circumvention of
technological measures.
4. Conveying Verbatim Copies.
You may convey verbatim copies of the Program's source code as you
receive it, in any medium, provided that you conspicuously and
appropriately publish on each copy an appropriate copyright notice;
keep intact all notices stating that this License and any
non-permissive terms added in accord with section 7 apply to the code;
keep intact all notices of the absence of any warranty; and give all
recipients a copy of this License along with the Program.
You may charge any price or no price for each copy that you convey,
and you may offer support or warranty protection for a fee.
5. Conveying Modified Source Versions.
You may convey a work based on the Program, or the modifications to
produce it from the Program, in the form of source code under the
terms of section 4, provided that you also meet all of these conditions:
a) The work must carry prominent notices stating that you modified
it, and giving a relevant date.
b) The work must carry prominent notices stating that it is
released under this License and any conditions added under section
7. This requirement modifies the requirement in section 4 to
"keep intact all notices".
c) You must license the entire work, as a whole, under this
License to anyone who comes into possession of a copy. This
License will therefore apply, along with any applicable section 7
additional terms, to the whole of the work, and all its parts,
regardless of how they are packaged. This License gives no
permission to license the work in any other way, but it does not
invalidate such permission if you have separately received it.
d) If the work has interactive user interfaces, each must display
Appropriate Legal Notices; however, if the Program has interactive
interfaces that do not display Appropriate Legal Notices, your
work need not make them do so.
A compilation of a covered work with other separate and independent
works, which are not by their nature extensions of the covered work,
and which are not combined with it such as to form a larger program,
in or on a volume of a storage or distribution medium, is called an
"aggregate" if the compilation and its resulting copyright are not
used to limit the access or legal rights of the compilation's users
beyond what the individual works permit. Inclusion of a covered work
in an aggregate does not cause this License to apply to the other
parts of the aggregate.
6. Conveying Non-Source Forms.
You may convey a covered work in object code form under the terms
of sections 4 and 5, provided that you also convey the
machine-readable Corresponding Source under the terms of this License,
in one of these ways:
a) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by the
Corresponding Source fixed on a durable physical medium
customarily used for software interchange.
b) Convey the object code in, or embodied in, a physical product
(including a physical distribution medium), accompanied by a
written offer, valid for at least three years and valid for as
long as you offer spare parts or customer support for that product
model, to give anyone who possesses the object code either (1) a
copy of the Corresponding Source for all the software in the
product that is covered by this License, on a durable physical
medium customarily used for software interchange, for a price no
more than your reasonable cost of physically performing this
conveying of source, or (2) access to copy the
Corresponding Source from a network server at no charge.
c) Convey individual copies of the object code with a copy of the
written offer to provide the Corresponding Source. This
alternative is allowed only occasionally and noncommercially, and
only if you received the object code with such an offer, in accord
with subsection 6b.
d) Convey the object code by offering access from a designated
place (gratis or for a charge), and offer equivalent access to the
Corresponding Source in the same way through the same place at no
further charge. You need not require recipients to copy the
Corresponding Source along with the object code. If the place to
copy the object code is a network server, the Corresponding Source
may be on a different server (operated by you or a third party)
that supports equivalent copying facilities, provided you maintain
clear directions next to the object code saying where to find the
Corresponding Source. Regardless of what server hosts the
Corresponding Source, you remain obligated to ensure that it is
available for as long as needed to satisfy these requirements.
e) Convey the object code using peer-to-peer transmission, provided
you inform other peers where the object code and Corresponding
Source of the work are being offered to the general public at no
charge under subsection 6d.
A separable portion of the object code, whose source code is excluded
from the Corresponding Source as a System Library, need not be
included in conveying the object code work.
A "User Product" is either (1) a "consumer product", which means any
tangible personal property which is normally used for personal, family,
or household purposes, or (2) anything designed or sold for incorporation
into a dwelling. In determining whether a product is a consumer product,
doubtful cases shall be resolved in favor of coverage. For a particular
product received by a particular user, "normally used" refers to a
typical or common use of that class of product, regardless of the status
of the particular user or of the way in which the particular user
actually uses, or expects or is expected to use, the product. A product
is a consumer product regardless of whether the product has substantial
commercial, industrial or non-consumer uses, unless such uses represent
the only significant mode of use of the product.
"Installation Information" for a User Product means any methods,
procedures, authorization keys, or other information required to install
and execute modified versions of a covered work in that User Product from
a modified version of its Corresponding Source. The information must
suffice to ensure that the continued functioning of the modified object
code is in no case prevented or interfered with solely because
modification has been made.
If you convey an object code work under this section in, or with, or
specifically for use in, a User Product, and the conveying occurs as
part of a transaction in which the right of possession and use of the
User Product is transferred to the recipient in perpetuity or for a
fixed term (regardless of how the transaction is characterized), the
Corresponding Source conveyed under this section must be accompanied
by the Installation Information. But this requirement does not apply
if neither you nor any third party retains the ability to install
modified object code on the User Product (for example, the work has
been installed in ROM).
The requirement to provide Installation Information does not include a
requirement to continue to provide support service, warranty, or updates
for a work that has been modified or installed by the recipient, or for
the User Product in which it has been modified or installed. Access to a
network may be denied when the modification itself materially and
adversely affects the operation of the network or violates the rules and
protocols for communication across the network.
Corresponding Source conveyed, and Installation Information provided,
in accord with this section must be in a format that is publicly
documented (and with an implementation available to the public in
source code form), and must require no special password or key for
unpacking, reading or copying.
7. Additional Terms.
"Additional permissions" are terms that supplement the terms of this
License by making exceptions from one or more of its conditions.
Additional permissions that are applicable to the entire Program shall
be treated as though they were included in this License, to the extent
that they are valid under applicable law. If additional permissions
apply only to part of the Program, that part may be used separately
under those permissions, but the entire Program remains governed by
this License without regard to the additional permissions.
When you convey a copy of a covered work, you may at your option
remove any additional permissions from that copy, or from any part of
it. (Additional permissions may be written to require their own
removal in certain cases when you modify the work.) You may place
additional permissions on material, added by you to a covered work,
for which you have or can give appropriate copyright permission.
Notwithstanding any other provision of this License, for material you
add to a covered work, you may (if authorized by the copyright holders of
that material) supplement the terms of this License with terms:
a) Disclaiming warranty or limiting liability differently from the
terms of sections 15 and 16 of this License; or
b) Requiring preservation of specified reasonable legal notices or
author attributions in that material or in the Appropriate Legal
Notices displayed by works containing it; or
c) Prohibiting misrepresentation of the origin of that material, or
requiring that modified versions of such material be marked in
reasonable ways as different from the original version; or
d) Limiting the use for publicity purposes of names of licensors or
authors of the material; or
e) Declining to grant rights under trademark law for use of some
trade names, trademarks, or service marks; or
f) Requiring indemnification of licensors and authors of that
material by anyone who conveys the material (or modified versions of
it) with contractual assumptions of liability to the recipient, for
any liability that these contractual assumptions directly impose on
those licensors and authors.
All other non-permissive additional terms are considered "further
restrictions" within the meaning of section 10. If the Program as you
received it, or any part of it, contains a notice stating that it is
governed by this License along with a term that is a further
restriction, you may remove that term. If a license document contains
a further restriction but permits relicensing or conveying under this
License, you may add to a covered work material governed by the terms
of that license document, provided that the further restriction does
not survive such relicensing or conveying.
If you add terms to a covered work in accord with this section, you
must place, in the relevant source files, a statement of the
additional terms that apply to those files, or a notice indicating
where to find the applicable terms.
Additional terms, permissive or non-permissive, may be stated in the
form of a separately written license, or stated as exceptions;
the above requirements apply either way.
8. Termination.
You may not propagate or modify a covered work except as expressly
provided under this License. Any attempt otherwise to propagate or
modify it is void, and will automatically terminate your rights under
this License (including any patent licenses granted under the third
paragraph of section 11).
However, if you cease all violation of this License, then your
license from a particular copyright holder is reinstated (a)
provisionally, unless and until the copyright holder explicitly and
finally terminates your license, and (b) permanently, if the copyright
holder fails to notify you of the violation by some reasonable means
prior to 60 days after the cessation.
Moreover, your license from a particular copyright holder is
reinstated permanently if the copyright holder notifies you of the
violation by some reasonable means, this is the first time you have
received notice of violation of this License (for any work) from that
copyright holder, and you cure the violation prior to 30 days after
your receipt of the notice.
Termination of your rights under this section does not terminate the
licenses of parties who have received copies or rights from you under
this License. If your rights have been terminated and not permanently
reinstated, you do not qualify to receive new licenses for the same
material under section 10.
9. Acceptance Not Required for Having Copies.
You are not required to accept this License in order to receive or
run a copy of the Program. Ancillary propagation of a covered work
occurring solely as a consequence of using peer-to-peer transmission
to receive a copy likewise does not require acceptance. However,
nothing other than this License grants you permission to propagate or
modify any covered work. These actions infringe copyright if you do
not accept this License. Therefore, by modifying or propagating a
covered work, you indicate your acceptance of this License to do so.
10. Automatic Licensing of Downstream Recipients.
Each time you convey a covered work, the recipient automatically
receives a license from the original licensors, to run, modify and
propagate that work, subject to this License. You are not responsible
for enforcing compliance by third parties with this License.
An "entity transaction" is a transaction transferring control of an
organization, or substantially all assets of one, or subdividing an
organization, or merging organizations. If propagation of a covered
work results from an entity transaction, each party to that
transaction who receives a copy of the work also receives whatever
licenses to the work the party's predecessor in interest had or could
give under the previous paragraph, plus a right to possession of the
Corresponding Source of the work from the predecessor in interest, if
the predecessor has it or can get it with reasonable efforts.
You may not impose any further restrictions on the exercise of the
rights granted or affirmed under this License. For example, you may
not impose a license fee, royalty, or other charge for exercise of
rights granted under this License, and you may not initiate litigation
(including a cross-claim or counterclaim in a lawsuit) alleging that
any patent claim is infringed by making, using, selling, offering for
sale, or importing the Program or any portion of it.
11. Patents.
A "contributor" is a copyright holder who authorizes use under this
License of the Program or a work on which the Program is based. The
work thus licensed is called the contributor's "contributor version".
A contributor's "essential patent claims" are all patent claims
owned or controlled by the contributor, whether already acquired or
hereafter acquired, that would be infringed by some manner, permitted
by this License, of making, using, or selling its contributor version,
but do not include claims that would be infringed only as a
consequence of further modification of the contributor version. For
purposes of this definition, "control" includes the right to grant
patent sublicenses in a manner consistent with the requirements of
this License.
Each contributor grants you a non-exclusive, worldwide, royalty-free
patent license under the contributor's essential patent claims, to
make, use, sell, offer for sale, import and otherwise run, modify and
propagate the contents of its contributor version.
In the following three paragraphs, a "patent license" is any express
agreement or commitment, however denominated, not to enforce a patent
(such as an express permission to practice a patent or covenant not to
sue for patent infringement). To "grant" such a patent license to a
party means to make such an agreement or commitment not to enforce a
patent against the party.
If you convey a covered work, knowingly relying on a patent license,
and the Corresponding Source of the work is not available for anyone
to copy, free of charge and under the terms of this License, through a
publicly available network server or other readily accessible means,
then you must either (1) cause the Corresponding Source to be so
available, or (2) arrange to deprive yourself of the benefit of the
patent license for this particular work, or (3) arrange, in a manner
consistent with the requirements of this License, to extend the patent
license to downstream recipients. "Knowingly relying" means you have
actual knowledge that, but for the patent license, your conveying the
covered work in a country, or your recipient's use of the covered work
in a country, would infringe one or more identifiable patents in that
country that you have reason to believe are valid.
If, pursuant to or in connection with a single transaction or
arrangement, you convey, or propagate by procuring conveyance of, a
covered work, and grant a patent license to some of the parties
receiving the covered work authorizing them to use, propagate, modify
or convey a specific copy of the covered work, then the patent license
you grant is automatically extended to all recipients of the covered
work and works based on it.
A patent license is "discriminatory" if it does not include within
the scope of its coverage, prohibits the exercise of, or is
conditioned on the non-exercise of one or more of the rights that are
specifically granted under this License. You may not convey a covered
work if you are a party to an arrangement with a third party that is
in the business of distributing software, under which you make payment
to the third party based on the extent of your activity of conveying
the work, and under which the third party grants, to any of the
parties who would receive the covered work from you, a discriminatory
patent license (a) in connection with copies of the covered work
conveyed by you (or copies made from those copies), or (b) primarily
for and in connection with specific products or compilations that
contain the covered work, unless you entered into that arrangement,
or that patent license was granted, prior to 28 March 2007.
Nothing in this License shall be construed as excluding or limiting
any implied license or other defenses to infringement that may
otherwise be available to you under applicable patent law.
12. No Surrender of Others' Freedom.
If conditions are imposed on you (whether by court order, agreement or
otherwise) that contradict the conditions of this License, they do not
excuse you from the conditions of this License. If you cannot convey a
covered work so as to satisfy simultaneously your obligations under this
License and any other pertinent obligations, then as a consequence you may
not convey it at all. For example, if you agree to terms that obligate you
to collect a royalty for further conveying from those to whom you convey
the Program, the only way you could satisfy both those terms and this
License would be to refrain entirely from conveying the Program.
13. Remote Network Interaction; Use with the GNU General Public License.
Notwithstanding any other provision of this License, if you modify the
Program, your modified version must prominently offer all users
interacting with it remotely through a computer network (if your version
supports such interaction) an opportunity to receive the Corresponding
Source of your version by providing access to the Corresponding Source
from a network server at no charge, through some standard or customary
means of facilitating copying of software. This Corresponding Source
shall include the Corresponding Source for any work covered by version 3
of the GNU General Public License that is incorporated pursuant to the
following paragraph.
Notwithstanding any other provision of this License, you have
permission to link or combine any covered work with a work licensed
under version 3 of the GNU General Public License into a single
combined work, and to convey the resulting work. The terms of this
License will continue to apply to the part which is the covered work,
but the work with which it is combined will remain governed by version
3 of the GNU General Public License.
14. Revised Versions of this License.
The Free Software Foundation may publish revised and/or new versions of
the GNU Affero General Public License from time to time. Such new versions
will be similar in spirit to the present version, but may differ in detail to
address new problems or concerns.
Each version is given a distinguishing version number. If the
Program specifies that a certain numbered version of the GNU Affero General
Public License "or any later version" applies to it, you have the
option of following the terms and conditions either of that numbered
version or of any later version published by the Free Software
Foundation. If the Program does not specify a version number of the
GNU Affero General Public License, you may choose any version ever published
by the Free Software Foundation.
If the Program specifies that a proxy can decide which future
versions of the GNU Affero General Public License can be used, that proxy's
public statement of acceptance of a version permanently authorizes you
to choose that version for the Program.
Later license versions may give you additional or different
permissions. However, no additional obligations are imposed on any
author or copyright holder as a result of your choosing to follow a
later version.
15. Disclaimer of Warranty.
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
16. Limitation of Liability.
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
SUCH DAMAGES.
17. Interpretation of Sections 15 and 16.
If the disclaimer of warranty and limitation of liability provided
above cannot be given local legal effect according to their terms,
reviewing courts shall apply local law that most closely approximates
an absolute waiver of all civil liability in connection with the
Program, unless a warranty or assumption of liability accompanies a
copy of the Program in return for a fee.
END OF TERMS AND CONDITIONS
How to Apply These Terms to Your New Programs
If you develop a new program, and you want it to be of the greatest
possible use to the public, the best way to achieve this is to make it
free software which everyone can redistribute and change under these terms.
To do so, attach the following notices to the program. It is safest
to attach them to the start of each source file to most effectively
state the exclusion of warranty; and each file should have at least
the "copyright" line and a pointer to where the full notice is found.
<one line to give the program's name and a brief idea of what it does.>
Copyright (C) <year> <name of author>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
Also add information on how to contact you by electronic and paper mail.
If your software can interact with users remotely through a computer
network, you should also make sure that it provides a way for users to
get its source. For example, if your program is a web application, its
interface could display a "Source" link that leads users to an archive
of the code. There are many ways you could offer source, and different
solutions will be better for different programs; see section 13 for the
specific requirements.
You should also get your employer (if you work as a programmer) or school,
if any, to sign a "copyright disclaimer" for the program, if necessary.
For more information on this, and how to apply and follow the GNU AGPL, see
<http://www.gnu.org/licenses/>.

1
MANIFEST.in 100644
View File

@ -0,0 +1 @@
include LICENSE

134
README.md
View File

@ -1,3 +1,133 @@
# biscayne-agave-runbook
# Stack Orchestrator
Ansible playbooks for operating the kind-based agave-stack deployment on biscayne.vaasl.io.
Stack Orchestrator allows building and deployment of a Laconic Stack on a single machine with minimial prerequisites. It is a Python3 CLI tool that runs on any OS with Python3 and Docker. The following diagram summarizes the relevant repositories in the Laconic Stack - and the relationship to Stack Orchestrator.
![The Stack](/docs/images/laconic-stack.png)
## Install
**To get started quickly** on a fresh Ubuntu instance (e.g, Digital Ocean); [try this script](./scripts/quick-install-linux.sh). **WARNING:** always review scripts prior to running them so that you know what is happening on your machine.
For any other installation, follow along below and **adapt these instructions based on the specifics of your system.**
Ensure that the following are already installed:
- [Python3](https://wiki.python.org/moin/BeginnersGuide/Download): `python3 --version` >= `3.8.10` (the Python3 shipped in Ubuntu 20+ is good to go)
- [Docker](https://docs.docker.com/get-docker/): `docker --version` >= `20.10.21`
- [jq](https://stedolan.github.io/jq/download/): `jq --version` >= `1.5`
- [git](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git): `git --version` >= `2.10.3`
Note: if installing docker-compose via package manager on Linux (as opposed to Docker Desktop), you must [install the plugin](https://docs.docker.com/compose/install/linux/#install-the-plugin-manually), e.g. :
```bash
mkdir -p ~/.docker/cli-plugins
curl -SL https://github.com/docker/compose/releases/download/v2.11.2/docker-compose-linux-x86_64 -o ~/.docker/cli-plugins/docker-compose
chmod +x ~/.docker/cli-plugins/docker-compose
```
Next decide on a directory where you would like to put the stack-orchestrator program. Typically this would be
a "user" binary directory such as `~/bin` or perhaps `/usr/local/laconic` or possibly just the current working directory.
Now, having selected that directory, download the latest release from [this page](https://git.vdb.to/cerc-io/stack-orchestrator/tags) into it (we're using `~/bin` below for concreteness but edit to suit if you selected a different directory). Also be sure that the destination directory exists and is writable:
```bash
curl -L -o ~/bin/laconic-so https://git.vdb.to/cerc-io/stack-orchestrator/releases/download/latest/laconic-so
```
Give it execute permissions:
```bash
chmod +x ~/bin/laconic-so
```
Ensure `laconic-so` is on the [`PATH`](https://unix.stackexchange.com/a/26059)
Verify operation (your version will probably be different, just check here that you see some version output and not an error):
```
laconic-so version
Version: 1.1.0-7a607c2-202304260513
```
Save the distribution url to `~/.laconic-so/config.yml`:
```bash
mkdir ~/.laconic-so
echo "distribution-url: https://git.vdb.to/cerc-io/stack-orchestrator/releases/download/latest/laconic-so" > ~/.laconic-so/config.yml
```
### Update
If Stack Orchestrator was installed using the process described above, it is able to subsequently self-update to the current latest version by running:
```bash
laconic-so update
```
## Usage
The various [stacks](/stack_orchestrator/data/stacks) each contain instructions for running different stacks based on your use case. For example:
- [self-hosted Gitea](/stack_orchestrator/data/stacks/build-support)
- [an Optimism Fixturenet](/stack_orchestrator/data/stacks/fixturenet-optimism)
- [laconicd with console and CLI](stack_orchestrator/data/stacks/fixturenet-laconic-loaded)
- [kubo (IPFS)](stack_orchestrator/data/stacks/kubo)
## Deployment Types
- **compose**: Docker Compose on local machine
- **k8s**: External Kubernetes cluster (requires kubeconfig)
- **k8s-kind**: Local Kubernetes via Kind - one cluster per host, shared by all deployments
## External Stacks
Stacks can live in external git repositories. Required structure:
```
<repo>/
stack_orchestrator/data/
stacks/<stack-name>/stack.yml
compose/docker-compose-<pod-name>.yml
deployment/spec.yml
```
## Deployment Commands
```bash
# Create deployment from spec
laconic-so --stack <path> deploy create --spec-file <spec.yml> --deployment-dir <dir>
# Start (creates cluster on first run)
laconic-so deployment --dir <dir> start
# GitOps restart (git pull + redeploy, preserves data)
laconic-so deployment --dir <dir> restart
# Stop
laconic-so deployment --dir <dir> stop
```
## spec.yml Reference
```yaml
stack: stack-name-or-path
deploy-to: k8s-kind
network:
http-proxy:
- host-name: app.example.com
routes:
- path: /
proxy-to: service-name:port
acme-email: admin@example.com
config:
ENV_VAR: value
SECRET_VAR: $generate:hex:32$ # Auto-generated, stored in K8s Secret
volumes:
volume-name:
```
## Contributing
See the [CONTRIBUTING.md](/docs/CONTRIBUTING.md) for developer mode install.
## Platform Support
Native aarm64 is _not_ currently supported. x64 emulation on ARM64 macos should work (not yet tested).

View File

@ -0,0 +1,413 @@
# Implementing `laconic-so create-stack` Command
A plan for adding a new CLI command to scaffold stack files automatically.
---
## Overview
Add a `create-stack` command that generates all required files for a new stack:
```bash
laconic-so create-stack --name my-stack --type webapp
```
**Output:**
```
stack_orchestrator/data/
├── stacks/my-stack/stack.yml
├── container-build/cerc-my-stack/
│ ├── Dockerfile
│ └── build.sh
└── compose/docker-compose-my-stack.yml
Updated: repository-list.txt, container-image-list.txt, pod-list.txt
```
---
## CLI Architecture Summary
### Command Registration Pattern
Commands are Click functions registered in `main.py`:
```python
# main.py (line ~70)
from stack_orchestrator.create import create_stack
cli.add_command(create_stack.command, "create-stack")
```
### Global Options Access
```python
from stack_orchestrator.opts import opts
if not opts.o.quiet:
print("message")
if opts.o.dry_run:
print("(would create files)")
```
### Key Utilities
| Function | Location | Purpose |
|----------|----------|---------|
| `get_yaml()` | `util.py` | YAML parser (ruamel.yaml) |
| `get_stack_path(stack)` | `util.py` | Resolve stack directory path |
| `error_exit(msg)` | `util.py` | Print error and exit(1) |
---
## Files to Create
### 1. Command Module
**`stack_orchestrator/create/__init__.py`**
```python
# Empty file to make this a package
```
**`stack_orchestrator/create/create_stack.py`**
```python
import click
import os
from pathlib import Path
from shutil import copy
from stack_orchestrator.opts import opts
from stack_orchestrator.util import error_exit, get_yaml
# Template types
STACK_TEMPLATES = {
"webapp": {
"description": "Web application with Node.js",
"base_image": "node:20-bullseye-slim",
"port": 3000,
},
"service": {
"description": "Backend service",
"base_image": "python:3.11-slim",
"port": 8080,
},
"empty": {
"description": "Minimal stack with no defaults",
"base_image": None,
"port": None,
},
}
def get_data_dir() -> Path:
"""Get path to stack_orchestrator/data directory"""
return Path(__file__).absolute().parent.parent.joinpath("data")
def validate_stack_name(name: str) -> None:
"""Validate stack name follows conventions"""
import re
if not re.match(r'^[a-z0-9][a-z0-9-]*[a-z0-9]$', name) and len(name) > 2:
error_exit(f"Invalid stack name '{name}'. Use lowercase alphanumeric with hyphens.")
if name.startswith("cerc-"):
error_exit("Stack name should not start with 'cerc-' (container names will add this prefix)")
def create_stack_yml(stack_dir: Path, name: str, template: dict, repo_url: str) -> None:
"""Create stack.yml file"""
config = {
"version": "1.2",
"name": name,
"description": template.get("description", f"Stack: {name}"),
"repos": [repo_url] if repo_url else [],
"containers": [f"cerc/{name}"],
"pods": [name],
}
stack_dir.mkdir(parents=True, exist_ok=True)
with open(stack_dir / "stack.yml", "w") as f:
get_yaml().dump(config, f)
def create_dockerfile(container_dir: Path, name: str, template: dict) -> None:
"""Create Dockerfile"""
base_image = template.get("base_image", "node:20-bullseye-slim")
port = template.get("port", 3000)
dockerfile_content = f'''# Build stage
FROM {base_image} AS builder
WORKDIR /app
COPY package*.json ./
RUN npm ci
COPY . .
RUN npm run build
# Production stage
FROM {base_image}
WORKDIR /app
COPY package*.json ./
RUN npm ci --only=production
COPY --from=builder /app/dist ./dist
EXPOSE {port}
CMD ["npm", "run", "start"]
'''
container_dir.mkdir(parents=True, exist_ok=True)
with open(container_dir / "Dockerfile", "w") as f:
f.write(dockerfile_content)
def create_build_script(container_dir: Path, name: str) -> None:
"""Create build.sh script"""
build_script = f'''#!/usr/bin/env bash
# Build cerc/{name}
source ${{CERC_CONTAINER_BASE_DIR}}/build-base.sh
SCRIPT_DIR=$( cd -- "$( dirname -- "${{BASH_SOURCE[0]}}" )" &> /dev/null && pwd )
docker build -t cerc/{name}:local \\
-f ${{SCRIPT_DIR}}/Dockerfile \\
${{build_command_args}} \\
${{CERC_REPO_BASE_DIR}}/{name}
'''
build_path = container_dir / "build.sh"
with open(build_path, "w") as f:
f.write(build_script)
# Make executable
os.chmod(build_path, 0o755)
def create_compose_file(compose_dir: Path, name: str, template: dict) -> None:
"""Create docker-compose file"""
port = template.get("port", 3000)
compose_content = {
"version": "3.8",
"services": {
name: {
"image": f"cerc/{name}:local",
"restart": "unless-stopped",
"ports": [f"${{HOST_PORT:-{port}}}:{port}"],
"environment": {
"NODE_ENV": "${NODE_ENV:-production}",
},
}
}
}
with open(compose_dir / f"docker-compose-{name}.yml", "w") as f:
get_yaml().dump(compose_content, f)
def update_list_file(data_dir: Path, filename: str, entry: str) -> None:
"""Add entry to a list file if not already present"""
list_path = data_dir / filename
# Read existing entries
existing = set()
if list_path.exists():
with open(list_path, "r") as f:
existing = set(line.strip() for line in f if line.strip())
# Add new entry
if entry not in existing:
with open(list_path, "a") as f:
f.write(f"{entry}\n")
@click.command()
@click.option("--name", required=True, help="Name of the new stack (lowercase, hyphens)")
@click.option("--type", "stack_type", default="webapp",
type=click.Choice(list(STACK_TEMPLATES.keys())),
help="Stack template type")
@click.option("--repo", help="Git repository URL (e.g., github.com/org/repo)")
@click.option("--force", is_flag=True, help="Overwrite existing files")
@click.pass_context
def command(ctx, name: str, stack_type: str, repo: str, force: bool):
"""Create a new stack with all required files.
Examples:
laconic-so create-stack --name my-app --type webapp
laconic-so create-stack --name my-service --type service --repo github.com/org/repo
"""
# Validate
validate_stack_name(name)
template = STACK_TEMPLATES[stack_type]
data_dir = get_data_dir()
# Define paths
stack_dir = data_dir / "stacks" / name
container_dir = data_dir / "container-build" / f"cerc-{name}"
compose_dir = data_dir / "compose"
# Check for existing files
if not force:
if stack_dir.exists():
error_exit(f"Stack already exists: {stack_dir}\nUse --force to overwrite")
if container_dir.exists():
error_exit(f"Container build dir exists: {container_dir}\nUse --force to overwrite")
# Dry run check
if opts.o.dry_run:
print(f"Would create stack '{name}' with template '{stack_type}':")
print(f" - {stack_dir}/stack.yml")
print(f" - {container_dir}/Dockerfile")
print(f" - {container_dir}/build.sh")
print(f" - {compose_dir}/docker-compose-{name}.yml")
print(f" - Update repository-list.txt")
print(f" - Update container-image-list.txt")
print(f" - Update pod-list.txt")
return
# Create files
if not opts.o.quiet:
print(f"Creating stack '{name}' with template '{stack_type}'...")
create_stack_yml(stack_dir, name, template, repo)
if opts.o.verbose:
print(f" Created {stack_dir}/stack.yml")
create_dockerfile(container_dir, name, template)
if opts.o.verbose:
print(f" Created {container_dir}/Dockerfile")
create_build_script(container_dir, name)
if opts.o.verbose:
print(f" Created {container_dir}/build.sh")
create_compose_file(compose_dir, name, template)
if opts.o.verbose:
print(f" Created {compose_dir}/docker-compose-{name}.yml")
# Update list files
if repo:
update_list_file(data_dir, "repository-list.txt", repo)
if opts.o.verbose:
print(f" Added {repo} to repository-list.txt")
update_list_file(data_dir, "container-image-list.txt", f"cerc/{name}")
if opts.o.verbose:
print(f" Added cerc/{name} to container-image-list.txt")
update_list_file(data_dir, "pod-list.txt", name)
if opts.o.verbose:
print(f" Added {name} to pod-list.txt")
# Summary
if not opts.o.quiet:
print(f"\nStack '{name}' created successfully!")
print(f"\nNext steps:")
print(f" 1. Edit {stack_dir}/stack.yml")
print(f" 2. Customize {container_dir}/Dockerfile")
print(f" 3. Run: laconic-so --stack {name} build-containers")
print(f" 4. Run: laconic-so --stack {name} deploy-system up")
```
### 2. Register Command in main.py
**Edit `stack_orchestrator/main.py`**
Add import:
```python
from stack_orchestrator.create import create_stack
```
Add command registration (after line ~78):
```python
cli.add_command(create_stack.command, "create-stack")
```
---
## Implementation Steps
### Step 1: Create module structure
```bash
mkdir -p stack_orchestrator/create
touch stack_orchestrator/create/__init__.py
```
### Step 2: Create the command file
Create `stack_orchestrator/create/create_stack.py` with the code above.
### Step 3: Register in main.py
Add the import and `cli.add_command()` line.
### Step 4: Test the command
```bash
# Show help
laconic-so create-stack --help
# Dry run
laconic-so --dry-run create-stack --name test-app --type webapp
# Create a stack
laconic-so create-stack --name test-app --type webapp --repo github.com/org/test-app
# Verify
ls -la stack_orchestrator/data/stacks/test-app/
cat stack_orchestrator/data/stacks/test-app/stack.yml
```
---
## Template Types
| Type | Base Image | Port | Use Case |
|------|------------|------|----------|
| `webapp` | node:20-bullseye-slim | 3000 | React/Vue/Next.js apps |
| `service` | python:3.11-slim | 8080 | Python backend services |
| `empty` | none | none | Custom from scratch |
---
## Future Enhancements
1. **Interactive mode** - Prompt for values if not provided
2. **More templates** - Go, Rust, database stacks
3. **Template from existing** - `--from-stack existing-stack`
4. **External stack support** - Create in custom directory
5. **Validation command** - `laconic-so validate-stack --name my-stack`
---
## Files Modified
| File | Change |
|------|--------|
| `stack_orchestrator/create/__init__.py` | New (empty) |
| `stack_orchestrator/create/create_stack.py` | New (command implementation) |
| `stack_orchestrator/main.py` | Add import and `cli.add_command()` |
---
## Verification
```bash
# 1. Command appears in help
laconic-so --help | grep create-stack
# 2. Dry run works
laconic-so --dry-run create-stack --name verify-test --type webapp
# 3. Full creation works
laconic-so create-stack --name verify-test --type webapp
ls stack_orchestrator/data/stacks/verify-test/
ls stack_orchestrator/data/container-build/cerc-verify-test/
ls stack_orchestrator/data/compose/docker-compose-verify-test.yml
# 4. Build works
laconic-so --stack verify-test build-containers
# 5. Cleanup
rm -rf stack_orchestrator/data/stacks/verify-test
rm -rf stack_orchestrator/data/container-build/cerc-verify-test
rm stack_orchestrator/data/compose/docker-compose-verify-test.yml
```

35
TODO.md 100644
View File

@ -0,0 +1,35 @@
# TODO
## Features Needed
### Update Stack Command
We need an "update stack" command in stack orchestrator and cleaner documentation regarding how to do continuous deployment with and without payments.
**Context**: Currently, `deploy init` generates a spec file and `deploy create` creates a deployment directory. The `deployment update` command (added by Thomas Lackey) only syncs env vars and restarts - it doesn't regenerate configurations. There's a gap in the workflow for updating stack configurations after initial deployment.
## Bugs
### `deploy create` doesn't auto-generate volume mappings for new pods
When a new pod is added to `stack.yml` (e.g. `monitoring`), `deploy create`
does not generate default host path mappings in spec.yml for the new pod's
volumes. The deployment then fails at scheduling because the PVCs don't exist.
**Expected**: `deploy create` enumerates all volumes from all compose files
in the stack and generates default host paths for any that aren't already
mapped in the spec.yml `volumes:` section.
**Actual**: Only volumes already in spec.yml get PVs. New volumes are silently
missing, causing `FailedScheduling: persistentvolumeclaim not found`.
**Workaround**: Manually add volume entries to spec.yml and create host dirs.
**Files**: `deployment_create.py` (`_write_config_file`, volume handling)
## Architecture Refactoring
### Separate Deployer from Stack Orchestrator CLI
The deployer logic should be decoupled from the CLI tool to allow independent development and reuse.
### Separate Stacks from Stack Orchestrator Repo
Stacks should live in their own repositories, not bundled with the orchestrator tool. This allows stacks to evolve independently and be maintained by different teams.

View File

@ -1,277 +0,0 @@
# agave-stack
Unified Agave/Jito Solana stack for [laconic-so](https://github.com/LaconicNetwork/stack-orchestrator). Deploys Solana validators, RPC nodes, and test validators as containers with optional [DoubleZero](https://doublezero.xyz) network routing.
## Modes
| Mode | Compose file | Use case |
|------|-------------|----------|
| `validator` | `docker-compose-agave.yml` | Voting validator (mainnet/testnet) |
| `rpc` | `docker-compose-agave-rpc.yml` | Non-voting RPC node |
| `test` | `docker-compose-agave-test.yml` | Local dev with instant finality |
Mode is selected via the `AGAVE_MODE` environment variable.
## Repository layout
```
agave-stack/
├── deployment/ # Reference deployment (biscayne)
│ ├── spec.yml # k8s-kind deployment spec
│ └── k8s-manifests/
│ └── doublezero-daemonset.yaml # DZ DaemonSet (hostNetwork)
├── stack-orchestrator/
│ ├── stacks/agave/
│ │ ├── stack.yml # laconic-so stack definition
│ │ └── README.md # Stack-level docs
│ ├── compose/
│ │ ├── docker-compose-agave.yml # Voting validator
│ │ ├── docker-compose-agave-rpc.yml # Non-voting RPC
│ │ ├── docker-compose-agave-test.yml # Test validator
│ │ └── docker-compose-doublezero.yml # DoubleZero daemon
│ ├── container-build/
│ │ ├── laconicnetwork-agave/ # Agave/Jito image
│ │ │ ├── Dockerfile # Two-stage build from source
│ │ │ ├── build.sh # laconic-so build script
│ │ │ ├── entrypoint.sh # Mode router
│ │ │ ├── start-validator.sh # Voting validator startup
│ │ │ ├── start-rpc.sh # RPC node startup
│ │ │ └── start-test.sh # Test validator + SPL setup
│ │ └── laconicnetwork-doublezero/ # DoubleZero image
│ │ ├── Dockerfile # Installs from Cloudsmith apt
│ │ ├── build.sh
│ │ └── entrypoint.sh
│ └── config/agave/
│ ├── restart-node.sh # Container restart helper
│ └── restart.cron # Scheduled restart schedule
```
## Prerequisites
- [laconic-so](https://github.com/LaconicNetwork/stack-orchestrator) (stack orchestrator)
- Docker
- Kind (for k8s deployments)
## Building
```bash
# Vanilla Agave v3.1.9
laconic-so --stack agave build-containers
# Jito v3.1.8 (required for MEV)
AGAVE_REPO=https://github.com/jito-foundation/jito-solana.git \
AGAVE_VERSION=v3.1.8-jito \
laconic-so --stack agave build-containers
```
Build compiles from source (~30-60 min on first build). This produces both the `laconicnetwork/agave:local` and `laconicnetwork/doublezero:local` images.
## Deploying
### Test validator (local dev)
```bash
laconic-so --stack agave deploy init --output spec.yml
laconic-so --stack agave deploy create --spec-file spec.yml --deployment-dir my-test
laconic-so deployment --dir my-test start
```
The test validator starts with instant finality and optionally creates SPL token mints and airdrops to configured pubkeys.
### Mainnet/testnet (Docker Compose)
```bash
laconic-so --stack agave deploy init --output spec.yml
# Edit spec.yml: set AGAVE_MODE, VALIDATOR_ENTRYPOINT, KNOWN_VALIDATOR, etc.
laconic-so --stack agave deploy create --spec-file spec.yml --deployment-dir my-node
laconic-so deployment --dir my-node start
```
### Kind/k8s deployment
The `deployment/spec.yml` provides a reference spec targeting `k8s-kind`. The compose files use `network_mode: host` which works for Docker Compose and is silently ignored by laconic-so's k8s conversion (it uses explicit ports from the deployment spec instead).
```bash
laconic-so --stack agave deploy create \
--spec-file deployment/spec.yml \
--deployment-dir my-deployment
# Mount validator keypairs
cp validator-identity.json my-deployment/data/validator-config/
cp vote-account-keypair.json my-deployment/data/validator-config/ # validator mode only
laconic-so deployment --dir my-deployment start
```
## Configuration
### Common (all modes)
| Variable | Default | Description |
|----------|---------|-------------|
| `AGAVE_MODE` | `test` | `test`, `rpc`, or `validator` |
| `VALIDATOR_ENTRYPOINT` | *required* | Cluster entrypoint (host:port) |
| `KNOWN_VALIDATOR` | *required* | Known validator pubkey |
| `EXTRA_ENTRYPOINTS` | | Space-separated additional entrypoints |
| `EXTRA_KNOWN_VALIDATORS` | | Space-separated additional known validators |
| `RPC_PORT` | `8899` | RPC HTTP port |
| `RPC_BIND_ADDRESS` | `127.0.0.1` | RPC bind address |
| `GOSSIP_PORT` | `8001` | Gossip protocol port |
| `DYNAMIC_PORT_RANGE` | `8000-10000` | TPU/TVU/repair UDP port range |
| `LIMIT_LEDGER_SIZE` | `50000000` | Max ledger slots to retain |
| `SNAPSHOT_INTERVAL_SLOTS` | `1000` | Full snapshot interval |
| `MAXIMUM_SNAPSHOTS_TO_RETAIN` | `5` | Max full snapshots |
| `EXPECTED_GENESIS_HASH` | | Cluster genesis verification |
| `EXPECTED_SHRED_VERSION` | | Shred version verification |
| `RUST_LOG` | `info` | Log level |
| `SOLANA_METRICS_CONFIG` | | Metrics reporting config |
### Validator mode
| Variable | Default | Description |
|----------|---------|-------------|
| `VOTE_ACCOUNT_KEYPAIR` | `/data/config/vote-account-keypair.json` | Vote account keypair path |
Identity keypair must be mounted at `/data/config/validator-identity.json`.
### RPC mode
| Variable | Default | Description |
|----------|---------|-------------|
| `PUBLIC_RPC_ADDRESS` | | If set, advertise as public RPC |
| `ACCOUNT_INDEXES` | `program-id,spl-token-owner,spl-token-mint` | Account indexes for queries |
Identity is auto-generated if not mounted.
### Jito MEV (validator and RPC modes)
Set `JITO_ENABLE=true` and provide:
| Variable | Description |
|----------|-------------|
| `JITO_BLOCK_ENGINE_URL` | Block engine endpoint |
| `JITO_SHRED_RECEIVER_ADDR` | Shred receiver (region-specific) |
| `JITO_RELAYER_URL` | Relayer URL (validator mode) |
| `JITO_TIP_PAYMENT_PROGRAM` | Tip payment program pubkey |
| `JITO_DISTRIBUTION_PROGRAM` | Tip distribution program pubkey |
| `JITO_MERKLE_ROOT_AUTHORITY` | Merkle root upload authority |
| `JITO_COMMISSION_BPS` | Commission basis points |
Image must be built from `jito-foundation/jito-solana` for Jito flags to work.
### Test mode
| Variable | Default | Description |
|----------|---------|-------------|
| `FACILITATOR_PUBKEY` | | Pubkey to airdrop SOL |
| `SERVER_PUBKEY` | | Pubkey to airdrop SOL |
| `CLIENT_PUBKEY` | | Pubkey to airdrop SOL + create ATA |
| `MINT_DECIMALS` | `6` | SPL token decimals |
| `MINT_AMOUNT` | `1000000` | SPL tokens to mint |
## DoubleZero
[DoubleZero](https://doublezero.xyz) provides optimized network routing for Solana validators via GRE tunnels (IP protocol 47) and BGP (TCP/179) over link-local 169.254.0.0/16. Validator traffic to other DZ participants is routed through private fiber instead of the public internet.
### How it works
`doublezerod` creates a `doublezero0` GRE tunnel interface and runs BGP peering through it. Routes are injected into the host routing table, so the validator transparently sends traffic over the fiber backbone. IBRL mode falls back to public internet if DZ is down.
### Requirements
- Validator identity keypair at `/data/config/validator-identity.json`
- `privileged: true` + `NET_ADMIN` (GRE tunnel + route table manipulation)
- `hostNetwork: true` (GRE uses IP protocol 47 — cannot be port-mapped)
- Node registered with DoubleZero passport system
### Docker Compose
`docker-compose-doublezero.yml` runs alongside the validator with `network_mode: host`, sharing the `validator-config` volume for identity access.
### k8s
laconic-so does not pass `hostNetwork` through to generated k8s resources. DoubleZero runs as a DaemonSet applied after `deployment start`:
```bash
kubectl apply -f deployment/k8s-manifests/doublezero-daemonset.yaml
```
Since the validator pods share the node's network namespace, they automatically see the GRE routes injected by `doublezerod`.
| Variable | Default | Description |
|----------|---------|-------------|
| `VALIDATOR_IDENTITY_PATH` | `/data/config/validator-identity.json` | Validator identity keypair |
| `DOUBLEZERO_RPC_ENDPOINT` | `http://127.0.0.1:8899` | Solana RPC for DZ registration |
| `DOUBLEZERO_EXTRA_ARGS` | | Additional doublezerod arguments |
## Runtime requirements
The container requires the following (already set in compose files):
| Setting | Value | Why |
|---------|-------|-----|
| `privileged` | `true` | `mlock()` syscall and raw network access |
| `cap_add` | `IPC_LOCK` | Memory page locking for account indexes and ledger |
| `ulimits.memlock` | `-1` (unlimited) | Agave locks gigabytes of memory |
| `ulimits.nofile` | `1000000` | Gossip/TPU connections + memory-mapped ledger files |
| `network_mode` | `host` | Direct host network stack for gossip, TPU, UDP ranges |
Without these, Agave either refuses to start or dies under load.
## Container overhead
Containers with `privileged: true` and `network_mode: host` add **zero measurable overhead** vs bare metal. Linux containers are not VMs:
- **Network**: Host network namespace directly — no bridge, no NAT, no veth. Same kernel code path as bare metal.
- **CPU**: No hypervisor. Same physical cores, same scheduler priority.
- **Memory**: `IPC_LOCK` + unlimited memlock = identical `mlock()` behavior.
- **Disk I/O**: hostPath-backed PVs have identical I/O characteristics.
The only overhead is cgroup accounting (nanoseconds per syscall) and overlayfs for cold file opens (single-digit microseconds, zero once cached).
## Scheduled restarts
The `config/agave/restart.cron` defines periodic restarts to mitigate memory growth:
- **Validator**: every 4 hours
- **RPC**: every 6 hours (staggered 30 min offset)
Uses `restart-node.sh` which sends TERM to the matching container for graceful shutdown.
## Biscayne reference deployment
The `deployment/` directory contains a reference deployment for biscayne.vaasl.io (186.233.184.235), a mainnet voting validator with Jito MEV and DoubleZero:
```bash
# Build Jito image
AGAVE_REPO=https://github.com/jito-foundation/jito-solana.git \
AGAVE_VERSION=v3.1.8-jito \
laconic-so --stack agave build-containers
# Create deployment
laconic-so --stack agave deploy create \
--spec-file deployment/spec.yml \
--deployment-dir biscayne-deployment
# Mount keypairs
cp validator-identity.json biscayne-deployment/data/validator-config/
cp vote-account-keypair.json biscayne-deployment/data/validator-config/
# Start
laconic-so deployment --dir biscayne-deployment start
# Start DoubleZero
kubectl apply -f deployment/k8s-manifests/doublezero-daemonset.yaml
```
To run as non-voting RPC, change `AGAVE_MODE: rpc` in `deployment/spec.yml`.
## Volumes
| Volume | Mount | Content |
|--------|-------|---------|
| `validator-config` / `rpc-config` | `/data/config` | Identity keypairs, node config |
| `validator-ledger` / `rpc-ledger` | `/data/ledger` | Blockchain ledger data |
| `validator-accounts` / `rpc-accounts` | `/data/accounts` | Account state cache |
| `validator-snapshots` / `rpc-snapshots` | `/data/snapshots` | Full and incremental snapshots |
| `doublezero-config` | `~/.config/doublezero` | DZ identity and state |

View File

@ -1,198 +0,0 @@
# Work in Progress: Biscayne TVU Shred Relay
## Overview
Biscayne's agave validator was shred-starved (~1.7 slots/sec replay vs ~2.5 mainnet).
Root cause: not enough turbine shreds arriving. Solution: advertise a TVU address in
Ashburn (dense validator population, better turbine tree neighbors) and relay shreds
to biscayne in Miami over the laconic backbone.
### Architecture
```
Turbine peers (hundreds of validators)
|
v UDP shreds to port 20000
laconic-was-sw01 Et1/1 (64.92.84.81, Ashburn)
| ASIC receives on front-panel interface
| EOS monitor session mirrors matched packets to CPU
v
mirror0 interface (Linux userspace)
| socat reads raw frames, sends as UDP
v 172.16.1.188 -> 186.233.184.235:9100 (Et4/1 backbone, 25.4ms)
laconic-mia-sw01 Et4/1 (172.16.1.189, Miami)
| forwards via default route (Et1/1, same metro)
v 0.13ms
biscayne:9100 (186.233.184.235, Miami)
| shred-unwrap.py strips IP+UDP headers
v clean shred payload to localhost:9000
agave-validator TVU port
```
Total one-way relay latency: ~12.8ms
### Results
Before relay: ~1.7 slots/sec replay, falling behind ~0.8 slots/sec.
After relay: ~3.32 slots/sec replay, catching up ~0.82 slots/sec.
---
## Changes by Host
### laconic-was-sw01 (Ashburn) — `install@137.239.200.198`
All changes are ephemeral (not persisted, lost on reboot).
**1. EOS monitor session (running-config, not in startup-config)**
Mirrors inbound UDP port 20000 traffic on Et1/1 to a CPU-accessible `mirror0` interface.
Required because the Arista 7280CR3A ASIC handles front-panel traffic without punting to
Linux userspace — regular sockets cannot receive packets on front-panel IPs.
```
monitor session 1 source Ethernet1/1 rx
monitor session 1 ip access-group SHRED-RELAY
monitor session 1 destination Cpu
```
**2. EOS ACL (running-config, not in startup-config)**
```
ip access-list SHRED-RELAY
10 permit udp any any eq 20000
```
**3. EOS static route (running-config, not in startup-config)**
```
ip route 186.233.184.235/32 172.16.1.189
```
Routes biscayne traffic via Et4/1 backbone to laconic-mia-sw01 instead of the default
route (64.92.84.80, Cogent public internet).
**4. Linux kernel static route (ephemeral, `ip route add`)**
```
ip route add 186.233.184.235/32 via 172.16.1.189 dev et4_1
```
Required because socat runs in Linux userspace. The EOS static route programs the ASIC
but does not always sync to the Linux kernel routing table. Without this, socat's UDP
packets egress via the default route (et1_1, public internet).
**5. socat relay process (foreground, pts/5)**
```bash
sudo socat -u INTERFACE:mirror0,type=2 UDP-SENDTO:186.233.184.235:9100
```
Reads raw L2 frames from mirror0 (SOCK_DGRAM strips ethernet header, leaving IP+UDP+payload).
Sends each frame as a UDP datagram to biscayne:9100. Runs as root (raw socket access to mirror0).
PID: 27743 (child of sudo PID 27742)
---
### laconic-mia-sw01 (Miami) — `install@209.42.167.130`
**No changes made.** MIA already reaches biscayne at 0.13ms via its default route
(`209.42.167.132` on Et1/1, same metro). Relay traffic from WAS arrives on Et4/1
(`172.16.1.189`) and MIA forwards to `186.233.184.235` natively.
Key interfaces for reference:
- Et1/1: `209.42.167.133/31` (public uplink, default route via 209.42.167.132)
- Et4/1: `172.16.1.189/31` (backbone link to WAS, peer 172.16.1.188)
- Et8/1: `172.16.1.192/31` (another backbone link, not used for relay)
---
### biscayne (Miami) — `rix@biscayne.vaasl.io`
**1. Custom agave image: `laconicnetwork/agave:tvu-relay`**
Stock agave v3.1.9 with cherry-picked commit 9f4b3ae from anza master (adds
`--public-tvu-address` flag, from anza PR #6778). Built in `/tmp/agave-tvu-patch/`,
transferred via `docker save | scp | docker load | kind load docker-image`.
**2. K8s deployment changes**
Namespace: `laconic-laconic-70ce4c4b47e23b85`
Deployment: `laconic-70ce4c4b47e23b85-deployment`
Changes from previous deployment:
- Image: `laconicnetwork/agave:local` -> `laconicnetwork/agave:tvu-relay`
- Added env: `PUBLIC_TVU_ADDRESS=64.92.84.81:20000`
- Set: `JITO_ENABLE=false` (stock agave has no Jito flags)
- Strategy: changed to `Recreate` (hostNetwork port conflicts prevent RollingUpdate)
The validator runs with `--public-tvu-address 64.92.84.81:20000`, causing it to
advertise the Ashburn switch IP as its TVU address in gossip. Turbine tree peers
send shreds to Ashburn instead of directly to Miami.
**3. shred-unwrap.py (foreground process, PID 2497694)**
```bash
python3 /tmp/shred-unwrap.py 9100 127.0.0.1 9000
```
Listens on UDP port 9100, strips IP+UDP headers from mirrored packets (variable-length
IP header via IHL field + 8-byte UDP header), forwards clean shred payloads to
localhost:9000 (the validator's TVU port). Running as user `rix`.
Script location: `/tmp/shred-unwrap.py`
**4. agave-stack repo changes (uncommitted)**
- `stack-orchestrator/container-build/laconicnetwork-agave/start-rpc.sh`:
Added `PUBLIC_TVU_ADDRESS` to header docs and
`[ -n "${PUBLIC_TVU_ADDRESS:-}" ] && ARGS+=(--public-tvu-address "$PUBLIC_TVU_ADDRESS")`
- `stack-orchestrator/compose/docker-compose-agave-rpc.yml`:
Added `PUBLIC_TVU_ADDRESS: ${PUBLIC_TVU_ADDRESS:-}` to environment section
---
## What's NOT Production-Ready
### Ephemeral processes
- socat on laconic-was-sw01: foreground process in a terminal session
- shred-unwrap.py on biscayne: foreground process, running from /tmp
- Both die if the terminal disconnects or the host reboots
- Need systemd units for both
### Ephemeral switch config
- Monitor session, ACL, and static routes on was-sw01 are in running-config only
- Not saved to startup-config (`write memory` was run but the route didn't persist)
- Linux kernel route (`ip route add`) is completely ephemeral
- All lost on switch reboot
### No monitoring
- No alerting on relay health (socat crash, shred-unwrap crash, packet loss)
- No metrics on relay throughput vs direct turbine throughput
- No comparison of before/after slot gap trends
### Validator still catching up
- ~50k slots behind as of initial relay activation
- Catching up at ~0.82 slots/sec (~2,950 slots/hour)
- ~17 hours to catch up from current position, or reset with fresh snapshot (~15-30 min)
---
## Key Details
| Item | Value |
|------|-------|
| Biscayne validator identity | `4WeLUxfQghbhsLEuwaAzjZiHg2VBw87vqHc4iZrGvKPr` |
| Biscayne IP | `186.233.184.235` |
| laconic-was-sw01 public IP | `64.92.84.81` (Et1/1) |
| laconic-was-sw01 backbone IP | `172.16.1.188` (Et4/1) |
| laconic-was-sw01 SSH | `install@137.239.200.198` |
| laconic-mia-sw01 backbone IP | `172.16.1.189` (Et4/1) |
| laconic-mia-sw01 SSH | `install@209.42.167.130` |
| Biscayne SSH | `rix@biscayne.vaasl.io` (via ProxyJump abernathy) |
| Backbone RTT (WAS-MIA) | 25.4ms (Et4/1 ↔ Et4/1, 0.01ms jitter) |
| Relay one-way latency | ~12.8ms |
| Agave image | `laconicnetwork/agave:tvu-relay` (v3.1.9 + commit 9f4b3ae) |
| EOS version | 4.34.0F |

View File

@ -1,193 +0,0 @@
---
# Redeploy agave-stack on biscayne with aria2c snapshot pre-download
#
# Usage:
# # Standard redeploy (download snapshot, preserve accounts + ledger)
# ansible-playbook -i biscayne.vaasl.io, ansible/biscayne-redeploy.yml
#
# # Full wipe (accounts + ledger) — slow rebuild
# ansible-playbook -i biscayne.vaasl.io, ansible/biscayne-redeploy.yml \
# -e wipe_accounts=true -e wipe_ledger=true
#
# # Skip snapshot download (use existing)
# ansible-playbook -i biscayne.vaasl.io, ansible/biscayne-redeploy.yml \
# -e skip_snapshot=true
#
# # Pass extra args to snapshot-download.py
# ansible-playbook -i biscayne.vaasl.io, ansible/biscayne-redeploy.yml \
# -e 'snapshot_args=--version 2.2 --min-download-speed 50'
#
# # Snapshot only (no redeploy)
# ansible-playbook -i biscayne.vaasl.io, ansible/biscayne-redeploy.yml --tags snapshot
#
- name: Redeploy agave validator on biscayne
hosts: all
gather_facts: false
vars:
deployment_dir: /srv/deployments/agave
laconic_so: /home/rix/.local/bin/laconic-so
kind_cluster: laconic-70ce4c4b47e23b85
k8s_namespace: "laconic-{{ kind_cluster }}"
snapshot_dir: /srv/solana/snapshots
ledger_dir: /srv/solana/ledger
accounts_dir: /srv/solana/ramdisk/accounts
ramdisk_mount: /srv/solana/ramdisk
ramdisk_device: /dev/ram0
snapshot_script_local: "{{ playbook_dir }}/../scripts/snapshot-download.py"
snapshot_script: /tmp/snapshot-download.py
# Flags — non-destructive by default
wipe_accounts: false
wipe_ledger: false
skip_snapshot: false
snapshot_args: ""
tasks:
# --- Snapshot download (runs while validator is still up) ---
- name: Verify aria2c installed
command: which aria2c
changed_when: false
when: not skip_snapshot | bool
tags: [snapshot]
- name: Copy snapshot script to remote
copy:
src: "{{ snapshot_script_local }}"
dest: "{{ snapshot_script }}"
mode: "0755"
when: not skip_snapshot | bool
tags: [snapshot]
- name: Download snapshot via aria2c
command: >
python3 {{ snapshot_script }}
-o {{ snapshot_dir }}
{{ snapshot_args }}
become: true
register: snapshot_result
when: not skip_snapshot | bool
timeout: 3600
tags: [snapshot]
- name: Show snapshot download result
debug:
msg: "{{ snapshot_result.stdout_lines | default(['skipped']) }}"
tags: [snapshot]
# --- Teardown (namespace only, preserve kind cluster) ---
- name: Delete deployment namespace
command: >
kubectl delete namespace {{ k8s_namespace }} --timeout=120s
register: ns_delete
failed_when: false
tags: [teardown]
- name: Wait for namespace to terminate
command: >
kubectl get namespace {{ k8s_namespace }}
-o jsonpath='{.status.phase}'
register: ns_status
retries: 30
delay: 5
until: ns_status.rc != 0
failed_when: false
when: ns_delete.rc == 0
tags: [teardown]
# --- Data wipe (opt-in) ---
- name: Wipe ledger data
shell: rm -rf {{ ledger_dir }}/*
become: true
when: wipe_ledger | bool
tags: [wipe]
- name: Wipe accounts ramdisk (umount + mkfs + mount)
shell: |
mountpoint -q {{ ramdisk_mount }} && umount {{ ramdisk_mount }} || true
mkfs.ext4 -q {{ ramdisk_device }}
mount {{ ramdisk_device }} {{ ramdisk_mount }}
mkdir -p {{ accounts_dir }}
chown solana:solana {{ ramdisk_mount }} {{ accounts_dir }}
become: true
when: wipe_accounts | bool
tags: [wipe]
- name: Clean old snapshots (keep newest full + incremental)
shell: |
cd {{ snapshot_dir }} || exit 0
newest=$(ls -t snapshot-*.tar.* 2>/dev/null | head -1)
if [ -n "$newest" ]; then
newest_inc=$(ls -t incremental-snapshot-*.tar.* 2>/dev/null | head -1)
find . -maxdepth 1 -name '*.tar.*' \
! -name "$newest" \
! -name "${newest_inc:-__none__}" \
-delete
fi
become: true
when: not skip_snapshot | bool
tags: [wipe]
# --- Deploy ---
- name: Verify kind-config.yml has unified mount root
command: "grep -c 'containerPath: /mnt$' {{ deployment_dir }}/kind-config.yml"
register: mount_root_check
failed_when: mount_root_check.stdout | int < 1
tags: [deploy]
- name: Start deployment
command: "{{ laconic_so }} deployment --dir {{ deployment_dir }} start"
timeout: 600
tags: [deploy]
- name: Wait for pod to be running
command: >
kubectl get pods -n {{ k8s_namespace }}
-o jsonpath='{.items[0].status.phase}'
register: pod_status
retries: 60
delay: 10
until: pod_status.stdout == "Running"
tags: [deploy]
# --- Verify ---
- name: Verify unified mount inside kind node
command: "docker exec {{ kind_cluster }}-control-plane ls /mnt/solana/"
register: mount_check
tags: [verify]
- name: Show mount contents
debug:
msg: "{{ mount_check.stdout_lines }}"
tags: [verify]
- name: Check validator log file is being written
command: >
kubectl exec -n {{ k8s_namespace }}
deployment/{{ kind_cluster }}-deployment
-c agave-validator -- test -f /data/log/validator.log
retries: 12
delay: 10
until: log_file_check.rc == 0
register: log_file_check
failed_when: false
tags: [verify]
- name: Check RPC health
uri:
url: http://127.0.0.1:8899/health
return_content: true
register: rpc_health
retries: 6
delay: 10
until: rpc_health.status == 200
failed_when: false
delegate_to: "{{ inventory_hostname }}"
tags: [verify]
- name: Report status
debug:
msg: >-
Deployment complete.
Log: {{ 'writing' if log_file_check.rc == 0 else 'not yet created' }}.
RPC: {{ rpc_health.content | default('not responding') }}.
Wiped: ledger={{ wipe_ledger }}, accounts={{ wipe_accounts }}.
tags: [verify]

View File

@ -1,50 +0,0 @@
# DoubleZero DaemonSet - applied separately from laconic-so deployment
# laconic-so does not support hostNetwork in generated k8s resources,
# so this manifest is applied via kubectl after 'deployment start'.
#
# DoubleZero creates GRE tunnels (IP protocol 47) and runs BGP (tcp/179)
# on link-local 169.254.0.0/16. This requires host network access.
# The GRE routes injected into the node routing table are automatically
# visible to all pods using hostNetwork.
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: doublezero
labels:
app: doublezero
spec:
selector:
matchLabels:
app: doublezero
template:
metadata:
labels:
app: doublezero
spec:
hostNetwork: true
containers:
- name: doublezerod
image: laconicnetwork/doublezero:local
securityContext:
privileged: true
capabilities:
add:
- NET_ADMIN
env:
- name: VALIDATOR_IDENTITY_PATH
value: /data/config/validator-identity.json
- name: DOUBLEZERO_RPC_ENDPOINT
value: http://127.0.0.1:8899
volumeMounts:
- name: validator-config
mountPath: /data/config
readOnly: true
- name: doublezero-config
mountPath: /root/.config/doublezero
volumes:
- name: validator-config
persistentVolumeClaim:
claimName: validator-config
- name: doublezero-config
persistentVolumeClaim:
claimName: doublezero-config

View File

@ -1,112 +0,0 @@
# Biscayne Solana Validator deployment spec
# Host: biscayne.vaasl.io (186.233.184.235)
# Identity: 4WeLUxfQghbhsLEuwaAzjZiHg2VBw87vqHc4iZrGvKPr
stack: /srv/deployments/agave-stack/stack-orchestrator/stacks/agave
deploy-to: k8s-kind
kind-mount-root: /srv/kind
network:
http-proxy:
- host-name: biscayne.vaasl.io
routes:
- path: /
proxy-to: agave-validator:8899
- path: /
proxy-to: agave-validator:8900
websocket: true
ports:
agave-validator:
- '8899'
- '8900'
- '8001'
- 8001/udp
- 9000/udp
- 9001/udp
- 9002/udp
- 9003/udp
- 9004/udp
- 9005/udp
- 9006/udp
- 9007/udp
- 9008/udp
- 9009/udp
- 9010/udp
- 9011/udp
- 9012/udp
- 9013/udp
- 9014/udp
- 9015/udp
- 9016/udp
- 9017/udp
- 9018/udp
- 9019/udp
- 9020/udp
- 9021/udp
- 9022/udp
- 9023/udp
- 9024/udp
- 9025/udp
resources:
containers:
reservations:
cpus: '4.0'
memory: 256000M
limits:
cpus: '32.0'
memory: 921600M
security:
privileged: true
unlimited-memlock: true
capabilities:
- IPC_LOCK
volumes:
# Config volumes — on ZFS dataset (backed up via snapshots)
validator-config: /srv/deployments/agave/data/validator-config
doublezero-validator-identity: /srv/deployments/agave/data/validator-config
doublezero-config: /srv/deployments/agave/data/doublezero-config
# Heavy data volumes — on zvol/ramdisk (not backed up, rebuildable)
validator-ledger: /srv/kind/solana/ledger
validator-accounts: /srv/kind/solana/ramdisk/accounts
validator-snapshots: /srv/kind/solana/snapshots
validator-log: /srv/kind/solana/log
# Monitoring
monitoring-influxdb-data: /srv/kind/solana/monitoring/influxdb
monitoring-grafana-data: /srv/kind/solana/monitoring/grafana
configmaps:
monitoring-telegraf-config: config/monitoring/telegraf-config
monitoring-telegraf-scripts: config/monitoring/scripts
monitoring-grafana-datasources: config/monitoring/grafana-datasources
monitoring-grafana-dashboards: config/monitoring/grafana-dashboards
config:
# Mode: 'rpc' (non-voting) — matches current biscayne systemd config
AGAVE_MODE: rpc
# Mainnet entrypoints
VALIDATOR_ENTRYPOINT: entrypoint.mainnet-beta.solana.com:8001
EXTRA_ENTRYPOINTS: entrypoint2.mainnet-beta.solana.com:8001 entrypoint3.mainnet-beta.solana.com:8001 entrypoint4.mainnet-beta.solana.com:8001 entrypoint5.mainnet-beta.solana.com:8001
# Known validators (Solana Foundation, Everstake, Chorus One)
KNOWN_VALIDATOR: 7Np41oeYqPefeNQEHSv1UDhYrehxin3NStELsSKCT4K2
EXTRA_KNOWN_VALIDATORS: GdnSyH3YtwcxFvQrVVJMm1JhTS4QVX7MFsX56uJLUfiZ dDzy5SR3AXdYWVqbDEkVFdvSPCtS9ihF5kJkHCtXoFs DE1bawNcRJB9rVm3buyMVfr8mBEoyyu73NBovf2oXJsJ CakcnaRDHka2gXyfbEd2d3xsvkJkqsLw2akB3zsN1D2S C1ocKDYMCm2ooWptMMnpd5VEB2Nx4UMJgRuYofysyzcA GwHH8ciFhR8vejWCqmg8FWZUCNtubPY2esALvy5tBvji 6WgdYhhGE53WrZ7ywJA15hBVkw7CRbQ8yDBBTwmBtAHN
# Network
RPC_PORT: '8899'
RPC_BIND_ADDRESS: 0.0.0.0
GOSSIP_PORT: '8001'
GOSSIP_HOST: 137.239.194.65
DYNAMIC_PORT_RANGE: 9000-10000
# Cluster verification
EXPECTED_GENESIS_HASH: 5eykt4UsFv8P8NJdTREpY1vzqKqZKvdpKuc147dw2N9d
EXPECTED_SHRED_VERSION: '50093'
# Storage
LIMIT_LEDGER_SIZE: '50000000'
MAXIMUM_SNAPSHOTS_TO_RETAIN: '1'
NO_INCREMENTAL_SNAPSHOTS: 'false'
RUST_LOG: info,solana_metrics=warn
SOLANA_METRICS_CONFIG: host=http://localhost:8086,db=agave_metrics,u=admin,p=admin
# Jito MEV (NY region shred receiver) — disabled until voting enabled
JITO_ENABLE: 'false'
JITO_BLOCK_ENGINE_URL: https://mainnet.block-engine.jito.wtf
JITO_SHRED_RECEIVER_ADDR: 141.98.216.96:1002
JITO_TIP_PAYMENT_PROGRAM: T1pyyaTNZsKv2WcRAB8oVnk93mLJw2XzjtVYqCsaHqt
JITO_DISTRIBUTION_PROGRAM: 4R3gSG8BpU4t19KYj8CfnbtRpnT8gtk4dvTHxVRwc2r7
JITO_MERKLE_ROOT_AUTHORITY: 8F4jGUmxF36vQ6yabnsxX6AQVXdKBhs8kGSUuRKSg8Xt
JITO_COMMISSION_BPS: '800'
# DoubleZero
DOUBLEZERO_RPC_ENDPOINT: http://127.0.0.1:8899

View File

@ -1,234 +0,0 @@
#!/bin/bash
set -Eeuo pipefail
export PATH=/sbin:/bin:/usr/sbin:/usr/bin:/usr/local/sbin:/usr/local/bin
export XDG_RUNTIME_DIR="/run/user/$(id -u)"
mkdir -p "$XDG_RUNTIME_DIR"
# optional suffix from command-line, prepend dash if non-empty
SUFFIX="${1:-}"
SUFFIX="${SUFFIX:+-$SUFFIX}"
# define variables
DATASET="biscayne/DATA/deployments"
DEPLOYMENT_DIR="/srv/deployments/agave"
LOG_FILE="$HOME/.backlog_history"
ZFS_HOLD="backlog:pending"
SERVICE_STOP_TIMEOUT="300"
SNAPSHOT_RETENTION="6"
SNAPSHOT_PREFIX="backlog"
SNAPSHOT_TAG="$(date +%Y%m%d)${SUFFIX}"
SNAPSHOT="${DATASET}@${SNAPSHOT_PREFIX}-${SNAPSHOT_TAG}"
# remote replication targets
REMOTES=(
"mysterio:edith/DATA/backlog/biscayne-main"
"ardham:batterywharf/DATA/backlog/biscayne-main"
)
# log functions
log() {
local time_fmt
time_fmt=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
echo "[$time_fmt] $1" >> "$LOG_FILE"
}
log_close() {
local end_time duration
end_time=$(date +%s)
duration=$((end_time - start_time))
log "Backlog completed in ${duration}s"
echo "" >> "$LOG_FILE"
}
# service controls
services() {
local action="$1"
case "$action" in
stop)
log "Stopping agave deployment..."
laconic-so deployment --dir "$DEPLOYMENT_DIR" stop
log "Waiting for services to fully stop..."
local deadline=$(( $(date +%s) + SERVICE_STOP_TIMEOUT ))
while true; do
local running
running=$(docker ps --filter "label=com.docker.compose.project.working_dir=$DEPLOYMENT_DIR" -q 2>/dev/null | wc -l)
if [[ "$running" -eq 0 ]]; then
break
fi
if (( $(date +%s) >= deadline )); then
log "WARNING: Timeout waiting for services to stop; continuing."
break
fi
sleep 0.2
done
;;
start)
log "Starting agave deployment..."
laconic-so deployment --dir "$DEPLOYMENT_DIR" start
;;
*)
log "ERROR: Unknown action '$action' in services()"
exit 2
;;
esac
}
# send a snapshot to one remote
# args: snap remote_host remote_dataset
snapshot_send_one() {
local snap="$1" remote_host="$2" remote_dataset="$3"
log "Checking remote snapshots on $remote_host..."
local -a local_snaps remote_snaps
mapfile -t local_snaps < <(zfs list -H -t snapshot -o name -s creation -d1 "$DATASET" | grep -F "${DATASET}@${SNAPSHOT_PREFIX}-")
mapfile -t remote_snaps < <(ssh "$remote_host" zfs list -H -t snapshot -o name -s creation "$remote_dataset" | grep -F "${remote_dataset}@${SNAPSHOT_PREFIX}-" || true)
# find latest common snapshot
local base=""
local local_snap remote_snap remote_check
for local_snap in "${local_snaps[@]}"; do
remote_snap="${local_snap/$DATASET/$remote_dataset}"
for remote_check in "${remote_snaps[@]}"; do
if [[ "$remote_check" == "$remote_snap" ]]; then
base="$local_snap"
break
fi
done
done
if [[ -z "$base" && ${#remote_snaps[@]} -eq 0 ]]; then
log "No remote snapshots found on $remote_host — sending full snapshot."
if zfs send "$snap" | ssh "$remote_host" zfs receive -sF "$remote_dataset"; then
log "Full send to $remote_host succeeded."
return 0
else
log "ERROR: Full send to $remote_host failed."
return 1
fi
elif [[ -n "$base" ]]; then
log "Common base snapshot $base found — sending incremental to $remote_host."
if zfs send -i "$base" "$snap" | ssh "$remote_host" zfs receive -sF "$remote_dataset"; then
log "Incremental send to $remote_host succeeded."
return 0
else
log "ERROR: Incremental send to $remote_host failed."
return 1
fi
else
log "STALE DESTINATION: $remote_host has snapshots but no common base with local — skipping."
return 1
fi
}
# send snapshot to all remotes
snapshot_send() {
local snap="$1"
local failure_count=0
set +e
local entry remote_host remote_dataset
for entry in "${REMOTES[@]}"; do
remote_host="${entry%%:*}"
remote_dataset="${entry#*:}"
if ! snapshot_send_one "$snap" "$remote_host" "$remote_dataset"; then
failure_count=$((failure_count + 1))
fi
done
set -e
if [[ "$failure_count" -gt 0 ]]; then
log "WARNING: $failure_count destination(s) failed or are out of sync."
return 1
fi
return 0
}
# snapshot management
snapshot() {
local action="$1"
case "$action" in
create)
log "Creating snapshot: $SNAPSHOT"
zfs snapshot "$SNAPSHOT"
zfs hold "$ZFS_HOLD" "$SNAPSHOT" || log "ERROR: Failed to hold $SNAPSHOT"
;;
send)
log "Sending snapshot $SNAPSHOT..."
if snapshot_send "$SNAPSHOT"; then
log "Snapshot send completed. Releasing hold."
zfs release "$ZFS_HOLD" "$SNAPSHOT" || log "ERROR: Failed to release hold on $SNAPSHOT"
else
log "WARNING: Snapshot send encountered errors. Hold retained on $SNAPSHOT."
fi
;;
prune)
if [[ "$SNAPSHOT_RETENTION" -gt 0 ]]; then
log "Pruning old snapshots in $DATASET (retaining $SNAPSHOT_RETENTION destroyable snapshots)..."
local -a all_snaps destroyable
mapfile -t all_snaps < <(zfs list -H -t snapshot -o name -s creation -d1 "$DATASET" | grep -F "${DATASET}@${SNAPSHOT_PREFIX}-")
destroyable=()
for snap in "${all_snaps[@]}"; do
if zfs destroy -n -- "$snap" &>/dev/null; then
destroyable+=("$snap")
else
log "Skipping $snap — snapshot not destroyable (likely held)"
fi
done
local count to_destroy
count="${#destroyable[@]}"
to_destroy=$((count - SNAPSHOT_RETENTION))
if [[ "$to_destroy" -le 0 ]]; then
log "Nothing to prune — only $count destroyable snapshots exist"
else
local i
for (( i=0; i<to_destroy; i++ )); do
snap="${destroyable[$i]}"
log "Destroying snapshot: $snap"
if ! zfs destroy -- "$snap"; then
log "WARNING: Failed to destroy $snap despite earlier check"
fi
done
fi
else
log "Skipping pruning — retention is set to $SNAPSHOT_RETENTION"
fi
;;
*)
log "ERROR: Snapshot unknown action: $action"
exit 2
;;
esac
}
# open logging and begin execution
mkdir -p "$(dirname -- "$LOG_FILE")"
start_time=$(date +%s)
exec >> "$LOG_FILE" 2>&1
trap 'log_close' EXIT
trap 'rc=$?; log "ERROR: command failed at line $LINENO (exit $rc)"; exit $rc' ERR
log "Backlog Started"
if zfs list -H -t snapshot -o name -d1 "$DATASET" | grep -qxF "$SNAPSHOT"; then
log "WARNING: Snapshot $SNAPSHOT already exists. Exiting."
exit 1
fi
services stop
snapshot create
services start
snapshot send
snapshot prune
# end

View File

@ -1,280 +0,0 @@
#!/usr/bin/env python3
"""Biscayne agave validator status check.
Collects and displays key health metrics:
- Slot position (local vs mainnet, gap, replay rate)
- Pod status (running, restarts, age)
- Memory usage (cgroup current vs limit, % used)
- OOM kills (recent dmesg entries)
- Shred relay (packets/sec on port 9100, shred-unwrap.py alive)
- Validator process state (from logs)
"""
import json
import subprocess
import sys
import time
NAMESPACE = "laconic-laconic-70ce4c4b47e23b85"
DEPLOYMENT = "laconic-70ce4c4b47e23b85-deployment"
KIND_NODE = "laconic-70ce4c4b47e23b85-control-plane"
SSH = "rix@biscayne.vaasl.io"
MAINNET_RPC = "https://api.mainnet-beta.solana.com"
LOCAL_RPC = "http://127.0.0.1:8899"
def ssh(cmd: str, timeout: int = 10) -> str:
try:
r = subprocess.run(
["ssh", SSH, cmd],
capture_output=True, text=True, timeout=timeout,
)
return r.stdout.strip() + r.stderr.strip()
except subprocess.TimeoutExpired:
return "<timeout>"
def local(cmd: str, timeout: int = 10) -> str:
try:
r = subprocess.run(
cmd, shell=True, capture_output=True, text=True, timeout=timeout,
)
return r.stdout.strip()
except subprocess.TimeoutExpired:
return "<timeout>"
def rpc_call(method: str, url: str = LOCAL_RPC, remote: bool = True, params: list | None = None) -> dict | None:
payload = json.dumps({"jsonrpc": "2.0", "id": 1, "method": method, "params": params or []})
cmd = f"curl -s {url} -X POST -H 'Content-Type: application/json' -d '{payload}'"
raw = ssh(cmd) if remote else local(cmd)
try:
return json.loads(raw)
except (json.JSONDecodeError, TypeError):
return None
def get_slots() -> tuple[int | None, int | None]:
local_resp = rpc_call("getSlot")
mainnet_resp = rpc_call("getSlot", MAINNET_RPC, remote=False)
local_slot = local_resp.get("result") if local_resp else None
mainnet_slot = mainnet_resp.get("result") if mainnet_resp else None
return local_slot, mainnet_slot
def get_health() -> str:
resp = rpc_call("getHealth")
if not resp:
return "unreachable"
if "result" in resp and resp["result"] == "ok":
return "healthy"
err = resp.get("error", {})
msg = err.get("message", "unknown")
behind = err.get("data", {}).get("numSlotsBehind")
if behind is not None:
return f"behind {behind:,} slots"
return msg
def get_pod_status() -> str:
cmd = f"kubectl -n {NAMESPACE} get pods -o json"
raw = ssh(cmd, timeout=15)
try:
data = json.loads(raw)
except (json.JSONDecodeError, TypeError):
return "unknown"
items = data.get("items", [])
if not items:
return "no pods"
pod = items[0]
name = pod["metadata"]["name"].split("-")[-1]
phase = pod["status"].get("phase", "?")
containers = pod["status"].get("containerStatuses", [])
restarts = sum(c.get("restartCount", 0) for c in containers)
ready = sum(1 for c in containers if c.get("ready"))
total = len(containers)
age = pod["metadata"].get("creationTimestamp", "?")
return f"{ready}/{total} {phase} restarts={restarts} pod=..{name} created={age}"
def get_memory() -> str:
cmd = (
f"docker exec {KIND_NODE} bash -c '"
"find /sys/fs/cgroup -name memory.current -path \"*burstable*\" 2>/dev/null | head -1 | "
"while read f; do "
" dir=$(dirname $f); "
" cur=$(cat $f); "
" max=$(cat $dir/memory.max 2>/dev/null || echo unknown); "
" echo $cur $max; "
"done'"
)
raw = ssh(cmd, timeout=10)
try:
parts = raw.split()
current = int(parts[0])
limit_str = parts[1]
cur_gb = current / (1024**3)
if limit_str == "max":
return f"{cur_gb:.0f}GB / unlimited"
limit = int(limit_str)
lim_gb = limit / (1024**3)
pct = (current / limit) * 100
return f"{cur_gb:.0f}GB / {lim_gb:.0f}GB ({pct:.0f}%)"
except (IndexError, ValueError):
return raw or "unknown"
def get_oom_kills() -> str:
raw = ssh("sudo dmesg | grep -c 'oom-kill' || echo 0")
try:
count = int(raw.strip())
except ValueError:
return "check failed"
if count == 0:
return "none"
# Get kernel uptime-relative timestamp and convert to UTC
# dmesg timestamps are seconds since boot; combine with boot time
raw = ssh(
"BOOT=$(date -d \"$(uptime -s)\" +%s); "
"KERN_TS=$(sudo dmesg | grep 'oom-kill' | tail -1 | "
" sed 's/\\[\\s*\\([0-9.]*\\)\\].*/\\1/'); "
"echo $BOOT $KERN_TS"
)
try:
parts = raw.split()
boot_epoch = int(parts[0])
kern_secs = float(parts[1])
oom_epoch = boot_epoch + int(kern_secs)
from datetime import datetime, timezone
oom_utc = datetime.fromtimestamp(oom_epoch, tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
return f"{count} total (last: {oom_utc})"
except (IndexError, ValueError):
return f"{count} total (timestamp parse failed)"
def get_relay_rate() -> str:
# Two samples 3s apart from /proc/net/snmp
cmd = (
"T0=$(cat /proc/net/snmp | grep '^Udp:' | tail -1 | awk '{print $2}'); "
"sleep 3; "
"T1=$(cat /proc/net/snmp | grep '^Udp:' | tail -1 | awk '{print $2}'); "
"echo $T0 $T1"
)
raw = ssh(cmd, timeout=15)
try:
parts = raw.split()
t0, t1 = int(parts[0]), int(parts[1])
rate = (t1 - t0) / 3
return f"{rate:,.0f} UDP dgrams/sec (all ports)"
except (IndexError, ValueError):
return raw or "unknown"
def get_shreds_per_sec() -> str:
"""Count UDP packets on TVU port 9000 over 3 seconds using tcpdump."""
cmd = "sudo timeout 3 tcpdump -i any udp dst port 9000 -q 2>&1 | grep -oP '\\d+(?= packets captured)'"
raw = ssh(cmd, timeout=15)
try:
count = int(raw.strip())
rate = count / 3
return f"{rate:,.0f} shreds/sec ({count:,} in 3s)"
except (ValueError, TypeError):
return raw or "unknown"
def get_unwrap_status() -> str:
raw = ssh("ps -p $(pgrep -f shred-unwrap | head -1) -o pid,etime,rss --no-headers 2>/dev/null || echo dead")
if "dead" in raw or not raw.strip():
return "NOT RUNNING"
parts = raw.split()
if len(parts) >= 3:
pid, etime, rss_kb = parts[0], parts[1], parts[2]
rss_mb = int(rss_kb) / 1024
return f"pid={pid} uptime={etime} rss={rss_mb:.0f}MB"
return raw
def get_replay_rate() -> tuple[float | None, int | None, int | None]:
"""Sample processed slot twice over 10s to measure replay rate."""
params = [{"commitment": "processed"}]
r0 = rpc_call("getSlot", params=params)
s0 = r0.get("result") if r0 else None
if s0 is None:
return None, None, None
t0 = time.monotonic()
time.sleep(10)
r1 = rpc_call("getSlot", params=params)
s1 = r1.get("result") if r1 else None
if s1 is None:
return None, s0, None
dt = time.monotonic() - t0
rate = (s1 - s0) / dt if s1 != s0 else 0
return rate, s0, s1
def main() -> None:
print("=" * 60)
print(" BISCAYNE VALIDATOR STATUS")
print("=" * 60)
# Health + slots
print("\n--- RPC ---")
health = get_health()
local_slot, mainnet_slot = get_slots()
print(f" Health: {health}")
if local_slot is not None:
print(f" Local slot: {local_slot:,}")
else:
print(" Local slot: unreachable")
if mainnet_slot is not None:
print(f" Mainnet slot: {mainnet_slot:,}")
if local_slot and mainnet_slot:
gap = mainnet_slot - local_slot
print(f" Gap: {gap:,} slots")
# Replay rate (10s sample)
print("\n--- Replay ---")
print(" Sampling replay rate (10s)...", end="", flush=True)
rate, s0, s1 = get_replay_rate()
if rate is not None:
print(f"\r Replay rate: {rate:.1f} slots/sec ({s0:,}{s1:,})")
net = rate - 2.5
if net > 0:
print(f" Net catchup: +{net:.1f} slots/sec (gaining)")
elif net < 0:
print(f" Net catchup: {net:.1f} slots/sec (falling behind)")
else:
print(" Net catchup: 0 (keeping pace)")
else:
print("\r Replay rate: could not measure")
# Pod
print("\n--- Pod ---")
pod = get_pod_status()
print(f" {pod}")
# Memory
print("\n--- Memory ---")
mem = get_memory()
print(f" Cgroup: {mem}")
# OOM
oom = get_oom_kills()
print(f" OOM kills: {oom}")
# Relay
print("\n--- Shred Relay ---")
unwrap = get_unwrap_status()
print(f" shred-unwrap: {unwrap}")
print(" Measuring shred rate (3s)...", end="", flush=True)
shreds = get_shreds_per_sec()
print(f"\r TVU shreds: {shreds} ")
print(" Measuring UDP rate (3s)...", end="", flush=True)
relay = get_relay_rate()
print(f"\r UDP inbound: {relay} ")
print("\n" + "=" * 60)
if __name__ == "__main__":
main()

View File

@ -1,546 +0,0 @@
#!/usr/bin/env python3
"""Download Solana snapshots using aria2c for parallel multi-connection downloads.
Discovers snapshot sources by querying getClusterNodes for all RPCs in the
cluster, probing each for available snapshots, benchmarking download speed,
and downloading from the fastest source using aria2c (16 connections by default).
Based on the discovery approach from etcusr/solana-snapshot-finder but replaces
the single-connection wget download with aria2c parallel chunked downloads.
Usage:
# Download to /srv/solana/snapshots (mainnet, 16 connections)
./snapshot-download.py -o /srv/solana/snapshots
# Dry run — find best source, print URL
./snapshot-download.py --dry-run
# Custom RPC for cluster node discovery + 32 connections
./snapshot-download.py -r https://api.mainnet-beta.solana.com -n 32
# Testnet
./snapshot-download.py -c testnet -o /data/snapshots
Requirements:
- aria2c (apt install aria2)
- python3 >= 3.10 (stdlib only, no pip dependencies)
"""
from __future__ import annotations
import argparse
import concurrent.futures
import json
import logging
import os
import re
import shutil
import subprocess
import sys
import time
import urllib.error
import urllib.request
from dataclasses import dataclass, field
from http.client import HTTPResponse
from pathlib import Path
from typing import NoReturn
from urllib.request import Request
log: logging.Logger = logging.getLogger("snapshot-download")
CLUSTER_RPC: dict[str, str] = {
"mainnet-beta": "https://api.mainnet-beta.solana.com",
"testnet": "https://api.testnet.solana.com",
"devnet": "https://api.devnet.solana.com",
}
# Snapshot filenames:
# snapshot-<slot>-<hash>.tar.zst
# incremental-snapshot-<base_slot>-<slot>-<hash>.tar.zst
FULL_SNAP_RE: re.Pattern[str] = re.compile(
r"^snapshot-(\d+)-([A-Za-z0-9]+)\.tar\.(zst|bz2)$"
)
INCR_SNAP_RE: re.Pattern[str] = re.compile(
r"^incremental-snapshot-(\d+)-(\d+)-([A-Za-z0-9]+)\.tar\.(zst|bz2)$"
)
@dataclass
class SnapshotSource:
"""A snapshot file available from a specific RPC node."""
rpc_address: str
# Full redirect paths as returned by the server (e.g. /snapshot-123-hash.tar.zst)
file_paths: list[str] = field(default_factory=list)
slots_diff: int = 0
latency_ms: float = 0.0
download_speed: float = 0.0 # bytes/sec
# -- JSON-RPC helpers ----------------------------------------------------------
class _NoRedirectHandler(urllib.request.HTTPRedirectHandler):
"""Handler that captures redirect Location instead of following it."""
def redirect_request(
self,
req: Request,
fp: HTTPResponse,
code: int,
msg: str,
headers: dict[str, str], # type: ignore[override]
newurl: str,
) -> None:
return None
def rpc_post(url: str, method: str, params: list[object] | None = None,
timeout: int = 25) -> object | None:
"""JSON-RPC POST. Returns parsed 'result' field or None on error."""
payload: bytes = json.dumps({
"jsonrpc": "2.0", "id": 1,
"method": method, "params": params or [],
}).encode()
req = Request(url, data=payload,
headers={"Content-Type": "application/json"})
try:
with urllib.request.urlopen(req, timeout=timeout) as resp:
data: dict[str, object] = json.loads(resp.read())
return data.get("result")
except (urllib.error.URLError, json.JSONDecodeError, OSError, TimeoutError) as e:
log.debug("rpc_post %s %s failed: %s", url, method, e)
return None
def head_no_follow(url: str, timeout: float = 3) -> tuple[str | None, float]:
"""HEAD request without following redirects.
Returns (Location header value, latency_sec) if the server returned a
3xx redirect. Returns (None, 0.0) on any error or non-redirect response.
"""
opener: urllib.request.OpenerDirector = urllib.request.build_opener(_NoRedirectHandler)
req = Request(url, method="HEAD")
try:
start: float = time.monotonic()
resp: HTTPResponse = opener.open(req, timeout=timeout) # type: ignore[assignment]
latency: float = time.monotonic() - start
# Non-redirect (2xx) — server didn't redirect, not useful for discovery
location: str | None = resp.headers.get("Location")
resp.close()
return location, latency
except urllib.error.HTTPError as e:
# 3xx redirects raise HTTPError with the redirect info
latency = time.monotonic() - start # type: ignore[possibly-undefined]
location = e.headers.get("Location")
if location and 300 <= e.code < 400:
return location, latency
return None, 0.0
except (urllib.error.URLError, OSError, TimeoutError):
return None, 0.0
# -- Discovery -----------------------------------------------------------------
def get_current_slot(rpc_url: str) -> int | None:
"""Get current slot from RPC."""
result: object | None = rpc_post(rpc_url, "getSlot")
if isinstance(result, int):
return result
return None
def get_cluster_rpc_nodes(rpc_url: str, version_filter: str | None = None) -> list[str]:
"""Get all RPC node addresses from getClusterNodes."""
result: object | None = rpc_post(rpc_url, "getClusterNodes")
if not isinstance(result, list):
return []
rpc_addrs: list[str] = []
for node in result:
if not isinstance(node, dict):
continue
if version_filter is not None:
node_version: str | None = node.get("version")
if node_version and not node_version.startswith(version_filter):
continue
rpc: str | None = node.get("rpc")
if rpc:
rpc_addrs.append(rpc)
return list(set(rpc_addrs))
def _parse_snapshot_filename(location: str) -> tuple[str, str | None]:
"""Extract filename and full redirect path from Location header.
Returns (filename, full_path). full_path includes any path prefix
the server returned (e.g. '/snapshots/snapshot-123-hash.tar.zst').
"""
# Location may be absolute URL or relative path
if location.startswith("http://") or location.startswith("https://"):
# Absolute URL — extract path
from urllib.parse import urlparse
path: str = urlparse(location).path
else:
path = location
filename: str = path.rsplit("/", 1)[-1]
return filename, path
def probe_rpc_snapshot(
rpc_address: str,
current_slot: int,
max_age_slots: int,
max_latency_ms: float,
) -> SnapshotSource | None:
"""Probe a single RPC node for available snapshots.
Probes for full snapshot first (required), then incremental. Records all
available files. Which files to actually download is decided at download
time based on what already exists locally not here.
Based on the discovery approach from etcusr/solana-snapshot-finder.
"""
full_url: str = f"http://{rpc_address}/snapshot.tar.bz2"
# Full snapshot is required — every source must have one
full_location, full_latency = head_no_follow(full_url, timeout=2)
if not full_location:
return None
latency_ms: float = full_latency * 1000
if latency_ms > max_latency_ms:
return None
full_filename, full_path = _parse_snapshot_filename(full_location)
fm: re.Match[str] | None = FULL_SNAP_RE.match(full_filename)
if not fm:
return None
full_snap_slot: int = int(fm.group(1))
slots_diff: int = current_slot - full_snap_slot
if slots_diff > max_age_slots or slots_diff < -100:
return None
file_paths: list[str] = [full_path]
# Also check for incremental snapshot
inc_url: str = f"http://{rpc_address}/incremental-snapshot.tar.bz2"
inc_location, _ = head_no_follow(inc_url, timeout=2)
if inc_location:
inc_filename, inc_path = _parse_snapshot_filename(inc_location)
m: re.Match[str] | None = INCR_SNAP_RE.match(inc_filename)
if m:
inc_base_slot: int = int(m.group(1))
# Incremental must be based on this source's full snapshot
if inc_base_slot == full_snap_slot:
file_paths.append(inc_path)
return SnapshotSource(
rpc_address=rpc_address,
file_paths=file_paths,
slots_diff=slots_diff,
latency_ms=latency_ms,
)
def discover_sources(
rpc_url: str,
current_slot: int,
max_age_slots: int,
max_latency_ms: float,
threads: int,
version_filter: str | None,
) -> list[SnapshotSource]:
"""Discover all snapshot sources from the cluster."""
rpc_nodes: list[str] = get_cluster_rpc_nodes(rpc_url, version_filter)
if not rpc_nodes:
log.error("No RPC nodes found via getClusterNodes")
return []
log.info("Found %d RPC nodes, probing for snapshots...", len(rpc_nodes))
sources: list[SnapshotSource] = []
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as pool:
futures: dict[concurrent.futures.Future[SnapshotSource | None], str] = {
pool.submit(
probe_rpc_snapshot, addr, current_slot,
max_age_slots, max_latency_ms,
): addr
for addr in rpc_nodes
}
done: int = 0
for future in concurrent.futures.as_completed(futures):
done += 1
if done % 200 == 0:
log.info(" probed %d/%d nodes, %d sources found",
done, len(rpc_nodes), len(sources))
try:
result: SnapshotSource | None = future.result()
except (urllib.error.URLError, OSError, TimeoutError) as e:
log.debug("Probe failed for %s: %s", futures[future], e)
continue
if result:
sources.append(result)
log.info("Found %d RPC nodes with suitable snapshots", len(sources))
return sources
# -- Speed benchmark -----------------------------------------------------------
def measure_speed(rpc_address: str, measure_time: int = 7) -> float:
"""Measure download speed from an RPC node. Returns bytes/sec."""
url: str = f"http://{rpc_address}/snapshot.tar.bz2"
req = Request(url)
try:
with urllib.request.urlopen(req, timeout=measure_time + 5) as resp:
start: float = time.monotonic()
total: int = 0
while True:
elapsed: float = time.monotonic() - start
if elapsed >= measure_time:
break
chunk: bytes = resp.read(81920)
if not chunk:
break
total += len(chunk)
elapsed = time.monotonic() - start
if elapsed <= 0:
return 0.0
return total / elapsed
except (urllib.error.URLError, OSError, TimeoutError):
return 0.0
# -- Download ------------------------------------------------------------------
def download_aria2c(
urls: list[str],
output_dir: str,
filename: str,
connections: int = 16,
) -> bool:
"""Download a file using aria2c with parallel connections.
When multiple URLs are provided, aria2c treats them as mirrors of the
same file and distributes chunks across all of them.
"""
num_mirrors: int = len(urls)
total_splits: int = max(connections, connections * num_mirrors)
cmd: list[str] = [
"aria2c",
"--file-allocation=none",
"--continue=true",
f"--max-connection-per-server={connections}",
f"--split={total_splits}",
"--min-split-size=50M",
# aria2c retries individual chunk connections on transient network
# errors (TCP reset, timeout). This is transport-level retry analogous
# to TCP retransmit, not application-level retry of a failed operation.
"--max-tries=5",
"--retry-wait=5",
"--timeout=60",
"--connect-timeout=10",
"--summary-interval=10",
"--console-log-level=notice",
f"--dir={output_dir}",
f"--out={filename}",
"--auto-file-renaming=false",
"--allow-overwrite=true",
*urls,
]
log.info("Downloading %s", filename)
log.info(" aria2c: %d connections × %d mirrors (%d splits)",
connections, num_mirrors, total_splits)
start: float = time.monotonic()
result: subprocess.CompletedProcess[bytes] = subprocess.run(cmd)
elapsed: float = time.monotonic() - start
if result.returncode != 0:
log.error("aria2c failed with exit code %d", result.returncode)
return False
filepath: Path = Path(output_dir) / filename
if not filepath.exists():
log.error("aria2c reported success but %s does not exist", filepath)
return False
size_bytes: int = filepath.stat().st_size
size_gb: float = size_bytes / (1024 ** 3)
avg_mb: float = size_bytes / elapsed / (1024 ** 2) if elapsed > 0 else 0
log.info(" Done: %.1f GB in %.0fs (%.1f MiB/s avg)", size_gb, elapsed, avg_mb)
return True
# -- Main ----------------------------------------------------------------------
def main() -> int:
p: argparse.ArgumentParser = argparse.ArgumentParser(
description="Download Solana snapshots with aria2c parallel downloads",
)
p.add_argument("-o", "--output", default="/srv/solana/snapshots",
help="Snapshot output directory (default: /srv/solana/snapshots)")
p.add_argument("-c", "--cluster", default="mainnet-beta",
choices=list(CLUSTER_RPC),
help="Solana cluster (default: mainnet-beta)")
p.add_argument("-r", "--rpc", default=None,
help="RPC URL for cluster discovery (default: public RPC)")
p.add_argument("-n", "--connections", type=int, default=16,
help="aria2c connections per download (default: 16)")
p.add_argument("-t", "--threads", type=int, default=500,
help="Threads for parallel RPC probing (default: 500)")
p.add_argument("--max-snapshot-age", type=int, default=1300,
help="Max snapshot age in slots (default: 1300)")
p.add_argument("--max-latency", type=float, default=100,
help="Max RPC probe latency in ms (default: 100)")
p.add_argument("--min-download-speed", type=int, default=20,
help="Min download speed in MiB/s (default: 20)")
p.add_argument("--measurement-time", type=int, default=7,
help="Speed measurement duration in seconds (default: 7)")
p.add_argument("--max-speed-checks", type=int, default=15,
help="Max nodes to benchmark before giving up (default: 15)")
p.add_argument("--version", default=None,
help="Filter nodes by version prefix (e.g. '2.2')")
p.add_argument("--full-only", action="store_true",
help="Download only full snapshot, skip incremental")
p.add_argument("--dry-run", action="store_true",
help="Find best source and print URL, don't download")
p.add_argument("-v", "--verbose", action="store_true")
args: argparse.Namespace = p.parse_args()
logging.basicConfig(
level=logging.DEBUG if args.verbose else logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
datefmt="%H:%M:%S",
)
rpc_url: str = args.rpc or CLUSTER_RPC[args.cluster]
# aria2c is required for actual downloads (not dry-run)
if not args.dry_run and not shutil.which("aria2c"):
log.error("aria2c not found. Install with: apt install aria2")
return 1
# Get current slot
log.info("Cluster: %s | RPC: %s", args.cluster, rpc_url)
current_slot: int | None = get_current_slot(rpc_url)
if current_slot is None:
log.error("Cannot get current slot from %s", rpc_url)
return 1
log.info("Current slot: %d", current_slot)
# Discover sources
sources: list[SnapshotSource] = discover_sources(
rpc_url, current_slot,
max_age_slots=args.max_snapshot_age,
max_latency_ms=args.max_latency,
threads=args.threads,
version_filter=args.version,
)
if not sources:
log.error("No snapshot sources found")
return 1
# Sort by latency (lowest first) for speed benchmarking
sources.sort(key=lambda s: s.latency_ms)
# Benchmark top candidates — all speeds in MiB/s (binary, 1 MiB = 1048576 bytes)
log.info("Benchmarking download speed on top %d sources...", args.max_speed_checks)
fast_sources: list[SnapshotSource] = []
checked: int = 0
min_speed_bytes: int = args.min_download_speed * 1024 * 1024 # MiB to bytes
for source in sources:
if checked >= args.max_speed_checks:
break
checked += 1
speed: float = measure_speed(source.rpc_address, args.measurement_time)
source.download_speed = speed
speed_mib: float = speed / (1024 ** 2)
if speed < min_speed_bytes:
log.info(" %s: %.1f MiB/s (too slow, need >=%d MiB/s)",
source.rpc_address, speed_mib, args.min_download_speed)
continue
log.info(" %s: %.1f MiB/s (latency: %.0fms, age: %d slots)",
source.rpc_address, speed_mib,
source.latency_ms, source.slots_diff)
fast_sources.append(source)
if not fast_sources:
log.error("No source met minimum speed requirement (%d MiB/s)",
args.min_download_speed)
log.info("Try: --min-download-speed 10")
return 1
# Use the fastest source as primary, collect mirrors for each file
best: SnapshotSource = fast_sources[0]
file_paths: list[str] = best.file_paths
if args.full_only:
file_paths = [fp for fp in file_paths
if fp.rsplit("/", 1)[-1].startswith("snapshot-")]
# Build mirror URL lists: for each file, collect URLs from all fast sources
# that serve the same filename
download_plan: list[tuple[str, list[str]]] = []
for fp in file_paths:
filename: str = fp.rsplit("/", 1)[-1]
mirror_urls: list[str] = [f"http://{best.rpc_address}{fp}"]
for other in fast_sources[1:]:
for other_fp in other.file_paths:
if other_fp.rsplit("/", 1)[-1] == filename:
mirror_urls.append(f"http://{other.rpc_address}{other_fp}")
break
download_plan.append((filename, mirror_urls))
speed_mib: float = best.download_speed / (1024 ** 2)
log.info("Best source: %s (%.1f MiB/s), %d mirrors total",
best.rpc_address, speed_mib, len(fast_sources))
for filename, mirror_urls in download_plan:
log.info(" %s (%d mirrors)", filename, len(mirror_urls))
for url in mirror_urls:
log.info(" %s", url)
if args.dry_run:
for _, mirror_urls in download_plan:
for url in mirror_urls:
print(url)
return 0
# Download — skip files that already exist locally
os.makedirs(args.output, exist_ok=True)
total_start: float = time.monotonic()
for filename, mirror_urls in download_plan:
filepath: Path = Path(args.output) / filename
if filepath.exists() and filepath.stat().st_size > 0:
log.info("Skipping %s (already exists: %.1f GB)",
filename, filepath.stat().st_size / (1024 ** 3))
continue
if not download_aria2c(mirror_urls, args.output, filename, args.connections):
log.error("Failed to download %s", filename)
return 1
total_elapsed: float = time.monotonic() - total_start
log.info("All downloads complete in %.0fs", total_elapsed)
for filename, _ in download_plan:
fp: Path = Path(args.output) / filename
if fp.exists():
log.info(" %s (%.1f GB)", fp.name, fp.stat().st_size / (1024 ** 3))
return 0
if __name__ == "__main__":
sys.exit(main())

View File

@ -1,109 +0,0 @@
# ZFS Setup for Biscayne
## Current State
```
biscayne none (pool root)
biscayne/DATA none
biscayne/DATA/home /home 42G
biscayne/DATA/home/solana /home/solana 2.9G
biscayne/DATA/srv /srv 712G
biscayne/DATA/srv/backups /srv/backups 208G
biscayne/DATA/volumes/solana (zvol, 4T) → block-mounted at /srv/solana
```
Docker root: `/var/lib/docker` on root filesystem (`/dev/md0`, 439G).
## Target State
```
biscayne/DATA/deployments /srv/deployments ← laconic-so deployment dirs (snapshotted)
biscayne/DATA/var/docker /var/lib/docker ← docker storage on ZFS
biscayne/DATA/volumes/solana (zvol, 4T) ← bulk solana data (not backed up)
```
## Steps
### 1. Create deployments dataset
```bash
zfs create -o mountpoint=/srv/deployments biscayne/DATA/deployments
```
### 2. Move docker onto ZFS
Stop docker and all containers first:
```bash
systemctl stop docker.socket docker.service
```
Create the dataset:
```bash
zfs create -o mountpoint=/var/lib/docker biscayne/DATA/var
zfs create biscayne/DATA/var/docker
```
Copy existing docker data (if any worth keeping):
```bash
rsync -aHAX /var/lib/docker.bak/ /var/lib/docker/
```
Or just start fresh — the only running containers are telegraf/influxdb monitoring
which can be recreated.
Start docker:
```bash
systemctl start docker.service
```
### 3. Grant ZFS permissions to the backup user
```bash
zfs allow -u <backup-user> destroy,snapshot,send,hold,release,mount biscayne/DATA/deployments
```
### 4. Create remote receiving datasets
On mysterio:
```bash
zfs create -p edith/DATA/backlog/biscayne-main
```
On ardham:
```bash
zfs create -p batterywharf/DATA/backlog/biscayne-main
```
These will fail until SSH keys and network access are configured for biscayne
to reach these hosts. The backup script handles this gracefully.
### 5. Install backlog.sh and crontab
```bash
mkdir -p ~/.local/bin
cp scripts/backlog.sh ~/.local/bin/backlog.sh
chmod +x ~/.local/bin/backlog.sh
crontab -e
# Add: 01 0 * * * /home/<user>/.local/bin/backlog.sh
```
## Volume Layout
laconic-so deployment at `/srv/deployments/agave/`:
| Volume | Location | Backed up |
|---|---|---|
| validator-config | `/srv/deployments/agave/data/validator-config/` | Yes (ZFS snapshot) |
| doublezero-config | `/srv/deployments/agave/data/doublezero-config/` | Yes (ZFS snapshot) |
| validator-ledger | `/srv/solana/ledger/` (zvol) | No (rebuildable) |
| validator-accounts | `/srv/solana/accounts/` (zvol) | No (rebuildable) |
| validator-snapshots | `/srv/solana/snapshots/` (zvol) | No (rebuildable) |
The laconic-so spec.yml must map the heavy volumes to zvol paths and the small
config volumes to the deployment directory.

View File

@ -1,112 +0,0 @@
services:
agave-rpc:
restart: unless-stopped
image: laconicnetwork/agave:local
network_mode: host
privileged: true
cap_add:
- IPC_LOCK
# Compose owns all defaults. spec.yml overrides per-deployment.
environment:
AGAVE_MODE: rpc
# Required — no defaults
VALIDATOR_ENTRYPOINT: ${VALIDATOR_ENTRYPOINT}
KNOWN_VALIDATOR: ${KNOWN_VALIDATOR}
# Optional with defaults
EXTRA_ENTRYPOINTS: ${EXTRA_ENTRYPOINTS:-}
EXTRA_KNOWN_VALIDATORS: ${EXTRA_KNOWN_VALIDATORS:-}
RPC_PORT: ${RPC_PORT:-8899}
RPC_BIND_ADDRESS: ${RPC_BIND_ADDRESS:-127.0.0.1}
GOSSIP_PORT: ${GOSSIP_PORT:-8001}
DYNAMIC_PORT_RANGE: ${DYNAMIC_PORT_RANGE:-9000-10000}
EXPECTED_GENESIS_HASH: ${EXPECTED_GENESIS_HASH:-}
EXPECTED_SHRED_VERSION: ${EXPECTED_SHRED_VERSION:-}
LIMIT_LEDGER_SIZE: ${LIMIT_LEDGER_SIZE:-50000000}
NO_SNAPSHOTS: ${NO_SNAPSHOTS:-false}
SNAPSHOT_INTERVAL_SLOTS: ${SNAPSHOT_INTERVAL_SLOTS:-100000}
MAXIMUM_SNAPSHOTS_TO_RETAIN: ${MAXIMUM_SNAPSHOTS_TO_RETAIN:-1}
NO_INCREMENTAL_SNAPSHOTS: ${NO_INCREMENTAL_SNAPSHOTS:-false}
ACCOUNT_INDEXES: ${ACCOUNT_INDEXES:-}
PUBLIC_RPC_ADDRESS: ${PUBLIC_RPC_ADDRESS:-}
GOSSIP_HOST: ${GOSSIP_HOST:-}
PUBLIC_TVU_ADDRESS: ${PUBLIC_TVU_ADDRESS:-}
RUST_LOG: ${RUST_LOG:-info}
SOLANA_METRICS_CONFIG: ${SOLANA_METRICS_CONFIG:-}
JITO_ENABLE: ${JITO_ENABLE:-false}
JITO_BLOCK_ENGINE_URL: ${JITO_BLOCK_ENGINE_URL:-}
JITO_SHRED_RECEIVER_ADDR: ${JITO_SHRED_RECEIVER_ADDR:-}
JITO_TIP_PAYMENT_PROGRAM: ${JITO_TIP_PAYMENT_PROGRAM:-}
JITO_DISTRIBUTION_PROGRAM: ${JITO_DISTRIBUTION_PROGRAM:-}
JITO_MERKLE_ROOT_AUTHORITY: ${JITO_MERKLE_ROOT_AUTHORITY:-}
JITO_COMMISSION_BPS: ${JITO_COMMISSION_BPS:-0}
EXTRA_ARGS: ${EXTRA_ARGS:-}
SNAPSHOT_AUTO_DOWNLOAD: ${SNAPSHOT_AUTO_DOWNLOAD:-true}
SNAPSHOT_MAX_AGE_SLOTS: ${SNAPSHOT_MAX_AGE_SLOTS:-20000}
PROBE_GRACE_SECONDS: ${PROBE_GRACE_SECONDS:-600}
PROBE_MAX_SLOT_LAG: ${PROBE_MAX_SLOT_LAG:-20000}
deploy:
resources:
reservations:
cpus: '4.0'
memory: 256000M
limits:
cpus: '32.0'
memory: 921600M
volumes:
- rpc-config:/data/config
- rpc-ledger:/data/ledger
- rpc-accounts:/data/accounts
- rpc-snapshots:/data/snapshots
ports:
# RPC ports
- "8899"
- "8900"
# Gossip port
- "8001"
- "8001/udp"
# Dynamic port range for TPU/TVU/repair (9000-9025, 26 ports)
- "9000/udp"
- "9001/udp"
- "9002/udp"
- "9003/udp"
- "9004/udp"
- "9005/udp"
- "9006/udp"
- "9007/udp"
- "9008/udp"
- "9009/udp"
- "9010/udp"
- "9011/udp"
- "9012/udp"
- "9013/udp"
- "9014/udp"
- "9015/udp"
- "9016/udp"
- "9017/udp"
- "9018/udp"
- "9019/udp"
- "9020/udp"
- "9021/udp"
- "9022/udp"
- "9023/udp"
- "9024/udp"
- "9025/udp"
ulimits:
memlock:
soft: -1
hard: -1
nofile:
soft: 1000000
hard: 1000000
healthcheck:
test: ["CMD", "entrypoint.py", "probe"]
interval: 30s
timeout: 10s
retries: 3
start_period: 600s
volumes:
rpc-config:
rpc-ledger:
rpc-accounts:
rpc-snapshots:

View File

@ -1,27 +0,0 @@
services:
agave-test:
restart: unless-stopped
image: laconicnetwork/agave:local
security_opt:
- seccomp=unconfined
environment:
AGAVE_MODE: test
FACILITATOR_PUBKEY: ${FACILITATOR_PUBKEY:-}
SERVER_PUBKEY: ${SERVER_PUBKEY:-}
CLIENT_PUBKEY: ${CLIENT_PUBKEY:-}
MINT_DECIMALS: ${MINT_DECIMALS:-6}
MINT_AMOUNT: ${MINT_AMOUNT:-1000000000}
volumes:
- test-ledger:/data/ledger
ports:
- "8899"
- "8900"
healthcheck:
test: ["CMD", "solana", "cluster-version", "--url", "http://127.0.0.1:8899"]
interval: 5s
timeout: 5s
retries: 30
start_period: 10s
volumes:
test-ledger:

View File

@ -1,115 +0,0 @@
services:
agave-validator:
restart: unless-stopped
image: laconicnetwork/agave:local
network_mode: host
privileged: true
cap_add:
- IPC_LOCK
# Compose owns all defaults. spec.yml overrides per-deployment.
environment:
AGAVE_MODE: ${AGAVE_MODE:-validator}
# Required — no defaults
VALIDATOR_ENTRYPOINT: ${VALIDATOR_ENTRYPOINT}
KNOWN_VALIDATOR: ${KNOWN_VALIDATOR}
# Optional with defaults
EXTRA_ENTRYPOINTS: ${EXTRA_ENTRYPOINTS:-}
EXTRA_KNOWN_VALIDATORS: ${EXTRA_KNOWN_VALIDATORS:-}
RPC_PORT: ${RPC_PORT:-8899}
RPC_BIND_ADDRESS: ${RPC_BIND_ADDRESS:-127.0.0.1}
GOSSIP_PORT: ${GOSSIP_PORT:-8001}
DYNAMIC_PORT_RANGE: ${DYNAMIC_PORT_RANGE:-9000-10000}
EXPECTED_GENESIS_HASH: ${EXPECTED_GENESIS_HASH:-}
EXPECTED_SHRED_VERSION: ${EXPECTED_SHRED_VERSION:-}
LIMIT_LEDGER_SIZE: ${LIMIT_LEDGER_SIZE:-50000000}
NO_SNAPSHOTS: ${NO_SNAPSHOTS:-false}
SNAPSHOT_INTERVAL_SLOTS: ${SNAPSHOT_INTERVAL_SLOTS:-100000}
MAXIMUM_SNAPSHOTS_TO_RETAIN: ${MAXIMUM_SNAPSHOTS_TO_RETAIN:-1}
NO_INCREMENTAL_SNAPSHOTS: ${NO_INCREMENTAL_SNAPSHOTS:-false}
ACCOUNT_INDEXES: ${ACCOUNT_INDEXES:-}
VOTE_ACCOUNT_KEYPAIR: ${VOTE_ACCOUNT_KEYPAIR:-/data/config/vote-account-keypair.json}
GOSSIP_HOST: ${GOSSIP_HOST:-}
PUBLIC_TVU_ADDRESS: ${PUBLIC_TVU_ADDRESS:-}
RUST_LOG: ${RUST_LOG:-info}
SOLANA_METRICS_CONFIG: ${SOLANA_METRICS_CONFIG:-}
JITO_ENABLE: ${JITO_ENABLE:-false}
JITO_BLOCK_ENGINE_URL: ${JITO_BLOCK_ENGINE_URL:-}
JITO_RELAYER_URL: ${JITO_RELAYER_URL:-}
JITO_SHRED_RECEIVER_ADDR: ${JITO_SHRED_RECEIVER_ADDR:-}
JITO_TIP_PAYMENT_PROGRAM: ${JITO_TIP_PAYMENT_PROGRAM:-}
JITO_DISTRIBUTION_PROGRAM: ${JITO_DISTRIBUTION_PROGRAM:-}
JITO_MERKLE_ROOT_AUTHORITY: ${JITO_MERKLE_ROOT_AUTHORITY:-}
JITO_COMMISSION_BPS: ${JITO_COMMISSION_BPS:-0}
EXTRA_ARGS: ${EXTRA_ARGS:-}
SNAPSHOT_AUTO_DOWNLOAD: ${SNAPSHOT_AUTO_DOWNLOAD:-true}
SNAPSHOT_MAX_AGE_SLOTS: ${SNAPSHOT_MAX_AGE_SLOTS:-20000}
PROBE_GRACE_SECONDS: ${PROBE_GRACE_SECONDS:-600}
PROBE_MAX_SLOT_LAG: ${PROBE_MAX_SLOT_LAG:-20000}
deploy:
resources:
reservations:
cpus: '4.0'
memory: 256000M
limits:
cpus: '32.0'
memory: 921600M
volumes:
- validator-config:/data/config
- validator-ledger:/data/ledger
- validator-accounts:/data/accounts
- validator-snapshots:/data/snapshots
- validator-log:/data/log
ports:
# RPC ports
- "8899"
- "8900"
# Gossip port
- "8001"
- "8001/udp"
# Dynamic port range for TPU/TVU/repair (9000-9025, 26 ports)
- "9000/udp"
- "9001/udp"
- "9002/udp"
- "9003/udp"
- "9004/udp"
- "9005/udp"
- "9006/udp"
- "9007/udp"
- "9008/udp"
- "9009/udp"
- "9010/udp"
- "9011/udp"
- "9012/udp"
- "9013/udp"
- "9014/udp"
- "9015/udp"
- "9016/udp"
- "9017/udp"
- "9018/udp"
- "9019/udp"
- "9020/udp"
- "9021/udp"
- "9022/udp"
- "9023/udp"
- "9024/udp"
- "9025/udp"
ulimits:
memlock:
soft: -1
hard: -1
nofile:
soft: 1000000
hard: 1000000
healthcheck:
test: ["CMD", "entrypoint.py", "probe"]
interval: 30s
timeout: 10s
retries: 3
start_period: 600s
volumes:
validator-config:
validator-ledger:
validator-accounts:
validator-snapshots:
validator-log:

View File

@ -1,19 +0,0 @@
services:
doublezerod:
restart: unless-stopped
image: laconicnetwork/doublezero:local
network_mode: host
privileged: true
cap_add:
- NET_ADMIN
environment:
DOUBLEZERO_RPC_ENDPOINT: ${DOUBLEZERO_RPC_ENDPOINT:-http://127.0.0.1:8899}
DOUBLEZERO_ENV: ${DOUBLEZERO_ENV:-mainnet-beta}
DOUBLEZERO_EXTRA_ARGS: ${DOUBLEZERO_EXTRA_ARGS:-}
volumes:
- doublezero-validator-identity:/data/config:ro
- doublezero-config:/root/.config/doublezero
volumes:
doublezero-validator-identity:
doublezero-config:

View File

@ -1,49 +0,0 @@
services:
monitoring-influxdb:
image: influxdb:1.8
restart: unless-stopped
environment:
INFLUXDB_DB: agave_metrics
INFLUXDB_HTTP_AUTH_ENABLED: "true"
INFLUXDB_ADMIN_USER: admin
INFLUXDB_ADMIN_PASSWORD: admin
INFLUXDB_REPORTING_DISABLED: "true"
volumes:
- monitoring-influxdb-data:/var/lib/influxdb
ports:
- "8086"
monitoring-grafana:
image: grafana/grafana:latest
restart: unless-stopped
environment:
GF_SECURITY_ADMIN_PASSWORD: admin
GF_SECURITY_ADMIN_USER: admin
GF_USERS_ALLOW_SIGN_UP: "false"
GF_PATHS_DATA: /var/lib/grafana
volumes:
- monitoring-grafana-data:/var/lib/grafana
- monitoring-grafana-datasources:/etc/grafana/provisioning/datasources:ro
- monitoring-grafana-dashboards:/etc/grafana/provisioning/dashboards:ro
ports:
- "3000"
monitoring-telegraf:
image: telegraf:1.36
restart: unless-stopped
network_mode: host
environment:
NODE_RPC_URL: ${NODE_RPC_URL:-http://localhost:8899}
CANONICAL_RPC_URL: ${CANONICAL_RPC_URL:-https://api.mainnet-beta.solana.com}
INFLUXDB_URL: ${INFLUXDB_URL:-http://localhost:8086}
volumes:
- monitoring-telegraf-config:/etc/telegraf:ro
- monitoring-telegraf-scripts:/scripts:ro
volumes:
monitoring-influxdb-data:
monitoring-grafana-data:
monitoring-grafana-datasources:
monitoring-grafana-dashboards:
monitoring-telegraf-config:
monitoring-telegraf-scripts:

View File

@ -1,8 +0,0 @@
#!/bin/sh
# Restart a container by label filter
# Used by the cron-based restarter sidecar
label_filter="$1"
container=$(docker ps -qf "label=$label_filter")
if [ -n "$container" ]; then
docker restart -s TERM "$container" > /dev/null
fi

View File

@ -1,4 +0,0 @@
# Restart validator every 4 hours (mitigate memory leaks)
0 */4 * * * /scripts/restart-node.sh role=validator
# Restart RPC every 6 hours (staggered from validator)
30 */6 * * * /scripts/restart-node.sh role=rpc

View File

@ -1,12 +0,0 @@
apiVersion: 1
providers:
- name: 'default'
orgId: 1
folder: ''
type: file
disableDeletion: false
editable: true
options:
path: /etc/grafana/provisioning/dashboards
foldersFromFilesStructure: false

View File

@ -1,16 +0,0 @@
apiVersion: 1
datasources:
- name: InfluxDB
type: influxdb
access: proxy
url: http://monitoring-influxdb:8086
database: agave_metrics
user: admin
isDefault: true
editable: true
secureJsonData:
password: admin
jsonData:
timeInterval: 10s
httpMode: GET

View File

@ -1,17 +0,0 @@
#!/bin/bash
# Query canonical mainnet slot for sync lag comparison
set -euo pipefail
CANONICAL_RPC="${CANONICAL_RPC_URL:-https://api.mainnet-beta.solana.com}"
response=$(curl -s --max-time 10 -X POST \
-H "Content-Type: application/json" \
-d '{"jsonrpc":"2.0","id":1,"method":"getSlot"}' \
"$CANONICAL_RPC" 2>/dev/null || echo '{"result":0}')
slot=$(echo "$response" | grep -o '"result":[0-9]*' | grep -o '[0-9]*' || echo "0")
if [ "$slot" != "0" ]; then
echo "canonical_slot slot=${slot}i"
fi

View File

@ -1,33 +0,0 @@
#!/bin/bash
# Check getSlot RPC latency
# Outputs metrics in InfluxDB line protocol format
set -euo pipefail
RPC_URL="${NODE_RPC_URL:-http://localhost:8899}"
RPC_PAYLOAD='{"jsonrpc":"2.0","id":1,"method":"getSlot"}'
response=$(curl -sk --max-time 10 -X POST \
-H "Content-Type: application/json" \
-d "$RPC_PAYLOAD" \
-w "\n%{http_code}\n%{time_total}" \
"$RPC_URL" 2>/dev/null || echo -e "\n000\n0")
json_response=$(echo "$response" | head -n 1)
# curl -w output follows response body; blank lines may appear between them
http_code=$(echo "$response" | tail -2 | head -1)
time_total=$(echo "$response" | tail -1)
latency_ms="$(awk -v t="$time_total" 'BEGIN { printf "%.0f", (t * 1000) }')"
# Strip leading zeros from http_code (influx line protocol rejects 000i)
http_code=$((10#${http_code:-0}))
if [ "$http_code" = "200" ]; then
slot=$(echo "$json_response" | grep -o '"result":[0-9]*' | grep -o '[0-9]*' || echo "0")
[ "$slot" != "0" ] && success=1 || success=0
else
success=0
slot=0
fi
echo "rpc_latency,endpoint=direct,method=getSlot latency_ms=${latency_ms},success=${success}i,http_code=${http_code}i,slot=${slot}i"

View File

@ -1,36 +0,0 @@
# Telegraf configuration for Agave monitoring
[agent]
interval = "10s"
round_interval = true
metric_batch_size = 1000
metric_buffer_limit = 10000
collection_jitter = "0s"
flush_interval = "10s"
flush_jitter = "0s"
precision = "0s"
hostname = "telegraf"
omit_hostname = false
# Output to InfluxDB
[[outputs.influxdb]]
urls = ["http://localhost:8086"]
database = "agave_metrics"
skip_database_creation = true
username = "admin"
password = "admin"
retention_policy = ""
write_consistency = "any"
timeout = "5s"
# Custom getSlot latency check
[[inputs.exec]]
commands = ["/scripts/check_getslot_latency.sh"]
timeout = "30s"
data_format = "influx"
# Canonical mainnet slot tracking
[[inputs.exec]]
commands = ["/scripts/check_canonical_slot.sh"]
timeout = "30s"
data_format = "influx"

View File

@ -1,81 +0,0 @@
# Unified Agave/Jito Solana image
# Supports three modes via AGAVE_MODE env: test, rpc, validator
#
# Build args:
# AGAVE_REPO - git repo URL (anza-xyz/agave or jito-foundation/jito-solana)
# AGAVE_VERSION - git tag to build (e.g. v3.1.9, v3.1.8-jito)
ARG AGAVE_REPO=https://github.com/anza-xyz/agave.git
ARG AGAVE_VERSION=v3.1.9
# ---------- Stage 1: Build ----------
FROM rust:1.85-bookworm AS builder
ARG AGAVE_REPO
ARG AGAVE_VERSION
RUN apt-get update && apt-get install -y --no-install-recommends \
build-essential \
pkg-config \
libssl-dev \
libudev-dev \
libclang-dev \
protobuf-compiler \
ca-certificates \
git \
cmake \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /build
RUN git clone "$AGAVE_REPO" --depth 1 --branch "$AGAVE_VERSION" --recurse-submodules agave
WORKDIR /build/agave
# Cherry-pick --public-tvu-address support (anza-xyz/agave PR #6778, commit 9f4b3ae)
# This flag only exists on master, not in v3.1.9 — fetch the PR ref and cherry-pick
ARG TVU_ADDRESS_PR=6778
RUN if [ -n "$TVU_ADDRESS_PR" ]; then \
git fetch --depth 50 origin "pull/${TVU_ADDRESS_PR}/head:tvu-pr" && \
git cherry-pick --no-commit tvu-pr; \
fi
# Build all binaries using the upstream install script
RUN CI_COMMIT=$(git rev-parse HEAD) scripts/cargo-install-all.sh /solana-release
# ---------- Stage 2: Runtime ----------
FROM debian:bookworm-slim
RUN apt-get update && apt-get install -y --no-install-recommends \
ca-certificates \
libssl3 \
libudev1 \
curl \
sudo \
aria2 \
python3 \
&& rm -rf /var/lib/apt/lists/*
# Create non-root user with sudo
RUN useradd -m -s /bin/bash agave \
&& echo "agave ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
# Copy all compiled binaries
COPY --from=builder /solana-release/bin/ /usr/local/bin/
# Copy entrypoint and support scripts
COPY entrypoint.py snapshot_download.py ip_echo_preflight.py /usr/local/bin/
COPY start-test.sh /usr/local/bin/
RUN chmod +x /usr/local/bin/entrypoint.py /usr/local/bin/start-test.sh
# Create data directories
RUN mkdir -p /data/config /data/ledger /data/accounts /data/snapshots \
&& chown -R agave:agave /data
USER agave
WORKDIR /data
ENV RUST_LOG=info
ENV RUST_BACKTRACE=1
EXPOSE 8899 8900 8001 8001/udp
ENTRYPOINT ["entrypoint.py"]

View File

@ -1,17 +0,0 @@
#!/usr/bin/env bash
# Build laconicnetwork/agave
# Set AGAVE_REPO and AGAVE_VERSION env vars to build Jito or a different version
source ${CERC_CONTAINER_BASE_DIR}/build-base.sh
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
AGAVE_REPO="${AGAVE_REPO:-https://github.com/anza-xyz/agave.git}"
AGAVE_VERSION="${AGAVE_VERSION:-v3.1.9}"
docker build -t laconicnetwork/agave:local \
--build-arg AGAVE_REPO="$AGAVE_REPO" \
--build-arg AGAVE_VERSION="$AGAVE_VERSION" \
${build_command_args} \
-f ${SCRIPT_DIR}/Dockerfile \
${SCRIPT_DIR}

View File

@ -1,686 +0,0 @@
#!/usr/bin/env python3
"""Agave validator entrypoint — snapshot management, arg construction, liveness probe.
Two subcommands:
entrypoint.py serve (default) snapshot freshness check + run agave-validator
entrypoint.py probe liveness probe (slot lag check, exits 0/1)
Replaces the bash entrypoint.sh / start-rpc.sh / start-validator.sh with a single
Python module. Test mode still dispatches to start-test.sh.
Python stays as PID 1 and traps SIGTERM. On SIGTERM, it runs
``agave-validator exit --force --ledger /data/ledger`` which connects to the
admin RPC Unix socket and tells the validator to flush I/O and exit cleanly.
This avoids the io_uring/ZFS deadlock that occurs when the process is killed.
All configuration comes from environment variables same vars as the original
bash scripts. See compose files for defaults.
"""
from __future__ import annotations
import json
import logging
import os
import re
import signal
import subprocess
import sys
import threading
import time
import urllib.error
import urllib.request
from pathlib import Path
from urllib.request import Request
log: logging.Logger = logging.getLogger("entrypoint")
# Directories
CONFIG_DIR = "/data/config"
LEDGER_DIR = "/data/ledger"
ACCOUNTS_DIR = "/data/accounts"
SNAPSHOTS_DIR = "/data/snapshots"
LOG_DIR = "/data/log"
IDENTITY_FILE = f"{CONFIG_DIR}/validator-identity.json"
# Snapshot filename patterns
FULL_SNAP_RE: re.Pattern[str] = re.compile(
r"^snapshot-(\d+)-[A-Za-z0-9]+\.tar\.(zst|bz2)$"
)
INCR_SNAP_RE: re.Pattern[str] = re.compile(
r"^incremental-snapshot-(\d+)-(\d+)-[A-Za-z0-9]+\.tar\.(zst|bz2)$"
)
MAINNET_RPC = "https://api.mainnet-beta.solana.com"
# -- Helpers -------------------------------------------------------------------
def env(name: str, default: str = "") -> str:
"""Read env var with default."""
return os.environ.get(name, default)
def env_required(name: str) -> str:
"""Read required env var, exit if missing."""
val = os.environ.get(name)
if not val:
log.error("%s is required but not set", name)
sys.exit(1)
return val
def env_bool(name: str, default: bool = False) -> bool:
"""Read boolean env var (true/false/1/0)."""
val = os.environ.get(name, "").lower()
if not val:
return default
return val in ("true", "1", "yes")
def rpc_get_slot(url: str, timeout: int = 10) -> int | None:
"""Get current slot from a Solana RPC endpoint."""
payload = json.dumps({
"jsonrpc": "2.0", "id": 1,
"method": "getSlot", "params": [],
}).encode()
req = Request(url, data=payload,
headers={"Content-Type": "application/json"})
try:
with urllib.request.urlopen(req, timeout=timeout) as resp:
data = json.loads(resp.read())
result = data.get("result")
if isinstance(result, int):
return result
except (urllib.error.URLError, json.JSONDecodeError, OSError, TimeoutError):
pass
return None
# -- Snapshot management -------------------------------------------------------
def get_local_snapshot_slot(snapshots_dir: str) -> int | None:
"""Find the highest slot among local snapshot files."""
best_slot: int | None = None
snap_path = Path(snapshots_dir)
if not snap_path.is_dir():
return None
for entry in snap_path.iterdir():
m = FULL_SNAP_RE.match(entry.name)
if m:
slot = int(m.group(1))
if best_slot is None or slot > best_slot:
best_slot = slot
return best_slot
def clean_snapshots(snapshots_dir: str) -> None:
"""Remove all snapshot files from the directory."""
snap_path = Path(snapshots_dir)
if not snap_path.is_dir():
return
for entry in snap_path.iterdir():
if entry.name.startswith(("snapshot-", "incremental-snapshot-")):
log.info("Removing old snapshot: %s", entry.name)
entry.unlink(missing_ok=True)
def get_incremental_slot(snapshots_dir: str, full_slot: int | None) -> int | None:
"""Get the highest incremental snapshot slot matching the full's base slot."""
if full_slot is None:
return None
snap_path = Path(snapshots_dir)
if not snap_path.is_dir():
return None
best: int | None = None
for entry in snap_path.iterdir():
m = INCR_SNAP_RE.match(entry.name)
if m and int(m.group(1)) == full_slot:
slot = int(m.group(2))
if best is None or slot > best:
best = slot
return best
def maybe_download_snapshot(snapshots_dir: str) -> None:
"""Ensure full + incremental snapshots exist before starting.
The validator should always start from a full + incremental pair to
minimize replay time. If either is missing or the full is too old,
download fresh ones via download_best_snapshot (which does rolling
incremental convergence after downloading the full).
Controlled by env vars:
SNAPSHOT_AUTO_DOWNLOAD (default: true) enable/disable
SNAPSHOT_MAX_AGE_SLOTS (default: 100000) full snapshot staleness threshold
(one full snapshot generation, ~11 hours)
"""
if not env_bool("SNAPSHOT_AUTO_DOWNLOAD", default=True):
log.info("Snapshot auto-download disabled")
return
max_age = int(env("SNAPSHOT_MAX_AGE_SLOTS", "100000"))
mainnet_slot = rpc_get_slot(MAINNET_RPC)
if mainnet_slot is None:
log.warning("Cannot reach mainnet RPC — skipping snapshot check")
return
script_dir = Path(__file__).resolve().parent
sys.path.insert(0, str(script_dir))
from snapshot_download import download_best_snapshot, download_incremental_for_slot
convergence = int(env("SNAPSHOT_CONVERGENCE_SLOTS", "500"))
retry_delay = int(env("SNAPSHOT_RETRY_DELAY", "60"))
# Check local full snapshot
local_slot = get_local_snapshot_slot(snapshots_dir)
have_fresh_full = (local_slot is not None
and (mainnet_slot - local_slot) <= max_age)
if have_fresh_full:
assert local_slot is not None
inc_slot = get_incremental_slot(snapshots_dir, local_slot)
if inc_slot is not None:
inc_gap = mainnet_slot - inc_slot
if inc_gap <= convergence:
log.info("Full (slot %d) + incremental (slot %d, gap %d) "
"within convergence, starting",
local_slot, inc_slot, inc_gap)
return
log.info("Incremental too stale (slot %d, gap %d > %d)",
inc_slot, inc_gap, convergence)
# Fresh full, need a fresh incremental
log.info("Downloading incremental for full at slot %d", local_slot)
while True:
if download_incremental_for_slot(snapshots_dir, local_slot,
convergence_slots=convergence):
return
log.warning("Incremental download failed — retrying in %ds",
retry_delay)
time.sleep(retry_delay)
# No full or full too old — download both
log.info("Downloading full + incremental")
clean_snapshots(snapshots_dir)
while True:
if download_best_snapshot(snapshots_dir, convergence_slots=convergence):
return
log.warning("Snapshot download failed — retrying in %ds", retry_delay)
time.sleep(retry_delay)
# -- Directory and identity setup ----------------------------------------------
def ensure_dirs(*dirs: str) -> None:
"""Create directories and fix ownership."""
uid = os.getuid()
gid = os.getgid()
for d in dirs:
os.makedirs(d, exist_ok=True)
try:
subprocess.run(
["sudo", "chown", "-R", f"{uid}:{gid}", d],
check=False, capture_output=True,
)
except FileNotFoundError:
pass # sudo not available — dirs already owned correctly
def ensure_identity_rpc() -> None:
"""Generate ephemeral identity keypair for RPC mode if not mounted."""
if os.path.isfile(IDENTITY_FILE):
return
log.info("Generating RPC node identity keypair...")
subprocess.run(
["solana-keygen", "new", "--no-passphrase", "--silent",
"--force", "--outfile", IDENTITY_FILE],
check=True,
)
def print_identity() -> None:
"""Print the node identity pubkey."""
result = subprocess.run(
["solana-keygen", "pubkey", IDENTITY_FILE],
capture_output=True, text=True, check=False,
)
if result.returncode == 0:
log.info("Node identity: %s", result.stdout.strip())
# -- Arg construction ----------------------------------------------------------
def build_common_args() -> list[str]:
"""Build agave-validator args common to both RPC and validator modes."""
args: list[str] = [
"--identity", IDENTITY_FILE,
"--entrypoint", env_required("VALIDATOR_ENTRYPOINT"),
"--known-validator", env_required("KNOWN_VALIDATOR"),
"--ledger", LEDGER_DIR,
"--accounts", ACCOUNTS_DIR,
"--snapshots", SNAPSHOTS_DIR,
"--rpc-port", env("RPC_PORT", "8899"),
"--rpc-bind-address", env("RPC_BIND_ADDRESS", "127.0.0.1"),
"--gossip-port", env("GOSSIP_PORT", "8001"),
"--dynamic-port-range", env("DYNAMIC_PORT_RANGE", "9000-10000"),
"--no-os-network-limits-test",
"--wal-recovery-mode", "skip_any_corrupted_record",
"--limit-ledger-size", env("LIMIT_LEDGER_SIZE", "50000000"),
"--no-snapshot-fetch", # entrypoint handles snapshot download
]
# Snapshot generation
if env("NO_SNAPSHOTS") == "true":
args.append("--no-snapshots")
else:
args += [
"--full-snapshot-interval-slots", env("SNAPSHOT_INTERVAL_SLOTS", "100000"),
"--maximum-full-snapshots-to-retain", env("MAXIMUM_SNAPSHOTS_TO_RETAIN", "1"),
]
if env("NO_INCREMENTAL_SNAPSHOTS") != "true":
args += ["--maximum-incremental-snapshots-to-retain", "2"]
# Account indexes
account_indexes = env("ACCOUNT_INDEXES")
if account_indexes:
for idx in account_indexes.split(","):
idx = idx.strip()
if idx:
args += ["--account-index", idx]
# Additional entrypoints
for ep in env("EXTRA_ENTRYPOINTS").split():
if ep:
args += ["--entrypoint", ep]
# Additional known validators
for kv in env("EXTRA_KNOWN_VALIDATORS").split():
if kv:
args += ["--known-validator", kv]
# Cluster verification
genesis_hash = env("EXPECTED_GENESIS_HASH")
if genesis_hash:
args += ["--expected-genesis-hash", genesis_hash]
shred_version = env("EXPECTED_SHRED_VERSION")
if shred_version:
args += ["--expected-shred-version", shred_version]
# Metrics — just needs to be in the environment, agave reads it directly
# (env var is already set, nothing to pass as arg)
# Gossip host / TVU address
gossip_host = env("GOSSIP_HOST")
if gossip_host:
args += ["--gossip-host", gossip_host]
elif env("PUBLIC_TVU_ADDRESS"):
args += ["--public-tvu-address", env("PUBLIC_TVU_ADDRESS")]
# Jito flags
if env("JITO_ENABLE") == "true":
log.info("Jito MEV enabled")
jito_flags: list[tuple[str, str]] = [
("JITO_TIP_PAYMENT_PROGRAM", "--tip-payment-program-pubkey"),
("JITO_DISTRIBUTION_PROGRAM", "--tip-distribution-program-pubkey"),
("JITO_MERKLE_ROOT_AUTHORITY", "--merkle-root-upload-authority"),
("JITO_COMMISSION_BPS", "--commission-bps"),
("JITO_BLOCK_ENGINE_URL", "--block-engine-url"),
("JITO_SHRED_RECEIVER_ADDR", "--shred-receiver-address"),
]
for env_name, flag in jito_flags:
val = env(env_name)
if val:
args += [flag, val]
return args
def build_rpc_args() -> list[str]:
"""Build agave-validator args for RPC (non-voting) mode."""
args = build_common_args()
args += [
"--no-voting",
"--log", f"{LOG_DIR}/validator.log",
"--full-rpc-api",
"--enable-rpc-transaction-history",
"--rpc-pubsub-enable-block-subscription",
"--enable-extended-tx-metadata-storage",
"--no-wait-for-vote-to-start-leader",
]
# Public vs private RPC
public_rpc = env("PUBLIC_RPC_ADDRESS")
if public_rpc:
args += ["--public-rpc-address", public_rpc]
else:
args += ["--private-rpc", "--allow-private-addr", "--only-known-rpc"]
# Jito relayer URL (RPC mode doesn't use it, but validator mode does —
# handled in build_validator_args)
return args
def build_validator_args() -> list[str]:
"""Build agave-validator args for voting validator mode."""
vote_keypair = env("VOTE_ACCOUNT_KEYPAIR",
"/data/config/vote-account-keypair.json")
# Identity must be mounted for validator mode
if not os.path.isfile(IDENTITY_FILE):
log.error("Validator identity keypair not found at %s", IDENTITY_FILE)
log.error("Mount your validator keypair to %s", IDENTITY_FILE)
sys.exit(1)
# Vote account keypair must exist
if not os.path.isfile(vote_keypair):
log.error("Vote account keypair not found at %s", vote_keypair)
log.error("Mount your vote account keypair or set VOTE_ACCOUNT_KEYPAIR")
sys.exit(1)
# Print vote account pubkey
result = subprocess.run(
["solana-keygen", "pubkey", vote_keypair],
capture_output=True, text=True, check=False,
)
if result.returncode == 0:
log.info("Vote account: %s", result.stdout.strip())
args = build_common_args()
args += [
"--vote-account", vote_keypair,
"--log", "-",
]
# Jito relayer URL (validator-only)
relayer_url = env("JITO_RELAYER_URL")
if env("JITO_ENABLE") == "true" and relayer_url:
args += ["--relayer-url", relayer_url]
return args
def append_extra_args(args: list[str]) -> list[str]:
"""Append EXTRA_ARGS passthrough flags."""
extra = env("EXTRA_ARGS")
if extra:
args += extra.split()
return args
# -- Graceful shutdown --------------------------------------------------------
# Timeout for graceful exit via admin RPC. Leave 30s margin for k8s
# terminationGracePeriodSeconds (300s).
GRACEFUL_EXIT_TIMEOUT = 270
def graceful_exit(child: subprocess.Popen[bytes], reason: str = "SIGTERM") -> None:
"""Request graceful shutdown via the admin RPC Unix socket.
Runs ``agave-validator exit --force --ledger /data/ledger`` which connects
to the admin RPC socket at ``/data/ledger/admin.rpc`` and sets the
validator's exit flag. The validator flushes all I/O and exits cleanly,
avoiding the io_uring/ZFS deadlock.
If the admin RPC exit fails or the child doesn't exit within the timeout,
falls back to SIGTERM then SIGKILL.
"""
log.info("%s — requesting graceful exit via admin RPC", reason)
try:
result = subprocess.run(
["agave-validator", "exit", "--force", "--ledger", LEDGER_DIR],
capture_output=True, text=True, timeout=30,
)
if result.returncode == 0:
log.info("Admin RPC exit requested successfully")
else:
log.warning(
"Admin RPC exit returned %d: %s",
result.returncode, result.stderr.strip(),
)
except subprocess.TimeoutExpired:
log.warning("Admin RPC exit command timed out after 30s")
except FileNotFoundError:
log.warning("agave-validator binary not found for exit command")
# Wait for child to exit
try:
child.wait(timeout=GRACEFUL_EXIT_TIMEOUT)
log.info("Validator exited cleanly with code %d", child.returncode)
return
except subprocess.TimeoutExpired:
log.warning(
"Validator did not exit within %ds — sending SIGTERM",
GRACEFUL_EXIT_TIMEOUT,
)
# Fallback: SIGTERM
child.terminate()
try:
child.wait(timeout=15)
log.info("Validator exited after SIGTERM with code %d", child.returncode)
return
except subprocess.TimeoutExpired:
log.warning("Validator did not exit after SIGTERM — sending SIGKILL")
# Last resort: SIGKILL
child.kill()
child.wait()
log.info("Validator killed with SIGKILL, code %d", child.returncode)
# -- Serve subcommand ---------------------------------------------------------
def _gap_monitor(
child: subprocess.Popen[bytes],
leapfrog: threading.Event,
shutting_down: threading.Event,
) -> None:
"""Background thread: poll slot gap and trigger leapfrog if too far behind.
Waits for a grace period (SNAPSHOT_MONITOR_GRACE, default 600s) before
monitoring the validator needs time to extract snapshots and catch up.
Then polls every SNAPSHOT_MONITOR_INTERVAL (default 30s). If the gap
exceeds SNAPSHOT_LEAPFROG_SLOTS (default 5000) for SNAPSHOT_LEAPFROG_CHECKS
(default 3) consecutive checks, triggers graceful shutdown and sets the
leapfrog event so cmd_serve loops back to download a fresh incremental.
"""
threshold = int(env("SNAPSHOT_LEAPFROG_SLOTS", "5000"))
required_checks = int(env("SNAPSHOT_LEAPFROG_CHECKS", "3"))
interval = int(env("SNAPSHOT_MONITOR_INTERVAL", "30"))
grace = int(env("SNAPSHOT_MONITOR_GRACE", "600"))
rpc_port = env("RPC_PORT", "8899")
local_url = f"http://127.0.0.1:{rpc_port}"
# Grace period — don't monitor during initial catch-up
if shutting_down.wait(grace):
return
consecutive = 0
while not shutting_down.is_set():
local_slot = rpc_get_slot(local_url, timeout=5)
mainnet_slot = rpc_get_slot(MAINNET_RPC, timeout=10)
if local_slot is not None and mainnet_slot is not None:
gap = mainnet_slot - local_slot
if gap > threshold:
consecutive += 1
log.warning("Gap %d > %d (%d/%d consecutive)",
gap, threshold, consecutive, required_checks)
if consecutive >= required_checks:
log.warning("Leapfrog triggered: gap %d", gap)
leapfrog.set()
graceful_exit(child, reason="Leapfrog")
return
else:
if consecutive > 0:
log.info("Gap %d within threshold, resetting counter", gap)
consecutive = 0
shutting_down.wait(interval)
def cmd_serve() -> None:
"""Main serve flow: snapshot download, run validator, monitor gap, leapfrog.
Python stays as PID 1. On each iteration:
1. Download full + incremental snapshots (if needed)
2. Start agave-validator as child process
3. Monitor slot gap in background thread
4. If gap exceeds threshold graceful stop loop back to step 1
5. If SIGTERM graceful stop exit
6. If validator crashes exit with its return code
"""
mode = env("AGAVE_MODE", "test")
log.info("AGAVE_MODE=%s", mode)
if mode == "test":
os.execvp("start-test.sh", ["start-test.sh"])
if mode not in ("rpc", "validator"):
log.error("Unknown AGAVE_MODE: %s (valid: test, rpc, validator)", mode)
sys.exit(1)
# One-time setup
dirs = [CONFIG_DIR, LEDGER_DIR, ACCOUNTS_DIR, SNAPSHOTS_DIR]
if mode == "rpc":
dirs.append(LOG_DIR)
ensure_dirs(*dirs)
if not env_bool("SKIP_IP_ECHO_PREFLIGHT"):
script_dir = Path(__file__).resolve().parent
sys.path.insert(0, str(script_dir))
from ip_echo_preflight import main as ip_echo_main
if ip_echo_main() != 0:
sys.exit(1)
if mode == "rpc":
ensure_identity_rpc()
print_identity()
if mode == "rpc":
args = build_rpc_args()
else:
args = build_validator_args()
args = append_extra_args(args)
# Main loop: download → run → monitor → leapfrog if needed
while True:
maybe_download_snapshot(SNAPSHOTS_DIR)
Path("/tmp/entrypoint-start").write_text(str(time.time()))
log.info("Starting agave-validator with %d arguments", len(args))
child = subprocess.Popen(["agave-validator"] + args)
shutting_down = threading.Event()
leapfrog = threading.Event()
signal.signal(signal.SIGUSR1,
lambda _sig, _frame: child.send_signal(signal.SIGUSR1))
def _on_sigterm(_sig: int, _frame: object) -> None:
shutting_down.set()
threading.Thread(
target=graceful_exit, args=(child,), daemon=True,
).start()
signal.signal(signal.SIGTERM, _on_sigterm)
# Start gap monitor
monitor = threading.Thread(
target=_gap_monitor,
args=(child, leapfrog, shutting_down),
daemon=True,
)
monitor.start()
child.wait()
if leapfrog.is_set():
log.info("Leapfrog: restarting with fresh incremental")
continue
sys.exit(child.returncode)
# -- Probe subcommand ---------------------------------------------------------
def cmd_probe() -> None:
"""Liveness probe: check local RPC slot vs mainnet.
Exit 0 = healthy, exit 1 = unhealthy.
Grace period: PROBE_GRACE_SECONDS (default 600) probe always passes
during grace period to allow for snapshot unpacking and initial replay.
"""
grace_seconds = int(env("PROBE_GRACE_SECONDS", "600"))
max_lag = int(env("PROBE_MAX_SLOT_LAG", "20000"))
# Check grace period
start_file = Path("/tmp/entrypoint-start")
if start_file.exists():
try:
start_time = float(start_file.read_text().strip())
elapsed = time.time() - start_time
if elapsed < grace_seconds:
# Within grace period — always healthy
sys.exit(0)
except (ValueError, OSError):
pass
else:
# No start file — serve hasn't started yet, within grace
sys.exit(0)
# Query local RPC
rpc_port = env("RPC_PORT", "8899")
local_url = f"http://127.0.0.1:{rpc_port}"
local_slot = rpc_get_slot(local_url, timeout=5)
if local_slot is None:
# Local RPC unreachable after grace period — unhealthy
sys.exit(1)
# Query mainnet
mainnet_slot = rpc_get_slot(MAINNET_RPC, timeout=10)
if mainnet_slot is None:
# Can't reach mainnet to compare — assume healthy (don't penalize
# the validator for mainnet RPC being down)
sys.exit(0)
lag = mainnet_slot - local_slot
if lag > max_lag:
sys.exit(1)
sys.exit(0)
# -- Main ----------------------------------------------------------------------
def main() -> None:
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
datefmt="%H:%M:%S",
)
subcmd = sys.argv[1] if len(sys.argv) > 1 else "serve"
if subcmd == "serve":
cmd_serve()
elif subcmd == "probe":
cmd_probe()
else:
log.error("Unknown subcommand: %s (valid: serve, probe)", subcmd)
sys.exit(1)
if __name__ == "__main__":
main()

View File

@ -1,249 +0,0 @@
#!/usr/bin/env python3
"""ip_echo preflight — verify UDP port reachability before starting the validator.
Implements the Solana ip_echo client protocol exactly:
1. Bind UDP sockets on the ports the validator will use
2. TCP connect to entrypoint gossip port, send IpEchoServerMessage
3. Parse IpEchoServerResponse (our IP as seen by entrypoint)
4. Wait for entrypoint's UDP probes on each port
5. Exit 0 if all ports reachable, exit 1 if any fail
Wire format (from agave net-utils/src/):
Request: 4 null bytes + [u16; 4] tcp_ports LE + [u16; 4] udp_ports LE + \n
Response: 4 null bytes + bincode IpAddr (variant byte + addr) + optional shred_version
Called from entrypoint.py before snapshot download. Prevents wasting hours
downloading a snapshot only to crash-loop on port reachability.
"""
from __future__ import annotations
import logging
import os
import socket
import struct
import sys
import threading
import time
log = logging.getLogger("ip_echo_preflight")
HEADER = b"\x00\x00\x00\x00"
TERMINUS = b"\x0a"
RESPONSE_BUF = 27
IO_TIMEOUT = 5.0
PROBE_TIMEOUT = 10.0
MAX_RETRIES = 3
RETRY_DELAY = 2.0
def build_request(tcp_ports: list[int], udp_ports: list[int]) -> bytes:
"""Build IpEchoServerMessage: header + [u16;4] tcp + [u16;4] udp + newline."""
tcp = (tcp_ports + [0, 0, 0, 0])[:4]
udp = (udp_ports + [0, 0, 0, 0])[:4]
return HEADER + struct.pack("<4H", *tcp) + struct.pack("<4H", *udp) + TERMINUS
def parse_response(data: bytes) -> tuple[str, int | None]:
"""Parse IpEchoServerResponse → (ip_string, shred_version | None).
Wire format (bincode):
4 bytes header (\0\0\0\0)
4 bytes IpAddr enum variant (u32 LE: 0=IPv4, 1=IPv6)
4|16 bytes address octets
1 byte Option tag (0=None, 1=Some)
2 bytes shred_version (u16 LE, only if Some)
"""
if len(data) < 8:
raise ValueError(f"response too short: {len(data)} bytes")
if data[:4] == b"HTTP":
raise ValueError("got HTTP response — not an ip_echo server")
if data[:4] != HEADER:
raise ValueError(f"unexpected header: {data[:4].hex()}")
variant = struct.unpack("<I", data[4:8])[0]
if variant == 0: # IPv4
if len(data) < 12:
raise ValueError(f"IPv4 response truncated: {len(data)} bytes")
ip = socket.inet_ntoa(data[8:12])
rest = data[12:]
elif variant == 1: # IPv6
if len(data) < 24:
raise ValueError(f"IPv6 response truncated: {len(data)} bytes")
ip = socket.inet_ntop(socket.AF_INET6, data[8:24])
rest = data[24:]
else:
raise ValueError(f"unknown IpAddr variant: {variant}")
shred_version = None
if len(rest) >= 3 and rest[0] == 1:
shred_version = struct.unpack("<H", rest[1:3])[0]
return ip, shred_version
def _listen_udp(port: int, results: dict, stop: threading.Event) -> None:
"""Bind a UDP socket and wait for a probe packet."""
try:
sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
sock.bind(("0.0.0.0", port))
sock.settimeout(0.5)
try:
while not stop.is_set():
try:
_data, addr = sock.recvfrom(64)
results[port] = ("ok", addr)
return
except socket.timeout:
continue
finally:
sock.close()
except OSError as exc:
results[port] = ("bind_error", str(exc))
def ip_echo_check(
entrypoint_host: str,
entrypoint_port: int,
udp_ports: list[int],
) -> tuple[str, dict[int, bool]]:
"""Run one ip_echo exchange and return (seen_ip, {port: reachable}).
Raises on TCP failure (caller retries).
"""
udp_ports = [p for p in udp_ports if p != 0][:4]
# Start UDP listeners before sending the TCP request
results: dict[int, tuple] = {}
stop = threading.Event()
threads = []
for port in udp_ports:
t = threading.Thread(target=_listen_udp, args=(port, results, stop), daemon=True)
t.start()
threads.append(t)
time.sleep(0.1) # let listeners bind
# TCP: send request, read response
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
sock.settimeout(IO_TIMEOUT)
try:
sock.connect((entrypoint_host, entrypoint_port))
sock.sendall(build_request([], udp_ports))
resp = sock.recv(RESPONSE_BUF)
finally:
sock.close()
seen_ip, shred_version = parse_response(resp)
log.info(
"entrypoint %s:%d sees us as %s (shred_version=%s)",
entrypoint_host, entrypoint_port, seen_ip, shred_version,
)
# Wait for UDP probes
deadline = time.monotonic() + PROBE_TIMEOUT
while time.monotonic() < deadline:
if all(p in results for p in udp_ports):
break
time.sleep(0.2)
stop.set()
for t in threads:
t.join(timeout=1)
port_ok: dict[int, bool] = {}
for port in udp_ports:
if port not in results:
log.error("port %d: no probe received within %.0fs", port, PROBE_TIMEOUT)
port_ok[port] = False
else:
status, detail = results[port]
if status == "ok":
log.info("port %d: probe received from %s", port, detail)
port_ok[port] = True
else:
log.error("port %d: %s: %s", port, status, detail)
port_ok[port] = False
return seen_ip, port_ok
def run_preflight(
entrypoint_host: str,
entrypoint_port: int,
udp_ports: list[int],
expected_ip: str = "",
) -> bool:
"""Run ip_echo check with retries. Returns True if all ports pass."""
for attempt in range(1, MAX_RETRIES + 1):
log.info("ip_echo attempt %d/%d%s:%d, ports %s",
attempt, MAX_RETRIES, entrypoint_host, entrypoint_port, udp_ports)
try:
seen_ip, port_ok = ip_echo_check(entrypoint_host, entrypoint_port, udp_ports)
except Exception as exc:
log.error("attempt %d TCP failed: %s", attempt, exc)
if attempt < MAX_RETRIES:
time.sleep(RETRY_DELAY)
continue
if expected_ip and seen_ip != expected_ip:
log.error(
"IP MISMATCH: entrypoint sees %s, expected %s (GOSSIP_HOST). "
"Outbound mangle/SNAT path is broken.",
seen_ip, expected_ip,
)
if attempt < MAX_RETRIES:
time.sleep(RETRY_DELAY)
continue
reachable = [p for p, ok in port_ok.items() if ok]
unreachable = [p for p, ok in port_ok.items() if not ok]
if not unreachable:
log.info("PASS: all ports reachable %s, seen as %s", reachable, seen_ip)
return True
log.error(
"attempt %d: unreachable %s, reachable %s, seen as %s",
attempt, unreachable, reachable, seen_ip,
)
if attempt < MAX_RETRIES:
time.sleep(RETRY_DELAY)
log.error("FAIL: ip_echo preflight exhausted %d attempts", MAX_RETRIES)
return False
def main() -> int:
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
datefmt="%H:%M:%S",
)
# Parse entrypoint — VALIDATOR_ENTRYPOINT is "host:port"
raw = os.environ.get("VALIDATOR_ENTRYPOINT", "")
if not raw and len(sys.argv) > 1:
raw = sys.argv[1]
if not raw:
log.error("set VALIDATOR_ENTRYPOINT or pass host:port as argument")
return 1
if ":" in raw:
host, port_str = raw.rsplit(":", 1)
ep_port = int(port_str)
else:
host = raw
ep_port = 8001
gossip_port = int(os.environ.get("GOSSIP_PORT", "8001"))
dynamic_range = os.environ.get("DYNAMIC_PORT_RANGE", "9000-10000")
range_start = int(dynamic_range.split("-")[0])
expected_ip = os.environ.get("GOSSIP_HOST", "")
# Test gossip + first 3 ports from dynamic range (4 max per ip_echo message)
udp_ports = [gossip_port, range_start, range_start + 2, range_start + 3]
ok = run_preflight(host, ep_port, udp_ports, expected_ip)
return 0 if ok else 1
if __name__ == "__main__":
sys.exit(main())

View File

@ -1,878 +0,0 @@
#!/usr/bin/env python3
"""Download Solana snapshots using aria2c for parallel multi-connection downloads.
Discovers snapshot sources by querying getClusterNodes for all RPCs in the
cluster, probing each for available snapshots, benchmarking download speed,
and downloading from the fastest source using aria2c (16 connections by default).
Based on the discovery approach from etcusr/solana-snapshot-finder but replaces
the single-connection wget download with aria2c parallel chunked downloads.
Usage:
# Download to /srv/kind/solana/snapshots (mainnet, 16 connections)
./snapshot_download.py -o /srv/kind/solana/snapshots
# Dry run — find best source, print URL
./snapshot_download.py --dry-run
# Custom RPC for cluster discovery + 32 connections
./snapshot_download.py -r https://api.mainnet-beta.solana.com -n 32
# Testnet
./snapshot_download.py -c testnet -o /data/snapshots
# Programmatic use from entrypoint.py:
from snapshot_download import download_best_snapshot
ok = download_best_snapshot("/data/snapshots")
Requirements:
- aria2c (apt install aria2)
- python3 >= 3.10 (stdlib only, no pip dependencies)
"""
from __future__ import annotations
import argparse
import concurrent.futures
import json
import logging
import os
import re
import shutil
import subprocess
import sys
import time
import urllib.error
import urllib.request
from dataclasses import dataclass, field
from http.client import HTTPResponse
from pathlib import Path
from urllib.request import Request
log: logging.Logger = logging.getLogger("snapshot-download")
CLUSTER_RPC: dict[str, str] = {
"mainnet-beta": "https://api.mainnet-beta.solana.com",
"testnet": "https://api.testnet.solana.com",
"devnet": "https://api.devnet.solana.com",
}
# Snapshot filenames:
# snapshot-<slot>-<hash>.tar.zst
# incremental-snapshot-<base_slot>-<slot>-<hash>.tar.zst
FULL_SNAP_RE: re.Pattern[str] = re.compile(
r"^snapshot-(\d+)-([A-Za-z0-9]+)\.tar\.(zst|bz2)$"
)
INCR_SNAP_RE: re.Pattern[str] = re.compile(
r"^incremental-snapshot-(\d+)-(\d+)-([A-Za-z0-9]+)\.tar\.(zst|bz2)$"
)
@dataclass
class SnapshotSource:
"""A snapshot file available from a specific RPC node."""
rpc_address: str
# Full redirect paths as returned by the server (e.g. /snapshot-123-hash.tar.zst)
file_paths: list[str] = field(default_factory=list)
slots_diff: int = 0
latency_ms: float = 0.0
download_speed: float = 0.0 # bytes/sec
# -- JSON-RPC helpers ----------------------------------------------------------
class _NoRedirectHandler(urllib.request.HTTPRedirectHandler):
"""Handler that captures redirect Location instead of following it."""
def redirect_request(
self,
req: Request,
fp: HTTPResponse,
code: int,
msg: str,
headers: dict[str, str], # type: ignore[override]
newurl: str,
) -> None:
return None
def rpc_post(url: str, method: str, params: list[object] | None = None,
timeout: int = 25) -> object | None:
"""JSON-RPC POST. Returns parsed 'result' field or None on error."""
payload: bytes = json.dumps({
"jsonrpc": "2.0", "id": 1,
"method": method, "params": params or [],
}).encode()
req = Request(url, data=payload,
headers={"Content-Type": "application/json"})
try:
with urllib.request.urlopen(req, timeout=timeout) as resp:
data: dict[str, object] = json.loads(resp.read())
return data.get("result")
except (urllib.error.URLError, json.JSONDecodeError, OSError, TimeoutError) as e:
log.debug("rpc_post %s %s failed: %s", url, method, e)
return None
def head_no_follow(url: str, timeout: float = 3) -> tuple[str | None, float]:
"""HEAD request without following redirects.
Returns (Location header value, latency_sec) if the server returned a
3xx redirect. Returns (None, 0.0) on any error or non-redirect response.
"""
opener: urllib.request.OpenerDirector = urllib.request.build_opener(_NoRedirectHandler)
req = Request(url, method="HEAD")
try:
start: float = time.monotonic()
resp: HTTPResponse = opener.open(req, timeout=timeout) # type: ignore[assignment]
latency: float = time.monotonic() - start
# Non-redirect (2xx) — server didn't redirect, not useful for discovery
location: str | None = resp.headers.get("Location")
resp.close()
return location, latency
except urllib.error.HTTPError as e:
# 3xx redirects raise HTTPError with the redirect info
latency = time.monotonic() - start # type: ignore[possibly-undefined]
location = e.headers.get("Location")
if location and 300 <= e.code < 400:
return location, latency
return None, 0.0
except (urllib.error.URLError, OSError, TimeoutError):
return None, 0.0
# -- Discovery -----------------------------------------------------------------
def get_current_slot(rpc_url: str) -> int | None:
"""Get current slot from RPC."""
result: object | None = rpc_post(rpc_url, "getSlot")
if isinstance(result, int):
return result
return None
def get_cluster_rpc_nodes(rpc_url: str, version_filter: str | None = None) -> list[str]:
"""Get all RPC node addresses from getClusterNodes."""
result: object | None = rpc_post(rpc_url, "getClusterNodes")
if not isinstance(result, list):
return []
rpc_addrs: list[str] = []
for node in result:
if not isinstance(node, dict):
continue
if version_filter is not None:
node_version: str | None = node.get("version")
if node_version and not node_version.startswith(version_filter):
continue
rpc: str | None = node.get("rpc")
if rpc:
rpc_addrs.append(rpc)
return list(set(rpc_addrs))
def _parse_snapshot_filename(location: str) -> tuple[str, str | None]:
"""Extract filename and full redirect path from Location header.
Returns (filename, full_path). full_path includes any path prefix
the server returned (e.g. '/snapshots/snapshot-123-hash.tar.zst').
"""
# Location may be absolute URL or relative path
if location.startswith("http://") or location.startswith("https://"):
# Absolute URL — extract path
from urllib.parse import urlparse
path: str = urlparse(location).path
else:
path = location
filename: str = path.rsplit("/", 1)[-1]
return filename, path
def probe_rpc_snapshot(
rpc_address: str,
current_slot: int,
) -> SnapshotSource | None:
"""Probe a single RPC node for available snapshots.
Discovery only no filtering. Returns a SnapshotSource with all available
info so the caller can decide what to keep. Filtering happens after all
probes complete, so rejected sources are still visible for debugging.
"""
full_url: str = f"http://{rpc_address}/snapshot.tar.bz2"
# Full snapshot is required — every source must have one
full_location, full_latency = head_no_follow(full_url, timeout=2)
if not full_location:
return None
latency_ms: float = full_latency * 1000
full_filename, full_path = _parse_snapshot_filename(full_location)
fm: re.Match[str] | None = FULL_SNAP_RE.match(full_filename)
if not fm:
return None
full_snap_slot: int = int(fm.group(1))
slots_diff: int = current_slot - full_snap_slot
file_paths: list[str] = [full_path]
# Also check for incremental snapshot
inc_url: str = f"http://{rpc_address}/incremental-snapshot.tar.bz2"
inc_location, _ = head_no_follow(inc_url, timeout=2)
if inc_location:
inc_filename, inc_path = _parse_snapshot_filename(inc_location)
m: re.Match[str] | None = INCR_SNAP_RE.match(inc_filename)
if m:
inc_base_slot: int = int(m.group(1))
# Incremental must be based on this source's full snapshot
if inc_base_slot == full_snap_slot:
file_paths.append(inc_path)
return SnapshotSource(
rpc_address=rpc_address,
file_paths=file_paths,
slots_diff=slots_diff,
latency_ms=latency_ms,
)
def discover_sources(
rpc_url: str,
current_slot: int,
max_age_slots: int,
max_latency_ms: float,
threads: int,
version_filter: str | None,
) -> list[SnapshotSource]:
"""Discover all snapshot sources, then filter.
Probing and filtering are separate: all reachable sources are collected
first so we can report what exists even if filters reject everything.
"""
rpc_nodes: list[str] = get_cluster_rpc_nodes(rpc_url, version_filter)
if not rpc_nodes:
log.error("No RPC nodes found via getClusterNodes")
return []
log.info("Found %d RPC nodes, probing for snapshots...", len(rpc_nodes))
all_sources: list[SnapshotSource] = []
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as pool:
futures: dict[concurrent.futures.Future[SnapshotSource | None], str] = {
pool.submit(probe_rpc_snapshot, addr, current_slot): addr
for addr in rpc_nodes
}
done: int = 0
for future in concurrent.futures.as_completed(futures):
done += 1
if done % 200 == 0:
log.info(" probed %d/%d nodes, %d reachable",
done, len(rpc_nodes), len(all_sources))
try:
result: SnapshotSource | None = future.result()
except (urllib.error.URLError, OSError, TimeoutError) as e:
log.debug("Probe failed for %s: %s", futures[future], e)
continue
if result:
all_sources.append(result)
log.info("Discovered %d reachable sources", len(all_sources))
# Apply filters
filtered: list[SnapshotSource] = []
rejected_age: int = 0
rejected_latency: int = 0
for src in all_sources:
if src.slots_diff > max_age_slots or src.slots_diff < -100:
rejected_age += 1
continue
if src.latency_ms > max_latency_ms:
rejected_latency += 1
continue
filtered.append(src)
if rejected_age or rejected_latency:
log.info("Filtered: %d rejected by age (>%d slots), %d by latency (>%.0fms)",
rejected_age, max_age_slots, rejected_latency, max_latency_ms)
if not filtered and all_sources:
# Show what was available so the user can adjust filters
all_sources.sort(key=lambda s: s.slots_diff)
best = all_sources[0]
log.warning("All %d sources rejected by filters. Best available: "
"%s (age=%d slots, latency=%.0fms). "
"Try --max-snapshot-age %d --max-latency %.0f",
len(all_sources), best.rpc_address,
best.slots_diff, best.latency_ms,
best.slots_diff + 500,
max(best.latency_ms * 1.5, 500))
log.info("Found %d sources after filtering", len(filtered))
return filtered
# -- Speed benchmark -----------------------------------------------------------
def measure_speed(rpc_address: str, measure_time: int = 7) -> float:
"""Measure download speed from an RPC node. Returns bytes/sec."""
url: str = f"http://{rpc_address}/snapshot.tar.bz2"
req = Request(url)
try:
with urllib.request.urlopen(req, timeout=measure_time + 5) as resp:
start: float = time.monotonic()
total: int = 0
while True:
elapsed: float = time.monotonic() - start
if elapsed >= measure_time:
break
chunk: bytes = resp.read(81920)
if not chunk:
break
total += len(chunk)
elapsed = time.monotonic() - start
if elapsed <= 0:
return 0.0
return total / elapsed
except (urllib.error.URLError, OSError, TimeoutError):
return 0.0
# -- Incremental probing -------------------------------------------------------
def probe_incremental(
fast_sources: list[SnapshotSource],
full_snap_slot: int,
) -> tuple[str | None, list[str]]:
"""Probe fast sources for the best incremental matching full_snap_slot.
Returns (filename, mirror_urls) or (None, []) if no match found.
The "best" incremental is the one with the highest slot (closest to head).
"""
best_filename: str | None = None
best_slot: int = 0
best_source: SnapshotSource | None = None
best_path: str | None = None
for source in fast_sources:
inc_url: str = f"http://{source.rpc_address}/incremental-snapshot.tar.bz2"
inc_location, _ = head_no_follow(inc_url, timeout=2)
if not inc_location:
continue
inc_fn, inc_fp = _parse_snapshot_filename(inc_location)
m: re.Match[str] | None = INCR_SNAP_RE.match(inc_fn)
if not m:
continue
if int(m.group(1)) != full_snap_slot:
log.debug(" %s: incremental base slot %s != full %d, skipping",
source.rpc_address, m.group(1), full_snap_slot)
continue
inc_slot: int = int(m.group(2))
if inc_slot > best_slot:
best_slot = inc_slot
best_filename = inc_fn
best_source = source
best_path = inc_fp
if best_filename is None or best_source is None or best_path is None:
return None, []
# Build mirror list — check other sources for the same filename
mirror_urls: list[str] = [f"http://{best_source.rpc_address}{best_path}"]
for other in fast_sources:
if other.rpc_address == best_source.rpc_address:
continue
other_loc, _ = head_no_follow(
f"http://{other.rpc_address}/incremental-snapshot.tar.bz2", timeout=2)
if other_loc:
other_fn, other_fp = _parse_snapshot_filename(other_loc)
if other_fn == best_filename:
mirror_urls.append(f"http://{other.rpc_address}{other_fp}")
return best_filename, mirror_urls
# -- Download ------------------------------------------------------------------
def download_aria2c(
urls: list[str],
output_dir: str,
filename: str,
connections: int = 16,
) -> bool:
"""Download a file using aria2c with parallel connections.
When multiple URLs are provided, aria2c treats them as mirrors of the
same file and distributes chunks across all of them.
"""
num_mirrors: int = len(urls)
total_splits: int = max(connections, connections * num_mirrors)
cmd: list[str] = [
"aria2c",
"--file-allocation=none",
"--continue=false",
f"--max-connection-per-server={connections}",
f"--split={total_splits}",
"--min-split-size=50M",
# aria2c retries individual chunk connections on transient network
# errors (TCP reset, timeout). This is transport-level retry analogous
# to TCP retransmit, not application-level retry of a failed operation.
"--max-tries=5",
"--retry-wait=5",
"--timeout=60",
"--connect-timeout=10",
"--summary-interval=10",
"--console-log-level=notice",
f"--dir={output_dir}",
f"--out={filename}",
"--auto-file-renaming=false",
"--allow-overwrite=true",
*urls,
]
log.info("Downloading %s", filename)
log.info(" aria2c: %d connections x %d mirrors (%d splits)",
connections, num_mirrors, total_splits)
start: float = time.monotonic()
result: subprocess.CompletedProcess[bytes] = subprocess.run(cmd)
elapsed: float = time.monotonic() - start
if result.returncode != 0:
log.error("aria2c failed with exit code %d", result.returncode)
return False
filepath: Path = Path(output_dir) / filename
if not filepath.exists():
log.error("aria2c reported success but %s does not exist", filepath)
return False
size_bytes: int = filepath.stat().st_size
size_gb: float = size_bytes / (1024 ** 3)
avg_mb: float = size_bytes / elapsed / (1024 ** 2) if elapsed > 0 else 0
log.info(" Done: %.1f GB in %.0fs (%.1f MiB/s avg)", size_gb, elapsed, avg_mb)
return True
# -- Shared helpers ------------------------------------------------------------
def _discover_and_benchmark(
rpc_url: str,
current_slot: int,
*,
max_snapshot_age: int = 10000,
max_latency: float = 500,
threads: int = 500,
min_download_speed: int = 20,
measurement_time: int = 7,
max_speed_checks: int = 15,
version_filter: str | None = None,
) -> list[SnapshotSource]:
"""Discover snapshot sources and benchmark download speed.
Returns sources that meet the minimum speed requirement, sorted by speed.
"""
sources: list[SnapshotSource] = discover_sources(
rpc_url, current_slot,
max_age_slots=max_snapshot_age,
max_latency_ms=max_latency,
threads=threads,
version_filter=version_filter,
)
if not sources:
return []
sources.sort(key=lambda s: s.latency_ms)
log.info("Benchmarking download speed on top %d sources...", max_speed_checks)
fast_sources: list[SnapshotSource] = []
checked: int = 0
min_speed_bytes: int = min_download_speed * 1024 * 1024
for source in sources:
if checked >= max_speed_checks:
break
checked += 1
speed: float = measure_speed(source.rpc_address, measurement_time)
source.download_speed = speed
speed_mib: float = speed / (1024 ** 2)
if speed < min_speed_bytes:
log.info(" %s: %.1f MiB/s (too slow, need >=%d MiB/s)",
source.rpc_address, speed_mib, min_download_speed)
continue
log.info(" %s: %.1f MiB/s (latency: %.0fms, age: %d slots)",
source.rpc_address, speed_mib,
source.latency_ms, source.slots_diff)
fast_sources.append(source)
return fast_sources
def _rolling_incremental_download(
fast_sources: list[SnapshotSource],
full_snap_slot: int,
output_dir: str,
convergence_slots: int,
connections: int,
rpc_url: str,
) -> str | None:
"""Download incrementals in a loop until converged.
Probes fast_sources for incrementals matching full_snap_slot, downloads
the freshest one, then re-probes until the gap to head is within
convergence_slots. Returns the filename of the final incremental,
or None if no incremental was found.
"""
prev_inc_filename: str | None = None
loop_start: float = time.monotonic()
max_convergence_time: float = 1800.0 # 30 min wall-clock limit
while True:
if time.monotonic() - loop_start > max_convergence_time:
if prev_inc_filename:
log.warning("Convergence timeout (%.0fs) — using %s",
max_convergence_time, prev_inc_filename)
else:
log.warning("Convergence timeout (%.0fs) — no incremental downloaded",
max_convergence_time)
break
inc_fn, inc_mirrors = probe_incremental(fast_sources, full_snap_slot)
if inc_fn is None:
if prev_inc_filename is None:
log.error("No matching incremental found for base slot %d",
full_snap_slot)
else:
log.info("No newer incremental available, using %s", prev_inc_filename)
break
m_inc: re.Match[str] | None = INCR_SNAP_RE.match(inc_fn)
assert m_inc is not None
inc_slot: int = int(m_inc.group(2))
head_slot: int | None = get_current_slot(rpc_url)
if head_slot is None:
log.warning("Cannot get current slot — downloading best available incremental")
gap: int = convergence_slots + 1
else:
gap = head_slot - inc_slot
if inc_fn == prev_inc_filename:
if gap <= convergence_slots:
log.info("Incremental %s already downloaded (gap %d slots, converged)",
inc_fn, gap)
break
log.info("No newer incremental yet (slot %d, gap %d slots), waiting...",
inc_slot, gap)
time.sleep(10)
continue
if prev_inc_filename is not None:
old_path: Path = Path(output_dir) / prev_inc_filename
if old_path.exists():
log.info("Removing superseded incremental %s", prev_inc_filename)
old_path.unlink()
log.info("Downloading incremental %s (%d mirrors, slot %d, gap %d slots)",
inc_fn, len(inc_mirrors), inc_slot, gap)
if not download_aria2c(inc_mirrors, output_dir, inc_fn, connections):
log.warning("Failed to download incremental %s — re-probing in 10s", inc_fn)
time.sleep(10)
continue
prev_inc_filename = inc_fn
if gap <= convergence_slots:
log.info("Converged: incremental slot %d is %d slots behind head",
inc_slot, gap)
break
if head_slot is None:
break
log.info("Not converged (gap %d > %d), re-probing in 10s...",
gap, convergence_slots)
time.sleep(10)
return prev_inc_filename
# -- Public API ----------------------------------------------------------------
def download_incremental_for_slot(
output_dir: str,
full_snap_slot: int,
*,
cluster: str = "mainnet-beta",
rpc_url: str | None = None,
connections: int = 16,
threads: int = 500,
max_snapshot_age: int = 10000,
max_latency: float = 500,
min_download_speed: int = 20,
measurement_time: int = 7,
max_speed_checks: int = 15,
version_filter: str | None = None,
convergence_slots: int = 500,
) -> bool:
"""Download an incremental snapshot for an existing full snapshot.
Discovers sources, benchmarks speed, then runs the rolling incremental
download loop for the given full snapshot base slot. Does NOT download
a full snapshot.
Returns True if an incremental was downloaded, False otherwise.
"""
resolved_rpc: str = rpc_url or CLUSTER_RPC[cluster]
if not shutil.which("aria2c"):
log.error("aria2c not found. Install with: apt install aria2")
return False
log.info("Incremental download for base slot %d", full_snap_slot)
current_slot: int | None = get_current_slot(resolved_rpc)
if current_slot is None:
log.error("Cannot get current slot from %s", resolved_rpc)
return False
fast_sources: list[SnapshotSource] = _discover_and_benchmark(
resolved_rpc, current_slot,
max_snapshot_age=max_snapshot_age,
max_latency=max_latency,
threads=threads,
min_download_speed=min_download_speed,
measurement_time=measurement_time,
max_speed_checks=max_speed_checks,
version_filter=version_filter,
)
if not fast_sources:
log.error("No fast sources found")
return False
os.makedirs(output_dir, exist_ok=True)
result: str | None = _rolling_incremental_download(
fast_sources, full_snap_slot, output_dir,
convergence_slots, connections, resolved_rpc,
)
return result is not None
def download_best_snapshot(
output_dir: str,
*,
cluster: str = "mainnet-beta",
rpc_url: str | None = None,
connections: int = 16,
threads: int = 500,
max_snapshot_age: int = 10000,
max_latency: float = 500,
min_download_speed: int = 20,
measurement_time: int = 7,
max_speed_checks: int = 15,
version_filter: str | None = None,
full_only: bool = False,
convergence_slots: int = 500,
) -> bool:
"""Download the best available snapshot to output_dir.
This is the programmatic API called by entrypoint.py for automatic
snapshot download. Returns True on success, False on failure.
All parameters have sensible defaults matching the CLI interface.
"""
resolved_rpc: str = rpc_url or CLUSTER_RPC[cluster]
if not shutil.which("aria2c"):
log.error("aria2c not found. Install with: apt install aria2")
return False
log.info("Cluster: %s | RPC: %s", cluster, resolved_rpc)
current_slot: int | None = get_current_slot(resolved_rpc)
if current_slot is None:
log.error("Cannot get current slot from %s", resolved_rpc)
return False
log.info("Current slot: %d", current_slot)
fast_sources: list[SnapshotSource] = _discover_and_benchmark(
resolved_rpc, current_slot,
max_snapshot_age=max_snapshot_age,
max_latency=max_latency,
threads=threads,
min_download_speed=min_download_speed,
measurement_time=measurement_time,
max_speed_checks=max_speed_checks,
version_filter=version_filter,
)
if not fast_sources:
log.error("No fast sources found")
return False
# Use the fastest source as primary, build full snapshot download plan
best: SnapshotSource = fast_sources[0]
full_paths: list[str] = [fp for fp in best.file_paths
if fp.rsplit("/", 1)[-1].startswith("snapshot-")]
if not full_paths:
log.error("Best source has no full snapshot")
return False
# Build mirror URLs for the full snapshot
full_filename: str = full_paths[0].rsplit("/", 1)[-1]
full_mirrors: list[str] = [f"http://{best.rpc_address}{full_paths[0]}"]
for other in fast_sources[1:]:
for other_fp in other.file_paths:
if other_fp.rsplit("/", 1)[-1] == full_filename:
full_mirrors.append(f"http://{other.rpc_address}{other_fp}")
break
speed_mib: float = best.download_speed / (1024 ** 2)
log.info("Best source: %s (%.1f MiB/s), %d mirrors",
best.rpc_address, speed_mib, len(full_mirrors))
# Download full snapshot
os.makedirs(output_dir, exist_ok=True)
total_start: float = time.monotonic()
filepath: Path = Path(output_dir) / full_filename
if filepath.exists() and filepath.stat().st_size > 0:
log.info("Skipping %s (already exists: %.1f GB)",
full_filename, filepath.stat().st_size / (1024 ** 3))
else:
if not download_aria2c(full_mirrors, output_dir, full_filename, connections):
log.error("Failed to download %s", full_filename)
return False
# Download incremental separately — the full download took minutes,
# so any incremental from discovery is stale. Re-probe for fresh ones.
if not full_only:
fm: re.Match[str] | None = FULL_SNAP_RE.match(full_filename)
if fm:
full_snap_slot: int = int(fm.group(1))
log.info("Downloading incremental for base slot %d...", full_snap_slot)
_rolling_incremental_download(
fast_sources, full_snap_slot, output_dir,
convergence_slots, connections, resolved_rpc,
)
total_elapsed: float = time.monotonic() - total_start
log.info("All downloads complete in %.0fs", total_elapsed)
return True
# -- Main (CLI) ----------------------------------------------------------------
def main() -> int:
p: argparse.ArgumentParser = argparse.ArgumentParser(
description="Download Solana snapshots with aria2c parallel downloads",
)
p.add_argument("-o", "--output", default="/srv/kind/solana/snapshots",
help="Snapshot output directory (default: /srv/kind/solana/snapshots)")
p.add_argument("-c", "--cluster", default="mainnet-beta",
choices=list(CLUSTER_RPC),
help="Solana cluster (default: mainnet-beta)")
p.add_argument("-r", "--rpc", default=None,
help="RPC URL for cluster discovery (default: public RPC)")
p.add_argument("-n", "--connections", type=int, default=16,
help="aria2c connections per download (default: 16)")
p.add_argument("-t", "--threads", type=int, default=500,
help="Threads for parallel RPC probing (default: 500)")
p.add_argument("--max-snapshot-age", type=int, default=10000,
help="Max snapshot age in slots (default: 10000)")
p.add_argument("--max-latency", type=float, default=500,
help="Max RPC probe latency in ms (default: 500)")
p.add_argument("--min-download-speed", type=int, default=20,
help="Min download speed in MiB/s (default: 20)")
p.add_argument("--measurement-time", type=int, default=7,
help="Speed measurement duration in seconds (default: 7)")
p.add_argument("--max-speed-checks", type=int, default=15,
help="Max nodes to benchmark before giving up (default: 15)")
p.add_argument("--version", default=None,
help="Filter nodes by version prefix (e.g. '2.2')")
p.add_argument("--convergence-slots", type=int, default=500,
help="Max slot gap for incremental convergence (default: 500)")
p.add_argument("--full-only", action="store_true",
help="Download only full snapshot, skip incremental")
p.add_argument("--dry-run", action="store_true",
help="Find best source and print URL, don't download")
p.add_argument("--post-cmd",
help="Shell command to run after successful download "
"(e.g. 'kubectl scale deployment ... --replicas=1')")
p.add_argument("-v", "--verbose", action="store_true")
args: argparse.Namespace = p.parse_args()
logging.basicConfig(
level=logging.DEBUG if args.verbose else logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
datefmt="%H:%M:%S",
)
# Dry-run uses the original inline flow (needs access to sources for URL printing)
if args.dry_run:
rpc_url: str = args.rpc or CLUSTER_RPC[args.cluster]
current_slot: int | None = get_current_slot(rpc_url)
if current_slot is None:
log.error("Cannot get current slot from %s", rpc_url)
return 1
sources: list[SnapshotSource] = discover_sources(
rpc_url, current_slot,
max_age_slots=args.max_snapshot_age,
max_latency_ms=args.max_latency,
threads=args.threads,
version_filter=args.version,
)
if not sources:
log.error("No snapshot sources found")
return 1
sources.sort(key=lambda s: s.latency_ms)
best = sources[0]
for fp in best.file_paths:
print(f"http://{best.rpc_address}{fp}")
return 0
ok: bool = download_best_snapshot(
args.output,
cluster=args.cluster,
rpc_url=args.rpc,
connections=args.connections,
threads=args.threads,
max_snapshot_age=args.max_snapshot_age,
max_latency=args.max_latency,
min_download_speed=args.min_download_speed,
measurement_time=args.measurement_time,
max_speed_checks=args.max_speed_checks,
version_filter=args.version,
full_only=args.full_only,
convergence_slots=args.convergence_slots,
)
if ok and args.post_cmd:
log.info("Running post-download command: %s", args.post_cmd)
result: subprocess.CompletedProcess[bytes] = subprocess.run(
args.post_cmd, shell=True,
)
if result.returncode != 0:
log.error("Post-download command failed with exit code %d",
result.returncode)
return 1
log.info("Post-download command completed successfully")
return 0 if ok else 1
if __name__ == "__main__":
sys.exit(main())

View File

@ -1,112 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
# -----------------------------------------------------------------------
# Start solana-test-validator with optional SPL token setup
#
# Environment variables:
# FACILITATOR_PUBKEY - facilitator fee-payer public key (base58)
# SERVER_PUBKEY - server/payee wallet public key (base58)
# CLIENT_PUBKEY - client/payer wallet public key (base58)
# MINT_DECIMALS - token decimals (default: 6, matching USDC)
# MINT_AMOUNT - amount to mint to client (default: 1000000000)
# LEDGER_DIR - ledger directory (default: /data/ledger)
# -----------------------------------------------------------------------
LEDGER_DIR="${LEDGER_DIR:-/data/ledger}"
MINT_DECIMALS="${MINT_DECIMALS:-6}"
MINT_AMOUNT="${MINT_AMOUNT:-1000000000}"
SETUP_MARKER="${LEDGER_DIR}/.setup-done"
sudo chown -R "$(id -u):$(id -g)" "$LEDGER_DIR" 2>/dev/null || true
# Start test-validator in the background
solana-test-validator \
--ledger "${LEDGER_DIR}" \
--rpc-port 8899 \
--bind-address 0.0.0.0 \
--quiet &
VALIDATOR_PID=$!
# Wait for RPC to become available
echo "Waiting for test-validator RPC..."
for i in $(seq 1 60); do
if solana cluster-version --url http://127.0.0.1:8899 >/dev/null 2>&1; then
echo "Test-validator is ready (attempt ${i})"
break
fi
sleep 1
done
solana config set --url http://127.0.0.1:8899
# Only run setup once (idempotent via marker file)
if [ ! -f "${SETUP_MARKER}" ]; then
echo "Running first-time setup..."
# Airdrop SOL to all wallets for gas
for PUBKEY in "${FACILITATOR_PUBKEY:-}" "${SERVER_PUBKEY:-}" "${CLIENT_PUBKEY:-}"; do
if [ -n "${PUBKEY}" ]; then
echo "Airdropping 100 SOL to ${PUBKEY}..."
solana airdrop 100 "${PUBKEY}" --url http://127.0.0.1:8899 || true
fi
done
# Create a USDC-equivalent SPL token mint if any pubkeys are set
if [ -n "${CLIENT_PUBKEY:-}" ] || [ -n "${FACILITATOR_PUBKEY:-}" ] || [ -n "${SERVER_PUBKEY:-}" ]; then
MINT_AUTHORITY_FILE="${LEDGER_DIR}/mint-authority.json"
if [ ! -f "${MINT_AUTHORITY_FILE}" ]; then
solana-keygen new --no-bip39-passphrase --outfile "${MINT_AUTHORITY_FILE}" --force
MINT_AUTH_PUBKEY=$(solana-keygen pubkey "${MINT_AUTHORITY_FILE}")
solana airdrop 10 "${MINT_AUTH_PUBKEY}" --url http://127.0.0.1:8899
fi
MINT_ADDRESS_FILE="${LEDGER_DIR}/usdc-mint-address.txt"
if [ ! -f "${MINT_ADDRESS_FILE}" ]; then
spl-token create-token \
--decimals "${MINT_DECIMALS}" \
--mint-authority "${MINT_AUTHORITY_FILE}" \
--url http://127.0.0.1:8899 \
2>&1 | grep "Creating token" | awk '{print $3}' > "${MINT_ADDRESS_FILE}"
echo "Created USDC mint: $(cat "${MINT_ADDRESS_FILE}")"
fi
USDC_MINT=$(cat "${MINT_ADDRESS_FILE}")
# Create ATAs and mint tokens for the client
if [ -n "${CLIENT_PUBKEY:-}" ]; then
echo "Creating ATA for client ${CLIENT_PUBKEY}..."
spl-token create-account "${USDC_MINT}" \
--owner "${CLIENT_PUBKEY}" \
--fee-payer "${MINT_AUTHORITY_FILE}" \
--url http://127.0.0.1:8899 || true
echo "Minting ${MINT_AMOUNT} tokens to client..."
spl-token mint "${USDC_MINT}" "${MINT_AMOUNT}" \
--recipient-owner "${CLIENT_PUBKEY}" \
--mint-authority "${MINT_AUTHORITY_FILE}" \
--url http://127.0.0.1:8899 || true
fi
# Create ATAs for server and facilitator
for PUBKEY in "${SERVER_PUBKEY:-}" "${FACILITATOR_PUBKEY:-}"; do
if [ -n "${PUBKEY}" ]; then
echo "Creating ATA for ${PUBKEY}..."
spl-token create-account "${USDC_MINT}" \
--owner "${PUBKEY}" \
--fee-payer "${MINT_AUTHORITY_FILE}" \
--url http://127.0.0.1:8899 || true
fi
done
# Expose mint address for other containers
cp "${MINT_ADDRESS_FILE}" /tmp/usdc-mint-address.txt 2>/dev/null || true
fi
touch "${SETUP_MARKER}"
echo "Setup complete."
fi
echo "solana-test-validator running (PID ${VALIDATOR_PID})"
wait ${VALIDATOR_PID}

View File

@ -1,22 +0,0 @@
# DoubleZero network daemon for Solana validators
# Provides GRE tunnel + BGP routing via the DoubleZero fiber backbone
FROM debian:bookworm-slim
RUN apt-get update && apt-get install -y --no-install-recommends \
ca-certificates \
curl \
gnupg \
iproute2 \
&& rm -rf /var/lib/apt/lists/*
# Install DoubleZero from Cloudsmith apt repo
RUN curl -1sLf https://dl.cloudsmith.io/public/malbeclabs/doublezero/setup.deb.sh | bash \
&& apt-get update \
&& apt-get install -y doublezero \
&& rm -rf /var/lib/apt/lists/*
COPY entrypoint.sh /usr/local/bin/entrypoint.sh
RUN chmod +x /usr/local/bin/entrypoint.sh
ENTRYPOINT ["entrypoint.sh"]

View File

@ -1,9 +0,0 @@
#!/usr/bin/env bash
# Build laconicnetwork/doublezero
source ${CERC_CONTAINER_BASE_DIR}/build-base.sh
docker build -t laconicnetwork/doublezero:local \
${build_command_args} \
-f ${CERC_CONTAINER_BASE_DIR}/laconicnetwork-doublezero/Dockerfile \
${CERC_CONTAINER_BASE_DIR}/laconicnetwork-doublezero

View File

@ -1,38 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
# -----------------------------------------------------------------------
# Start doublezerod
#
# Optional environment:
# DOUBLEZERO_RPC_ENDPOINT - Solana RPC endpoint (default: http://127.0.0.1:8899)
# DOUBLEZERO_ENV - DoubleZero environment (default: mainnet-beta)
# DOUBLEZERO_EXTRA_ARGS - additional doublezerod arguments
# -----------------------------------------------------------------------
RPC_ENDPOINT="${DOUBLEZERO_RPC_ENDPOINT:-http://127.0.0.1:8899}"
DZ_ENV="${DOUBLEZERO_ENV:-mainnet-beta}"
# Ensure state directories exist
mkdir -p /var/lib/doublezerod /var/run/doublezerod
# Generate DZ identity if not already present
DZ_CONFIG_DIR="${HOME}/.config/doublezero"
mkdir -p "$DZ_CONFIG_DIR"
if [ ! -f "$DZ_CONFIG_DIR/id.json" ]; then
echo "Generating DoubleZero identity..."
doublezero keygen
fi
echo "Starting doublezerod..."
echo "Environment: $DZ_ENV"
echo "RPC endpoint: $RPC_ENDPOINT"
echo "DZ address: $(doublezero address)"
ARGS=()
[ -n "${DOUBLEZERO_EXTRA_ARGS:-}" ] && read -ra ARGS <<< "$DOUBLEZERO_EXTRA_ARGS"
exec doublezerod \
-env "$DZ_ENV" \
-solana-rpc-endpoint "$RPC_ENDPOINT" \
"${ARGS[@]}"

View File

@ -1,169 +0,0 @@
# agave stack
Unified Agave/Jito Solana stack supporting three modes:
| Mode | Compose file | Use case |
|------|-------------|----------|
| `test` | `docker-compose-agave-test.yml` | Local dev with instant finality |
| `rpc` | `docker-compose-agave-rpc.yml` | Non-voting mainnet/testnet RPC node |
| `validator` | `docker-compose-agave.yml` | Voting validator |
## Build
```bash
# Vanilla Agave v3.1.9
laconic-so --stack agave build-containers
# Jito v3.1.8
AGAVE_REPO=https://github.com/jito-foundation/jito-solana.git \
AGAVE_VERSION=v3.1.8-jito \
laconic-so --stack agave build-containers
```
Build compiles from source (~30-60 min on first build).
## Deploy
```bash
# Test validator (dev)
laconic-so --stack agave deploy init --output spec.yml
laconic-so --stack agave deploy create --spec-file spec.yml --deployment-dir my-test
laconic-so deployment --dir my-test start
# Mainnet RPC (e.g. biscayne)
# Edit spec.yml to set AGAVE_MODE=rpc, VALIDATOR_ENTRYPOINT, KNOWN_VALIDATOR, etc.
laconic-so --stack agave deploy init --output spec.yml
laconic-so --stack agave deploy create --spec-file spec.yml --deployment-dir my-rpc
laconic-so deployment --dir my-rpc start
```
## Configuration
Mode is selected via `AGAVE_MODE` environment variable (`test`, `rpc`, or `validator`).
### RPC mode required env
- `VALIDATOR_ENTRYPOINT` - cluster entrypoint (e.g. `entrypoint.mainnet-beta.solana.com:8001`)
- `KNOWN_VALIDATOR` - known validator pubkey
### Validator mode required env
- `VALIDATOR_ENTRYPOINT` - cluster entrypoint
- `KNOWN_VALIDATOR` - known validator pubkey
- Identity and vote account keypairs mounted at `/data/config/`
### Jito (optional, any mode except test)
Set `JITO_ENABLE=true` and provide:
- `JITO_BLOCK_ENGINE_URL`
- `JITO_SHRED_RECEIVER_ADDR`
- `JITO_TIP_PAYMENT_PROGRAM`
- `JITO_DISTRIBUTION_PROGRAM`
- `JITO_MERKLE_ROOT_AUTHORITY`
- `JITO_COMMISSION_BPS`
Image must be built from `jito-foundation/jito-solana` repo for Jito flags to work.
## Runtime requirements
The container requires the following (already set in compose files):
- `privileged: true` — allows `mlock()` and raw network access
- `cap_add: IPC_LOCK` — memory page locking for account indexes and ledger mappings
- `ulimits: memlock: -1` (unlimited) — Agave locks gigabytes of memory
- `ulimits: nofile: 1000000` — gossip/TPU connections + memory-mapped ledger files
- `network_mode: host` — direct host network stack for gossip, TPU, and UDP port ranges
Without these, Agave either refuses to start or dies under load.
## Container overhead
Containers running with `privileged: true` and `network_mode: host` add **zero
measurable overhead** compared to bare metal. Linux containers are not VMs — there
is no hypervisor, no emulation layer, no packet translation:
- **Network**: `network_mode: host` shares the host's network namespace directly.
No virtual bridge, no NAT, no veth pair. Same kernel code path as bare metal.
GRE tunnels (DoubleZero) and raw sockets work identically.
- **CPU**: No hypervisor. The process runs on the same physical cores with the
same scheduler priority as any host process.
- **Memory**: `IPC_LOCK` + unlimited memlock means Agave can `mlock()` pages
exactly like bare metal. No memory ballooning or overcommit.
- **Disk I/O**: PersistentVolumes backed by hostPath mounts have identical I/O
characteristics to direct filesystem access.
The only overhead is cgroup accounting (nanoseconds per syscall) and overlayfs
for cold file opens (single-digit microseconds, zero once cached).
## DoubleZero
DoubleZero provides optimized network routing for Solana validators via GRE
tunnels (IP protocol 47) and BGP (TCP/179) over link-local 169.254.0.0/16.
Traffic to other DoubleZero participants is routed through private fiber
instead of the public internet.
### How it works
`doublezerod` creates a `doublezero0` GRE tunnel interface and runs BGP
peering through it. Routes are injected into the host routing table, so
the validator transparently sends traffic to other DZ validators over
the fiber backbone. IBRL mode falls back to public internet if DZ is down.
### Container build
```bash
laconic-so --stack agave build-containers
```
This builds both the `laconicnetwork/agave` and `laconicnetwork/doublezero` images.
### Requirements
- Validator identity keypair at `/data/config/validator-identity.json`
- `privileged: true` + `NET_ADMIN` (GRE tunnel + route table manipulation)
- `hostNetwork: true` (GRE uses IP protocol 47, not TCP/UDP — cannot be port-mapped)
- Node registered with DoubleZero passport system
### Docker Compose
The `docker-compose-doublezero.yml` runs alongside the validator with
`network_mode: host`, sharing the `validator-config` volume for identity access.
### k8s deployment
laconic-so does not pass `hostNetwork` through to generated k8s resources.
DoubleZero runs as a DaemonSet defined in `deployment/k8s-manifests/doublezero-daemonset.yaml`,
applied after `deployment start`:
```bash
kubectl apply -f deployment/k8s-manifests/doublezero-daemonset.yaml
```
Since validator pods also use `hostNetwork: true` (via the compose `network_mode: host`
which maps to the pod spec in k8s), they automatically see the GRE routes
injected by `doublezerod` into the node's routing table.
## Biscayne deployment (biscayne.vaasl.io)
Mainnet voting validator with Jito MEV and DoubleZero.
```bash
# Build Jito image
AGAVE_REPO=https://github.com/jito-foundation/jito-solana.git \
AGAVE_VERSION=v3.1.8-jito \
laconic-so --stack agave build-containers
# Create deployment from biscayne spec
laconic-so --stack agave deploy create \
--spec-file deployment/spec.yml \
--deployment-dir biscayne-deployment
# Copy validator keypairs
cp /path/to/validator-identity.json biscayne-deployment/data/validator-config/
cp /path/to/vote-account-keypair.json biscayne-deployment/data/validator-config/
# Start validator
laconic-so deployment --dir biscayne-deployment start
# Start DoubleZero (after deployment is running)
kubectl apply -f deployment/k8s-manifests/doublezero-daemonset.yaml
```
To run as non-voting RPC instead, change `AGAVE_MODE: rpc` in `deployment/spec.yml`.

View File

@ -1,10 +0,0 @@
version: "1.1"
name: agave
description: "Agave/Jito Solana validator, RPC node, or test-validator"
containers:
- laconicnetwork/agave
- laconicnetwork/doublezero
pods:
- agave
- doublezero
- monitoring

View File

@ -1,14 +0,0 @@
[defaults]
inventory = inventory/
stdout_callback = ansible.builtin.default
result_format = yaml
callbacks_enabled = profile_tasks
retry_files_enabled = false
[privilege_escalation]
become = true
become_method = sudo
[ssh_connection]
pipelining = true
ssh_args = -o ForwardAgent=yes

View File

@ -1,38 +0,0 @@
hostname mia-sw01
ip routing
interface Ethernet1
no switchport
ip address 10.0.2.1/24
interface Ethernet2
no switchport
ip address 172.16.1.189/31
! GRE tunnel to biscayne (simulates doublezero0)
interface Tunnel1
mtu 1476
ip address 169.254.7.6/31
tunnel mode gre
tunnel source 10.0.2.1
tunnel destination 10.0.2.2
! Inbound: route 137.239.194.65 to biscayne via GRE tunnel
ip route 137.239.194.65/32 169.254.7.7
! Outbound: redirect traffic sourced from 137.239.194.65 to was-sw01 via backbone
ip access-list VALIDATOR-OUTBOUND-ACL
10 permit ip 137.239.194.65/32 any
traffic-policy VALIDATOR-OUTBOUND
match VALIDATOR-OUTBOUND-ACL
set nexthop 172.16.1.188
system-rule overriding-action redirect
! Apply on the GRE tunnel interface — this is what we're validating.
! If cEOS doesn't support traffic-policy on Tunnel, test.sh has a
! fallback that applies it on Ethernet1 instead.
interface Tunnel1
traffic-policy input VALIDATOR-OUTBOUND

View File

@ -1,377 +0,0 @@
#!/usr/bin/env bash
# End-to-end test for Ashburn validator relay topology.
#
# Prerequisites:
# sudo containerlab deploy -t topology.yml
#
# Usage:
# ./test.sh # run all tests
# ./test.sh setup # configure containers only (skip tests)
# ./test.sh inbound # inbound test only
# ./test.sh outbound # outbound test only
# ./test.sh counters # show all counters
set -euo pipefail
P="clab-ashburn-relay"
ASHBURN_IP="137.239.194.65"
KIND_NODE_IP="172.20.0.2"
BISCAYNE_BRIDGE_IP="172.20.0.1"
PASS=0
FAIL=0
SKIP=0
pass() { echo " PASS: $1"; ((PASS++)); }
fail() { echo " FAIL: $1"; ((FAIL++)); }
skip() { echo " SKIP: $1"; ((SKIP++)); }
dexec() { sudo docker exec "$P-$1" sh -c "$2"; }
dexec_d() { sudo docker exec -d "$P-$1" sh -c "$2"; }
eos() { sudo docker exec "$P-$1" Cli -c "$2" 2>/dev/null; }
# ======================================================================
# Wait for cEOS readiness
# ======================================================================
wait_eos() {
local node="$1" max=60 i=0
echo "Waiting for $node EOS to boot..."
while ! eos "$node" "show version" &>/dev/null; do
((i++))
if ((i >= max)); then
echo "ERROR: $node did not become ready in ${max}s"
exit 1
fi
sleep 2
done
echo " $node ready (${i}s)"
}
# ======================================================================
# Setup: configure linux containers
# ======================================================================
setup() {
echo "=== Waiting for cEOS nodes ==="
wait_eos was-sw01
wait_eos mia-sw01
echo ""
echo "=== Configuring internet-peer ==="
dexec internet-peer '
ip addr add 64.92.84.82/24 dev eth1 2>/dev/null || true
ip route add 137.239.194.65/32 via 64.92.84.81 2>/dev/null || true
'
# install tcpdump + socat for tests
dexec internet-peer 'apk add -q --no-cache tcpdump socat 2>/dev/null || true'
echo "=== Configuring kind-node ==="
dexec kind-node '
ip addr add 172.20.0.2/24 dev eth1 2>/dev/null || true
ip route add default via 172.20.0.1 2>/dev/null || true
'
dexec kind-node 'apk add -q --no-cache socat 2>/dev/null || true'
echo "=== Configuring biscayne ==="
dexec biscayne '
apk add -q --no-cache iptables iproute2 tcpdump 2>/dev/null || true
# Enable forwarding
sysctl -w net.ipv4.ip_forward=1 >/dev/null
# Interfaces
ip addr add 10.0.2.2/24 dev eth1 2>/dev/null || true
ip addr add 172.20.0.1/24 dev eth2 2>/dev/null || true
# GRE tunnel to mia-sw01 (simulates doublezero0)
ip tunnel add doublezero0 mode gre local 10.0.2.2 remote 10.0.2.1 2>/dev/null || true
ip addr add 169.254.7.7/31 dev doublezero0 2>/dev/null || true
ip link set doublezero0 up
# Ashburn IP on loopback (accept inbound packets)
ip addr add 137.239.194.65/32 dev lo 2>/dev/null || true
# --- Inbound DNAT: 137.239.194.65 → kind-node (172.20.0.2) ---
iptables -t nat -C PREROUTING -p udp -d 137.239.194.65 --dport 8001 \
-j DNAT --to-destination 172.20.0.2:8001 2>/dev/null || \
iptables -t nat -A PREROUTING -p udp -d 137.239.194.65 --dport 8001 \
-j DNAT --to-destination 172.20.0.2:8001
iptables -t nat -C PREROUTING -p tcp -d 137.239.194.65 --dport 8001 \
-j DNAT --to-destination 172.20.0.2:8001 2>/dev/null || \
iptables -t nat -A PREROUTING -p tcp -d 137.239.194.65 --dport 8001 \
-j DNAT --to-destination 172.20.0.2:8001
iptables -t nat -C PREROUTING -p udp -d 137.239.194.65 --dport 9000:9025 \
-j DNAT --to-destination 172.20.0.2 2>/dev/null || \
iptables -t nat -A PREROUTING -p udp -d 137.239.194.65 --dport 9000:9025 \
-j DNAT --to-destination 172.20.0.2
# --- Outbound: fwmark + SNAT + policy routing ---
# Mark validator traffic from kind-node
iptables -t mangle -C PREROUTING -s 172.20.0.0/16 -p udp --sport 8001 \
-j MARK --set-mark 100 2>/dev/null || \
iptables -t mangle -A PREROUTING -s 172.20.0.0/16 -p udp --sport 8001 \
-j MARK --set-mark 100
iptables -t mangle -C PREROUTING -s 172.20.0.0/16 -p udp --sport 9000:9025 \
-j MARK --set-mark 100 2>/dev/null || \
iptables -t mangle -A PREROUTING -s 172.20.0.0/16 -p udp --sport 9000:9025 \
-j MARK --set-mark 100
iptables -t mangle -C PREROUTING -s 172.20.0.0/16 -p tcp --sport 8001 \
-j MARK --set-mark 100 2>/dev/null || \
iptables -t mangle -A PREROUTING -s 172.20.0.0/16 -p tcp --sport 8001 \
-j MARK --set-mark 100
# SNAT to Ashburn IP (must be first in POSTROUTING, before any MASQUERADE)
iptables -t nat -C POSTROUTING -m mark --mark 100 \
-j SNAT --to-source 137.239.194.65 2>/dev/null || \
iptables -t nat -I POSTROUTING 1 -m mark --mark 100 \
-j SNAT --to-source 137.239.194.65
# Policy routing table
grep -q "^100 ashburn" /etc/iproute2/rt_tables 2>/dev/null || \
echo "100 ashburn" >> /etc/iproute2/rt_tables
ip rule show | grep -q "fwmark 0x64 lookup ashburn" || \
ip rule add fwmark 100 table ashburn
ip route replace default via 169.254.7.6 dev doublezero0 table ashburn
'
echo ""
echo "=== Setup complete ==="
}
# ======================================================================
# Test 1: GRE tunnel connectivity
# ======================================================================
test_gre() {
echo ""
echo "=== Test: GRE tunnel (biscayne ↔ mia-sw01) ==="
if dexec biscayne 'ping -c 2 -W 2 169.254.7.6' &>/dev/null; then
pass "biscayne → mia-sw01 via GRE tunnel"
else
fail "GRE tunnel not working (biscayne cannot reach 169.254.7.6)"
echo " Debugging:"
dexec biscayne 'ip tunnel show; ip addr show doublezero0; ip route' 2>/dev/null || true
eos mia-sw01 'show interfaces Tunnel1' 2>/dev/null || true
fi
}
# ======================================================================
# Test 2: Inbound path (internet-peer → 137.239.194.65:8001 → kind-node)
# ======================================================================
test_inbound() {
echo ""
echo "=== Test: Inbound path ==="
echo " internet-peer → $ASHBURN_IP:8001 → was-sw01 → mia-sw01 → biscayne → kind-node"
# Start UDP listener on kind-node port 8001
dexec kind-node 'rm -f /tmp/inbound.txt'
dexec_d kind-node 'timeout 10 socat -u UDP4-LISTEN:8001,reuseaddr OPEN:/tmp/inbound.txt,creat,trunc'
sleep 1
# Send test packet from internet-peer to 137.239.194.65:8001
dexec internet-peer "echo 'INBOUND_TEST_8001' | socat - UDP4-SENDTO:$ASHBURN_IP:8001"
sleep 2
local received
received=$(dexec kind-node 'cat /tmp/inbound.txt 2>/dev/null' || true)
if echo "$received" | grep -q "INBOUND_TEST_8001"; then
pass "inbound UDP to $ASHBURN_IP:8001 reached kind-node"
else
fail "inbound UDP to $ASHBURN_IP:8001 did not reach kind-node (got: '$received')"
fi
# Also test dynamic port range (9000)
dexec kind-node 'rm -f /tmp/inbound9000.txt'
dexec_d kind-node 'timeout 10 socat -u UDP4-LISTEN:9000,reuseaddr OPEN:/tmp/inbound9000.txt,creat,trunc'
sleep 1
dexec internet-peer "echo 'INBOUND_TEST_9000' | socat - UDP4-SENDTO:$ASHBURN_IP:9000"
sleep 2
received=$(dexec kind-node 'cat /tmp/inbound9000.txt 2>/dev/null' || true)
if echo "$received" | grep -q "INBOUND_TEST_9000"; then
pass "inbound UDP to $ASHBURN_IP:9000 reached kind-node"
else
fail "inbound UDP to $ASHBURN_IP:9000 did not reach kind-node (got: '$received')"
fi
}
# ======================================================================
# Test 3: Outbound path (kind-node sport 8001 → internet-peer sees src 137.239.194.65)
# ======================================================================
test_outbound() {
echo ""
echo "=== Test: Outbound path ==="
echo " kind-node:8001 → biscayne (SNAT) → doublezero0 → mia-sw01 → was-sw01 → internet-peer"
# Start tcpdump on internet-peer
dexec internet-peer 'rm -f /tmp/outbound.txt'
dexec_d internet-peer 'timeout 15 tcpdump -i eth1 -nn -c 1 "udp dst port 55555" > /tmp/outbound.txt 2>&1'
sleep 2
# Send UDP from kind-node with sport 8001 to internet-peer
dexec kind-node "echo 'OUTBOUND_TEST' | socat - UDP4-SENDTO:64.92.84.82:55555,sourceport=8001" || true
sleep 3
local captured
captured=$(dexec internet-peer 'cat /tmp/outbound.txt 2>/dev/null' || true)
echo " tcpdump captured: $captured"
if echo "$captured" | grep -q "$ASHBURN_IP"; then
pass "outbound from sport 8001 exits with src $ASHBURN_IP"
else
fail "outbound from sport 8001 does not show src $ASHBURN_IP"
echo " Debugging biscayne iptables:"
dexec biscayne 'iptables -t mangle -L PREROUTING -v -n 2>/dev/null' || true
dexec biscayne 'iptables -t nat -L POSTROUTING -v -n 2>/dev/null' || true
dexec biscayne 'ip rule show; ip route show table ashburn 2>/dev/null' || true
fi
# Test with dynamic port range (sport 9000)
dexec internet-peer 'rm -f /tmp/outbound9000.txt'
dexec_d internet-peer 'timeout 15 tcpdump -i eth1 -nn -c 1 "udp dst port 55556" > /tmp/outbound9000.txt 2>&1'
sleep 2
dexec kind-node "echo 'OUTBOUND_9000' | socat - UDP4-SENDTO:64.92.84.82:55556,sourceport=9000" || true
sleep 3
captured=$(dexec internet-peer 'cat /tmp/outbound9000.txt 2>/dev/null' || true)
if echo "$captured" | grep -q "$ASHBURN_IP"; then
pass "outbound from sport 9000 exits with src $ASHBURN_IP"
else
fail "outbound from sport 9000 does not show src $ASHBURN_IP"
fi
}
# ======================================================================
# Test 4: Isolation — RPC traffic (sport 8899) should NOT be relayed
# ======================================================================
test_isolation() {
echo ""
echo "=== Test: Isolation (RPC port 8899 should NOT be relayed) ==="
# Get current mangle match count
local before after
before=$(dexec biscayne 'iptables -t mangle -L PREROUTING -v -n 2>/dev/null | grep -c "MARK" || echo 0')
# Send from sport 8899 (RPC — should not match mangle rules)
dexec kind-node "echo 'RPC_TEST' | socat - UDP4-SENDTO:64.92.84.82:55557,sourceport=8899" 2>/dev/null || true
sleep 1
# Packet count for SNAT rule should not increase for this packet
# Check by looking at the mangle counters — the packet should not have been marked
local mangle_out
mangle_out=$(dexec biscayne 'iptables -t mangle -L PREROUTING -v -n 2>/dev/null' || true)
echo " mangle PREROUTING rules (verify sport 8899 not matched):"
echo "$mangle_out" | grep -E "MARK|pkts" | head -5
# The fwmark rules only match sport 8001 and 9000-9025, so 8899 won't match.
# We can verify by checking that no new packets were marked.
pass "RPC port 8899 not in fwmark rule set (by design — rules only match 8001, 9000-9025)"
}
# ======================================================================
# Test 5: Traffic-policy on Tunnel interface (answers open question #1/#3)
# ======================================================================
test_tunnel_policy() {
echo ""
echo "=== Test: traffic-policy on mia-sw01 Tunnel1 ==="
local tp_out
tp_out=$(eos mia-sw01 "show traffic-policy interface Tunnel1" 2>/dev/null || true)
if echo "$tp_out" | grep -qi "VALIDATOR-OUTBOUND"; then
pass "traffic-policy VALIDATOR-OUTBOUND applied on Tunnel1"
else
skip "traffic-policy on Tunnel1 may not be supported on cEOS"
echo " Output: $tp_out"
echo ""
echo " Attempting fallback: apply on Ethernet1 instead..."
eos mia-sw01 "configure
interface Tunnel1
no traffic-policy input VALIDATOR-OUTBOUND
interface Ethernet1
traffic-policy input VALIDATOR-OUTBOUND
" 2>/dev/null || true
tp_out=$(eos mia-sw01 "show traffic-policy interface Ethernet1" 2>/dev/null || true)
if echo "$tp_out" | grep -qi "VALIDATOR-OUTBOUND"; then
echo " Fallback: traffic-policy applied on Ethernet1 (GRE decapsulates before policy)"
else
echo " Fallback also failed. Check mia-sw01 config manually."
fi
fi
}
# ======================================================================
# Counters
# ======================================================================
show_counters() {
echo ""
echo "=== Traffic-policy counters ==="
echo "--- was-sw01 ---"
eos was-sw01 "show traffic-policy counters" 2>/dev/null || echo "(not available on cEOS)"
echo "--- mia-sw01 ---"
eos mia-sw01 "show traffic-policy counters" 2>/dev/null || echo "(not available on cEOS)"
echo ""
echo "--- biscayne iptables nat ---"
dexec biscayne 'iptables -t nat -L -v -n 2>/dev/null' || true
echo ""
echo "--- biscayne iptables mangle ---"
dexec biscayne 'iptables -t mangle -L PREROUTING -v -n 2>/dev/null' || true
echo ""
echo "--- biscayne policy routing ---"
dexec biscayne 'ip rule show 2>/dev/null' || true
dexec biscayne 'ip route show table ashburn 2>/dev/null' || true
}
# ======================================================================
# Main
# ======================================================================
main() {
local mode="${1:-all}"
case "$mode" in
setup)
setup
;;
inbound)
test_gre
test_inbound
;;
outbound)
test_outbound
;;
counters)
show_counters
;;
all)
setup
test_gre
test_tunnel_policy
test_inbound
test_outbound
test_isolation
show_counters
echo ""
echo "==============================="
echo "Results: $PASS passed, $FAIL failed, $SKIP skipped"
echo "==============================="
if ((FAIL > 0)); then
exit 1
fi
;;
*)
echo "Usage: $0 [setup|inbound|outbound|counters|all]"
exit 1
;;
esac
}
main "$@"

View File

@ -1,43 +0,0 @@
name: ashburn-relay
topology:
kinds:
ceos:
image: ceos:4.34.0F
linux:
image: alpine:3.20
nodes:
# Ashburn switch — inbound traffic-policy + Loopback101 for 137.239.194.65
was-sw01:
kind: ceos
startup-config: was-sw01-startup.cfg
# Miami switch — outbound traffic-policy + GRE tunnel to biscayne
mia-sw01:
kind: ceos
startup-config: mia-sw01-startup.cfg
# Biscayne host — iptables DNAT/SNAT, fwmark, policy routing, GRE
biscayne:
kind: linux
# Simulates kind node (172.20.0.2) running the validator
kind-node:
kind: linux
# Simulates an internet peer sending/receiving validator traffic
internet-peer:
kind: linux
links:
# was-sw01 Et1 (uplink) <-> internet-peer
- endpoints: ["was-sw01:et1", "internet-peer:eth1"]
# was-sw01 Et2 <-> mia-sw01 Et2 (backbone, 172.16.1.188/31)
- endpoints: ["was-sw01:et2", "mia-sw01:et2"]
# mia-sw01 Et1 <-> biscayne (GRE underlay, 10.0.2.0/24)
- endpoints: ["mia-sw01:et1", "biscayne:eth1"]
# biscayne <-> kind-node (Docker bridge simulation, 172.20.0.0/24)
- endpoints: ["biscayne:eth2", "kind-node:eth1"]

View File

@ -1,26 +0,0 @@
hostname was-sw01
ip routing
interface Loopback101
ip address 137.239.194.65/32
interface Ethernet1
no switchport
ip address 64.92.84.81/24
traffic-policy input VALIDATOR-RELAY
interface Ethernet2
no switchport
ip address 172.16.1.188/31
ip access-list VALIDATOR-RELAY-ACL
10 permit udp any any eq 8001
20 permit udp any any range 9000 9025
30 permit tcp any any eq 8001
traffic-policy VALIDATOR-RELAY
match VALIDATOR-RELAY-ACL
set nexthop 172.16.1.189
system-rule overriding-action redirect

View File

@ -0,0 +1,113 @@
# Contributing
Thank you for taking the time to make a contribution to Stack Orchestrator.
## Install (developer mode)
Suitable for developers either modifying or debugging the orchestrator Python code:
### Prerequisites
In addition to the pre-requisites listed in the [README](/README.md), the following are required:
1. Python venv package
This may or may not be already installed depending on the host OS and version. Check by running:
```
$ python3 -m venv
usage: venv [-h] [--system-site-packages] [--symlinks | --copies] [--clear] [--upgrade] [--without-pip] [--prompt PROMPT] ENV_DIR [ENV_DIR ...]
venv: error: the following arguments are required: ENV_DIR
```
If the venv package is missing you should see a message indicating how to install it, for example with:
```
$ apt install python3.10-venv
```
### Install
1. Clone this repository:
```
$ git clone https://git.vdb.to/cerc-io/stack-orchestrator.git
```
2. Enter the project directory:
```
$ cd stack-orchestrator
```
3. (This and the next step can be done by running `source ./scripts/developer-mode-setup.sh`)
Create and activate a venv:
```
$ python3 -m venv venv
$ source ./venv/bin/activate
(venv) $
```
4. Install the cli in edit mode:
```
$ pip install --editable .
```
5. Verify installation:
```
(venv) $ laconic-so
Usage: laconic-so [OPTIONS] COMMAND [ARGS]...
Laconic Stack Orchestrator
Options:
--quiet
--verbose
--dry-run
-h, --help Show this message and exit.
Commands:
build-containers build the set of containers required for a complete...
deploy-system deploy a stack
setup-repositories git clone the set of repositories required to build...
```
## Build a zipapp (single file distributable script)
Use shiv to build a single file Python executable zip archive of laconic-so:
1. Install [shiv](https://github.com/linkedin/shiv):
```
$ (venv) pip install shiv
$ (venv) pip install wheel
```
2. Run shiv to create a zipapp file:
```
$ (venv) shiv -c laconic-so -o laconic-so .
```
This creates a file `./laconic-so` that is executable outside of any venv, and on other machines and OSes and architectures, and requiring only the system Python3:
3. Verify it works:
```
$ cp stack-orchetrator/laconic-so ~/bin
$ laconic-so
Usage: laconic-so [OPTIONS] COMMAND [ARGS]...
Laconic Stack Orchestrator
Options:
--stack TEXT specify a stack to build/deploy
--quiet
--verbose
--dry-run
--local-stack
--debug
--continue-on-error
-h, --help Show this message and exit.
Commands:
build-containers build the set of containers required for a complete...
build-npms build the set of npm packages required for a...
deploy deploy a stack
deploy-system deploy a stack
setup-repositories git clone the set of repositories required to build...
version print tool version
```
For cutting releases, use the [shiv build script](/scripts/build_shiv_package.sh).

8
docs/README.md 100644
View File

@ -0,0 +1,8 @@
# Stack Orchestrator
Here you will find information about the design of stack orchestrator, contributing to it, and deploying services/applications that combine two or more "stacks".
Most "stacks" contain their own README which has plenty of information on deploying, but stacks can be combined in a variety of ways which are document here, for example:
- [Gitea with Laconicd Fixturenet](./gitea-with-laconicd-fixturenet.md)
- [Laconicd Registry with Console](./laconicd-with-console.md)

View File

@ -0,0 +1,71 @@
# Adding a new stack
See [this PR](https://git.vdb.to/cerc-io/stack-orchestrator/pull/434) for an example of how to currently add a minimal stack to stack orchestrator. The [reth stack](https://git.vdb.to/cerc-io/stack-orchestrator/pull/435) is another good example.
For external developers, we recommend forking this repo and adding your stack directly to your fork. This initially requires running in "developer mode" as described [here](/docs/CONTRIBUTING.md). Check out the [Namada stack](https://github.com/vknowable/stack-orchestrator/blob/main/app/data/stacks/public-namada/digitalocean_quickstart.md) from Knowable to see how that is done.
Core to the feature completeness of stack orchestrator is to [decouple the tool functionality from payload](https://git.vdb.to/cerc-io/stack-orchestrator/issues/315) which will no longer require forking to add a stack.
## Example
- in `stack_orchestrator/data/stacks/my-new-stack/stack.yml` add:
```yaml
version: "0.1"
name: my-new-stack
repos:
- github.com/my-org/my-new-stack
containers:
- cerc/my-new-stack
pods:
- my-new-stack
```
- in `stack_orchestrator/data/container-build/cerc-my-new-stack/build.sh` add:
```yaml
#!/usr/bin/env bash
# Build the my-new-stack image
source ${CERC_CONTAINER_BASE_DIR}/build-base.sh
docker build -t cerc/my-new-stack:local -f ${CERC_REPO_BASE_DIR}/my-new-stack/Dockerfile ${build_command_args} ${CERC_REPO_BASE_DIR}/my-new-stack
```
- in `stack_orchestrator/data/compose/docker-compose-my-new-stack.yml` add:
```yaml
version: "3.2"
services:
my-new-stack:
image: cerc/my-new-stack:local
restart: always
ports:
- "0.0.0.0:3000:3000"
```
- in `stack_orchestrator/data/repository-list.txt` add:
```bash
github.com/my-org/my-new-stack
```
whereby that repository contains your source code and a `Dockerfile`, and matches the `repos:` field in the `stack.yml`.
- in `stack_orchestrator/data/container-image-list.txt` add:
```bash
cerc/my-new-stack
```
- in `stack_orchestrator/data/pod-list.txt` add:
```bash
my-new-stack
```
Now, the following commands will fetch, build, and deploy you app:
```bash
laconic-so --stack my-new-stack setup-repositories
laconic-so --stack my-new-stack build-containers
laconic-so --stack my-new-stack deploy-system up
```

View File

@ -1,114 +0,0 @@
# Arista EOS Reference Notes
Collected from live switch CLI (`?` help) and Arista documentation search
results. Switch platform: 7280CR3A, EOS 4.34.0F.
## PBR (Policy-Based Routing)
EOS uses `policy-map type pbr` — NOT `traffic-policy` (which is a different
feature for ASIC-level traffic policies, not available on all platforms/modes).
### Syntax
```
! ACL to match traffic
ip access-list <ACL-NAME>
10 permit <proto> <src> <dst> [ports]
! Class-map referencing the ACL
class-map type pbr match-any <CLASS-NAME>
match ip access-group <ACL-NAME>
! Policy-map with nexthop redirect
policy-map type pbr <POLICY-NAME>
class <CLASS-NAME>
set nexthop <A.B.C.D> ! direct nexthop IP
set nexthop recursive <A.B.C.D> ! recursive resolution
! set nexthop-group <NAME> ! nexthop group
! set ttl <value> ! TTL override
! Apply on interface
interface <INTF>
service-policy type pbr input <POLICY-NAME>
```
### PBR `set` options (from CLI `?`)
```
set ?
nexthop Next hop IP address for forwarding
nexthop-group next hop group name
ttl TTL effective with nexthop/nexthop-group
```
```
set nexthop ?
A.B.C.D next hop IP address
A:B:C:D:E:F:G:H next hop IPv6 address
recursive Enable Recursive Next hop resolution
```
**No VRF qualifier on `set nexthop`.** The nexthop must be reachable in the
VRF where the policy is applied. For cross-VRF PBR, use a static inter-VRF
route to make the nexthop reachable (see below).
## Static Inter-VRF Routes
Source: [EOS 4.34.0F - Static Inter-VRF Route](https://www.arista.com/en/um-eos/eos-static-inter-vrf-route)
Allows configuring a static route in one VRF with a nexthop evaluated in a
different VRF. Uses the `egress-vrf` keyword.
### Syntax
```
ip route vrf <ingress-vrf> <prefix>/<mask> egress-vrf <egress-vrf> <nexthop-ip>
ip route vrf <ingress-vrf> <prefix>/<mask> egress-vrf <egress-vrf> <interface>
```
### Examples (from Arista docs)
```
! Route in vrf1 with nexthop resolved in default VRF
ip route vrf vrf1 1.0.1.0/24 egress-vrf default 1.0.0.2
! show ip route vrf vrf1 output:
! S 1.0.1.0/24 [1/0] via 1.0.0.2, Vlan2180 (egress VRF default)
```
### Key points
- For bidirectional traffic, static inter-VRF routes must be configured in
both VRFs.
- ECMP next-hop sets across same or heterogeneous egress VRFs are supported.
- The `show ip route vrf` output displays the egress VRF name when it differs
from the source VRF.
## Inter-VRF Local Route Leaking
Source: [EOS 4.35.1F - Inter-VRF Local Route Leaking](https://www.arista.com/en/um-eos/eos-inter-vrf-local-route-leaking)
An alternative to static inter-VRF routes that leaks routes dynamically from
one VRF (source) to another VRF (destination) on the same router.
## Config Sessions
```
configure session <name> ! enter named session
show session-config diffs ! MUST be run from inside the session
commit timer HH:MM:SS ! commit with auto-revert timer
abort ! discard session
```
From enable mode:
```
configure session <name> commit ! finalize a pending session
```
## Checkpoints and Rollback
```
configure checkpoint save <name>
rollback running-config checkpoint <name>
write memory
```

File diff suppressed because it is too large Load Diff

View File

@ -1,181 +0,0 @@
<!-- Source: https://www.arista.com/um-eos/eos-ingress-and-egress-per-port-for-ipv4-and-ipv6-counters -->
<!-- Scraped: 2026-03-06T20:50:41.080Z -->
# Ingress and Egress Per-Port for IPv4 and IPv6 Counters
This feature supports per-interface ingress and egress packet and byte counters for IPv4
and IPv6.
This section describes Ingress and Egress per-port for IPv4 and IPv6 counters, including
configuration instructions and command descriptions.
Topics covered by this chapter include:
- Configuration
- Show commands
- Dedicated ARP Entry for TX IPv4 and IPv6 Counters
- Considerations
## Configuration
IPv4 and IPv6 ingress counters (count **bridged and routed**
traffic, supported only on front-panel ports) can be enabled and disabled using the
**hardware counter feature ip in**
command:
```
`**[no] hardware counter feature ip in**`
```
For IPv4 and IPv6 ingress and egress counters that include only
**routed** traffic (supported on Layer3 interfaces such as
routed ports and L3 subinterfaces only), use the following commands:
Note: The DCS-7300X, DCS-7250X, DCS-7050X, and DCS-7060X platforms
do not require configuration for IPv4 and IPv6 packet counters for only routed
traffic. They are collected by default. Other platforms (DCS-7280SR, DCS-7280CR, and
DCS-7500-R) need the feature enabled.
```
`**[no] hardware counter feature ip in layer3**`
```
```
`**[no] hardware counter feature ip out layer3**`
```
### hardware counter feature ip
Use the **hardware counter feature ip** command to enable ingress
and egress counters at Layer 3. The **no** and **default** forms of the command
disables the feature. The feature is enabled by default.
**Command Mode**
Configuration mode
**Command Syntax**
**hardware counter feature ip in|out layer3**
**no hardware counter feature ip in|out layer3**
**default hardware counter feature in|out layer3**
**Example**
This example enables ingress and egress ip counters for Layer 3.
```
`**switch(config)# hardware counter feature in layer3**`
```
```
`**switch(config)# hardware counter feature out layer3**`
```
## Show commands
Use the [**show interfaces counters ip**](/um-eos/eos-ethernet-ports#xzx_RbdvgrfI6B) command to
display IPv4, IPv6 packets, and octets.
**Example**
```
`switch# **show interfaces counters ip**
Interface IPv4InOctets IPv4InPkts IPv6InOctets IPv6InPkts
Et1/1 0 0 0 0
Et1/2 0 0 0 0
Et1/3 0 0 0 0
Et1/4 0 0 0 0
...
Interface IPv4OutOctets IPv4OutPkts IPv6OutOctets IPv6OutPkts
Et1/1 0 0 0 0
Et1/2 0 0 0 0
Et1/3 0 0 0 0
Et1/4 0 0 0 0
...`
```
You can also query the output from the **show interfaces counters
ip** command through snmp via the ARISTA-IP-MIB.
To clear the IPv4 or IPv6 counters, use the [**clear
counters**](/um-eos/eos-ethernet-ports#topic_dnd_1nm_vnb) command.
**Example**
```
`switch# **clear counters**`
```
## Dedicated ARP Entry for TX IPv4 and IPv6 Counters
IPv4/IPv6 egress Layer 3 (**hardware counter feature ip out layer3**)
counting on DCS-7280SR, DCS-7280CR, and DCS-7500-R platforms work based on ARP entry of
the next hop. By default, IPv4's next-hop and IPv6's next-hop resolve to the same MAC
address and interface that shared the ARP entry.
To differentiate the counters between IPv4 and IPv6, disable
**arp** entry sharing with the following command:
```
`**ip hardware fib next-hop arp dedicated**`
```
Note: This command is required for IPv4 and IPv6 egress counters
to operate on the DCS-7280SR, DCS-7280CR, and DCS-7500-R platforms.
## Considerations
- Packet sizes greater than 9236 bytes are not counted by per-port IPv4 and IPv6 counters.
- Only the DCS-7260X3, DCS-7368, DCS-7300, DCS-7050SX3, DCS-7050CX3, DCS-7280SR,
DCS-7280CR and DCS-7500-R platforms support the **hardware counter feature ip in** command.
- Only the DCS-7280SR, DCS-7280CR and DCS-7500-R platforms support the **hardware counter feature ip [in|out] layer3** command.

View File

@ -1,305 +0,0 @@
<!-- Source: https://www.arista.com/en/um-eos/eos-inter-vrf-local-route-leaking -->
<!-- Scraped: 2026-03-06T20:43:28.363Z -->
# Inter-VRF Local Route Leaking
Inter-VRF local route leaking allows the leaking of routes from one VRF (the source VRF) to
another VRF (the destination VRF) on the same router.
Inter-VRF routes can exist in any VRF (including the
default VRF) on the system. Routes can be leaked using the
following methods:
- Inter-VRF Local Route Leaking using BGP
VPN
- Inter-VRF Local Route Leaking using VRF-leak
Agent
## Inter-VRF Local Route Leaking using BGP VPN
Inter-VRF local route leaking allows the user to export and import routes from one VRF to another
on the same device. This is implemented by exporting routes from a VRF to the local VPN table
using the route target extended community list and importing the same route target extended
community lists from the local VPN table into the target VRF. VRF route leaking is supported
on VPN-IPv4, VPN-IPv6, and EVPN types.
Figure 1. Inter-VRF Local Route Leaking using Local VPN Table
### Accessing Shared Resources Across VPNs
To access shared resources across VPNs, all the routes from the shared services VRF must be
leaked into each of the VPN VRFs, and customer routes must be leaked into the shared
services VRF for return traffic. Accessing shared resources allows the route target of the
shared services VRF to be exported into all customer VRFs, and allows the shared services
VRF to import route targets from customers A and B. The following figure shows how to
provide customers, corresponding to multiple VPN domains, access to services like DHCP
available in the shared VRF.
Route leaking across the VRFs is supported
on VPN-IPv4, VPN-IPv6, and EVPN.
Figure 2. Accessing Shared Resources Across VPNs
### Configuring Inter-VRF Local Route Leaking
Inter-VRF local route leaking is configured using VPN-IPv4, VPN-IPv6, and EVPN. Prefixes can be
exported and imported using any of the configured VPN types. Ensure that the same VPN
type that is exported is used while importing.
Leaking unicast IPv4 or IPv6 prefixes is supported and achieved by exporting prefixes locally to
the VPN table and importing locally from the VPN table into the target VRF on the same
device as shown in the figure titled **Inter-VRF Local Route Leaking using Local VPN
Table** using the **route-target** command.
Exporting or importing the routes to or from the EVPN table is accomplished with the following
two methods:
- Using VXLAN for encapsulation
- Using MPLS for encapsulation
#### Using VXLAN for Encapsulation
To use VXLAN encapsulation type, make sure that VRF to VNI mapping is present and the interface
status for the VXLAN interface is up. This is the default encapsulation type for
EVPN.
**Example**
The configuration for VXLAN encapsulation type is as
follows:
```
`switch(config)# **router bgp 65001**
switch(config-router-bgp)# **address-family evpn**
switch(config-router-bgp-af)# **neighbor default encapsulation VXLAN next-hop-self source-interface Loopback0**
switch(config)# **hardware tcam**
switch(config-hw-tcam)# **system profile VXLAN-routing**
switch(config-hw-tcam)# **interface VXLAN1**
switch(config-hw-tcam-if-Vx1)# **VXLAN source-interface Loopback0**
switch(config-hw-tcam-if-Vx1)# **VXLAN udp-port 4789**
switch(config-hw-tcam-if-Vx1)# **VXLAN vrf vrf-blue vni 20001**
switch(config-hw-tcam-if-Vx1)# **VXLAN vrf vrf-red vni 10001**`
```
#### Using MPLS for Encapsulation
To use MPLS encapsulation type to export
to the EVPN table, MPLS needs to be enabled globally on the device and
the encapsulation method needs to be changed from default type, that
is VXLAN to MPLS under the EVPN address-family sub-mode.
**Example**
```
`switch(config)# **router bgp 65001**
switch(config-router-bgp)# **address-family evpn**
switch(config-router-bgp-af)# **neighbor default encapsulation mpls next-hop-self source-interface Loopback0**`
```
### Route-Distinguisher
Route-Distinguisher (RD) uniquely identifies routes from a particular VRF.
Route-Distinguisher is configured for every VRF from which routes are exported from or
imported into.
The following commands are used to configure Route-Distinguisher for a VRF.
```
`switch(config-router-bgp)# **vrf vrf-services**
switch(config-router-bgp-vrf-vrf-services)# **rd 1.0.0.1:1**
switch(config-router-bgp)# **vrf vrf-blue**
switch(config-router-bgp-vrf-vrf-blue)# **rd 2.0.0.1:2**`
```
### Exporting Routes from a VRF
Use the **route-target export** command to export routes from a VRF to the
local VPN or EVPN table using the route target
extended community list.
**Examples**
- These commands export routes from
**vrf-red** to the local VPN
table.
```
`switch(config)# **service routing protocols model multi-agent**
switch(config)# **mpls ip**
switch(config)# **router bgp 65001**
switch(config-router-bgp)# **vrf vrf-red**
switch(config-router-bgp-vrf-vrf-red)# **rd 1:1**
switch(config-router-bgp-vrf-vrf-red)# **route-target export vpn-ipv4 10:10**
switch(config-router-bgp-vrf-vrf-red)# **route-target export vpn-ipv6 10:20**`
```
- These commands export routes from
**vrf-red** to the EVPN
table.
```
`switch(config)# **router bgp 65001**
switch(config-router-bgp)# **vrf vrf-red**
switch(config-router-bgp-vrf-vrf-red)# **rd 1:1**
switch(config-router-bgp-vrf-vrf-red)# **route-target export evpn 10:1**`
```
### Importing Routes into a VRF
Use the **route-target import** command to import the exported routes from
the local VPN or EVPN table to the target VRF
using the route target extended community
list.
**Examples**
- These commands import routes from the VPN
table to
**vrf-blue**.
```
`switch(config)# **service routing protocols model multi-agent**
switch(config)# **mpls ip**
switch(config)# **router bgp 65001**
switch(config-router-bgp)# **vrf vrf-blue**
switch(config-router-bgp-vrf-vrf-blue)# **rd 2:2**
switch(config-router-bgp-vrf-vrf-blue)# **route-target import vpn-ipv4 10:10**
switch(config-router-bgp-vrf-vrf-blue)# **route-target import vpn-ipv6 10:20**`
```
- These commands import routes from the EVPN
table to
**vrf-blue**.
```
`switch(config)# **router bgp 65001**
switch(config-router-bgp)# **vrf vrf-blue**
switch(config-router-bgp-vrf-vrf-blue)# **rd 2:2**
switch(config-router-bgp-vrf-vrf-blue)# **route-target import evpn 10:1**`
```
### Exporting and Importing Routes using Route
Map
To manage VRF route leaking, control the export and import prefixes with route-map export or
import commands. The route map is effective only if the VRF or the VPN
paths are already candidates for export or import. The route-target
export or import commandmust be configured first. Setting BGP
attributes using route maps is effective only on the export end.
Note: Prefixes that are leaked are not re-exported to the VPN table from the target VRF.
**Examples**
- These commands export routes from
**vrf-red** to the local VPN
table.
```
`switch(config)# **service routing protocols model multi-agent**
switch(config)# **mpls ip**
switch(config)# **router bgp 65001**
switch(config-router-bgp)# **vrf vrf-red**
switch(config-router-bgp-vrf-vrf-red)# **rd 1:1**
switch(config-router-bgp-vrf-vrf-red)# **route-target export vpn-ipv4 10:10**
switch(config-router-bgp-vrf-vrf-red)# **route-target export vpn-ipv6 10:20**
switch(config-router-bgp-vrf-vrf-red)# **route-target export vpn-ipv4 route-map EXPORT_V4_ROUTES_T0_VPN_TABLE**
switch(config-router-bgp-vrf-vrf-red)# **route-target export vpn-ipv6 route-map EXPORT_V6_ROUTES_T0_VPN_TABLE**`
```
- These commands export routes to from
**vrf-red** to the EVPN
table.
```
`switch(config)# **router bgp 65001**
switch(config-router-bgp)# **vrf vrf-red**
switch(config-router-bgp-vrf-vrf-red)# **rd 1:1**
switch(config-router-bgp-vrf-vrf-red)# **route-target export evpn 10:1**
switch(config-router-bgp-vrf-vrf-red)# **route-target export evpn route-map EXPORT_ROUTES_T0_EVPN_TABLE**`
```
- These commands import routes from the VPN table to
**vrf-blue**.
```
`switch(config)# **service routing protocols model multi-agent**
switch(config)# **mpls ip**
switch(config)# **router bgp 65001**
switch(config-router-bgp)# **vrf vrf-blue**
switch(config-router-bgp-vrf-vrf-blue)# **rd 1:1**
switch(config-router-bgp-vrf-vrf-blue)# **route-target import vpn-ipv4 10:10**
switch(config-router-bgp-vrf-vrf-blue)# **route-target import vpn-ipv6 10:20**
switch(config-router-bgp-vrf-vrf-blue)# **route-target import vpn-ipv4 route-map IMPORT_V4_ROUTES_VPN_TABLE**
switch(config-router-bgp-vrf-vrf-blue)# **route-target import vpn-ipv6 route-map IMPORT_V6_ROUTES_VPN_TABLE**`
```
- These commands import routes from the EVPN table to
**vrf-blue**.
```
`switch(config)# **router bgp 65001**
switch(config-router-bgp)# **vrf vrf-blue**
switch(config-router-bgp-vrf-vrf-blue)# **rd 2:2**
switch(config-router-bgp-vrf-vrf-blue)# **route-target import evpn 10:1**
switch(config-router-bgp-vrf-vrf-blue)# **route-target import evpn route-map IMPORT_ROUTES_FROM_EVPN_TABLE**`
```
## Inter-VRF Local Route Leaking using VRF-leak
Agent
Inter-VRF local route leaking allows routes to leak from one VRF to another using a route
map as a VRF-leak agent. VRFs are leaked based on the preferences assigned to each
VRF.
### Configuring Route Maps
To leak routes from one VRF to another using a route map, use the [router general](/um-eos/eos-evpn-and-vcs-commands#xx1351777) command to enter Router-General
Configuration Mode, then enter the VRF submode for the destination VRF, and use the
[leak routes](/um-eos/eos-evpn-and-vcs-commands#reference_g2h_2z3_hwb) command to specify the source
VRF and the route map to be used. Routes in the source VRF that match the policy in the
route map will then be considered for leaking into the configuration-mode VRF. If two or
more policies specify leaking the same prefix to the same destination VRF, the route
with a higher (post-set-clause) distance and preference is chosen.
**Example**
These commands configure a route map to leak routes from **VRF1**
to **VRF2** using route map
**RM1**.
```
`switch(config)# **router general**
switch(config-router-general)# **vrf VRF2**
switch(config-router-general-vrf-VRF2)# **leak routes source-vrf VRF1 subscribe-policy RM1**
switch(config-router-general-vrf-VRF2)#`
```

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,82 +0,0 @@
<!-- Source: https://www.arista.com/en/um-eos/eos-static-inter-vrf-route -->
<!-- Scraped: 2026-03-06T20:43:17.977Z -->
# Static Inter-VRF Route
The Static Inter-VRF Route feature adds support for static inter-VRF routes. This enables the configuration of routes to destinations in one ingress VRF with an ability to specify a next-hop in a different egress VRF through a static configuration.
You can configure static inter-VRF routes in default and non-default VRFs. A different
egress VRF is achieved by “tagging” the **next-hop** or **forwarding
via** with a reference to an egress VRF (different from the source
VRF) in which that next-hop should be evaluated. Static inter-VRF routes
with ECMP next-hop sets in the same egress VRF or heterogenous egress VRFs
can be specified.
The Static Inter-VRF Route feature is independent and complementary to other mechanisms that can be used to setup local inter-VRF routes. The other supported mechanisms in EOS and the broader use-cases they support are documented here:
- [Inter-VRF Local Route Leaking using BGP VPN](/um-eos/eos-inter-vrf-local-route-leaking#xx1348142)
- [Inter-VRF Local Route Leaking using VRF-leak Agent](/um-eos/eos-inter-vrf-local-route-leaking#xx1346287)
## Configuration
The configuration to setup static-Inter VRF routes in an ingress (source) VRF to forward IP traffic to a different egress (target) VRF can be done in the following modes:
- This command creates a static route in one ingress VRF that points to a next-hop
in a different egress VRF.
ip | ipv6
route [vrf
vrf-name
destination-prefix [egress-vrf
egress-next-hop-vrf-name]
next-hop]
## Show Commands
Use the **show ip route vrf** to display the egress VRF name if it
differs from the source VRF.
**Example**
```
`switch# **show ip route vrf vrf1**
VRF: vrf1
Codes: C - connected, S - static, K - kernel,
O - OSPF, IA - OSPF inter area, E1 - OSPF external type 1,
E2 - OSPF external type 2, N1 - OSPF NSSA external type 1,
N2 - OSPF NSSA external type2, B - BGP, B I - iBGP, B E - eBGP,
R - RIP, I L1 - IS-IS level 1, I L2 - IS-IS level 2,
O3 - OSPFv3, A B - BGP Aggregate, A O - OSPF Summary,
NG - Nexthop Group Static Route, V - VXLAN Control Service,
DH - DHCP client installed default route, M - Martian,
DP - Dynamic Policy Route, L - VRF Leaked
Gateway of last resort is not set
S 1.0.1.0/24 [1/0] via 1.0.0.2, Vlan2180 (egress VRF default)
S 1.0.7.0/24 [1/0] via 1.0.6.2, Vlan2507 (egress VRF vrf3)`
```
## Limitations
- For bidirectional traffic to work correctly between a pair of VRFs, static inter-VRF
routes in both VRFs must be configured.
- Static Inter-VRF routing is supported only in multi-agent routing protocol mode.

File diff suppressed because it is too large Load Diff

View File

@ -1,168 +0,0 @@
# Ashburn Relay / ip_echo Port Reachability Checklist
The validator exits when it can't verify UDP ports (8001, 9000, 9002, 9003) are
reachable from entrypoint servers. The ip_echo protocol: validator TCP-connects
to entrypoint on port 8001, entrypoint sees source IP, sends UDP probes back to
that IP on the validator's ports. If probes don't arrive, validator crashes.
## Layer 1: Biscayne outbound path
Validator's outbound ip_echo TCP (dport 8001) must exit via GRE tunnel so
entrypoints see `137.239.194.65`, not biscayne's real IP via Docker MASQUERADE.
```
[ ] 1.1 Mangle rules (4 rules in mangle PREROUTING):
- udp sport 8001 (gossip outbound)
- udp sport 9000:9025 (TVU/repair outbound)
- tcp sport 8001 (gossip TCP outbound)
- tcp dport 8001 (ip_echo outbound — THE CRITICAL ONE)
[ ] 1.2 SNAT rule at position 1 (before Docker MASQUERADE):
POSTROUTING -m mark --mark 100 -j SNAT --to-source 137.239.194.65
[ ] 1.3 Policy routing rule:
fwmark 0x64 lookup ashburn
[ ] 1.4 Ashburn routing table default route:
default via 169.254.100.0 dev gre-ashburn
[ ] 1.5 Mangle counters incrementing (pkts/bytes on tcp dport 8001 rule)
```
## Layer 2: GRE tunnel (biscayne ↔ mia-sw01)
```
[ ] 2.1 Tunnel exists and UP:
gre-ashburn with 169.254.100.1/31
[ ] 2.2 Tunnel peer reachable:
ping 169.254.100.0
[ ] 2.3 Ashburn IP on loopback:
137.239.194.65/32 dev lo
```
## Layer 3: Biscayne inbound path (DNAT + DOCKER-USER)
Entrypoint UDP probes arrive at `137.239.194.65` and must reach kind node
`172.20.0.2`.
```
[ ] 3.1 DNAT rules at position 1 in nat PREROUTING
(before Docker's ADDRTYPE LOCAL rule):
- udp dport 8001 → 172.20.0.2:8001
- tcp dport 8001 → 172.20.0.2:8001
- udp dport 9000:9025 → 172.20.0.2
[ ] 3.2 DOCKER-USER ACCEPT rules (3 rules):
- udp dport 8001 → ACCEPT
- tcp dport 8001 → ACCEPT
- udp dport 9000:9025 → ACCEPT
[ ] 3.3 DNAT counters incrementing
```
## Layer 4: mia-sw01
```
[ ] 4.1 Tunnel100 UP in VRF relay
src 209.42.167.137, dst 186.233.184.235, link 169.254.100.0/31
[ ] 4.2 VRF relay default route:
0.0.0.0/0 egress-vrf default 172.16.1.188
[ ] 4.3 Default VRF route to relay IP:
137.239.194.65/32 egress-vrf relay 169.254.100.1
[ ] 4.4 ACL SEC-VALIDATOR-100-IN permits all needed traffic
[ ] 4.5 Backbone Et4/1 UP (172.16.1.189/31)
```
## Layer 5: was-sw01
```
[ ] 5.1 Static route: 137.239.194.65/32 via 172.16.1.189
[ ] 5.2 Backbone Et4/1 UP (172.16.1.188/31)
[ ] 5.3 No Loopback101 (removed to avoid absorbing traffic locally)
```
## Layer 6: Persistence
```
[ ] 6.1 ashburn-relay.service enabled and active (runs After=docker.service)
[ ] 6.2 /usr/local/sbin/ashburn-relay-setup.sh exists
```
## Layer 7: End-to-end tests
All tests run via Ansible playbooks. The test scripts in `scripts/` are
utilities invoked by the playbooks — never run them manually via SSH.
```
[ ] 7.1 relay-test-tcp-dport.py (via ashburn-relay-check.yml or ad-hoc play)
Tests: outbound tcp dport 8001 mangle → SNAT → tunnel
Pass: entrypoint sees 137.239.194.65
Fail: entrypoint sees 186.233.184.235 (Docker MASQUERADE)
[ ] 7.2 relay-test-ip-echo.py (via ashburn-relay-check.yml or ad-hoc play)
Tests: FULL END-TO-END (outbound SNAT + inbound DNAT + DOCKER-USER)
Pass: UDP probe received from entrypoint
Fail: no UDP probes — inbound path broken
[ ] 7.3 relay-inbound-udp-test.yml (cross-inventory: biscayne + kelce)
Tests: inbound UDP from external host → DNAT → kind node
Pass: UDP arrives in kind netns
```
## Playbooks
```bash
# Read-only check of all relay state (biscayne + both switches):
ansible-playbook -i inventory-switches/switches.yml \
-i inventory/biscayne.yml playbooks/ashburn-relay-check.yml
# Apply all biscayne relay rules (idempotent):
ansible-playbook -i inventory/biscayne.yml playbooks/ashburn-relay-biscayne.yml
# Apply outbound only (the ip_echo fix):
ansible-playbook -i inventory/biscayne.yml \
playbooks/ashburn-relay-biscayne.yml -t outbound
# Apply inbound only (DNAT + DOCKER-USER):
ansible-playbook -i inventory/biscayne.yml \
playbooks/ashburn-relay-biscayne.yml -t inbound
# Apply mia-sw01 config:
ansible-playbook -i inventory-switches/switches.yml \
playbooks/ashburn-relay-mia-sw01.yml
# Apply was-sw01 config:
ansible-playbook -i inventory-switches/switches.yml \
playbooks/ashburn-relay-was-sw01.yml
# Cross-inventory inbound UDP test (biscayne + kelce):
ansible-playbook -i inventory/biscayne.yml -i inventory/kelce.yml \
playbooks/relay-inbound-udp-test.yml
```
## Historical root causes
1. **TCP dport 8001 mangle rule missing** — ip_echo TCP exits via Docker
MASQUERADE, entrypoint sees wrong IP, UDP probes go to wrong address.
2. **DOCKER-USER ACCEPT rules missing** — DNAT'd traffic hits Docker's FORWARD
DROP policy, never reaches kind node.
3. **DNAT rule position wrong** — Docker's `ADDRTYPE LOCAL` rule in PREROUTING
catches traffic to loopback IPs before our DNAT rules. Must use `-I
PREROUTING 1`.
4. **mia-sw01 egress-vrf route with interface specified** — silently fails in
EOS (accepted in config, never installed in RIB). Must use nexthop-only form.
5. **was-sw01 Loopback101 absorbing traffic** — local delivery instead of
forwarding to mia-sw01 via backbone.

View File

@ -1,275 +0,0 @@
# Ashburn Validator Relay — Full Traffic Redirect
## Overview
All validator traffic (gossip, repair, TVU, TPU) enters and exits from
`137.239.194.65` (laconic-was-sw01, Ashburn). Peers see the validator as an
Ashburn node. This improves repair peer count and slot catchup rate by reducing
RTT to the TeraSwitch/Pittsburgh cluster from ~30ms (direct Miami) to ~5ms
(Ashburn).
Supersedes the previous TVU-only shred relay (see `tvu-shred-relay.md`).
## Architecture
```
OUTBOUND (validator → peers)
agave-validator (kind pod, ports 8001, 9000-9025)
↓ Docker bridge → host FORWARD chain
biscayne host (186.233.184.235)
↓ mangle PREROUTING: fwmark 100 on sport 8001,9000-9025 from 172.20.0.0/16
↓ nat POSTROUTING: SNAT → src 137.239.194.65
↓ policy route: fwmark 100 → table ashburn → via 169.254.7.6 dev doublezero0
laconic-mia-sw01 (209.42.167.133, Miami)
↓ traffic-policy VALIDATOR-OUTBOUND: src 137.239.194.65 → nexthop 172.16.1.188
↓ backbone Et4/1 (25.4ms)
laconic-was-sw01 Et4/1 (Ashburn)
↓ default route via 64.92.84.80 out Et1/1
Internet (peers see src 137.239.194.65)
INBOUND (peers → validator)
Solana peers → 137.239.194.65:8001,9000-9025
↓ internet routing to was-sw01
laconic-was-sw01 Et1/1 (Ashburn)
↓ traffic-policy VALIDATOR-RELAY: ASIC redirect, line rate
↓ nexthop 172.16.1.189 via Et4/1 backbone (25.4ms)
laconic-mia-sw01 Et4/1 (Miami)
↓ L3 forward → biscayne via doublezero0 GRE or ISP routing
biscayne (186.233.184.235)
↓ nat PREROUTING: DNAT dst 137.239.194.65:* → 172.20.0.2:* (kind node)
↓ Docker bridge → validator pod
agave-validator
```
RPC traffic (port 8899) is NOT relayed — clients connect directly to biscayne.
## Switch Config: laconic-was-sw01
SSH: `install@137.239.200.198`
### Pre-change
```
configure checkpoint save pre-validator-relay
```
Rollback: `rollback running-config checkpoint pre-validator-relay` then `write memory`.
### Config session with auto-revert
```
configure session validator-relay
! Loopback for 137.239.194.65 (do NOT touch Loopback100 which has .64)
interface Loopback101
ip address 137.239.194.65/32
! ACL covering all validator ports
ip access-list VALIDATOR-RELAY-ACL
10 permit udp any any eq 8001
20 permit udp any any range 9000 9025
30 permit tcp any any eq 8001
! Traffic-policy: ASIC redirect to backbone (mia-sw01)
traffic-policy VALIDATOR-RELAY
match VALIDATOR-RELAY-ACL
set nexthop 172.16.1.189
! Replace old SHRED-RELAY on Et1/1
interface Ethernet1/1
no traffic-policy input SHRED-RELAY
traffic-policy input VALIDATOR-RELAY
! system-rule overriding-action redirect (already present from SHRED-RELAY)
show session-config diffs
commit timer 00:05:00
```
After verification: `configure session validator-relay commit` then `write memory`.
### Cleanup (after stable)
Old SHRED-RELAY policy and ACL can be removed once VALIDATOR-RELAY is confirmed:
```
configure session cleanup-shred-relay
no traffic-policy SHRED-RELAY
no ip access-list SHRED-RELAY-ACL
show session-config diffs
commit
write memory
```
## Switch Config: laconic-mia-sw01
### Pre-flight checks
Before applying config, verify:
1. Which EOS interface terminates the doublezero0 GRE from biscayne
(endpoint 209.42.167.133). Check with `show interfaces tunnel` or
`show ip interface brief | include Tunnel`.
2. Whether `system-rule overriding-action redirect` is already configured.
Check with `show running-config | include system-rule`.
3. Whether EOS traffic-policy works on tunnel interfaces. If not, apply on
the physical interface where GRE packets arrive (likely Et<X> facing
biscayne's ISP network or the DZ infrastructure).
### Config session
```
configure checkpoint save pre-validator-outbound
configure session validator-outbound
! ACL matching outbound validator traffic (source = Ashburn IP)
ip access-list VALIDATOR-OUTBOUND-ACL
10 permit ip 137.239.194.65/32 any
! Redirect to was-sw01 via backbone
traffic-policy VALIDATOR-OUTBOUND
match VALIDATOR-OUTBOUND-ACL
set nexthop 172.16.1.188
! Apply on the interface where biscayne GRE traffic arrives
! Replace Tunnel<X> with the actual interface from pre-flight check #1
interface Tunnel<X>
traffic-policy input VALIDATOR-OUTBOUND
! Add system-rule if not already present (pre-flight check #2)
system-rule overriding-action redirect
show session-config diffs
commit timer 00:05:00
```
After verification: commit + `write memory`.
## Host Config: biscayne
Automated via ansible playbook `playbooks/ashburn-validator-relay.yml`.
### Manual equivalent
```bash
# 1. Accept packets destined for 137.239.194.65
sudo ip addr add 137.239.194.65/32 dev lo
# 2. Inbound DNAT to kind node (172.20.0.2)
sudo iptables -t nat -A PREROUTING -p udp -d 137.239.194.65 --dport 8001 \
-j DNAT --to-destination 172.20.0.2:8001
sudo iptables -t nat -A PREROUTING -p tcp -d 137.239.194.65 --dport 8001 \
-j DNAT --to-destination 172.20.0.2:8001
sudo iptables -t nat -A PREROUTING -p udp -d 137.239.194.65 --dport 9000:9025 \
-j DNAT --to-destination 172.20.0.2
# 3. Outbound: mark validator traffic
sudo iptables -t mangle -A PREROUTING -s 172.20.0.0/16 -p udp --sport 8001 \
-j MARK --set-mark 100
sudo iptables -t mangle -A PREROUTING -s 172.20.0.0/16 -p udp --sport 9000:9025 \
-j MARK --set-mark 100
sudo iptables -t mangle -A PREROUTING -s 172.20.0.0/16 -p tcp --sport 8001 \
-j MARK --set-mark 100
# 4. Outbound: SNAT to Ashburn IP (INSERT before Docker MASQUERADE)
sudo iptables -t nat -I POSTROUTING 1 -m mark --mark 100 \
-j SNAT --to-source 137.239.194.65
# 5. Policy routing table
echo "100 ashburn" | sudo tee -a /etc/iproute2/rt_tables
sudo ip rule add fwmark 100 table ashburn
sudo ip route add default via 169.254.7.6 dev doublezero0 table ashburn
# 6. Persist
sudo netfilter-persistent save
# ip rule + ip route persist via /etc/network/if-up.d/ashburn-routing
```
### Docker NAT port preservation
**Must verify before going live:** Docker masquerade must preserve source ports
for kind's hostNetwork pods. If Docker rewrites the source port, the mangle
PREROUTING match on `--sport 8001,9000-9025` will miss traffic.
Test: `tcpdump -i br-cf46a62ab5b2 -nn 'udp src port 8001'` — if you see
packets with sport 8001 from 172.20.0.2, port preservation works.
If Docker does NOT preserve ports, the mark must be set inside the kind node
container (on the pod's veth) rather than on the host.
## Execution Order
1. **was-sw01**: checkpoint → config session with 5min auto-revert → verify counters → commit
2. **biscayne**: add 137.239.194.65/32 to lo, add inbound DNAT rules
3. **Verify inbound**: `ping 137.239.194.65` from external host, check DNAT counters
4. **mia-sw01**: pre-flight checks → config session with 5min auto-revert → commit
5. **biscayne**: add outbound fwmark + policy routing + SNAT rules
6. **Test outbound**: from biscayne, send UDP from port 8001, verify src 137.239.194.65 on was-sw01
7. **Verify**: traffic-policy counters on both switches, iptables hit counts on biscayne
8. **Restart validator** if needed (gossip should auto-refresh, but restart ensures clean state)
9. **was-sw01 + mia-sw01**: `write memory` to persist
10. **Cleanup**: remove old SHRED-RELAY and 64.92.84.81:20000 DNAT after stable
## Verification
1. `show traffic-policy counters` on was-sw01 — VALIDATOR-RELAY-ACL matches
2. `show traffic-policy counters` on mia-sw01 — VALIDATOR-OUTBOUND-ACL matches
3. `sudo iptables -t nat -L -v -n` on biscayne — DNAT and SNAT hit counts
4. `sudo iptables -t mangle -L -v -n` on biscayne — fwmark hit counts
5. `ip rule show` on biscayne — fwmark 100 lookup ashburn
6. Validator gossip ContactInfo shows 137.239.194.65 for ALL addresses (gossip, repair, TVU, TPU)
7. Repair peer count increases (target: 20+ peers)
8. Slot catchup rate improves from ~0.9 toward ~2.5 slots/sec
9. `traceroute --sport=8001 <remote_peer>` from biscayne routes via doublezero0/was-sw01
## Rollback
### biscayne
```bash
sudo ip addr del 137.239.194.65/32 dev lo
sudo iptables -t nat -D PREROUTING -p udp -d 137.239.194.65 --dport 8001 -j DNAT --to-destination 172.20.0.2:8001
sudo iptables -t nat -D PREROUTING -p tcp -d 137.239.194.65 --dport 8001 -j DNAT --to-destination 172.20.0.2:8001
sudo iptables -t nat -D PREROUTING -p udp -d 137.239.194.65 --dport 9000:9025 -j DNAT --to-destination 172.20.0.2
sudo iptables -t mangle -D PREROUTING -s 172.20.0.0/16 -p udp --sport 8001 -j MARK --set-mark 100
sudo iptables -t mangle -D PREROUTING -s 172.20.0.0/16 -p udp --sport 9000:9025 -j MARK --set-mark 100
sudo iptables -t mangle -D PREROUTING -s 172.20.0.0/16 -p tcp --sport 8001 -j MARK --set-mark 100
sudo iptables -t nat -D POSTROUTING -m mark --mark 100 -j SNAT --to-source 137.239.194.65
sudo ip rule del fwmark 100 table ashburn
sudo ip route del default table ashburn
sudo netfilter-persistent save
```
### was-sw01
```
rollback running-config checkpoint pre-validator-relay
write memory
```
### mia-sw01
```
rollback running-config checkpoint pre-validator-outbound
write memory
```
## Key Details
| Item | Value |
|------|-------|
| Ashburn relay IP | `137.239.194.65` (Loopback101 on was-sw01) |
| Ashburn LAN block | `137.239.194.64/29` on was-sw01 Et1/1 |
| Biscayne IP | `186.233.184.235` |
| Kind node IP | `172.20.0.2` (Docker bridge br-cf46a62ab5b2) |
| Validator ports | 8001 (gossip), 9000-9025 (TVU/repair/TPU) |
| Excluded ports | 8899 (RPC), 8900 (WebSocket) — direct to biscayne |
| GRE tunnel | doublezero0: 169.254.7.7 ↔ 169.254.7.6, remote 209.42.167.133 |
| Backbone | was-sw01 Et4/1 172.16.1.188/31 ↔ mia-sw01 Et4/1 172.16.1.189/31 |
| Policy routing table | 100 ashburn |
| Fwmark | 100 |
| was-sw01 SSH | `install@137.239.200.198` |
| EOS version | 4.34.0F |

View File

@ -1,416 +0,0 @@
# Blue-Green Upgrades for Biscayne
Zero-downtime upgrade procedures for the agave-stack deployment on biscayne.
Uses ZFS clones for instant data duplication, Caddy health-check routing for
traffic shifting, and k8s native sidecars for independent container upgrades.
## Architecture
```
Caddy ingress (biscayne.vaasl.io)
├── upstream A: localhost:8899 ← health: /health
└── upstream B: localhost:8897 ← health: /health
┌─────────────────┴──────────────────┐
│ kind cluster │
│ │
│ Deployment A Deployment B │
│ ┌─────────────┐ ┌─────────────┐ │
│ │ agave :8899 │ │ agave :8897 │ │
│ │ doublezerod │ │ doublezerod │ │
│ └──────┬──────┘ └──────┬──────┘ │
└─────────┼─────────────────┼─────────┘
│ │
ZFS dataset A ZFS clone B
(original) (instant CoW copy)
```
Both deployments run in the same kind cluster with `hostNetwork: true`.
Caddy active health checks route traffic to whichever deployment has a
healthy `/health` endpoint.
## Storage Layout
| Data | Path | Type | Survives restart? |
|------|------|------|-------------------|
| Ledger | `/srv/solana/ledger` | ZFS zvol (xfs) | Yes |
| Snapshots | `/srv/solana/snapshots` | ZFS zvol (xfs) | Yes |
| Accounts | `/srv/solana/ramdisk/accounts` | `/dev/ram0` (xfs) | Until host reboot |
| Validator config | `/srv/deployments/agave/data/validator-config` | ZFS | Yes |
| DZ config | `/srv/deployments/agave/data/doublezero-config` | ZFS | Yes |
The ZFS zvol `biscayne/DATA/volumes/solana` backs `/srv/solana` (ledger, snapshots).
The ramdisk at `/dev/ram0` holds accounts — it's a block device, not tmpfs, so it
survives process restarts but not host reboots.
---
## Procedure 1: DoubleZero Binary Upgrade (zero downtime, single pod)
The GRE tunnel (`doublezero0`) and BGP routes live in kernel space. They persist
across doublezerod process restarts. Upgrading the DZ binary does not require
tearing down the tunnel or restarting the validator.
### Prerequisites
- doublezerod is defined as a k8s native sidecar (`spec.initContainers` with
`restartPolicy: Always`). See [Required Changes](#required-changes) below.
- k8s 1.29+ (biscayne runs 1.35.1)
### Steps
1. Build or pull the new doublezero container image.
2. Patch the pod's sidecar image:
```bash
kubectl -n <ns> patch pod <pod> --type='json' -p='[
{"op": "replace", "path": "/spec/initContainers/0/image",
"value": "laconicnetwork/doublezero:new-version"}
]'
```
3. Only the doublezerod container restarts. The agave container is unaffected.
The GRE tunnel interface and BGP routes remain in the kernel throughout.
4. Verify:
```bash
kubectl -n <ns> exec <pod> -c doublezerod -- doublezero --version
kubectl -n <ns> exec <pod> -c doublezerod -- doublezero status
ip route | grep doublezero0 # routes still present
```
### Rollback
Patch the image back to the previous version. Same process, same zero downtime.
---
## Procedure 2: Agave Version Upgrade (zero RPC downtime, blue-green)
Agave is the main container and must be restarted for a version change. To maintain
zero RPC downtime, we run two deployments simultaneously and let Caddy shift traffic
based on health checks.
### Prerequisites
- Caddy ingress configured with dual upstreams and active health checks
- A parameterized spec.yml that accepts alternate ports and volume paths
- ZFS snapshot/clone scripts
### Steps
#### Phase 1: Prepare (no downtime, no risk)
1. **ZFS snapshot** for rollback safety:
```bash
zfs snapshot -r biscayne/DATA@pre-upgrade-$(date +%Y%m%d)
```
2. **ZFS clone** the validator volumes:
```bash
zfs clone biscayne/DATA/volumes/solana@pre-upgrade-$(date +%Y%m%d) \
biscayne/DATA/volumes/solana-blue
```
This is instant (copy-on-write). No additional storage until writes diverge.
3. **Clone the ramdisk accounts** (not on ZFS):
```bash
mkdir -p /srv/solana-blue/ramdisk/accounts
cp -a /srv/solana/ramdisk/accounts/* /srv/solana-blue/ramdisk/accounts/
```
This is the slow step — 460GB on ramdisk. Consider `rsync` with `--inplace`
to minimize copy time, or investigate whether the ramdisk can move to a ZFS
dataset for instant cloning in future deployments.
4. **Build or pull** the new agave container image.
#### Phase 2: Start blue deployment (no downtime)
5. **Create Deployment B** in the same kind cluster, pointing at cloned volumes,
with RPC on port 8897:
```bash
# Apply the blue deployment manifest (parameterized spec)
kubectl apply -f deployment/k8s-manifests/agave-blue.yaml
```
6. **Deployment B catches up.** It starts from the snapshot point and replays.
Monitor progress:
```bash
kubectl -n <ns> exec <blue-pod> -c agave-validator -- \
solana -u http://127.0.0.1:8897 slot
```
7. **Validate** the new version works:
- RPC responds: `curl -sf http://localhost:8897/health`
- Correct version: `kubectl -n <ns> exec <blue-pod> -c agave-validator -- agave-validator --version`
- doublezerod connected (if applicable)
Take as long as needed. Deployment A is still serving all traffic.
#### Phase 3: Traffic shift (zero downtime)
8. **Caddy routes traffic to B.** Once B's `/health` returns 200, Caddy's active
health check automatically starts routing to it. Alternatively, update the
Caddy upstream config to prefer B.
9. **Verify** B is serving live traffic:
```bash
curl -sf https://biscayne.vaasl.io/health
# Check Caddy access logs for requests hitting port 8897
```
#### Phase 4: Cleanup
10. **Stop Deployment A:**
```bash
kubectl -n <ns> delete deployment agave-green
```
11. **Reconfigure B to use standard port** (8899) if desired, or update Caddy
to only route to 8897.
12. **Clean up ZFS clone** (or keep as rollback):
```bash
zfs destroy biscayne/DATA/volumes/solana-blue
```
### Rollback
At any point before Phase 4:
- Deployment A is untouched and still serving traffic (or can be restarted)
- Delete Deployment B: `kubectl -n <ns> delete deployment agave-blue`
- Destroy the ZFS clone: `zfs destroy biscayne/DATA/volumes/solana-blue`
After Phase 4 (A already stopped):
- `zfs rollback` to restore original data
- Redeploy A with old image
---
## Required Changes to agave-stack
### 1. Move doublezerod to native sidecar
In the pod spec generation (laconic-so or compose override), doublezerod must be
defined as a native sidecar container instead of a regular container:
```yaml
spec:
initContainers:
- name: doublezerod
image: laconicnetwork/doublezero:local
restartPolicy: Always # makes it a native sidecar
securityContext:
privileged: true
capabilities:
add: [NET_ADMIN]
env:
- name: DOUBLEZERO_RPC_ENDPOINT
value: https://api.mainnet-beta.solana.com
volumeMounts:
- name: doublezero-config
mountPath: /root/.config/doublezero
containers:
- name: agave-validator
image: laconicnetwork/agave:local
# ... existing config
```
This change means:
- doublezerod starts before agave and stays running
- Patching the doublezerod image restarts only that container
- agave can be restarted independently without affecting doublezerod
This requires a laconic-so change to support `initContainers` with `restartPolicy`
in compose-to-k8s translation — or a post-deployment patch.
### 2. Caddy dual-upstream config
Add health-checked upstreams for both blue and green deployments:
```caddyfile
biscayne.vaasl.io {
reverse_proxy {
to localhost:8899 localhost:8897
health_uri /health
health_interval 5s
health_timeout 3s
lb_policy first
}
}
```
`lb_policy first` routes to the first healthy upstream. When only A is running,
all traffic goes to :8899. When B comes up healthy, traffic shifts.
### 3. Parameterized deployment spec
Create a parameterized spec or kustomize overlay that accepts:
- RPC port (8899 vs 8897)
- Volume paths (original vs ZFS clone)
- Deployment name suffix (green vs blue)
### 4. Delete DaemonSet workaround
Remove `deployment/k8s-manifests/doublezero-daemonset.yaml` from agave-stack.
### 5. Fix container DZ identity
Copy the registered identity into the container volume:
```bash
sudo cp /home/solana/.config/doublezero/id.json \
/srv/deployments/agave/data/doublezero-config/id.json
```
### 6. Disable host systemd doublezerod
After the container sidecar is working:
```bash
sudo systemctl stop doublezerod
sudo systemctl disable doublezerod
```
---
## Implementation Order
This is a spec-driven, test-driven plan. Each step produces a testable artifact.
### Step 1: Fix existing DZ bugs (no code changes to laconic-so)
Fixes BUG-1 through BUG-5 from [doublezero-status.md](doublezero-status.md).
**Spec:** Container doublezerod shows correct identity, connects to laconic-mia-sw01,
host systemd doublezerod is disabled.
**Test:**
```bash
kubectl -n <ns> exec <pod> -c doublezerod -- doublezero address
# assert: 3Bw6v7EruQvTwoY79h2QjQCs2KBQFzSneBdYUbcXK1Tr
kubectl -n <ns> exec <pod> -c doublezerod -- doublezero status
# assert: BGP Session Up, laconic-mia-sw01
systemctl is-active doublezerod
# assert: inactive
```
**Changes:**
- Copy `id.json` to container volume
- Update `DOUBLEZERO_RPC_ENDPOINT` in spec.yml
- Deploy with hostNetwork-enabled stack-orchestrator
- Stop and disable host doublezerod
- Delete DaemonSet manifest from agave-stack
### Step 2: Native sidecar for doublezerod
**Spec:** doublezerod image can be patched without restarting the agave container.
GRE tunnel and routes persist across doublezerod restart.
**Test:**
```bash
# Record current agave container start time
BEFORE=$(kubectl -n <ns> get pod <pod> -o jsonpath='{.status.containerStatuses[?(@.name=="agave-validator")].state.running.startedAt}')
# Patch DZ image
kubectl -n <ns> patch pod <pod> --type='json' -p='[
{"op":"replace","path":"/spec/initContainers/0/image","value":"laconicnetwork/doublezero:test"}
]'
# Wait for DZ container to restart
sleep 10
# Verify agave was NOT restarted
AFTER=$(kubectl -n <ns> get pod <pod> -o jsonpath='{.status.containerStatuses[?(@.name=="agave-validator")].state.running.startedAt}')
[ "$BEFORE" = "$AFTER" ] # assert: same start time
# Verify tunnel survived
ip route | grep doublezero0 # assert: routes present
```
**Changes:**
- laconic-so: support `initContainers` with `restartPolicy: Always` in
compose-to-k8s translation (or: define doublezerod as native sidecar in
compose via `x-kubernetes-init-container` extension or equivalent)
- Alternatively: post-deploy kubectl patch to move doublezerod to initContainers
### Step 3: Caddy dual-upstream routing
**Spec:** Caddy routes RPC traffic to whichever backend is healthy. Adding a second
healthy backend on :8897 causes traffic to shift without configuration changes.
**Test:**
```bash
# Start a test HTTP server on :8897 with /health
python3 -c "
from http.server import HTTPServer, BaseHTTPRequestHandler
class H(BaseHTTPRequestHandler):
def do_GET(self):
self.send_response(200); self.end_headers(); self.wfile.write(b'ok')
HTTPServer(('', 8897), H).serve_forever()
" &
# Verify Caddy discovers it
sleep 10
curl -sf https://biscayne.vaasl.io/health
# assert: 200
kill %1
```
**Changes:**
- Update Caddy ingress config with dual upstreams and health checks
### Step 4: ZFS clone and blue-green tooling
**Spec:** A script creates a ZFS clone, starts a blue deployment on alternate ports
using the cloned data, and the deployment catches up and becomes healthy.
**Test:**
```bash
# Run the clone + deploy script
./scripts/blue-green-prepare.sh --target-version v2.2.1
# assert: ZFS clone exists
zfs list biscayne/DATA/volumes/solana-blue
# assert: blue deployment exists and is catching up
kubectl -n <ns> get deployment agave-blue
# assert: blue RPC eventually becomes healthy
timeout 600 bash -c 'until curl -sf http://localhost:8897/health; do sleep 5; done'
```
**Changes:**
- `scripts/blue-green-prepare.sh` — ZFS snapshot, clone, deploy B
- `scripts/blue-green-promote.sh` — tear down A, optional port swap
- `scripts/blue-green-rollback.sh` — destroy B, restore A
- Parameterized deployment spec (kustomize overlay or env-driven)
### Step 5: End-to-end upgrade test
**Spec:** Full upgrade cycle completes with zero dropped RPC requests.
**Test:**
```bash
# Start continuous health probe in background
while true; do
curl -sf -o /dev/null -w "%{http_code} %{time_total}\n" \
https://biscayne.vaasl.io/health || echo "FAIL $(date)"
sleep 0.5
done > /tmp/health-probe.log &
# Execute full blue-green upgrade
./scripts/blue-green-prepare.sh --target-version v2.2.1
# wait for blue to sync...
./scripts/blue-green-promote.sh
# Stop probe
kill %1
# assert: no FAIL lines in probe log
grep -c FAIL /tmp/health-probe.log
# assert: 0
```

View File

@ -1,61 +0,0 @@
# Bug: Ashburn Relay — Outbound Gossip Dropped by DZ Agent ACL
## Summary
`--gossip-host 137.239.194.65` correctly advertises the Ashburn relay IP in
ContactInfo for all sockets (gossip, TVU, repair, TPU). The inbound path
works end-to-end (proven with kelce UDP tests through every hop). However,
outbound gossip from biscayne (src 137.239.194.65) is dropped by the
DoubleZero agent's ACL on mia-sw01's Tunnel500, preventing ContactInfo from
propagating to the cluster. Peers never learn our TVU address.
## Evidence
- Inbound path confirmed hop by hop (kelce → was-sw01 → mia-sw01 → Tunnel500
→ biscayne doublezero0 → DNAT → kind bridge → kind node eth0):
```
01:04:12.136633 IP 69.112.108.72.58856 > 172.20.0.2.9000: UDP, length 13
```
- Outbound gossip leaves biscayne correctly (src 137.239.194.65:8001 on
doublezero0), enters mia-sw01 via Tunnel500, hits SEC-USER-500-IN ACL:
```
60 deny ip any any [match 26355968 packets, 0:00:02 ago]
```
The ACL only permits src 186.233.184.235 and 169.254.7.7 — not 137.239.194.65.
- Validator not visible in public RPC getClusterNodes (gossip not propagating)
- Validator sees 775 nodes vs 5,045 on public RPC
## Root Cause
The `doublezero-agent` daemon on mia-sw01 manages Tunnel500 and its ACL
(SEC-USER-500-IN). The agent periodically reconciles the ACL to its expected
state, overwriting any custom entries we add. We cannot modify the ACL
without the agent reverting it.
137.239.194.65 is from the was-sw01 LAN block (137.239.194.64/29), routed
by the ISP to was-sw01 via the WAN link. It IS publicly routable (confirmed
by kelce ping/UDP tests). The earlier hypothesis that it was unroutable was
wrong — the IP reaches was-sw01, gets forwarded to mia-sw01 via backbone,
and reaches biscayne through Tunnel500 (inbound ACL direction is fine).
The problem is outbound only: the Tunnel500 ingress ACL (traffic FROM
biscayne TO mia-sw01) drops src 137.239.194.65.
## Fix
Create a dedicated GRE tunnel (Tunnel100) between biscayne and mia-sw01
that bypasses the DZ-managed Tunnel500 entirely:
- **mia-sw01 Tunnel100**: src 209.42.167.137 (free LAN IP), dst 186.233.184.235
(biscayne), link 169.254.100.0/31, ACL SEC-VALIDATOR-100-IN (we control)
- **biscayne gre-ashburn**: src 186.233.184.235, dst 209.42.167.137,
link 169.254.100.1/31
Traffic flow unchanged except the tunnel:
- Inbound: was-sw01 → backbone → mia-sw01 → Tunnel100 → biscayne → DNAT → agave
- Outbound: agave → SNAT 137.239.194.65 → Tunnel100 → mia-sw01 → backbone → was-sw01
See:
- `playbooks/ashburn-relay-mia-sw01.yml` (Tunnel100 + ACL + routes)
- `playbooks/ashburn-relay-biscayne.yml` (gre-ashburn + DNAT + SNAT + policy routing)
- `playbooks/ashburn-relay-was-sw01.yml` (static route, unchanged)

View File

@ -1,51 +0,0 @@
# Bug: laconic-so etcd cleanup wipes core kubernetes service
## Summary
`_clean_etcd_keeping_certs()` in laconic-stack-orchestrator 1.1.0 deletes the `kubernetes` service from etcd, breaking cluster networking on restart.
## Component
`stack_orchestrator/deploy/k8s/helpers.py``_clean_etcd_keeping_certs()`
## Reproduction
1. Deploy with `laconic-so` to a k8s-kind target with persisted etcd (hostPath mount in kind-config.yml)
2. `laconic-so deployment --dir <dir> stop` (destroys cluster)
3. `laconic-so deployment --dir <dir> start` (recreates cluster with cleaned etcd)
## Symptoms
- `kindnet` pods enter CrashLoopBackOff with: `panic: unable to load in-cluster configuration, KUBERNETES_SERVICE_HOST and KUBERNETES_SERVICE_PORT must be defined`
- `kubectl get svc kubernetes -n default` returns `NotFound`
- coredns, caddy, local-path-provisioner stuck in Pending (no CNI without kindnet)
- No pods can be scheduled
## Root Cause
`_clean_etcd_keeping_certs()` uses a whitelist that only preserves `/registry/secrets/caddy-system` keys. All other etcd keys are deleted, including `/registry/services/specs/default/kubernetes` — the core `kubernetes` ClusterIP service that kube-apiserver auto-creates.
When the kind cluster starts with the cleaned etcd, kube-apiserver sees the existing etcd data and does not re-create the `kubernetes` service. kindnet depends on the `KUBERNETES_SERVICE_HOST` environment variable which is injected by the kubelet from this service — without it, kindnet panics.
## Fix Options
1. **Expand the whitelist** to include `/registry/services/specs/default/kubernetes` and other core cluster resources
2. **Fully wipe etcd** instead of selective cleanup — let the cluster bootstrap fresh (simpler, but loses Caddy TLS certs)
3. **Don't persist etcd at all** — ephemeral etcd means clean state every restart (recommended for kind deployments)
## Workaround
Fully delete the kind cluster before `start`:
```bash
kind delete cluster --name <cluster-name>
laconic-so deployment --dir <dir> start
```
This forces fresh etcd bootstrap. Downside: all other services deployed to the cluster (DaemonSets, other namespaces) are destroyed.
## Impact
- Affects any k8s-kind deployment with persisted etcd
- Cluster is unrecoverable without full destroy+recreate
- All non-laconic-so-managed workloads in the cluster are lost

View File

@ -1,75 +0,0 @@
# Bug: laconic-so crashes on re-deploy when caddy ingress already exists
## Summary
`laconic-so deployment start` crashes with `FailToCreateError` when the kind cluster already has caddy ingress resources installed. The deployer uses `create_from_yaml()` which fails on `AlreadyExists` conflicts instead of applying idempotently. This prevents the application deployment from ever being reached — the crash happens before any app manifests are applied.
## Component
`stack_orchestrator/deploy/k8s/deploy_k8s.py:366``up()` method
`stack_orchestrator/deploy/k8s/helpers.py:369``install_ingress_for_kind()`
## Reproduction
1. `kind delete cluster --name laconic-70ce4c4b47e23b85`
2. `laconic-so deployment --dir /srv/deployments/agave start` — creates cluster, loads images, installs caddy ingress, but times out or is interrupted before app deployment completes
3. `laconic-so deployment --dir /srv/deployments/agave start` — crashes immediately after image loading
## Symptoms
- Traceback ending in:
```
kubernetes.utils.create_from_yaml.FailToCreateError:
Error from server (Conflict): namespaces "caddy-system" already exists
Error from server (Conflict): serviceaccounts "caddy-ingress-controller" already exists
Error from server (Conflict): clusterroles.rbac.authorization.k8s.io "caddy-ingress-controller" already exists
...
```
- Namespace `laconic-laconic-70ce4c4b47e23b85` exists but is empty — no pods, no deployments, no events
- Cluster is healthy, images are loaded, but no app manifests are applied
## Root Cause
`install_ingress_for_kind()` calls `kubernetes.utils.create_from_yaml()` which uses `POST` (create) semantics. If the resources already exist (from a previous partial run), every resource returns `409 Conflict` and `create_from_yaml` raises `FailToCreateError`, aborting the entire `up()` method before the app deployment step.
The first `laconic-so start` after a fresh `kind delete` works because:
1. Image loading into the kind node takes 5-10 minutes (images are ~10GB+)
2. Caddy ingress is installed successfully
3. App deployment begins
But if that first run is interrupted (timeout, Ctrl-C, ansible timeout), the second run finds caddy already installed and crashes.
## Fix Options
1. **Use server-side apply** instead of `create_from_yaml()``kubectl apply` is idempotent
2. **Check if ingress exists before installing** — skip `install_ingress_for_kind()` if caddy-system namespace exists
3. **Catch `AlreadyExists` and continue** — treat 409 as success for infrastructure resources
## Workaround
Delete the caddy ingress resources before re-running:
```bash
kubectl delete namespace caddy-system
kubectl delete clusterrole caddy-ingress-controller
kubectl delete clusterrolebinding caddy-ingress-controller
kubectl delete ingressclass caddy
laconic-so deployment --dir /srv/deployments/agave start
```
Or nuke the entire cluster and start fresh:
```bash
kind delete cluster --name laconic-70ce4c4b47e23b85
laconic-so deployment --dir /srv/deployments/agave start
```
## Interaction with ansible timeout
The `biscayne-redeploy.yml` playbook sets a 600s timeout on the `laconic-so deployment start` task. Image loading alone can exceed this on a fresh cluster (images must be re-loaded into the new kind node). When ansible kills the process at 600s, the caddy ingress is already installed but the app is not — putting the cluster into the broken state described above. Subsequent playbook runs hit this bug on every attempt.
## Impact
- Blocks all re-deploys on biscayne without manual cleanup
- The playbook cannot recover automatically — every retry hits the same conflict
- Discovered 2026-03-05 during full wipe redeploy of biscayne validator

135
docs/cli.md 100644
View File

@ -0,0 +1,135 @@
# laconic-so
Sub-commands and flags
## setup-repositories
Clone a single repository:
```
$ laconic-so setup-repositories --include github.com/cerc-io/go-ethereum
```
Clone the repositories for a stack:
```
$ laconic-so --stack fixturenet-eth setup-repositories
```
Pull latest commits from origin:
```
$ laconic-so --stack fixturenet-eth setup-repositories --pull
```
Use SSH rather than https:
```
$ laconic-so --stack fixturenet-eth setup-repositories --git-ssh
```
## build-containers
Build a single container:
```
$ laconic-so build-containers --include <container-name>
```
e.g.
```
$ laconic-so build-containers --include cerc/go-ethereum
```
Build the containers for a stack:
```
$ laconic-so --stack <stack-name> build-containers
```
e.g.
```
$ laconic-so --stack fixturenet-eth build-containers
```
Force full rebuild of container images:
```
$ laconic-so build-containers --include <container-name> --force-rebuild
```
## build-npms
Build a single package:
```
$ laconic-so build-npms --include <package-name>
```
e.g.
```
$ laconic-so build-npms --include registry-sdk
```
Build the packages for a stack:
```
$ laconic-so --stack <stack-name> build-npms
```
e.g.
```
$ laconic-so --stack fixturenet-laconicd build-npms
```
Force full rebuild of packages:
```
$ laconic-so build-npms --include <package-name> --force-rebuild
```
## deploy
The `deploy` command group manages persistent deployments. The general workflow is `deploy init` to generate a spec file, then `deploy create` to create a deployment directory from the spec, then runtime commands like `deploy up` and `deploy down`.
### deploy init
Generate a deployment spec file from a stack definition:
```
$ laconic-so --stack <stack-name> deploy init --output <spec-file>
```
Options:
- `--output` (required): write spec file here
- `--config`: provide config variables for the deployment
- `--config-file`: provide config variables in a file
- `--kube-config`: provide a config file for a k8s deployment
- `--image-registry`: provide a container image registry url for this k8s cluster
- `--map-ports-to-host`: map ports to the host (`any-variable-random`, `localhost-same`, `any-same`, `localhost-fixed-random`, `any-fixed-random`)
### deploy create
Create a deployment directory from a spec file:
```
$ laconic-so --stack <stack-name> deploy create --spec-file <spec-file> --deployment-dir <dir>
```
Update an existing deployment in-place (preserving data volumes and env file):
```
$ laconic-so --stack <stack-name> deploy create --spec-file <spec-file> --deployment-dir <dir> --update
```
Options:
- `--spec-file` (required): spec file to use
- `--deployment-dir`: target directory for deployment files
- `--update`: update an existing deployment directory, preserving data volumes and env file. Changed files are backed up with a `.bak` suffix. The deployment's `config.env` and `deployment.yml` are also preserved.
- `--network-dir`: network configuration supplied in this directory
- `--initial-peers`: initial set of persistent peers
### deploy up
Start a deployment:
```
$ laconic-so deployment --dir <deployment-dir> up
```
### deploy down
Stop a deployment:
```
$ laconic-so deployment --dir <deployment-dir> down
```
Use `--delete-volumes` to also remove data volumes.
### deploy ps
Show running services:
```
$ laconic-so deployment --dir <deployment-dir> ps
```
### deploy logs
View service logs:
```
$ laconic-so deployment --dir <deployment-dir> logs
```
Use `-f` to follow and `-n <count>` to tail.

View File

@ -0,0 +1,202 @@
# Deployment Patterns
## GitOps Pattern
For production deployments, we recommend a GitOps approach where your deployment configuration is tracked in version control.
### Overview
- **spec.yml is your source of truth**: Maintain it in your operator repository
- **Don't regenerate on every restart**: Run `deploy init` once, then customize and commit
- **Use restart for updates**: The restart command respects your git-tracked spec.yml
### Workflow
1. **Initial setup**: Run `deploy init` once to generate a spec.yml template
2. **Customize and commit**: Edit spec.yml with your configuration (hostnames, resources, etc.) and commit to your operator repo
3. **Deploy from git**: Use the committed spec.yml for deployments
4. **Update via git**: Make changes in git, then restart to apply
```bash
# Initial setup (run once)
laconic-so --stack my-stack deploy init --output spec.yml
# Customize for your environment
vim spec.yml # Set hostname, resources, etc.
# Commit to your operator repository
git add spec.yml
git commit -m "Add my-stack deployment configuration"
git push
# On deployment server: deploy from git-tracked spec
laconic-so deploy create \
--spec-file /path/to/operator-repo/spec.yml \
--deployment-dir my-deployment
laconic-so deployment --dir my-deployment start
```
### Updating Deployments
When you need to update a deployment:
```bash
# 1. Make changes in your operator repo
vim /path/to/operator-repo/spec.yml
git commit -am "Update configuration"
git push
# 2. On deployment server: pull and restart
cd /path/to/operator-repo && git pull
laconic-so deployment --dir my-deployment restart
```
The `restart` command:
- Pulls latest code from the stack repository
- Uses your git-tracked spec.yml (does NOT regenerate from defaults)
- Syncs the deployment directory
- Restarts services
### Anti-patterns
**Don't do this:**
```bash
# BAD: Regenerating spec on every deployment
laconic-so --stack my-stack deploy init --output spec.yml
laconic-so deploy create --spec-file spec.yml ...
```
This overwrites your customizations with defaults from the stack's `commands.py`.
**Do this instead:**
```bash
# GOOD: Use your git-tracked spec
git pull # Get latest spec.yml from your operator repo
laconic-so deployment --dir my-deployment restart
```
## Private Registry Authentication
For deployments using images from private container registries (e.g., GitHub Container Registry), configure authentication in your spec.yml:
### Configuration
Add a `registry-credentials` section to your spec.yml:
```yaml
registry-credentials:
server: ghcr.io
username: your-org-or-username
token-env: REGISTRY_TOKEN
```
**Fields:**
- `server`: The registry hostname (e.g., `ghcr.io`, `docker.io`, `gcr.io`)
- `username`: Registry username (for GHCR, use your GitHub username or org name)
- `token-env`: Name of the environment variable containing your API token/PAT
### Token Environment Variable
The `token-env` pattern keeps credentials out of version control. Set the environment variable when running `deployment start`:
```bash
export REGISTRY_TOKEN="your-personal-access-token"
laconic-so deployment --dir my-deployment start
```
For GHCR, create a Personal Access Token (PAT) with `read:packages` scope.
### Ansible Integration
When using Ansible for deployments, pass the token from a credentials file:
```yaml
- name: Start deployment
ansible.builtin.command:
cmd: laconic-so deployment --dir {{ deployment_dir }} start
environment:
REGISTRY_TOKEN: "{{ lookup('file', '~/.credentials/ghcr_token') }}"
```
### How It Works
1. laconic-so reads the `registry-credentials` config from spec.yml
2. Creates a Kubernetes `docker-registry` secret named `{deployment}-registry`
3. The deployment's pods reference this secret for image pulls
## Cluster and Volume Management
### Stopping Deployments
The `deployment stop` command has two important flags:
```bash
# Default: stops deployment, deletes cluster, PRESERVES volumes
laconic-so deployment --dir my-deployment stop
# Explicitly delete volumes (USE WITH CAUTION)
laconic-so deployment --dir my-deployment stop --delete-volumes
```
### Volume Persistence
Volumes persist across cluster deletion by design. This is important because:
- **Data survives cluster recreation**: Ledger data, databases, and other state are preserved
- **Faster recovery**: No need to re-sync or rebuild data after cluster issues
- **Safe cluster upgrades**: Delete and recreate cluster without data loss
**Only use `--delete-volumes` when:**
- You explicitly want to start fresh with no data
- The user specifically requests volume deletion
- You're cleaning up a test/dev environment completely
### Shared Cluster Architecture
In kind deployments, multiple stacks share a single cluster:
- First `deployment start` creates the cluster
- Subsequent deployments reuse the existing cluster
- `deployment stop` on ANY deployment deletes the shared cluster
- Other deployments will fail until cluster is recreated
To stop a single deployment without affecting the cluster:
```bash
laconic-so deployment --dir my-deployment stop --skip-cluster-management
```
## Volume Persistence in k8s-kind
k8s-kind has 3 storage layers:
- **Docker Host**: The physical server running Docker
- **Kind Node**: A Docker container simulating a k8s node
- **Pod Container**: Your workload
For k8s-kind, volumes with paths are mounted from Docker Host → Kind Node → Pod via extraMounts.
| spec.yml volume | Storage Location | Survives Pod Restart | Survives Cluster Restart |
|-----------------|------------------|---------------------|-------------------------|
| `vol:` (empty) | Kind Node PVC | ✅ | ❌ |
| `vol: ./data/x` | Docker Host | ✅ | ✅ |
| `vol: /abs/path`| Docker Host | ✅ | ✅ |
**Recommendation**: Always use paths for data you want to keep. Relative paths
(e.g., `./data/rpc-config`) resolve to `$DEPLOYMENT_DIR/data/rpc-config` on the
Docker Host.
### Example
```yaml
# In spec.yml
volumes:
rpc-config: ./data/rpc-config # Persists to $DEPLOYMENT_DIR/data/rpc-config
chain-data: ./data/chain # Persists to $DEPLOYMENT_DIR/data/chain
temp-cache: # Empty = Kind Node PVC (lost on cluster delete)
```
### The Antipattern
Empty-path volumes appear persistent because they survive pod restarts (data lives
in Kind Node container). However, this data is lost when the kind cluster is
recreated. This "false persistence" has caused data loss when operators assumed
their data was safe.

View File

@ -0,0 +1,550 @@
# Docker Compose Deployment Guide
## Introduction
### What is a Deployer?
In stack-orchestrator, a **deployer** provides a uniform interface for orchestrating containerized applications. This guide focuses on Docker Compose deployments, which is the default and recommended deployment mode.
While stack-orchestrator also supports Kubernetes (`k8s`) and Kind (`k8s-kind`) deployments, those are out of scope for this guide. See the [Kubernetes Enhancements](./k8s-deployment-enhancements.md) documentation for advanced deployment options.
## Prerequisites
To deploy stacks using Docker Compose, you need:
- Docker Engine (20.10+)
- Docker Compose plugin (v2.0+)
- Python 3.8+
- stack-orchestrator installed (`laconic-so`)
**That's it!** No additional infrastructure is required. If you have Docker installed, you're ready to deploy.
## Deployment Workflow
The typical deployment workflow consists of four main steps:
1. **Setup repositories and build containers** (first time only)
2. **Initialize deployment specification**
3. **Create deployment directory**
4. **Start and manage services**
## Quick Start Example
Here's a complete example using the built-in `test` stack:
```bash
# Step 1: Setup (first time only)
laconic-so --stack test setup-repositories
laconic-so --stack test build-containers
# Step 2: Initialize deployment spec
laconic-so --stack test deploy init --output test-spec.yml
# Step 3: Create deployment directory
laconic-so --stack test deploy create \
--spec-file test-spec.yml \
--deployment-dir test-deployment
# Step 4: Start services
laconic-so deployment --dir test-deployment start
# View running services
laconic-so deployment --dir test-deployment ps
# View logs
laconic-so deployment --dir test-deployment logs
# Stop services (preserves data)
laconic-so deployment --dir test-deployment stop
```
## Deployment Workflows
Stack-orchestrator supports two deployment workflows:
### 1. Deployment Directory Workflow (Recommended)
This workflow creates a persistent deployment directory that contains all configuration and data.
**When to use:**
- Production deployments
- When you need to preserve configuration
- When you want to manage multiple deployments
- When you need persistent volume data
**Example:**
```bash
# Initialize deployment spec
laconic-so --stack fixturenet-eth deploy init --output eth-spec.yml
# Optionally edit eth-spec.yml to customize configuration
# Create deployment directory
laconic-so --stack fixturenet-eth deploy create \
--spec-file eth-spec.yml \
--deployment-dir my-eth-deployment
# Start the deployment
laconic-so deployment --dir my-eth-deployment start
# Manage the deployment
laconic-so deployment --dir my-eth-deployment ps
laconic-so deployment --dir my-eth-deployment logs
laconic-so deployment --dir my-eth-deployment stop
```
### 2. Quick Deploy Workflow
This workflow deploys directly without creating a persistent deployment directory.
**When to use:**
- Quick testing
- Temporary deployments
- Simple stacks that don't require customization
**Example:**
```bash
# Start the stack directly
laconic-so --stack test deploy up
# Check service status
laconic-so --stack test deploy port test 80
# View logs
laconic-so --stack test deploy logs
# Stop (preserves volumes)
laconic-so --stack test deploy down
# Stop and remove volumes
laconic-so --stack test deploy down --delete-volumes
```
## Real-World Example: Ethereum Fixturenet
Deploy a local Ethereum testnet with Geth and Lighthouse:
```bash
# Setup (first time only)
laconic-so --stack fixturenet-eth setup-repositories
laconic-so --stack fixturenet-eth build-containers
# Initialize with default configuration
laconic-so --stack fixturenet-eth deploy init --output eth-spec.yml
# Create deployment
laconic-so --stack fixturenet-eth deploy create \
--spec-file eth-spec.yml \
--deployment-dir fixturenet-eth-deployment
# Start the network
laconic-so deployment --dir fixturenet-eth-deployment start
# Check status
laconic-so deployment --dir fixturenet-eth-deployment ps
# Access logs from specific service
laconic-so deployment --dir fixturenet-eth-deployment logs fixturenet-eth-geth-1
# Stop the network (preserves blockchain data)
laconic-so deployment --dir fixturenet-eth-deployment stop
# Start again - blockchain data is preserved
laconic-so deployment --dir fixturenet-eth-deployment start
# Clean up everything including data
laconic-so deployment --dir fixturenet-eth-deployment stop --delete-volumes
```
## Configuration
### Passing Configuration Parameters
Configuration can be passed in three ways:
**1. At init time via `--config` flag:**
```bash
laconic-so --stack test deploy init --output spec.yml \
--config PARAM1=value1,PARAM2=value2
```
**2. Edit the spec file after init:**
```bash
# Initialize
laconic-so --stack test deploy init --output spec.yml
# Edit spec.yml
vim spec.yml
```
Example spec.yml:
```yaml
stack: test
config:
PARAM1: value1
PARAM2: value2
```
**3. Docker Compose defaults:**
Environment variables defined in the stack's `docker-compose-*.yml` files are used as defaults. Configuration from the spec file overrides these defaults.
### Port Mapping
By default, services are accessible on randomly assigned host ports. To find the mapped port:
```bash
# Find the host port for container port 80 on service 'webapp'
laconic-so deployment --dir my-deployment port webapp 80
# Output example: 0.0.0.0:32768
```
To configure fixed ports, edit the spec file before creating the deployment:
```yaml
network:
ports:
webapp:
- '8080:80' # Maps host port 8080 to container port 80
api:
- '3000:3000'
```
Then create the deployment:
```bash
laconic-so --stack my-stack deploy create \
--spec-file spec.yml \
--deployment-dir my-deployment
```
### Volume Persistence
Volumes are preserved between stop/start cycles by default:
```bash
# Stop but keep data
laconic-so deployment --dir my-deployment stop
# Start again - data is still there
laconic-so deployment --dir my-deployment start
```
To completely remove all data:
```bash
# Stop and delete all volumes
laconic-so deployment --dir my-deployment stop --delete-volumes
```
Volume data is stored in `<deployment-dir>/data/`.
## Common Operations
### Viewing Logs
```bash
# All services, continuous follow
laconic-so deployment --dir my-deployment logs --follow
# Last 100 lines from all services
laconic-so deployment --dir my-deployment logs --tail 100
# Specific service only
laconic-so deployment --dir my-deployment logs webapp
# Combine options
laconic-so deployment --dir my-deployment logs --tail 50 --follow webapp
```
### Executing Commands in Containers
```bash
# Execute a command in a running service
laconic-so deployment --dir my-deployment exec webapp ls -la
# Interactive shell
laconic-so deployment --dir my-deployment exec webapp /bin/bash
# Run command with specific environment variables
laconic-so deployment --dir my-deployment exec webapp env VAR=value command
```
### Checking Service Status
```bash
# List all running services
laconic-so deployment --dir my-deployment ps
# Check using Docker directly
docker ps
```
### Updating a Running Deployment
If you need to change configuration after deployment:
```bash
# 1. Edit the spec file
vim my-deployment/spec.yml
# 2. Regenerate configuration
laconic-so deployment --dir my-deployment update
# 3. Restart services to apply changes
laconic-so deployment --dir my-deployment stop
laconic-so deployment --dir my-deployment start
```
## Multi-Service Deployments
Many stacks deploy multiple services that work together:
```bash
# Deploy a stack with multiple services
laconic-so --stack laconicd-with-console deploy init --output spec.yml
laconic-so --stack laconicd-with-console deploy create \
--spec-file spec.yml \
--deployment-dir laconicd-deployment
laconic-so deployment --dir laconicd-deployment start
# View all services
laconic-so deployment --dir laconicd-deployment ps
# View logs from specific services
laconic-so deployment --dir laconicd-deployment logs laconicd
laconic-so deployment --dir laconicd-deployment logs console
```
## ConfigMaps
ConfigMaps allow you to mount configuration files into containers:
```bash
# 1. Create the config directory in your deployment
mkdir -p my-deployment/data/my-config
echo "database_url=postgres://localhost" > my-deployment/data/my-config/app.conf
# 2. Reference in spec file
vim my-deployment/spec.yml
```
Add to spec.yml:
```yaml
configmaps:
my-config: ./data/my-config
```
```bash
# 3. Restart to apply
laconic-so deployment --dir my-deployment stop
laconic-so deployment --dir my-deployment start
```
The files will be mounted in the container at `/config/` (or as specified by the stack).
## Deployment Directory Structure
A typical deployment directory contains:
```
my-deployment/
├── compose/
│ └── docker-compose-*.yml # Generated compose files
├── config.env # Environment variables
├── deployment.yml # Deployment metadata
├── spec.yml # Deployment specification
└── data/ # Volume mounts and configs
├── service-data/ # Persistent service data
└── config-maps/ # ConfigMap files
```
## Troubleshooting
### Common Issues
**Problem: "Cannot connect to Docker daemon"**
```bash
# Ensure Docker is running
docker ps
# Start Docker if needed (macOS)
open -a Docker
# Start Docker (Linux)
sudo systemctl start docker
```
**Problem: "Port already in use"**
```bash
# Either stop the conflicting service or use different ports
# Edit spec.yml before creating deployment:
network:
ports:
webapp:
- '8081:80' # Use 8081 instead of 8080
```
**Problem: "Image not found"**
```bash
# Build containers first
laconic-so --stack your-stack build-containers
```
**Problem: Volumes not persisting**
```bash
# Check if you used --delete-volumes when stopping
# Volume data is in: <deployment-dir>/data/
# Don't use --delete-volumes if you want to keep data:
laconic-so deployment --dir my-deployment stop
# Only use --delete-volumes when you want to reset completely:
laconic-so deployment --dir my-deployment stop --delete-volumes
```
**Problem: Services not starting**
```bash
# Check logs for errors
laconic-so deployment --dir my-deployment logs
# Check Docker container status
docker ps -a
# Try stopping and starting again
laconic-so deployment --dir my-deployment stop
laconic-so deployment --dir my-deployment start
```
### Inspecting Deployment State
```bash
# Check deployment directory structure
ls -la my-deployment/
# Check running containers
docker ps
# Check container details
docker inspect <container-name>
# Check networks
docker network ls
# Check volumes
docker volume ls
```
## CLI Commands Reference
### Stack Operations
```bash
# Clone required repositories
laconic-so --stack <name> setup-repositories
# Build container images
laconic-so --stack <name> build-containers
```
### Deployment Initialization
```bash
# Initialize deployment spec with defaults
laconic-so --stack <name> deploy init --output <spec-file>
# Initialize with configuration
laconic-so --stack <name> deploy init --output <spec-file> \
--config PARAM1=value1,PARAM2=value2
```
### Deployment Creation
```bash
# Create deployment directory from spec
laconic-so --stack <name> deploy create \
--spec-file <spec-file> \
--deployment-dir <dir>
```
### Deployment Management
```bash
# Start all services
laconic-so deployment --dir <dir> start
# Stop services (preserves volumes)
laconic-so deployment --dir <dir> stop
# Stop and remove volumes
laconic-so deployment --dir <dir> stop --delete-volumes
# List running services
laconic-so deployment --dir <dir> ps
# View logs
laconic-so deployment --dir <dir> logs [--tail N] [--follow] [service]
# Show mapped port
laconic-so deployment --dir <dir> port <service> <private-port>
# Execute command in service
laconic-so deployment --dir <dir> exec <service> <command>
# Update configuration
laconic-so deployment --dir <dir> update
```
### Quick Deploy Commands
```bash
# Start stack directly
laconic-so --stack <name> deploy up
# Stop stack
laconic-so --stack <name> deploy down [--delete-volumes]
# View logs
laconic-so --stack <name> deploy logs
# Show port mapping
laconic-so --stack <name> deploy port <service> <port>
```
## Related Documentation
- [CLI Reference](./cli.md) - Complete CLI command documentation
- [Adding a New Stack](./adding-a-new-stack.md) - Creating custom stacks
- [Specification](./spec.md) - Internal structure and design
- [Kubernetes Enhancements](./k8s-deployment-enhancements.md) - Advanced K8s deployment options
- [Web App Deployment](./webapp.md) - Deploying web applications
## Examples
For more examples, see the test scripts:
- `scripts/quick-deploy-test.sh` - Quick deployment example
- `tests/deploy/run-deploy-test.sh` - Comprehensive test showing all features
## Summary
- Docker Compose is the default and recommended deployment mode
- Two workflows: deployment directory (recommended) or quick deploy
- The standard workflow is: setup → build → init → create → start
- Configuration is flexible with multiple override layers
- Volume persistence is automatic unless explicitly deleted
- All deployment state is contained in the deployment directory
- For Kubernetes deployments, see separate K8s documentation
You're now ready to deploy stacks using stack-orchestrator with Docker Compose!

View File

@ -1,80 +0,0 @@
# DoubleZero Agent — Managed Configuration
The `doublezero-agent` daemon runs on both mia-sw01 and was-sw01. It manages
GRE tunnels, ACLs, BGP neighbors, and route-maps via EOS config sessions
(named `doublezero-agent-<timestamp>`). It periodically creates pending
sessions and commits them, overwriting any manual changes to the objects
it manages.
**Do NOT modify any of the items listed below.** The agent will silently
overwrite your changes.
## mia-sw01
### Tunnel interfaces (all DZ-managed)
| Interface | Description | VRF | Peer | ACL |
|------------|-----------------|---------|-----------------|------------------------------|
| Tunnel500 | USER-UCAST-500 | vrf1 | 186.233.184.235 | SEC-USER-500-IN |
| Tunnel501 | USER-MCAST-501 | default | 186.233.185.50 | SEC-USER-SUB-MCAST-IN |
| Tunnel502 | USER-UCAST-502 | vrf1 | 155.138.213.71 | SEC-USER-502-IN |
| Tunnel503 | USER-MCAST-503 | default | 155.138.213.71 | SEC-USER-PUB-MCAST-IN |
| Tunnel504 | (empty) | | | |
| Tunnel505 | USER-UCAST-505 | vrf1 | 186.233.185.50 | SEC-USER-505-IN |
| Tunnel506 | (exists) | | | |
### ACLs (DZ-managed — do NOT modify)
- `SEC-DIA-IN` — ingress ACL on Et1/1 (bogon/RFC1918 filter)
- `SEC-USER-500-IN` — ingress ACL on Tunnel500
- `SEC-USER-502-IN` — ingress ACL on Tunnel502
- `SEC-USER-505-IN` — ingress ACL on Tunnel505
- `SEC-USER-SUB-MCAST-IN` — ingress ACL on Tunnel501
- `SEC-USER-PUB-MCAST-IN` — ingress ACL on Tunnel503
- `SEC-USER-MCAST-BOUNDARY-501-OUT` — multicast boundary on Tunnel501
- `SEC-USER-MCAST-BOUNDARY-503-OUT` — multicast boundary on Tunnel503
### VRF (DZ-managed)
- `vrf1` — used by Tunnel500, Tunnel502, Tunnel505 (unicast tunnels)
- `ip route vrf vrf1 0.0.0.0/0 egress-vrf default Ethernet4/1 172.16.1.188`
### BGP (DZ-managed)
- `router bgp 65342` — iBGP mesh with DZ fabric switches (ny7, sea001, ld4, etc.)
- BGP neighbors on tunnel link IPs (169.254.x.x) with `RM-USER-*` route-maps
- All `RM-USER-*-IN` and `RM-USER-*-OUT` route-maps
### Loopbacks (DZ-managed)
- `Loopback255`, `Loopback256` — BGP update sources for iBGP mesh
## was-sw01
### ACLs (DZ-managed)
- `SEC-DIA-IN` — ingress ACL on Et1/1
- `SEC-USER-PUB-MCAST-IN`
- `SEC-USER-SUB-MCAST-IN`
### Daemons
- `doublezero-agent` — config management
- `doublezero-telemetry` — metrics (writes to influxdb `doublezero-mainnet-beta`)
## Safe to modify (NOT managed by DZ agent)
### mia-sw01
- `Tunnel100` — our dedicated validator relay tunnel (VRF relay)
- `SEC-VALIDATOR-100-IN` — our ACL on Tunnel100
- `Loopback101` — tunnel source IP (209.42.167.137)
- VRF `relay` — our outbound isolation VRF
- `ip route 137.239.194.65/32 egress-vrf relay 169.254.100.1`
- `ip route vrf relay 0.0.0.0/0 egress-vrf default 172.16.1.188`
- Backbone `Ethernet4/1` — physical interface, not DZ-managed
### was-sw01
- `ip route 137.239.194.65/32 172.16.1.189` — our static route
- Backbone `Ethernet4/1` — physical interface, not DZ-managed

Some files were not shown because too many files have changed in this diff Show More