Merge commit '19bb90f8148833ea7ff79cba312b048abc0d790b' as 'stack-orchestrator'
commit
03a5b5e39e
|
|
@ -0,0 +1,66 @@
|
|||
name: Fixturenet-Laconicd-Test
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: '*'
|
||||
paths:
|
||||
- '!**'
|
||||
- '.gitea/workflows/triggers/fixturenet-laconicd-test'
|
||||
schedule:
|
||||
- cron: '1 13 * * *'
|
||||
|
||||
jobs:
|
||||
test:
|
||||
name: "Run Laconicd fixturenet and Laconic CLI tests"
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: 'Update'
|
||||
run: apt-get update
|
||||
- name: 'Setup jq'
|
||||
run: apt-get install jq -y
|
||||
- name: 'Check jq'
|
||||
run: |
|
||||
which jq
|
||||
jq --version
|
||||
- name: "Clone project repository"
|
||||
uses: actions/checkout@v3
|
||||
# At present the stock setup-python action fails on Linux/aarch64
|
||||
# Conditional steps below workaroud this by using deadsnakes for that case only
|
||||
- name: "Install Python for ARM on Linux"
|
||||
if: ${{ runner.arch == 'arm64' && runner.os == 'Linux' }}
|
||||
uses: deadsnakes/action@v3.0.1
|
||||
with:
|
||||
python-version: '3.8'
|
||||
- name: "Install Python cases other than ARM on Linux"
|
||||
if: ${{ ! (runner.arch == 'arm64' && runner.os == 'Linux') }}
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.8'
|
||||
- name: "Print Python version"
|
||||
run: python3 --version
|
||||
- name: "Install shiv"
|
||||
run: pip install shiv==1.0.6
|
||||
- name: "Generate build version file"
|
||||
run: ./scripts/create_build_tag_file.sh
|
||||
- name: "Build local shiv package"
|
||||
run: ./scripts/build_shiv_package.sh
|
||||
- name: "Run fixturenet-laconicd tests"
|
||||
run: ./tests/fixturenet-laconicd/run-test.sh
|
||||
- name: "Run laconic CLI tests"
|
||||
run: ./tests/fixturenet-laconicd/run-cli-test.sh
|
||||
- name: Notify Vulcanize Slack on CI failure
|
||||
if: ${{ always() && github.ref_name == 'main' }}
|
||||
uses: ravsamhq/notify-slack-action@v2
|
||||
with:
|
||||
status: ${{ job.status }}
|
||||
notify_when: 'failure'
|
||||
env:
|
||||
SLACK_WEBHOOK_URL: ${{ secrets.VULCANIZE_SLACK_CI_ALERTS }}
|
||||
- name: Notify DeepStack Slack on CI failure
|
||||
if: ${{ always() && github.ref_name == 'main' }}
|
||||
uses: ravsamhq/notify-slack-action@v2
|
||||
with:
|
||||
status: ${{ job.status }}
|
||||
notify_when: 'failure'
|
||||
env:
|
||||
SLACK_WEBHOOK_URL: ${{ secrets.DEEPSTACK_SLACK_CI_ALERTS }}
|
||||
|
|
@ -0,0 +1,37 @@
|
|||
name: Lint Checks
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches: '*'
|
||||
push:
|
||||
branches: '*'
|
||||
|
||||
jobs:
|
||||
test:
|
||||
name: "Run linter"
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: "Clone project repository"
|
||||
uses: actions/checkout@v3
|
||||
- name: "Install Python"
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.8'
|
||||
- name : "Run flake8"
|
||||
uses: py-actions/flake8@v2
|
||||
- name: Notify Vulcanize Slack on CI failure
|
||||
if: ${{ always() && github.ref_name == 'main' }}
|
||||
uses: ravsamhq/notify-slack-action@v2
|
||||
with:
|
||||
status: ${{ job.status }}
|
||||
notify_when: 'failure'
|
||||
env:
|
||||
SLACK_WEBHOOK_URL: ${{ secrets.VULCANIZE_SLACK_CI_ALERTS }}
|
||||
- name: Notify DeepStack Slack on CI failure
|
||||
if: ${{ always() && github.ref_name == 'main' }}
|
||||
uses: ravsamhq/notify-slack-action@v2
|
||||
with:
|
||||
status: ${{ job.status }}
|
||||
notify_when: 'failure'
|
||||
env:
|
||||
SLACK_WEBHOOK_URL: ${{ secrets.DEEPSTACK_SLACK_CI_ALERTS }}
|
||||
|
|
@ -0,0 +1,72 @@
|
|||
name: Publish
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- publish-test
|
||||
paths-ignore:
|
||||
- '.gitea/workflows/triggers/*'
|
||||
|
||||
jobs:
|
||||
publish:
|
||||
name: "Build and publish"
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: "Clone project repository"
|
||||
uses: actions/checkout@v3
|
||||
- name: "Get build info"
|
||||
id: build-info
|
||||
run: |
|
||||
build_tag=$(./scripts/create_build_tag_file.sh)
|
||||
echo "build-tag=v${build_tag}" >> $GITHUB_OUTPUT
|
||||
# At present the stock setup-python action fails on Linux/aarch64
|
||||
# Conditional steps below workaroud this by using deadsnakes for that case only
|
||||
- name: "Install Python for ARM on Linux"
|
||||
if: ${{ runner.arch == 'arm64' && runner.os == 'Linux' }}
|
||||
uses: deadsnakes/action@v3.0.1
|
||||
with:
|
||||
python-version: '3.8'
|
||||
- name: "Install Python cases other than ARM on Linux"
|
||||
if: ${{ ! (runner.arch == 'arm64' && runner.os == 'Linux') }}
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.8'
|
||||
- name: "Print Python version"
|
||||
run: python3 --version
|
||||
- name: "Install shiv"
|
||||
run: pip install shiv==1.0.6
|
||||
- name: "Build local shiv package"
|
||||
id: build
|
||||
run: |
|
||||
./scripts/build_shiv_package.sh
|
||||
result_code=$?
|
||||
echo "package-file=$(ls ./package/*)" >> $GITHUB_OUTPUT
|
||||
exit $result_code
|
||||
- name: "Stage artifact file"
|
||||
run: |
|
||||
cp ${{ steps.build.outputs.package-file }} ./laconic-so
|
||||
- name: "Create release"
|
||||
uses: https://gitea.com/cerc-io/action-gh-release@gitea-v2
|
||||
with:
|
||||
tag_name: ${{ steps.build-info.outputs.build-tag }}
|
||||
# On the publish test branch, mark our release as a draft
|
||||
# Hack using endsWith to workaround Gitea sometimes sending "publish-test" vs "refs/heads/publish-test"
|
||||
draft: ${{ endsWith('publish-test', github.ref ) }}
|
||||
files: ./laconic-so
|
||||
- name: Notify Vulcanize Slack on CI failure
|
||||
if: ${{ always() && github.ref_name == 'main' }}
|
||||
uses: ravsamhq/notify-slack-action@v2
|
||||
with:
|
||||
status: ${{ job.status }}
|
||||
notify_when: 'failure'
|
||||
env:
|
||||
SLACK_WEBHOOK_URL: ${{ secrets.VULCANIZE_SLACK_CI_ALERTS }}
|
||||
- name: Notify DeepStack Slack on CI failure
|
||||
if: ${{ always() && github.ref_name == 'main' }}
|
||||
uses: ravsamhq/notify-slack-action@v2
|
||||
with:
|
||||
status: ${{ job.status }}
|
||||
notify_when: 'failure'
|
||||
env:
|
||||
SLACK_WEBHOOK_URL: ${{ secrets.DEEPSTACK_SLACK_CI_ALERTS }}
|
||||
|
|
@ -0,0 +1,69 @@
|
|||
name: Container Registry Test
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: '*'
|
||||
paths:
|
||||
- '!**'
|
||||
- '.gitea/workflows/triggers/test-container-registry'
|
||||
- '.gitea/workflows/test-container-registry.yml'
|
||||
- 'tests/container-registry/run-test.sh'
|
||||
schedule: # Note: coordinate with other tests to not overload runners at the same time of day
|
||||
- cron: '6 19 * * *'
|
||||
|
||||
jobs:
|
||||
test:
|
||||
name: "Run contaier registry hosting test on kind/k8s"
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- name: "Clone project repository"
|
||||
uses: actions/checkout@v3
|
||||
# At present the stock setup-python action fails on Linux/aarch64
|
||||
# Conditional steps below workaroud this by using deadsnakes for that case only
|
||||
- name: "Install Python for ARM on Linux"
|
||||
if: ${{ runner.arch == 'arm64' && runner.os == 'Linux' }}
|
||||
uses: deadsnakes/action@v3.0.1
|
||||
with:
|
||||
python-version: '3.8'
|
||||
- name: "Install Python cases other than ARM on Linux"
|
||||
if: ${{ ! (runner.arch == 'arm64' && runner.os == 'Linux') }}
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.8'
|
||||
- name: "Print Python version"
|
||||
run: python3 --version
|
||||
- name: "Install shiv"
|
||||
run: pip install shiv==1.0.6
|
||||
- name: "Generate build version file"
|
||||
run: ./scripts/create_build_tag_file.sh
|
||||
- name: "Build local shiv package"
|
||||
run: ./scripts/build_shiv_package.sh
|
||||
- name: "Check cgroups version"
|
||||
run: mount | grep cgroup
|
||||
- name: "Install kind"
|
||||
run: ./tests/scripts/install-kind.sh
|
||||
- name: "Install Kubectl"
|
||||
run: ./tests/scripts/install-kubectl.sh
|
||||
- name: "Install ed" # Only needed until we remove the need to edit the spec file
|
||||
run: apt update && apt install -y ed
|
||||
- name: "Run container registry deployment test"
|
||||
run: |
|
||||
source /opt/bash-utils/cgroup-helper.sh
|
||||
join_cgroup
|
||||
./tests/container-registry/run-test.sh
|
||||
- name: Notify Vulcanize Slack on CI failure
|
||||
if: ${{ always() && github.ref_name == 'main' }}
|
||||
uses: ravsamhq/notify-slack-action@v2
|
||||
with:
|
||||
status: ${{ job.status }}
|
||||
notify_when: 'failure'
|
||||
env:
|
||||
SLACK_WEBHOOK_URL: ${{ secrets.VULCANIZE_SLACK_CI_ALERTS }}
|
||||
- name: Notify DeepStack Slack on CI failure
|
||||
if: ${{ always() && github.ref_name == 'main' }}
|
||||
uses: ravsamhq/notify-slack-action@v2
|
||||
with:
|
||||
status: ${{ job.status }}
|
||||
notify_when: 'failure'
|
||||
env:
|
||||
SLACK_WEBHOOK_URL: ${{ secrets.DEEPSTACK_SLACK_CI_ALERTS }}
|
||||
|
|
@ -0,0 +1,67 @@
|
|||
name: Database Test
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: '*'
|
||||
paths:
|
||||
- '!**'
|
||||
- '.gitea/workflows/triggers/test-database'
|
||||
- '.gitea/workflows/test-database.yml'
|
||||
- 'tests/database/run-test.sh'
|
||||
schedule: # Note: coordinate with other tests to not overload runners at the same time of day
|
||||
- cron: '5 18 * * *'
|
||||
|
||||
jobs:
|
||||
test:
|
||||
name: "Run database hosting test on kind/k8s"
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- name: "Clone project repository"
|
||||
uses: actions/checkout@v3
|
||||
# At present the stock setup-python action fails on Linux/aarch64
|
||||
# Conditional steps below workaroud this by using deadsnakes for that case only
|
||||
- name: "Install Python for ARM on Linux"
|
||||
if: ${{ runner.arch == 'arm64' && runner.os == 'Linux' }}
|
||||
uses: deadsnakes/action@v3.0.1
|
||||
with:
|
||||
python-version: '3.8'
|
||||
- name: "Install Python cases other than ARM on Linux"
|
||||
if: ${{ ! (runner.arch == 'arm64' && runner.os == 'Linux') }}
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.8'
|
||||
- name: "Print Python version"
|
||||
run: python3 --version
|
||||
- name: "Install shiv"
|
||||
run: pip install shiv==1.0.6
|
||||
- name: "Generate build version file"
|
||||
run: ./scripts/create_build_tag_file.sh
|
||||
- name: "Build local shiv package"
|
||||
run: ./scripts/build_shiv_package.sh
|
||||
- name: "Check cgroups version"
|
||||
run: mount | grep cgroup
|
||||
- name: "Install kind"
|
||||
run: ./tests/scripts/install-kind.sh
|
||||
- name: "Install Kubectl"
|
||||
run: ./tests/scripts/install-kubectl.sh
|
||||
- name: "Run database deployment test"
|
||||
run: |
|
||||
source /opt/bash-utils/cgroup-helper.sh
|
||||
join_cgroup
|
||||
./tests/database/run-test.sh
|
||||
- name: Notify Vulcanize Slack on CI failure
|
||||
if: ${{ always() && github.ref_name == 'main' }}
|
||||
uses: ravsamhq/notify-slack-action@v2
|
||||
with:
|
||||
status: ${{ job.status }}
|
||||
notify_when: 'failure'
|
||||
env:
|
||||
SLACK_WEBHOOK_URL: ${{ secrets.VULCANIZE_SLACK_CI_ALERTS }}
|
||||
- name: Notify DeepStack Slack on CI failure
|
||||
if: ${{ always() && github.ref_name == 'main' }}
|
||||
uses: ravsamhq/notify-slack-action@v2
|
||||
with:
|
||||
status: ${{ job.status }}
|
||||
notify_when: 'failure'
|
||||
env:
|
||||
SLACK_WEBHOOK_URL: ${{ secrets.DEEPSTACK_SLACK_CI_ALERTS }}
|
||||
|
|
@ -0,0 +1,59 @@
|
|||
name: Deploy Test
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- ci-test
|
||||
paths-ignore:
|
||||
- '.gitea/workflows/triggers/*'
|
||||
|
||||
|
||||
jobs:
|
||||
test:
|
||||
name: "Run deploy test suite"
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: "Clone project repository"
|
||||
uses: actions/checkout@v3
|
||||
# At present the stock setup-python action fails on Linux/aarch64
|
||||
# Conditional steps below workaroud this by using deadsnakes for that case only
|
||||
- name: "Install Python for ARM on Linux"
|
||||
if: ${{ runner.arch == 'arm64' && runner.os == 'Linux' }}
|
||||
uses: deadsnakes/action@v3.0.1
|
||||
with:
|
||||
python-version: '3.8'
|
||||
- name: "Install Python cases other than ARM on Linux"
|
||||
if: ${{ ! (runner.arch == 'arm64' && runner.os == 'Linux') }}
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.8'
|
||||
- name: "Print Python version"
|
||||
run: python3 --version
|
||||
- name: "Install shiv"
|
||||
run: pip install shiv==1.0.6
|
||||
- name: "Generate build version file"
|
||||
run: ./scripts/create_build_tag_file.sh
|
||||
- name: "Build local shiv package"
|
||||
run: ./scripts/build_shiv_package.sh
|
||||
- name: "Run deploy tests"
|
||||
run: ./tests/deploy/run-deploy-test.sh
|
||||
- name: Notify Vulcanize Slack on CI failure
|
||||
if: ${{ always() && github.ref_name == 'main' }}
|
||||
uses: ravsamhq/notify-slack-action@v2
|
||||
with:
|
||||
status: ${{ job.status }}
|
||||
notify_when: 'failure'
|
||||
env:
|
||||
SLACK_WEBHOOK_URL: ${{ secrets.VULCANIZE_SLACK_CI_ALERTS }}
|
||||
- name: Notify DeepStack Slack on CI failure
|
||||
if: ${{ always() && github.ref_name == 'main' }}
|
||||
uses: ravsamhq/notify-slack-action@v2
|
||||
with:
|
||||
status: ${{ job.status }}
|
||||
notify_when: 'failure'
|
||||
env:
|
||||
SLACK_WEBHOOK_URL: ${{ secrets.DEEPSTACK_SLACK_CI_ALERTS }}
|
||||
|
|
@ -0,0 +1,58 @@
|
|||
name: External Stack Test
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: '*'
|
||||
paths:
|
||||
- '!**'
|
||||
- '.gitea/workflows/triggers/test-external-stack'
|
||||
- '.gitea/workflows/test-external-stack.yml'
|
||||
- 'tests/external-stack/run-test.sh'
|
||||
schedule: # Note: coordinate with other tests to not overload runners at the same time of day
|
||||
- cron: '8 19 * * *'
|
||||
|
||||
jobs:
|
||||
test:
|
||||
name: "Run external stack test suite"
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: "Clone project repository"
|
||||
uses: actions/checkout@v3
|
||||
# At present the stock setup-python action fails on Linux/aarch64
|
||||
# Conditional steps below workaroud this by using deadsnakes for that case only
|
||||
- name: "Install Python for ARM on Linux"
|
||||
if: ${{ runner.arch == 'arm64' && runner.os == 'Linux' }}
|
||||
uses: deadsnakes/action@v3.0.1
|
||||
with:
|
||||
python-version: '3.8'
|
||||
- name: "Install Python cases other than ARM on Linux"
|
||||
if: ${{ ! (runner.arch == 'arm64' && runner.os == 'Linux') }}
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.8'
|
||||
- name: "Print Python version"
|
||||
run: python3 --version
|
||||
- name: "Install shiv"
|
||||
run: pip install shiv==1.0.6
|
||||
- name: "Generate build version file"
|
||||
run: ./scripts/create_build_tag_file.sh
|
||||
- name: "Build local shiv package"
|
||||
run: ./scripts/build_shiv_package.sh
|
||||
- name: "Run external stack tests"
|
||||
run: ./tests/external-stack/run-test.sh
|
||||
- name: Notify Vulcanize Slack on CI failure
|
||||
if: ${{ always() && github.ref_name == 'main' }}
|
||||
uses: ravsamhq/notify-slack-action@v2
|
||||
with:
|
||||
status: ${{ job.status }}
|
||||
notify_when: 'failure'
|
||||
env:
|
||||
SLACK_WEBHOOK_URL: ${{ secrets.VULCANIZE_SLACK_CI_ALERTS }}
|
||||
- name: Notify DeepStack Slack on CI failure
|
||||
if: ${{ always() && github.ref_name == 'main' }}
|
||||
uses: ravsamhq/notify-slack-action@v2
|
||||
with:
|
||||
status: ${{ job.status }}
|
||||
notify_when: 'failure'
|
||||
env:
|
||||
SLACK_WEBHOOK_URL: ${{ secrets.DEEPSTACK_SLACK_CI_ALERTS }}
|
||||
|
|
@ -0,0 +1,70 @@
|
|||
name: K8s Deploy Test
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
push:
|
||||
branches: '*'
|
||||
paths:
|
||||
- '!**'
|
||||
- '.gitea/workflows/triggers/test-k8s-deploy'
|
||||
- '.gitea/workflows/test-k8s-deploy.yml'
|
||||
- 'tests/k8s-deploy/run-deploy-test.sh'
|
||||
schedule: # Note: coordinate with other tests to not overload runners at the same time of day
|
||||
- cron: '3 15 * * *'
|
||||
|
||||
jobs:
|
||||
test:
|
||||
name: "Run deploy test suite on kind/k8s"
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- name: "Clone project repository"
|
||||
uses: actions/checkout@v3
|
||||
# At present the stock setup-python action fails on Linux/aarch64
|
||||
# Conditional steps below workaroud this by using deadsnakes for that case only
|
||||
- name: "Install Python for ARM on Linux"
|
||||
if: ${{ runner.arch == 'arm64' && runner.os == 'Linux' }}
|
||||
uses: deadsnakes/action@v3.0.1
|
||||
with:
|
||||
python-version: '3.8'
|
||||
- name: "Install Python cases other than ARM on Linux"
|
||||
if: ${{ ! (runner.arch == 'arm64' && runner.os == 'Linux') }}
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.8'
|
||||
- name: "Print Python version"
|
||||
run: python3 --version
|
||||
- name: "Install shiv"
|
||||
run: pip install shiv==1.0.6
|
||||
- name: "Generate build version file"
|
||||
run: ./scripts/create_build_tag_file.sh
|
||||
- name: "Build local shiv package"
|
||||
run: ./scripts/build_shiv_package.sh
|
||||
- name: "Check cgroups version"
|
||||
run: mount | grep cgroup
|
||||
- name: "Install kind"
|
||||
run: ./tests/scripts/install-kind.sh
|
||||
- name: "Install Kubectl"
|
||||
run: ./tests/scripts/install-kubectl.sh
|
||||
- name: "Run k8s deployment test"
|
||||
run: |
|
||||
source /opt/bash-utils/cgroup-helper.sh
|
||||
join_cgroup
|
||||
./tests/k8s-deploy/run-deploy-test.sh
|
||||
- name: Notify Vulcanize Slack on CI failure
|
||||
if: ${{ always() && github.ref_name == 'main' }}
|
||||
uses: ravsamhq/notify-slack-action@v2
|
||||
with:
|
||||
status: ${{ job.status }}
|
||||
notify_when: 'failure'
|
||||
env:
|
||||
SLACK_WEBHOOK_URL: ${{ secrets.VULCANIZE_SLACK_CI_ALERTS }}
|
||||
- name: Notify DeepStack Slack on CI failure
|
||||
if: ${{ always() && github.ref_name == 'main' }}
|
||||
uses: ravsamhq/notify-slack-action@v2
|
||||
with:
|
||||
status: ${{ job.status }}
|
||||
notify_when: 'failure'
|
||||
env:
|
||||
SLACK_WEBHOOK_URL: ${{ secrets.DEEPSTACK_SLACK_CI_ALERTS }}
|
||||
|
|
@ -0,0 +1,70 @@
|
|||
name: K8s Deployment Control Test
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
push:
|
||||
branches: '*'
|
||||
paths:
|
||||
- '!**'
|
||||
- '.gitea/workflows/triggers/test-k8s-deployment-control'
|
||||
- '.gitea/workflows/test-k8s-deployment-control.yml'
|
||||
- 'tests/k8s-deployment-control/run-test.sh'
|
||||
schedule: # Note: coordinate with other tests to not overload runners at the same time of day
|
||||
- cron: '3 30 * * *'
|
||||
|
||||
jobs:
|
||||
test:
|
||||
name: "Run deployment control suite on kind/k8s"
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- name: "Clone project repository"
|
||||
uses: actions/checkout@v3
|
||||
# At present the stock setup-python action fails on Linux/aarch64
|
||||
# Conditional steps below workaroud this by using deadsnakes for that case only
|
||||
- name: "Install Python for ARM on Linux"
|
||||
if: ${{ runner.arch == 'arm64' && runner.os == 'Linux' }}
|
||||
uses: deadsnakes/action@v3.0.1
|
||||
with:
|
||||
python-version: '3.8'
|
||||
- name: "Install Python cases other than ARM on Linux"
|
||||
if: ${{ ! (runner.arch == 'arm64' && runner.os == 'Linux') }}
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.8'
|
||||
- name: "Print Python version"
|
||||
run: python3 --version
|
||||
- name: "Install shiv"
|
||||
run: pip install shiv==1.0.6
|
||||
- name: "Generate build version file"
|
||||
run: ./scripts/create_build_tag_file.sh
|
||||
- name: "Build local shiv package"
|
||||
run: ./scripts/build_shiv_package.sh
|
||||
- name: "Check cgroups version"
|
||||
run: mount | grep cgroup
|
||||
- name: "Install kind"
|
||||
run: ./tests/scripts/install-kind.sh
|
||||
- name: "Install Kubectl"
|
||||
run: ./tests/scripts/install-kubectl.sh
|
||||
- name: "Run k8s deployment control test"
|
||||
run: |
|
||||
source /opt/bash-utils/cgroup-helper.sh
|
||||
join_cgroup
|
||||
./tests/k8s-deployment-control/run-test.sh
|
||||
- name: Notify Vulcanize Slack on CI failure
|
||||
if: ${{ always() && github.ref_name == 'main' }}
|
||||
uses: ravsamhq/notify-slack-action@v2
|
||||
with:
|
||||
status: ${{ job.status }}
|
||||
notify_when: 'failure'
|
||||
env:
|
||||
SLACK_WEBHOOK_URL: ${{ secrets.VULCANIZE_SLACK_CI_ALERTS }}
|
||||
- name: Notify DeepStack Slack on CI failure
|
||||
if: ${{ always() && github.ref_name == 'main' }}
|
||||
uses: ravsamhq/notify-slack-action@v2
|
||||
with:
|
||||
status: ${{ job.status }}
|
||||
notify_when: 'failure'
|
||||
env:
|
||||
SLACK_WEBHOOK_URL: ${{ secrets.DEEPSTACK_SLACK_CI_ALERTS }}
|
||||
|
|
@ -0,0 +1,60 @@
|
|||
name: Webapp Test
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- ci-test
|
||||
paths-ignore:
|
||||
- '.gitea/workflows/triggers/*'
|
||||
|
||||
jobs:
|
||||
test:
|
||||
name: "Run webapp test suite"
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: "Clone project repository"
|
||||
uses: actions/checkout@v3
|
||||
# At present the stock setup-python action fails on Linux/aarch64
|
||||
# Conditional steps below workaroud this by using deadsnakes for that case only
|
||||
- name: "Install Python for ARM on Linux"
|
||||
if: ${{ runner.arch == 'arm64' && runner.os == 'Linux' }}
|
||||
uses: deadsnakes/action@v3.0.1
|
||||
with:
|
||||
python-version: '3.8'
|
||||
- name: "Install Python cases other than ARM on Linux"
|
||||
if: ${{ ! (runner.arch == 'arm64' && runner.os == 'Linux') }}
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.8'
|
||||
- name: "Print Python version"
|
||||
run: python3 --version
|
||||
- name: "Install shiv"
|
||||
run: pip install shiv==1.0.6
|
||||
- name: "Generate build version file"
|
||||
run: ./scripts/create_build_tag_file.sh
|
||||
- name: "Build local shiv package"
|
||||
run: ./scripts/build_shiv_package.sh
|
||||
- name: "Install wget" # 20240109 - Only needed until the executors are updated.
|
||||
run: apt update && apt install -y wget
|
||||
- name: "Run webapp tests"
|
||||
run: ./tests/webapp-test/run-webapp-test.sh
|
||||
- name: Notify Vulcanize Slack on CI failure
|
||||
if: ${{ always() && github.ref_name == 'main' }}
|
||||
uses: ravsamhq/notify-slack-action@v2
|
||||
with:
|
||||
status: ${{ job.status }}
|
||||
notify_when: 'failure'
|
||||
env:
|
||||
SLACK_WEBHOOK_URL: ${{ secrets.VULCANIZE_SLACK_CI_ALERTS }}
|
||||
- name: Notify DeepStack Slack on CI failure
|
||||
if: ${{ always() && github.ref_name == 'main' }}
|
||||
uses: ravsamhq/notify-slack-action@v2
|
||||
with:
|
||||
status: ${{ job.status }}
|
||||
notify_when: 'failure'
|
||||
env:
|
||||
SLACK_WEBHOOK_URL: ${{ secrets.DEEPSTACK_SLACK_CI_ALERTS }}
|
||||
|
|
@ -0,0 +1,58 @@
|
|||
name: Smoke Test
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches: '*'
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- ci-test
|
||||
paths-ignore:
|
||||
- '.gitea/workflows/triggers/*'
|
||||
|
||||
|
||||
jobs:
|
||||
test:
|
||||
name: "Run basic test suite"
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: "Clone project repository"
|
||||
uses: actions/checkout@v3
|
||||
# At present the stock setup-python action fails on Linux/aarch64
|
||||
# Conditional steps below workaroud this by using deadsnakes for that case only
|
||||
- name: "Install Python for ARM on Linux"
|
||||
if: ${{ runner.arch == 'arm64' && runner.os == 'Linux' }}
|
||||
uses: deadsnakes/action@v3.0.1
|
||||
with:
|
||||
python-version: '3.8'
|
||||
- name: "Install Python cases other than ARM on Linux"
|
||||
if: ${{ ! (runner.arch == 'arm64' && runner.os == 'Linux') }}
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.8'
|
||||
- name: "Print Python version"
|
||||
run: python3 --version
|
||||
- name: "Install shiv"
|
||||
run: pip install shiv==1.0.6
|
||||
- name: "Generate build version file"
|
||||
run: ./scripts/create_build_tag_file.sh
|
||||
- name: "Build local shiv package"
|
||||
run: ./scripts/build_shiv_package.sh
|
||||
- name: "Run smoke tests"
|
||||
run: ./tests/smoke-test/run-smoke-test.sh
|
||||
- name: Notify Vulcanize Slack on CI failure
|
||||
if: ${{ always() && github.ref_name == 'main' }}
|
||||
uses: ravsamhq/notify-slack-action@v2
|
||||
with:
|
||||
status: ${{ job.status }}
|
||||
notify_when: 'failure'
|
||||
env:
|
||||
SLACK_WEBHOOK_URL: ${{ secrets.VULCANIZE_SLACK_CI_ALERTS }}
|
||||
- name: Notify DeepStack Slack on CI failure
|
||||
if: ${{ always() && github.ref_name == 'main' }}
|
||||
uses: ravsamhq/notify-slack-action@v2
|
||||
with:
|
||||
status: ${{ job.status }}
|
||||
notify_when: 'failure'
|
||||
env:
|
||||
SLACK_WEBHOOK_URL: ${{ secrets.DEEPSTACK_SLACK_CI_ALERTS }}
|
||||
|
|
@ -0,0 +1,10 @@
|
|||
Change this file to trigger running the fixturenet-laconicd-test CI job
|
||||
Trigger
|
||||
Trigger
|
||||
Trigger
|
||||
Trigger
|
||||
Trigger
|
||||
Trigger
|
||||
Trigger
|
||||
Trigger
|
||||
Trigger
|
||||
|
|
@ -0,0 +1,3 @@
|
|||
Change this file to trigger running the test-container-registry CI job
|
||||
Triggered: 2026-01-21
|
||||
Triggered: 2026-01-21 19:28:29
|
||||
|
|
@ -0,0 +1,2 @@
|
|||
Change this file to trigger running the test-database CI job
|
||||
Trigger test run
|
||||
|
|
@ -0,0 +1,2 @@
|
|||
Change this file to trigger running the external-stack CI job
|
||||
trigger
|
||||
|
|
@ -0,0 +1,2 @@
|
|||
Change this file to trigger running the test-k8s-deploy CI job
|
||||
Trigger test on PR branch
|
||||
|
|
@ -0,0 +1,30 @@
|
|||
name: Fixturenet-Eth Test
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: '*'
|
||||
paths:
|
||||
- '!**'
|
||||
- '.github/workflows/triggers/fixturenet-eth-test'
|
||||
|
||||
jobs:
|
||||
test:
|
||||
name: "Run fixturenet-eth test suite"
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: "Clone project repository"
|
||||
uses: actions/checkout@v3
|
||||
- name: "Install Python"
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.8'
|
||||
- name: "Print Python version"
|
||||
run: python3 --version
|
||||
- name: "Install shiv"
|
||||
run: pip install shiv
|
||||
- name: "Generate build version file"
|
||||
run: ./scripts/create_build_tag_file.sh
|
||||
- name: "Build local shiv package"
|
||||
run: ./scripts/build_shiv_package.sh
|
||||
- name: "Run fixturenet-eth tests"
|
||||
run: ./tests/fixturenet-eth/run-test.sh
|
||||
|
|
@ -0,0 +1,30 @@
|
|||
name: Fixturenet-Laconicd Test
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: '*'
|
||||
paths:
|
||||
- '!**'
|
||||
- '.github/workflows/triggers/fixturenet-laconicd-test'
|
||||
|
||||
jobs:
|
||||
test:
|
||||
name: "Run fixturenet-laconicd test suite"
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: "Clone project repository"
|
||||
uses: actions/checkout@v3
|
||||
- name: "Install Python"
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.8'
|
||||
- name: "Print Python version"
|
||||
run: python3 --version
|
||||
- name: "Install shiv"
|
||||
run: pip install shiv
|
||||
- name: "Generate build version file"
|
||||
run: ./scripts/create_build_tag_file.sh
|
||||
- name: "Build local shiv package"
|
||||
run: ./scripts/build_shiv_package.sh
|
||||
- name: "Run fixturenet-laconicd tests"
|
||||
run: ./tests/fixturenet-laconicd/run-test.sh
|
||||
|
|
@ -0,0 +1,21 @@
|
|||
name: Lint Checks
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches: '*'
|
||||
push:
|
||||
branches: '*'
|
||||
|
||||
jobs:
|
||||
test:
|
||||
name: "Run linter"
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: "Clone project repository"
|
||||
uses: actions/checkout@v3
|
||||
- name: "Install Python"
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.8'
|
||||
- name : "Run flake8"
|
||||
uses: py-actions/flake8@v2
|
||||
|
|
@ -0,0 +1,46 @@
|
|||
name: Publish
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
- publish-test
|
||||
|
||||
jobs:
|
||||
publish:
|
||||
name: "Build and publish"
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: "Clone project repository"
|
||||
uses: actions/checkout@v3
|
||||
- name: "Get build info"
|
||||
id: build-info
|
||||
run: |
|
||||
build_tag=$(./scripts/create_build_tag_file.sh)
|
||||
echo "build-tag=v${build_tag}" >> $GITHUB_OUTPUT
|
||||
- name: "Install Python"
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.8'
|
||||
- name: "Print Python version"
|
||||
run: python3 --version
|
||||
- name: "Install shiv"
|
||||
run: pip install shiv
|
||||
- name: "Build local shiv package"
|
||||
id: build
|
||||
run: |
|
||||
./scripts/build_shiv_package.sh
|
||||
result_code=$?
|
||||
echo "package-file=$(ls ./package/*)" >> $GITHUB_OUTPUT
|
||||
exit $result_code
|
||||
- name: "Stage artifact file"
|
||||
run: |
|
||||
cp ${{ steps.build.outputs.package-file }} ./laconic-so
|
||||
- name: "Create release"
|
||||
uses: softprops/action-gh-release@v1
|
||||
with:
|
||||
tag_name: ${{ steps.build-info.outputs.build-tag }}
|
||||
# On the publish test branch, mark our release as a draft
|
||||
# Hack using endsWith to workaround Gitea sometimes sending "publish-test" vs "refs/heads/publish-test"
|
||||
draft: ${{ endsWith('publish-test', github.ref ) }}
|
||||
files: ./laconic-so
|
||||
|
|
@ -0,0 +1,29 @@
|
|||
name: Deploy Test
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches: '*'
|
||||
push:
|
||||
branches: '*'
|
||||
|
||||
jobs:
|
||||
test:
|
||||
name: "Run deploy test suite"
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: "Clone project repository"
|
||||
uses: actions/checkout@v3
|
||||
- name: "Install Python"
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.8'
|
||||
- name: "Print Python version"
|
||||
run: python3 --version
|
||||
- name: "Install shiv"
|
||||
run: pip install shiv
|
||||
- name: "Generate build version file"
|
||||
run: ./scripts/create_build_tag_file.sh
|
||||
- name: "Build local shiv package"
|
||||
run: ./scripts/build_shiv_package.sh
|
||||
- name: "Run deploy tests"
|
||||
run: ./tests/deploy/run-deploy-test.sh
|
||||
|
|
@ -0,0 +1,29 @@
|
|||
name: Webapp Test
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches: '*'
|
||||
push:
|
||||
branches: '*'
|
||||
|
||||
jobs:
|
||||
test:
|
||||
name: "Run webapp test suite"
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: "Clone project repository"
|
||||
uses: actions/checkout@v3
|
||||
- name: "Install Python"
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.8'
|
||||
- name: "Print Python version"
|
||||
run: python3 --version
|
||||
- name: "Install shiv"
|
||||
run: pip install shiv
|
||||
- name: "Generate build version file"
|
||||
run: ./scripts/create_build_tag_file.sh
|
||||
- name: "Build local shiv package"
|
||||
run: ./scripts/build_shiv_package.sh
|
||||
- name: "Run webapp tests"
|
||||
run: ./tests/webapp-test/run-webapp-test.sh
|
||||
|
|
@ -0,0 +1,29 @@
|
|||
name: Smoke Test
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches: '*'
|
||||
push:
|
||||
branches: '*'
|
||||
|
||||
jobs:
|
||||
test:
|
||||
name: "Run basic test suite"
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: "Clone project repository"
|
||||
uses: actions/checkout@v3
|
||||
- name: "Install Python"
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.8'
|
||||
- name: "Print Python version"
|
||||
run: python3 --version
|
||||
- name: "Install shiv"
|
||||
run: pip install shiv
|
||||
- name: "Generate build version file"
|
||||
run: ./scripts/create_build_tag_file.sh
|
||||
- name: "Build local shiv package"
|
||||
run: ./scripts/build_shiv_package.sh
|
||||
- name: "Run smoke tests"
|
||||
run: ./tests/smoke-test/run-smoke-test.sh
|
||||
|
|
@ -0,0 +1 @@
|
|||
Change this file to trigger running the fixturenet-eth-test CI job
|
||||
|
|
@ -0,0 +1,3 @@
|
|||
Change this file to trigger running the fixturenet-laconicd-test CI job
|
||||
|
||||
trigger
|
||||
|
|
@ -1,4 +1,11 @@
|
|||
.venv/
|
||||
sessions.duckdb
|
||||
sessions.duckdb.wal
|
||||
.idea
|
||||
venv
|
||||
.vscode
|
||||
laconic-so
|
||||
laconic_stack_orchestrator.egg-info
|
||||
__pycache__
|
||||
*~
|
||||
package
|
||||
stack_orchestrator/data/build_tag.txt
|
||||
/build
|
||||
.worktrees
|
||||
|
|
|
|||
|
|
@ -1 +0,0 @@
|
|||
pebbles.db
|
||||
|
|
@ -1,3 +1,3 @@
|
|||
{
|
||||
"prefix": "bar"
|
||||
"prefix": "so"
|
||||
}
|
||||
|
|
@ -1,52 +1,15 @@
|
|||
{"type":"create","timestamp":"2026-03-06T07:57:55.427398426Z","issue_id":"bar-48f","payload":{"description":"Route all validator traffic (gossip, repair, TVU, TPU) through 137.239.194.65 on laconic-was-sw01 in Ashburn. Supersedes old TVU-only shred relay. See docs/ashburn-validator-relay.md for full design.","priority":"1","title":"Ashburn Full Validator Traffic Relay","type":"epic"}}
|
||||
{"type":"create","timestamp":"2026-03-06T07:58:01.589463071Z","issue_id":"bar-a47","payload":{"description":"Create Loopback101 (137.239.194.65/32), VALIDATOR-RELAY ACL + traffic-policy on Et1/1, replacing old SHRED-RELAY. Uses 5-min auto-revert config session. Playbook: playbooks/ashburn-relay-was-sw01.yml","priority":"1","title":"was-sw01: Inbound validator relay config","type":"task"}}
|
||||
{"type":"create","timestamp":"2026-03-06T07:58:07.292140983Z","issue_id":"bar-0e5","payload":{"description":"Add 137.239.194.65/32 to lo, DNAT rules for ports 8001,9000-9025 to kind node 172.20.0.2. Playbook: playbooks/ashburn-relay-biscayne.yml -t inbound","priority":"1","title":"biscayne: Inbound DNAT rules","type":"task"}}
|
||||
{"type":"create","timestamp":"2026-03-06T07:58:10.838534858Z","issue_id":"bar-f9b","payload":{"description":"Ping 137.239.194.65 from external host, check DNAT counters on biscayne, verify traffic-policy counters on was-sw01.","priority":"1","title":"Verify inbound relay","type":"task"}}
|
||||
{"type":"create","timestamp":"2026-03-06T07:58:15.228970622Z","issue_id":"bar-bf4","payload":{"description":"Pre-flight to discover GRE tunnel interface, then apply VALIDATOR-OUTBOUND traffic-policy redirecting src 137.239.194.65 to was-sw01 via backbone. Playbook: playbooks/ashburn-relay-mia-sw01.yml","priority":"1","title":"mia-sw01: Outbound validator redirect","type":"task"}}
|
||||
{"type":"create","timestamp":"2026-03-06T07:58:19.571640837Z","issue_id":"bar-78d","payload":{"description":"fwmark 100 on validator source ports, SNAT to 137.239.194.65, policy route via doublezero0 table ashburn. Playbook: playbooks/ashburn-relay-biscayne.yml -t outbound","priority":"1","title":"biscayne: Outbound SNAT + policy routing","type":"task"}}
|
||||
{"type":"create","timestamp":"2026-03-06T07:58:23.377441628Z","issue_id":"bar-f3b","payload":{"description":"Verify traffic-policy counters on both switches, iptables counters on biscayne, validator gossip ContactInfo shows 137.239.194.65, repair peer count increases, slot catchup rate improves. Write memory on both switches.","priority":"1","title":"End-to-end verification","type":"task"}}
|
||||
{"type":"create","timestamp":"2026-03-06T07:58:27.341320984Z","issue_id":"bar-8a9","payload":{"description":"After stable: remove old SHRED-RELAY policy and ACL from was-sw01, remove old 64.92.84.81:20000 DNAT from biscayne.","priority":"2","title":"Cleanup old SHRED-RELAY","type":"task"}}
|
||||
{"type":"rename","timestamp":"2026-03-06T07:58:32.091645662Z","issue_id":"bar-a47","payload":{"new_id":"bar-48f.1"}}
|
||||
{"type":"dep_add","timestamp":"2026-03-06T07:58:32.091647902Z","issue_id":"bar-48f.1","payload":{"dep_type":"parent-child","depends_on":"bar-48f"}}
|
||||
{"type":"rename","timestamp":"2026-03-06T07:58:32.274391159Z","issue_id":"bar-0e5","payload":{"new_id":"bar-48f.2"}}
|
||||
{"type":"dep_add","timestamp":"2026-03-06T07:58:32.274392749Z","issue_id":"bar-48f.2","payload":{"dep_type":"parent-child","depends_on":"bar-48f"}}
|
||||
{"type":"rename","timestamp":"2026-03-06T07:58:32.468426932Z","issue_id":"bar-f9b","payload":{"new_id":"bar-48f.3"}}
|
||||
{"type":"dep_add","timestamp":"2026-03-06T07:58:32.468428522Z","issue_id":"bar-48f.3","payload":{"dep_type":"parent-child","depends_on":"bar-48f"}}
|
||||
{"type":"rename","timestamp":"2026-03-06T07:58:32.657295386Z","issue_id":"bar-bf4","payload":{"new_id":"bar-48f.4"}}
|
||||
{"type":"dep_add","timestamp":"2026-03-06T07:58:32.657297846Z","issue_id":"bar-48f.4","payload":{"dep_type":"parent-child","depends_on":"bar-48f"}}
|
||||
{"type":"rename","timestamp":"2026-03-06T07:58:32.864939519Z","issue_id":"bar-78d","payload":{"new_id":"bar-48f.5"}}
|
||||
{"type":"dep_add","timestamp":"2026-03-06T07:58:32.864941739Z","issue_id":"bar-48f.5","payload":{"dep_type":"parent-child","depends_on":"bar-48f"}}
|
||||
{"type":"rename","timestamp":"2026-03-06T07:58:33.364299485Z","issue_id":"bar-f3b","payload":{"new_id":"bar-48f.6"}}
|
||||
{"type":"dep_add","timestamp":"2026-03-06T07:58:33.364301305Z","issue_id":"bar-48f.6","payload":{"dep_type":"parent-child","depends_on":"bar-48f"}}
|
||||
{"type":"rename","timestamp":"2026-03-06T07:58:33.639638369Z","issue_id":"bar-8a9","payload":{"new_id":"bar-48f.7"}}
|
||||
{"type":"dep_add","timestamp":"2026-03-06T07:58:33.639640069Z","issue_id":"bar-48f.7","payload":{"dep_type":"parent-child","depends_on":"bar-48f"}}
|
||||
{"type":"dep_add","timestamp":"2026-03-06T07:58:39.486721446Z","issue_id":"bar-48f.2","payload":{"dep_type":"blocks","depends_on":"bar-48f.1"}}
|
||||
{"type":"dep_add","timestamp":"2026-03-06T07:58:39.911749641Z","issue_id":"bar-48f.3","payload":{"dep_type":"blocks","depends_on":"bar-48f.2"}}
|
||||
{"type":"dep_add","timestamp":"2026-03-06T07:58:40.398532353Z","issue_id":"bar-48f.4","payload":{"dep_type":"blocks","depends_on":"bar-48f.3"}}
|
||||
{"type":"dep_add","timestamp":"2026-03-06T07:58:40.762666046Z","issue_id":"bar-48f.5","payload":{"dep_type":"blocks","depends_on":"bar-48f.4"}}
|
||||
{"type":"dep_add","timestamp":"2026-03-06T07:58:41.173027726Z","issue_id":"bar-48f.6","payload":{"dep_type":"blocks","depends_on":"bar-48f.5"}}
|
||||
{"type":"dep_add","timestamp":"2026-03-06T07:58:41.467313496Z","issue_id":"bar-48f.7","payload":{"dep_type":"blocks","depends_on":"bar-48f.6"}}
|
||||
{"type":"update","timestamp":"2026-03-06T18:32:00.041874266Z","issue_id":"bar-48f.1","payload":{"description":"Run ansible playbook (pane A) to apply config session with 5-min auto-revert. Review output. In pane B, SSH to install@137.239.200.198 and manually verify (show session-config diffs, show traffic-policy counters). Type 'configure session validator-relay commit' and 'write memory' when satisfied. Playbook: playbooks/ashburn-relay-was-sw01.yml (do NOT use -e commit=true; commit is manual via SSH)."}}
|
||||
{"type":"update","timestamp":"2026-03-06T18:32:05.861153312Z","issue_id":"bar-48f.4","payload":{"description":"Run ansible playbook pre-flight (pane A) to discover GRE tunnel interface. Then run with -e apply=true -e tunnel_interface=TunnelX for 5-min auto-revert. In pane B, SSH to install@209.42.167.133 and manually verify. Type 'configure session validator-outbound commit' and 'write memory' when satisfied. Playbook: playbooks/ashburn-relay-mia-sw01.yml (do NOT use -e commit=true; commit is manual via SSH)."}}
|
||||
{"type":"status_update","timestamp":"2026-03-06T18:35:35.320628231Z","issue_id":"bar-48f","payload":{"status":"in_progress"}}
|
||||
{"type":"status_update","timestamp":"2026-03-06T18:35:35.717040604Z","issue_id":"bar-48f.1","payload":{"status":"in_progress"}}
|
||||
{"type":"close","timestamp":"2026-03-06T20:12:45.087966093Z","issue_id":"bar-48f.1","payload":{}}
|
||||
{"type":"status_update","timestamp":"2026-03-06T20:16:34.00466057Z","issue_id":"bar-48f.2","payload":{"status":"in_progress"}}
|
||||
{"type":"close","timestamp":"2026-03-06T20:17:18.681131396Z","issue_id":"bar-48f.2","payload":{}}
|
||||
{"type":"status_update","timestamp":"2026-03-06T20:17:19.159927405Z","issue_id":"bar-48f.3","payload":{"status":"in_progress"}}
|
||||
{"type":"close","timestamp":"2026-03-06T20:18:42.42112937Z","issue_id":"bar-48f.3","payload":{}}
|
||||
{"type":"status_update","timestamp":"2026-03-06T20:18:42.930237032Z","issue_id":"bar-48f.4","payload":{"status":"in_progress"}}
|
||||
{"type":"create","timestamp":"2026-03-08T06:58:52.122307149Z","issue_id":"bar-02e","payload":{"description":"/srv/solana is a directory on the ZFS dataset biscayne/DATA/srv (mounted at /srv\nwith overlay=on). The fstab zvol mount at /srv/solana was shadowed by ZFS.\n\nFixed 2026-03-08: removed /srv/solana fstab entries, canonical data path is now\n/srv/kind/solana. All playbooks updated. fstab clean. Mounts verified.","priority":"1","title":"zvol mount: /srv/solana resolves to ZFS dataset, not zvol","type":"bug"}}
|
||||
{"type":"create","timestamp":"2026-03-08T06:58:52.557582445Z","issue_id":"bar-41a","payload":{"description":"laconic-so creates configmap resources for telegraf but does not generate\nvolumeMounts in the pod spec. The telegraf container crashes because\n/etc/telegraf and /scripts are empty. Manual configmap creation works but\nthe volume mounts are still missing. Root cause is in laconic-so's stack\nmigration — configmap volume mount generation is incomplete.","priority":"1","title":"telegraf volume mounts missing from pod spec","type":"bug"}}
|
||||
{"type":"create","timestamp":"2026-03-08T06:58:53.065888933Z","issue_id":"bar-a3b","payload":{"description":"Validator exits shortly after starting. Log shows UDP port reachability checks\nand TCP port checks failing. Needs full log analysis from kind node path\n(/mnt/validator-log/validator.log). May be related to networking/firewall\nconfiguration or the shred relay setup.","priority":"0","title":"agave-validator crash after ~57 seconds","type":"bug"}}
|
||||
{"type":"create","timestamp":"2026-03-08T06:58:53.589221516Z","issue_id":"bar-b04","payload":{"description":"Once laconic-so deployment prepare lands, update biscayne-redeploy.yml to use\nprepare instead of start+scale-to-0 workaround. The deploy tag section should\ncall deployment prepare, and scale-up should call deployment start\n--skip-cluster-management.","priority":"2","title":"update biscayne-redeploy to use deployment prepare","type":"task"}}
|
||||
{"type":"create","timestamp":"2026-03-08T06:58:54.238136989Z","issue_id":"bar-b41","payload":{"description":"Automate the leapfrog recovery strategy documented in CLAUDE.md. When the\nvalidator is stuck in a repair-dependent gap, download a fresh snapshot past\nthe incomplete zone while preserving the existing ledger (which has turbine\nshreds at the tip). Needs: shred completeness check, snapshot slot targeting,\nselective wipe (accounts+snapshots only, keep ledger).","priority":"2","title":"snapshot leapfrog recovery playbook","type":"feature"}}
|
||||
{"type":"create","timestamp":"2026-03-08T06:58:54.756609299Z","issue_id":"bar-0b4","payload":{"description":"biscayne-prepare-agave.yml unconditionally imports ashburn-relay-biscayne.yml\nat the end. This couples filesystem preparation to relay setup. The relay\nplaybook fails if the kind node isn't running (ping to 172.20.0.2 fails).\nShould be a separate playbook invocation, not an import.","priority":"3","title":"biscayne-prepare-agave imports ashburn-relay-biscayne unconditionally","type":"bug"}}
|
||||
{"type":"close","timestamp":"2026-03-08T06:59:00.140156099Z","issue_id":"bar-02e","payload":{}}
|
||||
{"type":"create","timestamp":"2026-03-10T08:05:07.190617713Z","issue_id":"bar-2c9","payload":{"description":"laconic-so build-containers --include filter does exact string match via\ninclude_exclude_check(). Container names use slash (laconicnetwork/agave),\nnot dash. Using --include laconicnetwork-agave silently skips the build\nand reports success.\n\nFixed in biscayne-sync-tools.yml (commit ceea8f0) but the underlying\nlaconic-so behavior of silently skipping with no warning is a bug.","priority":"2","title":"build-containers --include uses slash not dash in container names","type":"bug"}}
|
||||
{"type":"create","timestamp":"2026-03-10T08:05:12.506655809Z","issue_id":"bar-6cb","payload":{"description":"When laconic-so deployment restart deletes the namespace, PVCs are\ncascade-deleted but PVs (cluster-scoped) survive in Released state with\nstale claimRefs pointing to the old PVC UIDs. New PVCs created by the\nrestarted deployment can't bind because the PVs still reference the\ndeleted PVCs.\n\nWorkaround: patch Released PVs to clear claimRef after restart.\nAdded to biscayne-restart.yml. Root cause is in laconic-so — it should\nclear stale claimRefs as part of the restart flow.\n\nRelated: so-933 (namespace termination race).","priority":"1","title":"PV claimRefs go stale after deployment restart","type":"bug"}}
|
||||
{"type":"create","timestamp":"2026-03-10T08:05:15.941416301Z","issue_id":"bar-fec","payload":{"description":"monitoring-grafana-data volume is defined in spec.yml but laconic-so's\nget_pvcs() does not generate a PVC for it. The PV is created but no\nmatching PVC exists, so the grafana container can't mount its data volume.\n\nWorkaround: manually kubectl apply the PVC after each deployment restart.\nRoot cause is in stack-orchestrator deploy_k8s.py get_pvcs().","priority":"2","title":"grafana PVC not generated by get_pvcs()","type":"bug"}}
|
||||
{"type":"create","timestamp":"2026-03-10T08:05:22.853965263Z","issue_id":"bar-822","payload":{"description":"Rebuilding a container image on the Docker host does NOT update the image\ninside the kind node. With imagePullPolicy: IfNotPresent (the default for\n:local tags), kind uses its cached copy. Must run:\n\n kind load docker-image laconicnetwork/agave:local \\\n --name laconic-70ce4c4b47e23b85\n\nafter every rebuild. This step is not in any playbook or laconic-so flow.\nShould be added to biscayne-sync-tools.yml build-container tag or to\nlaconic-so build-containers itself.","priority":"2","title":"kind load docker-image required after container rebuild","type":"bug"}}
|
||||
{"type":"create","timestamp":"2026-03-10T08:05:28.585915055Z","issue_id":"bar-571","payload":{"description":"Full snapshot slots differ per validator depending on when each started.\nThe entrypoint's incremental download loop assumes it can find an\nincremental keyed to any full snapshot's base slot, but no other validator\nmay have produced a full at that exact slot.\n\nThis causes the incremental download to retry forever when the local\nfull snapshot has a base slot that no network peer has incrementals for.\n\nDocumented for awareness. The entrypoint's infinite retry is intentional\n(user decision) — eventually a matching incremental will appear or the\nentrypoint falls through to download a fresh full+incremental pair.","priority":"3","title":"snapshot base slots are not consensus-aligned across validators","type":"bug"}}
|
||||
{"type":"create","timestamp":"2026-03-10T08:05:32.262889286Z","issue_id":"bar-2d9","payload":{"description":"When spec.yml has explicit values for env vars that also have defaults in\nthe compose file, the spec.yml values win. Changing compose file defaults\nhas no effect unless the spec.yml override is also removed.\n\nThis is by design (spec.yml is deployment-specific config) but the\ninteraction is non-obvious. Bit us when changing snapshot settings in\ncompose but spec.yml still had the old values.\n\nNot a code bug — more a documentation/workflow issue. Operators must\ncheck both compose defaults and spec.yml overrides.","priority":"3","title":"spec.yml overrides compose defaults silently","type":"bug"}}
|
||||
{"type":"create","timestamp":"2026-03-10T08:05:36.212405156Z","issue_id":"bar-31a","payload":{"description":"laconic-so deployment restart sleeps only 5s between down and up. If the\nnamespace is still terminating when 'up' runs, k8s returns 403 Forbidden\ncreating configmaps in the new namespace.\n\nCross-ref: so-933 in the stack-orchestrator pebbles project.\n\nWorkaround: retry the restart or wait manually. The restart playbook\n(biscayne-restart.yml) handles this by scaling to 0 first, waiting for\npod termination, then calling laconic-so restart.","priority":"1","title":"deployment restart namespace termination race","type":"bug"}}
|
||||
{"type":"create","timestamp":"2026-03-08T06:56:07.080584539Z","issue_id":"so-076","payload":{"description":"Currently laconic-so maps one stack to one deployment to one pod. All containers\nin a stack's compose files become containers in a single k8s pod. This means:\n\n- Can't upgrade doublezero without restarting agave-validator\n- Can't restart monitoring without disrupting the validator\n- Can't independently scale or lifecycle-manage components\n\nThe fix is stack composition. A meta-stack (e.g. biscayne-stack) composes\nsub-stacks (agave, doublezero, agave-monitoring), each becoming its own\nk8s Deployment with independent lifecycle.","priority":"2","title":"Stack composition: deploy multiple stacks into one kind cluster","type":"epic"}}
|
||||
{"type":"create","timestamp":"2026-03-08T06:56:07.551986919Z","issue_id":"so-ab0","payload":{"description":"Add laconic-so deployment prepare that creates cluster infrastructure without pods. Already implemented, needs review.","priority":"2","title":"deployment prepare command","type":"task"}}
|
||||
{"type":"create","timestamp":"2026-03-08T06:56:07.884418759Z","issue_id":"so-04f","payload":{"description":"deployment stop on ANY deployment deletes the shared kind cluster. Should only delete its own namespace.","priority":"2","title":"deployment stop should not destroy shared cluster","type":"bug"}}
|
||||
{"type":"create","timestamp":"2026-03-08T06:56:08.253520249Z","issue_id":"so-370","payload":{"description":"Allow stack.yml to reference sub-stacks. Each sub-stack becomes its own k8s Deployment sharing namespace and PVs.","priority":"2","title":"Add stacks: field to stack.yml for composition","type":"task"}}
|
||||
{"type":"create","timestamp":"2026-03-08T06:56:08.646764337Z","issue_id":"so-f7c","payload":{"description":"Create three independent stacks from the monolithic agave-stack. Each gets its own compose file and independent lifecycle.","priority":"2","title":"Split agave-stack into agave + doublezero + monitoring","type":"task"}}
|
||||
{"type":"rename","timestamp":"2026-03-08T06:56:14.499990161Z","issue_id":"so-ab0","payload":{"new_id":"so-076.1"}}
|
||||
{"type":"dep_add","timestamp":"2026-03-08T06:56:14.499992031Z","issue_id":"so-076.1","payload":{"dep_type":"parent-child","depends_on":"so-076"}}
|
||||
{"type":"rename","timestamp":"2026-03-08T06:56:14.786407752Z","issue_id":"so-04f","payload":{"new_id":"so-076.2"}}
|
||||
{"type":"dep_add","timestamp":"2026-03-08T06:56:14.786409842Z","issue_id":"so-076.2","payload":{"dep_type":"parent-child","depends_on":"so-076"}}
|
||||
{"type":"rename","timestamp":"2026-03-08T06:56:15.058959714Z","issue_id":"so-370","payload":{"new_id":"so-076.3"}}
|
||||
{"type":"dep_add","timestamp":"2026-03-08T06:56:15.058961364Z","issue_id":"so-076.3","payload":{"dep_type":"parent-child","depends_on":"so-076"}}
|
||||
{"type":"rename","timestamp":"2026-03-08T06:56:15.410080785Z","issue_id":"so-f7c","payload":{"new_id":"so-076.4"}}
|
||||
{"type":"dep_add","timestamp":"2026-03-08T06:56:15.410082305Z","issue_id":"so-076.4","payload":{"dep_type":"parent-child","depends_on":"so-076"}}
|
||||
{"type":"dep_add","timestamp":"2026-03-08T06:56:16.313585082Z","issue_id":"so-076.3","payload":{"dep_type":"blocks","depends_on":"so-076.2"}}
|
||||
{"type":"dep_add","timestamp":"2026-03-08T06:56:16.567629422Z","issue_id":"so-076.4","payload":{"dep_type":"blocks","depends_on":"so-076.3"}}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,34 @@
|
|||
repos:
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v5.0.0
|
||||
hooks:
|
||||
- id: trailing-whitespace
|
||||
- id: end-of-file-fixer
|
||||
- id: check-yaml
|
||||
args: ['--allow-multiple-documents']
|
||||
- id: check-json
|
||||
- id: check-merge-conflict
|
||||
- id: check-added-large-files
|
||||
|
||||
- repo: https://github.com/psf/black
|
||||
rev: 23.12.1
|
||||
hooks:
|
||||
- id: black
|
||||
language_version: python3
|
||||
|
||||
- repo: https://github.com/PyCQA/flake8
|
||||
rev: 7.1.1
|
||||
hooks:
|
||||
- id: flake8
|
||||
args: ['--max-line-length=88', '--extend-ignore=E203,W503,E402']
|
||||
|
||||
- repo: https://github.com/RobertCraigie/pyright-python
|
||||
rev: v1.1.345
|
||||
hooks:
|
||||
- id: pyright
|
||||
|
||||
- repo: https://github.com/adrienverge/yamllint
|
||||
rev: v1.35.1
|
||||
hooks:
|
||||
- id: yamllint
|
||||
args: [-d, relaxed]
|
||||
|
|
@ -0,0 +1,151 @@
|
|||
# Plan: Make Stack-Orchestrator AI-Friendly
|
||||
|
||||
## Goal
|
||||
|
||||
Make the stack-orchestrator repository easier for AI tools (Claude Code, Cursor, Copilot) to understand and use for generating stacks, including adding a `create-stack` command.
|
||||
|
||||
---
|
||||
|
||||
## Part 1: Documentation & Context Files
|
||||
|
||||
### 1.1 Add CLAUDE.md
|
||||
|
||||
Create a root-level context file for AI assistants.
|
||||
|
||||
**File:** `CLAUDE.md`
|
||||
|
||||
Contents:
|
||||
- Project overview (what stack-orchestrator does)
|
||||
- Stack creation workflow (step-by-step)
|
||||
- File naming conventions
|
||||
- Required vs optional fields in stack.yml
|
||||
- Common patterns and anti-patterns
|
||||
- Links to example stacks (simple, medium, complex)
|
||||
|
||||
### 1.2 Add JSON Schema for stack.yml
|
||||
|
||||
Create formal validation schema.
|
||||
|
||||
**File:** `schemas/stack-schema.json`
|
||||
|
||||
Benefits:
|
||||
- AI tools can validate generated stacks
|
||||
- IDEs provide autocomplete
|
||||
- CI can catch errors early
|
||||
|
||||
### 1.3 Add Template Stack with Comments
|
||||
|
||||
Create an annotated template for reference.
|
||||
|
||||
**File:** `stack_orchestrator/data/stacks/_template/stack.yml`
|
||||
|
||||
```yaml
|
||||
# Stack definition template - copy this directory to create a new stack
|
||||
version: "1.2" # Required: 1.0, 1.1, or 1.2
|
||||
name: my-stack # Required: lowercase, hyphens only
|
||||
description: "Human-readable description" # Optional
|
||||
repos: # Git repositories to clone
|
||||
- github.com/org/repo
|
||||
containers: # Container images to build (must have matching container-build/)
|
||||
- cerc/my-container
|
||||
pods: # Deployment units (must have matching docker-compose-{pod}.yml)
|
||||
- my-pod
|
||||
```
|
||||
|
||||
### 1.4 Document Validation Rules
|
||||
|
||||
Create explicit documentation of constraints currently scattered in code.
|
||||
|
||||
**File:** `docs/stack-format.md`
|
||||
|
||||
Contents:
|
||||
- Container names must start with `cerc/`
|
||||
- Pod names must match compose file: `docker-compose-{pod}.yml`
|
||||
- Repository format: `host/org/repo[@ref]`
|
||||
- Stack directory name should match `name` field
|
||||
- Version field options and differences
|
||||
|
||||
---
|
||||
|
||||
## Part 2: Add `create-stack` Command
|
||||
|
||||
### 2.1 Command Overview
|
||||
|
||||
```bash
|
||||
laconic-so create-stack --repo github.com/org/my-app [--name my-app] [--type webapp]
|
||||
```
|
||||
|
||||
**Behavior:**
|
||||
1. Parse repo URL to extract app name (if --name not provided)
|
||||
2. Create `stacks/{name}/stack.yml`
|
||||
3. Create `container-build/cerc-{name}/Dockerfile` and `build.sh`
|
||||
4. Create `compose/docker-compose-{name}.yml`
|
||||
5. Update list files (repository-list.txt, container-image-list.txt, pod-list.txt)
|
||||
|
||||
### 2.2 Files to Create
|
||||
|
||||
| File | Purpose |
|
||||
|------|---------|
|
||||
| `stack_orchestrator/create/__init__.py` | Package init |
|
||||
| `stack_orchestrator/create/create_stack.py` | Command implementation |
|
||||
|
||||
### 2.3 Files to Modify
|
||||
|
||||
| File | Change |
|
||||
|------|--------|
|
||||
| `stack_orchestrator/main.py` | Add import and `cli.add_command()` |
|
||||
|
||||
### 2.4 Command Options
|
||||
|
||||
| Option | Required | Description |
|
||||
|--------|----------|-------------|
|
||||
| `--repo` | Yes | Git repository URL (e.g., github.com/org/repo) |
|
||||
| `--name` | No | Stack name (defaults to repo name) |
|
||||
| `--type` | No | Template type: webapp, service, empty (default: webapp) |
|
||||
| `--force` | No | Overwrite existing files |
|
||||
|
||||
### 2.5 Template Types
|
||||
|
||||
| Type | Base Image | Port | Use Case |
|
||||
|------|------------|------|----------|
|
||||
| webapp | node:20-bullseye-slim | 3000 | React/Vue/Next.js apps |
|
||||
| service | python:3.11-slim | 8080 | Python backend services |
|
||||
| empty | none | none | Custom from scratch |
|
||||
|
||||
---
|
||||
|
||||
## Part 3: Implementation Summary
|
||||
|
||||
### New Files (6)
|
||||
|
||||
1. `CLAUDE.md` - AI assistant context
|
||||
2. `schemas/stack-schema.json` - Validation schema
|
||||
3. `stack_orchestrator/data/stacks/_template/stack.yml` - Annotated template
|
||||
4. `docs/stack-format.md` - Stack format documentation
|
||||
5. `stack_orchestrator/create/__init__.py` - Package init
|
||||
6. `stack_orchestrator/create/create_stack.py` - Command implementation
|
||||
|
||||
### Modified Files (1)
|
||||
|
||||
1. `stack_orchestrator/main.py` - Register create-stack command
|
||||
|
||||
---
|
||||
|
||||
## Verification
|
||||
|
||||
```bash
|
||||
# 1. Command appears in help
|
||||
laconic-so --help | grep create-stack
|
||||
|
||||
# 2. Dry run works
|
||||
laconic-so --dry-run create-stack --repo github.com/org/test-app
|
||||
|
||||
# 3. Creates all expected files
|
||||
laconic-so create-stack --repo github.com/org/test-app
|
||||
ls stack_orchestrator/data/stacks/test-app/
|
||||
ls stack_orchestrator/data/container-build/cerc-test-app/
|
||||
ls stack_orchestrator/data/compose/docker-compose-test-app.yml
|
||||
|
||||
# 4. Build works with generated stack
|
||||
laconic-so --stack test-app build-containers
|
||||
```
|
||||
278
CLAUDE.md
278
CLAUDE.md
|
|
@ -1,221 +1,121 @@
|
|||
# Biscayne Agave Runbook
|
||||
# CLAUDE.md
|
||||
|
||||
## Deployment Layers
|
||||
This file provides guidance to Claude Code when working with the stack-orchestrator project.
|
||||
|
||||
Operations on biscayne follow a strict layering. Each layer assumes the layers
|
||||
below it are correct. Playbooks belong to exactly one layer.
|
||||
## Some rules to follow
|
||||
NEVER speculate about the cause of something
|
||||
NEVER assume your hypotheses are true without evidence
|
||||
|
||||
| Layer | What | Playbooks |
|
||||
|-------|------|-----------|
|
||||
| 1. Base system | Docker, ZFS, packages | Out of scope (manual/PXE) |
|
||||
| 2. Prepare kind | `/srv/kind` exists (ZFS dataset) | None needed (ZFS handles it) |
|
||||
| 3. Install kind | `laconic-so deployment start` creates kind cluster, mounts `/srv/kind` → `/mnt` in kind node | `biscayne-redeploy.yml` (deploy tags) |
|
||||
| 4. Prepare agave | Host storage for agave: ZFS dataset, ramdisk | `biscayne-prepare-agave.yml` |
|
||||
| 5. Deploy agave | Deploy agave-stack into kind, snapshot download, scale up | `biscayne-redeploy.yml` (snapshot/verify tags), `biscayne-recover.yml` |
|
||||
ALWAYS clearly state when something is a hypothesis
|
||||
ALWAYS use evidence from the systems your interacting with to support your claims and hypotheses
|
||||
ALWAYS run `pre-commit run --all-files` before committing changes
|
||||
|
||||
**Layer 4 invariants** (asserted by `biscayne-prepare-agave.yml`):
|
||||
- `/srv/kind/solana` is a ZFS dataset (`biscayne/DATA/srv/kind/solana`), child of the `/srv/kind` dataset
|
||||
- `/srv/kind/solana/ramdisk` is tmpfs (1TB) — accounts must be in RAM
|
||||
- `/srv/solana` is NOT the data path — it's a directory on the parent ZFS dataset. All data paths use `/srv/kind/solana`
|
||||
## Key Principles
|
||||
|
||||
These invariants are checked at runtime and persisted to fstab/systemd so they
|
||||
survive reboot.
|
||||
### Development Guidelines
|
||||
- **Single responsibility** - Each component has one clear purpose
|
||||
- **Fail fast** - Let errors propagate, don't hide failures
|
||||
- **DRY/KISS** - Minimize duplication and complexity
|
||||
|
||||
**Cross-cutting**: `health-check.yml` (read-only diagnostics), `biscayne-stop.yml`
|
||||
(layer 5 — graceful shutdown), `fix-pv-mounts.yml` (layer 5 — PV repair).
|
||||
## Development Philosophy: Conversational Literate Programming
|
||||
|
||||
## Cluster Operations
|
||||
### Approach
|
||||
This project follows principles inspired by literate programming, where development happens through explanatory conversation rather than code-first implementation.
|
||||
|
||||
### Shutdown Order
|
||||
### Core Principles
|
||||
- **Documentation-First**: All changes begin with discussion of intent and reasoning
|
||||
- **Narrative-Driven**: Complex systems are explained through conversational exploration
|
||||
- **Justification Required**: Every coding task must have a corresponding TODO.md item explaining the "why"
|
||||
- **Iterative Understanding**: Architecture and implementation evolve through dialogue
|
||||
|
||||
The agave validator runs inside a kind-based k8s cluster managed by `laconic-so`.
|
||||
The kind node is a Docker container. **Never restart or kill the kind node container
|
||||
while the validator is running.** Use `agave-validator exit --force` via the admin
|
||||
RPC socket for graceful shutdown, or scale the deployment to 0 and wait.
|
||||
### Working Method
|
||||
1. **Explore and Understand**: Read existing code to understand current state
|
||||
2. **Discuss Architecture**: Workshop complex design decisions through conversation
|
||||
3. **Document Intent**: Update TODO.md with clear justification before coding
|
||||
4. **Explain Changes**: Each modification includes reasoning and context
|
||||
5. **Maintain Narrative**: Conversations serve as living documentation of design evolution
|
||||
|
||||
Correct shutdown sequence:
|
||||
### Implementation Guidelines
|
||||
- Treat conversations as primary documentation
|
||||
- Explain architectural decisions before implementing
|
||||
- Use TODO.md as the "literate document" that justifies all work
|
||||
- Maintain clear narrative threads across sessions
|
||||
- Workshop complex ideas before coding
|
||||
|
||||
1. Scale the deployment to 0 and wait for the pod to terminate:
|
||||
```
|
||||
kubectl scale deployment laconic-70ce4c4b47e23b85-deployment \
|
||||
-n laconic-laconic-70ce4c4b47e23b85 --replicas=0
|
||||
kubectl wait --for=delete pod -l app=laconic-70ce4c4b47e23b85-deployment \
|
||||
-n laconic-laconic-70ce4c4b47e23b85 --timeout=120s
|
||||
```
|
||||
2. Only then restart the kind node if needed:
|
||||
```
|
||||
docker restart laconic-70ce4c4b47e23b85-control-plane
|
||||
```
|
||||
3. Scale back up:
|
||||
```
|
||||
kubectl scale deployment laconic-70ce4c4b47e23b85-deployment \
|
||||
-n laconic-laconic-70ce4c4b47e23b85 --replicas=1
|
||||
```
|
||||
This approach treats the human-AI collaboration as a form of **conversational literate programming** where understanding emerges through dialogue before code implementation.
|
||||
|
||||
### Ramdisk
|
||||
## External Stacks Preferred
|
||||
|
||||
The accounts directory must be in RAM for performance. tmpfs is used instead of
|
||||
`/dev/ram0` — simpler (no format-on-boot service needed), resizable on the fly
|
||||
with `mount -o remount,size=<new>`, and what most Solana operators use.
|
||||
When creating new stacks for any reason, **use the external stack pattern** rather than adding stacks directly to this repository.
|
||||
|
||||
**Boot ordering**: `/srv/kind/solana` is a ZFS dataset mounted automatically by
|
||||
`zfs-mount.service`. The tmpfs ramdisk fstab entry uses
|
||||
`x-systemd.requires=zfs-mount.service` to ensure the dataset is mounted first.
|
||||
**No manual intervention after reboot.**
|
||||
External stacks follow this structure:
|
||||
|
||||
**Mount propagation**: The kind node bind-mounts `/srv/kind` → `/mnt` at container
|
||||
start. laconic-so sets `propagation: HostToContainer` on all kind extraMounts
|
||||
(commit `a11d40f2` in stack-orchestrator), so host submounts propagate into the
|
||||
kind node automatically. A kind restart is required to pick up the new config
|
||||
after updating laconic-so.
|
||||
|
||||
### KUBECONFIG
|
||||
|
||||
kubectl must be told where the kubeconfig is when running as root or via ansible:
|
||||
```
|
||||
KUBECONFIG=/home/rix/.kube/config kubectl ...
|
||||
my-stack/
|
||||
└── stack-orchestrator/
|
||||
├── stacks/
|
||||
│ └── my-stack/
|
||||
│ ├── stack.yml
|
||||
│ └── README.md
|
||||
├── compose/
|
||||
│ └── docker-compose-my-stack.yml
|
||||
└── config/
|
||||
└── my-stack/
|
||||
└── (config files)
|
||||
```
|
||||
|
||||
The ansible playbooks set `environment: KUBECONFIG: /home/rix/.kube/config`.
|
||||
### Usage
|
||||
|
||||
### SSH Agent
|
||||
```bash
|
||||
# Fetch external stack
|
||||
laconic-so fetch-stack github.com/org/my-stack
|
||||
|
||||
SSH to biscayne goes through a ProxyCommand jump host (abernathy.ch2.vaasl.io).
|
||||
The SSH agent socket rotates when the user reconnects. Find the current one:
|
||||
```
|
||||
ls -t /tmp/ssh-*/agent.* | head -1
|
||||
```
|
||||
Then export it:
|
||||
```
|
||||
export SSH_AUTH_SOCK=/tmp/ssh-XXXX/agent.NNNN
|
||||
# Use external stack
|
||||
STACK_PATH=~/cerc/my-stack/stack-orchestrator/stacks/my-stack
|
||||
laconic-so --stack $STACK_PATH deploy init --output spec.yml
|
||||
laconic-so --stack $STACK_PATH deploy create --spec-file spec.yml --deployment-dir deployment
|
||||
laconic-so deployment --dir deployment start
|
||||
```
|
||||
|
||||
### io_uring/ZFS Deadlock — Historical Note
|
||||
### Examples
|
||||
|
||||
Agave uses io_uring for async I/O. Killing agave ungracefully while it has
|
||||
outstanding I/O against ZFS can produce unkillable D-state kernel threads
|
||||
(`io_wq_put_and_exit` blocked on ZFS transactions), deadlocking the container.
|
||||
- `zenith-karma-stack` - Karma watcher deployment
|
||||
- `urbit-stack` - Fake Urbit ship for testing
|
||||
- `zenith-desk-stack` - Desk deployment stack
|
||||
|
||||
**Prevention**: Use graceful shutdown (`agave-validator exit --force` via admin
|
||||
RPC, or scale to 0 and wait). The `biscayne-stop.yml` playbook enforces this.
|
||||
With graceful shutdown, io_uring contexts are closed cleanly and ZFS storage
|
||||
is safe to use directly (no zvol/XFS workaround needed).
|
||||
## Architecture: k8s-kind Deployments
|
||||
|
||||
**ZFS fix**: The underlying io_uring bug is fixed in ZFS 2.2.8+ (PR #17298).
|
||||
Biscayne currently runs ZFS 2.2.2. Upgrading ZFS will eliminate the deadlock
|
||||
risk entirely, even for ungraceful shutdowns.
|
||||
### One Cluster Per Host
|
||||
One Kind cluster per host by design. Never request or expect separate clusters.
|
||||
|
||||
### laconic-so Architecture
|
||||
- `create_cluster()` in `helpers.py` reuses any existing cluster
|
||||
- `cluster-id` in deployment.yml is an identifier, not a cluster request
|
||||
- All deployments share: ingress controller, etcd, certificates
|
||||
|
||||
`laconic-so` manages kind clusters atomically — `deployment start` creates the
|
||||
kind cluster, namespace, PVs, PVCs, and deployment in one shot. There is no way
|
||||
to create the cluster without deploying the pod.
|
||||
### Stack Resolution
|
||||
- External stacks detected via `Path(stack).exists()` in `util.py`
|
||||
- Config/compose resolution: external path first, then internal fallback
|
||||
- External path structure: `stack_orchestrator/data/stacks/<name>/stack.yml`
|
||||
|
||||
Key code paths in stack-orchestrator:
|
||||
- `deploy_k8s.py:up()` — creates everything atomically
|
||||
- `cluster_info.py:get_pvs()` — translates host paths using `kind-mount-root`
|
||||
- `helpers_k8s.py:get_kind_pv_bind_mount_path()` — strips `kind-mount-root`
|
||||
prefix and prepends `/mnt/`
|
||||
- `helpers_k8s.py:_generate_kind_mounts()` — when `kind-mount-root` is set,
|
||||
emits a single `/srv/kind` → `/mnt` mount instead of individual mounts
|
||||
### Secret Generation Implementation
|
||||
- `GENERATE_TOKEN_PATTERN` in `deployment_create.py` matches `$generate:type:length$`
|
||||
- `_generate_and_store_secrets()` creates K8s Secret
|
||||
- `cluster_info.py` adds `envFrom` with `secretRef` to containers
|
||||
- Non-secret config written to `config.env`
|
||||
|
||||
The `kind-mount-root: /srv/kind` setting in `spec.yml` means all data volumes
|
||||
whose host paths start with `/srv/kind` get translated to `/mnt/...` inside the
|
||||
kind node via a single bind mount.
|
||||
### Repository Cloning
|
||||
`setup-repositories --git-ssh` clones repos defined in stack.yml's `repos:` field. Requires SSH agent.
|
||||
|
||||
### Key Identifiers
|
||||
### Key Files (for codebase navigation)
|
||||
- `repos/setup_repositories.py`: `setup-repositories` command (git clone)
|
||||
- `deployment_create.py`: `deploy create` command, secret generation
|
||||
- `deployment.py`: `deployment start/stop/restart` commands
|
||||
- `deploy_k8s.py`: K8s deployer, cluster management calls
|
||||
- `helpers.py`: `create_cluster()`, etcd cleanup, kind operations
|
||||
- `cluster_info.py`: K8s resource generation (Deployment, Service, Ingress)
|
||||
|
||||
- Kind cluster: `laconic-70ce4c4b47e23b85`
|
||||
- Namespace: `laconic-laconic-70ce4c4b47e23b85`
|
||||
- Deployment: `laconic-70ce4c4b47e23b85-deployment`
|
||||
- Kind node container: `laconic-70ce4c4b47e23b85-control-plane`
|
||||
- Deployment dir: `/srv/deployments/agave`
|
||||
- Snapshot dir: `/srv/kind/solana/snapshots` (ZFS dataset, visible to kind at `/mnt/validator-snapshots`)
|
||||
- Ledger dir: `/srv/kind/solana/ledger` (ZFS dataset, visible to kind at `/mnt/validator-ledger`)
|
||||
- Accounts dir: `/srv/kind/solana/ramdisk/accounts` (tmpfs ramdisk, visible to kind at `/mnt/validator-accounts`)
|
||||
- Log dir: `/srv/kind/solana/log` (ZFS dataset, visible to kind at `/mnt/validator-log`)
|
||||
- **WARNING**: `/srv/solana` is a different ZFS dataset directory. All data paths use `/srv/kind/solana`.
|
||||
- Host bind mount root: `/srv/kind` -> kind node `/mnt`
|
||||
- laconic-so: `/home/rix/.local/bin/laconic-so` (editable install)
|
||||
## Insights and Observations
|
||||
|
||||
### PV Mount Paths (inside kind node)
|
||||
|
||||
| PV Name | hostPath |
|
||||
|----------------------|-------------------------------|
|
||||
| validator-snapshots | /mnt/validator-snapshots |
|
||||
| validator-ledger | /mnt/validator-ledger |
|
||||
| validator-accounts | /mnt/validator-accounts |
|
||||
| validator-log | /mnt/validator-log |
|
||||
|
||||
### Snapshot Freshness
|
||||
|
||||
If the snapshot is more than **20,000 slots behind** the current mainnet tip, it is
|
||||
too old. Stop the validator, download a fresh snapshot, and restart. Do NOT let it
|
||||
try to catch up from an old snapshot — it will take too long and may never converge.
|
||||
|
||||
Check with:
|
||||
```
|
||||
# Snapshot slot (from filename)
|
||||
ls /srv/kind/solana/snapshots/snapshot-*.tar.*
|
||||
|
||||
# Current mainnet slot
|
||||
curl -s -X POST -H "Content-Type: application/json" \
|
||||
-d '{"jsonrpc":"2.0","id":1,"method":"getSlot","params":[{"commitment":"finalized"}]}' \
|
||||
https://api.mainnet-beta.solana.com
|
||||
```
|
||||
|
||||
### Snapshot Leapfrog Recovery
|
||||
|
||||
When the validator is stuck in a repair-dependent gap (incomplete shreds from a
|
||||
relay outage or insufficient turbine coverage), "grinding through" doesn't work.
|
||||
At 0.4 slots/sec replay through incomplete blocks vs 2.5 slots/sec chain
|
||||
production, the gap grows faster than it shrinks.
|
||||
|
||||
**Strategy**: Download a fresh snapshot whose slot lands *past* the incomplete zone,
|
||||
into the range where turbine+relay shreds are accumulating in the blockstore.
|
||||
**Keep the existing ledger** — it has those shreds. The validator replays from
|
||||
local blockstore data instead of waiting on repair.
|
||||
|
||||
**Steps**:
|
||||
1. Let the validator run — turbine+relay accumulate shreds at the tip
|
||||
2. Monitor shred completeness at the tip:
|
||||
`scripts/check-shred-completeness.sh 500`
|
||||
3. When there's a contiguous run of complete blocks (>100 slots), note the
|
||||
starting slot of that run
|
||||
4. Scale to 0, wipe accounts (ramdisk), wipe old snapshots
|
||||
5. **Do NOT wipe ledger** — it has the turbine shreds
|
||||
6. Download a fresh snapshot (its slot should be within the complete run)
|
||||
7. Scale to 1 — validator replays from local blockstore at 3-5 slots/sec
|
||||
|
||||
**Why this works**: Turbine delivers ~60% of shreds in real-time. Repair fills
|
||||
the rest for recent slots quickly (peers prioritize recent data). The only
|
||||
problem is repair for *old* slots (minutes/hours behind) which peers deprioritize.
|
||||
By snapshotting past the gap, we skip the old-slot repair bottleneck entirely.
|
||||
|
||||
### Shred Relay (Ashburn)
|
||||
|
||||
The TVU shred relay from laconic-was-sw01 provides ~4,000 additional shreds/sec.
|
||||
Without it, turbine alone delivers ~60% of blocks. With it, completeness improves
|
||||
but still requires repair for full coverage.
|
||||
|
||||
**Current state**: Old pipeline (monitor session + socat + shred-unwrap.py).
|
||||
The traffic-policy redirect was never committed (auto-revert after 5 min timer).
|
||||
See `docs/tvu-shred-relay.md` for the traffic-policy config that needs to be
|
||||
properly applied.
|
||||
|
||||
**Boot dependency**: `shred-unwrap.py` must be running on biscayne for the old
|
||||
pipeline to work. It is NOT persistent across reboots. The iptables DNAT rule
|
||||
for the new pipeline IS persistent (iptables-persistent installed).
|
||||
|
||||
### Redeploy Flow
|
||||
|
||||
See `playbooks/biscayne-redeploy.yml`. The scale-to-0 pattern is required because
|
||||
`laconic-so` creates the cluster and deploys the pod atomically:
|
||||
|
||||
1. Delete namespace (teardown)
|
||||
2. Optionally wipe data
|
||||
3. `laconic-so deployment start` (creates cluster + pod)
|
||||
4. Immediately scale to 0
|
||||
5. Download snapshot via aria2c
|
||||
6. Scale to 1
|
||||
7. Verify
|
||||
### Design Principles
|
||||
- **When something times out that doesn't mean it needs a longer timeout it means something that was expected never happened, not that we need to wait longer for it.**
|
||||
- **NEVER change a timeout because you believe something truncated, you don't understand timeouts, don't edit them unless told to explicitly by user.**
|
||||
|
|
|
|||
|
|
@ -0,0 +1,661 @@
|
|||
GNU AFFERO GENERAL PUBLIC LICENSE
|
||||
Version 3, 19 November 2007
|
||||
|
||||
Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
|
||||
Everyone is permitted to copy and distribute verbatim copies
|
||||
of this license document, but changing it is not allowed.
|
||||
|
||||
Preamble
|
||||
|
||||
The GNU Affero General Public License is a free, copyleft license for
|
||||
software and other kinds of works, specifically designed to ensure
|
||||
cooperation with the community in the case of network server software.
|
||||
|
||||
The licenses for most software and other practical works are designed
|
||||
to take away your freedom to share and change the works. By contrast,
|
||||
our General Public Licenses are intended to guarantee your freedom to
|
||||
share and change all versions of a program--to make sure it remains free
|
||||
software for all its users.
|
||||
|
||||
When we speak of free software, we are referring to freedom, not
|
||||
price. Our General Public Licenses are designed to make sure that you
|
||||
have the freedom to distribute copies of free software (and charge for
|
||||
them if you wish), that you receive source code or can get it if you
|
||||
want it, that you can change the software or use pieces of it in new
|
||||
free programs, and that you know you can do these things.
|
||||
|
||||
Developers that use our General Public Licenses protect your rights
|
||||
with two steps: (1) assert copyright on the software, and (2) offer
|
||||
you this License which gives you legal permission to copy, distribute
|
||||
and/or modify the software.
|
||||
|
||||
A secondary benefit of defending all users' freedom is that
|
||||
improvements made in alternate versions of the program, if they
|
||||
receive widespread use, become available for other developers to
|
||||
incorporate. Many developers of free software are heartened and
|
||||
encouraged by the resulting cooperation. However, in the case of
|
||||
software used on network servers, this result may fail to come about.
|
||||
The GNU General Public License permits making a modified version and
|
||||
letting the public access it on a server without ever releasing its
|
||||
source code to the public.
|
||||
|
||||
The GNU Affero General Public License is designed specifically to
|
||||
ensure that, in such cases, the modified source code becomes available
|
||||
to the community. It requires the operator of a network server to
|
||||
provide the source code of the modified version running there to the
|
||||
users of that server. Therefore, public use of a modified version, on
|
||||
a publicly accessible server, gives the public access to the source
|
||||
code of the modified version.
|
||||
|
||||
An older license, called the Affero General Public License and
|
||||
published by Affero, was designed to accomplish similar goals. This is
|
||||
a different license, not a version of the Affero GPL, but Affero has
|
||||
released a new version of the Affero GPL which permits relicensing under
|
||||
this license.
|
||||
|
||||
The precise terms and conditions for copying, distribution and
|
||||
modification follow.
|
||||
|
||||
TERMS AND CONDITIONS
|
||||
|
||||
0. Definitions.
|
||||
|
||||
"This License" refers to version 3 of the GNU Affero General Public License.
|
||||
|
||||
"Copyright" also means copyright-like laws that apply to other kinds of
|
||||
works, such as semiconductor masks.
|
||||
|
||||
"The Program" refers to any copyrightable work licensed under this
|
||||
License. Each licensee is addressed as "you". "Licensees" and
|
||||
"recipients" may be individuals or organizations.
|
||||
|
||||
To "modify" a work means to copy from or adapt all or part of the work
|
||||
in a fashion requiring copyright permission, other than the making of an
|
||||
exact copy. The resulting work is called a "modified version" of the
|
||||
earlier work or a work "based on" the earlier work.
|
||||
|
||||
A "covered work" means either the unmodified Program or a work based
|
||||
on the Program.
|
||||
|
||||
To "propagate" a work means to do anything with it that, without
|
||||
permission, would make you directly or secondarily liable for
|
||||
infringement under applicable copyright law, except executing it on a
|
||||
computer or modifying a private copy. Propagation includes copying,
|
||||
distribution (with or without modification), making available to the
|
||||
public, and in some countries other activities as well.
|
||||
|
||||
To "convey" a work means any kind of propagation that enables other
|
||||
parties to make or receive copies. Mere interaction with a user through
|
||||
a computer network, with no transfer of a copy, is not conveying.
|
||||
|
||||
An interactive user interface displays "Appropriate Legal Notices"
|
||||
to the extent that it includes a convenient and prominently visible
|
||||
feature that (1) displays an appropriate copyright notice, and (2)
|
||||
tells the user that there is no warranty for the work (except to the
|
||||
extent that warranties are provided), that licensees may convey the
|
||||
work under this License, and how to view a copy of this License. If
|
||||
the interface presents a list of user commands or options, such as a
|
||||
menu, a prominent item in the list meets this criterion.
|
||||
|
||||
1. Source Code.
|
||||
|
||||
The "source code" for a work means the preferred form of the work
|
||||
for making modifications to it. "Object code" means any non-source
|
||||
form of a work.
|
||||
|
||||
A "Standard Interface" means an interface that either is an official
|
||||
standard defined by a recognized standards body, or, in the case of
|
||||
interfaces specified for a particular programming language, one that
|
||||
is widely used among developers working in that language.
|
||||
|
||||
The "System Libraries" of an executable work include anything, other
|
||||
than the work as a whole, that (a) is included in the normal form of
|
||||
packaging a Major Component, but which is not part of that Major
|
||||
Component, and (b) serves only to enable use of the work with that
|
||||
Major Component, or to implement a Standard Interface for which an
|
||||
implementation is available to the public in source code form. A
|
||||
"Major Component", in this context, means a major essential component
|
||||
(kernel, window system, and so on) of the specific operating system
|
||||
(if any) on which the executable work runs, or a compiler used to
|
||||
produce the work, or an object code interpreter used to run it.
|
||||
|
||||
The "Corresponding Source" for a work in object code form means all
|
||||
the source code needed to generate, install, and (for an executable
|
||||
work) run the object code and to modify the work, including scripts to
|
||||
control those activities. However, it does not include the work's
|
||||
System Libraries, or general-purpose tools or generally available free
|
||||
programs which are used unmodified in performing those activities but
|
||||
which are not part of the work. For example, Corresponding Source
|
||||
includes interface definition files associated with source files for
|
||||
the work, and the source code for shared libraries and dynamically
|
||||
linked subprograms that the work is specifically designed to require,
|
||||
such as by intimate data communication or control flow between those
|
||||
subprograms and other parts of the work.
|
||||
|
||||
The Corresponding Source need not include anything that users
|
||||
can regenerate automatically from other parts of the Corresponding
|
||||
Source.
|
||||
|
||||
The Corresponding Source for a work in source code form is that
|
||||
same work.
|
||||
|
||||
2. Basic Permissions.
|
||||
|
||||
All rights granted under this License are granted for the term of
|
||||
copyright on the Program, and are irrevocable provided the stated
|
||||
conditions are met. This License explicitly affirms your unlimited
|
||||
permission to run the unmodified Program. The output from running a
|
||||
covered work is covered by this License only if the output, given its
|
||||
content, constitutes a covered work. This License acknowledges your
|
||||
rights of fair use or other equivalent, as provided by copyright law.
|
||||
|
||||
You may make, run and propagate covered works that you do not
|
||||
convey, without conditions so long as your license otherwise remains
|
||||
in force. You may convey covered works to others for the sole purpose
|
||||
of having them make modifications exclusively for you, or provide you
|
||||
with facilities for running those works, provided that you comply with
|
||||
the terms of this License in conveying all material for which you do
|
||||
not control copyright. Those thus making or running the covered works
|
||||
for you must do so exclusively on your behalf, under your direction
|
||||
and control, on terms that prohibit them from making any copies of
|
||||
your copyrighted material outside their relationship with you.
|
||||
|
||||
Conveying under any other circumstances is permitted solely under
|
||||
the conditions stated below. Sublicensing is not allowed; section 10
|
||||
makes it unnecessary.
|
||||
|
||||
3. Protecting Users' Legal Rights From Anti-Circumvention Law.
|
||||
|
||||
No covered work shall be deemed part of an effective technological
|
||||
measure under any applicable law fulfilling obligations under article
|
||||
11 of the WIPO copyright treaty adopted on 20 December 1996, or
|
||||
similar laws prohibiting or restricting circumvention of such
|
||||
measures.
|
||||
|
||||
When you convey a covered work, you waive any legal power to forbid
|
||||
circumvention of technological measures to the extent such circumvention
|
||||
is effected by exercising rights under this License with respect to
|
||||
the covered work, and you disclaim any intention to limit operation or
|
||||
modification of the work as a means of enforcing, against the work's
|
||||
users, your or third parties' legal rights to forbid circumvention of
|
||||
technological measures.
|
||||
|
||||
4. Conveying Verbatim Copies.
|
||||
|
||||
You may convey verbatim copies of the Program's source code as you
|
||||
receive it, in any medium, provided that you conspicuously and
|
||||
appropriately publish on each copy an appropriate copyright notice;
|
||||
keep intact all notices stating that this License and any
|
||||
non-permissive terms added in accord with section 7 apply to the code;
|
||||
keep intact all notices of the absence of any warranty; and give all
|
||||
recipients a copy of this License along with the Program.
|
||||
|
||||
You may charge any price or no price for each copy that you convey,
|
||||
and you may offer support or warranty protection for a fee.
|
||||
|
||||
5. Conveying Modified Source Versions.
|
||||
|
||||
You may convey a work based on the Program, or the modifications to
|
||||
produce it from the Program, in the form of source code under the
|
||||
terms of section 4, provided that you also meet all of these conditions:
|
||||
|
||||
a) The work must carry prominent notices stating that you modified
|
||||
it, and giving a relevant date.
|
||||
|
||||
b) The work must carry prominent notices stating that it is
|
||||
released under this License and any conditions added under section
|
||||
7. This requirement modifies the requirement in section 4 to
|
||||
"keep intact all notices".
|
||||
|
||||
c) You must license the entire work, as a whole, under this
|
||||
License to anyone who comes into possession of a copy. This
|
||||
License will therefore apply, along with any applicable section 7
|
||||
additional terms, to the whole of the work, and all its parts,
|
||||
regardless of how they are packaged. This License gives no
|
||||
permission to license the work in any other way, but it does not
|
||||
invalidate such permission if you have separately received it.
|
||||
|
||||
d) If the work has interactive user interfaces, each must display
|
||||
Appropriate Legal Notices; however, if the Program has interactive
|
||||
interfaces that do not display Appropriate Legal Notices, your
|
||||
work need not make them do so.
|
||||
|
||||
A compilation of a covered work with other separate and independent
|
||||
works, which are not by their nature extensions of the covered work,
|
||||
and which are not combined with it such as to form a larger program,
|
||||
in or on a volume of a storage or distribution medium, is called an
|
||||
"aggregate" if the compilation and its resulting copyright are not
|
||||
used to limit the access or legal rights of the compilation's users
|
||||
beyond what the individual works permit. Inclusion of a covered work
|
||||
in an aggregate does not cause this License to apply to the other
|
||||
parts of the aggregate.
|
||||
|
||||
6. Conveying Non-Source Forms.
|
||||
|
||||
You may convey a covered work in object code form under the terms
|
||||
of sections 4 and 5, provided that you also convey the
|
||||
machine-readable Corresponding Source under the terms of this License,
|
||||
in one of these ways:
|
||||
|
||||
a) Convey the object code in, or embodied in, a physical product
|
||||
(including a physical distribution medium), accompanied by the
|
||||
Corresponding Source fixed on a durable physical medium
|
||||
customarily used for software interchange.
|
||||
|
||||
b) Convey the object code in, or embodied in, a physical product
|
||||
(including a physical distribution medium), accompanied by a
|
||||
written offer, valid for at least three years and valid for as
|
||||
long as you offer spare parts or customer support for that product
|
||||
model, to give anyone who possesses the object code either (1) a
|
||||
copy of the Corresponding Source for all the software in the
|
||||
product that is covered by this License, on a durable physical
|
||||
medium customarily used for software interchange, for a price no
|
||||
more than your reasonable cost of physically performing this
|
||||
conveying of source, or (2) access to copy the
|
||||
Corresponding Source from a network server at no charge.
|
||||
|
||||
c) Convey individual copies of the object code with a copy of the
|
||||
written offer to provide the Corresponding Source. This
|
||||
alternative is allowed only occasionally and noncommercially, and
|
||||
only if you received the object code with such an offer, in accord
|
||||
with subsection 6b.
|
||||
|
||||
d) Convey the object code by offering access from a designated
|
||||
place (gratis or for a charge), and offer equivalent access to the
|
||||
Corresponding Source in the same way through the same place at no
|
||||
further charge. You need not require recipients to copy the
|
||||
Corresponding Source along with the object code. If the place to
|
||||
copy the object code is a network server, the Corresponding Source
|
||||
may be on a different server (operated by you or a third party)
|
||||
that supports equivalent copying facilities, provided you maintain
|
||||
clear directions next to the object code saying where to find the
|
||||
Corresponding Source. Regardless of what server hosts the
|
||||
Corresponding Source, you remain obligated to ensure that it is
|
||||
available for as long as needed to satisfy these requirements.
|
||||
|
||||
e) Convey the object code using peer-to-peer transmission, provided
|
||||
you inform other peers where the object code and Corresponding
|
||||
Source of the work are being offered to the general public at no
|
||||
charge under subsection 6d.
|
||||
|
||||
A separable portion of the object code, whose source code is excluded
|
||||
from the Corresponding Source as a System Library, need not be
|
||||
included in conveying the object code work.
|
||||
|
||||
A "User Product" is either (1) a "consumer product", which means any
|
||||
tangible personal property which is normally used for personal, family,
|
||||
or household purposes, or (2) anything designed or sold for incorporation
|
||||
into a dwelling. In determining whether a product is a consumer product,
|
||||
doubtful cases shall be resolved in favor of coverage. For a particular
|
||||
product received by a particular user, "normally used" refers to a
|
||||
typical or common use of that class of product, regardless of the status
|
||||
of the particular user or of the way in which the particular user
|
||||
actually uses, or expects or is expected to use, the product. A product
|
||||
is a consumer product regardless of whether the product has substantial
|
||||
commercial, industrial or non-consumer uses, unless such uses represent
|
||||
the only significant mode of use of the product.
|
||||
|
||||
"Installation Information" for a User Product means any methods,
|
||||
procedures, authorization keys, or other information required to install
|
||||
and execute modified versions of a covered work in that User Product from
|
||||
a modified version of its Corresponding Source. The information must
|
||||
suffice to ensure that the continued functioning of the modified object
|
||||
code is in no case prevented or interfered with solely because
|
||||
modification has been made.
|
||||
|
||||
If you convey an object code work under this section in, or with, or
|
||||
specifically for use in, a User Product, and the conveying occurs as
|
||||
part of a transaction in which the right of possession and use of the
|
||||
User Product is transferred to the recipient in perpetuity or for a
|
||||
fixed term (regardless of how the transaction is characterized), the
|
||||
Corresponding Source conveyed under this section must be accompanied
|
||||
by the Installation Information. But this requirement does not apply
|
||||
if neither you nor any third party retains the ability to install
|
||||
modified object code on the User Product (for example, the work has
|
||||
been installed in ROM).
|
||||
|
||||
The requirement to provide Installation Information does not include a
|
||||
requirement to continue to provide support service, warranty, or updates
|
||||
for a work that has been modified or installed by the recipient, or for
|
||||
the User Product in which it has been modified or installed. Access to a
|
||||
network may be denied when the modification itself materially and
|
||||
adversely affects the operation of the network or violates the rules and
|
||||
protocols for communication across the network.
|
||||
|
||||
Corresponding Source conveyed, and Installation Information provided,
|
||||
in accord with this section must be in a format that is publicly
|
||||
documented (and with an implementation available to the public in
|
||||
source code form), and must require no special password or key for
|
||||
unpacking, reading or copying.
|
||||
|
||||
7. Additional Terms.
|
||||
|
||||
"Additional permissions" are terms that supplement the terms of this
|
||||
License by making exceptions from one or more of its conditions.
|
||||
Additional permissions that are applicable to the entire Program shall
|
||||
be treated as though they were included in this License, to the extent
|
||||
that they are valid under applicable law. If additional permissions
|
||||
apply only to part of the Program, that part may be used separately
|
||||
under those permissions, but the entire Program remains governed by
|
||||
this License without regard to the additional permissions.
|
||||
|
||||
When you convey a copy of a covered work, you may at your option
|
||||
remove any additional permissions from that copy, or from any part of
|
||||
it. (Additional permissions may be written to require their own
|
||||
removal in certain cases when you modify the work.) You may place
|
||||
additional permissions on material, added by you to a covered work,
|
||||
for which you have or can give appropriate copyright permission.
|
||||
|
||||
Notwithstanding any other provision of this License, for material you
|
||||
add to a covered work, you may (if authorized by the copyright holders of
|
||||
that material) supplement the terms of this License with terms:
|
||||
|
||||
a) Disclaiming warranty or limiting liability differently from the
|
||||
terms of sections 15 and 16 of this License; or
|
||||
|
||||
b) Requiring preservation of specified reasonable legal notices or
|
||||
author attributions in that material or in the Appropriate Legal
|
||||
Notices displayed by works containing it; or
|
||||
|
||||
c) Prohibiting misrepresentation of the origin of that material, or
|
||||
requiring that modified versions of such material be marked in
|
||||
reasonable ways as different from the original version; or
|
||||
|
||||
d) Limiting the use for publicity purposes of names of licensors or
|
||||
authors of the material; or
|
||||
|
||||
e) Declining to grant rights under trademark law for use of some
|
||||
trade names, trademarks, or service marks; or
|
||||
|
||||
f) Requiring indemnification of licensors and authors of that
|
||||
material by anyone who conveys the material (or modified versions of
|
||||
it) with contractual assumptions of liability to the recipient, for
|
||||
any liability that these contractual assumptions directly impose on
|
||||
those licensors and authors.
|
||||
|
||||
All other non-permissive additional terms are considered "further
|
||||
restrictions" within the meaning of section 10. If the Program as you
|
||||
received it, or any part of it, contains a notice stating that it is
|
||||
governed by this License along with a term that is a further
|
||||
restriction, you may remove that term. If a license document contains
|
||||
a further restriction but permits relicensing or conveying under this
|
||||
License, you may add to a covered work material governed by the terms
|
||||
of that license document, provided that the further restriction does
|
||||
not survive such relicensing or conveying.
|
||||
|
||||
If you add terms to a covered work in accord with this section, you
|
||||
must place, in the relevant source files, a statement of the
|
||||
additional terms that apply to those files, or a notice indicating
|
||||
where to find the applicable terms.
|
||||
|
||||
Additional terms, permissive or non-permissive, may be stated in the
|
||||
form of a separately written license, or stated as exceptions;
|
||||
the above requirements apply either way.
|
||||
|
||||
8. Termination.
|
||||
|
||||
You may not propagate or modify a covered work except as expressly
|
||||
provided under this License. Any attempt otherwise to propagate or
|
||||
modify it is void, and will automatically terminate your rights under
|
||||
this License (including any patent licenses granted under the third
|
||||
paragraph of section 11).
|
||||
|
||||
However, if you cease all violation of this License, then your
|
||||
license from a particular copyright holder is reinstated (a)
|
||||
provisionally, unless and until the copyright holder explicitly and
|
||||
finally terminates your license, and (b) permanently, if the copyright
|
||||
holder fails to notify you of the violation by some reasonable means
|
||||
prior to 60 days after the cessation.
|
||||
|
||||
Moreover, your license from a particular copyright holder is
|
||||
reinstated permanently if the copyright holder notifies you of the
|
||||
violation by some reasonable means, this is the first time you have
|
||||
received notice of violation of this License (for any work) from that
|
||||
copyright holder, and you cure the violation prior to 30 days after
|
||||
your receipt of the notice.
|
||||
|
||||
Termination of your rights under this section does not terminate the
|
||||
licenses of parties who have received copies or rights from you under
|
||||
this License. If your rights have been terminated and not permanently
|
||||
reinstated, you do not qualify to receive new licenses for the same
|
||||
material under section 10.
|
||||
|
||||
9. Acceptance Not Required for Having Copies.
|
||||
|
||||
You are not required to accept this License in order to receive or
|
||||
run a copy of the Program. Ancillary propagation of a covered work
|
||||
occurring solely as a consequence of using peer-to-peer transmission
|
||||
to receive a copy likewise does not require acceptance. However,
|
||||
nothing other than this License grants you permission to propagate or
|
||||
modify any covered work. These actions infringe copyright if you do
|
||||
not accept this License. Therefore, by modifying or propagating a
|
||||
covered work, you indicate your acceptance of this License to do so.
|
||||
|
||||
10. Automatic Licensing of Downstream Recipients.
|
||||
|
||||
Each time you convey a covered work, the recipient automatically
|
||||
receives a license from the original licensors, to run, modify and
|
||||
propagate that work, subject to this License. You are not responsible
|
||||
for enforcing compliance by third parties with this License.
|
||||
|
||||
An "entity transaction" is a transaction transferring control of an
|
||||
organization, or substantially all assets of one, or subdividing an
|
||||
organization, or merging organizations. If propagation of a covered
|
||||
work results from an entity transaction, each party to that
|
||||
transaction who receives a copy of the work also receives whatever
|
||||
licenses to the work the party's predecessor in interest had or could
|
||||
give under the previous paragraph, plus a right to possession of the
|
||||
Corresponding Source of the work from the predecessor in interest, if
|
||||
the predecessor has it or can get it with reasonable efforts.
|
||||
|
||||
You may not impose any further restrictions on the exercise of the
|
||||
rights granted or affirmed under this License. For example, you may
|
||||
not impose a license fee, royalty, or other charge for exercise of
|
||||
rights granted under this License, and you may not initiate litigation
|
||||
(including a cross-claim or counterclaim in a lawsuit) alleging that
|
||||
any patent claim is infringed by making, using, selling, offering for
|
||||
sale, or importing the Program or any portion of it.
|
||||
|
||||
11. Patents.
|
||||
|
||||
A "contributor" is a copyright holder who authorizes use under this
|
||||
License of the Program or a work on which the Program is based. The
|
||||
work thus licensed is called the contributor's "contributor version".
|
||||
|
||||
A contributor's "essential patent claims" are all patent claims
|
||||
owned or controlled by the contributor, whether already acquired or
|
||||
hereafter acquired, that would be infringed by some manner, permitted
|
||||
by this License, of making, using, or selling its contributor version,
|
||||
but do not include claims that would be infringed only as a
|
||||
consequence of further modification of the contributor version. For
|
||||
purposes of this definition, "control" includes the right to grant
|
||||
patent sublicenses in a manner consistent with the requirements of
|
||||
this License.
|
||||
|
||||
Each contributor grants you a non-exclusive, worldwide, royalty-free
|
||||
patent license under the contributor's essential patent claims, to
|
||||
make, use, sell, offer for sale, import and otherwise run, modify and
|
||||
propagate the contents of its contributor version.
|
||||
|
||||
In the following three paragraphs, a "patent license" is any express
|
||||
agreement or commitment, however denominated, not to enforce a patent
|
||||
(such as an express permission to practice a patent or covenant not to
|
||||
sue for patent infringement). To "grant" such a patent license to a
|
||||
party means to make such an agreement or commitment not to enforce a
|
||||
patent against the party.
|
||||
|
||||
If you convey a covered work, knowingly relying on a patent license,
|
||||
and the Corresponding Source of the work is not available for anyone
|
||||
to copy, free of charge and under the terms of this License, through a
|
||||
publicly available network server or other readily accessible means,
|
||||
then you must either (1) cause the Corresponding Source to be so
|
||||
available, or (2) arrange to deprive yourself of the benefit of the
|
||||
patent license for this particular work, or (3) arrange, in a manner
|
||||
consistent with the requirements of this License, to extend the patent
|
||||
license to downstream recipients. "Knowingly relying" means you have
|
||||
actual knowledge that, but for the patent license, your conveying the
|
||||
covered work in a country, or your recipient's use of the covered work
|
||||
in a country, would infringe one or more identifiable patents in that
|
||||
country that you have reason to believe are valid.
|
||||
|
||||
If, pursuant to or in connection with a single transaction or
|
||||
arrangement, you convey, or propagate by procuring conveyance of, a
|
||||
covered work, and grant a patent license to some of the parties
|
||||
receiving the covered work authorizing them to use, propagate, modify
|
||||
or convey a specific copy of the covered work, then the patent license
|
||||
you grant is automatically extended to all recipients of the covered
|
||||
work and works based on it.
|
||||
|
||||
A patent license is "discriminatory" if it does not include within
|
||||
the scope of its coverage, prohibits the exercise of, or is
|
||||
conditioned on the non-exercise of one or more of the rights that are
|
||||
specifically granted under this License. You may not convey a covered
|
||||
work if you are a party to an arrangement with a third party that is
|
||||
in the business of distributing software, under which you make payment
|
||||
to the third party based on the extent of your activity of conveying
|
||||
the work, and under which the third party grants, to any of the
|
||||
parties who would receive the covered work from you, a discriminatory
|
||||
patent license (a) in connection with copies of the covered work
|
||||
conveyed by you (or copies made from those copies), or (b) primarily
|
||||
for and in connection with specific products or compilations that
|
||||
contain the covered work, unless you entered into that arrangement,
|
||||
or that patent license was granted, prior to 28 March 2007.
|
||||
|
||||
Nothing in this License shall be construed as excluding or limiting
|
||||
any implied license or other defenses to infringement that may
|
||||
otherwise be available to you under applicable patent law.
|
||||
|
||||
12. No Surrender of Others' Freedom.
|
||||
|
||||
If conditions are imposed on you (whether by court order, agreement or
|
||||
otherwise) that contradict the conditions of this License, they do not
|
||||
excuse you from the conditions of this License. If you cannot convey a
|
||||
covered work so as to satisfy simultaneously your obligations under this
|
||||
License and any other pertinent obligations, then as a consequence you may
|
||||
not convey it at all. For example, if you agree to terms that obligate you
|
||||
to collect a royalty for further conveying from those to whom you convey
|
||||
the Program, the only way you could satisfy both those terms and this
|
||||
License would be to refrain entirely from conveying the Program.
|
||||
|
||||
13. Remote Network Interaction; Use with the GNU General Public License.
|
||||
|
||||
Notwithstanding any other provision of this License, if you modify the
|
||||
Program, your modified version must prominently offer all users
|
||||
interacting with it remotely through a computer network (if your version
|
||||
supports such interaction) an opportunity to receive the Corresponding
|
||||
Source of your version by providing access to the Corresponding Source
|
||||
from a network server at no charge, through some standard or customary
|
||||
means of facilitating copying of software. This Corresponding Source
|
||||
shall include the Corresponding Source for any work covered by version 3
|
||||
of the GNU General Public License that is incorporated pursuant to the
|
||||
following paragraph.
|
||||
|
||||
Notwithstanding any other provision of this License, you have
|
||||
permission to link or combine any covered work with a work licensed
|
||||
under version 3 of the GNU General Public License into a single
|
||||
combined work, and to convey the resulting work. The terms of this
|
||||
License will continue to apply to the part which is the covered work,
|
||||
but the work with which it is combined will remain governed by version
|
||||
3 of the GNU General Public License.
|
||||
|
||||
14. Revised Versions of this License.
|
||||
|
||||
The Free Software Foundation may publish revised and/or new versions of
|
||||
the GNU Affero General Public License from time to time. Such new versions
|
||||
will be similar in spirit to the present version, but may differ in detail to
|
||||
address new problems or concerns.
|
||||
|
||||
Each version is given a distinguishing version number. If the
|
||||
Program specifies that a certain numbered version of the GNU Affero General
|
||||
Public License "or any later version" applies to it, you have the
|
||||
option of following the terms and conditions either of that numbered
|
||||
version or of any later version published by the Free Software
|
||||
Foundation. If the Program does not specify a version number of the
|
||||
GNU Affero General Public License, you may choose any version ever published
|
||||
by the Free Software Foundation.
|
||||
|
||||
If the Program specifies that a proxy can decide which future
|
||||
versions of the GNU Affero General Public License can be used, that proxy's
|
||||
public statement of acceptance of a version permanently authorizes you
|
||||
to choose that version for the Program.
|
||||
|
||||
Later license versions may give you additional or different
|
||||
permissions. However, no additional obligations are imposed on any
|
||||
author or copyright holder as a result of your choosing to follow a
|
||||
later version.
|
||||
|
||||
15. Disclaimer of Warranty.
|
||||
|
||||
THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
|
||||
APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
|
||||
HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
|
||||
OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
|
||||
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
|
||||
IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
|
||||
ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
|
||||
|
||||
16. Limitation of Liability.
|
||||
|
||||
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
|
||||
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
|
||||
THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
|
||||
GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
|
||||
USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
|
||||
DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
|
||||
PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
|
||||
EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
|
||||
SUCH DAMAGES.
|
||||
|
||||
17. Interpretation of Sections 15 and 16.
|
||||
|
||||
If the disclaimer of warranty and limitation of liability provided
|
||||
above cannot be given local legal effect according to their terms,
|
||||
reviewing courts shall apply local law that most closely approximates
|
||||
an absolute waiver of all civil liability in connection with the
|
||||
Program, unless a warranty or assumption of liability accompanies a
|
||||
copy of the Program in return for a fee.
|
||||
|
||||
END OF TERMS AND CONDITIONS
|
||||
|
||||
How to Apply These Terms to Your New Programs
|
||||
|
||||
If you develop a new program, and you want it to be of the greatest
|
||||
possible use to the public, the best way to achieve this is to make it
|
||||
free software which everyone can redistribute and change under these terms.
|
||||
|
||||
To do so, attach the following notices to the program. It is safest
|
||||
to attach them to the start of each source file to most effectively
|
||||
state the exclusion of warranty; and each file should have at least
|
||||
the "copyright" line and a pointer to where the full notice is found.
|
||||
|
||||
<one line to give the program's name and a brief idea of what it does.>
|
||||
Copyright (C) <year> <name of author>
|
||||
|
||||
This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Affero General Public License as published by
|
||||
the Free Software Foundation, either version 3 of the License, or
|
||||
(at your option) any later version.
|
||||
|
||||
This program is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
GNU Affero General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Affero General Public License
|
||||
along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
Also add information on how to contact you by electronic and paper mail.
|
||||
|
||||
If your software can interact with users remotely through a computer
|
||||
network, you should also make sure that it provides a way for users to
|
||||
get its source. For example, if your program is a web application, its
|
||||
interface could display a "Source" link that leads users to an archive
|
||||
of the code. There are many ways you could offer source, and different
|
||||
solutions will be better for different programs; see section 13 for the
|
||||
specific requirements.
|
||||
|
||||
You should also get your employer (if you work as a programmer) or school,
|
||||
if any, to sign a "copyright disclaimer" for the program, if necessary.
|
||||
For more information on this, and how to apply and follow the GNU AGPL, see
|
||||
<http://www.gnu.org/licenses/>.
|
||||
|
|
@ -0,0 +1 @@
|
|||
include LICENSE
|
||||
134
README.md
134
README.md
|
|
@ -1,3 +1,133 @@
|
|||
# biscayne-agave-runbook
|
||||
# Stack Orchestrator
|
||||
|
||||
Ansible playbooks for operating the kind-based agave-stack deployment on biscayne.vaasl.io.
|
||||
Stack Orchestrator allows building and deployment of a Laconic Stack on a single machine with minimial prerequisites. It is a Python3 CLI tool that runs on any OS with Python3 and Docker. The following diagram summarizes the relevant repositories in the Laconic Stack - and the relationship to Stack Orchestrator.
|
||||
|
||||

|
||||
|
||||
## Install
|
||||
|
||||
**To get started quickly** on a fresh Ubuntu instance (e.g, Digital Ocean); [try this script](./scripts/quick-install-linux.sh). **WARNING:** always review scripts prior to running them so that you know what is happening on your machine.
|
||||
|
||||
For any other installation, follow along below and **adapt these instructions based on the specifics of your system.**
|
||||
|
||||
|
||||
Ensure that the following are already installed:
|
||||
|
||||
- [Python3](https://wiki.python.org/moin/BeginnersGuide/Download): `python3 --version` >= `3.8.10` (the Python3 shipped in Ubuntu 20+ is good to go)
|
||||
- [Docker](https://docs.docker.com/get-docker/): `docker --version` >= `20.10.21`
|
||||
- [jq](https://stedolan.github.io/jq/download/): `jq --version` >= `1.5`
|
||||
- [git](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git): `git --version` >= `2.10.3`
|
||||
|
||||
Note: if installing docker-compose via package manager on Linux (as opposed to Docker Desktop), you must [install the plugin](https://docs.docker.com/compose/install/linux/#install-the-plugin-manually), e.g. :
|
||||
|
||||
```bash
|
||||
mkdir -p ~/.docker/cli-plugins
|
||||
curl -SL https://github.com/docker/compose/releases/download/v2.11.2/docker-compose-linux-x86_64 -o ~/.docker/cli-plugins/docker-compose
|
||||
chmod +x ~/.docker/cli-plugins/docker-compose
|
||||
```
|
||||
|
||||
Next decide on a directory where you would like to put the stack-orchestrator program. Typically this would be
|
||||
a "user" binary directory such as `~/bin` or perhaps `/usr/local/laconic` or possibly just the current working directory.
|
||||
|
||||
Now, having selected that directory, download the latest release from [this page](https://git.vdb.to/cerc-io/stack-orchestrator/tags) into it (we're using `~/bin` below for concreteness but edit to suit if you selected a different directory). Also be sure that the destination directory exists and is writable:
|
||||
|
||||
```bash
|
||||
curl -L -o ~/bin/laconic-so https://git.vdb.to/cerc-io/stack-orchestrator/releases/download/latest/laconic-so
|
||||
```
|
||||
|
||||
Give it execute permissions:
|
||||
|
||||
```bash
|
||||
chmod +x ~/bin/laconic-so
|
||||
```
|
||||
|
||||
Ensure `laconic-so` is on the [`PATH`](https://unix.stackexchange.com/a/26059)
|
||||
|
||||
Verify operation (your version will probably be different, just check here that you see some version output and not an error):
|
||||
|
||||
```
|
||||
laconic-so version
|
||||
Version: 1.1.0-7a607c2-202304260513
|
||||
```
|
||||
Save the distribution url to `~/.laconic-so/config.yml`:
|
||||
```bash
|
||||
mkdir ~/.laconic-so
|
||||
echo "distribution-url: https://git.vdb.to/cerc-io/stack-orchestrator/releases/download/latest/laconic-so" > ~/.laconic-so/config.yml
|
||||
```
|
||||
|
||||
### Update
|
||||
If Stack Orchestrator was installed using the process described above, it is able to subsequently self-update to the current latest version by running:
|
||||
|
||||
```bash
|
||||
laconic-so update
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
The various [stacks](/stack_orchestrator/data/stacks) each contain instructions for running different stacks based on your use case. For example:
|
||||
|
||||
- [self-hosted Gitea](/stack_orchestrator/data/stacks/build-support)
|
||||
- [an Optimism Fixturenet](/stack_orchestrator/data/stacks/fixturenet-optimism)
|
||||
- [laconicd with console and CLI](stack_orchestrator/data/stacks/fixturenet-laconic-loaded)
|
||||
- [kubo (IPFS)](stack_orchestrator/data/stacks/kubo)
|
||||
|
||||
## Deployment Types
|
||||
|
||||
- **compose**: Docker Compose on local machine
|
||||
- **k8s**: External Kubernetes cluster (requires kubeconfig)
|
||||
- **k8s-kind**: Local Kubernetes via Kind - one cluster per host, shared by all deployments
|
||||
|
||||
## External Stacks
|
||||
|
||||
Stacks can live in external git repositories. Required structure:
|
||||
|
||||
```
|
||||
<repo>/
|
||||
stack_orchestrator/data/
|
||||
stacks/<stack-name>/stack.yml
|
||||
compose/docker-compose-<pod-name>.yml
|
||||
deployment/spec.yml
|
||||
```
|
||||
|
||||
## Deployment Commands
|
||||
|
||||
```bash
|
||||
# Create deployment from spec
|
||||
laconic-so --stack <path> deploy create --spec-file <spec.yml> --deployment-dir <dir>
|
||||
|
||||
# Start (creates cluster on first run)
|
||||
laconic-so deployment --dir <dir> start
|
||||
|
||||
# GitOps restart (git pull + redeploy, preserves data)
|
||||
laconic-so deployment --dir <dir> restart
|
||||
|
||||
# Stop
|
||||
laconic-so deployment --dir <dir> stop
|
||||
```
|
||||
|
||||
## spec.yml Reference
|
||||
|
||||
```yaml
|
||||
stack: stack-name-or-path
|
||||
deploy-to: k8s-kind
|
||||
network:
|
||||
http-proxy:
|
||||
- host-name: app.example.com
|
||||
routes:
|
||||
- path: /
|
||||
proxy-to: service-name:port
|
||||
acme-email: admin@example.com
|
||||
config:
|
||||
ENV_VAR: value
|
||||
SECRET_VAR: $generate:hex:32$ # Auto-generated, stored in K8s Secret
|
||||
volumes:
|
||||
volume-name:
|
||||
```
|
||||
|
||||
## Contributing
|
||||
|
||||
See the [CONTRIBUTING.md](/docs/CONTRIBUTING.md) for developer mode install.
|
||||
|
||||
## Platform Support
|
||||
|
||||
Native aarm64 is _not_ currently supported. x64 emulation on ARM64 macos should work (not yet tested).
|
||||
|
|
|
|||
|
|
@ -0,0 +1,413 @@
|
|||
# Implementing `laconic-so create-stack` Command
|
||||
|
||||
A plan for adding a new CLI command to scaffold stack files automatically.
|
||||
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
Add a `create-stack` command that generates all required files for a new stack:
|
||||
|
||||
```bash
|
||||
laconic-so create-stack --name my-stack --type webapp
|
||||
```
|
||||
|
||||
**Output:**
|
||||
```
|
||||
stack_orchestrator/data/
|
||||
├── stacks/my-stack/stack.yml
|
||||
├── container-build/cerc-my-stack/
|
||||
│ ├── Dockerfile
|
||||
│ └── build.sh
|
||||
└── compose/docker-compose-my-stack.yml
|
||||
|
||||
Updated: repository-list.txt, container-image-list.txt, pod-list.txt
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## CLI Architecture Summary
|
||||
|
||||
### Command Registration Pattern
|
||||
|
||||
Commands are Click functions registered in `main.py`:
|
||||
|
||||
```python
|
||||
# main.py (line ~70)
|
||||
from stack_orchestrator.create import create_stack
|
||||
cli.add_command(create_stack.command, "create-stack")
|
||||
```
|
||||
|
||||
### Global Options Access
|
||||
|
||||
```python
|
||||
from stack_orchestrator.opts import opts
|
||||
|
||||
if not opts.o.quiet:
|
||||
print("message")
|
||||
if opts.o.dry_run:
|
||||
print("(would create files)")
|
||||
```
|
||||
|
||||
### Key Utilities
|
||||
|
||||
| Function | Location | Purpose |
|
||||
|----------|----------|---------|
|
||||
| `get_yaml()` | `util.py` | YAML parser (ruamel.yaml) |
|
||||
| `get_stack_path(stack)` | `util.py` | Resolve stack directory path |
|
||||
| `error_exit(msg)` | `util.py` | Print error and exit(1) |
|
||||
|
||||
---
|
||||
|
||||
## Files to Create
|
||||
|
||||
### 1. Command Module
|
||||
|
||||
**`stack_orchestrator/create/__init__.py`**
|
||||
```python
|
||||
# Empty file to make this a package
|
||||
```
|
||||
|
||||
**`stack_orchestrator/create/create_stack.py`**
|
||||
```python
|
||||
import click
|
||||
import os
|
||||
from pathlib import Path
|
||||
from shutil import copy
|
||||
from stack_orchestrator.opts import opts
|
||||
from stack_orchestrator.util import error_exit, get_yaml
|
||||
|
||||
# Template types
|
||||
STACK_TEMPLATES = {
|
||||
"webapp": {
|
||||
"description": "Web application with Node.js",
|
||||
"base_image": "node:20-bullseye-slim",
|
||||
"port": 3000,
|
||||
},
|
||||
"service": {
|
||||
"description": "Backend service",
|
||||
"base_image": "python:3.11-slim",
|
||||
"port": 8080,
|
||||
},
|
||||
"empty": {
|
||||
"description": "Minimal stack with no defaults",
|
||||
"base_image": None,
|
||||
"port": None,
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def get_data_dir() -> Path:
|
||||
"""Get path to stack_orchestrator/data directory"""
|
||||
return Path(__file__).absolute().parent.parent.joinpath("data")
|
||||
|
||||
|
||||
def validate_stack_name(name: str) -> None:
|
||||
"""Validate stack name follows conventions"""
|
||||
import re
|
||||
if not re.match(r'^[a-z0-9][a-z0-9-]*[a-z0-9]$', name) and len(name) > 2:
|
||||
error_exit(f"Invalid stack name '{name}'. Use lowercase alphanumeric with hyphens.")
|
||||
if name.startswith("cerc-"):
|
||||
error_exit("Stack name should not start with 'cerc-' (container names will add this prefix)")
|
||||
|
||||
|
||||
def create_stack_yml(stack_dir: Path, name: str, template: dict, repo_url: str) -> None:
|
||||
"""Create stack.yml file"""
|
||||
config = {
|
||||
"version": "1.2",
|
||||
"name": name,
|
||||
"description": template.get("description", f"Stack: {name}"),
|
||||
"repos": [repo_url] if repo_url else [],
|
||||
"containers": [f"cerc/{name}"],
|
||||
"pods": [name],
|
||||
}
|
||||
|
||||
stack_dir.mkdir(parents=True, exist_ok=True)
|
||||
with open(stack_dir / "stack.yml", "w") as f:
|
||||
get_yaml().dump(config, f)
|
||||
|
||||
|
||||
def create_dockerfile(container_dir: Path, name: str, template: dict) -> None:
|
||||
"""Create Dockerfile"""
|
||||
base_image = template.get("base_image", "node:20-bullseye-slim")
|
||||
port = template.get("port", 3000)
|
||||
|
||||
dockerfile_content = f'''# Build stage
|
||||
FROM {base_image} AS builder
|
||||
|
||||
WORKDIR /app
|
||||
COPY package*.json ./
|
||||
RUN npm ci
|
||||
COPY . .
|
||||
RUN npm run build
|
||||
|
||||
# Production stage
|
||||
FROM {base_image}
|
||||
|
||||
WORKDIR /app
|
||||
COPY package*.json ./
|
||||
RUN npm ci --only=production
|
||||
COPY --from=builder /app/dist ./dist
|
||||
|
||||
EXPOSE {port}
|
||||
CMD ["npm", "run", "start"]
|
||||
'''
|
||||
|
||||
container_dir.mkdir(parents=True, exist_ok=True)
|
||||
with open(container_dir / "Dockerfile", "w") as f:
|
||||
f.write(dockerfile_content)
|
||||
|
||||
|
||||
def create_build_script(container_dir: Path, name: str) -> None:
|
||||
"""Create build.sh script"""
|
||||
build_script = f'''#!/usr/bin/env bash
|
||||
# Build cerc/{name}
|
||||
|
||||
source ${{CERC_CONTAINER_BASE_DIR}}/build-base.sh
|
||||
|
||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${{BASH_SOURCE[0]}}" )" &> /dev/null && pwd )
|
||||
|
||||
docker build -t cerc/{name}:local \\
|
||||
-f ${{SCRIPT_DIR}}/Dockerfile \\
|
||||
${{build_command_args}} \\
|
||||
${{CERC_REPO_BASE_DIR}}/{name}
|
||||
'''
|
||||
|
||||
build_path = container_dir / "build.sh"
|
||||
with open(build_path, "w") as f:
|
||||
f.write(build_script)
|
||||
|
||||
# Make executable
|
||||
os.chmod(build_path, 0o755)
|
||||
|
||||
|
||||
def create_compose_file(compose_dir: Path, name: str, template: dict) -> None:
|
||||
"""Create docker-compose file"""
|
||||
port = template.get("port", 3000)
|
||||
|
||||
compose_content = {
|
||||
"version": "3.8",
|
||||
"services": {
|
||||
name: {
|
||||
"image": f"cerc/{name}:local",
|
||||
"restart": "unless-stopped",
|
||||
"ports": [f"${{HOST_PORT:-{port}}}:{port}"],
|
||||
"environment": {
|
||||
"NODE_ENV": "${NODE_ENV:-production}",
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
with open(compose_dir / f"docker-compose-{name}.yml", "w") as f:
|
||||
get_yaml().dump(compose_content, f)
|
||||
|
||||
|
||||
def update_list_file(data_dir: Path, filename: str, entry: str) -> None:
|
||||
"""Add entry to a list file if not already present"""
|
||||
list_path = data_dir / filename
|
||||
|
||||
# Read existing entries
|
||||
existing = set()
|
||||
if list_path.exists():
|
||||
with open(list_path, "r") as f:
|
||||
existing = set(line.strip() for line in f if line.strip())
|
||||
|
||||
# Add new entry
|
||||
if entry not in existing:
|
||||
with open(list_path, "a") as f:
|
||||
f.write(f"{entry}\n")
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option("--name", required=True, help="Name of the new stack (lowercase, hyphens)")
|
||||
@click.option("--type", "stack_type", default="webapp",
|
||||
type=click.Choice(list(STACK_TEMPLATES.keys())),
|
||||
help="Stack template type")
|
||||
@click.option("--repo", help="Git repository URL (e.g., github.com/org/repo)")
|
||||
@click.option("--force", is_flag=True, help="Overwrite existing files")
|
||||
@click.pass_context
|
||||
def command(ctx, name: str, stack_type: str, repo: str, force: bool):
|
||||
"""Create a new stack with all required files.
|
||||
|
||||
Examples:
|
||||
|
||||
laconic-so create-stack --name my-app --type webapp
|
||||
|
||||
laconic-so create-stack --name my-service --type service --repo github.com/org/repo
|
||||
"""
|
||||
# Validate
|
||||
validate_stack_name(name)
|
||||
|
||||
template = STACK_TEMPLATES[stack_type]
|
||||
data_dir = get_data_dir()
|
||||
|
||||
# Define paths
|
||||
stack_dir = data_dir / "stacks" / name
|
||||
container_dir = data_dir / "container-build" / f"cerc-{name}"
|
||||
compose_dir = data_dir / "compose"
|
||||
|
||||
# Check for existing files
|
||||
if not force:
|
||||
if stack_dir.exists():
|
||||
error_exit(f"Stack already exists: {stack_dir}\nUse --force to overwrite")
|
||||
if container_dir.exists():
|
||||
error_exit(f"Container build dir exists: {container_dir}\nUse --force to overwrite")
|
||||
|
||||
# Dry run check
|
||||
if opts.o.dry_run:
|
||||
print(f"Would create stack '{name}' with template '{stack_type}':")
|
||||
print(f" - {stack_dir}/stack.yml")
|
||||
print(f" - {container_dir}/Dockerfile")
|
||||
print(f" - {container_dir}/build.sh")
|
||||
print(f" - {compose_dir}/docker-compose-{name}.yml")
|
||||
print(f" - Update repository-list.txt")
|
||||
print(f" - Update container-image-list.txt")
|
||||
print(f" - Update pod-list.txt")
|
||||
return
|
||||
|
||||
# Create files
|
||||
if not opts.o.quiet:
|
||||
print(f"Creating stack '{name}' with template '{stack_type}'...")
|
||||
|
||||
create_stack_yml(stack_dir, name, template, repo)
|
||||
if opts.o.verbose:
|
||||
print(f" Created {stack_dir}/stack.yml")
|
||||
|
||||
create_dockerfile(container_dir, name, template)
|
||||
if opts.o.verbose:
|
||||
print(f" Created {container_dir}/Dockerfile")
|
||||
|
||||
create_build_script(container_dir, name)
|
||||
if opts.o.verbose:
|
||||
print(f" Created {container_dir}/build.sh")
|
||||
|
||||
create_compose_file(compose_dir, name, template)
|
||||
if opts.o.verbose:
|
||||
print(f" Created {compose_dir}/docker-compose-{name}.yml")
|
||||
|
||||
# Update list files
|
||||
if repo:
|
||||
update_list_file(data_dir, "repository-list.txt", repo)
|
||||
if opts.o.verbose:
|
||||
print(f" Added {repo} to repository-list.txt")
|
||||
|
||||
update_list_file(data_dir, "container-image-list.txt", f"cerc/{name}")
|
||||
if opts.o.verbose:
|
||||
print(f" Added cerc/{name} to container-image-list.txt")
|
||||
|
||||
update_list_file(data_dir, "pod-list.txt", name)
|
||||
if opts.o.verbose:
|
||||
print(f" Added {name} to pod-list.txt")
|
||||
|
||||
# Summary
|
||||
if not opts.o.quiet:
|
||||
print(f"\nStack '{name}' created successfully!")
|
||||
print(f"\nNext steps:")
|
||||
print(f" 1. Edit {stack_dir}/stack.yml")
|
||||
print(f" 2. Customize {container_dir}/Dockerfile")
|
||||
print(f" 3. Run: laconic-so --stack {name} build-containers")
|
||||
print(f" 4. Run: laconic-so --stack {name} deploy-system up")
|
||||
```
|
||||
|
||||
### 2. Register Command in main.py
|
||||
|
||||
**Edit `stack_orchestrator/main.py`**
|
||||
|
||||
Add import:
|
||||
```python
|
||||
from stack_orchestrator.create import create_stack
|
||||
```
|
||||
|
||||
Add command registration (after line ~78):
|
||||
```python
|
||||
cli.add_command(create_stack.command, "create-stack")
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Implementation Steps
|
||||
|
||||
### Step 1: Create module structure
|
||||
```bash
|
||||
mkdir -p stack_orchestrator/create
|
||||
touch stack_orchestrator/create/__init__.py
|
||||
```
|
||||
|
||||
### Step 2: Create the command file
|
||||
Create `stack_orchestrator/create/create_stack.py` with the code above.
|
||||
|
||||
### Step 3: Register in main.py
|
||||
Add the import and `cli.add_command()` line.
|
||||
|
||||
### Step 4: Test the command
|
||||
```bash
|
||||
# Show help
|
||||
laconic-so create-stack --help
|
||||
|
||||
# Dry run
|
||||
laconic-so --dry-run create-stack --name test-app --type webapp
|
||||
|
||||
# Create a stack
|
||||
laconic-so create-stack --name test-app --type webapp --repo github.com/org/test-app
|
||||
|
||||
# Verify
|
||||
ls -la stack_orchestrator/data/stacks/test-app/
|
||||
cat stack_orchestrator/data/stacks/test-app/stack.yml
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Template Types
|
||||
|
||||
| Type | Base Image | Port | Use Case |
|
||||
|------|------------|------|----------|
|
||||
| `webapp` | node:20-bullseye-slim | 3000 | React/Vue/Next.js apps |
|
||||
| `service` | python:3.11-slim | 8080 | Python backend services |
|
||||
| `empty` | none | none | Custom from scratch |
|
||||
|
||||
---
|
||||
|
||||
## Future Enhancements
|
||||
|
||||
1. **Interactive mode** - Prompt for values if not provided
|
||||
2. **More templates** - Go, Rust, database stacks
|
||||
3. **Template from existing** - `--from-stack existing-stack`
|
||||
4. **External stack support** - Create in custom directory
|
||||
5. **Validation command** - `laconic-so validate-stack --name my-stack`
|
||||
|
||||
---
|
||||
|
||||
## Files Modified
|
||||
|
||||
| File | Change |
|
||||
|------|--------|
|
||||
| `stack_orchestrator/create/__init__.py` | New (empty) |
|
||||
| `stack_orchestrator/create/create_stack.py` | New (command implementation) |
|
||||
| `stack_orchestrator/main.py` | Add import and `cli.add_command()` |
|
||||
|
||||
---
|
||||
|
||||
## Verification
|
||||
|
||||
```bash
|
||||
# 1. Command appears in help
|
||||
laconic-so --help | grep create-stack
|
||||
|
||||
# 2. Dry run works
|
||||
laconic-so --dry-run create-stack --name verify-test --type webapp
|
||||
|
||||
# 3. Full creation works
|
||||
laconic-so create-stack --name verify-test --type webapp
|
||||
ls stack_orchestrator/data/stacks/verify-test/
|
||||
ls stack_orchestrator/data/container-build/cerc-verify-test/
|
||||
ls stack_orchestrator/data/compose/docker-compose-verify-test.yml
|
||||
|
||||
# 4. Build works
|
||||
laconic-so --stack verify-test build-containers
|
||||
|
||||
# 5. Cleanup
|
||||
rm -rf stack_orchestrator/data/stacks/verify-test
|
||||
rm -rf stack_orchestrator/data/container-build/cerc-verify-test
|
||||
rm stack_orchestrator/data/compose/docker-compose-verify-test.yml
|
||||
```
|
||||
|
|
@ -0,0 +1,35 @@
|
|||
# TODO
|
||||
|
||||
## Features Needed
|
||||
|
||||
### Update Stack Command
|
||||
We need an "update stack" command in stack orchestrator and cleaner documentation regarding how to do continuous deployment with and without payments.
|
||||
|
||||
**Context**: Currently, `deploy init` generates a spec file and `deploy create` creates a deployment directory. The `deployment update` command (added by Thomas Lackey) only syncs env vars and restarts - it doesn't regenerate configurations. There's a gap in the workflow for updating stack configurations after initial deployment.
|
||||
|
||||
## Bugs
|
||||
|
||||
### `deploy create` doesn't auto-generate volume mappings for new pods
|
||||
|
||||
When a new pod is added to `stack.yml` (e.g. `monitoring`), `deploy create`
|
||||
does not generate default host path mappings in spec.yml for the new pod's
|
||||
volumes. The deployment then fails at scheduling because the PVCs don't exist.
|
||||
|
||||
**Expected**: `deploy create` enumerates all volumes from all compose files
|
||||
in the stack and generates default host paths for any that aren't already
|
||||
mapped in the spec.yml `volumes:` section.
|
||||
|
||||
**Actual**: Only volumes already in spec.yml get PVs. New volumes are silently
|
||||
missing, causing `FailedScheduling: persistentvolumeclaim not found`.
|
||||
|
||||
**Workaround**: Manually add volume entries to spec.yml and create host dirs.
|
||||
|
||||
**Files**: `deployment_create.py` (`_write_config_file`, volume handling)
|
||||
|
||||
## Architecture Refactoring
|
||||
|
||||
### Separate Deployer from Stack Orchestrator CLI
|
||||
The deployer logic should be decoupled from the CLI tool to allow independent development and reuse.
|
||||
|
||||
### Separate Stacks from Stack Orchestrator Repo
|
||||
Stacks should live in their own repositories, not bundled with the orchestrator tool. This allows stacks to evolve independently and be maintained by different teams.
|
||||
|
|
@ -1,277 +0,0 @@
|
|||
# agave-stack
|
||||
|
||||
Unified Agave/Jito Solana stack for [laconic-so](https://github.com/LaconicNetwork/stack-orchestrator). Deploys Solana validators, RPC nodes, and test validators as containers with optional [DoubleZero](https://doublezero.xyz) network routing.
|
||||
|
||||
## Modes
|
||||
|
||||
| Mode | Compose file | Use case |
|
||||
|------|-------------|----------|
|
||||
| `validator` | `docker-compose-agave.yml` | Voting validator (mainnet/testnet) |
|
||||
| `rpc` | `docker-compose-agave-rpc.yml` | Non-voting RPC node |
|
||||
| `test` | `docker-compose-agave-test.yml` | Local dev with instant finality |
|
||||
|
||||
Mode is selected via the `AGAVE_MODE` environment variable.
|
||||
|
||||
## Repository layout
|
||||
|
||||
```
|
||||
agave-stack/
|
||||
├── deployment/ # Reference deployment (biscayne)
|
||||
│ ├── spec.yml # k8s-kind deployment spec
|
||||
│ └── k8s-manifests/
|
||||
│ └── doublezero-daemonset.yaml # DZ DaemonSet (hostNetwork)
|
||||
├── stack-orchestrator/
|
||||
│ ├── stacks/agave/
|
||||
│ │ ├── stack.yml # laconic-so stack definition
|
||||
│ │ └── README.md # Stack-level docs
|
||||
│ ├── compose/
|
||||
│ │ ├── docker-compose-agave.yml # Voting validator
|
||||
│ │ ├── docker-compose-agave-rpc.yml # Non-voting RPC
|
||||
│ │ ├── docker-compose-agave-test.yml # Test validator
|
||||
│ │ └── docker-compose-doublezero.yml # DoubleZero daemon
|
||||
│ ├── container-build/
|
||||
│ │ ├── laconicnetwork-agave/ # Agave/Jito image
|
||||
│ │ │ ├── Dockerfile # Two-stage build from source
|
||||
│ │ │ ├── build.sh # laconic-so build script
|
||||
│ │ │ ├── entrypoint.sh # Mode router
|
||||
│ │ │ ├── start-validator.sh # Voting validator startup
|
||||
│ │ │ ├── start-rpc.sh # RPC node startup
|
||||
│ │ │ └── start-test.sh # Test validator + SPL setup
|
||||
│ │ └── laconicnetwork-doublezero/ # DoubleZero image
|
||||
│ │ ├── Dockerfile # Installs from Cloudsmith apt
|
||||
│ │ ├── build.sh
|
||||
│ │ └── entrypoint.sh
|
||||
│ └── config/agave/
|
||||
│ ├── restart-node.sh # Container restart helper
|
||||
│ └── restart.cron # Scheduled restart schedule
|
||||
```
|
||||
|
||||
## Prerequisites
|
||||
|
||||
- [laconic-so](https://github.com/LaconicNetwork/stack-orchestrator) (stack orchestrator)
|
||||
- Docker
|
||||
- Kind (for k8s deployments)
|
||||
|
||||
## Building
|
||||
|
||||
```bash
|
||||
# Vanilla Agave v3.1.9
|
||||
laconic-so --stack agave build-containers
|
||||
|
||||
# Jito v3.1.8 (required for MEV)
|
||||
AGAVE_REPO=https://github.com/jito-foundation/jito-solana.git \
|
||||
AGAVE_VERSION=v3.1.8-jito \
|
||||
laconic-so --stack agave build-containers
|
||||
```
|
||||
|
||||
Build compiles from source (~30-60 min on first build). This produces both the `laconicnetwork/agave:local` and `laconicnetwork/doublezero:local` images.
|
||||
|
||||
## Deploying
|
||||
|
||||
### Test validator (local dev)
|
||||
|
||||
```bash
|
||||
laconic-so --stack agave deploy init --output spec.yml
|
||||
laconic-so --stack agave deploy create --spec-file spec.yml --deployment-dir my-test
|
||||
laconic-so deployment --dir my-test start
|
||||
```
|
||||
|
||||
The test validator starts with instant finality and optionally creates SPL token mints and airdrops to configured pubkeys.
|
||||
|
||||
### Mainnet/testnet (Docker Compose)
|
||||
|
||||
```bash
|
||||
laconic-so --stack agave deploy init --output spec.yml
|
||||
# Edit spec.yml: set AGAVE_MODE, VALIDATOR_ENTRYPOINT, KNOWN_VALIDATOR, etc.
|
||||
laconic-so --stack agave deploy create --spec-file spec.yml --deployment-dir my-node
|
||||
laconic-so deployment --dir my-node start
|
||||
```
|
||||
|
||||
### Kind/k8s deployment
|
||||
|
||||
The `deployment/spec.yml` provides a reference spec targeting `k8s-kind`. The compose files use `network_mode: host` which works for Docker Compose and is silently ignored by laconic-so's k8s conversion (it uses explicit ports from the deployment spec instead).
|
||||
|
||||
```bash
|
||||
laconic-so --stack agave deploy create \
|
||||
--spec-file deployment/spec.yml \
|
||||
--deployment-dir my-deployment
|
||||
|
||||
# Mount validator keypairs
|
||||
cp validator-identity.json my-deployment/data/validator-config/
|
||||
cp vote-account-keypair.json my-deployment/data/validator-config/ # validator mode only
|
||||
|
||||
laconic-so deployment --dir my-deployment start
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
### Common (all modes)
|
||||
|
||||
| Variable | Default | Description |
|
||||
|----------|---------|-------------|
|
||||
| `AGAVE_MODE` | `test` | `test`, `rpc`, or `validator` |
|
||||
| `VALIDATOR_ENTRYPOINT` | *required* | Cluster entrypoint (host:port) |
|
||||
| `KNOWN_VALIDATOR` | *required* | Known validator pubkey |
|
||||
| `EXTRA_ENTRYPOINTS` | | Space-separated additional entrypoints |
|
||||
| `EXTRA_KNOWN_VALIDATORS` | | Space-separated additional known validators |
|
||||
| `RPC_PORT` | `8899` | RPC HTTP port |
|
||||
| `RPC_BIND_ADDRESS` | `127.0.0.1` | RPC bind address |
|
||||
| `GOSSIP_PORT` | `8001` | Gossip protocol port |
|
||||
| `DYNAMIC_PORT_RANGE` | `8000-10000` | TPU/TVU/repair UDP port range |
|
||||
| `LIMIT_LEDGER_SIZE` | `50000000` | Max ledger slots to retain |
|
||||
| `SNAPSHOT_INTERVAL_SLOTS` | `1000` | Full snapshot interval |
|
||||
| `MAXIMUM_SNAPSHOTS_TO_RETAIN` | `5` | Max full snapshots |
|
||||
| `EXPECTED_GENESIS_HASH` | | Cluster genesis verification |
|
||||
| `EXPECTED_SHRED_VERSION` | | Shred version verification |
|
||||
| `RUST_LOG` | `info` | Log level |
|
||||
| `SOLANA_METRICS_CONFIG` | | Metrics reporting config |
|
||||
|
||||
### Validator mode
|
||||
|
||||
| Variable | Default | Description |
|
||||
|----------|---------|-------------|
|
||||
| `VOTE_ACCOUNT_KEYPAIR` | `/data/config/vote-account-keypair.json` | Vote account keypair path |
|
||||
|
||||
Identity keypair must be mounted at `/data/config/validator-identity.json`.
|
||||
|
||||
### RPC mode
|
||||
|
||||
| Variable | Default | Description |
|
||||
|----------|---------|-------------|
|
||||
| `PUBLIC_RPC_ADDRESS` | | If set, advertise as public RPC |
|
||||
| `ACCOUNT_INDEXES` | `program-id,spl-token-owner,spl-token-mint` | Account indexes for queries |
|
||||
|
||||
Identity is auto-generated if not mounted.
|
||||
|
||||
### Jito MEV (validator and RPC modes)
|
||||
|
||||
Set `JITO_ENABLE=true` and provide:
|
||||
|
||||
| Variable | Description |
|
||||
|----------|-------------|
|
||||
| `JITO_BLOCK_ENGINE_URL` | Block engine endpoint |
|
||||
| `JITO_SHRED_RECEIVER_ADDR` | Shred receiver (region-specific) |
|
||||
| `JITO_RELAYER_URL` | Relayer URL (validator mode) |
|
||||
| `JITO_TIP_PAYMENT_PROGRAM` | Tip payment program pubkey |
|
||||
| `JITO_DISTRIBUTION_PROGRAM` | Tip distribution program pubkey |
|
||||
| `JITO_MERKLE_ROOT_AUTHORITY` | Merkle root upload authority |
|
||||
| `JITO_COMMISSION_BPS` | Commission basis points |
|
||||
|
||||
Image must be built from `jito-foundation/jito-solana` for Jito flags to work.
|
||||
|
||||
### Test mode
|
||||
|
||||
| Variable | Default | Description |
|
||||
|----------|---------|-------------|
|
||||
| `FACILITATOR_PUBKEY` | | Pubkey to airdrop SOL |
|
||||
| `SERVER_PUBKEY` | | Pubkey to airdrop SOL |
|
||||
| `CLIENT_PUBKEY` | | Pubkey to airdrop SOL + create ATA |
|
||||
| `MINT_DECIMALS` | `6` | SPL token decimals |
|
||||
| `MINT_AMOUNT` | `1000000` | SPL tokens to mint |
|
||||
|
||||
## DoubleZero
|
||||
|
||||
[DoubleZero](https://doublezero.xyz) provides optimized network routing for Solana validators via GRE tunnels (IP protocol 47) and BGP (TCP/179) over link-local 169.254.0.0/16. Validator traffic to other DZ participants is routed through private fiber instead of the public internet.
|
||||
|
||||
### How it works
|
||||
|
||||
`doublezerod` creates a `doublezero0` GRE tunnel interface and runs BGP peering through it. Routes are injected into the host routing table, so the validator transparently sends traffic over the fiber backbone. IBRL mode falls back to public internet if DZ is down.
|
||||
|
||||
### Requirements
|
||||
|
||||
- Validator identity keypair at `/data/config/validator-identity.json`
|
||||
- `privileged: true` + `NET_ADMIN` (GRE tunnel + route table manipulation)
|
||||
- `hostNetwork: true` (GRE uses IP protocol 47 — cannot be port-mapped)
|
||||
- Node registered with DoubleZero passport system
|
||||
|
||||
### Docker Compose
|
||||
|
||||
`docker-compose-doublezero.yml` runs alongside the validator with `network_mode: host`, sharing the `validator-config` volume for identity access.
|
||||
|
||||
### k8s
|
||||
|
||||
laconic-so does not pass `hostNetwork` through to generated k8s resources. DoubleZero runs as a DaemonSet applied after `deployment start`:
|
||||
|
||||
```bash
|
||||
kubectl apply -f deployment/k8s-manifests/doublezero-daemonset.yaml
|
||||
```
|
||||
|
||||
Since the validator pods share the node's network namespace, they automatically see the GRE routes injected by `doublezerod`.
|
||||
|
||||
| Variable | Default | Description |
|
||||
|----------|---------|-------------|
|
||||
| `VALIDATOR_IDENTITY_PATH` | `/data/config/validator-identity.json` | Validator identity keypair |
|
||||
| `DOUBLEZERO_RPC_ENDPOINT` | `http://127.0.0.1:8899` | Solana RPC for DZ registration |
|
||||
| `DOUBLEZERO_EXTRA_ARGS` | | Additional doublezerod arguments |
|
||||
|
||||
## Runtime requirements
|
||||
|
||||
The container requires the following (already set in compose files):
|
||||
|
||||
| Setting | Value | Why |
|
||||
|---------|-------|-----|
|
||||
| `privileged` | `true` | `mlock()` syscall and raw network access |
|
||||
| `cap_add` | `IPC_LOCK` | Memory page locking for account indexes and ledger |
|
||||
| `ulimits.memlock` | `-1` (unlimited) | Agave locks gigabytes of memory |
|
||||
| `ulimits.nofile` | `1000000` | Gossip/TPU connections + memory-mapped ledger files |
|
||||
| `network_mode` | `host` | Direct host network stack for gossip, TPU, UDP ranges |
|
||||
|
||||
Without these, Agave either refuses to start or dies under load.
|
||||
|
||||
## Container overhead
|
||||
|
||||
Containers with `privileged: true` and `network_mode: host` add **zero measurable overhead** vs bare metal. Linux containers are not VMs:
|
||||
|
||||
- **Network**: Host network namespace directly — no bridge, no NAT, no veth. Same kernel code path as bare metal.
|
||||
- **CPU**: No hypervisor. Same physical cores, same scheduler priority.
|
||||
- **Memory**: `IPC_LOCK` + unlimited memlock = identical `mlock()` behavior.
|
||||
- **Disk I/O**: hostPath-backed PVs have identical I/O characteristics.
|
||||
|
||||
The only overhead is cgroup accounting (nanoseconds per syscall) and overlayfs for cold file opens (single-digit microseconds, zero once cached).
|
||||
|
||||
## Scheduled restarts
|
||||
|
||||
The `config/agave/restart.cron` defines periodic restarts to mitigate memory growth:
|
||||
|
||||
- **Validator**: every 4 hours
|
||||
- **RPC**: every 6 hours (staggered 30 min offset)
|
||||
|
||||
Uses `restart-node.sh` which sends TERM to the matching container for graceful shutdown.
|
||||
|
||||
## Biscayne reference deployment
|
||||
|
||||
The `deployment/` directory contains a reference deployment for biscayne.vaasl.io (186.233.184.235), a mainnet voting validator with Jito MEV and DoubleZero:
|
||||
|
||||
```bash
|
||||
# Build Jito image
|
||||
AGAVE_REPO=https://github.com/jito-foundation/jito-solana.git \
|
||||
AGAVE_VERSION=v3.1.8-jito \
|
||||
laconic-so --stack agave build-containers
|
||||
|
||||
# Create deployment
|
||||
laconic-so --stack agave deploy create \
|
||||
--spec-file deployment/spec.yml \
|
||||
--deployment-dir biscayne-deployment
|
||||
|
||||
# Mount keypairs
|
||||
cp validator-identity.json biscayne-deployment/data/validator-config/
|
||||
cp vote-account-keypair.json biscayne-deployment/data/validator-config/
|
||||
|
||||
# Start
|
||||
laconic-so deployment --dir biscayne-deployment start
|
||||
|
||||
# Start DoubleZero
|
||||
kubectl apply -f deployment/k8s-manifests/doublezero-daemonset.yaml
|
||||
```
|
||||
|
||||
To run as non-voting RPC, change `AGAVE_MODE: rpc` in `deployment/spec.yml`.
|
||||
|
||||
## Volumes
|
||||
|
||||
| Volume | Mount | Content |
|
||||
|--------|-------|---------|
|
||||
| `validator-config` / `rpc-config` | `/data/config` | Identity keypairs, node config |
|
||||
| `validator-ledger` / `rpc-ledger` | `/data/ledger` | Blockchain ledger data |
|
||||
| `validator-accounts` / `rpc-accounts` | `/data/accounts` | Account state cache |
|
||||
| `validator-snapshots` / `rpc-snapshots` | `/data/snapshots` | Full and incremental snapshots |
|
||||
| `doublezero-config` | `~/.config/doublezero` | DZ identity and state |
|
||||
|
|
@ -1,198 +0,0 @@
|
|||
# Work in Progress: Biscayne TVU Shred Relay
|
||||
|
||||
## Overview
|
||||
|
||||
Biscayne's agave validator was shred-starved (~1.7 slots/sec replay vs ~2.5 mainnet).
|
||||
Root cause: not enough turbine shreds arriving. Solution: advertise a TVU address in
|
||||
Ashburn (dense validator population, better turbine tree neighbors) and relay shreds
|
||||
to biscayne in Miami over the laconic backbone.
|
||||
|
||||
### Architecture
|
||||
|
||||
```
|
||||
Turbine peers (hundreds of validators)
|
||||
|
|
||||
v UDP shreds to port 20000
|
||||
laconic-was-sw01 Et1/1 (64.92.84.81, Ashburn)
|
||||
| ASIC receives on front-panel interface
|
||||
| EOS monitor session mirrors matched packets to CPU
|
||||
v
|
||||
mirror0 interface (Linux userspace)
|
||||
| socat reads raw frames, sends as UDP
|
||||
v 172.16.1.188 -> 186.233.184.235:9100 (Et4/1 backbone, 25.4ms)
|
||||
laconic-mia-sw01 Et4/1 (172.16.1.189, Miami)
|
||||
| forwards via default route (Et1/1, same metro)
|
||||
v 0.13ms
|
||||
biscayne:9100 (186.233.184.235, Miami)
|
||||
| shred-unwrap.py strips IP+UDP headers
|
||||
v clean shred payload to localhost:9000
|
||||
agave-validator TVU port
|
||||
```
|
||||
|
||||
Total one-way relay latency: ~12.8ms
|
||||
|
||||
### Results
|
||||
|
||||
Before relay: ~1.7 slots/sec replay, falling behind ~0.8 slots/sec.
|
||||
After relay: ~3.32 slots/sec replay, catching up ~0.82 slots/sec.
|
||||
|
||||
---
|
||||
|
||||
## Changes by Host
|
||||
|
||||
### laconic-was-sw01 (Ashburn) — `install@137.239.200.198`
|
||||
|
||||
All changes are ephemeral (not persisted, lost on reboot).
|
||||
|
||||
**1. EOS monitor session (running-config, not in startup-config)**
|
||||
|
||||
Mirrors inbound UDP port 20000 traffic on Et1/1 to a CPU-accessible `mirror0` interface.
|
||||
Required because the Arista 7280CR3A ASIC handles front-panel traffic without punting to
|
||||
Linux userspace — regular sockets cannot receive packets on front-panel IPs.
|
||||
|
||||
```
|
||||
monitor session 1 source Ethernet1/1 rx
|
||||
monitor session 1 ip access-group SHRED-RELAY
|
||||
monitor session 1 destination Cpu
|
||||
```
|
||||
|
||||
**2. EOS ACL (running-config, not in startup-config)**
|
||||
|
||||
```
|
||||
ip access-list SHRED-RELAY
|
||||
10 permit udp any any eq 20000
|
||||
```
|
||||
|
||||
**3. EOS static route (running-config, not in startup-config)**
|
||||
|
||||
```
|
||||
ip route 186.233.184.235/32 172.16.1.189
|
||||
```
|
||||
|
||||
Routes biscayne traffic via Et4/1 backbone to laconic-mia-sw01 instead of the default
|
||||
route (64.92.84.80, Cogent public internet).
|
||||
|
||||
**4. Linux kernel static route (ephemeral, `ip route add`)**
|
||||
|
||||
```
|
||||
ip route add 186.233.184.235/32 via 172.16.1.189 dev et4_1
|
||||
```
|
||||
|
||||
Required because socat runs in Linux userspace. The EOS static route programs the ASIC
|
||||
but does not always sync to the Linux kernel routing table. Without this, socat's UDP
|
||||
packets egress via the default route (et1_1, public internet).
|
||||
|
||||
**5. socat relay process (foreground, pts/5)**
|
||||
|
||||
```bash
|
||||
sudo socat -u INTERFACE:mirror0,type=2 UDP-SENDTO:186.233.184.235:9100
|
||||
```
|
||||
|
||||
Reads raw L2 frames from mirror0 (SOCK_DGRAM strips ethernet header, leaving IP+UDP+payload).
|
||||
Sends each frame as a UDP datagram to biscayne:9100. Runs as root (raw socket access to mirror0).
|
||||
|
||||
PID: 27743 (child of sudo PID 27742)
|
||||
|
||||
---
|
||||
|
||||
### laconic-mia-sw01 (Miami) — `install@209.42.167.130`
|
||||
|
||||
**No changes made.** MIA already reaches biscayne at 0.13ms via its default route
|
||||
(`209.42.167.132` on Et1/1, same metro). Relay traffic from WAS arrives on Et4/1
|
||||
(`172.16.1.189`) and MIA forwards to `186.233.184.235` natively.
|
||||
|
||||
Key interfaces for reference:
|
||||
- Et1/1: `209.42.167.133/31` (public uplink, default route via 209.42.167.132)
|
||||
- Et4/1: `172.16.1.189/31` (backbone link to WAS, peer 172.16.1.188)
|
||||
- Et8/1: `172.16.1.192/31` (another backbone link, not used for relay)
|
||||
|
||||
---
|
||||
|
||||
### biscayne (Miami) — `rix@biscayne.vaasl.io`
|
||||
|
||||
**1. Custom agave image: `laconicnetwork/agave:tvu-relay`**
|
||||
|
||||
Stock agave v3.1.9 with cherry-picked commit 9f4b3ae from anza master (adds
|
||||
`--public-tvu-address` flag, from anza PR #6778). Built in `/tmp/agave-tvu-patch/`,
|
||||
transferred via `docker save | scp | docker load | kind load docker-image`.
|
||||
|
||||
**2. K8s deployment changes**
|
||||
|
||||
Namespace: `laconic-laconic-70ce4c4b47e23b85`
|
||||
Deployment: `laconic-70ce4c4b47e23b85-deployment`
|
||||
|
||||
Changes from previous deployment:
|
||||
- Image: `laconicnetwork/agave:local` -> `laconicnetwork/agave:tvu-relay`
|
||||
- Added env: `PUBLIC_TVU_ADDRESS=64.92.84.81:20000`
|
||||
- Set: `JITO_ENABLE=false` (stock agave has no Jito flags)
|
||||
- Strategy: changed to `Recreate` (hostNetwork port conflicts prevent RollingUpdate)
|
||||
|
||||
The validator runs with `--public-tvu-address 64.92.84.81:20000`, causing it to
|
||||
advertise the Ashburn switch IP as its TVU address in gossip. Turbine tree peers
|
||||
send shreds to Ashburn instead of directly to Miami.
|
||||
|
||||
**3. shred-unwrap.py (foreground process, PID 2497694)**
|
||||
|
||||
```bash
|
||||
python3 /tmp/shred-unwrap.py 9100 127.0.0.1 9000
|
||||
```
|
||||
|
||||
Listens on UDP port 9100, strips IP+UDP headers from mirrored packets (variable-length
|
||||
IP header via IHL field + 8-byte UDP header), forwards clean shred payloads to
|
||||
localhost:9000 (the validator's TVU port). Running as user `rix`.
|
||||
|
||||
Script location: `/tmp/shred-unwrap.py`
|
||||
|
||||
**4. agave-stack repo changes (uncommitted)**
|
||||
|
||||
- `stack-orchestrator/container-build/laconicnetwork-agave/start-rpc.sh`:
|
||||
Added `PUBLIC_TVU_ADDRESS` to header docs and
|
||||
`[ -n "${PUBLIC_TVU_ADDRESS:-}" ] && ARGS+=(--public-tvu-address "$PUBLIC_TVU_ADDRESS")`
|
||||
|
||||
- `stack-orchestrator/compose/docker-compose-agave-rpc.yml`:
|
||||
Added `PUBLIC_TVU_ADDRESS: ${PUBLIC_TVU_ADDRESS:-}` to environment section
|
||||
|
||||
---
|
||||
|
||||
## What's NOT Production-Ready
|
||||
|
||||
### Ephemeral processes
|
||||
- socat on laconic-was-sw01: foreground process in a terminal session
|
||||
- shred-unwrap.py on biscayne: foreground process, running from /tmp
|
||||
- Both die if the terminal disconnects or the host reboots
|
||||
- Need systemd units for both
|
||||
|
||||
### Ephemeral switch config
|
||||
- Monitor session, ACL, and static routes on was-sw01 are in running-config only
|
||||
- Not saved to startup-config (`write memory` was run but the route didn't persist)
|
||||
- Linux kernel route (`ip route add`) is completely ephemeral
|
||||
- All lost on switch reboot
|
||||
|
||||
### No monitoring
|
||||
- No alerting on relay health (socat crash, shred-unwrap crash, packet loss)
|
||||
- No metrics on relay throughput vs direct turbine throughput
|
||||
- No comparison of before/after slot gap trends
|
||||
|
||||
### Validator still catching up
|
||||
- ~50k slots behind as of initial relay activation
|
||||
- Catching up at ~0.82 slots/sec (~2,950 slots/hour)
|
||||
- ~17 hours to catch up from current position, or reset with fresh snapshot (~15-30 min)
|
||||
|
||||
---
|
||||
|
||||
## Key Details
|
||||
|
||||
| Item | Value |
|
||||
|------|-------|
|
||||
| Biscayne validator identity | `4WeLUxfQghbhsLEuwaAzjZiHg2VBw87vqHc4iZrGvKPr` |
|
||||
| Biscayne IP | `186.233.184.235` |
|
||||
| laconic-was-sw01 public IP | `64.92.84.81` (Et1/1) |
|
||||
| laconic-was-sw01 backbone IP | `172.16.1.188` (Et4/1) |
|
||||
| laconic-was-sw01 SSH | `install@137.239.200.198` |
|
||||
| laconic-mia-sw01 backbone IP | `172.16.1.189` (Et4/1) |
|
||||
| laconic-mia-sw01 SSH | `install@209.42.167.130` |
|
||||
| Biscayne SSH | `rix@biscayne.vaasl.io` (via ProxyJump abernathy) |
|
||||
| Backbone RTT (WAS-MIA) | 25.4ms (Et4/1 ↔ Et4/1, 0.01ms jitter) |
|
||||
| Relay one-way latency | ~12.8ms |
|
||||
| Agave image | `laconicnetwork/agave:tvu-relay` (v3.1.9 + commit 9f4b3ae) |
|
||||
| EOS version | 4.34.0F |
|
||||
|
|
@ -1,193 +0,0 @@
|
|||
---
|
||||
# Redeploy agave-stack on biscayne with aria2c snapshot pre-download
|
||||
#
|
||||
# Usage:
|
||||
# # Standard redeploy (download snapshot, preserve accounts + ledger)
|
||||
# ansible-playbook -i biscayne.vaasl.io, ansible/biscayne-redeploy.yml
|
||||
#
|
||||
# # Full wipe (accounts + ledger) — slow rebuild
|
||||
# ansible-playbook -i biscayne.vaasl.io, ansible/biscayne-redeploy.yml \
|
||||
# -e wipe_accounts=true -e wipe_ledger=true
|
||||
#
|
||||
# # Skip snapshot download (use existing)
|
||||
# ansible-playbook -i biscayne.vaasl.io, ansible/biscayne-redeploy.yml \
|
||||
# -e skip_snapshot=true
|
||||
#
|
||||
# # Pass extra args to snapshot-download.py
|
||||
# ansible-playbook -i biscayne.vaasl.io, ansible/biscayne-redeploy.yml \
|
||||
# -e 'snapshot_args=--version 2.2 --min-download-speed 50'
|
||||
#
|
||||
# # Snapshot only (no redeploy)
|
||||
# ansible-playbook -i biscayne.vaasl.io, ansible/biscayne-redeploy.yml --tags snapshot
|
||||
#
|
||||
- name: Redeploy agave validator on biscayne
|
||||
hosts: all
|
||||
gather_facts: false
|
||||
vars:
|
||||
deployment_dir: /srv/deployments/agave
|
||||
laconic_so: /home/rix/.local/bin/laconic-so
|
||||
kind_cluster: laconic-70ce4c4b47e23b85
|
||||
k8s_namespace: "laconic-{{ kind_cluster }}"
|
||||
snapshot_dir: /srv/solana/snapshots
|
||||
ledger_dir: /srv/solana/ledger
|
||||
accounts_dir: /srv/solana/ramdisk/accounts
|
||||
ramdisk_mount: /srv/solana/ramdisk
|
||||
ramdisk_device: /dev/ram0
|
||||
snapshot_script_local: "{{ playbook_dir }}/../scripts/snapshot-download.py"
|
||||
snapshot_script: /tmp/snapshot-download.py
|
||||
# Flags — non-destructive by default
|
||||
wipe_accounts: false
|
||||
wipe_ledger: false
|
||||
skip_snapshot: false
|
||||
snapshot_args: ""
|
||||
|
||||
tasks:
|
||||
# --- Snapshot download (runs while validator is still up) ---
|
||||
- name: Verify aria2c installed
|
||||
command: which aria2c
|
||||
changed_when: false
|
||||
when: not skip_snapshot | bool
|
||||
tags: [snapshot]
|
||||
|
||||
- name: Copy snapshot script to remote
|
||||
copy:
|
||||
src: "{{ snapshot_script_local }}"
|
||||
dest: "{{ snapshot_script }}"
|
||||
mode: "0755"
|
||||
when: not skip_snapshot | bool
|
||||
tags: [snapshot]
|
||||
|
||||
- name: Download snapshot via aria2c
|
||||
command: >
|
||||
python3 {{ snapshot_script }}
|
||||
-o {{ snapshot_dir }}
|
||||
{{ snapshot_args }}
|
||||
become: true
|
||||
register: snapshot_result
|
||||
when: not skip_snapshot | bool
|
||||
timeout: 3600
|
||||
tags: [snapshot]
|
||||
|
||||
- name: Show snapshot download result
|
||||
debug:
|
||||
msg: "{{ snapshot_result.stdout_lines | default(['skipped']) }}"
|
||||
tags: [snapshot]
|
||||
|
||||
# --- Teardown (namespace only, preserve kind cluster) ---
|
||||
- name: Delete deployment namespace
|
||||
command: >
|
||||
kubectl delete namespace {{ k8s_namespace }} --timeout=120s
|
||||
register: ns_delete
|
||||
failed_when: false
|
||||
tags: [teardown]
|
||||
|
||||
- name: Wait for namespace to terminate
|
||||
command: >
|
||||
kubectl get namespace {{ k8s_namespace }}
|
||||
-o jsonpath='{.status.phase}'
|
||||
register: ns_status
|
||||
retries: 30
|
||||
delay: 5
|
||||
until: ns_status.rc != 0
|
||||
failed_when: false
|
||||
when: ns_delete.rc == 0
|
||||
tags: [teardown]
|
||||
|
||||
# --- Data wipe (opt-in) ---
|
||||
- name: Wipe ledger data
|
||||
shell: rm -rf {{ ledger_dir }}/*
|
||||
become: true
|
||||
when: wipe_ledger | bool
|
||||
tags: [wipe]
|
||||
|
||||
- name: Wipe accounts ramdisk (umount + mkfs + mount)
|
||||
shell: |
|
||||
mountpoint -q {{ ramdisk_mount }} && umount {{ ramdisk_mount }} || true
|
||||
mkfs.ext4 -q {{ ramdisk_device }}
|
||||
mount {{ ramdisk_device }} {{ ramdisk_mount }}
|
||||
mkdir -p {{ accounts_dir }}
|
||||
chown solana:solana {{ ramdisk_mount }} {{ accounts_dir }}
|
||||
become: true
|
||||
when: wipe_accounts | bool
|
||||
tags: [wipe]
|
||||
|
||||
- name: Clean old snapshots (keep newest full + incremental)
|
||||
shell: |
|
||||
cd {{ snapshot_dir }} || exit 0
|
||||
newest=$(ls -t snapshot-*.tar.* 2>/dev/null | head -1)
|
||||
if [ -n "$newest" ]; then
|
||||
newest_inc=$(ls -t incremental-snapshot-*.tar.* 2>/dev/null | head -1)
|
||||
find . -maxdepth 1 -name '*.tar.*' \
|
||||
! -name "$newest" \
|
||||
! -name "${newest_inc:-__none__}" \
|
||||
-delete
|
||||
fi
|
||||
become: true
|
||||
when: not skip_snapshot | bool
|
||||
tags: [wipe]
|
||||
|
||||
# --- Deploy ---
|
||||
- name: Verify kind-config.yml has unified mount root
|
||||
command: "grep -c 'containerPath: /mnt$' {{ deployment_dir }}/kind-config.yml"
|
||||
register: mount_root_check
|
||||
failed_when: mount_root_check.stdout | int < 1
|
||||
tags: [deploy]
|
||||
|
||||
- name: Start deployment
|
||||
command: "{{ laconic_so }} deployment --dir {{ deployment_dir }} start"
|
||||
timeout: 600
|
||||
tags: [deploy]
|
||||
|
||||
- name: Wait for pod to be running
|
||||
command: >
|
||||
kubectl get pods -n {{ k8s_namespace }}
|
||||
-o jsonpath='{.items[0].status.phase}'
|
||||
register: pod_status
|
||||
retries: 60
|
||||
delay: 10
|
||||
until: pod_status.stdout == "Running"
|
||||
tags: [deploy]
|
||||
|
||||
# --- Verify ---
|
||||
- name: Verify unified mount inside kind node
|
||||
command: "docker exec {{ kind_cluster }}-control-plane ls /mnt/solana/"
|
||||
register: mount_check
|
||||
tags: [verify]
|
||||
|
||||
- name: Show mount contents
|
||||
debug:
|
||||
msg: "{{ mount_check.stdout_lines }}"
|
||||
tags: [verify]
|
||||
|
||||
- name: Check validator log file is being written
|
||||
command: >
|
||||
kubectl exec -n {{ k8s_namespace }}
|
||||
deployment/{{ kind_cluster }}-deployment
|
||||
-c agave-validator -- test -f /data/log/validator.log
|
||||
retries: 12
|
||||
delay: 10
|
||||
until: log_file_check.rc == 0
|
||||
register: log_file_check
|
||||
failed_when: false
|
||||
tags: [verify]
|
||||
|
||||
- name: Check RPC health
|
||||
uri:
|
||||
url: http://127.0.0.1:8899/health
|
||||
return_content: true
|
||||
register: rpc_health
|
||||
retries: 6
|
||||
delay: 10
|
||||
until: rpc_health.status == 200
|
||||
failed_when: false
|
||||
delegate_to: "{{ inventory_hostname }}"
|
||||
tags: [verify]
|
||||
|
||||
- name: Report status
|
||||
debug:
|
||||
msg: >-
|
||||
Deployment complete.
|
||||
Log: {{ 'writing' if log_file_check.rc == 0 else 'not yet created' }}.
|
||||
RPC: {{ rpc_health.content | default('not responding') }}.
|
||||
Wiped: ledger={{ wipe_ledger }}, accounts={{ wipe_accounts }}.
|
||||
tags: [verify]
|
||||
|
|
@ -1,50 +0,0 @@
|
|||
# DoubleZero DaemonSet - applied separately from laconic-so deployment
|
||||
# laconic-so does not support hostNetwork in generated k8s resources,
|
||||
# so this manifest is applied via kubectl after 'deployment start'.
|
||||
#
|
||||
# DoubleZero creates GRE tunnels (IP protocol 47) and runs BGP (tcp/179)
|
||||
# on link-local 169.254.0.0/16. This requires host network access.
|
||||
# The GRE routes injected into the node routing table are automatically
|
||||
# visible to all pods using hostNetwork.
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: doublezero
|
||||
labels:
|
||||
app: doublezero
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app: doublezero
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: doublezero
|
||||
spec:
|
||||
hostNetwork: true
|
||||
containers:
|
||||
- name: doublezerod
|
||||
image: laconicnetwork/doublezero:local
|
||||
securityContext:
|
||||
privileged: true
|
||||
capabilities:
|
||||
add:
|
||||
- NET_ADMIN
|
||||
env:
|
||||
- name: VALIDATOR_IDENTITY_PATH
|
||||
value: /data/config/validator-identity.json
|
||||
- name: DOUBLEZERO_RPC_ENDPOINT
|
||||
value: http://127.0.0.1:8899
|
||||
volumeMounts:
|
||||
- name: validator-config
|
||||
mountPath: /data/config
|
||||
readOnly: true
|
||||
- name: doublezero-config
|
||||
mountPath: /root/.config/doublezero
|
||||
volumes:
|
||||
- name: validator-config
|
||||
persistentVolumeClaim:
|
||||
claimName: validator-config
|
||||
- name: doublezero-config
|
||||
persistentVolumeClaim:
|
||||
claimName: doublezero-config
|
||||
|
|
@ -1,112 +0,0 @@
|
|||
# Biscayne Solana Validator deployment spec
|
||||
# Host: biscayne.vaasl.io (186.233.184.235)
|
||||
# Identity: 4WeLUxfQghbhsLEuwaAzjZiHg2VBw87vqHc4iZrGvKPr
|
||||
stack: /srv/deployments/agave-stack/stack-orchestrator/stacks/agave
|
||||
deploy-to: k8s-kind
|
||||
kind-mount-root: /srv/kind
|
||||
network:
|
||||
http-proxy:
|
||||
- host-name: biscayne.vaasl.io
|
||||
routes:
|
||||
- path: /
|
||||
proxy-to: agave-validator:8899
|
||||
- path: /
|
||||
proxy-to: agave-validator:8900
|
||||
websocket: true
|
||||
ports:
|
||||
agave-validator:
|
||||
- '8899'
|
||||
- '8900'
|
||||
- '8001'
|
||||
- 8001/udp
|
||||
- 9000/udp
|
||||
- 9001/udp
|
||||
- 9002/udp
|
||||
- 9003/udp
|
||||
- 9004/udp
|
||||
- 9005/udp
|
||||
- 9006/udp
|
||||
- 9007/udp
|
||||
- 9008/udp
|
||||
- 9009/udp
|
||||
- 9010/udp
|
||||
- 9011/udp
|
||||
- 9012/udp
|
||||
- 9013/udp
|
||||
- 9014/udp
|
||||
- 9015/udp
|
||||
- 9016/udp
|
||||
- 9017/udp
|
||||
- 9018/udp
|
||||
- 9019/udp
|
||||
- 9020/udp
|
||||
- 9021/udp
|
||||
- 9022/udp
|
||||
- 9023/udp
|
||||
- 9024/udp
|
||||
- 9025/udp
|
||||
resources:
|
||||
containers:
|
||||
reservations:
|
||||
cpus: '4.0'
|
||||
memory: 256000M
|
||||
limits:
|
||||
cpus: '32.0'
|
||||
memory: 921600M
|
||||
security:
|
||||
privileged: true
|
||||
unlimited-memlock: true
|
||||
capabilities:
|
||||
- IPC_LOCK
|
||||
volumes:
|
||||
# Config volumes — on ZFS dataset (backed up via snapshots)
|
||||
validator-config: /srv/deployments/agave/data/validator-config
|
||||
doublezero-validator-identity: /srv/deployments/agave/data/validator-config
|
||||
doublezero-config: /srv/deployments/agave/data/doublezero-config
|
||||
# Heavy data volumes — on zvol/ramdisk (not backed up, rebuildable)
|
||||
validator-ledger: /srv/kind/solana/ledger
|
||||
validator-accounts: /srv/kind/solana/ramdisk/accounts
|
||||
validator-snapshots: /srv/kind/solana/snapshots
|
||||
validator-log: /srv/kind/solana/log
|
||||
# Monitoring
|
||||
monitoring-influxdb-data: /srv/kind/solana/monitoring/influxdb
|
||||
monitoring-grafana-data: /srv/kind/solana/monitoring/grafana
|
||||
configmaps:
|
||||
monitoring-telegraf-config: config/monitoring/telegraf-config
|
||||
monitoring-telegraf-scripts: config/monitoring/scripts
|
||||
monitoring-grafana-datasources: config/monitoring/grafana-datasources
|
||||
monitoring-grafana-dashboards: config/monitoring/grafana-dashboards
|
||||
config:
|
||||
# Mode: 'rpc' (non-voting) — matches current biscayne systemd config
|
||||
AGAVE_MODE: rpc
|
||||
# Mainnet entrypoints
|
||||
VALIDATOR_ENTRYPOINT: entrypoint.mainnet-beta.solana.com:8001
|
||||
EXTRA_ENTRYPOINTS: entrypoint2.mainnet-beta.solana.com:8001 entrypoint3.mainnet-beta.solana.com:8001 entrypoint4.mainnet-beta.solana.com:8001 entrypoint5.mainnet-beta.solana.com:8001
|
||||
# Known validators (Solana Foundation, Everstake, Chorus One)
|
||||
KNOWN_VALIDATOR: 7Np41oeYqPefeNQEHSv1UDhYrehxin3NStELsSKCT4K2
|
||||
EXTRA_KNOWN_VALIDATORS: GdnSyH3YtwcxFvQrVVJMm1JhTS4QVX7MFsX56uJLUfiZ dDzy5SR3AXdYWVqbDEkVFdvSPCtS9ihF5kJkHCtXoFs DE1bawNcRJB9rVm3buyMVfr8mBEoyyu73NBovf2oXJsJ CakcnaRDHka2gXyfbEd2d3xsvkJkqsLw2akB3zsN1D2S C1ocKDYMCm2ooWptMMnpd5VEB2Nx4UMJgRuYofysyzcA GwHH8ciFhR8vejWCqmg8FWZUCNtubPY2esALvy5tBvji 6WgdYhhGE53WrZ7ywJA15hBVkw7CRbQ8yDBBTwmBtAHN
|
||||
# Network
|
||||
RPC_PORT: '8899'
|
||||
RPC_BIND_ADDRESS: 0.0.0.0
|
||||
GOSSIP_PORT: '8001'
|
||||
GOSSIP_HOST: 137.239.194.65
|
||||
DYNAMIC_PORT_RANGE: 9000-10000
|
||||
# Cluster verification
|
||||
EXPECTED_GENESIS_HASH: 5eykt4UsFv8P8NJdTREpY1vzqKqZKvdpKuc147dw2N9d
|
||||
EXPECTED_SHRED_VERSION: '50093'
|
||||
# Storage
|
||||
LIMIT_LEDGER_SIZE: '50000000'
|
||||
MAXIMUM_SNAPSHOTS_TO_RETAIN: '1'
|
||||
NO_INCREMENTAL_SNAPSHOTS: 'false'
|
||||
RUST_LOG: info,solana_metrics=warn
|
||||
SOLANA_METRICS_CONFIG: host=http://localhost:8086,db=agave_metrics,u=admin,p=admin
|
||||
# Jito MEV (NY region shred receiver) — disabled until voting enabled
|
||||
JITO_ENABLE: 'false'
|
||||
JITO_BLOCK_ENGINE_URL: https://mainnet.block-engine.jito.wtf
|
||||
JITO_SHRED_RECEIVER_ADDR: 141.98.216.96:1002
|
||||
JITO_TIP_PAYMENT_PROGRAM: T1pyyaTNZsKv2WcRAB8oVnk93mLJw2XzjtVYqCsaHqt
|
||||
JITO_DISTRIBUTION_PROGRAM: 4R3gSG8BpU4t19KYj8CfnbtRpnT8gtk4dvTHxVRwc2r7
|
||||
JITO_MERKLE_ROOT_AUTHORITY: 8F4jGUmxF36vQ6yabnsxX6AQVXdKBhs8kGSUuRKSg8Xt
|
||||
JITO_COMMISSION_BPS: '800'
|
||||
# DoubleZero
|
||||
DOUBLEZERO_RPC_ENDPOINT: http://127.0.0.1:8899
|
||||
|
|
@ -1,234 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
set -Eeuo pipefail
|
||||
|
||||
export PATH=/sbin:/bin:/usr/sbin:/usr/bin:/usr/local/sbin:/usr/local/bin
|
||||
export XDG_RUNTIME_DIR="/run/user/$(id -u)"
|
||||
mkdir -p "$XDG_RUNTIME_DIR"
|
||||
|
||||
# optional suffix from command-line, prepend dash if non-empty
|
||||
SUFFIX="${1:-}"
|
||||
SUFFIX="${SUFFIX:+-$SUFFIX}"
|
||||
|
||||
# define variables
|
||||
DATASET="biscayne/DATA/deployments"
|
||||
DEPLOYMENT_DIR="/srv/deployments/agave"
|
||||
LOG_FILE="$HOME/.backlog_history"
|
||||
ZFS_HOLD="backlog:pending"
|
||||
SERVICE_STOP_TIMEOUT="300"
|
||||
SNAPSHOT_RETENTION="6"
|
||||
SNAPSHOT_PREFIX="backlog"
|
||||
SNAPSHOT_TAG="$(date +%Y%m%d)${SUFFIX}"
|
||||
SNAPSHOT="${DATASET}@${SNAPSHOT_PREFIX}-${SNAPSHOT_TAG}"
|
||||
|
||||
# remote replication targets
|
||||
REMOTES=(
|
||||
"mysterio:edith/DATA/backlog/biscayne-main"
|
||||
"ardham:batterywharf/DATA/backlog/biscayne-main"
|
||||
)
|
||||
|
||||
# log functions
|
||||
log() {
|
||||
local time_fmt
|
||||
time_fmt=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
|
||||
echo "[$time_fmt] $1" >> "$LOG_FILE"
|
||||
}
|
||||
|
||||
log_close() {
|
||||
local end_time duration
|
||||
end_time=$(date +%s)
|
||||
duration=$((end_time - start_time))
|
||||
log "Backlog completed in ${duration}s"
|
||||
echo "" >> "$LOG_FILE"
|
||||
}
|
||||
|
||||
# service controls
|
||||
services() {
|
||||
local action="$1"
|
||||
|
||||
case "$action" in
|
||||
stop)
|
||||
log "Stopping agave deployment..."
|
||||
laconic-so deployment --dir "$DEPLOYMENT_DIR" stop
|
||||
|
||||
log "Waiting for services to fully stop..."
|
||||
local deadline=$(( $(date +%s) + SERVICE_STOP_TIMEOUT ))
|
||||
while true; do
|
||||
local running
|
||||
running=$(docker ps --filter "label=com.docker.compose.project.working_dir=$DEPLOYMENT_DIR" -q 2>/dev/null | wc -l)
|
||||
if [[ "$running" -eq 0 ]]; then
|
||||
break
|
||||
fi
|
||||
if (( $(date +%s) >= deadline )); then
|
||||
log "WARNING: Timeout waiting for services to stop; continuing."
|
||||
break
|
||||
fi
|
||||
sleep 0.2
|
||||
done
|
||||
;;
|
||||
start)
|
||||
log "Starting agave deployment..."
|
||||
laconic-so deployment --dir "$DEPLOYMENT_DIR" start
|
||||
;;
|
||||
*)
|
||||
log "ERROR: Unknown action '$action' in services()"
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
# send a snapshot to one remote
|
||||
# args: snap remote_host remote_dataset
|
||||
snapshot_send_one() {
|
||||
local snap="$1" remote_host="$2" remote_dataset="$3"
|
||||
|
||||
log "Checking remote snapshots on $remote_host..."
|
||||
|
||||
local -a local_snaps remote_snaps
|
||||
mapfile -t local_snaps < <(zfs list -H -t snapshot -o name -s creation -d1 "$DATASET" | grep -F "${DATASET}@${SNAPSHOT_PREFIX}-")
|
||||
mapfile -t remote_snaps < <(ssh "$remote_host" zfs list -H -t snapshot -o name -s creation "$remote_dataset" | grep -F "${remote_dataset}@${SNAPSHOT_PREFIX}-" || true)
|
||||
|
||||
# find latest common snapshot
|
||||
local base=""
|
||||
local local_snap remote_snap remote_check
|
||||
for local_snap in "${local_snaps[@]}"; do
|
||||
remote_snap="${local_snap/$DATASET/$remote_dataset}"
|
||||
for remote_check in "${remote_snaps[@]}"; do
|
||||
if [[ "$remote_check" == "$remote_snap" ]]; then
|
||||
base="$local_snap"
|
||||
break
|
||||
fi
|
||||
done
|
||||
done
|
||||
|
||||
if [[ -z "$base" && ${#remote_snaps[@]} -eq 0 ]]; then
|
||||
log "No remote snapshots found on $remote_host — sending full snapshot."
|
||||
if zfs send "$snap" | ssh "$remote_host" zfs receive -sF "$remote_dataset"; then
|
||||
log "Full send to $remote_host succeeded."
|
||||
return 0
|
||||
else
|
||||
log "ERROR: Full send to $remote_host failed."
|
||||
return 1
|
||||
fi
|
||||
elif [[ -n "$base" ]]; then
|
||||
log "Common base snapshot $base found — sending incremental to $remote_host."
|
||||
if zfs send -i "$base" "$snap" | ssh "$remote_host" zfs receive -sF "$remote_dataset"; then
|
||||
log "Incremental send to $remote_host succeeded."
|
||||
return 0
|
||||
else
|
||||
log "ERROR: Incremental send to $remote_host failed."
|
||||
return 1
|
||||
fi
|
||||
else
|
||||
log "STALE DESTINATION: $remote_host has snapshots but no common base with local — skipping."
|
||||
return 1
|
||||
fi
|
||||
}
|
||||
|
||||
# send snapshot to all remotes
|
||||
snapshot_send() {
|
||||
local snap="$1"
|
||||
local failure_count=0
|
||||
|
||||
set +e
|
||||
local entry remote_host remote_dataset
|
||||
for entry in "${REMOTES[@]}"; do
|
||||
remote_host="${entry%%:*}"
|
||||
remote_dataset="${entry#*:}"
|
||||
if ! snapshot_send_one "$snap" "$remote_host" "$remote_dataset"; then
|
||||
failure_count=$((failure_count + 1))
|
||||
fi
|
||||
done
|
||||
set -e
|
||||
|
||||
if [[ "$failure_count" -gt 0 ]]; then
|
||||
log "WARNING: $failure_count destination(s) failed or are out of sync."
|
||||
return 1
|
||||
fi
|
||||
return 0
|
||||
}
|
||||
|
||||
# snapshot management
|
||||
snapshot() {
|
||||
local action="$1"
|
||||
|
||||
case "$action" in
|
||||
create)
|
||||
log "Creating snapshot: $SNAPSHOT"
|
||||
zfs snapshot "$SNAPSHOT"
|
||||
zfs hold "$ZFS_HOLD" "$SNAPSHOT" || log "ERROR: Failed to hold $SNAPSHOT"
|
||||
;;
|
||||
send)
|
||||
log "Sending snapshot $SNAPSHOT..."
|
||||
if snapshot_send "$SNAPSHOT"; then
|
||||
log "Snapshot send completed. Releasing hold."
|
||||
zfs release "$ZFS_HOLD" "$SNAPSHOT" || log "ERROR: Failed to release hold on $SNAPSHOT"
|
||||
else
|
||||
log "WARNING: Snapshot send encountered errors. Hold retained on $SNAPSHOT."
|
||||
fi
|
||||
;;
|
||||
prune)
|
||||
if [[ "$SNAPSHOT_RETENTION" -gt 0 ]]; then
|
||||
log "Pruning old snapshots in $DATASET (retaining $SNAPSHOT_RETENTION destroyable snapshots)..."
|
||||
|
||||
local -a all_snaps destroyable
|
||||
mapfile -t all_snaps < <(zfs list -H -t snapshot -o name -s creation -d1 "$DATASET" | grep -F "${DATASET}@${SNAPSHOT_PREFIX}-")
|
||||
|
||||
destroyable=()
|
||||
for snap in "${all_snaps[@]}"; do
|
||||
if zfs destroy -n -- "$snap" &>/dev/null; then
|
||||
destroyable+=("$snap")
|
||||
else
|
||||
log "Skipping $snap — snapshot not destroyable (likely held)"
|
||||
fi
|
||||
done
|
||||
|
||||
local count to_destroy
|
||||
count="${#destroyable[@]}"
|
||||
to_destroy=$((count - SNAPSHOT_RETENTION))
|
||||
|
||||
if [[ "$to_destroy" -le 0 ]]; then
|
||||
log "Nothing to prune — only $count destroyable snapshots exist"
|
||||
else
|
||||
local i
|
||||
for (( i=0; i<to_destroy; i++ )); do
|
||||
snap="${destroyable[$i]}"
|
||||
log "Destroying snapshot: $snap"
|
||||
if ! zfs destroy -- "$snap"; then
|
||||
log "WARNING: Failed to destroy $snap despite earlier check"
|
||||
fi
|
||||
done
|
||||
fi
|
||||
else
|
||||
log "Skipping pruning — retention is set to $SNAPSHOT_RETENTION"
|
||||
fi
|
||||
;;
|
||||
*)
|
||||
log "ERROR: Snapshot unknown action: $action"
|
||||
exit 2
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
# open logging and begin execution
|
||||
mkdir -p "$(dirname -- "$LOG_FILE")"
|
||||
|
||||
start_time=$(date +%s)
|
||||
exec >> "$LOG_FILE" 2>&1
|
||||
trap 'log_close' EXIT
|
||||
trap 'rc=$?; log "ERROR: command failed at line $LINENO (exit $rc)"; exit $rc' ERR
|
||||
|
||||
log "Backlog Started"
|
||||
|
||||
if zfs list -H -t snapshot -o name -d1 "$DATASET" | grep -qxF "$SNAPSHOT"; then
|
||||
log "WARNING: Snapshot $SNAPSHOT already exists. Exiting."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
services stop
|
||||
snapshot create
|
||||
services start
|
||||
snapshot send
|
||||
snapshot prune
|
||||
|
||||
# end
|
||||
|
|
@ -1,280 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Biscayne agave validator status check.
|
||||
|
||||
Collects and displays key health metrics:
|
||||
- Slot position (local vs mainnet, gap, replay rate)
|
||||
- Pod status (running, restarts, age)
|
||||
- Memory usage (cgroup current vs limit, % used)
|
||||
- OOM kills (recent dmesg entries)
|
||||
- Shred relay (packets/sec on port 9100, shred-unwrap.py alive)
|
||||
- Validator process state (from logs)
|
||||
"""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
|
||||
NAMESPACE = "laconic-laconic-70ce4c4b47e23b85"
|
||||
DEPLOYMENT = "laconic-70ce4c4b47e23b85-deployment"
|
||||
KIND_NODE = "laconic-70ce4c4b47e23b85-control-plane"
|
||||
SSH = "rix@biscayne.vaasl.io"
|
||||
MAINNET_RPC = "https://api.mainnet-beta.solana.com"
|
||||
LOCAL_RPC = "http://127.0.0.1:8899"
|
||||
|
||||
|
||||
def ssh(cmd: str, timeout: int = 10) -> str:
|
||||
try:
|
||||
r = subprocess.run(
|
||||
["ssh", SSH, cmd],
|
||||
capture_output=True, text=True, timeout=timeout,
|
||||
)
|
||||
return r.stdout.strip() + r.stderr.strip()
|
||||
except subprocess.TimeoutExpired:
|
||||
return "<timeout>"
|
||||
|
||||
|
||||
def local(cmd: str, timeout: int = 10) -> str:
|
||||
try:
|
||||
r = subprocess.run(
|
||||
cmd, shell=True, capture_output=True, text=True, timeout=timeout,
|
||||
)
|
||||
return r.stdout.strip()
|
||||
except subprocess.TimeoutExpired:
|
||||
return "<timeout>"
|
||||
|
||||
|
||||
def rpc_call(method: str, url: str = LOCAL_RPC, remote: bool = True, params: list | None = None) -> dict | None:
|
||||
payload = json.dumps({"jsonrpc": "2.0", "id": 1, "method": method, "params": params or []})
|
||||
cmd = f"curl -s {url} -X POST -H 'Content-Type: application/json' -d '{payload}'"
|
||||
raw = ssh(cmd) if remote else local(cmd)
|
||||
try:
|
||||
return json.loads(raw)
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
return None
|
||||
|
||||
|
||||
def get_slots() -> tuple[int | None, int | None]:
|
||||
local_resp = rpc_call("getSlot")
|
||||
mainnet_resp = rpc_call("getSlot", MAINNET_RPC, remote=False)
|
||||
local_slot = local_resp.get("result") if local_resp else None
|
||||
mainnet_slot = mainnet_resp.get("result") if mainnet_resp else None
|
||||
return local_slot, mainnet_slot
|
||||
|
||||
|
||||
def get_health() -> str:
|
||||
resp = rpc_call("getHealth")
|
||||
if not resp:
|
||||
return "unreachable"
|
||||
if "result" in resp and resp["result"] == "ok":
|
||||
return "healthy"
|
||||
err = resp.get("error", {})
|
||||
msg = err.get("message", "unknown")
|
||||
behind = err.get("data", {}).get("numSlotsBehind")
|
||||
if behind is not None:
|
||||
return f"behind {behind:,} slots"
|
||||
return msg
|
||||
|
||||
|
||||
def get_pod_status() -> str:
|
||||
cmd = f"kubectl -n {NAMESPACE} get pods -o json"
|
||||
raw = ssh(cmd, timeout=15)
|
||||
try:
|
||||
data = json.loads(raw)
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
return "unknown"
|
||||
items = data.get("items", [])
|
||||
if not items:
|
||||
return "no pods"
|
||||
pod = items[0]
|
||||
name = pod["metadata"]["name"].split("-")[-1]
|
||||
phase = pod["status"].get("phase", "?")
|
||||
containers = pod["status"].get("containerStatuses", [])
|
||||
restarts = sum(c.get("restartCount", 0) for c in containers)
|
||||
ready = sum(1 for c in containers if c.get("ready"))
|
||||
total = len(containers)
|
||||
age = pod["metadata"].get("creationTimestamp", "?")
|
||||
return f"{ready}/{total} {phase} restarts={restarts} pod=..{name} created={age}"
|
||||
|
||||
|
||||
def get_memory() -> str:
|
||||
cmd = (
|
||||
f"docker exec {KIND_NODE} bash -c '"
|
||||
"find /sys/fs/cgroup -name memory.current -path \"*burstable*\" 2>/dev/null | head -1 | "
|
||||
"while read f; do "
|
||||
" dir=$(dirname $f); "
|
||||
" cur=$(cat $f); "
|
||||
" max=$(cat $dir/memory.max 2>/dev/null || echo unknown); "
|
||||
" echo $cur $max; "
|
||||
"done'"
|
||||
)
|
||||
raw = ssh(cmd, timeout=10)
|
||||
try:
|
||||
parts = raw.split()
|
||||
current = int(parts[0])
|
||||
limit_str = parts[1]
|
||||
cur_gb = current / (1024**3)
|
||||
if limit_str == "max":
|
||||
return f"{cur_gb:.0f}GB / unlimited"
|
||||
limit = int(limit_str)
|
||||
lim_gb = limit / (1024**3)
|
||||
pct = (current / limit) * 100
|
||||
return f"{cur_gb:.0f}GB / {lim_gb:.0f}GB ({pct:.0f}%)"
|
||||
except (IndexError, ValueError):
|
||||
return raw or "unknown"
|
||||
|
||||
|
||||
def get_oom_kills() -> str:
|
||||
raw = ssh("sudo dmesg | grep -c 'oom-kill' || echo 0")
|
||||
try:
|
||||
count = int(raw.strip())
|
||||
except ValueError:
|
||||
return "check failed"
|
||||
if count == 0:
|
||||
return "none"
|
||||
# Get kernel uptime-relative timestamp and convert to UTC
|
||||
# dmesg timestamps are seconds since boot; combine with boot time
|
||||
raw = ssh(
|
||||
"BOOT=$(date -d \"$(uptime -s)\" +%s); "
|
||||
"KERN_TS=$(sudo dmesg | grep 'oom-kill' | tail -1 | "
|
||||
" sed 's/\\[\\s*\\([0-9.]*\\)\\].*/\\1/'); "
|
||||
"echo $BOOT $KERN_TS"
|
||||
)
|
||||
try:
|
||||
parts = raw.split()
|
||||
boot_epoch = int(parts[0])
|
||||
kern_secs = float(parts[1])
|
||||
oom_epoch = boot_epoch + int(kern_secs)
|
||||
from datetime import datetime, timezone
|
||||
oom_utc = datetime.fromtimestamp(oom_epoch, tz=timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
|
||||
return f"{count} total (last: {oom_utc})"
|
||||
except (IndexError, ValueError):
|
||||
return f"{count} total (timestamp parse failed)"
|
||||
|
||||
|
||||
def get_relay_rate() -> str:
|
||||
# Two samples 3s apart from /proc/net/snmp
|
||||
cmd = (
|
||||
"T0=$(cat /proc/net/snmp | grep '^Udp:' | tail -1 | awk '{print $2}'); "
|
||||
"sleep 3; "
|
||||
"T1=$(cat /proc/net/snmp | grep '^Udp:' | tail -1 | awk '{print $2}'); "
|
||||
"echo $T0 $T1"
|
||||
)
|
||||
raw = ssh(cmd, timeout=15)
|
||||
try:
|
||||
parts = raw.split()
|
||||
t0, t1 = int(parts[0]), int(parts[1])
|
||||
rate = (t1 - t0) / 3
|
||||
return f"{rate:,.0f} UDP dgrams/sec (all ports)"
|
||||
except (IndexError, ValueError):
|
||||
return raw or "unknown"
|
||||
|
||||
|
||||
def get_shreds_per_sec() -> str:
|
||||
"""Count UDP packets on TVU port 9000 over 3 seconds using tcpdump."""
|
||||
cmd = "sudo timeout 3 tcpdump -i any udp dst port 9000 -q 2>&1 | grep -oP '\\d+(?= packets captured)'"
|
||||
raw = ssh(cmd, timeout=15)
|
||||
try:
|
||||
count = int(raw.strip())
|
||||
rate = count / 3
|
||||
return f"{rate:,.0f} shreds/sec ({count:,} in 3s)"
|
||||
except (ValueError, TypeError):
|
||||
return raw or "unknown"
|
||||
|
||||
|
||||
def get_unwrap_status() -> str:
|
||||
raw = ssh("ps -p $(pgrep -f shred-unwrap | head -1) -o pid,etime,rss --no-headers 2>/dev/null || echo dead")
|
||||
if "dead" in raw or not raw.strip():
|
||||
return "NOT RUNNING"
|
||||
parts = raw.split()
|
||||
if len(parts) >= 3:
|
||||
pid, etime, rss_kb = parts[0], parts[1], parts[2]
|
||||
rss_mb = int(rss_kb) / 1024
|
||||
return f"pid={pid} uptime={etime} rss={rss_mb:.0f}MB"
|
||||
return raw
|
||||
|
||||
|
||||
def get_replay_rate() -> tuple[float | None, int | None, int | None]:
|
||||
"""Sample processed slot twice over 10s to measure replay rate."""
|
||||
params = [{"commitment": "processed"}]
|
||||
r0 = rpc_call("getSlot", params=params)
|
||||
s0 = r0.get("result") if r0 else None
|
||||
if s0 is None:
|
||||
return None, None, None
|
||||
t0 = time.monotonic()
|
||||
time.sleep(10)
|
||||
r1 = rpc_call("getSlot", params=params)
|
||||
s1 = r1.get("result") if r1 else None
|
||||
if s1 is None:
|
||||
return None, s0, None
|
||||
dt = time.monotonic() - t0
|
||||
rate = (s1 - s0) / dt if s1 != s0 else 0
|
||||
return rate, s0, s1
|
||||
|
||||
|
||||
def main() -> None:
|
||||
print("=" * 60)
|
||||
print(" BISCAYNE VALIDATOR STATUS")
|
||||
print("=" * 60)
|
||||
|
||||
# Health + slots
|
||||
print("\n--- RPC ---")
|
||||
health = get_health()
|
||||
local_slot, mainnet_slot = get_slots()
|
||||
print(f" Health: {health}")
|
||||
if local_slot is not None:
|
||||
print(f" Local slot: {local_slot:,}")
|
||||
else:
|
||||
print(" Local slot: unreachable")
|
||||
if mainnet_slot is not None:
|
||||
print(f" Mainnet slot: {mainnet_slot:,}")
|
||||
if local_slot and mainnet_slot:
|
||||
gap = mainnet_slot - local_slot
|
||||
print(f" Gap: {gap:,} slots")
|
||||
|
||||
# Replay rate (10s sample)
|
||||
print("\n--- Replay ---")
|
||||
print(" Sampling replay rate (10s)...", end="", flush=True)
|
||||
rate, s0, s1 = get_replay_rate()
|
||||
if rate is not None:
|
||||
print(f"\r Replay rate: {rate:.1f} slots/sec ({s0:,} → {s1:,})")
|
||||
net = rate - 2.5
|
||||
if net > 0:
|
||||
print(f" Net catchup: +{net:.1f} slots/sec (gaining)")
|
||||
elif net < 0:
|
||||
print(f" Net catchup: {net:.1f} slots/sec (falling behind)")
|
||||
else:
|
||||
print(" Net catchup: 0 (keeping pace)")
|
||||
else:
|
||||
print("\r Replay rate: could not measure")
|
||||
|
||||
# Pod
|
||||
print("\n--- Pod ---")
|
||||
pod = get_pod_status()
|
||||
print(f" {pod}")
|
||||
|
||||
# Memory
|
||||
print("\n--- Memory ---")
|
||||
mem = get_memory()
|
||||
print(f" Cgroup: {mem}")
|
||||
|
||||
# OOM
|
||||
oom = get_oom_kills()
|
||||
print(f" OOM kills: {oom}")
|
||||
|
||||
# Relay
|
||||
print("\n--- Shred Relay ---")
|
||||
unwrap = get_unwrap_status()
|
||||
print(f" shred-unwrap: {unwrap}")
|
||||
print(" Measuring shred rate (3s)...", end="", flush=True)
|
||||
shreds = get_shreds_per_sec()
|
||||
print(f"\r TVU shreds: {shreds} ")
|
||||
print(" Measuring UDP rate (3s)...", end="", flush=True)
|
||||
relay = get_relay_rate()
|
||||
print(f"\r UDP inbound: {relay} ")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,546 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Download Solana snapshots using aria2c for parallel multi-connection downloads.
|
||||
|
||||
Discovers snapshot sources by querying getClusterNodes for all RPCs in the
|
||||
cluster, probing each for available snapshots, benchmarking download speed,
|
||||
and downloading from the fastest source using aria2c (16 connections by default).
|
||||
|
||||
Based on the discovery approach from etcusr/solana-snapshot-finder but replaces
|
||||
the single-connection wget download with aria2c parallel chunked downloads.
|
||||
|
||||
Usage:
|
||||
# Download to /srv/solana/snapshots (mainnet, 16 connections)
|
||||
./snapshot-download.py -o /srv/solana/snapshots
|
||||
|
||||
# Dry run — find best source, print URL
|
||||
./snapshot-download.py --dry-run
|
||||
|
||||
# Custom RPC for cluster node discovery + 32 connections
|
||||
./snapshot-download.py -r https://api.mainnet-beta.solana.com -n 32
|
||||
|
||||
# Testnet
|
||||
./snapshot-download.py -c testnet -o /data/snapshots
|
||||
|
||||
Requirements:
|
||||
- aria2c (apt install aria2)
|
||||
- python3 >= 3.10 (stdlib only, no pip dependencies)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import concurrent.futures
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from dataclasses import dataclass, field
|
||||
from http.client import HTTPResponse
|
||||
from pathlib import Path
|
||||
from typing import NoReturn
|
||||
from urllib.request import Request
|
||||
|
||||
log: logging.Logger = logging.getLogger("snapshot-download")
|
||||
|
||||
CLUSTER_RPC: dict[str, str] = {
|
||||
"mainnet-beta": "https://api.mainnet-beta.solana.com",
|
||||
"testnet": "https://api.testnet.solana.com",
|
||||
"devnet": "https://api.devnet.solana.com",
|
||||
}
|
||||
|
||||
# Snapshot filenames:
|
||||
# snapshot-<slot>-<hash>.tar.zst
|
||||
# incremental-snapshot-<base_slot>-<slot>-<hash>.tar.zst
|
||||
FULL_SNAP_RE: re.Pattern[str] = re.compile(
|
||||
r"^snapshot-(\d+)-([A-Za-z0-9]+)\.tar\.(zst|bz2)$"
|
||||
)
|
||||
INCR_SNAP_RE: re.Pattern[str] = re.compile(
|
||||
r"^incremental-snapshot-(\d+)-(\d+)-([A-Za-z0-9]+)\.tar\.(zst|bz2)$"
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SnapshotSource:
|
||||
"""A snapshot file available from a specific RPC node."""
|
||||
|
||||
rpc_address: str
|
||||
# Full redirect paths as returned by the server (e.g. /snapshot-123-hash.tar.zst)
|
||||
file_paths: list[str] = field(default_factory=list)
|
||||
slots_diff: int = 0
|
||||
latency_ms: float = 0.0
|
||||
download_speed: float = 0.0 # bytes/sec
|
||||
|
||||
|
||||
# -- JSON-RPC helpers ----------------------------------------------------------
|
||||
|
||||
|
||||
class _NoRedirectHandler(urllib.request.HTTPRedirectHandler):
|
||||
"""Handler that captures redirect Location instead of following it."""
|
||||
|
||||
def redirect_request(
|
||||
self,
|
||||
req: Request,
|
||||
fp: HTTPResponse,
|
||||
code: int,
|
||||
msg: str,
|
||||
headers: dict[str, str], # type: ignore[override]
|
||||
newurl: str,
|
||||
) -> None:
|
||||
return None
|
||||
|
||||
|
||||
def rpc_post(url: str, method: str, params: list[object] | None = None,
|
||||
timeout: int = 25) -> object | None:
|
||||
"""JSON-RPC POST. Returns parsed 'result' field or None on error."""
|
||||
payload: bytes = json.dumps({
|
||||
"jsonrpc": "2.0", "id": 1,
|
||||
"method": method, "params": params or [],
|
||||
}).encode()
|
||||
req = Request(url, data=payload,
|
||||
headers={"Content-Type": "application/json"})
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
||||
data: dict[str, object] = json.loads(resp.read())
|
||||
return data.get("result")
|
||||
except (urllib.error.URLError, json.JSONDecodeError, OSError, TimeoutError) as e:
|
||||
log.debug("rpc_post %s %s failed: %s", url, method, e)
|
||||
return None
|
||||
|
||||
|
||||
def head_no_follow(url: str, timeout: float = 3) -> tuple[str | None, float]:
|
||||
"""HEAD request without following redirects.
|
||||
|
||||
Returns (Location header value, latency_sec) if the server returned a
|
||||
3xx redirect. Returns (None, 0.0) on any error or non-redirect response.
|
||||
"""
|
||||
opener: urllib.request.OpenerDirector = urllib.request.build_opener(_NoRedirectHandler)
|
||||
req = Request(url, method="HEAD")
|
||||
try:
|
||||
start: float = time.monotonic()
|
||||
resp: HTTPResponse = opener.open(req, timeout=timeout) # type: ignore[assignment]
|
||||
latency: float = time.monotonic() - start
|
||||
# Non-redirect (2xx) — server didn't redirect, not useful for discovery
|
||||
location: str | None = resp.headers.get("Location")
|
||||
resp.close()
|
||||
return location, latency
|
||||
except urllib.error.HTTPError as e:
|
||||
# 3xx redirects raise HTTPError with the redirect info
|
||||
latency = time.monotonic() - start # type: ignore[possibly-undefined]
|
||||
location = e.headers.get("Location")
|
||||
if location and 300 <= e.code < 400:
|
||||
return location, latency
|
||||
return None, 0.0
|
||||
except (urllib.error.URLError, OSError, TimeoutError):
|
||||
return None, 0.0
|
||||
|
||||
|
||||
# -- Discovery -----------------------------------------------------------------
|
||||
|
||||
|
||||
def get_current_slot(rpc_url: str) -> int | None:
|
||||
"""Get current slot from RPC."""
|
||||
result: object | None = rpc_post(rpc_url, "getSlot")
|
||||
if isinstance(result, int):
|
||||
return result
|
||||
return None
|
||||
|
||||
|
||||
def get_cluster_rpc_nodes(rpc_url: str, version_filter: str | None = None) -> list[str]:
|
||||
"""Get all RPC node addresses from getClusterNodes."""
|
||||
result: object | None = rpc_post(rpc_url, "getClusterNodes")
|
||||
if not isinstance(result, list):
|
||||
return []
|
||||
|
||||
rpc_addrs: list[str] = []
|
||||
for node in result:
|
||||
if not isinstance(node, dict):
|
||||
continue
|
||||
if version_filter is not None:
|
||||
node_version: str | None = node.get("version")
|
||||
if node_version and not node_version.startswith(version_filter):
|
||||
continue
|
||||
rpc: str | None = node.get("rpc")
|
||||
if rpc:
|
||||
rpc_addrs.append(rpc)
|
||||
return list(set(rpc_addrs))
|
||||
|
||||
|
||||
def _parse_snapshot_filename(location: str) -> tuple[str, str | None]:
|
||||
"""Extract filename and full redirect path from Location header.
|
||||
|
||||
Returns (filename, full_path). full_path includes any path prefix
|
||||
the server returned (e.g. '/snapshots/snapshot-123-hash.tar.zst').
|
||||
"""
|
||||
# Location may be absolute URL or relative path
|
||||
if location.startswith("http://") or location.startswith("https://"):
|
||||
# Absolute URL — extract path
|
||||
from urllib.parse import urlparse
|
||||
path: str = urlparse(location).path
|
||||
else:
|
||||
path = location
|
||||
|
||||
filename: str = path.rsplit("/", 1)[-1]
|
||||
return filename, path
|
||||
|
||||
|
||||
def probe_rpc_snapshot(
|
||||
rpc_address: str,
|
||||
current_slot: int,
|
||||
max_age_slots: int,
|
||||
max_latency_ms: float,
|
||||
) -> SnapshotSource | None:
|
||||
"""Probe a single RPC node for available snapshots.
|
||||
|
||||
Probes for full snapshot first (required), then incremental. Records all
|
||||
available files. Which files to actually download is decided at download
|
||||
time based on what already exists locally — not here.
|
||||
|
||||
Based on the discovery approach from etcusr/solana-snapshot-finder.
|
||||
"""
|
||||
full_url: str = f"http://{rpc_address}/snapshot.tar.bz2"
|
||||
|
||||
# Full snapshot is required — every source must have one
|
||||
full_location, full_latency = head_no_follow(full_url, timeout=2)
|
||||
if not full_location:
|
||||
return None
|
||||
|
||||
latency_ms: float = full_latency * 1000
|
||||
if latency_ms > max_latency_ms:
|
||||
return None
|
||||
|
||||
full_filename, full_path = _parse_snapshot_filename(full_location)
|
||||
fm: re.Match[str] | None = FULL_SNAP_RE.match(full_filename)
|
||||
if not fm:
|
||||
return None
|
||||
|
||||
full_snap_slot: int = int(fm.group(1))
|
||||
slots_diff: int = current_slot - full_snap_slot
|
||||
|
||||
if slots_diff > max_age_slots or slots_diff < -100:
|
||||
return None
|
||||
|
||||
file_paths: list[str] = [full_path]
|
||||
|
||||
# Also check for incremental snapshot
|
||||
inc_url: str = f"http://{rpc_address}/incremental-snapshot.tar.bz2"
|
||||
inc_location, _ = head_no_follow(inc_url, timeout=2)
|
||||
if inc_location:
|
||||
inc_filename, inc_path = _parse_snapshot_filename(inc_location)
|
||||
m: re.Match[str] | None = INCR_SNAP_RE.match(inc_filename)
|
||||
if m:
|
||||
inc_base_slot: int = int(m.group(1))
|
||||
# Incremental must be based on this source's full snapshot
|
||||
if inc_base_slot == full_snap_slot:
|
||||
file_paths.append(inc_path)
|
||||
|
||||
return SnapshotSource(
|
||||
rpc_address=rpc_address,
|
||||
file_paths=file_paths,
|
||||
slots_diff=slots_diff,
|
||||
latency_ms=latency_ms,
|
||||
)
|
||||
|
||||
|
||||
def discover_sources(
|
||||
rpc_url: str,
|
||||
current_slot: int,
|
||||
max_age_slots: int,
|
||||
max_latency_ms: float,
|
||||
threads: int,
|
||||
version_filter: str | None,
|
||||
) -> list[SnapshotSource]:
|
||||
"""Discover all snapshot sources from the cluster."""
|
||||
rpc_nodes: list[str] = get_cluster_rpc_nodes(rpc_url, version_filter)
|
||||
if not rpc_nodes:
|
||||
log.error("No RPC nodes found via getClusterNodes")
|
||||
return []
|
||||
|
||||
log.info("Found %d RPC nodes, probing for snapshots...", len(rpc_nodes))
|
||||
|
||||
sources: list[SnapshotSource] = []
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as pool:
|
||||
futures: dict[concurrent.futures.Future[SnapshotSource | None], str] = {
|
||||
pool.submit(
|
||||
probe_rpc_snapshot, addr, current_slot,
|
||||
max_age_slots, max_latency_ms,
|
||||
): addr
|
||||
for addr in rpc_nodes
|
||||
}
|
||||
done: int = 0
|
||||
for future in concurrent.futures.as_completed(futures):
|
||||
done += 1
|
||||
if done % 200 == 0:
|
||||
log.info(" probed %d/%d nodes, %d sources found",
|
||||
done, len(rpc_nodes), len(sources))
|
||||
try:
|
||||
result: SnapshotSource | None = future.result()
|
||||
except (urllib.error.URLError, OSError, TimeoutError) as e:
|
||||
log.debug("Probe failed for %s: %s", futures[future], e)
|
||||
continue
|
||||
if result:
|
||||
sources.append(result)
|
||||
|
||||
log.info("Found %d RPC nodes with suitable snapshots", len(sources))
|
||||
return sources
|
||||
|
||||
|
||||
# -- Speed benchmark -----------------------------------------------------------
|
||||
|
||||
|
||||
def measure_speed(rpc_address: str, measure_time: int = 7) -> float:
|
||||
"""Measure download speed from an RPC node. Returns bytes/sec."""
|
||||
url: str = f"http://{rpc_address}/snapshot.tar.bz2"
|
||||
req = Request(url)
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=measure_time + 5) as resp:
|
||||
start: float = time.monotonic()
|
||||
total: int = 0
|
||||
while True:
|
||||
elapsed: float = time.monotonic() - start
|
||||
if elapsed >= measure_time:
|
||||
break
|
||||
chunk: bytes = resp.read(81920)
|
||||
if not chunk:
|
||||
break
|
||||
total += len(chunk)
|
||||
elapsed = time.monotonic() - start
|
||||
if elapsed <= 0:
|
||||
return 0.0
|
||||
return total / elapsed
|
||||
except (urllib.error.URLError, OSError, TimeoutError):
|
||||
return 0.0
|
||||
|
||||
|
||||
# -- Download ------------------------------------------------------------------
|
||||
|
||||
|
||||
def download_aria2c(
|
||||
urls: list[str],
|
||||
output_dir: str,
|
||||
filename: str,
|
||||
connections: int = 16,
|
||||
) -> bool:
|
||||
"""Download a file using aria2c with parallel connections.
|
||||
|
||||
When multiple URLs are provided, aria2c treats them as mirrors of the
|
||||
same file and distributes chunks across all of them.
|
||||
"""
|
||||
num_mirrors: int = len(urls)
|
||||
total_splits: int = max(connections, connections * num_mirrors)
|
||||
cmd: list[str] = [
|
||||
"aria2c",
|
||||
"--file-allocation=none",
|
||||
"--continue=true",
|
||||
f"--max-connection-per-server={connections}",
|
||||
f"--split={total_splits}",
|
||||
"--min-split-size=50M",
|
||||
# aria2c retries individual chunk connections on transient network
|
||||
# errors (TCP reset, timeout). This is transport-level retry analogous
|
||||
# to TCP retransmit, not application-level retry of a failed operation.
|
||||
"--max-tries=5",
|
||||
"--retry-wait=5",
|
||||
"--timeout=60",
|
||||
"--connect-timeout=10",
|
||||
"--summary-interval=10",
|
||||
"--console-log-level=notice",
|
||||
f"--dir={output_dir}",
|
||||
f"--out={filename}",
|
||||
"--auto-file-renaming=false",
|
||||
"--allow-overwrite=true",
|
||||
*urls,
|
||||
]
|
||||
|
||||
log.info("Downloading %s", filename)
|
||||
log.info(" aria2c: %d connections × %d mirrors (%d splits)",
|
||||
connections, num_mirrors, total_splits)
|
||||
|
||||
start: float = time.monotonic()
|
||||
result: subprocess.CompletedProcess[bytes] = subprocess.run(cmd)
|
||||
elapsed: float = time.monotonic() - start
|
||||
|
||||
if result.returncode != 0:
|
||||
log.error("aria2c failed with exit code %d", result.returncode)
|
||||
return False
|
||||
|
||||
filepath: Path = Path(output_dir) / filename
|
||||
if not filepath.exists():
|
||||
log.error("aria2c reported success but %s does not exist", filepath)
|
||||
return False
|
||||
|
||||
size_bytes: int = filepath.stat().st_size
|
||||
size_gb: float = size_bytes / (1024 ** 3)
|
||||
avg_mb: float = size_bytes / elapsed / (1024 ** 2) if elapsed > 0 else 0
|
||||
log.info(" Done: %.1f GB in %.0fs (%.1f MiB/s avg)", size_gb, elapsed, avg_mb)
|
||||
return True
|
||||
|
||||
|
||||
# -- Main ----------------------------------------------------------------------
|
||||
|
||||
|
||||
def main() -> int:
|
||||
p: argparse.ArgumentParser = argparse.ArgumentParser(
|
||||
description="Download Solana snapshots with aria2c parallel downloads",
|
||||
)
|
||||
p.add_argument("-o", "--output", default="/srv/solana/snapshots",
|
||||
help="Snapshot output directory (default: /srv/solana/snapshots)")
|
||||
p.add_argument("-c", "--cluster", default="mainnet-beta",
|
||||
choices=list(CLUSTER_RPC),
|
||||
help="Solana cluster (default: mainnet-beta)")
|
||||
p.add_argument("-r", "--rpc", default=None,
|
||||
help="RPC URL for cluster discovery (default: public RPC)")
|
||||
p.add_argument("-n", "--connections", type=int, default=16,
|
||||
help="aria2c connections per download (default: 16)")
|
||||
p.add_argument("-t", "--threads", type=int, default=500,
|
||||
help="Threads for parallel RPC probing (default: 500)")
|
||||
p.add_argument("--max-snapshot-age", type=int, default=1300,
|
||||
help="Max snapshot age in slots (default: 1300)")
|
||||
p.add_argument("--max-latency", type=float, default=100,
|
||||
help="Max RPC probe latency in ms (default: 100)")
|
||||
p.add_argument("--min-download-speed", type=int, default=20,
|
||||
help="Min download speed in MiB/s (default: 20)")
|
||||
p.add_argument("--measurement-time", type=int, default=7,
|
||||
help="Speed measurement duration in seconds (default: 7)")
|
||||
p.add_argument("--max-speed-checks", type=int, default=15,
|
||||
help="Max nodes to benchmark before giving up (default: 15)")
|
||||
p.add_argument("--version", default=None,
|
||||
help="Filter nodes by version prefix (e.g. '2.2')")
|
||||
p.add_argument("--full-only", action="store_true",
|
||||
help="Download only full snapshot, skip incremental")
|
||||
p.add_argument("--dry-run", action="store_true",
|
||||
help="Find best source and print URL, don't download")
|
||||
p.add_argument("-v", "--verbose", action="store_true")
|
||||
args: argparse.Namespace = p.parse_args()
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG if args.verbose else logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(message)s",
|
||||
datefmt="%H:%M:%S",
|
||||
)
|
||||
|
||||
rpc_url: str = args.rpc or CLUSTER_RPC[args.cluster]
|
||||
|
||||
# aria2c is required for actual downloads (not dry-run)
|
||||
if not args.dry_run and not shutil.which("aria2c"):
|
||||
log.error("aria2c not found. Install with: apt install aria2")
|
||||
return 1
|
||||
|
||||
# Get current slot
|
||||
log.info("Cluster: %s | RPC: %s", args.cluster, rpc_url)
|
||||
current_slot: int | None = get_current_slot(rpc_url)
|
||||
if current_slot is None:
|
||||
log.error("Cannot get current slot from %s", rpc_url)
|
||||
return 1
|
||||
log.info("Current slot: %d", current_slot)
|
||||
|
||||
# Discover sources
|
||||
sources: list[SnapshotSource] = discover_sources(
|
||||
rpc_url, current_slot,
|
||||
max_age_slots=args.max_snapshot_age,
|
||||
max_latency_ms=args.max_latency,
|
||||
threads=args.threads,
|
||||
version_filter=args.version,
|
||||
)
|
||||
if not sources:
|
||||
log.error("No snapshot sources found")
|
||||
return 1
|
||||
|
||||
# Sort by latency (lowest first) for speed benchmarking
|
||||
sources.sort(key=lambda s: s.latency_ms)
|
||||
|
||||
# Benchmark top candidates — all speeds in MiB/s (binary, 1 MiB = 1048576 bytes)
|
||||
log.info("Benchmarking download speed on top %d sources...", args.max_speed_checks)
|
||||
fast_sources: list[SnapshotSource] = []
|
||||
checked: int = 0
|
||||
min_speed_bytes: int = args.min_download_speed * 1024 * 1024 # MiB to bytes
|
||||
|
||||
for source in sources:
|
||||
if checked >= args.max_speed_checks:
|
||||
break
|
||||
checked += 1
|
||||
|
||||
speed: float = measure_speed(source.rpc_address, args.measurement_time)
|
||||
source.download_speed = speed
|
||||
speed_mib: float = speed / (1024 ** 2)
|
||||
|
||||
if speed < min_speed_bytes:
|
||||
log.info(" %s: %.1f MiB/s (too slow, need >=%d MiB/s)",
|
||||
source.rpc_address, speed_mib, args.min_download_speed)
|
||||
continue
|
||||
|
||||
log.info(" %s: %.1f MiB/s (latency: %.0fms, age: %d slots)",
|
||||
source.rpc_address, speed_mib,
|
||||
source.latency_ms, source.slots_diff)
|
||||
fast_sources.append(source)
|
||||
|
||||
if not fast_sources:
|
||||
log.error("No source met minimum speed requirement (%d MiB/s)",
|
||||
args.min_download_speed)
|
||||
log.info("Try: --min-download-speed 10")
|
||||
return 1
|
||||
|
||||
# Use the fastest source as primary, collect mirrors for each file
|
||||
best: SnapshotSource = fast_sources[0]
|
||||
file_paths: list[str] = best.file_paths
|
||||
if args.full_only:
|
||||
file_paths = [fp for fp in file_paths
|
||||
if fp.rsplit("/", 1)[-1].startswith("snapshot-")]
|
||||
|
||||
# Build mirror URL lists: for each file, collect URLs from all fast sources
|
||||
# that serve the same filename
|
||||
download_plan: list[tuple[str, list[str]]] = []
|
||||
for fp in file_paths:
|
||||
filename: str = fp.rsplit("/", 1)[-1]
|
||||
mirror_urls: list[str] = [f"http://{best.rpc_address}{fp}"]
|
||||
for other in fast_sources[1:]:
|
||||
for other_fp in other.file_paths:
|
||||
if other_fp.rsplit("/", 1)[-1] == filename:
|
||||
mirror_urls.append(f"http://{other.rpc_address}{other_fp}")
|
||||
break
|
||||
download_plan.append((filename, mirror_urls))
|
||||
|
||||
speed_mib: float = best.download_speed / (1024 ** 2)
|
||||
log.info("Best source: %s (%.1f MiB/s), %d mirrors total",
|
||||
best.rpc_address, speed_mib, len(fast_sources))
|
||||
for filename, mirror_urls in download_plan:
|
||||
log.info(" %s (%d mirrors)", filename, len(mirror_urls))
|
||||
for url in mirror_urls:
|
||||
log.info(" %s", url)
|
||||
|
||||
if args.dry_run:
|
||||
for _, mirror_urls in download_plan:
|
||||
for url in mirror_urls:
|
||||
print(url)
|
||||
return 0
|
||||
|
||||
# Download — skip files that already exist locally
|
||||
os.makedirs(args.output, exist_ok=True)
|
||||
total_start: float = time.monotonic()
|
||||
|
||||
for filename, mirror_urls in download_plan:
|
||||
filepath: Path = Path(args.output) / filename
|
||||
if filepath.exists() and filepath.stat().st_size > 0:
|
||||
log.info("Skipping %s (already exists: %.1f GB)",
|
||||
filename, filepath.stat().st_size / (1024 ** 3))
|
||||
continue
|
||||
if not download_aria2c(mirror_urls, args.output, filename, args.connections):
|
||||
log.error("Failed to download %s", filename)
|
||||
return 1
|
||||
|
||||
total_elapsed: float = time.monotonic() - total_start
|
||||
log.info("All downloads complete in %.0fs", total_elapsed)
|
||||
for filename, _ in download_plan:
|
||||
fp: Path = Path(args.output) / filename
|
||||
if fp.exists():
|
||||
log.info(" %s (%.1f GB)", fp.name, fp.stat().st_size / (1024 ** 3))
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
|
|
@ -1,109 +0,0 @@
|
|||
# ZFS Setup for Biscayne
|
||||
|
||||
## Current State
|
||||
|
||||
```
|
||||
biscayne none (pool root)
|
||||
biscayne/DATA none
|
||||
biscayne/DATA/home /home 42G
|
||||
biscayne/DATA/home/solana /home/solana 2.9G
|
||||
biscayne/DATA/srv /srv 712G
|
||||
biscayne/DATA/srv/backups /srv/backups 208G
|
||||
biscayne/DATA/volumes/solana (zvol, 4T) → block-mounted at /srv/solana
|
||||
```
|
||||
|
||||
Docker root: `/var/lib/docker` on root filesystem (`/dev/md0`, 439G).
|
||||
|
||||
## Target State
|
||||
|
||||
```
|
||||
biscayne/DATA/deployments /srv/deployments ← laconic-so deployment dirs (snapshotted)
|
||||
biscayne/DATA/var/docker /var/lib/docker ← docker storage on ZFS
|
||||
biscayne/DATA/volumes/solana (zvol, 4T) ← bulk solana data (not backed up)
|
||||
```
|
||||
|
||||
## Steps
|
||||
|
||||
### 1. Create deployments dataset
|
||||
|
||||
```bash
|
||||
zfs create -o mountpoint=/srv/deployments biscayne/DATA/deployments
|
||||
```
|
||||
|
||||
### 2. Move docker onto ZFS
|
||||
|
||||
Stop docker and all containers first:
|
||||
|
||||
```bash
|
||||
systemctl stop docker.socket docker.service
|
||||
```
|
||||
|
||||
Create the dataset:
|
||||
|
||||
```bash
|
||||
zfs create -o mountpoint=/var/lib/docker biscayne/DATA/var
|
||||
zfs create biscayne/DATA/var/docker
|
||||
```
|
||||
|
||||
Copy existing docker data (if any worth keeping):
|
||||
|
||||
```bash
|
||||
rsync -aHAX /var/lib/docker.bak/ /var/lib/docker/
|
||||
```
|
||||
|
||||
Or just start fresh — the only running containers are telegraf/influxdb monitoring
|
||||
which can be recreated.
|
||||
|
||||
Start docker:
|
||||
|
||||
```bash
|
||||
systemctl start docker.service
|
||||
```
|
||||
|
||||
### 3. Grant ZFS permissions to the backup user
|
||||
|
||||
```bash
|
||||
zfs allow -u <backup-user> destroy,snapshot,send,hold,release,mount biscayne/DATA/deployments
|
||||
```
|
||||
|
||||
### 4. Create remote receiving datasets
|
||||
|
||||
On mysterio:
|
||||
|
||||
```bash
|
||||
zfs create -p edith/DATA/backlog/biscayne-main
|
||||
```
|
||||
|
||||
On ardham:
|
||||
|
||||
```bash
|
||||
zfs create -p batterywharf/DATA/backlog/biscayne-main
|
||||
```
|
||||
|
||||
These will fail until SSH keys and network access are configured for biscayne
|
||||
to reach these hosts. The backup script handles this gracefully.
|
||||
|
||||
### 5. Install backlog.sh and crontab
|
||||
|
||||
```bash
|
||||
mkdir -p ~/.local/bin
|
||||
cp scripts/backlog.sh ~/.local/bin/backlog.sh
|
||||
chmod +x ~/.local/bin/backlog.sh
|
||||
crontab -e
|
||||
# Add: 01 0 * * * /home/<user>/.local/bin/backlog.sh
|
||||
```
|
||||
|
||||
## Volume Layout
|
||||
|
||||
laconic-so deployment at `/srv/deployments/agave/`:
|
||||
|
||||
| Volume | Location | Backed up |
|
||||
|---|---|---|
|
||||
| validator-config | `/srv/deployments/agave/data/validator-config/` | Yes (ZFS snapshot) |
|
||||
| doublezero-config | `/srv/deployments/agave/data/doublezero-config/` | Yes (ZFS snapshot) |
|
||||
| validator-ledger | `/srv/solana/ledger/` (zvol) | No (rebuildable) |
|
||||
| validator-accounts | `/srv/solana/accounts/` (zvol) | No (rebuildable) |
|
||||
| validator-snapshots | `/srv/solana/snapshots/` (zvol) | No (rebuildable) |
|
||||
|
||||
The laconic-so spec.yml must map the heavy volumes to zvol paths and the small
|
||||
config volumes to the deployment directory.
|
||||
|
|
@ -1,112 +0,0 @@
|
|||
services:
|
||||
agave-rpc:
|
||||
restart: unless-stopped
|
||||
image: laconicnetwork/agave:local
|
||||
network_mode: host
|
||||
privileged: true
|
||||
cap_add:
|
||||
- IPC_LOCK
|
||||
# Compose owns all defaults. spec.yml overrides per-deployment.
|
||||
environment:
|
||||
AGAVE_MODE: rpc
|
||||
# Required — no defaults
|
||||
VALIDATOR_ENTRYPOINT: ${VALIDATOR_ENTRYPOINT}
|
||||
KNOWN_VALIDATOR: ${KNOWN_VALIDATOR}
|
||||
# Optional with defaults
|
||||
EXTRA_ENTRYPOINTS: ${EXTRA_ENTRYPOINTS:-}
|
||||
EXTRA_KNOWN_VALIDATORS: ${EXTRA_KNOWN_VALIDATORS:-}
|
||||
RPC_PORT: ${RPC_PORT:-8899}
|
||||
RPC_BIND_ADDRESS: ${RPC_BIND_ADDRESS:-127.0.0.1}
|
||||
GOSSIP_PORT: ${GOSSIP_PORT:-8001}
|
||||
DYNAMIC_PORT_RANGE: ${DYNAMIC_PORT_RANGE:-9000-10000}
|
||||
EXPECTED_GENESIS_HASH: ${EXPECTED_GENESIS_HASH:-}
|
||||
EXPECTED_SHRED_VERSION: ${EXPECTED_SHRED_VERSION:-}
|
||||
LIMIT_LEDGER_SIZE: ${LIMIT_LEDGER_SIZE:-50000000}
|
||||
NO_SNAPSHOTS: ${NO_SNAPSHOTS:-false}
|
||||
SNAPSHOT_INTERVAL_SLOTS: ${SNAPSHOT_INTERVAL_SLOTS:-100000}
|
||||
MAXIMUM_SNAPSHOTS_TO_RETAIN: ${MAXIMUM_SNAPSHOTS_TO_RETAIN:-1}
|
||||
NO_INCREMENTAL_SNAPSHOTS: ${NO_INCREMENTAL_SNAPSHOTS:-false}
|
||||
ACCOUNT_INDEXES: ${ACCOUNT_INDEXES:-}
|
||||
PUBLIC_RPC_ADDRESS: ${PUBLIC_RPC_ADDRESS:-}
|
||||
GOSSIP_HOST: ${GOSSIP_HOST:-}
|
||||
PUBLIC_TVU_ADDRESS: ${PUBLIC_TVU_ADDRESS:-}
|
||||
RUST_LOG: ${RUST_LOG:-info}
|
||||
SOLANA_METRICS_CONFIG: ${SOLANA_METRICS_CONFIG:-}
|
||||
JITO_ENABLE: ${JITO_ENABLE:-false}
|
||||
JITO_BLOCK_ENGINE_URL: ${JITO_BLOCK_ENGINE_URL:-}
|
||||
JITO_SHRED_RECEIVER_ADDR: ${JITO_SHRED_RECEIVER_ADDR:-}
|
||||
JITO_TIP_PAYMENT_PROGRAM: ${JITO_TIP_PAYMENT_PROGRAM:-}
|
||||
JITO_DISTRIBUTION_PROGRAM: ${JITO_DISTRIBUTION_PROGRAM:-}
|
||||
JITO_MERKLE_ROOT_AUTHORITY: ${JITO_MERKLE_ROOT_AUTHORITY:-}
|
||||
JITO_COMMISSION_BPS: ${JITO_COMMISSION_BPS:-0}
|
||||
EXTRA_ARGS: ${EXTRA_ARGS:-}
|
||||
SNAPSHOT_AUTO_DOWNLOAD: ${SNAPSHOT_AUTO_DOWNLOAD:-true}
|
||||
SNAPSHOT_MAX_AGE_SLOTS: ${SNAPSHOT_MAX_AGE_SLOTS:-20000}
|
||||
PROBE_GRACE_SECONDS: ${PROBE_GRACE_SECONDS:-600}
|
||||
PROBE_MAX_SLOT_LAG: ${PROBE_MAX_SLOT_LAG:-20000}
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
cpus: '4.0'
|
||||
memory: 256000M
|
||||
limits:
|
||||
cpus: '32.0'
|
||||
memory: 921600M
|
||||
volumes:
|
||||
- rpc-config:/data/config
|
||||
- rpc-ledger:/data/ledger
|
||||
- rpc-accounts:/data/accounts
|
||||
- rpc-snapshots:/data/snapshots
|
||||
ports:
|
||||
# RPC ports
|
||||
- "8899"
|
||||
- "8900"
|
||||
# Gossip port
|
||||
- "8001"
|
||||
- "8001/udp"
|
||||
# Dynamic port range for TPU/TVU/repair (9000-9025, 26 ports)
|
||||
- "9000/udp"
|
||||
- "9001/udp"
|
||||
- "9002/udp"
|
||||
- "9003/udp"
|
||||
- "9004/udp"
|
||||
- "9005/udp"
|
||||
- "9006/udp"
|
||||
- "9007/udp"
|
||||
- "9008/udp"
|
||||
- "9009/udp"
|
||||
- "9010/udp"
|
||||
- "9011/udp"
|
||||
- "9012/udp"
|
||||
- "9013/udp"
|
||||
- "9014/udp"
|
||||
- "9015/udp"
|
||||
- "9016/udp"
|
||||
- "9017/udp"
|
||||
- "9018/udp"
|
||||
- "9019/udp"
|
||||
- "9020/udp"
|
||||
- "9021/udp"
|
||||
- "9022/udp"
|
||||
- "9023/udp"
|
||||
- "9024/udp"
|
||||
- "9025/udp"
|
||||
ulimits:
|
||||
memlock:
|
||||
soft: -1
|
||||
hard: -1
|
||||
nofile:
|
||||
soft: 1000000
|
||||
hard: 1000000
|
||||
healthcheck:
|
||||
test: ["CMD", "entrypoint.py", "probe"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 600s
|
||||
|
||||
volumes:
|
||||
rpc-config:
|
||||
rpc-ledger:
|
||||
rpc-accounts:
|
||||
rpc-snapshots:
|
||||
|
|
@ -1,27 +0,0 @@
|
|||
services:
|
||||
agave-test:
|
||||
restart: unless-stopped
|
||||
image: laconicnetwork/agave:local
|
||||
security_opt:
|
||||
- seccomp=unconfined
|
||||
environment:
|
||||
AGAVE_MODE: test
|
||||
FACILITATOR_PUBKEY: ${FACILITATOR_PUBKEY:-}
|
||||
SERVER_PUBKEY: ${SERVER_PUBKEY:-}
|
||||
CLIENT_PUBKEY: ${CLIENT_PUBKEY:-}
|
||||
MINT_DECIMALS: ${MINT_DECIMALS:-6}
|
||||
MINT_AMOUNT: ${MINT_AMOUNT:-1000000000}
|
||||
volumes:
|
||||
- test-ledger:/data/ledger
|
||||
ports:
|
||||
- "8899"
|
||||
- "8900"
|
||||
healthcheck:
|
||||
test: ["CMD", "solana", "cluster-version", "--url", "http://127.0.0.1:8899"]
|
||||
interval: 5s
|
||||
timeout: 5s
|
||||
retries: 30
|
||||
start_period: 10s
|
||||
|
||||
volumes:
|
||||
test-ledger:
|
||||
|
|
@ -1,115 +0,0 @@
|
|||
services:
|
||||
agave-validator:
|
||||
restart: unless-stopped
|
||||
image: laconicnetwork/agave:local
|
||||
network_mode: host
|
||||
privileged: true
|
||||
cap_add:
|
||||
- IPC_LOCK
|
||||
# Compose owns all defaults. spec.yml overrides per-deployment.
|
||||
environment:
|
||||
AGAVE_MODE: ${AGAVE_MODE:-validator}
|
||||
# Required — no defaults
|
||||
VALIDATOR_ENTRYPOINT: ${VALIDATOR_ENTRYPOINT}
|
||||
KNOWN_VALIDATOR: ${KNOWN_VALIDATOR}
|
||||
# Optional with defaults
|
||||
EXTRA_ENTRYPOINTS: ${EXTRA_ENTRYPOINTS:-}
|
||||
EXTRA_KNOWN_VALIDATORS: ${EXTRA_KNOWN_VALIDATORS:-}
|
||||
RPC_PORT: ${RPC_PORT:-8899}
|
||||
RPC_BIND_ADDRESS: ${RPC_BIND_ADDRESS:-127.0.0.1}
|
||||
GOSSIP_PORT: ${GOSSIP_PORT:-8001}
|
||||
DYNAMIC_PORT_RANGE: ${DYNAMIC_PORT_RANGE:-9000-10000}
|
||||
EXPECTED_GENESIS_HASH: ${EXPECTED_GENESIS_HASH:-}
|
||||
EXPECTED_SHRED_VERSION: ${EXPECTED_SHRED_VERSION:-}
|
||||
LIMIT_LEDGER_SIZE: ${LIMIT_LEDGER_SIZE:-50000000}
|
||||
NO_SNAPSHOTS: ${NO_SNAPSHOTS:-false}
|
||||
SNAPSHOT_INTERVAL_SLOTS: ${SNAPSHOT_INTERVAL_SLOTS:-100000}
|
||||
MAXIMUM_SNAPSHOTS_TO_RETAIN: ${MAXIMUM_SNAPSHOTS_TO_RETAIN:-1}
|
||||
NO_INCREMENTAL_SNAPSHOTS: ${NO_INCREMENTAL_SNAPSHOTS:-false}
|
||||
ACCOUNT_INDEXES: ${ACCOUNT_INDEXES:-}
|
||||
VOTE_ACCOUNT_KEYPAIR: ${VOTE_ACCOUNT_KEYPAIR:-/data/config/vote-account-keypair.json}
|
||||
GOSSIP_HOST: ${GOSSIP_HOST:-}
|
||||
PUBLIC_TVU_ADDRESS: ${PUBLIC_TVU_ADDRESS:-}
|
||||
RUST_LOG: ${RUST_LOG:-info}
|
||||
SOLANA_METRICS_CONFIG: ${SOLANA_METRICS_CONFIG:-}
|
||||
JITO_ENABLE: ${JITO_ENABLE:-false}
|
||||
JITO_BLOCK_ENGINE_URL: ${JITO_BLOCK_ENGINE_URL:-}
|
||||
JITO_RELAYER_URL: ${JITO_RELAYER_URL:-}
|
||||
JITO_SHRED_RECEIVER_ADDR: ${JITO_SHRED_RECEIVER_ADDR:-}
|
||||
JITO_TIP_PAYMENT_PROGRAM: ${JITO_TIP_PAYMENT_PROGRAM:-}
|
||||
JITO_DISTRIBUTION_PROGRAM: ${JITO_DISTRIBUTION_PROGRAM:-}
|
||||
JITO_MERKLE_ROOT_AUTHORITY: ${JITO_MERKLE_ROOT_AUTHORITY:-}
|
||||
JITO_COMMISSION_BPS: ${JITO_COMMISSION_BPS:-0}
|
||||
EXTRA_ARGS: ${EXTRA_ARGS:-}
|
||||
SNAPSHOT_AUTO_DOWNLOAD: ${SNAPSHOT_AUTO_DOWNLOAD:-true}
|
||||
SNAPSHOT_MAX_AGE_SLOTS: ${SNAPSHOT_MAX_AGE_SLOTS:-20000}
|
||||
PROBE_GRACE_SECONDS: ${PROBE_GRACE_SECONDS:-600}
|
||||
PROBE_MAX_SLOT_LAG: ${PROBE_MAX_SLOT_LAG:-20000}
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
cpus: '4.0'
|
||||
memory: 256000M
|
||||
limits:
|
||||
cpus: '32.0'
|
||||
memory: 921600M
|
||||
volumes:
|
||||
- validator-config:/data/config
|
||||
- validator-ledger:/data/ledger
|
||||
- validator-accounts:/data/accounts
|
||||
- validator-snapshots:/data/snapshots
|
||||
- validator-log:/data/log
|
||||
ports:
|
||||
# RPC ports
|
||||
- "8899"
|
||||
- "8900"
|
||||
# Gossip port
|
||||
- "8001"
|
||||
- "8001/udp"
|
||||
# Dynamic port range for TPU/TVU/repair (9000-9025, 26 ports)
|
||||
- "9000/udp"
|
||||
- "9001/udp"
|
||||
- "9002/udp"
|
||||
- "9003/udp"
|
||||
- "9004/udp"
|
||||
- "9005/udp"
|
||||
- "9006/udp"
|
||||
- "9007/udp"
|
||||
- "9008/udp"
|
||||
- "9009/udp"
|
||||
- "9010/udp"
|
||||
- "9011/udp"
|
||||
- "9012/udp"
|
||||
- "9013/udp"
|
||||
- "9014/udp"
|
||||
- "9015/udp"
|
||||
- "9016/udp"
|
||||
- "9017/udp"
|
||||
- "9018/udp"
|
||||
- "9019/udp"
|
||||
- "9020/udp"
|
||||
- "9021/udp"
|
||||
- "9022/udp"
|
||||
- "9023/udp"
|
||||
- "9024/udp"
|
||||
- "9025/udp"
|
||||
ulimits:
|
||||
memlock:
|
||||
soft: -1
|
||||
hard: -1
|
||||
nofile:
|
||||
soft: 1000000
|
||||
hard: 1000000
|
||||
healthcheck:
|
||||
test: ["CMD", "entrypoint.py", "probe"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 600s
|
||||
|
||||
volumes:
|
||||
validator-config:
|
||||
validator-ledger:
|
||||
validator-accounts:
|
||||
validator-snapshots:
|
||||
validator-log:
|
||||
|
|
@ -1,19 +0,0 @@
|
|||
services:
|
||||
doublezerod:
|
||||
restart: unless-stopped
|
||||
image: laconicnetwork/doublezero:local
|
||||
network_mode: host
|
||||
privileged: true
|
||||
cap_add:
|
||||
- NET_ADMIN
|
||||
environment:
|
||||
DOUBLEZERO_RPC_ENDPOINT: ${DOUBLEZERO_RPC_ENDPOINT:-http://127.0.0.1:8899}
|
||||
DOUBLEZERO_ENV: ${DOUBLEZERO_ENV:-mainnet-beta}
|
||||
DOUBLEZERO_EXTRA_ARGS: ${DOUBLEZERO_EXTRA_ARGS:-}
|
||||
volumes:
|
||||
- doublezero-validator-identity:/data/config:ro
|
||||
- doublezero-config:/root/.config/doublezero
|
||||
|
||||
volumes:
|
||||
doublezero-validator-identity:
|
||||
doublezero-config:
|
||||
|
|
@ -1,49 +0,0 @@
|
|||
services:
|
||||
monitoring-influxdb:
|
||||
image: influxdb:1.8
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
INFLUXDB_DB: agave_metrics
|
||||
INFLUXDB_HTTP_AUTH_ENABLED: "true"
|
||||
INFLUXDB_ADMIN_USER: admin
|
||||
INFLUXDB_ADMIN_PASSWORD: admin
|
||||
INFLUXDB_REPORTING_DISABLED: "true"
|
||||
volumes:
|
||||
- monitoring-influxdb-data:/var/lib/influxdb
|
||||
ports:
|
||||
- "8086"
|
||||
|
||||
monitoring-grafana:
|
||||
image: grafana/grafana:latest
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
GF_SECURITY_ADMIN_PASSWORD: admin
|
||||
GF_SECURITY_ADMIN_USER: admin
|
||||
GF_USERS_ALLOW_SIGN_UP: "false"
|
||||
GF_PATHS_DATA: /var/lib/grafana
|
||||
volumes:
|
||||
- monitoring-grafana-data:/var/lib/grafana
|
||||
- monitoring-grafana-datasources:/etc/grafana/provisioning/datasources:ro
|
||||
- monitoring-grafana-dashboards:/etc/grafana/provisioning/dashboards:ro
|
||||
ports:
|
||||
- "3000"
|
||||
|
||||
monitoring-telegraf:
|
||||
image: telegraf:1.36
|
||||
restart: unless-stopped
|
||||
network_mode: host
|
||||
environment:
|
||||
NODE_RPC_URL: ${NODE_RPC_URL:-http://localhost:8899}
|
||||
CANONICAL_RPC_URL: ${CANONICAL_RPC_URL:-https://api.mainnet-beta.solana.com}
|
||||
INFLUXDB_URL: ${INFLUXDB_URL:-http://localhost:8086}
|
||||
volumes:
|
||||
- monitoring-telegraf-config:/etc/telegraf:ro
|
||||
- monitoring-telegraf-scripts:/scripts:ro
|
||||
|
||||
volumes:
|
||||
monitoring-influxdb-data:
|
||||
monitoring-grafana-data:
|
||||
monitoring-grafana-datasources:
|
||||
monitoring-grafana-dashboards:
|
||||
monitoring-telegraf-config:
|
||||
monitoring-telegraf-scripts:
|
||||
|
|
@ -1,8 +0,0 @@
|
|||
#!/bin/sh
|
||||
# Restart a container by label filter
|
||||
# Used by the cron-based restarter sidecar
|
||||
label_filter="$1"
|
||||
container=$(docker ps -qf "label=$label_filter")
|
||||
if [ -n "$container" ]; then
|
||||
docker restart -s TERM "$container" > /dev/null
|
||||
fi
|
||||
|
|
@ -1,4 +0,0 @@
|
|||
# Restart validator every 4 hours (mitigate memory leaks)
|
||||
0 */4 * * * /scripts/restart-node.sh role=validator
|
||||
# Restart RPC every 6 hours (staggered from validator)
|
||||
30 */6 * * * /scripts/restart-node.sh role=rpc
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -1,12 +0,0 @@
|
|||
apiVersion: 1
|
||||
|
||||
providers:
|
||||
- name: 'default'
|
||||
orgId: 1
|
||||
folder: ''
|
||||
type: file
|
||||
disableDeletion: false
|
||||
editable: true
|
||||
options:
|
||||
path: /etc/grafana/provisioning/dashboards
|
||||
foldersFromFilesStructure: false
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -1,16 +0,0 @@
|
|||
apiVersion: 1
|
||||
|
||||
datasources:
|
||||
- name: InfluxDB
|
||||
type: influxdb
|
||||
access: proxy
|
||||
url: http://monitoring-influxdb:8086
|
||||
database: agave_metrics
|
||||
user: admin
|
||||
isDefault: true
|
||||
editable: true
|
||||
secureJsonData:
|
||||
password: admin
|
||||
jsonData:
|
||||
timeInterval: 10s
|
||||
httpMode: GET
|
||||
|
|
@ -1,17 +0,0 @@
|
|||
#!/bin/bash
|
||||
# Query canonical mainnet slot for sync lag comparison
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
CANONICAL_RPC="${CANONICAL_RPC_URL:-https://api.mainnet-beta.solana.com}"
|
||||
|
||||
response=$(curl -s --max-time 10 -X POST \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"jsonrpc":"2.0","id":1,"method":"getSlot"}' \
|
||||
"$CANONICAL_RPC" 2>/dev/null || echo '{"result":0}')
|
||||
|
||||
slot=$(echo "$response" | grep -o '"result":[0-9]*' | grep -o '[0-9]*' || echo "0")
|
||||
|
||||
if [ "$slot" != "0" ]; then
|
||||
echo "canonical_slot slot=${slot}i"
|
||||
fi
|
||||
|
|
@ -1,33 +0,0 @@
|
|||
#!/bin/bash
|
||||
# Check getSlot RPC latency
|
||||
# Outputs metrics in InfluxDB line protocol format
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
RPC_URL="${NODE_RPC_URL:-http://localhost:8899}"
|
||||
RPC_PAYLOAD='{"jsonrpc":"2.0","id":1,"method":"getSlot"}'
|
||||
|
||||
response=$(curl -sk --max-time 10 -X POST \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "$RPC_PAYLOAD" \
|
||||
-w "\n%{http_code}\n%{time_total}" \
|
||||
"$RPC_URL" 2>/dev/null || echo -e "\n000\n0")
|
||||
|
||||
json_response=$(echo "$response" | head -n 1)
|
||||
# curl -w output follows response body; blank lines may appear between them
|
||||
http_code=$(echo "$response" | tail -2 | head -1)
|
||||
time_total=$(echo "$response" | tail -1)
|
||||
|
||||
latency_ms="$(awk -v t="$time_total" 'BEGIN { printf "%.0f", (t * 1000) }')"
|
||||
# Strip leading zeros from http_code (influx line protocol rejects 000i)
|
||||
http_code=$((10#${http_code:-0}))
|
||||
|
||||
if [ "$http_code" = "200" ]; then
|
||||
slot=$(echo "$json_response" | grep -o '"result":[0-9]*' | grep -o '[0-9]*' || echo "0")
|
||||
[ "$slot" != "0" ] && success=1 || success=0
|
||||
else
|
||||
success=0
|
||||
slot=0
|
||||
fi
|
||||
|
||||
echo "rpc_latency,endpoint=direct,method=getSlot latency_ms=${latency_ms},success=${success}i,http_code=${http_code}i,slot=${slot}i"
|
||||
|
|
@ -1,36 +0,0 @@
|
|||
# Telegraf configuration for Agave monitoring
|
||||
|
||||
[agent]
|
||||
interval = "10s"
|
||||
round_interval = true
|
||||
metric_batch_size = 1000
|
||||
metric_buffer_limit = 10000
|
||||
collection_jitter = "0s"
|
||||
flush_interval = "10s"
|
||||
flush_jitter = "0s"
|
||||
precision = "0s"
|
||||
hostname = "telegraf"
|
||||
omit_hostname = false
|
||||
|
||||
# Output to InfluxDB
|
||||
[[outputs.influxdb]]
|
||||
urls = ["http://localhost:8086"]
|
||||
database = "agave_metrics"
|
||||
skip_database_creation = true
|
||||
username = "admin"
|
||||
password = "admin"
|
||||
retention_policy = ""
|
||||
write_consistency = "any"
|
||||
timeout = "5s"
|
||||
|
||||
# Custom getSlot latency check
|
||||
[[inputs.exec]]
|
||||
commands = ["/scripts/check_getslot_latency.sh"]
|
||||
timeout = "30s"
|
||||
data_format = "influx"
|
||||
|
||||
# Canonical mainnet slot tracking
|
||||
[[inputs.exec]]
|
||||
commands = ["/scripts/check_canonical_slot.sh"]
|
||||
timeout = "30s"
|
||||
data_format = "influx"
|
||||
|
|
@ -1,81 +0,0 @@
|
|||
# Unified Agave/Jito Solana image
|
||||
# Supports three modes via AGAVE_MODE env: test, rpc, validator
|
||||
#
|
||||
# Build args:
|
||||
# AGAVE_REPO - git repo URL (anza-xyz/agave or jito-foundation/jito-solana)
|
||||
# AGAVE_VERSION - git tag to build (e.g. v3.1.9, v3.1.8-jito)
|
||||
|
||||
ARG AGAVE_REPO=https://github.com/anza-xyz/agave.git
|
||||
ARG AGAVE_VERSION=v3.1.9
|
||||
|
||||
# ---------- Stage 1: Build ----------
|
||||
FROM rust:1.85-bookworm AS builder
|
||||
|
||||
ARG AGAVE_REPO
|
||||
ARG AGAVE_VERSION
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
pkg-config \
|
||||
libssl-dev \
|
||||
libudev-dev \
|
||||
libclang-dev \
|
||||
protobuf-compiler \
|
||||
ca-certificates \
|
||||
git \
|
||||
cmake \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /build
|
||||
RUN git clone "$AGAVE_REPO" --depth 1 --branch "$AGAVE_VERSION" --recurse-submodules agave
|
||||
WORKDIR /build/agave
|
||||
|
||||
# Cherry-pick --public-tvu-address support (anza-xyz/agave PR #6778, commit 9f4b3ae)
|
||||
# This flag only exists on master, not in v3.1.9 — fetch the PR ref and cherry-pick
|
||||
ARG TVU_ADDRESS_PR=6778
|
||||
RUN if [ -n "$TVU_ADDRESS_PR" ]; then \
|
||||
git fetch --depth 50 origin "pull/${TVU_ADDRESS_PR}/head:tvu-pr" && \
|
||||
git cherry-pick --no-commit tvu-pr; \
|
||||
fi
|
||||
|
||||
# Build all binaries using the upstream install script
|
||||
RUN CI_COMMIT=$(git rev-parse HEAD) scripts/cargo-install-all.sh /solana-release
|
||||
|
||||
# ---------- Stage 2: Runtime ----------
|
||||
FROM debian:bookworm-slim
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
ca-certificates \
|
||||
libssl3 \
|
||||
libudev1 \
|
||||
curl \
|
||||
sudo \
|
||||
aria2 \
|
||||
python3 \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Create non-root user with sudo
|
||||
RUN useradd -m -s /bin/bash agave \
|
||||
&& echo "agave ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers
|
||||
|
||||
# Copy all compiled binaries
|
||||
COPY --from=builder /solana-release/bin/ /usr/local/bin/
|
||||
|
||||
# Copy entrypoint and support scripts
|
||||
COPY entrypoint.py snapshot_download.py ip_echo_preflight.py /usr/local/bin/
|
||||
COPY start-test.sh /usr/local/bin/
|
||||
RUN chmod +x /usr/local/bin/entrypoint.py /usr/local/bin/start-test.sh
|
||||
|
||||
# Create data directories
|
||||
RUN mkdir -p /data/config /data/ledger /data/accounts /data/snapshots \
|
||||
&& chown -R agave:agave /data
|
||||
|
||||
USER agave
|
||||
WORKDIR /data
|
||||
|
||||
ENV RUST_LOG=info
|
||||
ENV RUST_BACKTRACE=1
|
||||
|
||||
EXPOSE 8899 8900 8001 8001/udp
|
||||
|
||||
ENTRYPOINT ["entrypoint.py"]
|
||||
|
|
@ -1,17 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
# Build laconicnetwork/agave
|
||||
# Set AGAVE_REPO and AGAVE_VERSION env vars to build Jito or a different version
|
||||
source ${CERC_CONTAINER_BASE_DIR}/build-base.sh
|
||||
|
||||
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
|
||||
|
||||
AGAVE_REPO="${AGAVE_REPO:-https://github.com/anza-xyz/agave.git}"
|
||||
AGAVE_VERSION="${AGAVE_VERSION:-v3.1.9}"
|
||||
|
||||
docker build -t laconicnetwork/agave:local \
|
||||
--build-arg AGAVE_REPO="$AGAVE_REPO" \
|
||||
--build-arg AGAVE_VERSION="$AGAVE_VERSION" \
|
||||
${build_command_args} \
|
||||
-f ${SCRIPT_DIR}/Dockerfile \
|
||||
${SCRIPT_DIR}
|
||||
|
|
@ -1,686 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Agave validator entrypoint — snapshot management, arg construction, liveness probe.
|
||||
|
||||
Two subcommands:
|
||||
entrypoint.py serve (default) — snapshot freshness check + run agave-validator
|
||||
entrypoint.py probe — liveness probe (slot lag check, exits 0/1)
|
||||
|
||||
Replaces the bash entrypoint.sh / start-rpc.sh / start-validator.sh with a single
|
||||
Python module. Test mode still dispatches to start-test.sh.
|
||||
|
||||
Python stays as PID 1 and traps SIGTERM. On SIGTERM, it runs
|
||||
``agave-validator exit --force --ledger /data/ledger`` which connects to the
|
||||
admin RPC Unix socket and tells the validator to flush I/O and exit cleanly.
|
||||
This avoids the io_uring/ZFS deadlock that occurs when the process is killed.
|
||||
|
||||
All configuration comes from environment variables — same vars as the original
|
||||
bash scripts. See compose files for defaults.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import signal
|
||||
import subprocess
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from pathlib import Path
|
||||
from urllib.request import Request
|
||||
|
||||
log: logging.Logger = logging.getLogger("entrypoint")
|
||||
|
||||
# Directories
|
||||
CONFIG_DIR = "/data/config"
|
||||
LEDGER_DIR = "/data/ledger"
|
||||
ACCOUNTS_DIR = "/data/accounts"
|
||||
SNAPSHOTS_DIR = "/data/snapshots"
|
||||
LOG_DIR = "/data/log"
|
||||
IDENTITY_FILE = f"{CONFIG_DIR}/validator-identity.json"
|
||||
|
||||
# Snapshot filename patterns
|
||||
FULL_SNAP_RE: re.Pattern[str] = re.compile(
|
||||
r"^snapshot-(\d+)-[A-Za-z0-9]+\.tar\.(zst|bz2)$"
|
||||
)
|
||||
INCR_SNAP_RE: re.Pattern[str] = re.compile(
|
||||
r"^incremental-snapshot-(\d+)-(\d+)-[A-Za-z0-9]+\.tar\.(zst|bz2)$"
|
||||
)
|
||||
|
||||
MAINNET_RPC = "https://api.mainnet-beta.solana.com"
|
||||
|
||||
|
||||
# -- Helpers -------------------------------------------------------------------
|
||||
|
||||
|
||||
def env(name: str, default: str = "") -> str:
|
||||
"""Read env var with default."""
|
||||
return os.environ.get(name, default)
|
||||
|
||||
|
||||
def env_required(name: str) -> str:
|
||||
"""Read required env var, exit if missing."""
|
||||
val = os.environ.get(name)
|
||||
if not val:
|
||||
log.error("%s is required but not set", name)
|
||||
sys.exit(1)
|
||||
return val
|
||||
|
||||
|
||||
def env_bool(name: str, default: bool = False) -> bool:
|
||||
"""Read boolean env var (true/false/1/0)."""
|
||||
val = os.environ.get(name, "").lower()
|
||||
if not val:
|
||||
return default
|
||||
return val in ("true", "1", "yes")
|
||||
|
||||
|
||||
def rpc_get_slot(url: str, timeout: int = 10) -> int | None:
|
||||
"""Get current slot from a Solana RPC endpoint."""
|
||||
payload = json.dumps({
|
||||
"jsonrpc": "2.0", "id": 1,
|
||||
"method": "getSlot", "params": [],
|
||||
}).encode()
|
||||
req = Request(url, data=payload,
|
||||
headers={"Content-Type": "application/json"})
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
||||
data = json.loads(resp.read())
|
||||
result = data.get("result")
|
||||
if isinstance(result, int):
|
||||
return result
|
||||
except (urllib.error.URLError, json.JSONDecodeError, OSError, TimeoutError):
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
# -- Snapshot management -------------------------------------------------------
|
||||
|
||||
|
||||
def get_local_snapshot_slot(snapshots_dir: str) -> int | None:
|
||||
"""Find the highest slot among local snapshot files."""
|
||||
best_slot: int | None = None
|
||||
snap_path = Path(snapshots_dir)
|
||||
if not snap_path.is_dir():
|
||||
return None
|
||||
for entry in snap_path.iterdir():
|
||||
m = FULL_SNAP_RE.match(entry.name)
|
||||
if m:
|
||||
slot = int(m.group(1))
|
||||
if best_slot is None or slot > best_slot:
|
||||
best_slot = slot
|
||||
return best_slot
|
||||
|
||||
|
||||
def clean_snapshots(snapshots_dir: str) -> None:
|
||||
"""Remove all snapshot files from the directory."""
|
||||
snap_path = Path(snapshots_dir)
|
||||
if not snap_path.is_dir():
|
||||
return
|
||||
for entry in snap_path.iterdir():
|
||||
if entry.name.startswith(("snapshot-", "incremental-snapshot-")):
|
||||
log.info("Removing old snapshot: %s", entry.name)
|
||||
entry.unlink(missing_ok=True)
|
||||
|
||||
|
||||
def get_incremental_slot(snapshots_dir: str, full_slot: int | None) -> int | None:
|
||||
"""Get the highest incremental snapshot slot matching the full's base slot."""
|
||||
if full_slot is None:
|
||||
return None
|
||||
snap_path = Path(snapshots_dir)
|
||||
if not snap_path.is_dir():
|
||||
return None
|
||||
best: int | None = None
|
||||
for entry in snap_path.iterdir():
|
||||
m = INCR_SNAP_RE.match(entry.name)
|
||||
if m and int(m.group(1)) == full_slot:
|
||||
slot = int(m.group(2))
|
||||
if best is None or slot > best:
|
||||
best = slot
|
||||
return best
|
||||
|
||||
|
||||
def maybe_download_snapshot(snapshots_dir: str) -> None:
|
||||
"""Ensure full + incremental snapshots exist before starting.
|
||||
|
||||
The validator should always start from a full + incremental pair to
|
||||
minimize replay time. If either is missing or the full is too old,
|
||||
download fresh ones via download_best_snapshot (which does rolling
|
||||
incremental convergence after downloading the full).
|
||||
|
||||
Controlled by env vars:
|
||||
SNAPSHOT_AUTO_DOWNLOAD (default: true) — enable/disable
|
||||
SNAPSHOT_MAX_AGE_SLOTS (default: 100000) — full snapshot staleness threshold
|
||||
(one full snapshot generation, ~11 hours)
|
||||
"""
|
||||
if not env_bool("SNAPSHOT_AUTO_DOWNLOAD", default=True):
|
||||
log.info("Snapshot auto-download disabled")
|
||||
return
|
||||
|
||||
max_age = int(env("SNAPSHOT_MAX_AGE_SLOTS", "100000"))
|
||||
|
||||
mainnet_slot = rpc_get_slot(MAINNET_RPC)
|
||||
if mainnet_slot is None:
|
||||
log.warning("Cannot reach mainnet RPC — skipping snapshot check")
|
||||
return
|
||||
|
||||
script_dir = Path(__file__).resolve().parent
|
||||
sys.path.insert(0, str(script_dir))
|
||||
from snapshot_download import download_best_snapshot, download_incremental_for_slot
|
||||
|
||||
convergence = int(env("SNAPSHOT_CONVERGENCE_SLOTS", "500"))
|
||||
retry_delay = int(env("SNAPSHOT_RETRY_DELAY", "60"))
|
||||
|
||||
# Check local full snapshot
|
||||
local_slot = get_local_snapshot_slot(snapshots_dir)
|
||||
have_fresh_full = (local_slot is not None
|
||||
and (mainnet_slot - local_slot) <= max_age)
|
||||
|
||||
if have_fresh_full:
|
||||
assert local_slot is not None
|
||||
inc_slot = get_incremental_slot(snapshots_dir, local_slot)
|
||||
if inc_slot is not None:
|
||||
inc_gap = mainnet_slot - inc_slot
|
||||
if inc_gap <= convergence:
|
||||
log.info("Full (slot %d) + incremental (slot %d, gap %d) "
|
||||
"within convergence, starting",
|
||||
local_slot, inc_slot, inc_gap)
|
||||
return
|
||||
log.info("Incremental too stale (slot %d, gap %d > %d)",
|
||||
inc_slot, inc_gap, convergence)
|
||||
# Fresh full, need a fresh incremental
|
||||
log.info("Downloading incremental for full at slot %d", local_slot)
|
||||
while True:
|
||||
if download_incremental_for_slot(snapshots_dir, local_slot,
|
||||
convergence_slots=convergence):
|
||||
return
|
||||
log.warning("Incremental download failed — retrying in %ds",
|
||||
retry_delay)
|
||||
time.sleep(retry_delay)
|
||||
|
||||
# No full or full too old — download both
|
||||
log.info("Downloading full + incremental")
|
||||
clean_snapshots(snapshots_dir)
|
||||
while True:
|
||||
if download_best_snapshot(snapshots_dir, convergence_slots=convergence):
|
||||
return
|
||||
log.warning("Snapshot download failed — retrying in %ds", retry_delay)
|
||||
time.sleep(retry_delay)
|
||||
|
||||
|
||||
# -- Directory and identity setup ----------------------------------------------
|
||||
|
||||
|
||||
def ensure_dirs(*dirs: str) -> None:
|
||||
"""Create directories and fix ownership."""
|
||||
uid = os.getuid()
|
||||
gid = os.getgid()
|
||||
for d in dirs:
|
||||
os.makedirs(d, exist_ok=True)
|
||||
try:
|
||||
subprocess.run(
|
||||
["sudo", "chown", "-R", f"{uid}:{gid}", d],
|
||||
check=False, capture_output=True,
|
||||
)
|
||||
except FileNotFoundError:
|
||||
pass # sudo not available — dirs already owned correctly
|
||||
|
||||
|
||||
def ensure_identity_rpc() -> None:
|
||||
"""Generate ephemeral identity keypair for RPC mode if not mounted."""
|
||||
if os.path.isfile(IDENTITY_FILE):
|
||||
return
|
||||
log.info("Generating RPC node identity keypair...")
|
||||
subprocess.run(
|
||||
["solana-keygen", "new", "--no-passphrase", "--silent",
|
||||
"--force", "--outfile", IDENTITY_FILE],
|
||||
check=True,
|
||||
)
|
||||
|
||||
|
||||
def print_identity() -> None:
|
||||
"""Print the node identity pubkey."""
|
||||
result = subprocess.run(
|
||||
["solana-keygen", "pubkey", IDENTITY_FILE],
|
||||
capture_output=True, text=True, check=False,
|
||||
)
|
||||
if result.returncode == 0:
|
||||
log.info("Node identity: %s", result.stdout.strip())
|
||||
|
||||
|
||||
# -- Arg construction ----------------------------------------------------------
|
||||
|
||||
|
||||
def build_common_args() -> list[str]:
|
||||
"""Build agave-validator args common to both RPC and validator modes."""
|
||||
args: list[str] = [
|
||||
"--identity", IDENTITY_FILE,
|
||||
"--entrypoint", env_required("VALIDATOR_ENTRYPOINT"),
|
||||
"--known-validator", env_required("KNOWN_VALIDATOR"),
|
||||
"--ledger", LEDGER_DIR,
|
||||
"--accounts", ACCOUNTS_DIR,
|
||||
"--snapshots", SNAPSHOTS_DIR,
|
||||
"--rpc-port", env("RPC_PORT", "8899"),
|
||||
"--rpc-bind-address", env("RPC_BIND_ADDRESS", "127.0.0.1"),
|
||||
"--gossip-port", env("GOSSIP_PORT", "8001"),
|
||||
"--dynamic-port-range", env("DYNAMIC_PORT_RANGE", "9000-10000"),
|
||||
"--no-os-network-limits-test",
|
||||
"--wal-recovery-mode", "skip_any_corrupted_record",
|
||||
"--limit-ledger-size", env("LIMIT_LEDGER_SIZE", "50000000"),
|
||||
"--no-snapshot-fetch", # entrypoint handles snapshot download
|
||||
]
|
||||
|
||||
# Snapshot generation
|
||||
if env("NO_SNAPSHOTS") == "true":
|
||||
args.append("--no-snapshots")
|
||||
else:
|
||||
args += [
|
||||
"--full-snapshot-interval-slots", env("SNAPSHOT_INTERVAL_SLOTS", "100000"),
|
||||
"--maximum-full-snapshots-to-retain", env("MAXIMUM_SNAPSHOTS_TO_RETAIN", "1"),
|
||||
]
|
||||
if env("NO_INCREMENTAL_SNAPSHOTS") != "true":
|
||||
args += ["--maximum-incremental-snapshots-to-retain", "2"]
|
||||
|
||||
# Account indexes
|
||||
account_indexes = env("ACCOUNT_INDEXES")
|
||||
if account_indexes:
|
||||
for idx in account_indexes.split(","):
|
||||
idx = idx.strip()
|
||||
if idx:
|
||||
args += ["--account-index", idx]
|
||||
|
||||
# Additional entrypoints
|
||||
for ep in env("EXTRA_ENTRYPOINTS").split():
|
||||
if ep:
|
||||
args += ["--entrypoint", ep]
|
||||
|
||||
# Additional known validators
|
||||
for kv in env("EXTRA_KNOWN_VALIDATORS").split():
|
||||
if kv:
|
||||
args += ["--known-validator", kv]
|
||||
|
||||
# Cluster verification
|
||||
genesis_hash = env("EXPECTED_GENESIS_HASH")
|
||||
if genesis_hash:
|
||||
args += ["--expected-genesis-hash", genesis_hash]
|
||||
shred_version = env("EXPECTED_SHRED_VERSION")
|
||||
if shred_version:
|
||||
args += ["--expected-shred-version", shred_version]
|
||||
|
||||
# Metrics — just needs to be in the environment, agave reads it directly
|
||||
# (env var is already set, nothing to pass as arg)
|
||||
|
||||
# Gossip host / TVU address
|
||||
gossip_host = env("GOSSIP_HOST")
|
||||
if gossip_host:
|
||||
args += ["--gossip-host", gossip_host]
|
||||
elif env("PUBLIC_TVU_ADDRESS"):
|
||||
args += ["--public-tvu-address", env("PUBLIC_TVU_ADDRESS")]
|
||||
|
||||
# Jito flags
|
||||
if env("JITO_ENABLE") == "true":
|
||||
log.info("Jito MEV enabled")
|
||||
jito_flags: list[tuple[str, str]] = [
|
||||
("JITO_TIP_PAYMENT_PROGRAM", "--tip-payment-program-pubkey"),
|
||||
("JITO_DISTRIBUTION_PROGRAM", "--tip-distribution-program-pubkey"),
|
||||
("JITO_MERKLE_ROOT_AUTHORITY", "--merkle-root-upload-authority"),
|
||||
("JITO_COMMISSION_BPS", "--commission-bps"),
|
||||
("JITO_BLOCK_ENGINE_URL", "--block-engine-url"),
|
||||
("JITO_SHRED_RECEIVER_ADDR", "--shred-receiver-address"),
|
||||
]
|
||||
for env_name, flag in jito_flags:
|
||||
val = env(env_name)
|
||||
if val:
|
||||
args += [flag, val]
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def build_rpc_args() -> list[str]:
|
||||
"""Build agave-validator args for RPC (non-voting) mode."""
|
||||
args = build_common_args()
|
||||
args += [
|
||||
"--no-voting",
|
||||
"--log", f"{LOG_DIR}/validator.log",
|
||||
"--full-rpc-api",
|
||||
"--enable-rpc-transaction-history",
|
||||
"--rpc-pubsub-enable-block-subscription",
|
||||
"--enable-extended-tx-metadata-storage",
|
||||
"--no-wait-for-vote-to-start-leader",
|
||||
]
|
||||
|
||||
# Public vs private RPC
|
||||
public_rpc = env("PUBLIC_RPC_ADDRESS")
|
||||
if public_rpc:
|
||||
args += ["--public-rpc-address", public_rpc]
|
||||
else:
|
||||
args += ["--private-rpc", "--allow-private-addr", "--only-known-rpc"]
|
||||
|
||||
# Jito relayer URL (RPC mode doesn't use it, but validator mode does —
|
||||
# handled in build_validator_args)
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def build_validator_args() -> list[str]:
|
||||
"""Build agave-validator args for voting validator mode."""
|
||||
vote_keypair = env("VOTE_ACCOUNT_KEYPAIR",
|
||||
"/data/config/vote-account-keypair.json")
|
||||
|
||||
# Identity must be mounted for validator mode
|
||||
if not os.path.isfile(IDENTITY_FILE):
|
||||
log.error("Validator identity keypair not found at %s", IDENTITY_FILE)
|
||||
log.error("Mount your validator keypair to %s", IDENTITY_FILE)
|
||||
sys.exit(1)
|
||||
|
||||
# Vote account keypair must exist
|
||||
if not os.path.isfile(vote_keypair):
|
||||
log.error("Vote account keypair not found at %s", vote_keypair)
|
||||
log.error("Mount your vote account keypair or set VOTE_ACCOUNT_KEYPAIR")
|
||||
sys.exit(1)
|
||||
|
||||
# Print vote account pubkey
|
||||
result = subprocess.run(
|
||||
["solana-keygen", "pubkey", vote_keypair],
|
||||
capture_output=True, text=True, check=False,
|
||||
)
|
||||
if result.returncode == 0:
|
||||
log.info("Vote account: %s", result.stdout.strip())
|
||||
|
||||
args = build_common_args()
|
||||
args += [
|
||||
"--vote-account", vote_keypair,
|
||||
"--log", "-",
|
||||
]
|
||||
|
||||
# Jito relayer URL (validator-only)
|
||||
relayer_url = env("JITO_RELAYER_URL")
|
||||
if env("JITO_ENABLE") == "true" and relayer_url:
|
||||
args += ["--relayer-url", relayer_url]
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def append_extra_args(args: list[str]) -> list[str]:
|
||||
"""Append EXTRA_ARGS passthrough flags."""
|
||||
extra = env("EXTRA_ARGS")
|
||||
if extra:
|
||||
args += extra.split()
|
||||
return args
|
||||
|
||||
|
||||
# -- Graceful shutdown --------------------------------------------------------
|
||||
|
||||
# Timeout for graceful exit via admin RPC. Leave 30s margin for k8s
|
||||
# terminationGracePeriodSeconds (300s).
|
||||
GRACEFUL_EXIT_TIMEOUT = 270
|
||||
|
||||
|
||||
def graceful_exit(child: subprocess.Popen[bytes], reason: str = "SIGTERM") -> None:
|
||||
"""Request graceful shutdown via the admin RPC Unix socket.
|
||||
|
||||
Runs ``agave-validator exit --force --ledger /data/ledger`` which connects
|
||||
to the admin RPC socket at ``/data/ledger/admin.rpc`` and sets the
|
||||
validator's exit flag. The validator flushes all I/O and exits cleanly,
|
||||
avoiding the io_uring/ZFS deadlock.
|
||||
|
||||
If the admin RPC exit fails or the child doesn't exit within the timeout,
|
||||
falls back to SIGTERM then SIGKILL.
|
||||
"""
|
||||
log.info("%s — requesting graceful exit via admin RPC", reason)
|
||||
try:
|
||||
result = subprocess.run(
|
||||
["agave-validator", "exit", "--force", "--ledger", LEDGER_DIR],
|
||||
capture_output=True, text=True, timeout=30,
|
||||
)
|
||||
if result.returncode == 0:
|
||||
log.info("Admin RPC exit requested successfully")
|
||||
else:
|
||||
log.warning(
|
||||
"Admin RPC exit returned %d: %s",
|
||||
result.returncode, result.stderr.strip(),
|
||||
)
|
||||
except subprocess.TimeoutExpired:
|
||||
log.warning("Admin RPC exit command timed out after 30s")
|
||||
except FileNotFoundError:
|
||||
log.warning("agave-validator binary not found for exit command")
|
||||
|
||||
# Wait for child to exit
|
||||
try:
|
||||
child.wait(timeout=GRACEFUL_EXIT_TIMEOUT)
|
||||
log.info("Validator exited cleanly with code %d", child.returncode)
|
||||
return
|
||||
except subprocess.TimeoutExpired:
|
||||
log.warning(
|
||||
"Validator did not exit within %ds — sending SIGTERM",
|
||||
GRACEFUL_EXIT_TIMEOUT,
|
||||
)
|
||||
|
||||
# Fallback: SIGTERM
|
||||
child.terminate()
|
||||
try:
|
||||
child.wait(timeout=15)
|
||||
log.info("Validator exited after SIGTERM with code %d", child.returncode)
|
||||
return
|
||||
except subprocess.TimeoutExpired:
|
||||
log.warning("Validator did not exit after SIGTERM — sending SIGKILL")
|
||||
|
||||
# Last resort: SIGKILL
|
||||
child.kill()
|
||||
child.wait()
|
||||
log.info("Validator killed with SIGKILL, code %d", child.returncode)
|
||||
|
||||
|
||||
# -- Serve subcommand ---------------------------------------------------------
|
||||
|
||||
|
||||
def _gap_monitor(
|
||||
child: subprocess.Popen[bytes],
|
||||
leapfrog: threading.Event,
|
||||
shutting_down: threading.Event,
|
||||
) -> None:
|
||||
"""Background thread: poll slot gap and trigger leapfrog if too far behind.
|
||||
|
||||
Waits for a grace period (SNAPSHOT_MONITOR_GRACE, default 600s) before
|
||||
monitoring — the validator needs time to extract snapshots and catch up.
|
||||
Then polls every SNAPSHOT_MONITOR_INTERVAL (default 30s). If the gap
|
||||
exceeds SNAPSHOT_LEAPFROG_SLOTS (default 5000) for SNAPSHOT_LEAPFROG_CHECKS
|
||||
(default 3) consecutive checks, triggers graceful shutdown and sets the
|
||||
leapfrog event so cmd_serve loops back to download a fresh incremental.
|
||||
"""
|
||||
threshold = int(env("SNAPSHOT_LEAPFROG_SLOTS", "5000"))
|
||||
required_checks = int(env("SNAPSHOT_LEAPFROG_CHECKS", "3"))
|
||||
interval = int(env("SNAPSHOT_MONITOR_INTERVAL", "30"))
|
||||
grace = int(env("SNAPSHOT_MONITOR_GRACE", "600"))
|
||||
rpc_port = env("RPC_PORT", "8899")
|
||||
local_url = f"http://127.0.0.1:{rpc_port}"
|
||||
|
||||
# Grace period — don't monitor during initial catch-up
|
||||
if shutting_down.wait(grace):
|
||||
return
|
||||
|
||||
consecutive = 0
|
||||
while not shutting_down.is_set():
|
||||
local_slot = rpc_get_slot(local_url, timeout=5)
|
||||
mainnet_slot = rpc_get_slot(MAINNET_RPC, timeout=10)
|
||||
|
||||
if local_slot is not None and mainnet_slot is not None:
|
||||
gap = mainnet_slot - local_slot
|
||||
if gap > threshold:
|
||||
consecutive += 1
|
||||
log.warning("Gap %d > %d (%d/%d consecutive)",
|
||||
gap, threshold, consecutive, required_checks)
|
||||
if consecutive >= required_checks:
|
||||
log.warning("Leapfrog triggered: gap %d", gap)
|
||||
leapfrog.set()
|
||||
graceful_exit(child, reason="Leapfrog")
|
||||
return
|
||||
else:
|
||||
if consecutive > 0:
|
||||
log.info("Gap %d within threshold, resetting counter", gap)
|
||||
consecutive = 0
|
||||
|
||||
shutting_down.wait(interval)
|
||||
|
||||
|
||||
def cmd_serve() -> None:
|
||||
"""Main serve flow: snapshot download, run validator, monitor gap, leapfrog.
|
||||
|
||||
Python stays as PID 1. On each iteration:
|
||||
1. Download full + incremental snapshots (if needed)
|
||||
2. Start agave-validator as child process
|
||||
3. Monitor slot gap in background thread
|
||||
4. If gap exceeds threshold → graceful stop → loop back to step 1
|
||||
5. If SIGTERM → graceful stop → exit
|
||||
6. If validator crashes → exit with its return code
|
||||
"""
|
||||
mode = env("AGAVE_MODE", "test")
|
||||
log.info("AGAVE_MODE=%s", mode)
|
||||
|
||||
if mode == "test":
|
||||
os.execvp("start-test.sh", ["start-test.sh"])
|
||||
|
||||
if mode not in ("rpc", "validator"):
|
||||
log.error("Unknown AGAVE_MODE: %s (valid: test, rpc, validator)", mode)
|
||||
sys.exit(1)
|
||||
|
||||
# One-time setup
|
||||
dirs = [CONFIG_DIR, LEDGER_DIR, ACCOUNTS_DIR, SNAPSHOTS_DIR]
|
||||
if mode == "rpc":
|
||||
dirs.append(LOG_DIR)
|
||||
ensure_dirs(*dirs)
|
||||
|
||||
if not env_bool("SKIP_IP_ECHO_PREFLIGHT"):
|
||||
script_dir = Path(__file__).resolve().parent
|
||||
sys.path.insert(0, str(script_dir))
|
||||
from ip_echo_preflight import main as ip_echo_main
|
||||
if ip_echo_main() != 0:
|
||||
sys.exit(1)
|
||||
|
||||
if mode == "rpc":
|
||||
ensure_identity_rpc()
|
||||
print_identity()
|
||||
|
||||
if mode == "rpc":
|
||||
args = build_rpc_args()
|
||||
else:
|
||||
args = build_validator_args()
|
||||
args = append_extra_args(args)
|
||||
|
||||
# Main loop: download → run → monitor → leapfrog if needed
|
||||
while True:
|
||||
maybe_download_snapshot(SNAPSHOTS_DIR)
|
||||
|
||||
Path("/tmp/entrypoint-start").write_text(str(time.time()))
|
||||
log.info("Starting agave-validator with %d arguments", len(args))
|
||||
child = subprocess.Popen(["agave-validator"] + args)
|
||||
|
||||
shutting_down = threading.Event()
|
||||
leapfrog = threading.Event()
|
||||
|
||||
signal.signal(signal.SIGUSR1,
|
||||
lambda _sig, _frame: child.send_signal(signal.SIGUSR1))
|
||||
|
||||
def _on_sigterm(_sig: int, _frame: object) -> None:
|
||||
shutting_down.set()
|
||||
threading.Thread(
|
||||
target=graceful_exit, args=(child,), daemon=True,
|
||||
).start()
|
||||
|
||||
signal.signal(signal.SIGTERM, _on_sigterm)
|
||||
|
||||
# Start gap monitor
|
||||
monitor = threading.Thread(
|
||||
target=_gap_monitor,
|
||||
args=(child, leapfrog, shutting_down),
|
||||
daemon=True,
|
||||
)
|
||||
monitor.start()
|
||||
|
||||
child.wait()
|
||||
|
||||
if leapfrog.is_set():
|
||||
log.info("Leapfrog: restarting with fresh incremental")
|
||||
continue
|
||||
|
||||
sys.exit(child.returncode)
|
||||
|
||||
|
||||
# -- Probe subcommand ---------------------------------------------------------
|
||||
|
||||
|
||||
def cmd_probe() -> None:
|
||||
"""Liveness probe: check local RPC slot vs mainnet.
|
||||
|
||||
Exit 0 = healthy, exit 1 = unhealthy.
|
||||
|
||||
Grace period: PROBE_GRACE_SECONDS (default 600) — probe always passes
|
||||
during grace period to allow for snapshot unpacking and initial replay.
|
||||
"""
|
||||
grace_seconds = int(env("PROBE_GRACE_SECONDS", "600"))
|
||||
max_lag = int(env("PROBE_MAX_SLOT_LAG", "20000"))
|
||||
|
||||
# Check grace period
|
||||
start_file = Path("/tmp/entrypoint-start")
|
||||
if start_file.exists():
|
||||
try:
|
||||
start_time = float(start_file.read_text().strip())
|
||||
elapsed = time.time() - start_time
|
||||
if elapsed < grace_seconds:
|
||||
# Within grace period — always healthy
|
||||
sys.exit(0)
|
||||
except (ValueError, OSError):
|
||||
pass
|
||||
else:
|
||||
# No start file — serve hasn't started yet, within grace
|
||||
sys.exit(0)
|
||||
|
||||
# Query local RPC
|
||||
rpc_port = env("RPC_PORT", "8899")
|
||||
local_url = f"http://127.0.0.1:{rpc_port}"
|
||||
local_slot = rpc_get_slot(local_url, timeout=5)
|
||||
if local_slot is None:
|
||||
# Local RPC unreachable after grace period — unhealthy
|
||||
sys.exit(1)
|
||||
|
||||
# Query mainnet
|
||||
mainnet_slot = rpc_get_slot(MAINNET_RPC, timeout=10)
|
||||
if mainnet_slot is None:
|
||||
# Can't reach mainnet to compare — assume healthy (don't penalize
|
||||
# the validator for mainnet RPC being down)
|
||||
sys.exit(0)
|
||||
|
||||
lag = mainnet_slot - local_slot
|
||||
if lag > max_lag:
|
||||
sys.exit(1)
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
# -- Main ----------------------------------------------------------------------
|
||||
|
||||
|
||||
def main() -> None:
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
|
||||
datefmt="%H:%M:%S",
|
||||
)
|
||||
|
||||
subcmd = sys.argv[1] if len(sys.argv) > 1 else "serve"
|
||||
|
||||
if subcmd == "serve":
|
||||
cmd_serve()
|
||||
elif subcmd == "probe":
|
||||
cmd_probe()
|
||||
else:
|
||||
log.error("Unknown subcommand: %s (valid: serve, probe)", subcmd)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,249 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""ip_echo preflight — verify UDP port reachability before starting the validator.
|
||||
|
||||
Implements the Solana ip_echo client protocol exactly:
|
||||
1. Bind UDP sockets on the ports the validator will use
|
||||
2. TCP connect to entrypoint gossip port, send IpEchoServerMessage
|
||||
3. Parse IpEchoServerResponse (our IP as seen by entrypoint)
|
||||
4. Wait for entrypoint's UDP probes on each port
|
||||
5. Exit 0 if all ports reachable, exit 1 if any fail
|
||||
|
||||
Wire format (from agave net-utils/src/):
|
||||
Request: 4 null bytes + [u16; 4] tcp_ports LE + [u16; 4] udp_ports LE + \n
|
||||
Response: 4 null bytes + bincode IpAddr (variant byte + addr) + optional shred_version
|
||||
|
||||
Called from entrypoint.py before snapshot download. Prevents wasting hours
|
||||
downloading a snapshot only to crash-loop on port reachability.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import socket
|
||||
import struct
|
||||
import sys
|
||||
import threading
|
||||
import time
|
||||
|
||||
log = logging.getLogger("ip_echo_preflight")
|
||||
|
||||
HEADER = b"\x00\x00\x00\x00"
|
||||
TERMINUS = b"\x0a"
|
||||
RESPONSE_BUF = 27
|
||||
IO_TIMEOUT = 5.0
|
||||
PROBE_TIMEOUT = 10.0
|
||||
MAX_RETRIES = 3
|
||||
RETRY_DELAY = 2.0
|
||||
|
||||
|
||||
def build_request(tcp_ports: list[int], udp_ports: list[int]) -> bytes:
|
||||
"""Build IpEchoServerMessage: header + [u16;4] tcp + [u16;4] udp + newline."""
|
||||
tcp = (tcp_ports + [0, 0, 0, 0])[:4]
|
||||
udp = (udp_ports + [0, 0, 0, 0])[:4]
|
||||
return HEADER + struct.pack("<4H", *tcp) + struct.pack("<4H", *udp) + TERMINUS
|
||||
|
||||
|
||||
def parse_response(data: bytes) -> tuple[str, int | None]:
|
||||
"""Parse IpEchoServerResponse → (ip_string, shred_version | None).
|
||||
|
||||
Wire format (bincode):
|
||||
4 bytes header (\0\0\0\0)
|
||||
4 bytes IpAddr enum variant (u32 LE: 0=IPv4, 1=IPv6)
|
||||
4|16 bytes address octets
|
||||
1 byte Option tag (0=None, 1=Some)
|
||||
2 bytes shred_version (u16 LE, only if Some)
|
||||
"""
|
||||
if len(data) < 8:
|
||||
raise ValueError(f"response too short: {len(data)} bytes")
|
||||
if data[:4] == b"HTTP":
|
||||
raise ValueError("got HTTP response — not an ip_echo server")
|
||||
if data[:4] != HEADER:
|
||||
raise ValueError(f"unexpected header: {data[:4].hex()}")
|
||||
variant = struct.unpack("<I", data[4:8])[0]
|
||||
if variant == 0: # IPv4
|
||||
if len(data) < 12:
|
||||
raise ValueError(f"IPv4 response truncated: {len(data)} bytes")
|
||||
ip = socket.inet_ntoa(data[8:12])
|
||||
rest = data[12:]
|
||||
elif variant == 1: # IPv6
|
||||
if len(data) < 24:
|
||||
raise ValueError(f"IPv6 response truncated: {len(data)} bytes")
|
||||
ip = socket.inet_ntop(socket.AF_INET6, data[8:24])
|
||||
rest = data[24:]
|
||||
else:
|
||||
raise ValueError(f"unknown IpAddr variant: {variant}")
|
||||
shred_version = None
|
||||
if len(rest) >= 3 and rest[0] == 1:
|
||||
shred_version = struct.unpack("<H", rest[1:3])[0]
|
||||
return ip, shred_version
|
||||
|
||||
|
||||
def _listen_udp(port: int, results: dict, stop: threading.Event) -> None:
|
||||
"""Bind a UDP socket and wait for a probe packet."""
|
||||
try:
|
||||
sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
||||
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
||||
sock.bind(("0.0.0.0", port))
|
||||
sock.settimeout(0.5)
|
||||
try:
|
||||
while not stop.is_set():
|
||||
try:
|
||||
_data, addr = sock.recvfrom(64)
|
||||
results[port] = ("ok", addr)
|
||||
return
|
||||
except socket.timeout:
|
||||
continue
|
||||
finally:
|
||||
sock.close()
|
||||
except OSError as exc:
|
||||
results[port] = ("bind_error", str(exc))
|
||||
|
||||
|
||||
def ip_echo_check(
|
||||
entrypoint_host: str,
|
||||
entrypoint_port: int,
|
||||
udp_ports: list[int],
|
||||
) -> tuple[str, dict[int, bool]]:
|
||||
"""Run one ip_echo exchange and return (seen_ip, {port: reachable}).
|
||||
|
||||
Raises on TCP failure (caller retries).
|
||||
"""
|
||||
udp_ports = [p for p in udp_ports if p != 0][:4]
|
||||
|
||||
# Start UDP listeners before sending the TCP request
|
||||
results: dict[int, tuple] = {}
|
||||
stop = threading.Event()
|
||||
threads = []
|
||||
for port in udp_ports:
|
||||
t = threading.Thread(target=_listen_udp, args=(port, results, stop), daemon=True)
|
||||
t.start()
|
||||
threads.append(t)
|
||||
time.sleep(0.1) # let listeners bind
|
||||
|
||||
# TCP: send request, read response
|
||||
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||
sock.settimeout(IO_TIMEOUT)
|
||||
try:
|
||||
sock.connect((entrypoint_host, entrypoint_port))
|
||||
sock.sendall(build_request([], udp_ports))
|
||||
resp = sock.recv(RESPONSE_BUF)
|
||||
finally:
|
||||
sock.close()
|
||||
|
||||
seen_ip, shred_version = parse_response(resp)
|
||||
log.info(
|
||||
"entrypoint %s:%d sees us as %s (shred_version=%s)",
|
||||
entrypoint_host, entrypoint_port, seen_ip, shred_version,
|
||||
)
|
||||
|
||||
# Wait for UDP probes
|
||||
deadline = time.monotonic() + PROBE_TIMEOUT
|
||||
while time.monotonic() < deadline:
|
||||
if all(p in results for p in udp_ports):
|
||||
break
|
||||
time.sleep(0.2)
|
||||
|
||||
stop.set()
|
||||
for t in threads:
|
||||
t.join(timeout=1)
|
||||
|
||||
port_ok: dict[int, bool] = {}
|
||||
for port in udp_ports:
|
||||
if port not in results:
|
||||
log.error("port %d: no probe received within %.0fs", port, PROBE_TIMEOUT)
|
||||
port_ok[port] = False
|
||||
else:
|
||||
status, detail = results[port]
|
||||
if status == "ok":
|
||||
log.info("port %d: probe received from %s", port, detail)
|
||||
port_ok[port] = True
|
||||
else:
|
||||
log.error("port %d: %s: %s", port, status, detail)
|
||||
port_ok[port] = False
|
||||
|
||||
return seen_ip, port_ok
|
||||
|
||||
|
||||
def run_preflight(
|
||||
entrypoint_host: str,
|
||||
entrypoint_port: int,
|
||||
udp_ports: list[int],
|
||||
expected_ip: str = "",
|
||||
) -> bool:
|
||||
"""Run ip_echo check with retries. Returns True if all ports pass."""
|
||||
for attempt in range(1, MAX_RETRIES + 1):
|
||||
log.info("ip_echo attempt %d/%d → %s:%d, ports %s",
|
||||
attempt, MAX_RETRIES, entrypoint_host, entrypoint_port, udp_ports)
|
||||
try:
|
||||
seen_ip, port_ok = ip_echo_check(entrypoint_host, entrypoint_port, udp_ports)
|
||||
except Exception as exc:
|
||||
log.error("attempt %d TCP failed: %s", attempt, exc)
|
||||
if attempt < MAX_RETRIES:
|
||||
time.sleep(RETRY_DELAY)
|
||||
continue
|
||||
|
||||
if expected_ip and seen_ip != expected_ip:
|
||||
log.error(
|
||||
"IP MISMATCH: entrypoint sees %s, expected %s (GOSSIP_HOST). "
|
||||
"Outbound mangle/SNAT path is broken.",
|
||||
seen_ip, expected_ip,
|
||||
)
|
||||
if attempt < MAX_RETRIES:
|
||||
time.sleep(RETRY_DELAY)
|
||||
continue
|
||||
|
||||
reachable = [p for p, ok in port_ok.items() if ok]
|
||||
unreachable = [p for p, ok in port_ok.items() if not ok]
|
||||
|
||||
if not unreachable:
|
||||
log.info("PASS: all ports reachable %s, seen as %s", reachable, seen_ip)
|
||||
return True
|
||||
|
||||
log.error(
|
||||
"attempt %d: unreachable %s, reachable %s, seen as %s",
|
||||
attempt, unreachable, reachable, seen_ip,
|
||||
)
|
||||
if attempt < MAX_RETRIES:
|
||||
time.sleep(RETRY_DELAY)
|
||||
|
||||
log.error("FAIL: ip_echo preflight exhausted %d attempts", MAX_RETRIES)
|
||||
return False
|
||||
|
||||
|
||||
def main() -> int:
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
|
||||
datefmt="%H:%M:%S",
|
||||
)
|
||||
|
||||
# Parse entrypoint — VALIDATOR_ENTRYPOINT is "host:port"
|
||||
raw = os.environ.get("VALIDATOR_ENTRYPOINT", "")
|
||||
if not raw and len(sys.argv) > 1:
|
||||
raw = sys.argv[1]
|
||||
if not raw:
|
||||
log.error("set VALIDATOR_ENTRYPOINT or pass host:port as argument")
|
||||
return 1
|
||||
|
||||
if ":" in raw:
|
||||
host, port_str = raw.rsplit(":", 1)
|
||||
ep_port = int(port_str)
|
||||
else:
|
||||
host = raw
|
||||
ep_port = 8001
|
||||
|
||||
gossip_port = int(os.environ.get("GOSSIP_PORT", "8001"))
|
||||
dynamic_range = os.environ.get("DYNAMIC_PORT_RANGE", "9000-10000")
|
||||
range_start = int(dynamic_range.split("-")[0])
|
||||
expected_ip = os.environ.get("GOSSIP_HOST", "")
|
||||
|
||||
# Test gossip + first 3 ports from dynamic range (4 max per ip_echo message)
|
||||
udp_ports = [gossip_port, range_start, range_start + 2, range_start + 3]
|
||||
|
||||
ok = run_preflight(host, ep_port, udp_ports, expected_ip)
|
||||
return 0 if ok else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
|
|
@ -1,878 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Download Solana snapshots using aria2c for parallel multi-connection downloads.
|
||||
|
||||
Discovers snapshot sources by querying getClusterNodes for all RPCs in the
|
||||
cluster, probing each for available snapshots, benchmarking download speed,
|
||||
and downloading from the fastest source using aria2c (16 connections by default).
|
||||
|
||||
Based on the discovery approach from etcusr/solana-snapshot-finder but replaces
|
||||
the single-connection wget download with aria2c parallel chunked downloads.
|
||||
|
||||
Usage:
|
||||
# Download to /srv/kind/solana/snapshots (mainnet, 16 connections)
|
||||
./snapshot_download.py -o /srv/kind/solana/snapshots
|
||||
|
||||
# Dry run — find best source, print URL
|
||||
./snapshot_download.py --dry-run
|
||||
|
||||
# Custom RPC for cluster discovery + 32 connections
|
||||
./snapshot_download.py -r https://api.mainnet-beta.solana.com -n 32
|
||||
|
||||
# Testnet
|
||||
./snapshot_download.py -c testnet -o /data/snapshots
|
||||
|
||||
# Programmatic use from entrypoint.py:
|
||||
from snapshot_download import download_best_snapshot
|
||||
ok = download_best_snapshot("/data/snapshots")
|
||||
|
||||
Requirements:
|
||||
- aria2c (apt install aria2)
|
||||
- python3 >= 3.10 (stdlib only, no pip dependencies)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import concurrent.futures
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
import urllib.error
|
||||
import urllib.request
|
||||
from dataclasses import dataclass, field
|
||||
from http.client import HTTPResponse
|
||||
from pathlib import Path
|
||||
from urllib.request import Request
|
||||
|
||||
log: logging.Logger = logging.getLogger("snapshot-download")
|
||||
|
||||
CLUSTER_RPC: dict[str, str] = {
|
||||
"mainnet-beta": "https://api.mainnet-beta.solana.com",
|
||||
"testnet": "https://api.testnet.solana.com",
|
||||
"devnet": "https://api.devnet.solana.com",
|
||||
}
|
||||
|
||||
# Snapshot filenames:
|
||||
# snapshot-<slot>-<hash>.tar.zst
|
||||
# incremental-snapshot-<base_slot>-<slot>-<hash>.tar.zst
|
||||
FULL_SNAP_RE: re.Pattern[str] = re.compile(
|
||||
r"^snapshot-(\d+)-([A-Za-z0-9]+)\.tar\.(zst|bz2)$"
|
||||
)
|
||||
INCR_SNAP_RE: re.Pattern[str] = re.compile(
|
||||
r"^incremental-snapshot-(\d+)-(\d+)-([A-Za-z0-9]+)\.tar\.(zst|bz2)$"
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class SnapshotSource:
|
||||
"""A snapshot file available from a specific RPC node."""
|
||||
|
||||
rpc_address: str
|
||||
# Full redirect paths as returned by the server (e.g. /snapshot-123-hash.tar.zst)
|
||||
file_paths: list[str] = field(default_factory=list)
|
||||
slots_diff: int = 0
|
||||
latency_ms: float = 0.0
|
||||
download_speed: float = 0.0 # bytes/sec
|
||||
|
||||
|
||||
# -- JSON-RPC helpers ----------------------------------------------------------
|
||||
|
||||
|
||||
class _NoRedirectHandler(urllib.request.HTTPRedirectHandler):
|
||||
"""Handler that captures redirect Location instead of following it."""
|
||||
|
||||
def redirect_request(
|
||||
self,
|
||||
req: Request,
|
||||
fp: HTTPResponse,
|
||||
code: int,
|
||||
msg: str,
|
||||
headers: dict[str, str], # type: ignore[override]
|
||||
newurl: str,
|
||||
) -> None:
|
||||
return None
|
||||
|
||||
|
||||
def rpc_post(url: str, method: str, params: list[object] | None = None,
|
||||
timeout: int = 25) -> object | None:
|
||||
"""JSON-RPC POST. Returns parsed 'result' field or None on error."""
|
||||
payload: bytes = json.dumps({
|
||||
"jsonrpc": "2.0", "id": 1,
|
||||
"method": method, "params": params or [],
|
||||
}).encode()
|
||||
req = Request(url, data=payload,
|
||||
headers={"Content-Type": "application/json"})
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
||||
data: dict[str, object] = json.loads(resp.read())
|
||||
return data.get("result")
|
||||
except (urllib.error.URLError, json.JSONDecodeError, OSError, TimeoutError) as e:
|
||||
log.debug("rpc_post %s %s failed: %s", url, method, e)
|
||||
return None
|
||||
|
||||
|
||||
def head_no_follow(url: str, timeout: float = 3) -> tuple[str | None, float]:
|
||||
"""HEAD request without following redirects.
|
||||
|
||||
Returns (Location header value, latency_sec) if the server returned a
|
||||
3xx redirect. Returns (None, 0.0) on any error or non-redirect response.
|
||||
"""
|
||||
opener: urllib.request.OpenerDirector = urllib.request.build_opener(_NoRedirectHandler)
|
||||
req = Request(url, method="HEAD")
|
||||
try:
|
||||
start: float = time.monotonic()
|
||||
resp: HTTPResponse = opener.open(req, timeout=timeout) # type: ignore[assignment]
|
||||
latency: float = time.monotonic() - start
|
||||
# Non-redirect (2xx) — server didn't redirect, not useful for discovery
|
||||
location: str | None = resp.headers.get("Location")
|
||||
resp.close()
|
||||
return location, latency
|
||||
except urllib.error.HTTPError as e:
|
||||
# 3xx redirects raise HTTPError with the redirect info
|
||||
latency = time.monotonic() - start # type: ignore[possibly-undefined]
|
||||
location = e.headers.get("Location")
|
||||
if location and 300 <= e.code < 400:
|
||||
return location, latency
|
||||
return None, 0.0
|
||||
except (urllib.error.URLError, OSError, TimeoutError):
|
||||
return None, 0.0
|
||||
|
||||
|
||||
# -- Discovery -----------------------------------------------------------------
|
||||
|
||||
|
||||
def get_current_slot(rpc_url: str) -> int | None:
|
||||
"""Get current slot from RPC."""
|
||||
result: object | None = rpc_post(rpc_url, "getSlot")
|
||||
if isinstance(result, int):
|
||||
return result
|
||||
return None
|
||||
|
||||
|
||||
def get_cluster_rpc_nodes(rpc_url: str, version_filter: str | None = None) -> list[str]:
|
||||
"""Get all RPC node addresses from getClusterNodes."""
|
||||
result: object | None = rpc_post(rpc_url, "getClusterNodes")
|
||||
if not isinstance(result, list):
|
||||
return []
|
||||
|
||||
rpc_addrs: list[str] = []
|
||||
for node in result:
|
||||
if not isinstance(node, dict):
|
||||
continue
|
||||
if version_filter is not None:
|
||||
node_version: str | None = node.get("version")
|
||||
if node_version and not node_version.startswith(version_filter):
|
||||
continue
|
||||
rpc: str | None = node.get("rpc")
|
||||
if rpc:
|
||||
rpc_addrs.append(rpc)
|
||||
return list(set(rpc_addrs))
|
||||
|
||||
|
||||
def _parse_snapshot_filename(location: str) -> tuple[str, str | None]:
|
||||
"""Extract filename and full redirect path from Location header.
|
||||
|
||||
Returns (filename, full_path). full_path includes any path prefix
|
||||
the server returned (e.g. '/snapshots/snapshot-123-hash.tar.zst').
|
||||
"""
|
||||
# Location may be absolute URL or relative path
|
||||
if location.startswith("http://") or location.startswith("https://"):
|
||||
# Absolute URL — extract path
|
||||
from urllib.parse import urlparse
|
||||
path: str = urlparse(location).path
|
||||
else:
|
||||
path = location
|
||||
|
||||
filename: str = path.rsplit("/", 1)[-1]
|
||||
return filename, path
|
||||
|
||||
|
||||
def probe_rpc_snapshot(
|
||||
rpc_address: str,
|
||||
current_slot: int,
|
||||
) -> SnapshotSource | None:
|
||||
"""Probe a single RPC node for available snapshots.
|
||||
|
||||
Discovery only — no filtering. Returns a SnapshotSource with all available
|
||||
info so the caller can decide what to keep. Filtering happens after all
|
||||
probes complete, so rejected sources are still visible for debugging.
|
||||
"""
|
||||
full_url: str = f"http://{rpc_address}/snapshot.tar.bz2"
|
||||
|
||||
# Full snapshot is required — every source must have one
|
||||
full_location, full_latency = head_no_follow(full_url, timeout=2)
|
||||
if not full_location:
|
||||
return None
|
||||
|
||||
latency_ms: float = full_latency * 1000
|
||||
|
||||
full_filename, full_path = _parse_snapshot_filename(full_location)
|
||||
fm: re.Match[str] | None = FULL_SNAP_RE.match(full_filename)
|
||||
if not fm:
|
||||
return None
|
||||
|
||||
full_snap_slot: int = int(fm.group(1))
|
||||
slots_diff: int = current_slot - full_snap_slot
|
||||
|
||||
file_paths: list[str] = [full_path]
|
||||
|
||||
# Also check for incremental snapshot
|
||||
inc_url: str = f"http://{rpc_address}/incremental-snapshot.tar.bz2"
|
||||
inc_location, _ = head_no_follow(inc_url, timeout=2)
|
||||
if inc_location:
|
||||
inc_filename, inc_path = _parse_snapshot_filename(inc_location)
|
||||
m: re.Match[str] | None = INCR_SNAP_RE.match(inc_filename)
|
||||
if m:
|
||||
inc_base_slot: int = int(m.group(1))
|
||||
# Incremental must be based on this source's full snapshot
|
||||
if inc_base_slot == full_snap_slot:
|
||||
file_paths.append(inc_path)
|
||||
|
||||
return SnapshotSource(
|
||||
rpc_address=rpc_address,
|
||||
file_paths=file_paths,
|
||||
slots_diff=slots_diff,
|
||||
latency_ms=latency_ms,
|
||||
)
|
||||
|
||||
|
||||
def discover_sources(
|
||||
rpc_url: str,
|
||||
current_slot: int,
|
||||
max_age_slots: int,
|
||||
max_latency_ms: float,
|
||||
threads: int,
|
||||
version_filter: str | None,
|
||||
) -> list[SnapshotSource]:
|
||||
"""Discover all snapshot sources, then filter.
|
||||
|
||||
Probing and filtering are separate: all reachable sources are collected
|
||||
first so we can report what exists even if filters reject everything.
|
||||
"""
|
||||
rpc_nodes: list[str] = get_cluster_rpc_nodes(rpc_url, version_filter)
|
||||
if not rpc_nodes:
|
||||
log.error("No RPC nodes found via getClusterNodes")
|
||||
return []
|
||||
|
||||
log.info("Found %d RPC nodes, probing for snapshots...", len(rpc_nodes))
|
||||
|
||||
all_sources: list[SnapshotSource] = []
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as pool:
|
||||
futures: dict[concurrent.futures.Future[SnapshotSource | None], str] = {
|
||||
pool.submit(probe_rpc_snapshot, addr, current_slot): addr
|
||||
for addr in rpc_nodes
|
||||
}
|
||||
done: int = 0
|
||||
for future in concurrent.futures.as_completed(futures):
|
||||
done += 1
|
||||
if done % 200 == 0:
|
||||
log.info(" probed %d/%d nodes, %d reachable",
|
||||
done, len(rpc_nodes), len(all_sources))
|
||||
try:
|
||||
result: SnapshotSource | None = future.result()
|
||||
except (urllib.error.URLError, OSError, TimeoutError) as e:
|
||||
log.debug("Probe failed for %s: %s", futures[future], e)
|
||||
continue
|
||||
if result:
|
||||
all_sources.append(result)
|
||||
|
||||
log.info("Discovered %d reachable sources", len(all_sources))
|
||||
|
||||
# Apply filters
|
||||
filtered: list[SnapshotSource] = []
|
||||
rejected_age: int = 0
|
||||
rejected_latency: int = 0
|
||||
for src in all_sources:
|
||||
if src.slots_diff > max_age_slots or src.slots_diff < -100:
|
||||
rejected_age += 1
|
||||
continue
|
||||
if src.latency_ms > max_latency_ms:
|
||||
rejected_latency += 1
|
||||
continue
|
||||
filtered.append(src)
|
||||
|
||||
if rejected_age or rejected_latency:
|
||||
log.info("Filtered: %d rejected by age (>%d slots), %d by latency (>%.0fms)",
|
||||
rejected_age, max_age_slots, rejected_latency, max_latency_ms)
|
||||
|
||||
if not filtered and all_sources:
|
||||
# Show what was available so the user can adjust filters
|
||||
all_sources.sort(key=lambda s: s.slots_diff)
|
||||
best = all_sources[0]
|
||||
log.warning("All %d sources rejected by filters. Best available: "
|
||||
"%s (age=%d slots, latency=%.0fms). "
|
||||
"Try --max-snapshot-age %d --max-latency %.0f",
|
||||
len(all_sources), best.rpc_address,
|
||||
best.slots_diff, best.latency_ms,
|
||||
best.slots_diff + 500,
|
||||
max(best.latency_ms * 1.5, 500))
|
||||
|
||||
log.info("Found %d sources after filtering", len(filtered))
|
||||
return filtered
|
||||
|
||||
|
||||
# -- Speed benchmark -----------------------------------------------------------
|
||||
|
||||
|
||||
def measure_speed(rpc_address: str, measure_time: int = 7) -> float:
|
||||
"""Measure download speed from an RPC node. Returns bytes/sec."""
|
||||
url: str = f"http://{rpc_address}/snapshot.tar.bz2"
|
||||
req = Request(url)
|
||||
try:
|
||||
with urllib.request.urlopen(req, timeout=measure_time + 5) as resp:
|
||||
start: float = time.monotonic()
|
||||
total: int = 0
|
||||
while True:
|
||||
elapsed: float = time.monotonic() - start
|
||||
if elapsed >= measure_time:
|
||||
break
|
||||
chunk: bytes = resp.read(81920)
|
||||
if not chunk:
|
||||
break
|
||||
total += len(chunk)
|
||||
elapsed = time.monotonic() - start
|
||||
if elapsed <= 0:
|
||||
return 0.0
|
||||
return total / elapsed
|
||||
except (urllib.error.URLError, OSError, TimeoutError):
|
||||
return 0.0
|
||||
|
||||
|
||||
# -- Incremental probing -------------------------------------------------------
|
||||
|
||||
|
||||
def probe_incremental(
|
||||
fast_sources: list[SnapshotSource],
|
||||
full_snap_slot: int,
|
||||
) -> tuple[str | None, list[str]]:
|
||||
"""Probe fast sources for the best incremental matching full_snap_slot.
|
||||
|
||||
Returns (filename, mirror_urls) or (None, []) if no match found.
|
||||
The "best" incremental is the one with the highest slot (closest to head).
|
||||
"""
|
||||
best_filename: str | None = None
|
||||
best_slot: int = 0
|
||||
best_source: SnapshotSource | None = None
|
||||
best_path: str | None = None
|
||||
|
||||
for source in fast_sources:
|
||||
inc_url: str = f"http://{source.rpc_address}/incremental-snapshot.tar.bz2"
|
||||
inc_location, _ = head_no_follow(inc_url, timeout=2)
|
||||
if not inc_location:
|
||||
continue
|
||||
inc_fn, inc_fp = _parse_snapshot_filename(inc_location)
|
||||
m: re.Match[str] | None = INCR_SNAP_RE.match(inc_fn)
|
||||
if not m:
|
||||
continue
|
||||
if int(m.group(1)) != full_snap_slot:
|
||||
log.debug(" %s: incremental base slot %s != full %d, skipping",
|
||||
source.rpc_address, m.group(1), full_snap_slot)
|
||||
continue
|
||||
inc_slot: int = int(m.group(2))
|
||||
if inc_slot > best_slot:
|
||||
best_slot = inc_slot
|
||||
best_filename = inc_fn
|
||||
best_source = source
|
||||
best_path = inc_fp
|
||||
|
||||
if best_filename is None or best_source is None or best_path is None:
|
||||
return None, []
|
||||
|
||||
# Build mirror list — check other sources for the same filename
|
||||
mirror_urls: list[str] = [f"http://{best_source.rpc_address}{best_path}"]
|
||||
for other in fast_sources:
|
||||
if other.rpc_address == best_source.rpc_address:
|
||||
continue
|
||||
other_loc, _ = head_no_follow(
|
||||
f"http://{other.rpc_address}/incremental-snapshot.tar.bz2", timeout=2)
|
||||
if other_loc:
|
||||
other_fn, other_fp = _parse_snapshot_filename(other_loc)
|
||||
if other_fn == best_filename:
|
||||
mirror_urls.append(f"http://{other.rpc_address}{other_fp}")
|
||||
|
||||
return best_filename, mirror_urls
|
||||
|
||||
|
||||
# -- Download ------------------------------------------------------------------
|
||||
|
||||
|
||||
def download_aria2c(
|
||||
urls: list[str],
|
||||
output_dir: str,
|
||||
filename: str,
|
||||
connections: int = 16,
|
||||
) -> bool:
|
||||
"""Download a file using aria2c with parallel connections.
|
||||
|
||||
When multiple URLs are provided, aria2c treats them as mirrors of the
|
||||
same file and distributes chunks across all of them.
|
||||
"""
|
||||
num_mirrors: int = len(urls)
|
||||
total_splits: int = max(connections, connections * num_mirrors)
|
||||
cmd: list[str] = [
|
||||
"aria2c",
|
||||
"--file-allocation=none",
|
||||
"--continue=false",
|
||||
f"--max-connection-per-server={connections}",
|
||||
f"--split={total_splits}",
|
||||
"--min-split-size=50M",
|
||||
# aria2c retries individual chunk connections on transient network
|
||||
# errors (TCP reset, timeout). This is transport-level retry analogous
|
||||
# to TCP retransmit, not application-level retry of a failed operation.
|
||||
"--max-tries=5",
|
||||
"--retry-wait=5",
|
||||
"--timeout=60",
|
||||
"--connect-timeout=10",
|
||||
"--summary-interval=10",
|
||||
"--console-log-level=notice",
|
||||
f"--dir={output_dir}",
|
||||
f"--out={filename}",
|
||||
"--auto-file-renaming=false",
|
||||
"--allow-overwrite=true",
|
||||
*urls,
|
||||
]
|
||||
|
||||
log.info("Downloading %s", filename)
|
||||
log.info(" aria2c: %d connections x %d mirrors (%d splits)",
|
||||
connections, num_mirrors, total_splits)
|
||||
|
||||
start: float = time.monotonic()
|
||||
result: subprocess.CompletedProcess[bytes] = subprocess.run(cmd)
|
||||
elapsed: float = time.monotonic() - start
|
||||
|
||||
if result.returncode != 0:
|
||||
log.error("aria2c failed with exit code %d", result.returncode)
|
||||
return False
|
||||
|
||||
filepath: Path = Path(output_dir) / filename
|
||||
if not filepath.exists():
|
||||
log.error("aria2c reported success but %s does not exist", filepath)
|
||||
return False
|
||||
|
||||
size_bytes: int = filepath.stat().st_size
|
||||
size_gb: float = size_bytes / (1024 ** 3)
|
||||
avg_mb: float = size_bytes / elapsed / (1024 ** 2) if elapsed > 0 else 0
|
||||
log.info(" Done: %.1f GB in %.0fs (%.1f MiB/s avg)", size_gb, elapsed, avg_mb)
|
||||
return True
|
||||
|
||||
|
||||
# -- Shared helpers ------------------------------------------------------------
|
||||
|
||||
|
||||
def _discover_and_benchmark(
|
||||
rpc_url: str,
|
||||
current_slot: int,
|
||||
*,
|
||||
max_snapshot_age: int = 10000,
|
||||
max_latency: float = 500,
|
||||
threads: int = 500,
|
||||
min_download_speed: int = 20,
|
||||
measurement_time: int = 7,
|
||||
max_speed_checks: int = 15,
|
||||
version_filter: str | None = None,
|
||||
) -> list[SnapshotSource]:
|
||||
"""Discover snapshot sources and benchmark download speed.
|
||||
|
||||
Returns sources that meet the minimum speed requirement, sorted by speed.
|
||||
"""
|
||||
sources: list[SnapshotSource] = discover_sources(
|
||||
rpc_url, current_slot,
|
||||
max_age_slots=max_snapshot_age,
|
||||
max_latency_ms=max_latency,
|
||||
threads=threads,
|
||||
version_filter=version_filter,
|
||||
)
|
||||
if not sources:
|
||||
return []
|
||||
|
||||
sources.sort(key=lambda s: s.latency_ms)
|
||||
|
||||
log.info("Benchmarking download speed on top %d sources...", max_speed_checks)
|
||||
fast_sources: list[SnapshotSource] = []
|
||||
checked: int = 0
|
||||
min_speed_bytes: int = min_download_speed * 1024 * 1024
|
||||
|
||||
for source in sources:
|
||||
if checked >= max_speed_checks:
|
||||
break
|
||||
checked += 1
|
||||
|
||||
speed: float = measure_speed(source.rpc_address, measurement_time)
|
||||
source.download_speed = speed
|
||||
speed_mib: float = speed / (1024 ** 2)
|
||||
|
||||
if speed < min_speed_bytes:
|
||||
log.info(" %s: %.1f MiB/s (too slow, need >=%d MiB/s)",
|
||||
source.rpc_address, speed_mib, min_download_speed)
|
||||
continue
|
||||
|
||||
log.info(" %s: %.1f MiB/s (latency: %.0fms, age: %d slots)",
|
||||
source.rpc_address, speed_mib,
|
||||
source.latency_ms, source.slots_diff)
|
||||
fast_sources.append(source)
|
||||
|
||||
return fast_sources
|
||||
|
||||
|
||||
def _rolling_incremental_download(
|
||||
fast_sources: list[SnapshotSource],
|
||||
full_snap_slot: int,
|
||||
output_dir: str,
|
||||
convergence_slots: int,
|
||||
connections: int,
|
||||
rpc_url: str,
|
||||
) -> str | None:
|
||||
"""Download incrementals in a loop until converged.
|
||||
|
||||
Probes fast_sources for incrementals matching full_snap_slot, downloads
|
||||
the freshest one, then re-probes until the gap to head is within
|
||||
convergence_slots. Returns the filename of the final incremental,
|
||||
or None if no incremental was found.
|
||||
"""
|
||||
prev_inc_filename: str | None = None
|
||||
loop_start: float = time.monotonic()
|
||||
max_convergence_time: float = 1800.0 # 30 min wall-clock limit
|
||||
|
||||
while True:
|
||||
if time.monotonic() - loop_start > max_convergence_time:
|
||||
if prev_inc_filename:
|
||||
log.warning("Convergence timeout (%.0fs) — using %s",
|
||||
max_convergence_time, prev_inc_filename)
|
||||
else:
|
||||
log.warning("Convergence timeout (%.0fs) — no incremental downloaded",
|
||||
max_convergence_time)
|
||||
break
|
||||
|
||||
inc_fn, inc_mirrors = probe_incremental(fast_sources, full_snap_slot)
|
||||
if inc_fn is None:
|
||||
if prev_inc_filename is None:
|
||||
log.error("No matching incremental found for base slot %d",
|
||||
full_snap_slot)
|
||||
else:
|
||||
log.info("No newer incremental available, using %s", prev_inc_filename)
|
||||
break
|
||||
|
||||
m_inc: re.Match[str] | None = INCR_SNAP_RE.match(inc_fn)
|
||||
assert m_inc is not None
|
||||
inc_slot: int = int(m_inc.group(2))
|
||||
|
||||
head_slot: int | None = get_current_slot(rpc_url)
|
||||
if head_slot is None:
|
||||
log.warning("Cannot get current slot — downloading best available incremental")
|
||||
gap: int = convergence_slots + 1
|
||||
else:
|
||||
gap = head_slot - inc_slot
|
||||
|
||||
if inc_fn == prev_inc_filename:
|
||||
if gap <= convergence_slots:
|
||||
log.info("Incremental %s already downloaded (gap %d slots, converged)",
|
||||
inc_fn, gap)
|
||||
break
|
||||
log.info("No newer incremental yet (slot %d, gap %d slots), waiting...",
|
||||
inc_slot, gap)
|
||||
time.sleep(10)
|
||||
continue
|
||||
|
||||
if prev_inc_filename is not None:
|
||||
old_path: Path = Path(output_dir) / prev_inc_filename
|
||||
if old_path.exists():
|
||||
log.info("Removing superseded incremental %s", prev_inc_filename)
|
||||
old_path.unlink()
|
||||
|
||||
log.info("Downloading incremental %s (%d mirrors, slot %d, gap %d slots)",
|
||||
inc_fn, len(inc_mirrors), inc_slot, gap)
|
||||
if not download_aria2c(inc_mirrors, output_dir, inc_fn, connections):
|
||||
log.warning("Failed to download incremental %s — re-probing in 10s", inc_fn)
|
||||
time.sleep(10)
|
||||
continue
|
||||
|
||||
prev_inc_filename = inc_fn
|
||||
|
||||
if gap <= convergence_slots:
|
||||
log.info("Converged: incremental slot %d is %d slots behind head",
|
||||
inc_slot, gap)
|
||||
break
|
||||
|
||||
if head_slot is None:
|
||||
break
|
||||
|
||||
log.info("Not converged (gap %d > %d), re-probing in 10s...",
|
||||
gap, convergence_slots)
|
||||
time.sleep(10)
|
||||
|
||||
return prev_inc_filename
|
||||
|
||||
|
||||
# -- Public API ----------------------------------------------------------------
|
||||
|
||||
|
||||
def download_incremental_for_slot(
|
||||
output_dir: str,
|
||||
full_snap_slot: int,
|
||||
*,
|
||||
cluster: str = "mainnet-beta",
|
||||
rpc_url: str | None = None,
|
||||
connections: int = 16,
|
||||
threads: int = 500,
|
||||
max_snapshot_age: int = 10000,
|
||||
max_latency: float = 500,
|
||||
min_download_speed: int = 20,
|
||||
measurement_time: int = 7,
|
||||
max_speed_checks: int = 15,
|
||||
version_filter: str | None = None,
|
||||
convergence_slots: int = 500,
|
||||
) -> bool:
|
||||
"""Download an incremental snapshot for an existing full snapshot.
|
||||
|
||||
Discovers sources, benchmarks speed, then runs the rolling incremental
|
||||
download loop for the given full snapshot base slot. Does NOT download
|
||||
a full snapshot.
|
||||
|
||||
Returns True if an incremental was downloaded, False otherwise.
|
||||
"""
|
||||
resolved_rpc: str = rpc_url or CLUSTER_RPC[cluster]
|
||||
|
||||
if not shutil.which("aria2c"):
|
||||
log.error("aria2c not found. Install with: apt install aria2")
|
||||
return False
|
||||
|
||||
log.info("Incremental download for base slot %d", full_snap_slot)
|
||||
current_slot: int | None = get_current_slot(resolved_rpc)
|
||||
if current_slot is None:
|
||||
log.error("Cannot get current slot from %s", resolved_rpc)
|
||||
return False
|
||||
|
||||
fast_sources: list[SnapshotSource] = _discover_and_benchmark(
|
||||
resolved_rpc, current_slot,
|
||||
max_snapshot_age=max_snapshot_age,
|
||||
max_latency=max_latency,
|
||||
threads=threads,
|
||||
min_download_speed=min_download_speed,
|
||||
measurement_time=measurement_time,
|
||||
max_speed_checks=max_speed_checks,
|
||||
version_filter=version_filter,
|
||||
)
|
||||
if not fast_sources:
|
||||
log.error("No fast sources found")
|
||||
return False
|
||||
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
result: str | None = _rolling_incremental_download(
|
||||
fast_sources, full_snap_slot, output_dir,
|
||||
convergence_slots, connections, resolved_rpc,
|
||||
)
|
||||
return result is not None
|
||||
|
||||
|
||||
def download_best_snapshot(
|
||||
output_dir: str,
|
||||
*,
|
||||
cluster: str = "mainnet-beta",
|
||||
rpc_url: str | None = None,
|
||||
connections: int = 16,
|
||||
threads: int = 500,
|
||||
max_snapshot_age: int = 10000,
|
||||
max_latency: float = 500,
|
||||
min_download_speed: int = 20,
|
||||
measurement_time: int = 7,
|
||||
max_speed_checks: int = 15,
|
||||
version_filter: str | None = None,
|
||||
full_only: bool = False,
|
||||
convergence_slots: int = 500,
|
||||
) -> bool:
|
||||
"""Download the best available snapshot to output_dir.
|
||||
|
||||
This is the programmatic API — called by entrypoint.py for automatic
|
||||
snapshot download. Returns True on success, False on failure.
|
||||
|
||||
All parameters have sensible defaults matching the CLI interface.
|
||||
"""
|
||||
resolved_rpc: str = rpc_url or CLUSTER_RPC[cluster]
|
||||
|
||||
if not shutil.which("aria2c"):
|
||||
log.error("aria2c not found. Install with: apt install aria2")
|
||||
return False
|
||||
|
||||
log.info("Cluster: %s | RPC: %s", cluster, resolved_rpc)
|
||||
current_slot: int | None = get_current_slot(resolved_rpc)
|
||||
if current_slot is None:
|
||||
log.error("Cannot get current slot from %s", resolved_rpc)
|
||||
return False
|
||||
log.info("Current slot: %d", current_slot)
|
||||
|
||||
fast_sources: list[SnapshotSource] = _discover_and_benchmark(
|
||||
resolved_rpc, current_slot,
|
||||
max_snapshot_age=max_snapshot_age,
|
||||
max_latency=max_latency,
|
||||
threads=threads,
|
||||
min_download_speed=min_download_speed,
|
||||
measurement_time=measurement_time,
|
||||
max_speed_checks=max_speed_checks,
|
||||
version_filter=version_filter,
|
||||
)
|
||||
if not fast_sources:
|
||||
log.error("No fast sources found")
|
||||
return False
|
||||
|
||||
# Use the fastest source as primary, build full snapshot download plan
|
||||
best: SnapshotSource = fast_sources[0]
|
||||
full_paths: list[str] = [fp for fp in best.file_paths
|
||||
if fp.rsplit("/", 1)[-1].startswith("snapshot-")]
|
||||
if not full_paths:
|
||||
log.error("Best source has no full snapshot")
|
||||
return False
|
||||
|
||||
# Build mirror URLs for the full snapshot
|
||||
full_filename: str = full_paths[0].rsplit("/", 1)[-1]
|
||||
full_mirrors: list[str] = [f"http://{best.rpc_address}{full_paths[0]}"]
|
||||
for other in fast_sources[1:]:
|
||||
for other_fp in other.file_paths:
|
||||
if other_fp.rsplit("/", 1)[-1] == full_filename:
|
||||
full_mirrors.append(f"http://{other.rpc_address}{other_fp}")
|
||||
break
|
||||
|
||||
speed_mib: float = best.download_speed / (1024 ** 2)
|
||||
log.info("Best source: %s (%.1f MiB/s), %d mirrors",
|
||||
best.rpc_address, speed_mib, len(full_mirrors))
|
||||
|
||||
# Download full snapshot
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
total_start: float = time.monotonic()
|
||||
|
||||
filepath: Path = Path(output_dir) / full_filename
|
||||
if filepath.exists() and filepath.stat().st_size > 0:
|
||||
log.info("Skipping %s (already exists: %.1f GB)",
|
||||
full_filename, filepath.stat().st_size / (1024 ** 3))
|
||||
else:
|
||||
if not download_aria2c(full_mirrors, output_dir, full_filename, connections):
|
||||
log.error("Failed to download %s", full_filename)
|
||||
return False
|
||||
|
||||
# Download incremental separately — the full download took minutes,
|
||||
# so any incremental from discovery is stale. Re-probe for fresh ones.
|
||||
if not full_only:
|
||||
fm: re.Match[str] | None = FULL_SNAP_RE.match(full_filename)
|
||||
if fm:
|
||||
full_snap_slot: int = int(fm.group(1))
|
||||
log.info("Downloading incremental for base slot %d...", full_snap_slot)
|
||||
_rolling_incremental_download(
|
||||
fast_sources, full_snap_slot, output_dir,
|
||||
convergence_slots, connections, resolved_rpc,
|
||||
)
|
||||
|
||||
total_elapsed: float = time.monotonic() - total_start
|
||||
log.info("All downloads complete in %.0fs", total_elapsed)
|
||||
|
||||
return True
|
||||
|
||||
|
||||
# -- Main (CLI) ----------------------------------------------------------------
|
||||
|
||||
|
||||
def main() -> int:
|
||||
p: argparse.ArgumentParser = argparse.ArgumentParser(
|
||||
description="Download Solana snapshots with aria2c parallel downloads",
|
||||
)
|
||||
p.add_argument("-o", "--output", default="/srv/kind/solana/snapshots",
|
||||
help="Snapshot output directory (default: /srv/kind/solana/snapshots)")
|
||||
p.add_argument("-c", "--cluster", default="mainnet-beta",
|
||||
choices=list(CLUSTER_RPC),
|
||||
help="Solana cluster (default: mainnet-beta)")
|
||||
p.add_argument("-r", "--rpc", default=None,
|
||||
help="RPC URL for cluster discovery (default: public RPC)")
|
||||
p.add_argument("-n", "--connections", type=int, default=16,
|
||||
help="aria2c connections per download (default: 16)")
|
||||
p.add_argument("-t", "--threads", type=int, default=500,
|
||||
help="Threads for parallel RPC probing (default: 500)")
|
||||
p.add_argument("--max-snapshot-age", type=int, default=10000,
|
||||
help="Max snapshot age in slots (default: 10000)")
|
||||
p.add_argument("--max-latency", type=float, default=500,
|
||||
help="Max RPC probe latency in ms (default: 500)")
|
||||
p.add_argument("--min-download-speed", type=int, default=20,
|
||||
help="Min download speed in MiB/s (default: 20)")
|
||||
p.add_argument("--measurement-time", type=int, default=7,
|
||||
help="Speed measurement duration in seconds (default: 7)")
|
||||
p.add_argument("--max-speed-checks", type=int, default=15,
|
||||
help="Max nodes to benchmark before giving up (default: 15)")
|
||||
p.add_argument("--version", default=None,
|
||||
help="Filter nodes by version prefix (e.g. '2.2')")
|
||||
p.add_argument("--convergence-slots", type=int, default=500,
|
||||
help="Max slot gap for incremental convergence (default: 500)")
|
||||
p.add_argument("--full-only", action="store_true",
|
||||
help="Download only full snapshot, skip incremental")
|
||||
p.add_argument("--dry-run", action="store_true",
|
||||
help="Find best source and print URL, don't download")
|
||||
p.add_argument("--post-cmd",
|
||||
help="Shell command to run after successful download "
|
||||
"(e.g. 'kubectl scale deployment ... --replicas=1')")
|
||||
p.add_argument("-v", "--verbose", action="store_true")
|
||||
args: argparse.Namespace = p.parse_args()
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG if args.verbose else logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(message)s",
|
||||
datefmt="%H:%M:%S",
|
||||
)
|
||||
|
||||
# Dry-run uses the original inline flow (needs access to sources for URL printing)
|
||||
if args.dry_run:
|
||||
rpc_url: str = args.rpc or CLUSTER_RPC[args.cluster]
|
||||
current_slot: int | None = get_current_slot(rpc_url)
|
||||
if current_slot is None:
|
||||
log.error("Cannot get current slot from %s", rpc_url)
|
||||
return 1
|
||||
|
||||
sources: list[SnapshotSource] = discover_sources(
|
||||
rpc_url, current_slot,
|
||||
max_age_slots=args.max_snapshot_age,
|
||||
max_latency_ms=args.max_latency,
|
||||
threads=args.threads,
|
||||
version_filter=args.version,
|
||||
)
|
||||
if not sources:
|
||||
log.error("No snapshot sources found")
|
||||
return 1
|
||||
|
||||
sources.sort(key=lambda s: s.latency_ms)
|
||||
best = sources[0]
|
||||
for fp in best.file_paths:
|
||||
print(f"http://{best.rpc_address}{fp}")
|
||||
return 0
|
||||
|
||||
ok: bool = download_best_snapshot(
|
||||
args.output,
|
||||
cluster=args.cluster,
|
||||
rpc_url=args.rpc,
|
||||
connections=args.connections,
|
||||
threads=args.threads,
|
||||
max_snapshot_age=args.max_snapshot_age,
|
||||
max_latency=args.max_latency,
|
||||
min_download_speed=args.min_download_speed,
|
||||
measurement_time=args.measurement_time,
|
||||
max_speed_checks=args.max_speed_checks,
|
||||
version_filter=args.version,
|
||||
full_only=args.full_only,
|
||||
convergence_slots=args.convergence_slots,
|
||||
)
|
||||
|
||||
if ok and args.post_cmd:
|
||||
log.info("Running post-download command: %s", args.post_cmd)
|
||||
result: subprocess.CompletedProcess[bytes] = subprocess.run(
|
||||
args.post_cmd, shell=True,
|
||||
)
|
||||
if result.returncode != 0:
|
||||
log.error("Post-download command failed with exit code %d",
|
||||
result.returncode)
|
||||
return 1
|
||||
log.info("Post-download command completed successfully")
|
||||
|
||||
return 0 if ok else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
|
|
@ -1,112 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Start solana-test-validator with optional SPL token setup
|
||||
#
|
||||
# Environment variables:
|
||||
# FACILITATOR_PUBKEY - facilitator fee-payer public key (base58)
|
||||
# SERVER_PUBKEY - server/payee wallet public key (base58)
|
||||
# CLIENT_PUBKEY - client/payer wallet public key (base58)
|
||||
# MINT_DECIMALS - token decimals (default: 6, matching USDC)
|
||||
# MINT_AMOUNT - amount to mint to client (default: 1000000000)
|
||||
# LEDGER_DIR - ledger directory (default: /data/ledger)
|
||||
# -----------------------------------------------------------------------
|
||||
|
||||
LEDGER_DIR="${LEDGER_DIR:-/data/ledger}"
|
||||
MINT_DECIMALS="${MINT_DECIMALS:-6}"
|
||||
MINT_AMOUNT="${MINT_AMOUNT:-1000000000}"
|
||||
SETUP_MARKER="${LEDGER_DIR}/.setup-done"
|
||||
|
||||
sudo chown -R "$(id -u):$(id -g)" "$LEDGER_DIR" 2>/dev/null || true
|
||||
|
||||
# Start test-validator in the background
|
||||
solana-test-validator \
|
||||
--ledger "${LEDGER_DIR}" \
|
||||
--rpc-port 8899 \
|
||||
--bind-address 0.0.0.0 \
|
||||
--quiet &
|
||||
|
||||
VALIDATOR_PID=$!
|
||||
|
||||
# Wait for RPC to become available
|
||||
echo "Waiting for test-validator RPC..."
|
||||
for i in $(seq 1 60); do
|
||||
if solana cluster-version --url http://127.0.0.1:8899 >/dev/null 2>&1; then
|
||||
echo "Test-validator is ready (attempt ${i})"
|
||||
break
|
||||
fi
|
||||
sleep 1
|
||||
done
|
||||
|
||||
solana config set --url http://127.0.0.1:8899
|
||||
|
||||
# Only run setup once (idempotent via marker file)
|
||||
if [ ! -f "${SETUP_MARKER}" ]; then
|
||||
echo "Running first-time setup..."
|
||||
|
||||
# Airdrop SOL to all wallets for gas
|
||||
for PUBKEY in "${FACILITATOR_PUBKEY:-}" "${SERVER_PUBKEY:-}" "${CLIENT_PUBKEY:-}"; do
|
||||
if [ -n "${PUBKEY}" ]; then
|
||||
echo "Airdropping 100 SOL to ${PUBKEY}..."
|
||||
solana airdrop 100 "${PUBKEY}" --url http://127.0.0.1:8899 || true
|
||||
fi
|
||||
done
|
||||
|
||||
# Create a USDC-equivalent SPL token mint if any pubkeys are set
|
||||
if [ -n "${CLIENT_PUBKEY:-}" ] || [ -n "${FACILITATOR_PUBKEY:-}" ] || [ -n "${SERVER_PUBKEY:-}" ]; then
|
||||
MINT_AUTHORITY_FILE="${LEDGER_DIR}/mint-authority.json"
|
||||
if [ ! -f "${MINT_AUTHORITY_FILE}" ]; then
|
||||
solana-keygen new --no-bip39-passphrase --outfile "${MINT_AUTHORITY_FILE}" --force
|
||||
MINT_AUTH_PUBKEY=$(solana-keygen pubkey "${MINT_AUTHORITY_FILE}")
|
||||
solana airdrop 10 "${MINT_AUTH_PUBKEY}" --url http://127.0.0.1:8899
|
||||
fi
|
||||
|
||||
MINT_ADDRESS_FILE="${LEDGER_DIR}/usdc-mint-address.txt"
|
||||
if [ ! -f "${MINT_ADDRESS_FILE}" ]; then
|
||||
spl-token create-token \
|
||||
--decimals "${MINT_DECIMALS}" \
|
||||
--mint-authority "${MINT_AUTHORITY_FILE}" \
|
||||
--url http://127.0.0.1:8899 \
|
||||
2>&1 | grep "Creating token" | awk '{print $3}' > "${MINT_ADDRESS_FILE}"
|
||||
echo "Created USDC mint: $(cat "${MINT_ADDRESS_FILE}")"
|
||||
fi
|
||||
|
||||
USDC_MINT=$(cat "${MINT_ADDRESS_FILE}")
|
||||
|
||||
# Create ATAs and mint tokens for the client
|
||||
if [ -n "${CLIENT_PUBKEY:-}" ]; then
|
||||
echo "Creating ATA for client ${CLIENT_PUBKEY}..."
|
||||
spl-token create-account "${USDC_MINT}" \
|
||||
--owner "${CLIENT_PUBKEY}" \
|
||||
--fee-payer "${MINT_AUTHORITY_FILE}" \
|
||||
--url http://127.0.0.1:8899 || true
|
||||
|
||||
echo "Minting ${MINT_AMOUNT} tokens to client..."
|
||||
spl-token mint "${USDC_MINT}" "${MINT_AMOUNT}" \
|
||||
--recipient-owner "${CLIENT_PUBKEY}" \
|
||||
--mint-authority "${MINT_AUTHORITY_FILE}" \
|
||||
--url http://127.0.0.1:8899 || true
|
||||
fi
|
||||
|
||||
# Create ATAs for server and facilitator
|
||||
for PUBKEY in "${SERVER_PUBKEY:-}" "${FACILITATOR_PUBKEY:-}"; do
|
||||
if [ -n "${PUBKEY}" ]; then
|
||||
echo "Creating ATA for ${PUBKEY}..."
|
||||
spl-token create-account "${USDC_MINT}" \
|
||||
--owner "${PUBKEY}" \
|
||||
--fee-payer "${MINT_AUTHORITY_FILE}" \
|
||||
--url http://127.0.0.1:8899 || true
|
||||
fi
|
||||
done
|
||||
|
||||
# Expose mint address for other containers
|
||||
cp "${MINT_ADDRESS_FILE}" /tmp/usdc-mint-address.txt 2>/dev/null || true
|
||||
fi
|
||||
|
||||
touch "${SETUP_MARKER}"
|
||||
echo "Setup complete."
|
||||
fi
|
||||
|
||||
echo "solana-test-validator running (PID ${VALIDATOR_PID})"
|
||||
wait ${VALIDATOR_PID}
|
||||
|
|
@ -1,22 +0,0 @@
|
|||
# DoubleZero network daemon for Solana validators
|
||||
# Provides GRE tunnel + BGP routing via the DoubleZero fiber backbone
|
||||
|
||||
FROM debian:bookworm-slim
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
ca-certificates \
|
||||
curl \
|
||||
gnupg \
|
||||
iproute2 \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install DoubleZero from Cloudsmith apt repo
|
||||
RUN curl -1sLf https://dl.cloudsmith.io/public/malbeclabs/doublezero/setup.deb.sh | bash \
|
||||
&& apt-get update \
|
||||
&& apt-get install -y doublezero \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
COPY entrypoint.sh /usr/local/bin/entrypoint.sh
|
||||
RUN chmod +x /usr/local/bin/entrypoint.sh
|
||||
|
||||
ENTRYPOINT ["entrypoint.sh"]
|
||||
|
|
@ -1,9 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
|
||||
# Build laconicnetwork/doublezero
|
||||
source ${CERC_CONTAINER_BASE_DIR}/build-base.sh
|
||||
|
||||
docker build -t laconicnetwork/doublezero:local \
|
||||
${build_command_args} \
|
||||
-f ${CERC_CONTAINER_BASE_DIR}/laconicnetwork-doublezero/Dockerfile \
|
||||
${CERC_CONTAINER_BASE_DIR}/laconicnetwork-doublezero
|
||||
|
|
@ -1,38 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# -----------------------------------------------------------------------
|
||||
# Start doublezerod
|
||||
#
|
||||
# Optional environment:
|
||||
# DOUBLEZERO_RPC_ENDPOINT - Solana RPC endpoint (default: http://127.0.0.1:8899)
|
||||
# DOUBLEZERO_ENV - DoubleZero environment (default: mainnet-beta)
|
||||
# DOUBLEZERO_EXTRA_ARGS - additional doublezerod arguments
|
||||
# -----------------------------------------------------------------------
|
||||
|
||||
RPC_ENDPOINT="${DOUBLEZERO_RPC_ENDPOINT:-http://127.0.0.1:8899}"
|
||||
DZ_ENV="${DOUBLEZERO_ENV:-mainnet-beta}"
|
||||
|
||||
# Ensure state directories exist
|
||||
mkdir -p /var/lib/doublezerod /var/run/doublezerod
|
||||
|
||||
# Generate DZ identity if not already present
|
||||
DZ_CONFIG_DIR="${HOME}/.config/doublezero"
|
||||
mkdir -p "$DZ_CONFIG_DIR"
|
||||
if [ ! -f "$DZ_CONFIG_DIR/id.json" ]; then
|
||||
echo "Generating DoubleZero identity..."
|
||||
doublezero keygen
|
||||
fi
|
||||
|
||||
echo "Starting doublezerod..."
|
||||
echo "Environment: $DZ_ENV"
|
||||
echo "RPC endpoint: $RPC_ENDPOINT"
|
||||
echo "DZ address: $(doublezero address)"
|
||||
|
||||
ARGS=()
|
||||
[ -n "${DOUBLEZERO_EXTRA_ARGS:-}" ] && read -ra ARGS <<< "$DOUBLEZERO_EXTRA_ARGS"
|
||||
|
||||
exec doublezerod \
|
||||
-env "$DZ_ENV" \
|
||||
-solana-rpc-endpoint "$RPC_ENDPOINT" \
|
||||
"${ARGS[@]}"
|
||||
|
|
@ -1,169 +0,0 @@
|
|||
# agave stack
|
||||
|
||||
Unified Agave/Jito Solana stack supporting three modes:
|
||||
|
||||
| Mode | Compose file | Use case |
|
||||
|------|-------------|----------|
|
||||
| `test` | `docker-compose-agave-test.yml` | Local dev with instant finality |
|
||||
| `rpc` | `docker-compose-agave-rpc.yml` | Non-voting mainnet/testnet RPC node |
|
||||
| `validator` | `docker-compose-agave.yml` | Voting validator |
|
||||
|
||||
## Build
|
||||
|
||||
```bash
|
||||
# Vanilla Agave v3.1.9
|
||||
laconic-so --stack agave build-containers
|
||||
|
||||
# Jito v3.1.8
|
||||
AGAVE_REPO=https://github.com/jito-foundation/jito-solana.git \
|
||||
AGAVE_VERSION=v3.1.8-jito \
|
||||
laconic-so --stack agave build-containers
|
||||
```
|
||||
|
||||
Build compiles from source (~30-60 min on first build).
|
||||
|
||||
## Deploy
|
||||
|
||||
```bash
|
||||
# Test validator (dev)
|
||||
laconic-so --stack agave deploy init --output spec.yml
|
||||
laconic-so --stack agave deploy create --spec-file spec.yml --deployment-dir my-test
|
||||
laconic-so deployment --dir my-test start
|
||||
|
||||
# Mainnet RPC (e.g. biscayne)
|
||||
# Edit spec.yml to set AGAVE_MODE=rpc, VALIDATOR_ENTRYPOINT, KNOWN_VALIDATOR, etc.
|
||||
laconic-so --stack agave deploy init --output spec.yml
|
||||
laconic-so --stack agave deploy create --spec-file spec.yml --deployment-dir my-rpc
|
||||
laconic-so deployment --dir my-rpc start
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
Mode is selected via `AGAVE_MODE` environment variable (`test`, `rpc`, or `validator`).
|
||||
|
||||
### RPC mode required env
|
||||
- `VALIDATOR_ENTRYPOINT` - cluster entrypoint (e.g. `entrypoint.mainnet-beta.solana.com:8001`)
|
||||
- `KNOWN_VALIDATOR` - known validator pubkey
|
||||
|
||||
### Validator mode required env
|
||||
- `VALIDATOR_ENTRYPOINT` - cluster entrypoint
|
||||
- `KNOWN_VALIDATOR` - known validator pubkey
|
||||
- Identity and vote account keypairs mounted at `/data/config/`
|
||||
|
||||
### Jito (optional, any mode except test)
|
||||
Set `JITO_ENABLE=true` and provide:
|
||||
- `JITO_BLOCK_ENGINE_URL`
|
||||
- `JITO_SHRED_RECEIVER_ADDR`
|
||||
- `JITO_TIP_PAYMENT_PROGRAM`
|
||||
- `JITO_DISTRIBUTION_PROGRAM`
|
||||
- `JITO_MERKLE_ROOT_AUTHORITY`
|
||||
- `JITO_COMMISSION_BPS`
|
||||
|
||||
Image must be built from `jito-foundation/jito-solana` repo for Jito flags to work.
|
||||
|
||||
## Runtime requirements
|
||||
|
||||
The container requires the following (already set in compose files):
|
||||
|
||||
- `privileged: true` — allows `mlock()` and raw network access
|
||||
- `cap_add: IPC_LOCK` — memory page locking for account indexes and ledger mappings
|
||||
- `ulimits: memlock: -1` (unlimited) — Agave locks gigabytes of memory
|
||||
- `ulimits: nofile: 1000000` — gossip/TPU connections + memory-mapped ledger files
|
||||
- `network_mode: host` — direct host network stack for gossip, TPU, and UDP port ranges
|
||||
|
||||
Without these, Agave either refuses to start or dies under load.
|
||||
|
||||
## Container overhead
|
||||
|
||||
Containers running with `privileged: true` and `network_mode: host` add **zero
|
||||
measurable overhead** compared to bare metal. Linux containers are not VMs — there
|
||||
is no hypervisor, no emulation layer, no packet translation:
|
||||
|
||||
- **Network**: `network_mode: host` shares the host's network namespace directly.
|
||||
No virtual bridge, no NAT, no veth pair. Same kernel code path as bare metal.
|
||||
GRE tunnels (DoubleZero) and raw sockets work identically.
|
||||
- **CPU**: No hypervisor. The process runs on the same physical cores with the
|
||||
same scheduler priority as any host process.
|
||||
- **Memory**: `IPC_LOCK` + unlimited memlock means Agave can `mlock()` pages
|
||||
exactly like bare metal. No memory ballooning or overcommit.
|
||||
- **Disk I/O**: PersistentVolumes backed by hostPath mounts have identical I/O
|
||||
characteristics to direct filesystem access.
|
||||
|
||||
The only overhead is cgroup accounting (nanoseconds per syscall) and overlayfs
|
||||
for cold file opens (single-digit microseconds, zero once cached).
|
||||
|
||||
## DoubleZero
|
||||
|
||||
DoubleZero provides optimized network routing for Solana validators via GRE
|
||||
tunnels (IP protocol 47) and BGP (TCP/179) over link-local 169.254.0.0/16.
|
||||
Traffic to other DoubleZero participants is routed through private fiber
|
||||
instead of the public internet.
|
||||
|
||||
### How it works
|
||||
|
||||
`doublezerod` creates a `doublezero0` GRE tunnel interface and runs BGP
|
||||
peering through it. Routes are injected into the host routing table, so
|
||||
the validator transparently sends traffic to other DZ validators over
|
||||
the fiber backbone. IBRL mode falls back to public internet if DZ is down.
|
||||
|
||||
### Container build
|
||||
|
||||
```bash
|
||||
laconic-so --stack agave build-containers
|
||||
```
|
||||
|
||||
This builds both the `laconicnetwork/agave` and `laconicnetwork/doublezero` images.
|
||||
|
||||
### Requirements
|
||||
|
||||
- Validator identity keypair at `/data/config/validator-identity.json`
|
||||
- `privileged: true` + `NET_ADMIN` (GRE tunnel + route table manipulation)
|
||||
- `hostNetwork: true` (GRE uses IP protocol 47, not TCP/UDP — cannot be port-mapped)
|
||||
- Node registered with DoubleZero passport system
|
||||
|
||||
### Docker Compose
|
||||
|
||||
The `docker-compose-doublezero.yml` runs alongside the validator with
|
||||
`network_mode: host`, sharing the `validator-config` volume for identity access.
|
||||
|
||||
### k8s deployment
|
||||
|
||||
laconic-so does not pass `hostNetwork` through to generated k8s resources.
|
||||
DoubleZero runs as a DaemonSet defined in `deployment/k8s-manifests/doublezero-daemonset.yaml`,
|
||||
applied after `deployment start`:
|
||||
|
||||
```bash
|
||||
kubectl apply -f deployment/k8s-manifests/doublezero-daemonset.yaml
|
||||
```
|
||||
|
||||
Since validator pods also use `hostNetwork: true` (via the compose `network_mode: host`
|
||||
which maps to the pod spec in k8s), they automatically see the GRE routes
|
||||
injected by `doublezerod` into the node's routing table.
|
||||
|
||||
## Biscayne deployment (biscayne.vaasl.io)
|
||||
|
||||
Mainnet voting validator with Jito MEV and DoubleZero.
|
||||
|
||||
```bash
|
||||
# Build Jito image
|
||||
AGAVE_REPO=https://github.com/jito-foundation/jito-solana.git \
|
||||
AGAVE_VERSION=v3.1.8-jito \
|
||||
laconic-so --stack agave build-containers
|
||||
|
||||
# Create deployment from biscayne spec
|
||||
laconic-so --stack agave deploy create \
|
||||
--spec-file deployment/spec.yml \
|
||||
--deployment-dir biscayne-deployment
|
||||
|
||||
# Copy validator keypairs
|
||||
cp /path/to/validator-identity.json biscayne-deployment/data/validator-config/
|
||||
cp /path/to/vote-account-keypair.json biscayne-deployment/data/validator-config/
|
||||
|
||||
# Start validator
|
||||
laconic-so deployment --dir biscayne-deployment start
|
||||
|
||||
# Start DoubleZero (after deployment is running)
|
||||
kubectl apply -f deployment/k8s-manifests/doublezero-daemonset.yaml
|
||||
```
|
||||
|
||||
To run as non-voting RPC instead, change `AGAVE_MODE: rpc` in `deployment/spec.yml`.
|
||||
|
|
@ -1,10 +0,0 @@
|
|||
version: "1.1"
|
||||
name: agave
|
||||
description: "Agave/Jito Solana validator, RPC node, or test-validator"
|
||||
containers:
|
||||
- laconicnetwork/agave
|
||||
- laconicnetwork/doublezero
|
||||
pods:
|
||||
- agave
|
||||
- doublezero
|
||||
- monitoring
|
||||
14
ansible.cfg
14
ansible.cfg
|
|
@ -1,14 +0,0 @@
|
|||
[defaults]
|
||||
inventory = inventory/
|
||||
stdout_callback = ansible.builtin.default
|
||||
result_format = yaml
|
||||
callbacks_enabled = profile_tasks
|
||||
retry_files_enabled = false
|
||||
|
||||
[privilege_escalation]
|
||||
become = true
|
||||
become_method = sudo
|
||||
|
||||
[ssh_connection]
|
||||
pipelining = true
|
||||
ssh_args = -o ForwardAgent=yes
|
||||
|
|
@ -1,38 +0,0 @@
|
|||
hostname mia-sw01
|
||||
|
||||
ip routing
|
||||
|
||||
interface Ethernet1
|
||||
no switchport
|
||||
ip address 10.0.2.1/24
|
||||
|
||||
interface Ethernet2
|
||||
no switchport
|
||||
ip address 172.16.1.189/31
|
||||
|
||||
! GRE tunnel to biscayne (simulates doublezero0)
|
||||
interface Tunnel1
|
||||
mtu 1476
|
||||
ip address 169.254.7.6/31
|
||||
tunnel mode gre
|
||||
tunnel source 10.0.2.1
|
||||
tunnel destination 10.0.2.2
|
||||
|
||||
! Inbound: route 137.239.194.65 to biscayne via GRE tunnel
|
||||
ip route 137.239.194.65/32 169.254.7.7
|
||||
|
||||
! Outbound: redirect traffic sourced from 137.239.194.65 to was-sw01 via backbone
|
||||
ip access-list VALIDATOR-OUTBOUND-ACL
|
||||
10 permit ip 137.239.194.65/32 any
|
||||
|
||||
traffic-policy VALIDATOR-OUTBOUND
|
||||
match VALIDATOR-OUTBOUND-ACL
|
||||
set nexthop 172.16.1.188
|
||||
|
||||
system-rule overriding-action redirect
|
||||
|
||||
! Apply on the GRE tunnel interface — this is what we're validating.
|
||||
! If cEOS doesn't support traffic-policy on Tunnel, test.sh has a
|
||||
! fallback that applies it on Ethernet1 instead.
|
||||
interface Tunnel1
|
||||
traffic-policy input VALIDATOR-OUTBOUND
|
||||
|
|
@ -1,377 +0,0 @@
|
|||
#!/usr/bin/env bash
|
||||
# End-to-end test for Ashburn validator relay topology.
|
||||
#
|
||||
# Prerequisites:
|
||||
# sudo containerlab deploy -t topology.yml
|
||||
#
|
||||
# Usage:
|
||||
# ./test.sh # run all tests
|
||||
# ./test.sh setup # configure containers only (skip tests)
|
||||
# ./test.sh inbound # inbound test only
|
||||
# ./test.sh outbound # outbound test only
|
||||
# ./test.sh counters # show all counters
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
P="clab-ashburn-relay"
|
||||
ASHBURN_IP="137.239.194.65"
|
||||
KIND_NODE_IP="172.20.0.2"
|
||||
BISCAYNE_BRIDGE_IP="172.20.0.1"
|
||||
|
||||
PASS=0
|
||||
FAIL=0
|
||||
SKIP=0
|
||||
|
||||
pass() { echo " PASS: $1"; ((PASS++)); }
|
||||
fail() { echo " FAIL: $1"; ((FAIL++)); }
|
||||
skip() { echo " SKIP: $1"; ((SKIP++)); }
|
||||
|
||||
dexec() { sudo docker exec "$P-$1" sh -c "$2"; }
|
||||
dexec_d() { sudo docker exec -d "$P-$1" sh -c "$2"; }
|
||||
eos() { sudo docker exec "$P-$1" Cli -c "$2" 2>/dev/null; }
|
||||
|
||||
# ======================================================================
|
||||
# Wait for cEOS readiness
|
||||
# ======================================================================
|
||||
wait_eos() {
|
||||
local node="$1" max=60 i=0
|
||||
echo "Waiting for $node EOS to boot..."
|
||||
while ! eos "$node" "show version" &>/dev/null; do
|
||||
((i++))
|
||||
if ((i >= max)); then
|
||||
echo "ERROR: $node did not become ready in ${max}s"
|
||||
exit 1
|
||||
fi
|
||||
sleep 2
|
||||
done
|
||||
echo " $node ready (${i}s)"
|
||||
}
|
||||
|
||||
# ======================================================================
|
||||
# Setup: configure linux containers
|
||||
# ======================================================================
|
||||
setup() {
|
||||
echo "=== Waiting for cEOS nodes ==="
|
||||
wait_eos was-sw01
|
||||
wait_eos mia-sw01
|
||||
|
||||
echo ""
|
||||
echo "=== Configuring internet-peer ==="
|
||||
dexec internet-peer '
|
||||
ip addr add 64.92.84.82/24 dev eth1 2>/dev/null || true
|
||||
ip route add 137.239.194.65/32 via 64.92.84.81 2>/dev/null || true
|
||||
'
|
||||
# install tcpdump + socat for tests
|
||||
dexec internet-peer 'apk add -q --no-cache tcpdump socat 2>/dev/null || true'
|
||||
|
||||
echo "=== Configuring kind-node ==="
|
||||
dexec kind-node '
|
||||
ip addr add 172.20.0.2/24 dev eth1 2>/dev/null || true
|
||||
ip route add default via 172.20.0.1 2>/dev/null || true
|
||||
'
|
||||
dexec kind-node 'apk add -q --no-cache socat 2>/dev/null || true'
|
||||
|
||||
echo "=== Configuring biscayne ==="
|
||||
dexec biscayne '
|
||||
apk add -q --no-cache iptables iproute2 tcpdump 2>/dev/null || true
|
||||
|
||||
# Enable forwarding
|
||||
sysctl -w net.ipv4.ip_forward=1 >/dev/null
|
||||
|
||||
# Interfaces
|
||||
ip addr add 10.0.2.2/24 dev eth1 2>/dev/null || true
|
||||
ip addr add 172.20.0.1/24 dev eth2 2>/dev/null || true
|
||||
|
||||
# GRE tunnel to mia-sw01 (simulates doublezero0)
|
||||
ip tunnel add doublezero0 mode gre local 10.0.2.2 remote 10.0.2.1 2>/dev/null || true
|
||||
ip addr add 169.254.7.7/31 dev doublezero0 2>/dev/null || true
|
||||
ip link set doublezero0 up
|
||||
|
||||
# Ashburn IP on loopback (accept inbound packets)
|
||||
ip addr add 137.239.194.65/32 dev lo 2>/dev/null || true
|
||||
|
||||
# --- Inbound DNAT: 137.239.194.65 → kind-node (172.20.0.2) ---
|
||||
iptables -t nat -C PREROUTING -p udp -d 137.239.194.65 --dport 8001 \
|
||||
-j DNAT --to-destination 172.20.0.2:8001 2>/dev/null || \
|
||||
iptables -t nat -A PREROUTING -p udp -d 137.239.194.65 --dport 8001 \
|
||||
-j DNAT --to-destination 172.20.0.2:8001
|
||||
|
||||
iptables -t nat -C PREROUTING -p tcp -d 137.239.194.65 --dport 8001 \
|
||||
-j DNAT --to-destination 172.20.0.2:8001 2>/dev/null || \
|
||||
iptables -t nat -A PREROUTING -p tcp -d 137.239.194.65 --dport 8001 \
|
||||
-j DNAT --to-destination 172.20.0.2:8001
|
||||
|
||||
iptables -t nat -C PREROUTING -p udp -d 137.239.194.65 --dport 9000:9025 \
|
||||
-j DNAT --to-destination 172.20.0.2 2>/dev/null || \
|
||||
iptables -t nat -A PREROUTING -p udp -d 137.239.194.65 --dport 9000:9025 \
|
||||
-j DNAT --to-destination 172.20.0.2
|
||||
|
||||
# --- Outbound: fwmark + SNAT + policy routing ---
|
||||
# Mark validator traffic from kind-node
|
||||
iptables -t mangle -C PREROUTING -s 172.20.0.0/16 -p udp --sport 8001 \
|
||||
-j MARK --set-mark 100 2>/dev/null || \
|
||||
iptables -t mangle -A PREROUTING -s 172.20.0.0/16 -p udp --sport 8001 \
|
||||
-j MARK --set-mark 100
|
||||
|
||||
iptables -t mangle -C PREROUTING -s 172.20.0.0/16 -p udp --sport 9000:9025 \
|
||||
-j MARK --set-mark 100 2>/dev/null || \
|
||||
iptables -t mangle -A PREROUTING -s 172.20.0.0/16 -p udp --sport 9000:9025 \
|
||||
-j MARK --set-mark 100
|
||||
|
||||
iptables -t mangle -C PREROUTING -s 172.20.0.0/16 -p tcp --sport 8001 \
|
||||
-j MARK --set-mark 100 2>/dev/null || \
|
||||
iptables -t mangle -A PREROUTING -s 172.20.0.0/16 -p tcp --sport 8001 \
|
||||
-j MARK --set-mark 100
|
||||
|
||||
# SNAT to Ashburn IP (must be first in POSTROUTING, before any MASQUERADE)
|
||||
iptables -t nat -C POSTROUTING -m mark --mark 100 \
|
||||
-j SNAT --to-source 137.239.194.65 2>/dev/null || \
|
||||
iptables -t nat -I POSTROUTING 1 -m mark --mark 100 \
|
||||
-j SNAT --to-source 137.239.194.65
|
||||
|
||||
# Policy routing table
|
||||
grep -q "^100 ashburn" /etc/iproute2/rt_tables 2>/dev/null || \
|
||||
echo "100 ashburn" >> /etc/iproute2/rt_tables
|
||||
ip rule show | grep -q "fwmark 0x64 lookup ashburn" || \
|
||||
ip rule add fwmark 100 table ashburn
|
||||
ip route replace default via 169.254.7.6 dev doublezero0 table ashburn
|
||||
'
|
||||
|
||||
echo ""
|
||||
echo "=== Setup complete ==="
|
||||
}
|
||||
|
||||
# ======================================================================
|
||||
# Test 1: GRE tunnel connectivity
|
||||
# ======================================================================
|
||||
test_gre() {
|
||||
echo ""
|
||||
echo "=== Test: GRE tunnel (biscayne ↔ mia-sw01) ==="
|
||||
|
||||
if dexec biscayne 'ping -c 2 -W 2 169.254.7.6' &>/dev/null; then
|
||||
pass "biscayne → mia-sw01 via GRE tunnel"
|
||||
else
|
||||
fail "GRE tunnel not working (biscayne cannot reach 169.254.7.6)"
|
||||
echo " Debugging:"
|
||||
dexec biscayne 'ip tunnel show; ip addr show doublezero0; ip route' 2>/dev/null || true
|
||||
eos mia-sw01 'show interfaces Tunnel1' 2>/dev/null || true
|
||||
fi
|
||||
}
|
||||
|
||||
# ======================================================================
|
||||
# Test 2: Inbound path (internet-peer → 137.239.194.65:8001 → kind-node)
|
||||
# ======================================================================
|
||||
test_inbound() {
|
||||
echo ""
|
||||
echo "=== Test: Inbound path ==="
|
||||
echo " internet-peer → $ASHBURN_IP:8001 → was-sw01 → mia-sw01 → biscayne → kind-node"
|
||||
|
||||
# Start UDP listener on kind-node port 8001
|
||||
dexec kind-node 'rm -f /tmp/inbound.txt'
|
||||
dexec_d kind-node 'timeout 10 socat -u UDP4-LISTEN:8001,reuseaddr OPEN:/tmp/inbound.txt,creat,trunc'
|
||||
sleep 1
|
||||
|
||||
# Send test packet from internet-peer to 137.239.194.65:8001
|
||||
dexec internet-peer "echo 'INBOUND_TEST_8001' | socat - UDP4-SENDTO:$ASHBURN_IP:8001"
|
||||
sleep 2
|
||||
|
||||
local received
|
||||
received=$(dexec kind-node 'cat /tmp/inbound.txt 2>/dev/null' || true)
|
||||
if echo "$received" | grep -q "INBOUND_TEST_8001"; then
|
||||
pass "inbound UDP to $ASHBURN_IP:8001 reached kind-node"
|
||||
else
|
||||
fail "inbound UDP to $ASHBURN_IP:8001 did not reach kind-node (got: '$received')"
|
||||
fi
|
||||
|
||||
# Also test dynamic port range (9000)
|
||||
dexec kind-node 'rm -f /tmp/inbound9000.txt'
|
||||
dexec_d kind-node 'timeout 10 socat -u UDP4-LISTEN:9000,reuseaddr OPEN:/tmp/inbound9000.txt,creat,trunc'
|
||||
sleep 1
|
||||
|
||||
dexec internet-peer "echo 'INBOUND_TEST_9000' | socat - UDP4-SENDTO:$ASHBURN_IP:9000"
|
||||
sleep 2
|
||||
|
||||
received=$(dexec kind-node 'cat /tmp/inbound9000.txt 2>/dev/null' || true)
|
||||
if echo "$received" | grep -q "INBOUND_TEST_9000"; then
|
||||
pass "inbound UDP to $ASHBURN_IP:9000 reached kind-node"
|
||||
else
|
||||
fail "inbound UDP to $ASHBURN_IP:9000 did not reach kind-node (got: '$received')"
|
||||
fi
|
||||
}
|
||||
|
||||
# ======================================================================
|
||||
# Test 3: Outbound path (kind-node sport 8001 → internet-peer sees src 137.239.194.65)
|
||||
# ======================================================================
|
||||
test_outbound() {
|
||||
echo ""
|
||||
echo "=== Test: Outbound path ==="
|
||||
echo " kind-node:8001 → biscayne (SNAT) → doublezero0 → mia-sw01 → was-sw01 → internet-peer"
|
||||
|
||||
# Start tcpdump on internet-peer
|
||||
dexec internet-peer 'rm -f /tmp/outbound.txt'
|
||||
dexec_d internet-peer 'timeout 15 tcpdump -i eth1 -nn -c 1 "udp dst port 55555" > /tmp/outbound.txt 2>&1'
|
||||
sleep 2
|
||||
|
||||
# Send UDP from kind-node with sport 8001 to internet-peer
|
||||
dexec kind-node "echo 'OUTBOUND_TEST' | socat - UDP4-SENDTO:64.92.84.82:55555,sourceport=8001" || true
|
||||
sleep 3
|
||||
|
||||
local captured
|
||||
captured=$(dexec internet-peer 'cat /tmp/outbound.txt 2>/dev/null' || true)
|
||||
echo " tcpdump captured: $captured"
|
||||
|
||||
if echo "$captured" | grep -q "$ASHBURN_IP"; then
|
||||
pass "outbound from sport 8001 exits with src $ASHBURN_IP"
|
||||
else
|
||||
fail "outbound from sport 8001 does not show src $ASHBURN_IP"
|
||||
echo " Debugging biscayne iptables:"
|
||||
dexec biscayne 'iptables -t mangle -L PREROUTING -v -n 2>/dev/null' || true
|
||||
dexec biscayne 'iptables -t nat -L POSTROUTING -v -n 2>/dev/null' || true
|
||||
dexec biscayne 'ip rule show; ip route show table ashburn 2>/dev/null' || true
|
||||
fi
|
||||
|
||||
# Test with dynamic port range (sport 9000)
|
||||
dexec internet-peer 'rm -f /tmp/outbound9000.txt'
|
||||
dexec_d internet-peer 'timeout 15 tcpdump -i eth1 -nn -c 1 "udp dst port 55556" > /tmp/outbound9000.txt 2>&1'
|
||||
sleep 2
|
||||
|
||||
dexec kind-node "echo 'OUTBOUND_9000' | socat - UDP4-SENDTO:64.92.84.82:55556,sourceport=9000" || true
|
||||
sleep 3
|
||||
|
||||
captured=$(dexec internet-peer 'cat /tmp/outbound9000.txt 2>/dev/null' || true)
|
||||
if echo "$captured" | grep -q "$ASHBURN_IP"; then
|
||||
pass "outbound from sport 9000 exits with src $ASHBURN_IP"
|
||||
else
|
||||
fail "outbound from sport 9000 does not show src $ASHBURN_IP"
|
||||
fi
|
||||
}
|
||||
|
||||
# ======================================================================
|
||||
# Test 4: Isolation — RPC traffic (sport 8899) should NOT be relayed
|
||||
# ======================================================================
|
||||
test_isolation() {
|
||||
echo ""
|
||||
echo "=== Test: Isolation (RPC port 8899 should NOT be relayed) ==="
|
||||
|
||||
# Get current mangle match count
|
||||
local before after
|
||||
before=$(dexec biscayne 'iptables -t mangle -L PREROUTING -v -n 2>/dev/null | grep -c "MARK" || echo 0')
|
||||
|
||||
# Send from sport 8899 (RPC — should not match mangle rules)
|
||||
dexec kind-node "echo 'RPC_TEST' | socat - UDP4-SENDTO:64.92.84.82:55557,sourceport=8899" 2>/dev/null || true
|
||||
sleep 1
|
||||
|
||||
# Packet count for SNAT rule should not increase for this packet
|
||||
# Check by looking at the mangle counters — the packet should not have been marked
|
||||
local mangle_out
|
||||
mangle_out=$(dexec biscayne 'iptables -t mangle -L PREROUTING -v -n 2>/dev/null' || true)
|
||||
echo " mangle PREROUTING rules (verify sport 8899 not matched):"
|
||||
echo "$mangle_out" | grep -E "MARK|pkts" | head -5
|
||||
|
||||
# The fwmark rules only match sport 8001 and 9000-9025, so 8899 won't match.
|
||||
# We can verify by checking that no new packets were marked.
|
||||
pass "RPC port 8899 not in fwmark rule set (by design — rules only match 8001, 9000-9025)"
|
||||
}
|
||||
|
||||
# ======================================================================
|
||||
# Test 5: Traffic-policy on Tunnel interface (answers open question #1/#3)
|
||||
# ======================================================================
|
||||
test_tunnel_policy() {
|
||||
echo ""
|
||||
echo "=== Test: traffic-policy on mia-sw01 Tunnel1 ==="
|
||||
|
||||
local tp_out
|
||||
tp_out=$(eos mia-sw01 "show traffic-policy interface Tunnel1" 2>/dev/null || true)
|
||||
if echo "$tp_out" | grep -qi "VALIDATOR-OUTBOUND"; then
|
||||
pass "traffic-policy VALIDATOR-OUTBOUND applied on Tunnel1"
|
||||
else
|
||||
skip "traffic-policy on Tunnel1 may not be supported on cEOS"
|
||||
echo " Output: $tp_out"
|
||||
echo ""
|
||||
echo " Attempting fallback: apply on Ethernet1 instead..."
|
||||
eos mia-sw01 "configure
|
||||
interface Tunnel1
|
||||
no traffic-policy input VALIDATOR-OUTBOUND
|
||||
interface Ethernet1
|
||||
traffic-policy input VALIDATOR-OUTBOUND
|
||||
" 2>/dev/null || true
|
||||
tp_out=$(eos mia-sw01 "show traffic-policy interface Ethernet1" 2>/dev/null || true)
|
||||
if echo "$tp_out" | grep -qi "VALIDATOR-OUTBOUND"; then
|
||||
echo " Fallback: traffic-policy applied on Ethernet1 (GRE decapsulates before policy)"
|
||||
else
|
||||
echo " Fallback also failed. Check mia-sw01 config manually."
|
||||
fi
|
||||
fi
|
||||
}
|
||||
|
||||
# ======================================================================
|
||||
# Counters
|
||||
# ======================================================================
|
||||
show_counters() {
|
||||
echo ""
|
||||
echo "=== Traffic-policy counters ==="
|
||||
|
||||
echo "--- was-sw01 ---"
|
||||
eos was-sw01 "show traffic-policy counters" 2>/dev/null || echo "(not available on cEOS)"
|
||||
|
||||
echo "--- mia-sw01 ---"
|
||||
eos mia-sw01 "show traffic-policy counters" 2>/dev/null || echo "(not available on cEOS)"
|
||||
|
||||
echo ""
|
||||
echo "--- biscayne iptables nat ---"
|
||||
dexec biscayne 'iptables -t nat -L -v -n 2>/dev/null' || true
|
||||
|
||||
echo ""
|
||||
echo "--- biscayne iptables mangle ---"
|
||||
dexec biscayne 'iptables -t mangle -L PREROUTING -v -n 2>/dev/null' || true
|
||||
|
||||
echo ""
|
||||
echo "--- biscayne policy routing ---"
|
||||
dexec biscayne 'ip rule show 2>/dev/null' || true
|
||||
dexec biscayne 'ip route show table ashburn 2>/dev/null' || true
|
||||
}
|
||||
|
||||
# ======================================================================
|
||||
# Main
|
||||
# ======================================================================
|
||||
main() {
|
||||
local mode="${1:-all}"
|
||||
|
||||
case "$mode" in
|
||||
setup)
|
||||
setup
|
||||
;;
|
||||
inbound)
|
||||
test_gre
|
||||
test_inbound
|
||||
;;
|
||||
outbound)
|
||||
test_outbound
|
||||
;;
|
||||
counters)
|
||||
show_counters
|
||||
;;
|
||||
all)
|
||||
setup
|
||||
test_gre
|
||||
test_tunnel_policy
|
||||
test_inbound
|
||||
test_outbound
|
||||
test_isolation
|
||||
show_counters
|
||||
echo ""
|
||||
echo "==============================="
|
||||
echo "Results: $PASS passed, $FAIL failed, $SKIP skipped"
|
||||
echo "==============================="
|
||||
if ((FAIL > 0)); then
|
||||
exit 1
|
||||
fi
|
||||
;;
|
||||
*)
|
||||
echo "Usage: $0 [setup|inbound|outbound|counters|all]"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
}
|
||||
|
||||
main "$@"
|
||||
|
|
@ -1,43 +0,0 @@
|
|||
name: ashburn-relay
|
||||
topology:
|
||||
kinds:
|
||||
ceos:
|
||||
image: ceos:4.34.0F
|
||||
linux:
|
||||
image: alpine:3.20
|
||||
|
||||
nodes:
|
||||
# Ashburn switch — inbound traffic-policy + Loopback101 for 137.239.194.65
|
||||
was-sw01:
|
||||
kind: ceos
|
||||
startup-config: was-sw01-startup.cfg
|
||||
|
||||
# Miami switch — outbound traffic-policy + GRE tunnel to biscayne
|
||||
mia-sw01:
|
||||
kind: ceos
|
||||
startup-config: mia-sw01-startup.cfg
|
||||
|
||||
# Biscayne host — iptables DNAT/SNAT, fwmark, policy routing, GRE
|
||||
biscayne:
|
||||
kind: linux
|
||||
|
||||
# Simulates kind node (172.20.0.2) running the validator
|
||||
kind-node:
|
||||
kind: linux
|
||||
|
||||
# Simulates an internet peer sending/receiving validator traffic
|
||||
internet-peer:
|
||||
kind: linux
|
||||
|
||||
links:
|
||||
# was-sw01 Et1 (uplink) <-> internet-peer
|
||||
- endpoints: ["was-sw01:et1", "internet-peer:eth1"]
|
||||
|
||||
# was-sw01 Et2 <-> mia-sw01 Et2 (backbone, 172.16.1.188/31)
|
||||
- endpoints: ["was-sw01:et2", "mia-sw01:et2"]
|
||||
|
||||
# mia-sw01 Et1 <-> biscayne (GRE underlay, 10.0.2.0/24)
|
||||
- endpoints: ["mia-sw01:et1", "biscayne:eth1"]
|
||||
|
||||
# biscayne <-> kind-node (Docker bridge simulation, 172.20.0.0/24)
|
||||
- endpoints: ["biscayne:eth2", "kind-node:eth1"]
|
||||
|
|
@ -1,26 +0,0 @@
|
|||
hostname was-sw01
|
||||
|
||||
ip routing
|
||||
|
||||
interface Loopback101
|
||||
ip address 137.239.194.65/32
|
||||
|
||||
interface Ethernet1
|
||||
no switchport
|
||||
ip address 64.92.84.81/24
|
||||
traffic-policy input VALIDATOR-RELAY
|
||||
|
||||
interface Ethernet2
|
||||
no switchport
|
||||
ip address 172.16.1.188/31
|
||||
|
||||
ip access-list VALIDATOR-RELAY-ACL
|
||||
10 permit udp any any eq 8001
|
||||
20 permit udp any any range 9000 9025
|
||||
30 permit tcp any any eq 8001
|
||||
|
||||
traffic-policy VALIDATOR-RELAY
|
||||
match VALIDATOR-RELAY-ACL
|
||||
set nexthop 172.16.1.189
|
||||
|
||||
system-rule overriding-action redirect
|
||||
|
|
@ -0,0 +1,113 @@
|
|||
# Contributing
|
||||
|
||||
Thank you for taking the time to make a contribution to Stack Orchestrator.
|
||||
|
||||
## Install (developer mode)
|
||||
|
||||
Suitable for developers either modifying or debugging the orchestrator Python code:
|
||||
|
||||
### Prerequisites
|
||||
|
||||
In addition to the pre-requisites listed in the [README](/README.md), the following are required:
|
||||
|
||||
1. Python venv package
|
||||
This may or may not be already installed depending on the host OS and version. Check by running:
|
||||
```
|
||||
$ python3 -m venv
|
||||
usage: venv [-h] [--system-site-packages] [--symlinks | --copies] [--clear] [--upgrade] [--without-pip] [--prompt PROMPT] ENV_DIR [ENV_DIR ...]
|
||||
venv: error: the following arguments are required: ENV_DIR
|
||||
```
|
||||
If the venv package is missing you should see a message indicating how to install it, for example with:
|
||||
```
|
||||
$ apt install python3.10-venv
|
||||
```
|
||||
|
||||
### Install
|
||||
|
||||
1. Clone this repository:
|
||||
```
|
||||
$ git clone https://git.vdb.to/cerc-io/stack-orchestrator.git
|
||||
```
|
||||
|
||||
2. Enter the project directory:
|
||||
```
|
||||
$ cd stack-orchestrator
|
||||
```
|
||||
|
||||
3. (This and the next step can be done by running `source ./scripts/developer-mode-setup.sh`)
|
||||
|
||||
Create and activate a venv:
|
||||
```
|
||||
$ python3 -m venv venv
|
||||
$ source ./venv/bin/activate
|
||||
(venv) $
|
||||
```
|
||||
|
||||
4. Install the cli in edit mode:
|
||||
```
|
||||
$ pip install --editable .
|
||||
```
|
||||
|
||||
5. Verify installation:
|
||||
```
|
||||
(venv) $ laconic-so
|
||||
Usage: laconic-so [OPTIONS] COMMAND [ARGS]...
|
||||
|
||||
Laconic Stack Orchestrator
|
||||
|
||||
Options:
|
||||
--quiet
|
||||
--verbose
|
||||
--dry-run
|
||||
-h, --help Show this message and exit.
|
||||
|
||||
Commands:
|
||||
build-containers build the set of containers required for a complete...
|
||||
deploy-system deploy a stack
|
||||
setup-repositories git clone the set of repositories required to build...
|
||||
```
|
||||
|
||||
## Build a zipapp (single file distributable script)
|
||||
|
||||
Use shiv to build a single file Python executable zip archive of laconic-so:
|
||||
|
||||
1. Install [shiv](https://github.com/linkedin/shiv):
|
||||
```
|
||||
$ (venv) pip install shiv
|
||||
$ (venv) pip install wheel
|
||||
```
|
||||
|
||||
2. Run shiv to create a zipapp file:
|
||||
```
|
||||
$ (venv) shiv -c laconic-so -o laconic-so .
|
||||
```
|
||||
This creates a file `./laconic-so` that is executable outside of any venv, and on other machines and OSes and architectures, and requiring only the system Python3:
|
||||
|
||||
3. Verify it works:
|
||||
```
|
||||
$ cp stack-orchetrator/laconic-so ~/bin
|
||||
$ laconic-so
|
||||
Usage: laconic-so [OPTIONS] COMMAND [ARGS]...
|
||||
|
||||
Laconic Stack Orchestrator
|
||||
|
||||
Options:
|
||||
--stack TEXT specify a stack to build/deploy
|
||||
--quiet
|
||||
--verbose
|
||||
--dry-run
|
||||
--local-stack
|
||||
--debug
|
||||
--continue-on-error
|
||||
-h, --help Show this message and exit.
|
||||
|
||||
Commands:
|
||||
build-containers build the set of containers required for a complete...
|
||||
build-npms build the set of npm packages required for a...
|
||||
deploy deploy a stack
|
||||
deploy-system deploy a stack
|
||||
setup-repositories git clone the set of repositories required to build...
|
||||
version print tool version
|
||||
```
|
||||
|
||||
For cutting releases, use the [shiv build script](/scripts/build_shiv_package.sh).
|
||||
|
|
@ -0,0 +1,8 @@
|
|||
# Stack Orchestrator
|
||||
|
||||
Here you will find information about the design of stack orchestrator, contributing to it, and deploying services/applications that combine two or more "stacks".
|
||||
|
||||
Most "stacks" contain their own README which has plenty of information on deploying, but stacks can be combined in a variety of ways which are document here, for example:
|
||||
|
||||
- [Gitea with Laconicd Fixturenet](./gitea-with-laconicd-fixturenet.md)
|
||||
- [Laconicd Registry with Console](./laconicd-with-console.md)
|
||||
|
|
@ -0,0 +1,71 @@
|
|||
# Adding a new stack
|
||||
|
||||
See [this PR](https://git.vdb.to/cerc-io/stack-orchestrator/pull/434) for an example of how to currently add a minimal stack to stack orchestrator. The [reth stack](https://git.vdb.to/cerc-io/stack-orchestrator/pull/435) is another good example.
|
||||
|
||||
For external developers, we recommend forking this repo and adding your stack directly to your fork. This initially requires running in "developer mode" as described [here](/docs/CONTRIBUTING.md). Check out the [Namada stack](https://github.com/vknowable/stack-orchestrator/blob/main/app/data/stacks/public-namada/digitalocean_quickstart.md) from Knowable to see how that is done.
|
||||
|
||||
Core to the feature completeness of stack orchestrator is to [decouple the tool functionality from payload](https://git.vdb.to/cerc-io/stack-orchestrator/issues/315) which will no longer require forking to add a stack.
|
||||
|
||||
## Example
|
||||
|
||||
- in `stack_orchestrator/data/stacks/my-new-stack/stack.yml` add:
|
||||
|
||||
```yaml
|
||||
version: "0.1"
|
||||
name: my-new-stack
|
||||
repos:
|
||||
- github.com/my-org/my-new-stack
|
||||
containers:
|
||||
- cerc/my-new-stack
|
||||
pods:
|
||||
- my-new-stack
|
||||
```
|
||||
|
||||
- in `stack_orchestrator/data/container-build/cerc-my-new-stack/build.sh` add:
|
||||
|
||||
```yaml
|
||||
#!/usr/bin/env bash
|
||||
# Build the my-new-stack image
|
||||
source ${CERC_CONTAINER_BASE_DIR}/build-base.sh
|
||||
docker build -t cerc/my-new-stack:local -f ${CERC_REPO_BASE_DIR}/my-new-stack/Dockerfile ${build_command_args} ${CERC_REPO_BASE_DIR}/my-new-stack
|
||||
```
|
||||
|
||||
- in `stack_orchestrator/data/compose/docker-compose-my-new-stack.yml` add:
|
||||
|
||||
```yaml
|
||||
version: "3.2"
|
||||
|
||||
services:
|
||||
my-new-stack:
|
||||
image: cerc/my-new-stack:local
|
||||
restart: always
|
||||
ports:
|
||||
- "0.0.0.0:3000:3000"
|
||||
```
|
||||
|
||||
- in `stack_orchestrator/data/repository-list.txt` add:
|
||||
|
||||
```bash
|
||||
github.com/my-org/my-new-stack
|
||||
```
|
||||
whereby that repository contains your source code and a `Dockerfile`, and matches the `repos:` field in the `stack.yml`.
|
||||
|
||||
- in `stack_orchestrator/data/container-image-list.txt` add:
|
||||
|
||||
```bash
|
||||
cerc/my-new-stack
|
||||
```
|
||||
|
||||
- in `stack_orchestrator/data/pod-list.txt` add:
|
||||
|
||||
```bash
|
||||
my-new-stack
|
||||
```
|
||||
|
||||
Now, the following commands will fetch, build, and deploy you app:
|
||||
|
||||
```bash
|
||||
laconic-so --stack my-new-stack setup-repositories
|
||||
laconic-so --stack my-new-stack build-containers
|
||||
laconic-so --stack my-new-stack deploy-system up
|
||||
```
|
||||
|
|
@ -1,114 +0,0 @@
|
|||
# Arista EOS Reference Notes
|
||||
|
||||
Collected from live switch CLI (`?` help) and Arista documentation search
|
||||
results. Switch platform: 7280CR3A, EOS 4.34.0F.
|
||||
|
||||
## PBR (Policy-Based Routing)
|
||||
|
||||
EOS uses `policy-map type pbr` — NOT `traffic-policy` (which is a different
|
||||
feature for ASIC-level traffic policies, not available on all platforms/modes).
|
||||
|
||||
### Syntax
|
||||
|
||||
```
|
||||
! ACL to match traffic
|
||||
ip access-list <ACL-NAME>
|
||||
10 permit <proto> <src> <dst> [ports]
|
||||
|
||||
! Class-map referencing the ACL
|
||||
class-map type pbr match-any <CLASS-NAME>
|
||||
match ip access-group <ACL-NAME>
|
||||
|
||||
! Policy-map with nexthop redirect
|
||||
policy-map type pbr <POLICY-NAME>
|
||||
class <CLASS-NAME>
|
||||
set nexthop <A.B.C.D> ! direct nexthop IP
|
||||
set nexthop recursive <A.B.C.D> ! recursive resolution
|
||||
! set nexthop-group <NAME> ! nexthop group
|
||||
! set ttl <value> ! TTL override
|
||||
|
||||
! Apply on interface
|
||||
interface <INTF>
|
||||
service-policy type pbr input <POLICY-NAME>
|
||||
```
|
||||
|
||||
### PBR `set` options (from CLI `?`)
|
||||
|
||||
```
|
||||
set ?
|
||||
nexthop Next hop IP address for forwarding
|
||||
nexthop-group next hop group name
|
||||
ttl TTL effective with nexthop/nexthop-group
|
||||
```
|
||||
|
||||
```
|
||||
set nexthop ?
|
||||
A.B.C.D next hop IP address
|
||||
A:B:C:D:E:F:G:H next hop IPv6 address
|
||||
recursive Enable Recursive Next hop resolution
|
||||
```
|
||||
|
||||
**No VRF qualifier on `set nexthop`.** The nexthop must be reachable in the
|
||||
VRF where the policy is applied. For cross-VRF PBR, use a static inter-VRF
|
||||
route to make the nexthop reachable (see below).
|
||||
|
||||
## Static Inter-VRF Routes
|
||||
|
||||
Source: [EOS 4.34.0F - Static Inter-VRF Route](https://www.arista.com/en/um-eos/eos-static-inter-vrf-route)
|
||||
|
||||
Allows configuring a static route in one VRF with a nexthop evaluated in a
|
||||
different VRF. Uses the `egress-vrf` keyword.
|
||||
|
||||
### Syntax
|
||||
|
||||
```
|
||||
ip route vrf <ingress-vrf> <prefix>/<mask> egress-vrf <egress-vrf> <nexthop-ip>
|
||||
ip route vrf <ingress-vrf> <prefix>/<mask> egress-vrf <egress-vrf> <interface>
|
||||
```
|
||||
|
||||
### Examples (from Arista docs)
|
||||
|
||||
```
|
||||
! Route in vrf1 with nexthop resolved in default VRF
|
||||
ip route vrf vrf1 1.0.1.0/24 egress-vrf default 1.0.0.2
|
||||
|
||||
! show ip route vrf vrf1 output:
|
||||
! S 1.0.1.0/24 [1/0] via 1.0.0.2, Vlan2180 (egress VRF default)
|
||||
```
|
||||
|
||||
### Key points
|
||||
|
||||
- For bidirectional traffic, static inter-VRF routes must be configured in
|
||||
both VRFs.
|
||||
- ECMP next-hop sets across same or heterogeneous egress VRFs are supported.
|
||||
- The `show ip route vrf` output displays the egress VRF name when it differs
|
||||
from the source VRF.
|
||||
|
||||
## Inter-VRF Local Route Leaking
|
||||
|
||||
Source: [EOS 4.35.1F - Inter-VRF Local Route Leaking](https://www.arista.com/en/um-eos/eos-inter-vrf-local-route-leaking)
|
||||
|
||||
An alternative to static inter-VRF routes that leaks routes dynamically from
|
||||
one VRF (source) to another VRF (destination) on the same router.
|
||||
|
||||
## Config Sessions
|
||||
|
||||
```
|
||||
configure session <name> ! enter named session
|
||||
show session-config diffs ! MUST be run from inside the session
|
||||
commit timer HH:MM:SS ! commit with auto-revert timer
|
||||
abort ! discard session
|
||||
```
|
||||
|
||||
From enable mode:
|
||||
```
|
||||
configure session <name> commit ! finalize a pending session
|
||||
```
|
||||
|
||||
## Checkpoints and Rollback
|
||||
|
||||
```
|
||||
configure checkpoint save <name>
|
||||
rollback running-config checkpoint <name>
|
||||
write memory
|
||||
```
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -1,181 +0,0 @@
|
|||
<!-- Source: https://www.arista.com/um-eos/eos-ingress-and-egress-per-port-for-ipv4-and-ipv6-counters -->
|
||||
<!-- Scraped: 2026-03-06T20:50:41.080Z -->
|
||||
|
||||
# Ingress and Egress Per-Port for IPv4 and IPv6 Counters
|
||||
|
||||
|
||||
This feature supports per-interface ingress and egress packet and byte counters for IPv4
|
||||
and IPv6.
|
||||
|
||||
|
||||
This section describes Ingress and Egress per-port for IPv4 and IPv6 counters, including
|
||||
configuration instructions and command descriptions.
|
||||
|
||||
|
||||
Topics covered by this chapter include:
|
||||
|
||||
|
||||
- Configuration
|
||||
|
||||
- Show commands
|
||||
|
||||
- Dedicated ARP Entry for TX IPv4 and IPv6 Counters
|
||||
|
||||
- Considerations
|
||||
|
||||
|
||||
## Configuration
|
||||
|
||||
|
||||
IPv4 and IPv6 ingress counters (count **bridged and routed**
|
||||
traffic, supported only on front-panel ports) can be enabled and disabled using the
|
||||
**hardware counter feature ip in**
|
||||
command:
|
||||
|
||||
|
||||
```
|
||||
`**[no] hardware counter feature ip in**`
|
||||
```
|
||||
|
||||
|
||||
For IPv4 and IPv6 ingress and egress counters that include only
|
||||
**routed** traffic (supported on Layer3 interfaces such as
|
||||
routed ports and L3 subinterfaces only), use the following commands:
|
||||
|
||||
|
||||
Note: The DCS-7300X, DCS-7250X, DCS-7050X, and DCS-7060X platforms
|
||||
do not require configuration for IPv4 and IPv6 packet counters for only routed
|
||||
traffic. They are collected by default. Other platforms (DCS-7280SR, DCS-7280CR, and
|
||||
DCS-7500-R) need the feature enabled.
|
||||
|
||||
|
||||
```
|
||||
`**[no] hardware counter feature ip in layer3**`
|
||||
```
|
||||
|
||||
|
||||
```
|
||||
`**[no] hardware counter feature ip out layer3**`
|
||||
```
|
||||
|
||||
|
||||
### hardware counter feature ip
|
||||
|
||||
|
||||
Use the **hardware counter feature ip** command to enable ingress
|
||||
and egress counters at Layer 3. The **no** and **default** forms of the command
|
||||
disables the feature. The feature is enabled by default.
|
||||
|
||||
|
||||
**Command Mode**
|
||||
|
||||
|
||||
Configuration mode
|
||||
|
||||
|
||||
**Command Syntax**
|
||||
|
||||
|
||||
**hardware counter feature ip in|out layer3**
|
||||
|
||||
|
||||
**no hardware counter feature ip in|out layer3**
|
||||
|
||||
|
||||
**default hardware counter feature in|out layer3**
|
||||
|
||||
|
||||
**Example**
|
||||
|
||||
|
||||
This example enables ingress and egress ip counters for Layer 3.
|
||||
```
|
||||
`**switch(config)# hardware counter feature in layer3**`
|
||||
```
|
||||
|
||||
|
||||
```
|
||||
`**switch(config)# hardware counter feature out layer3**`
|
||||
```
|
||||
|
||||
|
||||
## Show commands
|
||||
|
||||
|
||||
Use the [**show interfaces counters ip**](/um-eos/eos-ethernet-ports#xzx_RbdvgrfI6B) command to
|
||||
display IPv4, IPv6 packets, and octets.
|
||||
|
||||
|
||||
**Example**
|
||||
|
||||
|
||||
```
|
||||
`switch# **show interfaces counters ip**
|
||||
Interface IPv4InOctets IPv4InPkts IPv6InOctets IPv6InPkts
|
||||
Et1/1 0 0 0 0
|
||||
Et1/2 0 0 0 0
|
||||
Et1/3 0 0 0 0
|
||||
Et1/4 0 0 0 0
|
||||
...
|
||||
Interface IPv4OutOctets IPv4OutPkts IPv6OutOctets IPv6OutPkts
|
||||
Et1/1 0 0 0 0
|
||||
Et1/2 0 0 0 0
|
||||
Et1/3 0 0 0 0
|
||||
Et1/4 0 0 0 0
|
||||
...`
|
||||
```
|
||||
|
||||
|
||||
You can also query the output from the **show interfaces counters
|
||||
ip** command through snmp via the ARISTA-IP-MIB.
|
||||
|
||||
|
||||
To clear the IPv4 or IPv6 counters, use the [**clear
|
||||
counters**](/um-eos/eos-ethernet-ports#topic_dnd_1nm_vnb) command.
|
||||
|
||||
|
||||
**Example**
|
||||
```
|
||||
`switch# **clear counters**`
|
||||
```
|
||||
|
||||
|
||||
## Dedicated ARP Entry for TX IPv4 and IPv6 Counters
|
||||
|
||||
|
||||
IPv4/IPv6 egress Layer 3 (**hardware counter feature ip out layer3**)
|
||||
counting on DCS-7280SR, DCS-7280CR, and DCS-7500-R platforms work based on ARP entry of
|
||||
the next hop. By default, IPv4's next-hop and IPv6's next-hop resolve to the same MAC
|
||||
address and interface that shared the ARP entry.
|
||||
|
||||
|
||||
To differentiate the counters between IPv4 and IPv6, disable
|
||||
**arp** entry sharing with the following command:
|
||||
|
||||
|
||||
```
|
||||
`**ip hardware fib next-hop arp dedicated**`
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
Note: This command is required for IPv4 and IPv6 egress counters
|
||||
to operate on the DCS-7280SR, DCS-7280CR, and DCS-7500-R platforms.
|
||||
|
||||
|
||||
|
||||
|
||||
## Considerations
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
- Packet sizes greater than 9236 bytes are not counted by per-port IPv4 and IPv6 counters.
|
||||
|
||||
- Only the DCS-7260X3, DCS-7368, DCS-7300, DCS-7050SX3, DCS-7050CX3, DCS-7280SR,
|
||||
DCS-7280CR and DCS-7500-R platforms support the **hardware counter feature ip in** command.
|
||||
|
||||
- Only the DCS-7280SR, DCS-7280CR and DCS-7500-R platforms support the **hardware counter feature ip [in|out] layer3** command.
|
||||
|
|
@ -1,305 +0,0 @@
|
|||
<!-- Source: https://www.arista.com/en/um-eos/eos-inter-vrf-local-route-leaking -->
|
||||
<!-- Scraped: 2026-03-06T20:43:28.363Z -->
|
||||
|
||||
# Inter-VRF Local Route Leaking
|
||||
|
||||
|
||||
Inter-VRF local route leaking allows the leaking of routes from one VRF (the source VRF) to
|
||||
another VRF (the destination VRF) on the same router.
|
||||
Inter-VRF routes can exist in any VRF (including the
|
||||
default VRF) on the system. Routes can be leaked using the
|
||||
following methods:
|
||||
|
||||
- Inter-VRF Local Route Leaking using BGP
|
||||
VPN
|
||||
|
||||
- Inter-VRF Local Route Leaking using VRF-leak
|
||||
Agent
|
||||
|
||||
|
||||
## Inter-VRF Local Route Leaking using BGP VPN
|
||||
|
||||
|
||||
Inter-VRF local route leaking allows the user to export and import routes from one VRF to another
|
||||
on the same device. This is implemented by exporting routes from a VRF to the local VPN table
|
||||
using the route target extended community list and importing the same route target extended
|
||||
community lists from the local VPN table into the target VRF. VRF route leaking is supported
|
||||
on VPN-IPv4, VPN-IPv6, and EVPN types.
|
||||
|
||||
|
||||
Figure 1. Inter-VRF Local Route Leaking using Local VPN Table
|
||||
|
||||
|
||||
### Accessing Shared Resources Across VPNs
|
||||
|
||||
|
||||
To access shared resources across VPNs, all the routes from the shared services VRF must be
|
||||
leaked into each of the VPN VRFs, and customer routes must be leaked into the shared
|
||||
services VRF for return traffic. Accessing shared resources allows the route target of the
|
||||
shared services VRF to be exported into all customer VRFs, and allows the shared services
|
||||
VRF to import route targets from customers A and B. The following figure shows how to
|
||||
provide customers, corresponding to multiple VPN domains, access to services like DHCP
|
||||
available in the shared VRF.
|
||||
|
||||
|
||||
Route leaking across the VRFs is supported
|
||||
on VPN-IPv4, VPN-IPv6, and EVPN.
|
||||
|
||||
|
||||
Figure 2. Accessing Shared Resources Across VPNs
|
||||
|
||||
|
||||
### Configuring Inter-VRF Local Route Leaking
|
||||
|
||||
|
||||
Inter-VRF local route leaking is configured using VPN-IPv4, VPN-IPv6, and EVPN. Prefixes can be
|
||||
exported and imported using any of the configured VPN types. Ensure that the same VPN
|
||||
type that is exported is used while importing.
|
||||
|
||||
|
||||
Leaking unicast IPv4 or IPv6 prefixes is supported and achieved by exporting prefixes locally to
|
||||
the VPN table and importing locally from the VPN table into the target VRF on the same
|
||||
device as shown in the figure titled **Inter-VRF Local Route Leaking using Local VPN
|
||||
Table** using the **route-target** command.
|
||||
|
||||
|
||||
Exporting or importing the routes to or from the EVPN table is accomplished with the following
|
||||
two methods:
|
||||
|
||||
- Using VXLAN for encapsulation
|
||||
|
||||
- Using MPLS for encapsulation
|
||||
|
||||
|
||||
#### Using VXLAN for Encapsulation
|
||||
|
||||
|
||||
To use VXLAN encapsulation type, make sure that VRF to VNI mapping is present and the interface
|
||||
status for the VXLAN interface is up. This is the default encapsulation type for
|
||||
EVPN.
|
||||
|
||||
|
||||
**Example**
|
||||
|
||||
|
||||
The configuration for VXLAN encapsulation type is as
|
||||
follows:
|
||||
```
|
||||
`switch(config)# **router bgp 65001**
|
||||
switch(config-router-bgp)# **address-family evpn**
|
||||
switch(config-router-bgp-af)# **neighbor default encapsulation VXLAN next-hop-self source-interface Loopback0**
|
||||
switch(config)# **hardware tcam**
|
||||
switch(config-hw-tcam)# **system profile VXLAN-routing**
|
||||
switch(config-hw-tcam)# **interface VXLAN1**
|
||||
switch(config-hw-tcam-if-Vx1)# **VXLAN source-interface Loopback0**
|
||||
switch(config-hw-tcam-if-Vx1)# **VXLAN udp-port 4789**
|
||||
switch(config-hw-tcam-if-Vx1)# **VXLAN vrf vrf-blue vni 20001**
|
||||
switch(config-hw-tcam-if-Vx1)# **VXLAN vrf vrf-red vni 10001**`
|
||||
```
|
||||
|
||||
|
||||
#### Using MPLS for Encapsulation
|
||||
|
||||
|
||||
To use MPLS encapsulation type to export
|
||||
to the EVPN table, MPLS needs to be enabled globally on the device and
|
||||
the encapsulation method needs to be changed from default type, that
|
||||
is VXLAN to MPLS under the EVPN address-family sub-mode.
|
||||
|
||||
|
||||
**Example**
|
||||
```
|
||||
`switch(config)# **router bgp 65001**
|
||||
switch(config-router-bgp)# **address-family evpn**
|
||||
switch(config-router-bgp-af)# **neighbor default encapsulation mpls next-hop-self source-interface Loopback0**`
|
||||
```
|
||||
|
||||
|
||||
### Route-Distinguisher
|
||||
|
||||
|
||||
Route-Distinguisher (RD) uniquely identifies routes from a particular VRF.
|
||||
Route-Distinguisher is configured for every VRF from which routes are exported from or
|
||||
imported into.
|
||||
|
||||
|
||||
The following commands are used to configure Route-Distinguisher for a VRF.
|
||||
|
||||
|
||||
```
|
||||
`switch(config-router-bgp)# **vrf vrf-services**
|
||||
switch(config-router-bgp-vrf-vrf-services)# **rd 1.0.0.1:1**
|
||||
|
||||
switch(config-router-bgp)# **vrf vrf-blue**
|
||||
switch(config-router-bgp-vrf-vrf-blue)# **rd 2.0.0.1:2**`
|
||||
```
|
||||
|
||||
|
||||
### Exporting Routes from a VRF
|
||||
|
||||
|
||||
Use the **route-target export** command to export routes from a VRF to the
|
||||
local VPN or EVPN table using the route target
|
||||
extended community list.
|
||||
|
||||
|
||||
**Examples**
|
||||
|
||||
- These commands export routes from
|
||||
**vrf-red** to the local VPN
|
||||
table.
|
||||
```
|
||||
`switch(config)# **service routing protocols model multi-agent**
|
||||
switch(config)# **mpls ip**
|
||||
switch(config)# **router bgp 65001**
|
||||
switch(config-router-bgp)# **vrf vrf-red**
|
||||
switch(config-router-bgp-vrf-vrf-red)# **rd 1:1**
|
||||
switch(config-router-bgp-vrf-vrf-red)# **route-target export vpn-ipv4 10:10**
|
||||
switch(config-router-bgp-vrf-vrf-red)# **route-target export vpn-ipv6 10:20**`
|
||||
```
|
||||
|
||||
- These commands export routes from
|
||||
**vrf-red** to the EVPN
|
||||
table.
|
||||
```
|
||||
`switch(config)# **router bgp 65001**
|
||||
switch(config-router-bgp)# **vrf vrf-red**
|
||||
switch(config-router-bgp-vrf-vrf-red)# **rd 1:1**
|
||||
switch(config-router-bgp-vrf-vrf-red)# **route-target export evpn 10:1**`
|
||||
```
|
||||
|
||||
|
||||
### Importing Routes into a VRF
|
||||
|
||||
|
||||
Use the **route-target import** command to import the exported routes from
|
||||
the local VPN or EVPN table to the target VRF
|
||||
using the route target extended community
|
||||
list.
|
||||
|
||||
|
||||
**Examples**
|
||||
|
||||
- These commands import routes from the VPN
|
||||
table to
|
||||
**vrf-blue**.
|
||||
```
|
||||
`switch(config)# **service routing protocols model multi-agent**
|
||||
switch(config)# **mpls ip**
|
||||
switch(config)# **router bgp 65001**
|
||||
switch(config-router-bgp)# **vrf vrf-blue**
|
||||
switch(config-router-bgp-vrf-vrf-blue)# **rd 2:2**
|
||||
switch(config-router-bgp-vrf-vrf-blue)# **route-target import vpn-ipv4 10:10**
|
||||
switch(config-router-bgp-vrf-vrf-blue)# **route-target import vpn-ipv6 10:20**`
|
||||
```
|
||||
|
||||
- These commands import routes from the EVPN
|
||||
table to
|
||||
**vrf-blue**.
|
||||
```
|
||||
`switch(config)# **router bgp 65001**
|
||||
switch(config-router-bgp)# **vrf vrf-blue**
|
||||
switch(config-router-bgp-vrf-vrf-blue)# **rd 2:2**
|
||||
switch(config-router-bgp-vrf-vrf-blue)# **route-target import evpn 10:1**`
|
||||
```
|
||||
|
||||
|
||||
### Exporting and Importing Routes using Route
|
||||
Map
|
||||
|
||||
|
||||
To manage VRF route leaking, control the export and import prefixes with route-map export or
|
||||
import commands. The route map is effective only if the VRF or the VPN
|
||||
paths are already candidates for export or import. The route-target
|
||||
export or import commandmust be configured first. Setting BGP
|
||||
attributes using route maps is effective only on the export end.
|
||||
|
||||
|
||||
Note: Prefixes that are leaked are not re-exported to the VPN table from the target VRF.
|
||||
|
||||
**Examples**
|
||||
|
||||
- These commands export routes from
|
||||
**vrf-red** to the local VPN
|
||||
table.
|
||||
```
|
||||
`switch(config)# **service routing protocols model multi-agent**
|
||||
switch(config)# **mpls ip**
|
||||
switch(config)# **router bgp 65001**
|
||||
switch(config-router-bgp)# **vrf vrf-red**
|
||||
switch(config-router-bgp-vrf-vrf-red)# **rd 1:1**
|
||||
switch(config-router-bgp-vrf-vrf-red)# **route-target export vpn-ipv4 10:10**
|
||||
switch(config-router-bgp-vrf-vrf-red)# **route-target export vpn-ipv6 10:20**
|
||||
switch(config-router-bgp-vrf-vrf-red)# **route-target export vpn-ipv4 route-map EXPORT_V4_ROUTES_T0_VPN_TABLE**
|
||||
switch(config-router-bgp-vrf-vrf-red)# **route-target export vpn-ipv6 route-map EXPORT_V6_ROUTES_T0_VPN_TABLE**`
|
||||
```
|
||||
|
||||
- These commands export routes to from
|
||||
**vrf-red** to the EVPN
|
||||
table.
|
||||
```
|
||||
`switch(config)# **router bgp 65001**
|
||||
switch(config-router-bgp)# **vrf vrf-red**
|
||||
switch(config-router-bgp-vrf-vrf-red)# **rd 1:1**
|
||||
switch(config-router-bgp-vrf-vrf-red)# **route-target export evpn 10:1**
|
||||
switch(config-router-bgp-vrf-vrf-red)# **route-target export evpn route-map EXPORT_ROUTES_T0_EVPN_TABLE**`
|
||||
```
|
||||
|
||||
- These commands import routes from the VPN table to
|
||||
**vrf-blue**.
|
||||
```
|
||||
`switch(config)# **service routing protocols model multi-agent**
|
||||
switch(config)# **mpls ip**
|
||||
switch(config)# **router bgp 65001**
|
||||
switch(config-router-bgp)# **vrf vrf-blue**
|
||||
switch(config-router-bgp-vrf-vrf-blue)# **rd 1:1**
|
||||
switch(config-router-bgp-vrf-vrf-blue)# **route-target import vpn-ipv4 10:10**
|
||||
switch(config-router-bgp-vrf-vrf-blue)# **route-target import vpn-ipv6 10:20**
|
||||
switch(config-router-bgp-vrf-vrf-blue)# **route-target import vpn-ipv4 route-map IMPORT_V4_ROUTES_VPN_TABLE**
|
||||
switch(config-router-bgp-vrf-vrf-blue)# **route-target import vpn-ipv6 route-map IMPORT_V6_ROUTES_VPN_TABLE**`
|
||||
```
|
||||
|
||||
- These commands import routes from the EVPN table to
|
||||
**vrf-blue**.
|
||||
```
|
||||
`switch(config)# **router bgp 65001**
|
||||
switch(config-router-bgp)# **vrf vrf-blue**
|
||||
switch(config-router-bgp-vrf-vrf-blue)# **rd 2:2**
|
||||
switch(config-router-bgp-vrf-vrf-blue)# **route-target import evpn 10:1**
|
||||
switch(config-router-bgp-vrf-vrf-blue)# **route-target import evpn route-map IMPORT_ROUTES_FROM_EVPN_TABLE**`
|
||||
```
|
||||
|
||||
|
||||
## Inter-VRF Local Route Leaking using VRF-leak
|
||||
Agent
|
||||
|
||||
|
||||
Inter-VRF local route leaking allows routes to leak from one VRF to another using a route
|
||||
map as a VRF-leak agent. VRFs are leaked based on the preferences assigned to each
|
||||
VRF.
|
||||
|
||||
|
||||
### Configuring Route Maps
|
||||
|
||||
|
||||
To leak routes from one VRF to another using a route map, use the [router general](/um-eos/eos-evpn-and-vcs-commands#xx1351777) command to enter Router-General
|
||||
Configuration Mode, then enter the VRF submode for the destination VRF, and use the
|
||||
[leak routes](/um-eos/eos-evpn-and-vcs-commands#reference_g2h_2z3_hwb) command to specify the source
|
||||
VRF and the route map to be used. Routes in the source VRF that match the policy in the
|
||||
route map will then be considered for leaking into the configuration-mode VRF. If two or
|
||||
more policies specify leaking the same prefix to the same destination VRF, the route
|
||||
with a higher (post-set-clause) distance and preference is chosen.
|
||||
|
||||
|
||||
**Example**
|
||||
|
||||
|
||||
These commands configure a route map to leak routes from **VRF1**
|
||||
to **VRF2** using route map
|
||||
**RM1**.
|
||||
```
|
||||
`switch(config)# **router general**
|
||||
switch(config-router-general)# **vrf VRF2**
|
||||
switch(config-router-general-vrf-VRF2)# **leak routes source-vrf VRF1 subscribe-policy RM1**
|
||||
switch(config-router-general-vrf-VRF2)#`
|
||||
```
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -1,82 +0,0 @@
|
|||
<!-- Source: https://www.arista.com/en/um-eos/eos-static-inter-vrf-route -->
|
||||
<!-- Scraped: 2026-03-06T20:43:17.977Z -->
|
||||
|
||||
# Static Inter-VRF Route
|
||||
|
||||
|
||||
The Static Inter-VRF Route feature adds support for static inter-VRF routes. This enables the configuration of routes to destinations in one ingress VRF with an ability to specify a next-hop in a different egress VRF through a static configuration.
|
||||
|
||||
|
||||
You can configure static inter-VRF routes in default and non-default VRFs. A different
|
||||
egress VRF is achieved by “tagging” the **next-hop** or **forwarding
|
||||
via** with a reference to an egress VRF (different from the source
|
||||
VRF) in which that next-hop should be evaluated. Static inter-VRF routes
|
||||
with ECMP next-hop sets in the same egress VRF or heterogenous egress VRFs
|
||||
can be specified.
|
||||
|
||||
|
||||
The Static Inter-VRF Route feature is independent and complementary to other mechanisms that can be used to setup local inter-VRF routes. The other supported mechanisms in EOS and the broader use-cases they support are documented here:
|
||||
|
||||
- [Inter-VRF Local Route Leaking using BGP VPN](/um-eos/eos-inter-vrf-local-route-leaking#xx1348142)
|
||||
|
||||
- [Inter-VRF Local Route Leaking using VRF-leak Agent](/um-eos/eos-inter-vrf-local-route-leaking#xx1346287)
|
||||
|
||||
|
||||
## Configuration
|
||||
|
||||
|
||||
The configuration to setup static-Inter VRF routes in an ingress (source) VRF to forward IP traffic to a different egress (target) VRF can be done in the following modes:
|
||||
|
||||
- This command creates a static route in one ingress VRF that points to a next-hop
|
||||
in a different egress VRF.
|
||||
ip | ipv6
|
||||
route [vrf
|
||||
vrf-name
|
||||
destination-prefix [egress-vrf
|
||||
egress-next-hop-vrf-name]
|
||||
next-hop]
|
||||
|
||||
|
||||
## Show Commands
|
||||
|
||||
|
||||
Use the **show ip route vrf** to display the egress VRF name if it
|
||||
differs from the source VRF.
|
||||
|
||||
|
||||
**Example**
|
||||
```
|
||||
`switch# **show ip route vrf vrf1**
|
||||
|
||||
VRF: vrf1
|
||||
Codes: C - connected, S - static, K - kernel,
|
||||
O - OSPF, IA - OSPF inter area, E1 - OSPF external type 1,
|
||||
E2 - OSPF external type 2, N1 - OSPF NSSA external type 1,
|
||||
N2 - OSPF NSSA external type2, B - BGP, B I - iBGP, B E - eBGP,
|
||||
R - RIP, I L1 - IS-IS level 1, I L2 - IS-IS level 2,
|
||||
O3 - OSPFv3, A B - BGP Aggregate, A O - OSPF Summary,
|
||||
NG - Nexthop Group Static Route, V - VXLAN Control Service,
|
||||
DH - DHCP client installed default route, M - Martian,
|
||||
DP - Dynamic Policy Route, L - VRF Leaked
|
||||
|
||||
Gateway of last resort is not set
|
||||
|
||||
S 1.0.1.0/24 [1/0] via 1.0.0.2, Vlan2180 (egress VRF default)
|
||||
S 1.0.7.0/24 [1/0] via 1.0.6.2, Vlan2507 (egress VRF vrf3)`
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
## Limitations
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
- For bidirectional traffic to work correctly between a pair of VRFs, static inter-VRF
|
||||
routes in both VRFs must be configured.
|
||||
|
||||
- Static Inter-VRF routing is supported only in multi-agent routing protocol mode.
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -1,168 +0,0 @@
|
|||
# Ashburn Relay / ip_echo Port Reachability Checklist
|
||||
|
||||
The validator exits when it can't verify UDP ports (8001, 9000, 9002, 9003) are
|
||||
reachable from entrypoint servers. The ip_echo protocol: validator TCP-connects
|
||||
to entrypoint on port 8001, entrypoint sees source IP, sends UDP probes back to
|
||||
that IP on the validator's ports. If probes don't arrive, validator crashes.
|
||||
|
||||
## Layer 1: Biscayne outbound path
|
||||
|
||||
Validator's outbound ip_echo TCP (dport 8001) must exit via GRE tunnel so
|
||||
entrypoints see `137.239.194.65`, not biscayne's real IP via Docker MASQUERADE.
|
||||
|
||||
```
|
||||
[ ] 1.1 Mangle rules (4 rules in mangle PREROUTING):
|
||||
- udp sport 8001 (gossip outbound)
|
||||
- udp sport 9000:9025 (TVU/repair outbound)
|
||||
- tcp sport 8001 (gossip TCP outbound)
|
||||
- tcp dport 8001 (ip_echo outbound — THE CRITICAL ONE)
|
||||
|
||||
[ ] 1.2 SNAT rule at position 1 (before Docker MASQUERADE):
|
||||
POSTROUTING -m mark --mark 100 -j SNAT --to-source 137.239.194.65
|
||||
|
||||
[ ] 1.3 Policy routing rule:
|
||||
fwmark 0x64 lookup ashburn
|
||||
|
||||
[ ] 1.4 Ashburn routing table default route:
|
||||
default via 169.254.100.0 dev gre-ashburn
|
||||
|
||||
[ ] 1.5 Mangle counters incrementing (pkts/bytes on tcp dport 8001 rule)
|
||||
```
|
||||
|
||||
## Layer 2: GRE tunnel (biscayne ↔ mia-sw01)
|
||||
|
||||
```
|
||||
[ ] 2.1 Tunnel exists and UP:
|
||||
gre-ashburn with 169.254.100.1/31
|
||||
|
||||
[ ] 2.2 Tunnel peer reachable:
|
||||
ping 169.254.100.0
|
||||
|
||||
[ ] 2.3 Ashburn IP on loopback:
|
||||
137.239.194.65/32 dev lo
|
||||
```
|
||||
|
||||
## Layer 3: Biscayne inbound path (DNAT + DOCKER-USER)
|
||||
|
||||
Entrypoint UDP probes arrive at `137.239.194.65` and must reach kind node
|
||||
`172.20.0.2`.
|
||||
|
||||
```
|
||||
[ ] 3.1 DNAT rules at position 1 in nat PREROUTING
|
||||
(before Docker's ADDRTYPE LOCAL rule):
|
||||
- udp dport 8001 → 172.20.0.2:8001
|
||||
- tcp dport 8001 → 172.20.0.2:8001
|
||||
- udp dport 9000:9025 → 172.20.0.2
|
||||
|
||||
[ ] 3.2 DOCKER-USER ACCEPT rules (3 rules):
|
||||
- udp dport 8001 → ACCEPT
|
||||
- tcp dport 8001 → ACCEPT
|
||||
- udp dport 9000:9025 → ACCEPT
|
||||
|
||||
[ ] 3.3 DNAT counters incrementing
|
||||
```
|
||||
|
||||
## Layer 4: mia-sw01
|
||||
|
||||
```
|
||||
[ ] 4.1 Tunnel100 UP in VRF relay
|
||||
src 209.42.167.137, dst 186.233.184.235, link 169.254.100.0/31
|
||||
|
||||
[ ] 4.2 VRF relay default route:
|
||||
0.0.0.0/0 egress-vrf default 172.16.1.188
|
||||
|
||||
[ ] 4.3 Default VRF route to relay IP:
|
||||
137.239.194.65/32 egress-vrf relay 169.254.100.1
|
||||
|
||||
[ ] 4.4 ACL SEC-VALIDATOR-100-IN permits all needed traffic
|
||||
|
||||
[ ] 4.5 Backbone Et4/1 UP (172.16.1.189/31)
|
||||
```
|
||||
|
||||
## Layer 5: was-sw01
|
||||
|
||||
```
|
||||
[ ] 5.1 Static route: 137.239.194.65/32 via 172.16.1.189
|
||||
|
||||
[ ] 5.2 Backbone Et4/1 UP (172.16.1.188/31)
|
||||
|
||||
[ ] 5.3 No Loopback101 (removed to avoid absorbing traffic locally)
|
||||
```
|
||||
|
||||
## Layer 6: Persistence
|
||||
|
||||
```
|
||||
[ ] 6.1 ashburn-relay.service enabled and active (runs After=docker.service)
|
||||
|
||||
[ ] 6.2 /usr/local/sbin/ashburn-relay-setup.sh exists
|
||||
```
|
||||
|
||||
## Layer 7: End-to-end tests
|
||||
|
||||
All tests run via Ansible playbooks. The test scripts in `scripts/` are
|
||||
utilities invoked by the playbooks — never run them manually via SSH.
|
||||
|
||||
```
|
||||
[ ] 7.1 relay-test-tcp-dport.py (via ashburn-relay-check.yml or ad-hoc play)
|
||||
Tests: outbound tcp dport 8001 mangle → SNAT → tunnel
|
||||
Pass: entrypoint sees 137.239.194.65
|
||||
Fail: entrypoint sees 186.233.184.235 (Docker MASQUERADE)
|
||||
|
||||
[ ] 7.2 relay-test-ip-echo.py (via ashburn-relay-check.yml or ad-hoc play)
|
||||
Tests: FULL END-TO-END (outbound SNAT + inbound DNAT + DOCKER-USER)
|
||||
Pass: UDP probe received from entrypoint
|
||||
Fail: no UDP probes — inbound path broken
|
||||
|
||||
[ ] 7.3 relay-inbound-udp-test.yml (cross-inventory: biscayne + kelce)
|
||||
Tests: inbound UDP from external host → DNAT → kind node
|
||||
Pass: UDP arrives in kind netns
|
||||
```
|
||||
|
||||
## Playbooks
|
||||
|
||||
```bash
|
||||
# Read-only check of all relay state (biscayne + both switches):
|
||||
ansible-playbook -i inventory-switches/switches.yml \
|
||||
-i inventory/biscayne.yml playbooks/ashburn-relay-check.yml
|
||||
|
||||
# Apply all biscayne relay rules (idempotent):
|
||||
ansible-playbook -i inventory/biscayne.yml playbooks/ashburn-relay-biscayne.yml
|
||||
|
||||
# Apply outbound only (the ip_echo fix):
|
||||
ansible-playbook -i inventory/biscayne.yml \
|
||||
playbooks/ashburn-relay-biscayne.yml -t outbound
|
||||
|
||||
# Apply inbound only (DNAT + DOCKER-USER):
|
||||
ansible-playbook -i inventory/biscayne.yml \
|
||||
playbooks/ashburn-relay-biscayne.yml -t inbound
|
||||
|
||||
# Apply mia-sw01 config:
|
||||
ansible-playbook -i inventory-switches/switches.yml \
|
||||
playbooks/ashburn-relay-mia-sw01.yml
|
||||
|
||||
# Apply was-sw01 config:
|
||||
ansible-playbook -i inventory-switches/switches.yml \
|
||||
playbooks/ashburn-relay-was-sw01.yml
|
||||
|
||||
# Cross-inventory inbound UDP test (biscayne + kelce):
|
||||
ansible-playbook -i inventory/biscayne.yml -i inventory/kelce.yml \
|
||||
playbooks/relay-inbound-udp-test.yml
|
||||
```
|
||||
|
||||
## Historical root causes
|
||||
|
||||
1. **TCP dport 8001 mangle rule missing** — ip_echo TCP exits via Docker
|
||||
MASQUERADE, entrypoint sees wrong IP, UDP probes go to wrong address.
|
||||
|
||||
2. **DOCKER-USER ACCEPT rules missing** — DNAT'd traffic hits Docker's FORWARD
|
||||
DROP policy, never reaches kind node.
|
||||
|
||||
3. **DNAT rule position wrong** — Docker's `ADDRTYPE LOCAL` rule in PREROUTING
|
||||
catches traffic to loopback IPs before our DNAT rules. Must use `-I
|
||||
PREROUTING 1`.
|
||||
|
||||
4. **mia-sw01 egress-vrf route with interface specified** — silently fails in
|
||||
EOS (accepted in config, never installed in RIB). Must use nexthop-only form.
|
||||
|
||||
5. **was-sw01 Loopback101 absorbing traffic** — local delivery instead of
|
||||
forwarding to mia-sw01 via backbone.
|
||||
|
|
@ -1,275 +0,0 @@
|
|||
# Ashburn Validator Relay — Full Traffic Redirect
|
||||
|
||||
## Overview
|
||||
|
||||
All validator traffic (gossip, repair, TVU, TPU) enters and exits from
|
||||
`137.239.194.65` (laconic-was-sw01, Ashburn). Peers see the validator as an
|
||||
Ashburn node. This improves repair peer count and slot catchup rate by reducing
|
||||
RTT to the TeraSwitch/Pittsburgh cluster from ~30ms (direct Miami) to ~5ms
|
||||
(Ashburn).
|
||||
|
||||
Supersedes the previous TVU-only shred relay (see `tvu-shred-relay.md`).
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
OUTBOUND (validator → peers)
|
||||
agave-validator (kind pod, ports 8001, 9000-9025)
|
||||
↓ Docker bridge → host FORWARD chain
|
||||
biscayne host (186.233.184.235)
|
||||
↓ mangle PREROUTING: fwmark 100 on sport 8001,9000-9025 from 172.20.0.0/16
|
||||
↓ nat POSTROUTING: SNAT → src 137.239.194.65
|
||||
↓ policy route: fwmark 100 → table ashburn → via 169.254.7.6 dev doublezero0
|
||||
laconic-mia-sw01 (209.42.167.133, Miami)
|
||||
↓ traffic-policy VALIDATOR-OUTBOUND: src 137.239.194.65 → nexthop 172.16.1.188
|
||||
↓ backbone Et4/1 (25.4ms)
|
||||
laconic-was-sw01 Et4/1 (Ashburn)
|
||||
↓ default route via 64.92.84.80 out Et1/1
|
||||
Internet (peers see src 137.239.194.65)
|
||||
|
||||
INBOUND (peers → validator)
|
||||
Solana peers → 137.239.194.65:8001,9000-9025
|
||||
↓ internet routing to was-sw01
|
||||
laconic-was-sw01 Et1/1 (Ashburn)
|
||||
↓ traffic-policy VALIDATOR-RELAY: ASIC redirect, line rate
|
||||
↓ nexthop 172.16.1.189 via Et4/1 backbone (25.4ms)
|
||||
laconic-mia-sw01 Et4/1 (Miami)
|
||||
↓ L3 forward → biscayne via doublezero0 GRE or ISP routing
|
||||
biscayne (186.233.184.235)
|
||||
↓ nat PREROUTING: DNAT dst 137.239.194.65:* → 172.20.0.2:* (kind node)
|
||||
↓ Docker bridge → validator pod
|
||||
agave-validator
|
||||
```
|
||||
|
||||
RPC traffic (port 8899) is NOT relayed — clients connect directly to biscayne.
|
||||
|
||||
## Switch Config: laconic-was-sw01
|
||||
|
||||
SSH: `install@137.239.200.198`
|
||||
|
||||
### Pre-change
|
||||
|
||||
```
|
||||
configure checkpoint save pre-validator-relay
|
||||
```
|
||||
|
||||
Rollback: `rollback running-config checkpoint pre-validator-relay` then `write memory`.
|
||||
|
||||
### Config session with auto-revert
|
||||
|
||||
```
|
||||
configure session validator-relay
|
||||
|
||||
! Loopback for 137.239.194.65 (do NOT touch Loopback100 which has .64)
|
||||
interface Loopback101
|
||||
ip address 137.239.194.65/32
|
||||
|
||||
! ACL covering all validator ports
|
||||
ip access-list VALIDATOR-RELAY-ACL
|
||||
10 permit udp any any eq 8001
|
||||
20 permit udp any any range 9000 9025
|
||||
30 permit tcp any any eq 8001
|
||||
|
||||
! Traffic-policy: ASIC redirect to backbone (mia-sw01)
|
||||
traffic-policy VALIDATOR-RELAY
|
||||
match VALIDATOR-RELAY-ACL
|
||||
set nexthop 172.16.1.189
|
||||
|
||||
! Replace old SHRED-RELAY on Et1/1
|
||||
interface Ethernet1/1
|
||||
no traffic-policy input SHRED-RELAY
|
||||
traffic-policy input VALIDATOR-RELAY
|
||||
|
||||
! system-rule overriding-action redirect (already present from SHRED-RELAY)
|
||||
|
||||
show session-config diffs
|
||||
commit timer 00:05:00
|
||||
```
|
||||
|
||||
After verification: `configure session validator-relay commit` then `write memory`.
|
||||
|
||||
### Cleanup (after stable)
|
||||
|
||||
Old SHRED-RELAY policy and ACL can be removed once VALIDATOR-RELAY is confirmed:
|
||||
|
||||
```
|
||||
configure session cleanup-shred-relay
|
||||
no traffic-policy SHRED-RELAY
|
||||
no ip access-list SHRED-RELAY-ACL
|
||||
show session-config diffs
|
||||
commit
|
||||
write memory
|
||||
```
|
||||
|
||||
## Switch Config: laconic-mia-sw01
|
||||
|
||||
### Pre-flight checks
|
||||
|
||||
Before applying config, verify:
|
||||
|
||||
1. Which EOS interface terminates the doublezero0 GRE from biscayne
|
||||
(endpoint 209.42.167.133). Check with `show interfaces tunnel` or
|
||||
`show ip interface brief | include Tunnel`.
|
||||
|
||||
2. Whether `system-rule overriding-action redirect` is already configured.
|
||||
Check with `show running-config | include system-rule`.
|
||||
|
||||
3. Whether EOS traffic-policy works on tunnel interfaces. If not, apply on
|
||||
the physical interface where GRE packets arrive (likely Et<X> facing
|
||||
biscayne's ISP network or the DZ infrastructure).
|
||||
|
||||
### Config session
|
||||
|
||||
```
|
||||
configure checkpoint save pre-validator-outbound
|
||||
|
||||
configure session validator-outbound
|
||||
|
||||
! ACL matching outbound validator traffic (source = Ashburn IP)
|
||||
ip access-list VALIDATOR-OUTBOUND-ACL
|
||||
10 permit ip 137.239.194.65/32 any
|
||||
|
||||
! Redirect to was-sw01 via backbone
|
||||
traffic-policy VALIDATOR-OUTBOUND
|
||||
match VALIDATOR-OUTBOUND-ACL
|
||||
set nexthop 172.16.1.188
|
||||
|
||||
! Apply on the interface where biscayne GRE traffic arrives
|
||||
! Replace Tunnel<X> with the actual interface from pre-flight check #1
|
||||
interface Tunnel<X>
|
||||
traffic-policy input VALIDATOR-OUTBOUND
|
||||
|
||||
! Add system-rule if not already present (pre-flight check #2)
|
||||
system-rule overriding-action redirect
|
||||
|
||||
show session-config diffs
|
||||
commit timer 00:05:00
|
||||
```
|
||||
|
||||
After verification: commit + `write memory`.
|
||||
|
||||
## Host Config: biscayne
|
||||
|
||||
Automated via ansible playbook `playbooks/ashburn-validator-relay.yml`.
|
||||
|
||||
### Manual equivalent
|
||||
|
||||
```bash
|
||||
# 1. Accept packets destined for 137.239.194.65
|
||||
sudo ip addr add 137.239.194.65/32 dev lo
|
||||
|
||||
# 2. Inbound DNAT to kind node (172.20.0.2)
|
||||
sudo iptables -t nat -A PREROUTING -p udp -d 137.239.194.65 --dport 8001 \
|
||||
-j DNAT --to-destination 172.20.0.2:8001
|
||||
sudo iptables -t nat -A PREROUTING -p tcp -d 137.239.194.65 --dport 8001 \
|
||||
-j DNAT --to-destination 172.20.0.2:8001
|
||||
sudo iptables -t nat -A PREROUTING -p udp -d 137.239.194.65 --dport 9000:9025 \
|
||||
-j DNAT --to-destination 172.20.0.2
|
||||
|
||||
# 3. Outbound: mark validator traffic
|
||||
sudo iptables -t mangle -A PREROUTING -s 172.20.0.0/16 -p udp --sport 8001 \
|
||||
-j MARK --set-mark 100
|
||||
sudo iptables -t mangle -A PREROUTING -s 172.20.0.0/16 -p udp --sport 9000:9025 \
|
||||
-j MARK --set-mark 100
|
||||
sudo iptables -t mangle -A PREROUTING -s 172.20.0.0/16 -p tcp --sport 8001 \
|
||||
-j MARK --set-mark 100
|
||||
|
||||
# 4. Outbound: SNAT to Ashburn IP (INSERT before Docker MASQUERADE)
|
||||
sudo iptables -t nat -I POSTROUTING 1 -m mark --mark 100 \
|
||||
-j SNAT --to-source 137.239.194.65
|
||||
|
||||
# 5. Policy routing table
|
||||
echo "100 ashburn" | sudo tee -a /etc/iproute2/rt_tables
|
||||
sudo ip rule add fwmark 100 table ashburn
|
||||
sudo ip route add default via 169.254.7.6 dev doublezero0 table ashburn
|
||||
|
||||
# 6. Persist
|
||||
sudo netfilter-persistent save
|
||||
# ip rule + ip route persist via /etc/network/if-up.d/ashburn-routing
|
||||
```
|
||||
|
||||
### Docker NAT port preservation
|
||||
|
||||
**Must verify before going live:** Docker masquerade must preserve source ports
|
||||
for kind's hostNetwork pods. If Docker rewrites the source port, the mangle
|
||||
PREROUTING match on `--sport 8001,9000-9025` will miss traffic.
|
||||
|
||||
Test: `tcpdump -i br-cf46a62ab5b2 -nn 'udp src port 8001'` — if you see
|
||||
packets with sport 8001 from 172.20.0.2, port preservation works.
|
||||
|
||||
If Docker does NOT preserve ports, the mark must be set inside the kind node
|
||||
container (on the pod's veth) rather than on the host.
|
||||
|
||||
## Execution Order
|
||||
|
||||
1. **was-sw01**: checkpoint → config session with 5min auto-revert → verify counters → commit
|
||||
2. **biscayne**: add 137.239.194.65/32 to lo, add inbound DNAT rules
|
||||
3. **Verify inbound**: `ping 137.239.194.65` from external host, check DNAT counters
|
||||
4. **mia-sw01**: pre-flight checks → config session with 5min auto-revert → commit
|
||||
5. **biscayne**: add outbound fwmark + policy routing + SNAT rules
|
||||
6. **Test outbound**: from biscayne, send UDP from port 8001, verify src 137.239.194.65 on was-sw01
|
||||
7. **Verify**: traffic-policy counters on both switches, iptables hit counts on biscayne
|
||||
8. **Restart validator** if needed (gossip should auto-refresh, but restart ensures clean state)
|
||||
9. **was-sw01 + mia-sw01**: `write memory` to persist
|
||||
10. **Cleanup**: remove old SHRED-RELAY and 64.92.84.81:20000 DNAT after stable
|
||||
|
||||
## Verification
|
||||
|
||||
1. `show traffic-policy counters` on was-sw01 — VALIDATOR-RELAY-ACL matches
|
||||
2. `show traffic-policy counters` on mia-sw01 — VALIDATOR-OUTBOUND-ACL matches
|
||||
3. `sudo iptables -t nat -L -v -n` on biscayne — DNAT and SNAT hit counts
|
||||
4. `sudo iptables -t mangle -L -v -n` on biscayne — fwmark hit counts
|
||||
5. `ip rule show` on biscayne — fwmark 100 lookup ashburn
|
||||
6. Validator gossip ContactInfo shows 137.239.194.65 for ALL addresses (gossip, repair, TVU, TPU)
|
||||
7. Repair peer count increases (target: 20+ peers)
|
||||
8. Slot catchup rate improves from ~0.9 toward ~2.5 slots/sec
|
||||
9. `traceroute --sport=8001 <remote_peer>` from biscayne routes via doublezero0/was-sw01
|
||||
|
||||
## Rollback
|
||||
|
||||
### biscayne
|
||||
|
||||
```bash
|
||||
sudo ip addr del 137.239.194.65/32 dev lo
|
||||
sudo iptables -t nat -D PREROUTING -p udp -d 137.239.194.65 --dport 8001 -j DNAT --to-destination 172.20.0.2:8001
|
||||
sudo iptables -t nat -D PREROUTING -p tcp -d 137.239.194.65 --dport 8001 -j DNAT --to-destination 172.20.0.2:8001
|
||||
sudo iptables -t nat -D PREROUTING -p udp -d 137.239.194.65 --dport 9000:9025 -j DNAT --to-destination 172.20.0.2
|
||||
sudo iptables -t mangle -D PREROUTING -s 172.20.0.0/16 -p udp --sport 8001 -j MARK --set-mark 100
|
||||
sudo iptables -t mangle -D PREROUTING -s 172.20.0.0/16 -p udp --sport 9000:9025 -j MARK --set-mark 100
|
||||
sudo iptables -t mangle -D PREROUTING -s 172.20.0.0/16 -p tcp --sport 8001 -j MARK --set-mark 100
|
||||
sudo iptables -t nat -D POSTROUTING -m mark --mark 100 -j SNAT --to-source 137.239.194.65
|
||||
sudo ip rule del fwmark 100 table ashburn
|
||||
sudo ip route del default table ashburn
|
||||
sudo netfilter-persistent save
|
||||
```
|
||||
|
||||
### was-sw01
|
||||
|
||||
```
|
||||
rollback running-config checkpoint pre-validator-relay
|
||||
write memory
|
||||
```
|
||||
|
||||
### mia-sw01
|
||||
|
||||
```
|
||||
rollback running-config checkpoint pre-validator-outbound
|
||||
write memory
|
||||
```
|
||||
|
||||
## Key Details
|
||||
|
||||
| Item | Value |
|
||||
|------|-------|
|
||||
| Ashburn relay IP | `137.239.194.65` (Loopback101 on was-sw01) |
|
||||
| Ashburn LAN block | `137.239.194.64/29` on was-sw01 Et1/1 |
|
||||
| Biscayne IP | `186.233.184.235` |
|
||||
| Kind node IP | `172.20.0.2` (Docker bridge br-cf46a62ab5b2) |
|
||||
| Validator ports | 8001 (gossip), 9000-9025 (TVU/repair/TPU) |
|
||||
| Excluded ports | 8899 (RPC), 8900 (WebSocket) — direct to biscayne |
|
||||
| GRE tunnel | doublezero0: 169.254.7.7 ↔ 169.254.7.6, remote 209.42.167.133 |
|
||||
| Backbone | was-sw01 Et4/1 172.16.1.188/31 ↔ mia-sw01 Et4/1 172.16.1.189/31 |
|
||||
| Policy routing table | 100 ashburn |
|
||||
| Fwmark | 100 |
|
||||
| was-sw01 SSH | `install@137.239.200.198` |
|
||||
| EOS version | 4.34.0F |
|
||||
|
|
@ -1,416 +0,0 @@
|
|||
# Blue-Green Upgrades for Biscayne
|
||||
|
||||
Zero-downtime upgrade procedures for the agave-stack deployment on biscayne.
|
||||
Uses ZFS clones for instant data duplication, Caddy health-check routing for
|
||||
traffic shifting, and k8s native sidecars for independent container upgrades.
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
Caddy ingress (biscayne.vaasl.io)
|
||||
├── upstream A: localhost:8899 ← health: /health
|
||||
└── upstream B: localhost:8897 ← health: /health
|
||||
│
|
||||
┌─────────────────┴──────────────────┐
|
||||
│ kind cluster │
|
||||
│ │
|
||||
│ Deployment A Deployment B │
|
||||
│ ┌─────────────┐ ┌─────────────┐ │
|
||||
│ │ agave :8899 │ │ agave :8897 │ │
|
||||
│ │ doublezerod │ │ doublezerod │ │
|
||||
│ └──────┬──────┘ └──────┬──────┘ │
|
||||
└─────────┼─────────────────┼─────────┘
|
||||
│ │
|
||||
ZFS dataset A ZFS clone B
|
||||
(original) (instant CoW copy)
|
||||
```
|
||||
|
||||
Both deployments run in the same kind cluster with `hostNetwork: true`.
|
||||
Caddy active health checks route traffic to whichever deployment has a
|
||||
healthy `/health` endpoint.
|
||||
|
||||
## Storage Layout
|
||||
|
||||
| Data | Path | Type | Survives restart? |
|
||||
|------|------|------|-------------------|
|
||||
| Ledger | `/srv/solana/ledger` | ZFS zvol (xfs) | Yes |
|
||||
| Snapshots | `/srv/solana/snapshots` | ZFS zvol (xfs) | Yes |
|
||||
| Accounts | `/srv/solana/ramdisk/accounts` | `/dev/ram0` (xfs) | Until host reboot |
|
||||
| Validator config | `/srv/deployments/agave/data/validator-config` | ZFS | Yes |
|
||||
| DZ config | `/srv/deployments/agave/data/doublezero-config` | ZFS | Yes |
|
||||
|
||||
The ZFS zvol `biscayne/DATA/volumes/solana` backs `/srv/solana` (ledger, snapshots).
|
||||
The ramdisk at `/dev/ram0` holds accounts — it's a block device, not tmpfs, so it
|
||||
survives process restarts but not host reboots.
|
||||
|
||||
---
|
||||
|
||||
## Procedure 1: DoubleZero Binary Upgrade (zero downtime, single pod)
|
||||
|
||||
The GRE tunnel (`doublezero0`) and BGP routes live in kernel space. They persist
|
||||
across doublezerod process restarts. Upgrading the DZ binary does not require
|
||||
tearing down the tunnel or restarting the validator.
|
||||
|
||||
### Prerequisites
|
||||
|
||||
- doublezerod is defined as a k8s native sidecar (`spec.initContainers` with
|
||||
`restartPolicy: Always`). See [Required Changes](#required-changes) below.
|
||||
- k8s 1.29+ (biscayne runs 1.35.1)
|
||||
|
||||
### Steps
|
||||
|
||||
1. Build or pull the new doublezero container image.
|
||||
|
||||
2. Patch the pod's sidecar image:
|
||||
```bash
|
||||
kubectl -n <ns> patch pod <pod> --type='json' -p='[
|
||||
{"op": "replace", "path": "/spec/initContainers/0/image",
|
||||
"value": "laconicnetwork/doublezero:new-version"}
|
||||
]'
|
||||
```
|
||||
|
||||
3. Only the doublezerod container restarts. The agave container is unaffected.
|
||||
The GRE tunnel interface and BGP routes remain in the kernel throughout.
|
||||
|
||||
4. Verify:
|
||||
```bash
|
||||
kubectl -n <ns> exec <pod> -c doublezerod -- doublezero --version
|
||||
kubectl -n <ns> exec <pod> -c doublezerod -- doublezero status
|
||||
ip route | grep doublezero0 # routes still present
|
||||
```
|
||||
|
||||
### Rollback
|
||||
|
||||
Patch the image back to the previous version. Same process, same zero downtime.
|
||||
|
||||
---
|
||||
|
||||
## Procedure 2: Agave Version Upgrade (zero RPC downtime, blue-green)
|
||||
|
||||
Agave is the main container and must be restarted for a version change. To maintain
|
||||
zero RPC downtime, we run two deployments simultaneously and let Caddy shift traffic
|
||||
based on health checks.
|
||||
|
||||
### Prerequisites
|
||||
|
||||
- Caddy ingress configured with dual upstreams and active health checks
|
||||
- A parameterized spec.yml that accepts alternate ports and volume paths
|
||||
- ZFS snapshot/clone scripts
|
||||
|
||||
### Steps
|
||||
|
||||
#### Phase 1: Prepare (no downtime, no risk)
|
||||
|
||||
1. **ZFS snapshot** for rollback safety:
|
||||
```bash
|
||||
zfs snapshot -r biscayne/DATA@pre-upgrade-$(date +%Y%m%d)
|
||||
```
|
||||
|
||||
2. **ZFS clone** the validator volumes:
|
||||
```bash
|
||||
zfs clone biscayne/DATA/volumes/solana@pre-upgrade-$(date +%Y%m%d) \
|
||||
biscayne/DATA/volumes/solana-blue
|
||||
```
|
||||
This is instant (copy-on-write). No additional storage until writes diverge.
|
||||
|
||||
3. **Clone the ramdisk accounts** (not on ZFS):
|
||||
```bash
|
||||
mkdir -p /srv/solana-blue/ramdisk/accounts
|
||||
cp -a /srv/solana/ramdisk/accounts/* /srv/solana-blue/ramdisk/accounts/
|
||||
```
|
||||
This is the slow step — 460GB on ramdisk. Consider `rsync` with `--inplace`
|
||||
to minimize copy time, or investigate whether the ramdisk can move to a ZFS
|
||||
dataset for instant cloning in future deployments.
|
||||
|
||||
4. **Build or pull** the new agave container image.
|
||||
|
||||
#### Phase 2: Start blue deployment (no downtime)
|
||||
|
||||
5. **Create Deployment B** in the same kind cluster, pointing at cloned volumes,
|
||||
with RPC on port 8897:
|
||||
```bash
|
||||
# Apply the blue deployment manifest (parameterized spec)
|
||||
kubectl apply -f deployment/k8s-manifests/agave-blue.yaml
|
||||
```
|
||||
|
||||
6. **Deployment B catches up.** It starts from the snapshot point and replays.
|
||||
Monitor progress:
|
||||
```bash
|
||||
kubectl -n <ns> exec <blue-pod> -c agave-validator -- \
|
||||
solana -u http://127.0.0.1:8897 slot
|
||||
```
|
||||
|
||||
7. **Validate** the new version works:
|
||||
- RPC responds: `curl -sf http://localhost:8897/health`
|
||||
- Correct version: `kubectl -n <ns> exec <blue-pod> -c agave-validator -- agave-validator --version`
|
||||
- doublezerod connected (if applicable)
|
||||
|
||||
Take as long as needed. Deployment A is still serving all traffic.
|
||||
|
||||
#### Phase 3: Traffic shift (zero downtime)
|
||||
|
||||
8. **Caddy routes traffic to B.** Once B's `/health` returns 200, Caddy's active
|
||||
health check automatically starts routing to it. Alternatively, update the
|
||||
Caddy upstream config to prefer B.
|
||||
|
||||
9. **Verify** B is serving live traffic:
|
||||
```bash
|
||||
curl -sf https://biscayne.vaasl.io/health
|
||||
# Check Caddy access logs for requests hitting port 8897
|
||||
```
|
||||
|
||||
#### Phase 4: Cleanup
|
||||
|
||||
10. **Stop Deployment A:**
|
||||
```bash
|
||||
kubectl -n <ns> delete deployment agave-green
|
||||
```
|
||||
|
||||
11. **Reconfigure B to use standard port** (8899) if desired, or update Caddy
|
||||
to only route to 8897.
|
||||
|
||||
12. **Clean up ZFS clone** (or keep as rollback):
|
||||
```bash
|
||||
zfs destroy biscayne/DATA/volumes/solana-blue
|
||||
```
|
||||
|
||||
### Rollback
|
||||
|
||||
At any point before Phase 4:
|
||||
- Deployment A is untouched and still serving traffic (or can be restarted)
|
||||
- Delete Deployment B: `kubectl -n <ns> delete deployment agave-blue`
|
||||
- Destroy the ZFS clone: `zfs destroy biscayne/DATA/volumes/solana-blue`
|
||||
|
||||
After Phase 4 (A already stopped):
|
||||
- `zfs rollback` to restore original data
|
||||
- Redeploy A with old image
|
||||
|
||||
---
|
||||
|
||||
## Required Changes to agave-stack
|
||||
|
||||
### 1. Move doublezerod to native sidecar
|
||||
|
||||
In the pod spec generation (laconic-so or compose override), doublezerod must be
|
||||
defined as a native sidecar container instead of a regular container:
|
||||
|
||||
```yaml
|
||||
spec:
|
||||
initContainers:
|
||||
- name: doublezerod
|
||||
image: laconicnetwork/doublezero:local
|
||||
restartPolicy: Always # makes it a native sidecar
|
||||
securityContext:
|
||||
privileged: true
|
||||
capabilities:
|
||||
add: [NET_ADMIN]
|
||||
env:
|
||||
- name: DOUBLEZERO_RPC_ENDPOINT
|
||||
value: https://api.mainnet-beta.solana.com
|
||||
volumeMounts:
|
||||
- name: doublezero-config
|
||||
mountPath: /root/.config/doublezero
|
||||
containers:
|
||||
- name: agave-validator
|
||||
image: laconicnetwork/agave:local
|
||||
# ... existing config
|
||||
```
|
||||
|
||||
This change means:
|
||||
- doublezerod starts before agave and stays running
|
||||
- Patching the doublezerod image restarts only that container
|
||||
- agave can be restarted independently without affecting doublezerod
|
||||
|
||||
This requires a laconic-so change to support `initContainers` with `restartPolicy`
|
||||
in compose-to-k8s translation — or a post-deployment patch.
|
||||
|
||||
### 2. Caddy dual-upstream config
|
||||
|
||||
Add health-checked upstreams for both blue and green deployments:
|
||||
|
||||
```caddyfile
|
||||
biscayne.vaasl.io {
|
||||
reverse_proxy {
|
||||
to localhost:8899 localhost:8897
|
||||
|
||||
health_uri /health
|
||||
health_interval 5s
|
||||
health_timeout 3s
|
||||
|
||||
lb_policy first
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
`lb_policy first` routes to the first healthy upstream. When only A is running,
|
||||
all traffic goes to :8899. When B comes up healthy, traffic shifts.
|
||||
|
||||
### 3. Parameterized deployment spec
|
||||
|
||||
Create a parameterized spec or kustomize overlay that accepts:
|
||||
- RPC port (8899 vs 8897)
|
||||
- Volume paths (original vs ZFS clone)
|
||||
- Deployment name suffix (green vs blue)
|
||||
|
||||
### 4. Delete DaemonSet workaround
|
||||
|
||||
Remove `deployment/k8s-manifests/doublezero-daemonset.yaml` from agave-stack.
|
||||
|
||||
### 5. Fix container DZ identity
|
||||
|
||||
Copy the registered identity into the container volume:
|
||||
```bash
|
||||
sudo cp /home/solana/.config/doublezero/id.json \
|
||||
/srv/deployments/agave/data/doublezero-config/id.json
|
||||
```
|
||||
|
||||
### 6. Disable host systemd doublezerod
|
||||
|
||||
After the container sidecar is working:
|
||||
```bash
|
||||
sudo systemctl stop doublezerod
|
||||
sudo systemctl disable doublezerod
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Implementation Order
|
||||
|
||||
This is a spec-driven, test-driven plan. Each step produces a testable artifact.
|
||||
|
||||
### Step 1: Fix existing DZ bugs (no code changes to laconic-so)
|
||||
|
||||
Fixes BUG-1 through BUG-5 from [doublezero-status.md](doublezero-status.md).
|
||||
|
||||
**Spec:** Container doublezerod shows correct identity, connects to laconic-mia-sw01,
|
||||
host systemd doublezerod is disabled.
|
||||
|
||||
**Test:**
|
||||
```bash
|
||||
kubectl -n <ns> exec <pod> -c doublezerod -- doublezero address
|
||||
# assert: 3Bw6v7EruQvTwoY79h2QjQCs2KBQFzSneBdYUbcXK1Tr
|
||||
|
||||
kubectl -n <ns> exec <pod> -c doublezerod -- doublezero status
|
||||
# assert: BGP Session Up, laconic-mia-sw01
|
||||
|
||||
systemctl is-active doublezerod
|
||||
# assert: inactive
|
||||
```
|
||||
|
||||
**Changes:**
|
||||
- Copy `id.json` to container volume
|
||||
- Update `DOUBLEZERO_RPC_ENDPOINT` in spec.yml
|
||||
- Deploy with hostNetwork-enabled stack-orchestrator
|
||||
- Stop and disable host doublezerod
|
||||
- Delete DaemonSet manifest from agave-stack
|
||||
|
||||
### Step 2: Native sidecar for doublezerod
|
||||
|
||||
**Spec:** doublezerod image can be patched without restarting the agave container.
|
||||
GRE tunnel and routes persist across doublezerod restart.
|
||||
|
||||
**Test:**
|
||||
```bash
|
||||
# Record current agave container start time
|
||||
BEFORE=$(kubectl -n <ns> get pod <pod> -o jsonpath='{.status.containerStatuses[?(@.name=="agave-validator")].state.running.startedAt}')
|
||||
|
||||
# Patch DZ image
|
||||
kubectl -n <ns> patch pod <pod> --type='json' -p='[
|
||||
{"op":"replace","path":"/spec/initContainers/0/image","value":"laconicnetwork/doublezero:test"}
|
||||
]'
|
||||
|
||||
# Wait for DZ container to restart
|
||||
sleep 10
|
||||
|
||||
# Verify agave was NOT restarted
|
||||
AFTER=$(kubectl -n <ns> get pod <pod> -o jsonpath='{.status.containerStatuses[?(@.name=="agave-validator")].state.running.startedAt}')
|
||||
[ "$BEFORE" = "$AFTER" ] # assert: same start time
|
||||
|
||||
# Verify tunnel survived
|
||||
ip route | grep doublezero0 # assert: routes present
|
||||
```
|
||||
|
||||
**Changes:**
|
||||
- laconic-so: support `initContainers` with `restartPolicy: Always` in
|
||||
compose-to-k8s translation (or: define doublezerod as native sidecar in
|
||||
compose via `x-kubernetes-init-container` extension or equivalent)
|
||||
- Alternatively: post-deploy kubectl patch to move doublezerod to initContainers
|
||||
|
||||
### Step 3: Caddy dual-upstream routing
|
||||
|
||||
**Spec:** Caddy routes RPC traffic to whichever backend is healthy. Adding a second
|
||||
healthy backend on :8897 causes traffic to shift without configuration changes.
|
||||
|
||||
**Test:**
|
||||
```bash
|
||||
# Start a test HTTP server on :8897 with /health
|
||||
python3 -c "
|
||||
from http.server import HTTPServer, BaseHTTPRequestHandler
|
||||
class H(BaseHTTPRequestHandler):
|
||||
def do_GET(self):
|
||||
self.send_response(200); self.end_headers(); self.wfile.write(b'ok')
|
||||
HTTPServer(('', 8897), H).serve_forever()
|
||||
" &
|
||||
|
||||
# Verify Caddy discovers it
|
||||
sleep 10
|
||||
curl -sf https://biscayne.vaasl.io/health
|
||||
# assert: 200
|
||||
|
||||
kill %1
|
||||
```
|
||||
|
||||
**Changes:**
|
||||
- Update Caddy ingress config with dual upstreams and health checks
|
||||
|
||||
### Step 4: ZFS clone and blue-green tooling
|
||||
|
||||
**Spec:** A script creates a ZFS clone, starts a blue deployment on alternate ports
|
||||
using the cloned data, and the deployment catches up and becomes healthy.
|
||||
|
||||
**Test:**
|
||||
```bash
|
||||
# Run the clone + deploy script
|
||||
./scripts/blue-green-prepare.sh --target-version v2.2.1
|
||||
|
||||
# assert: ZFS clone exists
|
||||
zfs list biscayne/DATA/volumes/solana-blue
|
||||
|
||||
# assert: blue deployment exists and is catching up
|
||||
kubectl -n <ns> get deployment agave-blue
|
||||
|
||||
# assert: blue RPC eventually becomes healthy
|
||||
timeout 600 bash -c 'until curl -sf http://localhost:8897/health; do sleep 5; done'
|
||||
```
|
||||
|
||||
**Changes:**
|
||||
- `scripts/blue-green-prepare.sh` — ZFS snapshot, clone, deploy B
|
||||
- `scripts/blue-green-promote.sh` — tear down A, optional port swap
|
||||
- `scripts/blue-green-rollback.sh` — destroy B, restore A
|
||||
- Parameterized deployment spec (kustomize overlay or env-driven)
|
||||
|
||||
### Step 5: End-to-end upgrade test
|
||||
|
||||
**Spec:** Full upgrade cycle completes with zero dropped RPC requests.
|
||||
|
||||
**Test:**
|
||||
```bash
|
||||
# Start continuous health probe in background
|
||||
while true; do
|
||||
curl -sf -o /dev/null -w "%{http_code} %{time_total}\n" \
|
||||
https://biscayne.vaasl.io/health || echo "FAIL $(date)"
|
||||
sleep 0.5
|
||||
done > /tmp/health-probe.log &
|
||||
|
||||
# Execute full blue-green upgrade
|
||||
./scripts/blue-green-prepare.sh --target-version v2.2.1
|
||||
# wait for blue to sync...
|
||||
./scripts/blue-green-promote.sh
|
||||
|
||||
# Stop probe
|
||||
kill %1
|
||||
|
||||
# assert: no FAIL lines in probe log
|
||||
grep -c FAIL /tmp/health-probe.log
|
||||
# assert: 0
|
||||
```
|
||||
|
|
@ -1,61 +0,0 @@
|
|||
# Bug: Ashburn Relay — Outbound Gossip Dropped by DZ Agent ACL
|
||||
|
||||
## Summary
|
||||
|
||||
`--gossip-host 137.239.194.65` correctly advertises the Ashburn relay IP in
|
||||
ContactInfo for all sockets (gossip, TVU, repair, TPU). The inbound path
|
||||
works end-to-end (proven with kelce UDP tests through every hop). However,
|
||||
outbound gossip from biscayne (src 137.239.194.65) is dropped by the
|
||||
DoubleZero agent's ACL on mia-sw01's Tunnel500, preventing ContactInfo from
|
||||
propagating to the cluster. Peers never learn our TVU address.
|
||||
|
||||
## Evidence
|
||||
|
||||
- Inbound path confirmed hop by hop (kelce → was-sw01 → mia-sw01 → Tunnel500
|
||||
→ biscayne doublezero0 → DNAT → kind bridge → kind node eth0):
|
||||
```
|
||||
01:04:12.136633 IP 69.112.108.72.58856 > 172.20.0.2.9000: UDP, length 13
|
||||
```
|
||||
- Outbound gossip leaves biscayne correctly (src 137.239.194.65:8001 on
|
||||
doublezero0), enters mia-sw01 via Tunnel500, hits SEC-USER-500-IN ACL:
|
||||
```
|
||||
60 deny ip any any [match 26355968 packets, 0:00:02 ago]
|
||||
```
|
||||
The ACL only permits src 186.233.184.235 and 169.254.7.7 — not 137.239.194.65.
|
||||
- Validator not visible in public RPC getClusterNodes (gossip not propagating)
|
||||
- Validator sees 775 nodes vs 5,045 on public RPC
|
||||
|
||||
## Root Cause
|
||||
|
||||
The `doublezero-agent` daemon on mia-sw01 manages Tunnel500 and its ACL
|
||||
(SEC-USER-500-IN). The agent periodically reconciles the ACL to its expected
|
||||
state, overwriting any custom entries we add. We cannot modify the ACL
|
||||
without the agent reverting it.
|
||||
|
||||
137.239.194.65 is from the was-sw01 LAN block (137.239.194.64/29), routed
|
||||
by the ISP to was-sw01 via the WAN link. It IS publicly routable (confirmed
|
||||
by kelce ping/UDP tests). The earlier hypothesis that it was unroutable was
|
||||
wrong — the IP reaches was-sw01, gets forwarded to mia-sw01 via backbone,
|
||||
and reaches biscayne through Tunnel500 (inbound ACL direction is fine).
|
||||
|
||||
The problem is outbound only: the Tunnel500 ingress ACL (traffic FROM
|
||||
biscayne TO mia-sw01) drops src 137.239.194.65.
|
||||
|
||||
## Fix
|
||||
|
||||
Create a dedicated GRE tunnel (Tunnel100) between biscayne and mia-sw01
|
||||
that bypasses the DZ-managed Tunnel500 entirely:
|
||||
|
||||
- **mia-sw01 Tunnel100**: src 209.42.167.137 (free LAN IP), dst 186.233.184.235
|
||||
(biscayne), link 169.254.100.0/31, ACL SEC-VALIDATOR-100-IN (we control)
|
||||
- **biscayne gre-ashburn**: src 186.233.184.235, dst 209.42.167.137,
|
||||
link 169.254.100.1/31
|
||||
|
||||
Traffic flow unchanged except the tunnel:
|
||||
- Inbound: was-sw01 → backbone → mia-sw01 → Tunnel100 → biscayne → DNAT → agave
|
||||
- Outbound: agave → SNAT 137.239.194.65 → Tunnel100 → mia-sw01 → backbone → was-sw01
|
||||
|
||||
See:
|
||||
- `playbooks/ashburn-relay-mia-sw01.yml` (Tunnel100 + ACL + routes)
|
||||
- `playbooks/ashburn-relay-biscayne.yml` (gre-ashburn + DNAT + SNAT + policy routing)
|
||||
- `playbooks/ashburn-relay-was-sw01.yml` (static route, unchanged)
|
||||
|
|
@ -1,51 +0,0 @@
|
|||
# Bug: laconic-so etcd cleanup wipes core kubernetes service
|
||||
|
||||
## Summary
|
||||
|
||||
`_clean_etcd_keeping_certs()` in laconic-stack-orchestrator 1.1.0 deletes the `kubernetes` service from etcd, breaking cluster networking on restart.
|
||||
|
||||
## Component
|
||||
|
||||
`stack_orchestrator/deploy/k8s/helpers.py` — `_clean_etcd_keeping_certs()`
|
||||
|
||||
## Reproduction
|
||||
|
||||
1. Deploy with `laconic-so` to a k8s-kind target with persisted etcd (hostPath mount in kind-config.yml)
|
||||
2. `laconic-so deployment --dir <dir> stop` (destroys cluster)
|
||||
3. `laconic-so deployment --dir <dir> start` (recreates cluster with cleaned etcd)
|
||||
|
||||
## Symptoms
|
||||
|
||||
- `kindnet` pods enter CrashLoopBackOff with: `panic: unable to load in-cluster configuration, KUBERNETES_SERVICE_HOST and KUBERNETES_SERVICE_PORT must be defined`
|
||||
- `kubectl get svc kubernetes -n default` returns `NotFound`
|
||||
- coredns, caddy, local-path-provisioner stuck in Pending (no CNI without kindnet)
|
||||
- No pods can be scheduled
|
||||
|
||||
## Root Cause
|
||||
|
||||
`_clean_etcd_keeping_certs()` uses a whitelist that only preserves `/registry/secrets/caddy-system` keys. All other etcd keys are deleted, including `/registry/services/specs/default/kubernetes` — the core `kubernetes` ClusterIP service that kube-apiserver auto-creates.
|
||||
|
||||
When the kind cluster starts with the cleaned etcd, kube-apiserver sees the existing etcd data and does not re-create the `kubernetes` service. kindnet depends on the `KUBERNETES_SERVICE_HOST` environment variable which is injected by the kubelet from this service — without it, kindnet panics.
|
||||
|
||||
## Fix Options
|
||||
|
||||
1. **Expand the whitelist** to include `/registry/services/specs/default/kubernetes` and other core cluster resources
|
||||
2. **Fully wipe etcd** instead of selective cleanup — let the cluster bootstrap fresh (simpler, but loses Caddy TLS certs)
|
||||
3. **Don't persist etcd at all** — ephemeral etcd means clean state every restart (recommended for kind deployments)
|
||||
|
||||
## Workaround
|
||||
|
||||
Fully delete the kind cluster before `start`:
|
||||
|
||||
```bash
|
||||
kind delete cluster --name <cluster-name>
|
||||
laconic-so deployment --dir <dir> start
|
||||
```
|
||||
|
||||
This forces fresh etcd bootstrap. Downside: all other services deployed to the cluster (DaemonSets, other namespaces) are destroyed.
|
||||
|
||||
## Impact
|
||||
|
||||
- Affects any k8s-kind deployment with persisted etcd
|
||||
- Cluster is unrecoverable without full destroy+recreate
|
||||
- All non-laconic-so-managed workloads in the cluster are lost
|
||||
|
|
@ -1,75 +0,0 @@
|
|||
# Bug: laconic-so crashes on re-deploy when caddy ingress already exists
|
||||
|
||||
## Summary
|
||||
|
||||
`laconic-so deployment start` crashes with `FailToCreateError` when the kind cluster already has caddy ingress resources installed. The deployer uses `create_from_yaml()` which fails on `AlreadyExists` conflicts instead of applying idempotently. This prevents the application deployment from ever being reached — the crash happens before any app manifests are applied.
|
||||
|
||||
## Component
|
||||
|
||||
`stack_orchestrator/deploy/k8s/deploy_k8s.py:366` — `up()` method
|
||||
`stack_orchestrator/deploy/k8s/helpers.py:369` — `install_ingress_for_kind()`
|
||||
|
||||
## Reproduction
|
||||
|
||||
1. `kind delete cluster --name laconic-70ce4c4b47e23b85`
|
||||
2. `laconic-so deployment --dir /srv/deployments/agave start` — creates cluster, loads images, installs caddy ingress, but times out or is interrupted before app deployment completes
|
||||
3. `laconic-so deployment --dir /srv/deployments/agave start` — crashes immediately after image loading
|
||||
|
||||
## Symptoms
|
||||
|
||||
- Traceback ending in:
|
||||
```
|
||||
kubernetes.utils.create_from_yaml.FailToCreateError:
|
||||
Error from server (Conflict): namespaces "caddy-system" already exists
|
||||
Error from server (Conflict): serviceaccounts "caddy-ingress-controller" already exists
|
||||
Error from server (Conflict): clusterroles.rbac.authorization.k8s.io "caddy-ingress-controller" already exists
|
||||
...
|
||||
```
|
||||
- Namespace `laconic-laconic-70ce4c4b47e23b85` exists but is empty — no pods, no deployments, no events
|
||||
- Cluster is healthy, images are loaded, but no app manifests are applied
|
||||
|
||||
## Root Cause
|
||||
|
||||
`install_ingress_for_kind()` calls `kubernetes.utils.create_from_yaml()` which uses `POST` (create) semantics. If the resources already exist (from a previous partial run), every resource returns `409 Conflict` and `create_from_yaml` raises `FailToCreateError`, aborting the entire `up()` method before the app deployment step.
|
||||
|
||||
The first `laconic-so start` after a fresh `kind delete` works because:
|
||||
1. Image loading into the kind node takes 5-10 minutes (images are ~10GB+)
|
||||
2. Caddy ingress is installed successfully
|
||||
3. App deployment begins
|
||||
|
||||
But if that first run is interrupted (timeout, Ctrl-C, ansible timeout), the second run finds caddy already installed and crashes.
|
||||
|
||||
## Fix Options
|
||||
|
||||
1. **Use server-side apply** instead of `create_from_yaml()` — `kubectl apply` is idempotent
|
||||
2. **Check if ingress exists before installing** — skip `install_ingress_for_kind()` if caddy-system namespace exists
|
||||
3. **Catch `AlreadyExists` and continue** — treat 409 as success for infrastructure resources
|
||||
|
||||
## Workaround
|
||||
|
||||
Delete the caddy ingress resources before re-running:
|
||||
|
||||
```bash
|
||||
kubectl delete namespace caddy-system
|
||||
kubectl delete clusterrole caddy-ingress-controller
|
||||
kubectl delete clusterrolebinding caddy-ingress-controller
|
||||
kubectl delete ingressclass caddy
|
||||
laconic-so deployment --dir /srv/deployments/agave start
|
||||
```
|
||||
|
||||
Or nuke the entire cluster and start fresh:
|
||||
|
||||
```bash
|
||||
kind delete cluster --name laconic-70ce4c4b47e23b85
|
||||
laconic-so deployment --dir /srv/deployments/agave start
|
||||
```
|
||||
|
||||
## Interaction with ansible timeout
|
||||
|
||||
The `biscayne-redeploy.yml` playbook sets a 600s timeout on the `laconic-so deployment start` task. Image loading alone can exceed this on a fresh cluster (images must be re-loaded into the new kind node). When ansible kills the process at 600s, the caddy ingress is already installed but the app is not — putting the cluster into the broken state described above. Subsequent playbook runs hit this bug on every attempt.
|
||||
|
||||
## Impact
|
||||
|
||||
- Blocks all re-deploys on biscayne without manual cleanup
|
||||
- The playbook cannot recover automatically — every retry hits the same conflict
|
||||
- Discovered 2026-03-05 during full wipe redeploy of biscayne validator
|
||||
|
|
@ -0,0 +1,135 @@
|
|||
# laconic-so
|
||||
|
||||
Sub-commands and flags
|
||||
|
||||
## setup-repositories
|
||||
|
||||
Clone a single repository:
|
||||
```
|
||||
$ laconic-so setup-repositories --include github.com/cerc-io/go-ethereum
|
||||
```
|
||||
Clone the repositories for a stack:
|
||||
```
|
||||
$ laconic-so --stack fixturenet-eth setup-repositories
|
||||
```
|
||||
Pull latest commits from origin:
|
||||
```
|
||||
$ laconic-so --stack fixturenet-eth setup-repositories --pull
|
||||
```
|
||||
Use SSH rather than https:
|
||||
```
|
||||
$ laconic-so --stack fixturenet-eth setup-repositories --git-ssh
|
||||
```
|
||||
|
||||
## build-containers
|
||||
|
||||
Build a single container:
|
||||
```
|
||||
$ laconic-so build-containers --include <container-name>
|
||||
```
|
||||
e.g.
|
||||
```
|
||||
$ laconic-so build-containers --include cerc/go-ethereum
|
||||
```
|
||||
Build the containers for a stack:
|
||||
```
|
||||
$ laconic-so --stack <stack-name> build-containers
|
||||
```
|
||||
e.g.
|
||||
```
|
||||
$ laconic-so --stack fixturenet-eth build-containers
|
||||
```
|
||||
Force full rebuild of container images:
|
||||
```
|
||||
$ laconic-so build-containers --include <container-name> --force-rebuild
|
||||
```
|
||||
## build-npms
|
||||
|
||||
Build a single package:
|
||||
```
|
||||
$ laconic-so build-npms --include <package-name>
|
||||
```
|
||||
e.g.
|
||||
```
|
||||
$ laconic-so build-npms --include registry-sdk
|
||||
```
|
||||
Build the packages for a stack:
|
||||
```
|
||||
$ laconic-so --stack <stack-name> build-npms
|
||||
```
|
||||
e.g.
|
||||
```
|
||||
$ laconic-so --stack fixturenet-laconicd build-npms
|
||||
```
|
||||
Force full rebuild of packages:
|
||||
```
|
||||
$ laconic-so build-npms --include <package-name> --force-rebuild
|
||||
```
|
||||
|
||||
## deploy
|
||||
|
||||
The `deploy` command group manages persistent deployments. The general workflow is `deploy init` to generate a spec file, then `deploy create` to create a deployment directory from the spec, then runtime commands like `deploy up` and `deploy down`.
|
||||
|
||||
### deploy init
|
||||
|
||||
Generate a deployment spec file from a stack definition:
|
||||
```
|
||||
$ laconic-so --stack <stack-name> deploy init --output <spec-file>
|
||||
```
|
||||
|
||||
Options:
|
||||
- `--output` (required): write spec file here
|
||||
- `--config`: provide config variables for the deployment
|
||||
- `--config-file`: provide config variables in a file
|
||||
- `--kube-config`: provide a config file for a k8s deployment
|
||||
- `--image-registry`: provide a container image registry url for this k8s cluster
|
||||
- `--map-ports-to-host`: map ports to the host (`any-variable-random`, `localhost-same`, `any-same`, `localhost-fixed-random`, `any-fixed-random`)
|
||||
|
||||
### deploy create
|
||||
|
||||
Create a deployment directory from a spec file:
|
||||
```
|
||||
$ laconic-so --stack <stack-name> deploy create --spec-file <spec-file> --deployment-dir <dir>
|
||||
```
|
||||
|
||||
Update an existing deployment in-place (preserving data volumes and env file):
|
||||
```
|
||||
$ laconic-so --stack <stack-name> deploy create --spec-file <spec-file> --deployment-dir <dir> --update
|
||||
```
|
||||
|
||||
Options:
|
||||
- `--spec-file` (required): spec file to use
|
||||
- `--deployment-dir`: target directory for deployment files
|
||||
- `--update`: update an existing deployment directory, preserving data volumes and env file. Changed files are backed up with a `.bak` suffix. The deployment's `config.env` and `deployment.yml` are also preserved.
|
||||
- `--network-dir`: network configuration supplied in this directory
|
||||
- `--initial-peers`: initial set of persistent peers
|
||||
|
||||
### deploy up
|
||||
|
||||
Start a deployment:
|
||||
```
|
||||
$ laconic-so deployment --dir <deployment-dir> up
|
||||
```
|
||||
|
||||
### deploy down
|
||||
|
||||
Stop a deployment:
|
||||
```
|
||||
$ laconic-so deployment --dir <deployment-dir> down
|
||||
```
|
||||
Use `--delete-volumes` to also remove data volumes.
|
||||
|
||||
### deploy ps
|
||||
|
||||
Show running services:
|
||||
```
|
||||
$ laconic-so deployment --dir <deployment-dir> ps
|
||||
```
|
||||
|
||||
### deploy logs
|
||||
|
||||
View service logs:
|
||||
```
|
||||
$ laconic-so deployment --dir <deployment-dir> logs
|
||||
```
|
||||
Use `-f` to follow and `-n <count>` to tail.
|
||||
|
|
@ -0,0 +1,202 @@
|
|||
# Deployment Patterns
|
||||
|
||||
## GitOps Pattern
|
||||
|
||||
For production deployments, we recommend a GitOps approach where your deployment configuration is tracked in version control.
|
||||
|
||||
### Overview
|
||||
|
||||
- **spec.yml is your source of truth**: Maintain it in your operator repository
|
||||
- **Don't regenerate on every restart**: Run `deploy init` once, then customize and commit
|
||||
- **Use restart for updates**: The restart command respects your git-tracked spec.yml
|
||||
|
||||
### Workflow
|
||||
|
||||
1. **Initial setup**: Run `deploy init` once to generate a spec.yml template
|
||||
2. **Customize and commit**: Edit spec.yml with your configuration (hostnames, resources, etc.) and commit to your operator repo
|
||||
3. **Deploy from git**: Use the committed spec.yml for deployments
|
||||
4. **Update via git**: Make changes in git, then restart to apply
|
||||
|
||||
```bash
|
||||
# Initial setup (run once)
|
||||
laconic-so --stack my-stack deploy init --output spec.yml
|
||||
|
||||
# Customize for your environment
|
||||
vim spec.yml # Set hostname, resources, etc.
|
||||
|
||||
# Commit to your operator repository
|
||||
git add spec.yml
|
||||
git commit -m "Add my-stack deployment configuration"
|
||||
git push
|
||||
|
||||
# On deployment server: deploy from git-tracked spec
|
||||
laconic-so deploy create \
|
||||
--spec-file /path/to/operator-repo/spec.yml \
|
||||
--deployment-dir my-deployment
|
||||
|
||||
laconic-so deployment --dir my-deployment start
|
||||
```
|
||||
|
||||
### Updating Deployments
|
||||
|
||||
When you need to update a deployment:
|
||||
|
||||
```bash
|
||||
# 1. Make changes in your operator repo
|
||||
vim /path/to/operator-repo/spec.yml
|
||||
git commit -am "Update configuration"
|
||||
git push
|
||||
|
||||
# 2. On deployment server: pull and restart
|
||||
cd /path/to/operator-repo && git pull
|
||||
laconic-so deployment --dir my-deployment restart
|
||||
```
|
||||
|
||||
The `restart` command:
|
||||
- Pulls latest code from the stack repository
|
||||
- Uses your git-tracked spec.yml (does NOT regenerate from defaults)
|
||||
- Syncs the deployment directory
|
||||
- Restarts services
|
||||
|
||||
### Anti-patterns
|
||||
|
||||
**Don't do this:**
|
||||
```bash
|
||||
# BAD: Regenerating spec on every deployment
|
||||
laconic-so --stack my-stack deploy init --output spec.yml
|
||||
laconic-so deploy create --spec-file spec.yml ...
|
||||
```
|
||||
|
||||
This overwrites your customizations with defaults from the stack's `commands.py`.
|
||||
|
||||
**Do this instead:**
|
||||
```bash
|
||||
# GOOD: Use your git-tracked spec
|
||||
git pull # Get latest spec.yml from your operator repo
|
||||
laconic-so deployment --dir my-deployment restart
|
||||
```
|
||||
|
||||
## Private Registry Authentication
|
||||
|
||||
For deployments using images from private container registries (e.g., GitHub Container Registry), configure authentication in your spec.yml:
|
||||
|
||||
### Configuration
|
||||
|
||||
Add a `registry-credentials` section to your spec.yml:
|
||||
|
||||
```yaml
|
||||
registry-credentials:
|
||||
server: ghcr.io
|
||||
username: your-org-or-username
|
||||
token-env: REGISTRY_TOKEN
|
||||
```
|
||||
|
||||
**Fields:**
|
||||
- `server`: The registry hostname (e.g., `ghcr.io`, `docker.io`, `gcr.io`)
|
||||
- `username`: Registry username (for GHCR, use your GitHub username or org name)
|
||||
- `token-env`: Name of the environment variable containing your API token/PAT
|
||||
|
||||
### Token Environment Variable
|
||||
|
||||
The `token-env` pattern keeps credentials out of version control. Set the environment variable when running `deployment start`:
|
||||
|
||||
```bash
|
||||
export REGISTRY_TOKEN="your-personal-access-token"
|
||||
laconic-so deployment --dir my-deployment start
|
||||
```
|
||||
|
||||
For GHCR, create a Personal Access Token (PAT) with `read:packages` scope.
|
||||
|
||||
### Ansible Integration
|
||||
|
||||
When using Ansible for deployments, pass the token from a credentials file:
|
||||
|
||||
```yaml
|
||||
- name: Start deployment
|
||||
ansible.builtin.command:
|
||||
cmd: laconic-so deployment --dir {{ deployment_dir }} start
|
||||
environment:
|
||||
REGISTRY_TOKEN: "{{ lookup('file', '~/.credentials/ghcr_token') }}"
|
||||
```
|
||||
|
||||
### How It Works
|
||||
|
||||
1. laconic-so reads the `registry-credentials` config from spec.yml
|
||||
2. Creates a Kubernetes `docker-registry` secret named `{deployment}-registry`
|
||||
3. The deployment's pods reference this secret for image pulls
|
||||
|
||||
## Cluster and Volume Management
|
||||
|
||||
### Stopping Deployments
|
||||
|
||||
The `deployment stop` command has two important flags:
|
||||
|
||||
```bash
|
||||
# Default: stops deployment, deletes cluster, PRESERVES volumes
|
||||
laconic-so deployment --dir my-deployment stop
|
||||
|
||||
# Explicitly delete volumes (USE WITH CAUTION)
|
||||
laconic-so deployment --dir my-deployment stop --delete-volumes
|
||||
```
|
||||
|
||||
### Volume Persistence
|
||||
|
||||
Volumes persist across cluster deletion by design. This is important because:
|
||||
- **Data survives cluster recreation**: Ledger data, databases, and other state are preserved
|
||||
- **Faster recovery**: No need to re-sync or rebuild data after cluster issues
|
||||
- **Safe cluster upgrades**: Delete and recreate cluster without data loss
|
||||
|
||||
**Only use `--delete-volumes` when:**
|
||||
- You explicitly want to start fresh with no data
|
||||
- The user specifically requests volume deletion
|
||||
- You're cleaning up a test/dev environment completely
|
||||
|
||||
### Shared Cluster Architecture
|
||||
|
||||
In kind deployments, multiple stacks share a single cluster:
|
||||
- First `deployment start` creates the cluster
|
||||
- Subsequent deployments reuse the existing cluster
|
||||
- `deployment stop` on ANY deployment deletes the shared cluster
|
||||
- Other deployments will fail until cluster is recreated
|
||||
|
||||
To stop a single deployment without affecting the cluster:
|
||||
```bash
|
||||
laconic-so deployment --dir my-deployment stop --skip-cluster-management
|
||||
```
|
||||
|
||||
## Volume Persistence in k8s-kind
|
||||
|
||||
k8s-kind has 3 storage layers:
|
||||
|
||||
- **Docker Host**: The physical server running Docker
|
||||
- **Kind Node**: A Docker container simulating a k8s node
|
||||
- **Pod Container**: Your workload
|
||||
|
||||
For k8s-kind, volumes with paths are mounted from Docker Host → Kind Node → Pod via extraMounts.
|
||||
|
||||
| spec.yml volume | Storage Location | Survives Pod Restart | Survives Cluster Restart |
|
||||
|-----------------|------------------|---------------------|-------------------------|
|
||||
| `vol:` (empty) | Kind Node PVC | ✅ | ❌ |
|
||||
| `vol: ./data/x` | Docker Host | ✅ | ✅ |
|
||||
| `vol: /abs/path`| Docker Host | ✅ | ✅ |
|
||||
|
||||
**Recommendation**: Always use paths for data you want to keep. Relative paths
|
||||
(e.g., `./data/rpc-config`) resolve to `$DEPLOYMENT_DIR/data/rpc-config` on the
|
||||
Docker Host.
|
||||
|
||||
### Example
|
||||
|
||||
```yaml
|
||||
# In spec.yml
|
||||
volumes:
|
||||
rpc-config: ./data/rpc-config # Persists to $DEPLOYMENT_DIR/data/rpc-config
|
||||
chain-data: ./data/chain # Persists to $DEPLOYMENT_DIR/data/chain
|
||||
temp-cache: # Empty = Kind Node PVC (lost on cluster delete)
|
||||
```
|
||||
|
||||
### The Antipattern
|
||||
|
||||
Empty-path volumes appear persistent because they survive pod restarts (data lives
|
||||
in Kind Node container). However, this data is lost when the kind cluster is
|
||||
recreated. This "false persistence" has caused data loss when operators assumed
|
||||
their data was safe.
|
||||
|
|
@ -0,0 +1,550 @@
|
|||
# Docker Compose Deployment Guide
|
||||
|
||||
## Introduction
|
||||
|
||||
### What is a Deployer?
|
||||
|
||||
In stack-orchestrator, a **deployer** provides a uniform interface for orchestrating containerized applications. This guide focuses on Docker Compose deployments, which is the default and recommended deployment mode.
|
||||
|
||||
While stack-orchestrator also supports Kubernetes (`k8s`) and Kind (`k8s-kind`) deployments, those are out of scope for this guide. See the [Kubernetes Enhancements](./k8s-deployment-enhancements.md) documentation for advanced deployment options.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
To deploy stacks using Docker Compose, you need:
|
||||
|
||||
- Docker Engine (20.10+)
|
||||
- Docker Compose plugin (v2.0+)
|
||||
- Python 3.8+
|
||||
- stack-orchestrator installed (`laconic-so`)
|
||||
|
||||
**That's it!** No additional infrastructure is required. If you have Docker installed, you're ready to deploy.
|
||||
|
||||
## Deployment Workflow
|
||||
|
||||
The typical deployment workflow consists of four main steps:
|
||||
|
||||
1. **Setup repositories and build containers** (first time only)
|
||||
2. **Initialize deployment specification**
|
||||
3. **Create deployment directory**
|
||||
4. **Start and manage services**
|
||||
|
||||
## Quick Start Example
|
||||
|
||||
Here's a complete example using the built-in `test` stack:
|
||||
|
||||
```bash
|
||||
# Step 1: Setup (first time only)
|
||||
laconic-so --stack test setup-repositories
|
||||
laconic-so --stack test build-containers
|
||||
|
||||
# Step 2: Initialize deployment spec
|
||||
laconic-so --stack test deploy init --output test-spec.yml
|
||||
|
||||
# Step 3: Create deployment directory
|
||||
laconic-so --stack test deploy create \
|
||||
--spec-file test-spec.yml \
|
||||
--deployment-dir test-deployment
|
||||
|
||||
# Step 4: Start services
|
||||
laconic-so deployment --dir test-deployment start
|
||||
|
||||
# View running services
|
||||
laconic-so deployment --dir test-deployment ps
|
||||
|
||||
# View logs
|
||||
laconic-so deployment --dir test-deployment logs
|
||||
|
||||
# Stop services (preserves data)
|
||||
laconic-so deployment --dir test-deployment stop
|
||||
```
|
||||
|
||||
## Deployment Workflows
|
||||
|
||||
Stack-orchestrator supports two deployment workflows:
|
||||
|
||||
### 1. Deployment Directory Workflow (Recommended)
|
||||
|
||||
This workflow creates a persistent deployment directory that contains all configuration and data.
|
||||
|
||||
**When to use:**
|
||||
- Production deployments
|
||||
- When you need to preserve configuration
|
||||
- When you want to manage multiple deployments
|
||||
- When you need persistent volume data
|
||||
|
||||
**Example:**
|
||||
|
||||
```bash
|
||||
# Initialize deployment spec
|
||||
laconic-so --stack fixturenet-eth deploy init --output eth-spec.yml
|
||||
|
||||
# Optionally edit eth-spec.yml to customize configuration
|
||||
|
||||
# Create deployment directory
|
||||
laconic-so --stack fixturenet-eth deploy create \
|
||||
--spec-file eth-spec.yml \
|
||||
--deployment-dir my-eth-deployment
|
||||
|
||||
# Start the deployment
|
||||
laconic-so deployment --dir my-eth-deployment start
|
||||
|
||||
# Manage the deployment
|
||||
laconic-so deployment --dir my-eth-deployment ps
|
||||
laconic-so deployment --dir my-eth-deployment logs
|
||||
laconic-so deployment --dir my-eth-deployment stop
|
||||
```
|
||||
|
||||
### 2. Quick Deploy Workflow
|
||||
|
||||
This workflow deploys directly without creating a persistent deployment directory.
|
||||
|
||||
**When to use:**
|
||||
- Quick testing
|
||||
- Temporary deployments
|
||||
- Simple stacks that don't require customization
|
||||
|
||||
**Example:**
|
||||
|
||||
```bash
|
||||
# Start the stack directly
|
||||
laconic-so --stack test deploy up
|
||||
|
||||
# Check service status
|
||||
laconic-so --stack test deploy port test 80
|
||||
|
||||
# View logs
|
||||
laconic-so --stack test deploy logs
|
||||
|
||||
# Stop (preserves volumes)
|
||||
laconic-so --stack test deploy down
|
||||
|
||||
# Stop and remove volumes
|
||||
laconic-so --stack test deploy down --delete-volumes
|
||||
```
|
||||
|
||||
## Real-World Example: Ethereum Fixturenet
|
||||
|
||||
Deploy a local Ethereum testnet with Geth and Lighthouse:
|
||||
|
||||
```bash
|
||||
# Setup (first time only)
|
||||
laconic-so --stack fixturenet-eth setup-repositories
|
||||
laconic-so --stack fixturenet-eth build-containers
|
||||
|
||||
# Initialize with default configuration
|
||||
laconic-so --stack fixturenet-eth deploy init --output eth-spec.yml
|
||||
|
||||
# Create deployment
|
||||
laconic-so --stack fixturenet-eth deploy create \
|
||||
--spec-file eth-spec.yml \
|
||||
--deployment-dir fixturenet-eth-deployment
|
||||
|
||||
# Start the network
|
||||
laconic-so deployment --dir fixturenet-eth-deployment start
|
||||
|
||||
# Check status
|
||||
laconic-so deployment --dir fixturenet-eth-deployment ps
|
||||
|
||||
# Access logs from specific service
|
||||
laconic-so deployment --dir fixturenet-eth-deployment logs fixturenet-eth-geth-1
|
||||
|
||||
# Stop the network (preserves blockchain data)
|
||||
laconic-so deployment --dir fixturenet-eth-deployment stop
|
||||
|
||||
# Start again - blockchain data is preserved
|
||||
laconic-so deployment --dir fixturenet-eth-deployment start
|
||||
|
||||
# Clean up everything including data
|
||||
laconic-so deployment --dir fixturenet-eth-deployment stop --delete-volumes
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
### Passing Configuration Parameters
|
||||
|
||||
Configuration can be passed in three ways:
|
||||
|
||||
**1. At init time via `--config` flag:**
|
||||
|
||||
```bash
|
||||
laconic-so --stack test deploy init --output spec.yml \
|
||||
--config PARAM1=value1,PARAM2=value2
|
||||
```
|
||||
|
||||
**2. Edit the spec file after init:**
|
||||
|
||||
```bash
|
||||
# Initialize
|
||||
laconic-so --stack test deploy init --output spec.yml
|
||||
|
||||
# Edit spec.yml
|
||||
vim spec.yml
|
||||
```
|
||||
|
||||
Example spec.yml:
|
||||
```yaml
|
||||
stack: test
|
||||
config:
|
||||
PARAM1: value1
|
||||
PARAM2: value2
|
||||
```
|
||||
|
||||
**3. Docker Compose defaults:**
|
||||
|
||||
Environment variables defined in the stack's `docker-compose-*.yml` files are used as defaults. Configuration from the spec file overrides these defaults.
|
||||
|
||||
### Port Mapping
|
||||
|
||||
By default, services are accessible on randomly assigned host ports. To find the mapped port:
|
||||
|
||||
```bash
|
||||
# Find the host port for container port 80 on service 'webapp'
|
||||
laconic-so deployment --dir my-deployment port webapp 80
|
||||
|
||||
# Output example: 0.0.0.0:32768
|
||||
```
|
||||
|
||||
To configure fixed ports, edit the spec file before creating the deployment:
|
||||
|
||||
```yaml
|
||||
network:
|
||||
ports:
|
||||
webapp:
|
||||
- '8080:80' # Maps host port 8080 to container port 80
|
||||
api:
|
||||
- '3000:3000'
|
||||
```
|
||||
|
||||
Then create the deployment:
|
||||
|
||||
```bash
|
||||
laconic-so --stack my-stack deploy create \
|
||||
--spec-file spec.yml \
|
||||
--deployment-dir my-deployment
|
||||
```
|
||||
|
||||
### Volume Persistence
|
||||
|
||||
Volumes are preserved between stop/start cycles by default:
|
||||
|
||||
```bash
|
||||
# Stop but keep data
|
||||
laconic-so deployment --dir my-deployment stop
|
||||
|
||||
# Start again - data is still there
|
||||
laconic-so deployment --dir my-deployment start
|
||||
```
|
||||
|
||||
To completely remove all data:
|
||||
|
||||
```bash
|
||||
# Stop and delete all volumes
|
||||
laconic-so deployment --dir my-deployment stop --delete-volumes
|
||||
```
|
||||
|
||||
Volume data is stored in `<deployment-dir>/data/`.
|
||||
|
||||
## Common Operations
|
||||
|
||||
### Viewing Logs
|
||||
|
||||
```bash
|
||||
# All services, continuous follow
|
||||
laconic-so deployment --dir my-deployment logs --follow
|
||||
|
||||
# Last 100 lines from all services
|
||||
laconic-so deployment --dir my-deployment logs --tail 100
|
||||
|
||||
# Specific service only
|
||||
laconic-so deployment --dir my-deployment logs webapp
|
||||
|
||||
# Combine options
|
||||
laconic-so deployment --dir my-deployment logs --tail 50 --follow webapp
|
||||
```
|
||||
|
||||
### Executing Commands in Containers
|
||||
|
||||
```bash
|
||||
# Execute a command in a running service
|
||||
laconic-so deployment --dir my-deployment exec webapp ls -la
|
||||
|
||||
# Interactive shell
|
||||
laconic-so deployment --dir my-deployment exec webapp /bin/bash
|
||||
|
||||
# Run command with specific environment variables
|
||||
laconic-so deployment --dir my-deployment exec webapp env VAR=value command
|
||||
```
|
||||
|
||||
### Checking Service Status
|
||||
|
||||
```bash
|
||||
# List all running services
|
||||
laconic-so deployment --dir my-deployment ps
|
||||
|
||||
# Check using Docker directly
|
||||
docker ps
|
||||
```
|
||||
|
||||
### Updating a Running Deployment
|
||||
|
||||
If you need to change configuration after deployment:
|
||||
|
||||
```bash
|
||||
# 1. Edit the spec file
|
||||
vim my-deployment/spec.yml
|
||||
|
||||
# 2. Regenerate configuration
|
||||
laconic-so deployment --dir my-deployment update
|
||||
|
||||
# 3. Restart services to apply changes
|
||||
laconic-so deployment --dir my-deployment stop
|
||||
laconic-so deployment --dir my-deployment start
|
||||
```
|
||||
|
||||
## Multi-Service Deployments
|
||||
|
||||
Many stacks deploy multiple services that work together:
|
||||
|
||||
```bash
|
||||
# Deploy a stack with multiple services
|
||||
laconic-so --stack laconicd-with-console deploy init --output spec.yml
|
||||
laconic-so --stack laconicd-with-console deploy create \
|
||||
--spec-file spec.yml \
|
||||
--deployment-dir laconicd-deployment
|
||||
|
||||
laconic-so deployment --dir laconicd-deployment start
|
||||
|
||||
# View all services
|
||||
laconic-so deployment --dir laconicd-deployment ps
|
||||
|
||||
# View logs from specific services
|
||||
laconic-so deployment --dir laconicd-deployment logs laconicd
|
||||
laconic-so deployment --dir laconicd-deployment logs console
|
||||
```
|
||||
|
||||
## ConfigMaps
|
||||
|
||||
ConfigMaps allow you to mount configuration files into containers:
|
||||
|
||||
```bash
|
||||
# 1. Create the config directory in your deployment
|
||||
mkdir -p my-deployment/data/my-config
|
||||
echo "database_url=postgres://localhost" > my-deployment/data/my-config/app.conf
|
||||
|
||||
# 2. Reference in spec file
|
||||
vim my-deployment/spec.yml
|
||||
```
|
||||
|
||||
Add to spec.yml:
|
||||
```yaml
|
||||
configmaps:
|
||||
my-config: ./data/my-config
|
||||
```
|
||||
|
||||
```bash
|
||||
# 3. Restart to apply
|
||||
laconic-so deployment --dir my-deployment stop
|
||||
laconic-so deployment --dir my-deployment start
|
||||
```
|
||||
|
||||
The files will be mounted in the container at `/config/` (or as specified by the stack).
|
||||
|
||||
## Deployment Directory Structure
|
||||
|
||||
A typical deployment directory contains:
|
||||
|
||||
```
|
||||
my-deployment/
|
||||
├── compose/
|
||||
│ └── docker-compose-*.yml # Generated compose files
|
||||
├── config.env # Environment variables
|
||||
├── deployment.yml # Deployment metadata
|
||||
├── spec.yml # Deployment specification
|
||||
└── data/ # Volume mounts and configs
|
||||
├── service-data/ # Persistent service data
|
||||
└── config-maps/ # ConfigMap files
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Common Issues
|
||||
|
||||
**Problem: "Cannot connect to Docker daemon"**
|
||||
|
||||
```bash
|
||||
# Ensure Docker is running
|
||||
docker ps
|
||||
|
||||
# Start Docker if needed (macOS)
|
||||
open -a Docker
|
||||
|
||||
# Start Docker (Linux)
|
||||
sudo systemctl start docker
|
||||
```
|
||||
|
||||
**Problem: "Port already in use"**
|
||||
|
||||
```bash
|
||||
# Either stop the conflicting service or use different ports
|
||||
# Edit spec.yml before creating deployment:
|
||||
|
||||
network:
|
||||
ports:
|
||||
webapp:
|
||||
- '8081:80' # Use 8081 instead of 8080
|
||||
```
|
||||
|
||||
**Problem: "Image not found"**
|
||||
|
||||
```bash
|
||||
# Build containers first
|
||||
laconic-so --stack your-stack build-containers
|
||||
```
|
||||
|
||||
**Problem: Volumes not persisting**
|
||||
|
||||
```bash
|
||||
# Check if you used --delete-volumes when stopping
|
||||
# Volume data is in: <deployment-dir>/data/
|
||||
|
||||
# Don't use --delete-volumes if you want to keep data:
|
||||
laconic-so deployment --dir my-deployment stop
|
||||
|
||||
# Only use --delete-volumes when you want to reset completely:
|
||||
laconic-so deployment --dir my-deployment stop --delete-volumes
|
||||
```
|
||||
|
||||
**Problem: Services not starting**
|
||||
|
||||
```bash
|
||||
# Check logs for errors
|
||||
laconic-so deployment --dir my-deployment logs
|
||||
|
||||
# Check Docker container status
|
||||
docker ps -a
|
||||
|
||||
# Try stopping and starting again
|
||||
laconic-so deployment --dir my-deployment stop
|
||||
laconic-so deployment --dir my-deployment start
|
||||
```
|
||||
|
||||
### Inspecting Deployment State
|
||||
|
||||
```bash
|
||||
# Check deployment directory structure
|
||||
ls -la my-deployment/
|
||||
|
||||
# Check running containers
|
||||
docker ps
|
||||
|
||||
# Check container details
|
||||
docker inspect <container-name>
|
||||
|
||||
# Check networks
|
||||
docker network ls
|
||||
|
||||
# Check volumes
|
||||
docker volume ls
|
||||
```
|
||||
|
||||
## CLI Commands Reference
|
||||
|
||||
### Stack Operations
|
||||
|
||||
```bash
|
||||
# Clone required repositories
|
||||
laconic-so --stack <name> setup-repositories
|
||||
|
||||
# Build container images
|
||||
laconic-so --stack <name> build-containers
|
||||
```
|
||||
|
||||
### Deployment Initialization
|
||||
|
||||
```bash
|
||||
# Initialize deployment spec with defaults
|
||||
laconic-so --stack <name> deploy init --output <spec-file>
|
||||
|
||||
# Initialize with configuration
|
||||
laconic-so --stack <name> deploy init --output <spec-file> \
|
||||
--config PARAM1=value1,PARAM2=value2
|
||||
```
|
||||
|
||||
### Deployment Creation
|
||||
|
||||
```bash
|
||||
# Create deployment directory from spec
|
||||
laconic-so --stack <name> deploy create \
|
||||
--spec-file <spec-file> \
|
||||
--deployment-dir <dir>
|
||||
```
|
||||
|
||||
### Deployment Management
|
||||
|
||||
```bash
|
||||
# Start all services
|
||||
laconic-so deployment --dir <dir> start
|
||||
|
||||
# Stop services (preserves volumes)
|
||||
laconic-so deployment --dir <dir> stop
|
||||
|
||||
# Stop and remove volumes
|
||||
laconic-so deployment --dir <dir> stop --delete-volumes
|
||||
|
||||
# List running services
|
||||
laconic-so deployment --dir <dir> ps
|
||||
|
||||
# View logs
|
||||
laconic-so deployment --dir <dir> logs [--tail N] [--follow] [service]
|
||||
|
||||
# Show mapped port
|
||||
laconic-so deployment --dir <dir> port <service> <private-port>
|
||||
|
||||
# Execute command in service
|
||||
laconic-so deployment --dir <dir> exec <service> <command>
|
||||
|
||||
# Update configuration
|
||||
laconic-so deployment --dir <dir> update
|
||||
```
|
||||
|
||||
### Quick Deploy Commands
|
||||
|
||||
```bash
|
||||
# Start stack directly
|
||||
laconic-so --stack <name> deploy up
|
||||
|
||||
# Stop stack
|
||||
laconic-so --stack <name> deploy down [--delete-volumes]
|
||||
|
||||
# View logs
|
||||
laconic-so --stack <name> deploy logs
|
||||
|
||||
# Show port mapping
|
||||
laconic-so --stack <name> deploy port <service> <port>
|
||||
```
|
||||
|
||||
## Related Documentation
|
||||
|
||||
- [CLI Reference](./cli.md) - Complete CLI command documentation
|
||||
- [Adding a New Stack](./adding-a-new-stack.md) - Creating custom stacks
|
||||
- [Specification](./spec.md) - Internal structure and design
|
||||
- [Kubernetes Enhancements](./k8s-deployment-enhancements.md) - Advanced K8s deployment options
|
||||
- [Web App Deployment](./webapp.md) - Deploying web applications
|
||||
|
||||
## Examples
|
||||
|
||||
For more examples, see the test scripts:
|
||||
- `scripts/quick-deploy-test.sh` - Quick deployment example
|
||||
- `tests/deploy/run-deploy-test.sh` - Comprehensive test showing all features
|
||||
|
||||
## Summary
|
||||
|
||||
- Docker Compose is the default and recommended deployment mode
|
||||
- Two workflows: deployment directory (recommended) or quick deploy
|
||||
- The standard workflow is: setup → build → init → create → start
|
||||
- Configuration is flexible with multiple override layers
|
||||
- Volume persistence is automatic unless explicitly deleted
|
||||
- All deployment state is contained in the deployment directory
|
||||
- For Kubernetes deployments, see separate K8s documentation
|
||||
|
||||
You're now ready to deploy stacks using stack-orchestrator with Docker Compose!
|
||||
|
|
@ -1,80 +0,0 @@
|
|||
# DoubleZero Agent — Managed Configuration
|
||||
|
||||
The `doublezero-agent` daemon runs on both mia-sw01 and was-sw01. It manages
|
||||
GRE tunnels, ACLs, BGP neighbors, and route-maps via EOS config sessions
|
||||
(named `doublezero-agent-<timestamp>`). It periodically creates pending
|
||||
sessions and commits them, overwriting any manual changes to the objects
|
||||
it manages.
|
||||
|
||||
**Do NOT modify any of the items listed below.** The agent will silently
|
||||
overwrite your changes.
|
||||
|
||||
## mia-sw01
|
||||
|
||||
### Tunnel interfaces (all DZ-managed)
|
||||
|
||||
| Interface | Description | VRF | Peer | ACL |
|
||||
|------------|-----------------|---------|-----------------|------------------------------|
|
||||
| Tunnel500 | USER-UCAST-500 | vrf1 | 186.233.184.235 | SEC-USER-500-IN |
|
||||
| Tunnel501 | USER-MCAST-501 | default | 186.233.185.50 | SEC-USER-SUB-MCAST-IN |
|
||||
| Tunnel502 | USER-UCAST-502 | vrf1 | 155.138.213.71 | SEC-USER-502-IN |
|
||||
| Tunnel503 | USER-MCAST-503 | default | 155.138.213.71 | SEC-USER-PUB-MCAST-IN |
|
||||
| Tunnel504 | (empty) | | | |
|
||||
| Tunnel505 | USER-UCAST-505 | vrf1 | 186.233.185.50 | SEC-USER-505-IN |
|
||||
| Tunnel506 | (exists) | | | |
|
||||
|
||||
### ACLs (DZ-managed — do NOT modify)
|
||||
|
||||
- `SEC-DIA-IN` — ingress ACL on Et1/1 (bogon/RFC1918 filter)
|
||||
- `SEC-USER-500-IN` — ingress ACL on Tunnel500
|
||||
- `SEC-USER-502-IN` — ingress ACL on Tunnel502
|
||||
- `SEC-USER-505-IN` — ingress ACL on Tunnel505
|
||||
- `SEC-USER-SUB-MCAST-IN` — ingress ACL on Tunnel501
|
||||
- `SEC-USER-PUB-MCAST-IN` — ingress ACL on Tunnel503
|
||||
- `SEC-USER-MCAST-BOUNDARY-501-OUT` — multicast boundary on Tunnel501
|
||||
- `SEC-USER-MCAST-BOUNDARY-503-OUT` — multicast boundary on Tunnel503
|
||||
|
||||
### VRF (DZ-managed)
|
||||
|
||||
- `vrf1` — used by Tunnel500, Tunnel502, Tunnel505 (unicast tunnels)
|
||||
- `ip route vrf vrf1 0.0.0.0/0 egress-vrf default Ethernet4/1 172.16.1.188`
|
||||
|
||||
### BGP (DZ-managed)
|
||||
|
||||
- `router bgp 65342` — iBGP mesh with DZ fabric switches (ny7, sea001, ld4, etc.)
|
||||
- BGP neighbors on tunnel link IPs (169.254.x.x) with `RM-USER-*` route-maps
|
||||
- All `RM-USER-*-IN` and `RM-USER-*-OUT` route-maps
|
||||
|
||||
### Loopbacks (DZ-managed)
|
||||
|
||||
- `Loopback255`, `Loopback256` — BGP update sources for iBGP mesh
|
||||
|
||||
## was-sw01
|
||||
|
||||
### ACLs (DZ-managed)
|
||||
|
||||
- `SEC-DIA-IN` — ingress ACL on Et1/1
|
||||
- `SEC-USER-PUB-MCAST-IN`
|
||||
- `SEC-USER-SUB-MCAST-IN`
|
||||
|
||||
### Daemons
|
||||
|
||||
- `doublezero-agent` — config management
|
||||
- `doublezero-telemetry` — metrics (writes to influxdb `doublezero-mainnet-beta`)
|
||||
|
||||
## Safe to modify (NOT managed by DZ agent)
|
||||
|
||||
### mia-sw01
|
||||
|
||||
- `Tunnel100` — our dedicated validator relay tunnel (VRF relay)
|
||||
- `SEC-VALIDATOR-100-IN` — our ACL on Tunnel100
|
||||
- `Loopback101` — tunnel source IP (209.42.167.137)
|
||||
- VRF `relay` — our outbound isolation VRF
|
||||
- `ip route 137.239.194.65/32 egress-vrf relay 169.254.100.1`
|
||||
- `ip route vrf relay 0.0.0.0/0 egress-vrf default 172.16.1.188`
|
||||
- Backbone `Ethernet4/1` — physical interface, not DZ-managed
|
||||
|
||||
### was-sw01
|
||||
|
||||
- `ip route 137.239.194.65/32 172.16.1.189` — our static route
|
||||
- Backbone `Ethernet4/1` — physical interface, not DZ-managed
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue