feat: ip_echo preflight tool + relay post-mortem and checklist
ip_echo_preflight.py: reimplements Solana ip_echo client protocol in Python. Verifies UDP port reachability before snapshot download, called from entrypoint.py. Prevents wasting hours on a snapshot only to crash-loop on port reachability. docs/postmortem-ashburn-relay-outbound.md: root cause analysis of the firewalld nftables FORWARD chain blocking outbound relay traffic. docs/ashburn-relay-checklist.md: 7-layer verification checklist for relay path debugging. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>fix/kind-mount-propagation
parent
68edcc60c7
commit
61b7f6a236
|
|
@ -0,0 +1,168 @@
|
||||||
|
# Ashburn Relay / ip_echo Port Reachability Checklist
|
||||||
|
|
||||||
|
The validator exits when it can't verify UDP ports (8001, 9000, 9002, 9003) are
|
||||||
|
reachable from entrypoint servers. The ip_echo protocol: validator TCP-connects
|
||||||
|
to entrypoint on port 8001, entrypoint sees source IP, sends UDP probes back to
|
||||||
|
that IP on the validator's ports. If probes don't arrive, validator crashes.
|
||||||
|
|
||||||
|
## Layer 1: Biscayne outbound path
|
||||||
|
|
||||||
|
Validator's outbound ip_echo TCP (dport 8001) must exit via GRE tunnel so
|
||||||
|
entrypoints see `137.239.194.65`, not biscayne's real IP via Docker MASQUERADE.
|
||||||
|
|
||||||
|
```
|
||||||
|
[ ] 1.1 Mangle rules (4 rules in mangle PREROUTING):
|
||||||
|
- udp sport 8001 (gossip outbound)
|
||||||
|
- udp sport 9000:9025 (TVU/repair outbound)
|
||||||
|
- tcp sport 8001 (gossip TCP outbound)
|
||||||
|
- tcp dport 8001 (ip_echo outbound — THE CRITICAL ONE)
|
||||||
|
|
||||||
|
[ ] 1.2 SNAT rule at position 1 (before Docker MASQUERADE):
|
||||||
|
POSTROUTING -m mark --mark 100 -j SNAT --to-source 137.239.194.65
|
||||||
|
|
||||||
|
[ ] 1.3 Policy routing rule:
|
||||||
|
fwmark 0x64 lookup ashburn
|
||||||
|
|
||||||
|
[ ] 1.4 Ashburn routing table default route:
|
||||||
|
default via 169.254.100.0 dev gre-ashburn
|
||||||
|
|
||||||
|
[ ] 1.5 Mangle counters incrementing (pkts/bytes on tcp dport 8001 rule)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Layer 2: GRE tunnel (biscayne ↔ mia-sw01)
|
||||||
|
|
||||||
|
```
|
||||||
|
[ ] 2.1 Tunnel exists and UP:
|
||||||
|
gre-ashburn with 169.254.100.1/31
|
||||||
|
|
||||||
|
[ ] 2.2 Tunnel peer reachable:
|
||||||
|
ping 169.254.100.0
|
||||||
|
|
||||||
|
[ ] 2.3 Ashburn IP on loopback:
|
||||||
|
137.239.194.65/32 dev lo
|
||||||
|
```
|
||||||
|
|
||||||
|
## Layer 3: Biscayne inbound path (DNAT + DOCKER-USER)
|
||||||
|
|
||||||
|
Entrypoint UDP probes arrive at `137.239.194.65` and must reach kind node
|
||||||
|
`172.20.0.2`.
|
||||||
|
|
||||||
|
```
|
||||||
|
[ ] 3.1 DNAT rules at position 1 in nat PREROUTING
|
||||||
|
(before Docker's ADDRTYPE LOCAL rule):
|
||||||
|
- udp dport 8001 → 172.20.0.2:8001
|
||||||
|
- tcp dport 8001 → 172.20.0.2:8001
|
||||||
|
- udp dport 9000:9025 → 172.20.0.2
|
||||||
|
|
||||||
|
[ ] 3.2 DOCKER-USER ACCEPT rules (3 rules):
|
||||||
|
- udp dport 8001 → ACCEPT
|
||||||
|
- tcp dport 8001 → ACCEPT
|
||||||
|
- udp dport 9000:9025 → ACCEPT
|
||||||
|
|
||||||
|
[ ] 3.3 DNAT counters incrementing
|
||||||
|
```
|
||||||
|
|
||||||
|
## Layer 4: mia-sw01
|
||||||
|
|
||||||
|
```
|
||||||
|
[ ] 4.1 Tunnel100 UP in VRF relay
|
||||||
|
src 209.42.167.137, dst 186.233.184.235, link 169.254.100.0/31
|
||||||
|
|
||||||
|
[ ] 4.2 VRF relay default route:
|
||||||
|
0.0.0.0/0 egress-vrf default 172.16.1.188
|
||||||
|
|
||||||
|
[ ] 4.3 Default VRF route to relay IP:
|
||||||
|
137.239.194.65/32 egress-vrf relay 169.254.100.1
|
||||||
|
|
||||||
|
[ ] 4.4 ACL SEC-VALIDATOR-100-IN permits all needed traffic
|
||||||
|
|
||||||
|
[ ] 4.5 Backbone Et4/1 UP (172.16.1.189/31)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Layer 5: was-sw01
|
||||||
|
|
||||||
|
```
|
||||||
|
[ ] 5.1 Static route: 137.239.194.65/32 via 172.16.1.189
|
||||||
|
|
||||||
|
[ ] 5.2 Backbone Et4/1 UP (172.16.1.188/31)
|
||||||
|
|
||||||
|
[ ] 5.3 No Loopback101 (removed to avoid absorbing traffic locally)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Layer 6: Persistence
|
||||||
|
|
||||||
|
```
|
||||||
|
[ ] 6.1 ashburn-relay.service enabled and active (runs After=docker.service)
|
||||||
|
|
||||||
|
[ ] 6.2 /usr/local/sbin/ashburn-relay-setup.sh exists
|
||||||
|
```
|
||||||
|
|
||||||
|
## Layer 7: End-to-end tests
|
||||||
|
|
||||||
|
All tests run via Ansible playbooks. The test scripts in `scripts/` are
|
||||||
|
utilities invoked by the playbooks — never run them manually via SSH.
|
||||||
|
|
||||||
|
```
|
||||||
|
[ ] 7.1 relay-test-tcp-dport.py (via ashburn-relay-check.yml or ad-hoc play)
|
||||||
|
Tests: outbound tcp dport 8001 mangle → SNAT → tunnel
|
||||||
|
Pass: entrypoint sees 137.239.194.65
|
||||||
|
Fail: entrypoint sees 186.233.184.235 (Docker MASQUERADE)
|
||||||
|
|
||||||
|
[ ] 7.2 relay-test-ip-echo.py (via ashburn-relay-check.yml or ad-hoc play)
|
||||||
|
Tests: FULL END-TO-END (outbound SNAT + inbound DNAT + DOCKER-USER)
|
||||||
|
Pass: UDP probe received from entrypoint
|
||||||
|
Fail: no UDP probes — inbound path broken
|
||||||
|
|
||||||
|
[ ] 7.3 relay-inbound-udp-test.yml (cross-inventory: biscayne + kelce)
|
||||||
|
Tests: inbound UDP from external host → DNAT → kind node
|
||||||
|
Pass: UDP arrives in kind netns
|
||||||
|
```
|
||||||
|
|
||||||
|
## Playbooks
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Read-only check of all relay state (biscayne + both switches):
|
||||||
|
ansible-playbook -i inventory-switches/switches.yml \
|
||||||
|
-i inventory/biscayne.yml playbooks/ashburn-relay-check.yml
|
||||||
|
|
||||||
|
# Apply all biscayne relay rules (idempotent):
|
||||||
|
ansible-playbook -i inventory/biscayne.yml playbooks/ashburn-relay-biscayne.yml
|
||||||
|
|
||||||
|
# Apply outbound only (the ip_echo fix):
|
||||||
|
ansible-playbook -i inventory/biscayne.yml \
|
||||||
|
playbooks/ashburn-relay-biscayne.yml -t outbound
|
||||||
|
|
||||||
|
# Apply inbound only (DNAT + DOCKER-USER):
|
||||||
|
ansible-playbook -i inventory/biscayne.yml \
|
||||||
|
playbooks/ashburn-relay-biscayne.yml -t inbound
|
||||||
|
|
||||||
|
# Apply mia-sw01 config:
|
||||||
|
ansible-playbook -i inventory-switches/switches.yml \
|
||||||
|
playbooks/ashburn-relay-mia-sw01.yml
|
||||||
|
|
||||||
|
# Apply was-sw01 config:
|
||||||
|
ansible-playbook -i inventory-switches/switches.yml \
|
||||||
|
playbooks/ashburn-relay-was-sw01.yml
|
||||||
|
|
||||||
|
# Cross-inventory inbound UDP test (biscayne + kelce):
|
||||||
|
ansible-playbook -i inventory/biscayne.yml -i inventory/kelce.yml \
|
||||||
|
playbooks/relay-inbound-udp-test.yml
|
||||||
|
```
|
||||||
|
|
||||||
|
## Historical root causes
|
||||||
|
|
||||||
|
1. **TCP dport 8001 mangle rule missing** — ip_echo TCP exits via Docker
|
||||||
|
MASQUERADE, entrypoint sees wrong IP, UDP probes go to wrong address.
|
||||||
|
|
||||||
|
2. **DOCKER-USER ACCEPT rules missing** — DNAT'd traffic hits Docker's FORWARD
|
||||||
|
DROP policy, never reaches kind node.
|
||||||
|
|
||||||
|
3. **DNAT rule position wrong** — Docker's `ADDRTYPE LOCAL` rule in PREROUTING
|
||||||
|
catches traffic to loopback IPs before our DNAT rules. Must use `-I
|
||||||
|
PREROUTING 1`.
|
||||||
|
|
||||||
|
4. **mia-sw01 egress-vrf route with interface specified** — silently fails in
|
||||||
|
EOS (accepted in config, never installed in RIB). Must use nexthop-only form.
|
||||||
|
|
||||||
|
5. **was-sw01 Loopback101 absorbing traffic** — local delivery instead of
|
||||||
|
forwarding to mia-sw01 via backbone.
|
||||||
|
|
@ -0,0 +1,190 @@
|
||||||
|
# Post-Mortem: Ashburn Relay Outbound Path Failure
|
||||||
|
|
||||||
|
**Date resolved**: 2026-03-10
|
||||||
|
**Duration of impact**: Unknown — likely since firewalld was enabled (post-reboot
|
||||||
|
2026-03-09 ~21:24 UTC). The relay worked before this with firewalld disabled.
|
||||||
|
**Symptoms**: Validator CrashLoopBackOff on ip_echo port reachability check.
|
||||||
|
Entrypoint never receives the validator's outbound TCP connection, so it can't
|
||||||
|
verify UDP port reachability and the validator refuses to start.
|
||||||
|
|
||||||
|
## Timeline
|
||||||
|
|
||||||
|
### Session d02959a7 (2026-03-06 to 2026-03-08)
|
||||||
|
|
||||||
|
Initial relay infrastructure build-out. Multi-day effort across three repos.
|
||||||
|
|
||||||
|
1. **Validator deployed**, replaying at 0.24 slots/sec. RTT between Miami and
|
||||||
|
peers (~150ms per repair round-trip) identified as the bottleneck. Ashburn
|
||||||
|
relay identified as the fix.
|
||||||
|
|
||||||
|
2. **GRE tunnel created** (gre-ashburn: biscayne 186.233.184.235 ↔ mia-sw01
|
||||||
|
209.42.167.137). Tunnel100 on mia-sw01 in VRF relay. Policy routing with
|
||||||
|
fwmark 0x64 routes validator traffic through the tunnel.
|
||||||
|
|
||||||
|
3. **Inbound path debugged end-to-end**:
|
||||||
|
- Cross-VRF routing on mia-sw01 investigated (egress-vrf route form, hardware
|
||||||
|
FIB programming, TCAM profile).
|
||||||
|
- GRE decapsulation on biscayne verified (kernel source read to understand
|
||||||
|
ip_tunnel_lookup matching logic).
|
||||||
|
- **DOCKER chain drop rule found**: Docker's FORWARD chain only had ACCEPT
|
||||||
|
for TCP 6443/443/80. DNAT'd relay UDP was dropped. Fix: DOCKER-USER
|
||||||
|
ACCEPT rules for UDP 8001 and 9000-9025.
|
||||||
|
- Inbound UDP relay test passed (kelce → was-sw01 → mia-sw01 → Tunnel100 →
|
||||||
|
biscayne → DNAT → kind node).
|
||||||
|
|
||||||
|
4. **Outbound path partially verified**: Relay test scripts confirmed TCP and
|
||||||
|
UDP traffic from the kind container exits via gre-ashburn with correct SNAT.
|
||||||
|
But the **validator's own ip_echo check was never end-to-end verified** with
|
||||||
|
a successful startup. The validator entered CrashLoopBackOff after the
|
||||||
|
DOCKER-USER fix for unrelated reasons (monitoring container crashes, log path
|
||||||
|
issues).
|
||||||
|
|
||||||
|
5. **Ashburn relay checklist** written at `docs/ashburn-relay-checklist.md` —
|
||||||
|
7 layers covering the full path. All items remained unchecked.
|
||||||
|
|
||||||
|
### Session 0b5908a4 (2026-03-09)
|
||||||
|
|
||||||
|
Container rebuild, graceful shutdown implementation, ZFS upgrade, storage
|
||||||
|
migration. The validator was **running and catching up from a ~5,649 slot gap**,
|
||||||
|
confirming the relay was working. Then:
|
||||||
|
|
||||||
|
- io_uring/ZFS deadlock from ungraceful shutdown (ZFS 2.2.2, fixed in 2.2.8+)
|
||||||
|
- Reboot required to clear zombie processes
|
||||||
|
- **Firewalld was enabled/started on the reboot** (previously disabled)
|
||||||
|
|
||||||
|
### Session cc6c8c55 (2026-03-10, this session)
|
||||||
|
|
||||||
|
User asked to review session d02959a7 to confirm the ip_echo problem was
|
||||||
|
actually solved. It wasn't.
|
||||||
|
|
||||||
|
1. **ip_echo preflight tool written** (`scripts/agave-container/ip_echo_preflight.py`)
|
||||||
|
— reimplements the Solana ip_echo client protocol in Python, called from
|
||||||
|
`entrypoint.py` before snapshot download. Tested successfully against live
|
||||||
|
entrypoints from the host.
|
||||||
|
|
||||||
|
2. **Tested from kind netns** — TCP to entrypoint:8001 returns "No route to
|
||||||
|
host". Mangle PREROUTING counter increments (marking works) but SNAT
|
||||||
|
POSTROUTING counter stays at 0 (packets never reach POSTROUTING).
|
||||||
|
|
||||||
|
3. **Misdiagnoses**:
|
||||||
|
- `src_valid_mark=0` suspected as root cause. Set to 1, no change. The
|
||||||
|
`ip route get X from Y mark Z` command was misleading — it simulates
|
||||||
|
locally-originated traffic, not forwarded. The correct test is
|
||||||
|
`ip route get X from Y iif <iface> mark Z`, which showed routing works.
|
||||||
|
- Firewalld nftables backend not setting `src_valid_mark` was a red herring.
|
||||||
|
|
||||||
|
4. **Root cause found**: Firewalld's nftables `filter_FORWARD` chain (priority
|
||||||
|
filter+10) rejects forwarded traffic between interfaces not in known zones.
|
||||||
|
Docker bridges and gre-ashburn were not in any firewalld zone. The chain's
|
||||||
|
`filter_FORWARD_POLICIES` only had rules for eno1, eno2, and mesh.
|
||||||
|
Traffic from br-cf46a62ab5b2 to gre-ashburn fell through to
|
||||||
|
`reject with icmpx admin-prohibited`.
|
||||||
|
|
||||||
|
```
|
||||||
|
# The reject that was killing outbound relay traffic:
|
||||||
|
chain filter_FORWARD {
|
||||||
|
...
|
||||||
|
jump filter_FORWARD_POLICIES
|
||||||
|
reject with icmpx admin-prohibited ← packets from unknown interfaces
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
5. **Fix applied**:
|
||||||
|
- Docker bridges (br-cf46a62ab5b2, docker0, br-4fb6f6795448) → `docker` zone
|
||||||
|
- gre-ashburn → `trusted` zone
|
||||||
|
- New `docker-to-relay` policy: docker → trusted, ACCEPT
|
||||||
|
- All permanent (`firewall-cmd --permanent` + reload)
|
||||||
|
|
||||||
|
6. **Verified**: ip_echo from kind netns returns `seen_ip=137.239.194.65
|
||||||
|
shred_version=50093`. Full outbound path works.
|
||||||
|
|
||||||
|
## Root Cause
|
||||||
|
|
||||||
|
**Firewalld was enabled on biscayne after a reboot. Its nftables FORWARD chain
|
||||||
|
rejected forwarded traffic from Docker bridges to gre-ashburn because neither
|
||||||
|
interface was assigned to a firewalld zone.**
|
||||||
|
|
||||||
|
The relay worked before because firewalld was disabled. The iptables rules
|
||||||
|
(mangle marks, SNAT, DNAT, DOCKER-USER) operated without interference. When
|
||||||
|
firewalld was enabled, its nftables filter_FORWARD chain (priority filter+10)
|
||||||
|
added a second layer of forwarding policy enforcement that the iptables rules
|
||||||
|
couldn't bypass.
|
||||||
|
|
||||||
|
### Why Docker outbound to the internet still worked
|
||||||
|
|
||||||
|
Docker's outbound traffic to eno1 was accepted by firewalld because eno1 IS in
|
||||||
|
the `public` zone. The `filter_FWD_public_allow` chain has `oifname "eno1"
|
||||||
|
accept`. Only traffic to gre-ashburn (not in any zone) was rejected.
|
||||||
|
|
||||||
|
### Why iptables rules alone weren't enough
|
||||||
|
|
||||||
|
Linux netfilter processes hooks in priority order. At the FORWARD hook:
|
||||||
|
|
||||||
|
1. **Priority filter (0)**: iptables `FORWARD` chain — Docker's DOCKER-USER
|
||||||
|
and DOCKER-FORWARD chains. These accept the traffic.
|
||||||
|
2. **Priority filter+10**: nftables `filter_FORWARD` chain — firewalld's zone
|
||||||
|
policies. These reject the traffic if interfaces aren't in known zones.
|
||||||
|
|
||||||
|
Both chains must accept for the packet to pass. The iptables acceptance at
|
||||||
|
priority 0 is overridden by the nftables rejection at priority filter+10.
|
||||||
|
|
||||||
|
## Architecture After Fix
|
||||||
|
|
||||||
|
Firewalld manages forwarding policy. Iptables handles Docker-specific rules
|
||||||
|
that firewalld can't replace (DNAT ordering, DOCKER-USER chain, mangle marks,
|
||||||
|
SNAT). Both coexist because they operate at different netfilter priorities.
|
||||||
|
|
||||||
|
```
|
||||||
|
Firewalld (permanent, survives reboots):
|
||||||
|
docker zone: br-cf46a62ab5b2, docker0, br-4fb6f6795448
|
||||||
|
trusted zone: mesh, gre-ashburn
|
||||||
|
docker-forwarding policy: ANY → docker, ACCEPT (existing)
|
||||||
|
docker-to-relay policy: docker → trusted, ACCEPT (new)
|
||||||
|
|
||||||
|
Systemd service (ashburn-relay.service, After=docker+firewalld):
|
||||||
|
GRE tunnel creation (iproute2)
|
||||||
|
Ashburn IP on loopback (iproute2)
|
||||||
|
DNAT rules at PREROUTING position 1 (iptables, before Docker's chain)
|
||||||
|
DOCKER-USER ACCEPT rules (iptables, for Docker's FORWARD chain)
|
||||||
|
Mangle marks for policy routing (iptables)
|
||||||
|
SNAT for marked traffic (iptables)
|
||||||
|
ip rule + ip route for ashburn table (iproute2)
|
||||||
|
```
|
||||||
|
|
||||||
|
## Lessons
|
||||||
|
|
||||||
|
1. **Firewalld with nftables backend and Docker iptables coexist but don't
|
||||||
|
coordinate.** Adding an interface that Docker uses to forward traffic
|
||||||
|
requires explicitly assigning it to a firewalld zone. Docker's iptables
|
||||||
|
ACCEPT is necessary but not sufficient.
|
||||||
|
|
||||||
|
2. **`ip route get X from Y mark Z` is misleading for forwarded traffic.**
|
||||||
|
It simulates local origination and fails on source address validation. Use
|
||||||
|
`ip route get X from Y iif <iface> mark Z` to simulate forwarded packets.
|
||||||
|
This wasted significant debugging time.
|
||||||
|
|
||||||
|
3. **SNAT counter = 0 means packets die before POSTROUTING, but the cause
|
||||||
|
could be in either the routing decision OR a filter chain between PREROUTING
|
||||||
|
and POSTROUTING.** The nftables filter_FORWARD chain was invisible when only
|
||||||
|
checking iptables rules.
|
||||||
|
|
||||||
|
4. **The validator passed ip_echo and ran successfully before.** That prior
|
||||||
|
success was the strongest evidence that the infrastructure was correct and
|
||||||
|
something changed. The change was firewalld being enabled.
|
||||||
|
|
||||||
|
## Related Documents
|
||||||
|
|
||||||
|
- `docs/ashburn-relay-checklist.md` — 7-layer checklist for relay verification
|
||||||
|
- `docs/bug-ashburn-tunnel-port-filtering.md` — prior DOCKER chain drop bug
|
||||||
|
- `.claude/skills/biscayne-relay-debugging/SKILL.md` — debugging skill
|
||||||
|
- `playbooks/ashburn-relay-biscayne.yml` — migrated playbook (firewalld + iptables)
|
||||||
|
- `scripts/agave-container/ip_echo_preflight.py` — preflight diagnostic tool
|
||||||
|
|
||||||
|
## Related Sessions
|
||||||
|
|
||||||
|
- `d02959a7-2ec6-4d27-8326-1bc4aaf3ebf1` (2026-03-06): Initial relay build,
|
||||||
|
DOCKER-USER fix, inbound path verified, outbound not end-to-end verified
|
||||||
|
- `0b5908a4-eff7-46de-9024-a11440bd68a8` (2026-03-09): Relay working (validator
|
||||||
|
catching up), then reboot introduced firewalld
|
||||||
|
- `cc6c8c55-fb4c-4482-b161-332ddf175300` (2026-03-10): Root cause found and
|
||||||
|
fixed (firewalld zone assignment)
|
||||||
|
|
@ -0,0 +1,249 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""ip_echo preflight — verify UDP port reachability before starting the validator.
|
||||||
|
|
||||||
|
Implements the Solana ip_echo client protocol exactly:
|
||||||
|
1. Bind UDP sockets on the ports the validator will use
|
||||||
|
2. TCP connect to entrypoint gossip port, send IpEchoServerMessage
|
||||||
|
3. Parse IpEchoServerResponse (our IP as seen by entrypoint)
|
||||||
|
4. Wait for entrypoint's UDP probes on each port
|
||||||
|
5. Exit 0 if all ports reachable, exit 1 if any fail
|
||||||
|
|
||||||
|
Wire format (from agave net-utils/src/):
|
||||||
|
Request: 4 null bytes + [u16; 4] tcp_ports LE + [u16; 4] udp_ports LE + \n
|
||||||
|
Response: 4 null bytes + bincode IpAddr (variant byte + addr) + optional shred_version
|
||||||
|
|
||||||
|
Called from entrypoint.py before snapshot download. Prevents wasting hours
|
||||||
|
downloading a snapshot only to crash-loop on port reachability.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import socket
|
||||||
|
import struct
|
||||||
|
import sys
|
||||||
|
import threading
|
||||||
|
import time
|
||||||
|
|
||||||
|
log = logging.getLogger("ip_echo_preflight")
|
||||||
|
|
||||||
|
HEADER = b"\x00\x00\x00\x00"
|
||||||
|
TERMINUS = b"\x0a"
|
||||||
|
RESPONSE_BUF = 27
|
||||||
|
IO_TIMEOUT = 5.0
|
||||||
|
PROBE_TIMEOUT = 10.0
|
||||||
|
MAX_RETRIES = 3
|
||||||
|
RETRY_DELAY = 2.0
|
||||||
|
|
||||||
|
|
||||||
|
def build_request(tcp_ports: list[int], udp_ports: list[int]) -> bytes:
|
||||||
|
"""Build IpEchoServerMessage: header + [u16;4] tcp + [u16;4] udp + newline."""
|
||||||
|
tcp = (tcp_ports + [0, 0, 0, 0])[:4]
|
||||||
|
udp = (udp_ports + [0, 0, 0, 0])[:4]
|
||||||
|
return HEADER + struct.pack("<4H", *tcp) + struct.pack("<4H", *udp) + TERMINUS
|
||||||
|
|
||||||
|
|
||||||
|
def parse_response(data: bytes) -> tuple[str, int | None]:
|
||||||
|
"""Parse IpEchoServerResponse → (ip_string, shred_version | None).
|
||||||
|
|
||||||
|
Wire format (bincode):
|
||||||
|
4 bytes header (\0\0\0\0)
|
||||||
|
4 bytes IpAddr enum variant (u32 LE: 0=IPv4, 1=IPv6)
|
||||||
|
4|16 bytes address octets
|
||||||
|
1 byte Option tag (0=None, 1=Some)
|
||||||
|
2 bytes shred_version (u16 LE, only if Some)
|
||||||
|
"""
|
||||||
|
if len(data) < 8:
|
||||||
|
raise ValueError(f"response too short: {len(data)} bytes")
|
||||||
|
if data[:4] == b"HTTP":
|
||||||
|
raise ValueError("got HTTP response — not an ip_echo server")
|
||||||
|
if data[:4] != HEADER:
|
||||||
|
raise ValueError(f"unexpected header: {data[:4].hex()}")
|
||||||
|
variant = struct.unpack("<I", data[4:8])[0]
|
||||||
|
if variant == 0: # IPv4
|
||||||
|
if len(data) < 12:
|
||||||
|
raise ValueError(f"IPv4 response truncated: {len(data)} bytes")
|
||||||
|
ip = socket.inet_ntoa(data[8:12])
|
||||||
|
rest = data[12:]
|
||||||
|
elif variant == 1: # IPv6
|
||||||
|
if len(data) < 24:
|
||||||
|
raise ValueError(f"IPv6 response truncated: {len(data)} bytes")
|
||||||
|
ip = socket.inet_ntop(socket.AF_INET6, data[8:24])
|
||||||
|
rest = data[24:]
|
||||||
|
else:
|
||||||
|
raise ValueError(f"unknown IpAddr variant: {variant}")
|
||||||
|
shred_version = None
|
||||||
|
if len(rest) >= 3 and rest[0] == 1:
|
||||||
|
shred_version = struct.unpack("<H", rest[1:3])[0]
|
||||||
|
return ip, shred_version
|
||||||
|
|
||||||
|
|
||||||
|
def _listen_udp(port: int, results: dict, stop: threading.Event) -> None:
|
||||||
|
"""Bind a UDP socket and wait for a probe packet."""
|
||||||
|
try:
|
||||||
|
sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
||||||
|
sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
|
||||||
|
sock.bind(("0.0.0.0", port))
|
||||||
|
sock.settimeout(0.5)
|
||||||
|
try:
|
||||||
|
while not stop.is_set():
|
||||||
|
try:
|
||||||
|
_data, addr = sock.recvfrom(64)
|
||||||
|
results[port] = ("ok", addr)
|
||||||
|
return
|
||||||
|
except socket.timeout:
|
||||||
|
continue
|
||||||
|
finally:
|
||||||
|
sock.close()
|
||||||
|
except OSError as exc:
|
||||||
|
results[port] = ("bind_error", str(exc))
|
||||||
|
|
||||||
|
|
||||||
|
def ip_echo_check(
|
||||||
|
entrypoint_host: str,
|
||||||
|
entrypoint_port: int,
|
||||||
|
udp_ports: list[int],
|
||||||
|
) -> tuple[str, dict[int, bool]]:
|
||||||
|
"""Run one ip_echo exchange and return (seen_ip, {port: reachable}).
|
||||||
|
|
||||||
|
Raises on TCP failure (caller retries).
|
||||||
|
"""
|
||||||
|
udp_ports = [p for p in udp_ports if p != 0][:4]
|
||||||
|
|
||||||
|
# Start UDP listeners before sending the TCP request
|
||||||
|
results: dict[int, tuple] = {}
|
||||||
|
stop = threading.Event()
|
||||||
|
threads = []
|
||||||
|
for port in udp_ports:
|
||||||
|
t = threading.Thread(target=_listen_udp, args=(port, results, stop), daemon=True)
|
||||||
|
t.start()
|
||||||
|
threads.append(t)
|
||||||
|
time.sleep(0.1) # let listeners bind
|
||||||
|
|
||||||
|
# TCP: send request, read response
|
||||||
|
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||||
|
sock.settimeout(IO_TIMEOUT)
|
||||||
|
try:
|
||||||
|
sock.connect((entrypoint_host, entrypoint_port))
|
||||||
|
sock.sendall(build_request([], udp_ports))
|
||||||
|
resp = sock.recv(RESPONSE_BUF)
|
||||||
|
finally:
|
||||||
|
sock.close()
|
||||||
|
|
||||||
|
seen_ip, shred_version = parse_response(resp)
|
||||||
|
log.info(
|
||||||
|
"entrypoint %s:%d sees us as %s (shred_version=%s)",
|
||||||
|
entrypoint_host, entrypoint_port, seen_ip, shred_version,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Wait for UDP probes
|
||||||
|
deadline = time.monotonic() + PROBE_TIMEOUT
|
||||||
|
while time.monotonic() < deadline:
|
||||||
|
if all(p in results for p in udp_ports):
|
||||||
|
break
|
||||||
|
time.sleep(0.2)
|
||||||
|
|
||||||
|
stop.set()
|
||||||
|
for t in threads:
|
||||||
|
t.join(timeout=1)
|
||||||
|
|
||||||
|
port_ok: dict[int, bool] = {}
|
||||||
|
for port in udp_ports:
|
||||||
|
if port not in results:
|
||||||
|
log.error("port %d: no probe received within %.0fs", port, PROBE_TIMEOUT)
|
||||||
|
port_ok[port] = False
|
||||||
|
else:
|
||||||
|
status, detail = results[port]
|
||||||
|
if status == "ok":
|
||||||
|
log.info("port %d: probe received from %s", port, detail)
|
||||||
|
port_ok[port] = True
|
||||||
|
else:
|
||||||
|
log.error("port %d: %s: %s", port, status, detail)
|
||||||
|
port_ok[port] = False
|
||||||
|
|
||||||
|
return seen_ip, port_ok
|
||||||
|
|
||||||
|
|
||||||
|
def run_preflight(
|
||||||
|
entrypoint_host: str,
|
||||||
|
entrypoint_port: int,
|
||||||
|
udp_ports: list[int],
|
||||||
|
expected_ip: str = "",
|
||||||
|
) -> bool:
|
||||||
|
"""Run ip_echo check with retries. Returns True if all ports pass."""
|
||||||
|
for attempt in range(1, MAX_RETRIES + 1):
|
||||||
|
log.info("ip_echo attempt %d/%d → %s:%d, ports %s",
|
||||||
|
attempt, MAX_RETRIES, entrypoint_host, entrypoint_port, udp_ports)
|
||||||
|
try:
|
||||||
|
seen_ip, port_ok = ip_echo_check(entrypoint_host, entrypoint_port, udp_ports)
|
||||||
|
except Exception as exc:
|
||||||
|
log.error("attempt %d TCP failed: %s", attempt, exc)
|
||||||
|
if attempt < MAX_RETRIES:
|
||||||
|
time.sleep(RETRY_DELAY)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if expected_ip and seen_ip != expected_ip:
|
||||||
|
log.error(
|
||||||
|
"IP MISMATCH: entrypoint sees %s, expected %s (GOSSIP_HOST). "
|
||||||
|
"Outbound mangle/SNAT path is broken.",
|
||||||
|
seen_ip, expected_ip,
|
||||||
|
)
|
||||||
|
if attempt < MAX_RETRIES:
|
||||||
|
time.sleep(RETRY_DELAY)
|
||||||
|
continue
|
||||||
|
|
||||||
|
reachable = [p for p, ok in port_ok.items() if ok]
|
||||||
|
unreachable = [p for p, ok in port_ok.items() if not ok]
|
||||||
|
|
||||||
|
if not unreachable:
|
||||||
|
log.info("PASS: all ports reachable %s, seen as %s", reachable, seen_ip)
|
||||||
|
return True
|
||||||
|
|
||||||
|
log.error(
|
||||||
|
"attempt %d: unreachable %s, reachable %s, seen as %s",
|
||||||
|
attempt, unreachable, reachable, seen_ip,
|
||||||
|
)
|
||||||
|
if attempt < MAX_RETRIES:
|
||||||
|
time.sleep(RETRY_DELAY)
|
||||||
|
|
||||||
|
log.error("FAIL: ip_echo preflight exhausted %d attempts", MAX_RETRIES)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
|
||||||
|
datefmt="%H:%M:%S",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Parse entrypoint — VALIDATOR_ENTRYPOINT is "host:port"
|
||||||
|
raw = os.environ.get("VALIDATOR_ENTRYPOINT", "")
|
||||||
|
if not raw and len(sys.argv) > 1:
|
||||||
|
raw = sys.argv[1]
|
||||||
|
if not raw:
|
||||||
|
log.error("set VALIDATOR_ENTRYPOINT or pass host:port as argument")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
if ":" in raw:
|
||||||
|
host, port_str = raw.rsplit(":", 1)
|
||||||
|
ep_port = int(port_str)
|
||||||
|
else:
|
||||||
|
host = raw
|
||||||
|
ep_port = 8001
|
||||||
|
|
||||||
|
gossip_port = int(os.environ.get("GOSSIP_PORT", "8001"))
|
||||||
|
dynamic_range = os.environ.get("DYNAMIC_PORT_RANGE", "9000-10000")
|
||||||
|
range_start = int(dynamic_range.split("-")[0])
|
||||||
|
expected_ip = os.environ.get("GOSSIP_HOST", "")
|
||||||
|
|
||||||
|
# Test gossip + first 3 ports from dynamic range (4 max per ip_echo message)
|
||||||
|
udp_ports = [gossip_port, range_start, range_start + 2, range_start + 3]
|
||||||
|
|
||||||
|
ok = run_preflight(host, ep_port, udp_ports, expected_ip)
|
||||||
|
return 0 if ok else 1
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
Loading…
Reference in New Issue