fix: DOCKER-USER rules for inbound relay, add UDP test playbooks

Root cause: Docker FORWARD chain policy DROP blocked all DNAT'd relay
traffic (UDP/TCP 8001, UDP 9000-9025) to the kind node. The DOCKER
chain only ACCEPTs specific TCP ports (6443, 443, 80). Added ACCEPT
rules in DOCKER-USER chain which runs before all Docker chains.

Changes:
- ashburn-relay-biscayne.yml: add DOCKER-USER ACCEPT rules (inbound
  tag) and rollback cleanup
- ashburn-relay-setup.sh.j2: persist DOCKER-USER rules across reboot
- relay-inbound-udp-test.yml: controlled e2e test — listener in kind
  netns, sender from kelce, assert arrival
- relay-link-test.yml: link-by-link tcpdump captures at each hop
- relay-test-udp-listen.py, relay-test-udp-send.py: test helpers
- relay-test-ip-echo.py: full ip_echo protocol test
- inventory/kelce.yml, inventory/panic.yml: test host inventories
- test-ashburn-relay.sh: add ip_echo UDP reachability test

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
fix/kind-mount-propagation
A. F. Dudley 2026-03-08 02:43:31 +00:00
parent 496c7982cb
commit 05f9acf8a0
10 changed files with 470 additions and 1 deletions

View File

@ -0,0 +1,6 @@
all:
hosts:
kelce:
ansible_host: kelce
ansible_user: rix
ansible_python_interpreter: /usr/bin/python3

View File

@ -0,0 +1,7 @@
all:
hosts:
panic:
ansible_host: panic
ansible_user: rix
ansible_become: false
ansible_python_interpreter: /usr/bin/python3

View File

@ -87,6 +87,20 @@
executable: /bin/bash executable: /bin/bash
changed_when: false changed_when: false
- name: Remove DOCKER-USER relay rules
ansible.builtin.shell:
cmd: |
set -o pipefail
iptables -D DOCKER-USER -p udp -d {{ kind_node_ip }} \
--dport {{ gossip_port }} -j ACCEPT 2>/dev/null || true
iptables -D DOCKER-USER -p tcp -d {{ kind_node_ip }} \
--dport {{ gossip_port }} -j ACCEPT 2>/dev/null || true
iptables -D DOCKER-USER -p udp -d {{ kind_node_ip }} \
--dport {{ dynamic_port_range_start }}:{{ dynamic_port_range_end }} \
-j ACCEPT 2>/dev/null || true
executable: /bin/bash
changed_when: false
- name: Remove outbound mangle rules - name: Remove outbound mangle rules
ansible.builtin.shell: ansible.builtin.shell:
cmd: | cmd: |
@ -253,6 +267,36 @@
var: dnat_result.stdout_lines var: dnat_result.stdout_lines
tags: [inbound] tags: [inbound]
- name: Allow DNAT'd relay traffic through DOCKER-USER
ansible.builtin.shell:
cmd: |
set -o pipefail
# Docker's FORWARD chain drops traffic to bridge networks unless
# explicitly accepted. DOCKER-USER runs first and is the correct
# place for user rules. These ACCEPT rules let DNAT'd relay
# traffic reach the kind node (172.20.0.2).
for rule in \
"-p udp -d {{ kind_node_ip }} --dport {{ gossip_port }} -j ACCEPT" \
"-p tcp -d {{ kind_node_ip }} --dport {{ gossip_port }} -j ACCEPT" \
"-p udp -d {{ kind_node_ip }} --dport {{ dynamic_port_range_start }}:{{ dynamic_port_range_end }} -j ACCEPT" \
; do
if ! iptables -C DOCKER-USER $rule 2>/dev/null; then
iptables -I DOCKER-USER 1 $rule
echo "added: $rule"
else
echo "exists: $rule"
fi
done
executable: /bin/bash
register: forward_result
changed_when: "'added' in forward_result.stdout"
tags: [inbound]
- name: Show DOCKER-USER result
ansible.builtin.debug:
var: forward_result.stdout_lines
tags: [inbound]
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# Outbound: fwmark + SNAT + policy routing via new tunnel # Outbound: fwmark + SNAT + policy routing via new tunnel
# ------------------------------------------------------------------ # ------------------------------------------------------------------

View File

@ -35,6 +35,21 @@ for rule in \
fi fi
done done
# FORWARD: allow DNAT'd relay traffic through Docker's FORWARD chain.
# Docker drops traffic to bridge networks unless explicitly accepted.
# DOCKER-USER runs before all Docker chains and survives daemon restarts.
for rule in \
"-p udp -d {{ kind_node_ip }} --dport {{ gossip_port }} -j ACCEPT" \
"-p tcp -d {{ kind_node_ip }} --dport {{ gossip_port }} -j ACCEPT" \
"-p udp -d {{ kind_node_ip }} \
--dport {{ dynamic_port_range_start }}:{{ dynamic_port_range_end }} \
-j ACCEPT" \
; do
if ! iptables -C DOCKER-USER $rule 2>/dev/null; then
iptables -I DOCKER-USER 1 $rule
fi
done
# Outbound mangle (fwmark for policy routing) # Outbound mangle (fwmark for policy routing)
# sport rules: gossip/repair/TVU traffic FROM validator well-known ports # sport rules: gossip/repair/TVU traffic FROM validator well-known ports
# dport rule: ip_echo TCP TO entrypoint port 8001 (ephemeral sport, # dport rule: ip_echo TCP TO entrypoint port 8001 (ephemeral sport,

View File

@ -0,0 +1,95 @@
---
# Test inbound UDP through the Ashburn relay.
#
# Sends a UDP packet from kelce to 137.239.194.65:8001 and checks
# whether it arrives inside the kind node's network namespace.
#
# Usage:
# ansible-playbook -i inventory/biscayne.yml -i inventory/kelce.yml \
# playbooks/relay-inbound-udp-test.yml
#
- name: Inbound UDP relay test — listener
hosts: biscayne
gather_facts: false
become: true
vars:
relay_ip: 137.239.194.65
gossip_port: 8001
kind_node: laconic-70ce4c4b47e23b85-control-plane
tasks:
- name: Copy listener script
ansible.builtin.copy:
src: ../scripts/relay-test-udp-listen.py
dest: /tmp/relay-test-udp-listen.py
mode: "0755"
- name: Get kind node PID
ansible.builtin.shell:
cmd: >-
docker inspect --format '{%raw%}{{.State.Pid}}{%endraw%}' {{ kind_node }}
register: kind_pid_result
changed_when: false
- name: Set kind PID fact
ansible.builtin.set_fact:
kind_pid: "{{ kind_pid_result.stdout | trim }}"
- name: Start UDP listener in kind netns
ansible.builtin.shell:
cmd: >-
nsenter --net --target {{ kind_pid }}
python3 /tmp/relay-test-udp-listen.py {{ gossip_port }} 15
register: listener_result
async: 20
poll: 0
- name: Wait for listener to bind
ansible.builtin.pause:
seconds: 2
- name: Inbound UDP relay test — sender
hosts: kelce
gather_facts: false
vars:
relay_ip: 137.239.194.65
gossip_port: 8001
tasks:
- name: Copy sender script
ansible.builtin.copy:
src: ../scripts/relay-test-udp-send.py
dest: /tmp/relay-test-udp-send.py
mode: "0755"
- name: Send UDP probe to relay IP
ansible.builtin.command:
cmd: python3 /tmp/relay-test-udp-send.py {{ relay_ip }} {{ gossip_port }}
register: send_result
changed_when: false
- name: Show send result
ansible.builtin.debug:
var: send_result.stdout
- name: Inbound UDP relay test — collect results
hosts: biscayne
gather_facts: false
become: true
tasks:
- name: Wait for listener to complete
ansible.builtin.async_status:
jid: "{{ listener_result.ansible_job_id }}"
register: listener_final
until: listener_final.finished
retries: 10
delay: 2
- name: Show listener result
ansible.builtin.debug:
var: listener_final.stdout
- name: Assert UDP arrived
ansible.builtin.assert:
that:
- "'OK' in listener_final.stdout"
fail_msg: "Inbound UDP did not arrive at kind node: {{ listener_final.stdout }}"
success_msg: "Inbound UDP reached kind node: {{ listener_final.stdout }}"

View File

@ -0,0 +1,135 @@
---
# Link-by-link test for inbound UDP through the Ashburn relay.
#
# Tests whether a UDP packet sent from panic to 137.239.194.65:8001
# arrives at each hop along the inbound path:
# 1. biscayne gre-ashburn (post-tunnel decap)
# 2. biscayne DNAT counter
# 3. kind node network namespace
#
# Usage:
# ansible-playbook -i inventory/biscayne.yml -i inventory/panic.yml \
# playbooks/relay-link-test.yml
#
- name: Link test — start captures on biscayne
hosts: biscayne
gather_facts: false
become: true
vars:
relay_ip: 137.239.194.65
gossip_port: 8001
kind_node: laconic-70ce4c4b47e23b85-control-plane
panic_ip: 166.84.136.68
tasks:
- name: Get kind node PID
ansible.builtin.shell:
cmd: >-
docker inspect --format '{%raw%}{{.State.Pid}}{%endraw%}' {{ kind_node }}
register: kind_pid_result
changed_when: false
- name: Get DNAT counter before
ansible.builtin.shell:
cmd: >-
iptables -t nat -L PREROUTING -v -n | grep 'udp dpt:{{ gossip_port }}' | awk '{print $1}'
register: dnat_before
changed_when: false
- name: Start tcpdump on gre-ashburn
ansible.builtin.shell:
cmd: >-
timeout 15 tcpdump -c 1 -nn -i gre-ashburn
'src host {{ panic_ip }} and udp dst port {{ gossip_port }}'
> /tmp/link-test-gre.txt 2>&1
async: 20
poll: 0
register: tcpdump_gre
- name: Start tcpdump on bridge
ansible.builtin.shell:
cmd: >-
timeout 15 tcpdump -c 1 -nn -i br-cf46a62ab5b2
'udp dst port {{ gossip_port }}'
> /tmp/link-test-br.txt 2>&1
async: 20
poll: 0
register: tcpdump_br
- name: Start tcpdump in kind netns
ansible.builtin.shell:
cmd: >-
nsenter --net --target {{ kind_pid_result.stdout | trim }}
timeout 15 tcpdump -c 1 -nn -i eth0
'udp dst port {{ gossip_port }}'
> /tmp/link-test-kind.txt 2>&1
async: 20
poll: 0
register: tcpdump_kind
- name: Wait for captures to start
ansible.builtin.pause:
seconds: 2
- name: Link test — send from panic
hosts: panic
gather_facts: false
vars:
relay_ip: 137.239.194.65
gossip_port: 8001
tasks:
- name: Send 3 UDP probes with 1s interval
ansible.builtin.raw: "python3 -c \"import socket,time;s=socket.socket(socket.AF_INET,socket.SOCK_DGRAM);[s.sendto(b'PROBE',('{{ relay_ip }}',{{ gossip_port }})) or time.sleep(1) for i in range(3)];print('OK sent 3 probes to {{ relay_ip }}:{{ gossip_port }}');s.close()\""
register: send_result
changed_when: false
- name: Show send result
ansible.builtin.debug:
var: send_result.stdout
- name: Link test — collect results
hosts: biscayne
gather_facts: false
become: true
vars:
gossip_port: 8001
tasks:
- name: Wait for captures to finish
ansible.builtin.pause:
seconds: 10
- name: Get DNAT counter after
ansible.builtin.shell:
cmd: >-
iptables -t nat -L PREROUTING -v -n | grep 'udp dpt:{{ gossip_port }}' | awk '{print $1}'
register: dnat_after
changed_when: false
- name: Read gre-ashburn capture
ansible.builtin.command:
cmd: cat /tmp/link-test-gre.txt
register: cap_gre
changed_when: false
- name: Read bridge capture
ansible.builtin.command:
cmd: cat /tmp/link-test-br.txt
register: cap_br
changed_when: false
- name: Read kind netns capture
ansible.builtin.command:
cmd: cat /tmp/link-test-kind.txt
register: cap_kind
changed_when: false
- name: Report results
ansible.builtin.debug:
msg: |
=== Link-by-link results ===
DNAT counter: {{ dnat_before.stdout }} → {{ dnat_after.stdout }}
--- gre-ashburn ---
{{ cap_gre.stdout }}
--- bridge ---
{{ cap_br.stdout }}
--- kind netns ---
{{ cap_kind.stdout }}

View File

@ -0,0 +1,116 @@
#!/usr/bin/env python3
"""Full ip_echo protocol test with UDP probe listener.
Sends the correct ip_echo protocol message to a Solana entrypoint,
which triggers the entrypoint to probe our UDP ports. Then listens
for those probe datagrams to verify inbound UDP reachability.
Protocol (from agave source):
Request: 4 null bytes + bincode(IpEchoServerMessage) + '\n'
Response: 4 null bytes + bincode(IpEchoServerResponse)
IpEchoServerMessage { tcp_ports: [u16; 4], udp_ports: [u16; 4] }
IpEchoServerResponse { address: IpAddr, shred_version: Option<u16> }
The entrypoint sends a single [0] byte to peer_addr.ip() on each
non-zero UDP port, then responds AFTER all probes complete (5s timeout).
"""
import socket
import struct
import sys
import threading
import time
ENTRYPOINT_IP = sys.argv[1] if len(sys.argv) > 1 else "34.83.231.102"
GOSSIP_PORT = int(sys.argv[2]) if len(sys.argv) > 2 else 8001
# Build ip_echo request
# bincode for [u16; 4]: 4 little-endian u16 values, no length prefix (fixed array)
tcp_ports = struct.pack("<4H", 0, 0, 0, 0) # no TCP probes
udp_ports = struct.pack("<4H", GOSSIP_PORT, 0, 0, 0) # probe our gossip port
header = b"\x00" * 4
message = header + tcp_ports + udp_ports + b"\n"
print(f"Connecting to {ENTRYPOINT_IP}:{GOSSIP_PORT} for ip_echo")
print(f"Request: {message.hex()} ({len(message)} bytes)")
# Start UDP listener on gossip port BEFORE sending ip_echo
udp_received = []
def udp_listener():
us = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
us.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
us.bind(("0.0.0.0", GOSSIP_PORT))
us.settimeout(10)
try:
while True:
data, addr = us.recvfrom(64)
udp_received.append((data, addr))
print(f"UDP PROBE received: {len(data)} bytes from {addr[0]}:{addr[1]}")
except socket.timeout:
pass
finally:
us.close()
listener = threading.Thread(target=udp_listener, daemon=True)
listener.start()
# Give listener time to bind
time.sleep(0.1)
# Send ip_echo request via TCP
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.settimeout(15) # entrypoint probes take up to 5s each
try:
s.connect((ENTRYPOINT_IP, GOSSIP_PORT))
print(f"OK TCP connected to {ENTRYPOINT_IP}:{GOSSIP_PORT}")
s.sendall(message)
print("OK ip_echo request sent, waiting for probes + response...")
# Read response (comes AFTER probes complete)
resp = b""
while len(resp) < 4:
chunk = s.recv(256)
if not chunk:
break
resp += chunk
if len(resp) >= 4:
print(f"OK ip_echo response: {len(resp)} bytes: {resp.hex()}")
# Parse: 4 null bytes + bincode IpEchoServerResponse
# IpEchoServerResponse { address: IpAddr, shred_version: Option<u16> }
# bincode IpAddr: enum tag (u32) + data
if len(resp) >= 12:
payload = resp[4:]
ip_enum = struct.unpack("<I", payload[:4])[0]
if ip_enum == 0: # V4
ip_bytes = payload[4:8]
ip = socket.inet_ntoa(ip_bytes)
print(f"OK entrypoint sees us as {ip}")
else:
print(f"OK ip_enum={ip_enum} (IPv6?)")
else:
print(f"ERROR incomplete response: {len(resp)} bytes: {resp.hex()}")
except socket.timeout:
print("TIMEOUT waiting for ip_echo response")
sys.exit(1)
except ConnectionRefusedError:
print(f"REFUSED by {ENTRYPOINT_IP}:{GOSSIP_PORT}")
sys.exit(1)
except Exception as e:
print(f"ERROR {e}")
sys.exit(1)
finally:
s.close()
# Wait for listener to finish
listener.join(timeout=2)
# Summary
print(f"\nUDP probes received: {len(udp_received)}")
if udp_received:
print("OK inbound UDP reachability CONFIRMED")
else:
print("FAIL no UDP probes received — inbound UDP is broken")
sys.exit(1)

View File

@ -0,0 +1,22 @@
#!/usr/bin/env python3
"""Listen on UDP port and report the first packet received."""
import socket
import sys
PORT = int(sys.argv[1]) if len(sys.argv) > 1 else 8001
TIMEOUT = int(sys.argv[2]) if len(sys.argv) > 2 else 15
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
s.bind(("0.0.0.0", PORT))
s.settimeout(TIMEOUT)
print(f"LISTENING on UDP {PORT}", flush=True)
try:
data, addr = s.recvfrom(256)
print(f"OK {len(data)} bytes from {addr[0]}:{addr[1]}: {data!r}")
except socket.timeout:
print("TIMEOUT no UDP received")
sys.exit(1)
finally:
s.close()

View File

@ -0,0 +1,12 @@
#!/usr/bin/env python3
"""Send a UDP probe packet to a target host:port."""
import socket
import sys
HOST = sys.argv[1] if len(sys.argv) > 1 else "137.239.194.65"
PORT = int(sys.argv[2]) if len(sys.argv) > 2 else 8001
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
s.sendto(b"PROBE", (HOST, PORT))
print(f"OK sent 5 bytes to {HOST}:{PORT}")
s.close()

View File

@ -59,7 +59,7 @@ run_test() {
shift shift
ansible biscayne -i "$BISCAYNE_INV" -m ansible.builtin.shell \ ansible biscayne -i "$BISCAYNE_INV" -m ansible.builtin.shell \
-a "nsenter --net --target $KIND_PID python3 /tmp/$name $*" \ -a "nsenter --net --target $KIND_PID python3 /tmp/$name $*" \
--become 2>&1 | grep -E '^OK|^TIMEOUT|^ERROR|^REFUSED|^NOTE' || echo "NO OUTPUT" --become 2>&1 | grep -E '^OK|^TIMEOUT|^ERROR|^REFUSED|^NOTE|^FAIL' || echo "NO OUTPUT"
} }
echo "=== Ashburn Relay End-to-End Test ===" echo "=== Ashburn Relay End-to-End Test ==="
@ -102,6 +102,23 @@ else
fi fi
echo "" echo ""
# Test 4: ip_echo UDP reachability — the actual validator startup check
# Sends correct ip_echo protocol to entrypoint, which probes our UDP port.
# This is the path that causes CrashLoopBackOff when broken.
# Triggers: outbound TCP dport 8001 (mangle mark → tunnel → SNAT)
# inbound UDP dport 8001 (was-sw01 → backbone → mia-sw01 → tunnel → DNAT)
echo "--- Test 4: ip_echo UDP reachability (inbound UDP probe) ---"
result=$(run_test relay-test-ip-echo.py 34.83.231.102 "$GOSSIP_PORT")
if echo "$result" | grep -q "^OK inbound UDP"; then
pass "ip_echo UDP reachability: $result"
elif echo "$result" | grep -q "^OK"; then
# Partial success — TCP worked but no UDP probes arrived
fail "ip_echo partial — no inbound UDP: $result"
else
fail "ip_echo: $result"
fi
echo ""
# Summary # Summary
echo "=== Results: $PASS passed, $FAIL failed ===" echo "=== Results: $PASS passed, $FAIL failed ==="
if [[ $FAIL -gt 0 ]]; then if [[ $FAIL -gt 0 ]]; then