feat: rolling incremental snapshot download loop

After the full snapshot downloads, continuously re-probe all fast sources
for newer incrementals until the best available is within convergence_slots
(default 500) of head. Each iteration finds the highest-slot incremental
matching our full snapshot's base slot, downloads it (replacing any previous),
and checks the gap to mainnet head.

- Extract probe_incremental() from inline re-probe code
- Add convergence_slots param to download_best_snapshot() (default 500)
- Add --convergence-slots CLI arg
- Pass SNAPSHOT_CONVERGENCE_SLOTS env var from entrypoint.py

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
fix/kind-mount-propagation
A. F. Dudley 2026-03-09 05:33:47 +00:00
parent bd38c1b791
commit bfde58431e
2 changed files with 122 additions and 33 deletions

View File

@ -158,7 +158,8 @@ def maybe_download_snapshot(snapshots_dir: str) -> None:
sys.path.insert(0, str(script_dir))
from snapshot_download import download_best_snapshot
ok = download_best_snapshot(snapshots_dir)
convergence = int(env("SNAPSHOT_CONVERGENCE_SLOTS", "500"))
ok = download_best_snapshot(snapshots_dir, convergence_slots=convergence)
if not ok:
log.error("Snapshot download failed — starting without fresh snapshot")

View File

@ -343,6 +343,61 @@ def measure_speed(rpc_address: str, measure_time: int = 7) -> float:
return 0.0
# -- Incremental probing -------------------------------------------------------
def probe_incremental(
fast_sources: list[SnapshotSource],
full_snap_slot: int,
) -> tuple[str | None, list[str]]:
"""Probe fast sources for the best incremental matching full_snap_slot.
Returns (filename, mirror_urls) or (None, []) if no match found.
The "best" incremental is the one with the highest slot (closest to head).
"""
best_filename: str | None = None
best_slot: int = 0
best_source: SnapshotSource | None = None
best_path: str | None = None
for source in fast_sources:
inc_url: str = f"http://{source.rpc_address}/incremental-snapshot.tar.bz2"
inc_location, _ = head_no_follow(inc_url, timeout=2)
if not inc_location:
continue
inc_fn, inc_fp = _parse_snapshot_filename(inc_location)
m: re.Match[str] | None = INCR_SNAP_RE.match(inc_fn)
if not m:
continue
if int(m.group(1)) != full_snap_slot:
log.debug(" %s: incremental base slot %s != full %d, skipping",
source.rpc_address, m.group(1), full_snap_slot)
continue
inc_slot: int = int(m.group(2))
if inc_slot > best_slot:
best_slot = inc_slot
best_filename = inc_fn
best_source = source
best_path = inc_fp
if best_filename is None or best_source is None or best_path is None:
return None, []
# Build mirror list — check other sources for the same filename
mirror_urls: list[str] = [f"http://{best_source.rpc_address}{best_path}"]
for other in fast_sources:
if other.rpc_address == best_source.rpc_address:
continue
other_loc, _ = head_no_follow(
f"http://{other.rpc_address}/incremental-snapshot.tar.bz2", timeout=2)
if other_loc:
other_fn, other_fp = _parse_snapshot_filename(other_loc)
if other_fn == best_filename:
mirror_urls.append(f"http://{other.rpc_address}{other_fp}")
return best_filename, mirror_urls
# -- Download ------------------------------------------------------------------
@ -423,6 +478,7 @@ def download_best_snapshot(
max_speed_checks: int = 15,
version_filter: str | None = None,
full_only: bool = False,
convergence_slots: int = 500,
) -> bool:
"""Download the best available snapshot to output_dir.
@ -534,46 +590,75 @@ def download_best_snapshot(
log.error("Failed to download %s", filename)
return False
# After full snapshot download, re-probe for a fresh incremental.
# After full snapshot download, rolling incremental download loop.
# The initial incremental is stale by now (full download takes 10+ min).
# Re-probe repeatedly until we find one close enough to head.
if not full_only:
# Get the full snapshot slot from the filename we just downloaded
full_filename: str = full_downloads[0][0]
fm_post: re.Match[str] | None = FULL_SNAP_RE.match(full_filename)
if fm_post:
full_snap_slot: int = int(fm_post.group(1))
log.info("Re-probing for fresh incremental based on slot %d...", full_snap_slot)
inc_downloaded: bool = False
for source in fast_sources:
inc_url_re: str = f"http://{source.rpc_address}/incremental-snapshot.tar.bz2"
inc_location, _ = head_no_follow(inc_url_re, timeout=2)
if not inc_location:
continue
inc_fn, inc_fp = _parse_snapshot_filename(inc_location)
log.info("Rolling incremental download (base slot %d, convergence %d slots)...",
full_snap_slot, convergence_slots)
prev_inc_filename: str | None = None
while True:
inc_fn, inc_mirrors = probe_incremental(fast_sources, full_snap_slot)
if inc_fn is None:
if prev_inc_filename is None:
log.error("No matching incremental found for base slot %d "
"— validator will replay from full snapshot", full_snap_slot)
else:
log.info("No newer incremental available, using %s", prev_inc_filename)
break
# Parse the incremental slot from the filename
m_inc: re.Match[str] | None = INCR_SNAP_RE.match(inc_fn)
if not m_inc:
assert m_inc is not None # probe_incremental already validated
inc_slot: int = int(m_inc.group(2))
# Check convergence against current mainnet slot
head_slot: int | None = get_current_slot(resolved_rpc)
if head_slot is None:
log.warning("Cannot get current slot — downloading best available incremental")
gap: int = convergence_slots + 1 # force download, then break
else:
gap = head_slot - inc_slot
# Skip download if we already have this exact incremental
if inc_fn == prev_inc_filename:
if gap <= convergence_slots:
log.info("Incremental %s already downloaded (gap %d slots, converged)", inc_fn, gap)
break
log.info("No newer incremental yet (slot %d, gap %d slots), waiting...",
inc_slot, gap)
time.sleep(10)
continue
if int(m_inc.group(1)) != full_snap_slot:
log.debug(" %s: incremental base slot %s != full %d, skipping",
source.rpc_address, m_inc.group(1), full_snap_slot)
continue
# Found a matching incremental — build mirror list and download
inc_mirrors: list[str] = [f"http://{source.rpc_address}{inc_fp}"]
for other in fast_sources:
if other.rpc_address == source.rpc_address:
continue
other_loc, _ = head_no_follow(
f"http://{other.rpc_address}/incremental-snapshot.tar.bz2", timeout=2)
if other_loc:
other_fn, other_fp = _parse_snapshot_filename(other_loc)
if other_fn == inc_fn:
inc_mirrors.append(f"http://{other.rpc_address}{other_fp}")
log.info(" Found incremental %s (%d mirrors)", inc_fn, len(inc_mirrors))
if download_aria2c(inc_mirrors, output_dir, inc_fn, connections):
inc_downloaded = True
break
if not inc_downloaded:
log.info("No matching incremental found — validator will replay from full snapshot")
# Delete previous incremental before downloading the new one
if prev_inc_filename is not None:
old_path: Path = Path(output_dir) / prev_inc_filename
if old_path.exists():
log.info("Removing superseded incremental %s", prev_inc_filename)
old_path.unlink()
log.info("Downloading incremental %s (%d mirrors, slot %d, gap %d slots)",
inc_fn, len(inc_mirrors), inc_slot, gap)
if not download_aria2c(inc_mirrors, output_dir, inc_fn, connections):
log.error("Failed to download incremental %s", inc_fn)
break
prev_inc_filename = inc_fn
if gap <= convergence_slots:
log.info("Converged: incremental slot %d is %d slots behind head", inc_slot, gap)
break
if head_slot is None:
break
log.info("Not converged (gap %d > %d), re-probing in 10s...", gap, convergence_slots)
time.sleep(10)
total_elapsed: float = time.monotonic() - total_start
log.info("All downloads complete in %.0fs", total_elapsed)
@ -615,6 +700,8 @@ def main() -> int:
help="Max nodes to benchmark before giving up (default: 15)")
p.add_argument("--version", default=None,
help="Filter nodes by version prefix (e.g. '2.2')")
p.add_argument("--convergence-slots", type=int, default=500,
help="Max slot gap for incremental convergence (default: 500)")
p.add_argument("--full-only", action="store_true",
help="Download only full snapshot, skip incremental")
p.add_argument("--dry-run", action="store_true",
@ -669,6 +756,7 @@ def main() -> int:
max_speed_checks=args.max_speed_checks,
version_filter=args.version,
full_only=args.full_only,
convergence_slots=args.convergence_slots,
)
if ok and args.post_cmd: