feat: snapshot leapfrog — auto-recovery when validator falls behind

Entrypoint changes: - Always require full + incremental before starting (retry until found) - Check incremental freshness against convergence threshold (500 slots) - Gap monitor thread: if validator falls >5000 slots behind for 3 consecutive checks, graceful stop + restart with fresh incremental - cmd_serve is now a loop: download → run → monitor → leapfrog → repeat - --no-snapshot-fetch moved to common args (both RPC and validator modes) - --maximum-full-snapshots-to-retain default 1 (validator deletes downloaded full after generating its own) - SNAPSHOT_MAX_AGE_SLOTS default 100000 (one full snapshot generation) snapshot_download.py refactoring: - Extract _discover_and_benchmark() and _rolling_incremental_download() as shared helpers - Restore download_incremental_for_slot() using shared helpers (downloads only an incremental for an existing full snapshot) - download_best_snapshot() uses shared helpers, downloads full then incremental as separate operations The leapfrog cycle: validator generates full snapshots at standard 100k block height intervals (same slots as the rest of the network). When the gap monitor triggers, the entrypoint loops back to maybe_download_snapshot which finds the validator's local full, downloads a fresh network incremental (generated every ~40s, converges within the ~11hr full generation window), and restarts. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-10 05:53:56 +00:00 · 2026-03-10 05:53:56 +00:00 · 3bf87a2e9b
parent cd36bfe5ee
commit 3bf87a2e9b
2 changed files with 415 additions and 212 deletions
--- a/scripts/agave-container/entrypoint.py
+++ b/scripts/agave-container/entrypoint.py
@ -43,10 +43,13 @@ SNAPSHOTS_DIR = "/data/snapshots"
 LOG_DIR = "/data/log"
 IDENTITY_FILE = f"{CONFIG_DIR}/validator-identity.json"
-# Snapshot filename pattern
+# Snapshot filename patterns
 FULL_SNAP_RE: re.Pattern[str] = re.compile(
    r"^snapshot-(\d+)-[A-Za-z0-9]+\.tar\.(zst|bz2)$"
 )
 INCR_SNAP_RE: re.Pattern[str] = re.compile(
    r"^incremental-snapshot-(\d+)-(\d+)-[A-Za-z0-9]+\.tar\.(zst|bz2)$"
 )
 MAINNET_RPC = "https://api.mainnet-beta.solana.com"
@ -124,51 +127,89 @@ def clean_snapshots(snapshots_dir: str) -> None:
            entry.unlink(missing_ok=True)
 def get_incremental_slot(snapshots_dir: str, full_slot: int | None) -> int | None:
    """Get the highest incremental snapshot slot matching the full's base slot."""
    if full_slot is None:
        return None
    snap_path = Path(snapshots_dir)
    if not snap_path.is_dir():
        return None
    best: int | None = None
    for entry in snap_path.iterdir():
        m = INCR_SNAP_RE.match(entry.name)
        if m and int(m.group(1)) == full_slot:
            slot = int(m.group(2))
            if best is None or slot > best:
                best = slot
    return best
 def maybe_download_snapshot(snapshots_dir: str) -> None:
-    """Check snapshot freshness and download if needed.
+    """Ensure full + incremental snapshots exist before starting.
    The validator should always start from a full + incremental pair to
    minimize replay time. If either is missing or the full is too old,
    download fresh ones via download_best_snapshot (which does rolling
    incremental convergence after downloading the full).
    Controlled by env vars:
      SNAPSHOT_AUTO_DOWNLOAD (default: true) — enable/disable
-      SNAPSHOT_MAX_AGE_SLOTS (default: 20000) — staleness threshold
+      SNAPSHOT_MAX_AGE_SLOTS (default: 100000) — full snapshot staleness threshold
        (one full snapshot generation, ~11 hours)
    """
    if not env_bool("SNAPSHOT_AUTO_DOWNLOAD", default=True):
        log.info("Snapshot auto-download disabled")
        return
-    max_age = int(env("SNAPSHOT_MAX_AGE_SLOTS", "20000"))
+    max_age = int(env("SNAPSHOT_MAX_AGE_SLOTS", "100000"))
    # Get mainnet current slot
    mainnet_slot = rpc_get_slot(MAINNET_RPC)
    if mainnet_slot is None:
        log.warning("Cannot reach mainnet RPC — skipping snapshot check")
        return
    # Check local snapshot
    local_slot = get_local_snapshot_slot(snapshots_dir)
    if local_slot is not None:
        age = mainnet_slot - local_slot
        log.info("Local snapshot at slot %d (mainnet: %d, age: %d slots)",
                 local_slot, mainnet_slot, age)
        if age <= max_age:
            log.info("Snapshot is fresh enough (age %d <= %d), skipping download", age, max_age)
            return
        log.info("Snapshot is stale (age %d > %d), downloading fresh", age, max_age)
    else:
        log.info("No local snapshot found, downloading")
    # Clean old snapshots before downloading
    clean_snapshots(snapshots_dir)
    # Import and call snapshot download
    # snapshot_download.py is installed alongside this file in /usr/local/bin/
    script_dir = Path(__file__).resolve().parent
    sys.path.insert(0, str(script_dir))
-    from snapshot_download import download_best_snapshot
+    from snapshot_download import download_best_snapshot, download_incremental_for_slot
    convergence = int(env("SNAPSHOT_CONVERGENCE_SLOTS", "500"))
-    ok = download_best_snapshot(snapshots_dir, convergence_slots=convergence)
+    retry_delay = int(env("SNAPSHOT_RETRY_DELAY", "60"))
-    if not ok:
+
-        log.error("Snapshot download failed — starting without fresh snapshot")
+    # Check local full snapshot
    local_slot = get_local_snapshot_slot(snapshots_dir)
    have_fresh_full = (local_slot is not None
                       and (mainnet_slot - local_slot) <= max_age)
    if have_fresh_full:
        assert local_slot is not None
        inc_slot = get_incremental_slot(snapshots_dir, local_slot)
        if inc_slot is not None:
            inc_gap = mainnet_slot - inc_slot
            if inc_gap <= convergence:
                log.info("Full (slot %d) + incremental (slot %d, gap %d) "
                         "within convergence, starting",
                         local_slot, inc_slot, inc_gap)
                return
            log.info("Incremental too stale (slot %d, gap %d > %d)",
                     inc_slot, inc_gap, convergence)
        # Fresh full, need a fresh incremental
        log.info("Downloading incremental for full at slot %d", local_slot)
        while True:
            if download_incremental_for_slot(snapshots_dir, local_slot,
                                             convergence_slots=convergence):
                return
            log.warning("Incremental download failed — retrying in %ds",
                        retry_delay)
            time.sleep(retry_delay)
    # No full or full too old — download both
    log.info("Downloading full + incremental")
    clean_snapshots(snapshots_dir)
    while True:
        if download_best_snapshot(snapshots_dir, convergence_slots=convergence):
            return
        log.warning("Snapshot download failed — retrying in %ds", retry_delay)
        time.sleep(retry_delay)
 # -- Directory and identity setup ----------------------------------------------
@ -230,6 +271,7 @@ def build_common_args() -> list[str]:
        "--no-os-network-limits-test",
        "--wal-recovery-mode", "skip_any_corrupted_record",
        "--limit-ledger-size", env("LIMIT_LEDGER_SIZE", "50000000"),
        "--no-snapshot-fetch",  # entrypoint handles snapshot download
    ]
    # Snapshot generation
@ -238,7 +280,7 @@ def build_common_args() -> list[str]:
    else:
        args += [
            "--full-snapshot-interval-slots", env("SNAPSHOT_INTERVAL_SLOTS", "100000"),
-            "--maximum-full-snapshots-to-retain", env("MAXIMUM_SNAPSHOTS_TO_RETAIN", "5"),
+            "--maximum-full-snapshots-to-retain", env("MAXIMUM_SNAPSHOTS_TO_RETAIN", "1"),
        ]
        if env("NO_INCREMENTAL_SNAPSHOTS") != "true":
            args += ["--maximum-incremental-snapshots-to-retain", "2"]
@ -309,7 +351,6 @@ def build_rpc_args() -> list[str]:
        "--rpc-pubsub-enable-block-subscription",
        "--enable-extended-tx-metadata-storage",
        "--no-wait-for-vote-to-start-leader",
        "--no-snapshot-fetch",
    ]
    # Public vs private RPC
@ -379,7 +420,7 @@ def append_extra_args(args: list[str]) -> list[str]:
 GRACEFUL_EXIT_TIMEOUT = 270
-def graceful_exit(child: subprocess.Popen[bytes]) -> None:
+def graceful_exit(child: subprocess.Popen[bytes], reason: str = "SIGTERM") -> None:
    """Request graceful shutdown via the admin RPC Unix socket.
    Runs ``agave-validator exit --force --ledger /data/ledger`` which connects
@ -390,7 +431,7 @@ def graceful_exit(child: subprocess.Popen[bytes]) -> None:
    If the admin RPC exit fails or the child doesn't exit within the timeout,
    falls back to SIGTERM then SIGKILL.
    """
-    log.info("SIGTERM received — requesting graceful exit via admin RPC")
+    log.info("%s — requesting graceful exit via admin RPC", reason)
    try:
        result = subprocess.run(
            ["agave-validator", "exit", "--force", "--ledger", LEDGER_DIR],
@ -437,16 +478,69 @@ def graceful_exit(child: subprocess.Popen[bytes]) -> None:
 # -- Serve subcommand ---------------------------------------------------------
-def cmd_serve() -> None:
+def _gap_monitor(
-    """Main serve flow: snapshot check, setup, run agave-validator as child.
+    child: subprocess.Popen[bytes],
    leapfrog: threading.Event,
    shutting_down: threading.Event,
 ) -> None:
    """Background thread: poll slot gap and trigger leapfrog if too far behind.
-    Python stays as PID 1 and traps SIGTERM to perform graceful shutdown
+    Waits for a grace period (SNAPSHOT_MONITOR_GRACE, default 600s) before
-    via the admin RPC Unix socket.
+    monitoring — the validator needs time to extract snapshots and catch up.
    Then polls every SNAPSHOT_MONITOR_INTERVAL (default 30s). If the gap
    exceeds SNAPSHOT_LEAPFROG_SLOTS (default 5000) for SNAPSHOT_LEAPFROG_CHECKS
    (default 3) consecutive checks, triggers graceful shutdown and sets the
    leapfrog event so cmd_serve loops back to download a fresh incremental.
    """
    threshold = int(env("SNAPSHOT_LEAPFROG_SLOTS", "5000"))
    required_checks = int(env("SNAPSHOT_LEAPFROG_CHECKS", "3"))
    interval = int(env("SNAPSHOT_MONITOR_INTERVAL", "30"))
    grace = int(env("SNAPSHOT_MONITOR_GRACE", "600"))
    rpc_port = env("RPC_PORT", "8899")
    local_url = f"http://127.0.0.1:{rpc_port}"
    # Grace period — don't monitor during initial catch-up
    if shutting_down.wait(grace):
        return
    consecutive = 0
    while not shutting_down.is_set():
        local_slot = rpc_get_slot(local_url, timeout=5)
        mainnet_slot = rpc_get_slot(MAINNET_RPC, timeout=10)
        if local_slot is not None and mainnet_slot is not None:
            gap = mainnet_slot - local_slot
            if gap > threshold:
                consecutive += 1
                log.warning("Gap %d > %d (%d/%d consecutive)",
                            gap, threshold, consecutive, required_checks)
                if consecutive >= required_checks:
                    log.warning("Leapfrog triggered: gap %d", gap)
                    leapfrog.set()
                    graceful_exit(child, reason="Leapfrog")
                    return
            else:
                if consecutive > 0:
                    log.info("Gap %d within threshold, resetting counter", gap)
                consecutive = 0
        shutting_down.wait(interval)
 def cmd_serve() -> None:
    """Main serve flow: snapshot download, run validator, monitor gap, leapfrog.
    Python stays as PID 1. On each iteration:
      1. Download full + incremental snapshots (if needed)
      2. Start agave-validator as child process
      3. Monitor slot gap in background thread
      4. If gap exceeds threshold → graceful stop → loop back to step 1
      5. If SIGTERM → graceful stop → exit
      6. If validator crashes → exit with its return code
    """
    mode = env("AGAVE_MODE", "test")
    log.info("AGAVE_MODE=%s", mode)
    # Test mode dispatches to start-test.sh
    if mode == "test":
        os.execvp("start-test.sh", ["start-test.sh"])
@ -454,47 +548,66 @@ def cmd_serve() -> None:
        log.error("Unknown AGAVE_MODE: %s (valid: test, rpc, validator)", mode)
        sys.exit(1)
-    # Ensure directories
+    # One-time setup
    dirs = [CONFIG_DIR, LEDGER_DIR, ACCOUNTS_DIR, SNAPSHOTS_DIR]
    if mode == "rpc":
        dirs.append(LOG_DIR)
    ensure_dirs(*dirs)
-    # Snapshot freshness check and auto-download
+    if not env_bool("SKIP_IP_ECHO_PREFLIGHT"):
-    maybe_download_snapshot(SNAPSHOTS_DIR)
+        script_dir = Path(__file__).resolve().parent
        sys.path.insert(0, str(script_dir))
        from ip_echo_preflight import main as ip_echo_main
        if ip_echo_main() != 0:
            sys.exit(1)
    # Identity setup
    if mode == "rpc":
        ensure_identity_rpc()
    print_identity()
    # Build args
    if mode == "rpc":
        args = build_rpc_args()
    else:
        args = build_validator_args()
    args = append_extra_args(args)
-    # Write startup timestamp for probe grace period
+    # Main loop: download → run → monitor → leapfrog if needed
-    Path("/tmp/entrypoint-start").write_text(str(time.time()))
+    while True:
        maybe_download_snapshot(SNAPSHOTS_DIR)
-    log.info("Starting agave-validator with %d arguments", len(args))
+        Path("/tmp/entrypoint-start").write_text(str(time.time()))
-    child = subprocess.Popen(["agave-validator"] + args)
+        log.info("Starting agave-validator with %d arguments", len(args))
        child = subprocess.Popen(["agave-validator"] + args)
-    # Forward SIGUSR1 to child (log rotation)
+        shutting_down = threading.Event()
-    signal.signal(signal.SIGUSR1, lambda _sig, _frame: child.send_signal(signal.SIGUSR1))
+        leapfrog = threading.Event()
-    # Trap SIGTERM — run graceful_exit in a thread so the signal handler returns
+        signal.signal(signal.SIGUSR1,
-    # immediately and child.wait() in the main thread can observe the exit.
+                      lambda _sig, _frame: child.send_signal(signal.SIGUSR1))
    def _on_sigterm(_sig: int, _frame: object) -> None:
        threading.Thread(target=graceful_exit, args=(child,), daemon=True).start()
-    signal.signal(signal.SIGTERM, _on_sigterm)
+        def _on_sigterm(_sig: int, _frame: object) -> None:
            shutting_down.set()
            threading.Thread(
                target=graceful_exit, args=(child,), daemon=True,
            ).start()
-    # Wait for child — if it exits on its own (crash, normal exit), propagate code
+        signal.signal(signal.SIGTERM, _on_sigterm)
-    child.wait()
+
-    sys.exit(child.returncode)
+        # Start gap monitor
        monitor = threading.Thread(
            target=_gap_monitor,
            args=(child, leapfrog, shutting_down),
            daemon=True,
        )
        monitor.start()
        child.wait()
        if leapfrog.is_set():
            log.info("Leapfrog: restarting with fresh incremental")
            continue
        sys.exit(child.returncode)
 # -- Probe subcommand ---------------------------------------------------------
--- a/scripts/agave-container/snapshot_download.py
+++ b/scripts/agave-container/snapshot_download.py
@ -461,9 +461,214 @@ def download_aria2c(
    return True
 # -- Shared helpers ------------------------------------------------------------
 def _discover_and_benchmark(
    rpc_url: str,
    current_slot: int,
    *,
    max_snapshot_age: int = 10000,
    max_latency: float = 500,
    threads: int = 500,
    min_download_speed: int = 20,
    measurement_time: int = 7,
    max_speed_checks: int = 15,
    version_filter: str | None = None,
 ) -> list[SnapshotSource]:
    """Discover snapshot sources and benchmark download speed.
    Returns sources that meet the minimum speed requirement, sorted by speed.
    """
    sources: list[SnapshotSource] = discover_sources(
        rpc_url, current_slot,
        max_age_slots=max_snapshot_age,
        max_latency_ms=max_latency,
        threads=threads,
        version_filter=version_filter,
    )
    if not sources:
        return []
    sources.sort(key=lambda s: s.latency_ms)
    log.info("Benchmarking download speed on top %d sources...", max_speed_checks)
    fast_sources: list[SnapshotSource] = []
    checked: int = 0
    min_speed_bytes: int = min_download_speed * 1024 * 1024
    for source in sources:
        if checked >= max_speed_checks:
            break
        checked += 1
        speed: float = measure_speed(source.rpc_address, measurement_time)
        source.download_speed = speed
        speed_mib: float = speed / (1024 ** 2)
        if speed < min_speed_bytes:
            log.info("  %s: %.1f MiB/s (too slow, need >=%d MiB/s)",
                     source.rpc_address, speed_mib, min_download_speed)
            continue
        log.info("  %s: %.1f MiB/s (latency: %.0fms, age: %d slots)",
                 source.rpc_address, speed_mib,
                 source.latency_ms, source.slots_diff)
        fast_sources.append(source)
    return fast_sources
 def _rolling_incremental_download(
    fast_sources: list[SnapshotSource],
    full_snap_slot: int,
    output_dir: str,
    convergence_slots: int,
    connections: int,
    rpc_url: str,
 ) -> str | None:
    """Download incrementals in a loop until converged.
    Probes fast_sources for incrementals matching full_snap_slot, downloads
    the freshest one, then re-probes until the gap to head is within
    convergence_slots. Returns the filename of the final incremental,
    or None if no incremental was found.
    """
    prev_inc_filename: str | None = None
    loop_start: float = time.monotonic()
    max_convergence_time: float = 1800.0  # 30 min wall-clock limit
    while True:
        if time.monotonic() - loop_start > max_convergence_time:
            if prev_inc_filename:
                log.warning("Convergence timeout (%.0fs) — using %s",
                            max_convergence_time, prev_inc_filename)
            else:
                log.warning("Convergence timeout (%.0fs) — no incremental downloaded",
                            max_convergence_time)
            break
        inc_fn, inc_mirrors = probe_incremental(fast_sources, full_snap_slot)
        if inc_fn is None:
            if prev_inc_filename is None:
                log.error("No matching incremental found for base slot %d",
                          full_snap_slot)
            else:
                log.info("No newer incremental available, using %s", prev_inc_filename)
            break
        m_inc: re.Match[str] | None = INCR_SNAP_RE.match(inc_fn)
        assert m_inc is not None
        inc_slot: int = int(m_inc.group(2))
        head_slot: int | None = get_current_slot(rpc_url)
        if head_slot is None:
            log.warning("Cannot get current slot — downloading best available incremental")
            gap: int = convergence_slots + 1
        else:
            gap = head_slot - inc_slot
        if inc_fn == prev_inc_filename:
            if gap <= convergence_slots:
                log.info("Incremental %s already downloaded (gap %d slots, converged)",
                         inc_fn, gap)
                break
            log.info("No newer incremental yet (slot %d, gap %d slots), waiting...",
                     inc_slot, gap)
            time.sleep(10)
            continue
        if prev_inc_filename is not None:
            old_path: Path = Path(output_dir) / prev_inc_filename
            if old_path.exists():
                log.info("Removing superseded incremental %s", prev_inc_filename)
                old_path.unlink()
        log.info("Downloading incremental %s (%d mirrors, slot %d, gap %d slots)",
                 inc_fn, len(inc_mirrors), inc_slot, gap)
        if not download_aria2c(inc_mirrors, output_dir, inc_fn, connections):
            log.warning("Failed to download incremental %s — re-probing in 10s", inc_fn)
            time.sleep(10)
            continue
        prev_inc_filename = inc_fn
        if gap <= convergence_slots:
            log.info("Converged: incremental slot %d is %d slots behind head",
                     inc_slot, gap)
            break
        if head_slot is None:
            break
        log.info("Not converged (gap %d > %d), re-probing in 10s...",
                 gap, convergence_slots)
        time.sleep(10)
    return prev_inc_filename
 # -- Public API ----------------------------------------------------------------
 def download_incremental_for_slot(
    output_dir: str,
    full_snap_slot: int,
    *,
    cluster: str = "mainnet-beta",
    rpc_url: str | None = None,
    connections: int = 16,
    threads: int = 500,
    max_snapshot_age: int = 10000,
    max_latency: float = 500,
    min_download_speed: int = 20,
    measurement_time: int = 7,
    max_speed_checks: int = 15,
    version_filter: str | None = None,
    convergence_slots: int = 500,
 ) -> bool:
    """Download an incremental snapshot for an existing full snapshot.
    Discovers sources, benchmarks speed, then runs the rolling incremental
    download loop for the given full snapshot base slot. Does NOT download
    a full snapshot.
    Returns True if an incremental was downloaded, False otherwise.
    """
    resolved_rpc: str = rpc_url or CLUSTER_RPC[cluster]
    if not shutil.which("aria2c"):
        log.error("aria2c not found. Install with: apt install aria2")
        return False
    log.info("Incremental download for base slot %d", full_snap_slot)
    current_slot: int | None = get_current_slot(resolved_rpc)
    if current_slot is None:
        log.error("Cannot get current slot from %s", resolved_rpc)
        return False
    fast_sources: list[SnapshotSource] = _discover_and_benchmark(
        resolved_rpc, current_slot,
        max_snapshot_age=max_snapshot_age,
        max_latency=max_latency,
        threads=threads,
        min_download_speed=min_download_speed,
        measurement_time=measurement_time,
        max_speed_checks=max_speed_checks,
        version_filter=version_filter,
    )
    if not fast_sources:
        log.error("No fast sources found")
        return False
    os.makedirs(output_dir, exist_ok=True)
    result: str | None = _rolling_incremental_download(
        fast_sources, full_snap_slot, output_dir,
        convergence_slots, connections, resolved_rpc,
    )
    return result is not None
 def download_best_snapshot(
    output_dir: str,
    *,
@ -500,183 +705,68 @@ def download_best_snapshot(
        return False
    log.info("Current slot: %d", current_slot)
-    sources: list[SnapshotSource] = discover_sources(
+    fast_sources: list[SnapshotSource] = _discover_and_benchmark(
        resolved_rpc, current_slot,
-        max_age_slots=max_snapshot_age,
+        max_snapshot_age=max_snapshot_age,
-        max_latency_ms=max_latency,
+        max_latency=max_latency,
        threads=threads,
        min_download_speed=min_download_speed,
        measurement_time=measurement_time,
        max_speed_checks=max_speed_checks,
        version_filter=version_filter,
    )
    if not sources:
        log.error("No snapshot sources found")
        return False
    # Sort by latency (lowest first) for speed benchmarking
    sources.sort(key=lambda s: s.latency_ms)
    # Benchmark top candidates
    log.info("Benchmarking download speed on top %d sources...", max_speed_checks)
    fast_sources: list[SnapshotSource] = []
    checked: int = 0
    min_speed_bytes: int = min_download_speed * 1024 * 1024
    for source in sources:
        if checked >= max_speed_checks:
            break
        checked += 1
        speed: float = measure_speed(source.rpc_address, measurement_time)
        source.download_speed = speed
        speed_mib: float = speed / (1024 ** 2)
        if speed < min_speed_bytes:
            log.info("  %s: %.1f MiB/s (too slow, need >=%d MiB/s)",
                     source.rpc_address, speed_mib, min_download_speed)
            continue
        log.info("  %s: %.1f MiB/s (latency: %.0fms, age: %d slots)",
                 source.rpc_address, speed_mib,
                 source.latency_ms, source.slots_diff)
        fast_sources.append(source)
    if not fast_sources:
-        log.error("No source met minimum speed requirement (%d MiB/s)",
+        log.error("No fast sources found")
                  min_download_speed)
        return False
-    # Use the fastest source as primary, collect mirrors for each file
+    # Use the fastest source as primary, build full snapshot download plan
    best: SnapshotSource = fast_sources[0]
-    file_paths: list[str] = best.file_paths
+    full_paths: list[str] = [fp for fp in best.file_paths
-    if full_only:
+                             if fp.rsplit("/", 1)[-1].startswith("snapshot-")]
-        file_paths = [fp for fp in file_paths
+    if not full_paths:
-                      if fp.rsplit("/", 1)[-1].startswith("snapshot-")]
+        log.error("Best source has no full snapshot")
        return False
-    # Build mirror URL lists
+    # Build mirror URLs for the full snapshot
-    download_plan: list[tuple[str, list[str]]] = []
+    full_filename: str = full_paths[0].rsplit("/", 1)[-1]
-    for fp in file_paths:
+    full_mirrors: list[str] = [f"http://{best.rpc_address}{full_paths[0]}"]
-        filename: str = fp.rsplit("/", 1)[-1]
+    for other in fast_sources[1:]:
-        mirror_urls: list[str] = [f"http://{best.rpc_address}{fp}"]
+        for other_fp in other.file_paths:
-        for other in fast_sources[1:]:
+            if other_fp.rsplit("/", 1)[-1] == full_filename:
-            for other_fp in other.file_paths:
+                full_mirrors.append(f"http://{other.rpc_address}{other_fp}")
-                if other_fp.rsplit("/", 1)[-1] == filename:
+                break
                    mirror_urls.append(f"http://{other.rpc_address}{other_fp}")
                    break
        download_plan.append((filename, mirror_urls))
    speed_mib: float = best.download_speed / (1024 ** 2)
-    log.info("Best source: %s (%.1f MiB/s), %d mirrors total",
+    log.info("Best source: %s (%.1f MiB/s), %d mirrors",
-             best.rpc_address, speed_mib, len(fast_sources))
+             best.rpc_address, speed_mib, len(full_mirrors))
    for filename, mirror_urls in download_plan:
        log.info("  %s (%d mirrors)", filename, len(mirror_urls))
-    # Download — full snapshot first, then re-probe for fresh incremental
+    # Download full snapshot
    os.makedirs(output_dir, exist_ok=True)
    total_start: float = time.monotonic()
-    # Separate full and incremental from the initial plan
+    filepath: Path = Path(output_dir) / full_filename
-    full_downloads: list[tuple[str, list[str]]] = []
+    if filepath.exists() and filepath.stat().st_size > 0:
-    for filename, mirror_urls in download_plan:
+        log.info("Skipping %s (already exists: %.1f GB)",
-        if filename.startswith("snapshot-"):
+                 full_filename, filepath.stat().st_size / (1024 ** 3))
-            full_downloads.append((filename, mirror_urls))
+    else:
-
+        if not download_aria2c(full_mirrors, output_dir, full_filename, connections):
-    # Download full snapshot(s)
+            log.error("Failed to download %s", full_filename)
    for filename, mirror_urls in full_downloads:
        filepath: Path = Path(output_dir) / filename
        if filepath.exists() and filepath.stat().st_size > 0:
            log.info("Skipping %s (already exists: %.1f GB)",
                     filename, filepath.stat().st_size / (1024 ** 3))
            continue
        if not download_aria2c(mirror_urls, output_dir, filename, connections):
            log.error("Failed to download %s", filename)
            return False
-    # After full snapshot download, rolling incremental download loop.
+    # Download incremental separately — the full download took minutes,
-    # The initial incremental is stale by now (full download takes 10+ min).
+    # so any incremental from discovery is stale. Re-probe for fresh ones.
    # Re-probe repeatedly until we find one close enough to head.
    if not full_only:
-        full_filename: str = full_downloads[0][0]
+        fm: re.Match[str] | None = FULL_SNAP_RE.match(full_filename)
-        fm_post: re.Match[str] | None = FULL_SNAP_RE.match(full_filename)
+        if fm:
-        if fm_post:
+            full_snap_slot: int = int(fm.group(1))
-            full_snap_slot: int = int(fm_post.group(1))
+            log.info("Downloading incremental for base slot %d...", full_snap_slot)
-            log.info("Rolling incremental download (base slot %d, convergence %d slots)...",
+            _rolling_incremental_download(
-                     full_snap_slot, convergence_slots)
+                fast_sources, full_snap_slot, output_dir,
-            prev_inc_filename: str | None = None
+                convergence_slots, connections, resolved_rpc,
-            loop_start: float = time.monotonic()
+            )
            max_convergence_time: float = 1800.0  # 30 min wall-clock limit
            while True:
                if time.monotonic() - loop_start > max_convergence_time:
                    if prev_inc_filename:
                        log.warning("Convergence timeout (%.0fs) — using %s",
                                    max_convergence_time, prev_inc_filename)
                    else:
                        log.warning("Convergence timeout (%.0fs) — no incremental downloaded",
                                    max_convergence_time)
                    break
                inc_fn, inc_mirrors = probe_incremental(fast_sources, full_snap_slot)
                if inc_fn is None:
                    if prev_inc_filename is None:
                        log.error("No matching incremental found for base slot %d "
                                  "— validator will replay from full snapshot", full_snap_slot)
                    else:
                        log.info("No newer incremental available, using %s", prev_inc_filename)
                    break
                # Parse the incremental slot from the filename
                m_inc: re.Match[str] | None = INCR_SNAP_RE.match(inc_fn)
                assert m_inc is not None  # probe_incremental already validated
                inc_slot: int = int(m_inc.group(2))
                # Check convergence against current mainnet slot
                head_slot: int | None = get_current_slot(resolved_rpc)
                if head_slot is None:
                    log.warning("Cannot get current slot — downloading best available incremental")
                    gap: int = convergence_slots + 1  # force download, then break
                else:
                    gap = head_slot - inc_slot
                # Skip download if we already have this exact incremental
                if inc_fn == prev_inc_filename:
                    if gap <= convergence_slots:
                        log.info("Incremental %s already downloaded (gap %d slots, converged)", inc_fn, gap)
                        break
                    log.info("No newer incremental yet (slot %d, gap %d slots), waiting...",
                             inc_slot, gap)
                    time.sleep(10)
                    continue
                # Delete previous incremental before downloading the new one
                if prev_inc_filename is not None:
                    old_path: Path = Path(output_dir) / prev_inc_filename
                    if old_path.exists():
                        log.info("Removing superseded incremental %s", prev_inc_filename)
                        old_path.unlink()
                log.info("Downloading incremental %s (%d mirrors, slot %d, gap %d slots)",
                         inc_fn, len(inc_mirrors), inc_slot, gap)
                if not download_aria2c(inc_mirrors, output_dir, inc_fn, connections):
                    log.warning("Failed to download incremental %s — re-probing in 10s", inc_fn)
                    time.sleep(10)
                    continue
                prev_inc_filename = inc_fn
                if gap <= convergence_slots:
                    log.info("Converged: incremental slot %d is %d slots behind head", inc_slot, gap)
                    break
                if head_slot is None:
                    break
                log.info("Not converged (gap %d > %d), re-probing in 10s...", gap, convergence_slots)
                time.sleep(10)
    total_elapsed: float = time.monotonic() - total_start
    log.info("All downloads complete in %.0fs", total_elapsed)
    for filename, _ in download_plan:
        fp_path: Path = Path(output_dir) / filename
        if fp_path.exists():
            log.info("  %s (%.1f GB)", fp_path.name, fp_path.stat().st_size / (1024 ** 3))
    return True