diff --git a/packages/beacon-node/src/sync/range/chain.ts b/packages/beacon-node/src/sync/range/chain.ts index 3d390c718885..5266189aac3b 100644 --- a/packages/beacon-node/src/sync/range/chain.ts +++ b/packages/beacon-node/src/sync/range/chain.ts @@ -506,14 +506,23 @@ export class SyncChain { // if last processed epoch is n, we don't want to request batches with epoch > n + MAX_LOOK_AHEAD_EPOCHS // we should have enough batches to process in the buffer: n + 1, ..., n + MAX_LOOK_AHEAD_EPOCHS // let's focus on redownloading these batches first because it may have to reach different peers to get enough sampled columns + // + // Only pending (not-yet-validated) batches count toward the look-ahead window. AwaitingValidation + // batches (e.g. already-processed empty/skip-slot batches) cannot advance lastEpochWithProcessBlocks + // on their own, so counting them would let a run of empty epochs longer than MAX_LOOK_AHEAD_EPOCHS + // fill the window and stall sync. Mirrors the BATCH_BUFFER_SIZE carve-out above. + const pendingBatches = batches.filter((batch) => batch.state.status !== BatchStatus.AwaitingValidation); if ( - batches.length > 0 && - Math.max(...batches.map((b) => b.startEpoch)) >= this.lastEpochWithProcessBlocks + MAX_LOOK_AHEAD_EPOCHS + pendingBatches.length > 0 && + Math.max(...pendingBatches.map((b) => b.startEpoch)) >= this.lastEpochWithProcessBlocks + MAX_LOOK_AHEAD_EPOCHS ) { return null; } - // This line decides the starting epoch of the next batch. MUST ensure no duplicate batch for the same startEpoch + // This line decides the starting epoch of the next batch. MUST ensure no duplicate batch for the same startEpoch. + // Note: intentionally considers ALL batches (including AwaitingValidation), unlike the pending-only + // look-ahead check above — the next epoch must follow the last batch we hold to avoid gaps/duplicates, + // regardless of validation state. const startEpoch = toBeDownloadedStartEpoch(batches, this.lastEpochWithProcessBlocks); // Don't request batches beyond the target head slot. The to-be-downloaded batch must be strictly after target.slot diff --git a/packages/beacon-node/src/sync/utils/downloadByRange.ts b/packages/beacon-node/src/sync/utils/downloadByRange.ts index 8fe76e5bd54e..24d9b5a81681 100644 --- a/packages/beacon-node/src/sync/utils/downloadByRange.ts +++ b/packages/beacon-node/src/sync/utils/downloadByRange.ts @@ -514,44 +514,21 @@ export async function validateResponses({ return {result: {responses: validatedResponses, payloadEnvelopes: null}, warnings}; } - if (!dataRequest) { - // Only envelope and/or parent-by-root validation needed - if (parentPayloadCommitments !== undefined) { - const parentValidated = await validateParentPayloadColumns( - parentPayloadCommitments, - columnSidecars ?? [], - peerDasMetrics - ); - if (parentValidated) { - validatedResponses.validatedColumnSidecars = [parentValidated]; - } - } - const validatedPayloadEnvelopes = validateEnvelopesByRangeResponse( - validatedResponses.validatedBlocks ?? [], - batchBlocks, - payloadEnvelopes ?? [], - parentPayloadCommitments - ); - return {result: {responses: validatedResponses, payloadEnvelopes: validatedPayloadEnvelopes}, warnings}; - } - - const blocksForDataValidation = getBlocksForDataValidation( - dataRequest, - batchBlocks, - validatedResponses.validatedBlocks?.length ? validatedResponses.validatedBlocks : undefined - ); - - if (!blocksForDataValidation.length) { - throw new DownloadByRangeError( - { - code: DownloadByRangeErrorCode.MISSING_BLOCKS_RESPONSE, - ...requestsLogMeta({blobsRequest, columnsRequest}), - }, - "No blocks in data request slot range to validate data response against" - ); - } - - if (blobsRequest) { + // Resolve the in-range blocks the data sidecars are validated against. Empty when there is no data + // request (envelopes-only / parent-payload-only) or when the data request's slot range contains no + // blocks (an empty epoch — valid during poor chain liveness; the empty batch is confirmed at the + // chain level once a later batch imports a block, so a peer cannot stall sync by falsely claiming an + // epoch is empty). getBlocksForDataValidation treats `undefined` and `[]` the same, so pass directly. + const blocksForDataValidation = dataRequest + ? getBlocksForDataValidation(dataRequest, batchBlocks, validatedResponses.validatedBlocks) + : []; + + // When the range has no blocks we neither validate nor reject the sidecars a peer returned. We can't + // confirm the range is genuinely empty at this layer (the empty blocks response is itself unverified + // until a later batch extends the chain), so there is no peer we could safely penalize — orphan + // sidecars are dropped. The per-request validators below only run when there are blocks to validate + // against. + if (blobsRequest && blocksForDataValidation.length > 0) { if (!blobSidecars) { throw new DownloadByRangeError( { @@ -568,7 +545,7 @@ export async function validateResponses({ ); } - if (columnsRequest) { + if (columnsRequest && blocksForDataValidation.length > 0) { if (!columnSidecars) { throw new DownloadByRangeError( { @@ -664,26 +641,23 @@ export function validateBlockByRangeResponse( ): WarnResult { const {startSlot, count} = blocksRequest; - // An error was thrown here by @twoeths in #8150 but it breaks for epochs with 0 blocks during chain - // liveness issues. See comment https://github.com/ChainSafe/lodestar/issues/8147#issuecomment-3246434697 - // There are instances where clients return no blocks though. Need to monitor this via the warns to see - // if what the correct behavior should be + // A peer returning zero blocks for an epoch is the correct response when every slot in the epoch + // was skipped (an "empty epoch"), which happens during periods of poor chain liveness. Return the + // empty result with a warning — rather than throwing — so the SyncChain can process through the + // empty epoch. Throwing here previously killed the chain after MAX_BATCH_DOWNLOAD_ATTEMPTS + // (https://github.com/ChainSafe/lodestar/issues/8147). The empty batch is held in + // AwaitingValidation and only confirmed once a later batch imports a block, so a peer cannot stall + // sync by falsely claiming an epoch is empty. if (!blocks.length) { - throw new DownloadByRangeError({ - code: DownloadByRangeErrorCode.MISSING_BLOCKS_RESPONSE, - ...requestsLogMeta({blocksRequest}), - }); - // TODO: this was causing deadlock again. need to come back and fix this so that its possible to process through - // an empty epoch for periods with poor liveness - // return { - // result: [], - // warnings: [ - // new DownloadByRangeError({ - // code: DownloadByRangeErrorCode.MISSING_BLOCKS_RESPONSE, - // ...requestsLogMeta({blocksRequest}), - // }), - // ], - // }; + return { + result: [], + warnings: [ + new DownloadByRangeError({ + code: DownloadByRangeErrorCode.MISSING_BLOCKS_RESPONSE, + ...requestsLogMeta({blocksRequest}), + }), + ], + }; } if (blocks.length > count) { diff --git a/packages/beacon-node/test/e2e/sync/emptyEpochSync.test.ts b/packages/beacon-node/test/e2e/sync/emptyEpochSync.test.ts new file mode 100644 index 000000000000..2a2254f8bd5a --- /dev/null +++ b/packages/beacon-node/test/e2e/sync/emptyEpochSync.test.ts @@ -0,0 +1,223 @@ +import {afterEach, describe, expect, it, vi} from "vitest"; +import {routes} from "@lodestar/api"; +import {ChainConfig} from "@lodestar/config"; +import {IForkChoice, ProtoBlock} from "@lodestar/fork-choice"; +import {TimestampFormatCode} from "@lodestar/logger"; +import {LogLevel, TestLoggerOpts, testLogger} from "@lodestar/logger/test-utils"; +import {SLOTS_PER_EPOCH} from "@lodestar/params"; +import {waitForEvent} from "../../utils/events/resolver.js"; +import {connect, onPeerConnect} from "../../utils/network.js"; +import {getDevBeaconNode} from "../../utils/node/beacon.js"; +import {getAndInitDevValidators} from "../../utils/node/validator.js"; + +/** + * Return the length of the longest run of consecutive empty epochs (epochs containing zero blocks) + * on a node's canonical chain. + */ +function longestEmptyEpochRun(forkChoice: IForkChoice, head: ProtoBlock): number { + let maxRun = 0; + let prevEpoch = Math.floor(head.slot / SLOTS_PER_EPOCH); + for (const block of forkChoice.getAllAncestorBlocks(head.blockRoot, head.payloadStatus)) { + const epoch = Math.floor(block.slot / SLOTS_PER_EPOCH); + maxRun = Math.max(maxRun, prevEpoch - epoch - 1); + prevEpoch = epoch; + } + return maxRun; +} + +// Regression test for https://github.com/ChainSafe/lodestar/pull/9417 (issue #8147). +// +// During periods of poor chain liveness an entire epoch can be empty: every slot is skipped, so +// a BeaconBlocksByRange request for that epoch correctly returns zero blocks. Range sync used to +// throw MISSING_BLOCKS_RESPONSE on a zero-block response, which — after MAX_BATCH_DOWNLOAD_ATTEMPTS +// — killed the SyncChain and deadlocked a node trying to sync past the empty epoch. A run of empty +// epochs longer than MAX_LOOK_AHEAD_EPOCHS (= 2) additionally stalled the look-ahead window. +// +// This test drives the real end-to-end path: Node A builds a chain with a run of >= 3 empty epochs +// (created by taking all validators offline for several epochs, i.e. simulating offline proposers), +// then Node B range-syncs from genesis and must catch up to Node A's head — crossing the gap. With +// the fix Node B reaches the head; without it Node B deadlocks at the first empty epoch. +// +// ~60 slots @ 2s/slot (minimal preset, SLOTS_PER_EPOCH = 8) ⇒ Node A is ready in ~125s, leaving +// ample room under the timeout for Node B's sync (60s budget) before the outer timeout fires. +describe("sync / empty epoch range sync", () => { + vi.setConfig({testTimeout: 300_000}); + + const validatorCount = 8; + const SLOT_DURATION_MS = 2000; + const ELECTRA_FORK_EPOCH = 0; + const FULU_FORK_EPOCH = 1; + const GLOAS_FORK_EPOCH = 2; + const testParams: Partial = { + SLOT_DURATION_MS, + ALTAIR_FORK_EPOCH: ELECTRA_FORK_EPOCH, + BELLATRIX_FORK_EPOCH: ELECTRA_FORK_EPOCH, + CAPELLA_FORK_EPOCH: ELECTRA_FORK_EPOCH, + DENEB_FORK_EPOCH: ELECTRA_FORK_EPOCH, + ELECTRA_FORK_EPOCH, + FULU_FORK_EPOCH, + GLOAS_FORK_EPOCH, + BLOB_SCHEDULE: [{EPOCH: 1, MAX_BLOBS_PER_BLOCK: 3}], + }; + + // Empty-epoch window. Produce blocks through epoch 2 (so the electra→fulu→gloas transitions all + // happen with blocks present), take the validators offline for the next four epochs, then resume. + // Even if a single in-flight proposal lands right after we go offline, epochs 4–6 are guaranteed + // fully empty (>= 3, exceeding MAX_LOOK_AHEAD_EPOCHS), exercising the whole fix. + const STOP_SLOT = 3 * SLOTS_PER_EPOCH; // 24 — first slot with no proposer + const RESUME_SLOT = 7 * SLOTS_PER_EPOCH; // 56 — proposers come back online + const TARGET_HEAD_MIN_SLOT = RESUME_SLOT + Math.floor(SLOTS_PER_EPOCH / 2); // 60 — at least a full batch past the gap + + const afterEachCallbacks: (() => Promise | void)[] = []; + afterEach(async () => { + while (afterEachCallbacks.length > 0) { + const callback = afterEachCallbacks.pop(); + if (callback) await callback(); + } + }); + + it("syncs through a run of empty epochs during poor liveness", async () => { + const genesisSlotsDelay = 4; + const genesisTime = Math.floor(Date.now() / 1000) + genesisSlotsDelay * (SLOT_DURATION_MS / 1000); + + const testLoggerOpts: TestLoggerOpts = { + level: LogLevel.info, + timestampFormat: { + format: TimestampFormatCode.EpochSlot, + genesisTime, + slotsPerEpoch: SLOTS_PER_EPOCH, + secondsPerSlot: SLOT_DURATION_MS / 1000, + }, + }; + + const loggerNodeA = testLogger("EmptyEpochSync-Node-A", testLoggerOpts); + const loggerNodeB = testLogger("EmptyEpochSync-Node-B", testLoggerOpts); + + // Node A: the lone block producer. `isSingleNode` + `allowPublishToZeroPeers` let it build a + // chain without any peers connected. + const bn = await getDevBeaconNode({ + params: testParams, + options: { + // Large slotImportTolerance so Node A keeps considering itself "synced" — and keeps + // producing — while its head sits several epochs behind the wall clock during the offline + // window. A lone node has no peers to range-sync from, so with the default tolerance it + // would go Stalled once the head fell > 1 epoch behind and never resume block production. + sync: {isSingleNode: true, slotImportTolerance: 12 * SLOTS_PER_EPOCH}, + network: {allowPublishToZeroPeers: true, useWorker: false}, + chain: {blsVerifyAllMainThread: true}, + }, + validatorCount, + genesisTime, + logger: loggerNodeA, + }); + afterEachCallbacks.push(() => bn.close()); + + // Produce blocks through epoch 2 (covering the electra→fulu→gloas transitions). + let validators: Awaited>["validators"] = ( + await getAndInitDevValidators({ + node: bn, + logPrefix: "EmptyEpochSyncVc-1", + validatorsPerClient: validatorCount, + validatorClientCount: 1, + startIndex: 0, + useRestApi: false, + testLoggerOpts, + }) + ).validators; + afterEachCallbacks.push(() => Promise.all(validators.map((v) => v.close().catch(() => {})))); + + await bn.chain.clock.waitForSlot(STOP_SLOT); + + // Sanity-check that Node A actually built a chain before we create the gap. + expect( + bn.chain.forkChoice.getHead().slot, + "Node A should have produced blocks before going offline" + ).toBeGreaterThanOrEqual(STOP_SLOT - SLOTS_PER_EPOCH); + + // Take every validator offline. With no proposer online, every slot from here until + // the validators come back is skipped, producing a run of empty epochs on Node A's chain. + await Promise.all(validators.map((v) => v.close())); + loggerNodeA.info("Validators offline — empty epochs begin", {stopSlot: STOP_SLOT}); + + await bn.chain.clock.waitForSlot(RESUME_SLOT); + + // Bring validators back online (fresh clients, same keys). The chain resumes building + // on the pre-gap head, leaving the skipped slots permanently empty. + validators = ( + await getAndInitDevValidators({ + node: bn, + logPrefix: "EmptyEpochSyncVc-2", + validatorsPerClient: validatorCount, + validatorClientCount: 1, + startIndex: 0, + useRestApi: false, + testLoggerOpts, + }) + ).validators; + afterEachCallbacks.push(() => Promise.all(validators.map((v) => v.close().catch(() => {})))); + loggerNodeA.info("Validators back online — empty epochs end", {resumeSlot: RESUME_SLOT}); + + // Wait until Node A has built at least a full epoch of blocks past the gap. Range sync confirms + // an empty (AwaitingValidation) batch only once a *later* batch imports a block, so the synced + // chain must extend past the empty run. + await waitForEvent( + bn.chain.emitter, + routes.events.EventType.head, + 120_000, + ({slot}) => slot >= TARGET_HEAD_MIN_SLOT + ); + + // Capture Node A's head past the gap as Node B's sync target. Node A keeps producing, but this + // block stays on its canonical chain, so Node B reaching it proves it synced across the gap. + const headSummary = bn.chain.forkChoice.getHead(); + const headRootHex = headSummary.blockRoot; + const emptyRunNodeA = longestEmptyEpochRun(bn.chain.forkChoice, headSummary); + loggerNodeA.info("Node A built chain past the empty epochs", { + headSlot: headSummary.slot, + headRoot: headRootHex, + longestEmptyEpochRun: emptyRunNodeA, + }); + + // Precondition: the scenario we mean to exercise must actually exist on Node A's chain. + expect( + emptyRunNodeA, + `Node A chain must contain a run of >= 3 empty epochs to exercise the regression (got ${emptyRunNodeA})` + ).toBeGreaterThanOrEqual(3); + + // Node B: starts from genesis and must reach Node A's head purely via range sync, which has to + // traverse the empty epochs. + const bn2 = await getDevBeaconNode({ + params: testParams, + options: { + api: {rest: {enabled: false}}, + network: {useWorker: false}, + chain: {blsVerifyAllMainThread: true}, + }, + validatorCount, + genesisTime, + logger: loggerNodeB, + }); + afterEachCallbacks.push(() => bn2.close()); + + // Attach the listener before connecting so the head event can't be missed. + const waitForSynced = waitForEvent( + bn2.chain.emitter, + routes.events.EventType.head, + 60_000, + ({block}) => block === headRootHex + ); + + await Promise.all([connect(bn2.network, bn.network), onPeerConnect(bn2.network), onPeerConnect(bn.network)]); + loggerNodeB.info("Node B connected to Node A"); + + try { + await waitForSynced; + } catch (_e) { + expect.fail( + `Node B failed to range-sync through the empty epochs to Node A's head (slot ${headSummary.slot}). ` + + "Before the fix this deadlocks: a zero-block response for an empty epoch threw MISSING_BLOCKS_RESPONSE, " + + "killing the SyncChain after MAX_BATCH_DOWNLOAD_ATTEMPTS." + ); + } + }); +}); diff --git a/packages/beacon-node/test/unit/sync/range/chain.test.ts b/packages/beacon-node/test/unit/sync/range/chain.test.ts index 3ceed42a4fc0..4cdbb4541d8f 100644 --- a/packages/beacon-node/test/unit/sync/range/chain.test.ts +++ b/packages/beacon-node/test/unit/sync/range/chain.test.ts @@ -30,14 +30,23 @@ describe("sync / range / chain", () => { targetEpoch: 16, }, { - // due to BATCH_BUFFER_SIZE and MAX_LOOK_AHEAD_EPOCHS, lodestar cannot deal with unlimited skipped slots - // having a test with 2 epochs of skipped slots is enough to test the logic - // this hasn't happened in any networks as of Aug 2025 + // A run of skipped slots that fits within the MAX_LOOK_AHEAD_EPOCHS window. The longer-run case + // (a run of empty epochs exceeding the look-ahead window, which previously deadlocked) is + // covered by "Simulate sync with 3 epochs of skipped slots" below. id: "Simulate sync with 2 epochs of skipped slots", startEpoch: 0, targetEpoch: 16, skippedSlots: new Set(linspace(3 * SLOTS_PER_EPOCH, 4 * SLOTS_PER_EPOCH)), }, + { + // Regression: a run of empty epochs longer than MAX_LOOK_AHEAD_EPOCHS must not deadlock the + // chain. Empty batches sit in AwaitingValidation and never advance lastEpochWithProcessBlocks, + // so the look-ahead window must not count them or no further batches are ever downloaded. + id: "Simulate sync with 3 epochs of skipped slots", + startEpoch: 0, + targetEpoch: 16, + skippedSlots: new Set(linspace(3 * SLOTS_PER_EPOCH, 6 * SLOTS_PER_EPOCH)), + }, // As of https://github.com/ChainSafe/lodestar/pull/8150, we abort the batch after a single processing error // { // id: "Simulate sync with multiple ranges of bad blocks", diff --git a/packages/beacon-node/test/unit/sync/utils/downloadByRange.test.ts b/packages/beacon-node/test/unit/sync/utils/downloadByRange.test.ts index 29b447d717ce..41e1914960c5 100644 --- a/packages/beacon-node/test/unit/sync/utils/downloadByRange.test.ts +++ b/packages/beacon-node/test/unit/sync/utils/downloadByRange.test.ts @@ -1,16 +1,21 @@ import {beforeEach, describe, expect, it} from "vitest"; -import {ForkName} from "@lodestar/params"; +import {ForkName, SLOTS_PER_EPOCH} from "@lodestar/params"; import {BlockInputPreData} from "../../../../src/chain/blocks/blockInput/blockInput.js"; import {BlockInputSource, IBlockInput} from "../../../../src/chain/blocks/blockInput/types.js"; -import {ValidatedBlock, getBlocksForDataValidation} from "../../../../src/sync/utils/downloadByRange.js"; -import {generateChainOfBlockMaybeSidecars} from "../../../utils/blocksAndData.js"; +import { + DownloadByRangeErrorCode, + ValidatedBlock, + getBlocksForDataValidation, + validateBlockByRangeResponse, + validateResponses, +} from "../../../../src/sync/utils/downloadByRange.js"; +import {config, generateChainOfBlockMaybeSidecars, slots} from "../../../utils/blocksAndData.js"; /** * Logic errors and gaps identified during test case creation: * * INSERT_LOGIC_ERROR_BULLET_POINTS_HERE * - * - validateBlockByRangeResponse: Commented out zero blocks check breaks during chain liveness issues (line 445-453) * - validateBlobsByRangeResponse: Missing validation that blob sidecars are in consecutive (slot, index) order as per spec * - validateColumnsByRangeResponse: Missing validation that column sidecars are in consecutive (slot, index) order * - cacheByRangeResponses: Error handling for wrong chain only breaks loop but doesn't throw/propagate error properly @@ -164,6 +169,59 @@ import {generateChainOfBlockMaybeSidecars} from "../../../utils/blocksAndData.js // it("should propagate validation errors from validateBlockDataColumnSidecars"); // }); +describe("validateBlockByRangeResponse", () => { + // Regression: a peer returning zero blocks for an all-skipped epoch is a valid response during + // poor chain liveness. It must be accepted (with a warning), not thrown — otherwise the SyncChain + // dies after MAX_BATCH_DOWNLOAD_ATTEMPTS and the node cannot sync past an empty epoch. + it("should accept empty response during chain liveness issues", () => { + const blocksRequest = {startSlot: slots.deneb, count: SLOTS_PER_EPOCH, step: 1}; + + const {result, warnings} = validateBlockByRangeResponse(config, blocksRequest, []); + + expect(result).toEqual([]); + expect(warnings?.map((w) => w.type.code)).toContain(DownloadByRangeErrorCode.MISSING_BLOCKS_RESPONSE); + }); +}); + +describe("validateResponses", () => { + // Regression: an empty epoch that also carries a data (blobs/columns) request must not throw. + // Both the block validator and the "no blocks in data range" guard previously threw + // MISSING_BLOCKS_RESPONSE for a legitimately empty epoch, killing the SyncChain. + it("should accept an empty epoch when a data request is present", async () => { + const startSlot = slots.deneb; + const blocksRequest = {startSlot, count: SLOTS_PER_EPOCH, step: 1}; + const blobsRequest = {startSlot, count: SLOTS_PER_EPOCH}; + + const {result} = await validateResponses({ + config, + blocksRequest, + blobsRequest, + blocks: [], + blobSidecars: [], + }); + + expect(result.responses.validatedBlocks).toEqual([]); + expect(result.responses.validatedBlobSidecars ?? []).toEqual([]); + }); + + // Characterization test for the no-data-request path (envelopes-only). With no blocks/blobs/columns + // request, data-sidecar validation must be skipped while envelope validation still runs and returns + // a (possibly empty) map — never null. This pins the behavior that the data-validation refactor + // must preserve. + it("validates envelopes and skips data validation when only an envelopes request is present", async () => { + const {result} = await validateResponses({ + config, + envelopesRequest: {startSlot: slots.gloas, count: SLOTS_PER_EPOCH}, + payloadEnvelopes: [], + }); + + expect(result.responses.validatedBlocks).toBeUndefined(); + expect(result.responses.validatedColumnSidecars ?? []).toEqual([]); + expect(result.payloadEnvelopes).toBeInstanceOf(Map); + expect(result.payloadEnvelopes?.size).toBe(0); + }); +}); + describe("getBlocksForDataValidation", () => { const forkName = ForkName.capella; let chainOfBlocks: ReturnType;