Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions packages/beacon-node/src/sync/range/chain.ts
Original file line number Diff line number Diff line change
Expand Up @@ -506,14 +506,23 @@ export class SyncChain {
// if last processed epoch is n, we don't want to request batches with epoch > n + MAX_LOOK_AHEAD_EPOCHS
// we should have enough batches to process in the buffer: n + 1, ..., n + MAX_LOOK_AHEAD_EPOCHS
// let's focus on redownloading these batches first because it may have to reach different peers to get enough sampled columns
//
// Only pending (not-yet-validated) batches count toward the look-ahead window. AwaitingValidation
// batches (e.g. already-processed empty/skip-slot batches) cannot advance lastEpochWithProcessBlocks
// on their own, so counting them would let a run of empty epochs longer than MAX_LOOK_AHEAD_EPOCHS
// fill the window and stall sync. Mirrors the BATCH_BUFFER_SIZE carve-out above.
const pendingBatches = batches.filter((batch) => batch.state.status !== BatchStatus.AwaitingValidation);
if (
batches.length > 0 &&
Math.max(...batches.map((b) => b.startEpoch)) >= this.lastEpochWithProcessBlocks + MAX_LOOK_AHEAD_EPOCHS
pendingBatches.length > 0 &&
Math.max(...pendingBatches.map((b) => b.startEpoch)) >= this.lastEpochWithProcessBlocks + MAX_LOOK_AHEAD_EPOCHS
) {
return null;
}

// This line decides the starting epoch of the next batch. MUST ensure no duplicate batch for the same startEpoch
// This line decides the starting epoch of the next batch. MUST ensure no duplicate batch for the same startEpoch.
// Note: intentionally considers ALL batches (including AwaitingValidation), unlike the pending-only
// look-ahead check above — the next epoch must follow the last batch we hold to avoid gaps/duplicates,
// regardless of validation state.
const startEpoch = toBeDownloadedStartEpoch(batches, this.lastEpochWithProcessBlocks);

// Don't request batches beyond the target head slot. The to-be-downloaded batch must be strictly after target.slot
Expand Down
90 changes: 32 additions & 58 deletions packages/beacon-node/src/sync/utils/downloadByRange.ts
Original file line number Diff line number Diff line change
Expand Up @@ -514,44 +514,21 @@ export async function validateResponses({
return {result: {responses: validatedResponses, payloadEnvelopes: null}, warnings};
}

if (!dataRequest) {
// Only envelope and/or parent-by-root validation needed
if (parentPayloadCommitments !== undefined) {
const parentValidated = await validateParentPayloadColumns(
parentPayloadCommitments,
columnSidecars ?? [],
peerDasMetrics
);
if (parentValidated) {
validatedResponses.validatedColumnSidecars = [parentValidated];
}
}
const validatedPayloadEnvelopes = validateEnvelopesByRangeResponse(
validatedResponses.validatedBlocks ?? [],
batchBlocks,
payloadEnvelopes ?? [],
parentPayloadCommitments
);
return {result: {responses: validatedResponses, payloadEnvelopes: validatedPayloadEnvelopes}, warnings};
}

const blocksForDataValidation = getBlocksForDataValidation(
dataRequest,
batchBlocks,
validatedResponses.validatedBlocks?.length ? validatedResponses.validatedBlocks : undefined
);

if (!blocksForDataValidation.length) {
throw new DownloadByRangeError(
{
code: DownloadByRangeErrorCode.MISSING_BLOCKS_RESPONSE,
...requestsLogMeta({blobsRequest, columnsRequest}),
},
"No blocks in data request slot range to validate data response against"
);
}

if (blobsRequest) {
// Resolve the in-range blocks the data sidecars are validated against. Empty when there is no data
// request (envelopes-only / parent-payload-only) or when the data request's slot range contains no
// blocks (an empty epoch — valid during poor chain liveness; the empty batch is confirmed at the
// chain level once a later batch imports a block, so a peer cannot stall sync by falsely claiming an
// epoch is empty). getBlocksForDataValidation treats `undefined` and `[]` the same, so pass directly.
const blocksForDataValidation = dataRequest
? getBlocksForDataValidation(dataRequest, batchBlocks, validatedResponses.validatedBlocks)
: [];

// When the range has no blocks we neither validate nor reject the sidecars a peer returned. We can't
// confirm the range is genuinely empty at this layer (the empty blocks response is itself unverified
// until a later batch extends the chain), so there is no peer we could safely penalize — orphan
// sidecars are dropped. The per-request validators below only run when there are blocks to validate
// against.
if (blobsRequest && blocksForDataValidation.length > 0) {
Comment thread
wemeetagain marked this conversation as resolved.
if (!blobSidecars) {
throw new DownloadByRangeError(
{
Expand All @@ -568,7 +545,7 @@ export async function validateResponses({
);
}

if (columnsRequest) {
if (columnsRequest && blocksForDataValidation.length > 0) {
if (!columnSidecars) {
throw new DownloadByRangeError(
{
Expand Down Expand Up @@ -664,26 +641,23 @@ export function validateBlockByRangeResponse(
): WarnResult<ValidatedBlock[], DownloadByRangeError> {
const {startSlot, count} = blocksRequest;

// An error was thrown here by @twoeths in #8150 but it breaks for epochs with 0 blocks during chain
// liveness issues. See comment https://github.com/ChainSafe/lodestar/issues/8147#issuecomment-3246434697
// There are instances where clients return no blocks though. Need to monitor this via the warns to see
// if what the correct behavior should be
// A peer returning zero blocks for an epoch is the correct response when every slot in the epoch
// was skipped (an "empty epoch"), which happens during periods of poor chain liveness. Return the
// empty result with a warning — rather than throwing — so the SyncChain can process through the
// empty epoch. Throwing here previously killed the chain after MAX_BATCH_DOWNLOAD_ATTEMPTS
// (https://github.com/ChainSafe/lodestar/issues/8147). The empty batch is held in
// AwaitingValidation and only confirmed once a later batch imports a block, so a peer cannot stall
// sync by falsely claiming an epoch is empty.
if (!blocks.length) {
throw new DownloadByRangeError({
code: DownloadByRangeErrorCode.MISSING_BLOCKS_RESPONSE,
...requestsLogMeta({blocksRequest}),
});
// TODO: this was causing deadlock again. need to come back and fix this so that its possible to process through
// an empty epoch for periods with poor liveness
// return {
// result: [],
// warnings: [
// new DownloadByRangeError({
// code: DownloadByRangeErrorCode.MISSING_BLOCKS_RESPONSE,
// ...requestsLogMeta({blocksRequest}),
// }),
// ],
// };
return {
result: [],
warnings: [
new DownloadByRangeError({
code: DownloadByRangeErrorCode.MISSING_BLOCKS_RESPONSE,
...requestsLogMeta({blocksRequest}),
}),
],
};
}

if (blocks.length > count) {
Expand Down
223 changes: 223 additions & 0 deletions packages/beacon-node/test/e2e/sync/emptyEpochSync.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
import {afterEach, describe, expect, it, vi} from "vitest";
import {routes} from "@lodestar/api";
import {ChainConfig} from "@lodestar/config";
import {IForkChoice, ProtoBlock} from "@lodestar/fork-choice";
import {TimestampFormatCode} from "@lodestar/logger";
import {LogLevel, TestLoggerOpts, testLogger} from "@lodestar/logger/test-utils";
import {SLOTS_PER_EPOCH} from "@lodestar/params";
import {waitForEvent} from "../../utils/events/resolver.js";
import {connect, onPeerConnect} from "../../utils/network.js";
import {getDevBeaconNode} from "../../utils/node/beacon.js";
import {getAndInitDevValidators} from "../../utils/node/validator.js";

/**
* Return the length of the longest run of consecutive empty epochs (epochs containing zero blocks)
* on a node's canonical chain.
*/
function longestEmptyEpochRun(forkChoice: IForkChoice, head: ProtoBlock): number {
let maxRun = 0;
let prevEpoch = Math.floor(head.slot / SLOTS_PER_EPOCH);
for (const block of forkChoice.getAllAncestorBlocks(head.blockRoot, head.payloadStatus)) {
const epoch = Math.floor(block.slot / SLOTS_PER_EPOCH);
maxRun = Math.max(maxRun, prevEpoch - epoch - 1);
prevEpoch = epoch;
}
return maxRun;
}

// Regression test for https://github.com/ChainSafe/lodestar/pull/9417 (issue #8147).
//
// During periods of poor chain liveness an entire epoch can be empty: every slot is skipped, so
// a BeaconBlocksByRange request for that epoch correctly returns zero blocks. Range sync used to
// throw MISSING_BLOCKS_RESPONSE on a zero-block response, which — after MAX_BATCH_DOWNLOAD_ATTEMPTS
// — killed the SyncChain and deadlocked a node trying to sync past the empty epoch. A run of empty
// epochs longer than MAX_LOOK_AHEAD_EPOCHS (= 2) additionally stalled the look-ahead window.
//
// This test drives the real end-to-end path: Node A builds a chain with a run of >= 3 empty epochs
// (created by taking all validators offline for several epochs, i.e. simulating offline proposers),
// then Node B range-syncs from genesis and must catch up to Node A's head — crossing the gap. With
// the fix Node B reaches the head; without it Node B deadlocks at the first empty epoch.
//
// ~60 slots @ 2s/slot (minimal preset, SLOTS_PER_EPOCH = 8) ⇒ Node A is ready in ~125s, leaving
// ample room under the timeout for Node B's sync (60s budget) before the outer timeout fires.
describe("sync / empty epoch range sync", () => {
vi.setConfig({testTimeout: 300_000});

const validatorCount = 8;
const SLOT_DURATION_MS = 2000;
const ELECTRA_FORK_EPOCH = 0;
const FULU_FORK_EPOCH = 1;
const GLOAS_FORK_EPOCH = 2;
const testParams: Partial<ChainConfig> = {
SLOT_DURATION_MS,
ALTAIR_FORK_EPOCH: ELECTRA_FORK_EPOCH,
BELLATRIX_FORK_EPOCH: ELECTRA_FORK_EPOCH,
CAPELLA_FORK_EPOCH: ELECTRA_FORK_EPOCH,
DENEB_FORK_EPOCH: ELECTRA_FORK_EPOCH,
ELECTRA_FORK_EPOCH,
FULU_FORK_EPOCH,
GLOAS_FORK_EPOCH,
BLOB_SCHEDULE: [{EPOCH: 1, MAX_BLOBS_PER_BLOCK: 3}],
};

// Empty-epoch window. Produce blocks through epoch 2 (so the electra→fulu→gloas transitions all
// happen with blocks present), take the validators offline for the next four epochs, then resume.
// Even if a single in-flight proposal lands right after we go offline, epochs 4–6 are guaranteed
// fully empty (>= 3, exceeding MAX_LOOK_AHEAD_EPOCHS), exercising the whole fix.
const STOP_SLOT = 3 * SLOTS_PER_EPOCH; // 24 — first slot with no proposer
const RESUME_SLOT = 7 * SLOTS_PER_EPOCH; // 56 — proposers come back online
const TARGET_HEAD_MIN_SLOT = RESUME_SLOT + Math.floor(SLOTS_PER_EPOCH / 2); // 60 — at least a full batch past the gap

const afterEachCallbacks: (() => Promise<unknown> | void)[] = [];
afterEach(async () => {
while (afterEachCallbacks.length > 0) {
const callback = afterEachCallbacks.pop();
if (callback) await callback();
}
});

it("syncs through a run of empty epochs during poor liveness", async () => {
const genesisSlotsDelay = 4;
const genesisTime = Math.floor(Date.now() / 1000) + genesisSlotsDelay * (SLOT_DURATION_MS / 1000);

const testLoggerOpts: TestLoggerOpts = {
level: LogLevel.info,
timestampFormat: {
format: TimestampFormatCode.EpochSlot,
genesisTime,
slotsPerEpoch: SLOTS_PER_EPOCH,
secondsPerSlot: SLOT_DURATION_MS / 1000,
},
};

const loggerNodeA = testLogger("EmptyEpochSync-Node-A", testLoggerOpts);
const loggerNodeB = testLogger("EmptyEpochSync-Node-B", testLoggerOpts);

// Node A: the lone block producer. `isSingleNode` + `allowPublishToZeroPeers` let it build a
// chain without any peers connected.
const bn = await getDevBeaconNode({
params: testParams,
options: {
// Large slotImportTolerance so Node A keeps considering itself "synced" — and keeps
// producing — while its head sits several epochs behind the wall clock during the offline
// window. A lone node has no peers to range-sync from, so with the default tolerance it
// would go Stalled once the head fell > 1 epoch behind and never resume block production.
sync: {isSingleNode: true, slotImportTolerance: 12 * SLOTS_PER_EPOCH},
network: {allowPublishToZeroPeers: true, useWorker: false},
chain: {blsVerifyAllMainThread: true},
},
validatorCount,
genesisTime,
logger: loggerNodeA,
});
afterEachCallbacks.push(() => bn.close());

// Produce blocks through epoch 2 (covering the electra→fulu→gloas transitions).
let validators: Awaited<ReturnType<typeof getAndInitDevValidators>>["validators"] = (
await getAndInitDevValidators({
node: bn,
logPrefix: "EmptyEpochSyncVc-1",
validatorsPerClient: validatorCount,
validatorClientCount: 1,
startIndex: 0,
useRestApi: false,
testLoggerOpts,
})
).validators;
afterEachCallbacks.push(() => Promise.all(validators.map((v) => v.close().catch(() => {}))));

await bn.chain.clock.waitForSlot(STOP_SLOT);

// Sanity-check that Node A actually built a chain before we create the gap.
expect(
bn.chain.forkChoice.getHead().slot,
"Node A should have produced blocks before going offline"
).toBeGreaterThanOrEqual(STOP_SLOT - SLOTS_PER_EPOCH);

// Take every validator offline. With no proposer online, every slot from here until
// the validators come back is skipped, producing a run of empty epochs on Node A's chain.
await Promise.all(validators.map((v) => v.close()));
loggerNodeA.info("Validators offline — empty epochs begin", {stopSlot: STOP_SLOT});

await bn.chain.clock.waitForSlot(RESUME_SLOT);

// Bring validators back online (fresh clients, same keys). The chain resumes building
// on the pre-gap head, leaving the skipped slots permanently empty.
validators = (
await getAndInitDevValidators({
node: bn,
logPrefix: "EmptyEpochSyncVc-2",
validatorsPerClient: validatorCount,
validatorClientCount: 1,
startIndex: 0,
useRestApi: false,
testLoggerOpts,
})
).validators;
afterEachCallbacks.push(() => Promise.all(validators.map((v) => v.close().catch(() => {}))));
loggerNodeA.info("Validators back online — empty epochs end", {resumeSlot: RESUME_SLOT});

// Wait until Node A has built at least a full epoch of blocks past the gap. Range sync confirms
// an empty (AwaitingValidation) batch only once a *later* batch imports a block, so the synced
// chain must extend past the empty run.
await waitForEvent<routes.events.EventData[routes.events.EventType.head]>(
bn.chain.emitter,
routes.events.EventType.head,
120_000,
({slot}) => slot >= TARGET_HEAD_MIN_SLOT
);

// Capture Node A's head past the gap as Node B's sync target. Node A keeps producing, but this
// block stays on its canonical chain, so Node B reaching it proves it synced across the gap.
const headSummary = bn.chain.forkChoice.getHead();
const headRootHex = headSummary.blockRoot;
const emptyRunNodeA = longestEmptyEpochRun(bn.chain.forkChoice, headSummary);
loggerNodeA.info("Node A built chain past the empty epochs", {
headSlot: headSummary.slot,
headRoot: headRootHex,
longestEmptyEpochRun: emptyRunNodeA,
});

// Precondition: the scenario we mean to exercise must actually exist on Node A's chain.
expect(
emptyRunNodeA,
`Node A chain must contain a run of >= 3 empty epochs to exercise the regression (got ${emptyRunNodeA})`
).toBeGreaterThanOrEqual(3);

// Node B: starts from genesis and must reach Node A's head purely via range sync, which has to
// traverse the empty epochs.
const bn2 = await getDevBeaconNode({
params: testParams,
options: {
api: {rest: {enabled: false}},
network: {useWorker: false},
chain: {blsVerifyAllMainThread: true},
},
validatorCount,
genesisTime,
logger: loggerNodeB,
});
afterEachCallbacks.push(() => bn2.close());

// Attach the listener before connecting so the head event can't be missed.
const waitForSynced = waitForEvent<routes.events.EventData[routes.events.EventType.head]>(
bn2.chain.emitter,
routes.events.EventType.head,
60_000,
({block}) => block === headRootHex
);

await Promise.all([connect(bn2.network, bn.network), onPeerConnect(bn2.network), onPeerConnect(bn.network)]);
loggerNodeB.info("Node B connected to Node A");

try {
await waitForSynced;
} catch (_e) {
expect.fail(
`Node B failed to range-sync through the empty epochs to Node A's head (slot ${headSummary.slot}). ` +
"Before the fix this deadlocks: a zero-block response for an empty epoch threw MISSING_BLOCKS_RESPONSE, " +
"killing the SyncChain after MAX_BATCH_DOWNLOAD_ATTEMPTS."
);
}
});
});
15 changes: 12 additions & 3 deletions packages/beacon-node/test/unit/sync/range/chain.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,23 @@ describe("sync / range / chain", () => {
targetEpoch: 16,
},
{
// due to BATCH_BUFFER_SIZE and MAX_LOOK_AHEAD_EPOCHS, lodestar cannot deal with unlimited skipped slots
// having a test with 2 epochs of skipped slots is enough to test the logic
// this hasn't happened in any networks as of Aug 2025
// A run of skipped slots that fits within the MAX_LOOK_AHEAD_EPOCHS window. The longer-run case
// (a run of empty epochs exceeding the look-ahead window, which previously deadlocked) is
// covered by "Simulate sync with 3 epochs of skipped slots" below.
id: "Simulate sync with 2 epochs of skipped slots",
startEpoch: 0,
targetEpoch: 16,
skippedSlots: new Set(linspace(3 * SLOTS_PER_EPOCH, 4 * SLOTS_PER_EPOCH)),
},
{
// Regression: a run of empty epochs longer than MAX_LOOK_AHEAD_EPOCHS must not deadlock the
// chain. Empty batches sit in AwaitingValidation and never advance lastEpochWithProcessBlocks,
// so the look-ahead window must not count them or no further batches are ever downloaded.
id: "Simulate sync with 3 epochs of skipped slots",
startEpoch: 0,
targetEpoch: 16,
skippedSlots: new Set(linspace(3 * SLOTS_PER_EPOCH, 6 * SLOTS_PER_EPOCH)),
},
// As of https://github.com/ChainSafe/lodestar/pull/8150, we abort the batch after a single processing error
// {
// id: "Simulate sync with multiple ranges of bad blocks",
Expand Down
Loading
Loading