Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions raft/src/main/java/org/apache/kafka/raft/FollowerState.java
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,14 @@ public void resetFetchTimeoutForSuccessfulFetch(long currentTimeMs) {
hasFetchedFromLeader = true;
}

/**
* Reset the fetch timeout after successful fetch from the bootstrap servers.
* This should only be called by observers who fetched from a non-leader bootstrap server.
*/
public void resetFetchTimeoutForBootstrapServers(long currentTimeMs) {
overrideFetchTimeout(currentTimeMs, fetchTimeoutMs);
}

/**
* Override the fetch timeout to a specific value. This is useful for short-circuiting followers' timeouts after
* they receive end quorum requests
Expand Down
30 changes: 29 additions & 1 deletion raft/src/main/java/org/apache/kafka/raft/KafkaRaftClient.java
Original file line number Diff line number Diff line change
Expand Up @@ -1725,6 +1725,8 @@ private boolean handleFetchResponse(
leaderEndpoints = Endpoints.empty();
}

maybeSwitchObserverFetchToLeader(responseMetadata.source(), responseLeaderId, leaderEndpoints, currentTimeMs);

Optional<Boolean> handled = maybeHandleCommonResponse(
error,
responseLeaderId,
Expand Down Expand Up @@ -2608,6 +2610,26 @@ private boolean hasConsistentLeader(int epoch, OptionalInt leaderId) {
}
}

/**
* If the local replica is an observer currently routing fetches to bootstrap servers,
* and a non-leader source's fetch response advertises the leader's endpoints, switch
* the observer back to fetching from the leader.
*/
private void maybeSwitchObserverFetchToLeader(
Node source,
OptionalInt responseLeaderId,
Endpoints leaderEndpoints,
long currentTimeMs
) {
if (!quorum.isVoter()
&& quorum.isFollower()
&& responseLeaderId.isPresent()
&& source.id() != responseLeaderId.getAsInt()
&& !leaderEndpoints.isEmpty()) {
quorum.followerStateOrThrow().resetFetchTimeoutForBootstrapServers(currentTimeMs);
}
}

/**
* Handle response errors that are common across request types.
*
Expand Down Expand Up @@ -3391,7 +3413,13 @@ private long pollFollowerAsObserver(FollowerState state, long currentTimeMs) {
}
return sendResult.timeToWaitMs();
} else {
return maybeSendFetchToBestNode(state, currentTimeMs);
final long backoffMs;
if (state.hasFetchTimeoutExpired(currentTimeMs)) {
backoffMs = maybeSendFetchToAnyBootstrap(currentTimeMs);
} else {
backoffMs = maybeSendFetchToBestNode(state, currentTimeMs);
}
return Math.min(backoffMs, state.remainingFetchTimeMs(currentTimeMs));
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@

import java.nio.ByteBuffer;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.OptionalInt;
import java.util.OptionalLong;
Expand Down Expand Up @@ -765,4 +766,75 @@ void testUpdatedHighWatermarkCompleted() throws Exception {
assertEquals(localLogEndOffset, partitionResponse.highWatermark());
}
}

@Test
void testObserverFetchesBetweenLeaderAndBootstrapServers() throws Exception {
final var epoch = 2;
final var local = KafkaRaftClientTest.replicaKey(
KafkaRaftClientTest.randomReplicaId(),
true
);
final var leader = KafkaRaftClientTest.replicaKey(local.id() + 1, true);
final var otherVoter = KafkaRaftClientTest.replicaKey(local.id() + 2, true);

final var voters = VoterSet.fromMap(
Map.of(
leader.id(), VoterSetTest.voterNode(leader),
otherVoter.id(), VoterSetTest.voterNode(otherVoter)
)
);

final var context = new RaftClientTestContext.Builder(
local.id(),
local.directoryId().get()
)
.withStaticVoters(voters)
.withBootstrapServers(Optional.of(List.of(RaftClientTestContext.mockAddress(otherVoter.id()))))
.withRaftProtocol(RaftClientTestContext.RaftProtocol.KIP_1166_PROTOCOL)
.build();

for (int i = 0; i < 10; ++i) {
// The observer initially fetches from the bootstrap servers, where it will discover the leader's endpoints.
context.pollUntilRequest();
final var bootstrapFetch = context.assertSentFetchRequest();
assertEquals(-2, bootstrapFetch.destination().id());
assertEquals(RaftClientTestContext.mockAddress(otherVoter.id()).getHostName(), bootstrapFetch.destination().host());
assertEquals(RaftClientTestContext.mockAddress(otherVoter.id()).getPort(), bootstrapFetch.destination().port());

context.deliverResponse(
bootstrapFetch.correlationId(),
bootstrapFetch.destination(),
context.fetchResponse(
epoch,
leader.id(),
MemoryRecords.EMPTY,
0L,
Errors.NOT_LEADER_OR_FOLLOWER
)
);

// Subsequent fetch from the observer is sent to the leader
context.pollUntilRequest();
final var leaderFetch = context.assertSentFetchRequest();
assertEquals(leader.id(), leaderFetch.destination().id());
assertEquals(RaftClientTestContext.mockAddress(leader.id()).getHostName(), leaderFetch.destination().host());
assertEquals(RaftClientTestContext.mockAddress(leader.id()).getPort(), leaderFetch.destination().port());

// Return a BROKER_NOT_AVAILABLE error, and then advance time past the fetch timeout,
// which should cause the observer to fetch from the bootstrap servers on the next fetch.

// The fetch timeout is much greater than the request manager's configured backoff, so the
// current unreachable connection will no longer be backing off when the next fetch is sent.
context.deliverResponse(
leaderFetch.correlationId(),
leaderFetch.destination(),
RaftUtil.errorResponse(
ApiKeys.FETCH,
Errors.BROKER_NOT_AVAILABLE
)
);
context.client.poll();
context.time.sleep(context.fetchTimeoutMs + 1);
}
}
}