use of com.hazelcast.cp.internal.raft.impl.dto.AppendRequest in project hazelcast by hazelcast.
the class SnapshotTest method testMembershipChangeBlocksSnapshotBug.
@Test
public void testMembershipChangeBlocksSnapshotBug() throws ExecutionException, InterruptedException {
// The comments below show how the code behaves before the mentioned bug is fixed.
int commitIndexAdvanceCount = 50;
int uncommittedEntryCount = 10;
RaftAlgorithmConfig config = new RaftAlgorithmConfig().setCommitIndexAdvanceCountToSnapshot(commitIndexAdvanceCount).setUncommittedEntryCountToRejectNewAppends(uncommittedEntryCount);
group = newGroup(3, config);
group.start();
RaftNodeImpl leader = group.waitUntilLeaderElected();
RaftNodeImpl[] followers = group.getNodesExcept(leader.getLocalMember());
group.dropMessagesToMember(leader.getLocalMember(), followers[0].getLocalMember(), AppendRequest.class);
while (getSnapshotEntry(leader).index() == 0) {
leader.replicate(new ApplyRaftRunnable("into_snapshot")).get();
}
// now, the leader has taken a snapshot.
// It also keeps some already committed entries in the log because followers[0] hasn't appended them.
// LOG: [ <46 - 49>, <50>], SNAPSHOT INDEX: 50, COMMIT INDEX: 50
long leaderCommitIndex = getCommitIndex(leader);
do {
leader.replicate(new ApplyRaftRunnable("committed_after_snapshot")).get();
} while (getCommitIndex(leader) < leaderCommitIndex + commitIndexAdvanceCount - 1);
// committing new entries.
// one more entry is needed to take the next snapshot.
// LOG: [ <46 - 49>, <50>, <51 - 99 (committed)> ], SNAPSHOT INDEX: 50, COMMIT INDEX: 99
group.dropMessagesToMember(leader.getLocalMember(), followers[1].getLocalMember(), AppendRequest.class);
for (int i = 0; i < uncommittedEntryCount - 1; i++) {
leader.replicate(new ApplyRaftRunnable("uncommitted_after_snapshot"));
}
// appended some more entries which will not be committed because the leader has no majority.
// the last uncommitted index is reserved for membership changed.
// LOG: [ <46 - 49>, <50>, <51 - 99 (committed)>, <100 - 108 (uncommitted)> ], SNAPSHOT INDEX: 50, COMMIT INDEX: 99
// There are only 2 empty indices in the log.
RaftNodeImpl newRaftNode = group.createNewRaftNode();
Function<Object, Object> alterFunc = o -> {
if (o instanceof AppendRequest) {
AppendRequest request = (AppendRequest) o;
LogEntry[] entries = request.entries();
if (entries.length > 0) {
if (entries[entries.length - 1].operation() instanceof UpdateRaftGroupMembersCmd) {
entries = Arrays.copyOf(entries, entries.length - 1);
return new AppendRequest(request.leader(), request.term(), request.prevLogTerm(), request.prevLogIndex(), request.leaderCommitIndex(), entries, request.queryRound());
} else if (entries[0].operation() instanceof UpdateRaftGroupMembersCmd) {
entries = new LogEntry[0];
return new AppendRequest(request.leader(), request.term(), request.prevLogTerm(), request.prevLogIndex(), request.leaderCommitIndex(), entries, request.queryRound());
}
}
}
return null;
};
group.alterMessagesToMember(leader.getLocalMember(), followers[1].getLocalMember(), alterFunc);
group.alterMessagesToMember(leader.getLocalMember(), newRaftNode.getLocalMember(), alterFunc);
long lastLogIndex1 = getLastLogOrSnapshotEntry(leader).index();
leader.replicateMembershipChange(newRaftNode.getLocalMember(), MembershipChangeMode.ADD);
// When the membership change entry is appended, the leader's Log will be as following:
// LOG: [ <46 - 49>, <50>, <51 - 99 (committed)>, <100 - 108 (uncommitted)>, <109 (membership change)> ], SNAPSHOT INDEX: 50, COMMIT INDEX: 99
assertTrueEventually(() -> assertTrue(getLastLogOrSnapshotEntry(leader).index() > lastLogIndex1));
group.allowMessagesToMember(leader.getLocalMember(), followers[1].getLocalMember(), AppendRequest.class);
// Then, only the entries before the membership change will be committed because we alter the append request. The log will be:
// LOG: [ <46 - 49>, <50>, <51 - 108 (committed)>, <109 (membership change)> ], SNAPSHOT INDEX: 50, COMMIT INDEX: 108
// There is only 1 empty index in the log.
assertTrueEventually(() -> {
assertEquals(lastLogIndex1, getCommitIndex(leader));
assertEquals(lastLogIndex1, getCommitIndex(followers[1]));
});
// assertTrueEventually(() -> {
// assertEquals(lastLogIndex1 + 1, getCommitIndex(leader));
// assertEquals(lastLogIndex1 + 1, getCommitIndex(followers[1]));
// });
long lastLogIndex2 = getLastLogOrSnapshotEntry(leader).index();
leader.replicate(new ApplyRaftRunnable("after_membership_change_append"));
assertTrueEventually(() -> assertTrue(getLastLogOrSnapshotEntry(leader).index() > lastLogIndex2));
// Now the log is full. There is no empty space left.
// LOG: [ <46 - 49>, <50>, <51 - 108 (committed)>, <109 (membership change)>, <110 (uncommitted)> ], SNAPSHOT INDEX: 50, COMMIT INDEX: 108
long lastLogIndex3 = getLastLogOrSnapshotEntry(leader).index();
Future f = leader.replicate(new ApplyRaftRunnable("after_membership_change_append"));
assertTrueEventually(() -> assertTrue(getLastLogOrSnapshotEntry(leader).index() > lastLogIndex3));
assertFalse(f.isDone());
}
use of com.hazelcast.cp.internal.raft.impl.dto.AppendRequest in project hazelcast by hazelcast.
the class RaftNodeImpl method sendAppendRequest.
/**
* Sends an append-entries request to the follower member.
* <p>
* Log entries between follower's known nextIndex and latest appended entry index are sent in a batch.
* Batch size can be {@link RaftAlgorithmConfig#getAppendRequestMaxEntryCount()} at most.
* <p>
* If follower's nextIndex is behind the latest snapshot index, then {@link InstallSnapshot} request is sent.
* <p>
* If leader doesn't know follower's matchIndex (if {@code matchIndex == 0}), then an empty append-entries is sent
* to save bandwidth until leader learns the matchIndex of the follower.
*/
@SuppressWarnings({ "checkstyle:npathcomplexity", "checkstyle:cyclomaticcomplexity", "checkstyle:methodlength" })
public void sendAppendRequest(RaftEndpoint follower) {
if (!raftIntegration.isReachable(follower)) {
return;
}
RaftLog raftLog = state.log();
LeaderState leaderState = state.leaderState();
FollowerState followerState = leaderState.getFollowerState(follower);
if (followerState.isAppendRequestBackoffSet()) {
// or a back-off timeout occurs.
return;
}
long nextIndex = followerState.nextIndex();
if (nextIndex <= raftLog.snapshotIndex() && (!raftLog.containsLogEntry(nextIndex) || (nextIndex > 1 && !raftLog.containsLogEntry(nextIndex - 1)))) {
InstallSnapshot installSnapshot = new InstallSnapshot(state.localEndpoint(), state.term(), raftLog.snapshot(), leaderState.queryRound());
if (logger.isFineEnabled()) {
logger.fine("Sending " + installSnapshot + " to " + follower + " since next index: " + nextIndex + " <= snapshot index: " + raftLog.snapshotIndex());
}
// no need to submit the flush task here because we send committed state...
raftIntegration.send(installSnapshot, follower);
followerState.setMaxAppendRequestBackoff();
scheduleAppendAckResetTask();
return;
}
int prevEntryTerm = 0;
long prevEntryIndex = 0;
LogEntry[] entries;
boolean shouldBackoff = true;
if (nextIndex > 1) {
prevEntryIndex = nextIndex - 1;
LogEntry prevEntry = (raftLog.snapshotIndex() == prevEntryIndex) ? raftLog.snapshot() : raftLog.getLogEntry(prevEntryIndex);
assert prevEntry != null : "Prev entry index: " + prevEntryIndex + ", snapshot: " + raftLog.snapshotIndex();
prevEntryTerm = prevEntry.term();
long matchIndex = followerState.matchIndex();
if (matchIndex == 0) {
// Until the leader has discovered where it and the follower's logs match,
// the leader can send AppendEntries with no entries (like heartbeats) to save bandwidth.
// We still need to enable append request backoff here because we do not want to bombard
// the follower before we learn its match index
entries = new LogEntry[0];
} else if (nextIndex <= raftLog.lastLogOrSnapshotIndex()) {
// Then, once the matchIndex immediately precedes the nextIndex,
// the leader should begin to send the actual entries
long end = min(nextIndex + appendRequestMaxEntryCount, raftLog.lastLogOrSnapshotIndex());
entries = raftLog.getEntriesBetween(nextIndex, end);
} else {
// The follower has caught up with the leader. Sending an empty append request as a heartbeat...
entries = new LogEntry[0];
shouldBackoff = false;
}
} else if (nextIndex == 1 && raftLog.lastLogOrSnapshotIndex() > 0) {
// Entries will be sent to the follower for the first time...
long end = min(nextIndex + appendRequestMaxEntryCount, raftLog.lastLogOrSnapshotIndex());
entries = raftLog.getEntriesBetween(nextIndex, end);
} else {
// There is no entry in the Raft log. Sending an empty append request as a heartbeat...
entries = new LogEntry[0];
shouldBackoff = false;
}
AppendRequest request = new AppendRequest(getLocalMember(), state.term(), prevEntryTerm, prevEntryIndex, state.commitIndex(), entries, leaderState.queryRound());
if (logger.isFineEnabled()) {
logger.fine("Sending " + request + " to " + follower + " with next index: " + nextIndex);
}
raftIntegration.send(request, follower);
if (entries.length > 0 && entries[entries.length - 1].index() > leaderState.flushedLogIndex()) {
// if I am sending any non-flushed entry to the follower, I should trigger the flush task.
// I hope that I will flush before receiving append responses from half of the followers...
// This is a very critical optimization because
// it makes the leader and followers flush in parallel...
submitFlushTask();
}
if (shouldBackoff) {
followerState.setAppendRequestBackoff();
scheduleAppendAckResetTask();
}
}
Aggregations