Search in sources :

Example 1 with AppendRequest

use of com.hazelcast.cp.internal.raft.impl.dto.AppendRequest in project hazelcast by hazelcast.

the class SnapshotTest method testMembershipChangeBlocksSnapshotBug.

@Test
public void testMembershipChangeBlocksSnapshotBug() throws ExecutionException, InterruptedException {
    // The comments below show how the code behaves before the mentioned bug is fixed.
    int commitIndexAdvanceCount = 50;
    int uncommittedEntryCount = 10;
    RaftAlgorithmConfig config = new RaftAlgorithmConfig().setCommitIndexAdvanceCountToSnapshot(commitIndexAdvanceCount).setUncommittedEntryCountToRejectNewAppends(uncommittedEntryCount);
    group = newGroup(3, config);
    group.start();
    RaftNodeImpl leader = group.waitUntilLeaderElected();
    RaftNodeImpl[] followers = group.getNodesExcept(leader.getLocalMember());
    group.dropMessagesToMember(leader.getLocalMember(), followers[0].getLocalMember(), AppendRequest.class);
    while (getSnapshotEntry(leader).index() == 0) {
        leader.replicate(new ApplyRaftRunnable("into_snapshot")).get();
    }
    // now, the leader has taken a snapshot.
    // It also keeps some already committed entries in the log because followers[0] hasn't appended them.
    // LOG: [ <46 - 49>, <50>], SNAPSHOT INDEX: 50, COMMIT INDEX: 50
    long leaderCommitIndex = getCommitIndex(leader);
    do {
        leader.replicate(new ApplyRaftRunnable("committed_after_snapshot")).get();
    } while (getCommitIndex(leader) < leaderCommitIndex + commitIndexAdvanceCount - 1);
    // committing new entries.
    // one more entry is needed to take the next snapshot.
    // LOG: [ <46 - 49>, <50>, <51 - 99 (committed)> ], SNAPSHOT INDEX: 50, COMMIT INDEX: 99
    group.dropMessagesToMember(leader.getLocalMember(), followers[1].getLocalMember(), AppendRequest.class);
    for (int i = 0; i < uncommittedEntryCount - 1; i++) {
        leader.replicate(new ApplyRaftRunnable("uncommitted_after_snapshot"));
    }
    // appended some more entries which will not be committed because the leader has no majority.
    // the last uncommitted index is reserved for membership changed.
    // LOG: [ <46 - 49>, <50>, <51 - 99 (committed)>, <100 - 108 (uncommitted)> ], SNAPSHOT INDEX: 50, COMMIT INDEX: 99
    // There are only 2 empty indices in the log.
    RaftNodeImpl newRaftNode = group.createNewRaftNode();
    Function<Object, Object> alterFunc = o -> {
        if (o instanceof AppendRequest) {
            AppendRequest request = (AppendRequest) o;
            LogEntry[] entries = request.entries();
            if (entries.length > 0) {
                if (entries[entries.length - 1].operation() instanceof UpdateRaftGroupMembersCmd) {
                    entries = Arrays.copyOf(entries, entries.length - 1);
                    return new AppendRequest(request.leader(), request.term(), request.prevLogTerm(), request.prevLogIndex(), request.leaderCommitIndex(), entries, request.queryRound());
                } else if (entries[0].operation() instanceof UpdateRaftGroupMembersCmd) {
                    entries = new LogEntry[0];
                    return new AppendRequest(request.leader(), request.term(), request.prevLogTerm(), request.prevLogIndex(), request.leaderCommitIndex(), entries, request.queryRound());
                }
            }
        }
        return null;
    };
    group.alterMessagesToMember(leader.getLocalMember(), followers[1].getLocalMember(), alterFunc);
    group.alterMessagesToMember(leader.getLocalMember(), newRaftNode.getLocalMember(), alterFunc);
    long lastLogIndex1 = getLastLogOrSnapshotEntry(leader).index();
    leader.replicateMembershipChange(newRaftNode.getLocalMember(), MembershipChangeMode.ADD);
    // When the membership change entry is appended, the leader's Log will be as following:
    // LOG: [ <46 - 49>, <50>, <51 - 99 (committed)>, <100 - 108 (uncommitted)>, <109 (membership change)> ], SNAPSHOT INDEX: 50, COMMIT INDEX: 99
    assertTrueEventually(() -> assertTrue(getLastLogOrSnapshotEntry(leader).index() > lastLogIndex1));
    group.allowMessagesToMember(leader.getLocalMember(), followers[1].getLocalMember(), AppendRequest.class);
    // Then, only the entries before the membership change will be committed because we alter the append request. The log will be:
    // LOG: [ <46 - 49>, <50>, <51 - 108 (committed)>, <109 (membership change)> ], SNAPSHOT INDEX: 50, COMMIT INDEX: 108
    // There is only 1 empty index in the log.
    assertTrueEventually(() -> {
        assertEquals(lastLogIndex1, getCommitIndex(leader));
        assertEquals(lastLogIndex1, getCommitIndex(followers[1]));
    });
    // assertTrueEventually(() -> {
    // assertEquals(lastLogIndex1 + 1, getCommitIndex(leader));
    // assertEquals(lastLogIndex1 + 1, getCommitIndex(followers[1]));
    // });
    long lastLogIndex2 = getLastLogOrSnapshotEntry(leader).index();
    leader.replicate(new ApplyRaftRunnable("after_membership_change_append"));
    assertTrueEventually(() -> assertTrue(getLastLogOrSnapshotEntry(leader).index() > lastLogIndex2));
    // Now the log is full. There is no empty space left.
    // LOG: [ <46 - 49>, <50>, <51 - 108 (committed)>, <109 (membership change)>, <110 (uncommitted)> ], SNAPSHOT INDEX: 50, COMMIT INDEX: 108
    long lastLogIndex3 = getLastLogOrSnapshotEntry(leader).index();
    Future f = leader.replicate(new ApplyRaftRunnable("after_membership_change_append"));
    assertTrueEventually(() -> assertTrue(getLastLogOrSnapshotEntry(leader).index() > lastLogIndex3));
    assertFalse(f.isDone());
}
Also used : ParallelJVMTest(com.hazelcast.test.annotation.ParallelJVMTest) AppendSuccessResponse(com.hazelcast.cp.internal.raft.impl.dto.AppendSuccessResponse) Arrays(java.util.Arrays) StaleAppendRequestException(com.hazelcast.cp.exception.StaleAppendRequestException) MembershipChangeMode(com.hazelcast.cp.internal.raft.MembershipChangeMode) QuickTest(com.hazelcast.test.annotation.QuickTest) RunWith(org.junit.runner.RunWith) RaftUtil.getLeaderMember(com.hazelcast.cp.internal.raft.impl.RaftUtil.getLeaderMember) InstallSnapshot(com.hazelcast.cp.internal.raft.impl.dto.InstallSnapshot) Function(java.util.function.Function) ArrayList(java.util.ArrayList) InternalCompletableFuture(com.hazelcast.spi.impl.InternalCompletableFuture) Assert.assertThat(org.junit.Assert.assertThat) AppendRequest(com.hazelcast.cp.internal.raft.impl.dto.AppendRequest) RaftUtil.getMatchIndex(com.hazelcast.cp.internal.raft.impl.RaftUtil.getMatchIndex) Future(java.util.concurrent.Future) RaftUtil.getCommitIndex(com.hazelcast.cp.internal.raft.impl.RaftUtil.getCommitIndex) AppendFailureResponse(com.hazelcast.cp.internal.raft.impl.dto.AppendFailureResponse) After(org.junit.After) RaftAlgorithmConfig(com.hazelcast.config.cp.RaftAlgorithmConfig) Assert.fail(org.junit.Assert.fail) ACTIVE(com.hazelcast.cp.internal.raft.impl.RaftNodeStatus.ACTIVE) RaftUtil.getCommittedGroupMembers(com.hazelcast.cp.internal.raft.impl.RaftUtil.getCommittedGroupMembers) Matchers.greaterThanOrEqualTo(org.hamcrest.Matchers.greaterThanOrEqualTo) Assert.assertNotNull(org.junit.Assert.assertNotNull) ApplyRaftRunnable(com.hazelcast.cp.internal.raft.impl.dataservice.ApplyRaftRunnable) HazelcastTestSupport(com.hazelcast.test.HazelcastTestSupport) RaftUtil.getStatus(com.hazelcast.cp.internal.raft.impl.RaftUtil.getStatus) Assert.assertTrue(org.junit.Assert.assertTrue) Test(org.junit.Test) LocalRaftGroup(com.hazelcast.cp.internal.raft.impl.testing.LocalRaftGroup) Category(org.junit.experimental.categories.Category) Assert.assertNotEquals(org.junit.Assert.assertNotEquals) UpdateRaftGroupMembersCmd(com.hazelcast.cp.internal.raft.impl.command.UpdateRaftGroupMembersCmd) ExecutionException(java.util.concurrent.ExecutionException) RaftUtil.getSnapshotEntry(com.hazelcast.cp.internal.raft.impl.RaftUtil.getSnapshotEntry) RaftDataService(com.hazelcast.cp.internal.raft.impl.dataservice.RaftDataService) List(java.util.List) LocalRaftGroupBuilder.newGroup(com.hazelcast.cp.internal.raft.impl.testing.LocalRaftGroup.LocalRaftGroupBuilder.newGroup) Assert.assertFalse(org.junit.Assert.assertFalse) HazelcastParallelClassRunner(com.hazelcast.test.HazelcastParallelClassRunner) LogEntry(com.hazelcast.cp.internal.raft.impl.log.LogEntry) RaftUtil.getLastLogOrSnapshotEntry(com.hazelcast.cp.internal.raft.impl.RaftUtil.getLastLogOrSnapshotEntry) Assert.assertEquals(org.junit.Assert.assertEquals) RaftAlgorithmConfig(com.hazelcast.config.cp.RaftAlgorithmConfig) UpdateRaftGroupMembersCmd(com.hazelcast.cp.internal.raft.impl.command.UpdateRaftGroupMembersCmd) ApplyRaftRunnable(com.hazelcast.cp.internal.raft.impl.dataservice.ApplyRaftRunnable) InternalCompletableFuture(com.hazelcast.spi.impl.InternalCompletableFuture) Future(java.util.concurrent.Future) AppendRequest(com.hazelcast.cp.internal.raft.impl.dto.AppendRequest) ParallelJVMTest(com.hazelcast.test.annotation.ParallelJVMTest) QuickTest(com.hazelcast.test.annotation.QuickTest) Test(org.junit.Test)

Example 2 with AppendRequest

use of com.hazelcast.cp.internal.raft.impl.dto.AppendRequest in project hazelcast by hazelcast.

the class RaftNodeImpl method sendAppendRequest.

/**
 * Sends an append-entries request to the follower member.
 * <p>
 * Log entries between follower's known nextIndex and latest appended entry index are sent in a batch.
 * Batch size can be {@link RaftAlgorithmConfig#getAppendRequestMaxEntryCount()} at most.
 * <p>
 * If follower's nextIndex is behind the latest snapshot index, then {@link InstallSnapshot} request is sent.
 * <p>
 * If leader doesn't know follower's matchIndex (if {@code matchIndex == 0}), then an empty append-entries is sent
 * to save bandwidth until leader learns the matchIndex of the follower.
 */
@SuppressWarnings({ "checkstyle:npathcomplexity", "checkstyle:cyclomaticcomplexity", "checkstyle:methodlength" })
public void sendAppendRequest(RaftEndpoint follower) {
    if (!raftIntegration.isReachable(follower)) {
        return;
    }
    RaftLog raftLog = state.log();
    LeaderState leaderState = state.leaderState();
    FollowerState followerState = leaderState.getFollowerState(follower);
    if (followerState.isAppendRequestBackoffSet()) {
        // or a back-off timeout occurs.
        return;
    }
    long nextIndex = followerState.nextIndex();
    if (nextIndex <= raftLog.snapshotIndex() && (!raftLog.containsLogEntry(nextIndex) || (nextIndex > 1 && !raftLog.containsLogEntry(nextIndex - 1)))) {
        InstallSnapshot installSnapshot = new InstallSnapshot(state.localEndpoint(), state.term(), raftLog.snapshot(), leaderState.queryRound());
        if (logger.isFineEnabled()) {
            logger.fine("Sending " + installSnapshot + " to " + follower + " since next index: " + nextIndex + " <= snapshot index: " + raftLog.snapshotIndex());
        }
        // no need to submit the flush task here because we send committed state...
        raftIntegration.send(installSnapshot, follower);
        followerState.setMaxAppendRequestBackoff();
        scheduleAppendAckResetTask();
        return;
    }
    int prevEntryTerm = 0;
    long prevEntryIndex = 0;
    LogEntry[] entries;
    boolean shouldBackoff = true;
    if (nextIndex > 1) {
        prevEntryIndex = nextIndex - 1;
        LogEntry prevEntry = (raftLog.snapshotIndex() == prevEntryIndex) ? raftLog.snapshot() : raftLog.getLogEntry(prevEntryIndex);
        assert prevEntry != null : "Prev entry index: " + prevEntryIndex + ", snapshot: " + raftLog.snapshotIndex();
        prevEntryTerm = prevEntry.term();
        long matchIndex = followerState.matchIndex();
        if (matchIndex == 0) {
            // Until the leader has discovered where it and the follower's logs match,
            // the leader can send AppendEntries with no entries (like heartbeats) to save bandwidth.
            // We still need to enable append request backoff here because we do not want to bombard
            // the follower before we learn its match index
            entries = new LogEntry[0];
        } else if (nextIndex <= raftLog.lastLogOrSnapshotIndex()) {
            // Then, once the matchIndex immediately precedes the nextIndex,
            // the leader should begin to send the actual entries
            long end = min(nextIndex + appendRequestMaxEntryCount, raftLog.lastLogOrSnapshotIndex());
            entries = raftLog.getEntriesBetween(nextIndex, end);
        } else {
            // The follower has caught up with the leader. Sending an empty append request as a heartbeat...
            entries = new LogEntry[0];
            shouldBackoff = false;
        }
    } else if (nextIndex == 1 && raftLog.lastLogOrSnapshotIndex() > 0) {
        // Entries will be sent to the follower for the first time...
        long end = min(nextIndex + appendRequestMaxEntryCount, raftLog.lastLogOrSnapshotIndex());
        entries = raftLog.getEntriesBetween(nextIndex, end);
    } else {
        // There is no entry in the Raft log. Sending an empty append request as a heartbeat...
        entries = new LogEntry[0];
        shouldBackoff = false;
    }
    AppendRequest request = new AppendRequest(getLocalMember(), state.term(), prevEntryTerm, prevEntryIndex, state.commitIndex(), entries, leaderState.queryRound());
    if (logger.isFineEnabled()) {
        logger.fine("Sending " + request + " to " + follower + " with next index: " + nextIndex);
    }
    raftIntegration.send(request, follower);
    if (entries.length > 0 && entries[entries.length - 1].index() > leaderState.flushedLogIndex()) {
        // if I am sending any non-flushed entry to the follower, I should trigger the flush task.
        // I hope that I will flush before receiving append responses from half of the followers...
        // This is a very critical optimization because
        // it makes the leader and followers flush in parallel...
        submitFlushTask();
    }
    if (shouldBackoff) {
        followerState.setAppendRequestBackoff();
        scheduleAppendAckResetTask();
    }
}
Also used : FollowerState(com.hazelcast.cp.internal.raft.impl.state.FollowerState) AppendRequest(com.hazelcast.cp.internal.raft.impl.dto.AppendRequest) LogEntry(com.hazelcast.cp.internal.raft.impl.log.LogEntry) RaftLog(com.hazelcast.cp.internal.raft.impl.log.RaftLog) LeaderState(com.hazelcast.cp.internal.raft.impl.state.LeaderState) InstallSnapshot(com.hazelcast.cp.internal.raft.impl.dto.InstallSnapshot)

Aggregations

AppendRequest (com.hazelcast.cp.internal.raft.impl.dto.AppendRequest)2 InstallSnapshot (com.hazelcast.cp.internal.raft.impl.dto.InstallSnapshot)2 LogEntry (com.hazelcast.cp.internal.raft.impl.log.LogEntry)2 RaftAlgorithmConfig (com.hazelcast.config.cp.RaftAlgorithmConfig)1 StaleAppendRequestException (com.hazelcast.cp.exception.StaleAppendRequestException)1 MembershipChangeMode (com.hazelcast.cp.internal.raft.MembershipChangeMode)1 ACTIVE (com.hazelcast.cp.internal.raft.impl.RaftNodeStatus.ACTIVE)1 RaftUtil.getCommitIndex (com.hazelcast.cp.internal.raft.impl.RaftUtil.getCommitIndex)1 RaftUtil.getCommittedGroupMembers (com.hazelcast.cp.internal.raft.impl.RaftUtil.getCommittedGroupMembers)1 RaftUtil.getLastLogOrSnapshotEntry (com.hazelcast.cp.internal.raft.impl.RaftUtil.getLastLogOrSnapshotEntry)1 RaftUtil.getLeaderMember (com.hazelcast.cp.internal.raft.impl.RaftUtil.getLeaderMember)1 RaftUtil.getMatchIndex (com.hazelcast.cp.internal.raft.impl.RaftUtil.getMatchIndex)1 RaftUtil.getSnapshotEntry (com.hazelcast.cp.internal.raft.impl.RaftUtil.getSnapshotEntry)1 RaftUtil.getStatus (com.hazelcast.cp.internal.raft.impl.RaftUtil.getStatus)1 UpdateRaftGroupMembersCmd (com.hazelcast.cp.internal.raft.impl.command.UpdateRaftGroupMembersCmd)1 ApplyRaftRunnable (com.hazelcast.cp.internal.raft.impl.dataservice.ApplyRaftRunnable)1 RaftDataService (com.hazelcast.cp.internal.raft.impl.dataservice.RaftDataService)1 AppendFailureResponse (com.hazelcast.cp.internal.raft.impl.dto.AppendFailureResponse)1 AppendSuccessResponse (com.hazelcast.cp.internal.raft.impl.dto.AppendSuccessResponse)1 RaftLog (com.hazelcast.cp.internal.raft.impl.log.RaftLog)1