Search in sources :

Example 6 with LeaderState

use of com.hazelcast.cp.internal.raft.impl.state.LeaderState in project hazelcast by hazelcast.

the class RaftNodeImpl method findQuorumMatchIndex.

private long findQuorumMatchIndex() {
    LeaderState leaderState = state.leaderState();
    long[] indices = leaderState.matchIndices();
    // if the leader is leaving, it should not count its vote for quorum...
    if (state.isKnownMember(state.localEndpoint())) {
        // Raft dissertation Section 10.2.1:
        // The leader may even commit an entry before it has been written to its own disk,
        // if a majority of followers have written it to their disks; this is still safe.
        long leaderIndex = flushTask == null ? state.log().lastLogOrSnapshotIndex() : leaderState.flushedLogIndex();
        indices[indices.length - 1] = leaderIndex;
    } else {
        // Remove the last empty slot reserved for leader index
        indices = Arrays.copyOf(indices, indices.length - 1);
    }
    sort(indices);
    long quorumMatchIndex = indices[(indices.length - 1) / 2];
    if (logger.isFineEnabled()) {
        logger.fine("Quorum match index: " + quorumMatchIndex + ", indices: " + Arrays.toString(indices));
    }
    return quorumMatchIndex;
}
Also used : LeaderState(com.hazelcast.cp.internal.raft.impl.state.LeaderState)

Example 7 with LeaderState

use of com.hazelcast.cp.internal.raft.impl.state.LeaderState in project hazelcast by hazelcast.

the class RaftNodeImpl method forceSetTerminatedStatus.

@Override
public InternalCompletableFuture forceSetTerminatedStatus() {
    InternalCompletableFuture resultFuture = raftIntegration.newCompletableFuture();
    if (isTerminatedOrSteppedDown()) {
        if (logger.isFineEnabled()) {
            logger.fine("Already stepped down or terminated, not setting `TERMINATED` status.");
        }
        resultFuture.complete(null);
        return resultFuture;
    }
    execute(() -> {
        Throwable failure = null;
        try {
            if (isTerminatedOrSteppedDown()) {
                return;
            } else if (status == INITIAL) {
                setStatus(TERMINATED);
                return;
            }
            invalidateFuturesFrom(state.commitIndex() + 1);
            LeaderState leaderState = state.leaderState();
            if (leaderState != null) {
                for (BiTuple<Object, InternalCompletableFuture> t : leaderState.queryState().operations()) {
                    t.element2.completeExceptionally(new LeaderDemotedException(state.localEndpoint(), null));
                }
            }
            state.completeLeadershipTransfer(new LeaderDemotedException(state.localEndpoint(), null));
            setStatus(TERMINATED);
        } catch (Throwable t) {
            failure = t;
            logger.severe("Failure during force-termination", t);
            if (status != TERMINATED && status != STEPPED_DOWN) {
                setStatus(TERMINATED);
            }
        } finally {
            if (failure == null) {
                resultFuture.complete(null);
            } else {
                resultFuture.completeExceptionally(failure);
            }
        }
    });
    return resultFuture;
}
Also used : LeaderDemotedException(com.hazelcast.cp.exception.LeaderDemotedException) InternalCompletableFuture(com.hazelcast.spi.impl.InternalCompletableFuture) LeaderState(com.hazelcast.cp.internal.raft.impl.state.LeaderState)

Example 8 with LeaderState

use of com.hazelcast.cp.internal.raft.impl.state.LeaderState in project hazelcast by hazelcast.

the class RaftNodeImpl method takeSnapshotIfCommitIndexAdvanced.

/**
 * Takes a snapshot if the advance in {@code commitIndex} is equal to
 * {@link RaftAlgorithmConfig#getCommitIndexAdvanceCountToSnapshot()}.
 * <p>
 * Snapshot is not created if the Raft group is being destroyed.
 */
@SuppressWarnings("checkstyle:npathcomplexity")
private void takeSnapshotIfCommitIndexAdvanced() {
    long commitIndex = state.commitIndex();
    if ((commitIndex - state.log().snapshotIndex()) < commitIndexAdvanceCountToSnapshot) {
        return;
    }
    if (isTerminatedOrSteppedDown()) {
        // If the status is TERMINATED or STEPPED_DOWN, then there will not be any new appends.
        return;
    }
    RaftLog log = state.log();
    Object snapshot = raftIntegration.takeSnapshot(commitIndex);
    if (snapshot instanceof Throwable) {
        Throwable t = (Throwable) snapshot;
        logger.severe("Could not take snapshot at commit index: " + commitIndex, t);
        return;
    }
    int snapshotTerm = log.getLogEntry(commitIndex).term();
    RaftGroupMembers members = state.committedGroupMembers();
    SnapshotEntry snapshotEntry = new SnapshotEntry(snapshotTerm, commitIndex, snapshot, members.index(), members.members());
    long highestLogIndexToTruncate = commitIndex - maxNumberOfLogsToKeepAfterSnapshot;
    LeaderState leaderState = state.leaderState();
    if (leaderState != null) {
        long[] matchIndices = leaderState.matchIndices();
        // Last slot is reserved for leader index and always zero.
        // If there is at least one follower with unknown match index,
        // its log can be close to the leader's log so we are keeping the old log entries.
        boolean allMatchIndicesKnown = Arrays.stream(matchIndices, 0, matchIndices.length - 1).noneMatch(i -> i == 0);
        if (allMatchIndicesKnown) {
            // Otherwise, we will keep the log entries until the minimum match index
            // that is bigger than (commitIndex - maxNumberOfLogsToKeepAfterSnapshot).
            // If there is no such follower (all of the minority followers are far behind),
            // then there is no need to keep the old log entries.
            highestLogIndexToTruncate = Arrays.stream(matchIndices).filter(i -> i < commitIndex).filter(i -> i > commitIndex - maxNumberOfLogsToKeepAfterSnapshot).map(i -> i - 1).sorted().findFirst().orElse(commitIndex);
        }
    }
    int truncatedEntryCount = log.setSnapshot(snapshotEntry, highestLogIndexToTruncate);
    if (logger.isFineEnabled()) {
        logger.fine(snapshotEntry + " is taken, " + truncatedEntryCount + " entries are truncated.");
    }
}
Also used : RaftStateStore(com.hazelcast.cp.internal.raft.impl.persistence.RaftStateStore) AppendSuccessResponse(com.hazelcast.cp.internal.raft.impl.dto.AppendSuccessResponse) QueryTask(com.hazelcast.cp.internal.raft.impl.task.QueryTask) Arrays(java.util.Arrays) StaleAppendRequestException(com.hazelcast.cp.exception.StaleAppendRequestException) Clock(com.hazelcast.internal.util.Clock) MembershipChangeTask(com.hazelcast.cp.internal.raft.impl.task.MembershipChangeTask) PreVoteRequest(com.hazelcast.cp.internal.raft.impl.dto.PreVoteRequest) InitLeadershipTransferTask(com.hazelcast.cp.internal.raft.impl.task.InitLeadershipTransferTask) InstallSnapshot(com.hazelcast.cp.internal.raft.impl.dto.InstallSnapshot) LeaderDemotedException(com.hazelcast.cp.exception.LeaderDemotedException) Arrays.sort(java.util.Arrays.sort) VoteResponse(com.hazelcast.cp.internal.raft.impl.dto.VoteResponse) RaftState.restoreRaftState(com.hazelcast.cp.internal.raft.impl.state.RaftState.restoreRaftState) InternalCompletableFuture(com.hazelcast.spi.impl.InternalCompletableFuture) AppendFailureResponse(com.hazelcast.cp.internal.raft.impl.dto.AppendFailureResponse) Map(java.util.Map) UPDATING_GROUP_MEMBER_LIST(com.hazelcast.cp.internal.raft.impl.RaftNodeStatus.UPDATING_GROUP_MEMBER_LIST) RaftAlgorithmConfig(com.hazelcast.config.cp.RaftAlgorithmConfig) PreVoteRequestHandlerTask(com.hazelcast.cp.internal.raft.impl.handler.PreVoteRequestHandlerTask) SnapshotEntry.isNonInitial(com.hazelcast.cp.internal.raft.impl.log.SnapshotEntry.isNonInitial) VoteRequestHandlerTask(com.hazelcast.cp.internal.raft.impl.handler.VoteRequestHandlerTask) Collection(java.util.Collection) TriggerLeaderElection(com.hazelcast.cp.internal.raft.impl.dto.TriggerLeaderElection) RandomPicker(com.hazelcast.internal.util.RandomPicker) Math.min(java.lang.Math.min) MILLISECONDS(java.util.concurrent.TimeUnit.MILLISECONDS) TERMINATING(com.hazelcast.cp.internal.raft.impl.RaftNodeStatus.TERMINATING) VoteResponseHandlerTask(com.hazelcast.cp.internal.raft.impl.handler.VoteResponseHandlerTask) NopRaftStateStore(com.hazelcast.cp.internal.raft.impl.persistence.NopRaftStateStore) QueryState(com.hazelcast.cp.internal.raft.impl.state.QueryState) CPMember(com.hazelcast.cp.CPMember) RaftGroupCmd(com.hazelcast.cp.internal.raft.command.RaftGroupCmd) Long2ObjectHashMap(com.hazelcast.internal.util.collection.Long2ObjectHashMap) Entry(java.util.Map.Entry) LogEntry(com.hazelcast.cp.internal.raft.impl.log.LogEntry) STEPPED_DOWN(com.hazelcast.cp.internal.raft.impl.RaftNodeStatus.STEPPED_DOWN) SnapshotEntry(com.hazelcast.cp.internal.raft.impl.log.SnapshotEntry) PreVoteResponseHandlerTask(com.hazelcast.cp.internal.raft.impl.handler.PreVoteResponseHandlerTask) MembershipChangeMode(com.hazelcast.cp.internal.raft.MembershipChangeMode) VoteRequest(com.hazelcast.cp.internal.raft.impl.dto.VoteRequest) InstallSnapshotHandlerTask(com.hazelcast.cp.internal.raft.impl.handler.InstallSnapshotHandlerTask) LEADER(com.hazelcast.cp.internal.raft.impl.RaftRole.LEADER) RaftState.newRaftState(com.hazelcast.cp.internal.raft.impl.state.RaftState.newRaftState) Level(java.util.logging.Level) AppendRequest(com.hazelcast.cp.internal.raft.impl.dto.AppendRequest) FollowerState(com.hazelcast.cp.internal.raft.impl.state.FollowerState) TriggerLeaderElectionHandlerTask(com.hazelcast.cp.internal.raft.impl.handler.TriggerLeaderElectionHandlerTask) RestoredRaftState(com.hazelcast.cp.internal.raft.impl.persistence.RestoredRaftState) TERMINATED(com.hazelcast.cp.internal.raft.impl.RaftNodeStatus.TERMINATED) DestroyRaftGroupCmd(com.hazelcast.cp.internal.raft.command.DestroyRaftGroupCmd) BiTuple(com.hazelcast.internal.util.BiTuple) ILogger(com.hazelcast.logging.ILogger) RaftGroupMembers(com.hazelcast.cp.internal.raft.impl.state.RaftGroupMembers) CPGroupId(com.hazelcast.cp.CPGroupId) ACTIVE(com.hazelcast.cp.internal.raft.impl.RaftNodeStatus.ACTIVE) RaftState(com.hazelcast.cp.internal.raft.impl.state.RaftState) LeaderState(com.hazelcast.cp.internal.raft.impl.state.LeaderState) Iterator(java.util.Iterator) RaftLog(com.hazelcast.cp.internal.raft.impl.log.RaftLog) AppendRequestHandlerTask(com.hazelcast.cp.internal.raft.impl.handler.AppendRequestHandlerTask) PreVoteTask(com.hazelcast.cp.internal.raft.impl.task.PreVoteTask) IOException(java.io.IOException) QueryPolicy(com.hazelcast.cp.internal.raft.QueryPolicy) AppendFailureResponseHandlerTask(com.hazelcast.cp.internal.raft.impl.handler.AppendFailureResponseHandlerTask) Preconditions.checkNotNull(com.hazelcast.internal.util.Preconditions.checkNotNull) AppendSuccessResponseHandlerTask(com.hazelcast.cp.internal.raft.impl.handler.AppendSuccessResponseHandlerTask) FOLLOWER(com.hazelcast.cp.internal.raft.impl.RaftRole.FOLLOWER) UpdateRaftGroupMembersCmd(com.hazelcast.cp.internal.raft.impl.command.UpdateRaftGroupMembersCmd) TimeUnit(java.util.concurrent.TimeUnit) PreVoteResponse(com.hazelcast.cp.internal.raft.impl.dto.PreVoteResponse) RaftNodeStatusAwareTask(com.hazelcast.cp.internal.raft.impl.task.RaftNodeStatusAwareTask) PostponedResponse(com.hazelcast.cp.internal.raft.impl.util.PostponedResponse) ReplicateTask(com.hazelcast.cp.internal.raft.impl.task.ReplicateTask) INITIAL(com.hazelcast.cp.internal.raft.impl.RaftNodeStatus.INITIAL) RaftGroupMembers(com.hazelcast.cp.internal.raft.impl.state.RaftGroupMembers) SnapshotEntry(com.hazelcast.cp.internal.raft.impl.log.SnapshotEntry) RaftLog(com.hazelcast.cp.internal.raft.impl.log.RaftLog) LeaderState(com.hazelcast.cp.internal.raft.impl.state.LeaderState)

Example 9 with LeaderState

use of com.hazelcast.cp.internal.raft.impl.state.LeaderState in project hazelcast by hazelcast.

the class RaftNodeImpl method sendAppendRequest.

/**
 * Sends an append-entries request to the follower member.
 * <p>
 * Log entries between follower's known nextIndex and latest appended entry index are sent in a batch.
 * Batch size can be {@link RaftAlgorithmConfig#getAppendRequestMaxEntryCount()} at most.
 * <p>
 * If follower's nextIndex is behind the latest snapshot index, then {@link InstallSnapshot} request is sent.
 * <p>
 * If leader doesn't know follower's matchIndex (if {@code matchIndex == 0}), then an empty append-entries is sent
 * to save bandwidth until leader learns the matchIndex of the follower.
 */
@SuppressWarnings({ "checkstyle:npathcomplexity", "checkstyle:cyclomaticcomplexity", "checkstyle:methodlength" })
public void sendAppendRequest(RaftEndpoint follower) {
    if (!raftIntegration.isReachable(follower)) {
        return;
    }
    RaftLog raftLog = state.log();
    LeaderState leaderState = state.leaderState();
    FollowerState followerState = leaderState.getFollowerState(follower);
    if (followerState.isAppendRequestBackoffSet()) {
        // or a back-off timeout occurs.
        return;
    }
    long nextIndex = followerState.nextIndex();
    if (nextIndex <= raftLog.snapshotIndex() && (!raftLog.containsLogEntry(nextIndex) || (nextIndex > 1 && !raftLog.containsLogEntry(nextIndex - 1)))) {
        InstallSnapshot installSnapshot = new InstallSnapshot(state.localEndpoint(), state.term(), raftLog.snapshot(), leaderState.queryRound());
        if (logger.isFineEnabled()) {
            logger.fine("Sending " + installSnapshot + " to " + follower + " since next index: " + nextIndex + " <= snapshot index: " + raftLog.snapshotIndex());
        }
        // no need to submit the flush task here because we send committed state...
        raftIntegration.send(installSnapshot, follower);
        followerState.setMaxAppendRequestBackoff();
        scheduleAppendAckResetTask();
        return;
    }
    int prevEntryTerm = 0;
    long prevEntryIndex = 0;
    LogEntry[] entries;
    boolean shouldBackoff = true;
    if (nextIndex > 1) {
        prevEntryIndex = nextIndex - 1;
        LogEntry prevEntry = (raftLog.snapshotIndex() == prevEntryIndex) ? raftLog.snapshot() : raftLog.getLogEntry(prevEntryIndex);
        assert prevEntry != null : "Prev entry index: " + prevEntryIndex + ", snapshot: " + raftLog.snapshotIndex();
        prevEntryTerm = prevEntry.term();
        long matchIndex = followerState.matchIndex();
        if (matchIndex == 0) {
            // Until the leader has discovered where it and the follower's logs match,
            // the leader can send AppendEntries with no entries (like heartbeats) to save bandwidth.
            // We still need to enable append request backoff here because we do not want to bombard
            // the follower before we learn its match index
            entries = new LogEntry[0];
        } else if (nextIndex <= raftLog.lastLogOrSnapshotIndex()) {
            // Then, once the matchIndex immediately precedes the nextIndex,
            // the leader should begin to send the actual entries
            long end = min(nextIndex + appendRequestMaxEntryCount, raftLog.lastLogOrSnapshotIndex());
            entries = raftLog.getEntriesBetween(nextIndex, end);
        } else {
            // The follower has caught up with the leader. Sending an empty append request as a heartbeat...
            entries = new LogEntry[0];
            shouldBackoff = false;
        }
    } else if (nextIndex == 1 && raftLog.lastLogOrSnapshotIndex() > 0) {
        // Entries will be sent to the follower for the first time...
        long end = min(nextIndex + appendRequestMaxEntryCount, raftLog.lastLogOrSnapshotIndex());
        entries = raftLog.getEntriesBetween(nextIndex, end);
    } else {
        // There is no entry in the Raft log. Sending an empty append request as a heartbeat...
        entries = new LogEntry[0];
        shouldBackoff = false;
    }
    AppendRequest request = new AppendRequest(getLocalMember(), state.term(), prevEntryTerm, prevEntryIndex, state.commitIndex(), entries, leaderState.queryRound());
    if (logger.isFineEnabled()) {
        logger.fine("Sending " + request + " to " + follower + " with next index: " + nextIndex);
    }
    raftIntegration.send(request, follower);
    if (entries.length > 0 && entries[entries.length - 1].index() > leaderState.flushedLogIndex()) {
        // if I am sending any non-flushed entry to the follower, I should trigger the flush task.
        // I hope that I will flush before receiving append responses from half of the followers...
        // This is a very critical optimization because
        // it makes the leader and followers flush in parallel...
        submitFlushTask();
    }
    if (shouldBackoff) {
        followerState.setAppendRequestBackoff();
        scheduleAppendAckResetTask();
    }
}
Also used : FollowerState(com.hazelcast.cp.internal.raft.impl.state.FollowerState) AppendRequest(com.hazelcast.cp.internal.raft.impl.dto.AppendRequest) LogEntry(com.hazelcast.cp.internal.raft.impl.log.LogEntry) RaftLog(com.hazelcast.cp.internal.raft.impl.log.RaftLog) LeaderState(com.hazelcast.cp.internal.raft.impl.state.LeaderState) InstallSnapshot(com.hazelcast.cp.internal.raft.impl.dto.InstallSnapshot)

Aggregations

LeaderState (com.hazelcast.cp.internal.raft.impl.state.LeaderState)9 FollowerState (com.hazelcast.cp.internal.raft.impl.state.FollowerState)4 LeaderDemotedException (com.hazelcast.cp.exception.LeaderDemotedException)3 LogEntry (com.hazelcast.cp.internal.raft.impl.log.LogEntry)3 RaftEndpoint (com.hazelcast.cp.internal.raft.impl.RaftEndpoint)2 AppendRequest (com.hazelcast.cp.internal.raft.impl.dto.AppendRequest)2 InstallSnapshot (com.hazelcast.cp.internal.raft.impl.dto.InstallSnapshot)2 TriggerLeaderElection (com.hazelcast.cp.internal.raft.impl.dto.TriggerLeaderElection)2 RaftLog (com.hazelcast.cp.internal.raft.impl.log.RaftLog)2 RaftState (com.hazelcast.cp.internal.raft.impl.state.RaftState)2 InternalCompletableFuture (com.hazelcast.spi.impl.InternalCompletableFuture)2 RaftAlgorithmConfig (com.hazelcast.config.cp.RaftAlgorithmConfig)1 CPGroupId (com.hazelcast.cp.CPGroupId)1 CPMember (com.hazelcast.cp.CPMember)1 StaleAppendRequestException (com.hazelcast.cp.exception.StaleAppendRequestException)1 MembershipChangeMode (com.hazelcast.cp.internal.raft.MembershipChangeMode)1 QueryPolicy (com.hazelcast.cp.internal.raft.QueryPolicy)1 DestroyRaftGroupCmd (com.hazelcast.cp.internal.raft.command.DestroyRaftGroupCmd)1 RaftGroupCmd (com.hazelcast.cp.internal.raft.command.RaftGroupCmd)1 ACTIVE (com.hazelcast.cp.internal.raft.impl.RaftNodeStatus.ACTIVE)1