use of com.hazelcast.cp.internal.raft.impl.state.LeaderState in project hazelcast by hazelcast.
the class RaftNodeImpl method findQuorumMatchIndex.
private long findQuorumMatchIndex() {
LeaderState leaderState = state.leaderState();
long[] indices = leaderState.matchIndices();
// if the leader is leaving, it should not count its vote for quorum...
if (state.isKnownMember(state.localEndpoint())) {
// Raft dissertation Section 10.2.1:
// The leader may even commit an entry before it has been written to its own disk,
// if a majority of followers have written it to their disks; this is still safe.
long leaderIndex = flushTask == null ? state.log().lastLogOrSnapshotIndex() : leaderState.flushedLogIndex();
indices[indices.length - 1] = leaderIndex;
} else {
// Remove the last empty slot reserved for leader index
indices = Arrays.copyOf(indices, indices.length - 1);
}
sort(indices);
long quorumMatchIndex = indices[(indices.length - 1) / 2];
if (logger.isFineEnabled()) {
logger.fine("Quorum match index: " + quorumMatchIndex + ", indices: " + Arrays.toString(indices));
}
return quorumMatchIndex;
}
use of com.hazelcast.cp.internal.raft.impl.state.LeaderState in project hazelcast by hazelcast.
the class RaftNodeImpl method forceSetTerminatedStatus.
@Override
public InternalCompletableFuture forceSetTerminatedStatus() {
InternalCompletableFuture resultFuture = raftIntegration.newCompletableFuture();
if (isTerminatedOrSteppedDown()) {
if (logger.isFineEnabled()) {
logger.fine("Already stepped down or terminated, not setting `TERMINATED` status.");
}
resultFuture.complete(null);
return resultFuture;
}
execute(() -> {
Throwable failure = null;
try {
if (isTerminatedOrSteppedDown()) {
return;
} else if (status == INITIAL) {
setStatus(TERMINATED);
return;
}
invalidateFuturesFrom(state.commitIndex() + 1);
LeaderState leaderState = state.leaderState();
if (leaderState != null) {
for (BiTuple<Object, InternalCompletableFuture> t : leaderState.queryState().operations()) {
t.element2.completeExceptionally(new LeaderDemotedException(state.localEndpoint(), null));
}
}
state.completeLeadershipTransfer(new LeaderDemotedException(state.localEndpoint(), null));
setStatus(TERMINATED);
} catch (Throwable t) {
failure = t;
logger.severe("Failure during force-termination", t);
if (status != TERMINATED && status != STEPPED_DOWN) {
setStatus(TERMINATED);
}
} finally {
if (failure == null) {
resultFuture.complete(null);
} else {
resultFuture.completeExceptionally(failure);
}
}
});
return resultFuture;
}
use of com.hazelcast.cp.internal.raft.impl.state.LeaderState in project hazelcast by hazelcast.
the class RaftNodeImpl method takeSnapshotIfCommitIndexAdvanced.
/**
* Takes a snapshot if the advance in {@code commitIndex} is equal to
* {@link RaftAlgorithmConfig#getCommitIndexAdvanceCountToSnapshot()}.
* <p>
* Snapshot is not created if the Raft group is being destroyed.
*/
@SuppressWarnings("checkstyle:npathcomplexity")
private void takeSnapshotIfCommitIndexAdvanced() {
long commitIndex = state.commitIndex();
if ((commitIndex - state.log().snapshotIndex()) < commitIndexAdvanceCountToSnapshot) {
return;
}
if (isTerminatedOrSteppedDown()) {
// If the status is TERMINATED or STEPPED_DOWN, then there will not be any new appends.
return;
}
RaftLog log = state.log();
Object snapshot = raftIntegration.takeSnapshot(commitIndex);
if (snapshot instanceof Throwable) {
Throwable t = (Throwable) snapshot;
logger.severe("Could not take snapshot at commit index: " + commitIndex, t);
return;
}
int snapshotTerm = log.getLogEntry(commitIndex).term();
RaftGroupMembers members = state.committedGroupMembers();
SnapshotEntry snapshotEntry = new SnapshotEntry(snapshotTerm, commitIndex, snapshot, members.index(), members.members());
long highestLogIndexToTruncate = commitIndex - maxNumberOfLogsToKeepAfterSnapshot;
LeaderState leaderState = state.leaderState();
if (leaderState != null) {
long[] matchIndices = leaderState.matchIndices();
// Last slot is reserved for leader index and always zero.
// If there is at least one follower with unknown match index,
// its log can be close to the leader's log so we are keeping the old log entries.
boolean allMatchIndicesKnown = Arrays.stream(matchIndices, 0, matchIndices.length - 1).noneMatch(i -> i == 0);
if (allMatchIndicesKnown) {
// Otherwise, we will keep the log entries until the minimum match index
// that is bigger than (commitIndex - maxNumberOfLogsToKeepAfterSnapshot).
// If there is no such follower (all of the minority followers are far behind),
// then there is no need to keep the old log entries.
highestLogIndexToTruncate = Arrays.stream(matchIndices).filter(i -> i < commitIndex).filter(i -> i > commitIndex - maxNumberOfLogsToKeepAfterSnapshot).map(i -> i - 1).sorted().findFirst().orElse(commitIndex);
}
}
int truncatedEntryCount = log.setSnapshot(snapshotEntry, highestLogIndexToTruncate);
if (logger.isFineEnabled()) {
logger.fine(snapshotEntry + " is taken, " + truncatedEntryCount + " entries are truncated.");
}
}
use of com.hazelcast.cp.internal.raft.impl.state.LeaderState in project hazelcast by hazelcast.
the class RaftNodeImpl method sendAppendRequest.
/**
* Sends an append-entries request to the follower member.
* <p>
* Log entries between follower's known nextIndex and latest appended entry index are sent in a batch.
* Batch size can be {@link RaftAlgorithmConfig#getAppendRequestMaxEntryCount()} at most.
* <p>
* If follower's nextIndex is behind the latest snapshot index, then {@link InstallSnapshot} request is sent.
* <p>
* If leader doesn't know follower's matchIndex (if {@code matchIndex == 0}), then an empty append-entries is sent
* to save bandwidth until leader learns the matchIndex of the follower.
*/
@SuppressWarnings({ "checkstyle:npathcomplexity", "checkstyle:cyclomaticcomplexity", "checkstyle:methodlength" })
public void sendAppendRequest(RaftEndpoint follower) {
if (!raftIntegration.isReachable(follower)) {
return;
}
RaftLog raftLog = state.log();
LeaderState leaderState = state.leaderState();
FollowerState followerState = leaderState.getFollowerState(follower);
if (followerState.isAppendRequestBackoffSet()) {
// or a back-off timeout occurs.
return;
}
long nextIndex = followerState.nextIndex();
if (nextIndex <= raftLog.snapshotIndex() && (!raftLog.containsLogEntry(nextIndex) || (nextIndex > 1 && !raftLog.containsLogEntry(nextIndex - 1)))) {
InstallSnapshot installSnapshot = new InstallSnapshot(state.localEndpoint(), state.term(), raftLog.snapshot(), leaderState.queryRound());
if (logger.isFineEnabled()) {
logger.fine("Sending " + installSnapshot + " to " + follower + " since next index: " + nextIndex + " <= snapshot index: " + raftLog.snapshotIndex());
}
// no need to submit the flush task here because we send committed state...
raftIntegration.send(installSnapshot, follower);
followerState.setMaxAppendRequestBackoff();
scheduleAppendAckResetTask();
return;
}
int prevEntryTerm = 0;
long prevEntryIndex = 0;
LogEntry[] entries;
boolean shouldBackoff = true;
if (nextIndex > 1) {
prevEntryIndex = nextIndex - 1;
LogEntry prevEntry = (raftLog.snapshotIndex() == prevEntryIndex) ? raftLog.snapshot() : raftLog.getLogEntry(prevEntryIndex);
assert prevEntry != null : "Prev entry index: " + prevEntryIndex + ", snapshot: " + raftLog.snapshotIndex();
prevEntryTerm = prevEntry.term();
long matchIndex = followerState.matchIndex();
if (matchIndex == 0) {
// Until the leader has discovered where it and the follower's logs match,
// the leader can send AppendEntries with no entries (like heartbeats) to save bandwidth.
// We still need to enable append request backoff here because we do not want to bombard
// the follower before we learn its match index
entries = new LogEntry[0];
} else if (nextIndex <= raftLog.lastLogOrSnapshotIndex()) {
// Then, once the matchIndex immediately precedes the nextIndex,
// the leader should begin to send the actual entries
long end = min(nextIndex + appendRequestMaxEntryCount, raftLog.lastLogOrSnapshotIndex());
entries = raftLog.getEntriesBetween(nextIndex, end);
} else {
// The follower has caught up with the leader. Sending an empty append request as a heartbeat...
entries = new LogEntry[0];
shouldBackoff = false;
}
} else if (nextIndex == 1 && raftLog.lastLogOrSnapshotIndex() > 0) {
// Entries will be sent to the follower for the first time...
long end = min(nextIndex + appendRequestMaxEntryCount, raftLog.lastLogOrSnapshotIndex());
entries = raftLog.getEntriesBetween(nextIndex, end);
} else {
// There is no entry in the Raft log. Sending an empty append request as a heartbeat...
entries = new LogEntry[0];
shouldBackoff = false;
}
AppendRequest request = new AppendRequest(getLocalMember(), state.term(), prevEntryTerm, prevEntryIndex, state.commitIndex(), entries, leaderState.queryRound());
if (logger.isFineEnabled()) {
logger.fine("Sending " + request + " to " + follower + " with next index: " + nextIndex);
}
raftIntegration.send(request, follower);
if (entries.length > 0 && entries[entries.length - 1].index() > leaderState.flushedLogIndex()) {
// if I am sending any non-flushed entry to the follower, I should trigger the flush task.
// I hope that I will flush before receiving append responses from half of the followers...
// This is a very critical optimization because
// it makes the leader and followers flush in parallel...
submitFlushTask();
}
if (shouldBackoff) {
followerState.setAppendRequestBackoff();
scheduleAppendAckResetTask();
}
}
Aggregations