use of com.hazelcast.cp.internal.raft.impl.log.RaftLog in project hazelcast by hazelcast.
the class RaftNodeImpl method canQueryLinearizable.
/**
* Returns true if a new query is currently allowed to be executed without
* appending to the Raft log. This method can be invoked only when
* the local Raft node is the leader.
* <p>
* A new linearizable query execution is not allowed, when;
* <ul>
* <li>Node is terminating, terminated or stepped down.
* See {@link RaftNodeStatus}.</li>
* <li>If the leader has not yet marked an entry from its current term
* committed. See Section 6.4 of Raft Dissertation.</li>
* <li>There are already
* {@link RaftAlgorithmConfig#getUncommittedEntryCountToRejectNewAppends()}
* queries waiting to be executed.</li>
* </ul>
*/
public boolean canQueryLinearizable() {
if (isTerminatedOrSteppedDown()) {
return false;
}
long commitIndex = state.commitIndex();
RaftLog log = state.log();
// If the leader has not yet marked an entry from its current term committed, it waits until it has done so. (§6.4)
// last committed entry is either in the last snapshot or still in the log
LogEntry lastCommittedEntry = commitIndex == log.snapshotIndex() ? log.snapshot() : log.getLogEntry(commitIndex);
assert lastCommittedEntry != null;
if (lastCommittedEntry.term() != state.term()) {
return false;
}
// We can execute multiple queries at one-shot without appending to the Raft log,
// and we use the maxUncommittedEntryCount configuration parameter to upper-bound
// the number of queries that are collected until the heartbeat round is done.
QueryState queryState = state.leaderState().queryState();
return queryState.queryCount() < maxUncommittedEntryCount;
}
use of com.hazelcast.cp.internal.raft.impl.log.RaftLog in project hazelcast by hazelcast.
the class RaftNodeImpl method sendAppendRequest.
/**
* Sends an append-entries request to the follower member.
* <p>
* Log entries between follower's known nextIndex and latest appended entry index are sent in a batch.
* Batch size can be {@link RaftAlgorithmConfig#getAppendRequestMaxEntryCount()} at most.
* <p>
* If follower's nextIndex is behind the latest snapshot index, then {@link InstallSnapshot} request is sent.
* <p>
* If leader doesn't know follower's matchIndex (if {@code matchIndex == 0}), then an empty append-entries is sent
* to save bandwidth until leader learns the matchIndex of the follower.
*/
@SuppressWarnings({ "checkstyle:npathcomplexity", "checkstyle:cyclomaticcomplexity", "checkstyle:methodlength" })
public void sendAppendRequest(RaftEndpoint follower) {
if (!raftIntegration.isReachable(follower)) {
return;
}
RaftLog raftLog = state.log();
LeaderState leaderState = state.leaderState();
FollowerState followerState = leaderState.getFollowerState(follower);
if (followerState.isAppendRequestBackoffSet()) {
// or a back-off timeout occurs.
return;
}
long nextIndex = followerState.nextIndex();
if (nextIndex <= raftLog.snapshotIndex() && (!raftLog.containsLogEntry(nextIndex) || (nextIndex > 1 && !raftLog.containsLogEntry(nextIndex - 1)))) {
InstallSnapshot installSnapshot = new InstallSnapshot(state.localEndpoint(), state.term(), raftLog.snapshot(), leaderState.queryRound());
if (logger.isFineEnabled()) {
logger.fine("Sending " + installSnapshot + " to " + follower + " since next index: " + nextIndex + " <= snapshot index: " + raftLog.snapshotIndex());
}
// no need to submit the flush task here because we send committed state...
raftIntegration.send(installSnapshot, follower);
followerState.setMaxAppendRequestBackoff();
scheduleAppendAckResetTask();
return;
}
int prevEntryTerm = 0;
long prevEntryIndex = 0;
LogEntry[] entries;
boolean shouldBackoff = true;
if (nextIndex > 1) {
prevEntryIndex = nextIndex - 1;
LogEntry prevEntry = (raftLog.snapshotIndex() == prevEntryIndex) ? raftLog.snapshot() : raftLog.getLogEntry(prevEntryIndex);
assert prevEntry != null : "Prev entry index: " + prevEntryIndex + ", snapshot: " + raftLog.snapshotIndex();
prevEntryTerm = prevEntry.term();
long matchIndex = followerState.matchIndex();
if (matchIndex == 0) {
// Until the leader has discovered where it and the follower's logs match,
// the leader can send AppendEntries with no entries (like heartbeats) to save bandwidth.
// We still need to enable append request backoff here because we do not want to bombard
// the follower before we learn its match index
entries = new LogEntry[0];
} else if (nextIndex <= raftLog.lastLogOrSnapshotIndex()) {
// Then, once the matchIndex immediately precedes the nextIndex,
// the leader should begin to send the actual entries
long end = min(nextIndex + appendRequestMaxEntryCount, raftLog.lastLogOrSnapshotIndex());
entries = raftLog.getEntriesBetween(nextIndex, end);
} else {
// The follower has caught up with the leader. Sending an empty append request as a heartbeat...
entries = new LogEntry[0];
shouldBackoff = false;
}
} else if (nextIndex == 1 && raftLog.lastLogOrSnapshotIndex() > 0) {
// Entries will be sent to the follower for the first time...
long end = min(nextIndex + appendRequestMaxEntryCount, raftLog.lastLogOrSnapshotIndex());
entries = raftLog.getEntriesBetween(nextIndex, end);
} else {
// There is no entry in the Raft log. Sending an empty append request as a heartbeat...
entries = new LogEntry[0];
shouldBackoff = false;
}
AppendRequest request = new AppendRequest(getLocalMember(), state.term(), prevEntryTerm, prevEntryIndex, state.commitIndex(), entries, leaderState.queryRound());
if (logger.isFineEnabled()) {
logger.fine("Sending " + request + " to " + follower + " with next index: " + nextIndex);
}
raftIntegration.send(request, follower);
if (entries.length > 0 && entries[entries.length - 1].index() > leaderState.flushedLogIndex()) {
// if I am sending any non-flushed entry to the follower, I should trigger the flush task.
// I hope that I will flush before receiving append responses from half of the followers...
// This is a very critical optimization because
// it makes the leader and followers flush in parallel...
submitFlushTask();
}
if (shouldBackoff) {
followerState.setAppendRequestBackoff();
scheduleAppendAckResetTask();
}
}
use of com.hazelcast.cp.internal.raft.impl.log.RaftLog in project hazelcast by hazelcast.
the class AppendRequestHandlerTask method innerRun.
@Override
@SuppressWarnings({ "checkstyle:npathcomplexity", "checkstyle:cyclomaticcomplexity", "checkstyle:methodlength", "checkstyle:nestedifdepth" })
protected // Justification: It is easier to follow the AppendEntriesRPC logic in a single method
void innerRun() {
if (logger.isFineEnabled()) {
logger.fine("Received " + req);
}
RaftState state = raftNode.state();
// Reply false if term < currentTerm (§5.1)
if (req.term() < state.term()) {
if (logger.isFineEnabled()) {
logger.warning("Stale " + req + " received in current term: " + state.term());
}
raftNode.send(createFailureResponse(state.term()), req.leader());
return;
}
// Transform into follower if a newer term is seen or another node wins the election of the current term
if (req.term() > state.term() || state.role() != FOLLOWER) {
// If RPC request or response contains term T > currentTerm: set currentTerm = T, convert to follower (§5.1)
logger.info("Demoting to FOLLOWER from current role: " + state.role() + ", term: " + state.term() + " to new term: " + req.term() + " and leader: " + req.leader());
raftNode.toFollower(req.term());
}
if (!req.leader().equals(state.leader())) {
logger.info("Setting leader: " + req.leader());
raftNode.leader(req.leader());
}
RaftLog raftLog = state.log();
// Verify the last log entry
if (req.prevLogIndex() > 0) {
long lastLogIndex = raftLog.lastLogOrSnapshotIndex();
int lastLogTerm = raftLog.lastLogOrSnapshotTerm();
int prevLogTerm;
if (req.prevLogIndex() == lastLogIndex) {
prevLogTerm = lastLogTerm;
} else {
// Reply false if log does not contain an entry at prevLogIndex whose term matches prevLogTerm (§5.3)
LogEntry prevLog = raftLog.getLogEntry(req.prevLogIndex());
if (prevLog == null) {
if (logger.isFineEnabled()) {
logger.warning("Failed to get previous log index for " + req + ", last log index: " + lastLogIndex);
}
raftNode.send(createFailureResponse(req.term()), req.leader());
return;
}
prevLogTerm = prevLog.term();
}
if (req.prevLogTerm() != prevLogTerm) {
if (logger.isFineEnabled()) {
logger.warning("Previous log term of " + req + " is different than ours: " + prevLogTerm);
}
raftNode.send(createFailureResponse(req.term()), req.leader());
return;
}
}
int truncatedAppendRequestEntryCount = 0;
LogEntry[] newEntries = null;
// Process any new entries
if (req.entryCount() > 0) {
// Delete any conflicting entries, skip any duplicates
long lastLogIndex = raftLog.lastLogOrSnapshotIndex();
for (int i = 0; i < req.entryCount(); i++) {
LogEntry reqEntry = req.entries()[i];
if (reqEntry.index() > lastLogIndex) {
newEntries = Arrays.copyOfRange(req.entries(), i, req.entryCount());
break;
}
LogEntry localEntry = raftLog.getLogEntry(reqEntry.index());
assert localEntry != null : "Entry not found on log index: " + reqEntry.index() + " for " + req;
// delete the existing entry and all that follow it (§5.3)
if (reqEntry.term() != localEntry.term()) {
List<LogEntry> truncatedEntries = raftLog.deleteEntriesFrom(reqEntry.index());
if (logger.isFineEnabled()) {
logger.warning("Truncated " + truncatedEntries.size() + " entries from entry index: " + reqEntry.index() + " => " + truncatedEntries);
} else {
logger.warning("Truncated " + truncatedEntries.size() + " entries from entry index: " + reqEntry.index());
}
raftNode.invalidateFuturesFrom(reqEntry.index());
revertPreAppliedRaftGroupCmd(truncatedEntries);
newEntries = Arrays.copyOfRange(req.entries(), i, req.entryCount());
raftLog.flush();
break;
}
}
if (newEntries != null && newEntries.length > 0) {
if (raftLog.availableCapacity() < newEntries.length) {
if (logger.isFineEnabled()) {
logger.warning("Truncating " + newEntries.length + " entries to " + raftLog.availableCapacity() + " to fit into the available capacity of the Raft log");
}
truncatedAppendRequestEntryCount = newEntries.length - raftLog.availableCapacity();
newEntries = Arrays.copyOf(newEntries, raftLog.availableCapacity());
}
// Append any new entries not already in the log
if (logger.isFineEnabled()) {
logger.fine("Appending " + newEntries.length + " entries: " + Arrays.toString(newEntries));
}
raftLog.appendEntries(newEntries);
raftLog.flush();
}
}
// I cannot use raftLog.lastLogOrSnapshotIndex() for lastLogIndex because my log may contain
// some uncommitted entries from the previous leader and those entries will be truncated soon
// I can only send a response based on how many entries I have appended from this append request
long lastLogIndex = req.prevLogIndex() + req.entryCount() - truncatedAppendRequestEntryCount;
long oldCommitIndex = state.commitIndex();
// Update the commit index
if (req.leaderCommitIndex() > oldCommitIndex) {
// If leaderCommit > commitIndex, set commitIndex = min(leaderCommit, index of last new entry)
long newCommitIndex = min(req.leaderCommitIndex(), lastLogIndex);
if (logger.isFineEnabled()) {
logger.fine("Setting commit index: " + newCommitIndex);
}
state.commitIndex(newCommitIndex);
}
raftNode.updateLastAppendEntriesTimestamp();
try {
AppendSuccessResponse resp = new AppendSuccessResponse(localMember(), state.term(), lastLogIndex, req.queryRound());
raftNode.send(resp, req.leader());
} finally {
if (state.commitIndex() > oldCommitIndex) {
raftNode.applyLogEntries();
}
if (newEntries != null) {
preApplyRaftGroupCmd(newEntries, state.commitIndex());
}
}
}
use of com.hazelcast.cp.internal.raft.impl.log.RaftLog in project hazelcast by hazelcast.
the class PreVoteRequestHandlerTask method innerRun.
@Override
protected void innerRun() {
RaftState state = raftNode.state();
RaftEndpoint localEndpoint = localMember();
// Reply false if term < currentTerm (§5.1)
if (state.term() > req.nextTerm()) {
logger.info("Rejecting " + req + " since current term: " + state.term() + " is bigger");
raftNode.send(new PreVoteResponse(localEndpoint, state.term(), false), req.candidate());
return;
}
// Reply false if last AppendEntries call was received less than election timeout ago (leader stickiness)
if (raftNode.lastAppendEntriesTimestamp() > Clock.currentTimeMillis() - raftNode.getLeaderElectionTimeoutInMillis()) {
logger.info("Rejecting " + req + " since received append entries recently.");
raftNode.send(new PreVoteResponse(localEndpoint, state.term(), false), req.candidate());
return;
}
RaftLog raftLog = state.log();
if (raftLog.lastLogOrSnapshotTerm() > req.lastLogTerm()) {
logger.info("Rejecting " + req + " since our last log term: " + raftLog.lastLogOrSnapshotTerm() + " is greater");
raftNode.send(new PreVoteResponse(localEndpoint, req.nextTerm(), false), req.candidate());
return;
}
if (raftLog.lastLogOrSnapshotTerm() == req.lastLogTerm() && raftLog.lastLogOrSnapshotIndex() > req.lastLogIndex()) {
logger.info("Rejecting " + req + " since our last log index: " + raftLog.lastLogOrSnapshotIndex() + " is greater");
raftNode.send(new PreVoteResponse(localEndpoint, req.nextTerm(), false), req.candidate());
return;
}
logger.info("Granted pre-vote for " + req);
raftNode.send(new PreVoteResponse(localEndpoint, req.nextTerm(), true), req.candidate());
}
use of com.hazelcast.cp.internal.raft.impl.log.RaftLog in project hazelcast by hazelcast.
the class PreVoteTask method innerRun.
@Override
protected void innerRun() {
RaftState state = raftNode.state();
if (state.leader() != null) {
logger.fine("No new pre-vote phase, we already have a LEADER: " + state.leader());
return;
} else if (state.term() != term) {
logger.fine("No new pre-vote phase for term= " + term + " because of new term: " + state.term());
return;
}
Collection<RaftEndpoint> remoteMembers = state.remoteMembers();
if (remoteMembers.isEmpty()) {
logger.fine("Remote members is empty. No need for pre-voting.");
return;
}
state.initPreCandidateState();
int nextTerm = state.term() + 1;
RaftLog log = state.log();
PreVoteRequest request = new PreVoteRequest(localMember(), nextTerm, log.lastLogOrSnapshotTerm(), log.lastLogOrSnapshotIndex());
logger.info("Pre-vote started for next term: " + request.nextTerm() + ", last log index: " + request.lastLogIndex() + ", last log term: " + request.lastLogTerm());
raftNode.printMemberState();
for (RaftEndpoint endpoint : remoteMembers) {
raftNode.send(request, endpoint);
}
schedulePreVoteTimeout();
}
Aggregations