Search in sources :

Example 11 with RaftLog

use of com.hazelcast.cp.internal.raft.impl.log.RaftLog in project hazelcast by hazelcast.

the class RaftNodeImpl method canQueryLinearizable.

/**
 * Returns true if a new query is currently allowed to be executed without
 * appending to the Raft log. This method can be invoked only when
 * the local Raft node is the leader.
 * <p>
 * A new linearizable query execution is not allowed, when;
 * <ul>
 * <li>Node is terminating, terminated or stepped down.
 * See {@link RaftNodeStatus}.</li>
 * <li>If the leader has not yet marked an entry from its current term
 * committed. See Section 6.4 of Raft Dissertation.</li>
 * <li>There are already
 * {@link RaftAlgorithmConfig#getUncommittedEntryCountToRejectNewAppends()}
 * queries waiting to be executed.</li>
 * </ul>
 */
public boolean canQueryLinearizable() {
    if (isTerminatedOrSteppedDown()) {
        return false;
    }
    long commitIndex = state.commitIndex();
    RaftLog log = state.log();
    // If the leader has not yet marked an entry from its current term committed, it waits until it has done so. (§6.4)
    // last committed entry is either in the last snapshot or still in the log
    LogEntry lastCommittedEntry = commitIndex == log.snapshotIndex() ? log.snapshot() : log.getLogEntry(commitIndex);
    assert lastCommittedEntry != null;
    if (lastCommittedEntry.term() != state.term()) {
        return false;
    }
    // We can execute multiple queries at one-shot without appending to the Raft log,
    // and we use the maxUncommittedEntryCount configuration parameter to upper-bound
    // the number of queries that are collected until the heartbeat round is done.
    QueryState queryState = state.leaderState().queryState();
    return queryState.queryCount() < maxUncommittedEntryCount;
}
Also used : QueryState(com.hazelcast.cp.internal.raft.impl.state.QueryState) LogEntry(com.hazelcast.cp.internal.raft.impl.log.LogEntry) RaftLog(com.hazelcast.cp.internal.raft.impl.log.RaftLog)

Example 12 with RaftLog

use of com.hazelcast.cp.internal.raft.impl.log.RaftLog in project hazelcast by hazelcast.

the class RaftNodeImpl method sendAppendRequest.

/**
 * Sends an append-entries request to the follower member.
 * <p>
 * Log entries between follower's known nextIndex and latest appended entry index are sent in a batch.
 * Batch size can be {@link RaftAlgorithmConfig#getAppendRequestMaxEntryCount()} at most.
 * <p>
 * If follower's nextIndex is behind the latest snapshot index, then {@link InstallSnapshot} request is sent.
 * <p>
 * If leader doesn't know follower's matchIndex (if {@code matchIndex == 0}), then an empty append-entries is sent
 * to save bandwidth until leader learns the matchIndex of the follower.
 */
@SuppressWarnings({ "checkstyle:npathcomplexity", "checkstyle:cyclomaticcomplexity", "checkstyle:methodlength" })
public void sendAppendRequest(RaftEndpoint follower) {
    if (!raftIntegration.isReachable(follower)) {
        return;
    }
    RaftLog raftLog = state.log();
    LeaderState leaderState = state.leaderState();
    FollowerState followerState = leaderState.getFollowerState(follower);
    if (followerState.isAppendRequestBackoffSet()) {
        // or a back-off timeout occurs.
        return;
    }
    long nextIndex = followerState.nextIndex();
    if (nextIndex <= raftLog.snapshotIndex() && (!raftLog.containsLogEntry(nextIndex) || (nextIndex > 1 && !raftLog.containsLogEntry(nextIndex - 1)))) {
        InstallSnapshot installSnapshot = new InstallSnapshot(state.localEndpoint(), state.term(), raftLog.snapshot(), leaderState.queryRound());
        if (logger.isFineEnabled()) {
            logger.fine("Sending " + installSnapshot + " to " + follower + " since next index: " + nextIndex + " <= snapshot index: " + raftLog.snapshotIndex());
        }
        // no need to submit the flush task here because we send committed state...
        raftIntegration.send(installSnapshot, follower);
        followerState.setMaxAppendRequestBackoff();
        scheduleAppendAckResetTask();
        return;
    }
    int prevEntryTerm = 0;
    long prevEntryIndex = 0;
    LogEntry[] entries;
    boolean shouldBackoff = true;
    if (nextIndex > 1) {
        prevEntryIndex = nextIndex - 1;
        LogEntry prevEntry = (raftLog.snapshotIndex() == prevEntryIndex) ? raftLog.snapshot() : raftLog.getLogEntry(prevEntryIndex);
        assert prevEntry != null : "Prev entry index: " + prevEntryIndex + ", snapshot: " + raftLog.snapshotIndex();
        prevEntryTerm = prevEntry.term();
        long matchIndex = followerState.matchIndex();
        if (matchIndex == 0) {
            // Until the leader has discovered where it and the follower's logs match,
            // the leader can send AppendEntries with no entries (like heartbeats) to save bandwidth.
            // We still need to enable append request backoff here because we do not want to bombard
            // the follower before we learn its match index
            entries = new LogEntry[0];
        } else if (nextIndex <= raftLog.lastLogOrSnapshotIndex()) {
            // Then, once the matchIndex immediately precedes the nextIndex,
            // the leader should begin to send the actual entries
            long end = min(nextIndex + appendRequestMaxEntryCount, raftLog.lastLogOrSnapshotIndex());
            entries = raftLog.getEntriesBetween(nextIndex, end);
        } else {
            // The follower has caught up with the leader. Sending an empty append request as a heartbeat...
            entries = new LogEntry[0];
            shouldBackoff = false;
        }
    } else if (nextIndex == 1 && raftLog.lastLogOrSnapshotIndex() > 0) {
        // Entries will be sent to the follower for the first time...
        long end = min(nextIndex + appendRequestMaxEntryCount, raftLog.lastLogOrSnapshotIndex());
        entries = raftLog.getEntriesBetween(nextIndex, end);
    } else {
        // There is no entry in the Raft log. Sending an empty append request as a heartbeat...
        entries = new LogEntry[0];
        shouldBackoff = false;
    }
    AppendRequest request = new AppendRequest(getLocalMember(), state.term(), prevEntryTerm, prevEntryIndex, state.commitIndex(), entries, leaderState.queryRound());
    if (logger.isFineEnabled()) {
        logger.fine("Sending " + request + " to " + follower + " with next index: " + nextIndex);
    }
    raftIntegration.send(request, follower);
    if (entries.length > 0 && entries[entries.length - 1].index() > leaderState.flushedLogIndex()) {
        // if I am sending any non-flushed entry to the follower, I should trigger the flush task.
        // I hope that I will flush before receiving append responses from half of the followers...
        // This is a very critical optimization because
        // it makes the leader and followers flush in parallel...
        submitFlushTask();
    }
    if (shouldBackoff) {
        followerState.setAppendRequestBackoff();
        scheduleAppendAckResetTask();
    }
}
Also used : FollowerState(com.hazelcast.cp.internal.raft.impl.state.FollowerState) AppendRequest(com.hazelcast.cp.internal.raft.impl.dto.AppendRequest) LogEntry(com.hazelcast.cp.internal.raft.impl.log.LogEntry) RaftLog(com.hazelcast.cp.internal.raft.impl.log.RaftLog) LeaderState(com.hazelcast.cp.internal.raft.impl.state.LeaderState) InstallSnapshot(com.hazelcast.cp.internal.raft.impl.dto.InstallSnapshot)

Example 13 with RaftLog

use of com.hazelcast.cp.internal.raft.impl.log.RaftLog in project hazelcast by hazelcast.

the class AppendRequestHandlerTask method innerRun.

@Override
@SuppressWarnings({ "checkstyle:npathcomplexity", "checkstyle:cyclomaticcomplexity", "checkstyle:methodlength", "checkstyle:nestedifdepth" })
protected // Justification: It is easier to follow the AppendEntriesRPC logic in a single method
void innerRun() {
    if (logger.isFineEnabled()) {
        logger.fine("Received " + req);
    }
    RaftState state = raftNode.state();
    // Reply false if term < currentTerm (§5.1)
    if (req.term() < state.term()) {
        if (logger.isFineEnabled()) {
            logger.warning("Stale " + req + " received in current term: " + state.term());
        }
        raftNode.send(createFailureResponse(state.term()), req.leader());
        return;
    }
    // Transform into follower if a newer term is seen or another node wins the election of the current term
    if (req.term() > state.term() || state.role() != FOLLOWER) {
        // If RPC request or response contains term T > currentTerm: set currentTerm = T, convert to follower (§5.1)
        logger.info("Demoting to FOLLOWER from current role: " + state.role() + ", term: " + state.term() + " to new term: " + req.term() + " and leader: " + req.leader());
        raftNode.toFollower(req.term());
    }
    if (!req.leader().equals(state.leader())) {
        logger.info("Setting leader: " + req.leader());
        raftNode.leader(req.leader());
    }
    RaftLog raftLog = state.log();
    // Verify the last log entry
    if (req.prevLogIndex() > 0) {
        long lastLogIndex = raftLog.lastLogOrSnapshotIndex();
        int lastLogTerm = raftLog.lastLogOrSnapshotTerm();
        int prevLogTerm;
        if (req.prevLogIndex() == lastLogIndex) {
            prevLogTerm = lastLogTerm;
        } else {
            // Reply false if log does not contain an entry at prevLogIndex whose term matches prevLogTerm (§5.3)
            LogEntry prevLog = raftLog.getLogEntry(req.prevLogIndex());
            if (prevLog == null) {
                if (logger.isFineEnabled()) {
                    logger.warning("Failed to get previous log index for " + req + ", last log index: " + lastLogIndex);
                }
                raftNode.send(createFailureResponse(req.term()), req.leader());
                return;
            }
            prevLogTerm = prevLog.term();
        }
        if (req.prevLogTerm() != prevLogTerm) {
            if (logger.isFineEnabled()) {
                logger.warning("Previous log term of " + req + " is different than ours: " + prevLogTerm);
            }
            raftNode.send(createFailureResponse(req.term()), req.leader());
            return;
        }
    }
    int truncatedAppendRequestEntryCount = 0;
    LogEntry[] newEntries = null;
    // Process any new entries
    if (req.entryCount() > 0) {
        // Delete any conflicting entries, skip any duplicates
        long lastLogIndex = raftLog.lastLogOrSnapshotIndex();
        for (int i = 0; i < req.entryCount(); i++) {
            LogEntry reqEntry = req.entries()[i];
            if (reqEntry.index() > lastLogIndex) {
                newEntries = Arrays.copyOfRange(req.entries(), i, req.entryCount());
                break;
            }
            LogEntry localEntry = raftLog.getLogEntry(reqEntry.index());
            assert localEntry != null : "Entry not found on log index: " + reqEntry.index() + " for " + req;
            // delete the existing entry and all that follow it (§5.3)
            if (reqEntry.term() != localEntry.term()) {
                List<LogEntry> truncatedEntries = raftLog.deleteEntriesFrom(reqEntry.index());
                if (logger.isFineEnabled()) {
                    logger.warning("Truncated " + truncatedEntries.size() + " entries from entry index: " + reqEntry.index() + " => " + truncatedEntries);
                } else {
                    logger.warning("Truncated " + truncatedEntries.size() + " entries from entry index: " + reqEntry.index());
                }
                raftNode.invalidateFuturesFrom(reqEntry.index());
                revertPreAppliedRaftGroupCmd(truncatedEntries);
                newEntries = Arrays.copyOfRange(req.entries(), i, req.entryCount());
                raftLog.flush();
                break;
            }
        }
        if (newEntries != null && newEntries.length > 0) {
            if (raftLog.availableCapacity() < newEntries.length) {
                if (logger.isFineEnabled()) {
                    logger.warning("Truncating " + newEntries.length + " entries to " + raftLog.availableCapacity() + " to fit into the available capacity of the Raft log");
                }
                truncatedAppendRequestEntryCount = newEntries.length - raftLog.availableCapacity();
                newEntries = Arrays.copyOf(newEntries, raftLog.availableCapacity());
            }
            // Append any new entries not already in the log
            if (logger.isFineEnabled()) {
                logger.fine("Appending " + newEntries.length + " entries: " + Arrays.toString(newEntries));
            }
            raftLog.appendEntries(newEntries);
            raftLog.flush();
        }
    }
    // I cannot use raftLog.lastLogOrSnapshotIndex() for lastLogIndex because my log may contain
    // some uncommitted entries from the previous leader and those entries will be truncated soon
    // I can only send a response based on how many entries I have appended from this append request
    long lastLogIndex = req.prevLogIndex() + req.entryCount() - truncatedAppendRequestEntryCount;
    long oldCommitIndex = state.commitIndex();
    // Update the commit index
    if (req.leaderCommitIndex() > oldCommitIndex) {
        // If leaderCommit > commitIndex, set commitIndex = min(leaderCommit, index of last new entry)
        long newCommitIndex = min(req.leaderCommitIndex(), lastLogIndex);
        if (logger.isFineEnabled()) {
            logger.fine("Setting commit index: " + newCommitIndex);
        }
        state.commitIndex(newCommitIndex);
    }
    raftNode.updateLastAppendEntriesTimestamp();
    try {
        AppendSuccessResponse resp = new AppendSuccessResponse(localMember(), state.term(), lastLogIndex, req.queryRound());
        raftNode.send(resp, req.leader());
    } finally {
        if (state.commitIndex() > oldCommitIndex) {
            raftNode.applyLogEntries();
        }
        if (newEntries != null) {
            preApplyRaftGroupCmd(newEntries, state.commitIndex());
        }
    }
}
Also used : AppendSuccessResponse(com.hazelcast.cp.internal.raft.impl.dto.AppendSuccessResponse) RaftState(com.hazelcast.cp.internal.raft.impl.state.RaftState) LogEntry(com.hazelcast.cp.internal.raft.impl.log.LogEntry) RaftLog(com.hazelcast.cp.internal.raft.impl.log.RaftLog)

Example 14 with RaftLog

use of com.hazelcast.cp.internal.raft.impl.log.RaftLog in project hazelcast by hazelcast.

the class PreVoteRequestHandlerTask method innerRun.

@Override
protected void innerRun() {
    RaftState state = raftNode.state();
    RaftEndpoint localEndpoint = localMember();
    // Reply false if term < currentTerm (§5.1)
    if (state.term() > req.nextTerm()) {
        logger.info("Rejecting " + req + " since current term: " + state.term() + " is bigger");
        raftNode.send(new PreVoteResponse(localEndpoint, state.term(), false), req.candidate());
        return;
    }
    // Reply false if last AppendEntries call was received less than election timeout ago (leader stickiness)
    if (raftNode.lastAppendEntriesTimestamp() > Clock.currentTimeMillis() - raftNode.getLeaderElectionTimeoutInMillis()) {
        logger.info("Rejecting " + req + " since received append entries recently.");
        raftNode.send(new PreVoteResponse(localEndpoint, state.term(), false), req.candidate());
        return;
    }
    RaftLog raftLog = state.log();
    if (raftLog.lastLogOrSnapshotTerm() > req.lastLogTerm()) {
        logger.info("Rejecting " + req + " since our last log term: " + raftLog.lastLogOrSnapshotTerm() + " is greater");
        raftNode.send(new PreVoteResponse(localEndpoint, req.nextTerm(), false), req.candidate());
        return;
    }
    if (raftLog.lastLogOrSnapshotTerm() == req.lastLogTerm() && raftLog.lastLogOrSnapshotIndex() > req.lastLogIndex()) {
        logger.info("Rejecting " + req + " since our last log index: " + raftLog.lastLogOrSnapshotIndex() + " is greater");
        raftNode.send(new PreVoteResponse(localEndpoint, req.nextTerm(), false), req.candidate());
        return;
    }
    logger.info("Granted pre-vote for " + req);
    raftNode.send(new PreVoteResponse(localEndpoint, req.nextTerm(), true), req.candidate());
}
Also used : RaftState(com.hazelcast.cp.internal.raft.impl.state.RaftState) RaftEndpoint(com.hazelcast.cp.internal.raft.impl.RaftEndpoint) PreVoteResponse(com.hazelcast.cp.internal.raft.impl.dto.PreVoteResponse) RaftLog(com.hazelcast.cp.internal.raft.impl.log.RaftLog)

Example 15 with RaftLog

use of com.hazelcast.cp.internal.raft.impl.log.RaftLog in project hazelcast by hazelcast.

the class PreVoteTask method innerRun.

@Override
protected void innerRun() {
    RaftState state = raftNode.state();
    if (state.leader() != null) {
        logger.fine("No new pre-vote phase, we already have a LEADER: " + state.leader());
        return;
    } else if (state.term() != term) {
        logger.fine("No new pre-vote phase for term= " + term + " because of new term: " + state.term());
        return;
    }
    Collection<RaftEndpoint> remoteMembers = state.remoteMembers();
    if (remoteMembers.isEmpty()) {
        logger.fine("Remote members is empty. No need for pre-voting.");
        return;
    }
    state.initPreCandidateState();
    int nextTerm = state.term() + 1;
    RaftLog log = state.log();
    PreVoteRequest request = new PreVoteRequest(localMember(), nextTerm, log.lastLogOrSnapshotTerm(), log.lastLogOrSnapshotIndex());
    logger.info("Pre-vote started for next term: " + request.nextTerm() + ", last log index: " + request.lastLogIndex() + ", last log term: " + request.lastLogTerm());
    raftNode.printMemberState();
    for (RaftEndpoint endpoint : remoteMembers) {
        raftNode.send(request, endpoint);
    }
    schedulePreVoteTimeout();
}
Also used : RaftState(com.hazelcast.cp.internal.raft.impl.state.RaftState) RaftEndpoint(com.hazelcast.cp.internal.raft.impl.RaftEndpoint) PreVoteRequest(com.hazelcast.cp.internal.raft.impl.dto.PreVoteRequest) RaftEndpoint(com.hazelcast.cp.internal.raft.impl.RaftEndpoint) RaftLog(com.hazelcast.cp.internal.raft.impl.log.RaftLog)

Aggregations

RaftLog (com.hazelcast.cp.internal.raft.impl.log.RaftLog)16 LogEntry (com.hazelcast.cp.internal.raft.impl.log.LogEntry)11 RaftEndpoint (com.hazelcast.cp.internal.raft.impl.RaftEndpoint)6 RaftState (com.hazelcast.cp.internal.raft.impl.state.RaftState)6 DestroyRaftGroupCmd (com.hazelcast.cp.internal.raft.command.DestroyRaftGroupCmd)3 RaftGroupCmd (com.hazelcast.cp.internal.raft.command.RaftGroupCmd)3 UpdateRaftGroupMembersCmd (com.hazelcast.cp.internal.raft.impl.command.UpdateRaftGroupMembersCmd)3 StaleAppendRequestException (com.hazelcast.cp.exception.StaleAppendRequestException)2 AppendRequest (com.hazelcast.cp.internal.raft.impl.dto.AppendRequest)2 AppendSuccessResponse (com.hazelcast.cp.internal.raft.impl.dto.AppendSuccessResponse)2 InstallSnapshot (com.hazelcast.cp.internal.raft.impl.dto.InstallSnapshot)2 PreVoteRequest (com.hazelcast.cp.internal.raft.impl.dto.PreVoteRequest)2 PreVoteResponse (com.hazelcast.cp.internal.raft.impl.dto.PreVoteResponse)2 VoteResponse (com.hazelcast.cp.internal.raft.impl.dto.VoteResponse)2 FollowerState (com.hazelcast.cp.internal.raft.impl.state.FollowerState)2 LeaderState (com.hazelcast.cp.internal.raft.impl.state.LeaderState)2 QueryState (com.hazelcast.cp.internal.raft.impl.state.QueryState)2 RaftAlgorithmConfig (com.hazelcast.config.cp.RaftAlgorithmConfig)1 CPGroupId (com.hazelcast.cp.CPGroupId)1 CPMember (com.hazelcast.cp.CPMember)1