use of org.apache.zookeeper.server.quorum.Leader.Proposal in project zookeeper by apache.
the class LearnerHandler method syncFollower.
/**
* Determine if we need to sync with follower using DIFF/TRUNC/SNAP
* and setup follower to receive packets from commit processor
*
* @param peerLastZxid
* @param db
* @param leader
* @return true if snapshot transfer is needed.
*/
public boolean syncFollower(long peerLastZxid, ZKDatabase db, Leader leader) {
/*
* When leader election is completed, the leader will set its
* lastProcessedZxid to be (epoch < 32). There will be no txn associated
* with this zxid.
*
* The learner will set its lastProcessedZxid to the same value if
* it get DIFF or SNAP from the leader. If the same learner come
* back to sync with leader using this zxid, we will never find this
* zxid in our history. In this case, we will ignore TRUNC logic and
* always send DIFF if we have old enough history
*/
boolean isPeerNewEpochZxid = (peerLastZxid & 0xffffffffL) == 0;
// Keep track of the latest zxid which already queued
long currentZxid = peerLastZxid;
boolean needSnap = true;
boolean txnLogSyncEnabled = db.isTxnLogSyncEnabled();
ReentrantReadWriteLock lock = db.getLogLock();
ReadLock rl = lock.readLock();
try {
rl.lock();
long maxCommittedLog = db.getmaxCommittedLog();
long minCommittedLog = db.getminCommittedLog();
long lastProcessedZxid = db.getDataTreeLastProcessedZxid();
LOG.info("Synchronizing with Follower sid: {} maxCommittedLog=0x{}" + " minCommittedLog=0x{} lastProcessedZxid=0x{}" + " peerLastZxid=0x{}", getSid(), Long.toHexString(maxCommittedLog), Long.toHexString(minCommittedLog), Long.toHexString(lastProcessedZxid), Long.toHexString(peerLastZxid));
if (db.getCommittedLog().isEmpty()) {
/*
* It is possible that committedLog is empty. In that case
* setting these value to the latest txn in leader db
* will reduce the case that we need to handle
*
* Here is how each case handle by the if block below
* 1. lastProcessZxid == peerZxid -> Handle by (2)
* 2. lastProcessZxid < peerZxid -> Handle by (3)
* 3. lastProcessZxid > peerZxid -> Handle by (5)
*/
minCommittedLog = lastProcessedZxid;
maxCommittedLog = lastProcessedZxid;
}
if (forceSnapSync) {
// Force leader to use snapshot to sync with follower
LOG.warn("Forcing snapshot sync - should not see this in production");
} else if (lastProcessedZxid == peerLastZxid) {
// Follower is already sync with us, send empty diff
LOG.info("Sending DIFF zxid=0x" + Long.toHexString(peerLastZxid) + " for peer sid: " + getSid());
queueOpPacket(Leader.DIFF, peerLastZxid);
needOpPacket = false;
needSnap = false;
} else if (peerLastZxid > maxCommittedLog && !isPeerNewEpochZxid) {
// Newer than committedLog, send trunc and done
LOG.debug("Sending TRUNC to follower zxidToSend=0x" + Long.toHexString(maxCommittedLog) + " for peer sid:" + getSid());
queueOpPacket(Leader.TRUNC, maxCommittedLog);
currentZxid = maxCommittedLog;
needOpPacket = false;
needSnap = false;
} else if ((maxCommittedLog >= peerLastZxid) && (minCommittedLog <= peerLastZxid)) {
// Follower is within commitLog range
LOG.info("Using committedLog for peer sid: " + getSid());
Iterator<Proposal> itr = db.getCommittedLog().iterator();
currentZxid = queueCommittedProposals(itr, peerLastZxid, null, maxCommittedLog);
needSnap = false;
} else if (peerLastZxid < minCommittedLog && txnLogSyncEnabled) {
// Use txnlog and committedLog to sync
// Calculate sizeLimit that we allow to retrieve txnlog from disk
long sizeLimit = db.calculateTxnLogSizeLimit();
// This method can return empty iterator if the requested zxid
// is older than on-disk txnlog
Iterator<Proposal> txnLogItr = db.getProposalsFromTxnLog(peerLastZxid, sizeLimit);
if (txnLogItr.hasNext()) {
LOG.info("Use txnlog and committedLog for peer sid: " + getSid());
currentZxid = queueCommittedProposals(txnLogItr, peerLastZxid, minCommittedLog, maxCommittedLog);
LOG.debug("Queueing committedLog 0x" + Long.toHexString(currentZxid));
Iterator<Proposal> committedLogItr = db.getCommittedLog().iterator();
currentZxid = queueCommittedProposals(committedLogItr, currentZxid, null, maxCommittedLog);
needSnap = false;
}
// closing the resources
if (txnLogItr instanceof TxnLogProposalIterator) {
TxnLogProposalIterator txnProposalItr = (TxnLogProposalIterator) txnLogItr;
txnProposalItr.close();
}
} else {
LOG.warn("Unhandled scenario for peer sid: " + getSid());
}
LOG.debug("Start forwarding 0x" + Long.toHexString(currentZxid) + " for peer sid: " + getSid());
leaderLastZxid = leader.startForwarding(this, currentZxid);
} finally {
rl.unlock();
}
if (needOpPacket && !needSnap) {
// This should never happen, but we should fall back to sending
// snapshot just in case.
LOG.error("Unhandled scenario for peer sid: " + getSid() + " fall back to use snapshot");
needSnap = true;
}
return needSnap;
}
use of org.apache.zookeeper.server.quorum.Leader.Proposal in project zookeeper by apache.
the class LearnerHandler method queueCommittedProposals.
/**
* Queue committed proposals into packet queue. The range of packets which
* is going to be queued are (peerLaxtZxid, maxZxid]
*
* @param itr iterator point to the proposals
* @param peerLastZxid last zxid seen by the follower
* @param maxZxid max zxid of the proposal to queue, null if no limit
* @param lastCommittedZxid when sending diff, we need to send lastCommittedZxid
* on the leader to follow Zab 1.0 protocol.
* @return last zxid of the queued proposal
*/
protected long queueCommittedProposals(Iterator<Proposal> itr, long peerLastZxid, Long maxZxid, Long lastCommittedZxid) {
boolean isPeerNewEpochZxid = (peerLastZxid & 0xffffffffL) == 0;
long queuedZxid = peerLastZxid;
// as we look through proposals, this variable keeps track of previous
// proposal Id.
long prevProposalZxid = -1;
while (itr.hasNext()) {
Proposal propose = itr.next();
long packetZxid = propose.packet.getZxid();
// abort if we hit the limit
if ((maxZxid != null) && (packetZxid > maxZxid)) {
break;
}
// skip the proposals the peer already has
if (packetZxid < peerLastZxid) {
prevProposalZxid = packetZxid;
continue;
}
// or diff
if (needOpPacket) {
// Send diff when we see the follower's zxid in our history
if (packetZxid == peerLastZxid) {
LOG.info("Sending DIFF zxid=0x{} for peer sid: {}", Long.toHexString(lastCommittedZxid), getSid());
queueOpPacket(Leader.DIFF, lastCommittedZxid);
needOpPacket = false;
continue;
}
if (isPeerNewEpochZxid) {
// Send diff and fall through if zxid is of a new-epoch
LOG.info("Sending DIFF zxid=0x{} for peer sid: {}", Long.toHexString(lastCommittedZxid), getSid());
queueOpPacket(Leader.DIFF, lastCommittedZxid);
needOpPacket = false;
} else if (packetZxid > peerLastZxid) {
// it may used to be a leader
if (ZxidUtils.getEpochFromZxid(packetZxid) != ZxidUtils.getEpochFromZxid(peerLastZxid)) {
// We cannot send TRUNC that cross epoch boundary.
// The learner will crash if it is asked to do so.
// We will send snapshot this those cases.
LOG.warn("Cannot send TRUNC to peer sid: " + getSid() + " peer zxid is from different epoch");
return queuedZxid;
}
LOG.info("Sending TRUNC zxid=0x{} for peer sid: {}", Long.toHexString(prevProposalZxid), getSid());
queueOpPacket(Leader.TRUNC, prevProposalZxid);
needOpPacket = false;
}
}
if (packetZxid <= queuedZxid) {
// or there is a duplicate txn in a given iterator
continue;
}
// Since this is already a committed proposal, we need to follow
// it by a commit packet
queuePacket(propose.packet);
queueOpPacket(Leader.COMMIT, packetZxid);
queuedZxid = packetZxid;
}
if (needOpPacket && isPeerNewEpochZxid) {
// We will send DIFF for this kind of zxid in any case. This if-block
// is the catch when our history older than learner and there is
// no new txn since then. So we need an empty diff
LOG.info("Sending TRUNC zxid=0x{} for peer sid: {}", Long.toHexString(lastCommittedZxid), getSid());
queueOpPacket(Leader.DIFF, lastCommittedZxid);
needOpPacket = false;
}
return queuedZxid;
}
use of org.apache.zookeeper.server.quorum.Leader.Proposal in project zookeeper by apache.
the class QuorumMajorityTest method testMajQuorums.
/**
************************************************************
*/
/* Test that the majority quorum verifier only counts votes from */
/* followers in its view */
/**
************************************************************
*/
@Test
public void testMajQuorums() throws Throwable {
LOG.info("Verify QuorumPeer#electionTimeTaken jmx bean attribute");
ArrayList<QuorumPeer> peers = getPeerList();
for (int i = 1; i <= peers.size(); i++) {
QuorumPeer qp = peers.get(i - 1);
Long electionTimeTaken = -1L;
String bean = "";
if (qp.getPeerState() == ServerState.FOLLOWING) {
bean = String.format("%s:name0=ReplicatedServer_id%d,name1=replica.%d,name2=Follower", MBeanRegistry.DOMAIN, i, i);
} else if (qp.getPeerState() == ServerState.LEADING) {
bean = String.format("%s:name0=ReplicatedServer_id%d,name1=replica.%d,name2=Leader", MBeanRegistry.DOMAIN, i, i);
}
electionTimeTaken = (Long) JMXEnv.ensureBeanAttribute(bean, "ElectionTimeTaken");
assertTrue(electionTimeTaken >= 0, "Wrong electionTimeTaken value!");
}
// setup servers 1-5 to be followers
setUp(false, true);
Proposal p = new Proposal();
p.addQuorumVerifier(s1.getQuorumVerifier());
// 2 followers out of 5 is not a majority
p.addAck(Long.valueOf(1));
p.addAck(Long.valueOf(2));
assertEquals(false, p.hasAllQuorums());
// 6 is not in the view - its vote shouldn't count
p.addAck(Long.valueOf(6));
assertEquals(false, p.hasAllQuorums());
// 3 followers out of 5 are a majority of the voting view
p.addAck(Long.valueOf(3));
assertEquals(true, p.hasAllQuorums());
// setup servers 1-3 to be followers and 4 and 5 to be observers
setUp(true, true);
p = new Proposal();
p.addQuorumVerifier(s1.getQuorumVerifier());
// 1 follower out of 3 is not a majority
p.addAck(Long.valueOf(1));
assertEquals(false, p.hasAllQuorums());
// 4 and 5 are observers, their vote shouldn't count
p.addAck(Long.valueOf(4));
p.addAck(Long.valueOf(5));
assertEquals(false, p.hasAllQuorums());
// 6 is not in the view - its vote shouldn't count
p.addAck(Long.valueOf(6));
assertEquals(false, p.hasAllQuorums());
// 2 followers out of 3 are a majority of the voting view
p.addAck(Long.valueOf(2));
assertEquals(true, p.hasAllQuorums());
}
use of org.apache.zookeeper.server.quorum.Leader.Proposal in project zookeeper by apache.
the class ZKDatabase method addCommittedProposal.
/**
* maintains a list of last <i>committedLog</i>
* or so committed requests. This is used for
* fast follower synchronization.
* @param request committed request
*/
public void addCommittedProposal(Request request) {
WriteLock wl = logLock.writeLock();
try {
wl.lock();
if (committedLog.size() > commitLogCount) {
committedLog.remove();
minCommittedLog = committedLog.peek().packet.getZxid();
}
if (committedLog.isEmpty()) {
minCommittedLog = request.zxid;
maxCommittedLog = request.zxid;
}
byte[] data = SerializeUtils.serializeRequest(request);
QuorumPacket pp = new QuorumPacket(Leader.PROPOSAL, request.zxid, data, null);
Proposal p = new Proposal();
p.packet = pp;
p.request = request;
committedLog.add(p);
maxCommittedLog = p.packet.getZxid();
} finally {
wl.unlock();
}
}
use of org.apache.zookeeper.server.quorum.Leader.Proposal in project zookeeper by apache.
the class QuorumPeerMainTest method testEarlyLeaderAbandonment.
/**
* Test early leader abandonment.
*/
@Test
public void testEarlyLeaderAbandonment() throws Exception {
ClientBase.setupTestEnv();
final int SERVER_COUNT = 3;
final int[] clientPorts = new int[SERVER_COUNT];
StringBuilder sb = new StringBuilder();
for (int i = 0; i < SERVER_COUNT; i++) {
clientPorts[i] = PortAssignment.unique();
sb.append("server." + i + "=127.0.0.1:" + PortAssignment.unique() + ":" + PortAssignment.unique() + ";" + clientPorts[i] + "\n");
}
String quorumCfgSection = sb.toString();
MainThread[] mt = new MainThread[SERVER_COUNT];
ZooKeeper[] zk = new ZooKeeper[SERVER_COUNT];
for (int i = 0; i < SERVER_COUNT; i++) {
mt[i] = new MainThread(i, clientPorts[i], quorumCfgSection);
mt[i].start();
zk[i] = new ZooKeeper("127.0.0.1:" + clientPorts[i], ClientBase.CONNECTION_TIMEOUT, this);
}
waitForAll(zk, States.CONNECTED);
// that is rather innocuous.
for (int i = 0; i < SERVER_COUNT; i++) {
mt[i].shutdown();
}
waitForAll(zk, States.CONNECTING);
for (int i = 0; i < SERVER_COUNT; i++) {
mt[i].start();
// Recreate a client session since the previous session was not persisted.
zk[i] = new ZooKeeper("127.0.0.1:" + clientPorts[i], ClientBase.CONNECTION_TIMEOUT, this);
}
waitForAll(zk, States.CONNECTED);
// ok lets find the leader and kill everything else, we have a few
// seconds, so it should be plenty of time
int leader = -1;
Map<Long, Proposal> outstanding = null;
for (int i = 0; i < SERVER_COUNT; i++) {
if (mt[i].main.quorumPeer.leader == null) {
mt[i].shutdown();
} else {
leader = i;
outstanding = mt[leader].main.quorumPeer.leader.outstandingProposals;
}
}
try {
zk[leader].create("/zk" + leader, "zk".getBytes(), Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
fail("create /zk" + leader + " should have failed");
} catch (KeeperException e) {
}
// just make sure that we actually did get it in process at the
// leader
assertTrue(outstanding.size() == 1);
assertTrue(outstanding.values().iterator().next().request.getHdr().getType() == OpCode.create);
// make sure it has a chance to write it to disk
Thread.sleep(1000);
mt[leader].shutdown();
waitForAll(zk, States.CONNECTING);
for (int i = 0; i < SERVER_COUNT; i++) {
if (i != leader) {
mt[i].start();
}
}
for (int i = 0; i < SERVER_COUNT; i++) {
if (i != leader) {
// Recreate a client session since the previous session was not persisted.
zk[i] = new ZooKeeper("127.0.0.1:" + clientPorts[i], ClientBase.CONNECTION_TIMEOUT, this);
waitForOne(zk[i], States.CONNECTED);
zk[i].create("/zk" + i, "zk".getBytes(), Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
}
}
mt[leader].start();
waitForAll(zk, States.CONNECTED);
// make sure everything is consistent
for (int i = 0; i < SERVER_COUNT; i++) {
for (int j = 0; j < SERVER_COUNT; j++) {
if (i == leader) {
assertTrue(zk[j].exists("/zk" + i, false) == null, (j == leader ? ("Leader (" + leader + ")") : ("Follower " + j)) + " should not have /zk" + i);
} else {
assertTrue(zk[j].exists("/zk" + i, false) != null, (j == leader ? ("Leader (" + leader + ")") : ("Follower " + j)) + " does not have /zk" + i);
}
}
}
for (int i = 0; i < SERVER_COUNT; i++) {
zk[i].close();
}
for (int i = 0; i < SERVER_COUNT; i++) {
mt[i].shutdown();
}
}
Aggregations