Search in sources :

Example 1 with Proposal

use of org.apache.zookeeper.server.quorum.Leader.Proposal in project zookeeper by apache.

the class LearnerHandler method syncFollower.

/**
 * Determine if we need to sync with follower using DIFF/TRUNC/SNAP
 * and setup follower to receive packets from commit processor
 *
 * @param peerLastZxid
 * @param db
 * @param leader
 * @return true if snapshot transfer is needed.
 */
public boolean syncFollower(long peerLastZxid, ZKDatabase db, Leader leader) {
    /*
         * When leader election is completed, the leader will set its
         * lastProcessedZxid to be (epoch < 32). There will be no txn associated
         * with this zxid.
         *
         * The learner will set its lastProcessedZxid to the same value if
         * it get DIFF or SNAP from the leader. If the same learner come
         * back to sync with leader using this zxid, we will never find this
         * zxid in our history. In this case, we will ignore TRUNC logic and
         * always send DIFF if we have old enough history
         */
    boolean isPeerNewEpochZxid = (peerLastZxid & 0xffffffffL) == 0;
    // Keep track of the latest zxid which already queued
    long currentZxid = peerLastZxid;
    boolean needSnap = true;
    boolean txnLogSyncEnabled = db.isTxnLogSyncEnabled();
    ReentrantReadWriteLock lock = db.getLogLock();
    ReadLock rl = lock.readLock();
    try {
        rl.lock();
        long maxCommittedLog = db.getmaxCommittedLog();
        long minCommittedLog = db.getminCommittedLog();
        long lastProcessedZxid = db.getDataTreeLastProcessedZxid();
        LOG.info("Synchronizing with Follower sid: {} maxCommittedLog=0x{}" + " minCommittedLog=0x{} lastProcessedZxid=0x{}" + " peerLastZxid=0x{}", getSid(), Long.toHexString(maxCommittedLog), Long.toHexString(minCommittedLog), Long.toHexString(lastProcessedZxid), Long.toHexString(peerLastZxid));
        if (db.getCommittedLog().isEmpty()) {
            /*
                 * It is possible that committedLog is empty. In that case
                 * setting these value to the latest txn in leader db
                 * will reduce the case that we need to handle
                 *
                 * Here is how each case handle by the if block below
                 * 1. lastProcessZxid == peerZxid -> Handle by (2)
                 * 2. lastProcessZxid < peerZxid -> Handle by (3)
                 * 3. lastProcessZxid > peerZxid -> Handle by (5)
                 */
            minCommittedLog = lastProcessedZxid;
            maxCommittedLog = lastProcessedZxid;
        }
        if (forceSnapSync) {
            // Force leader to use snapshot to sync with follower
            LOG.warn("Forcing snapshot sync - should not see this in production");
        } else if (lastProcessedZxid == peerLastZxid) {
            // Follower is already sync with us, send empty diff
            LOG.info("Sending DIFF zxid=0x" + Long.toHexString(peerLastZxid) + " for peer sid: " + getSid());
            queueOpPacket(Leader.DIFF, peerLastZxid);
            needOpPacket = false;
            needSnap = false;
        } else if (peerLastZxid > maxCommittedLog && !isPeerNewEpochZxid) {
            // Newer than committedLog, send trunc and done
            LOG.debug("Sending TRUNC to follower zxidToSend=0x" + Long.toHexString(maxCommittedLog) + " for peer sid:" + getSid());
            queueOpPacket(Leader.TRUNC, maxCommittedLog);
            currentZxid = maxCommittedLog;
            needOpPacket = false;
            needSnap = false;
        } else if ((maxCommittedLog >= peerLastZxid) && (minCommittedLog <= peerLastZxid)) {
            // Follower is within commitLog range
            LOG.info("Using committedLog for peer sid: " + getSid());
            Iterator<Proposal> itr = db.getCommittedLog().iterator();
            currentZxid = queueCommittedProposals(itr, peerLastZxid, null, maxCommittedLog);
            needSnap = false;
        } else if (peerLastZxid < minCommittedLog && txnLogSyncEnabled) {
            // Use txnlog and committedLog to sync
            // Calculate sizeLimit that we allow to retrieve txnlog from disk
            long sizeLimit = db.calculateTxnLogSizeLimit();
            // This method can return empty iterator if the requested zxid
            // is older than on-disk txnlog
            Iterator<Proposal> txnLogItr = db.getProposalsFromTxnLog(peerLastZxid, sizeLimit);
            if (txnLogItr.hasNext()) {
                LOG.info("Use txnlog and committedLog for peer sid: " + getSid());
                currentZxid = queueCommittedProposals(txnLogItr, peerLastZxid, minCommittedLog, maxCommittedLog);
                LOG.debug("Queueing committedLog 0x" + Long.toHexString(currentZxid));
                Iterator<Proposal> committedLogItr = db.getCommittedLog().iterator();
                currentZxid = queueCommittedProposals(committedLogItr, currentZxid, null, maxCommittedLog);
                needSnap = false;
            }
            // closing the resources
            if (txnLogItr instanceof TxnLogProposalIterator) {
                TxnLogProposalIterator txnProposalItr = (TxnLogProposalIterator) txnLogItr;
                txnProposalItr.close();
            }
        } else {
            LOG.warn("Unhandled scenario for peer sid: " + getSid());
        }
        LOG.debug("Start forwarding 0x" + Long.toHexString(currentZxid) + " for peer sid: " + getSid());
        leaderLastZxid = leader.startForwarding(this, currentZxid);
    } finally {
        rl.unlock();
    }
    if (needOpPacket && !needSnap) {
        // This should never happen, but we should fall back to sending
        // snapshot just in case.
        LOG.error("Unhandled scenario for peer sid: " + getSid() + " fall back to use snapshot");
        needSnap = true;
    }
    return needSnap;
}
Also used : ReadLock(java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock) TxnLogProposalIterator(org.apache.zookeeper.server.TxnLogProposalIterator) Iterator(java.util.Iterator) TxnLogProposalIterator(org.apache.zookeeper.server.TxnLogProposalIterator) ReentrantReadWriteLock(java.util.concurrent.locks.ReentrantReadWriteLock) Proposal(org.apache.zookeeper.server.quorum.Leader.Proposal)

Example 2 with Proposal

use of org.apache.zookeeper.server.quorum.Leader.Proposal in project zookeeper by apache.

the class LearnerHandler method queueCommittedProposals.

/**
 * Queue committed proposals into packet queue. The range of packets which
 * is going to be queued are (peerLaxtZxid, maxZxid]
 *
 * @param itr  iterator point to the proposals
 * @param peerLastZxid  last zxid seen by the follower
 * @param maxZxid  max zxid of the proposal to queue, null if no limit
 * @param lastCommittedZxid when sending diff, we need to send lastCommittedZxid
 *        on the leader to follow Zab 1.0 protocol.
 * @return last zxid of the queued proposal
 */
protected long queueCommittedProposals(Iterator<Proposal> itr, long peerLastZxid, Long maxZxid, Long lastCommittedZxid) {
    boolean isPeerNewEpochZxid = (peerLastZxid & 0xffffffffL) == 0;
    long queuedZxid = peerLastZxid;
    // as we look through proposals, this variable keeps track of previous
    // proposal Id.
    long prevProposalZxid = -1;
    while (itr.hasNext()) {
        Proposal propose = itr.next();
        long packetZxid = propose.packet.getZxid();
        // abort if we hit the limit
        if ((maxZxid != null) && (packetZxid > maxZxid)) {
            break;
        }
        // skip the proposals the peer already has
        if (packetZxid < peerLastZxid) {
            prevProposalZxid = packetZxid;
            continue;
        }
        // or diff
        if (needOpPacket) {
            // Send diff when we see the follower's zxid in our history
            if (packetZxid == peerLastZxid) {
                LOG.info("Sending DIFF zxid=0x{}  for peer sid: {}", Long.toHexString(lastCommittedZxid), getSid());
                queueOpPacket(Leader.DIFF, lastCommittedZxid);
                needOpPacket = false;
                continue;
            }
            if (isPeerNewEpochZxid) {
                // Send diff and fall through if zxid is of a new-epoch
                LOG.info("Sending DIFF zxid=0x{}  for peer sid: {}", Long.toHexString(lastCommittedZxid), getSid());
                queueOpPacket(Leader.DIFF, lastCommittedZxid);
                needOpPacket = false;
            } else if (packetZxid > peerLastZxid) {
                // it may used to be a leader
                if (ZxidUtils.getEpochFromZxid(packetZxid) != ZxidUtils.getEpochFromZxid(peerLastZxid)) {
                    // We cannot send TRUNC that cross epoch boundary.
                    // The learner will crash if it is asked to do so.
                    // We will send snapshot this those cases.
                    LOG.warn("Cannot send TRUNC to peer sid: " + getSid() + " peer zxid is from different epoch");
                    return queuedZxid;
                }
                LOG.info("Sending TRUNC zxid=0x{}  for peer sid: {}", Long.toHexString(prevProposalZxid), getSid());
                queueOpPacket(Leader.TRUNC, prevProposalZxid);
                needOpPacket = false;
            }
        }
        if (packetZxid <= queuedZxid) {
            // or there is a duplicate txn in a given iterator
            continue;
        }
        // Since this is already a committed proposal, we need to follow
        // it by a commit packet
        queuePacket(propose.packet);
        queueOpPacket(Leader.COMMIT, packetZxid);
        queuedZxid = packetZxid;
    }
    if (needOpPacket && isPeerNewEpochZxid) {
        // We will send DIFF for this kind of zxid in any case. This if-block
        // is the catch when our history older than learner and there is
        // no new txn since then. So we need an empty diff
        LOG.info("Sending TRUNC zxid=0x{}  for peer sid: {}", Long.toHexString(lastCommittedZxid), getSid());
        queueOpPacket(Leader.DIFF, lastCommittedZxid);
        needOpPacket = false;
    }
    return queuedZxid;
}
Also used : Proposal(org.apache.zookeeper.server.quorum.Leader.Proposal)

Example 3 with Proposal

use of org.apache.zookeeper.server.quorum.Leader.Proposal in project zookeeper by apache.

the class QuorumMajorityTest method testMajQuorums.

/**
 ************************************************************
 */
/* Test that the majority quorum verifier only counts votes from */
/* followers in its view                                    */
/**
 ************************************************************
 */
@Test
public void testMajQuorums() throws Throwable {
    LOG.info("Verify QuorumPeer#electionTimeTaken jmx bean attribute");
    ArrayList<QuorumPeer> peers = getPeerList();
    for (int i = 1; i <= peers.size(); i++) {
        QuorumPeer qp = peers.get(i - 1);
        Long electionTimeTaken = -1L;
        String bean = "";
        if (qp.getPeerState() == ServerState.FOLLOWING) {
            bean = String.format("%s:name0=ReplicatedServer_id%d,name1=replica.%d,name2=Follower", MBeanRegistry.DOMAIN, i, i);
        } else if (qp.getPeerState() == ServerState.LEADING) {
            bean = String.format("%s:name0=ReplicatedServer_id%d,name1=replica.%d,name2=Leader", MBeanRegistry.DOMAIN, i, i);
        }
        electionTimeTaken = (Long) JMXEnv.ensureBeanAttribute(bean, "ElectionTimeTaken");
        assertTrue(electionTimeTaken >= 0, "Wrong electionTimeTaken value!");
    }
    // setup servers 1-5 to be followers
    setUp(false, true);
    Proposal p = new Proposal();
    p.addQuorumVerifier(s1.getQuorumVerifier());
    // 2 followers out of 5 is not a majority
    p.addAck(Long.valueOf(1));
    p.addAck(Long.valueOf(2));
    assertEquals(false, p.hasAllQuorums());
    // 6 is not in the view - its vote shouldn't count
    p.addAck(Long.valueOf(6));
    assertEquals(false, p.hasAllQuorums());
    // 3 followers out of 5 are a majority of the voting view
    p.addAck(Long.valueOf(3));
    assertEquals(true, p.hasAllQuorums());
    // setup servers 1-3 to be followers and 4 and 5 to be observers
    setUp(true, true);
    p = new Proposal();
    p.addQuorumVerifier(s1.getQuorumVerifier());
    // 1 follower out of 3 is not a majority
    p.addAck(Long.valueOf(1));
    assertEquals(false, p.hasAllQuorums());
    // 4 and 5 are observers, their vote shouldn't count
    p.addAck(Long.valueOf(4));
    p.addAck(Long.valueOf(5));
    assertEquals(false, p.hasAllQuorums());
    // 6 is not in the view - its vote shouldn't count
    p.addAck(Long.valueOf(6));
    assertEquals(false, p.hasAllQuorums());
    // 2 followers out of 3 are a majority of the voting view
    p.addAck(Long.valueOf(2));
    assertEquals(true, p.hasAllQuorums());
}
Also used : QuorumPeer(org.apache.zookeeper.server.quorum.QuorumPeer) Proposal(org.apache.zookeeper.server.quorum.Leader.Proposal) Test(org.junit.jupiter.api.Test)

Example 4 with Proposal

use of org.apache.zookeeper.server.quorum.Leader.Proposal in project zookeeper by apache.

the class ZKDatabase method addCommittedProposal.

/**
 * maintains a list of last <i>committedLog</i>
 *  or so committed requests. This is used for
 * fast follower synchronization.
 * @param request committed request
 */
public void addCommittedProposal(Request request) {
    WriteLock wl = logLock.writeLock();
    try {
        wl.lock();
        if (committedLog.size() > commitLogCount) {
            committedLog.remove();
            minCommittedLog = committedLog.peek().packet.getZxid();
        }
        if (committedLog.isEmpty()) {
            minCommittedLog = request.zxid;
            maxCommittedLog = request.zxid;
        }
        byte[] data = SerializeUtils.serializeRequest(request);
        QuorumPacket pp = new QuorumPacket(Leader.PROPOSAL, request.zxid, data, null);
        Proposal p = new Proposal();
        p.packet = pp;
        p.request = request;
        committedLog.add(p);
        maxCommittedLog = p.packet.getZxid();
    } finally {
        wl.unlock();
    }
}
Also used : ReentrantReadWriteLock(java.util.concurrent.locks.ReentrantReadWriteLock) WriteLock(java.util.concurrent.locks.ReentrantReadWriteLock.WriteLock) QuorumPacket(org.apache.zookeeper.server.quorum.QuorumPacket) Proposal(org.apache.zookeeper.server.quorum.Leader.Proposal)

Example 5 with Proposal

use of org.apache.zookeeper.server.quorum.Leader.Proposal in project zookeeper by apache.

the class QuorumPeerMainTest method testEarlyLeaderAbandonment.

/**
 * Test early leader abandonment.
 */
@Test
public void testEarlyLeaderAbandonment() throws Exception {
    ClientBase.setupTestEnv();
    final int SERVER_COUNT = 3;
    final int[] clientPorts = new int[SERVER_COUNT];
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < SERVER_COUNT; i++) {
        clientPorts[i] = PortAssignment.unique();
        sb.append("server." + i + "=127.0.0.1:" + PortAssignment.unique() + ":" + PortAssignment.unique() + ";" + clientPorts[i] + "\n");
    }
    String quorumCfgSection = sb.toString();
    MainThread[] mt = new MainThread[SERVER_COUNT];
    ZooKeeper[] zk = new ZooKeeper[SERVER_COUNT];
    for (int i = 0; i < SERVER_COUNT; i++) {
        mt[i] = new MainThread(i, clientPorts[i], quorumCfgSection);
        mt[i].start();
        zk[i] = new ZooKeeper("127.0.0.1:" + clientPorts[i], ClientBase.CONNECTION_TIMEOUT, this);
    }
    waitForAll(zk, States.CONNECTED);
    // that is rather innocuous.
    for (int i = 0; i < SERVER_COUNT; i++) {
        mt[i].shutdown();
    }
    waitForAll(zk, States.CONNECTING);
    for (int i = 0; i < SERVER_COUNT; i++) {
        mt[i].start();
        // Recreate a client session since the previous session was not persisted.
        zk[i] = new ZooKeeper("127.0.0.1:" + clientPorts[i], ClientBase.CONNECTION_TIMEOUT, this);
    }
    waitForAll(zk, States.CONNECTED);
    // ok lets find the leader and kill everything else, we have a few
    // seconds, so it should be plenty of time
    int leader = -1;
    Map<Long, Proposal> outstanding = null;
    for (int i = 0; i < SERVER_COUNT; i++) {
        if (mt[i].main.quorumPeer.leader == null) {
            mt[i].shutdown();
        } else {
            leader = i;
            outstanding = mt[leader].main.quorumPeer.leader.outstandingProposals;
        }
    }
    try {
        zk[leader].create("/zk" + leader, "zk".getBytes(), Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
        fail("create /zk" + leader + " should have failed");
    } catch (KeeperException e) {
    }
    // just make sure that we actually did get it in process at the
    // leader
    assertTrue(outstanding.size() == 1);
    assertTrue(outstanding.values().iterator().next().request.getHdr().getType() == OpCode.create);
    // make sure it has a chance to write it to disk
    Thread.sleep(1000);
    mt[leader].shutdown();
    waitForAll(zk, States.CONNECTING);
    for (int i = 0; i < SERVER_COUNT; i++) {
        if (i != leader) {
            mt[i].start();
        }
    }
    for (int i = 0; i < SERVER_COUNT; i++) {
        if (i != leader) {
            // Recreate a client session since the previous session was not persisted.
            zk[i] = new ZooKeeper("127.0.0.1:" + clientPorts[i], ClientBase.CONNECTION_TIMEOUT, this);
            waitForOne(zk[i], States.CONNECTED);
            zk[i].create("/zk" + i, "zk".getBytes(), Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
        }
    }
    mt[leader].start();
    waitForAll(zk, States.CONNECTED);
    // make sure everything is consistent
    for (int i = 0; i < SERVER_COUNT; i++) {
        for (int j = 0; j < SERVER_COUNT; j++) {
            if (i == leader) {
                assertTrue(zk[j].exists("/zk" + i, false) == null, (j == leader ? ("Leader (" + leader + ")") : ("Follower " + j)) + " should not have /zk" + i);
            } else {
                assertTrue(zk[j].exists("/zk" + i, false) != null, (j == leader ? ("Leader (" + leader + ")") : ("Follower " + j)) + " does not have /zk" + i);
            }
        }
    }
    for (int i = 0; i < SERVER_COUNT; i++) {
        zk[i].close();
    }
    for (int i = 0; i < SERVER_COUNT; i++) {
        mt[i].shutdown();
    }
}
Also used : ZooKeeper(org.apache.zookeeper.ZooKeeper) Proposal(org.apache.zookeeper.server.quorum.Leader.Proposal) KeeperException(org.apache.zookeeper.KeeperException) Test(org.junit.jupiter.api.Test)

Aggregations

Proposal (org.apache.zookeeper.server.quorum.Leader.Proposal)16 ZooKeeper (org.apache.zookeeper.ZooKeeper)6 Test (org.junit.jupiter.api.Test)6 File (java.io.File)4 KeeperException (org.apache.zookeeper.KeeperException)4 ReentrantReadWriteLock (java.util.concurrent.locks.ReentrantReadWriteLock)3 ReadLock (java.util.concurrent.locks.ReentrantReadWriteLock.ReadLock)3 ServerCnxnFactory (org.apache.zookeeper.server.ServerCnxnFactory)3 ZKDatabase (org.apache.zookeeper.server.ZKDatabase)3 ZooKeeperServer (org.apache.zookeeper.server.ZooKeeperServer)3 IOException (java.io.IOException)2 Iterator (java.util.Iterator)2 Stat (org.apache.zookeeper.data.Stat)2 QuorumPeer (org.apache.zookeeper.server.quorum.QuorumPeer)2 ByteArrayOutputStream (java.io.ByteArrayOutputStream)1 LineNumberReader (java.io.LineNumberReader)1 StringReader (java.io.StringReader)1 InetSocketAddress (java.net.InetSocketAddress)1 ByteBuffer (java.nio.ByteBuffer)1 SocketChannel (java.nio.channels.SocketChannel)1