use of org.apache.zookeeper.server.quorum.Leader.Proposal in project zookeeper by apache.
the class QuorumPeerMainTest method testInconsistentDueToNewLeaderOrder.
/**
* Currently, in SNAP sync, the leader will start queuing the
* proposal/commits and the NEWLEADER packet before sending
* over the snapshot over wire. So it's possible that the zxid
* associated with the snapshot might be higher than all the
* packets queued before NEWLEADER.
*
* When the follower received the snapshot, it will apply all
* the txns queued before NEWLEADER, which may not cover all
* the txns up to the zxid in the snapshot. After that, it
* will write the snapshot out to disk with the zxid associated
* with the snapshot. In case the server crashed after writing
* this out, when loading the data from disk, it will use zxid
* of the snapshot file to sync with leader, and it could cause
* data inconsistent, because we only replayed partial of the
* historical data during previous syncing.
*
* This test case is going to cover and simulate this scenario
* and make sure there is no data inconsistency issue after fix.
*/
@Test
public void testInconsistentDueToNewLeaderOrder() throws Exception {
// 1. set up an ensemble with 3 servers
final int ENSEMBLE_SERVERS = 3;
final int[] clientPorts = new int[ENSEMBLE_SERVERS];
StringBuilder sb = new StringBuilder();
String server;
for (int i = 0; i < ENSEMBLE_SERVERS; i++) {
clientPorts[i] = PortAssignment.unique();
server = "server." + i + "=127.0.0.1:" + PortAssignment.unique() + ":" + PortAssignment.unique() + ":participant;127.0.0.1:" + clientPorts[i];
sb.append(server + "\n");
}
String currentQuorumCfgSection = sb.toString();
// start servers
MainThread[] mt = new MainThread[ENSEMBLE_SERVERS];
ZooKeeper[] zk = new ZooKeeper[ENSEMBLE_SERVERS];
Context[] contexts = new Context[ENSEMBLE_SERVERS];
for (int i = 0; i < ENSEMBLE_SERVERS; i++) {
final Context context = new Context();
contexts[i] = context;
mt[i] = new MainThread(i, clientPorts[i], currentQuorumCfgSection, false) {
@Override
public TestQPMain getTestQPMain() {
return new CustomizedQPMain(context);
}
};
mt[i].start();
zk[i] = new ZooKeeper("127.0.0.1:" + clientPorts[i], ClientBase.CONNECTION_TIMEOUT, this);
}
waitForAll(zk, States.CONNECTED);
LOG.info("all servers started");
String nodePath = "/testInconsistentDueToNewLeader";
int leaderId = -1;
int followerA = -1;
for (int i = 0; i < ENSEMBLE_SERVERS; i++) {
if (mt[i].main.quorumPeer.leader != null) {
leaderId = i;
} else if (followerA == -1) {
followerA = i;
}
}
LOG.info("shutdown follower {}", followerA);
mt[followerA].shutdown();
waitForOne(zk[followerA], States.CONNECTING);
try {
// 2. set force snapshot to be true
LOG.info("force snapshot sync");
System.setProperty(LearnerHandler.FORCE_SNAP_SYNC, "true");
// 3. create a node
String initialValue = "1";
final ZooKeeper leaderZk = zk[leaderId];
leaderZk.create(nodePath, initialValue.getBytes(), Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
LOG.info("created node {} with value {}", nodePath, initialValue);
CustomQuorumPeer leaderQuorumPeer = (CustomQuorumPeer) mt[leaderId].main.quorumPeer;
// 4. on the customized leader catch the startForwarding call
// (without synchronized), set the node to value v1, then
// call the super.startForwarding to generate the ongoing
// txn proposal and commit for v1 value update
leaderQuorumPeer.setStartForwardingListener(new StartForwardingListener() {
@Override
public void start() {
if (!Boolean.getBoolean(LearnerHandler.FORCE_SNAP_SYNC)) {
return;
}
final String value = "2";
LOG.info("start forwarding, set {} to {}", nodePath, value);
// ZKDatabase and the setData request will timeout
try {
leaderZk.setData(nodePath, value.getBytes(), -1, (rc, path, ctx, stat) -> {
}, null);
// wait for the setData txn being populated
Thread.sleep(1000);
} catch (Exception e) {
LOG.error("error when set {} to {}", nodePath, value, e);
}
}
});
// 5. on the customized leader catch the beginSnapshot call in
// LearnerSyncThrottler to set the node to value v2,
// wait it hit data tree
leaderQuorumPeer.setBeginSnapshotListener(new BeginSnapshotListener() {
@Override
public void start() {
String value = "3";
LOG.info("before sending snapshot, set {} to {}", nodePath, value);
try {
leaderZk.setData(nodePath, value.getBytes(), -1);
LOG.info("successfully set {} to {}", nodePath, value);
} catch (Exception e) {
LOG.error("error when set {} to {}, {}", nodePath, value, e);
}
}
});
// 6. exit follower A after taking snapshot
CustomQuorumPeer followerAQuorumPeer = ((CustomQuorumPeer) mt[followerA].main.quorumPeer);
LOG.info("set exit when ack new leader packet on {}", followerA);
contexts[followerA].exitWhenAckNewLeader = true;
CountDownLatch latch = new CountDownLatch(1);
final MainThread followerAMT = mt[followerA];
contexts[followerA].newLeaderAckCallback = new NewLeaderAckCallback() {
@Override
public void start() {
try {
latch.countDown();
followerAMT.shutdown();
} catch (Exception e) {
}
}
};
// 7. start follower A to do snapshot sync
LOG.info("starting follower {}", followerA);
mt[followerA].start();
assertTrue(latch.await(30, TimeUnit.SECONDS));
// 8. now we have invalid data on disk, let's load it and verify
LOG.info("disable exit when ack new leader packet on {}", followerA);
System.setProperty(LearnerHandler.FORCE_SNAP_SYNC, "false");
contexts[followerA].exitWhenAckNewLeader = true;
contexts[followerA].newLeaderAckCallback = null;
LOG.info("restarting follower {}", followerA);
mt[followerA].start();
zk[followerA].close();
zk[followerA] = new ZooKeeper("127.0.0.1:" + clientPorts[followerA], ClientBase.CONNECTION_TIMEOUT, this);
// 9. start follower A, after it's in broadcast state, make sure
// the node value is same as what we have on leader
waitForOne(zk[followerA], States.CONNECTED);
assertEquals(new String(zk[followerA].getData(nodePath, null, null)), new String(zk[leaderId].getData(nodePath, null, null)));
} finally {
System.clearProperty(LearnerHandler.FORCE_SNAP_SYNC);
for (int i = 0; i < ENSEMBLE_SERVERS; i++) {
mt[i].shutdown();
zk[i].close();
}
}
}
use of org.apache.zookeeper.server.quorum.Leader.Proposal in project zookeeper by apache.
the class DIFFSyncConsistencyTest method testInconsistentDueToUncommittedLog.
@Test
@Timeout(value = 120)
public void testInconsistentDueToUncommittedLog() throws Exception {
final int LEADER_TIMEOUT_MS = 10_000;
final int[] clientPorts = new int[SERVER_COUNT];
StringBuilder sb = new StringBuilder();
String server;
for (int i = 0; i < SERVER_COUNT; i++) {
clientPorts[i] = PortAssignment.unique();
server = "server." + i + "=127.0.0.1:" + PortAssignment.unique() + ":" + PortAssignment.unique() + ":participant;127.0.0.1:" + clientPorts[i];
sb.append(server + "\n");
}
String currentQuorumCfgSection = sb.toString();
for (int i = 0; i < SERVER_COUNT; i++) {
mt[i] = new MainThread(i, clientPorts[i], currentQuorumCfgSection, false) {
@Override
public TestQPMain getTestQPMain() {
return new MockTestQPMain();
}
};
mt[i].start();
}
for (int i = 0; i < SERVER_COUNT; i++) {
assertTrue(ClientBase.waitForServerUp("127.0.0.1:" + clientPorts[i], CONNECTION_TIMEOUT), "waiting for server " + i + " being up");
}
int leader = findLeader(mt);
CountdownWatcher watch = new CountdownWatcher();
ZooKeeper zk = new ZooKeeper("127.0.0.1:" + clientPorts[leader], ClientBase.CONNECTION_TIMEOUT, watch);
watch.waitForConnected(ClientBase.CONNECTION_TIMEOUT);
Map<Long, Proposal> outstanding = mt[leader].main.quorumPeer.leader.outstandingProposals;
// Increase the tick time to delay the leader going to looking to allow us proposal a transaction while other
// followers are offline.
int previousTick = mt[leader].main.quorumPeer.tickTime;
mt[leader].main.quorumPeer.tickTime = LEADER_TIMEOUT_MS;
// Let the previous tick on the leader exhaust itself so the new tick time takes effect
Thread.sleep(previousTick);
LOG.info("LEADER ELECTED {}", leader);
// In other words, we want to make sure the followers get the proposal later through DIFF sync.
for (int i = 0; i < SERVER_COUNT; i++) {
if (i != leader) {
mt[i].shutdown();
}
}
// Send a create request to old leader and make sure it's synced to disk.
try {
zk.create("/zk" + leader, "zk".getBytes(), Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
fail("create /zk" + leader + " should have failed");
} catch (KeeperException e) {
}
// Make sure that we actually did get it in process at the leader; there can be extra sessionClose proposals.
assertTrue(outstanding.size() > 0);
Proposal p = findProposalOfType(outstanding, OpCode.create);
LOG.info("Old leader id: {}. All proposals: {}", leader, outstanding);
assertNotNull(p, "Old leader doesn't have 'create' proposal");
// Make sure leader sync the proposal to disk.
int sleepTime = 0;
Long longLeader = (long) leader;
while (!p.qvAcksetPairs.get(0).getAckset().contains(longLeader)) {
if (sleepTime > 2000) {
fail("Transaction not synced to disk within 1 second " + p.qvAcksetPairs.get(0).getAckset() + " expected " + leader);
}
Thread.sleep(100);
sleepTime += 100;
}
// from DIFF sync.
for (int i = 0; i < SERVER_COUNT; i++) {
if (i == leader) {
continue;
}
mt[i].start();
int sleepCount = 0;
while (mt[i].getQuorumPeer() == null) {
++sleepCount;
if (sleepCount > 100) {
fail("Can't start follower " + i + " !");
}
Thread.sleep(100);
}
((CustomQuorumPeer) mt[i].getQuorumPeer()).setInjectError(true);
LOG.info("Follower {} started.", i);
}
// Verify leader can see it. The fact that leader can see it implies that
// leader should, at this point in time, get a quorum of ACK of NEWLEADER
// from two followers so leader can start serving requests; this also implies
// that DIFF sync from leader to followers are finished at this point in time.
// We then verify later that followers should have the same view after we shutdown
// this leader, otherwise it's a violation of ZAB / sequential consistency.
int c = 0;
while (c < 100) {
++c;
try {
Stat stat = zk.exists("/zk" + leader, false);
assertNotNull(stat, "server " + leader + " should have /zk");
break;
} catch (KeeperException.ConnectionLossException e) {
}
Thread.sleep(100);
}
// Shutdown all servers
for (int i = 0; i < SERVER_COUNT; i++) {
mt[i].shutdown();
}
waitForOne(zk, States.CONNECTING);
// to sync to disk because we made them fail at UPTODATE.
for (int i = 0; i < SERVER_COUNT; i++) {
if (i == leader) {
continue;
}
mt[i].start();
int sleepCount = 0;
while (mt[i].getQuorumPeer() == null) {
++sleepCount;
if (sleepCount > 100) {
fail("Can't start follower " + i + " !");
}
Thread.sleep(100);
}
((CustomQuorumPeer) mt[i].getQuorumPeer()).setInjectError(false);
LOG.info("Follower {} started again.", i);
}
int newLeader = findLeader(mt);
assertNotEquals(newLeader, leader, "new leader is still the old leader " + leader + " !!");
// This inconsistent view of the quorum exposed from leaders is a violation of ZAB.
for (int i = 0; i < SERVER_COUNT; i++) {
if (i != newLeader) {
continue;
}
zk.close();
zk = new ZooKeeper("127.0.0.1:" + clientPorts[i], ClientBase.CONNECTION_TIMEOUT, watch);
watch.waitForConnected(ClientBase.CONNECTION_TIMEOUT);
Stat val = zk.exists("/zk" + leader, false);
assertNotNull(val, "Data inconsistency detected! " + "Server " + i + " should have a view of /zk" + leader + "!");
}
zk.close();
}
use of org.apache.zookeeper.server.quorum.Leader.Proposal in project zookeeper by apache.
the class LearnerHandlerTest method createProposal.
Proposal createProposal(long zxid) {
Proposal p = new Proposal();
p.packet = new QuorumPacket();
p.packet.setZxid(zxid);
p.packet.setType(Leader.PROPOSAL);
return p;
}
use of org.apache.zookeeper.server.quorum.Leader.Proposal in project zookeeper by apache.
the class GetProposalFromTxnTest method testGetProposalFromTxn.
/**
* Test loading proposal from txnlog
*
* @throws Exception
* an exception might be thrown here
*/
@Test
public void testGetProposalFromTxn() throws Exception {
File tmpDir = ClientBase.createTmpDir();
ClientBase.setupTestEnv();
ZooKeeperServer zks = new ZooKeeperServer(tmpDir, tmpDir, 3000);
SyncRequestProcessor.setSnapCount(100);
final int PORT = Integer.parseInt(HOSTPORT.split(":")[1]);
ServerCnxnFactory f = ServerCnxnFactory.createFactory(PORT, -1);
f.startup(zks);
assertTrue(ClientBase.waitForServerUp(HOSTPORT, CONNECTION_TIMEOUT), "waiting for server being up ");
ZooKeeper zk = ClientBase.createZKClient(HOSTPORT);
// Generate transaction so we will have some txnlog
Long[] zxids = new Long[MSG_COUNT];
try {
String data = "data";
byte[] bytes = data.getBytes();
for (int i = 0; i < MSG_COUNT; i++) {
Stat stat = new Stat();
zk.create("/invalidsnap-" + i, bytes, Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
zk.getData("/invalidsnap-" + i, null, stat);
zxids[i] = stat.getCzxid();
}
} finally {
zk.close();
}
// shutdown and start zookeeper again
f.shutdown();
zks.shutdown();
assertTrue(ClientBase.waitForServerDown(HOSTPORT, CONNECTION_TIMEOUT), "waiting for server to shutdown");
zks = new ZooKeeperServer(tmpDir, tmpDir, 3000);
zks.startdata();
ZKDatabase db = zks.getZKDatabase();
// Set sizeLimit to be very high number, so we can pull all transactions
// from txnlog
Iterator<Proposal> itr = db.getProposalsFromTxnLog(zxids[0], 10000000);
int createCount = 0;
ArrayList<Long> retrievedZxids = new ArrayList<Long>(MSG_COUNT);
// Get zxid of create requests
while (itr.hasNext()) {
Proposal proposal = itr.next();
TxnLogEntry logEntry = SerializeUtils.deserializeTxn(proposal.packet.getData());
TxnHeader hdr = logEntry.getHeader();
Record rec = logEntry.getTxn();
if (hdr.getType() == OpCode.create) {
retrievedZxids.add(hdr.getZxid());
createCount++;
}
}
// All zxid should match what we created
assertTrue(Arrays.equals(zxids, retrievedZxids.toArray(new Long[0])), "Zxids missmatches");
// There should be 2000 create requests
assertTrue((createCount == MSG_COUNT), "create proposal count == " + MSG_COUNT);
// We are requesting half the number of transaction from the snapshot
// this should exceed threshold (ZKDatabase.snapshotSizeFactor)
db.setSnapshotSizeFactor(0.33);
long sizeLimit = db.calculateTxnLogSizeLimit();
itr = db.getProposalsFromTxnLog(zxids[MSG_COUNT / 2], sizeLimit);
assertFalse((itr.hasNext()), "Expect empty proposal");
f.shutdown();
zks.shutdown();
}
use of org.apache.zookeeper.server.quorum.Leader.Proposal in project zookeeper by apache.
the class LearnerHandler method syncFollower.
/**
* Determine if we need to sync with follower using DIFF/TRUNC/SNAP
* and setup follower to receive packets from commit processor
*
* @param peerLastZxid
* @param learnerMaster
* @return true if snapshot transfer is needed.
*/
boolean syncFollower(long peerLastZxid, LearnerMaster learnerMaster) {
/*
* When leader election is completed, the leader will set its
* lastProcessedZxid to be (epoch < 32). There will be no txn associated
* with this zxid.
*
* The learner will set its lastProcessedZxid to the same value if
* it get DIFF or SNAP from the learnerMaster. If the same learner come
* back to sync with learnerMaster using this zxid, we will never find this
* zxid in our history. In this case, we will ignore TRUNC logic and
* always send DIFF if we have old enough history
*/
boolean isPeerNewEpochZxid = (peerLastZxid & 0xffffffffL) == 0;
// Keep track of the latest zxid which already queued
long currentZxid = peerLastZxid;
boolean needSnap = true;
ZKDatabase db = learnerMaster.getZKDatabase();
boolean txnLogSyncEnabled = db.isTxnLogSyncEnabled();
ReentrantReadWriteLock lock = db.getLogLock();
ReadLock rl = lock.readLock();
try {
rl.lock();
long maxCommittedLog = db.getmaxCommittedLog();
long minCommittedLog = db.getminCommittedLog();
long lastProcessedZxid = db.getDataTreeLastProcessedZxid();
LOG.info("Synchronizing with Learner sid: {} maxCommittedLog=0x{}" + " minCommittedLog=0x{} lastProcessedZxid=0x{}" + " peerLastZxid=0x{}", getSid(), Long.toHexString(maxCommittedLog), Long.toHexString(minCommittedLog), Long.toHexString(lastProcessedZxid), Long.toHexString(peerLastZxid));
if (db.getCommittedLog().isEmpty()) {
/*
* It is possible that committedLog is empty. In that case
* setting these value to the latest txn in learnerMaster db
* will reduce the case that we need to handle
*
* Here is how each case handle by the if block below
* 1. lastProcessZxid == peerZxid -> Handle by (2)
* 2. lastProcessZxid < peerZxid -> Handle by (3)
* 3. lastProcessZxid > peerZxid -> Handle by (5)
*/
minCommittedLog = lastProcessedZxid;
maxCommittedLog = lastProcessedZxid;
}
if (forceSnapSync) {
// Force learnerMaster to use snapshot to sync with follower
LOG.warn("Forcing snapshot sync - should not see this in production");
} else if (lastProcessedZxid == peerLastZxid) {
// Follower is already sync with us, send empty diff
LOG.info("Sending DIFF zxid=0x{} for peer sid: {}", Long.toHexString(peerLastZxid), getSid());
queueOpPacket(Leader.DIFF, peerLastZxid);
needOpPacket = false;
needSnap = false;
} else if (peerLastZxid > maxCommittedLog && !isPeerNewEpochZxid) {
// Newer than committedLog, send trunc and done
LOG.debug("Sending TRUNC to follower zxidToSend=0x{} for peer sid:{}", Long.toHexString(maxCommittedLog), getSid());
queueOpPacket(Leader.TRUNC, maxCommittedLog);
currentZxid = maxCommittedLog;
needOpPacket = false;
needSnap = false;
} else if ((maxCommittedLog >= peerLastZxid) && (minCommittedLog <= peerLastZxid)) {
// Follower is within commitLog range
LOG.info("Using committedLog for peer sid: {}", getSid());
Iterator<Proposal> itr = db.getCommittedLog().iterator();
currentZxid = queueCommittedProposals(itr, peerLastZxid, null, maxCommittedLog);
needSnap = false;
} else if (peerLastZxid < minCommittedLog && txnLogSyncEnabled) {
// Use txnlog and committedLog to sync
// Calculate sizeLimit that we allow to retrieve txnlog from disk
long sizeLimit = db.calculateTxnLogSizeLimit();
// This method can return empty iterator if the requested zxid
// is older than on-disk txnlog
Iterator<Proposal> txnLogItr = db.getProposalsFromTxnLog(peerLastZxid, sizeLimit);
if (txnLogItr.hasNext()) {
LOG.info("Use txnlog and committedLog for peer sid: {}", getSid());
currentZxid = queueCommittedProposals(txnLogItr, peerLastZxid, minCommittedLog, maxCommittedLog);
if (currentZxid < minCommittedLog) {
LOG.info("Detected gap between end of txnlog: 0x{} and start of committedLog: 0x{}", Long.toHexString(currentZxid), Long.toHexString(minCommittedLog));
currentZxid = peerLastZxid;
// Clear out currently queued requests and revert
// to sending a snapshot.
queuedPackets.clear();
needOpPacket = true;
} else {
LOG.debug("Queueing committedLog 0x{}", Long.toHexString(currentZxid));
Iterator<Proposal> committedLogItr = db.getCommittedLog().iterator();
currentZxid = queueCommittedProposals(committedLogItr, currentZxid, null, maxCommittedLog);
needSnap = false;
}
}
// closing the resources
if (txnLogItr instanceof TxnLogProposalIterator) {
TxnLogProposalIterator txnProposalItr = (TxnLogProposalIterator) txnLogItr;
txnProposalItr.close();
}
} else {
LOG.warn("Unhandled scenario for peer sid: {} maxCommittedLog=0x{}" + " minCommittedLog=0x{} lastProcessedZxid=0x{}" + " peerLastZxid=0x{} txnLogSyncEnabled={}", getSid(), Long.toHexString(maxCommittedLog), Long.toHexString(minCommittedLog), Long.toHexString(lastProcessedZxid), Long.toHexString(peerLastZxid), txnLogSyncEnabled);
}
if (needSnap) {
currentZxid = db.getDataTreeLastProcessedZxid();
}
LOG.debug("Start forwarding 0x{} for peer sid: {}", Long.toHexString(currentZxid), getSid());
leaderLastZxid = learnerMaster.startForwarding(this, currentZxid);
} finally {
rl.unlock();
}
if (needOpPacket && !needSnap) {
// This should never happen, but we should fall back to sending
// snapshot just in case.
LOG.error("Unhandled scenario for peer sid: {} fall back to use snapshot", getSid());
needSnap = true;
}
return needSnap;
}
Aggregations