use of org.apache.zookeeper.KeeperException.ConnectionLossException in project helios by spotify.
the class TaskHistoryWriter method run.
@Override
public void run() {
while (true) {
final TaskStatusEvent item = getNext();
if (item == null) {
return;
}
final JobId jobId = item.getStatus().getJob().getId();
final String historyPath = Paths.historyJobHostEventsTimestamp(jobId, hostname, item.getTimestamp());
try {
log.debug("writing queued item to zookeeper {} {}", item.getStatus().getJob().getId(), item.getTimestamp());
client.ensurePath(historyPath, true);
client.createAndSetData(historyPath, item.getStatus().toJsonBytes());
// See if too many
final List<String> events = client.getChildren(Paths.historyJobHostEvents(jobId, hostname));
if (events.size() > MAX_NUMBER_STATUS_EVENTS_TO_RETAIN) {
trimStatusEvents(events, jobId);
}
} catch (NodeExistsException e) {
// Ahh, the two generals problem... We handle by doing nothing since the thing
// we wanted in, is in.
log.debug("item we wanted in is already there");
} catch (ConnectionLossException e) {
log.warn("Connection lost while putting item into zookeeper, will retry");
putBack(item);
break;
} catch (KeeperException e) {
log.error("Error putting item into zookeeper, will retry", e);
putBack(item);
break;
}
}
}
use of org.apache.zookeeper.KeeperException.ConnectionLossException in project zookeeper by apache.
the class SimpleSysTest method testSimpleCase.
/**
* This test checks the following:
* 1) All clients connect successfully
* 2) Half of the servers die (assuming odd number) and a write succeeds
* 3) All servers are restarted and cluster stays alive
* 4) Clients see a change by the server
* 5) Clients' ephemeral nodes are cleaned up
*
* @throws Exception
*/
@Test
public void testSimpleCase() throws Exception {
configureServers(serverCount);
configureClients(clientCount, SimpleClient.class, getHostPort());
Stat stat = new Stat();
startServers();
LOG.debug("Connecting to " + getHostPort());
ZooKeeper zk = new ZooKeeper(getHostPort(), 15000, this);
waitForConnect(zk, 10000);
zk.create("/simpleCase", "orig".getBytes(), Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
startClients();
// Check that all clients connect properly
for (int i = 0; i < getClientCount(); i++) {
for (int j = 0; j < maxTries; j++) {
try {
byte[] b = zk.getData("/simpleCase/" + i, false, stat);
Assert.assertEquals("orig", new String(b));
} catch (NoNodeException e) {
if (j + 1 == maxTries) {
Assert.fail("Max tries exceeded on client " + i);
}
Thread.sleep(1000);
}
}
}
// servers, and then bounce the other servers one by one
for (int i = 0; i < getServerCount(); i++) {
stopServer(i);
if (i + 1 > getServerCount() / 2) {
startServer(i);
} else if (i + 1 == getServerCount() / 2) {
Assert.assertTrue("Connection didn't recover", waitForConnect(zk, 10000));
try {
zk.setData("/simpleCase", "new".getBytes(), -1);
} catch (ConnectionLossException e) {
Assert.assertTrue("Connection didn't recover", waitForConnect(zk, 10000));
zk.setData("/simpleCase", "new".getBytes(), -1);
}
for (int j = 0; j < i; j++) {
LOG.info("Starting server " + j);
startServer(i);
}
}
}
// wait for things to stabilize
Thread.sleep(100);
Assert.assertTrue("Servers didn't bounce", waitForConnect(zk, 15000));
try {
zk.getData("/simpleCase", false, stat);
} catch (ConnectionLossException e) {
Assert.assertTrue("Servers didn't bounce", waitForConnect(zk, 15000));
}
// check that the change has propagated to everyone
for (int i = 0; i < getClientCount(); i++) {
for (int j = 0; j < maxTries; j++) {
byte[] data = zk.getData("/simpleCase/" + i, false, stat);
if (new String(data).equals("new")) {
break;
}
if (j + 1 == maxTries) {
Assert.fail("max tries exceeded for " + i);
}
Thread.sleep(1000);
}
}
// send out the kill signal
zk.setData("/simpleCase", "die".getBytes(), -1);
// watch for everyone to die
for (int i = 0; i < getClientCount(); i++) {
try {
for (int j = 0; j < maxTries; j++) {
zk.getData("/simpleCase/" + i, false, stat);
if (j + 1 == maxTries) {
Assert.fail("max tries exceeded waiting for child " + i + " to die");
}
Thread.sleep(200);
}
} catch (NoNodeException e) {
// Great this is what we were hoping for!
}
}
stopClients();
stopServers();
}
use of org.apache.zookeeper.KeeperException.ConnectionLossException in project zookeeper by apache.
the class InstanceManager method getStatus.
public String getStatus(String name, long timeout) throws KeeperException, InterruptedException {
Stat stat = new Stat();
byte[] data = null;
long endTime = Time.currentElapsedTime() + timeout;
KeeperException lastException = null;
for (int i = 0; i < maxTries && endTime > Time.currentElapsedTime(); i++) {
try {
data = zk.getData(reportsNode + '/' + name, false, stat);
if (LOG.isDebugEnabled()) {
LOG.debug("Got Data: " + ((data == null) ? "null" : new String(data)));
}
lastException = null;
break;
} catch (ConnectionLossException e) {
lastException = e;
} catch (NoNodeException e) {
final Object eventObj = new Object();
synchronized (eventObj) {
// wait for the node to appear
Stat eStat = zk.exists(reportsNode + '/' + name, new Watcher() {
public void process(WatchedEvent event) {
synchronized (eventObj) {
eventObj.notifyAll();
}
}
});
if (eStat == null) {
eventObj.wait(endTime - Time.currentElapsedTime());
}
}
lastException = e;
}
}
if (lastException != null) {
throw lastException;
}
return new String(data);
}
use of org.apache.zookeeper.KeeperException.ConnectionLossException in project zookeeper by apache.
the class ObserverMasterTest method testObserver.
/**
* This test ensures two things:
* 1. That Observers can successfully proxy requests to the ensemble.
* 2. That Observers don't participate in leader elections.
* The second is tested by constructing an ensemble where a leader would
* be elected if and only if an Observer voted.
*/
@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testObserver(boolean testObserverMaster) throws Exception {
// We expect two notifications before we want to continue
latch = new CountDownLatch(2);
setUp(-1, testObserverMaster);
q3.start();
assertTrue(ClientBase.waitForServerUp("127.0.0.1:" + CLIENT_PORT_OBS, CONNECTION_TIMEOUT), "waiting for server 3 being up");
validateObserverSyncTimeMetrics();
if (testObserverMaster) {
int masterPort = q3.getQuorumPeer().observer.getSocket().getPort();
LOG.info("port {} {}", masterPort, OM_PORT);
assertEquals(masterPort, OM_PORT, "observer failed to connect to observer master");
}
zk = new ZooKeeper("127.0.0.1:" + CLIENT_PORT_OBS, ClientBase.CONNECTION_TIMEOUT, this);
zk.create("/obstest", "test".getBytes(), Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT);
// Assert that commands are getting forwarded correctly
assertEquals(new String(zk.getData("/obstest", null, null)), "test");
// Now check that other commands don't blow everything up
zk.sync("/", null, null);
zk.setData("/obstest", "test2".getBytes(), -1);
zk.getChildren("/", false);
assertEquals(zk.getState(), States.CONNECTED);
LOG.info("Shutting down server 2");
// Now kill one of the other real servers
q2.shutdown();
assertTrue(ClientBase.waitForServerDown("127.0.0.1:" + CLIENT_PORT_QP2, ClientBase.CONNECTION_TIMEOUT), "Waiting for server 2 to shut down");
LOG.info("Server 2 down");
// Now the resulting ensemble shouldn't be quorate
latch.await();
assertNotSame(KeeperState.SyncConnected, lastEvent.getState(), "Client is still connected to non-quorate cluster");
LOG.info("Latch returned");
try {
assertNotEquals("Shouldn't get a response when cluster not quorate!", "test", new String(zk.getData("/obstest", null, null)));
} catch (ConnectionLossException c) {
LOG.info("Connection loss exception caught - ensemble not quorate (this is expected)");
}
latch = new CountDownLatch(1);
LOG.info("Restarting server 2");
// Bring it back
// q2 = new MainThread(2, CLIENT_PORT_QP2, quorumCfgSection, extraCfgs);
q2.start();
LOG.info("Waiting for server 2 to come up");
assertTrue(ClientBase.waitForServerUp("127.0.0.1:" + CLIENT_PORT_QP2, CONNECTION_TIMEOUT), "waiting for server 2 being up");
LOG.info("Server 2 started, waiting for latch");
latch.await();
// It's possible our session expired - but this is ok, shows we
// were able to talk to the ensemble
assertTrue((KeeperState.SyncConnected == lastEvent.getState() || KeeperState.Expired == lastEvent.getState()), "Client didn't reconnect to quorate ensemble (state was" + lastEvent.getState() + ")");
LOG.info("perform a revalidation test");
int leaderProxyPort = PortAssignment.unique();
int obsProxyPort = PortAssignment.unique();
int leaderPort = q1.getQuorumPeer().leader == null ? CLIENT_PORT_QP2 : CLIENT_PORT_QP1;
PortForwarder leaderPF = new PortForwarder(leaderProxyPort, leaderPort);
latch = new CountDownLatch(1);
ZooKeeper client = new ZooKeeper(String.format("127.0.0.1:%d,127.0.0.1:%d", leaderProxyPort, obsProxyPort), ClientBase.CONNECTION_TIMEOUT, this);
latch.await();
client.create("/revalidtest", "test".getBytes(), Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL);
assertNotNull(client.exists("/revalidtest", null), "Read-after write failed");
latch = new CountDownLatch(2);
PortForwarder obsPF = new PortForwarder(obsProxyPort, CLIENT_PORT_OBS);
try {
leaderPF.shutdown();
} catch (Exception e) {
// ignore?
}
latch.await();
assertEquals(new String(client.getData("/revalidtest", null, null)), "test");
client.close();
obsPF.shutdown();
shutdown();
}
Aggregations