use of org.apache.zookeeper.KeeperException.ConnectionLossException in project lucene-solr by apache.
the class LeaderElector method joinElection.
/**
* Begin participating in the election process. Gets a new sequential number
* and begins watching the node with the sequence number before it, unless it
* is the lowest number, in which case, initiates the leader process. If the
* node that is watched goes down, check if we are the new lowest node, else
* watch the next lowest numbered node.
*
* @return sequential node number
*/
public int joinElection(ElectionContext context, boolean replacement, boolean joinAtHead) throws KeeperException, InterruptedException, IOException {
context.joinedElectionFired();
final String shardsElectZkPath = context.electionPath + LeaderElector.ELECTION_NODE;
long sessionId = zkClient.getSolrZooKeeper().getSessionId();
String id = sessionId + "-" + context.id;
String leaderSeqPath = null;
boolean cont = true;
int tries = 0;
while (cont) {
try {
if (joinAtHead) {
log.debug("Node {} trying to join election at the head", id);
List<String> nodes = OverseerTaskProcessor.getSortedElectionNodes(zkClient, shardsElectZkPath);
if (nodes.size() < 2) {
leaderSeqPath = zkClient.create(shardsElectZkPath + "/" + id + "-n_", null, CreateMode.EPHEMERAL_SEQUENTIAL, false);
} else {
String firstInLine = nodes.get(1);
log.debug("The current head: {}", firstInLine);
Matcher m = LEADER_SEQ.matcher(firstInLine);
if (!m.matches()) {
throw new IllegalStateException("Could not find regex match in:" + firstInLine);
}
leaderSeqPath = shardsElectZkPath + "/" + id + "-n_" + m.group(1);
zkClient.create(leaderSeqPath, null, CreateMode.EPHEMERAL, false);
}
} else {
leaderSeqPath = zkClient.create(shardsElectZkPath + "/" + id + "-n_", null, CreateMode.EPHEMERAL_SEQUENTIAL, false);
}
log.debug("Joined leadership election with path: {}", leaderSeqPath);
context.leaderSeqPath = leaderSeqPath;
cont = false;
} catch (ConnectionLossException e) {
// we don't know if we made our node or not...
List<String> entries = zkClient.getChildren(shardsElectZkPath, null, true);
boolean foundId = false;
for (String entry : entries) {
String nodeId = getNodeId(entry);
if (id.equals(nodeId)) {
// we did create our node...
foundId = true;
break;
}
}
if (!foundId) {
cont = true;
if (tries++ > 20) {
throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "", e);
}
try {
Thread.sleep(50);
} catch (InterruptedException e2) {
Thread.currentThread().interrupt();
}
}
} catch (KeeperException.NoNodeException e) {
// be working on it, lets try again
if (tries++ > 20) {
context = null;
throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "", e);
}
cont = true;
try {
Thread.sleep(50);
} catch (InterruptedException e2) {
Thread.currentThread().interrupt();
}
}
}
checkIfIamLeader(context, replacement);
return getSeq(context.leaderSeqPath);
}
use of org.apache.zookeeper.KeeperException.ConnectionLossException in project lucene-solr by apache.
the class ZkController method getLeaderInitiatedRecoveryStateObject.
public Map<String, Object> getLeaderInitiatedRecoveryStateObject(String collection, String shardId, String coreNodeName) {
if (collection == null || shardId == null || coreNodeName == null)
// if we don't have complete data about a core in cloud mode, return null
return null;
String znodePath = getLeaderInitiatedRecoveryZnodePath(collection, shardId, coreNodeName);
byte[] stateData = null;
try {
stateData = zkClient.getData(znodePath, null, new Stat(), false);
} catch (NoNodeException ignoreMe) {
// safe to ignore as this znode will only exist if the leader initiated recovery
} catch (ConnectionLossException | SessionExpiredException cle) {
// sort of safe to ignore ??? Usually these are seen when the core is going down
// or there are bigger issues to deal with than reading this znode
log.warn("Unable to read " + znodePath + " due to: " + cle);
} catch (Exception exc) {
log.error("Failed to read data from znode " + znodePath + " due to: " + exc);
if (exc instanceof SolrException) {
throw (SolrException) exc;
} else {
throw new SolrException(ErrorCode.SERVER_ERROR, "Failed to read data from znodePath: " + znodePath, exc);
}
}
Map<String, Object> stateObj = null;
if (stateData != null && stateData.length > 0) {
// TODO: Remove later ... this is for upgrading from 4.8.x to 4.10.3 (see: SOLR-6732)
if (stateData[0] == (byte) '{') {
Object parsedJson = Utils.fromJSON(stateData);
if (parsedJson instanceof Map) {
stateObj = (Map<String, Object>) parsedJson;
} else {
throw new SolrException(ErrorCode.SERVER_ERROR, "Leader-initiated recovery state data is invalid! " + parsedJson);
}
} else {
// old format still in ZK
stateObj = Utils.makeMap("state", new String(stateData, StandardCharsets.UTF_8));
}
}
return stateObj;
}
use of org.apache.zookeeper.KeeperException.ConnectionLossException in project commons by twitter.
the class ZooKeeperClientTest method testGet.
@Test
public void testGet() throws Exception {
final ZooKeeperClient zkClient = createZkClient();
shutdownNetwork();
try {
zkClient.get(Amount.of(50L, Time.MILLISECONDS));
fail("Expected client connection to timeout while network down");
} catch (TimeoutException e) {
assertTrue(zkClient.isClosed());
}
assertNull(zkClient.getZooKeeperClientForTests());
final CountDownLatch blockingGetComplete = new CountDownLatch(1);
final AtomicReference<ZooKeeper> client = new AtomicReference<ZooKeeper>();
new Thread(new Runnable() {
@Override
public void run() {
try {
client.set(zkClient.get());
} catch (ZooKeeperConnectionException e) {
throw new RuntimeException(e);
} catch (InterruptedException e) {
throw new RuntimeException(e);
} finally {
blockingGetComplete.countDown();
}
}
}).start();
restartNetwork();
// Hung blocking connects should succeed when server connection comes up
blockingGetComplete.await();
assertNotNull(client.get());
// New connections should succeed now that network is back up
long sessionId = zkClient.get().getSessionId();
// While connected the same client should be reused (no new connections while healthy)
assertSame(client.get(), zkClient.get());
shutdownNetwork();
// Our client doesn't know the network is down yet so we should be able to get()
ZooKeeper zooKeeper = zkClient.get();
try {
zooKeeper.exists("/", false);
fail("Expected client operation to fail while network down");
} catch (ConnectionLossException e) {
// expected
}
restartNetwork();
assertEquals("Expected connection to be re-established with existing session", sessionId, zkClient.get().getSessionId());
}
use of org.apache.zookeeper.KeeperException.ConnectionLossException in project helios by spotify.
the class TaskHistoryWriterTest method testZooKeeperErrorDoesntLoseItemsReally.
@Test
public void testZooKeeperErrorDoesntLoseItemsReally() throws Exception {
final ZooKeeperClient mockClient = mock(ZooKeeperClient.class, delegatesTo(client));
final String path = Paths.historyJobHostEventsTimestamp(JOB_ID, HOSTNAME, TIMESTAMP);
// make save operations fail
final AtomicBoolean throwExceptionOnCreateAndSet = new AtomicBoolean(true);
final KeeperException exc = new ConnectionLossException();
doAnswer(new Answer<Void>() {
@Override
public Void answer(InvocationOnMock invocation) throws Throwable {
if (throwExceptionOnCreateAndSet.get()) {
throw exc;
} else {
client.createAndSetData((String) invocation.getArguments()[0], (byte[]) invocation.getArguments()[1]);
return null;
}
}
}).when(mockClient).createAndSetData(path, TASK_STATUS.toJsonBytes());
makeWriter(mockClient);
writer.saveHistoryItem(TASK_STATUS, TIMESTAMP);
// wait up to 10s for it to fail twice -- and make sure I mocked it correctly.
verify(mockClient, timeout(10000).atLeast(2)).createAndSetData(path, TASK_STATUS.toJsonBytes());
// now make the client work
throwExceptionOnCreateAndSet.set(false);
awaitHistoryItems();
}
use of org.apache.zookeeper.KeeperException.ConnectionLossException in project zookeeper by apache.
the class InstanceManager method resetStatus.
public void resetStatus(String name) throws InterruptedException, KeeperException {
KeeperException lastException = null;
for (int i = 0; i < maxTries; i++) {
try {
zk.delete(reportsNode + '/' + name, -1);
lastException = null;
break;
} catch (ConnectionLossException e) {
lastException = e;
} catch (NoNodeException e) {
// great this is what we want!
}
}
if (lastException != null) {
throw lastException;
}
}
Aggregations