use of org.apache.solr.cloud.AbstractFullDistribZkTestBase.CloudJettyRunner in project lucene-solr by apache.
the class ChaosMonkey method getRandomJetty.
public CloudJettyRunner getRandomJetty(String slice, boolean aggressivelyKillLeaders) throws KeeperException, InterruptedException {
int numActive = 0;
numActive = checkIfKillIsLegal(slice, numActive);
// TODO: stale state makes this a tough call
if (numActive < 2) {
// we cannot kill anyone
monkeyLog("only one active node in shard - monkey cannot kill :(");
return null;
}
// let's check the deadpool count
int numRunning = 0;
for (CloudJettyRunner cjetty : shardToJetty.get(slice)) {
if (!deadPool.contains(cjetty)) {
numRunning++;
}
}
if (numRunning < 2) {
// we cannot kill anyone
monkeyLog("only one active node in shard - monkey cannot kill :(");
return null;
}
boolean canKillIndexer = canKillIndexer(slice);
if (!canKillIndexer) {
monkeyLog("Number of indexer nodes (nrt or tlog replicas) is not enough to kill one of them, Will only choose a pull replica to kill");
}
int chance = chaosRandom.nextInt(10);
CloudJettyRunner cjetty = null;
if (chance <= 5 && aggressivelyKillLeaders && canKillIndexer) {
// if killLeader, really aggressively go after leaders
cjetty = shardToLeaderJetty.get(slice);
} else {
List<CloudJettyRunner> jetties = shardToJetty.get(slice);
// get random node
int attempt = 0;
while (true) {
attempt++;
int index = chaosRandom.nextInt(jetties.size());
cjetty = jetties.get(index);
if (canKillIndexer || getTypeForJetty(slice, cjetty) == Replica.Type.PULL) {
break;
} else if (attempt > 20) {
monkeyLog("Can't kill indexer nodes (nrt or tlog replicas) and couldn't find a random pull node after 20 attempts - monkey cannot kill :(");
return null;
}
}
ZkNodeProps leader = null;
try {
leader = zkStateReader.getLeaderRetry(collection, slice);
} catch (Throwable t) {
log.error("Could not get leader", t);
return null;
}
// cluster state can be stale - also go by our 'near real-time' is leader prop
boolean rtIsLeader;
CoreContainer cc = cjetty.jetty.getCoreContainer();
if (cc != null) {
try (SolrCore core = cc.getCore(leader.getStr(ZkStateReader.CORE_NAME_PROP))) {
if (core == null) {
monkeyLog("selected jetty not running correctly - skip");
return null;
}
rtIsLeader = core.getCoreDescriptor().getCloudDescriptor().isLeader();
}
} else {
return null;
}
boolean isLeader = leader.getStr(ZkStateReader.NODE_NAME_PROP).equals(cjetty.nodeName) || rtIsLeader;
if (!aggressivelyKillLeaders && isLeader) {
// we don't kill leaders...
monkeyLog("abort! I don't kill leaders");
return null;
}
}
if (cjetty.jetty.getLocalPort() == -1) {
// we can't kill the dead
monkeyLog("abort! This guy is already dead");
return null;
}
//System.out.println("num active:" + numActive + " for " + slice + " sac:" + jetty.getLocalPort());
monkeyLog("chose a victim! " + cjetty.jetty.getLocalPort());
return cjetty;
}
use of org.apache.solr.cloud.AbstractFullDistribZkTestBase.CloudJettyRunner in project lucene-solr by apache.
the class ChaosMonkey method canKillIndexer.
private boolean canKillIndexer(String sliceName) throws KeeperException, InterruptedException {
int numIndexersFoundInShard = 0;
for (CloudJettyRunner cloudJetty : shardToJetty.get(sliceName)) {
// get latest cloud state
zkStateReader.forceUpdateCollection(collection);
DocCollection docCollection = zkStateReader.getClusterState().getCollection(collection);
Slice slice = docCollection.getSlice(sliceName);
ZkNodeProps props = slice.getReplicasMap().get(cloudJetty.coreNodeName);
if (props == null) {
throw new RuntimeException("shard name " + cloudJetty.coreNodeName + " not found in " + slice.getReplicasMap().keySet());
}
final Replica.State state = Replica.State.getState(props.getStr(ZkStateReader.STATE_PROP));
final Replica.Type replicaType = Replica.Type.valueOf(props.getStr(ZkStateReader.REPLICA_TYPE));
final String nodeName = props.getStr(ZkStateReader.NODE_NAME_PROP);
if (cloudJetty.jetty.isRunning() && state == Replica.State.ACTIVE && (replicaType == Replica.Type.TLOG || replicaType == Replica.Type.NRT) && zkStateReader.getClusterState().liveNodesContain(nodeName)) {
numIndexersFoundInShard++;
}
}
return numIndexersFoundInShard > 1;
}
use of org.apache.solr.cloud.AbstractFullDistribZkTestBase.CloudJettyRunner in project lucene-solr by apache.
the class ChaosMonkey method randomConnectionLoss.
public void randomConnectionLoss() throws KeeperException, InterruptedException {
monkeyLog("Will cause connection loss!");
String sliceName = getRandomSlice();
CloudJettyRunner jetty = getRandomJetty(sliceName, aggressivelyKillLeaders);
if (jetty != null) {
causeConnectionLoss(jetty.jetty);
connloss.incrementAndGet();
}
}
use of org.apache.solr.cloud.AbstractFullDistribZkTestBase.CloudJettyRunner in project lucene-solr by apache.
the class ChaosMonkey method expireRandomSession.
public void expireRandomSession() throws KeeperException, InterruptedException {
String sliceName = getRandomSlice();
CloudJettyRunner jetty = getRandomJetty(sliceName, aggressivelyKillLeaders);
if (jetty != null) {
expireSession(jetty.jetty);
expires.incrementAndGet();
}
}
use of org.apache.solr.cloud.AbstractFullDistribZkTestBase.CloudJettyRunner in project lucene-solr by apache.
the class ChaosMonkey method stopShard.
public CloudJettyRunner stopShard(String slice, int index) throws Exception {
CloudJettyRunner cjetty = shardToJetty.get(slice).get(index);
stopJetty(cjetty);
return cjetty;
}
Aggregations