Search in sources :

Example 1 with CloudJettyRunner

use of org.apache.solr.cloud.AbstractFullDistribZkTestBase.CloudJettyRunner in project lucene-solr by apache.

the class ChaosMonkey method getRandomJetty.

public CloudJettyRunner getRandomJetty(String slice, boolean aggressivelyKillLeaders) throws KeeperException, InterruptedException {
    int numActive = 0;
    numActive = checkIfKillIsLegal(slice, numActive);
    // TODO: stale state makes this a tough call
    if (numActive < 2) {
        // we cannot kill anyone
        monkeyLog("only one active node in shard - monkey cannot kill :(");
        return null;
    }
    // let's check the deadpool count
    int numRunning = 0;
    for (CloudJettyRunner cjetty : shardToJetty.get(slice)) {
        if (!deadPool.contains(cjetty)) {
            numRunning++;
        }
    }
    if (numRunning < 2) {
        // we cannot kill anyone
        monkeyLog("only one active node in shard - monkey cannot kill :(");
        return null;
    }
    boolean canKillIndexer = canKillIndexer(slice);
    if (!canKillIndexer) {
        monkeyLog("Number of indexer nodes (nrt or tlog replicas) is not enough to kill one of them, Will only choose a pull replica to kill");
    }
    int chance = chaosRandom.nextInt(10);
    CloudJettyRunner cjetty = null;
    if (chance <= 5 && aggressivelyKillLeaders && canKillIndexer) {
        // if killLeader, really aggressively go after leaders
        cjetty = shardToLeaderJetty.get(slice);
    } else {
        List<CloudJettyRunner> jetties = shardToJetty.get(slice);
        // get random node
        int attempt = 0;
        while (true) {
            attempt++;
            int index = chaosRandom.nextInt(jetties.size());
            cjetty = jetties.get(index);
            if (canKillIndexer || getTypeForJetty(slice, cjetty) == Replica.Type.PULL) {
                break;
            } else if (attempt > 20) {
                monkeyLog("Can't kill indexer nodes (nrt or tlog replicas) and couldn't find a random pull node after 20 attempts - monkey cannot kill :(");
                return null;
            }
        }
        ZkNodeProps leader = null;
        try {
            leader = zkStateReader.getLeaderRetry(collection, slice);
        } catch (Throwable t) {
            log.error("Could not get leader", t);
            return null;
        }
        // cluster state can be stale - also go by our 'near real-time' is leader prop
        boolean rtIsLeader;
        CoreContainer cc = cjetty.jetty.getCoreContainer();
        if (cc != null) {
            try (SolrCore core = cc.getCore(leader.getStr(ZkStateReader.CORE_NAME_PROP))) {
                if (core == null) {
                    monkeyLog("selected jetty not running correctly - skip");
                    return null;
                }
                rtIsLeader = core.getCoreDescriptor().getCloudDescriptor().isLeader();
            }
        } else {
            return null;
        }
        boolean isLeader = leader.getStr(ZkStateReader.NODE_NAME_PROP).equals(cjetty.nodeName) || rtIsLeader;
        if (!aggressivelyKillLeaders && isLeader) {
            // we don't kill leaders...
            monkeyLog("abort! I don't kill leaders");
            return null;
        }
    }
    if (cjetty.jetty.getLocalPort() == -1) {
        // we can't kill the dead
        monkeyLog("abort! This guy is already dead");
        return null;
    }
    //System.out.println("num active:" + numActive + " for " + slice + " sac:" + jetty.getLocalPort());
    monkeyLog("chose a victim! " + cjetty.jetty.getLocalPort());
    return cjetty;
}
Also used : CoreContainer(org.apache.solr.core.CoreContainer) SolrCore(org.apache.solr.core.SolrCore) ZkNodeProps(org.apache.solr.common.cloud.ZkNodeProps) CloudJettyRunner(org.apache.solr.cloud.AbstractFullDistribZkTestBase.CloudJettyRunner)

Example 2 with CloudJettyRunner

use of org.apache.solr.cloud.AbstractFullDistribZkTestBase.CloudJettyRunner in project lucene-solr by apache.

the class ChaosMonkey method canKillIndexer.

private boolean canKillIndexer(String sliceName) throws KeeperException, InterruptedException {
    int numIndexersFoundInShard = 0;
    for (CloudJettyRunner cloudJetty : shardToJetty.get(sliceName)) {
        // get latest cloud state
        zkStateReader.forceUpdateCollection(collection);
        DocCollection docCollection = zkStateReader.getClusterState().getCollection(collection);
        Slice slice = docCollection.getSlice(sliceName);
        ZkNodeProps props = slice.getReplicasMap().get(cloudJetty.coreNodeName);
        if (props == null) {
            throw new RuntimeException("shard name " + cloudJetty.coreNodeName + " not found in " + slice.getReplicasMap().keySet());
        }
        final Replica.State state = Replica.State.getState(props.getStr(ZkStateReader.STATE_PROP));
        final Replica.Type replicaType = Replica.Type.valueOf(props.getStr(ZkStateReader.REPLICA_TYPE));
        final String nodeName = props.getStr(ZkStateReader.NODE_NAME_PROP);
        if (cloudJetty.jetty.isRunning() && state == Replica.State.ACTIVE && (replicaType == Replica.Type.TLOG || replicaType == Replica.Type.NRT) && zkStateReader.getClusterState().liveNodesContain(nodeName)) {
            numIndexersFoundInShard++;
        }
    }
    return numIndexersFoundInShard > 1;
}
Also used : Slice(org.apache.solr.common.cloud.Slice) ZkNodeProps(org.apache.solr.common.cloud.ZkNodeProps) CloudJettyRunner(org.apache.solr.cloud.AbstractFullDistribZkTestBase.CloudJettyRunner) DocCollection(org.apache.solr.common.cloud.DocCollection) Type(org.apache.solr.common.cloud.Replica.Type) Replica(org.apache.solr.common.cloud.Replica)

Example 3 with CloudJettyRunner

use of org.apache.solr.cloud.AbstractFullDistribZkTestBase.CloudJettyRunner in project lucene-solr by apache.

the class ChaosMonkey method randomConnectionLoss.

public void randomConnectionLoss() throws KeeperException, InterruptedException {
    monkeyLog("Will cause connection loss!");
    String sliceName = getRandomSlice();
    CloudJettyRunner jetty = getRandomJetty(sliceName, aggressivelyKillLeaders);
    if (jetty != null) {
        causeConnectionLoss(jetty.jetty);
        connloss.incrementAndGet();
    }
}
Also used : CloudJettyRunner(org.apache.solr.cloud.AbstractFullDistribZkTestBase.CloudJettyRunner)

Example 4 with CloudJettyRunner

use of org.apache.solr.cloud.AbstractFullDistribZkTestBase.CloudJettyRunner in project lucene-solr by apache.

the class ChaosMonkey method expireRandomSession.

public void expireRandomSession() throws KeeperException, InterruptedException {
    String sliceName = getRandomSlice();
    CloudJettyRunner jetty = getRandomJetty(sliceName, aggressivelyKillLeaders);
    if (jetty != null) {
        expireSession(jetty.jetty);
        expires.incrementAndGet();
    }
}
Also used : CloudJettyRunner(org.apache.solr.cloud.AbstractFullDistribZkTestBase.CloudJettyRunner)

Example 5 with CloudJettyRunner

use of org.apache.solr.cloud.AbstractFullDistribZkTestBase.CloudJettyRunner in project lucene-solr by apache.

the class ChaosMonkey method stopShard.

public CloudJettyRunner stopShard(String slice, int index) throws Exception {
    CloudJettyRunner cjetty = shardToJetty.get(slice).get(index);
    stopJetty(cjetty);
    return cjetty;
}
Also used : CloudJettyRunner(org.apache.solr.cloud.AbstractFullDistribZkTestBase.CloudJettyRunner)

Aggregations

CloudJettyRunner (org.apache.solr.cloud.AbstractFullDistribZkTestBase.CloudJettyRunner)8 ZkNodeProps (org.apache.solr.common.cloud.ZkNodeProps)3 DocCollection (org.apache.solr.common.cloud.DocCollection)2 Replica (org.apache.solr.common.cloud.Replica)2 Slice (org.apache.solr.common.cloud.Slice)2 ArrayList (java.util.ArrayList)1 JettySolrRunner (org.apache.solr.client.solrj.embedded.JettySolrRunner)1 Type (org.apache.solr.common.cloud.Replica.Type)1 CoreContainer (org.apache.solr.core.CoreContainer)1 SolrCore (org.apache.solr.core.SolrCore)1 KeeperException (org.apache.zookeeper.KeeperException)1