use of org.apache.solr.common.cloud.ZooKeeperException in project lucene-solr by apache.
the class ZkContainer method initZooKeeper.
public void initZooKeeper(final CoreContainer cc, String solrHome, CloudConfig config) {
ZkController zkController = null;
String zkRun = System.getProperty("zkRun");
if (zkRun != null && config == null)
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Cannot start Solr in cloud mode - no cloud config provided");
if (config == null)
// not in zk mode
return;
String zookeeperHost = config.getZkHost();
// zookeeper in quorum mode currently causes a failure when trying to
// register log4j mbeans. See SOLR-2369
// TODO: remove after updating to an slf4j based zookeeper
System.setProperty("zookeeper.jmx.log4j.disable", "true");
if (zkRun != null) {
String zkDataHome = System.getProperty("zkServerDataDir", Paths.get(solrHome).resolve("zoo_data").toString());
String zkConfHome = System.getProperty("zkServerConfDir", solrHome);
zkServer = new SolrZkServer(stripChroot(zkRun), stripChroot(config.getZkHost()), zkDataHome, zkConfHome, config.getSolrHostPort());
zkServer.parseConfig();
zkServer.start();
// set client from server config if not already set
if (zookeeperHost == null) {
zookeeperHost = zkServer.getClientString();
}
}
int zkClientConnectTimeout = 30000;
if (zookeeperHost != null) {
// we are ZooKeeper enabled
try {
// If this is an ensemble, allow for a long connect time for other servers to come up
if (zkRun != null && zkServer.getServers().size() > 1) {
// 1 day for embedded ensemble
zkClientConnectTimeout = 24 * 60 * 60 * 1000;
log.info("Zookeeper client=" + zookeeperHost + " Waiting for a quorum.");
} else {
log.info("Zookeeper client=" + zookeeperHost);
}
String confDir = System.getProperty("bootstrap_confdir");
boolean boostrapConf = Boolean.getBoolean("bootstrap_conf");
if (!ZkController.checkChrootPath(zookeeperHost, (confDir != null) || boostrapConf || zkRunOnly)) {
throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "A chroot was specified in ZkHost but the znode doesn't exist. " + zookeeperHost);
}
zkController = new ZkController(cc, zookeeperHost, zkClientConnectTimeout, config, new CurrentCoreDescriptorProvider() {
@Override
public List<CoreDescriptor> getCurrentDescriptors() {
List<CoreDescriptor> descriptors = new ArrayList<>(cc.getLoadedCoreNames().size());
Collection<SolrCore> cores = cc.getCores();
for (SolrCore core : cores) {
descriptors.add(core.getCoreDescriptor());
}
return descriptors;
}
});
if (zkRun != null && zkServer.getServers().size() > 1 && confDir == null && boostrapConf == false) {
// we are part of an ensemble and we are not uploading the config - pause to give the config time
// to get up
Thread.sleep(10000);
}
if (confDir != null) {
Path configPath = Paths.get(confDir);
if (!Files.isDirectory(configPath))
throw new IllegalArgumentException("bootstrap_confdir must be a directory of configuration files");
String confName = System.getProperty(ZkController.COLLECTION_PARAM_PREFIX + ZkController.CONFIGNAME_PROP, "configuration1");
ZkConfigManager configManager = new ZkConfigManager(zkController.getZkClient());
configManager.uploadConfigDir(configPath, confName);
}
if (boostrapConf) {
ZkController.bootstrapConf(zkController.getZkClient(), cc, solrHome);
}
} catch (InterruptedException e) {
// Restore the interrupted status
Thread.currentThread().interrupt();
log.error("", e);
throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "", e);
} catch (TimeoutException e) {
log.error("Could not connect to ZooKeeper", e);
throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "", e);
} catch (IOException | KeeperException e) {
log.error("", e);
throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "", e);
}
}
this.zkController = zkController;
}
use of org.apache.solr.common.cloud.ZooKeeperException in project lucene-solr by apache.
the class DistributedUpdateProcessor method getCollectionUrls.
private List<Node> getCollectionUrls(SolrQueryRequest req, String collection, EnumSet<Replica.Type> types) {
ClusterState clusterState = req.getCore().getCoreContainer().getZkController().getClusterState();
Map<String, Slice> slices = clusterState.getSlicesMap(collection);
if (slices == null) {
throw new ZooKeeperException(ErrorCode.BAD_REQUEST, "Could not find collection in zk: " + clusterState);
}
final List<Node> urls = new ArrayList<>(slices.size());
for (Map.Entry<String, Slice> sliceEntry : slices.entrySet()) {
Slice replicas = slices.get(sliceEntry.getKey());
Map<String, Replica> shardMap = replicas.getReplicasMap();
for (Entry<String, Replica> entry : shardMap.entrySet()) {
if (!types.contains(entry.getValue().getType())) {
continue;
}
ZkCoreNodeProps nodeProps = new ZkCoreNodeProps(entry.getValue());
if (clusterState.liveNodesContain(nodeProps.getNodeName())) {
urls.add(new StdNode(nodeProps, collection, replicas.getName()));
}
}
}
if (urls.isEmpty()) {
return null;
}
return urls;
}
use of org.apache.solr.common.cloud.ZooKeeperException in project lucene-solr by apache.
the class DistributedUpdateProcessor method setupRequestForDBQ.
// used for deleteByQuery to get the list of nodes this leader should forward to
private List<Node> setupRequestForDBQ() {
List<Node> nodes = null;
String shardId = cloudDesc.getShardId();
try {
Replica leaderReplica = zkController.getZkStateReader().getLeaderRetry(collection, shardId);
isLeader = leaderReplica.getName().equals(req.getCore().getCoreDescriptor().getCloudDescriptor().getCoreNodeName());
// TODO: what if we are no longer the leader?
forwardToLeader = false;
List<ZkCoreNodeProps> replicaProps = zkController.getZkStateReader().getReplicaProps(collection, shardId, leaderReplica.getName(), null, Replica.State.DOWN, EnumSet.of(Replica.Type.NRT, Replica.Type.TLOG));
if (replicaProps != null) {
nodes = new ArrayList<>(replicaProps.size());
for (ZkCoreNodeProps props : replicaProps) {
nodes.add(new StdNode(props, collection, shardId));
}
}
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "", e);
}
return nodes;
}
use of org.apache.solr.common.cloud.ZooKeeperException in project lucene-solr by apache.
the class DistributedUpdateProcessor method setupRequest.
private List<Node> setupRequest(String id, SolrInputDocument doc, String route) {
List<Node> nodes = null;
// if we are in zk mode...
if (zkEnabled) {
assert TestInjection.injectUpdateRandomPause();
if ((updateCommand.getFlags() & (UpdateCommand.REPLAY | UpdateCommand.PEER_SYNC)) != 0) {
// we actually might be the leader, but we don't want leader-logic for these types of updates anyway.
isLeader = false;
forwardToLeader = false;
return nodes;
}
ClusterState cstate = zkController.getClusterState();
DocCollection coll = cstate.getCollection(collection);
Slice slice = coll.getRouter().getTargetSlice(id, doc, route, req.getParams(), coll);
if (slice == null) {
// No slice found. Most strict routers will have already thrown an exception, so a null return is
// a signal to use the slice of this core.
// TODO: what if this core is not in the targeted collection?
String shardId = req.getCore().getCoreDescriptor().getCloudDescriptor().getShardId();
slice = coll.getSlice(shardId);
if (slice == null) {
throw new SolrException(ErrorCode.BAD_REQUEST, "No shard " + shardId + " in " + coll);
}
}
DistribPhase phase = DistribPhase.parseParam(req.getParams().get(DISTRIB_UPDATE_PARAM));
if (DistribPhase.FROMLEADER == phase && !couldIbeSubShardLeader(coll)) {
if (req.getCore().getCoreDescriptor().getCloudDescriptor().isLeader()) {
// locally we think we are leader but the request says it came FROMLEADER
// that could indicate a problem, let the full logic below figure it out
} else {
assert TestInjection.injectFailReplicaRequests();
// we actually might be the leader, but we don't want leader-logic for these types of updates anyway.
isLeader = false;
forwardToLeader = false;
return nodes;
}
}
String shardId = slice.getName();
try {
// Not equivalent to getLeaderProps, which does retries to find a leader.
// Replica leader = slice.getLeader();
Replica leaderReplica = zkController.getZkStateReader().getLeaderRetry(collection, shardId);
isLeader = leaderReplica.getName().equals(req.getCore().getCoreDescriptor().getCloudDescriptor().getCoreNodeName());
if (!isLeader) {
isSubShardLeader = amISubShardLeader(coll, slice, id, doc);
if (isSubShardLeader) {
String myShardId = req.getCore().getCoreDescriptor().getCloudDescriptor().getShardId();
slice = coll.getSlice(myShardId);
shardId = myShardId;
leaderReplica = zkController.getZkStateReader().getLeaderRetry(collection, myShardId);
List<ZkCoreNodeProps> myReplicas = zkController.getZkStateReader().getReplicaProps(collection, shardId, leaderReplica.getName(), null, Replica.State.DOWN);
}
}
doDefensiveChecks(phase);
// if request is coming from another collection then we want it to be sent to all replicas
// even if its phase is FROMLEADER
String fromCollection = updateCommand.getReq().getParams().get(DISTRIB_FROM_COLLECTION);
if (DistribPhase.FROMLEADER == phase && !isSubShardLeader && fromCollection == null) {
// we are coming from the leader, just go local - add no urls
forwardToLeader = false;
} else if (isLeader || isSubShardLeader) {
// that means I want to forward onto my replicas...
// so get the replicas...
forwardToLeader = false;
List<ZkCoreNodeProps> replicaProps = zkController.getZkStateReader().getReplicaProps(collection, shardId, leaderReplica.getName(), null, Replica.State.DOWN);
if (replicaProps != null) {
if (nodes == null) {
nodes = new ArrayList<>(replicaProps.size());
}
// check for test param that lets us miss replicas
String[] skipList = req.getParams().getParams(TEST_DISTRIB_SKIP_SERVERS);
Set<String> skipListSet = null;
if (skipList != null) {
skipListSet = new HashSet<>(skipList.length);
skipListSet.addAll(Arrays.asList(skipList));
log.info("test.distrib.skip.servers was found and contains:" + skipListSet);
}
for (ZkCoreNodeProps props : replicaProps) {
if (skipList != null) {
boolean skip = skipListSet.contains(props.getCoreUrl());
log.info("check url:" + props.getCoreUrl() + " against:" + skipListSet + " result:" + skip);
if (!skip) {
nodes.add(new StdNode(props, collection, shardId));
}
} else {
nodes.add(new StdNode(props, collection, shardId));
}
}
}
} else {
// I need to forward onto the leader...
nodes = new ArrayList<>(1);
nodes.add(new RetryNode(new ZkCoreNodeProps(leaderReplica), zkController.getZkStateReader(), collection, shardId));
forwardToLeader = true;
}
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new ZooKeeperException(SolrException.ErrorCode.SERVER_ERROR, "", e);
}
}
return nodes;
}
use of org.apache.solr.common.cloud.ZooKeeperException in project lucene-solr by apache.
the class DistributedUpdateProcessor method doDeleteByQuery.
public void doDeleteByQuery(DeleteUpdateCommand cmd) throws IOException {
// even in non zk mode, tests simulate updates from a leader
if (!zkEnabled) {
isLeader = getNonZkLeaderAssumption(req);
} else {
zkCheck();
}
// NONE: we are the first to receive this deleteByQuery
// - it must be forwarded to the leader of every shard
// TO: we are a leader receiving a forwarded deleteByQuery... we must:
// - block all updates (use VersionInfo)
// - flush *all* updates going to our replicas
// - forward the DBQ to our replicas and wait for the response
// - log + execute the local DBQ
// FROM: we are a replica receiving a DBQ from our leader
// - log + execute the local DBQ
DistribPhase phase = DistribPhase.parseParam(req.getParams().get(DISTRIB_UPDATE_PARAM));
DocCollection coll = zkEnabled ? zkController.getClusterState().getCollection(collection) : null;
if (zkEnabled && DistribPhase.NONE == phase) {
// start off by assuming we are not a leader for any shard
boolean leaderForAnyShard = false;
ModifiableSolrParams outParams = new ModifiableSolrParams(filterParams(req.getParams()));
outParams.set(DISTRIB_UPDATE_PARAM, DistribPhase.TOLEADER.toString());
outParams.set(DISTRIB_FROM, ZkCoreNodeProps.getCoreUrl(zkController.getBaseUrl(), req.getCore().getName()));
SolrParams params = req.getParams();
String route = params.get(ShardParams._ROUTE_);
Collection<Slice> slices = coll.getRouter().getSearchSlices(route, params, coll);
List<Node> leaders = new ArrayList<>(slices.size());
for (Slice slice : slices) {
String sliceName = slice.getName();
Replica leader;
try {
leader = zkController.getZkStateReader().getLeaderRetry(collection, sliceName);
} catch (InterruptedException e) {
throw new SolrException(ErrorCode.SERVICE_UNAVAILABLE, "Exception finding leader for shard " + sliceName, e);
}
// TODO: What if leaders changed in the meantime?
// should we send out slice-at-a-time and if a node returns "hey, I'm not a leader" (or we get an error because it went down) then look up the new leader?
// Am I the leader for this slice?
ZkCoreNodeProps coreLeaderProps = new ZkCoreNodeProps(leader);
String leaderCoreNodeName = leader.getName();
String coreNodeName = req.getCore().getCoreDescriptor().getCloudDescriptor().getCoreNodeName();
isLeader = coreNodeName.equals(leaderCoreNodeName);
if (isLeader) {
// don't forward to ourself
leaderForAnyShard = true;
} else {
leaders.add(new RetryNode(coreLeaderProps, zkController.getZkStateReader(), collection, sliceName));
}
}
// this will be distributed from the local commit
outParams.remove("commit");
cmdDistrib.distribDelete(cmd, leaders, outParams);
if (!leaderForAnyShard) {
return;
}
// change the phase to TOLEADER so we look up and forward to our own replicas (if any)
phase = DistribPhase.TOLEADER;
}
List<Node> replicas = null;
if (zkEnabled && DistribPhase.TOLEADER == phase) {
// This core should be a leader
isLeader = true;
replicas = setupRequestForDBQ();
} else if (DistribPhase.FROMLEADER == phase) {
isLeader = false;
}
if (vinfo == null) {
super.processDelete(cmd);
return;
}
// at this point, there is an update we need to try and apply.
// we may or may not be the leader.
boolean isReplayOrPeersync = (cmd.getFlags() & (UpdateCommand.REPLAY | UpdateCommand.PEER_SYNC)) != 0;
boolean leaderLogic = isLeader && !isReplayOrPeersync;
versionDeleteByQuery(cmd);
if (zkEnabled) {
// forward to all replicas
ModifiableSolrParams params = new ModifiableSolrParams(filterParams(req.getParams()));
params.set(CommonParams.VERSION_FIELD, Long.toString(cmd.getVersion()));
params.set(DISTRIB_UPDATE_PARAM, DistribPhase.FROMLEADER.toString());
params.set(DISTRIB_FROM, ZkCoreNodeProps.getCoreUrl(zkController.getBaseUrl(), req.getCore().getName()));
boolean someReplicas = false;
boolean subShardLeader = false;
try {
subShardLeader = amISubShardLeader(coll, null, null, null);
if (subShardLeader) {
String myShardId = req.getCore().getCoreDescriptor().getCloudDescriptor().getShardId();
Replica leaderReplica = zkController.getZkStateReader().getLeaderRetry(collection, myShardId);
// DBQ forwarded to NRT and TLOG replicas
List<ZkCoreNodeProps> replicaProps = zkController.getZkStateReader().getReplicaProps(collection, myShardId, leaderReplica.getName(), null, Replica.State.DOWN, EnumSet.of(Replica.Type.NRT, Replica.Type.TLOG));
if (replicaProps != null) {
final List<Node> myReplicas = new ArrayList<>(replicaProps.size());
for (ZkCoreNodeProps replicaProp : replicaProps) {
myReplicas.add(new StdNode(replicaProp, collection, myShardId));
}
cmdDistrib.distribDelete(cmd, myReplicas, params);
someReplicas = true;
}
}
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new ZooKeeperException(ErrorCode.SERVER_ERROR, "", e);
}
if (leaderLogic) {
List<Node> subShardLeaders = getSubShardLeaders(coll, cloudDesc.getShardId(), null, null);
if (subShardLeaders != null) {
cmdDistrib.distribDelete(cmd, subShardLeaders, params, true);
}
final List<Node> nodesByRoutingRules = getNodesByRoutingRules(zkController.getClusterState(), coll, null, null);
if (nodesByRoutingRules != null && !nodesByRoutingRules.isEmpty()) {
params = new ModifiableSolrParams(filterParams(req.getParams()));
params.set(DISTRIB_UPDATE_PARAM, DistribPhase.FROMLEADER.toString());
params.set(DISTRIB_FROM, ZkCoreNodeProps.getCoreUrl(zkController.getBaseUrl(), req.getCore().getName()));
params.set(DISTRIB_FROM_COLLECTION, req.getCore().getCoreDescriptor().getCloudDescriptor().getCollectionName());
params.set(DISTRIB_FROM_SHARD, req.getCore().getCoreDescriptor().getCloudDescriptor().getShardId());
cmdDistrib.distribDelete(cmd, nodesByRoutingRules, params, true);
}
if (replicas != null) {
cmdDistrib.distribDelete(cmd, replicas, params);
someReplicas = true;
}
}
if (someReplicas) {
cmdDistrib.blockAndDoRetries();
}
}
if (returnVersions && rsp != null) {
if (deleteByQueryResponse == null) {
deleteByQueryResponse = new NamedList<String>(1);
rsp.add("deleteByQuery", deleteByQueryResponse);
}
deleteByQueryResponse.add(cmd.getQuery(), cmd.getVersion());
}
}
Aggregations