use of org.apache.solr.util.TimeOut in project lucene-solr by apache.
the class BaseCdcrDistributedZkTest method waitForBootstrapToComplete.
protected void waitForBootstrapToComplete(String collectionName, String shardId) throws Exception {
// we need to wait until bootstrap is complete otherwise the replicator thread will never start
NamedList rsp;
TimeOut timeOut = new TimeOut(60, TimeUnit.SECONDS);
while (!timeOut.hasTimedOut()) {
rsp = invokeCdcrAction(shardToLeaderJetty.get(collectionName).get(shardId), CdcrParams.CdcrAction.BOOTSTRAP_STATUS);
if (rsp.get(RESPONSE_STATUS).toString().equals(COMPLETED)) {
break;
}
Thread.sleep(1000);
}
}
use of org.apache.solr.util.TimeOut in project lucene-solr by apache.
the class CreateCollectionCmd method call.
@Override
public void call(ClusterState clusterState, ZkNodeProps message, NamedList results) throws Exception {
final String collectionName = message.getStr(NAME);
log.info("Create collection {}", collectionName);
if (clusterState.hasCollection(collectionName)) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "collection already exists: " + collectionName);
}
String configName = getConfigName(collectionName, message);
if (configName == null) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "No config set found to associate with the collection.");
}
ocmh.validateConfigOrThrowSolrException(configName);
try {
// look at the replication factor and see if it matches reality
// if it does not, find best nodes to create more cores
int numTlogReplicas = message.getInt(TLOG_REPLICAS, 0);
int numNrtReplicas = message.getInt(NRT_REPLICAS, message.getInt(REPLICATION_FACTOR, numTlogReplicas > 0 ? 0 : 1));
int numPullReplicas = message.getInt(PULL_REPLICAS, 0);
ShardHandler shardHandler = ocmh.shardHandlerFactory.getShardHandler();
final String async = message.getStr(ASYNC);
Integer numSlices = message.getInt(NUM_SLICES, null);
String router = message.getStr("router.name", DocRouter.DEFAULT_NAME);
List<String> shardNames = new ArrayList<>();
if (ImplicitDocRouter.NAME.equals(router)) {
ClusterStateMutator.getShardNames(shardNames, message.getStr("shards", null));
numSlices = shardNames.size();
} else {
if (numSlices == null) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, NUM_SLICES + " is a required param (when using CompositeId router).");
}
ClusterStateMutator.getShardNames(numSlices, shardNames);
}
int maxShardsPerNode = message.getInt(MAX_SHARDS_PER_NODE, 1);
if (numNrtReplicas + numTlogReplicas <= 0) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, NRT_REPLICAS + " + " + TLOG_REPLICAS + " must be greater than 0");
}
if (numSlices <= 0) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, NUM_SLICES + " must be > 0");
}
// we need to look at every node and see how many cores it serves
// add our new cores to existing nodes serving the least number of cores
// but (for now) require that each core goes on a distinct node.
final List<String> nodeList = OverseerCollectionMessageHandler.getLiveOrLiveAndCreateNodeSetList(clusterState.getLiveNodes(), message, RANDOM);
Map<ReplicaAssigner.Position, String> positionVsNodes;
if (nodeList.isEmpty()) {
log.warn("It is unusual to create a collection (" + collectionName + ") without cores.");
positionVsNodes = new HashMap<>();
} else {
int totalNumReplicas = numNrtReplicas + numTlogReplicas + numPullReplicas;
if (totalNumReplicas > nodeList.size()) {
log.warn("Specified number of replicas of " + totalNumReplicas + " on collection " + collectionName + " is higher than the number of Solr instances currently live or live and part of your " + CREATE_NODE_SET + "(" + nodeList.size() + "). It's unusual to run two replica of the same slice on the same Solr-instance.");
}
int maxShardsAllowedToCreate = maxShardsPerNode * nodeList.size();
int requestedShardsToCreate = numSlices * totalNumReplicas;
if (maxShardsAllowedToCreate < requestedShardsToCreate) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "Cannot create collection " + collectionName + ". Value of " + MAX_SHARDS_PER_NODE + " is " + maxShardsPerNode + ", and the number of nodes currently live or live and part of your " + CREATE_NODE_SET + " is " + nodeList.size() + ". This allows a maximum of " + maxShardsAllowedToCreate + " to be created. Value of " + NUM_SLICES + " is " + numSlices + ", value of " + NRT_REPLICAS + " is " + numNrtReplicas + ", value of " + TLOG_REPLICAS + " is " + numTlogReplicas + " and value of " + PULL_REPLICAS + " is " + numPullReplicas + ". This requires " + requestedShardsToCreate + " shards to be created (higher than the allowed number)");
}
positionVsNodes = ocmh.identifyNodes(clusterState, nodeList, message, shardNames, numNrtReplicas, numTlogReplicas, numPullReplicas);
}
ZkStateReader zkStateReader = ocmh.zkStateReader;
boolean isLegacyCloud = Overseer.isLegacy(zkStateReader);
ocmh.createConfNode(configName, collectionName, isLegacyCloud);
Map<String, String> collectionParams = new HashMap<>();
Map<String, Object> collectionProps = message.getProperties();
for (String propName : collectionProps.keySet()) {
if (propName.startsWith(ZkController.COLLECTION_PARAM_PREFIX)) {
collectionParams.put(propName.substring(ZkController.COLLECTION_PARAM_PREFIX.length()), (String) collectionProps.get(propName));
}
}
createCollectionZkNode(zkClient, collectionName, collectionParams);
Overseer.getStateUpdateQueue(zkStateReader.getZkClient()).offer(Utils.toJSON(message));
// wait for a while until we don't see the collection
TimeOut waitUntil = new TimeOut(30, TimeUnit.SECONDS);
boolean created = false;
while (!waitUntil.hasTimedOut()) {
Thread.sleep(100);
created = zkStateReader.getClusterState().hasCollection(collectionName);
if (created)
break;
}
if (!created)
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Could not fully create collection: " + collectionName);
if (nodeList.isEmpty()) {
log.debug("Finished create command for collection: {}", collectionName);
return;
}
// For tracking async calls.
Map<String, String> requestMap = new HashMap<>();
log.debug(formatString("Creating SolrCores for new collection {0}, shardNames {1} , nrtReplicas : {2}, tlogReplicas: {3}, pullReplicas: {4}", collectionName, shardNames, numNrtReplicas, numTlogReplicas, numPullReplicas));
Map<String, ShardRequest> coresToCreate = new LinkedHashMap<>();
for (Map.Entry<ReplicaAssigner.Position, String> e : positionVsNodes.entrySet()) {
ReplicaAssigner.Position position = e.getKey();
String nodeName = e.getValue();
String coreName = Assign.buildCoreName(collectionName, position.shard, position.type, position.index + 1);
log.debug(formatString("Creating core {0} as part of shard {1} of collection {2} on {3}", coreName, position.shard, collectionName, nodeName));
String baseUrl = zkStateReader.getBaseUrlForNodeName(nodeName);
// Otherwise the core creation fails
if (!isLegacyCloud) {
ZkNodeProps props = new ZkNodeProps(Overseer.QUEUE_OPERATION, ADDREPLICA.toString(), ZkStateReader.COLLECTION_PROP, collectionName, ZkStateReader.SHARD_ID_PROP, position.shard, ZkStateReader.CORE_NAME_PROP, coreName, ZkStateReader.STATE_PROP, Replica.State.DOWN.toString(), ZkStateReader.BASE_URL_PROP, baseUrl, ZkStateReader.REPLICA_TYPE, position.type.name());
Overseer.getStateUpdateQueue(zkStateReader.getZkClient()).offer(Utils.toJSON(props));
}
// Need to create new params for each request
ModifiableSolrParams params = new ModifiableSolrParams();
params.set(CoreAdminParams.ACTION, CoreAdminParams.CoreAdminAction.CREATE.toString());
params.set(CoreAdminParams.NAME, coreName);
params.set(COLL_CONF, configName);
params.set(CoreAdminParams.COLLECTION, collectionName);
params.set(CoreAdminParams.SHARD, position.shard);
params.set(ZkStateReader.NUM_SHARDS_PROP, numSlices);
params.set(CoreAdminParams.NEW_COLLECTION, "true");
params.set(CoreAdminParams.REPLICA_TYPE, position.type.name());
if (async != null) {
String coreAdminAsyncId = async + Math.abs(System.nanoTime());
params.add(ASYNC, coreAdminAsyncId);
requestMap.put(nodeName, coreAdminAsyncId);
}
ocmh.addPropertyParams(message, params);
ShardRequest sreq = new ShardRequest();
sreq.nodeName = nodeName;
params.set("qt", ocmh.adminPath);
sreq.purpose = 1;
sreq.shards = new String[] { baseUrl };
sreq.actualShards = sreq.shards;
sreq.params = params;
if (isLegacyCloud) {
shardHandler.submit(sreq, sreq.shards[0], sreq.params);
} else {
coresToCreate.put(coreName, sreq);
}
}
if (!isLegacyCloud) {
// wait for all replica entries to be created
Map<String, Replica> replicas = ocmh.waitToSeeReplicasInState(collectionName, coresToCreate.keySet());
for (Map.Entry<String, ShardRequest> e : coresToCreate.entrySet()) {
ShardRequest sreq = e.getValue();
sreq.params.set(CoreAdminParams.CORE_NODE_NAME, replicas.get(e.getKey()).getName());
shardHandler.submit(sreq, sreq.shards[0], sreq.params);
}
}
ocmh.processResponses(results, shardHandler, false, null, async, requestMap, Collections.emptySet());
if (results.get("failure") != null && ((SimpleOrderedMap) results.get("failure")).size() > 0) {
// Let's cleanup as we hit an exception
// We shouldn't be passing 'results' here for the cleanup as the response would then contain 'success'
// element, which may be interpreted by the user as a positive ack
ocmh.cleanupCollection(collectionName, new NamedList());
log.info("Cleaned up artifacts for failed create collection for [{}]", collectionName);
} else {
log.debug("Finished create command on all shards for collection: {}", collectionName);
}
} catch (SolrException ex) {
throw ex;
} catch (Exception ex) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, null, ex);
}
}
use of org.apache.solr.util.TimeOut in project lucene-solr by apache.
the class DeleteCollectionCmd method call.
@Override
public void call(ClusterState state, ZkNodeProps message, NamedList results) throws Exception {
ZkStateReader zkStateReader = ocmh.zkStateReader;
final String collection = message.getStr(NAME);
try {
// Remove the snapshots meta-data for this collection in ZK. Deleting actual index files
// should be taken care of as part of collection delete operation.
SolrZkClient zkClient = zkStateReader.getZkClient();
SolrSnapshotManager.cleanupCollectionLevelSnapshots(zkClient, collection);
if (zkStateReader.getClusterState().getCollectionOrNull(collection) == null) {
if (zkStateReader.getZkClient().exists(ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection, true)) {
// is not in the clusterstate
return;
}
}
ModifiableSolrParams params = new ModifiableSolrParams();
params.set(CoreAdminParams.ACTION, CoreAdminParams.CoreAdminAction.UNLOAD.toString());
params.set(CoreAdminParams.DELETE_INSTANCE_DIR, true);
params.set(CoreAdminParams.DELETE_DATA_DIR, true);
String asyncId = message.getStr(ASYNC);
Map<String, String> requestMap = null;
if (asyncId != null) {
requestMap = new HashMap<>();
}
Set<String> okayExceptions = new HashSet<>(1);
okayExceptions.add(NonExistentCoreException.class.getName());
ocmh.collectionCmd(message, params, results, null, asyncId, requestMap, okayExceptions);
ZkNodeProps m = new ZkNodeProps(Overseer.QUEUE_OPERATION, DELETE.toLower(), NAME, collection);
Overseer.getStateUpdateQueue(zkStateReader.getZkClient()).offer(Utils.toJSON(m));
// wait for a while until we don't see the collection
TimeOut timeout = new TimeOut(30, TimeUnit.SECONDS);
boolean removed = false;
while (!timeout.hasTimedOut()) {
Thread.sleep(100);
removed = !zkStateReader.getClusterState().hasCollection(collection);
if (removed) {
// just a bit of time so it's more likely other
Thread.sleep(500);
// readers see on return
break;
}
}
if (!removed) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Could not fully remove collection: " + collection);
}
} finally {
try {
if (zkStateReader.getZkClient().exists(ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection, true)) {
zkStateReader.getZkClient().clean(ZkStateReader.COLLECTIONS_ZKNODE + "/" + collection);
}
} catch (InterruptedException e) {
SolrException.log(log, "Cleaning up collection in zk was interrupted:" + collection, e);
Thread.currentThread().interrupt();
} catch (KeeperException e) {
SolrException.log(log, "Problem cleaning up collection in zk:" + collection, e);
}
}
}
use of org.apache.solr.util.TimeOut in project lucene-solr by apache.
the class DeleteShardCmd method call.
@Override
public void call(ClusterState clusterState, ZkNodeProps message, NamedList results) throws Exception {
String collectionName = message.getStr(ZkStateReader.COLLECTION_PROP);
String sliceId = message.getStr(ZkStateReader.SHARD_ID_PROP);
log.info("Delete shard invoked");
Slice slice = clusterState.getSlice(collectionName, sliceId);
if (slice == null) {
if (clusterState.hasCollection(collectionName)) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "No shard with name " + sliceId + " exists for collection " + collectionName);
} else {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "No collection with the specified name exists: " + collectionName);
}
}
// For now, only allow for deletions of Inactive slices or custom hashes (range==null).
// TODO: Add check for range gaps on Slice deletion
final Slice.State state = slice.getState();
if (!(slice.getRange() == null || state == Slice.State.INACTIVE || state == Slice.State.RECOVERY || state == Slice.State.CONSTRUCTION) || state == Slice.State.RECOVERY_FAILED) {
throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "The slice: " + slice.getName() + " is currently " + state + ". Only non-active (or custom-hashed) slices can be deleted.");
}
if (state == Slice.State.RECOVERY) {
// mark the slice as 'construction' and only then try to delete the cores
// see SOLR-9455
DistributedQueue inQueue = Overseer.getStateUpdateQueue(ocmh.zkStateReader.getZkClient());
Map<String, Object> propMap = new HashMap<>();
propMap.put(Overseer.QUEUE_OPERATION, OverseerAction.UPDATESHARDSTATE.toLower());
propMap.put(sliceId, Slice.State.CONSTRUCTION.toString());
propMap.put(ZkStateReader.COLLECTION_PROP, collectionName);
ZkNodeProps m = new ZkNodeProps(propMap);
inQueue.offer(Utils.toJSON(m));
}
String asyncId = message.getStr(ASYNC);
try {
List<ZkNodeProps> replicas = getReplicasForSlice(collectionName, slice);
CountDownLatch cleanupLatch = new CountDownLatch(replicas.size());
for (ZkNodeProps r : replicas) {
final ZkNodeProps replica = r.plus(message.getProperties()).plus("parallel", "true").plus(ASYNC, asyncId);
log.info("Deleting replica for collection={} shard={} on node={}", replica.getStr(COLLECTION_PROP), replica.getStr(SHARD_ID_PROP), replica.getStr(CoreAdminParams.NODE));
NamedList deleteResult = new NamedList();
try {
((DeleteReplicaCmd) ocmh.commandMap.get(DELETEREPLICA)).deleteReplica(clusterState, replica, deleteResult, () -> {
cleanupLatch.countDown();
if (deleteResult.get("failure") != null) {
synchronized (results) {
results.add("failure", String.format(Locale.ROOT, "Failed to delete replica for collection=%s shard=%s" + " on node=%s", replica.getStr(COLLECTION_PROP), replica.getStr(SHARD_ID_PROP), replica.getStr(NODE_NAME_PROP)));
}
}
SimpleOrderedMap success = (SimpleOrderedMap) deleteResult.get("success");
if (success != null) {
synchronized (results) {
results.add("success", success);
}
}
});
} catch (KeeperException e) {
log.warn("Error deleting replica: " + r, e);
cleanupLatch.countDown();
} catch (Exception e) {
log.warn("Error deleting replica: " + r, e);
cleanupLatch.countDown();
throw e;
}
}
log.debug("Waiting for delete shard action to complete");
cleanupLatch.await(5, TimeUnit.MINUTES);
ZkNodeProps m = new ZkNodeProps(Overseer.QUEUE_OPERATION, DELETESHARD.toLower(), ZkStateReader.COLLECTION_PROP, collectionName, ZkStateReader.SHARD_ID_PROP, sliceId);
ZkStateReader zkStateReader = ocmh.zkStateReader;
Overseer.getStateUpdateQueue(zkStateReader.getZkClient()).offer(Utils.toJSON(m));
// wait for a while until we don't see the shard
TimeOut timeout = new TimeOut(30, TimeUnit.SECONDS);
boolean removed = false;
while (!timeout.hasTimedOut()) {
Thread.sleep(100);
DocCollection collection = zkStateReader.getClusterState().getCollection(collectionName);
removed = collection.getSlice(sliceId) == null;
if (removed) {
// just a bit of time so it's more likely other readers see on return
Thread.sleep(100);
break;
}
}
if (!removed) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Could not fully remove collection: " + collectionName + " shard: " + sliceId);
}
log.info("Successfully deleted collection: " + collectionName + ", shard: " + sliceId);
} catch (SolrException e) {
throw e;
} catch (Exception e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Error executing delete operation for collection: " + collectionName + " shard: " + sliceId, e);
}
}
use of org.apache.solr.util.TimeOut in project lucene-solr by apache.
the class OverseerCollectionMessageHandler method waitToSeeReplicasInState.
Map<String, Replica> waitToSeeReplicasInState(String collectionName, Collection<String> coreNames) throws InterruptedException {
Map<String, Replica> result = new HashMap<>();
TimeOut timeout = new TimeOut(30, TimeUnit.SECONDS);
while (true) {
DocCollection coll = zkStateReader.getClusterState().getCollection(collectionName);
for (String coreName : coreNames) {
if (result.containsKey(coreName))
continue;
for (Slice slice : coll.getSlices()) {
for (Replica replica : slice.getReplicas()) {
if (coreName.equals(replica.getStr(ZkStateReader.CORE_NAME_PROP))) {
result.put(coreName, replica);
break;
}
}
}
}
if (result.size() == coreNames.size()) {
return result;
} else {
log.debug("Expecting {} cores but found {}", coreNames.size(), result.size());
}
if (timeout.hasTimedOut()) {
throw new SolrException(ErrorCode.SERVER_ERROR, "Timed out waiting to see all replicas: " + coreNames + " in cluster state.");
}
Thread.sleep(100);
}
}
Aggregations