use of org.apache.solr.common.cloud.ZkCoreNodeProps in project lucene-solr by apache.
the class DistributedUpdateProcessor method doFinish.
// TODO: optionally fail if n replicas are not reached...
private void doFinish() {
// TODO: if not a forward and replication req is not specified, we could
// send in a background thread
cmdDistrib.finish();
List<Error> errors = cmdDistrib.getErrors();
// TODO - we may need to tell about more than one error...
List<Error> errorsForClient = new ArrayList<>(errors.size());
for (final SolrCmdDistributor.Error error : errors) {
if (error.req.node instanceof RetryNode) {
// if it's a forward, any fail is a problem -
// otherwise we assume things are fine if we got it locally
// until we start allowing min replication param
errorsForClient.add(error);
continue;
}
// succeeded
if (log.isWarnEnabled()) {
log.warn("Error sending update to " + error.req.node.getBaseUrl(), error.e);
}
// Since it is not a forward request, for each fail, try to tell them to
// recover - the doc was already added locally, so it should have been
// legit
DistribPhase phase = DistribPhase.parseParam(error.req.uReq.getParams().get(DISTRIB_UPDATE_PARAM));
if (phase != DistribPhase.FROMLEADER)
// don't have non-leaders try to recovery other nodes
continue;
// we don't want to run recovery on a node which missed a commit command
if (error.req.uReq.getParams().get(COMMIT_END_POINT) != null)
continue;
final String replicaUrl = error.req.node.getUrl();
// if the remote replica failed the request because of leader change (SOLR-6511), then fail the request
String cause = (error.e instanceof SolrException) ? ((SolrException) error.e).getMetadata("cause") : null;
if ("LeaderChanged".equals(cause)) {
// let's just fail this request and let the client retry? or just call processAdd again?
log.error("On " + cloudDesc.getCoreNodeName() + ", replica " + replicaUrl + " now thinks it is the leader! Failing the request to let the client retry! " + error.e);
errorsForClient.add(error);
continue;
}
String collection = null;
String shardId = null;
if (error.req.node instanceof StdNode) {
StdNode stdNode = (StdNode) error.req.node;
collection = stdNode.getCollection();
shardId = stdNode.getShardId();
// before we go setting other replicas to down, make sure we're still the leader!
String leaderCoreNodeName = null;
Exception getLeaderExc = null;
Replica leaderProps = null;
try {
leaderProps = zkController.getZkStateReader().getLeader(collection, shardId);
if (leaderProps != null) {
leaderCoreNodeName = leaderProps.getName();
}
} catch (Exception exc) {
getLeaderExc = exc;
}
if (leaderCoreNodeName == null) {
log.warn("Failed to determine if {} is still the leader for collection={} shardId={} " + "before putting {} into leader-initiated recovery", cloudDesc.getCoreNodeName(), collection, shardId, replicaUrl, getLeaderExc);
}
List<ZkCoreNodeProps> myReplicas = zkController.getZkStateReader().getReplicaProps(collection, cloudDesc.getShardId(), cloudDesc.getCoreNodeName());
boolean foundErrorNodeInReplicaList = false;
if (myReplicas != null) {
for (ZkCoreNodeProps replicaProp : myReplicas) {
if (((Replica) replicaProp.getNodeProps()).getName().equals(((Replica) stdNode.getNodeProps().getNodeProps()).getName())) {
foundErrorNodeInReplicaList = true;
break;
}
}
}
// If the client specified minRf and we didn't achieve the minRf, don't send recovery and let client retry
if (replicationTracker != null && replicationTracker.getAchievedRf() < replicationTracker.minRf) {
continue;
}
if (// we are still same leader
leaderCoreNodeName != null && cloudDesc.getCoreNodeName().equals(leaderCoreNodeName) && // we found an error for one of replicas
foundErrorNodeInReplicaList && !stdNode.getNodeProps().getCoreUrl().equals(leaderProps.getCoreUrl())) {
// we do not want to put ourself into LIR
try {
// if false, then the node is probably not "live" anymore
// and we do not need to send a recovery message
Throwable rootCause = SolrException.getRootCause(error.e);
log.error("Setting up to try to start recovery on replica {}", replicaUrl, rootCause);
zkController.ensureReplicaInLeaderInitiatedRecovery(req.getCore().getCoreContainer(), collection, shardId, stdNode.getNodeProps(), req.getCore().getCoreDescriptor(), false);
} catch (Exception exc) {
Throwable setLirZnodeFailedCause = SolrException.getRootCause(exc);
log.error("Leader failed to set replica " + error.req.node.getUrl() + " state to DOWN due to: " + setLirZnodeFailedCause, setLirZnodeFailedCause);
}
} else {
// not the leader anymore maybe or the error'd node is not my replica?
if (!foundErrorNodeInReplicaList) {
log.warn("Core " + cloudDesc.getCoreNodeName() + " belonging to " + collection + " " + shardId + ", does not have error'd node " + stdNode.getNodeProps().getCoreUrl() + " as a replica. " + "No request recovery command will be sent!");
} else {
log.warn("Core " + cloudDesc.getCoreNodeName() + " is no longer the leader for " + collection + " " + shardId + " or we tried to put ourself into LIR, no request recovery command will be sent!");
}
}
}
}
if (replicationTracker != null) {
rsp.getResponseHeader().add(UpdateRequest.REPFACT, replicationTracker.getAchievedRf());
rsp.getResponseHeader().add(UpdateRequest.MIN_REPFACT, replicationTracker.minRf);
replicationTracker = null;
}
if (0 < errorsForClient.size()) {
throw new DistributedUpdatesAsyncException(errorsForClient);
}
}
use of org.apache.solr.common.cloud.ZkCoreNodeProps in project lucene-solr by apache.
the class DistributedUpdateProcessor method getNodesByRoutingRules.
private List<Node> getNodesByRoutingRules(ClusterState cstate, DocCollection coll, String id, SolrInputDocument doc) {
DocRouter router = coll.getRouter();
List<Node> nodes = null;
if (router instanceof CompositeIdRouter) {
CompositeIdRouter compositeIdRouter = (CompositeIdRouter) router;
String myShardId = req.getCore().getCoreDescriptor().getCloudDescriptor().getShardId();
Slice slice = coll.getSlice(myShardId);
Map<String, RoutingRule> routingRules = slice.getRoutingRules();
if (routingRules != null) {
// delete by query case
if (id == null) {
for (Entry<String, RoutingRule> entry : routingRules.entrySet()) {
String targetCollectionName = entry.getValue().getTargetCollectionName();
Collection<Slice> activeSlices = cstate.getActiveSlices(targetCollectionName);
if (activeSlices != null && !activeSlices.isEmpty()) {
Slice any = activeSlices.iterator().next();
if (nodes == null)
nodes = new ArrayList<>();
nodes.add(new StdNode(new ZkCoreNodeProps(any.getLeader())));
}
}
return nodes;
}
String routeKey = SolrIndexSplitter.getRouteKey(id);
if (routeKey != null) {
RoutingRule rule = routingRules.get(routeKey + "!");
if (rule != null) {
if (!rule.isExpired()) {
List<DocRouter.Range> ranges = rule.getRouteRanges();
if (ranges != null && !ranges.isEmpty()) {
int hash = compositeIdRouter.sliceHash(id, doc, null, coll);
for (DocRouter.Range range : ranges) {
if (range.includes(hash)) {
DocCollection targetColl = cstate.getCollection(rule.getTargetCollectionName());
Collection<Slice> activeSlices = targetColl.getRouter().getSearchSlicesSingle(id, null, targetColl);
if (activeSlices == null || activeSlices.isEmpty()) {
throw new SolrException(ErrorCode.SERVER_ERROR, "No active slices serving " + id + " found for target collection: " + rule.getTargetCollectionName());
}
Replica targetLeader = targetColl.getLeader(activeSlices.iterator().next().getName());
nodes = new ArrayList<>(1);
nodes.add(new StdNode(new ZkCoreNodeProps(targetLeader)));
break;
}
}
}
} else {
ReentrantLock ruleExpiryLock = req.getCore().getRuleExpiryLock();
if (!ruleExpiryLock.isLocked()) {
try {
if (ruleExpiryLock.tryLock(10, TimeUnit.MILLISECONDS)) {
log.info("Going to expire routing rule");
try {
Map<String, Object> map = Utils.makeMap(Overseer.QUEUE_OPERATION, OverseerAction.REMOVEROUTINGRULE.toLower(), ZkStateReader.COLLECTION_PROP, collection, ZkStateReader.SHARD_ID_PROP, myShardId, "routeKey", routeKey + "!");
SolrZkClient zkClient = req.getCore().getCoreContainer().getZkController().getZkClient();
DistributedQueue queue = Overseer.getStateUpdateQueue(zkClient);
queue.offer(Utils.toJSON(map));
} catch (KeeperException e) {
log.warn("Exception while removing routing rule for route key: " + routeKey, e);
} catch (Exception e) {
log.error("Exception while removing routing rule for route key: " + routeKey, e);
} finally {
ruleExpiryLock.unlock();
}
}
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
}
}
}
}
}
}
return nodes;
}
use of org.apache.solr.common.cloud.ZkCoreNodeProps in project lucene-solr by apache.
the class DistributedUpdateProcessor method doDeleteByQuery.
public void doDeleteByQuery(DeleteUpdateCommand cmd) throws IOException {
// even in non zk mode, tests simulate updates from a leader
if (!zkEnabled) {
isLeader = getNonZkLeaderAssumption(req);
} else {
zkCheck();
}
// NONE: we are the first to receive this deleteByQuery
// - it must be forwarded to the leader of every shard
// TO: we are a leader receiving a forwarded deleteByQuery... we must:
// - block all updates (use VersionInfo)
// - flush *all* updates going to our replicas
// - forward the DBQ to our replicas and wait for the response
// - log + execute the local DBQ
// FROM: we are a replica receiving a DBQ from our leader
// - log + execute the local DBQ
DistribPhase phase = DistribPhase.parseParam(req.getParams().get(DISTRIB_UPDATE_PARAM));
DocCollection coll = zkEnabled ? zkController.getClusterState().getCollection(collection) : null;
if (zkEnabled && DistribPhase.NONE == phase) {
// start off by assuming we are not a leader for any shard
boolean leaderForAnyShard = false;
ModifiableSolrParams outParams = new ModifiableSolrParams(filterParams(req.getParams()));
outParams.set(DISTRIB_UPDATE_PARAM, DistribPhase.TOLEADER.toString());
outParams.set(DISTRIB_FROM, ZkCoreNodeProps.getCoreUrl(zkController.getBaseUrl(), req.getCore().getName()));
SolrParams params = req.getParams();
String route = params.get(ShardParams._ROUTE_);
Collection<Slice> slices = coll.getRouter().getSearchSlices(route, params, coll);
List<Node> leaders = new ArrayList<>(slices.size());
for (Slice slice : slices) {
String sliceName = slice.getName();
Replica leader;
try {
leader = zkController.getZkStateReader().getLeaderRetry(collection, sliceName);
} catch (InterruptedException e) {
throw new SolrException(ErrorCode.SERVICE_UNAVAILABLE, "Exception finding leader for shard " + sliceName, e);
}
// TODO: What if leaders changed in the meantime?
// should we send out slice-at-a-time and if a node returns "hey, I'm not a leader" (or we get an error because it went down) then look up the new leader?
// Am I the leader for this slice?
ZkCoreNodeProps coreLeaderProps = new ZkCoreNodeProps(leader);
String leaderCoreNodeName = leader.getName();
String coreNodeName = req.getCore().getCoreDescriptor().getCloudDescriptor().getCoreNodeName();
isLeader = coreNodeName.equals(leaderCoreNodeName);
if (isLeader) {
// don't forward to ourself
leaderForAnyShard = true;
} else {
leaders.add(new RetryNode(coreLeaderProps, zkController.getZkStateReader(), collection, sliceName));
}
}
// this will be distributed from the local commit
outParams.remove("commit");
cmdDistrib.distribDelete(cmd, leaders, outParams);
if (!leaderForAnyShard) {
return;
}
// change the phase to TOLEADER so we look up and forward to our own replicas (if any)
phase = DistribPhase.TOLEADER;
}
List<Node> replicas = null;
if (zkEnabled && DistribPhase.TOLEADER == phase) {
// This core should be a leader
isLeader = true;
replicas = setupRequestForDBQ();
} else if (DistribPhase.FROMLEADER == phase) {
isLeader = false;
}
if (vinfo == null) {
super.processDelete(cmd);
return;
}
// at this point, there is an update we need to try and apply.
// we may or may not be the leader.
boolean isReplayOrPeersync = (cmd.getFlags() & (UpdateCommand.REPLAY | UpdateCommand.PEER_SYNC)) != 0;
boolean leaderLogic = isLeader && !isReplayOrPeersync;
versionDeleteByQuery(cmd);
if (zkEnabled) {
// forward to all replicas
ModifiableSolrParams params = new ModifiableSolrParams(filterParams(req.getParams()));
params.set(CommonParams.VERSION_FIELD, Long.toString(cmd.getVersion()));
params.set(DISTRIB_UPDATE_PARAM, DistribPhase.FROMLEADER.toString());
params.set(DISTRIB_FROM, ZkCoreNodeProps.getCoreUrl(zkController.getBaseUrl(), req.getCore().getName()));
boolean someReplicas = false;
boolean subShardLeader = false;
try {
subShardLeader = amISubShardLeader(coll, null, null, null);
if (subShardLeader) {
String myShardId = req.getCore().getCoreDescriptor().getCloudDescriptor().getShardId();
Replica leaderReplica = zkController.getZkStateReader().getLeaderRetry(collection, myShardId);
// DBQ forwarded to NRT and TLOG replicas
List<ZkCoreNodeProps> replicaProps = zkController.getZkStateReader().getReplicaProps(collection, myShardId, leaderReplica.getName(), null, Replica.State.DOWN, EnumSet.of(Replica.Type.NRT, Replica.Type.TLOG));
if (replicaProps != null) {
final List<Node> myReplicas = new ArrayList<>(replicaProps.size());
for (ZkCoreNodeProps replicaProp : replicaProps) {
myReplicas.add(new StdNode(replicaProp, collection, myShardId));
}
cmdDistrib.distribDelete(cmd, myReplicas, params);
someReplicas = true;
}
}
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
throw new ZooKeeperException(ErrorCode.SERVER_ERROR, "", e);
}
if (leaderLogic) {
List<Node> subShardLeaders = getSubShardLeaders(coll, cloudDesc.getShardId(), null, null);
if (subShardLeaders != null) {
cmdDistrib.distribDelete(cmd, subShardLeaders, params, true);
}
final List<Node> nodesByRoutingRules = getNodesByRoutingRules(zkController.getClusterState(), coll, null, null);
if (nodesByRoutingRules != null && !nodesByRoutingRules.isEmpty()) {
params = new ModifiableSolrParams(filterParams(req.getParams()));
params.set(DISTRIB_UPDATE_PARAM, DistribPhase.FROMLEADER.toString());
params.set(DISTRIB_FROM, ZkCoreNodeProps.getCoreUrl(zkController.getBaseUrl(), req.getCore().getName()));
params.set(DISTRIB_FROM_COLLECTION, req.getCore().getCoreDescriptor().getCloudDescriptor().getCollectionName());
params.set(DISTRIB_FROM_SHARD, req.getCore().getCoreDescriptor().getCloudDescriptor().getShardId());
cmdDistrib.distribDelete(cmd, nodesByRoutingRules, params, true);
}
if (replicas != null) {
cmdDistrib.distribDelete(cmd, replicas, params);
someReplicas = true;
}
}
if (someReplicas) {
cmdDistrib.blockAndDoRetries();
}
}
if (returnVersions && rsp != null) {
if (deleteByQueryResponse == null) {
deleteByQueryResponse = new NamedList<String>(1);
rsp.add("deleteByQuery", deleteByQueryResponse);
}
deleteByQueryResponse.add(cmd.getQuery(), cmd.getVersion());
}
}
use of org.apache.solr.common.cloud.ZkCoreNodeProps in project lucene-solr by apache.
the class TestLeaderInitiatedRecoveryThread method testPublishDownState.
public void testPublishDownState() throws Exception {
waitForRecoveriesToFinish(true);
final String leaderCoreNodeName = shardToLeaderJetty.get(SHARD1).coreNodeName;
final CloudJettyRunner leaderRunner = shardToLeaderJetty.get(SHARD1);
CoreContainer coreContainer = leaderRunner.jetty.getCoreContainer();
ZkController zkController = coreContainer.getZkController();
CloudJettyRunner notLeader = null;
for (CloudJettyRunner cloudJettyRunner : shardToJetty.get(SHARD1)) {
if (cloudJettyRunner != leaderRunner) {
notLeader = cloudJettyRunner;
break;
}
}
assertNotNull(notLeader);
Replica replica = cloudClient.getZkStateReader().getClusterState().getReplica(DEFAULT_COLLECTION, notLeader.coreNodeName);
ZkCoreNodeProps replicaCoreNodeProps = new ZkCoreNodeProps(replica);
MockCoreDescriptor cd = new MockCoreDescriptor() {
public CloudDescriptor getCloudDescriptor() {
return new CloudDescriptor(shardToLeaderJetty.get(SHARD1).info.getStr(ZkStateReader.CORE_NAME_PROP), new Properties(), this) {
@Override
public String getCoreNodeName() {
return shardToLeaderJetty.get(SHARD1).info.getStr(ZkStateReader.CORE_NODE_NAME_PROP);
}
@Override
public boolean isLeader() {
return true;
}
};
}
};
/*
1. Test that publishDownState throws exception when zkController.isReplicaInRecoveryHandling == false
*/
try {
LeaderInitiatedRecoveryThread thread = new LeaderInitiatedRecoveryThread(zkController, coreContainer, DEFAULT_COLLECTION, SHARD1, replicaCoreNodeProps, 1, cd);
assertFalse(zkController.isReplicaInRecoveryHandling(replicaCoreNodeProps.getCoreUrl()));
thread.run();
fail("publishDownState should not have succeeded because replica url is not marked in leader initiated recovery in ZkController");
} catch (SolrException e) {
assertTrue(e.code() == SolrException.ErrorCode.INVALID_STATE.code);
}
/*
2. Test that a non-live replica cannot be put into LIR or down state
*/
LeaderInitiatedRecoveryThread thread = new LeaderInitiatedRecoveryThread(zkController, coreContainer, DEFAULT_COLLECTION, SHARD1, replicaCoreNodeProps, 1, cd);
// kill the replica
int children = cloudClient.getZkStateReader().getZkClient().getChildren("/live_nodes", null, true).size();
ChaosMonkey.stop(notLeader.jetty);
TimeOut timeOut = new TimeOut(60, TimeUnit.SECONDS);
while (!timeOut.hasTimedOut()) {
if (children > cloudClient.getZkStateReader().getZkClient().getChildren("/live_nodes", null, true).size()) {
break;
}
Thread.sleep(500);
}
assertTrue(children > cloudClient.getZkStateReader().getZkClient().getChildren("/live_nodes", null, true).size());
int cversion = getOverseerCversion();
// Thread should not publish LIR and down state for node which is not live, regardless of whether forcePublish is true or false
assertFalse(thread.publishDownState(replicaCoreNodeProps.getCoreName(), replica.getName(), replica.getNodeName(), replicaCoreNodeProps.getCoreUrl(), false));
// lets assert that we did not publish anything to overseer queue, simplest way is to assert that cversion of overseer queue zk node is still the same
assertEquals(cversion, getOverseerCversion());
assertFalse(thread.publishDownState(replicaCoreNodeProps.getCoreName(), replica.getName(), replica.getNodeName(), replicaCoreNodeProps.getCoreUrl(), true));
// lets assert that we did not publish anything to overseer queue
assertEquals(cversion, getOverseerCversion());
/*
3. Test that if ZK connection loss then thread should not attempt to publish down state even if forcePublish=true
*/
ChaosMonkey.start(notLeader.jetty);
waitForRecoveriesToFinish(true);
thread = new LeaderInitiatedRecoveryThread(zkController, coreContainer, DEFAULT_COLLECTION, SHARD1, replicaCoreNodeProps, 1, cd) {
@Override
protected void updateLIRState(String replicaCoreNodeName) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "", new KeeperException.ConnectionLossException());
}
};
assertFalse(thread.publishDownState(replicaCoreNodeProps.getCoreName(), replica.getName(), replica.getNodeName(), replicaCoreNodeProps.getCoreUrl(), false));
assertFalse(thread.publishDownState(replicaCoreNodeProps.getCoreName(), replica.getName(), replica.getNodeName(), replicaCoreNodeProps.getCoreUrl(), true));
assertNull(zkController.getLeaderInitiatedRecoveryState(DEFAULT_COLLECTION, SHARD1, replica.getName()));
/*
4. Test that if ZK connection loss or session expired then thread should not attempt to publish down state even if forcePublish=true
*/
thread = new LeaderInitiatedRecoveryThread(zkController, coreContainer, DEFAULT_COLLECTION, SHARD1, replicaCoreNodeProps, 1, cd) {
@Override
protected void updateLIRState(String replicaCoreNodeName) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "", new KeeperException.SessionExpiredException());
}
};
assertFalse(thread.publishDownState(replicaCoreNodeProps.getCoreName(), replica.getName(), replica.getNodeName(), replicaCoreNodeProps.getCoreUrl(), false));
assertFalse(thread.publishDownState(replicaCoreNodeProps.getCoreName(), replica.getName(), replica.getNodeName(), replicaCoreNodeProps.getCoreUrl(), true));
assertNull(zkController.getLeaderInitiatedRecoveryState(DEFAULT_COLLECTION, SHARD1, replica.getName()));
/*
5. Test that any exception other then ZK connection loss or session expired should publish down state only if forcePublish=true
*/
thread = new LeaderInitiatedRecoveryThread(zkController, coreContainer, DEFAULT_COLLECTION, SHARD1, replicaCoreNodeProps, 1, cd) {
@Override
protected void updateLIRState(String replicaCoreNodeName) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "bogus exception");
}
};
// the following should return true because regardless of the bogus exception in setting LIR state, we still want recovery commands to be sent,
// however the following will not publish a down state
cversion = getOverseerCversion();
assertTrue(thread.publishDownState(replicaCoreNodeProps.getCoreName(), replica.getName(), replica.getNodeName(), replicaCoreNodeProps.getCoreUrl(), false));
// lets assert that we did not publish anything to overseer queue, simplest way is to assert that cversion of overseer queue zk node is still the same
assertEquals(cversion, getOverseerCversion());
assertTrue(thread.publishDownState(replicaCoreNodeProps.getCoreName(), replica.getName(), replica.getNodeName(), replicaCoreNodeProps.getCoreUrl(), true));
// this should have published a down state so assert that cversion has incremented
assertTrue(getOverseerCversion() > cversion);
timeOut = new TimeOut(30, TimeUnit.SECONDS);
while (!timeOut.hasTimedOut()) {
Replica r = cloudClient.getZkStateReader().getClusterState().getReplica(DEFAULT_COLLECTION, replica.getName());
if (r.getState() == Replica.State.DOWN) {
break;
}
Thread.sleep(500);
}
assertNull(zkController.getLeaderInitiatedRecoveryState(DEFAULT_COLLECTION, SHARD1, replica.getName()));
assertEquals(Replica.State.DOWN, cloudClient.getZkStateReader().getClusterState().getReplica(DEFAULT_COLLECTION, replica.getName()).getState());
/*
6. Test that non-leader cannot set LIR nodes
*/
coreContainer = notLeader.jetty.getCoreContainer();
zkController = coreContainer.getZkController();
thread = new LeaderInitiatedRecoveryThread(zkController, coreContainer, DEFAULT_COLLECTION, SHARD1, replicaCoreNodeProps, 1, coreContainer.getCores().iterator().next().getCoreDescriptor()) {
@Override
protected void updateLIRState(String replicaCoreNodeName) {
try {
super.updateLIRState(replicaCoreNodeName);
} catch (Exception e) {
assertTrue(e instanceof ZkController.NotLeaderException);
throw e;
}
}
};
cversion = getOverseerCversion();
assertFalse(thread.publishDownState(replicaCoreNodeProps.getCoreName(), replica.getName(), replica.getNodeName(), replicaCoreNodeProps.getCoreUrl(), false));
assertEquals(cversion, getOverseerCversion());
/*
7. assert that we can write a LIR state if everything else is fine
*/
// reset the zkcontroller to the one from the leader
coreContainer = leaderRunner.jetty.getCoreContainer();
zkController = coreContainer.getZkController();
thread = new LeaderInitiatedRecoveryThread(zkController, coreContainer, DEFAULT_COLLECTION, SHARD1, replicaCoreNodeProps, 1, coreContainer.getCores().iterator().next().getCoreDescriptor());
thread.publishDownState(replicaCoreNodeProps.getCoreName(), replica.getName(), replica.getNodeName(), replicaCoreNodeProps.getCoreUrl(), false);
timeOut = new TimeOut(30, TimeUnit.SECONDS);
while (!timeOut.hasTimedOut()) {
Replica.State state = zkController.getLeaderInitiatedRecoveryState(DEFAULT_COLLECTION, SHARD1, replica.getName());
if (state == Replica.State.DOWN) {
break;
}
Thread.sleep(500);
}
assertNotNull(zkController.getLeaderInitiatedRecoveryStateObject(DEFAULT_COLLECTION, SHARD1, replica.getName()));
assertEquals(Replica.State.DOWN, zkController.getLeaderInitiatedRecoveryState(DEFAULT_COLLECTION, SHARD1, replica.getName()));
/*
7. Test that
*/
}
use of org.apache.solr.common.cloud.ZkCoreNodeProps in project lucene-solr by apache.
the class SyncStrategy method syncWithReplicas.
private PeerSync.PeerSyncResult syncWithReplicas(ZkController zkController, SolrCore core, ZkNodeProps props, String collection, String shardId, boolean peerSyncOnlyWithActive) {
List<ZkCoreNodeProps> nodes = zkController.getZkStateReader().getReplicaProps(collection, shardId, core.getCoreDescriptor().getCloudDescriptor().getCoreNodeName());
if (nodes == null) {
// I have no replicas
return PeerSync.PeerSyncResult.success();
}
List<String> syncWith = new ArrayList<>(nodes.size());
for (ZkCoreNodeProps node : nodes) {
syncWith.add(node.getCoreUrl());
}
// if we can't reach a replica for sync, we still consider the overall sync a success
// TODO: as an assurance, we should still try and tell the sync nodes that we couldn't reach
// to recover once more?
// Fingerprinting here is off because the we currently rely on having at least one of the nodes return "true", and if replicas are out-of-sync
// we still need to pick one as leader. A followup sync from the replica to the new leader (with fingerprinting on) should then fail and
// initiate recovery-by-replication.
PeerSync peerSync = new PeerSync(core, syncWith, core.getUpdateHandler().getUpdateLog().getNumRecordsToKeep(), true, true, peerSyncOnlyWithActive, false);
return peerSync.sync();
}
Aggregations