use of org.apache.solr.util.TimeOut in project lucene-solr by apache.
the class DistributedUpdateProcessor method waitForDependentUpdates.
/**
* This method checks the update/transaction logs and index to find out if the update ("previous update") that the current update
* depends on (in the case that this current update is an in-place update) has already been completed. If not,
* this method will wait for the missing update until it has arrived. If it doesn't arrive within a timeout threshold,
* then this actively fetches from the leader.
*
* @return -1 if the current in-place should be dropped, or last found version if previous update has been indexed.
*/
private long waitForDependentUpdates(AddUpdateCommand cmd, long versionOnUpdate, boolean isReplayOrPeersync, VersionBucket bucket) throws IOException {
long lastFoundVersion = 0;
TimeOut waitTimeout = new TimeOut(5, TimeUnit.SECONDS);
vinfo.lockForUpdate();
try {
synchronized (bucket) {
Long lookedUpVersion = vinfo.lookupVersion(cmd.getIndexedId());
lastFoundVersion = lookedUpVersion == null ? 0L : lookedUpVersion;
if (Math.abs(lastFoundVersion) < cmd.prevVersion) {
log.debug("Re-ordered inplace update. version={}, prevVersion={}, lastVersion={}, replayOrPeerSync={}, id={}", (cmd.getVersion() == 0 ? versionOnUpdate : cmd.getVersion()), cmd.prevVersion, lastFoundVersion, isReplayOrPeersync, cmd.getPrintableId());
}
while (Math.abs(lastFoundVersion) < cmd.prevVersion && !waitTimeout.hasTimedOut()) {
try {
long timeLeft = waitTimeout.timeLeft(TimeUnit.MILLISECONDS);
if (timeLeft > 0) {
// wait(0) waits forever until notified, but we don't want that.
bucket.wait(timeLeft);
}
} catch (InterruptedException ie) {
throw new RuntimeException(ie);
}
lookedUpVersion = vinfo.lookupVersion(cmd.getIndexedId());
lastFoundVersion = lookedUpVersion == null ? 0L : lookedUpVersion;
}
}
} finally {
vinfo.unlockForUpdate();
}
if (Math.abs(lastFoundVersion) > cmd.prevVersion) {
// we can drop the current update.
if (log.isDebugEnabled()) {
log.debug("Update was applied on version: {}, but last version I have is: {}" + ". Current update should be dropped. id={}", cmd.prevVersion, lastFoundVersion, cmd.getPrintableId());
}
return -1;
} else if (Math.abs(lastFoundVersion) == cmd.prevVersion) {
assert 0 < lastFoundVersion : "prevVersion " + cmd.prevVersion + " found but is a delete!";
if (log.isDebugEnabled()) {
log.debug("Dependent update found. id={}", cmd.getPrintableId());
}
return lastFoundVersion;
}
// We have waited enough, but dependent update didn't arrive. Its time to actively fetch it from leader
log.info("Missing update, on which current in-place update depends on, hasn't arrived. id={}, looking for version={}, last found version={}", cmd.getPrintableId(), cmd.prevVersion, lastFoundVersion);
UpdateCommand missingUpdate = fetchFullUpdateFromLeader(cmd, versionOnUpdate);
if (missingUpdate instanceof DeleteUpdateCommand) {
log.info("Tried to fetch document {} from the leader, but the leader says document has been deleted. " + "Deleting the document here and skipping this update: Last found version: {}, was looking for: {}", cmd.getPrintableId(), lastFoundVersion, cmd.prevVersion);
versionDelete((DeleteUpdateCommand) missingUpdate);
return -1;
} else {
assert missingUpdate instanceof AddUpdateCommand;
log.debug("Fetched the document: {}", ((AddUpdateCommand) missingUpdate).getSolrInputDocument());
versionAdd((AddUpdateCommand) missingUpdate);
log.info("Added the fetched document, id=" + ((AddUpdateCommand) missingUpdate).getPrintableId() + ", version=" + missingUpdate.getVersion());
}
return missingUpdate.getVersion();
}
use of org.apache.solr.util.TimeOut in project lucene-solr by apache.
the class TestLeaderInitiatedRecoveryThread method testPublishDownState.
public void testPublishDownState() throws Exception {
waitForRecoveriesToFinish(true);
final String leaderCoreNodeName = shardToLeaderJetty.get(SHARD1).coreNodeName;
final CloudJettyRunner leaderRunner = shardToLeaderJetty.get(SHARD1);
CoreContainer coreContainer = leaderRunner.jetty.getCoreContainer();
ZkController zkController = coreContainer.getZkController();
CloudJettyRunner notLeader = null;
for (CloudJettyRunner cloudJettyRunner : shardToJetty.get(SHARD1)) {
if (cloudJettyRunner != leaderRunner) {
notLeader = cloudJettyRunner;
break;
}
}
assertNotNull(notLeader);
Replica replica = cloudClient.getZkStateReader().getClusterState().getReplica(DEFAULT_COLLECTION, notLeader.coreNodeName);
ZkCoreNodeProps replicaCoreNodeProps = new ZkCoreNodeProps(replica);
MockCoreDescriptor cd = new MockCoreDescriptor() {
public CloudDescriptor getCloudDescriptor() {
return new CloudDescriptor(shardToLeaderJetty.get(SHARD1).info.getStr(ZkStateReader.CORE_NAME_PROP), new Properties(), this) {
@Override
public String getCoreNodeName() {
return shardToLeaderJetty.get(SHARD1).info.getStr(ZkStateReader.CORE_NODE_NAME_PROP);
}
@Override
public boolean isLeader() {
return true;
}
};
}
};
/*
1. Test that publishDownState throws exception when zkController.isReplicaInRecoveryHandling == false
*/
try {
LeaderInitiatedRecoveryThread thread = new LeaderInitiatedRecoveryThread(zkController, coreContainer, DEFAULT_COLLECTION, SHARD1, replicaCoreNodeProps, 1, cd);
assertFalse(zkController.isReplicaInRecoveryHandling(replicaCoreNodeProps.getCoreUrl()));
thread.run();
fail("publishDownState should not have succeeded because replica url is not marked in leader initiated recovery in ZkController");
} catch (SolrException e) {
assertTrue(e.code() == SolrException.ErrorCode.INVALID_STATE.code);
}
/*
2. Test that a non-live replica cannot be put into LIR or down state
*/
LeaderInitiatedRecoveryThread thread = new LeaderInitiatedRecoveryThread(zkController, coreContainer, DEFAULT_COLLECTION, SHARD1, replicaCoreNodeProps, 1, cd);
// kill the replica
int children = cloudClient.getZkStateReader().getZkClient().getChildren("/live_nodes", null, true).size();
ChaosMonkey.stop(notLeader.jetty);
TimeOut timeOut = new TimeOut(60, TimeUnit.SECONDS);
while (!timeOut.hasTimedOut()) {
if (children > cloudClient.getZkStateReader().getZkClient().getChildren("/live_nodes", null, true).size()) {
break;
}
Thread.sleep(500);
}
assertTrue(children > cloudClient.getZkStateReader().getZkClient().getChildren("/live_nodes", null, true).size());
int cversion = getOverseerCversion();
// Thread should not publish LIR and down state for node which is not live, regardless of whether forcePublish is true or false
assertFalse(thread.publishDownState(replicaCoreNodeProps.getCoreName(), replica.getName(), replica.getNodeName(), replicaCoreNodeProps.getCoreUrl(), false));
// lets assert that we did not publish anything to overseer queue, simplest way is to assert that cversion of overseer queue zk node is still the same
assertEquals(cversion, getOverseerCversion());
assertFalse(thread.publishDownState(replicaCoreNodeProps.getCoreName(), replica.getName(), replica.getNodeName(), replicaCoreNodeProps.getCoreUrl(), true));
// lets assert that we did not publish anything to overseer queue
assertEquals(cversion, getOverseerCversion());
/*
3. Test that if ZK connection loss then thread should not attempt to publish down state even if forcePublish=true
*/
ChaosMonkey.start(notLeader.jetty);
waitForRecoveriesToFinish(true);
thread = new LeaderInitiatedRecoveryThread(zkController, coreContainer, DEFAULT_COLLECTION, SHARD1, replicaCoreNodeProps, 1, cd) {
@Override
protected void updateLIRState(String replicaCoreNodeName) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "", new KeeperException.ConnectionLossException());
}
};
assertFalse(thread.publishDownState(replicaCoreNodeProps.getCoreName(), replica.getName(), replica.getNodeName(), replicaCoreNodeProps.getCoreUrl(), false));
assertFalse(thread.publishDownState(replicaCoreNodeProps.getCoreName(), replica.getName(), replica.getNodeName(), replicaCoreNodeProps.getCoreUrl(), true));
assertNull(zkController.getLeaderInitiatedRecoveryState(DEFAULT_COLLECTION, SHARD1, replica.getName()));
/*
4. Test that if ZK connection loss or session expired then thread should not attempt to publish down state even if forcePublish=true
*/
thread = new LeaderInitiatedRecoveryThread(zkController, coreContainer, DEFAULT_COLLECTION, SHARD1, replicaCoreNodeProps, 1, cd) {
@Override
protected void updateLIRState(String replicaCoreNodeName) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "", new KeeperException.SessionExpiredException());
}
};
assertFalse(thread.publishDownState(replicaCoreNodeProps.getCoreName(), replica.getName(), replica.getNodeName(), replicaCoreNodeProps.getCoreUrl(), false));
assertFalse(thread.publishDownState(replicaCoreNodeProps.getCoreName(), replica.getName(), replica.getNodeName(), replicaCoreNodeProps.getCoreUrl(), true));
assertNull(zkController.getLeaderInitiatedRecoveryState(DEFAULT_COLLECTION, SHARD1, replica.getName()));
/*
5. Test that any exception other then ZK connection loss or session expired should publish down state only if forcePublish=true
*/
thread = new LeaderInitiatedRecoveryThread(zkController, coreContainer, DEFAULT_COLLECTION, SHARD1, replicaCoreNodeProps, 1, cd) {
@Override
protected void updateLIRState(String replicaCoreNodeName) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "bogus exception");
}
};
// the following should return true because regardless of the bogus exception in setting LIR state, we still want recovery commands to be sent,
// however the following will not publish a down state
cversion = getOverseerCversion();
assertTrue(thread.publishDownState(replicaCoreNodeProps.getCoreName(), replica.getName(), replica.getNodeName(), replicaCoreNodeProps.getCoreUrl(), false));
// lets assert that we did not publish anything to overseer queue, simplest way is to assert that cversion of overseer queue zk node is still the same
assertEquals(cversion, getOverseerCversion());
assertTrue(thread.publishDownState(replicaCoreNodeProps.getCoreName(), replica.getName(), replica.getNodeName(), replicaCoreNodeProps.getCoreUrl(), true));
// this should have published a down state so assert that cversion has incremented
assertTrue(getOverseerCversion() > cversion);
timeOut = new TimeOut(30, TimeUnit.SECONDS);
while (!timeOut.hasTimedOut()) {
Replica r = cloudClient.getZkStateReader().getClusterState().getReplica(DEFAULT_COLLECTION, replica.getName());
if (r.getState() == Replica.State.DOWN) {
break;
}
Thread.sleep(500);
}
assertNull(zkController.getLeaderInitiatedRecoveryState(DEFAULT_COLLECTION, SHARD1, replica.getName()));
assertEquals(Replica.State.DOWN, cloudClient.getZkStateReader().getClusterState().getReplica(DEFAULT_COLLECTION, replica.getName()).getState());
/*
6. Test that non-leader cannot set LIR nodes
*/
coreContainer = notLeader.jetty.getCoreContainer();
zkController = coreContainer.getZkController();
thread = new LeaderInitiatedRecoveryThread(zkController, coreContainer, DEFAULT_COLLECTION, SHARD1, replicaCoreNodeProps, 1, coreContainer.getCores().iterator().next().getCoreDescriptor()) {
@Override
protected void updateLIRState(String replicaCoreNodeName) {
try {
super.updateLIRState(replicaCoreNodeName);
} catch (Exception e) {
assertTrue(e instanceof ZkController.NotLeaderException);
throw e;
}
}
};
cversion = getOverseerCversion();
assertFalse(thread.publishDownState(replicaCoreNodeProps.getCoreName(), replica.getName(), replica.getNodeName(), replicaCoreNodeProps.getCoreUrl(), false));
assertEquals(cversion, getOverseerCversion());
/*
7. assert that we can write a LIR state if everything else is fine
*/
// reset the zkcontroller to the one from the leader
coreContainer = leaderRunner.jetty.getCoreContainer();
zkController = coreContainer.getZkController();
thread = new LeaderInitiatedRecoveryThread(zkController, coreContainer, DEFAULT_COLLECTION, SHARD1, replicaCoreNodeProps, 1, coreContainer.getCores().iterator().next().getCoreDescriptor());
thread.publishDownState(replicaCoreNodeProps.getCoreName(), replica.getName(), replica.getNodeName(), replicaCoreNodeProps.getCoreUrl(), false);
timeOut = new TimeOut(30, TimeUnit.SECONDS);
while (!timeOut.hasTimedOut()) {
Replica.State state = zkController.getLeaderInitiatedRecoveryState(DEFAULT_COLLECTION, SHARD1, replica.getName());
if (state == Replica.State.DOWN) {
break;
}
Thread.sleep(500);
}
assertNotNull(zkController.getLeaderInitiatedRecoveryStateObject(DEFAULT_COLLECTION, SHARD1, replica.getName()));
assertEquals(Replica.State.DOWN, zkController.getLeaderInitiatedRecoveryState(DEFAULT_COLLECTION, SHARD1, replica.getName()));
/*
7. Test that
*/
}
use of org.apache.solr.util.TimeOut in project lucene-solr by apache.
the class TestPullReplica method testAddDocs.
@SuppressWarnings("unchecked")
public void testAddDocs() throws Exception {
int numReadOnlyReplicas = 1 + random().nextInt(3);
CollectionAdminRequest.createCollection(collectionName, "conf", 1, 1, 0, numReadOnlyReplicas).setMaxShardsPerNode(100).process(cluster.getSolrClient());
waitForState("Expected collection to be created with 1 shard and " + (numReadOnlyReplicas + 1) + " replicas", collectionName, clusterShape(1, numReadOnlyReplicas + 1));
DocCollection docCollection = assertNumberOfReplicas(1, 0, numReadOnlyReplicas, false, true);
assertEquals(1, docCollection.getSlices().size());
cluster.getSolrClient().add(collectionName, new SolrInputDocument("id", "1", "foo", "bar"));
cluster.getSolrClient().commit(collectionName);
Slice s = docCollection.getSlices().iterator().next();
try (HttpSolrClient leaderClient = getHttpSolrClient(s.getLeader().getCoreUrl())) {
assertEquals(1, leaderClient.query(new SolrQuery("*:*")).getResults().getNumFound());
}
TimeOut t = new TimeOut(REPLICATION_TIMEOUT_SECS, TimeUnit.SECONDS);
for (Replica r : s.getReplicas(EnumSet.of(Replica.Type.PULL))) {
//TODO: assert replication < REPLICATION_TIMEOUT_SECS
try (HttpSolrClient readOnlyReplicaClient = getHttpSolrClient(r.getCoreUrl())) {
while (true) {
try {
assertEquals("Replica " + r.getName() + " not up to date after 10 seconds", 1, readOnlyReplicaClient.query(new SolrQuery("*:*")).getResults().getNumFound());
break;
} catch (AssertionError e) {
if (t.hasTimedOut()) {
throw e;
} else {
Thread.sleep(100);
}
}
}
SolrQuery req = new SolrQuery("qt", "/admin/plugins", "stats", "true");
QueryResponse statsResponse = readOnlyReplicaClient.query(req);
assertEquals("Replicas shouldn't process the add document request: " + statsResponse, 0L, ((Map<String, Object>) ((NamedList<Object>) statsResponse.getResponse()).findRecursive("plugins", "UPDATE", "updateHandler", "stats")).get("UPDATE.updateHandler.adds"));
}
}
assertUlogPresence(docCollection);
}
use of org.apache.solr.util.TimeOut in project lucene-solr by apache.
the class TestPullReplicaErrorHandling method setupCluster.
@BeforeClass
public static void setupCluster() throws Exception {
// We'll be explicit about this in this test
TestInjection.waitForReplicasInSync = null;
configureCluster(4).addConfig("conf", configset("cloud-minimal")).configure();
// Add proxies
proxies = new HashMap<>(cluster.getJettySolrRunners().size());
jettys = new HashMap<>(cluster.getJettySolrRunners().size());
for (JettySolrRunner jetty : cluster.getJettySolrRunners()) {
SocketProxy proxy = new SocketProxy();
jetty.setProxyPort(proxy.getListenPort());
//TODO: Can we avoid this restart
cluster.stopJettySolrRunner(jetty);
cluster.startJettySolrRunner(jetty);
proxy.open(jetty.getBaseUrl().toURI());
LOG.info("Adding proxy for URL: " + jetty.getBaseUrl() + ". Proxy: " + proxy.getUrl());
proxies.put(proxy.getUrl(), proxy);
jettys.put(proxy.getUrl(), jetty);
}
TimeOut t = new TimeOut(10, TimeUnit.SECONDS);
while (true) {
try {
CollectionAdminRequest.ClusterProp clusterPropRequest = CollectionAdminRequest.setClusterProperty(ZkStateReader.LEGACY_CLOUD, "false");
CollectionAdminResponse response = clusterPropRequest.process(cluster.getSolrClient());
assertEquals(0, response.getStatus());
break;
} catch (SolrServerException e) {
Thread.sleep(50);
if (t.hasTimedOut()) {
throw e;
}
}
}
}
use of org.apache.solr.util.TimeOut in project lucene-solr by apache.
the class TestPullReplicaErrorHandling method testCantConnectToPullReplica.
// @Repeat(iterations=10)
public void testCantConnectToPullReplica() throws Exception {
int numShards = 2;
CollectionAdminRequest.createCollection(collectionName, "conf", numShards, 1, 0, 1).setMaxShardsPerNode(1).process(cluster.getSolrClient());
addDocs(10);
DocCollection docCollection = assertNumberOfReplicas(numShards, 0, numShards, false, true);
Slice s = docCollection.getSlices().iterator().next();
SocketProxy proxy = getProxyForReplica(s.getReplicas(EnumSet.of(Replica.Type.PULL)).get(0));
try {
proxy.close();
for (int i = 1; i <= 10; i++) {
addDocs(10 + i);
try (HttpSolrClient leaderClient = getHttpSolrClient(s.getLeader().getCoreUrl())) {
assertNumDocs(10 + i, leaderClient);
}
}
try (HttpSolrClient pullReplicaClient = getHttpSolrClient(s.getReplicas(EnumSet.of(Replica.Type.PULL)).get(0).getCoreUrl())) {
pullReplicaClient.query(new SolrQuery("*:*")).getResults().getNumFound();
fail("Shouldn't be able to query the pull replica");
} catch (SolrServerException e) {
//expected
}
// Replica should still be active, since it doesn't disconnect from ZooKeeper
assertNumberOfReplicas(numShards, 0, numShards, true, true);
{
long numFound = 0;
TimeOut t = new TimeOut(REPLICATION_TIMEOUT_SECS, TimeUnit.SECONDS);
while (numFound < 20 && !t.hasTimedOut()) {
Thread.sleep(200);
numFound = cluster.getSolrClient().query(collectionName, new SolrQuery("*:*")).getResults().getNumFound();
}
}
} finally {
proxy.reopen();
}
try (HttpSolrClient pullReplicaClient = getHttpSolrClient(s.getReplicas(EnumSet.of(Replica.Type.PULL)).get(0).getCoreUrl())) {
assertNumDocs(20, pullReplicaClient);
}
}
Aggregations