Search in sources :

Example 41 with TimeOut

use of org.apache.solr.util.TimeOut in project lucene-solr by apache.

the class DistributedUpdateProcessor method waitForDependentUpdates.

/**
   * This method checks the update/transaction logs and index to find out if the update ("previous update") that the current update
   * depends on (in the case that this current update is an in-place update) has already been completed. If not,
   * this method will wait for the missing update until it has arrived. If it doesn't arrive within a timeout threshold,
   * then this actively fetches from the leader.
   * 
   * @return -1 if the current in-place should be dropped, or last found version if previous update has been indexed.
   */
private long waitForDependentUpdates(AddUpdateCommand cmd, long versionOnUpdate, boolean isReplayOrPeersync, VersionBucket bucket) throws IOException {
    long lastFoundVersion = 0;
    TimeOut waitTimeout = new TimeOut(5, TimeUnit.SECONDS);
    vinfo.lockForUpdate();
    try {
        synchronized (bucket) {
            Long lookedUpVersion = vinfo.lookupVersion(cmd.getIndexedId());
            lastFoundVersion = lookedUpVersion == null ? 0L : lookedUpVersion;
            if (Math.abs(lastFoundVersion) < cmd.prevVersion) {
                log.debug("Re-ordered inplace update. version={}, prevVersion={}, lastVersion={}, replayOrPeerSync={}, id={}", (cmd.getVersion() == 0 ? versionOnUpdate : cmd.getVersion()), cmd.prevVersion, lastFoundVersion, isReplayOrPeersync, cmd.getPrintableId());
            }
            while (Math.abs(lastFoundVersion) < cmd.prevVersion && !waitTimeout.hasTimedOut()) {
                try {
                    long timeLeft = waitTimeout.timeLeft(TimeUnit.MILLISECONDS);
                    if (timeLeft > 0) {
                        // wait(0) waits forever until notified, but we don't want that.
                        bucket.wait(timeLeft);
                    }
                } catch (InterruptedException ie) {
                    throw new RuntimeException(ie);
                }
                lookedUpVersion = vinfo.lookupVersion(cmd.getIndexedId());
                lastFoundVersion = lookedUpVersion == null ? 0L : lookedUpVersion;
            }
        }
    } finally {
        vinfo.unlockForUpdate();
    }
    if (Math.abs(lastFoundVersion) > cmd.prevVersion) {
        // we can drop the current update.
        if (log.isDebugEnabled()) {
            log.debug("Update was applied on version: {}, but last version I have is: {}" + ". Current update should be dropped. id={}", cmd.prevVersion, lastFoundVersion, cmd.getPrintableId());
        }
        return -1;
    } else if (Math.abs(lastFoundVersion) == cmd.prevVersion) {
        assert 0 < lastFoundVersion : "prevVersion " + cmd.prevVersion + " found but is a delete!";
        if (log.isDebugEnabled()) {
            log.debug("Dependent update found. id={}", cmd.getPrintableId());
        }
        return lastFoundVersion;
    }
    // We have waited enough, but dependent update didn't arrive. Its time to actively fetch it from leader
    log.info("Missing update, on which current in-place update depends on, hasn't arrived. id={}, looking for version={}, last found version={}", cmd.getPrintableId(), cmd.prevVersion, lastFoundVersion);
    UpdateCommand missingUpdate = fetchFullUpdateFromLeader(cmd, versionOnUpdate);
    if (missingUpdate instanceof DeleteUpdateCommand) {
        log.info("Tried to fetch document {} from the leader, but the leader says document has been deleted. " + "Deleting the document here and skipping this update: Last found version: {}, was looking for: {}", cmd.getPrintableId(), lastFoundVersion, cmd.prevVersion);
        versionDelete((DeleteUpdateCommand) missingUpdate);
        return -1;
    } else {
        assert missingUpdate instanceof AddUpdateCommand;
        log.debug("Fetched the document: {}", ((AddUpdateCommand) missingUpdate).getSolrInputDocument());
        versionAdd((AddUpdateCommand) missingUpdate);
        log.info("Added the fetched document, id=" + ((AddUpdateCommand) missingUpdate).getPrintableId() + ", version=" + missingUpdate.getVersion());
    }
    return missingUpdate.getVersion();
}
Also used : TimeOut(org.apache.solr.util.TimeOut) DeleteUpdateCommand(org.apache.solr.update.DeleteUpdateCommand) CommitUpdateCommand(org.apache.solr.update.CommitUpdateCommand) AddUpdateCommand(org.apache.solr.update.AddUpdateCommand) UpdateCommand(org.apache.solr.update.UpdateCommand) DeleteUpdateCommand(org.apache.solr.update.DeleteUpdateCommand) AddUpdateCommand(org.apache.solr.update.AddUpdateCommand)

Example 42 with TimeOut

use of org.apache.solr.util.TimeOut in project lucene-solr by apache.

the class TestLeaderInitiatedRecoveryThread method testPublishDownState.

public void testPublishDownState() throws Exception {
    waitForRecoveriesToFinish(true);
    final String leaderCoreNodeName = shardToLeaderJetty.get(SHARD1).coreNodeName;
    final CloudJettyRunner leaderRunner = shardToLeaderJetty.get(SHARD1);
    CoreContainer coreContainer = leaderRunner.jetty.getCoreContainer();
    ZkController zkController = coreContainer.getZkController();
    CloudJettyRunner notLeader = null;
    for (CloudJettyRunner cloudJettyRunner : shardToJetty.get(SHARD1)) {
        if (cloudJettyRunner != leaderRunner) {
            notLeader = cloudJettyRunner;
            break;
        }
    }
    assertNotNull(notLeader);
    Replica replica = cloudClient.getZkStateReader().getClusterState().getReplica(DEFAULT_COLLECTION, notLeader.coreNodeName);
    ZkCoreNodeProps replicaCoreNodeProps = new ZkCoreNodeProps(replica);
    MockCoreDescriptor cd = new MockCoreDescriptor() {

        public CloudDescriptor getCloudDescriptor() {
            return new CloudDescriptor(shardToLeaderJetty.get(SHARD1).info.getStr(ZkStateReader.CORE_NAME_PROP), new Properties(), this) {

                @Override
                public String getCoreNodeName() {
                    return shardToLeaderJetty.get(SHARD1).info.getStr(ZkStateReader.CORE_NODE_NAME_PROP);
                }

                @Override
                public boolean isLeader() {
                    return true;
                }
            };
        }
    };
    /*
     1. Test that publishDownState throws exception when zkController.isReplicaInRecoveryHandling == false
      */
    try {
        LeaderInitiatedRecoveryThread thread = new LeaderInitiatedRecoveryThread(zkController, coreContainer, DEFAULT_COLLECTION, SHARD1, replicaCoreNodeProps, 1, cd);
        assertFalse(zkController.isReplicaInRecoveryHandling(replicaCoreNodeProps.getCoreUrl()));
        thread.run();
        fail("publishDownState should not have succeeded because replica url is not marked in leader initiated recovery in ZkController");
    } catch (SolrException e) {
        assertTrue(e.code() == SolrException.ErrorCode.INVALID_STATE.code);
    }
    /*
     2. Test that a non-live replica cannot be put into LIR or down state
      */
    LeaderInitiatedRecoveryThread thread = new LeaderInitiatedRecoveryThread(zkController, coreContainer, DEFAULT_COLLECTION, SHARD1, replicaCoreNodeProps, 1, cd);
    // kill the replica
    int children = cloudClient.getZkStateReader().getZkClient().getChildren("/live_nodes", null, true).size();
    ChaosMonkey.stop(notLeader.jetty);
    TimeOut timeOut = new TimeOut(60, TimeUnit.SECONDS);
    while (!timeOut.hasTimedOut()) {
        if (children > cloudClient.getZkStateReader().getZkClient().getChildren("/live_nodes", null, true).size()) {
            break;
        }
        Thread.sleep(500);
    }
    assertTrue(children > cloudClient.getZkStateReader().getZkClient().getChildren("/live_nodes", null, true).size());
    int cversion = getOverseerCversion();
    // Thread should not publish LIR and down state for node which is not live, regardless of whether forcePublish is true or false
    assertFalse(thread.publishDownState(replicaCoreNodeProps.getCoreName(), replica.getName(), replica.getNodeName(), replicaCoreNodeProps.getCoreUrl(), false));
    // lets assert that we did not publish anything to overseer queue, simplest way is to assert that cversion of overseer queue zk node is still the same
    assertEquals(cversion, getOverseerCversion());
    assertFalse(thread.publishDownState(replicaCoreNodeProps.getCoreName(), replica.getName(), replica.getNodeName(), replicaCoreNodeProps.getCoreUrl(), true));
    // lets assert that we did not publish anything to overseer queue
    assertEquals(cversion, getOverseerCversion());
    /*
    3. Test that if ZK connection loss then thread should not attempt to publish down state even if forcePublish=true
     */
    ChaosMonkey.start(notLeader.jetty);
    waitForRecoveriesToFinish(true);
    thread = new LeaderInitiatedRecoveryThread(zkController, coreContainer, DEFAULT_COLLECTION, SHARD1, replicaCoreNodeProps, 1, cd) {

        @Override
        protected void updateLIRState(String replicaCoreNodeName) {
            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "", new KeeperException.ConnectionLossException());
        }
    };
    assertFalse(thread.publishDownState(replicaCoreNodeProps.getCoreName(), replica.getName(), replica.getNodeName(), replicaCoreNodeProps.getCoreUrl(), false));
    assertFalse(thread.publishDownState(replicaCoreNodeProps.getCoreName(), replica.getName(), replica.getNodeName(), replicaCoreNodeProps.getCoreUrl(), true));
    assertNull(zkController.getLeaderInitiatedRecoveryState(DEFAULT_COLLECTION, SHARD1, replica.getName()));
    /*
     4. Test that if ZK connection loss or session expired then thread should not attempt to publish down state even if forcePublish=true
      */
    thread = new LeaderInitiatedRecoveryThread(zkController, coreContainer, DEFAULT_COLLECTION, SHARD1, replicaCoreNodeProps, 1, cd) {

        @Override
        protected void updateLIRState(String replicaCoreNodeName) {
            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "", new KeeperException.SessionExpiredException());
        }
    };
    assertFalse(thread.publishDownState(replicaCoreNodeProps.getCoreName(), replica.getName(), replica.getNodeName(), replicaCoreNodeProps.getCoreUrl(), false));
    assertFalse(thread.publishDownState(replicaCoreNodeProps.getCoreName(), replica.getName(), replica.getNodeName(), replicaCoreNodeProps.getCoreUrl(), true));
    assertNull(zkController.getLeaderInitiatedRecoveryState(DEFAULT_COLLECTION, SHARD1, replica.getName()));
    /*
     5. Test that any exception other then ZK connection loss or session expired should publish down state only if forcePublish=true
      */
    thread = new LeaderInitiatedRecoveryThread(zkController, coreContainer, DEFAULT_COLLECTION, SHARD1, replicaCoreNodeProps, 1, cd) {

        @Override
        protected void updateLIRState(String replicaCoreNodeName) {
            throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "bogus exception");
        }
    };
    // the following should return true because regardless of the bogus exception in setting LIR state, we still want recovery commands to be sent,
    // however the following will not publish a down state
    cversion = getOverseerCversion();
    assertTrue(thread.publishDownState(replicaCoreNodeProps.getCoreName(), replica.getName(), replica.getNodeName(), replicaCoreNodeProps.getCoreUrl(), false));
    // lets assert that we did not publish anything to overseer queue, simplest way is to assert that cversion of overseer queue zk node is still the same
    assertEquals(cversion, getOverseerCversion());
    assertTrue(thread.publishDownState(replicaCoreNodeProps.getCoreName(), replica.getName(), replica.getNodeName(), replicaCoreNodeProps.getCoreUrl(), true));
    // this should have published a down state so assert that cversion has incremented
    assertTrue(getOverseerCversion() > cversion);
    timeOut = new TimeOut(30, TimeUnit.SECONDS);
    while (!timeOut.hasTimedOut()) {
        Replica r = cloudClient.getZkStateReader().getClusterState().getReplica(DEFAULT_COLLECTION, replica.getName());
        if (r.getState() == Replica.State.DOWN) {
            break;
        }
        Thread.sleep(500);
    }
    assertNull(zkController.getLeaderInitiatedRecoveryState(DEFAULT_COLLECTION, SHARD1, replica.getName()));
    assertEquals(Replica.State.DOWN, cloudClient.getZkStateReader().getClusterState().getReplica(DEFAULT_COLLECTION, replica.getName()).getState());
    /*
    6. Test that non-leader cannot set LIR nodes
     */
    coreContainer = notLeader.jetty.getCoreContainer();
    zkController = coreContainer.getZkController();
    thread = new LeaderInitiatedRecoveryThread(zkController, coreContainer, DEFAULT_COLLECTION, SHARD1, replicaCoreNodeProps, 1, coreContainer.getCores().iterator().next().getCoreDescriptor()) {

        @Override
        protected void updateLIRState(String replicaCoreNodeName) {
            try {
                super.updateLIRState(replicaCoreNodeName);
            } catch (Exception e) {
                assertTrue(e instanceof ZkController.NotLeaderException);
                throw e;
            }
        }
    };
    cversion = getOverseerCversion();
    assertFalse(thread.publishDownState(replicaCoreNodeProps.getCoreName(), replica.getName(), replica.getNodeName(), replicaCoreNodeProps.getCoreUrl(), false));
    assertEquals(cversion, getOverseerCversion());
    /*
     7. assert that we can write a LIR state if everything else is fine
      */
    // reset the zkcontroller to the one from the leader
    coreContainer = leaderRunner.jetty.getCoreContainer();
    zkController = coreContainer.getZkController();
    thread = new LeaderInitiatedRecoveryThread(zkController, coreContainer, DEFAULT_COLLECTION, SHARD1, replicaCoreNodeProps, 1, coreContainer.getCores().iterator().next().getCoreDescriptor());
    thread.publishDownState(replicaCoreNodeProps.getCoreName(), replica.getName(), replica.getNodeName(), replicaCoreNodeProps.getCoreUrl(), false);
    timeOut = new TimeOut(30, TimeUnit.SECONDS);
    while (!timeOut.hasTimedOut()) {
        Replica.State state = zkController.getLeaderInitiatedRecoveryState(DEFAULT_COLLECTION, SHARD1, replica.getName());
        if (state == Replica.State.DOWN) {
            break;
        }
        Thread.sleep(500);
    }
    assertNotNull(zkController.getLeaderInitiatedRecoveryStateObject(DEFAULT_COLLECTION, SHARD1, replica.getName()));
    assertEquals(Replica.State.DOWN, zkController.getLeaderInitiatedRecoveryState(DEFAULT_COLLECTION, SHARD1, replica.getName()));
/*
    7. Test that
     */
}
Also used : ZkCoreNodeProps(org.apache.solr.common.cloud.ZkCoreNodeProps) TimeOut(org.apache.solr.util.TimeOut) Properties(java.util.Properties) Replica(org.apache.solr.common.cloud.Replica) SolrException(org.apache.solr.common.SolrException) KeeperException(org.apache.zookeeper.KeeperException) CoreContainer(org.apache.solr.core.CoreContainer) MockCoreDescriptor(org.apache.solr.util.MockCoreContainer.MockCoreDescriptor) SolrException(org.apache.solr.common.SolrException)

Example 43 with TimeOut

use of org.apache.solr.util.TimeOut in project lucene-solr by apache.

the class TestPullReplica method testAddDocs.

@SuppressWarnings("unchecked")
public void testAddDocs() throws Exception {
    int numReadOnlyReplicas = 1 + random().nextInt(3);
    CollectionAdminRequest.createCollection(collectionName, "conf", 1, 1, 0, numReadOnlyReplicas).setMaxShardsPerNode(100).process(cluster.getSolrClient());
    waitForState("Expected collection to be created with 1 shard and " + (numReadOnlyReplicas + 1) + " replicas", collectionName, clusterShape(1, numReadOnlyReplicas + 1));
    DocCollection docCollection = assertNumberOfReplicas(1, 0, numReadOnlyReplicas, false, true);
    assertEquals(1, docCollection.getSlices().size());
    cluster.getSolrClient().add(collectionName, new SolrInputDocument("id", "1", "foo", "bar"));
    cluster.getSolrClient().commit(collectionName);
    Slice s = docCollection.getSlices().iterator().next();
    try (HttpSolrClient leaderClient = getHttpSolrClient(s.getLeader().getCoreUrl())) {
        assertEquals(1, leaderClient.query(new SolrQuery("*:*")).getResults().getNumFound());
    }
    TimeOut t = new TimeOut(REPLICATION_TIMEOUT_SECS, TimeUnit.SECONDS);
    for (Replica r : s.getReplicas(EnumSet.of(Replica.Type.PULL))) {
        //TODO: assert replication < REPLICATION_TIMEOUT_SECS
        try (HttpSolrClient readOnlyReplicaClient = getHttpSolrClient(r.getCoreUrl())) {
            while (true) {
                try {
                    assertEquals("Replica " + r.getName() + " not up to date after 10 seconds", 1, readOnlyReplicaClient.query(new SolrQuery("*:*")).getResults().getNumFound());
                    break;
                } catch (AssertionError e) {
                    if (t.hasTimedOut()) {
                        throw e;
                    } else {
                        Thread.sleep(100);
                    }
                }
            }
            SolrQuery req = new SolrQuery("qt", "/admin/plugins", "stats", "true");
            QueryResponse statsResponse = readOnlyReplicaClient.query(req);
            assertEquals("Replicas shouldn't process the add document request: " + statsResponse, 0L, ((Map<String, Object>) ((NamedList<Object>) statsResponse.getResponse()).findRecursive("plugins", "UPDATE", "updateHandler", "stats")).get("UPDATE.updateHandler.adds"));
        }
    }
    assertUlogPresence(docCollection);
}
Also used : TimeOut(org.apache.solr.util.TimeOut) NamedList(org.apache.solr.common.util.NamedList) Replica(org.apache.solr.common.cloud.Replica) SolrQuery(org.apache.solr.client.solrj.SolrQuery) HttpSolrClient(org.apache.solr.client.solrj.impl.HttpSolrClient) SolrInputDocument(org.apache.solr.common.SolrInputDocument) Slice(org.apache.solr.common.cloud.Slice) QueryResponse(org.apache.solr.client.solrj.response.QueryResponse) DocCollection(org.apache.solr.common.cloud.DocCollection)

Example 44 with TimeOut

use of org.apache.solr.util.TimeOut in project lucene-solr by apache.

the class TestPullReplicaErrorHandling method setupCluster.

@BeforeClass
public static void setupCluster() throws Exception {
    // We'll be explicit about this in this test
    TestInjection.waitForReplicasInSync = null;
    configureCluster(4).addConfig("conf", configset("cloud-minimal")).configure();
    // Add proxies
    proxies = new HashMap<>(cluster.getJettySolrRunners().size());
    jettys = new HashMap<>(cluster.getJettySolrRunners().size());
    for (JettySolrRunner jetty : cluster.getJettySolrRunners()) {
        SocketProxy proxy = new SocketProxy();
        jetty.setProxyPort(proxy.getListenPort());
        //TODO: Can we avoid this restart
        cluster.stopJettySolrRunner(jetty);
        cluster.startJettySolrRunner(jetty);
        proxy.open(jetty.getBaseUrl().toURI());
        LOG.info("Adding proxy for URL: " + jetty.getBaseUrl() + ". Proxy: " + proxy.getUrl());
        proxies.put(proxy.getUrl(), proxy);
        jettys.put(proxy.getUrl(), jetty);
    }
    TimeOut t = new TimeOut(10, TimeUnit.SECONDS);
    while (true) {
        try {
            CollectionAdminRequest.ClusterProp clusterPropRequest = CollectionAdminRequest.setClusterProperty(ZkStateReader.LEGACY_CLOUD, "false");
            CollectionAdminResponse response = clusterPropRequest.process(cluster.getSolrClient());
            assertEquals(0, response.getStatus());
            break;
        } catch (SolrServerException e) {
            Thread.sleep(50);
            if (t.hasTimedOut()) {
                throw e;
            }
        }
    }
}
Also used : CollectionAdminResponse(org.apache.solr.client.solrj.response.CollectionAdminResponse) JettySolrRunner(org.apache.solr.client.solrj.embedded.JettySolrRunner) TimeOut(org.apache.solr.util.TimeOut) SolrServerException(org.apache.solr.client.solrj.SolrServerException) CollectionAdminRequest(org.apache.solr.client.solrj.request.CollectionAdminRequest) BeforeClass(org.junit.BeforeClass)

Example 45 with TimeOut

use of org.apache.solr.util.TimeOut in project lucene-solr by apache.

the class TestPullReplicaErrorHandling method testCantConnectToPullReplica.

//  @Repeat(iterations=10)
public void testCantConnectToPullReplica() throws Exception {
    int numShards = 2;
    CollectionAdminRequest.createCollection(collectionName, "conf", numShards, 1, 0, 1).setMaxShardsPerNode(1).process(cluster.getSolrClient());
    addDocs(10);
    DocCollection docCollection = assertNumberOfReplicas(numShards, 0, numShards, false, true);
    Slice s = docCollection.getSlices().iterator().next();
    SocketProxy proxy = getProxyForReplica(s.getReplicas(EnumSet.of(Replica.Type.PULL)).get(0));
    try {
        proxy.close();
        for (int i = 1; i <= 10; i++) {
            addDocs(10 + i);
            try (HttpSolrClient leaderClient = getHttpSolrClient(s.getLeader().getCoreUrl())) {
                assertNumDocs(10 + i, leaderClient);
            }
        }
        try (HttpSolrClient pullReplicaClient = getHttpSolrClient(s.getReplicas(EnumSet.of(Replica.Type.PULL)).get(0).getCoreUrl())) {
            pullReplicaClient.query(new SolrQuery("*:*")).getResults().getNumFound();
            fail("Shouldn't be able to query the pull replica");
        } catch (SolrServerException e) {
        //expected
        }
        // Replica should still be active, since it doesn't disconnect from ZooKeeper
        assertNumberOfReplicas(numShards, 0, numShards, true, true);
        {
            long numFound = 0;
            TimeOut t = new TimeOut(REPLICATION_TIMEOUT_SECS, TimeUnit.SECONDS);
            while (numFound < 20 && !t.hasTimedOut()) {
                Thread.sleep(200);
                numFound = cluster.getSolrClient().query(collectionName, new SolrQuery("*:*")).getResults().getNumFound();
            }
        }
    } finally {
        proxy.reopen();
    }
    try (HttpSolrClient pullReplicaClient = getHttpSolrClient(s.getReplicas(EnumSet.of(Replica.Type.PULL)).get(0).getCoreUrl())) {
        assertNumDocs(20, pullReplicaClient);
    }
}
Also used : HttpSolrClient(org.apache.solr.client.solrj.impl.HttpSolrClient) Slice(org.apache.solr.common.cloud.Slice) TimeOut(org.apache.solr.util.TimeOut) SolrServerException(org.apache.solr.client.solrj.SolrServerException) DocCollection(org.apache.solr.common.cloud.DocCollection) SolrQuery(org.apache.solr.client.solrj.SolrQuery)

Aggregations

TimeOut (org.apache.solr.util.TimeOut)48 SolrException (org.apache.solr.common.SolrException)15 Slice (org.apache.solr.common.cloud.Slice)15 DocCollection (org.apache.solr.common.cloud.DocCollection)14 Replica (org.apache.solr.common.cloud.Replica)13 SolrQuery (org.apache.solr.client.solrj.SolrQuery)11 ZkStateReader (org.apache.solr.common.cloud.ZkStateReader)8 ModifiableSolrParams (org.apache.solr.common.params.ModifiableSolrParams)8 HashMap (java.util.HashMap)7 Test (org.junit.Test)7 IOException (java.io.IOException)6 ArrayList (java.util.ArrayList)6 SolrInputDocument (org.apache.solr.common.SolrInputDocument)6 ZkNodeProps (org.apache.solr.common.cloud.ZkNodeProps)6 NamedList (org.apache.solr.common.util.NamedList)6 HttpSolrClient (org.apache.solr.client.solrj.impl.HttpSolrClient)5 Map (java.util.Map)4 SolrServerException (org.apache.solr.client.solrj.SolrServerException)4 Collections.singletonList (java.util.Collections.singletonList)3 HashSet (java.util.HashSet)3