use of org.apache.solr.common.cloud.Replica in project lucene-solr by apache.
the class AbstractDistribZkTestBase method assertAllActive.
protected static void assertAllActive(String collection, ZkStateReader zkStateReader) throws KeeperException, InterruptedException {
zkStateReader.forceUpdateCollection(collection);
ClusterState clusterState = zkStateReader.getClusterState();
Map<String, Slice> slices = clusterState.getSlicesMap(collection);
if (slices == null) {
throw new IllegalArgumentException("Cannot find collection:" + collection);
}
for (Map.Entry<String, Slice> entry : slices.entrySet()) {
Slice slice = entry.getValue();
if (slice.getState() != Slice.State.ACTIVE) {
fail("Not all shards are ACTIVE - found a shard " + slice.getName() + " that is: " + slice.getState());
}
Map<String, Replica> shards = slice.getReplicasMap();
for (Map.Entry<String, Replica> shard : shards.entrySet()) {
Replica replica = shard.getValue();
if (replica.getState() != Replica.State.ACTIVE) {
fail("Not all replicas are ACTIVE - found a replica " + replica.getName() + " that is: " + replica.getState());
}
}
}
}
use of org.apache.solr.common.cloud.Replica in project lucene-solr by apache.
the class DistributedVersionInfoTest method testReplicaVersionHandling.
@Test
public void testReplicaVersionHandling() throws Exception {
final String shardId = "shard1";
CollectionAdminRequest.createCollection(COLLECTION, "conf", 1, 3).processAndWait(cluster.getSolrClient(), DEFAULT_TIMEOUT);
final ZkStateReader stateReader = cluster.getSolrClient().getZkStateReader();
stateReader.waitForState(COLLECTION, DEFAULT_TIMEOUT, TimeUnit.SECONDS, (n, c) -> DocCollection.isFullyActive(n, c, 1, 3));
final Replica leader = stateReader.getLeaderRetry(COLLECTION, shardId);
// start by reloading the empty collection so we try to calculate the max from an empty index
reloadCollection(leader, COLLECTION);
sendDoc(1);
cluster.getSolrClient().commit(COLLECTION);
// verify doc is on the leader and replica
final List<Replica> notLeaders = stateReader.getClusterState().getCollection(COLLECTION).getReplicas().stream().filter(r -> r.getCoreName().equals(leader.getCoreName()) == false).collect(Collectors.toList());
assertDocsExistInAllReplicas(leader, notLeaders, COLLECTION, 1, 1, null);
// get max version from the leader and replica
Replica replica = notLeaders.get(0);
Long maxOnLeader = getMaxVersionFromIndex(leader);
Long maxOnReplica = getMaxVersionFromIndex(replica);
assertEquals("leader and replica should have same max version: " + maxOnLeader, maxOnLeader, maxOnReplica);
// send the same doc but with a lower version than the max in the index
try (SolrClient client = getHttpSolrClient(replica.getCoreUrl())) {
String docId = String.valueOf(1);
SolrInputDocument doc = new SolrInputDocument();
doc.setField("id", docId);
// bad version!!!
doc.setField("_version_", maxOnReplica - 1);
// simulate what the leader does when sending a doc to a replica
ModifiableSolrParams params = new ModifiableSolrParams();
params.set(DISTRIB_UPDATE_PARAM, DistributedUpdateProcessor.DistribPhase.FROMLEADER.toString());
params.set(DISTRIB_FROM, leader.getCoreUrl());
UpdateRequest req = new UpdateRequest();
req.setParams(params);
req.add(doc);
log.info("Sending doc with out-of-date version (" + (maxOnReplica - 1) + ") document directly to replica");
client.request(req);
client.commit();
Long docVersion = getVersionFromIndex(replica, docId);
assertEquals("older version should have been thrown away", maxOnReplica, docVersion);
}
reloadCollection(leader, COLLECTION);
maxOnLeader = getMaxVersionFromIndex(leader);
maxOnReplica = getMaxVersionFromIndex(replica);
assertEquals("leader and replica should have same max version after reload", maxOnLeader, maxOnReplica);
// now start sending docs while collection is reloading
delQ("*:*");
commit();
final Set<Integer> deletedDocs = new HashSet<>();
final AtomicInteger docsSent = new AtomicInteger(0);
final Random rand = new Random(5150);
Thread docSenderThread = new Thread() {
public void run() {
// brief delay before sending docs
try {
Thread.sleep(rand.nextInt(30) + 1);
} catch (InterruptedException e) {
}
for (int i = 0; i < 1000; i++) {
if (i % (rand.nextInt(20) + 1) == 0) {
try {
Thread.sleep(rand.nextInt(50) + 1);
} catch (InterruptedException e) {
}
}
int docId = i + 1;
try {
sendDoc(docId);
docsSent.incrementAndGet();
} catch (Exception e) {
}
}
}
};
Thread reloaderThread = new Thread() {
public void run() {
try {
Thread.sleep(rand.nextInt(300) + 1);
} catch (InterruptedException e) {
}
for (int i = 0; i < 3; i++) {
try {
reloadCollection(leader, COLLECTION);
} catch (Exception e) {
}
try {
Thread.sleep(rand.nextInt(300) + 300);
} catch (InterruptedException e) {
}
}
}
};
Thread deleteThread = new Thread() {
public void run() {
// brief delay before sending docs
try {
Thread.sleep(500);
} catch (InterruptedException e) {
}
for (int i = 0; i < 200; i++) {
try {
Thread.sleep(rand.nextInt(50) + 1);
} catch (InterruptedException e) {
}
int ds = docsSent.get();
if (ds > 0) {
int docToDelete = rand.nextInt(ds) + 1;
if (!deletedDocs.contains(docToDelete)) {
delI(String.valueOf(docToDelete));
deletedDocs.add(docToDelete);
}
}
}
}
};
Thread committerThread = new Thread() {
public void run() {
try {
Thread.sleep(rand.nextInt(200) + 1);
} catch (InterruptedException e) {
}
for (int i = 0; i < 20; i++) {
try {
cluster.getSolrClient().commit(COLLECTION);
} catch (Exception e) {
}
try {
Thread.sleep(rand.nextInt(100) + 100);
} catch (InterruptedException e) {
}
}
}
};
docSenderThread.start();
reloaderThread.start();
committerThread.start();
deleteThread.start();
docSenderThread.join();
reloaderThread.join();
committerThread.join();
deleteThread.join();
cluster.getSolrClient().commit(COLLECTION);
log.info("Total of " + deletedDocs.size() + " docs deleted");
maxOnLeader = getMaxVersionFromIndex(leader);
maxOnReplica = getMaxVersionFromIndex(replica);
assertEquals("leader and replica should have same max version before reload", maxOnLeader, maxOnReplica);
reloadCollection(leader, COLLECTION);
maxOnLeader = getMaxVersionFromIndex(leader);
maxOnReplica = getMaxVersionFromIndex(replica);
assertEquals("leader and replica should have same max version after reload", maxOnLeader, maxOnReplica);
assertDocsExistInAllReplicas(leader, notLeaders, COLLECTION, 1, 1000, deletedDocs);
}
use of org.apache.solr.common.cloud.Replica in project lucene-solr by apache.
the class HttpPartitionTest method testMinRf.
protected void testMinRf() throws Exception {
// create a collection that has 1 shard and 3 replicas
String testCollectionName = "collMinRf_1x3";
createCollection(testCollectionName, 1, 3, 1);
cloudClient.setDefaultCollection(testCollectionName);
sendDoc(1, 2);
List<Replica> notLeaders = ensureAllReplicasAreActive(testCollectionName, "shard1", 1, 3, maxWaitSecsToSeeAllActive);
assertTrue("Expected 2 non-leader replicas for collection " + testCollectionName + " but found " + notLeaders.size() + "; clusterState: " + printClusterStateInfo(testCollectionName), notLeaders.size() == 2);
assertDocsExistInAllReplicas(notLeaders, testCollectionName, 1, 1);
// Now introduce a network partition between the leader and 1 replica, so a minRf of 2 is still achieved
SocketProxy proxy0 = getProxyForReplica(notLeaders.get(0));
proxy0.close();
// indexing during a partition
int achievedRf = sendDoc(2, 2);
assertEquals("Unexpected achieved replication factor", 2, achievedRf);
Thread.sleep(sleepMsBeforeHealPartition);
// Verify that the partitioned replica is DOWN
ZkStateReader zkr = cloudClient.getZkStateReader();
// force the state to be fresh
zkr.forceUpdateCollection(testCollectionName);
// force the state to be fresh
;
ClusterState cs = zkr.getClusterState();
Collection<Slice> slices = cs.getActiveSlices(testCollectionName);
Slice slice = slices.iterator().next();
Replica partitionedReplica = slice.getReplica(notLeaders.get(0).getName());
assertEquals("The partitioned replica did not get marked down", Replica.State.DOWN.toString(), partitionedReplica.getStr(ZkStateReader.STATE_PROP));
proxy0.reopen();
notLeaders = ensureAllReplicasAreActive(testCollectionName, "shard1", 1, 3, maxWaitSecsToSeeAllActive);
// Since minRf is achieved, we expect recovery, so we expect seeing 2 documents
assertDocsExistInAllReplicas(notLeaders, testCollectionName, 1, 2);
// Now introduce a network partition between the leader and both of its replicas, so a minRf of 2 is NOT achieved
proxy0 = getProxyForReplica(notLeaders.get(0));
proxy0.close();
SocketProxy proxy1 = getProxyForReplica(notLeaders.get(1));
proxy1.close();
achievedRf = sendDoc(3, 2);
assertEquals("Unexpected achieved replication factor", 1, achievedRf);
Thread.sleep(sleepMsBeforeHealPartition);
// Verify that the partitioned replicas are NOT DOWN since minRf wasn't achieved
ensureAllReplicasAreActive(testCollectionName, "shard1", 1, 3, 1);
proxy0.reopen();
proxy1.reopen();
notLeaders = ensureAllReplicasAreActive(testCollectionName, "shard1", 1, 3, maxWaitSecsToSeeAllActive);
// Check that doc 3 is on the leader but not on the notLeaders
Replica leader = cloudClient.getZkStateReader().getLeaderRetry(testCollectionName, "shard1", 10000);
try (HttpSolrClient leaderSolr = getHttpSolrClient(leader, testCollectionName)) {
assertDocExists(leaderSolr, testCollectionName, "3");
}
for (Replica notLeader : notLeaders) {
try (HttpSolrClient notLeaderSolr = getHttpSolrClient(notLeader, testCollectionName)) {
assertDocNotExists(notLeaderSolr, testCollectionName, "3");
}
}
// Retry sending doc 3
achievedRf = sendDoc(3, 2);
assertEquals("Unexpected achieved replication factor", 3, achievedRf);
// Now doc 3 should be on all replicas
assertDocsExistInAllReplicas(notLeaders, testCollectionName, 1, 3);
}
use of org.apache.solr.common.cloud.Replica in project lucene-solr by apache.
the class HttpPartitionTest method testRf2.
protected void testRf2() throws Exception {
// create a collection that has 1 shard but 2 replicas
String testCollectionName = "c8n_1x2";
createCollectionRetry(testCollectionName, 1, 2, 1);
cloudClient.setDefaultCollection(testCollectionName);
sendDoc(1);
Replica notLeader = ensureAllReplicasAreActive(testCollectionName, "shard1", 1, 2, maxWaitSecsToSeeAllActive).get(0);
// ok, now introduce a network partition between the leader and the replica
SocketProxy proxy = getProxyForReplica(notLeader);
proxy.close();
// indexing during a partition
sendDoc(2);
// Have the partition last at least 1 sec
// While this gives the impression that recovery is timing related, this is
// really only
// to give time for the state to be written to ZK before the test completes.
// In other words,
// without a brief pause, the test finishes so quickly that it doesn't give
// time for the recovery process to kick-in
Thread.sleep(sleepMsBeforeHealPartition);
proxy.reopen();
List<Replica> notLeaders = ensureAllReplicasAreActive(testCollectionName, "shard1", 1, 2, maxWaitSecsToSeeAllActive);
sendDoc(3);
// sent 3 docs in so far, verify they are on the leader and replica
assertDocsExistInAllReplicas(notLeaders, testCollectionName, 1, 3);
// Get the max version from the replica core to make sure it gets updated after recovery (see SOLR-7625)
JettySolrRunner replicaJetty = getJettyOnPort(getReplicaPort(notLeader));
CoreContainer coreContainer = replicaJetty.getCoreContainer();
ZkCoreNodeProps replicaCoreNodeProps = new ZkCoreNodeProps(notLeader);
String coreName = replicaCoreNodeProps.getCoreName();
Long maxVersionBefore = null;
try (SolrCore core = coreContainer.getCore(coreName)) {
assertNotNull("Core '" + coreName + "' not found for replica: " + notLeader.getName(), core);
UpdateLog ulog = core.getUpdateHandler().getUpdateLog();
maxVersionBefore = ulog.getCurrentMaxVersion();
}
assertNotNull("max version bucket seed not set for core " + coreName, maxVersionBefore);
log.info("Looked up max version bucket seed " + maxVersionBefore + " for core " + coreName);
// now up the stakes and do more docs
int numDocs = TEST_NIGHTLY ? 1000 : 100;
boolean hasPartition = false;
for (int d = 0; d < numDocs; d++) {
// create / restore partition every 100 docs
if (d % 10 == 0) {
if (hasPartition) {
proxy.reopen();
hasPartition = false;
} else {
if (d >= 10) {
proxy.close();
hasPartition = true;
Thread.sleep(sleepMsBeforeHealPartition);
}
}
}
// 4 is offset as we've already indexed 1-3
sendDoc(d + 4);
}
// restore connectivity if lost
if (hasPartition) {
proxy.reopen();
}
notLeaders = ensureAllReplicasAreActive(testCollectionName, "shard1", 1, 2, maxWaitSecsToSeeAllActive);
try (SolrCore core = coreContainer.getCore(coreName)) {
assertNotNull("Core '" + coreName + "' not found for replica: " + notLeader.getName(), core);
Long currentMaxVersion = core.getUpdateHandler().getUpdateLog().getCurrentMaxVersion();
log.info("After recovery, looked up NEW max version bucket seed " + currentMaxVersion + " for core " + coreName + ", was: " + maxVersionBefore);
assertTrue("max version bucket seed not updated after recovery!", currentMaxVersion > maxVersionBefore);
}
// verify all docs received
assertDocsExistInAllReplicas(notLeaders, testCollectionName, 1, numDocs + 3);
log.info("testRf2 succeeded ... deleting the " + testCollectionName + " collection");
// try to clean up
attemptCollectionDelete(cloudClient, testCollectionName);
}
use of org.apache.solr.common.cloud.Replica in project lucene-solr by apache.
the class DeleteInactiveReplicaTest method deleteInactiveReplicaTest.
@Test
public void deleteInactiveReplicaTest() throws Exception {
String collectionName = "delDeadColl";
int replicationFactor = 2;
int numShards = 2;
int maxShardsPerNode = ((((numShards + 1) * replicationFactor) / cluster.getJettySolrRunners().size())) + 1;
CollectionAdminRequest.createCollection(collectionName, "conf", numShards, replicationFactor).setMaxShardsPerNode(maxShardsPerNode).process(cluster.getSolrClient());
waitForState("Expected a cluster of 2 shards and 2 replicas", collectionName, (n, c) -> {
return DocCollection.isFullyActive(n, c, numShards, replicationFactor);
});
DocCollection collectionState = getCollectionState(collectionName);
Slice shard = getRandomShard(collectionState);
Replica replica = getRandomReplica(shard);
JettySolrRunner jetty = cluster.getReplicaJetty(replica);
cluster.stopJettySolrRunner(jetty);
waitForState("Expected replica " + replica.getName() + " on down node to be removed from cluster state", collectionName, (n, c) -> {
Replica r = c.getReplica(replica.getCoreName());
return r == null || r.getState() != Replica.State.ACTIVE;
});
log.info("Removing replica {}/{} ", shard.getName(), replica.getName());
CollectionAdminRequest.deleteReplica(collectionName, shard.getName(), replica.getName()).process(cluster.getSolrClient());
waitForState("Expected deleted replica " + replica.getName() + " to be removed from cluster state", collectionName, (n, c) -> {
return c.getReplica(replica.getCoreName()) == null;
});
cluster.startJettySolrRunner(jetty);
log.info("restarted jetty");
CoreContainer cc = jetty.getCoreContainer();
CoreContainer.CoreLoadFailure loadFailure = cc.getCoreInitFailures().get(replica.getCoreName());
assertNotNull("Deleted core was still loaded!", loadFailure);
assertTrue("Unexpected load failure message: " + loadFailure.exception.getMessage(), loadFailure.exception.getMessage().contains("does not exist in shard"));
// Check that we can't create a core with no coreNodeName
try (SolrClient queryClient = getHttpSolrClient(jetty.getBaseUrl().toString())) {
Exception e = expectThrows(Exception.class, () -> {
CoreAdminRequest.Create createRequest = new CoreAdminRequest.Create();
createRequest.setCoreName("testcore");
createRequest.setCollection(collectionName);
createRequest.setShardId("shard2");
queryClient.request(createRequest);
});
assertTrue("Unexpected error message: " + e.getMessage(), e.getMessage().contains("coreNodeName missing"));
}
}
Aggregations