use of com.codahale.metrics.Metric in project lucene-solr by apache.
the class TestRecovery method testLogReplay.
@Test
public void testLogReplay() throws Exception {
try {
DirectUpdateHandler2.commitOnClose = false;
final Semaphore logReplay = new Semaphore(0);
final Semaphore logReplayFinish = new Semaphore(0);
UpdateLog.testing_logReplayHook = () -> {
try {
assertTrue(logReplay.tryAcquire(timeout, TimeUnit.SECONDS));
} catch (Exception e) {
throw new RuntimeException(e);
}
};
UpdateLog.testing_logReplayFinishHook = () -> logReplayFinish.release();
clearIndex();
assertU(commit());
Deque<Long> versions = new ArrayDeque<>();
versions.addFirst(addAndGetVersion(sdoc("id", "A1"), null));
versions.addFirst(addAndGetVersion(sdoc("id", "A11"), null));
versions.addFirst(addAndGetVersion(sdoc("id", "A12"), null));
versions.addFirst(deleteByQueryAndGetVersion("id:A11", null));
versions.addFirst(addAndGetVersion(sdoc("id", "A13"), null));
// atomic update
versions.addFirst(addAndGetVersion(sdoc("id", "A12", "val_i_dvo", map("set", 1)), null));
// in-place update
versions.addFirst(addAndGetVersion(sdoc("id", "A12", "val_i_dvo", map("set", 2)), null));
assertJQ(req("q", "*:*"), "/response/numFound==0");
assertJQ(req("qt", "/get", "getVersions", "" + versions.size()), "/versions==" + versions);
h.close();
createCore();
// live map view
Map<String, Metric> metrics = getMetrics();
// Solr should kick this off now
// h.getCore().getUpdateHandler().getUpdateLog().recoverFromLog();
// verify that previous close didn't do a commit
// recovery should be blocked by our hook
assertJQ(req("q", "*:*"), "/response/numFound==0");
// make sure we can still access versions after a restart
assertJQ(req("qt", "/get", "getVersions", "" + versions.size()), "/versions==" + versions);
assertEquals(UpdateLog.State.REPLAYING, h.getCore().getUpdateHandler().getUpdateLog().getState());
// check metrics
Gauge<Integer> state = (Gauge<Integer>) metrics.get("TLOG.state");
assertEquals(UpdateLog.State.REPLAYING.ordinal(), state.getValue().intValue());
Gauge<Integer> replayingLogs = (Gauge<Integer>) metrics.get("TLOG.replay.remaining.logs");
assertTrue(replayingLogs.getValue().intValue() > 0);
Gauge<Long> replayingDocs = (Gauge<Long>) metrics.get("TLOG.replay.remaining.bytes");
assertTrue(replayingDocs.getValue().longValue() > 0);
Meter replayDocs = (Meter) metrics.get("TLOG.replay.ops");
long initialOps = replayDocs.getCount();
// unblock recovery
logReplay.release(1000);
// make sure we can still access versions during recovery
assertJQ(req("qt", "/get", "getVersions", "" + versions.size()), "/versions==" + versions);
// wait until recovery has finished
assertTrue(logReplayFinish.tryAcquire(timeout, TimeUnit.SECONDS));
// assert that in-place update is retained
assertJQ(req("q", "val_i_dvo:2"), "/response/numFound==1");
assertJQ(req("q", "*:*"), "/response/numFound==3");
assertEquals(7L, replayDocs.getCount() - initialOps);
assertEquals(UpdateLog.State.ACTIVE.ordinal(), state.getValue().intValue());
// make sure we can still access versions after recovery
assertJQ(req("qt", "/get", "getVersions", "" + versions.size()), "/versions==" + versions);
assertU(adoc("id", "A2"));
assertU(adoc("id", "A3"));
assertU(delI("A2"));
assertU(adoc("id", "A4"));
assertJQ(req("q", "*:*"), "/response/numFound==3");
// assert that in-place update is retained
assertJQ(req("q", "val_i_dvo:2"), "/response/numFound==1");
h.close();
createCore();
// Solr should kick this off now
// h.getCore().getUpdateHandler().getUpdateLog().recoverFromLog();
// wait until recovery has finished
assertTrue(logReplayFinish.tryAcquire(timeout, TimeUnit.SECONDS));
assertJQ(req("q", "*:*"), "/response/numFound==5");
assertJQ(req("q", "id:A2"), "/response/numFound==0");
// no updates, so insure that recovery does not run
h.close();
int permits = logReplay.availablePermits();
createCore();
// Solr should kick this off now
// h.getCore().getUpdateHandler().getUpdateLog().recoverFromLog();
assertJQ(req("q", "*:*"), "/response/numFound==5");
// assert that in-place update is retained
assertJQ(req("q", "val_i_dvo:2"), "/response/numFound==1");
Thread.sleep(100);
// no updates, so insure that recovery didn't run
assertEquals(permits, logReplay.availablePermits());
assertEquals(UpdateLog.State.ACTIVE, h.getCore().getUpdateHandler().getUpdateLog().getState());
} finally {
DirectUpdateHandler2.commitOnClose = true;
UpdateLog.testing_logReplayHook = null;
UpdateLog.testing_logReplayFinishHook = null;
}
}
use of com.codahale.metrics.Metric in project lucene-solr by apache.
the class TestRecovery method testBuffering.
@Test
public void testBuffering() throws Exception {
DirectUpdateHandler2.commitOnClose = false;
final Semaphore logReplay = new Semaphore(0);
final Semaphore logReplayFinish = new Semaphore(0);
UpdateLog.testing_logReplayHook = () -> {
try {
assertTrue(logReplay.tryAcquire(timeout, TimeUnit.SECONDS));
} catch (Exception e) {
throw new RuntimeException(e);
}
};
UpdateLog.testing_logReplayFinishHook = logReplayFinish::release;
SolrQueryRequest req = req();
UpdateHandler uhandler = req.getCore().getUpdateHandler();
UpdateLog ulog = uhandler.getUpdateLog();
try {
clearIndex();
assertU(commit());
Map<String, Metric> metrics = getMetrics();
assertEquals(UpdateLog.State.ACTIVE, ulog.getState());
ulog.bufferUpdates();
assertEquals(UpdateLog.State.BUFFERING, ulog.getState());
Future<UpdateLog.RecoveryInfo> rinfoFuture = ulog.applyBufferedUpdates();
assertTrue(rinfoFuture == null);
assertEquals(UpdateLog.State.ACTIVE, ulog.getState());
ulog.bufferUpdates();
assertEquals(UpdateLog.State.BUFFERING, ulog.getState());
Gauge<Integer> state = (Gauge<Integer>) metrics.get("TLOG.state");
assertEquals(UpdateLog.State.BUFFERING.ordinal(), state.getValue().intValue());
Gauge<Integer> bufferedOps = (Gauge<Integer>) metrics.get("TLOG.buffered.ops");
int initialOps = bufferedOps.getValue();
Meter applyingBuffered = (Meter) metrics.get("TLOG.applyingBuffered.ops");
long initialApplyingOps = applyingBuffered.getCount();
String v3 = getNextVersion();
String v940_del = "-" + getNextVersion();
String v950_del = "-" + getNextVersion();
String v1010 = getNextVersion();
String v1015 = getNextVersion();
String v1017_del = "-" + getNextVersion();
String v1020 = getNextVersion();
String v1030 = getNextVersion();
String v1040 = getNextVersion();
String v1050 = getNextVersion();
String v1060 = getNextVersion();
String v1070 = getNextVersion();
String v1080 = getNextVersion();
String v2010_del = "-" + getNextVersion();
String v2060_del = "-" + getNextVersion();
String v3000_del = "-" + getNextVersion();
String versionListFirstCheck = String.join(",", v2010_del, v1030, v1020, v1017_del, v1015, v1010);
String versionListSecondCheck = String.join(",", v3000_del, v1080, v1050, v1060, v940_del, v1040, v3, v2010_del, v1030, v1020, v1017_del, v1015, v1010);
// simulate updates from a leader
updateJ(jsonAdd(sdoc("id", "B1", "_version_", v1010)), params(DISTRIB_UPDATE_PARAM, FROM_LEADER));
updateJ(jsonAdd(sdoc("id", "B11", "_version_", v1015)), params(DISTRIB_UPDATE_PARAM, FROM_LEADER));
updateJ(jsonDelQ("id:B1 id:B11 id:B2 id:B3"), params(DISTRIB_UPDATE_PARAM, FROM_LEADER, "_version_", v1017_del));
updateJ(jsonAdd(sdoc("id", "B2", "_version_", v1020)), params(DISTRIB_UPDATE_PARAM, FROM_LEADER));
updateJ(jsonAdd(sdoc("id", "B3", "_version_", v1030)), params(DISTRIB_UPDATE_PARAM, FROM_LEADER));
deleteAndGetVersion("B1", params(DISTRIB_UPDATE_PARAM, FROM_LEADER, "_version_", v2010_del));
assertJQ(req("qt", "/get", "getVersions", "6"), "=={'versions':[" + versionListFirstCheck + "]}");
assertU(commit());
assertJQ(req("qt", "/get", "getVersions", "6"), "=={'versions':[" + versionListFirstCheck + "]}");
// updates should be buffered, so we should not see any results yet.
assertJQ(req("q", "*:*"), "/response/numFound==0");
// real-time get should also not show anything (this could change in the future,
// but it's currently used for validating version numbers too, so it would
// be bad for updates to be visible if we're just buffering.
assertJQ(req("qt", "/get", "id", "B3"), "=={'doc':null}");
assertEquals(6, bufferedOps.getValue().intValue() - initialOps);
rinfoFuture = ulog.applyBufferedUpdates();
assertTrue(rinfoFuture != null);
assertEquals(UpdateLog.State.APPLYING_BUFFERED, ulog.getState());
logReplay.release(1000);
UpdateLog.RecoveryInfo rinfo = rinfoFuture.get();
assertEquals(UpdateLog.State.ACTIVE, ulog.getState());
assertEquals(6L, applyingBuffered.getCount() - initialApplyingOps);
assertJQ(req("qt", "/get", "getVersions", "6"), "=={'versions':[" + versionListFirstCheck + "]}");
assertJQ(req("q", "*:*"), "/response/numFound==2");
// move back to recovering
ulog.bufferUpdates();
assertEquals(UpdateLog.State.BUFFERING, ulog.getState());
Long ver = getVer(req("qt", "/get", "id", "B3"));
assertEquals(Long.valueOf(v1030), ver);
// add a reordered doc that shouldn't overwrite one in the index
updateJ(jsonAdd(sdoc("id", "B3", "_version_", v3)), params(DISTRIB_UPDATE_PARAM, FROM_LEADER));
// reorder two buffered updates
updateJ(jsonAdd(sdoc("id", "B4", "_version_", v1040)), params(DISTRIB_UPDATE_PARAM, FROM_LEADER));
// this update should not take affect
deleteAndGetVersion("B4", params(DISTRIB_UPDATE_PARAM, FROM_LEADER, "_version_", v940_del));
updateJ(jsonAdd(sdoc("id", "B6", "_version_", v1060)), params(DISTRIB_UPDATE_PARAM, FROM_LEADER));
updateJ(jsonAdd(sdoc("id", "B5", "_version_", v1050)), params(DISTRIB_UPDATE_PARAM, FROM_LEADER));
updateJ(jsonAdd(sdoc("id", "B8", "_version_", v1080)), params(DISTRIB_UPDATE_PARAM, FROM_LEADER));
// test that delete by query is at least buffered along with everything else so it will delete the
// currently buffered id:8 (even if it doesn't currently support versioning)
updateJ("{\"delete\": { \"query\":\"id:B2 OR id:B8\" }}", params(DISTRIB_UPDATE_PARAM, FROM_LEADER, "_version_", v3000_del));
assertJQ(req("qt", "/get", "getVersions", "13"), // the "3" appears because versions aren't checked while buffering
"=={'versions':[" + versionListSecondCheck + "]}");
logReplay.drainPermits();
rinfoFuture = ulog.applyBufferedUpdates();
assertTrue(rinfoFuture != null);
assertEquals(UpdateLog.State.APPLYING_BUFFERED, ulog.getState());
// apply a single update
logReplay.release(1);
// now add another update
updateJ(jsonAdd(sdoc("id", "B7", "_version_", v1070)), params(DISTRIB_UPDATE_PARAM, FROM_LEADER));
// a reordered update that should be dropped
deleteAndGetVersion("B5", params(DISTRIB_UPDATE_PARAM, FROM_LEADER, "_version_", v950_del));
deleteAndGetVersion("B6", params(DISTRIB_UPDATE_PARAM, FROM_LEADER, "_version_", v2060_del));
logReplay.release(1000);
UpdateLog.RecoveryInfo recInfo = rinfoFuture.get();
assertJQ(req("q", "*:*", "sort", "id asc", "fl", "id,_version_"), "/response/docs==[" + "{'id':'B3','_version_':" + v1030 + "}" + ",{'id':'B4','_version_':" + v1040 + "}" + ",{'id':'B5','_version_':" + v1050 + "}" + ",{'id':'B7','_version_':" + v1070 + "}" + "]");
assertEquals(1, recInfo.deleteByQuery);
// leave each test method in a good state
assertEquals(UpdateLog.State.ACTIVE, ulog.getState());
assertEquals(0, bufferedOps.getValue().intValue());
} finally {
DirectUpdateHandler2.commitOnClose = true;
UpdateLog.testing_logReplayHook = null;
UpdateLog.testing_logReplayFinishHook = null;
req().close();
}
}
use of com.codahale.metrics.Metric in project lucene-solr by apache.
the class PeerSyncReplicationTest method test.
@Test
public void test() throws Exception {
handle.clear();
handle.put("timestamp", SKIPVAL);
waitForThingsToLevelOut(30);
del("*:*");
// index enough docs and commit to establish frame of reference for PeerSync
for (int i = 0; i < 100; i++) {
indexDoc(id, docId, i1, 50, tlong, 50, t1, "document number " + docId++);
}
commit();
waitForThingsToLevelOut(30);
try {
checkShardConsistency(false, true);
long cloudClientDocs = cloudClient.query(new SolrQuery("*:*")).getResults().getNumFound();
assertEquals(docId, cloudClientDocs);
CloudJettyRunner initialLeaderJetty = shardToLeaderJetty.get("shard1");
List<CloudJettyRunner> otherJetties = getOtherAvailableJetties(initialLeaderJetty);
CloudJettyRunner neverLeader = otherJetties.get(otherJetties.size() - 1);
otherJetties.remove(neverLeader);
// first shutdown a node that will never be a leader
forceNodeFailures(singletonList(neverLeader));
// node failure and recovery via PeerSync
log.info("Forcing PeerSync");
CloudJettyRunner nodePeerSynced = forceNodeFailureAndDoPeerSync(false);
// add a few more docs
indexDoc(id, docId, i1, 50, tlong, 50, t1, "document number " + docId++);
indexDoc(id, docId, i1, 50, tlong, 50, t1, "document number " + docId++);
commit();
cloudClientDocs = cloudClient.query(new SolrQuery("*:*")).getResults().getNumFound();
assertEquals(docId, cloudClientDocs);
// now shutdown all other nodes except for 'nodeShutDownForFailure'
otherJetties.remove(nodePeerSynced);
forceNodeFailures(otherJetties);
waitForThingsToLevelOut(30);
checkShardConsistency(false, true);
// now shutdown the original leader
log.info("Now shutting down initial leader");
forceNodeFailures(singletonList(initialLeaderJetty));
log.info("Updating mappings from zk");
waitForNewLeader(cloudClient, "shard1", (Replica) initialLeaderJetty.client.info, new TimeOut(15, SECONDS));
updateMappingsFromZk(jettys, clients, true);
assertEquals("PeerSynced node did not become leader", nodePeerSynced, shardToLeaderJetty.get("shard1"));
// bring up node that was down all along, and let it PeerSync from the node that was forced to PeerSynce
bringUpDeadNodeAndEnsureNoReplication(neverLeader, false);
waitTillNodesActive();
checkShardConsistency(false, true);
// bring back all the nodes including initial leader
// (commented as reports Maximum concurrent create/delete watches above limit violation and reports thread leaks)
/*for(int i = 0 ; i < nodesDown.size(); i++) {
bringUpDeadNodeAndEnsureNoReplication(shardToLeaderJetty.get("shard1"), neverLeader, false);
}
checkShardConsistency(false, true);*/
// make sure leader has not changed after bringing initial leader back
assertEquals(nodePeerSynced, shardToLeaderJetty.get("shard1"));
// assert metrics
MetricRegistry registry = nodePeerSynced.jetty.getCoreContainer().getMetricManager().registry("solr.core.collection1");
Map<String, Metric> metrics = registry.getMetrics();
assertTrue("REPLICATION.time present", metrics.containsKey("REPLICATION.time"));
assertTrue("REPLICATION.errors present", metrics.containsKey("REPLICATION.errors"));
Timer timer = (Timer) metrics.get("REPLICATION.time");
assertEquals(1L, timer.getCount());
Counter counter = (Counter) metrics.get("REPLICATION.errors");
assertEquals(0L, counter.getCount());
success = true;
} finally {
System.clearProperty("solr.disableFingerprint");
}
}
use of com.codahale.metrics.Metric in project lucene-solr by apache.
the class HdfsCollectionsAPIDistributedZkTest method moveReplicaTest.
@Test
public void moveReplicaTest() throws Exception {
cluster.waitForAllNodes(5000);
String coll = "movereplicatest_coll";
CloudSolrClient cloudClient = cluster.getSolrClient();
CollectionAdminRequest.Create create = CollectionAdminRequest.createCollection(coll, "conf", 2, 2);
create.setMaxShardsPerNode(2);
cloudClient.request(create);
for (int i = 0; i < 10; i++) {
cloudClient.add(coll, sdoc("id", String.valueOf(i)));
cloudClient.commit(coll);
}
List<Slice> slices = new ArrayList<>(cloudClient.getZkStateReader().getClusterState().getCollection(coll).getSlices());
Collections.shuffle(slices, random());
Slice slice = null;
Replica replica = null;
for (Slice s : slices) {
slice = s;
for (Replica r : s.getReplicas()) {
if (s.getLeader() != r) {
replica = r;
}
}
}
String dataDir = getDataDir(replica);
Set<String> liveNodes = cloudClient.getZkStateReader().getClusterState().getLiveNodes();
ArrayList<String> l = new ArrayList<>(liveNodes);
Collections.shuffle(l, random());
String targetNode = null;
for (String node : liveNodes) {
if (!replica.getNodeName().equals(node)) {
targetNode = node;
break;
}
}
assertNotNull(targetNode);
CollectionAdminRequest.MoveReplica moveReplica = new CollectionAdminRequest.MoveReplica(coll, replica.getName(), targetNode);
moveReplica.process(cloudClient);
checkNumOfCores(cloudClient, replica.getNodeName(), 0);
checkNumOfCores(cloudClient, targetNode, 2);
waitForState("Wait for recovery finish failed", coll, clusterShape(2, 2));
slice = cloudClient.getZkStateReader().getClusterState().getCollection(coll).getSlice(slice.getName());
boolean found = false;
for (Replica newReplica : slice.getReplicas()) {
if (getDataDir(newReplica).equals(dataDir)) {
found = true;
}
}
assertTrue(found);
// data dir is reused so replication will be skipped
for (JettySolrRunner jetty : cluster.getJettySolrRunners()) {
SolrMetricManager manager = jetty.getCoreContainer().getMetricManager();
List<String> registryNames = manager.registryNames().stream().filter(s -> s.startsWith("solr.core.")).collect(Collectors.toList());
for (String registry : registryNames) {
Map<String, Metric> metrics = manager.registry(registry).getMetrics();
Counter counter = (Counter) metrics.get("REPLICATION./replication.requests");
if (counter != null) {
assertEquals(0, counter.getCount());
}
}
}
}
use of com.codahale.metrics.Metric in project lucene-solr by apache.
the class SolrMetricManager method registerAll.
/**
* Register all metrics in the provided {@link MetricSet}, optionally skipping those that
* already exist.
* @param registry registry name
* @param metrics metric set to register
* @param force if true then already existing metrics with the same name will be replaced.
* When false and a metric with the same name already exists an exception
* will be thrown.
* @param metricPath (optional) additional top-most metric name path elements
* @throws Exception if a metric with this name already exists.
*/
public void registerAll(String registry, MetricSet metrics, boolean force, String... metricPath) throws Exception {
MetricRegistry metricRegistry = registry(registry);
synchronized (metricRegistry) {
Map<String, Metric> existingMetrics = metricRegistry.getMetrics();
for (Map.Entry<String, Metric> entry : metrics.getMetrics().entrySet()) {
String fullName = mkName(entry.getKey(), metricPath);
if (force && existingMetrics.containsKey(fullName)) {
metricRegistry.remove(fullName);
}
metricRegistry.register(fullName, entry.getValue());
}
}
}
Aggregations