use of org.apache.cassandra.distributed.Cluster in project cassandra by apache.
the class RepairErrorsTest method testRemoteValidationFailure.
@Test
public void testRemoteValidationFailure() throws IOException {
Cluster.Builder builder = Cluster.build(2).withConfig(config -> config.with(GOSSIP).with(NETWORK)).withInstanceInitializer(ByteBuddyHelper::install);
try (Cluster cluster = builder.createWithoutStarting()) {
cluster.setUncaughtExceptionsFilter((i, throwable) -> {
if (i == 2)
return throwable.getMessage() != null && throwable.getMessage().contains("IGNORE");
return false;
});
cluster.startup();
init(cluster);
cluster.schemaChange("create table " + KEYSPACE + ".tbl (id int primary key, x int)");
for (int i = 0; i < 10; i++) cluster.coordinator(1).execute("insert into " + KEYSPACE + ".tbl (id, x) VALUES (?,?)", ConsistencyLevel.ALL, i, i);
cluster.forEach(i -> i.flush(KEYSPACE));
long mark = cluster.get(1).logs().mark();
cluster.forEach(i -> i.nodetoolResult("repair", "--full").asserts().failure());
Assertions.assertThat(cluster.get(1).logs().grep(mark, "^ERROR").getResult()).isEmpty();
}
}
use of org.apache.cassandra.distributed.Cluster in project cassandra by apache.
the class PreviewRepairTest method snapshotTest.
@Test
public void snapshotTest() throws IOException, InterruptedException {
try (Cluster cluster = init(Cluster.build(3).withConfig(config -> config.set("snapshot_on_repaired_data_mismatch", true).with(GOSSIP).with(NETWORK)).start())) {
cluster.schemaChange("create table " + KEYSPACE + ".tbl (id int primary key, t int)");
cluster.schemaChange("create table " + KEYSPACE + ".tbl2 (id int primary key, t int)");
// populate 2 tables
insert(cluster.coordinator(1), 0, 100, "tbl");
insert(cluster.coordinator(1), 0, 100, "tbl2");
cluster.forEach((n) -> n.flush(KEYSPACE));
// make sure everything is marked repaired
cluster.get(1).callOnInstance(repair(options(false, false)));
waitMarkedRepaired(cluster);
// make node2 mismatch
unmarkRepaired(cluster.get(2), "tbl");
verifySnapshots(cluster, "tbl", true);
verifySnapshots(cluster, "tbl2", true);
AtomicInteger snapshotMessageCounter = new AtomicInteger();
cluster.filters().verbs(Verb.SNAPSHOT_REQ.id).messagesMatching((from, to, message) -> {
snapshotMessageCounter.incrementAndGet();
return false;
}).drop();
cluster.get(1).callOnInstance(repair(options(true, true)));
verifySnapshots(cluster, "tbl", false);
// tbl2 should not have a mismatch, so the snapshots should be empty here
verifySnapshots(cluster, "tbl2", true);
assertEquals(3, snapshotMessageCounter.get());
// and make sure that we don't try to snapshot again
snapshotMessageCounter.set(0);
cluster.get(3).callOnInstance(repair(options(true, true)));
assertEquals(0, snapshotMessageCounter.get());
}
}
use of org.apache.cassandra.distributed.Cluster in project cassandra by apache.
the class PreviewRepairTest method testFinishingNonIntersectingIncRepairDuringPreview.
/**
* Same as testFinishingIncRepairDuringPreview but the previewed range does not intersect the incremental repair
* so both preview and incremental repair should finish fine (without any mismatches)
*/
@Test
public void testFinishingNonIntersectingIncRepairDuringPreview() throws IOException, InterruptedException, ExecutionException {
ExecutorService es = Executors.newSingleThreadExecutor();
try (Cluster cluster = init(Cluster.build(2).withConfig(config -> config.with(GOSSIP).with(NETWORK)).start())) {
cluster.schemaChange("create table " + KEYSPACE + ".tbl (id int primary key, t int)");
insert(cluster.coordinator(1), 0, 100);
cluster.forEach((node) -> node.flush(KEYSPACE));
assertTrue(cluster.get(1).callOnInstance(repair(options(false, false))).success);
insert(cluster.coordinator(1), 100, 100);
cluster.forEach((node) -> node.flush(KEYSPACE));
// pause preview repair validation messages on node2 until node1 has finished
Condition previewRepairStarted = newOneTimeCondition();
Condition continuePreviewRepair = newOneTimeCondition();
DelayFirstRepairTypeMessageFilter filter = validationRequest(previewRepairStarted, continuePreviewRepair);
cluster.filters().outbound().verbs(VALIDATION_REQ.id).from(1).to(2).messagesMatching(filter).drop();
// get local ranges to repair two separate ranges:
List<String> localRanges = cluster.get(1).callOnInstance(() -> {
List<String> res = new ArrayList<>();
for (Range<Token> r : instance.getLocalReplicas(KEYSPACE).ranges()) res.add(r.left.getTokenValue() + ":" + r.right.getTokenValue());
return res;
});
assertEquals(2, localRanges.size());
Future<RepairResult> repairStatusFuture = es.submit(() -> cluster.get(1).callOnInstance(repair(options(true, false, localRanges.get(0)))));
// wait for node1 to start validation compaction
previewRepairStarted.await();
// this needs to finish before the preview repair is unpaused on node2
assertTrue(cluster.get(1).callOnInstance(repair(options(false, false, localRanges.get(1)))).success);
continuePreviewRepair.signalAll();
RepairResult rs = repairStatusFuture.get();
// repair should succeed
assertTrue(rs.success);
// and no mismatches
assertFalse(rs.wasInconsistent);
} finally {
es.shutdown();
}
}
use of org.apache.cassandra.distributed.Cluster in project cassandra by apache.
the class PreviewRepairTest method testWithMismatchingPending.
/**
* makes sure that the repaired sstables are not matching on the two
* nodes by disabling autocompaction on node2 and then running an
* incremental repair
*/
@Test
public void testWithMismatchingPending() throws Throwable {
try (Cluster cluster = init(Cluster.build(2).withConfig(config -> config.with(GOSSIP).with(NETWORK)).start())) {
cluster.schemaChange("create table " + KEYSPACE + ".tbl (id int primary key, t int)");
insert(cluster.coordinator(1), 0, 100);
cluster.forEach((node) -> node.flush(KEYSPACE));
cluster.get(1).callOnInstance(repair(options(false, false)));
insert(cluster.coordinator(1), 100, 100);
cluster.forEach((node) -> node.flush(KEYSPACE));
// make sure that all sstables have moved to repaired by triggering a compaction
// also disables autocompaction on the nodes
cluster.forEach((node) -> node.runOnInstance(() -> {
ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore("tbl");
FBUtilities.waitOnFutures(CompactionManager.instance.submitBackground(cfs));
cfs.disableAutoCompaction();
}));
long[] marks = logMark(cluster);
cluster.get(1).callOnInstance(repair(options(false, false)));
// now re-enable autocompaction on node1, this moves the sstables for the new repair to repaired
cluster.get(1).runOnInstance(() -> {
ColumnFamilyStore cfs = Keyspace.open(KEYSPACE).getColumnFamilyStore("tbl");
cfs.enableAutoCompaction();
FBUtilities.waitOnFutures(CompactionManager.instance.submitBackground(cfs));
});
waitLogsRepairFullyFinished(cluster, marks);
RepairResult rs = cluster.get(1).callOnInstance(repair(options(true, false)));
// preview repair should succeed
assertTrue(rs.success);
// and we should see no mismatches
assertFalse(rs.wasInconsistent);
}
}
use of org.apache.cassandra.distributed.Cluster in project cassandra by apache.
the class HintsServiceMetricsTest method testHintsServiceMetrics.
@Test
public void testHintsServiceMetrics() throws Exception {
// setup a 3-node cluster with a bytebuddy injection that makes the writting of some hints to fail
try (Cluster cluster = builder().withNodes(3).withConfig(config -> config.with(NETWORK, GOSSIP, NATIVE_PROTOCOL)).withInstanceInitializer(FailHints::install).start()) {
// setup a message filter to drop some of the hint request messages from node1
AtomicInteger hintsNode2 = new AtomicInteger();
AtomicInteger hintsNode3 = new AtomicInteger();
cluster.filters().verbs(Verb.HINT_REQ.id).from(1).messagesMatching((from, to, message) -> (to == 2 && hintsNode2.incrementAndGet() <= NUM_TIMEOUTS_PER_NODE) || (to == 3 && hintsNode3.incrementAndGet() <= NUM_TIMEOUTS_PER_NODE)).drop();
// setup a message filter to drop mutations requests from node1, so it creates hints for those mutations
AtomicBoolean dropWritesForNode2 = new AtomicBoolean(false);
AtomicBoolean dropWritesForNode3 = new AtomicBoolean(false);
cluster.filters().verbs(Verb.MUTATION_REQ.id).from(1).messagesMatching((from, to, message) -> (to == 2 && dropWritesForNode2.get()) || (to == 3 && dropWritesForNode3.get())).drop();
// fix under replicated keyspaces so they don't produce hint requests while we are dropping mutations
fixDistributedSchemas(cluster);
cluster.schemaChange(withKeyspace("CREATE KEYSPACE %s WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 3}"));
cluster.schemaChange(withKeyspace("CREATE TABLE %s.t (k int PRIMARY KEY, v int)"));
ICoordinator coordinator = cluster.coordinator(1);
IInvokableInstance node1 = cluster.get(1);
IInvokableInstance node2 = cluster.get(2);
IInvokableInstance node3 = cluster.get(3);
// write the first half of the rows with the second node dropping mutation requests,
// so some hints will be created for that node
dropWritesForNode2.set(true);
for (int i = 0; i < NUM_ROWS / 2; i++) coordinator.execute(withKeyspace("INSERT INTO %s.t (k, v) VALUES (?, ?)"), QUORUM, i, i);
dropWritesForNode2.set(false);
// write the second half of the rows with the third node dropping mutations requests,
// so some hints will be created for that node
dropWritesForNode3.set(true);
for (int i = NUM_ROWS / 2; i < NUM_ROWS; i++) coordinator.execute(withKeyspace("INSERT INTO %s.t (k, v) VALUES (?, ?)"), QUORUM, i, i);
dropWritesForNode3.set(false);
// wait until all the hints have been successfully applied to the nodes that have been dropping mutations
waitUntilAsserted(() -> assertThat(countRows(node2)).isEqualTo(countRows(node3)).isEqualTo(NUM_ROWS));
// Verify the metrics for the coordinator node, which is the only one actually sending hints.
// The hint delivery errors that we have injected should have made the service try to send them again.
// These retries are done periodically and in pages, so the retries may send again some of the hints that
// were already successfully sent. This way, there may be more succeeded hints than actual hints/rows.
waitUntilAsserted(() -> assertThat(countHintsSucceeded(node1)).isGreaterThanOrEqualTo(NUM_ROWS));
waitUntilAsserted(() -> assertThat(countHintsFailed(node1)).isEqualTo(NUM_FAILURES_PER_NODE * 2));
waitUntilAsserted(() -> assertThat(countHintsTimedOut(node1)).isEqualTo(NUM_TIMEOUTS_PER_NODE * 2));
// verify delay metrics
long numGlobalDelays = countGlobalDelays(node1);
assertThat(numGlobalDelays).isGreaterThanOrEqualTo(NUM_ROWS);
assertThat(countEndpointDelays(node1, node1)).isEqualTo(0);
assertThat(countEndpointDelays(node1, node2)).isGreaterThan(0).isLessThanOrEqualTo(numGlobalDelays);
assertThat(countEndpointDelays(node1, node3)).isGreaterThan(0).isLessThanOrEqualTo(numGlobalDelays);
assertThat(countEndpointDelays(node1, node2) + countEndpointDelays(node1, node3)).isGreaterThanOrEqualTo(numGlobalDelays);
// verify that the metrics for the not-coordinator nodes are zero
for (IInvokableInstance node : Arrays.asList(node2, node3)) {
assertThat(countHintsSucceeded(node)).isEqualTo(0);
assertThat(countHintsFailed(node)).isEqualTo(0);
assertThat(countHintsTimedOut(node)).isEqualTo(0);
assertThat(countGlobalDelays(node)).isEqualTo(0);
cluster.forEach(target -> assertThat(countEndpointDelays(node, target)).isEqualTo(0));
}
}
}
Aggregations