use of org.apache.cassandra.distributed.api.ICoordinator in project cassandra by apache.
the class CASTest method consistencyAfterWriteTimeoutTest.
/**
* Base test to ensure that if a write times out but with a proposal accepted by some nodes (less then quorum), and
* a following SERIAL operation does not observe that write (the node having accepted it do not participate in that
* following operation), then that write is never applied, even when the nodes having accepted the original proposal
* participate.
*
* <p>In other words, if an operation timeout, it may or may not be applied, but that "fate" is persistently decided
* by the very SERIAL operation that "succeed" (in the sense of 'not timing out or throwing some other exception').
*
* @param postTimeoutOperation1 a SERIAL operation executed after an initial write that inserts the row [0, 0] times
* out. It is executed with a QUORUM of nodes that have _not_ see the timed out
* proposal, and so that operation should expect that the [0, 0] write has not taken
* place.
* @param postTimeoutOperation2 a 2nd SERIAL operation executed _after_ {@code postTimeoutOperation1}, with no
* write executed between the 2 operation. Contrarily to the 1st operation, the QORUM
* for this operation _will_ include the node that got the proposal for the [0, 0]
* insert but didn't participated to {@code postTimeoutOperation1}}. That operation
* should also no witness that [0, 0] write (since {@code postTimeoutOperation1}
* didn't).
* @param loseCommitOfOperation1 if {@code true}, the test will also drop the "commits" messages for
* {@code postTimeoutOperation1}. In general, the test should behave the same with or
* without that flag since a value is decided as soon as it has been "accepted by
* quorum" and the commits should always be properly replayed.
*/
private void consistencyAfterWriteTimeoutTest(BiConsumer<String, ICoordinator> postTimeoutOperation1, BiConsumer<String, ICoordinator> postTimeoutOperation2, boolean loseCommitOfOperation1) throws IOException {
// not about performance, this is probably ok, even if we ideally should dug into the underlying reason.
try (Cluster cluster = init(Cluster.create(3, config -> config.set("write_request_timeout", "4000ms").set("cas_contention_timeout", CONTENTION_TIMEOUT)))) {
String table = KEYSPACE + ".t";
cluster.schemaChange("CREATE TABLE " + table + " (k int PRIMARY KEY, v int)");
// We do a CAS insertion, but have with the PROPOSE message dropped on node 1 and 2. The CAS will not get
// through and should timeout. Importantly, node 3 does receive and answer the PROPOSE.
IMessageFilters.Filter dropProposeFilter = cluster.filters().inbound().verbs(PAXOS_PROPOSE_REQ.id).from(3).to(1, 2).drop();
try {
// NOTE: the consistency below is the "commit" one, so it doesn't matter at all here.
// NOTE 2: we use node 3 as coordinator because message filters don't currently work for locally
// delivered messages and as we want to drop messages to 1 and 2, we can't use them.
cluster.coordinator(3).execute("INSERT INTO " + table + "(k, v) VALUES (0, 0) IF NOT EXISTS", ConsistencyLevel.ONE);
fail("The insertion should have timed-out");
} catch (Exception e) {
// be improved at the dtest API level.
if (!e.getClass().getSimpleName().equals("CasWriteTimeoutException"))
throw e;
} finally {
dropProposeFilter.off();
}
// Isolates node 3 and executes the SERIAL operation. As neither node 1 or 2 got the initial insert proposal,
// there is nothing to "replay" and the operation should assert the table is still empty.
IMessageFilters.Filter ignoreNode3Filter = cluster.filters().verbs(paxosAndReadVerbs()).to(3).drop();
IMessageFilters.Filter dropCommitFilter = null;
if (loseCommitOfOperation1) {
dropCommitFilter = cluster.filters().verbs(PAXOS_COMMIT_REQ.id).to(1, 2).drop();
}
try {
postTimeoutOperation1.accept(table, cluster.coordinator(1));
} finally {
ignoreNode3Filter.off();
if (dropCommitFilter != null)
dropCommitFilter.off();
}
// Node 3 is now back and we isolate node 2 to ensure the next read hits node 1 and 3.
// What we want to ensure is that despite node 3 having the initial insert in its paxos state in a position of
// being replayed, that insert is _not_ replayed (it would contradict serializability since the previous
// operation asserted nothing was inserted). It is this execution that failed before CASSANDRA-12126.
IMessageFilters.Filter ignoreNode2Filter = cluster.filters().verbs(paxosAndReadVerbs()).to(2).drop();
try {
postTimeoutOperation2.accept(table, cluster.coordinator(1));
} finally {
ignoreNode2Filter.off();
}
}
}
use of org.apache.cassandra.distributed.api.ICoordinator in project cassandra by apache.
the class SimpleReadWriteTest method writeRows.
/**
* Writes {@code numPartitions} with {@code rowsPerPartition} each, with overrides in different sstables and memtables.
*/
private void writeRows(int numPartitions, int rowsPerPartition) {
String update = withTable("UPDATE %s SET v=? WHERE k=? AND c=?");
ICoordinator coordinator = cluster.coordinator(1);
// insert all the partition rows in a single sstable
for (int c = 0; c < rowsPerPartition; c++) for (int k = 0; k < numPartitions; k++) coordinator.execute(update, QUORUM, c, k, c);
cluster.forEach(i -> i.flush(KEYSPACE));
// override some rows in a second sstable
for (int c = 0; c < rowsPerPartition; c += SECOND_SSTABLE_INTERVAL) for (int k = 0; k < numPartitions; k++) coordinator.execute(update, QUORUM, c + rowsPerPartition, k, c);
cluster.forEach(i -> i.flush(KEYSPACE));
// override some rows only in memtable
for (int c = 0; c < rowsPerPartition; c += MEMTABLE_INTERVAL) for (int k = 0; k < numPartitions; k++) coordinator.execute(update, QUORUM, c + rowsPerPartition * 2, k, c);
}
use of org.apache.cassandra.distributed.api.ICoordinator in project cassandra by apache.
the class SimpleReadWriteTest method readRows.
/**
* Runs the specified query in all coordinators, with and without paging.
*/
private Object[][] readRows(String query, Object... boundValues) {
query = withTable(query);
// verify that all coordinators return the same results for the query, regardless of paging
Object[][] lastRows = null;
int lastNode = 1;
boolean lastPaging = false;
for (int node = 1; node <= NUM_NODES; node++) {
ICoordinator coordinator = cluster.coordinator(node);
for (boolean paging : BOOLEANS) {
Object[][] rows = paging ? Iterators.toArray(coordinator.executeWithPaging(query, QUORUM, 1, boundValues), Object[].class) : coordinator.execute(query, QUORUM, boundValues);
if (lastRows != null) {
try {
assertRows(lastRows, rows);
} catch (AssertionError e) {
fail(String.format("Node %d %s paging has returned different results " + "for the same query than node %d %s paging:\n%s", node, paging ? "with" : "without", lastNode, lastPaging ? "with" : "without", e.getMessage()));
}
}
lastRows = rows;
lastPaging = paging;
}
lastNode = node;
}
Assert.assertNotNull(lastRows);
// undo the clustering reverse sorting to ease validation
if (reverse)
ArrayUtils.reverse(lastRows);
// sort by partition key to ease validation
Arrays.sort(lastRows, Comparator.comparing(row -> (int) row[0]));
return lastRows;
}
use of org.apache.cassandra.distributed.api.ICoordinator in project cassandra by apache.
the class ReadDigestConsistencyTest method checkTraceForDigestMismatch.
public static void checkTraceForDigestMismatch(ICoordinator coordinator, String query, Object... boundValues) {
UUID sessionId = UUID.randomUUID();
try {
coordinator.executeWithTracing(sessionId, query, ConsistencyLevel.ALL, boundValues);
} catch (RuntimeException ex) {
if (Throwables.isCausedBy(ex, t -> t.getClass().getName().equals(SyntaxException.class.getName()))) {
if (coordinator.instance().getReleaseVersionString().startsWith("3.") && query.contains("[")) {
logger.warn("Query {} is not supported on node {} version {}", query, coordinator.instance().broadcastAddress().getAddress().getHostAddress(), coordinator.instance().getReleaseVersionString());
// we can forgive SyntaxException for C* < 4.0 if the query contains collection element selection
return;
}
}
logger.error("Failing for coordinator {} and query {}", coordinator.instance().getReleaseVersionString(), query);
throw ex;
}
Object[][] results = coordinator.execute(SELECT_TRACE, ConsistencyLevel.ALL, sessionId, coordinator.instance().broadcastAddress().getAddress());
for (Object[] result : results) {
String activity = (String) result[0];
Assert.assertFalse(String.format("Found Digest Mismatch while executing query: %s with bound values %s on %s/%s", query, Arrays.toString(boundValues), coordinator.instance().broadcastAddress(), coordinator.instance().getReleaseVersionString()), activity.toLowerCase().contains("mismatch for key"));
}
}
use of org.apache.cassandra.distributed.api.ICoordinator in project cassandra by apache.
the class ReadRepairTest method testRangeSliceQueryWithTombstones.
/**
* Verify that range queries with CL>ONE don't do unnecessary read-repairs when there are tombstones.
* <p>
* See CASSANDRA-8989 and CASSANDRA-9502.
* <p>
* Migrated from Python dtest read_repair_test.py:TestReadRepair.test_range_slice_query_with_tombstones()
*/
private void testRangeSliceQueryWithTombstones(boolean flush) throws Throwable {
try (Cluster cluster = init(Cluster.create(2))) {
cluster.schemaChange(withKeyspace("CREATE TABLE %s.t (k int, c int, v int, PRIMARY KEY(k, c))"));
ICoordinator coordinator = cluster.coordinator(1);
// insert some rows in all nodes
String insertQuery = withKeyspace("INSERT INTO %s.t (k, c, v) VALUES (?, ?, ?)");
for (int k = 0; k < 10; k++) {
for (int c = 0; c < 10; c++) coordinator.execute(insertQuery, ALL, k, c, k * c);
}
// delete a subset of the inserted partitions, plus some others that don't exist
String deletePartitionQuery = withKeyspace("DELETE FROM %s.t WHERE k = ?");
for (int k = 5; k < 15; k++) {
coordinator.execute(deletePartitionQuery, ALL, k);
}
// delete some of the rows of some of the partitions, including deleted and not deleted partitions
String deleteRowQuery = withKeyspace("DELETE FROM %s.t WHERE k = ? AND c = ?");
for (int k = 2; k < 7; k++) {
for (int c = 0; c < 5; c++) coordinator.execute(deleteRowQuery, ALL, k, c);
}
// delete some of the rows of some not-existent partitions, including deleted and never-written partitions
for (int k = 12; k < 17; k++) {
for (int c = 0; c < 5; c++) coordinator.execute(deleteRowQuery, ALL, k, c);
}
// flush all the nodes if specified
if (flush) {
for (int n = 1; n <= cluster.size(); n++) cluster.get(n).flush(KEYSPACE);
}
// run a bunch of queries verifying that they don't trigger read repair
coordinator.execute(withKeyspace("SELECT * FROM %s.t LIMIT 100"), QUORUM);
for (int k = 0; k < 15; k++) {
coordinator.execute(withKeyspace("SELECT * FROM %s.t WHERE k=?"), QUORUM, k);
for (int c = 0; c < 10; c++) {
coordinator.execute(withKeyspace("SELECT * FROM %s.t WHERE k=? AND c=?"), QUORUM, k, c);
coordinator.execute(withKeyspace("SELECT * FROM %s.t WHERE k=? AND c>?"), QUORUM, k, c);
coordinator.execute(withKeyspace("SELECT * FROM %s.t WHERE k=? AND c<?"), QUORUM, k, c);
}
}
long requests = ReadRepairTester.readRepairRequestsCount(cluster.get(1), "t");
assertEquals("No read repair requests were expected, found " + requests, 0, requests);
}
}
Aggregations