use of org.apache.cassandra.hints.Hint in project cassandra by apache.
the class StorageProxy method shouldHint.
/**
* Determines whether a hint should be stored or not.
* It rejects early if any of the condition is met:
* - Hints disabled entirely or for the belonging datacetner of the replica
* - The replica is transient or is the self node
* - The replica is no longer part of the ring
* - The hint window has expired
* - The hints have reached to the size limit for the node
* Otherwise, it permits.
*
* @param replica, the replica for the hint
* @param tryEnablePersistentWindow, true to consider hint_window_persistent_enabled; otherwise, ignores
* @return true to permit or false to reject hint
*/
public static boolean shouldHint(Replica replica, boolean tryEnablePersistentWindow) {
if (!DatabaseDescriptor.hintedHandoffEnabled() || replica.isTransient() || replica.isSelf())
return false;
Set<String> disabledDCs = DatabaseDescriptor.hintedHandoffDisabledDCs();
if (!disabledDCs.isEmpty()) {
final String dc = DatabaseDescriptor.getEndpointSnitch().getDatacenter(replica);
if (disabledDCs.contains(dc)) {
Tracing.trace("Not hinting {} since its data center {} has been disabled {}", replica, dc, disabledDCs);
return false;
}
}
InetAddressAndPort endpoint = replica.endpoint();
int maxHintWindow = DatabaseDescriptor.getMaxHintWindow();
long endpointDowntime = Gossiper.instance.getEndpointDowntime(endpoint);
boolean hintWindowExpired = endpointDowntime > maxHintWindow;
UUID hostIdForEndpoint = StorageService.instance.getHostIdForEndpoint(endpoint);
if (hostIdForEndpoint == null) {
Tracing.trace("Discarding hint for endpoint not part of ring: {}", endpoint);
return false;
}
// if persisting hints window, hintWindowExpired might be updated according to the timestamp of the earliest hint
if (tryEnablePersistentWindow && !hintWindowExpired && DatabaseDescriptor.hintWindowPersistentEnabled()) {
long earliestHint = HintsService.instance.getEarliestHintForHost(hostIdForEndpoint);
hintWindowExpired = Clock.Global.currentTimeMillis() - maxHintWindow > earliestHint;
if (hintWindowExpired)
Tracing.trace("Not hinting {} for which there is the earliest hint stored at {}", replica, earliestHint);
}
if (hintWindowExpired) {
HintsService.instance.metrics.incrPastWindow(endpoint);
Tracing.trace("Not hinting {} which has been down {} ms", endpoint, endpointDowntime);
return false;
}
long maxHintsSize = DatabaseDescriptor.getMaxHintsSizePerHost();
long actualTotalHintsSize = HintsService.instance.getTotalHintsSize(hostIdForEndpoint);
boolean hasHintsReachedMaxSize = maxHintsSize > 0 && actualTotalHintsSize > maxHintsSize;
if (hasHintsReachedMaxSize) {
Tracing.trace("Not hinting {} which has reached to the max hints size {} bytes on disk. The actual hints size on disk: {}", endpoint, maxHintsSize, actualTotalHintsSize);
return false;
}
return true;
}
use of org.apache.cassandra.hints.Hint in project cassandra by apache.
the class StorageProxy method sendToHintedEndpoints.
/**
* Send the mutations to the right targets, write it locally if it corresponds or writes a hint when the node
* is not available.
*
* Note about hints:
* <pre>
* {@code
* | Hinted Handoff | Consist. Level |
* | on | >=1 | --> wait for hints. We DO NOT notify the handler with handler.response() for hints;
* | on | ANY | --> wait for hints. Responses count towards consistency.
* | off | >=1 | --> DO NOT fire hints. And DO NOT wait for them to complete.
* | off | ANY | --> DO NOT fire hints. And DO NOT wait for them to complete.
* }
* </pre>
*
* @throws OverloadedException if the hints cannot be written/enqueued
*/
public static void sendToHintedEndpoints(final Mutation mutation, Iterable<InetAddress> targets, AbstractWriteResponseHandler<IMutation> responseHandler, String localDataCenter, Stage stage) throws OverloadedException {
int targetsSize = Iterables.size(targets);
// this dc replicas:
Collection<InetAddress> localDc = null;
// extra-datacenter replicas, grouped by dc
Map<String, Collection<InetAddress>> dcGroups = null;
// only need to create a Message for non-local writes
MessageOut<Mutation> message = null;
boolean insertLocal = false;
ArrayList<InetAddress> endpointsToHint = null;
List<InetAddress> backPressureHosts = null;
for (InetAddress destination : targets) {
checkHintOverload(destination);
if (FailureDetector.instance.isAlive(destination)) {
if (canDoLocalRequest(destination)) {
insertLocal = true;
} else {
// belongs on a different server
if (message == null)
message = mutation.createMessage();
String dc = DatabaseDescriptor.getEndpointSnitch().getDatacenter(destination);
// (1.1 knows how to forward old-style String message IDs; updated to int in 2.0)
if (localDataCenter.equals(dc)) {
if (localDc == null)
localDc = new ArrayList<>(targetsSize);
localDc.add(destination);
} else {
Collection<InetAddress> messages = (dcGroups != null) ? dcGroups.get(dc) : null;
if (messages == null) {
// most DCs will have <= 3 replicas
messages = new ArrayList<>(3);
if (dcGroups == null)
dcGroups = new HashMap<>();
dcGroups.put(dc, messages);
}
messages.add(destination);
}
if (backPressureHosts == null)
backPressureHosts = new ArrayList<>(targetsSize);
backPressureHosts.add(destination);
}
} else {
if (shouldHint(destination)) {
if (endpointsToHint == null)
endpointsToHint = new ArrayList<>(targetsSize);
endpointsToHint.add(destination);
}
}
}
if (backPressureHosts != null)
MessagingService.instance().applyBackPressure(backPressureHosts, responseHandler.currentTimeout());
if (endpointsToHint != null)
submitHint(mutation, endpointsToHint, responseHandler);
if (insertLocal)
performLocally(stage, Optional.of(mutation), mutation::apply, responseHandler);
if (localDc != null) {
for (InetAddress destination : localDc) MessagingService.instance().sendRR(message, destination, responseHandler, true);
}
if (dcGroups != null) {
// for each datacenter, send the message to one node to relay the write to other replicas
for (Collection<InetAddress> dcTargets : dcGroups.values()) sendMessagesToNonlocalDC(message, dcTargets, responseHandler);
}
}
use of org.apache.cassandra.hints.Hint in project cassandra by apache.
the class HintsServiceMetricsTest method testHintsServiceMetrics.
@Test
public void testHintsServiceMetrics() throws Exception {
// setup a 3-node cluster with a bytebuddy injection that makes the writting of some hints to fail
try (Cluster cluster = builder().withNodes(3).withConfig(config -> config.with(NETWORK, GOSSIP, NATIVE_PROTOCOL)).withInstanceInitializer(FailHints::install).start()) {
// setup a message filter to drop some of the hint request messages from node1
AtomicInteger hintsNode2 = new AtomicInteger();
AtomicInteger hintsNode3 = new AtomicInteger();
cluster.filters().verbs(Verb.HINT_REQ.id).from(1).messagesMatching((from, to, message) -> (to == 2 && hintsNode2.incrementAndGet() <= NUM_TIMEOUTS_PER_NODE) || (to == 3 && hintsNode3.incrementAndGet() <= NUM_TIMEOUTS_PER_NODE)).drop();
// setup a message filter to drop mutations requests from node1, so it creates hints for those mutations
AtomicBoolean dropWritesForNode2 = new AtomicBoolean(false);
AtomicBoolean dropWritesForNode3 = new AtomicBoolean(false);
cluster.filters().verbs(Verb.MUTATION_REQ.id).from(1).messagesMatching((from, to, message) -> (to == 2 && dropWritesForNode2.get()) || (to == 3 && dropWritesForNode3.get())).drop();
// fix under replicated keyspaces so they don't produce hint requests while we are dropping mutations
fixDistributedSchemas(cluster);
cluster.schemaChange(withKeyspace("CREATE KEYSPACE %s WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 3}"));
cluster.schemaChange(withKeyspace("CREATE TABLE %s.t (k int PRIMARY KEY, v int)"));
ICoordinator coordinator = cluster.coordinator(1);
IInvokableInstance node1 = cluster.get(1);
IInvokableInstance node2 = cluster.get(2);
IInvokableInstance node3 = cluster.get(3);
// write the first half of the rows with the second node dropping mutation requests,
// so some hints will be created for that node
dropWritesForNode2.set(true);
for (int i = 0; i < NUM_ROWS / 2; i++) coordinator.execute(withKeyspace("INSERT INTO %s.t (k, v) VALUES (?, ?)"), QUORUM, i, i);
dropWritesForNode2.set(false);
// write the second half of the rows with the third node dropping mutations requests,
// so some hints will be created for that node
dropWritesForNode3.set(true);
for (int i = NUM_ROWS / 2; i < NUM_ROWS; i++) coordinator.execute(withKeyspace("INSERT INTO %s.t (k, v) VALUES (?, ?)"), QUORUM, i, i);
dropWritesForNode3.set(false);
// wait until all the hints have been successfully applied to the nodes that have been dropping mutations
waitUntilAsserted(() -> assertThat(countRows(node2)).isEqualTo(countRows(node3)).isEqualTo(NUM_ROWS));
// Verify the metrics for the coordinator node, which is the only one actually sending hints.
// The hint delivery errors that we have injected should have made the service try to send them again.
// These retries are done periodically and in pages, so the retries may send again some of the hints that
// were already successfully sent. This way, there may be more succeeded hints than actual hints/rows.
waitUntilAsserted(() -> assertThat(countHintsSucceeded(node1)).isGreaterThanOrEqualTo(NUM_ROWS));
waitUntilAsserted(() -> assertThat(countHintsFailed(node1)).isEqualTo(NUM_FAILURES_PER_NODE * 2));
waitUntilAsserted(() -> assertThat(countHintsTimedOut(node1)).isEqualTo(NUM_TIMEOUTS_PER_NODE * 2));
// verify delay metrics
long numGlobalDelays = countGlobalDelays(node1);
assertThat(numGlobalDelays).isGreaterThanOrEqualTo(NUM_ROWS);
assertThat(countEndpointDelays(node1, node1)).isEqualTo(0);
assertThat(countEndpointDelays(node1, node2)).isGreaterThan(0).isLessThanOrEqualTo(numGlobalDelays);
assertThat(countEndpointDelays(node1, node3)).isGreaterThan(0).isLessThanOrEqualTo(numGlobalDelays);
assertThat(countEndpointDelays(node1, node2) + countEndpointDelays(node1, node3)).isGreaterThanOrEqualTo(numGlobalDelays);
// verify that the metrics for the not-coordinator nodes are zero
for (IInvokableInstance node : Arrays.asList(node2, node3)) {
assertThat(countHintsSucceeded(node)).isEqualTo(0);
assertThat(countHintsFailed(node)).isEqualTo(0);
assertThat(countHintsTimedOut(node)).isEqualTo(0);
assertThat(countGlobalDelays(node)).isEqualTo(0);
cluster.forEach(target -> assertThat(countEndpointDelays(node, target)).isEqualTo(0));
}
}
}
use of org.apache.cassandra.hints.Hint in project cassandra by apache.
the class StorageProxy method mutate.
/**
* Use this method to have these Mutations applied
* across all replicas. This method will take care
* of the possibility of a replica being down and hint
* the data across to some other replica.
*
* @param mutations the mutations to be applied across the replicas
* @param consistencyLevel the consistency level for the operation
* @param queryStartNanoTime the value of nanoTime() when the query started to be processed
*/
public static void mutate(List<? extends IMutation> mutations, ConsistencyLevel consistencyLevel, long queryStartNanoTime) throws UnavailableException, OverloadedException, WriteTimeoutException, WriteFailureException {
Tracing.trace("Determining replicas for mutation");
final String localDataCenter = DatabaseDescriptor.getEndpointSnitch().getLocalDatacenter();
long startTime = nanoTime();
List<AbstractWriteResponseHandler<IMutation>> responseHandlers = new ArrayList<>(mutations.size());
WriteType plainWriteType = mutations.size() <= 1 ? WriteType.SIMPLE : WriteType.UNLOGGED_BATCH;
try {
for (IMutation mutation : mutations) {
if (hasLocalMutation(mutation))
writeMetrics.localRequests.mark();
else
writeMetrics.remoteRequests.mark();
if (mutation instanceof CounterMutation)
responseHandlers.add(mutateCounter((CounterMutation) mutation, localDataCenter, queryStartNanoTime));
else
responseHandlers.add(performWrite(mutation, consistencyLevel, localDataCenter, standardWritePerformer, null, plainWriteType, queryStartNanoTime));
}
// upgrade to full quorum any failed cheap quorums
for (int i = 0; i < mutations.size(); ++i) {
if (// at the moment, only non-counter writes support cheap quorums
!(mutations.get(i) instanceof CounterMutation))
responseHandlers.get(i).maybeTryAdditionalReplicas(mutations.get(i), standardWritePerformer, localDataCenter);
}
// wait for writes. throws TimeoutException if necessary
for (AbstractWriteResponseHandler<IMutation> responseHandler : responseHandlers) responseHandler.get();
} catch (WriteTimeoutException | WriteFailureException ex) {
if (consistencyLevel == ConsistencyLevel.ANY) {
hintMutations(mutations);
} else {
if (ex instanceof WriteFailureException) {
writeMetrics.failures.mark();
writeMetricsForLevel(consistencyLevel).failures.mark();
WriteFailureException fe = (WriteFailureException) ex;
Tracing.trace("Write failure; received {} of {} required replies, failed {} requests", fe.received, fe.blockFor, fe.failureReasonByEndpoint.size());
} else {
writeMetrics.timeouts.mark();
writeMetricsForLevel(consistencyLevel).timeouts.mark();
WriteTimeoutException te = (WriteTimeoutException) ex;
Tracing.trace("Write timeout; received {} of {} required replies", te.received, te.blockFor);
}
throw ex;
}
} catch (UnavailableException e) {
writeMetrics.unavailables.mark();
writeMetricsForLevel(consistencyLevel).unavailables.mark();
Tracing.trace("Unavailable");
throw e;
} catch (OverloadedException e) {
writeMetrics.unavailables.mark();
writeMetricsForLevel(consistencyLevel).unavailables.mark();
Tracing.trace("Overloaded");
throw e;
} finally {
long latency = nanoTime() - startTime;
writeMetrics.addNano(latency);
writeMetricsForLevel(consistencyLevel).addNano(latency);
updateCoordinatorWriteLatencyTableMetric(mutations, latency);
}
}
Aggregations