use of org.apache.cassandra.db.rows.RowIterator in project cassandra by apache.
the class SelectStatement method process.
private ResultSet process(PartitionIterator partitions, QueryOptions options, int nowInSec, int userLimit) throws InvalidRequestException {
Selection.ResultSetBuilder result = selection.resultSetBuilder(options, parameters.isJson, aggregationSpec);
while (partitions.hasNext()) {
try (RowIterator partition = partitions.next()) {
processPartition(partition, options, result, nowInSec);
}
}
ResultSet cqlRows = result.build();
orderResults(cqlRows);
cqlRows.trim(userLimit);
return cqlRows;
}
use of org.apache.cassandra.db.rows.RowIterator in project cassandra by apache.
the class StorageProxy method doPaxos.
/**
* Performs the Paxos rounds for a given proposal, retrying when preempted until the timeout.
*
* <p>The main 'configurable' of this method is the {@code createUpdateProposal} method: it is called by the method
* once a ballot has been successfully 'prepared' to generate the update to 'propose' (and commit if the proposal is
* successful). That method also generates the result that the whole method will return. Note that due to retrying,
* this method may be called multiple times and does not have to return the same results.
*
* @param metadata the table to update with Paxos.
* @param key the partition updated.
* @param consistencyForPaxos the serial consistency of the operation (either {@link ConsistencyLevel#SERIAL} or
* {@link ConsistencyLevel#LOCAL_SERIAL}).
* @param consistencyForReplayCommits the consistency for the commit phase of "replayed" in-progress operations.
* @param consistencyForCommit the consistency for the commit phase of _this_ operation update.
* @param queryStartNanoTime the nano time for the start of the query this is part of. This is the base time for
* timeouts.
* @param casMetrics the metrics to update for this operation.
* @param createUpdateProposal method called after a successful 'prepare' phase to obtain 1) the actual update of
* this operation and 2) the result that the whole method should return. This can return {@code null} in the
* special where, after having "prepared" (and thus potentially replayed in-progress upgdates), we don't want
* to propose anything (the whole method then return {@code null}).
* @return the second element of the pair returned by {@code createUpdateProposal} (for the last call of that method
* if that method is called multiple times due to retries).
*/
private static RowIterator doPaxos(TableMetadata metadata, DecoratedKey key, ConsistencyLevel consistencyForPaxos, ConsistencyLevel consistencyForReplayCommits, ConsistencyLevel consistencyForCommit, long queryStartNanoTime, CASClientRequestMetrics casMetrics, Supplier<Pair<PartitionUpdate, RowIterator>> createUpdateProposal) throws UnavailableException, IsBootstrappingException, RequestFailureException, RequestTimeoutException, InvalidRequestException {
int contentions = 0;
Keyspace keyspace = Keyspace.open(metadata.keyspace);
AbstractReplicationStrategy latestRs = keyspace.getReplicationStrategy();
try {
consistencyForPaxos.validateForCas();
consistencyForReplayCommits.validateForCasCommit(latestRs);
consistencyForCommit.validateForCasCommit(latestRs);
long timeoutNanos = DatabaseDescriptor.getCasContentionTimeout(NANOSECONDS);
while (nanoTime() - queryStartNanoTime < timeoutNanos) {
// for simplicity, we'll do a single liveness check at the start of each attempt
ReplicaPlan.ForPaxosWrite replicaPlan = ReplicaPlans.forPaxos(keyspace, key, consistencyForPaxos);
latestRs = replicaPlan.replicationStrategy();
PaxosBallotAndContention pair = beginAndRepairPaxos(queryStartNanoTime, key, metadata, replicaPlan, consistencyForPaxos, consistencyForReplayCommits, casMetrics);
final UUID ballot = pair.ballot;
contentions += pair.contentions;
Pair<PartitionUpdate, RowIterator> proposalPair = createUpdateProposal.get();
// See method javadoc: null here is code for "stop here and return null".
if (proposalPair == null)
return null;
Commit proposal = Commit.newProposal(ballot, proposalPair.left);
Tracing.trace("CAS precondition is met; proposing client-requested updates for {}", ballot);
if (proposePaxos(proposal, replicaPlan, true, queryStartNanoTime)) {
// them), this is worth bothering.
if (!proposal.update.isEmpty())
commitPaxos(proposal, consistencyForCommit, true, queryStartNanoTime);
RowIterator result = proposalPair.right;
if (result != null)
Tracing.trace("CAS did not apply");
else
Tracing.trace("CAS applied successfully");
return result;
}
Tracing.trace("Paxos proposal not accepted (pre-empted by a higher ballot)");
contentions++;
Uninterruptibles.sleepUninterruptibly(ThreadLocalRandom.current().nextInt(100), TimeUnit.MILLISECONDS);
// continue to retry
}
} catch (CasWriteTimeoutException e) {
// Might be thrown by beginRepairAndPaxos. In that case, any contention that happened within the method and
// led up to the timeout was not accounted in our local 'contentions' variable and we add it now so it the
// contention recorded in the finally is correct.
contentions += e.contentions;
throw e;
} catch (WriteTimeoutException e) {
// Might be thrown by proposePaxos or commitPaxos
throw new CasWriteTimeoutException(e.writeType, e.consistency, e.received, e.blockFor, contentions);
} finally {
recordCasContention(metadata, key, casMetrics, contentions);
}
throw new CasWriteTimeoutException(WriteType.CAS, consistencyForPaxos, 0, consistencyForPaxos.blockFor(latestRs), contentions);
}
use of org.apache.cassandra.db.rows.RowIterator in project cassandra by apache.
the class StorageProxy method cas.
/**
* Apply @param updates if and only if the current values in the row for @param key
* match the provided @param conditions. The algorithm is "raw" Paxos: that is, Paxos
* minus leader election -- any node in the cluster may propose changes for any row,
* which (that is, the row) is the unit of values being proposed, not single columns.
*
* The Paxos cohort is only the replicas for the given key, not the entire cluster.
* So we expect performance to be reasonable, but CAS is still intended to be used
* "when you really need it," not for all your updates.
*
* There are three phases to Paxos:
* 1. Prepare: the coordinator generates a ballot (timeUUID in our case) and asks replicas to (a) promise
* not to accept updates from older ballots and (b) tell us about the most recent update it has already
* accepted.
* 2. Accept: if a majority of replicas respond, the coordinator asks replicas to accept the value of the
* highest proposal ballot it heard about, or a new value if no in-progress proposals were reported.
* 3. Commit (Learn): if a majority of replicas acknowledge the accept request, we can commit the new
* value.
*
* Commit procedure is not covered in "Paxos Made Simple," and only briefly mentioned in "Paxos Made Live,"
* so here is our approach:
* 3a. The coordinator sends a commit message to all replicas with the ballot and value.
* 3b. Because of 1-2, this will be the highest-seen commit ballot. The replicas will note that,
* and send it with subsequent promise replies. This allows us to discard acceptance records
* for successfully committed replicas, without allowing incomplete proposals to commit erroneously
* later on.
*
* Note that since we are performing a CAS rather than a simple update, we perform a read (of committed
* values) between the prepare and accept phases. This gives us a slightly longer window for another
* coordinator to come along and trump our own promise with a newer one but is otherwise safe.
*
* @param keyspaceName the keyspace for the CAS
* @param cfName the column family for the CAS
* @param key the row key for the row to CAS
* @param request the conditions for the CAS to apply as well as the update to perform if the conditions hold.
* @param consistencyForPaxos the consistency for the paxos prepare and propose round. This can only be either SERIAL or LOCAL_SERIAL.
* @param consistencyForCommit the consistency for write done during the commit phase. This can be anything, except SERIAL or LOCAL_SERIAL.
*
* @return null if the operation succeeds in updating the row, or the current values corresponding to conditions.
* (since, if the CAS doesn't succeed, it means the current value do not match the conditions).
*/
public static RowIterator cas(String keyspaceName, String cfName, DecoratedKey key, CASRequest request, ConsistencyLevel consistencyForPaxos, ConsistencyLevel consistencyForCommit, ClientState state, int nowInSeconds, long queryStartNanoTime) throws UnavailableException, IsBootstrappingException, RequestFailureException, RequestTimeoutException, InvalidRequestException, CasWriteUnknownResultException {
final long startTimeForMetrics = nanoTime();
try {
TableMetadata metadata = Schema.instance.validateTable(keyspaceName, cfName);
if (DatabaseDescriptor.getPartitionDenylistEnabled() && DatabaseDescriptor.getDenylistWritesEnabled() && !partitionDenylist.isKeyPermitted(keyspaceName, cfName, key.getKey())) {
denylistMetrics.incrementWritesRejected();
throw new InvalidRequestException(String.format("Unable to CAS write to denylisted partition [0x%s] in %s/%s", key.toString(), keyspaceName, cfName));
}
Supplier<Pair<PartitionUpdate, RowIterator>> updateProposer = () -> {
// read the current values and check they validate the conditions
Tracing.trace("Reading existing values for CAS precondition");
SinglePartitionReadCommand readCommand = (SinglePartitionReadCommand) request.readCommand(nowInSeconds);
ConsistencyLevel readConsistency = consistencyForPaxos == ConsistencyLevel.LOCAL_SERIAL ? ConsistencyLevel.LOCAL_QUORUM : ConsistencyLevel.QUORUM;
FilteredPartition current;
try (RowIterator rowIter = readOne(readCommand, readConsistency, queryStartNanoTime)) {
current = FilteredPartition.create(rowIter);
}
if (!request.appliesTo(current)) {
Tracing.trace("CAS precondition does not match current values {}", current);
casWriteMetrics.conditionNotMet.inc();
return Pair.create(PartitionUpdate.emptyUpdate(metadata, key), current.rowIterator());
}
// Create the desired updates
PartitionUpdate updates = request.makeUpdates(current, state);
long size = updates.dataSize();
casWriteMetrics.mutationSize.update(size);
writeMetricsForLevel(consistencyForPaxos).mutationSize.update(size);
// Apply triggers to cas updates. A consideration here is that
// triggers emit Mutations, and so a given trigger implementation
// may generate mutations for partitions other than the one this
// paxos round is scoped for. In this case, TriggerExecutor will
// validate that the generated mutations are targetted at the same
// partition as the initial updates and reject (via an
// InvalidRequestException) any which aren't.
updates = TriggerExecutor.instance.execute(updates);
return Pair.create(updates, null);
};
return doPaxos(metadata, key, consistencyForPaxos, consistencyForCommit, consistencyForCommit, queryStartNanoTime, casWriteMetrics, updateProposer);
} catch (CasWriteUnknownResultException e) {
casWriteMetrics.unknownResult.mark();
throw e;
} catch (CasWriteTimeoutException wte) {
casWriteMetrics.timeouts.mark();
writeMetricsForLevel(consistencyForPaxos).timeouts.mark();
throw new CasWriteTimeoutException(wte.writeType, wte.consistency, wte.received, wte.blockFor, wte.contentions);
} catch (ReadTimeoutException e) {
casWriteMetrics.timeouts.mark();
writeMetricsForLevel(consistencyForPaxos).timeouts.mark();
throw e;
} catch (ReadAbortException e) {
casWriteMetrics.markAbort(e);
writeMetricsForLevel(consistencyForPaxos).markAbort(e);
throw e;
} catch (WriteFailureException | ReadFailureException e) {
casWriteMetrics.failures.mark();
writeMetricsForLevel(consistencyForPaxos).failures.mark();
throw e;
} catch (UnavailableException e) {
casWriteMetrics.unavailables.mark();
writeMetricsForLevel(consistencyForPaxos).unavailables.mark();
throw e;
} finally {
final long latency = nanoTime() - startTimeForMetrics;
casWriteMetrics.addNano(latency);
writeMetricsForLevel(consistencyForPaxos).addNano(latency);
}
}
use of org.apache.cassandra.db.rows.RowIterator in project cassandra by apache.
the class SelectStatement method process.
private ResultSet process(PartitionIterator partitions, QueryOptions options, Selectors selectors, int nowInSec, int userLimit) throws InvalidRequestException {
GroupMaker groupMaker = aggregationSpec == null ? null : aggregationSpec.newGroupMaker();
ResultSetBuilder result = new ResultSetBuilder(getResultMetadata(), selectors, groupMaker);
while (partitions.hasNext()) {
try (RowIterator partition = partitions.next()) {
processPartition(partition, options, result, nowInSec);
}
}
ResultSet cqlRows = result.build();
maybeWarn(result, options);
orderResults(cqlRows);
cqlRows.trim(userLimit);
return cqlRows;
}
use of org.apache.cassandra.db.rows.RowIterator in project cassandra by apache.
the class BatchStatement method executeInternalWithConditions.
private ResultMessage executeInternalWithConditions(BatchQueryOptions options, QueryState state) {
Pair<CQL3CasRequest, Set<ColumnMetadata>> p = makeCasRequest(options, state);
CQL3CasRequest request = p.left;
Set<ColumnMetadata> columnsWithConditions = p.right;
String ksName = request.metadata.keyspace;
String tableName = request.metadata.name;
long timestamp = options.getTimestamp(state);
int nowInSeconds = options.getNowInSeconds(state);
try (RowIterator result = ModificationStatement.casInternal(state.getClientState(), request, timestamp, nowInSeconds)) {
ResultSet resultSet = ModificationStatement.buildCasResultSet(ksName, tableName, result, columnsWithConditions, true, state, options.forStatement(0));
return new ResultMessage.Rows(resultSet);
}
}
Aggregations