Search in sources :

Example 1 with RowIterator

use of org.apache.cassandra.db.rows.RowIterator in project cassandra by apache.

the class SelectStatement method process.

private ResultSet process(PartitionIterator partitions, QueryOptions options, int nowInSec, int userLimit) throws InvalidRequestException {
    Selection.ResultSetBuilder result = selection.resultSetBuilder(options, parameters.isJson, aggregationSpec);
    while (partitions.hasNext()) {
        try (RowIterator partition = {
            processPartition(partition, options, result, nowInSec);
    ResultSet cqlRows =;
    return cqlRows;
Also used : Selection(org.apache.cassandra.cql3.selection.Selection) RowIterator(org.apache.cassandra.db.rows.RowIterator)

Example 2 with RowIterator

use of org.apache.cassandra.db.rows.RowIterator in project cassandra by apache.

the class StorageProxy method doPaxos.

 * Performs the Paxos rounds for a given proposal, retrying when preempted until the timeout.
 * <p>The main 'configurable' of this method is the {@code createUpdateProposal} method: it is called by the method
 * once a ballot has been successfully 'prepared' to generate the update to 'propose' (and commit if the proposal is
 * successful). That method also generates the result that the whole method will return. Note that due to retrying,
 * this method may be called multiple times and does not have to return the same results.
 * @param metadata the table to update with Paxos.
 * @param key the partition updated.
 * @param consistencyForPaxos the serial consistency of the operation (either {@link ConsistencyLevel#SERIAL} or
 *     {@link ConsistencyLevel#LOCAL_SERIAL}).
 * @param consistencyForReplayCommits the consistency for the commit phase of "replayed" in-progress operations.
 * @param consistencyForCommit the consistency for the commit phase of _this_ operation update.
 * @param queryStartNanoTime the nano time for the start of the query this is part of. This is the base time for
 *     timeouts.
 * @param casMetrics the metrics to update for this operation.
 * @param createUpdateProposal method called after a successful 'prepare' phase to obtain 1) the actual update of
 *     this operation and 2) the result that the whole method should return. This can return {@code null} in the
 *     special where, after having "prepared" (and thus potentially replayed in-progress upgdates), we don't want
 *     to propose anything (the whole method then return {@code null}).
 * @return the second element of the pair returned by {@code createUpdateProposal} (for the last call of that method
 *     if that method is called multiple times due to retries).
private static RowIterator doPaxos(TableMetadata metadata, DecoratedKey key, ConsistencyLevel consistencyForPaxos, ConsistencyLevel consistencyForReplayCommits, ConsistencyLevel consistencyForCommit, long queryStartNanoTime, CASClientRequestMetrics casMetrics, Supplier<Pair<PartitionUpdate, RowIterator>> createUpdateProposal) throws UnavailableException, IsBootstrappingException, RequestFailureException, RequestTimeoutException, InvalidRequestException {
    int contentions = 0;
    Keyspace keyspace =;
    AbstractReplicationStrategy latestRs = keyspace.getReplicationStrategy();
    try {
        long timeoutNanos = DatabaseDescriptor.getCasContentionTimeout(NANOSECONDS);
        while (nanoTime() - queryStartNanoTime < timeoutNanos) {
            // for simplicity, we'll do a single liveness check at the start of each attempt
            ReplicaPlan.ForPaxosWrite replicaPlan = ReplicaPlans.forPaxos(keyspace, key, consistencyForPaxos);
            latestRs = replicaPlan.replicationStrategy();
            PaxosBallotAndContention pair = beginAndRepairPaxos(queryStartNanoTime, key, metadata, replicaPlan, consistencyForPaxos, consistencyForReplayCommits, casMetrics);
            final UUID ballot = pair.ballot;
            contentions += pair.contentions;
            Pair<PartitionUpdate, RowIterator> proposalPair = createUpdateProposal.get();
            // See method javadoc: null here is code for "stop here and return null".
            if (proposalPair == null)
                return null;
            Commit proposal = Commit.newProposal(ballot, proposalPair.left);
            Tracing.trace("CAS precondition is met; proposing client-requested updates for {}", ballot);
            if (proposePaxos(proposal, replicaPlan, true, queryStartNanoTime)) {
                // them), this is worth bothering.
                if (!proposal.update.isEmpty())
                    commitPaxos(proposal, consistencyForCommit, true, queryStartNanoTime);
                RowIterator result = proposalPair.right;
                if (result != null)
                    Tracing.trace("CAS did not apply");
                    Tracing.trace("CAS applied successfully");
                return result;
            Tracing.trace("Paxos proposal not accepted (pre-empted by a higher ballot)");
            Uninterruptibles.sleepUninterruptibly(ThreadLocalRandom.current().nextInt(100), TimeUnit.MILLISECONDS);
        // continue to retry
    } catch (CasWriteTimeoutException e) {
        // Might be thrown by beginRepairAndPaxos. In that case, any contention that happened within the method and
        // led up to the timeout was not accounted in our local 'contentions' variable and we add it now so it the
        // contention recorded in the finally is correct.
        contentions += e.contentions;
        throw e;
    } catch (WriteTimeoutException e) {
        // Might be thrown by proposePaxos or commitPaxos
        throw new CasWriteTimeoutException(e.writeType, e.consistency, e.received, e.blockFor, contentions);
    } finally {
        recordCasContention(metadata, key, casMetrics, contentions);
    throw new CasWriteTimeoutException(WriteType.CAS, consistencyForPaxos, 0, consistencyForPaxos.blockFor(latestRs), contentions);
Also used : ReplicaPlan(org.apache.cassandra.locator.ReplicaPlan) Hint(org.apache.cassandra.hints.Hint) CasWriteTimeoutException(org.apache.cassandra.exceptions.CasWriteTimeoutException) WriteTimeoutException(org.apache.cassandra.exceptions.WriteTimeoutException) Keyspace(org.apache.cassandra.db.Keyspace) RowIterator(org.apache.cassandra.db.rows.RowIterator) AbstractReplicationStrategy(org.apache.cassandra.locator.AbstractReplicationStrategy) UUID(java.util.UUID) CasWriteTimeoutException(org.apache.cassandra.exceptions.CasWriteTimeoutException) PartitionUpdate(org.apache.cassandra.db.partitions.PartitionUpdate)

Example 3 with RowIterator

use of org.apache.cassandra.db.rows.RowIterator in project cassandra by apache.

the class StorageProxy method cas.

 * Apply @param updates if and only if the current values in the row for @param key
 * match the provided @param conditions.  The algorithm is "raw" Paxos: that is, Paxos
 * minus leader election -- any node in the cluster may propose changes for any row,
 * which (that is, the row) is the unit of values being proposed, not single columns.
 * The Paxos cohort is only the replicas for the given key, not the entire cluster.
 * So we expect performance to be reasonable, but CAS is still intended to be used
 * "when you really need it," not for all your updates.
 * There are three phases to Paxos:
 *  1. Prepare: the coordinator generates a ballot (timeUUID in our case) and asks replicas to (a) promise
 *     not to accept updates from older ballots and (b) tell us about the most recent update it has already
 *     accepted.
 *  2. Accept: if a majority of replicas respond, the coordinator asks replicas to accept the value of the
 *     highest proposal ballot it heard about, or a new value if no in-progress proposals were reported.
 *  3. Commit (Learn): if a majority of replicas acknowledge the accept request, we can commit the new
 *     value.
 *  Commit procedure is not covered in "Paxos Made Simple," and only briefly mentioned in "Paxos Made Live,"
 *  so here is our approach:
 *   3a. The coordinator sends a commit message to all replicas with the ballot and value.
 *   3b. Because of 1-2, this will be the highest-seen commit ballot.  The replicas will note that,
 *       and send it with subsequent promise replies.  This allows us to discard acceptance records
 *       for successfully committed replicas, without allowing incomplete proposals to commit erroneously
 *       later on.
 *  Note that since we are performing a CAS rather than a simple update, we perform a read (of committed
 *  values) between the prepare and accept phases.  This gives us a slightly longer window for another
 *  coordinator to come along and trump our own promise with a newer one but is otherwise safe.
 * @param keyspaceName the keyspace for the CAS
 * @param cfName the column family for the CAS
 * @param key the row key for the row to CAS
 * @param request the conditions for the CAS to apply as well as the update to perform if the conditions hold.
 * @param consistencyForPaxos the consistency for the paxos prepare and propose round. This can only be either SERIAL or LOCAL_SERIAL.
 * @param consistencyForCommit the consistency for write done during the commit phase. This can be anything, except SERIAL or LOCAL_SERIAL.
 * @return null if the operation succeeds in updating the row, or the current values corresponding to conditions.
 * (since, if the CAS doesn't succeed, it means the current value do not match the conditions).
public static RowIterator cas(String keyspaceName, String cfName, DecoratedKey key, CASRequest request, ConsistencyLevel consistencyForPaxos, ConsistencyLevel consistencyForCommit, ClientState state, int nowInSeconds, long queryStartNanoTime) throws UnavailableException, IsBootstrappingException, RequestFailureException, RequestTimeoutException, InvalidRequestException, CasWriteUnknownResultException {
    final long startTimeForMetrics = nanoTime();
    try {
        TableMetadata metadata = Schema.instance.validateTable(keyspaceName, cfName);
        if (DatabaseDescriptor.getPartitionDenylistEnabled() && DatabaseDescriptor.getDenylistWritesEnabled() && !partitionDenylist.isKeyPermitted(keyspaceName, cfName, key.getKey())) {
            throw new InvalidRequestException(String.format("Unable to CAS write to denylisted partition [0x%s] in %s/%s", key.toString(), keyspaceName, cfName));
        Supplier<Pair<PartitionUpdate, RowIterator>> updateProposer = () -> {
            // read the current values and check they validate the conditions
            Tracing.trace("Reading existing values for CAS precondition");
            SinglePartitionReadCommand readCommand = (SinglePartitionReadCommand) request.readCommand(nowInSeconds);
            ConsistencyLevel readConsistency = consistencyForPaxos == ConsistencyLevel.LOCAL_SERIAL ? ConsistencyLevel.LOCAL_QUORUM : ConsistencyLevel.QUORUM;
            FilteredPartition current;
            try (RowIterator rowIter = readOne(readCommand, readConsistency, queryStartNanoTime)) {
                current = FilteredPartition.create(rowIter);
            if (!request.appliesTo(current)) {
                Tracing.trace("CAS precondition does not match current values {}", current);
                return Pair.create(PartitionUpdate.emptyUpdate(metadata, key), current.rowIterator());
            // Create the desired updates
            PartitionUpdate updates = request.makeUpdates(current, state);
            long size = updates.dataSize();
            // Apply triggers to cas updates. A consideration here is that
            // triggers emit Mutations, and so a given trigger implementation
            // may generate mutations for partitions other than the one this
            // paxos round is scoped for. In this case, TriggerExecutor will
            // validate that the generated mutations are targetted at the same
            // partition as the initial updates and reject (via an
            // InvalidRequestException) any which aren't.
            updates = TriggerExecutor.instance.execute(updates);
            return Pair.create(updates, null);
        return doPaxos(metadata, key, consistencyForPaxos, consistencyForCommit, consistencyForCommit, queryStartNanoTime, casWriteMetrics, updateProposer);
    } catch (CasWriteUnknownResultException e) {
        throw e;
    } catch (CasWriteTimeoutException wte) {
        throw new CasWriteTimeoutException(wte.writeType, wte.consistency, wte.received, wte.blockFor, wte.contentions);
    } catch (ReadTimeoutException e) {
        throw e;
    } catch (ReadAbortException e) {
        throw e;
    } catch (WriteFailureException | ReadFailureException e) {
        throw e;
    } catch (UnavailableException e) {
        throw e;
    } finally {
        final long latency = nanoTime() - startTimeForMetrics;
Also used : TableMetadata(org.apache.cassandra.schema.TableMetadata) ReadFailureException(org.apache.cassandra.exceptions.ReadFailureException) ReadTimeoutException(org.apache.cassandra.exceptions.ReadTimeoutException) SinglePartitionReadCommand(org.apache.cassandra.db.SinglePartitionReadCommand) UnavailableException(org.apache.cassandra.exceptions.UnavailableException) FilteredPartition(org.apache.cassandra.db.partitions.FilteredPartition) ReadAbortException(org.apache.cassandra.exceptions.ReadAbortException) CasWriteUnknownResultException(org.apache.cassandra.exceptions.CasWriteUnknownResultException) ConsistencyLevel(org.apache.cassandra.db.ConsistencyLevel) WriteFailureException(org.apache.cassandra.exceptions.WriteFailureException) RowIterator(org.apache.cassandra.db.rows.RowIterator) InvalidRequestException(org.apache.cassandra.exceptions.InvalidRequestException) CasWriteTimeoutException(org.apache.cassandra.exceptions.CasWriteTimeoutException) PartitionUpdate(org.apache.cassandra.db.partitions.PartitionUpdate) Pair(org.apache.cassandra.utils.Pair)

Example 4 with RowIterator

use of org.apache.cassandra.db.rows.RowIterator in project cassandra by apache.

the class SelectStatement method process.

private ResultSet process(PartitionIterator partitions, QueryOptions options, Selectors selectors, int nowInSec, int userLimit) throws InvalidRequestException {
    GroupMaker groupMaker = aggregationSpec == null ? null : aggregationSpec.newGroupMaker();
    ResultSetBuilder result = new ResultSetBuilder(getResultMetadata(), selectors, groupMaker);
    while (partitions.hasNext()) {
        try (RowIterator partition = {
            processPartition(partition, options, result, nowInSec);
    ResultSet cqlRows =;
    maybeWarn(result, options);
    return cqlRows;
Also used : ResultSetBuilder(org.apache.cassandra.cql3.selection.ResultSetBuilder) RowIterator(org.apache.cassandra.db.rows.RowIterator) GroupMaker(org.apache.cassandra.db.aggregation.GroupMaker)

Example 5 with RowIterator

use of org.apache.cassandra.db.rows.RowIterator in project cassandra by apache.

the class BatchStatement method executeInternalWithConditions.

private ResultMessage executeInternalWithConditions(BatchQueryOptions options, QueryState state) {
    Pair<CQL3CasRequest, Set<ColumnMetadata>> p = makeCasRequest(options, state);
    CQL3CasRequest request = p.left;
    Set<ColumnMetadata> columnsWithConditions = p.right;
    String ksName = request.metadata.keyspace;
    String tableName =;
    long timestamp = options.getTimestamp(state);
    int nowInSeconds = options.getNowInSeconds(state);
    try (RowIterator result = ModificationStatement.casInternal(state.getClientState(), request, timestamp, nowInSeconds)) {
        ResultSet resultSet = ModificationStatement.buildCasResultSet(ksName, tableName, result, columnsWithConditions, true, state, options.forStatement(0));
        return new ResultMessage.Rows(resultSet);
Also used : ColumnMetadata(org.apache.cassandra.schema.ColumnMetadata) RowIterator(org.apache.cassandra.db.rows.RowIterator)


RowIterator (org.apache.cassandra.db.rows.RowIterator)25 PartitionIterator (org.apache.cassandra.db.partitions.PartitionIterator)16 Row (org.apache.cassandra.db.rows.Row)15 Test (org.junit.Test)12 InetAddressAndPort (org.apache.cassandra.locator.InetAddressAndPort)10 UnfilteredPartitionIterator (org.apache.cassandra.db.partitions.UnfilteredPartitionIterator)9 BTreeRow (org.apache.cassandra.db.rows.BTreeRow)9 EndpointsForRange (org.apache.cassandra.locator.EndpointsForRange)9 Mutation (org.apache.cassandra.db.Mutation)8 ColumnIdentifier (org.apache.cassandra.cql3.ColumnIdentifier)5 RowUpdateBuilder (org.apache.cassandra.db.RowUpdateBuilder)5 DeletionTime (org.apache.cassandra.db.DeletionTime)4 ColumnMetadata (org.apache.cassandra.schema.ColumnMetadata)4 TableMetadata (org.apache.cassandra.schema.TableMetadata)4 ByteBuffer (java.nio.ByteBuffer)3 ReadCommand (org.apache.cassandra.db.ReadCommand)3 FilteredPartition (org.apache.cassandra.db.partitions.FilteredPartition)3 PartitionUpdate (org.apache.cassandra.db.partitions.PartitionUpdate)3 ComplexColumnData (org.apache.cassandra.db.rows.ComplexColumnData)3 TestableReadRepair (