use of org.apache.cassandra.schema.TableMetadata in project cassandra by apache.
the class CassandraIndex method indexCfsMetadata.
/**
* Construct the TableMetadata for an index table, the clustering columns in the index table
* vary dependent on the kind of the indexed value.
* @param baseCfsMetadata
* @param indexMetadata
* @return
*/
public static TableMetadata indexCfsMetadata(TableMetadata baseCfsMetadata, IndexMetadata indexMetadata) {
Pair<ColumnMetadata, IndexTarget.Type> target = TargetParser.parse(baseCfsMetadata, indexMetadata);
CassandraIndexFunctions utils = getFunctions(indexMetadata, target);
ColumnMetadata indexedColumn = target.left;
AbstractType<?> indexedValueType = utils.getIndexedValueType(indexedColumn);
TableMetadata.Builder builder = TableMetadata.builder(baseCfsMetadata.keyspace, baseCfsMetadata.indexTableName(indexMetadata), baseCfsMetadata.id).isDense(indexMetadata.isKeys()).isCompound(!indexMetadata.isKeys()).partitioner(new LocalPartitioner(indexedValueType)).addPartitionKeyColumn(indexedColumn.name, indexedColumn.type).addClusteringColumn("partition_key", baseCfsMetadata.partitioner.partitionOrdering());
if (indexMetadata.isKeys()) {
// A dense, compact table for KEYS indexes must have a compact
// value column defined, even though it is never used
CompactTables.DefaultNames names = CompactTables.defaultNameGenerator(ImmutableSet.of(indexedColumn.name.toString(), "partition_key"));
builder.addRegularColumn(names.defaultCompactValueName(), EmptyType.instance);
} else {
// The clustering columns for a table backing a COMPOSITES index are dependent
// on the specific type of index (there are specializations for indexes on collections)
utils.addIndexClusteringColumns(builder, baseCfsMetadata, indexedColumn);
}
return builder.build().updateIndexTableMetadata(baseCfsMetadata.params);
}
use of org.apache.cassandra.schema.TableMetadata in project cassandra by apache.
the class CassandraIndexSearcher method queryIndex.
private UnfilteredRowIterator queryIndex(DecoratedKey indexKey, ReadCommand command, ReadExecutionController executionController) {
ClusteringIndexFilter filter = makeIndexFilter(command);
ColumnFamilyStore indexCfs = index.getBackingTable().get();
TableMetadata indexMetadata = indexCfs.metadata();
return SinglePartitionReadCommand.create(indexMetadata, command.nowInSec(), indexKey, ColumnFilter.all(indexMetadata), filter).queryMemtableAndDisk(indexCfs, executionController.indexReadController());
}
use of org.apache.cassandra.schema.TableMetadata in project cassandra by apache.
the class StorageProxy method cas.
/**
* Apply @param updates if and only if the current values in the row for @param key
* match the provided @param conditions. The algorithm is "raw" Paxos: that is, Paxos
* minus leader election -- any node in the cluster may propose changes for any row,
* which (that is, the row) is the unit of values being proposed, not single columns.
*
* The Paxos cohort is only the replicas for the given key, not the entire cluster.
* So we expect performance to be reasonable, but CAS is still intended to be used
* "when you really need it," not for all your updates.
*
* There are three phases to Paxos:
* 1. Prepare: the coordinator generates a ballot (timeUUID in our case) and asks replicas to (a) promise
* not to accept updates from older ballots and (b) tell us about the most recent update it has already
* accepted.
* 2. Accept: if a majority of replicas reply, the coordinator asks replicas to accept the value of the
* highest proposal ballot it heard about, or a new value if no in-progress proposals were reported.
* 3. Commit (Learn): if a majority of replicas acknowledge the accept request, we can commit the new
* value.
*
* Commit procedure is not covered in "Paxos Made Simple," and only briefly mentioned in "Paxos Made Live,"
* so here is our approach:
* 3a. The coordinator sends a commit message to all replicas with the ballot and value.
* 3b. Because of 1-2, this will be the highest-seen commit ballot. The replicas will note that,
* and send it with subsequent promise replies. This allows us to discard acceptance records
* for successfully committed replicas, without allowing incomplete proposals to commit erroneously
* later on.
*
* Note that since we are performing a CAS rather than a simple update, we perform a read (of committed
* values) between the prepare and accept phases. This gives us a slightly longer window for another
* coordinator to come along and trump our own promise with a newer one but is otherwise safe.
*
* @param keyspaceName the keyspace for the CAS
* @param cfName the column family for the CAS
* @param key the row key for the row to CAS
* @param request the conditions for the CAS to apply as well as the update to perform if the conditions hold.
* @param consistencyForPaxos the consistency for the paxos prepare and propose round. This can only be either SERIAL or LOCAL_SERIAL.
* @param consistencyForCommit the consistency for write done during the commit phase. This can be anything, except SERIAL or LOCAL_SERIAL.
*
* @return null if the operation succeeds in updating the row, or the current values corresponding to conditions.
* (since, if the CAS doesn't succeed, it means the current value do not match the conditions).
*/
public static RowIterator cas(String keyspaceName, String cfName, DecoratedKey key, CASRequest request, ConsistencyLevel consistencyForPaxos, ConsistencyLevel consistencyForCommit, ClientState state, long queryStartNanoTime) throws UnavailableException, IsBootstrappingException, RequestFailureException, RequestTimeoutException, InvalidRequestException {
final long startTimeForMetrics = System.nanoTime();
int contentions = 0;
try {
consistencyForPaxos.validateForCas();
consistencyForCommit.validateForCasCommit(keyspaceName);
TableMetadata metadata = Schema.instance.getTableMetadata(keyspaceName, cfName);
long timeout = TimeUnit.MILLISECONDS.toNanos(DatabaseDescriptor.getCasContentionTimeout());
while (System.nanoTime() - queryStartNanoTime < timeout) {
// for simplicity, we'll do a single liveness check at the start of each attempt
Pair<List<InetAddress>, Integer> p = getPaxosParticipants(metadata, key, consistencyForPaxos);
List<InetAddress> liveEndpoints = p.left;
int requiredParticipants = p.right;
final Pair<UUID, Integer> pair = beginAndRepairPaxos(queryStartNanoTime, key, metadata, liveEndpoints, requiredParticipants, consistencyForPaxos, consistencyForCommit, true, state);
final UUID ballot = pair.left;
contentions += pair.right;
// read the current values and check they validate the conditions
Tracing.trace("Reading existing values for CAS precondition");
SinglePartitionReadCommand readCommand = request.readCommand(FBUtilities.nowInSeconds());
ConsistencyLevel readConsistency = consistencyForPaxos == ConsistencyLevel.LOCAL_SERIAL ? ConsistencyLevel.LOCAL_QUORUM : ConsistencyLevel.QUORUM;
FilteredPartition current;
try (RowIterator rowIter = readOne(readCommand, readConsistency, queryStartNanoTime)) {
current = FilteredPartition.create(rowIter);
}
if (!request.appliesTo(current)) {
Tracing.trace("CAS precondition does not match current values {}", current);
casWriteMetrics.conditionNotMet.inc();
return current.rowIterator();
}
// finish the paxos round w/ the desired updates
// TODO turn null updates into delete?
PartitionUpdate updates = request.makeUpdates(current);
long size = updates.dataSize();
casWriteMetrics.mutationSize.update(size);
writeMetricsMap.get(consistencyForPaxos).mutationSize.update(size);
// Apply triggers to cas updates. A consideration here is that
// triggers emit Mutations, and so a given trigger implementation
// may generate mutations for partitions other than the one this
// paxos round is scoped for. In this case, TriggerExecutor will
// validate that the generated mutations are targetted at the same
// partition as the initial updates and reject (via an
// InvalidRequestException) any which aren't.
updates = TriggerExecutor.instance.execute(updates);
Commit proposal = Commit.newProposal(ballot, updates);
Tracing.trace("CAS precondition is met; proposing client-requested updates for {}", ballot);
if (proposePaxos(proposal, liveEndpoints, requiredParticipants, true, consistencyForPaxos, queryStartNanoTime)) {
commitPaxos(proposal, consistencyForCommit, true, queryStartNanoTime);
Tracing.trace("CAS successful");
return null;
}
Tracing.trace("Paxos proposal not accepted (pre-empted by a higher ballot)");
contentions++;
Uninterruptibles.sleepUninterruptibly(ThreadLocalRandom.current().nextInt(100), TimeUnit.MILLISECONDS);
// continue to retry
}
throw new WriteTimeoutException(WriteType.CAS, consistencyForPaxos, 0, consistencyForPaxos.blockFor(Keyspace.open(keyspaceName)));
} catch (WriteTimeoutException | ReadTimeoutException e) {
casWriteMetrics.timeouts.mark();
writeMetricsMap.get(consistencyForPaxos).timeouts.mark();
throw e;
} catch (WriteFailureException | ReadFailureException e) {
casWriteMetrics.failures.mark();
writeMetricsMap.get(consistencyForPaxos).failures.mark();
throw e;
} catch (UnavailableException e) {
casWriteMetrics.unavailables.mark();
writeMetricsMap.get(consistencyForPaxos).unavailables.mark();
throw e;
} finally {
recordCasContention(contentions);
final long latency = System.nanoTime() - startTimeForMetrics;
casWriteMetrics.addNano(latency);
writeMetricsMap.get(consistencyForPaxos).addNano(latency);
}
}
use of org.apache.cassandra.schema.TableMetadata in project cassandra by apache.
the class StorageProxy method readWithPaxos.
private static PartitionIterator readWithPaxos(SinglePartitionReadCommand.Group group, ConsistencyLevel consistencyLevel, ClientState state, long queryStartNanoTime) throws InvalidRequestException, UnavailableException, ReadFailureException, ReadTimeoutException {
assert state != null;
if (group.commands.size() > 1)
throw new InvalidRequestException("SERIAL/LOCAL_SERIAL consistency may only be requested for one partition at a time");
long start = System.nanoTime();
SinglePartitionReadCommand command = group.commands.get(0);
TableMetadata metadata = command.metadata();
DecoratedKey key = command.partitionKey();
PartitionIterator result = null;
try {
// make sure any in-progress paxos writes are done (i.e., committed to a majority of replicas), before performing a quorum read
Pair<List<InetAddress>, Integer> p = getPaxosParticipants(metadata, key, consistencyLevel);
List<InetAddress> liveEndpoints = p.left;
int requiredParticipants = p.right;
// does the work of applying in-progress writes; throws UAE or timeout if it can't
final ConsistencyLevel consistencyForCommitOrFetch = consistencyLevel == ConsistencyLevel.LOCAL_SERIAL ? ConsistencyLevel.LOCAL_QUORUM : ConsistencyLevel.QUORUM;
try {
final Pair<UUID, Integer> pair = beginAndRepairPaxos(start, key, metadata, liveEndpoints, requiredParticipants, consistencyLevel, consistencyForCommitOrFetch, false, state);
if (pair.right > 0)
casReadMetrics.contention.update(pair.right);
} catch (WriteTimeoutException e) {
throw new ReadTimeoutException(consistencyLevel, 0, consistencyLevel.blockFor(Keyspace.open(metadata.keyspace)), false);
} catch (WriteFailureException e) {
throw new ReadFailureException(consistencyLevel, e.received, e.blockFor, false, e.failureReasonByEndpoint);
}
result = fetchRows(group.commands, consistencyForCommitOrFetch, queryStartNanoTime);
} catch (UnavailableException e) {
readMetrics.unavailables.mark();
casReadMetrics.unavailables.mark();
readMetricsMap.get(consistencyLevel).unavailables.mark();
throw e;
} catch (ReadTimeoutException e) {
readMetrics.timeouts.mark();
casReadMetrics.timeouts.mark();
readMetricsMap.get(consistencyLevel).timeouts.mark();
throw e;
} catch (ReadFailureException e) {
readMetrics.failures.mark();
casReadMetrics.failures.mark();
readMetricsMap.get(consistencyLevel).failures.mark();
throw e;
} finally {
long latency = System.nanoTime() - start;
readMetrics.addNano(latency);
casReadMetrics.addNano(latency);
readMetricsMap.get(consistencyLevel).addNano(latency);
Keyspace.open(metadata.keyspace).getColumnFamilyStore(metadata.name).metric.coordinatorReadLatency.update(latency, TimeUnit.NANOSECONDS);
}
return result;
}
use of org.apache.cassandra.schema.TableMetadata in project cassandra by apache.
the class SSTableExport method metadataFromSSTable.
/**
* Construct table schema from info stored in SSTable's Stats.db
*
* @param desc SSTable's descriptor
* @return Restored TableMetadata
* @throws IOException when Stats.db cannot be read
*/
public static TableMetadata metadataFromSSTable(Descriptor desc) throws IOException {
if (!desc.version.isCompatible())
throw new IOException("Cannot process old and unsupported SSTable version.");
EnumSet<MetadataType> types = EnumSet.of(MetadataType.STATS, MetadataType.HEADER);
Map<MetadataType, MetadataComponent> sstableMetadata = desc.getMetadataSerializer().deserialize(desc, types);
SerializationHeader.Component header = (SerializationHeader.Component) sstableMetadata.get(MetadataType.HEADER);
IPartitioner partitioner = FBUtilities.newPartitioner(desc);
TableMetadata.Builder builder = TableMetadata.builder("keyspace", "table").partitioner(partitioner);
header.getStaticColumns().entrySet().stream().forEach(entry -> {
ColumnIdentifier ident = ColumnIdentifier.getInterned(UTF8Type.instance.getString(entry.getKey()), true);
builder.addStaticColumn(ident, entry.getValue());
});
header.getRegularColumns().entrySet().stream().forEach(entry -> {
ColumnIdentifier ident = ColumnIdentifier.getInterned(UTF8Type.instance.getString(entry.getKey()), true);
builder.addRegularColumn(ident, entry.getValue());
});
builder.addPartitionKeyColumn("PartitionKey", header.getKeyType());
for (int i = 0; i < header.getClusteringTypes().size(); i++) {
builder.addClusteringColumn("clustering" + (i > 0 ? i : ""), header.getClusteringTypes().get(i));
}
return builder.build();
}
Aggregations