use of com.amazonaws.athena.connectors.dynamodb.model.DynamoDBIndex in project aws-athena-query-federation by awslabs.
the class DynamoDBMetadataHandler method enhancePartitionSchema.
/**
* Generates a partition schema with metadata derived from available predicates. This metadata will be
* copied to splits in the #doGetSplits call. At this point it is determined whether we can partition
* by hash key or fall back to a full table scan.
*
* @see GlueMetadataHandler
*/
@Override
public void enhancePartitionSchema(SchemaBuilder partitionSchemaBuilder, GetTableLayoutRequest request) {
// use the source table name from the schema if available (in case Glue table name != actual table name)
String tableName = getSourceTableName(request.getSchema());
if (tableName == null) {
tableName = request.getTableName().getTableName();
}
DynamoDBTable table = null;
try {
table = tableResolver.getTableMetadata(tableName);
} catch (TimeoutException e) {
throw new RuntimeException(e);
}
// add table name so we don't have to do case insensitive resolution again
partitionSchemaBuilder.addMetadata(TABLE_METADATA, table.getName());
Map<String, ValueSet> summary = request.getConstraints().getSummary();
List<String> requestedCols = request.getSchema().getFields().stream().map(Field::getName).collect(Collectors.toList());
DynamoDBIndex index = DDBPredicateUtils.getBestIndexForPredicates(table, requestedCols, summary);
logger.info("using index: {}", index.getName());
String hashKeyName = index.getHashKey();
ValueSet hashKeyValueSet = summary.get(hashKeyName);
List<Object> hashKeyValues = (hashKeyValueSet != null) ? DDBPredicateUtils.getHashKeyAttributeValues(hashKeyValueSet) : Collections.emptyList();
DDBRecordMetadata recordMetadata = new DDBRecordMetadata(request.getSchema());
Set<String> columnsToIgnore = new HashSet<>();
List<AttributeValue> valueAccumulator = new ArrayList<>();
IncrementingValueNameProducer valueNameProducer = new IncrementingValueNameProducer();
if (!hashKeyValues.isEmpty()) {
// can "partition" on hash key
partitionSchemaBuilder.addField(hashKeyName, hashKeyValueSet.getType());
partitionSchemaBuilder.addMetadata(HASH_KEY_NAME_METADATA, hashKeyName);
columnsToIgnore.add(hashKeyName);
partitionSchemaBuilder.addMetadata(PARTITION_TYPE_METADATA, QUERY_PARTITION_TYPE);
if (!table.getName().equals(index.getName())) {
partitionSchemaBuilder.addMetadata(INDEX_METADATA, index.getName());
}
// add range key filter if there is one
Optional<String> rangeKey = index.getRangeKey();
if (rangeKey.isPresent()) {
String rangeKeyName = rangeKey.get();
if (summary.containsKey(rangeKeyName)) {
String rangeKeyFilter = DDBPredicateUtils.generateSingleColumnFilter(rangeKeyName, summary.get(rangeKeyName), valueAccumulator, valueNameProducer, recordMetadata);
partitionSchemaBuilder.addMetadata(RANGE_KEY_NAME_METADATA, rangeKeyName);
partitionSchemaBuilder.addMetadata(RANGE_KEY_FILTER_METADATA, rangeKeyFilter);
columnsToIgnore.add(rangeKeyName);
}
}
} else {
// always fall back to a scan
partitionSchemaBuilder.addField(SEGMENT_COUNT_METADATA, Types.MinorType.INT.getType());
partitionSchemaBuilder.addMetadata(PARTITION_TYPE_METADATA, SCAN_PARTITION_TYPE);
}
// We will exclude the columns with custom types from filter clause when querying/scanning DDB
// As those types are not natively supported by DDB or Glue
// So we have to filter the results after the query/scan result is returned
columnsToIgnore.addAll(recordMetadata.getNonComparableColumns());
precomputeAdditionalMetadata(columnsToIgnore, summary, valueAccumulator, valueNameProducer, partitionSchemaBuilder, recordMetadata);
}
use of com.amazonaws.athena.connectors.dynamodb.model.DynamoDBIndex in project aws-athena-query-federation by awslabs.
the class DDBPredicateUtils method getBestIndexForPredicates.
/**
* Attempts to pick an optimal index (if any) from the given predicates. Returns the original table index if
* one was not found.
*
* @param table the original table
* @param predicates the predicates
* @return the optimal index if found, otherwise the original table index
*/
public static DynamoDBIndex getBestIndexForPredicates(DynamoDBTable table, List<String> requestedCols, Map<String, ValueSet> predicates) {
Set<String> columnNames = predicates.keySet();
ImmutableList.Builder<DynamoDBIndex> hashKeyMatchesBuilder = ImmutableList.builder();
// create the original table index
DynamoDBIndex tableIndex = new DynamoDBIndex(table.getName(), table.getHashKey(), table.getRangeKey(), ProjectionType.ALL, ImmutableList.of());
// if the original table has a hash key matching a predicate, start with that
if (columnNames.contains(tableIndex.getHashKey())) {
// here, treat table as a special index
hashKeyMatchesBuilder.add(tableIndex);
}
// requested columns must be projected in index
List<DynamoDBIndex> candidateIndices = table.getIndexes().stream().filter(index -> indexContainsAllRequiredColumns(requestedCols, index, table)).collect(Collectors.toList());
// get indices with hash keys that match a predicate
candidateIndices.stream().filter(index -> columnNames.contains(index.getHashKey()) && !getHashKeyAttributeValues(predicates.get(index.getHashKey())).isEmpty()).forEach(hashKeyMatchesBuilder::add);
List<DynamoDBIndex> hashKeyMatches = hashKeyMatchesBuilder.build();
// if the original table has a range key matching a predicate, start with that
ImmutableList.Builder<DynamoDBIndex> rangeKeyMatchesBuilder = ImmutableList.builder();
if (tableIndex.getRangeKey().isPresent() && columnNames.contains(tableIndex.getRangeKey().get())) {
rangeKeyMatchesBuilder.add(tableIndex);
}
// get indices with range keys that match a predicate
candidateIndices.stream().filter(index -> index.getRangeKey().isPresent() && columnNames.contains(index.getRangeKey().get())).forEach(rangeKeyMatchesBuilder::add);
List<DynamoDBIndex> rangeKeyMatches = rangeKeyMatchesBuilder.build();
// return first index where both hash and range key can be specified with predicates
for (DynamoDBIndex index : hashKeyMatches) {
if (rangeKeyMatches.contains(index)) {
return index;
}
}
// else return the first index with a hash key predicate, or the original table if there are none
return hashKeyMatches.isEmpty() ? tableIndex : hashKeyMatches.get(0);
}
use of com.amazonaws.athena.connectors.dynamodb.model.DynamoDBIndex in project aws-athena-query-federation by awslabs.
the class DDBPredicateUtilsTest method testGetBestIndexForPredicatesWithNonEqualityPredicate.
@Test
public void testGetBestIndexForPredicatesWithNonEqualityPredicate() {
// non-equality conditions for the hash key
ValueSet rangeValueSet = SortedRangeSet.of(Range.range(new BlockAllocatorImpl(), VARCHAR.getType(), "aaa", true, "bbb", false));
ValueSet singleValueSet = SortedRangeSet.of(Range.equal(new BlockAllocatorImpl(), VARCHAR.getType(), "value"));
DynamoDBTable table = new DynamoDBTable("tableName", "hashKey", Optional.of("sortKey"), ImmutableList.of(new AttributeDefinition("hashKey", "S"), new AttributeDefinition("sortKey", "S"), new AttributeDefinition("col0", "S")), ImmutableList.of(new DynamoDBIndex("col0-gsi", "col0", Optional.empty(), ProjectionType.KEYS_ONLY, ImmutableList.of())), 1000, 10, 5);
assertEquals("tableName", DDBPredicateUtils.getBestIndexForPredicates(table, ImmutableList.of("hashKey", "col0"), ImmutableMap.of("col0", rangeValueSet)).getName());
assertEquals("col0-gsi", DDBPredicateUtils.getBestIndexForPredicates(table, ImmutableList.of("hashKey", "col0"), ImmutableMap.of("col0", singleValueSet)).getName());
}
use of com.amazonaws.athena.connectors.dynamodb.model.DynamoDBIndex in project aws-athena-query-federation by awslabs.
the class DDBPredicateUtilsTest method testGetBestIndexForPredicatesWithMultipleIndices.
@Test
public void testGetBestIndexForPredicatesWithMultipleIndices() {
// multiple indices
ValueSet singleValueSet = SortedRangeSet.of(Range.equal(new BlockAllocatorImpl(), VARCHAR.getType(), "value"));
DynamoDBTable table = new DynamoDBTable("tableName", "hashKey", Optional.of("sortKey"), ImmutableList.of(new AttributeDefinition("hashKey", "S"), new AttributeDefinition("sortKey", "S"), new AttributeDefinition("col0", "S"), new AttributeDefinition("col1", "S")), ImmutableList.of(new DynamoDBIndex("col0-gsi", "col0", Optional.empty(), ProjectionType.INCLUDE, ImmutableList.of("col1")), new DynamoDBIndex("col1-gsi", "col1", Optional.empty(), ProjectionType.INCLUDE, ImmutableList.of("col2")), new DynamoDBIndex("col2-lsi", "hashKey", Optional.of("col2"), ProjectionType.ALL, ImmutableList.of())), 1000, 10, 5);
assertEquals("tableName", DDBPredicateUtils.getBestIndexForPredicates(table, ImmutableList.of("hashKey", "col0", "col1", "col2"), ImmutableMap.of("hashKey", singleValueSet)).getName());
assertEquals("tableName", DDBPredicateUtils.getBestIndexForPredicates(table, ImmutableList.of("hashKey", "col0", "col1", "col2"), ImmutableMap.of("col0", singleValueSet)).getName());
assertEquals("tableName", DDBPredicateUtils.getBestIndexForPredicates(table, ImmutableList.of("hashKey", "col0", "col1", "col2"), ImmutableMap.of("col1", singleValueSet)).getName());
assertEquals("tableName", DDBPredicateUtils.getBestIndexForPredicates(table, ImmutableList.of("hashKey", "col0", "col1", "col2"), ImmutableMap.of("col2", singleValueSet)).getName());
assertEquals("col0-gsi", DDBPredicateUtils.getBestIndexForPredicates(table, ImmutableList.of("hashKey", "col0", "col1"), ImmutableMap.of("col0", singleValueSet)).getName());
assertEquals("col1-gsi", DDBPredicateUtils.getBestIndexForPredicates(table, ImmutableList.of("hashKey", "col1", "col2"), ImmutableMap.of("col1", singleValueSet)).getName());
assertEquals("col2-lsi", DDBPredicateUtils.getBestIndexForPredicates(table, ImmutableList.of("hashKey", "col0", "col1"), ImmutableMap.of("hashKey", singleValueSet, "col2", singleValueSet)).getName());
}
use of com.amazonaws.athena.connectors.dynamodb.model.DynamoDBIndex in project aws-athena-query-federation by awslabs.
the class DynamoDBMetadataHandler method getPartitions.
/**
* Generates hash key partitions if possible or generates a single partition with the heuristically
* determined optimal scan segment count specified inside of it
*
* @see GlueMetadataHandler
*/
@Override
public void getPartitions(BlockWriter blockWriter, GetTableLayoutRequest request, QueryStatusChecker queryStatusChecker) throws Exception {
// TODO consider caching this repeated work in #enhancePartitionSchema
// use the source table name from the schema if available (in case Glue table name != actual table name)
String tableName = getSourceTableName(request.getSchema());
if (tableName == null) {
tableName = request.getTableName().getTableName();
}
DynamoDBTable table = tableResolver.getTableMetadata(tableName);
Map<String, ValueSet> summary = request.getConstraints().getSummary();
List<String> requestedCols = request.getSchema().getFields().stream().map(Field::getName).collect(Collectors.toList());
DynamoDBIndex index = DDBPredicateUtils.getBestIndexForPredicates(table, requestedCols, summary);
logger.info("using index: {}", index.getName());
String hashKeyName = index.getHashKey();
ValueSet hashKeyValueSet = summary.get(hashKeyName);
List<Object> hashKeyValues = (hashKeyValueSet != null) ? DDBPredicateUtils.getHashKeyAttributeValues(hashKeyValueSet) : Collections.emptyList();
if (!hashKeyValues.isEmpty()) {
for (Object hashKeyValue : hashKeyValues) {
blockWriter.writeRows((Block block, int rowNum) -> {
block.setValue(hashKeyName, rowNum, hashKeyValue);
// we added 1 partition per hashkey value
return 1;
});
}
} else {
// always fall back to a scan, need to return at least one partition so stick the segment count in it
int segmentCount = DDBTableUtils.getNumSegments(table.getProvisionedReadCapacity(), table.getApproxTableSizeInBytes());
blockWriter.writeRows((Block block, int rowNum) -> {
block.setValue(SEGMENT_COUNT_METADATA, rowNum, segmentCount);
return 1;
});
}
}
Aggregations