Search in sources :

Example 1 with DynamoDBIndex

use of com.amazonaws.athena.connectors.dynamodb.model.DynamoDBIndex in project aws-athena-query-federation by awslabs.

the class DynamoDBMetadataHandler method enhancePartitionSchema.

/**
 * Generates a partition schema with metadata derived from available predicates.  This metadata will be
 * copied to splits in the #doGetSplits call.  At this point it is determined whether we can partition
 * by hash key or fall back to a full table scan.
 *
 * @see GlueMetadataHandler
 */
@Override
public void enhancePartitionSchema(SchemaBuilder partitionSchemaBuilder, GetTableLayoutRequest request) {
    // use the source table name from the schema if available (in case Glue table name != actual table name)
    String tableName = getSourceTableName(request.getSchema());
    if (tableName == null) {
        tableName = request.getTableName().getTableName();
    }
    DynamoDBTable table = null;
    try {
        table = tableResolver.getTableMetadata(tableName);
    } catch (TimeoutException e) {
        throw new RuntimeException(e);
    }
    // add table name so we don't have to do case insensitive resolution again
    partitionSchemaBuilder.addMetadata(TABLE_METADATA, table.getName());
    Map<String, ValueSet> summary = request.getConstraints().getSummary();
    List<String> requestedCols = request.getSchema().getFields().stream().map(Field::getName).collect(Collectors.toList());
    DynamoDBIndex index = DDBPredicateUtils.getBestIndexForPredicates(table, requestedCols, summary);
    logger.info("using index: {}", index.getName());
    String hashKeyName = index.getHashKey();
    ValueSet hashKeyValueSet = summary.get(hashKeyName);
    List<Object> hashKeyValues = (hashKeyValueSet != null) ? DDBPredicateUtils.getHashKeyAttributeValues(hashKeyValueSet) : Collections.emptyList();
    DDBRecordMetadata recordMetadata = new DDBRecordMetadata(request.getSchema());
    Set<String> columnsToIgnore = new HashSet<>();
    List<AttributeValue> valueAccumulator = new ArrayList<>();
    IncrementingValueNameProducer valueNameProducer = new IncrementingValueNameProducer();
    if (!hashKeyValues.isEmpty()) {
        // can "partition" on hash key
        partitionSchemaBuilder.addField(hashKeyName, hashKeyValueSet.getType());
        partitionSchemaBuilder.addMetadata(HASH_KEY_NAME_METADATA, hashKeyName);
        columnsToIgnore.add(hashKeyName);
        partitionSchemaBuilder.addMetadata(PARTITION_TYPE_METADATA, QUERY_PARTITION_TYPE);
        if (!table.getName().equals(index.getName())) {
            partitionSchemaBuilder.addMetadata(INDEX_METADATA, index.getName());
        }
        // add range key filter if there is one
        Optional<String> rangeKey = index.getRangeKey();
        if (rangeKey.isPresent()) {
            String rangeKeyName = rangeKey.get();
            if (summary.containsKey(rangeKeyName)) {
                String rangeKeyFilter = DDBPredicateUtils.generateSingleColumnFilter(rangeKeyName, summary.get(rangeKeyName), valueAccumulator, valueNameProducer, recordMetadata);
                partitionSchemaBuilder.addMetadata(RANGE_KEY_NAME_METADATA, rangeKeyName);
                partitionSchemaBuilder.addMetadata(RANGE_KEY_FILTER_METADATA, rangeKeyFilter);
                columnsToIgnore.add(rangeKeyName);
            }
        }
    } else {
        // always fall back to a scan
        partitionSchemaBuilder.addField(SEGMENT_COUNT_METADATA, Types.MinorType.INT.getType());
        partitionSchemaBuilder.addMetadata(PARTITION_TYPE_METADATA, SCAN_PARTITION_TYPE);
    }
    // We will exclude the columns with custom types from filter clause when querying/scanning DDB
    // As those types are not natively supported by DDB or Glue
    // So we have to filter the results after the query/scan result is returned
    columnsToIgnore.addAll(recordMetadata.getNonComparableColumns());
    precomputeAdditionalMetadata(columnsToIgnore, summary, valueAccumulator, valueNameProducer, partitionSchemaBuilder, recordMetadata);
}
Also used : IncrementingValueNameProducer(com.amazonaws.athena.connectors.dynamodb.util.IncrementingValueNameProducer) AttributeValue(com.amazonaws.services.dynamodbv2.model.AttributeValue) ArrayList(java.util.ArrayList) DynamoDBIndex(com.amazonaws.athena.connectors.dynamodb.model.DynamoDBIndex) ValueSet(com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet) DDBRecordMetadata(com.amazonaws.athena.connectors.dynamodb.util.DDBRecordMetadata) DynamoDBTable(com.amazonaws.athena.connectors.dynamodb.model.DynamoDBTable) TimeoutException(java.util.concurrent.TimeoutException) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet)

Example 2 with DynamoDBIndex

use of com.amazonaws.athena.connectors.dynamodb.model.DynamoDBIndex in project aws-athena-query-federation by awslabs.

the class DDBPredicateUtils method getBestIndexForPredicates.

/**
 * Attempts to pick an optimal index (if any) from the given predicates. Returns the original table index if
 * one was not found.
 *
 * @param table the original table
 * @param predicates the predicates
 * @return the optimal index if found, otherwise the original table index
 */
public static DynamoDBIndex getBestIndexForPredicates(DynamoDBTable table, List<String> requestedCols, Map<String, ValueSet> predicates) {
    Set<String> columnNames = predicates.keySet();
    ImmutableList.Builder<DynamoDBIndex> hashKeyMatchesBuilder = ImmutableList.builder();
    // create the original table index
    DynamoDBIndex tableIndex = new DynamoDBIndex(table.getName(), table.getHashKey(), table.getRangeKey(), ProjectionType.ALL, ImmutableList.of());
    // if the original table has a hash key matching a predicate, start with that
    if (columnNames.contains(tableIndex.getHashKey())) {
        // here, treat table as a special index
        hashKeyMatchesBuilder.add(tableIndex);
    }
    // requested columns must be projected in index
    List<DynamoDBIndex> candidateIndices = table.getIndexes().stream().filter(index -> indexContainsAllRequiredColumns(requestedCols, index, table)).collect(Collectors.toList());
    // get indices with hash keys that match a predicate
    candidateIndices.stream().filter(index -> columnNames.contains(index.getHashKey()) && !getHashKeyAttributeValues(predicates.get(index.getHashKey())).isEmpty()).forEach(hashKeyMatchesBuilder::add);
    List<DynamoDBIndex> hashKeyMatches = hashKeyMatchesBuilder.build();
    // if the original table has a range key matching a predicate, start with that
    ImmutableList.Builder<DynamoDBIndex> rangeKeyMatchesBuilder = ImmutableList.builder();
    if (tableIndex.getRangeKey().isPresent() && columnNames.contains(tableIndex.getRangeKey().get())) {
        rangeKeyMatchesBuilder.add(tableIndex);
    }
    // get indices with range keys that match a predicate
    candidateIndices.stream().filter(index -> index.getRangeKey().isPresent() && columnNames.contains(index.getRangeKey().get())).forEach(rangeKeyMatchesBuilder::add);
    List<DynamoDBIndex> rangeKeyMatches = rangeKeyMatchesBuilder.build();
    // return first index where both hash and range key can be specified with predicates
    for (DynamoDBIndex index : hashKeyMatches) {
        if (rangeKeyMatches.contains(index)) {
            return index;
        }
    }
    // else return the first index with a hash key predicate, or the original table if there are none
    return hashKeyMatches.isEmpty() ? tableIndex : hashKeyMatches.get(0);
}
Also used : SortedRangeSet(com.amazonaws.athena.connector.lambda.domain.predicate.SortedRangeSet) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) EquatableValueSet(com.amazonaws.athena.connector.lambda.domain.predicate.EquatableValueSet) Set(java.util.Set) Iterables.getOnlyElement(com.google.common.collect.Iterables.getOnlyElement) Collectors(java.util.stream.Collectors) ArrayList(java.util.ArrayList) Preconditions.checkState(com.google.common.base.Preconditions.checkState) HashSet(java.util.HashSet) Range(com.amazonaws.athena.connector.lambda.domain.predicate.Range) DynamoDBIndex(com.amazonaws.athena.connectors.dynamodb.model.DynamoDBIndex) List(java.util.List) Stream(java.util.stream.Stream) ImmutableList(com.google.common.collect.ImmutableList) AttributeValue(com.amazonaws.services.dynamodbv2.model.AttributeValue) Map(java.util.Map) ValueSet(com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet) ItemUtils(com.amazonaws.services.dynamodbv2.document.ItemUtils) DynamoDBTable(com.amazonaws.athena.connectors.dynamodb.model.DynamoDBTable) ProjectionType(com.amazonaws.services.dynamodbv2.model.ProjectionType) Joiner(com.google.common.base.Joiner) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ImmutableList(com.google.common.collect.ImmutableList) DynamoDBIndex(com.amazonaws.athena.connectors.dynamodb.model.DynamoDBIndex)

Example 3 with DynamoDBIndex

use of com.amazonaws.athena.connectors.dynamodb.model.DynamoDBIndex in project aws-athena-query-federation by awslabs.

the class DDBPredicateUtilsTest method testGetBestIndexForPredicatesWithNonEqualityPredicate.

@Test
public void testGetBestIndexForPredicatesWithNonEqualityPredicate() {
    // non-equality conditions for the hash key
    ValueSet rangeValueSet = SortedRangeSet.of(Range.range(new BlockAllocatorImpl(), VARCHAR.getType(), "aaa", true, "bbb", false));
    ValueSet singleValueSet = SortedRangeSet.of(Range.equal(new BlockAllocatorImpl(), VARCHAR.getType(), "value"));
    DynamoDBTable table = new DynamoDBTable("tableName", "hashKey", Optional.of("sortKey"), ImmutableList.of(new AttributeDefinition("hashKey", "S"), new AttributeDefinition("sortKey", "S"), new AttributeDefinition("col0", "S")), ImmutableList.of(new DynamoDBIndex("col0-gsi", "col0", Optional.empty(), ProjectionType.KEYS_ONLY, ImmutableList.of())), 1000, 10, 5);
    assertEquals("tableName", DDBPredicateUtils.getBestIndexForPredicates(table, ImmutableList.of("hashKey", "col0"), ImmutableMap.of("col0", rangeValueSet)).getName());
    assertEquals("col0-gsi", DDBPredicateUtils.getBestIndexForPredicates(table, ImmutableList.of("hashKey", "col0"), ImmutableMap.of("col0", singleValueSet)).getName());
}
Also used : BlockAllocatorImpl(com.amazonaws.athena.connector.lambda.data.BlockAllocatorImpl) AttributeDefinition(com.amazonaws.services.dynamodbv2.model.AttributeDefinition) DynamoDBIndex(com.amazonaws.athena.connectors.dynamodb.model.DynamoDBIndex) ValueSet(com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet) DynamoDBTable(com.amazonaws.athena.connectors.dynamodb.model.DynamoDBTable) Test(org.junit.Test)

Example 4 with DynamoDBIndex

use of com.amazonaws.athena.connectors.dynamodb.model.DynamoDBIndex in project aws-athena-query-federation by awslabs.

the class DDBPredicateUtilsTest method testGetBestIndexForPredicatesWithMultipleIndices.

@Test
public void testGetBestIndexForPredicatesWithMultipleIndices() {
    // multiple indices
    ValueSet singleValueSet = SortedRangeSet.of(Range.equal(new BlockAllocatorImpl(), VARCHAR.getType(), "value"));
    DynamoDBTable table = new DynamoDBTable("tableName", "hashKey", Optional.of("sortKey"), ImmutableList.of(new AttributeDefinition("hashKey", "S"), new AttributeDefinition("sortKey", "S"), new AttributeDefinition("col0", "S"), new AttributeDefinition("col1", "S")), ImmutableList.of(new DynamoDBIndex("col0-gsi", "col0", Optional.empty(), ProjectionType.INCLUDE, ImmutableList.of("col1")), new DynamoDBIndex("col1-gsi", "col1", Optional.empty(), ProjectionType.INCLUDE, ImmutableList.of("col2")), new DynamoDBIndex("col2-lsi", "hashKey", Optional.of("col2"), ProjectionType.ALL, ImmutableList.of())), 1000, 10, 5);
    assertEquals("tableName", DDBPredicateUtils.getBestIndexForPredicates(table, ImmutableList.of("hashKey", "col0", "col1", "col2"), ImmutableMap.of("hashKey", singleValueSet)).getName());
    assertEquals("tableName", DDBPredicateUtils.getBestIndexForPredicates(table, ImmutableList.of("hashKey", "col0", "col1", "col2"), ImmutableMap.of("col0", singleValueSet)).getName());
    assertEquals("tableName", DDBPredicateUtils.getBestIndexForPredicates(table, ImmutableList.of("hashKey", "col0", "col1", "col2"), ImmutableMap.of("col1", singleValueSet)).getName());
    assertEquals("tableName", DDBPredicateUtils.getBestIndexForPredicates(table, ImmutableList.of("hashKey", "col0", "col1", "col2"), ImmutableMap.of("col2", singleValueSet)).getName());
    assertEquals("col0-gsi", DDBPredicateUtils.getBestIndexForPredicates(table, ImmutableList.of("hashKey", "col0", "col1"), ImmutableMap.of("col0", singleValueSet)).getName());
    assertEquals("col1-gsi", DDBPredicateUtils.getBestIndexForPredicates(table, ImmutableList.of("hashKey", "col1", "col2"), ImmutableMap.of("col1", singleValueSet)).getName());
    assertEquals("col2-lsi", DDBPredicateUtils.getBestIndexForPredicates(table, ImmutableList.of("hashKey", "col0", "col1"), ImmutableMap.of("hashKey", singleValueSet, "col2", singleValueSet)).getName());
}
Also used : BlockAllocatorImpl(com.amazonaws.athena.connector.lambda.data.BlockAllocatorImpl) AttributeDefinition(com.amazonaws.services.dynamodbv2.model.AttributeDefinition) DynamoDBIndex(com.amazonaws.athena.connectors.dynamodb.model.DynamoDBIndex) ValueSet(com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet) DynamoDBTable(com.amazonaws.athena.connectors.dynamodb.model.DynamoDBTable) Test(org.junit.Test)

Example 5 with DynamoDBIndex

use of com.amazonaws.athena.connectors.dynamodb.model.DynamoDBIndex in project aws-athena-query-federation by awslabs.

the class DynamoDBMetadataHandler method getPartitions.

/**
 * Generates hash key partitions if possible or generates a single partition with the heuristically
 * determined optimal scan segment count specified inside of it
 *
 * @see GlueMetadataHandler
 */
@Override
public void getPartitions(BlockWriter blockWriter, GetTableLayoutRequest request, QueryStatusChecker queryStatusChecker) throws Exception {
    // TODO consider caching this repeated work in #enhancePartitionSchema
    // use the source table name from the schema if available (in case Glue table name != actual table name)
    String tableName = getSourceTableName(request.getSchema());
    if (tableName == null) {
        tableName = request.getTableName().getTableName();
    }
    DynamoDBTable table = tableResolver.getTableMetadata(tableName);
    Map<String, ValueSet> summary = request.getConstraints().getSummary();
    List<String> requestedCols = request.getSchema().getFields().stream().map(Field::getName).collect(Collectors.toList());
    DynamoDBIndex index = DDBPredicateUtils.getBestIndexForPredicates(table, requestedCols, summary);
    logger.info("using index: {}", index.getName());
    String hashKeyName = index.getHashKey();
    ValueSet hashKeyValueSet = summary.get(hashKeyName);
    List<Object> hashKeyValues = (hashKeyValueSet != null) ? DDBPredicateUtils.getHashKeyAttributeValues(hashKeyValueSet) : Collections.emptyList();
    if (!hashKeyValues.isEmpty()) {
        for (Object hashKeyValue : hashKeyValues) {
            blockWriter.writeRows((Block block, int rowNum) -> {
                block.setValue(hashKeyName, rowNum, hashKeyValue);
                // we added 1 partition per hashkey value
                return 1;
            });
        }
    } else {
        // always fall back to a scan, need to return at least one partition so stick the segment count in it
        int segmentCount = DDBTableUtils.getNumSegments(table.getProvisionedReadCapacity(), table.getApproxTableSizeInBytes());
        blockWriter.writeRows((Block block, int rowNum) -> {
            block.setValue(SEGMENT_COUNT_METADATA, rowNum, segmentCount);
            return 1;
        });
    }
}
Also used : DynamoDBIndex(com.amazonaws.athena.connectors.dynamodb.model.DynamoDBIndex) Block(com.amazonaws.athena.connector.lambda.data.Block) ValueSet(com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet) DynamoDBTable(com.amazonaws.athena.connectors.dynamodb.model.DynamoDBTable)

Aggregations

DynamoDBIndex (com.amazonaws.athena.connectors.dynamodb.model.DynamoDBIndex)10 DynamoDBTable (com.amazonaws.athena.connectors.dynamodb.model.DynamoDBTable)10 ValueSet (com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet)9 AttributeDefinition (com.amazonaws.services.dynamodbv2.model.AttributeDefinition)7 BlockAllocatorImpl (com.amazonaws.athena.connector.lambda.data.BlockAllocatorImpl)6 Test (org.junit.Test)6 AttributeValue (com.amazonaws.services.dynamodbv2.model.AttributeValue)3 HashSet (java.util.HashSet)3 ItemUtils (com.amazonaws.services.dynamodbv2.document.ItemUtils)2 ProjectionType (com.amazonaws.services.dynamodbv2.model.ProjectionType)2 ImmutableList (com.google.common.collect.ImmutableList)2 ArrayList (java.util.ArrayList)2 List (java.util.List)2 Map (java.util.Map)2 Set (java.util.Set)2 TimeoutException (java.util.concurrent.TimeoutException)2 ThrottlingInvoker (com.amazonaws.athena.connector.lambda.ThrottlingInvoker)1 Block (com.amazonaws.athena.connector.lambda.data.Block)1 SchemaBuilder (com.amazonaws.athena.connector.lambda.data.SchemaBuilder)1 EquatableValueSet (com.amazonaws.athena.connector.lambda.domain.predicate.EquatableValueSet)1