Search in sources :

Example 1 with DDBRecordMetadata

use of com.amazonaws.athena.connectors.dynamodb.util.DDBRecordMetadata in project aws-athena-query-federation by awslabs.

the class DynamoDBMetadataHandler method enhancePartitionSchema.

/**
 * Generates a partition schema with metadata derived from available predicates.  This metadata will be
 * copied to splits in the #doGetSplits call.  At this point it is determined whether we can partition
 * by hash key or fall back to a full table scan.
 *
 * @see GlueMetadataHandler
 */
@Override
public void enhancePartitionSchema(SchemaBuilder partitionSchemaBuilder, GetTableLayoutRequest request) {
    // use the source table name from the schema if available (in case Glue table name != actual table name)
    String tableName = getSourceTableName(request.getSchema());
    if (tableName == null) {
        tableName = request.getTableName().getTableName();
    }
    DynamoDBTable table = null;
    try {
        table = tableResolver.getTableMetadata(tableName);
    } catch (TimeoutException e) {
        throw new RuntimeException(e);
    }
    // add table name so we don't have to do case insensitive resolution again
    partitionSchemaBuilder.addMetadata(TABLE_METADATA, table.getName());
    Map<String, ValueSet> summary = request.getConstraints().getSummary();
    List<String> requestedCols = request.getSchema().getFields().stream().map(Field::getName).collect(Collectors.toList());
    DynamoDBIndex index = DDBPredicateUtils.getBestIndexForPredicates(table, requestedCols, summary);
    logger.info("using index: {}", index.getName());
    String hashKeyName = index.getHashKey();
    ValueSet hashKeyValueSet = summary.get(hashKeyName);
    List<Object> hashKeyValues = (hashKeyValueSet != null) ? DDBPredicateUtils.getHashKeyAttributeValues(hashKeyValueSet) : Collections.emptyList();
    DDBRecordMetadata recordMetadata = new DDBRecordMetadata(request.getSchema());
    Set<String> columnsToIgnore = new HashSet<>();
    List<AttributeValue> valueAccumulator = new ArrayList<>();
    IncrementingValueNameProducer valueNameProducer = new IncrementingValueNameProducer();
    if (!hashKeyValues.isEmpty()) {
        // can "partition" on hash key
        partitionSchemaBuilder.addField(hashKeyName, hashKeyValueSet.getType());
        partitionSchemaBuilder.addMetadata(HASH_KEY_NAME_METADATA, hashKeyName);
        columnsToIgnore.add(hashKeyName);
        partitionSchemaBuilder.addMetadata(PARTITION_TYPE_METADATA, QUERY_PARTITION_TYPE);
        if (!table.getName().equals(index.getName())) {
            partitionSchemaBuilder.addMetadata(INDEX_METADATA, index.getName());
        }
        // add range key filter if there is one
        Optional<String> rangeKey = index.getRangeKey();
        if (rangeKey.isPresent()) {
            String rangeKeyName = rangeKey.get();
            if (summary.containsKey(rangeKeyName)) {
                String rangeKeyFilter = DDBPredicateUtils.generateSingleColumnFilter(rangeKeyName, summary.get(rangeKeyName), valueAccumulator, valueNameProducer, recordMetadata);
                partitionSchemaBuilder.addMetadata(RANGE_KEY_NAME_METADATA, rangeKeyName);
                partitionSchemaBuilder.addMetadata(RANGE_KEY_FILTER_METADATA, rangeKeyFilter);
                columnsToIgnore.add(rangeKeyName);
            }
        }
    } else {
        // always fall back to a scan
        partitionSchemaBuilder.addField(SEGMENT_COUNT_METADATA, Types.MinorType.INT.getType());
        partitionSchemaBuilder.addMetadata(PARTITION_TYPE_METADATA, SCAN_PARTITION_TYPE);
    }
    // We will exclude the columns with custom types from filter clause when querying/scanning DDB
    // As those types are not natively supported by DDB or Glue
    // So we have to filter the results after the query/scan result is returned
    columnsToIgnore.addAll(recordMetadata.getNonComparableColumns());
    precomputeAdditionalMetadata(columnsToIgnore, summary, valueAccumulator, valueNameProducer, partitionSchemaBuilder, recordMetadata);
}
Also used : IncrementingValueNameProducer(com.amazonaws.athena.connectors.dynamodb.util.IncrementingValueNameProducer) AttributeValue(com.amazonaws.services.dynamodbv2.model.AttributeValue) ArrayList(java.util.ArrayList) DynamoDBIndex(com.amazonaws.athena.connectors.dynamodb.model.DynamoDBIndex) ValueSet(com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet) DDBRecordMetadata(com.amazonaws.athena.connectors.dynamodb.util.DDBRecordMetadata) DynamoDBTable(com.amazonaws.athena.connectors.dynamodb.model.DynamoDBTable) TimeoutException(java.util.concurrent.TimeoutException) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet)

Example 2 with DDBRecordMetadata

use of com.amazonaws.athena.connectors.dynamodb.util.DDBRecordMetadata in project aws-athena-query-federation by awslabs.

the class DynamoDBRecordHandler method readWithConstraint.

/**
 * Reads data from DynamoDB by submitting either a Query or a Scan, depending
 * on the type of split, and includes any filters specified in the split.
 *
 * @see RecordHandler
 */
@Override
protected void readWithConstraint(BlockSpiller spiller, ReadRecordsRequest recordsRequest, QueryStatusChecker queryStatusChecker) throws ExecutionException {
    Split split = recordsRequest.getSplit();
    // use the property instead of the request table name because of case sensitivity
    String tableName = split.getProperty(TABLE_METADATA);
    invokerCache.get(tableName).setBlockSpiller(spiller);
    Iterator<Map<String, AttributeValue>> itemIterator = getIterator(split, tableName, recordsRequest.getSchema());
    DDBRecordMetadata recordMetadata = new DDBRecordMetadata(recordsRequest.getSchema());
    DynamoDBFieldResolver resolver = new DynamoDBFieldResolver(recordMetadata);
    long numRows = 0;
    AtomicLong numResultRows = new AtomicLong(0);
    while (itemIterator.hasNext()) {
        if (!queryStatusChecker.isQueryRunning()) {
            // we can stop processing because the query waiting for this data has already terminated
            return;
        }
        numRows++;
        spiller.writeRows((Block block, int rowNum) -> {
            Map<String, AttributeValue> item = itemIterator.next();
            if (item == null) {
                // had not made any DDB calls yet and there may be zero items returned when it does
                return 0;
            }
            boolean matched = true;
            numResultRows.getAndIncrement();
            // TODO refactor to use GeneratedRowWriter to improve performance
            for (Field nextField : recordsRequest.getSchema().getFields()) {
                Object value = ItemUtils.toSimpleValue(item.get(nextField.getName()));
                Types.MinorType fieldType = Types.getMinorTypeForArrowType(nextField.getType());
                value = DDBTypeUtils.coerceValueToExpectedType(value, nextField, fieldType, recordMetadata);
                try {
                    switch(fieldType) {
                        case LIST:
                            // DDB may return Set so coerce to List. Also coerce each List item to the correct type.
                            List valueAsList = value != null ? DDBTypeUtils.coerceListToExpectedType(value, nextField, recordMetadata) : null;
                            matched &= block.offerComplexValue(nextField.getName(), rowNum, resolver, valueAsList);
                            break;
                        case STRUCT:
                            matched &= block.offerComplexValue(nextField.getName(), rowNum, resolver, value);
                            break;
                        default:
                            matched &= block.offerValue(nextField.getName(), rowNum, value);
                            break;
                    }
                    if (!matched) {
                        return 0;
                    }
                } catch (Exception ex) {
                    throw new RuntimeException("Error while processing field " + nextField.getName(), ex);
                }
            }
            return 1;
        });
    }
    logger.info("readWithConstraint: numRows[{}] numResultRows[{}]", numRows, numResultRows.get());
}
Also used : Types(org.apache.arrow.vector.types.Types) AttributeValue(com.amazonaws.services.dynamodbv2.model.AttributeValue) TimeoutException(java.util.concurrent.TimeoutException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) DynamoDBFieldResolver(com.amazonaws.athena.connectors.dynamodb.resolver.DynamoDBFieldResolver) Field(org.apache.arrow.vector.types.pojo.Field) AtomicLong(java.util.concurrent.atomic.AtomicLong) Block(com.amazonaws.athena.connector.lambda.data.Block) List(java.util.List) Split(com.amazonaws.athena.connector.lambda.domain.Split) Map(java.util.Map) HashMap(java.util.HashMap) DDBRecordMetadata(com.amazonaws.athena.connectors.dynamodb.util.DDBRecordMetadata)

Aggregations

DDBRecordMetadata (com.amazonaws.athena.connectors.dynamodb.util.DDBRecordMetadata)2 AttributeValue (com.amazonaws.services.dynamodbv2.model.AttributeValue)2 TimeoutException (java.util.concurrent.TimeoutException)2 Block (com.amazonaws.athena.connector.lambda.data.Block)1 Split (com.amazonaws.athena.connector.lambda.domain.Split)1 ValueSet (com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet)1 DynamoDBIndex (com.amazonaws.athena.connectors.dynamodb.model.DynamoDBIndex)1 DynamoDBTable (com.amazonaws.athena.connectors.dynamodb.model.DynamoDBTable)1 DynamoDBFieldResolver (com.amazonaws.athena.connectors.dynamodb.resolver.DynamoDBFieldResolver)1 IncrementingValueNameProducer (com.amazonaws.athena.connectors.dynamodb.util.IncrementingValueNameProducer)1 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 LinkedHashSet (java.util.LinkedHashSet)1 List (java.util.List)1 Map (java.util.Map)1 ExecutionException (java.util.concurrent.ExecutionException)1 AtomicLong (java.util.concurrent.atomic.AtomicLong)1 Types (org.apache.arrow.vector.types.Types)1