Search in sources :

Example 31 with SpillLocation

use of com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation in project aws-athena-query-federation by awslabs.

the class DynamoDBMetadataHandler method doGetSplits.

/**
 * Copies data from partitions and creates splits, serializing as necessary for later calls to RecordHandler#readWithContraint.
 * This API supports pagination.
 *
 * @see GlueMetadataHandler
 */
@Override
public GetSplitsResponse doGetSplits(BlockAllocator allocator, GetSplitsRequest request) {
    int partitionContd = decodeContinuationToken(request);
    Set<Split> splits = new HashSet<>();
    Block partitions = request.getPartitions();
    Map<String, String> partitionMetadata = partitions.getSchema().getCustomMetadata();
    String partitionType = partitionMetadata.get(PARTITION_TYPE_METADATA);
    if (partitionType == null) {
        throw new IllegalStateException(String.format("No metadata %s defined in Schema %s", PARTITION_TYPE_METADATA, partitions.getSchema()));
    }
    if (QUERY_PARTITION_TYPE.equals(partitionType)) {
        String hashKeyName = partitionMetadata.get(HASH_KEY_NAME_METADATA);
        FieldReader hashKeyValueReader = partitions.getFieldReader(hashKeyName);
        // one split per hash key value (since one DDB query can only take one hash key value)
        for (int curPartition = partitionContd; curPartition < partitions.getRowCount(); curPartition++) {
            hashKeyValueReader.setPosition(curPartition);
            // Every split must have a unique location if we wish to spill to avoid failures
            SpillLocation spillLocation = makeSpillLocation(request);
            // copy all partition metadata to the split
            Map<String, String> splitMetadata = new HashMap<>(partitionMetadata);
            Object hashKeyValue = DDBTypeUtils.convertArrowTypeIfNecessary(hashKeyName, hashKeyValueReader.readObject());
            String hashKeyValueJSON = Jackson.toJsonString(ItemUtils.toAttributeValue(hashKeyValue));
            splitMetadata.put(hashKeyName, hashKeyValueJSON);
            splits.add(new Split(spillLocation, makeEncryptionKey(), splitMetadata));
            if (splits.size() == MAX_SPLITS_PER_REQUEST && curPartition != partitions.getRowCount() - 1) {
                // so send the page back
                return new GetSplitsResponse(request.getCatalogName(), splits, encodeContinuationToken(curPartition));
            }
        }
        return new GetSplitsResponse(request.getCatalogName(), splits, null);
    } else if (SCAN_PARTITION_TYPE.equals(partitionType)) {
        FieldReader segmentCountReader = partitions.getFieldReader(SEGMENT_COUNT_METADATA);
        int segmentCount = segmentCountReader.readInteger();
        for (int curPartition = partitionContd; curPartition < segmentCount; curPartition++) {
            // Every split must have a unique location if we wish to spill to avoid failures
            SpillLocation spillLocation = makeSpillLocation(request);
            // copy all partition metadata to the split
            Map<String, String> splitMetadata = new HashMap<>(partitionMetadata);
            splitMetadata.put(SEGMENT_ID_PROPERTY, String.valueOf(curPartition));
            splitMetadata.put(SEGMENT_COUNT_METADATA, String.valueOf(segmentCount));
            splits.add(new Split(spillLocation, makeEncryptionKey(), splitMetadata));
            if (splits.size() == MAX_SPLITS_PER_REQUEST && curPartition != segmentCount - 1) {
                // so send the page back
                return new GetSplitsResponse(request.getCatalogName(), splits, encodeContinuationToken(curPartition));
            }
        }
        return new GetSplitsResponse(request.getCatalogName(), splits, null);
    } else {
        throw new IllegalStateException("Unexpected partition type " + partitionType);
    }
}
Also used : SpillLocation(com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation) HashMap(java.util.HashMap) GetSplitsResponse(com.amazonaws.athena.connector.lambda.metadata.GetSplitsResponse) Block(com.amazonaws.athena.connector.lambda.data.Block) Split(com.amazonaws.athena.connector.lambda.domain.Split) FieldReader(org.apache.arrow.vector.complex.reader.FieldReader) Map(java.util.Map) HashMap(java.util.HashMap) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet)

Example 32 with SpillLocation

use of com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation in project aws-athena-query-federation by awslabs.

the class ElasticsearchMetadataHandler method doGetSplits.

/**
 * Used to split-up the reads required to scan the requested index by shard. Cluster-health information is
 * retrieved for shards associated with the specified index. A split will then be generated for each shard that
 * is primary and active.
 * @param allocator Tool for creating and managing Apache Arrow Blocks.
 * @param request Provides details of the catalog, domain, and index being queried, as well as any filter predicate.
 * @return A GetSplitsResponse which primarily contains:
 * 1. A Set<Split> each containing a domain and endpoint, and the shard to be retrieved by the Record handler.
 * 2. (Optional) A continuation token which allows you to paginate the generation of splits for large queries.
 * @throws RuntimeException when the domain does not exist in the map, or an error occurs while processing the
 * cluster/shard health information.
 */
@Override
public GetSplitsResponse doGetSplits(BlockAllocator allocator, GetSplitsRequest request) throws RuntimeException {
    logger.debug("doGetSplits: enter - " + request);
    // Create set of splits
    Set<Split> splits = new HashSet<>();
    // Get domain
    String domain = request.getTableName().getSchemaName();
    // Get index
    String index = request.getTableName().getTableName();
    try {
        String endpoint = getDomainEndpoint(domain);
        AwsRestHighLevelClient client = clientFactory.getOrCreateClient(endpoint);
        try {
            Set<Integer> shardIds = client.getShardIds(index, queryTimeout);
            for (Integer shardId : shardIds) {
                // Every split must have a unique location if we wish to spill to avoid failures
                SpillLocation spillLocation = makeSpillLocation(request);
                // Create a new split (added to the splits set) that includes the domain and endpoint, and
                // shard information (to be used later by the Record Handler).
                splits.add(new Split(spillLocation, makeEncryptionKey(), ImmutableMap.of(domain, endpoint, SHARD_KEY, SHARD_VALUE + shardId.toString())));
            }
        } catch (IOException error) {
            throw new RuntimeException("Error retrieving shard-health information: " + error.getMessage(), error);
        }
    } catch (RuntimeException error) {
        throw new RuntimeException("Error trying to generate splits for index (" + index + "): " + error.getMessage(), error);
    }
    return new GetSplitsResponse(request.getCatalogName(), splits);
}
Also used : SpillLocation(com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation) GetSplitsResponse(com.amazonaws.athena.connector.lambda.metadata.GetSplitsResponse) IOException(java.io.IOException) Split(com.amazonaws.athena.connector.lambda.domain.Split) HashSet(java.util.HashSet)

Aggregations

SpillLocation (com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation)32 Block (com.amazonaws.athena.connector.lambda.data.Block)23 Split (com.amazonaws.athena.connector.lambda.domain.Split)21 GetSplitsResponse (com.amazonaws.athena.connector.lambda.metadata.GetSplitsResponse)17 HashSet (java.util.HashSet)16 S3SpillLocation (com.amazonaws.athena.connector.lambda.domain.spill.S3SpillLocation)15 FieldReader (org.apache.arrow.vector.complex.reader.FieldReader)13 HashMap (java.util.HashMap)11 Constraints (com.amazonaws.athena.connector.lambda.domain.predicate.Constraints)9 ValueSet (com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet)9 ReadRecordsRequest (com.amazonaws.athena.connector.lambda.records.ReadRecordsRequest)9 RemoteReadRecordsResponse (com.amazonaws.athena.connector.lambda.records.RemoteReadRecordsResponse)9 Test (org.junit.Test)9 RecordResponse (com.amazonaws.athena.connector.lambda.records.RecordResponse)8 TableName (com.amazonaws.athena.connector.lambda.domain.TableName)7 Matchers.anyString (org.mockito.Matchers.anyString)7 EquatableValueSet (com.amazonaws.athena.connector.lambda.domain.predicate.EquatableValueSet)5 EncryptionKey (com.amazonaws.athena.connector.lambda.security.EncryptionKey)5 InvocationOnMock (org.mockito.invocation.InvocationOnMock)4 PutObjectResult (com.amazonaws.services.s3.model.PutObjectResult)3