use of com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation in project aws-athena-query-federation by awslabs.
the class DynamoDBMetadataHandler method doGetSplits.
/**
* Copies data from partitions and creates splits, serializing as necessary for later calls to RecordHandler#readWithContraint.
* This API supports pagination.
*
* @see GlueMetadataHandler
*/
@Override
public GetSplitsResponse doGetSplits(BlockAllocator allocator, GetSplitsRequest request) {
int partitionContd = decodeContinuationToken(request);
Set<Split> splits = new HashSet<>();
Block partitions = request.getPartitions();
Map<String, String> partitionMetadata = partitions.getSchema().getCustomMetadata();
String partitionType = partitionMetadata.get(PARTITION_TYPE_METADATA);
if (partitionType == null) {
throw new IllegalStateException(String.format("No metadata %s defined in Schema %s", PARTITION_TYPE_METADATA, partitions.getSchema()));
}
if (QUERY_PARTITION_TYPE.equals(partitionType)) {
String hashKeyName = partitionMetadata.get(HASH_KEY_NAME_METADATA);
FieldReader hashKeyValueReader = partitions.getFieldReader(hashKeyName);
// one split per hash key value (since one DDB query can only take one hash key value)
for (int curPartition = partitionContd; curPartition < partitions.getRowCount(); curPartition++) {
hashKeyValueReader.setPosition(curPartition);
// Every split must have a unique location if we wish to spill to avoid failures
SpillLocation spillLocation = makeSpillLocation(request);
// copy all partition metadata to the split
Map<String, String> splitMetadata = new HashMap<>(partitionMetadata);
Object hashKeyValue = DDBTypeUtils.convertArrowTypeIfNecessary(hashKeyName, hashKeyValueReader.readObject());
String hashKeyValueJSON = Jackson.toJsonString(ItemUtils.toAttributeValue(hashKeyValue));
splitMetadata.put(hashKeyName, hashKeyValueJSON);
splits.add(new Split(spillLocation, makeEncryptionKey(), splitMetadata));
if (splits.size() == MAX_SPLITS_PER_REQUEST && curPartition != partitions.getRowCount() - 1) {
// so send the page back
return new GetSplitsResponse(request.getCatalogName(), splits, encodeContinuationToken(curPartition));
}
}
return new GetSplitsResponse(request.getCatalogName(), splits, null);
} else if (SCAN_PARTITION_TYPE.equals(partitionType)) {
FieldReader segmentCountReader = partitions.getFieldReader(SEGMENT_COUNT_METADATA);
int segmentCount = segmentCountReader.readInteger();
for (int curPartition = partitionContd; curPartition < segmentCount; curPartition++) {
// Every split must have a unique location if we wish to spill to avoid failures
SpillLocation spillLocation = makeSpillLocation(request);
// copy all partition metadata to the split
Map<String, String> splitMetadata = new HashMap<>(partitionMetadata);
splitMetadata.put(SEGMENT_ID_PROPERTY, String.valueOf(curPartition));
splitMetadata.put(SEGMENT_COUNT_METADATA, String.valueOf(segmentCount));
splits.add(new Split(spillLocation, makeEncryptionKey(), splitMetadata));
if (splits.size() == MAX_SPLITS_PER_REQUEST && curPartition != segmentCount - 1) {
// so send the page back
return new GetSplitsResponse(request.getCatalogName(), splits, encodeContinuationToken(curPartition));
}
}
return new GetSplitsResponse(request.getCatalogName(), splits, null);
} else {
throw new IllegalStateException("Unexpected partition type " + partitionType);
}
}
use of com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation in project aws-athena-query-federation by awslabs.
the class ElasticsearchMetadataHandler method doGetSplits.
/**
* Used to split-up the reads required to scan the requested index by shard. Cluster-health information is
* retrieved for shards associated with the specified index. A split will then be generated for each shard that
* is primary and active.
* @param allocator Tool for creating and managing Apache Arrow Blocks.
* @param request Provides details of the catalog, domain, and index being queried, as well as any filter predicate.
* @return A GetSplitsResponse which primarily contains:
* 1. A Set<Split> each containing a domain and endpoint, and the shard to be retrieved by the Record handler.
* 2. (Optional) A continuation token which allows you to paginate the generation of splits for large queries.
* @throws RuntimeException when the domain does not exist in the map, or an error occurs while processing the
* cluster/shard health information.
*/
@Override
public GetSplitsResponse doGetSplits(BlockAllocator allocator, GetSplitsRequest request) throws RuntimeException {
logger.debug("doGetSplits: enter - " + request);
// Create set of splits
Set<Split> splits = new HashSet<>();
// Get domain
String domain = request.getTableName().getSchemaName();
// Get index
String index = request.getTableName().getTableName();
try {
String endpoint = getDomainEndpoint(domain);
AwsRestHighLevelClient client = clientFactory.getOrCreateClient(endpoint);
try {
Set<Integer> shardIds = client.getShardIds(index, queryTimeout);
for (Integer shardId : shardIds) {
// Every split must have a unique location if we wish to spill to avoid failures
SpillLocation spillLocation = makeSpillLocation(request);
// Create a new split (added to the splits set) that includes the domain and endpoint, and
// shard information (to be used later by the Record Handler).
splits.add(new Split(spillLocation, makeEncryptionKey(), ImmutableMap.of(domain, endpoint, SHARD_KEY, SHARD_VALUE + shardId.toString())));
}
} catch (IOException error) {
throw new RuntimeException("Error retrieving shard-health information: " + error.getMessage(), error);
}
} catch (RuntimeException error) {
throw new RuntimeException("Error trying to generate splits for index (" + index + "): " + error.getMessage(), error);
}
return new GetSplitsResponse(request.getCatalogName(), splits);
}
Aggregations