use of com.amazonaws.athena.connector.lambda.data.Block in project aws-athena-query-federation by awslabs.
the class ImpalaMetadataHandler method doGetSplits.
/**
* Used to split-up the reads required to scan the requested batch of partition(s).
*
* @param blockAllocator Tool for creating and managing Apache Arrow Blocks.
* @param getSplitsRequest Provides details of the Impala catalog, database, table, and partition(s) being queried as well as
* any filter predicate.
* @return A GetSplitsResponse which primarily contains:
* 1. A Set of Splits which represent read operations Amazon Athena must perform by calling your read function.
* 2. (Optional) A continuation token which allows you to paginate the generation of splits for large queries.
*/
@Override
public GetSplitsResponse doGetSplits(BlockAllocator blockAllocator, GetSplitsRequest getSplitsRequest) {
LOGGER.info("{}: Catalog {}, table {}", getSplitsRequest.getQueryId(), getSplitsRequest.getTableName().getSchemaName(), getSplitsRequest.getTableName().getTableName());
int partitionContd = decodeContinuationToken(getSplitsRequest);
Set<Split> splits = new HashSet<>();
Block partitions = getSplitsRequest.getPartitions();
for (int curPartition = partitionContd; curPartition < partitions.getRowCount(); curPartition++) {
FieldReader locationReader = partitions.getFieldReader(ImpalaConstants.BLOCK_PARTITION_COLUMN_NAME);
locationReader.setPosition(curPartition);
SpillLocation spillLocation = makeSpillLocation(getSplitsRequest);
Split.Builder splitBuilder = Split.newBuilder(spillLocation, makeEncryptionKey()).add(ImpalaConstants.BLOCK_PARTITION_COLUMN_NAME, String.valueOf(locationReader.readText()));
splits.add(splitBuilder.build());
if (splits.size() >= ImpalaConstants.MAX_SPLITS_PER_REQUEST) {
return new GetSplitsResponse(getSplitsRequest.getCatalogName(), splits, encodeContinuationToken(curPartition));
}
}
return new GetSplitsResponse(getSplitsRequest.getCatalogName(), splits, null);
}
use of com.amazonaws.athena.connector.lambda.data.Block in project aws-athena-query-federation by awslabs.
the class HiveMetadataHandler method doGetSplits.
/**
* Used to split-up the reads required to scan the requested batch of partition(s).
*
* @param blockAllocator Tool for creating and managing Apache Arrow Blocks.
* @param getSplitsRequest Provides details of the Hive catalog, database, table, and partition(s) being queried as well as
* any filter predicate.
* @return A GetSplitsResponse which primarily contains:
* 1. A Set of Splits which represent read operations Amazon Athena must perform by calling your read function.
* 2. (Optional) A continuation token which allows you to paginate the generation of splits for large queries.
*/
@Override
public GetSplitsResponse doGetSplits(BlockAllocator blockAllocator, GetSplitsRequest getSplitsRequest) {
LOGGER.info("{}: Catalog {}, table {}", getSplitsRequest.getQueryId(), getSplitsRequest.getTableName().getSchemaName(), getSplitsRequest.getTableName().getTableName());
int partitionContd = decodeContinuationToken(getSplitsRequest);
Set<Split> splits = new HashSet<>();
Block partitions = getSplitsRequest.getPartitions();
for (int curPartition = partitionContd; curPartition < partitions.getRowCount(); curPartition++) {
FieldReader locationReader = partitions.getFieldReader(HiveConstants.BLOCK_PARTITION_COLUMN_NAME);
locationReader.setPosition(curPartition);
SpillLocation spillLocation = makeSpillLocation(getSplitsRequest);
Split.Builder splitBuilder = Split.newBuilder(spillLocation, makeEncryptionKey()).add(HiveConstants.BLOCK_PARTITION_COLUMN_NAME, String.valueOf(locationReader.readText()));
splits.add(splitBuilder.build());
if (splits.size() >= HiveConstants.MAX_SPLITS_PER_REQUEST) {
return new GetSplitsResponse(getSplitsRequest.getCatalogName(), splits, encodeContinuationToken(curPartition));
}
}
return new GetSplitsResponse(getSplitsRequest.getCatalogName(), splits, null);
}
use of com.amazonaws.athena.connector.lambda.data.Block in project aws-athena-query-federation by awslabs.
the class CloudwatchMetadataHandler method doGetSplits.
/**
* Each partition is converted into a single Split which means we will potentially read all LogStreams required for
* the query in parallel.
*
* @see MetadataHandler
*/
@Override
public GetSplitsResponse doGetSplits(BlockAllocator allocator, GetSplitsRequest request) {
int partitionContd = decodeContinuationToken(request);
Set<Split> splits = new HashSet<>();
Block partitions = request.getPartitions();
for (int curPartition = partitionContd; curPartition < partitions.getRowCount(); curPartition++) {
FieldReader logStreamReader = partitions.getFieldReader(LOG_STREAM_FIELD);
logStreamReader.setPosition(curPartition);
FieldReader logGroupReader = partitions.getFieldReader(LOG_GROUP_FIELD);
logGroupReader.setPosition(curPartition);
FieldReader sizeReader = partitions.getFieldReader(LOG_STREAM_SIZE_FIELD);
sizeReader.setPosition(curPartition);
// Every split must have a unique location if we wish to spill to avoid failures
SpillLocation spillLocation = makeSpillLocation(request);
Split.Builder splitBuilder = Split.newBuilder(spillLocation, makeEncryptionKey()).add(CloudwatchMetadataHandler.LOG_GROUP_FIELD, String.valueOf(logGroupReader.readText())).add(CloudwatchMetadataHandler.LOG_STREAM_FIELD, String.valueOf(logStreamReader.readText())).add(CloudwatchMetadataHandler.LOG_STREAM_SIZE_FIELD, String.valueOf(sizeReader.readLong()));
splits.add(splitBuilder.build());
if (splits.size() >= MAX_SPLITS_PER_REQUEST) {
// a continuation token.
return new GetSplitsResponse(request.getCatalogName(), splits, encodeContinuationToken(curPartition));
}
}
return new GetSplitsResponse(request.getCatalogName(), splits, null);
}
use of com.amazonaws.athena.connector.lambda.data.Block in project aws-athena-query-federation by awslabs.
the class CloudwatchMetadataHandler method getPartitions.
/**
* Gets the list of LogStreams that need to be scanned to satisfy the requested table. In most cases this will be just
* 1 LogStream and this results in just 1 partition. If, however, the request is for the special ALL_LOG_STREAMS view
* then all LogStreams in the requested LogGroup (schema) are queried and turned into partitions 1:1.
*
* @note This method applies partition pruning based on the log_stream field.
* @see MetadataHandler
*/
@Override
public void getPartitions(BlockWriter blockWriter, GetTableLayoutRequest request, QueryStatusChecker queryStatusChecker) throws Exception {
CloudwatchTableName cwTableName = tableResolver.validateTable(request.getTableName());
DescribeLogStreamsRequest cwRequest = new DescribeLogStreamsRequest(cwTableName.getLogGroupName());
if (!ALL_LOG_STREAMS_TABLE.equals(cwTableName.getLogStreamName())) {
cwRequest.setLogStreamNamePrefix(cwTableName.getLogStreamName());
}
DescribeLogStreamsResult result;
do {
result = invoker.invoke(() -> awsLogs.describeLogStreams(cwRequest));
for (LogStream next : result.getLogStreams()) {
// Each log stream that matches any possible partition pruning should be added to the partition list.
blockWriter.writeRows((Block block, int rowNum) -> {
boolean matched = block.setValue(LOG_GROUP_FIELD, rowNum, cwRequest.getLogGroupName());
matched &= block.setValue(LOG_STREAM_FIELD, rowNum, next.getLogStreamName());
matched &= block.setValue(LOG_STREAM_SIZE_FIELD, rowNum, next.getStoredBytes());
return matched ? 1 : 0;
});
}
cwRequest.setNextToken(result.getNextToken());
} while (result.getNextToken() != null && queryStatusChecker.isQueryRunning());
}
use of com.amazonaws.athena.connector.lambda.data.Block in project aws-athena-query-federation by awslabs.
the class CloudwatchMetadataHandlerTest method doGetSplits.
@Test
public void doGetSplits() {
logger.info("doGetSplits: enter");
Schema schema = SchemaBuilder.newBuilder().addField(CloudwatchMetadataHandler.LOG_STREAM_FIELD, new ArrowType.Utf8()).addField(CloudwatchMetadataHandler.LOG_STREAM_SIZE_FIELD, new ArrowType.Int(64, true)).addField(CloudwatchMetadataHandler.LOG_GROUP_FIELD, new ArrowType.Utf8()).build();
Block partitions = allocator.createBlock(schema);
int num_partitions = 2_000;
for (int i = 0; i < num_partitions; i++) {
BlockUtils.setValue(partitions.getFieldVector(CloudwatchMetadataHandler.LOG_STREAM_SIZE_FIELD), i, 2016L + i);
BlockUtils.setValue(partitions.getFieldVector(CloudwatchMetadataHandler.LOG_STREAM_FIELD), i, "log_stream_" + i);
BlockUtils.setValue(partitions.getFieldVector(CloudwatchMetadataHandler.LOG_GROUP_FIELD), i, "log_group_" + i);
}
partitions.setRowCount(num_partitions);
String continuationToken = null;
GetSplitsRequest originalReq = new GetSplitsRequest(identity, "queryId", "catalog_name", new TableName("schema", "all_log_streams"), partitions, Collections.singletonList(CloudwatchMetadataHandler.LOG_STREAM_FIELD), new Constraints(new HashMap<>()), continuationToken);
int numContinuations = 0;
do {
GetSplitsRequest req = new GetSplitsRequest(originalReq, continuationToken);
logger.info("doGetSplits: req[{}]", req);
MetadataResponse rawResponse = handler.doGetSplits(allocator, req);
assertEquals(MetadataRequestType.GET_SPLITS, rawResponse.getRequestType());
GetSplitsResponse response = (GetSplitsResponse) rawResponse;
continuationToken = response.getContinuationToken();
logger.info("doGetSplits: continuationToken[{}] - numSplits[{}]", continuationToken, response.getSplits().size());
for (Split nextSplit : response.getSplits()) {
assertNotNull(nextSplit.getProperty(CloudwatchMetadataHandler.LOG_STREAM_SIZE_FIELD));
assertNotNull(nextSplit.getProperty(CloudwatchMetadataHandler.LOG_STREAM_FIELD));
assertNotNull(nextSplit.getProperty(CloudwatchMetadataHandler.LOG_GROUP_FIELD));
}
if (continuationToken != null) {
numContinuations++;
}
} while (continuationToken != null);
assertTrue(numContinuations > 0);
logger.info("doGetSplits: exit");
}
Aggregations