use of com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation in project aws-athena-query-federation by awslabs.
the class ImpalaMetadataHandler method doGetSplits.
/**
* Used to split-up the reads required to scan the requested batch of partition(s).
*
* @param blockAllocator Tool for creating and managing Apache Arrow Blocks.
* @param getSplitsRequest Provides details of the Impala catalog, database, table, and partition(s) being queried as well as
* any filter predicate.
* @return A GetSplitsResponse which primarily contains:
* 1. A Set of Splits which represent read operations Amazon Athena must perform by calling your read function.
* 2. (Optional) A continuation token which allows you to paginate the generation of splits for large queries.
*/
@Override
public GetSplitsResponse doGetSplits(BlockAllocator blockAllocator, GetSplitsRequest getSplitsRequest) {
LOGGER.info("{}: Catalog {}, table {}", getSplitsRequest.getQueryId(), getSplitsRequest.getTableName().getSchemaName(), getSplitsRequest.getTableName().getTableName());
int partitionContd = decodeContinuationToken(getSplitsRequest);
Set<Split> splits = new HashSet<>();
Block partitions = getSplitsRequest.getPartitions();
for (int curPartition = partitionContd; curPartition < partitions.getRowCount(); curPartition++) {
FieldReader locationReader = partitions.getFieldReader(ImpalaConstants.BLOCK_PARTITION_COLUMN_NAME);
locationReader.setPosition(curPartition);
SpillLocation spillLocation = makeSpillLocation(getSplitsRequest);
Split.Builder splitBuilder = Split.newBuilder(spillLocation, makeEncryptionKey()).add(ImpalaConstants.BLOCK_PARTITION_COLUMN_NAME, String.valueOf(locationReader.readText()));
splits.add(splitBuilder.build());
if (splits.size() >= ImpalaConstants.MAX_SPLITS_PER_REQUEST) {
return new GetSplitsResponse(getSplitsRequest.getCatalogName(), splits, encodeContinuationToken(curPartition));
}
}
return new GetSplitsResponse(getSplitsRequest.getCatalogName(), splits, null);
}
use of com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation in project aws-athena-query-federation by awslabs.
the class HiveMetadataHandler method doGetSplits.
/**
* Used to split-up the reads required to scan the requested batch of partition(s).
*
* @param blockAllocator Tool for creating and managing Apache Arrow Blocks.
* @param getSplitsRequest Provides details of the Hive catalog, database, table, and partition(s) being queried as well as
* any filter predicate.
* @return A GetSplitsResponse which primarily contains:
* 1. A Set of Splits which represent read operations Amazon Athena must perform by calling your read function.
* 2. (Optional) A continuation token which allows you to paginate the generation of splits for large queries.
*/
@Override
public GetSplitsResponse doGetSplits(BlockAllocator blockAllocator, GetSplitsRequest getSplitsRequest) {
LOGGER.info("{}: Catalog {}, table {}", getSplitsRequest.getQueryId(), getSplitsRequest.getTableName().getSchemaName(), getSplitsRequest.getTableName().getTableName());
int partitionContd = decodeContinuationToken(getSplitsRequest);
Set<Split> splits = new HashSet<>();
Block partitions = getSplitsRequest.getPartitions();
for (int curPartition = partitionContd; curPartition < partitions.getRowCount(); curPartition++) {
FieldReader locationReader = partitions.getFieldReader(HiveConstants.BLOCK_PARTITION_COLUMN_NAME);
locationReader.setPosition(curPartition);
SpillLocation spillLocation = makeSpillLocation(getSplitsRequest);
Split.Builder splitBuilder = Split.newBuilder(spillLocation, makeEncryptionKey()).add(HiveConstants.BLOCK_PARTITION_COLUMN_NAME, String.valueOf(locationReader.readText()));
splits.add(splitBuilder.build());
if (splits.size() >= HiveConstants.MAX_SPLITS_PER_REQUEST) {
return new GetSplitsResponse(getSplitsRequest.getCatalogName(), splits, encodeContinuationToken(curPartition));
}
}
return new GetSplitsResponse(getSplitsRequest.getCatalogName(), splits, null);
}
use of com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation in project aws-athena-query-federation by awslabs.
the class CloudwatchMetadataHandler method doGetSplits.
/**
* Each partition is converted into a single Split which means we will potentially read all LogStreams required for
* the query in parallel.
*
* @see MetadataHandler
*/
@Override
public GetSplitsResponse doGetSplits(BlockAllocator allocator, GetSplitsRequest request) {
int partitionContd = decodeContinuationToken(request);
Set<Split> splits = new HashSet<>();
Block partitions = request.getPartitions();
for (int curPartition = partitionContd; curPartition < partitions.getRowCount(); curPartition++) {
FieldReader logStreamReader = partitions.getFieldReader(LOG_STREAM_FIELD);
logStreamReader.setPosition(curPartition);
FieldReader logGroupReader = partitions.getFieldReader(LOG_GROUP_FIELD);
logGroupReader.setPosition(curPartition);
FieldReader sizeReader = partitions.getFieldReader(LOG_STREAM_SIZE_FIELD);
sizeReader.setPosition(curPartition);
// Every split must have a unique location if we wish to spill to avoid failures
SpillLocation spillLocation = makeSpillLocation(request);
Split.Builder splitBuilder = Split.newBuilder(spillLocation, makeEncryptionKey()).add(CloudwatchMetadataHandler.LOG_GROUP_FIELD, String.valueOf(logGroupReader.readText())).add(CloudwatchMetadataHandler.LOG_STREAM_FIELD, String.valueOf(logStreamReader.readText())).add(CloudwatchMetadataHandler.LOG_STREAM_SIZE_FIELD, String.valueOf(sizeReader.readLong()));
splits.add(splitBuilder.build());
if (splits.size() >= MAX_SPLITS_PER_REQUEST) {
// a continuation token.
return new GetSplitsResponse(request.getCatalogName(), splits, encodeContinuationToken(curPartition));
}
}
return new GetSplitsResponse(request.getCatalogName(), splits, null);
}
use of com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation in project aws-athena-query-federation by awslabs.
the class DocDBRecordHandlerTest method doReadRecordsSpill.
@Test
public void doReadRecordsSpill() throws Exception {
List<Document> documents = new ArrayList<>();
for (int docNum = 0; docNum < 20_000; docNum++) {
documents.add(DocumentGenerator.makeRandomRow(schemaForRead.getFields(), docNum));
}
when(mockCollection.find(any(Document.class))).thenAnswer((InvocationOnMock invocationOnMock) -> {
logger.info("doReadRecordsNoSpill: query[{}]", invocationOnMock.getArguments()[0]);
return mockIterable;
});
when(mockIterable.projection(any(Document.class))).thenAnswer((InvocationOnMock invocationOnMock) -> {
logger.info("doReadRecordsNoSpill: projection[{}]", invocationOnMock.getArguments()[0]);
return mockIterable;
});
when(mockIterable.batchSize(anyInt())).thenReturn(mockIterable);
when(mockIterable.iterator()).thenReturn(new StubbingCursor(documents.iterator()));
Map<String, ValueSet> constraintsMap = new HashMap<>();
constraintsMap.put("col3", SortedRangeSet.copyOf(Types.MinorType.FLOAT8.getType(), ImmutableList.of(Range.greaterThan(allocator, Types.MinorType.FLOAT8.getType(), -10000D)), false));
S3SpillLocation splitLoc = S3SpillLocation.newBuilder().withBucket(UUID.randomUUID().toString()).withSplitId(UUID.randomUUID().toString()).withQueryId(UUID.randomUUID().toString()).withIsDirectory(true).build();
ReadRecordsRequest request = new ReadRecordsRequest(IDENTITY, DEFAULT_CATALOG, "queryId-" + System.currentTimeMillis(), TABLE_NAME, schemaForRead, Split.newBuilder(splitLoc, keyFactory.create()).add(DOCDB_CONN_STR, CONNECTION_STRING).build(), new Constraints(constraintsMap), // ~1.5MB so we should see some spill
1_500_000L, 0L);
RecordResponse rawResponse = handler.doReadRecords(allocator, request);
assertTrue(rawResponse instanceof RemoteReadRecordsResponse);
try (RemoteReadRecordsResponse response = (RemoteReadRecordsResponse) rawResponse) {
logger.info("doReadRecordsSpill: remoteBlocks[{}]", response.getRemoteBlocks().size());
assertTrue(response.getNumberBlocks() > 1);
int blockNum = 0;
for (SpillLocation next : response.getRemoteBlocks()) {
S3SpillLocation spillLocation = (S3SpillLocation) next;
try (Block block = spillReader.read(spillLocation, response.getEncryptionKey(), response.getSchema())) {
logger.info("doReadRecordsSpill: blockNum[{}] and recordCount[{}]", blockNum++, block.getRowCount());
// assertTrue(++blockNum < response.getRemoteBlocks().size() && block.getRowCount() > 10_000);
logger.info("doReadRecordsSpill: {}", BlockUtils.rowToString(block, 0));
assertNotNull(BlockUtils.rowToString(block, 0));
}
}
}
}
use of com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation in project aws-athena-query-federation by awslabs.
the class CloudwatchRecordHandlerTest method doReadRecordsSpill.
@Test
public void doReadRecordsSpill() throws Exception {
logger.info("doReadRecordsSpill: enter");
Map<String, ValueSet> constraintsMap = new HashMap<>();
constraintsMap.put("time", SortedRangeSet.of(Range.range(allocator, Types.MinorType.BIGINT.getType(), 100L, true, 100_000_000L, true)));
ReadRecordsRequest request = new ReadRecordsRequest(identity, "catalog", "queryId-" + System.currentTimeMillis(), new TableName("schema", "table"), schemaForRead, Split.newBuilder(S3SpillLocation.newBuilder().withBucket(UUID.randomUUID().toString()).withSplitId(UUID.randomUUID().toString()).withQueryId(UUID.randomUUID().toString()).withIsDirectory(true).build(), keyFactory.create()).add(CloudwatchMetadataHandler.LOG_STREAM_FIELD, "table").build(), new Constraints(constraintsMap), // ~1.5MB so we should see some spill
1_500_000L, 0);
RecordResponse rawResponse = handler.doReadRecords(allocator, request);
assertTrue(rawResponse instanceof RemoteReadRecordsResponse);
try (RemoteReadRecordsResponse response = (RemoteReadRecordsResponse) rawResponse) {
logger.info("doReadRecordsSpill: remoteBlocks[{}]", response.getRemoteBlocks().size());
assertTrue(response.getNumberBlocks() > 1);
int blockNum = 0;
for (SpillLocation next : response.getRemoteBlocks()) {
S3SpillLocation spillLocation = (S3SpillLocation) next;
try (Block block = spillReader.read(spillLocation, response.getEncryptionKey(), response.getSchema())) {
logger.info("doReadRecordsSpill: blockNum[{}] and recordCount[{}]", blockNum++, block.getRowCount());
// assertTrue(++blockNum < response.getRemoteBlocks().size() && block.getRowCount() > 10_000);
logger.info("doReadRecordsSpill: {}", BlockUtils.rowToString(block, 0));
assertNotNull(BlockUtils.rowToString(block, 0));
}
}
}
logger.info("doReadRecordsSpill: exit");
}
Aggregations