use of com.amazonaws.athena.connector.lambda.data.Block in project aws-athena-query-federation by awslabs.
the class ExampleMetadataHandlerTest method doGetSplits.
@Test
public void doGetSplits() {
if (!enableTests) {
// We do this because until you complete the tutorial these tests will fail. When you attempt to publis
// using ../toos/publish.sh ... it will set the publishing flag and force these tests. This is how we
// avoid breaking the build but still have a useful tutorial. We are also duplicateing this block
// on purpose since this is a somewhat odd pattern.
logger.info("doGetSplits: Tests are disabled, to enable them set the 'publishing' environment variable " + "using maven clean install -Dpublishing=true");
return;
}
logger.info("doGetSplits: enter");
String yearCol = "year";
String monthCol = "month";
String dayCol = "day";
// This is the schema that ExampleMetadataHandler has layed out for a 'Partition' so we need to populate this
// minimal set of info here.
Schema schema = SchemaBuilder.newBuilder().addIntField(yearCol).addIntField(monthCol).addIntField(dayCol).build();
List<String> partitionCols = new ArrayList<>();
partitionCols.add(yearCol);
partitionCols.add(monthCol);
partitionCols.add(dayCol);
Map<String, ValueSet> constraintsMap = new HashMap<>();
Block partitions = allocator.createBlock(schema);
int num_partitions = 10;
for (int i = 0; i < num_partitions; i++) {
BlockUtils.setValue(partitions.getFieldVector(yearCol), i, 2016 + i);
BlockUtils.setValue(partitions.getFieldVector(monthCol), i, (i % 12) + 1);
BlockUtils.setValue(partitions.getFieldVector(dayCol), i, (i % 28) + 1);
}
partitions.setRowCount(num_partitions);
String continuationToken = null;
GetSplitsRequest originalReq = new GetSplitsRequest(fakeIdentity(), "queryId", "catalog_name", new TableName("schema", "table_name"), partitions, partitionCols, new Constraints(constraintsMap), continuationToken);
int numContinuations = 0;
do {
GetSplitsRequest req = new GetSplitsRequest(originalReq, continuationToken);
logger.info("doGetSplits: req[{}]", req);
MetadataResponse rawResponse = handler.doGetSplits(allocator, req);
assertEquals(MetadataRequestType.GET_SPLITS, rawResponse.getRequestType());
GetSplitsResponse response = (GetSplitsResponse) rawResponse;
continuationToken = response.getContinuationToken();
logger.info("doGetSplits: continuationToken[{}] - splits[{}]", continuationToken, response.getSplits());
for (Split nextSplit : response.getSplits()) {
assertNotNull(nextSplit.getProperty("year"));
assertNotNull(nextSplit.getProperty("month"));
assertNotNull(nextSplit.getProperty("day"));
}
assertTrue(!response.getSplits().isEmpty());
if (continuationToken != null) {
numContinuations++;
}
} while (continuationToken != null);
assertTrue(numContinuations == 0);
logger.info("doGetSplits: exit");
}
use of com.amazonaws.athena.connector.lambda.data.Block in project aws-athena-query-federation by awslabs.
the class ExampleMetadataHandler method doGetSplits.
/**
* Used to split-up the reads required to scan the requested batch of partition(s).
*
* @param allocator Tool for creating and managing Apache Arrow Blocks.
* @param request Provides details of the catalog, database, table, andpartition(s) being queried as well as
* any filter predicate.
* @return A GetSplitsResponse which primarily contains:
* 1. A Set<Split> which represent read operations Amazon Athena must perform by calling your read function.
* 2. (Optional) A continuation token which allows you to paginate the generation of splits for large queries.
* @note A Split is a mostly opaque object to Amazon Athena. Amazon Athena will use the optional SpillLocation and
* optional EncryptionKey for pipelined reads but all properties you set on the Split are passed to your read
* function to help you perform the read.
*/
@Override
public GetSplitsResponse doGetSplits(BlockAllocator allocator, GetSplitsRequest request) {
logger.info("doGetSplits: enter - " + request);
String catalogName = request.getCatalogName();
Set<Split> splits = new HashSet<>();
Block partitions = request.getPartitions();
FieldReader day = partitions.getFieldReader("day");
FieldReader month = partitions.getFieldReader("month");
FieldReader year = partitions.getFieldReader("year");
for (int i = 0; i < partitions.getRowCount(); i++) {
// Set the readers to the partition row we area on
year.setPosition(i);
month.setPosition(i);
day.setPosition(i);
/**
* TODO: For each partition in the request, create 1 or more splits. Splits
* are parallelizable units of work. Each represents a part of your table
* that needs to be read for the query. Splits are opaque to Athena aside from the
* spill location and encryption key. All properties added to a split are solely
* for your use when Athena calls your readWithContraints(...) function to perform
* the read. In this example we just need to know the partition details (year, month, day).
*
* Split split = Split.newBuilder(makeSpillLocation(request), makeEncryptionKey())
* .add("year", String.valueOf(year.readInteger()))
* .add("month", String.valueOf(month.readInteger()))
* .add("day", String.valueOf(day.readInteger()))
* .build();
*
* splits.add(split);
*/
}
logger.info("doGetSplits: exit - " + splits.size());
return new GetSplitsResponse(catalogName, splits);
}
use of com.amazonaws.athena.connector.lambda.data.Block in project aws-athena-query-federation by awslabs.
the class ElasticsearchMetadataHandlerTest method doGetSplits.
/**
* Used to test the doGetSplits() functionality in the ElasticsearchMetadataHandler class.
*/
@Test
public void doGetSplits() throws Exception {
logger.info("doGetSplits: enter");
List<String> partitionCols = new ArrayList<>();
Block partitions = BlockUtils.newBlock(allocator, "partitionId", Types.MinorType.INT.getType(), 0);
String continuationToken = null;
GetSplitsRequest originalReq = new GetSplitsRequest(fakeIdentity(), "queryId", "elasticsearch", new TableName("movies", "customer"), partitions, partitionCols, new Constraints(new HashMap<>()), null);
GetSplitsRequest req = new GetSplitsRequest(originalReq, continuationToken);
logger.info("doGetSplits: req[{}]", req);
// Setup domain and endpoint
String domain = "movies";
String endpoint = "https://search-movies-ne3fcqzfipy6jcrew2wca6kyqu.us-east-1.es.amazonaws.com";
when(domainMapProvider.getDomainMap(null)).thenReturn(ImmutableMap.of(domain, endpoint));
when(mockClient.getShardIds(anyString(), anyLong())).thenReturn(ImmutableSet.of(new Integer(0), new Integer(1), new Integer(2)));
// Instantiate handler
handler = new ElasticsearchMetadataHandler(awsGlue, new LocalKeyFactory(), awsSecretsManager, amazonAthena, "spill-bucket", "spill-prefix", domainMapProvider, clientFactory, 10);
// Call doGetSplits()
MetadataResponse rawResponse = handler.doGetSplits(allocator, req);
assertEquals(MetadataRequestType.GET_SPLITS, rawResponse.getRequestType());
GetSplitsResponse response = (GetSplitsResponse) rawResponse;
continuationToken = response.getContinuationToken();
logger.info("doGetSplits: continuationToken[{}] - numSplits[{}]", new Object[] { continuationToken, response.getSplits().size() });
// Response should contain 2 splits.
assertEquals("Response has invalid number of splits", 3, response.getSplits().size());
Set<String> shardIds = new HashSet<>(2);
shardIds.add("_shards:0");
shardIds.add("_shards:1");
shardIds.add("_shards:2");
response.getSplits().forEach(split -> {
assertEquals(endpoint, split.getProperty(domain));
String shard = split.getProperty(ElasticsearchMetadataHandler.SHARD_KEY);
assertTrue("Split contains invalid shard: " + shard, shardIds.contains(shard));
shardIds.remove(shard);
});
assertTrue("Continuation criteria violated", response.getContinuationToken() == null);
logger.info("doGetSplits: exit");
}
use of com.amazonaws.athena.connector.lambda.data.Block in project aws-athena-query-federation by awslabs.
the class ElasticsearchRecordHandlerTest method doReadRecordsSpill.
@Test
public void doReadRecordsSpill() throws Exception {
logger.info("doReadRecordsSpill: enter");
int batchSize = handler.getQueryBatchSize();
SearchHit[] searchHit1 = new SearchHit[batchSize];
for (int i = 0; i < batchSize; ++i) {
searchHit1[i] = new SearchHit(i + 1);
}
SearchHit[] searchHit2 = new SearchHit[2];
searchHit2[0] = new SearchHit(batchSize + 1);
searchHit2[1] = new SearchHit(batchSize + 2);
SearchHits searchHits1 = new SearchHits(searchHit1, new TotalHits(batchSize, TotalHits.Relation.EQUAL_TO), 4);
SearchHits searchHits2 = new SearchHits(searchHit2, new TotalHits(2, TotalHits.Relation.EQUAL_TO), 4);
when(mockResponse.getHits()).thenReturn(searchHits1, searchHits1, searchHits2, searchHits2);
Map<String, ValueSet> constraintsMap = new HashMap<>();
constraintsMap.put("myshort", SortedRangeSet.copyOf(Types.MinorType.SMALLINT.getType(), ImmutableList.of(Range.range(allocator, Types.MinorType.SMALLINT.getType(), (short) 1955, false, (short) 1972, true)), false));
ReadRecordsRequest request = new ReadRecordsRequest(fakeIdentity(), "elasticsearch", "queryId-" + System.currentTimeMillis(), new TableName("movies", "mishmash"), mapping, split, new Constraints(constraintsMap), // 10KB Expect this to spill
10_000L, 0L);
RecordResponse rawResponse = handler.doReadRecords(allocator, request);
assertTrue(rawResponse instanceof RemoteReadRecordsResponse);
try (RemoteReadRecordsResponse response = (RemoteReadRecordsResponse) rawResponse) {
logger.info("doReadRecordsSpill: remoteBlocks[{}]", response.getRemoteBlocks().size());
assertEquals(3, response.getNumberBlocks());
int blockNum = 0;
for (SpillLocation next : response.getRemoteBlocks()) {
S3SpillLocation spillLocation = (S3SpillLocation) next;
try (Block block = spillReader.read(spillLocation, response.getEncryptionKey(), response.getSchema())) {
logger.info("doReadRecordsSpill: blockNum[{}] and recordCount[{}]", blockNum++, block.getRowCount());
logger.info("doReadRecordsSpill: {}", BlockUtils.rowToString(block, 0));
assertNotNull(BlockUtils.rowToString(block, 0));
}
}
}
logger.info("doReadRecordsSpill: exit");
}
use of com.amazonaws.athena.connector.lambda.data.Block in project aws-athena-query-federation by awslabs.
the class S3BucketsTableProvider method toRow.
/**
* Maps a DBInstance into a row in our Apache Arrow response block(s).
*
* @param bucket The S3 Bucket to map.
* @param spiller The BlockSpiller to use when we want to write a matching row to the response.
* @note The current implementation is rather naive in how it maps fields. It leverages a static
* list of fields that we'd like to provide and then explicitly filters and converts each field.
*/
private void toRow(Bucket bucket, BlockSpiller spiller) {
spiller.writeRows((Block block, int row) -> {
boolean matched = true;
matched &= block.offerValue("bucket_name", row, bucket.getName());
matched &= block.offerValue("create_date", row, bucket.getCreationDate());
Owner owner = bucket.getOwner();
if (owner != null) {
matched &= block.offerValue("owner_name", row, bucket.getOwner().getDisplayName());
matched &= block.offerValue("owner_id", row, bucket.getOwner().getId());
}
return matched ? 1 : 0;
});
}
Aggregations