use of com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation in project aws-athena-query-federation by awslabs.
the class S3BlockSpiller method spillBlock.
/**
* Spills a block, potentially asynchronously depending on the settings.
*
* @param block The Block to spill.
*/
private void spillBlock(Block block) {
if (asyncSpillPool != null) {
// We use the read lock here because we want to allow these in parallel, its a bit counter intuitive
Lock lock = spillLock.readLock();
try {
// We lock before going async but unlock after spilling in the async thread, this makes it easy to use
// the ReadWrite lock to tell if all spills are completed without killing the thread pool.
lock.lock();
asyncSpillPool.submit(() -> {
try {
SpillLocation spillLocation = write(block);
spillLocations.add(spillLocation);
// Free the memory from the previous block since it has been spilled
safeClose(block);
} finally {
lock.unlock();
}
});
} catch (Exception ex) {
// If we hit an exception, make sure we unlock to avoid a deadlock before throwing.
lock.unlock();
throw ex;
}
} else {
SpillLocation spillLocation = write(block);
spillLocations.add(spillLocation);
safeClose(block);
}
}
use of com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation in project foundry-athena-query-federation-connector by palantir.
the class S3Spiller method spillBlock.
public void spillBlock(Block block) {
SpillLocation spillLocation = write(block);
spillLocations.add(spillLocation);
safeClose(block);
}
use of com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation in project aws-athena-query-federation by awslabs.
the class ExampleRecordHandlerTest method doReadRecordsSpill.
@Test
public void doReadRecordsSpill() throws Exception {
logger.info("doReadRecordsSpill: enter");
for (int i = 0; i < 2; i++) {
EncryptionKey encryptionKey = (i % 2 == 0) ? keyFactory.create() : null;
logger.info("doReadRecordsSpill: Using encryptionKey[" + encryptionKey + "]");
Map<String, ValueSet> constraintsMap = new HashMap<>();
constraintsMap.put("col3", SortedRangeSet.copyOf(Types.MinorType.FLOAT8.getType(), ImmutableList.of(Range.greaterThan(allocator, Types.MinorType.FLOAT8.getType(), -10000D)), false));
constraintsMap.put("unknown", EquatableValueSet.newBuilder(allocator, Types.MinorType.FLOAT8.getType(), false, true).add(1.1D).build());
constraintsMap.put("unknown2", new AllOrNoneValueSet(Types.MinorType.FLOAT8.getType(), false, true));
ReadRecordsRequest request = new ReadRecordsRequest(IdentityUtil.fakeIdentity(), "catalog", "queryId-" + System.currentTimeMillis(), new TableName("schema", "table"), schemaForRead, Split.newBuilder(makeSpillLocation(), encryptionKey).add("year", "10").add("month", "10").add("day", "10").build(), new Constraints(constraintsMap), // ~1.5MB so we should see some spill
1_600_000L, 1000L);
ObjectMapperUtil.assertSerialization(request);
RecordResponse rawResponse = recordService.readRecords(request);
ObjectMapperUtil.assertSerialization(rawResponse);
assertTrue(rawResponse instanceof RemoteReadRecordsResponse);
try (RemoteReadRecordsResponse response = (RemoteReadRecordsResponse) rawResponse) {
logger.info("doReadRecordsSpill: remoteBlocks[{}]", response.getRemoteBlocks().size());
assertTrue(response.getNumberBlocks() > 1);
int blockNum = 0;
for (SpillLocation next : response.getRemoteBlocks()) {
S3SpillLocation spillLocation = (S3SpillLocation) next;
try (Block block = spillReader.read(spillLocation, response.getEncryptionKey(), response.getSchema())) {
logger.info("doReadRecordsSpill: blockNum[{}] and recordCount[{}]", blockNum++, block.getRowCount());
// assertTrue(++blockNum < response.getRemoteBlocks().size() && block.getRowCount() > 10_000);
logger.info("doReadRecordsSpill: {}", BlockUtils.rowToString(block, 0));
assertNotNull(BlockUtils.rowToString(block, 0));
}
}
}
}
logger.info("doReadRecordsSpill: exit");
}
use of com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation in project aws-athena-query-federation by awslabs.
the class HbaseRecordHandlerTest method doReadRecordsSpill.
@Test
public void doReadRecordsSpill() throws Exception {
List<Result> results = TestUtils.makeResults(10_000);
ResultScanner mockScanner = mock(ResultScanner.class);
when(mockScanner.iterator()).thenReturn(results.iterator());
when(mockClient.scanTable(anyObject(), any(Scan.class), anyObject())).thenAnswer((InvocationOnMock invocationOnMock) -> {
ResultProcessor processor = (ResultProcessor) invocationOnMock.getArguments()[2];
return processor.scan(mockScanner);
});
Map<String, ValueSet> constraintsMap = new HashMap<>();
constraintsMap.put("family1:col3", SortedRangeSet.copyOf(Types.MinorType.BIGINT.getType(), ImmutableList.of(Range.greaterThan(allocator, Types.MinorType.BIGINT.getType(), 0L)), true));
S3SpillLocation splitLoc = S3SpillLocation.newBuilder().withBucket(UUID.randomUUID().toString()).withSplitId(UUID.randomUUID().toString()).withQueryId(UUID.randomUUID().toString()).withIsDirectory(true).build();
Split.Builder splitBuilder = Split.newBuilder(splitLoc, keyFactory.create()).add(HBASE_CONN_STR, "fake_con_str").add(START_KEY_FIELD, "fake_start_key").add(END_KEY_FIELD, "fake_end_key").add(REGION_ID_FIELD, "fake_region_id").add(REGION_NAME_FIELD, "fake_region_name");
ReadRecordsRequest request = new ReadRecordsRequest(IDENTITY, DEFAULT_CATALOG, "queryId-" + System.currentTimeMillis(), new TableName(DEFAULT_SCHEMA, TEST_TABLE), schemaForRead, splitBuilder.build(), new Constraints(constraintsMap), // ~1.5MB so we should see some spill
1_500_000L, 0L);
RecordResponse rawResponse = handler.doReadRecords(allocator, request);
assertTrue(rawResponse instanceof RemoteReadRecordsResponse);
try (RemoteReadRecordsResponse response = (RemoteReadRecordsResponse) rawResponse) {
logger.info("doReadRecordsSpill: remoteBlocks[{}]", response.getRemoteBlocks().size());
assertTrue(response.getNumberBlocks() > 1);
int blockNum = 0;
for (SpillLocation next : response.getRemoteBlocks()) {
S3SpillLocation spillLocation = (S3SpillLocation) next;
try (Block block = spillReader.read(spillLocation, response.getEncryptionKey(), response.getSchema())) {
logger.info("doReadRecordsSpill: blockNum[{}] and recordCount[{}]", blockNum++, block.getRowCount());
// assertTrue(++blockNum < response.getRemoteBlocks().size() && block.getRowCount() > 10_000);
logger.info("doReadRecordsSpill: {}", BlockUtils.rowToString(block, 0));
assertNotNull(BlockUtils.rowToString(block, 0));
}
}
}
}
use of com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation in project aws-athena-query-federation by awslabs.
the class BigQueryMetadataHandler method doGetSplits.
/**
* Making minimum(10) splits based on constraints. Since without constraints query may give lambda timeout if table has large data,
* concurrencyLimit is configurable and it can be changed based on Google BigQuery Quota Limits.
* @param allocator Tool for creating and managing Apache Arrow Blocks.
* @param request Provides details of the catalog, database, table, and partition(s) being queried as well as
* any filter predicate.
* @return
* @throws IOException
* @throws InterruptedException
*/
@Override
public GetSplitsResponse doGetSplits(BlockAllocator allocator, GetSplitsRequest request) throws IOException, InterruptedException {
int constraintsSize = request.getConstraints().getSummary().size();
if (constraintsSize > 0) {
// Every split must have a unique location if we wish to spill to avoid failures
SpillLocation spillLocation = makeSpillLocation(request);
return new GetSplitsResponse(request.getCatalogName(), Split.newBuilder(spillLocation, makeEncryptionKey()).build());
} else {
String projectName = BigQueryUtils.getProjectName(request);
String dataSetName = request.getTableName().getSchemaName();
String tableName = request.getTableName().getTableName();
QueryJobConfiguration queryConfig = QueryJobConfiguration.newBuilder("SELECT count(*) FROM `" + projectName + "." + dataSetName + "." + tableName + "` ").setUseLegacySql(false).build();
// Create a job ID so that we can safely retry.
JobId jobId = JobId.of(UUID.randomUUID().toString());
Job queryJob = BigQueryUtils.getBigQueryClient().create(JobInfo.newBuilder(queryConfig).setJobId(jobId).build()).waitFor();
TableResult result = queryJob.getQueryResults();
double numberOfRows = result.iterateAll().iterator().next().get(0).getLongValue();
logger.debug("numberOfRows: " + numberOfRows);
int concurrencyLimit = Integer.parseInt(BigQueryUtils.getEnvVar("concurrencyLimit"));
logger.debug("concurrencyLimit: " + numberOfRows);
long pageCount = (long) numberOfRows / concurrencyLimit;
long totalPageCountLimit = (pageCount == 0) ? (long) numberOfRows : pageCount;
double limit = (int) Math.ceil(numberOfRows / totalPageCountLimit);
Set<Split> splits = new HashSet<>();
long offSet = 0;
for (int i = 1; i <= limit; i++) {
if (i > 1) {
offSet = offSet + totalPageCountLimit;
}
// Every split must have a unique location if we wish to spill to avoid failures
SpillLocation spillLocation = makeSpillLocation(request);
// Create a new split (added to the splits set) that includes the domain and endpoint, and
// shard information (to be used later by the Record Handler).
Map<String, String> map = new HashMap<>();
map.put(Long.toString(totalPageCountLimit), Long.toString(offSet));
splits.add(new Split(spillLocation, makeEncryptionKey(), map));
}
return new GetSplitsResponse(request.getCatalogName(), splits);
}
}
Aggregations