Search in sources :

Example 16 with SpillLocation

use of com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation in project aws-athena-query-federation by awslabs.

the class S3BlockSpiller method spillBlock.

/**
 * Spills a block, potentially asynchronously depending on the settings.
 *
 * @param block The Block to spill.
 */
private void spillBlock(Block block) {
    if (asyncSpillPool != null) {
        // We use the read lock here because we want to allow these in parallel, its a bit counter intuitive
        Lock lock = spillLock.readLock();
        try {
            // We lock before going async but unlock after spilling in the async thread, this makes it easy to use
            // the ReadWrite lock to tell if all spills are completed without killing the thread pool.
            lock.lock();
            asyncSpillPool.submit(() -> {
                try {
                    SpillLocation spillLocation = write(block);
                    spillLocations.add(spillLocation);
                    // Free the memory from the previous block since it has been spilled
                    safeClose(block);
                } finally {
                    lock.unlock();
                }
            });
        } catch (Exception ex) {
            // If we hit an exception, make sure we unlock to avoid a deadlock before throwing.
            lock.unlock();
            throw ex;
        }
    } else {
        SpillLocation spillLocation = write(block);
        spillLocations.add(spillLocation);
        safeClose(block);
    }
}
Also used : SpillLocation(com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation) S3SpillLocation(com.amazonaws.athena.connector.lambda.domain.spill.S3SpillLocation) RejectedExecutionException(java.util.concurrent.RejectedExecutionException) IOException(java.io.IOException) ReadWriteLock(java.util.concurrent.locks.ReadWriteLock) Lock(java.util.concurrent.locks.Lock) StampedLock(java.util.concurrent.locks.StampedLock)

Example 17 with SpillLocation

use of com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation in project foundry-athena-query-federation-connector by palantir.

the class S3Spiller method spillBlock.

public void spillBlock(Block block) {
    SpillLocation spillLocation = write(block);
    spillLocations.add(spillLocation);
    safeClose(block);
}
Also used : SpillLocation(com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation) S3SpillLocation(com.amazonaws.athena.connector.lambda.domain.spill.S3SpillLocation)

Example 18 with SpillLocation

use of com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation in project aws-athena-query-federation by awslabs.

the class ExampleRecordHandlerTest method doReadRecordsSpill.

@Test
public void doReadRecordsSpill() throws Exception {
    logger.info("doReadRecordsSpill: enter");
    for (int i = 0; i < 2; i++) {
        EncryptionKey encryptionKey = (i % 2 == 0) ? keyFactory.create() : null;
        logger.info("doReadRecordsSpill: Using encryptionKey[" + encryptionKey + "]");
        Map<String, ValueSet> constraintsMap = new HashMap<>();
        constraintsMap.put("col3", SortedRangeSet.copyOf(Types.MinorType.FLOAT8.getType(), ImmutableList.of(Range.greaterThan(allocator, Types.MinorType.FLOAT8.getType(), -10000D)), false));
        constraintsMap.put("unknown", EquatableValueSet.newBuilder(allocator, Types.MinorType.FLOAT8.getType(), false, true).add(1.1D).build());
        constraintsMap.put("unknown2", new AllOrNoneValueSet(Types.MinorType.FLOAT8.getType(), false, true));
        ReadRecordsRequest request = new ReadRecordsRequest(IdentityUtil.fakeIdentity(), "catalog", "queryId-" + System.currentTimeMillis(), new TableName("schema", "table"), schemaForRead, Split.newBuilder(makeSpillLocation(), encryptionKey).add("year", "10").add("month", "10").add("day", "10").build(), new Constraints(constraintsMap), // ~1.5MB so we should see some spill
        1_600_000L, 1000L);
        ObjectMapperUtil.assertSerialization(request);
        RecordResponse rawResponse = recordService.readRecords(request);
        ObjectMapperUtil.assertSerialization(rawResponse);
        assertTrue(rawResponse instanceof RemoteReadRecordsResponse);
        try (RemoteReadRecordsResponse response = (RemoteReadRecordsResponse) rawResponse) {
            logger.info("doReadRecordsSpill: remoteBlocks[{}]", response.getRemoteBlocks().size());
            assertTrue(response.getNumberBlocks() > 1);
            int blockNum = 0;
            for (SpillLocation next : response.getRemoteBlocks()) {
                S3SpillLocation spillLocation = (S3SpillLocation) next;
                try (Block block = spillReader.read(spillLocation, response.getEncryptionKey(), response.getSchema())) {
                    logger.info("doReadRecordsSpill: blockNum[{}] and recordCount[{}]", blockNum++, block.getRowCount());
                    // assertTrue(++blockNum < response.getRemoteBlocks().size() && block.getRowCount() > 10_000);
                    logger.info("doReadRecordsSpill: {}", BlockUtils.rowToString(block, 0));
                    assertNotNull(BlockUtils.rowToString(block, 0));
                }
            }
        }
    }
    logger.info("doReadRecordsSpill: exit");
}
Also used : RemoteReadRecordsResponse(com.amazonaws.athena.connector.lambda.records.RemoteReadRecordsResponse) SpillLocation(com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation) S3SpillLocation(com.amazonaws.athena.connector.lambda.domain.spill.S3SpillLocation) HashMap(java.util.HashMap) AllOrNoneValueSet(com.amazonaws.athena.connector.lambda.domain.predicate.AllOrNoneValueSet) EncryptionKey(com.amazonaws.athena.connector.lambda.security.EncryptionKey) Matchers.anyString(org.mockito.Matchers.anyString) RecordResponse(com.amazonaws.athena.connector.lambda.records.RecordResponse) TableName(com.amazonaws.athena.connector.lambda.domain.TableName) ReadRecordsRequest(com.amazonaws.athena.connector.lambda.records.ReadRecordsRequest) Constraints(com.amazonaws.athena.connector.lambda.domain.predicate.Constraints) S3SpillLocation(com.amazonaws.athena.connector.lambda.domain.spill.S3SpillLocation) Block(com.amazonaws.athena.connector.lambda.data.Block) ValueSet(com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet) EquatableValueSet(com.amazonaws.athena.connector.lambda.domain.predicate.EquatableValueSet) AllOrNoneValueSet(com.amazonaws.athena.connector.lambda.domain.predicate.AllOrNoneValueSet) Test(org.junit.Test)

Example 19 with SpillLocation

use of com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation in project aws-athena-query-federation by awslabs.

the class HbaseRecordHandlerTest method doReadRecordsSpill.

@Test
public void doReadRecordsSpill() throws Exception {
    List<Result> results = TestUtils.makeResults(10_000);
    ResultScanner mockScanner = mock(ResultScanner.class);
    when(mockScanner.iterator()).thenReturn(results.iterator());
    when(mockClient.scanTable(anyObject(), any(Scan.class), anyObject())).thenAnswer((InvocationOnMock invocationOnMock) -> {
        ResultProcessor processor = (ResultProcessor) invocationOnMock.getArguments()[2];
        return processor.scan(mockScanner);
    });
    Map<String, ValueSet> constraintsMap = new HashMap<>();
    constraintsMap.put("family1:col3", SortedRangeSet.copyOf(Types.MinorType.BIGINT.getType(), ImmutableList.of(Range.greaterThan(allocator, Types.MinorType.BIGINT.getType(), 0L)), true));
    S3SpillLocation splitLoc = S3SpillLocation.newBuilder().withBucket(UUID.randomUUID().toString()).withSplitId(UUID.randomUUID().toString()).withQueryId(UUID.randomUUID().toString()).withIsDirectory(true).build();
    Split.Builder splitBuilder = Split.newBuilder(splitLoc, keyFactory.create()).add(HBASE_CONN_STR, "fake_con_str").add(START_KEY_FIELD, "fake_start_key").add(END_KEY_FIELD, "fake_end_key").add(REGION_ID_FIELD, "fake_region_id").add(REGION_NAME_FIELD, "fake_region_name");
    ReadRecordsRequest request = new ReadRecordsRequest(IDENTITY, DEFAULT_CATALOG, "queryId-" + System.currentTimeMillis(), new TableName(DEFAULT_SCHEMA, TEST_TABLE), schemaForRead, splitBuilder.build(), new Constraints(constraintsMap), // ~1.5MB so we should see some spill
    1_500_000L, 0L);
    RecordResponse rawResponse = handler.doReadRecords(allocator, request);
    assertTrue(rawResponse instanceof RemoteReadRecordsResponse);
    try (RemoteReadRecordsResponse response = (RemoteReadRecordsResponse) rawResponse) {
        logger.info("doReadRecordsSpill: remoteBlocks[{}]", response.getRemoteBlocks().size());
        assertTrue(response.getNumberBlocks() > 1);
        int blockNum = 0;
        for (SpillLocation next : response.getRemoteBlocks()) {
            S3SpillLocation spillLocation = (S3SpillLocation) next;
            try (Block block = spillReader.read(spillLocation, response.getEncryptionKey(), response.getSchema())) {
                logger.info("doReadRecordsSpill: blockNum[{}] and recordCount[{}]", blockNum++, block.getRowCount());
                // assertTrue(++blockNum < response.getRemoteBlocks().size() && block.getRowCount() > 10_000);
                logger.info("doReadRecordsSpill: {}", BlockUtils.rowToString(block, 0));
                assertNotNull(BlockUtils.rowToString(block, 0));
            }
        }
    }
}
Also used : ResultScanner(org.apache.hadoop.hbase.client.ResultScanner) RemoteReadRecordsResponse(com.amazonaws.athena.connector.lambda.records.RemoteReadRecordsResponse) SpillLocation(com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation) S3SpillLocation(com.amazonaws.athena.connector.lambda.domain.spill.S3SpillLocation) HashMap(java.util.HashMap) Matchers.anyString(org.mockito.Matchers.anyString) RecordResponse(com.amazonaws.athena.connector.lambda.records.RecordResponse) Result(org.apache.hadoop.hbase.client.Result) PutObjectResult(com.amazonaws.services.s3.model.PutObjectResult) GetTableResult(com.amazonaws.services.glue.model.GetTableResult) TableName(com.amazonaws.athena.connector.lambda.domain.TableName) ReadRecordsRequest(com.amazonaws.athena.connector.lambda.records.ReadRecordsRequest) Constraints(com.amazonaws.athena.connector.lambda.domain.predicate.Constraints) InvocationOnMock(org.mockito.invocation.InvocationOnMock) S3SpillLocation(com.amazonaws.athena.connector.lambda.domain.spill.S3SpillLocation) Block(com.amazonaws.athena.connector.lambda.data.Block) Scan(org.apache.hadoop.hbase.client.Scan) ResultProcessor(com.amazonaws.athena.connectors.hbase.connection.ResultProcessor) Split(com.amazonaws.athena.connector.lambda.domain.Split) ValueSet(com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet) Test(org.junit.Test)

Example 20 with SpillLocation

use of com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation in project aws-athena-query-federation by awslabs.

the class BigQueryMetadataHandler method doGetSplits.

/**
 * Making minimum(10) splits based on constraints. Since without constraints query may give lambda timeout if table has large data,
 * concurrencyLimit is configurable and it can be changed based on Google BigQuery Quota Limits.
 * @param allocator Tool for creating and managing Apache Arrow Blocks.
 * @param request Provides details of the catalog, database, table, and partition(s) being queried as well as
 * any filter predicate.
 * @return
 * @throws IOException
 * @throws InterruptedException
 */
@Override
public GetSplitsResponse doGetSplits(BlockAllocator allocator, GetSplitsRequest request) throws IOException, InterruptedException {
    int constraintsSize = request.getConstraints().getSummary().size();
    if (constraintsSize > 0) {
        // Every split must have a unique location if we wish to spill to avoid failures
        SpillLocation spillLocation = makeSpillLocation(request);
        return new GetSplitsResponse(request.getCatalogName(), Split.newBuilder(spillLocation, makeEncryptionKey()).build());
    } else {
        String projectName = BigQueryUtils.getProjectName(request);
        String dataSetName = request.getTableName().getSchemaName();
        String tableName = request.getTableName().getTableName();
        QueryJobConfiguration queryConfig = QueryJobConfiguration.newBuilder("SELECT count(*) FROM `" + projectName + "." + dataSetName + "." + tableName + "` ").setUseLegacySql(false).build();
        // Create a job ID so that we can safely retry.
        JobId jobId = JobId.of(UUID.randomUUID().toString());
        Job queryJob = BigQueryUtils.getBigQueryClient().create(JobInfo.newBuilder(queryConfig).setJobId(jobId).build()).waitFor();
        TableResult result = queryJob.getQueryResults();
        double numberOfRows = result.iterateAll().iterator().next().get(0).getLongValue();
        logger.debug("numberOfRows: " + numberOfRows);
        int concurrencyLimit = Integer.parseInt(BigQueryUtils.getEnvVar("concurrencyLimit"));
        logger.debug("concurrencyLimit: " + numberOfRows);
        long pageCount = (long) numberOfRows / concurrencyLimit;
        long totalPageCountLimit = (pageCount == 0) ? (long) numberOfRows : pageCount;
        double limit = (int) Math.ceil(numberOfRows / totalPageCountLimit);
        Set<Split> splits = new HashSet<>();
        long offSet = 0;
        for (int i = 1; i <= limit; i++) {
            if (i > 1) {
                offSet = offSet + totalPageCountLimit;
            }
            // Every split must have a unique location if we wish to spill to avoid failures
            SpillLocation spillLocation = makeSpillLocation(request);
            // Create a new split (added to the splits set) that includes the domain and endpoint, and
            // shard information (to be used later by the Record Handler).
            Map<String, String> map = new HashMap<>();
            map.put(Long.toString(totalPageCountLimit), Long.toString(offSet));
            splits.add(new Split(spillLocation, makeEncryptionKey(), map));
        }
        return new GetSplitsResponse(request.getCatalogName(), splits);
    }
}
Also used : SpillLocation(com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation) HashMap(java.util.HashMap) TableResult(com.google.cloud.bigquery.TableResult) GetSplitsResponse(com.amazonaws.athena.connector.lambda.metadata.GetSplitsResponse) Job(com.google.cloud.bigquery.Job) Split(com.amazonaws.athena.connector.lambda.domain.Split) QueryJobConfiguration(com.google.cloud.bigquery.QueryJobConfiguration) JobId(com.google.cloud.bigquery.JobId) HashSet(java.util.HashSet)

Aggregations

SpillLocation (com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation)32 Block (com.amazonaws.athena.connector.lambda.data.Block)23 Split (com.amazonaws.athena.connector.lambda.domain.Split)21 GetSplitsResponse (com.amazonaws.athena.connector.lambda.metadata.GetSplitsResponse)17 HashSet (java.util.HashSet)16 S3SpillLocation (com.amazonaws.athena.connector.lambda.domain.spill.S3SpillLocation)15 FieldReader (org.apache.arrow.vector.complex.reader.FieldReader)13 HashMap (java.util.HashMap)11 Constraints (com.amazonaws.athena.connector.lambda.domain.predicate.Constraints)9 ValueSet (com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet)9 ReadRecordsRequest (com.amazonaws.athena.connector.lambda.records.ReadRecordsRequest)9 RemoteReadRecordsResponse (com.amazonaws.athena.connector.lambda.records.RemoteReadRecordsResponse)9 Test (org.junit.Test)9 RecordResponse (com.amazonaws.athena.connector.lambda.records.RecordResponse)8 TableName (com.amazonaws.athena.connector.lambda.domain.TableName)7 Matchers.anyString (org.mockito.Matchers.anyString)7 EquatableValueSet (com.amazonaws.athena.connector.lambda.domain.predicate.EquatableValueSet)5 EncryptionKey (com.amazonaws.athena.connector.lambda.security.EncryptionKey)5 InvocationOnMock (org.mockito.invocation.InvocationOnMock)4 PutObjectResult (com.amazonaws.services.s3.model.PutObjectResult)3