Search in sources :

Example 11 with SpillLocation

use of com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation in project aws-athena-query-federation by awslabs.

the class ImpalaMetadataHandler method doGetSplits.

/**
 * Used to split-up the reads required to scan the requested batch of partition(s).
 *
 * @param blockAllocator Tool for creating and managing Apache Arrow Blocks.
 * @param getSplitsRequest Provides details of the Impala catalog, database, table, and partition(s) being queried as well as
 * any filter predicate.
 * @return A GetSplitsResponse which primarily contains:
 * 1. A Set of Splits which represent read operations Amazon Athena must perform by calling your read function.
 * 2. (Optional) A continuation token which allows you to paginate the generation of splits for large queries.
 */
@Override
public GetSplitsResponse doGetSplits(BlockAllocator blockAllocator, GetSplitsRequest getSplitsRequest) {
    LOGGER.info("{}: Catalog {}, table {}", getSplitsRequest.getQueryId(), getSplitsRequest.getTableName().getSchemaName(), getSplitsRequest.getTableName().getTableName());
    int partitionContd = decodeContinuationToken(getSplitsRequest);
    Set<Split> splits = new HashSet<>();
    Block partitions = getSplitsRequest.getPartitions();
    for (int curPartition = partitionContd; curPartition < partitions.getRowCount(); curPartition++) {
        FieldReader locationReader = partitions.getFieldReader(ImpalaConstants.BLOCK_PARTITION_COLUMN_NAME);
        locationReader.setPosition(curPartition);
        SpillLocation spillLocation = makeSpillLocation(getSplitsRequest);
        Split.Builder splitBuilder = Split.newBuilder(spillLocation, makeEncryptionKey()).add(ImpalaConstants.BLOCK_PARTITION_COLUMN_NAME, String.valueOf(locationReader.readText()));
        splits.add(splitBuilder.build());
        if (splits.size() >= ImpalaConstants.MAX_SPLITS_PER_REQUEST) {
            return new GetSplitsResponse(getSplitsRequest.getCatalogName(), splits, encodeContinuationToken(curPartition));
        }
    }
    return new GetSplitsResponse(getSplitsRequest.getCatalogName(), splits, null);
}
Also used : SpillLocation(com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation) GetSplitsResponse(com.amazonaws.athena.connector.lambda.metadata.GetSplitsResponse) Block(com.amazonaws.athena.connector.lambda.data.Block) Split(com.amazonaws.athena.connector.lambda.domain.Split) FieldReader(org.apache.arrow.vector.complex.reader.FieldReader) HashSet(java.util.HashSet)

Example 12 with SpillLocation

use of com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation in project aws-athena-query-federation by awslabs.

the class HiveMetadataHandler method doGetSplits.

/**
 * Used to split-up the reads required to scan the requested batch of partition(s).
 *
 * @param blockAllocator Tool for creating and managing Apache Arrow Blocks.
 * @param getSplitsRequest Provides details of the Hive catalog, database, table, and partition(s) being queried as well as
 * any filter predicate.
 * @return A GetSplitsResponse which primarily contains:
 * 1. A Set of Splits which represent read operations Amazon Athena must perform by calling your read function.
 * 2. (Optional) A continuation token which allows you to paginate the generation of splits for large queries.
 */
@Override
public GetSplitsResponse doGetSplits(BlockAllocator blockAllocator, GetSplitsRequest getSplitsRequest) {
    LOGGER.info("{}: Catalog {}, table {}", getSplitsRequest.getQueryId(), getSplitsRequest.getTableName().getSchemaName(), getSplitsRequest.getTableName().getTableName());
    int partitionContd = decodeContinuationToken(getSplitsRequest);
    Set<Split> splits = new HashSet<>();
    Block partitions = getSplitsRequest.getPartitions();
    for (int curPartition = partitionContd; curPartition < partitions.getRowCount(); curPartition++) {
        FieldReader locationReader = partitions.getFieldReader(HiveConstants.BLOCK_PARTITION_COLUMN_NAME);
        locationReader.setPosition(curPartition);
        SpillLocation spillLocation = makeSpillLocation(getSplitsRequest);
        Split.Builder splitBuilder = Split.newBuilder(spillLocation, makeEncryptionKey()).add(HiveConstants.BLOCK_PARTITION_COLUMN_NAME, String.valueOf(locationReader.readText()));
        splits.add(splitBuilder.build());
        if (splits.size() >= HiveConstants.MAX_SPLITS_PER_REQUEST) {
            return new GetSplitsResponse(getSplitsRequest.getCatalogName(), splits, encodeContinuationToken(curPartition));
        }
    }
    return new GetSplitsResponse(getSplitsRequest.getCatalogName(), splits, null);
}
Also used : SpillLocation(com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation) GetSplitsResponse(com.amazonaws.athena.connector.lambda.metadata.GetSplitsResponse) Block(com.amazonaws.athena.connector.lambda.data.Block) Split(com.amazonaws.athena.connector.lambda.domain.Split) FieldReader(org.apache.arrow.vector.complex.reader.FieldReader) HashSet(java.util.HashSet)

Example 13 with SpillLocation

use of com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation in project aws-athena-query-federation by awslabs.

the class CloudwatchMetadataHandler method doGetSplits.

/**
 * Each partition is converted into a single Split which means we will potentially read all LogStreams required for
 * the query in parallel.
 *
 * @see MetadataHandler
 */
@Override
public GetSplitsResponse doGetSplits(BlockAllocator allocator, GetSplitsRequest request) {
    int partitionContd = decodeContinuationToken(request);
    Set<Split> splits = new HashSet<>();
    Block partitions = request.getPartitions();
    for (int curPartition = partitionContd; curPartition < partitions.getRowCount(); curPartition++) {
        FieldReader logStreamReader = partitions.getFieldReader(LOG_STREAM_FIELD);
        logStreamReader.setPosition(curPartition);
        FieldReader logGroupReader = partitions.getFieldReader(LOG_GROUP_FIELD);
        logGroupReader.setPosition(curPartition);
        FieldReader sizeReader = partitions.getFieldReader(LOG_STREAM_SIZE_FIELD);
        sizeReader.setPosition(curPartition);
        // Every split must have a unique location if we wish to spill to avoid failures
        SpillLocation spillLocation = makeSpillLocation(request);
        Split.Builder splitBuilder = Split.newBuilder(spillLocation, makeEncryptionKey()).add(CloudwatchMetadataHandler.LOG_GROUP_FIELD, String.valueOf(logGroupReader.readText())).add(CloudwatchMetadataHandler.LOG_STREAM_FIELD, String.valueOf(logStreamReader.readText())).add(CloudwatchMetadataHandler.LOG_STREAM_SIZE_FIELD, String.valueOf(sizeReader.readLong()));
        splits.add(splitBuilder.build());
        if (splits.size() >= MAX_SPLITS_PER_REQUEST) {
            // a continuation token.
            return new GetSplitsResponse(request.getCatalogName(), splits, encodeContinuationToken(curPartition));
        }
    }
    return new GetSplitsResponse(request.getCatalogName(), splits, null);
}
Also used : SpillLocation(com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation) GetSplitsResponse(com.amazonaws.athena.connector.lambda.metadata.GetSplitsResponse) Block(com.amazonaws.athena.connector.lambda.data.Block) Split(com.amazonaws.athena.connector.lambda.domain.Split) FieldReader(org.apache.arrow.vector.complex.reader.FieldReader) HashSet(java.util.HashSet)

Example 14 with SpillLocation

use of com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation in project aws-athena-query-federation by awslabs.

the class DocDBRecordHandlerTest method doReadRecordsSpill.

@Test
public void doReadRecordsSpill() throws Exception {
    List<Document> documents = new ArrayList<>();
    for (int docNum = 0; docNum < 20_000; docNum++) {
        documents.add(DocumentGenerator.makeRandomRow(schemaForRead.getFields(), docNum));
    }
    when(mockCollection.find(any(Document.class))).thenAnswer((InvocationOnMock invocationOnMock) -> {
        logger.info("doReadRecordsNoSpill: query[{}]", invocationOnMock.getArguments()[0]);
        return mockIterable;
    });
    when(mockIterable.projection(any(Document.class))).thenAnswer((InvocationOnMock invocationOnMock) -> {
        logger.info("doReadRecordsNoSpill: projection[{}]", invocationOnMock.getArguments()[0]);
        return mockIterable;
    });
    when(mockIterable.batchSize(anyInt())).thenReturn(mockIterable);
    when(mockIterable.iterator()).thenReturn(new StubbingCursor(documents.iterator()));
    Map<String, ValueSet> constraintsMap = new HashMap<>();
    constraintsMap.put("col3", SortedRangeSet.copyOf(Types.MinorType.FLOAT8.getType(), ImmutableList.of(Range.greaterThan(allocator, Types.MinorType.FLOAT8.getType(), -10000D)), false));
    S3SpillLocation splitLoc = S3SpillLocation.newBuilder().withBucket(UUID.randomUUID().toString()).withSplitId(UUID.randomUUID().toString()).withQueryId(UUID.randomUUID().toString()).withIsDirectory(true).build();
    ReadRecordsRequest request = new ReadRecordsRequest(IDENTITY, DEFAULT_CATALOG, "queryId-" + System.currentTimeMillis(), TABLE_NAME, schemaForRead, Split.newBuilder(splitLoc, keyFactory.create()).add(DOCDB_CONN_STR, CONNECTION_STRING).build(), new Constraints(constraintsMap), // ~1.5MB so we should see some spill
    1_500_000L, 0L);
    RecordResponse rawResponse = handler.doReadRecords(allocator, request);
    assertTrue(rawResponse instanceof RemoteReadRecordsResponse);
    try (RemoteReadRecordsResponse response = (RemoteReadRecordsResponse) rawResponse) {
        logger.info("doReadRecordsSpill: remoteBlocks[{}]", response.getRemoteBlocks().size());
        assertTrue(response.getNumberBlocks() > 1);
        int blockNum = 0;
        for (SpillLocation next : response.getRemoteBlocks()) {
            S3SpillLocation spillLocation = (S3SpillLocation) next;
            try (Block block = spillReader.read(spillLocation, response.getEncryptionKey(), response.getSchema())) {
                logger.info("doReadRecordsSpill: blockNum[{}] and recordCount[{}]", blockNum++, block.getRowCount());
                // assertTrue(++blockNum < response.getRemoteBlocks().size() && block.getRowCount() > 10_000);
                logger.info("doReadRecordsSpill: {}", BlockUtils.rowToString(block, 0));
                assertNotNull(BlockUtils.rowToString(block, 0));
            }
        }
    }
}
Also used : RemoteReadRecordsResponse(com.amazonaws.athena.connector.lambda.records.RemoteReadRecordsResponse) SpillLocation(com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation) S3SpillLocation(com.amazonaws.athena.connector.lambda.domain.spill.S3SpillLocation) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Matchers.anyString(org.mockito.Matchers.anyString) RecordResponse(com.amazonaws.athena.connector.lambda.records.RecordResponse) Document(org.bson.Document) ReadRecordsRequest(com.amazonaws.athena.connector.lambda.records.ReadRecordsRequest) Constraints(com.amazonaws.athena.connector.lambda.domain.predicate.Constraints) InvocationOnMock(org.mockito.invocation.InvocationOnMock) S3SpillLocation(com.amazonaws.athena.connector.lambda.domain.spill.S3SpillLocation) Block(com.amazonaws.athena.connector.lambda.data.Block) ValueSet(com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet) Test(org.junit.Test)

Example 15 with SpillLocation

use of com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation in project aws-athena-query-federation by awslabs.

the class CloudwatchRecordHandlerTest method doReadRecordsSpill.

@Test
public void doReadRecordsSpill() throws Exception {
    logger.info("doReadRecordsSpill: enter");
    Map<String, ValueSet> constraintsMap = new HashMap<>();
    constraintsMap.put("time", SortedRangeSet.of(Range.range(allocator, Types.MinorType.BIGINT.getType(), 100L, true, 100_000_000L, true)));
    ReadRecordsRequest request = new ReadRecordsRequest(identity, "catalog", "queryId-" + System.currentTimeMillis(), new TableName("schema", "table"), schemaForRead, Split.newBuilder(S3SpillLocation.newBuilder().withBucket(UUID.randomUUID().toString()).withSplitId(UUID.randomUUID().toString()).withQueryId(UUID.randomUUID().toString()).withIsDirectory(true).build(), keyFactory.create()).add(CloudwatchMetadataHandler.LOG_STREAM_FIELD, "table").build(), new Constraints(constraintsMap), // ~1.5MB so we should see some spill
    1_500_000L, 0);
    RecordResponse rawResponse = handler.doReadRecords(allocator, request);
    assertTrue(rawResponse instanceof RemoteReadRecordsResponse);
    try (RemoteReadRecordsResponse response = (RemoteReadRecordsResponse) rawResponse) {
        logger.info("doReadRecordsSpill: remoteBlocks[{}]", response.getRemoteBlocks().size());
        assertTrue(response.getNumberBlocks() > 1);
        int blockNum = 0;
        for (SpillLocation next : response.getRemoteBlocks()) {
            S3SpillLocation spillLocation = (S3SpillLocation) next;
            try (Block block = spillReader.read(spillLocation, response.getEncryptionKey(), response.getSchema())) {
                logger.info("doReadRecordsSpill: blockNum[{}] and recordCount[{}]", blockNum++, block.getRowCount());
                // assertTrue(++blockNum < response.getRemoteBlocks().size() && block.getRowCount() > 10_000);
                logger.info("doReadRecordsSpill: {}", BlockUtils.rowToString(block, 0));
                assertNotNull(BlockUtils.rowToString(block, 0));
            }
        }
    }
    logger.info("doReadRecordsSpill: exit");
}
Also used : RemoteReadRecordsResponse(com.amazonaws.athena.connector.lambda.records.RemoteReadRecordsResponse) SpillLocation(com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation) S3SpillLocation(com.amazonaws.athena.connector.lambda.domain.spill.S3SpillLocation) HashMap(java.util.HashMap) Matchers.anyString(org.mockito.Matchers.anyString) RecordResponse(com.amazonaws.athena.connector.lambda.records.RecordResponse) TableName(com.amazonaws.athena.connector.lambda.domain.TableName) ReadRecordsRequest(com.amazonaws.athena.connector.lambda.records.ReadRecordsRequest) Constraints(com.amazonaws.athena.connector.lambda.domain.predicate.Constraints) S3SpillLocation(com.amazonaws.athena.connector.lambda.domain.spill.S3SpillLocation) Block(com.amazonaws.athena.connector.lambda.data.Block) ValueSet(com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet) Test(org.junit.Test)

Aggregations

SpillLocation (com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation)32 Block (com.amazonaws.athena.connector.lambda.data.Block)23 Split (com.amazonaws.athena.connector.lambda.domain.Split)21 GetSplitsResponse (com.amazonaws.athena.connector.lambda.metadata.GetSplitsResponse)17 HashSet (java.util.HashSet)16 S3SpillLocation (com.amazonaws.athena.connector.lambda.domain.spill.S3SpillLocation)15 FieldReader (org.apache.arrow.vector.complex.reader.FieldReader)13 HashMap (java.util.HashMap)11 Constraints (com.amazonaws.athena.connector.lambda.domain.predicate.Constraints)9 ValueSet (com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet)9 ReadRecordsRequest (com.amazonaws.athena.connector.lambda.records.ReadRecordsRequest)9 RemoteReadRecordsResponse (com.amazonaws.athena.connector.lambda.records.RemoteReadRecordsResponse)9 Test (org.junit.Test)9 RecordResponse (com.amazonaws.athena.connector.lambda.records.RecordResponse)8 TableName (com.amazonaws.athena.connector.lambda.domain.TableName)7 Matchers.anyString (org.mockito.Matchers.anyString)7 EquatableValueSet (com.amazonaws.athena.connector.lambda.domain.predicate.EquatableValueSet)5 EncryptionKey (com.amazonaws.athena.connector.lambda.security.EncryptionKey)5 InvocationOnMock (org.mockito.invocation.InvocationOnMock)4 PutObjectResult (com.amazonaws.services.s3.model.PutObjectResult)3