Search in sources :

Example 36 with Block

use of com.amazonaws.athena.connector.lambda.data.Block in project aws-athena-query-federation by awslabs.

the class ImpalaMetadataHandler method doGetSplits.

/**
 * Used to split-up the reads required to scan the requested batch of partition(s).
 *
 * @param blockAllocator Tool for creating and managing Apache Arrow Blocks.
 * @param getSplitsRequest Provides details of the Impala catalog, database, table, and partition(s) being queried as well as
 * any filter predicate.
 * @return A GetSplitsResponse which primarily contains:
 * 1. A Set of Splits which represent read operations Amazon Athena must perform by calling your read function.
 * 2. (Optional) A continuation token which allows you to paginate the generation of splits for large queries.
 */
@Override
public GetSplitsResponse doGetSplits(BlockAllocator blockAllocator, GetSplitsRequest getSplitsRequest) {
    LOGGER.info("{}: Catalog {}, table {}", getSplitsRequest.getQueryId(), getSplitsRequest.getTableName().getSchemaName(), getSplitsRequest.getTableName().getTableName());
    int partitionContd = decodeContinuationToken(getSplitsRequest);
    Set<Split> splits = new HashSet<>();
    Block partitions = getSplitsRequest.getPartitions();
    for (int curPartition = partitionContd; curPartition < partitions.getRowCount(); curPartition++) {
        FieldReader locationReader = partitions.getFieldReader(ImpalaConstants.BLOCK_PARTITION_COLUMN_NAME);
        locationReader.setPosition(curPartition);
        SpillLocation spillLocation = makeSpillLocation(getSplitsRequest);
        Split.Builder splitBuilder = Split.newBuilder(spillLocation, makeEncryptionKey()).add(ImpalaConstants.BLOCK_PARTITION_COLUMN_NAME, String.valueOf(locationReader.readText()));
        splits.add(splitBuilder.build());
        if (splits.size() >= ImpalaConstants.MAX_SPLITS_PER_REQUEST) {
            return new GetSplitsResponse(getSplitsRequest.getCatalogName(), splits, encodeContinuationToken(curPartition));
        }
    }
    return new GetSplitsResponse(getSplitsRequest.getCatalogName(), splits, null);
}
Also used : SpillLocation(com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation) GetSplitsResponse(com.amazonaws.athena.connector.lambda.metadata.GetSplitsResponse) Block(com.amazonaws.athena.connector.lambda.data.Block) Split(com.amazonaws.athena.connector.lambda.domain.Split) FieldReader(org.apache.arrow.vector.complex.reader.FieldReader) HashSet(java.util.HashSet)

Example 37 with Block

use of com.amazonaws.athena.connector.lambda.data.Block in project aws-athena-query-federation by awslabs.

the class HiveMetadataHandler method doGetSplits.

/**
 * Used to split-up the reads required to scan the requested batch of partition(s).
 *
 * @param blockAllocator Tool for creating and managing Apache Arrow Blocks.
 * @param getSplitsRequest Provides details of the Hive catalog, database, table, and partition(s) being queried as well as
 * any filter predicate.
 * @return A GetSplitsResponse which primarily contains:
 * 1. A Set of Splits which represent read operations Amazon Athena must perform by calling your read function.
 * 2. (Optional) A continuation token which allows you to paginate the generation of splits for large queries.
 */
@Override
public GetSplitsResponse doGetSplits(BlockAllocator blockAllocator, GetSplitsRequest getSplitsRequest) {
    LOGGER.info("{}: Catalog {}, table {}", getSplitsRequest.getQueryId(), getSplitsRequest.getTableName().getSchemaName(), getSplitsRequest.getTableName().getTableName());
    int partitionContd = decodeContinuationToken(getSplitsRequest);
    Set<Split> splits = new HashSet<>();
    Block partitions = getSplitsRequest.getPartitions();
    for (int curPartition = partitionContd; curPartition < partitions.getRowCount(); curPartition++) {
        FieldReader locationReader = partitions.getFieldReader(HiveConstants.BLOCK_PARTITION_COLUMN_NAME);
        locationReader.setPosition(curPartition);
        SpillLocation spillLocation = makeSpillLocation(getSplitsRequest);
        Split.Builder splitBuilder = Split.newBuilder(spillLocation, makeEncryptionKey()).add(HiveConstants.BLOCK_PARTITION_COLUMN_NAME, String.valueOf(locationReader.readText()));
        splits.add(splitBuilder.build());
        if (splits.size() >= HiveConstants.MAX_SPLITS_PER_REQUEST) {
            return new GetSplitsResponse(getSplitsRequest.getCatalogName(), splits, encodeContinuationToken(curPartition));
        }
    }
    return new GetSplitsResponse(getSplitsRequest.getCatalogName(), splits, null);
}
Also used : SpillLocation(com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation) GetSplitsResponse(com.amazonaws.athena.connector.lambda.metadata.GetSplitsResponse) Block(com.amazonaws.athena.connector.lambda.data.Block) Split(com.amazonaws.athena.connector.lambda.domain.Split) FieldReader(org.apache.arrow.vector.complex.reader.FieldReader) HashSet(java.util.HashSet)

Example 38 with Block

use of com.amazonaws.athena.connector.lambda.data.Block in project aws-athena-query-federation by awslabs.

the class CloudwatchMetadataHandler method doGetSplits.

/**
 * Each partition is converted into a single Split which means we will potentially read all LogStreams required for
 * the query in parallel.
 *
 * @see MetadataHandler
 */
@Override
public GetSplitsResponse doGetSplits(BlockAllocator allocator, GetSplitsRequest request) {
    int partitionContd = decodeContinuationToken(request);
    Set<Split> splits = new HashSet<>();
    Block partitions = request.getPartitions();
    for (int curPartition = partitionContd; curPartition < partitions.getRowCount(); curPartition++) {
        FieldReader logStreamReader = partitions.getFieldReader(LOG_STREAM_FIELD);
        logStreamReader.setPosition(curPartition);
        FieldReader logGroupReader = partitions.getFieldReader(LOG_GROUP_FIELD);
        logGroupReader.setPosition(curPartition);
        FieldReader sizeReader = partitions.getFieldReader(LOG_STREAM_SIZE_FIELD);
        sizeReader.setPosition(curPartition);
        // Every split must have a unique location if we wish to spill to avoid failures
        SpillLocation spillLocation = makeSpillLocation(request);
        Split.Builder splitBuilder = Split.newBuilder(spillLocation, makeEncryptionKey()).add(CloudwatchMetadataHandler.LOG_GROUP_FIELD, String.valueOf(logGroupReader.readText())).add(CloudwatchMetadataHandler.LOG_STREAM_FIELD, String.valueOf(logStreamReader.readText())).add(CloudwatchMetadataHandler.LOG_STREAM_SIZE_FIELD, String.valueOf(sizeReader.readLong()));
        splits.add(splitBuilder.build());
        if (splits.size() >= MAX_SPLITS_PER_REQUEST) {
            // a continuation token.
            return new GetSplitsResponse(request.getCatalogName(), splits, encodeContinuationToken(curPartition));
        }
    }
    return new GetSplitsResponse(request.getCatalogName(), splits, null);
}
Also used : SpillLocation(com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation) GetSplitsResponse(com.amazonaws.athena.connector.lambda.metadata.GetSplitsResponse) Block(com.amazonaws.athena.connector.lambda.data.Block) Split(com.amazonaws.athena.connector.lambda.domain.Split) FieldReader(org.apache.arrow.vector.complex.reader.FieldReader) HashSet(java.util.HashSet)

Example 39 with Block

use of com.amazonaws.athena.connector.lambda.data.Block in project aws-athena-query-federation by awslabs.

the class CloudwatchMetadataHandler method getPartitions.

/**
 * Gets the list of LogStreams that need to be scanned to satisfy the requested table. In most cases this will be just
 * 1 LogStream and this results in just 1 partition. If, however, the request is for the special ALL_LOG_STREAMS view
 * then all LogStreams in the requested LogGroup (schema) are queried and turned into partitions 1:1.
 *
 * @note This method applies partition pruning based on the log_stream field.
 * @see MetadataHandler
 */
@Override
public void getPartitions(BlockWriter blockWriter, GetTableLayoutRequest request, QueryStatusChecker queryStatusChecker) throws Exception {
    CloudwatchTableName cwTableName = tableResolver.validateTable(request.getTableName());
    DescribeLogStreamsRequest cwRequest = new DescribeLogStreamsRequest(cwTableName.getLogGroupName());
    if (!ALL_LOG_STREAMS_TABLE.equals(cwTableName.getLogStreamName())) {
        cwRequest.setLogStreamNamePrefix(cwTableName.getLogStreamName());
    }
    DescribeLogStreamsResult result;
    do {
        result = invoker.invoke(() -> awsLogs.describeLogStreams(cwRequest));
        for (LogStream next : result.getLogStreams()) {
            // Each log stream that matches any possible partition pruning should be added to the partition list.
            blockWriter.writeRows((Block block, int rowNum) -> {
                boolean matched = block.setValue(LOG_GROUP_FIELD, rowNum, cwRequest.getLogGroupName());
                matched &= block.setValue(LOG_STREAM_FIELD, rowNum, next.getLogStreamName());
                matched &= block.setValue(LOG_STREAM_SIZE_FIELD, rowNum, next.getStoredBytes());
                return matched ? 1 : 0;
            });
        }
        cwRequest.setNextToken(result.getNextToken());
    } while (result.getNextToken() != null && queryStatusChecker.isQueryRunning());
}
Also used : Block(com.amazonaws.athena.connector.lambda.data.Block) DescribeLogStreamsRequest(com.amazonaws.services.logs.model.DescribeLogStreamsRequest) LogStream(com.amazonaws.services.logs.model.LogStream) DescribeLogStreamsResult(com.amazonaws.services.logs.model.DescribeLogStreamsResult)

Example 40 with Block

use of com.amazonaws.athena.connector.lambda.data.Block in project aws-athena-query-federation by awslabs.

the class CloudwatchMetadataHandlerTest method doGetSplits.

@Test
public void doGetSplits() {
    logger.info("doGetSplits: enter");
    Schema schema = SchemaBuilder.newBuilder().addField(CloudwatchMetadataHandler.LOG_STREAM_FIELD, new ArrowType.Utf8()).addField(CloudwatchMetadataHandler.LOG_STREAM_SIZE_FIELD, new ArrowType.Int(64, true)).addField(CloudwatchMetadataHandler.LOG_GROUP_FIELD, new ArrowType.Utf8()).build();
    Block partitions = allocator.createBlock(schema);
    int num_partitions = 2_000;
    for (int i = 0; i < num_partitions; i++) {
        BlockUtils.setValue(partitions.getFieldVector(CloudwatchMetadataHandler.LOG_STREAM_SIZE_FIELD), i, 2016L + i);
        BlockUtils.setValue(partitions.getFieldVector(CloudwatchMetadataHandler.LOG_STREAM_FIELD), i, "log_stream_" + i);
        BlockUtils.setValue(partitions.getFieldVector(CloudwatchMetadataHandler.LOG_GROUP_FIELD), i, "log_group_" + i);
    }
    partitions.setRowCount(num_partitions);
    String continuationToken = null;
    GetSplitsRequest originalReq = new GetSplitsRequest(identity, "queryId", "catalog_name", new TableName("schema", "all_log_streams"), partitions, Collections.singletonList(CloudwatchMetadataHandler.LOG_STREAM_FIELD), new Constraints(new HashMap<>()), continuationToken);
    int numContinuations = 0;
    do {
        GetSplitsRequest req = new GetSplitsRequest(originalReq, continuationToken);
        logger.info("doGetSplits: req[{}]", req);
        MetadataResponse rawResponse = handler.doGetSplits(allocator, req);
        assertEquals(MetadataRequestType.GET_SPLITS, rawResponse.getRequestType());
        GetSplitsResponse response = (GetSplitsResponse) rawResponse;
        continuationToken = response.getContinuationToken();
        logger.info("doGetSplits: continuationToken[{}] - numSplits[{}]", continuationToken, response.getSplits().size());
        for (Split nextSplit : response.getSplits()) {
            assertNotNull(nextSplit.getProperty(CloudwatchMetadataHandler.LOG_STREAM_SIZE_FIELD));
            assertNotNull(nextSplit.getProperty(CloudwatchMetadataHandler.LOG_STREAM_FIELD));
            assertNotNull(nextSplit.getProperty(CloudwatchMetadataHandler.LOG_GROUP_FIELD));
        }
        if (continuationToken != null) {
            numContinuations++;
        }
    } while (continuationToken != null);
    assertTrue(numContinuations > 0);
    logger.info("doGetSplits: exit");
}
Also used : GetSplitsRequest(com.amazonaws.athena.connector.lambda.metadata.GetSplitsRequest) HashMap(java.util.HashMap) Schema(org.apache.arrow.vector.types.pojo.Schema) ArrowType(org.apache.arrow.vector.types.pojo.ArrowType) TableName(com.amazonaws.athena.connector.lambda.domain.TableName) Constraints(com.amazonaws.athena.connector.lambda.domain.predicate.Constraints) GetSplitsResponse(com.amazonaws.athena.connector.lambda.metadata.GetSplitsResponse) MetadataResponse(com.amazonaws.athena.connector.lambda.metadata.MetadataResponse) Block(com.amazonaws.athena.connector.lambda.data.Block) Split(com.amazonaws.athena.connector.lambda.domain.Split) Test(org.junit.Test)

Aggregations

Block (com.amazonaws.athena.connector.lambda.data.Block)113 Test (org.junit.Test)39 HashMap (java.util.HashMap)35 Schema (org.apache.arrow.vector.types.pojo.Schema)35 Constraints (com.amazonaws.athena.connector.lambda.domain.predicate.Constraints)32 Split (com.amazonaws.athena.connector.lambda.domain.Split)31 GetSplitsResponse (com.amazonaws.athena.connector.lambda.metadata.GetSplitsResponse)28 FieldReader (org.apache.arrow.vector.complex.reader.FieldReader)28 TableName (com.amazonaws.athena.connector.lambda.domain.TableName)27 SpillLocation (com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation)23 HashSet (java.util.HashSet)23 ValueSet (com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet)20 Field (org.apache.arrow.vector.types.pojo.Field)17 GetSplitsRequest (com.amazonaws.athena.connector.lambda.metadata.GetSplitsRequest)13 PreparedStatement (java.sql.PreparedStatement)13 ResultSet (java.sql.ResultSet)13 ArrayList (java.util.ArrayList)13 MetadataResponse (com.amazonaws.athena.connector.lambda.metadata.MetadataResponse)12 Connection (java.sql.Connection)12 ReadRecordsRequest (com.amazonaws.athena.connector.lambda.records.ReadRecordsRequest)11