Search in sources :

Example 31 with Block

use of com.amazonaws.athena.connector.lambda.data.Block in project aws-athena-query-federation by awslabs.

the class VerticaRecordHandler method readWithConstraint.

/**
 * Used to read the row data associated with the provided Split.
 *
 * @param spiller            A BlockSpiller that should be used to write the row data associated with this Split.
 *                           The BlockSpiller automatically handles chunking the response, encrypting, and spilling to S3.
 * @param recordsRequest     Details of the read request, including:
 *                           1. The Split
 *                           2. The Catalog, Database, and Table the read request is for.
 *                           3. The filtering predicate (if any)
 *                           4. The columns required for projection.
 * @param queryStatusChecker A QueryStatusChecker that you can use to stop doing work for a query that has already terminated
 * @throws IOException       Throws an IOException
 */
@Override
protected void readWithConstraint(BlockSpiller spiller, ReadRecordsRequest recordsRequest, QueryStatusChecker queryStatusChecker) throws IOException {
    logger.info("readWithConstraint: schema[{}] tableName[{}]", recordsRequest.getSchema(), recordsRequest.getTableName());
    Schema schemaName = recordsRequest.getSchema();
    Split split = recordsRequest.getSplit();
    String id = split.getProperty("query_id");
    String exportBucket = split.getProperty("exportBucket");
    String s3ObjectKey = split.getProperty("s3ObjectKey");
    if (!s3ObjectKey.isEmpty()) {
        // get column name and type from the Schema
        HashMap<String, Types.MinorType> mapOfNamesAndTypes = new HashMap<>();
        HashMap<String, Object> mapOfCols = new HashMap<>();
        for (Field field : schemaName.getFields()) {
            Types.MinorType minorTypeForArrowType = Types.getMinorTypeForArrowType(field.getType());
            mapOfNamesAndTypes.put(field.getName(), minorTypeForArrowType);
            mapOfCols.put(field.getName(), null);
        }
        // creating a RowContext class to hold the column name and value.
        final RowContext rowContext = new RowContext(id);
        // Generating the RowWriter and Extractor
        GeneratedRowWriter.RowWriterBuilder builder = GeneratedRowWriter.newBuilder(recordsRequest.getConstraints());
        for (Field next : recordsRequest.getSchema().getFields()) {
            Extractor extractor = makeExtractor(next, mapOfNamesAndTypes, mapOfCols);
            builder.withExtractor(next.getName(), extractor);
        }
        GeneratedRowWriter rowWriter = builder.build();
        /*
         Using S3 Select to read the S3 Parquet file generated in the split
         */
        // Creating the read Request
        SelectObjectContentRequest request = generateBaseParquetRequest(exportBucket, s3ObjectKey);
        try (SelectObjectContentResult result = amazonS3.selectObjectContent(request)) {
            InputStream resultInputStream = result.getPayload().getRecordsInputStream();
            BufferedReader streamReader = new BufferedReader(new InputStreamReader(resultInputStream, StandardCharsets.UTF_8));
            String inputStr;
            while ((inputStr = streamReader.readLine()) != null) {
                HashMap<String, Object> map = new HashMap<>();
                // we are reading the parquet files, but serializing the output it as JSON as SDK provides a Parquet InputSerialization, but only a JSON or CSV OutputSerializatio
                ObjectMapper objectMapper = new ObjectMapper();
                map = objectMapper.readValue(inputStr, HashMap.class);
                rowContext.setNameValue(map);
                // Passing the RowContext to BlockWriter;
                spiller.writeRows((Block block, int rowNum) -> rowWriter.writeRow(block, rowNum, rowContext) ? 1 : 0);
            }
        } catch (Exception e) {
            throw new RuntimeException("Error in connecting to S3 and selecting the object content for object : " + s3ObjectKey, e);
        }
    }
}
Also used : Types(org.apache.arrow.vector.types.Types) InputStreamReader(java.io.InputStreamReader) HashMap(java.util.HashMap) InputStream(java.io.InputStream) Schema(org.apache.arrow.vector.types.pojo.Schema) IOException(java.io.IOException) Field(org.apache.arrow.vector.types.pojo.Field) GeneratedRowWriter(com.amazonaws.athena.connector.lambda.data.writers.GeneratedRowWriter) BufferedReader(java.io.BufferedReader) Block(com.amazonaws.athena.connector.lambda.data.Block) Split(com.amazonaws.athena.connector.lambda.domain.Split) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper)

Example 32 with Block

use of com.amazonaws.athena.connector.lambda.data.Block in project aws-athena-query-federation by awslabs.

the class TimestreamRecordHandler method readWithConstraint.

/**
 * Scans TimeStream.
 *
 * @see RecordHandler
 */
@Override
protected void readWithConstraint(BlockSpiller spiller, ReadRecordsRequest recordsRequest, QueryStatusChecker queryStatusChecker) {
    TableName tableName = recordsRequest.getTableName();
    SelectQueryBuilder queryBuilder = queryFactory.createSelectQueryBuilder(GlueMetadataHandler.VIEW_METADATA_FIELD);
    String query = queryBuilder.withDatabaseName(tableName.getSchemaName()).withTableName(tableName.getTableName()).withProjection(recordsRequest.getSchema()).withConjucts(recordsRequest.getConstraints()).build();
    logger.info("readWithConstraint: query[{}]", query);
    GeneratedRowWriter rowWriter = buildRowWriter(recordsRequest);
    String nextToken = null;
    long numRows = 0;
    do {
        QueryResult queryResult = tsQuery.query(new QueryRequest().withQueryString(query).withNextToken(nextToken));
        List<Row> data = queryResult.getRows();
        if (data != null) {
            numRows += data.size();
            for (Row nextRow : data) {
                spiller.writeRows((Block block, int rowNum) -> rowWriter.writeRow(block, rowNum, nextRow) ? 1 : 0);
            }
        }
        nextToken = queryResult.getNextToken();
        logger.info("readWithConstraint: numRows[{}]", numRows);
    } while (nextToken != null && !nextToken.isEmpty());
}
Also used : TableName(com.amazonaws.athena.connector.lambda.domain.TableName) GeneratedRowWriter(com.amazonaws.athena.connector.lambda.data.writers.GeneratedRowWriter) QueryResult(com.amazonaws.services.timestreamquery.model.QueryResult) QueryRequest(com.amazonaws.services.timestreamquery.model.QueryRequest) SelectQueryBuilder(com.amazonaws.athena.connectors.timestream.query.SelectQueryBuilder) Block(com.amazonaws.athena.connector.lambda.data.Block) Row(com.amazonaws.services.timestreamquery.model.Row)

Example 33 with Block

use of com.amazonaws.athena.connector.lambda.data.Block in project aws-athena-query-federation by awslabs.

the class S3ObjectsTableProvider method toRow.

/**
 * Maps a DBInstance into a row in our Apache Arrow response block(s).
 *
 * @param objectSummary The S3 ObjectSummary to map.
 * @param spiller The BlockSpiller to use when we want to write a matching row to the response.
 * @note The current implementation is rather naive in how it maps fields. It leverages a static
 * list of fields that we'd like to provide and then explicitly filters and converts each field.
 */
private void toRow(S3ObjectSummary objectSummary, BlockSpiller spiller) {
    spiller.writeRows((Block block, int row) -> {
        boolean matched = true;
        matched &= block.offerValue("bucket_name", row, objectSummary.getBucketName());
        matched &= block.offerValue("e_tag", row, objectSummary.getETag());
        matched &= block.offerValue("key", row, objectSummary.getKey());
        matched &= block.offerValue("bytes", row, objectSummary.getSize());
        matched &= block.offerValue("storage_class", row, objectSummary.getStorageClass());
        matched &= block.offerValue("last_modified", row, objectSummary.getLastModified());
        Owner owner = objectSummary.getOwner();
        if (owner != null) {
            matched &= block.offerValue("owner_name", row, owner.getDisplayName());
            matched &= block.offerValue("owner_id", row, owner.getId());
        }
        return matched ? 1 : 0;
    });
}
Also used : Owner(com.amazonaws.services.s3.model.Owner) Block(com.amazonaws.athena.connector.lambda.data.Block)

Example 34 with Block

use of com.amazonaws.athena.connector.lambda.data.Block in project aws-athena-query-federation by awslabs.

the class AbstractTableProviderTest method validateRead.

protected void validateRead(Schema schema, S3BlockSpillReader reader, List<SpillLocation> locations, EncryptionKey encryptionKey) {
    int blockNum = 0;
    int rowNum = 0;
    for (SpillLocation next : locations) {
        S3SpillLocation spillLocation = (S3SpillLocation) next;
        try (Block block = reader.read(spillLocation, encryptionKey, schema)) {
            logger.info("validateRead: blockNum[{}] and recordCount[{}]", blockNum++, block.getRowCount());
            for (int i = 0; i < block.getRowCount(); i++) {
                logger.info("validateRead: {}", BlockUtils.rowToString(block, i));
                rowNum++;
                validateRow(block, i);
            }
        } catch (Exception ex) {
            throw new RuntimeException(ex);
        }
    }
    assertEquals(getExpectedRows(), rowNum);
}
Also used : SpillLocation(com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation) S3SpillLocation(com.amazonaws.athena.connector.lambda.domain.spill.S3SpillLocation) S3SpillLocation(com.amazonaws.athena.connector.lambda.domain.spill.S3SpillLocation) Block(com.amazonaws.athena.connector.lambda.data.Block)

Example 35 with Block

use of com.amazonaws.athena.connector.lambda.data.Block in project aws-athena-query-federation by awslabs.

the class ImpalaMetadataHandler method getPartitions.

/**
 * Used to get the Impala partitions that must be read from the request table in order to satisfy the requested predicate.
 *
 * @param blockWriter Used to write rows (Impala partitions) into the Apache Arrow response.
 * @param getTableLayoutRequest Provides details of the catalog, database, and table being queried as well as any filter predicate.
 * @param queryStatusChecker A QueryStatusChecker that you can use to stop doing work for a query that has already terminated
 * @throws SQLException A SQLException should be thrown for database connection failures , query syntax errors and so on.
 */
@Override
public void getPartitions(BlockWriter blockWriter, GetTableLayoutRequest getTableLayoutRequest, QueryStatusChecker queryStatusChecker) throws SQLException {
    LOGGER.info("{}: Schema {}, table {}", getTableLayoutRequest.getQueryId(), getTableLayoutRequest.getTableName().getSchemaName(), getTableLayoutRequest.getTableName().getTableName());
    try (Connection connection = getJdbcConnectionFactory().getConnection(getCredentialProvider());
        Statement stmt = connection.createStatement();
        PreparedStatement psmt = connection.prepareStatement(GET_METADATA_QUERY + getTableLayoutRequest.getTableName().getTableName().toUpperCase())) {
        Map<String, String> columnHashMap = getMetadataForGivenTable(psmt);
        String tableType = columnHashMap.get("TableType");
        if (tableType == null) {
            ResultSet partitionRs = stmt.executeQuery("show files in " + getTableLayoutRequest.getTableName().getTableName().toUpperCase());
            Set<String> partition = new HashSet<>();
            while (partitionRs != null && partitionRs.next()) {
                String partitionString = partitionRs.getString("Partition");
                if (partitionString != null && !partitionString.isEmpty()) {
                    partition.add(partitionString);
                }
            }
            LOGGER.debug("isTablePartitioned:" + !partition.isEmpty());
            if (!partition.isEmpty()) {
                addPartitions(partition, columnHashMap, blockWriter);
            } else {
                blockWriter.writeRows((Block block, int rowNum) -> {
                    block.setValue(ImpalaConstants.BLOCK_PARTITION_COLUMN_NAME, rowNum, ImpalaConstants.ALL_PARTITIONS);
                    return 1;
                });
            }
        } else {
            blockWriter.writeRows((Block block, int rowNum) -> {
                block.setValue(ImpalaConstants.BLOCK_PARTITION_COLUMN_NAME, rowNum, ImpalaConstants.ALL_PARTITIONS);
                return 1;
            });
        }
    }
}
Also used : PreparedStatement(java.sql.PreparedStatement) Statement(java.sql.Statement) Connection(java.sql.Connection) ResultSet(java.sql.ResultSet) Block(com.amazonaws.athena.connector.lambda.data.Block) PreparedStatement(java.sql.PreparedStatement) HashSet(java.util.HashSet)

Aggregations

Block (com.amazonaws.athena.connector.lambda.data.Block)113 Test (org.junit.Test)39 HashMap (java.util.HashMap)35 Schema (org.apache.arrow.vector.types.pojo.Schema)35 Constraints (com.amazonaws.athena.connector.lambda.domain.predicate.Constraints)32 Split (com.amazonaws.athena.connector.lambda.domain.Split)31 GetSplitsResponse (com.amazonaws.athena.connector.lambda.metadata.GetSplitsResponse)28 FieldReader (org.apache.arrow.vector.complex.reader.FieldReader)28 TableName (com.amazonaws.athena.connector.lambda.domain.TableName)27 SpillLocation (com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation)23 HashSet (java.util.HashSet)23 ValueSet (com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet)20 Field (org.apache.arrow.vector.types.pojo.Field)17 GetSplitsRequest (com.amazonaws.athena.connector.lambda.metadata.GetSplitsRequest)13 PreparedStatement (java.sql.PreparedStatement)13 ResultSet (java.sql.ResultSet)13 ArrayList (java.util.ArrayList)13 MetadataResponse (com.amazonaws.athena.connector.lambda.metadata.MetadataResponse)12 Connection (java.sql.Connection)12 ReadRecordsRequest (com.amazonaws.athena.connector.lambda.records.ReadRecordsRequest)11