Search in sources :

Example 1 with GeneratedRowWriter

use of in project aws-athena-query-federation by awslabs.

the class JdbcRecordHandler method readWithConstraint.

public void readWithConstraint(BlockSpiller blockSpiller, ReadRecordsRequest readRecordsRequest, QueryStatusChecker queryStatusChecker) {"{}: Catalog: {}, table {}, splits {}", readRecordsRequest.getQueryId(), readRecordsRequest.getCatalogName(), readRecordsRequest.getTableName(), readRecordsRequest.getSplit().getProperties());
    try (Connection connection = this.jdbcConnectionFactory.getConnection(getCredentialProvider())) {
        // For consistency. This is needed to be false to enable streaming for some database types.
        try (PreparedStatement preparedStatement = buildSplitSql(connection, readRecordsRequest.getCatalogName(), readRecordsRequest.getTableName(), readRecordsRequest.getSchema(), readRecordsRequest.getConstraints(), readRecordsRequest.getSplit());
            ResultSet resultSet = preparedStatement.executeQuery()) {
            Map<String, String> partitionValues = readRecordsRequest.getSplit().getProperties();
            GeneratedRowWriter.RowWriterBuilder rowWriterBuilder = GeneratedRowWriter.newBuilder(readRecordsRequest.getConstraints());
            for (Field next : readRecordsRequest.getSchema().getFields()) {
                if (next.getType() instanceof ArrowType.List) {
                    rowWriterBuilder.withFieldWriterFactory(next.getName(), makeFactory(next));
                } else {
                    rowWriterBuilder.withExtractor(next.getName(), makeExtractor(next, resultSet, partitionValues));
            GeneratedRowWriter rowWriter =;
            int rowsReturnedFromDatabase = 0;
            while ( {
                if (!queryStatusChecker.isQueryRunning()) {
                blockSpiller.writeRows((Block block, int rowNum) -> rowWriter.writeRow(block, rowNum, resultSet) ? 1 : 0);
  "{} rows returned by database.", rowsReturnedFromDatabase);
    } catch (SQLException sqlException) {
        throw new RuntimeException(sqlException.getErrorCode() + ": " + sqlException.getMessage(), sqlException);
Also used : SQLException(java.sql.SQLException) Connection(java.sql.Connection) PreparedStatement(java.sql.PreparedStatement) Field(org.apache.arrow.vector.types.pojo.Field) GeneratedRowWriter( ResultSet(java.sql.ResultSet) Block( List(java.util.List) ArrayList(java.util.ArrayList)

Example 2 with GeneratedRowWriter

use of in project aws-athena-query-federation by awslabs.

the class VerticaRecordHandler method readWithConstraint.

 * Used to read the row data associated with the provided Split.
 * @param spiller            A BlockSpiller that should be used to write the row data associated with this Split.
 *                           The BlockSpiller automatically handles chunking the response, encrypting, and spilling to S3.
 * @param recordsRequest     Details of the read request, including:
 *                           1. The Split
 *                           2. The Catalog, Database, and Table the read request is for.
 *                           3. The filtering predicate (if any)
 *                           4. The columns required for projection.
 * @param queryStatusChecker A QueryStatusChecker that you can use to stop doing work for a query that has already terminated
 * @throws IOException       Throws an IOException
protected void readWithConstraint(BlockSpiller spiller, ReadRecordsRequest recordsRequest, QueryStatusChecker queryStatusChecker) throws IOException {"readWithConstraint: schema[{}] tableName[{}]", recordsRequest.getSchema(), recordsRequest.getTableName());
    Schema schemaName = recordsRequest.getSchema();
    Split split = recordsRequest.getSplit();
    String id = split.getProperty("query_id");
    String exportBucket = split.getProperty("exportBucket");
    String s3ObjectKey = split.getProperty("s3ObjectKey");
    if (!s3ObjectKey.isEmpty()) {
        // get column name and type from the Schema
        HashMap<String, Types.MinorType> mapOfNamesAndTypes = new HashMap<>();
        HashMap<String, Object> mapOfCols = new HashMap<>();
        for (Field field : schemaName.getFields()) {
            Types.MinorType minorTypeForArrowType = Types.getMinorTypeForArrowType(field.getType());
            mapOfNamesAndTypes.put(field.getName(), minorTypeForArrowType);
            mapOfCols.put(field.getName(), null);
        // creating a RowContext class to hold the column name and value.
        final RowContext rowContext = new RowContext(id);
        // Generating the RowWriter and Extractor
        GeneratedRowWriter.RowWriterBuilder builder = GeneratedRowWriter.newBuilder(recordsRequest.getConstraints());
        for (Field next : recordsRequest.getSchema().getFields()) {
            Extractor extractor = makeExtractor(next, mapOfNamesAndTypes, mapOfCols);
            builder.withExtractor(next.getName(), extractor);
        GeneratedRowWriter rowWriter =;
         Using S3 Select to read the S3 Parquet file generated in the split
        // Creating the read Request
        SelectObjectContentRequest request = generateBaseParquetRequest(exportBucket, s3ObjectKey);
        try (SelectObjectContentResult result = amazonS3.selectObjectContent(request)) {
            InputStream resultInputStream = result.getPayload().getRecordsInputStream();
            BufferedReader streamReader = new BufferedReader(new InputStreamReader(resultInputStream, StandardCharsets.UTF_8));
            String inputStr;
            while ((inputStr = streamReader.readLine()) != null) {
                HashMap<String, Object> map = new HashMap<>();
                // we are reading the parquet files, but serializing the output it as JSON as SDK provides a Parquet InputSerialization, but only a JSON or CSV OutputSerializatio
                ObjectMapper objectMapper = new ObjectMapper();
                map = objectMapper.readValue(inputStr, HashMap.class);
                // Passing the RowContext to BlockWriter;
                spiller.writeRows((Block block, int rowNum) -> rowWriter.writeRow(block, rowNum, rowContext) ? 1 : 0);
        } catch (Exception e) {
            throw new RuntimeException("Error in connecting to S3 and selecting the object content for object : " + s3ObjectKey, e);
Also used : Types(org.apache.arrow.vector.types.Types) InputStreamReader( HashMap(java.util.HashMap) InputStream( Schema(org.apache.arrow.vector.types.pojo.Schema) IOException( Field(org.apache.arrow.vector.types.pojo.Field) GeneratedRowWriter( BufferedReader( Block( Split(com.amazonaws.athena.connector.lambda.domain.Split) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper)

Example 3 with GeneratedRowWriter

use of in project aws-athena-query-federation by awslabs.

the class TimestreamRecordHandler method buildRowWriter.

private GeneratedRowWriter buildRowWriter(ReadRecordsRequest request) {
    GeneratedRowWriter.RowWriterBuilder builder = GeneratedRowWriter.newBuilder(request.getConstraints());
    int fieldNum = 0;
    for (Field nextField : request.getSchema().getFields()) {
        int curFieldNum = fieldNum++;
        switch(Types.getMinorTypeForArrowType(nextField.getType())) {
            case VARCHAR:
                builder.withExtractor(nextField.getName(), (VarCharExtractor) (Object context, NullableVarCharHolder value) -> {
                    value.isSet = 1;
                    value.value = ((Row) context).getData().get(curFieldNum).getScalarValue();
            case FLOAT8:
                builder.withExtractor(nextField.getName(), (Float8Extractor) (Object context, NullableFloat8Holder value) -> {
                    value.isSet = 1;
                    value.value = Double.valueOf(((Row) context).getData().get(curFieldNum).getScalarValue());
            case BIT:
                builder.withExtractor(nextField.getName(), (BitExtractor) (Object context, NullableBitHolder value) -> {
                    value.isSet = 1;
                    value.value = Boolean.valueOf(((Row) context).getData().get(curFieldNum).getScalarValue()) == false ? 0 : 1;
            case BIGINT:
                builder.withExtractor(nextField.getName(), (BigIntExtractor) (Object context, NullableBigIntHolder value) -> {
                    value.isSet = 1;
                    value.value = Long.valueOf(((Row) context).getData().get(curFieldNum).getScalarValue());
            case DATEMILLI:
                builder.withExtractor(nextField.getName(), (DateMilliExtractor) (Object context, NullableDateMilliHolder value) -> {
                    value.isSet = 1;
                    value.value = TIMESTAMP_FORMATTER.parse(((Row) context).getData().get(curFieldNum).getScalarValue()).getTime();
            case LIST:
                // TODO: This presently only supports TimeSeries results but it is possible that customers may
                // generate LIST type results for other reasons when using VIEWs. For now this seems like an OK
                // compromise since it enables an important capability of TimeStream even if it doesn't enable arbitrary
                // complex types.
                buildTimeSeriesExtractor(builder, nextField, curFieldNum);
                throw new RuntimeException("Unsupported field type[" + nextField.getType() + "] for field[" + nextField.getName() + "]");
Also used : NullableBitHolder(org.apache.arrow.vector.holders.NullableBitHolder) Field(org.apache.arrow.vector.types.pojo.Field) NullableVarCharHolder( GeneratedRowWriter( NullableDateMilliHolder(org.apache.arrow.vector.holders.NullableDateMilliHolder) NullableFloat8Holder(org.apache.arrow.vector.holders.NullableFloat8Holder) Row( TimeSeriesDataPoint( NullableBigIntHolder(org.apache.arrow.vector.holders.NullableBigIntHolder)

Example 4 with GeneratedRowWriter

use of in project aws-athena-query-federation by awslabs.

the class TimestreamRecordHandler method readWithConstraint.

 * Scans TimeStream.
 * @see RecordHandler
protected void readWithConstraint(BlockSpiller spiller, ReadRecordsRequest recordsRequest, QueryStatusChecker queryStatusChecker) {
    TableName tableName = recordsRequest.getTableName();
    SelectQueryBuilder queryBuilder = queryFactory.createSelectQueryBuilder(GlueMetadataHandler.VIEW_METADATA_FIELD);
    String query = queryBuilder.withDatabaseName(tableName.getSchemaName()).withTableName(tableName.getTableName()).withProjection(recordsRequest.getSchema()).withConjucts(recordsRequest.getConstraints()).build();"readWithConstraint: query[{}]", query);
    GeneratedRowWriter rowWriter = buildRowWriter(recordsRequest);
    String nextToken = null;
    long numRows = 0;
    do {
        QueryResult queryResult = tsQuery.query(new QueryRequest().withQueryString(query).withNextToken(nextToken));
        List<Row> data = queryResult.getRows();
        if (data != null) {
            numRows += data.size();
            for (Row nextRow : data) {
                spiller.writeRows((Block block, int rowNum) -> rowWriter.writeRow(block, rowNum, nextRow) ? 1 : 0);
        nextToken = queryResult.getNextToken();"readWithConstraint: numRows[{}]", numRows);
    } while (nextToken != null && !nextToken.isEmpty());
Also used : TableName(com.amazonaws.athena.connector.lambda.domain.TableName) GeneratedRowWriter( QueryResult( QueryRequest( SelectQueryBuilder(com.amazonaws.athena.connectors.timestream.query.SelectQueryBuilder) Block( Row(

Example 5 with GeneratedRowWriter

use of in project aws-athena-query-federation by awslabs.

the class ElasticsearchRecordHandler method readWithConstraint.

 * Used to read the row data associated with the provided Split.
 * @param spiller A BlockSpiller that should be used to write the row data associated with this Split.
 * The BlockSpiller automatically handles chunking the response, encrypting, and spilling to S3.
 * @param recordsRequest Details of the read request, including:
 * 1. The Split
 * 2. The Catalog, Database, and Table the read request is for.
 * 3. The filtering predicate (if any)
 * 4. The columns required for projection.
 * @param queryStatusChecker A QueryStatusChecker that you can use to stop doing work for a query that has already terminated
 * @throws RuntimeException when an error occurs while attempting to send the query, or the query timed out.
 * @note Avoid writing >10 rows per-call to BlockSpiller.writeRow(...) because this will limit the BlockSpiller's
 * ability to control Block size. The resulting increase in Block size may cause failures and reduced performance.
protected void readWithConstraint(BlockSpiller spiller, ReadRecordsRequest recordsRequest, QueryStatusChecker queryStatusChecker) throws RuntimeException {"readWithConstraint - enter - Domain: {}, Index: {}, Mapping: {}", recordsRequest.getTableName().getSchemaName(), recordsRequest.getTableName().getTableName(), recordsRequest.getSchema());
    String domain = recordsRequest.getTableName().getSchemaName();
    String endpoint = recordsRequest.getSplit().getProperty(domain);
    String index = recordsRequest.getTableName().getTableName();
    String shard = recordsRequest.getSplit().getProperty(ElasticsearchMetadataHandler.SHARD_KEY);
    long numRows = 0;
    if (queryStatusChecker.isQueryRunning()) {
        AwsRestHighLevelClient client = clientFactory.getOrCreateClient(endpoint);
        try {
            // Create field extractors for all data types in the schema.
            GeneratedRowWriter rowWriter = createFieldExtractors(recordsRequest);
            // Create a new search-source injected with the projection, predicate, and the pagination batch size.
            SearchSourceBuilder searchSource = new SearchSourceBuilder().size(QUERY_BATCH_SIZE).timeout(new TimeValue(queryTimeout, TimeUnit.SECONDS)).fetchSource(ElasticsearchQueryUtils.getProjection(recordsRequest.getSchema())).query(ElasticsearchQueryUtils.getQuery(recordsRequest.getConstraints().getSummary()));
            // Create a new search-request for the specified index.
            SearchRequest searchRequest = new SearchRequest(index).preference(shard);
            int hitsNum;
            int currPosition = 0;
            do {
                // Process the search request injecting the search-source, and setting the from position
                // used for pagination of results.
                SearchResponse searchResponse = client.getDocuments(searchRequest.source(searchSource.from(currPosition)));
                // Throw on query timeout.
                if (searchResponse.isTimedOut()) {
                    throw new RuntimeException("Request for index (" + index + ") " + shard + " timed out.");
                // Increment current position to next batch of results.
                currPosition += QUERY_BATCH_SIZE;
                // Process hits.
                Iterator<SearchHit> hitIterator = searchResponse.getHits().iterator();
                hitsNum = searchResponse.getHits().getHits().length;
                while (hitIterator.hasNext() && queryStatusChecker.isQueryRunning()) {
                    spiller.writeRows((Block block, int rowNum) -> rowWriter.writeRow(block, rowNum, client.getDocument( ? 1 : 0);
            // if hitsNum < QUERY_BATCH_SIZE, then this is the last batch of documents.
            } while (hitsNum == QUERY_BATCH_SIZE && queryStatusChecker.isQueryRunning());
        } catch (IOException error) {
            throw new RuntimeException("Error sending search query: " + error.getMessage(), error);
    }"readWithConstraint: numRows[{}]", numRows);
Also used : SearchRequest( SearchHit( IOException( SearchSourceBuilder( SearchResponse( GeneratedRowWriter( Block( TimeValue(org.elasticsearch.common.unit.TimeValue)


GeneratedRowWriter ( Block ( Field (org.apache.arrow.vector.types.pojo.Field)6 BigIntExtractor ( BitExtractor ( DateDayExtractor ( DateMilliExtractor ( DecimalExtractor ( Extractor ( Float4Extractor ( Float8Extractor ( IntExtractor ( SmallIntExtractor ( TinyIntExtractor ( VarBinaryExtractor ( VarCharExtractor ( Split (com.amazonaws.athena.connector.lambda.domain.Split)2 Row ( BufferedReader ( IOException (