Search in sources :

Example 1 with GeneratedRowWriter

use of com.amazonaws.athena.connector.lambda.data.writers.GeneratedRowWriter in project aws-athena-query-federation by awslabs.

the class JdbcRecordHandler method readWithConstraint.

@Override
public void readWithConstraint(BlockSpiller blockSpiller, ReadRecordsRequest readRecordsRequest, QueryStatusChecker queryStatusChecker) {
    LOGGER.info("{}: Catalog: {}, table {}, splits {}", readRecordsRequest.getQueryId(), readRecordsRequest.getCatalogName(), readRecordsRequest.getTableName(), readRecordsRequest.getSplit().getProperties());
    try (Connection connection = this.jdbcConnectionFactory.getConnection(getCredentialProvider())) {
        // For consistency. This is needed to be false to enable streaming for some database types.
        connection.setAutoCommit(false);
        try (PreparedStatement preparedStatement = buildSplitSql(connection, readRecordsRequest.getCatalogName(), readRecordsRequest.getTableName(), readRecordsRequest.getSchema(), readRecordsRequest.getConstraints(), readRecordsRequest.getSplit());
            ResultSet resultSet = preparedStatement.executeQuery()) {
            Map<String, String> partitionValues = readRecordsRequest.getSplit().getProperties();
            GeneratedRowWriter.RowWriterBuilder rowWriterBuilder = GeneratedRowWriter.newBuilder(readRecordsRequest.getConstraints());
            for (Field next : readRecordsRequest.getSchema().getFields()) {
                if (next.getType() instanceof ArrowType.List) {
                    rowWriterBuilder.withFieldWriterFactory(next.getName(), makeFactory(next));
                } else {
                    rowWriterBuilder.withExtractor(next.getName(), makeExtractor(next, resultSet, partitionValues));
                }
            }
            GeneratedRowWriter rowWriter = rowWriterBuilder.build();
            int rowsReturnedFromDatabase = 0;
            while (resultSet.next()) {
                if (!queryStatusChecker.isQueryRunning()) {
                    return;
                }
                blockSpiller.writeRows((Block block, int rowNum) -> rowWriter.writeRow(block, rowNum, resultSet) ? 1 : 0);
                rowsReturnedFromDatabase++;
            }
            LOGGER.info("{} rows returned by database.", rowsReturnedFromDatabase);
            connection.commit();
        }
    } catch (SQLException sqlException) {
        throw new RuntimeException(sqlException.getErrorCode() + ": " + sqlException.getMessage(), sqlException);
    }
}
Also used : SQLException(java.sql.SQLException) Connection(java.sql.Connection) PreparedStatement(java.sql.PreparedStatement) Field(org.apache.arrow.vector.types.pojo.Field) GeneratedRowWriter(com.amazonaws.athena.connector.lambda.data.writers.GeneratedRowWriter) ResultSet(java.sql.ResultSet) Block(com.amazonaws.athena.connector.lambda.data.Block) List(java.util.List) ArrayList(java.util.ArrayList)

Example 2 with GeneratedRowWriter

use of com.amazonaws.athena.connector.lambda.data.writers.GeneratedRowWriter in project aws-athena-query-federation by awslabs.

the class VerticaRecordHandler method readWithConstraint.

/**
 * Used to read the row data associated with the provided Split.
 *
 * @param spiller            A BlockSpiller that should be used to write the row data associated with this Split.
 *                           The BlockSpiller automatically handles chunking the response, encrypting, and spilling to S3.
 * @param recordsRequest     Details of the read request, including:
 *                           1. The Split
 *                           2. The Catalog, Database, and Table the read request is for.
 *                           3. The filtering predicate (if any)
 *                           4. The columns required for projection.
 * @param queryStatusChecker A QueryStatusChecker that you can use to stop doing work for a query that has already terminated
 * @throws IOException       Throws an IOException
 */
@Override
protected void readWithConstraint(BlockSpiller spiller, ReadRecordsRequest recordsRequest, QueryStatusChecker queryStatusChecker) throws IOException {
    logger.info("readWithConstraint: schema[{}] tableName[{}]", recordsRequest.getSchema(), recordsRequest.getTableName());
    Schema schemaName = recordsRequest.getSchema();
    Split split = recordsRequest.getSplit();
    String id = split.getProperty("query_id");
    String exportBucket = split.getProperty("exportBucket");
    String s3ObjectKey = split.getProperty("s3ObjectKey");
    if (!s3ObjectKey.isEmpty()) {
        // get column name and type from the Schema
        HashMap<String, Types.MinorType> mapOfNamesAndTypes = new HashMap<>();
        HashMap<String, Object> mapOfCols = new HashMap<>();
        for (Field field : schemaName.getFields()) {
            Types.MinorType minorTypeForArrowType = Types.getMinorTypeForArrowType(field.getType());
            mapOfNamesAndTypes.put(field.getName(), minorTypeForArrowType);
            mapOfCols.put(field.getName(), null);
        }
        // creating a RowContext class to hold the column name and value.
        final RowContext rowContext = new RowContext(id);
        // Generating the RowWriter and Extractor
        GeneratedRowWriter.RowWriterBuilder builder = GeneratedRowWriter.newBuilder(recordsRequest.getConstraints());
        for (Field next : recordsRequest.getSchema().getFields()) {
            Extractor extractor = makeExtractor(next, mapOfNamesAndTypes, mapOfCols);
            builder.withExtractor(next.getName(), extractor);
        }
        GeneratedRowWriter rowWriter = builder.build();
        /*
         Using S3 Select to read the S3 Parquet file generated in the split
         */
        // Creating the read Request
        SelectObjectContentRequest request = generateBaseParquetRequest(exportBucket, s3ObjectKey);
        try (SelectObjectContentResult result = amazonS3.selectObjectContent(request)) {
            InputStream resultInputStream = result.getPayload().getRecordsInputStream();
            BufferedReader streamReader = new BufferedReader(new InputStreamReader(resultInputStream, StandardCharsets.UTF_8));
            String inputStr;
            while ((inputStr = streamReader.readLine()) != null) {
                HashMap<String, Object> map = new HashMap<>();
                // we are reading the parquet files, but serializing the output it as JSON as SDK provides a Parquet InputSerialization, but only a JSON or CSV OutputSerializatio
                ObjectMapper objectMapper = new ObjectMapper();
                map = objectMapper.readValue(inputStr, HashMap.class);
                rowContext.setNameValue(map);
                // Passing the RowContext to BlockWriter;
                spiller.writeRows((Block block, int rowNum) -> rowWriter.writeRow(block, rowNum, rowContext) ? 1 : 0);
            }
        } catch (Exception e) {
            throw new RuntimeException("Error in connecting to S3 and selecting the object content for object : " + s3ObjectKey, e);
        }
    }
}
Also used : Types(org.apache.arrow.vector.types.Types) InputStreamReader(java.io.InputStreamReader) HashMap(java.util.HashMap) InputStream(java.io.InputStream) Schema(org.apache.arrow.vector.types.pojo.Schema) IOException(java.io.IOException) Field(org.apache.arrow.vector.types.pojo.Field) GeneratedRowWriter(com.amazonaws.athena.connector.lambda.data.writers.GeneratedRowWriter) BufferedReader(java.io.BufferedReader) Block(com.amazonaws.athena.connector.lambda.data.Block) Split(com.amazonaws.athena.connector.lambda.domain.Split) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper)

Example 3 with GeneratedRowWriter

use of com.amazonaws.athena.connector.lambda.data.writers.GeneratedRowWriter in project aws-athena-query-federation by awslabs.

the class TimestreamRecordHandler method buildRowWriter.

private GeneratedRowWriter buildRowWriter(ReadRecordsRequest request) {
    GeneratedRowWriter.RowWriterBuilder builder = GeneratedRowWriter.newBuilder(request.getConstraints());
    int fieldNum = 0;
    for (Field nextField : request.getSchema().getFields()) {
        int curFieldNum = fieldNum++;
        switch(Types.getMinorTypeForArrowType(nextField.getType())) {
            case VARCHAR:
                builder.withExtractor(nextField.getName(), (VarCharExtractor) (Object context, NullableVarCharHolder value) -> {
                    value.isSet = 1;
                    value.value = ((Row) context).getData().get(curFieldNum).getScalarValue();
                });
                break;
            case FLOAT8:
                builder.withExtractor(nextField.getName(), (Float8Extractor) (Object context, NullableFloat8Holder value) -> {
                    value.isSet = 1;
                    value.value = Double.valueOf(((Row) context).getData().get(curFieldNum).getScalarValue());
                });
                break;
            case BIT:
                builder.withExtractor(nextField.getName(), (BitExtractor) (Object context, NullableBitHolder value) -> {
                    value.isSet = 1;
                    value.value = Boolean.valueOf(((Row) context).getData().get(curFieldNum).getScalarValue()) == false ? 0 : 1;
                });
                break;
            case BIGINT:
                builder.withExtractor(nextField.getName(), (BigIntExtractor) (Object context, NullableBigIntHolder value) -> {
                    value.isSet = 1;
                    value.value = Long.valueOf(((Row) context).getData().get(curFieldNum).getScalarValue());
                });
                break;
            case DATEMILLI:
                builder.withExtractor(nextField.getName(), (DateMilliExtractor) (Object context, NullableDateMilliHolder value) -> {
                    value.isSet = 1;
                    value.value = TIMESTAMP_FORMATTER.parse(((Row) context).getData().get(curFieldNum).getScalarValue()).getTime();
                });
                break;
            case LIST:
                // TODO: This presently only supports TimeSeries results but it is possible that customers may
                // generate LIST type results for other reasons when using VIEWs. For now this seems like an OK
                // compromise since it enables an important capability of TimeStream even if it doesn't enable arbitrary
                // complex types.
                buildTimeSeriesExtractor(builder, nextField, curFieldNum);
                break;
            default:
                throw new RuntimeException("Unsupported field type[" + nextField.getType() + "] for field[" + nextField.getName() + "]");
        }
    }
    return builder.build();
}
Also used : NullableBitHolder(org.apache.arrow.vector.holders.NullableBitHolder) Field(org.apache.arrow.vector.types.pojo.Field) NullableVarCharHolder(com.amazonaws.athena.connector.lambda.data.writers.holders.NullableVarCharHolder) GeneratedRowWriter(com.amazonaws.athena.connector.lambda.data.writers.GeneratedRowWriter) NullableDateMilliHolder(org.apache.arrow.vector.holders.NullableDateMilliHolder) NullableFloat8Holder(org.apache.arrow.vector.holders.NullableFloat8Holder) Row(com.amazonaws.services.timestreamquery.model.Row) TimeSeriesDataPoint(com.amazonaws.services.timestreamquery.model.TimeSeriesDataPoint) NullableBigIntHolder(org.apache.arrow.vector.holders.NullableBigIntHolder)

Example 4 with GeneratedRowWriter

use of com.amazonaws.athena.connector.lambda.data.writers.GeneratedRowWriter in project aws-athena-query-federation by awslabs.

the class TimestreamRecordHandler method readWithConstraint.

/**
 * Scans TimeStream.
 *
 * @see RecordHandler
 */
@Override
protected void readWithConstraint(BlockSpiller spiller, ReadRecordsRequest recordsRequest, QueryStatusChecker queryStatusChecker) {
    TableName tableName = recordsRequest.getTableName();
    SelectQueryBuilder queryBuilder = queryFactory.createSelectQueryBuilder(GlueMetadataHandler.VIEW_METADATA_FIELD);
    String query = queryBuilder.withDatabaseName(tableName.getSchemaName()).withTableName(tableName.getTableName()).withProjection(recordsRequest.getSchema()).withConjucts(recordsRequest.getConstraints()).build();
    logger.info("readWithConstraint: query[{}]", query);
    GeneratedRowWriter rowWriter = buildRowWriter(recordsRequest);
    String nextToken = null;
    long numRows = 0;
    do {
        QueryResult queryResult = tsQuery.query(new QueryRequest().withQueryString(query).withNextToken(nextToken));
        List<Row> data = queryResult.getRows();
        if (data != null) {
            numRows += data.size();
            for (Row nextRow : data) {
                spiller.writeRows((Block block, int rowNum) -> rowWriter.writeRow(block, rowNum, nextRow) ? 1 : 0);
            }
        }
        nextToken = queryResult.getNextToken();
        logger.info("readWithConstraint: numRows[{}]", numRows);
    } while (nextToken != null && !nextToken.isEmpty());
}
Also used : TableName(com.amazonaws.athena.connector.lambda.domain.TableName) GeneratedRowWriter(com.amazonaws.athena.connector.lambda.data.writers.GeneratedRowWriter) QueryResult(com.amazonaws.services.timestreamquery.model.QueryResult) QueryRequest(com.amazonaws.services.timestreamquery.model.QueryRequest) SelectQueryBuilder(com.amazonaws.athena.connectors.timestream.query.SelectQueryBuilder) Block(com.amazonaws.athena.connector.lambda.data.Block) Row(com.amazonaws.services.timestreamquery.model.Row)

Example 5 with GeneratedRowWriter

use of com.amazonaws.athena.connector.lambda.data.writers.GeneratedRowWriter in project aws-athena-query-federation by awslabs.

the class ElasticsearchRecordHandler method readWithConstraint.

/**
 * Used to read the row data associated with the provided Split.
 *
 * @param spiller A BlockSpiller that should be used to write the row data associated with this Split.
 * The BlockSpiller automatically handles chunking the response, encrypting, and spilling to S3.
 * @param recordsRequest Details of the read request, including:
 * 1. The Split
 * 2. The Catalog, Database, and Table the read request is for.
 * 3. The filtering predicate (if any)
 * 4. The columns required for projection.
 * @param queryStatusChecker A QueryStatusChecker that you can use to stop doing work for a query that has already terminated
 * @throws RuntimeException when an error occurs while attempting to send the query, or the query timed out.
 * @note Avoid writing >10 rows per-call to BlockSpiller.writeRow(...) because this will limit the BlockSpiller's
 * ability to control Block size. The resulting increase in Block size may cause failures and reduced performance.
 */
@Override
protected void readWithConstraint(BlockSpiller spiller, ReadRecordsRequest recordsRequest, QueryStatusChecker queryStatusChecker) throws RuntimeException {
    logger.info("readWithConstraint - enter - Domain: {}, Index: {}, Mapping: {}", recordsRequest.getTableName().getSchemaName(), recordsRequest.getTableName().getTableName(), recordsRequest.getSchema());
    String domain = recordsRequest.getTableName().getSchemaName();
    String endpoint = recordsRequest.getSplit().getProperty(domain);
    String index = recordsRequest.getTableName().getTableName();
    String shard = recordsRequest.getSplit().getProperty(ElasticsearchMetadataHandler.SHARD_KEY);
    long numRows = 0;
    if (queryStatusChecker.isQueryRunning()) {
        AwsRestHighLevelClient client = clientFactory.getOrCreateClient(endpoint);
        try {
            // Create field extractors for all data types in the schema.
            GeneratedRowWriter rowWriter = createFieldExtractors(recordsRequest);
            // Create a new search-source injected with the projection, predicate, and the pagination batch size.
            SearchSourceBuilder searchSource = new SearchSourceBuilder().size(QUERY_BATCH_SIZE).timeout(new TimeValue(queryTimeout, TimeUnit.SECONDS)).fetchSource(ElasticsearchQueryUtils.getProjection(recordsRequest.getSchema())).query(ElasticsearchQueryUtils.getQuery(recordsRequest.getConstraints().getSummary()));
            // Create a new search-request for the specified index.
            SearchRequest searchRequest = new SearchRequest(index).preference(shard);
            int hitsNum;
            int currPosition = 0;
            do {
                // Process the search request injecting the search-source, and setting the from position
                // used for pagination of results.
                SearchResponse searchResponse = client.getDocuments(searchRequest.source(searchSource.from(currPosition)));
                // Throw on query timeout.
                if (searchResponse.isTimedOut()) {
                    throw new RuntimeException("Request for index (" + index + ") " + shard + " timed out.");
                }
                // Increment current position to next batch of results.
                currPosition += QUERY_BATCH_SIZE;
                // Process hits.
                Iterator<SearchHit> hitIterator = searchResponse.getHits().iterator();
                hitsNum = searchResponse.getHits().getHits().length;
                while (hitIterator.hasNext() && queryStatusChecker.isQueryRunning()) {
                    ++numRows;
                    spiller.writeRows((Block block, int rowNum) -> rowWriter.writeRow(block, rowNum, client.getDocument(hitIterator.next())) ? 1 : 0);
                }
            // if hitsNum < QUERY_BATCH_SIZE, then this is the last batch of documents.
            } while (hitsNum == QUERY_BATCH_SIZE && queryStatusChecker.isQueryRunning());
        } catch (IOException error) {
            throw new RuntimeException("Error sending search query: " + error.getMessage(), error);
        }
    }
    logger.info("readWithConstraint: numRows[{}]", numRows);
}
Also used : SearchRequest(org.elasticsearch.action.search.SearchRequest) SearchHit(org.elasticsearch.search.SearchHit) IOException(java.io.IOException) SearchSourceBuilder(org.elasticsearch.search.builder.SearchSourceBuilder) SearchResponse(org.elasticsearch.action.search.SearchResponse) GeneratedRowWriter(com.amazonaws.athena.connector.lambda.data.writers.GeneratedRowWriter) Block(com.amazonaws.athena.connector.lambda.data.Block) TimeValue(org.elasticsearch.common.unit.TimeValue)

Aggregations

GeneratedRowWriter (com.amazonaws.athena.connector.lambda.data.writers.GeneratedRowWriter)10 Block (com.amazonaws.athena.connector.lambda.data.Block)8 Field (org.apache.arrow.vector.types.pojo.Field)6 BigIntExtractor (com.amazonaws.athena.connector.lambda.data.writers.extractors.BigIntExtractor)2 BitExtractor (com.amazonaws.athena.connector.lambda.data.writers.extractors.BitExtractor)2 DateDayExtractor (com.amazonaws.athena.connector.lambda.data.writers.extractors.DateDayExtractor)2 DateMilliExtractor (com.amazonaws.athena.connector.lambda.data.writers.extractors.DateMilliExtractor)2 DecimalExtractor (com.amazonaws.athena.connector.lambda.data.writers.extractors.DecimalExtractor)2 Extractor (com.amazonaws.athena.connector.lambda.data.writers.extractors.Extractor)2 Float4Extractor (com.amazonaws.athena.connector.lambda.data.writers.extractors.Float4Extractor)2 Float8Extractor (com.amazonaws.athena.connector.lambda.data.writers.extractors.Float8Extractor)2 IntExtractor (com.amazonaws.athena.connector.lambda.data.writers.extractors.IntExtractor)2 SmallIntExtractor (com.amazonaws.athena.connector.lambda.data.writers.extractors.SmallIntExtractor)2 TinyIntExtractor (com.amazonaws.athena.connector.lambda.data.writers.extractors.TinyIntExtractor)2 VarBinaryExtractor (com.amazonaws.athena.connector.lambda.data.writers.extractors.VarBinaryExtractor)2 VarCharExtractor (com.amazonaws.athena.connector.lambda.data.writers.extractors.VarCharExtractor)2 Split (com.amazonaws.athena.connector.lambda.domain.Split)2 Row (com.amazonaws.services.timestreamquery.model.Row)2 BufferedReader (java.io.BufferedReader)2 IOException (java.io.IOException)2