Search in sources :

Example 46 with Block

use of com.amazonaws.athena.connector.lambda.data.Block in project aws-athena-query-federation by awslabs.

the class MetricsRecordHandler method readMetricSamplesWithConstraint.

/**
 * Handles retrieving the samples for a specific metric from Cloudwatch Metrics.
 */
private void readMetricSamplesWithConstraint(BlockSpiller blockSpiller, ReadRecordsRequest request, QueryStatusChecker queryStatusChecker) throws TimeoutException {
    GetMetricDataRequest dataRequest = MetricUtils.makeGetMetricDataRequest(request);
    Map<String, MetricDataQuery> queries = new HashMap<>();
    for (MetricDataQuery query : dataRequest.getMetricDataQueries()) {
        queries.put(query.getId(), query);
    }
    String prevToken;
    ValueSet dimensionNameConstraint = request.getConstraints().getSummary().get(DIMENSION_NAME_FIELD);
    ValueSet dimensionValueConstraint = request.getConstraints().getSummary().get(DIMENSION_VALUE_FIELD);
    do {
        prevToken = dataRequest.getNextToken();
        GetMetricDataResult result = invoker.invoke(() -> metrics.getMetricData(dataRequest));
        for (MetricDataResult nextMetric : result.getMetricDataResults()) {
            MetricStat metricStat = queries.get(nextMetric.getId()).getMetricStat();
            List<Date> timestamps = nextMetric.getTimestamps();
            List<Double> values = nextMetric.getValues();
            for (int i = 0; i < nextMetric.getValues().size(); i++) {
                int sampleNum = i;
                blockSpiller.writeRows((Block block, int row) -> {
                    /**
                     * Most constraints were already applied at split generation so we only need to apply
                     * a subset.
                     */
                    block.offerValue(METRIC_NAME_FIELD, row, metricStat.getMetric().getMetricName());
                    block.offerValue(NAMESPACE_FIELD, row, metricStat.getMetric().getNamespace());
                    block.offerValue(STATISTIC_FIELD, row, metricStat.getStat());
                    block.offerComplexValue(DIMENSIONS_FIELD, row, (Field field, Object val) -> {
                        if (field.getName().equals(DIMENSION_NAME_FIELD)) {
                            return ((Dimension) val).getName();
                        } else if (field.getName().equals(DIMENSION_VALUE_FIELD)) {
                            return ((Dimension) val).getValue();
                        }
                        throw new RuntimeException("Unexpected field " + field.getName());
                    }, metricStat.getMetric().getDimensions());
                    // This field is 'faked' in that we just use it as a convenient way to filter single dimensions. As such
                    // we always populate it with the value of the filter if the constraint passed and the filter was singleValue
                    String dimName = (dimensionNameConstraint == null || !dimensionNameConstraint.isSingleValue()) ? null : dimensionNameConstraint.getSingleValue().toString();
                    block.offerValue(DIMENSION_NAME_FIELD, row, dimName);
                    // This field is 'faked' in that we just use it as a convenient way to filter single dimensions. As such
                    // we always populate it with the value of the filter if the constraint passed and the filter was singleValue
                    String dimVal = (dimensionValueConstraint == null || !dimensionValueConstraint.isSingleValue()) ? null : dimensionValueConstraint.getSingleValue().toString();
                    block.offerValue(DIMENSION_VALUE_FIELD, row, dimVal);
                    block.offerValue(PERIOD_FIELD, row, metricStat.getPeriod());
                    boolean matches = true;
                    block.offerValue(VALUE_FIELD, row, values.get(sampleNum));
                    long timestamp = timestamps.get(sampleNum).getTime() / 1000;
                    block.offerValue(TIMESTAMP_FIELD, row, timestamp);
                    return matches ? 1 : 0;
                });
            }
        }
        dataRequest.setNextToken(result.getNextToken());
    } while (dataRequest.getNextToken() != null && !dataRequest.getNextToken().equalsIgnoreCase(prevToken) && queryStatusChecker.isQueryRunning());
}
Also used : HashMap(java.util.HashMap) MetricStat(com.amazonaws.services.cloudwatch.model.MetricStat) GetMetricDataResult(com.amazonaws.services.cloudwatch.model.GetMetricDataResult) MetricDataResult(com.amazonaws.services.cloudwatch.model.MetricDataResult) GetMetricDataResult(com.amazonaws.services.cloudwatch.model.GetMetricDataResult) Dimension(com.amazonaws.services.cloudwatch.model.Dimension) Date(java.util.Date) Field(org.apache.arrow.vector.types.pojo.Field) GetMetricDataRequest(com.amazonaws.services.cloudwatch.model.GetMetricDataRequest) Block(com.amazonaws.athena.connector.lambda.data.Block) MetricDataQuery(com.amazonaws.services.cloudwatch.model.MetricDataQuery) ValueSet(com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet)

Example 47 with Block

use of com.amazonaws.athena.connector.lambda.data.Block in project aws-athena-query-federation by awslabs.

the class MetricsRecordHandler method readMetricsWithConstraint.

/**
 * Handles retrieving the list of available metrics when the METRICS_TABLE is queried by listing metrics in Cloudwatch Metrics.
 */
private void readMetricsWithConstraint(BlockSpiller blockSpiller, ReadRecordsRequest request, QueryStatusChecker queryStatusChecker) throws TimeoutException {
    ListMetricsRequest listMetricsRequest = new ListMetricsRequest();
    MetricUtils.pushDownPredicate(request.getConstraints(), listMetricsRequest);
    String prevToken;
    Set<String> requiredFields = new HashSet<>();
    request.getSchema().getFields().stream().forEach(next -> requiredFields.add(next.getName()));
    ValueSet dimensionNameConstraint = request.getConstraints().getSummary().get(DIMENSION_NAME_FIELD);
    ValueSet dimensionValueConstraint = request.getConstraints().getSummary().get(DIMENSION_VALUE_FIELD);
    do {
        prevToken = listMetricsRequest.getNextToken();
        ListMetricsResult result = invoker.invoke(() -> metrics.listMetrics(listMetricsRequest));
        for (Metric nextMetric : result.getMetrics()) {
            blockSpiller.writeRows((Block block, int row) -> {
                boolean matches = MetricUtils.applyMetricConstraints(blockSpiller.getConstraintEvaluator(), nextMetric, null);
                if (matches) {
                    matches &= block.offerValue(METRIC_NAME_FIELD, row, nextMetric.getMetricName());
                    matches &= block.offerValue(NAMESPACE_FIELD, row, nextMetric.getNamespace());
                    matches &= block.offerComplexValue(STATISTIC_FIELD, row, DEFAULT, STATISTICS);
                    matches &= block.offerComplexValue(DIMENSIONS_FIELD, row, (Field field, Object val) -> {
                        if (field.getName().equals(DIMENSION_NAME_FIELD)) {
                            return ((Dimension) val).getName();
                        } else if (field.getName().equals(DIMENSION_VALUE_FIELD)) {
                            return ((Dimension) val).getValue();
                        }
                        throw new RuntimeException("Unexpected field " + field.getName());
                    }, nextMetric.getDimensions());
                    // This field is 'faked' in that we just use it as a convenient way to filter single dimensions. As such
                    // we always populate it with the value of the filter if the constraint passed and the filter was singleValue
                    String dimName = (dimensionNameConstraint == null || !dimensionNameConstraint.isSingleValue()) ? null : (dimensionNameConstraint.getSingleValue().toString());
                    matches &= block.offerValue(DIMENSION_NAME_FIELD, row, dimName);
                    // This field is 'faked' in that we just use it as a convenient way to filter single dimensions. As such
                    // we always populate it with the value of the filter if the constraint passed and the filter was singleValue
                    String dimValue = (dimensionValueConstraint == null || !dimensionValueConstraint.isSingleValue()) ? null : dimensionValueConstraint.getSingleValue().toString();
                    matches &= block.offerValue(DIMENSION_VALUE_FIELD, row, dimValue);
                }
                return matches ? 1 : 0;
            });
        }
        listMetricsRequest.setNextToken(result.getNextToken());
    } while (listMetricsRequest.getNextToken() != null && !listMetricsRequest.getNextToken().equalsIgnoreCase(prevToken) && queryStatusChecker.isQueryRunning());
}
Also used : ListMetricsResult(com.amazonaws.services.cloudwatch.model.ListMetricsResult) Dimension(com.amazonaws.services.cloudwatch.model.Dimension) Field(org.apache.arrow.vector.types.pojo.Field) Block(com.amazonaws.athena.connector.lambda.data.Block) ListMetricsRequest(com.amazonaws.services.cloudwatch.model.ListMetricsRequest) Metric(com.amazonaws.services.cloudwatch.model.Metric) ValueSet(com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet) HashSet(java.util.HashSet)

Example 48 with Block

use of com.amazonaws.athena.connector.lambda.data.Block in project aws-athena-query-federation by awslabs.

the class DynamoDBRecordHandler method readWithConstraint.

/**
 * Reads data from DynamoDB by submitting either a Query or a Scan, depending
 * on the type of split, and includes any filters specified in the split.
 *
 * @see RecordHandler
 */
@Override
protected void readWithConstraint(BlockSpiller spiller, ReadRecordsRequest recordsRequest, QueryStatusChecker queryStatusChecker) throws ExecutionException {
    Split split = recordsRequest.getSplit();
    // use the property instead of the request table name because of case sensitivity
    String tableName = split.getProperty(TABLE_METADATA);
    invokerCache.get(tableName).setBlockSpiller(spiller);
    Iterator<Map<String, AttributeValue>> itemIterator = getIterator(split, tableName, recordsRequest.getSchema());
    DDBRecordMetadata recordMetadata = new DDBRecordMetadata(recordsRequest.getSchema());
    DynamoDBFieldResolver resolver = new DynamoDBFieldResolver(recordMetadata);
    long numRows = 0;
    AtomicLong numResultRows = new AtomicLong(0);
    while (itemIterator.hasNext()) {
        if (!queryStatusChecker.isQueryRunning()) {
            // we can stop processing because the query waiting for this data has already terminated
            return;
        }
        numRows++;
        spiller.writeRows((Block block, int rowNum) -> {
            Map<String, AttributeValue> item = itemIterator.next();
            if (item == null) {
                // had not made any DDB calls yet and there may be zero items returned when it does
                return 0;
            }
            boolean matched = true;
            numResultRows.getAndIncrement();
            // TODO refactor to use GeneratedRowWriter to improve performance
            for (Field nextField : recordsRequest.getSchema().getFields()) {
                Object value = ItemUtils.toSimpleValue(item.get(nextField.getName()));
                Types.MinorType fieldType = Types.getMinorTypeForArrowType(nextField.getType());
                value = DDBTypeUtils.coerceValueToExpectedType(value, nextField, fieldType, recordMetadata);
                try {
                    switch(fieldType) {
                        case LIST:
                            // DDB may return Set so coerce to List. Also coerce each List item to the correct type.
                            List valueAsList = value != null ? DDBTypeUtils.coerceListToExpectedType(value, nextField, recordMetadata) : null;
                            matched &= block.offerComplexValue(nextField.getName(), rowNum, resolver, valueAsList);
                            break;
                        case STRUCT:
                            matched &= block.offerComplexValue(nextField.getName(), rowNum, resolver, value);
                            break;
                        default:
                            matched &= block.offerValue(nextField.getName(), rowNum, value);
                            break;
                    }
                    if (!matched) {
                        return 0;
                    }
                } catch (Exception ex) {
                    throw new RuntimeException("Error while processing field " + nextField.getName(), ex);
                }
            }
            return 1;
        });
    }
    logger.info("readWithConstraint: numRows[{}] numResultRows[{}]", numRows, numResultRows.get());
}
Also used : Types(org.apache.arrow.vector.types.Types) AttributeValue(com.amazonaws.services.dynamodbv2.model.AttributeValue) TimeoutException(java.util.concurrent.TimeoutException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) DynamoDBFieldResolver(com.amazonaws.athena.connectors.dynamodb.resolver.DynamoDBFieldResolver) Field(org.apache.arrow.vector.types.pojo.Field) AtomicLong(java.util.concurrent.atomic.AtomicLong) Block(com.amazonaws.athena.connector.lambda.data.Block) List(java.util.List) Split(com.amazonaws.athena.connector.lambda.domain.Split) Map(java.util.Map) HashMap(java.util.HashMap) DDBRecordMetadata(com.amazonaws.athena.connectors.dynamodb.util.DDBRecordMetadata)

Example 49 with Block

use of com.amazonaws.athena.connector.lambda.data.Block in project aws-athena-query-federation by awslabs.

the class ElasticsearchRecordHandler method readWithConstraint.

/**
 * Used to read the row data associated with the provided Split.
 *
 * @param spiller A BlockSpiller that should be used to write the row data associated with this Split.
 * The BlockSpiller automatically handles chunking the response, encrypting, and spilling to S3.
 * @param recordsRequest Details of the read request, including:
 * 1. The Split
 * 2. The Catalog, Database, and Table the read request is for.
 * 3. The filtering predicate (if any)
 * 4. The columns required for projection.
 * @param queryStatusChecker A QueryStatusChecker that you can use to stop doing work for a query that has already terminated
 * @throws RuntimeException when an error occurs while attempting to send the query, or the query timed out.
 * @note Avoid writing >10 rows per-call to BlockSpiller.writeRow(...) because this will limit the BlockSpiller's
 * ability to control Block size. The resulting increase in Block size may cause failures and reduced performance.
 */
@Override
protected void readWithConstraint(BlockSpiller spiller, ReadRecordsRequest recordsRequest, QueryStatusChecker queryStatusChecker) throws RuntimeException {
    logger.info("readWithConstraint - enter - Domain: {}, Index: {}, Mapping: {}", recordsRequest.getTableName().getSchemaName(), recordsRequest.getTableName().getTableName(), recordsRequest.getSchema());
    String domain = recordsRequest.getTableName().getSchemaName();
    String endpoint = recordsRequest.getSplit().getProperty(domain);
    String index = recordsRequest.getTableName().getTableName();
    String shard = recordsRequest.getSplit().getProperty(ElasticsearchMetadataHandler.SHARD_KEY);
    long numRows = 0;
    if (queryStatusChecker.isQueryRunning()) {
        AwsRestHighLevelClient client = clientFactory.getOrCreateClient(endpoint);
        try {
            // Create field extractors for all data types in the schema.
            GeneratedRowWriter rowWriter = createFieldExtractors(recordsRequest);
            // Create a new search-source injected with the projection, predicate, and the pagination batch size.
            SearchSourceBuilder searchSource = new SearchSourceBuilder().size(QUERY_BATCH_SIZE).timeout(new TimeValue(queryTimeout, TimeUnit.SECONDS)).fetchSource(ElasticsearchQueryUtils.getProjection(recordsRequest.getSchema())).query(ElasticsearchQueryUtils.getQuery(recordsRequest.getConstraints().getSummary()));
            // Create a new search-request for the specified index.
            SearchRequest searchRequest = new SearchRequest(index).preference(shard);
            int hitsNum;
            int currPosition = 0;
            do {
                // Process the search request injecting the search-source, and setting the from position
                // used for pagination of results.
                SearchResponse searchResponse = client.getDocuments(searchRequest.source(searchSource.from(currPosition)));
                // Throw on query timeout.
                if (searchResponse.isTimedOut()) {
                    throw new RuntimeException("Request for index (" + index + ") " + shard + " timed out.");
                }
                // Increment current position to next batch of results.
                currPosition += QUERY_BATCH_SIZE;
                // Process hits.
                Iterator<SearchHit> hitIterator = searchResponse.getHits().iterator();
                hitsNum = searchResponse.getHits().getHits().length;
                while (hitIterator.hasNext() && queryStatusChecker.isQueryRunning()) {
                    ++numRows;
                    spiller.writeRows((Block block, int rowNum) -> rowWriter.writeRow(block, rowNum, client.getDocument(hitIterator.next())) ? 1 : 0);
                }
            // if hitsNum < QUERY_BATCH_SIZE, then this is the last batch of documents.
            } while (hitsNum == QUERY_BATCH_SIZE && queryStatusChecker.isQueryRunning());
        } catch (IOException error) {
            throw new RuntimeException("Error sending search query: " + error.getMessage(), error);
        }
    }
    logger.info("readWithConstraint: numRows[{}]", numRows);
}
Also used : SearchRequest(org.elasticsearch.action.search.SearchRequest) SearchHit(org.elasticsearch.search.SearchHit) IOException(java.io.IOException) SearchSourceBuilder(org.elasticsearch.search.builder.SearchSourceBuilder) SearchResponse(org.elasticsearch.action.search.SearchResponse) GeneratedRowWriter(com.amazonaws.athena.connector.lambda.data.writers.GeneratedRowWriter) Block(com.amazonaws.athena.connector.lambda.data.Block) TimeValue(org.elasticsearch.common.unit.TimeValue)

Example 50 with Block

use of com.amazonaws.athena.connector.lambda.data.Block in project aws-athena-query-federation by awslabs.

the class ExampleRecordHandler method readWithConstraint.

/**
 * Used to read the row data associated with the provided Split.
 *
 * @param spiller A BlockSpiller that should be used to write the row data associated with this Split.
 * The BlockSpiller automatically handles chunking the response, encrypting, and spilling to S3.
 * @param recordsRequest Details of the read request, including:
 * 1. The Split
 * 2. The Catalog, Database, and Table the read request is for.
 * 3. The filtering predicate (if any)
 * 4. The columns required for projection.
 * @param queryStatusChecker A QueryStatusChecker that you can use to stop doing work for a query that has already terminated
 * @throws IOException
 * @note Avoid writing >10 rows per-call to BlockSpiller.writeRow(...) because this will limit the BlockSpiller's
 * ability to control Block size. The resulting increase in Block size may cause failures and reduced performance.
 */
@Override
protected void readWithConstraint(BlockSpiller spiller, ReadRecordsRequest recordsRequest, QueryStatusChecker queryStatusChecker) throws IOException {
    logger.info("readWithConstraint: enter - " + recordsRequest.getSplit());
    Split split = recordsRequest.getSplit();
    int splitYear = 0;
    int splitMonth = 0;
    int splitDay = 0;
    /**
     * TODO: Extract information about what we need to read from the split. If you are following the tutorial
     *  this is basically the partition column values for year, month, day.
     *
     *         splitYear = split.getPropertyAsInt("year");
     *         splitMonth = split.getPropertyAsInt("month");
     *         splitDay = split.getPropertyAsInt("day");
     */
    String dataBucket = null;
    /**
     * TODO: Get the data bucket from the env variable set by athena-example.yaml
     *
     *         dataBucket = System.getenv("data_bucket");
     */
    String dataKey = format("%s/%s/%s/sample_data.csv", splitYear, splitMonth, splitDay);
    BufferedReader s3Reader = openS3File(dataBucket, dataKey);
    if (s3Reader == null) {
        // There is no data to read for this split.
        return;
    }
    GeneratedRowWriter.RowWriterBuilder builder = GeneratedRowWriter.newBuilder(recordsRequest.getConstraints());
    /**
     * TODO: Add extractors for each field to our RowWRiterBuilder, the RowWriterBuilder will then 'generate'
     * optomized code for converting our data to Apache Arrow, automatically minimizing memory overhead, code
     * branches, etc... Later in the code when we call RowWriter for each line in our S3 file
     *
     *         builder.withExtractor("year", (IntExtractor) (Object context, NullableIntHolder value) -> {
     *             value.isSet = 1;
     *             value.value = Integer.parseInt(((String[]) context)[0]);
     *         });
     *
     *         builder.withExtractor("month", (IntExtractor) (Object context, NullableIntHolder value) -> {
     *             value.isSet = 1;
     *             value.value = Integer.parseInt(((String[]) context)[1]);
     *         });
     *
     *         builder.withExtractor("day", (IntExtractor) (Object context, NullableIntHolder value) -> {
     *             value.isSet = 1;
     *             value.value = Integer.parseInt(((String[]) context)[2]);
     *         });
     *
     *         builder.withExtractor("encrypted_payload", (VarCharExtractor) (Object context, NullableVarCharHolder value) -> {
     *             value.isSet = 1;
     *             value.value = ((String[]) context)[6];
     *         });
     */
    /**
     * TODO: The account_id field is a sensitive field, so we'd like to mask it to the last 4 before
     *  returning it to Athena. Note that this will mean you can only filter (where/having)
     *  on the masked value from Athena.
     *
     *         builder.withExtractor("account_id", (VarCharExtractor) (Object context, NullableVarCharHolder value) -> {
     *             value.isSet = 1;
     *             String accountId = ((String[]) context)[3];
     *             value.value = accountId.length() > 4 ? accountId.substring(accountId.length() - 4) : accountId;
     *         });
     */
    /**
     * TODO: Write data for our transaction STRUCT:
     * For complex types like List and Struct, we can build a Map to conveniently set nested values
     *
     *         builder.withFieldWriterFactory("transaction",
     *                (FieldVector vector, Extractor extractor, ConstraintProjector constraint) ->
     *                    (Object context, int rowNum) -> {
     *                         Map<String, Object> eventMap = new HashMap<>();
     *                         eventMap.put("id", Integer.parseInt(((String[])context)[4]));
     *                         eventMap.put("completed", Boolean.parseBoolean(((String[])context)[5]));
     *                         BlockUtils.setComplexValue(vector, rowNum, FieldResolver.DEFAULT, eventMap);
     *                         return true;    //we don't yet support predicate pushdown on complex types
     *         });
     */
    // Used some basic code-gen to optimize how we generate response data.
    GeneratedRowWriter rowWriter = builder.build();
    // We read the transaction data line by line from our S3 object.
    String line;
    while ((line = s3Reader.readLine()) != null) {
        logger.info("readWithConstraint: processing line " + line);
        // The sample_data.csv file is structured as year,month,day,account_id,transaction.id,transaction.complete
        String[] lineParts = line.split(",");
        // We use the provided BlockSpiller to write our row data into the response. This utility is provided by
        // the Amazon Athena Query Federation SDK and automatically handles breaking the data into reasonably sized
        // chunks, encrypting it, and spilling to S3 if we've enabled these features.
        spiller.writeRows((Block block, int rowNum) -> rowWriter.writeRow(block, rowNum, lineParts) ? 1 : 0);
    }
}
Also used : GeneratedRowWriter(com.amazonaws.athena.connector.lambda.data.writers.GeneratedRowWriter) BufferedReader(java.io.BufferedReader) Block(com.amazonaws.athena.connector.lambda.data.Block) Split(com.amazonaws.athena.connector.lambda.domain.Split)

Aggregations

Block (com.amazonaws.athena.connector.lambda.data.Block)113 Test (org.junit.Test)39 HashMap (java.util.HashMap)35 Schema (org.apache.arrow.vector.types.pojo.Schema)35 Constraints (com.amazonaws.athena.connector.lambda.domain.predicate.Constraints)32 Split (com.amazonaws.athena.connector.lambda.domain.Split)31 GetSplitsResponse (com.amazonaws.athena.connector.lambda.metadata.GetSplitsResponse)28 FieldReader (org.apache.arrow.vector.complex.reader.FieldReader)28 TableName (com.amazonaws.athena.connector.lambda.domain.TableName)27 SpillLocation (com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation)23 HashSet (java.util.HashSet)23 ValueSet (com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet)20 Field (org.apache.arrow.vector.types.pojo.Field)17 GetSplitsRequest (com.amazonaws.athena.connector.lambda.metadata.GetSplitsRequest)13 PreparedStatement (java.sql.PreparedStatement)13 ResultSet (java.sql.ResultSet)13 ArrayList (java.util.ArrayList)13 MetadataResponse (com.amazonaws.athena.connector.lambda.metadata.MetadataResponse)12 Connection (java.sql.Connection)12 ReadRecordsRequest (com.amazonaws.athena.connector.lambda.records.ReadRecordsRequest)11