use of com.amazonaws.athena.connector.lambda.data.Block in project aws-athena-query-federation by awslabs.
the class MetricsRecordHandler method readMetricSamplesWithConstraint.
/**
* Handles retrieving the samples for a specific metric from Cloudwatch Metrics.
*/
private void readMetricSamplesWithConstraint(BlockSpiller blockSpiller, ReadRecordsRequest request, QueryStatusChecker queryStatusChecker) throws TimeoutException {
GetMetricDataRequest dataRequest = MetricUtils.makeGetMetricDataRequest(request);
Map<String, MetricDataQuery> queries = new HashMap<>();
for (MetricDataQuery query : dataRequest.getMetricDataQueries()) {
queries.put(query.getId(), query);
}
String prevToken;
ValueSet dimensionNameConstraint = request.getConstraints().getSummary().get(DIMENSION_NAME_FIELD);
ValueSet dimensionValueConstraint = request.getConstraints().getSummary().get(DIMENSION_VALUE_FIELD);
do {
prevToken = dataRequest.getNextToken();
GetMetricDataResult result = invoker.invoke(() -> metrics.getMetricData(dataRequest));
for (MetricDataResult nextMetric : result.getMetricDataResults()) {
MetricStat metricStat = queries.get(nextMetric.getId()).getMetricStat();
List<Date> timestamps = nextMetric.getTimestamps();
List<Double> values = nextMetric.getValues();
for (int i = 0; i < nextMetric.getValues().size(); i++) {
int sampleNum = i;
blockSpiller.writeRows((Block block, int row) -> {
/**
* Most constraints were already applied at split generation so we only need to apply
* a subset.
*/
block.offerValue(METRIC_NAME_FIELD, row, metricStat.getMetric().getMetricName());
block.offerValue(NAMESPACE_FIELD, row, metricStat.getMetric().getNamespace());
block.offerValue(STATISTIC_FIELD, row, metricStat.getStat());
block.offerComplexValue(DIMENSIONS_FIELD, row, (Field field, Object val) -> {
if (field.getName().equals(DIMENSION_NAME_FIELD)) {
return ((Dimension) val).getName();
} else if (field.getName().equals(DIMENSION_VALUE_FIELD)) {
return ((Dimension) val).getValue();
}
throw new RuntimeException("Unexpected field " + field.getName());
}, metricStat.getMetric().getDimensions());
// This field is 'faked' in that we just use it as a convenient way to filter single dimensions. As such
// we always populate it with the value of the filter if the constraint passed and the filter was singleValue
String dimName = (dimensionNameConstraint == null || !dimensionNameConstraint.isSingleValue()) ? null : dimensionNameConstraint.getSingleValue().toString();
block.offerValue(DIMENSION_NAME_FIELD, row, dimName);
// This field is 'faked' in that we just use it as a convenient way to filter single dimensions. As such
// we always populate it with the value of the filter if the constraint passed and the filter was singleValue
String dimVal = (dimensionValueConstraint == null || !dimensionValueConstraint.isSingleValue()) ? null : dimensionValueConstraint.getSingleValue().toString();
block.offerValue(DIMENSION_VALUE_FIELD, row, dimVal);
block.offerValue(PERIOD_FIELD, row, metricStat.getPeriod());
boolean matches = true;
block.offerValue(VALUE_FIELD, row, values.get(sampleNum));
long timestamp = timestamps.get(sampleNum).getTime() / 1000;
block.offerValue(TIMESTAMP_FIELD, row, timestamp);
return matches ? 1 : 0;
});
}
}
dataRequest.setNextToken(result.getNextToken());
} while (dataRequest.getNextToken() != null && !dataRequest.getNextToken().equalsIgnoreCase(prevToken) && queryStatusChecker.isQueryRunning());
}
use of com.amazonaws.athena.connector.lambda.data.Block in project aws-athena-query-federation by awslabs.
the class MetricsRecordHandler method readMetricsWithConstraint.
/**
* Handles retrieving the list of available metrics when the METRICS_TABLE is queried by listing metrics in Cloudwatch Metrics.
*/
private void readMetricsWithConstraint(BlockSpiller blockSpiller, ReadRecordsRequest request, QueryStatusChecker queryStatusChecker) throws TimeoutException {
ListMetricsRequest listMetricsRequest = new ListMetricsRequest();
MetricUtils.pushDownPredicate(request.getConstraints(), listMetricsRequest);
String prevToken;
Set<String> requiredFields = new HashSet<>();
request.getSchema().getFields().stream().forEach(next -> requiredFields.add(next.getName()));
ValueSet dimensionNameConstraint = request.getConstraints().getSummary().get(DIMENSION_NAME_FIELD);
ValueSet dimensionValueConstraint = request.getConstraints().getSummary().get(DIMENSION_VALUE_FIELD);
do {
prevToken = listMetricsRequest.getNextToken();
ListMetricsResult result = invoker.invoke(() -> metrics.listMetrics(listMetricsRequest));
for (Metric nextMetric : result.getMetrics()) {
blockSpiller.writeRows((Block block, int row) -> {
boolean matches = MetricUtils.applyMetricConstraints(blockSpiller.getConstraintEvaluator(), nextMetric, null);
if (matches) {
matches &= block.offerValue(METRIC_NAME_FIELD, row, nextMetric.getMetricName());
matches &= block.offerValue(NAMESPACE_FIELD, row, nextMetric.getNamespace());
matches &= block.offerComplexValue(STATISTIC_FIELD, row, DEFAULT, STATISTICS);
matches &= block.offerComplexValue(DIMENSIONS_FIELD, row, (Field field, Object val) -> {
if (field.getName().equals(DIMENSION_NAME_FIELD)) {
return ((Dimension) val).getName();
} else if (field.getName().equals(DIMENSION_VALUE_FIELD)) {
return ((Dimension) val).getValue();
}
throw new RuntimeException("Unexpected field " + field.getName());
}, nextMetric.getDimensions());
// This field is 'faked' in that we just use it as a convenient way to filter single dimensions. As such
// we always populate it with the value of the filter if the constraint passed and the filter was singleValue
String dimName = (dimensionNameConstraint == null || !dimensionNameConstraint.isSingleValue()) ? null : (dimensionNameConstraint.getSingleValue().toString());
matches &= block.offerValue(DIMENSION_NAME_FIELD, row, dimName);
// This field is 'faked' in that we just use it as a convenient way to filter single dimensions. As such
// we always populate it with the value of the filter if the constraint passed and the filter was singleValue
String dimValue = (dimensionValueConstraint == null || !dimensionValueConstraint.isSingleValue()) ? null : dimensionValueConstraint.getSingleValue().toString();
matches &= block.offerValue(DIMENSION_VALUE_FIELD, row, dimValue);
}
return matches ? 1 : 0;
});
}
listMetricsRequest.setNextToken(result.getNextToken());
} while (listMetricsRequest.getNextToken() != null && !listMetricsRequest.getNextToken().equalsIgnoreCase(prevToken) && queryStatusChecker.isQueryRunning());
}
use of com.amazonaws.athena.connector.lambda.data.Block in project aws-athena-query-federation by awslabs.
the class DynamoDBRecordHandler method readWithConstraint.
/**
* Reads data from DynamoDB by submitting either a Query or a Scan, depending
* on the type of split, and includes any filters specified in the split.
*
* @see RecordHandler
*/
@Override
protected void readWithConstraint(BlockSpiller spiller, ReadRecordsRequest recordsRequest, QueryStatusChecker queryStatusChecker) throws ExecutionException {
Split split = recordsRequest.getSplit();
// use the property instead of the request table name because of case sensitivity
String tableName = split.getProperty(TABLE_METADATA);
invokerCache.get(tableName).setBlockSpiller(spiller);
Iterator<Map<String, AttributeValue>> itemIterator = getIterator(split, tableName, recordsRequest.getSchema());
DDBRecordMetadata recordMetadata = new DDBRecordMetadata(recordsRequest.getSchema());
DynamoDBFieldResolver resolver = new DynamoDBFieldResolver(recordMetadata);
long numRows = 0;
AtomicLong numResultRows = new AtomicLong(0);
while (itemIterator.hasNext()) {
if (!queryStatusChecker.isQueryRunning()) {
// we can stop processing because the query waiting for this data has already terminated
return;
}
numRows++;
spiller.writeRows((Block block, int rowNum) -> {
Map<String, AttributeValue> item = itemIterator.next();
if (item == null) {
// had not made any DDB calls yet and there may be zero items returned when it does
return 0;
}
boolean matched = true;
numResultRows.getAndIncrement();
// TODO refactor to use GeneratedRowWriter to improve performance
for (Field nextField : recordsRequest.getSchema().getFields()) {
Object value = ItemUtils.toSimpleValue(item.get(nextField.getName()));
Types.MinorType fieldType = Types.getMinorTypeForArrowType(nextField.getType());
value = DDBTypeUtils.coerceValueToExpectedType(value, nextField, fieldType, recordMetadata);
try {
switch(fieldType) {
case LIST:
// DDB may return Set so coerce to List. Also coerce each List item to the correct type.
List valueAsList = value != null ? DDBTypeUtils.coerceListToExpectedType(value, nextField, recordMetadata) : null;
matched &= block.offerComplexValue(nextField.getName(), rowNum, resolver, valueAsList);
break;
case STRUCT:
matched &= block.offerComplexValue(nextField.getName(), rowNum, resolver, value);
break;
default:
matched &= block.offerValue(nextField.getName(), rowNum, value);
break;
}
if (!matched) {
return 0;
}
} catch (Exception ex) {
throw new RuntimeException("Error while processing field " + nextField.getName(), ex);
}
}
return 1;
});
}
logger.info("readWithConstraint: numRows[{}] numResultRows[{}]", numRows, numResultRows.get());
}
use of com.amazonaws.athena.connector.lambda.data.Block in project aws-athena-query-federation by awslabs.
the class ElasticsearchRecordHandler method readWithConstraint.
/**
* Used to read the row data associated with the provided Split.
*
* @param spiller A BlockSpiller that should be used to write the row data associated with this Split.
* The BlockSpiller automatically handles chunking the response, encrypting, and spilling to S3.
* @param recordsRequest Details of the read request, including:
* 1. The Split
* 2. The Catalog, Database, and Table the read request is for.
* 3. The filtering predicate (if any)
* 4. The columns required for projection.
* @param queryStatusChecker A QueryStatusChecker that you can use to stop doing work for a query that has already terminated
* @throws RuntimeException when an error occurs while attempting to send the query, or the query timed out.
* @note Avoid writing >10 rows per-call to BlockSpiller.writeRow(...) because this will limit the BlockSpiller's
* ability to control Block size. The resulting increase in Block size may cause failures and reduced performance.
*/
@Override
protected void readWithConstraint(BlockSpiller spiller, ReadRecordsRequest recordsRequest, QueryStatusChecker queryStatusChecker) throws RuntimeException {
logger.info("readWithConstraint - enter - Domain: {}, Index: {}, Mapping: {}", recordsRequest.getTableName().getSchemaName(), recordsRequest.getTableName().getTableName(), recordsRequest.getSchema());
String domain = recordsRequest.getTableName().getSchemaName();
String endpoint = recordsRequest.getSplit().getProperty(domain);
String index = recordsRequest.getTableName().getTableName();
String shard = recordsRequest.getSplit().getProperty(ElasticsearchMetadataHandler.SHARD_KEY);
long numRows = 0;
if (queryStatusChecker.isQueryRunning()) {
AwsRestHighLevelClient client = clientFactory.getOrCreateClient(endpoint);
try {
// Create field extractors for all data types in the schema.
GeneratedRowWriter rowWriter = createFieldExtractors(recordsRequest);
// Create a new search-source injected with the projection, predicate, and the pagination batch size.
SearchSourceBuilder searchSource = new SearchSourceBuilder().size(QUERY_BATCH_SIZE).timeout(new TimeValue(queryTimeout, TimeUnit.SECONDS)).fetchSource(ElasticsearchQueryUtils.getProjection(recordsRequest.getSchema())).query(ElasticsearchQueryUtils.getQuery(recordsRequest.getConstraints().getSummary()));
// Create a new search-request for the specified index.
SearchRequest searchRequest = new SearchRequest(index).preference(shard);
int hitsNum;
int currPosition = 0;
do {
// Process the search request injecting the search-source, and setting the from position
// used for pagination of results.
SearchResponse searchResponse = client.getDocuments(searchRequest.source(searchSource.from(currPosition)));
// Throw on query timeout.
if (searchResponse.isTimedOut()) {
throw new RuntimeException("Request for index (" + index + ") " + shard + " timed out.");
}
// Increment current position to next batch of results.
currPosition += QUERY_BATCH_SIZE;
// Process hits.
Iterator<SearchHit> hitIterator = searchResponse.getHits().iterator();
hitsNum = searchResponse.getHits().getHits().length;
while (hitIterator.hasNext() && queryStatusChecker.isQueryRunning()) {
++numRows;
spiller.writeRows((Block block, int rowNum) -> rowWriter.writeRow(block, rowNum, client.getDocument(hitIterator.next())) ? 1 : 0);
}
// if hitsNum < QUERY_BATCH_SIZE, then this is the last batch of documents.
} while (hitsNum == QUERY_BATCH_SIZE && queryStatusChecker.isQueryRunning());
} catch (IOException error) {
throw new RuntimeException("Error sending search query: " + error.getMessage(), error);
}
}
logger.info("readWithConstraint: numRows[{}]", numRows);
}
use of com.amazonaws.athena.connector.lambda.data.Block in project aws-athena-query-federation by awslabs.
the class ExampleRecordHandler method readWithConstraint.
/**
* Used to read the row data associated with the provided Split.
*
* @param spiller A BlockSpiller that should be used to write the row data associated with this Split.
* The BlockSpiller automatically handles chunking the response, encrypting, and spilling to S3.
* @param recordsRequest Details of the read request, including:
* 1. The Split
* 2. The Catalog, Database, and Table the read request is for.
* 3. The filtering predicate (if any)
* 4. The columns required for projection.
* @param queryStatusChecker A QueryStatusChecker that you can use to stop doing work for a query that has already terminated
* @throws IOException
* @note Avoid writing >10 rows per-call to BlockSpiller.writeRow(...) because this will limit the BlockSpiller's
* ability to control Block size. The resulting increase in Block size may cause failures and reduced performance.
*/
@Override
protected void readWithConstraint(BlockSpiller spiller, ReadRecordsRequest recordsRequest, QueryStatusChecker queryStatusChecker) throws IOException {
logger.info("readWithConstraint: enter - " + recordsRequest.getSplit());
Split split = recordsRequest.getSplit();
int splitYear = 0;
int splitMonth = 0;
int splitDay = 0;
/**
* TODO: Extract information about what we need to read from the split. If you are following the tutorial
* this is basically the partition column values for year, month, day.
*
* splitYear = split.getPropertyAsInt("year");
* splitMonth = split.getPropertyAsInt("month");
* splitDay = split.getPropertyAsInt("day");
*/
String dataBucket = null;
/**
* TODO: Get the data bucket from the env variable set by athena-example.yaml
*
* dataBucket = System.getenv("data_bucket");
*/
String dataKey = format("%s/%s/%s/sample_data.csv", splitYear, splitMonth, splitDay);
BufferedReader s3Reader = openS3File(dataBucket, dataKey);
if (s3Reader == null) {
// There is no data to read for this split.
return;
}
GeneratedRowWriter.RowWriterBuilder builder = GeneratedRowWriter.newBuilder(recordsRequest.getConstraints());
/**
* TODO: Add extractors for each field to our RowWRiterBuilder, the RowWriterBuilder will then 'generate'
* optomized code for converting our data to Apache Arrow, automatically minimizing memory overhead, code
* branches, etc... Later in the code when we call RowWriter for each line in our S3 file
*
* builder.withExtractor("year", (IntExtractor) (Object context, NullableIntHolder value) -> {
* value.isSet = 1;
* value.value = Integer.parseInt(((String[]) context)[0]);
* });
*
* builder.withExtractor("month", (IntExtractor) (Object context, NullableIntHolder value) -> {
* value.isSet = 1;
* value.value = Integer.parseInt(((String[]) context)[1]);
* });
*
* builder.withExtractor("day", (IntExtractor) (Object context, NullableIntHolder value) -> {
* value.isSet = 1;
* value.value = Integer.parseInt(((String[]) context)[2]);
* });
*
* builder.withExtractor("encrypted_payload", (VarCharExtractor) (Object context, NullableVarCharHolder value) -> {
* value.isSet = 1;
* value.value = ((String[]) context)[6];
* });
*/
/**
* TODO: The account_id field is a sensitive field, so we'd like to mask it to the last 4 before
* returning it to Athena. Note that this will mean you can only filter (where/having)
* on the masked value from Athena.
*
* builder.withExtractor("account_id", (VarCharExtractor) (Object context, NullableVarCharHolder value) -> {
* value.isSet = 1;
* String accountId = ((String[]) context)[3];
* value.value = accountId.length() > 4 ? accountId.substring(accountId.length() - 4) : accountId;
* });
*/
/**
* TODO: Write data for our transaction STRUCT:
* For complex types like List and Struct, we can build a Map to conveniently set nested values
*
* builder.withFieldWriterFactory("transaction",
* (FieldVector vector, Extractor extractor, ConstraintProjector constraint) ->
* (Object context, int rowNum) -> {
* Map<String, Object> eventMap = new HashMap<>();
* eventMap.put("id", Integer.parseInt(((String[])context)[4]));
* eventMap.put("completed", Boolean.parseBoolean(((String[])context)[5]));
* BlockUtils.setComplexValue(vector, rowNum, FieldResolver.DEFAULT, eventMap);
* return true; //we don't yet support predicate pushdown on complex types
* });
*/
// Used some basic code-gen to optimize how we generate response data.
GeneratedRowWriter rowWriter = builder.build();
// We read the transaction data line by line from our S3 object.
String line;
while ((line = s3Reader.readLine()) != null) {
logger.info("readWithConstraint: processing line " + line);
// The sample_data.csv file is structured as year,month,day,account_id,transaction.id,transaction.complete
String[] lineParts = line.split(",");
// We use the provided BlockSpiller to write our row data into the response. This utility is provided by
// the Amazon Athena Query Federation SDK and automatically handles breaking the data into reasonably sized
// chunks, encrypting it, and spilling to S3 if we've enabled these features.
spiller.writeRows((Block block, int rowNum) -> rowWriter.writeRow(block, rowNum, lineParts) ? 1 : 0);
}
}
Aggregations