Search in sources :

Example 36 with ValueSet

use of com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet in project aws-athena-query-federation by awslabs.

the class CloudwatchRecordHandlerTest method doReadRecordsSpill.

@Test
public void doReadRecordsSpill() throws Exception {
    logger.info("doReadRecordsSpill: enter");
    Map<String, ValueSet> constraintsMap = new HashMap<>();
    constraintsMap.put("time", SortedRangeSet.of(Range.range(allocator, Types.MinorType.BIGINT.getType(), 100L, true, 100_000_000L, true)));
    ReadRecordsRequest request = new ReadRecordsRequest(identity, "catalog", "queryId-" + System.currentTimeMillis(), new TableName("schema", "table"), schemaForRead, Split.newBuilder(S3SpillLocation.newBuilder().withBucket(UUID.randomUUID().toString()).withSplitId(UUID.randomUUID().toString()).withQueryId(UUID.randomUUID().toString()).withIsDirectory(true).build(), keyFactory.create()).add(CloudwatchMetadataHandler.LOG_STREAM_FIELD, "table").build(), new Constraints(constraintsMap), // ~1.5MB so we should see some spill
    1_500_000L, 0);
    RecordResponse rawResponse = handler.doReadRecords(allocator, request);
    assertTrue(rawResponse instanceof RemoteReadRecordsResponse);
    try (RemoteReadRecordsResponse response = (RemoteReadRecordsResponse) rawResponse) {
        logger.info("doReadRecordsSpill: remoteBlocks[{}]", response.getRemoteBlocks().size());
        assertTrue(response.getNumberBlocks() > 1);
        int blockNum = 0;
        for (SpillLocation next : response.getRemoteBlocks()) {
            S3SpillLocation spillLocation = (S3SpillLocation) next;
            try (Block block = spillReader.read(spillLocation, response.getEncryptionKey(), response.getSchema())) {
                logger.info("doReadRecordsSpill: blockNum[{}] and recordCount[{}]", blockNum++, block.getRowCount());
                // assertTrue(++blockNum < response.getRemoteBlocks().size() && block.getRowCount() > 10_000);
                logger.info("doReadRecordsSpill: {}", BlockUtils.rowToString(block, 0));
                assertNotNull(BlockUtils.rowToString(block, 0));
            }
        }
    }
    logger.info("doReadRecordsSpill: exit");
}
Also used : RemoteReadRecordsResponse(com.amazonaws.athena.connector.lambda.records.RemoteReadRecordsResponse) SpillLocation(com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation) S3SpillLocation(com.amazonaws.athena.connector.lambda.domain.spill.S3SpillLocation) HashMap(java.util.HashMap) Matchers.anyString(org.mockito.Matchers.anyString) RecordResponse(com.amazonaws.athena.connector.lambda.records.RecordResponse) TableName(com.amazonaws.athena.connector.lambda.domain.TableName) ReadRecordsRequest(com.amazonaws.athena.connector.lambda.records.ReadRecordsRequest) Constraints(com.amazonaws.athena.connector.lambda.domain.predicate.Constraints) S3SpillLocation(com.amazonaws.athena.connector.lambda.domain.spill.S3SpillLocation) Block(com.amazonaws.athena.connector.lambda.data.Block) ValueSet(com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet) Test(org.junit.Test)

Example 37 with ValueSet

use of com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet in project aws-athena-query-federation by awslabs.

the class MetricsMetadataHandlerTest method doGetTableLayout.

@Test
public void doGetTableLayout() throws Exception {
    logger.info("doGetTableLayout - enter");
    Map<String, ValueSet> constraintsMap = new HashMap<>();
    constraintsMap.put(METRIC_NAME_FIELD, EquatableValueSet.newBuilder(allocator, Types.MinorType.VARCHAR.getType(), true, false).add("MyMetric").build());
    GetTableLayoutRequest req = new GetTableLayoutRequest(identity, "queryId", "default", new TableName(defaultSchema, "metrics"), new Constraints(constraintsMap), SchemaBuilder.newBuilder().build(), Collections.EMPTY_SET);
    GetTableLayoutResponse res = handler.doGetTableLayout(allocator, req);
    logger.info("doGetTableLayout - {}", res.getPartitions().getSchema());
    logger.info("doGetTableLayout - {}", res.getPartitions());
    assertTrue(res.getPartitions().getRowCount() == 1);
    logger.info("doGetTableLayout - exit");
}
Also used : TableName(com.amazonaws.athena.connector.lambda.domain.TableName) Constraints(com.amazonaws.athena.connector.lambda.domain.predicate.Constraints) GetTableLayoutResponse(com.amazonaws.athena.connector.lambda.metadata.GetTableLayoutResponse) HashMap(java.util.HashMap) GetTableLayoutRequest(com.amazonaws.athena.connector.lambda.metadata.GetTableLayoutRequest) ValueSet(com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet) EquatableValueSet(com.amazonaws.athena.connector.lambda.domain.predicate.EquatableValueSet) Test(org.junit.Test)

Example 38 with ValueSet

use of com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet in project aws-athena-query-federation by awslabs.

the class MetricsRecordHandlerTest method readMetricSamplesWithConstraint.

@Test
public void readMetricSamplesWithConstraint() throws Exception {
    logger.info("readMetricSamplesWithConstraint: enter");
    String namespace = "namespace";
    String metricName = "metricName";
    String statistic = "p90";
    String period = "60";
    String dimName = "dimName";
    String dimValue = "dimValue";
    List<Dimension> dimensions = Collections.singletonList(new Dimension().withName(dimName).withValue(dimValue));
    int numMetrics = 10;
    int numSamples = 10;
    AtomicLong numCalls = new AtomicLong(0);
    when(mockMetrics.getMetricData(any(GetMetricDataRequest.class))).thenAnswer((InvocationOnMock invocation) -> {
        numCalls.incrementAndGet();
        return mockMetricData(invocation, numMetrics, numSamples);
    });
    Map<String, ValueSet> constraintsMap = new HashMap<>();
    constraintsMap.put(NAMESPACE_FIELD, makeStringEquals(allocator, namespace));
    constraintsMap.put(STATISTIC_FIELD, makeStringEquals(allocator, statistic));
    constraintsMap.put(DIMENSION_NAME_FIELD, makeStringEquals(allocator, dimName));
    constraintsMap.put(DIMENSION_VALUE_FIELD, makeStringEquals(allocator, dimValue));
    S3SpillLocation spillLocation = S3SpillLocation.newBuilder().withBucket(UUID.randomUUID().toString()).withSplitId(UUID.randomUUID().toString()).withQueryId(UUID.randomUUID().toString()).withIsDirectory(true).build();
    List<MetricStat> metricStats = new ArrayList<>();
    metricStats.add(new MetricStat().withMetric(new Metric().withNamespace(namespace).withMetricName(metricName).withDimensions(dimensions)).withPeriod(60).withStat(statistic));
    Split split = Split.newBuilder(spillLocation, keyFactory.create()).add(MetricStatSerDe.SERIALIZED_METRIC_STATS_FIELD_NAME, MetricStatSerDe.serialize(metricStats)).add(METRIC_NAME_FIELD, metricName).add(NAMESPACE_FIELD, namespace).add(STATISTIC_FIELD, statistic).add(PERIOD_FIELD, period).build();
    ReadRecordsRequest request = new ReadRecordsRequest(identity, "catalog", "queryId-" + System.currentTimeMillis(), METRIC_SAMPLES_TABLE_NAME, METRIC_DATA_TABLE.getSchema(), split, new Constraints(constraintsMap), 100_000_000_000L, // 100GB don't expect this to spill
    100_000_000_000L);
    RecordResponse rawResponse = handler.doReadRecords(allocator, request);
    assertTrue(rawResponse instanceof ReadRecordsResponse);
    ReadRecordsResponse response = (ReadRecordsResponse) rawResponse;
    logger.info("readMetricSamplesWithConstraint: rows[{}]", response.getRecordCount());
    assertEquals(numCalls.get() * numMetrics * numSamples, response.getRecords().getRowCount());
    logger.info("readMetricSamplesWithConstraint: {}", BlockUtils.rowToString(response.getRecords(), 0));
    logger.info("readMetricSamplesWithConstraint: exit");
}
Also used : HashMap(java.util.HashMap) ReadRecordsResponse(com.amazonaws.athena.connector.lambda.records.ReadRecordsResponse) MetricStat(com.amazonaws.services.cloudwatch.model.MetricStat) ArrayList(java.util.ArrayList) Matchers.anyString(org.mockito.Matchers.anyString) Dimension(com.amazonaws.services.cloudwatch.model.Dimension) RecordResponse(com.amazonaws.athena.connector.lambda.records.RecordResponse) AtomicLong(java.util.concurrent.atomic.AtomicLong) ReadRecordsRequest(com.amazonaws.athena.connector.lambda.records.ReadRecordsRequest) Constraints(com.amazonaws.athena.connector.lambda.domain.predicate.Constraints) GetMetricDataRequest(com.amazonaws.services.cloudwatch.model.GetMetricDataRequest) InvocationOnMock(org.mockito.invocation.InvocationOnMock) S3SpillLocation(com.amazonaws.athena.connector.lambda.domain.spill.S3SpillLocation) Metric(com.amazonaws.services.cloudwatch.model.Metric) Split(com.amazonaws.athena.connector.lambda.domain.Split) ValueSet(com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet) Test(org.junit.Test)

Example 39 with ValueSet

use of com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet in project aws-athena-query-federation by awslabs.

the class DynamoDBMetadataHandler method enhancePartitionSchema.

/**
 * Generates a partition schema with metadata derived from available predicates.  This metadata will be
 * copied to splits in the #doGetSplits call.  At this point it is determined whether we can partition
 * by hash key or fall back to a full table scan.
 *
 * @see GlueMetadataHandler
 */
@Override
public void enhancePartitionSchema(SchemaBuilder partitionSchemaBuilder, GetTableLayoutRequest request) {
    // use the source table name from the schema if available (in case Glue table name != actual table name)
    String tableName = getSourceTableName(request.getSchema());
    if (tableName == null) {
        tableName = request.getTableName().getTableName();
    }
    DynamoDBTable table = null;
    try {
        table = tableResolver.getTableMetadata(tableName);
    } catch (TimeoutException e) {
        throw new RuntimeException(e);
    }
    // add table name so we don't have to do case insensitive resolution again
    partitionSchemaBuilder.addMetadata(TABLE_METADATA, table.getName());
    Map<String, ValueSet> summary = request.getConstraints().getSummary();
    List<String> requestedCols = request.getSchema().getFields().stream().map(Field::getName).collect(Collectors.toList());
    DynamoDBIndex index = DDBPredicateUtils.getBestIndexForPredicates(table, requestedCols, summary);
    logger.info("using index: {}", index.getName());
    String hashKeyName = index.getHashKey();
    ValueSet hashKeyValueSet = summary.get(hashKeyName);
    List<Object> hashKeyValues = (hashKeyValueSet != null) ? DDBPredicateUtils.getHashKeyAttributeValues(hashKeyValueSet) : Collections.emptyList();
    DDBRecordMetadata recordMetadata = new DDBRecordMetadata(request.getSchema());
    Set<String> columnsToIgnore = new HashSet<>();
    List<AttributeValue> valueAccumulator = new ArrayList<>();
    IncrementingValueNameProducer valueNameProducer = new IncrementingValueNameProducer();
    if (!hashKeyValues.isEmpty()) {
        // can "partition" on hash key
        partitionSchemaBuilder.addField(hashKeyName, hashKeyValueSet.getType());
        partitionSchemaBuilder.addMetadata(HASH_KEY_NAME_METADATA, hashKeyName);
        columnsToIgnore.add(hashKeyName);
        partitionSchemaBuilder.addMetadata(PARTITION_TYPE_METADATA, QUERY_PARTITION_TYPE);
        if (!table.getName().equals(index.getName())) {
            partitionSchemaBuilder.addMetadata(INDEX_METADATA, index.getName());
        }
        // add range key filter if there is one
        Optional<String> rangeKey = index.getRangeKey();
        if (rangeKey.isPresent()) {
            String rangeKeyName = rangeKey.get();
            if (summary.containsKey(rangeKeyName)) {
                String rangeKeyFilter = DDBPredicateUtils.generateSingleColumnFilter(rangeKeyName, summary.get(rangeKeyName), valueAccumulator, valueNameProducer, recordMetadata);
                partitionSchemaBuilder.addMetadata(RANGE_KEY_NAME_METADATA, rangeKeyName);
                partitionSchemaBuilder.addMetadata(RANGE_KEY_FILTER_METADATA, rangeKeyFilter);
                columnsToIgnore.add(rangeKeyName);
            }
        }
    } else {
        // always fall back to a scan
        partitionSchemaBuilder.addField(SEGMENT_COUNT_METADATA, Types.MinorType.INT.getType());
        partitionSchemaBuilder.addMetadata(PARTITION_TYPE_METADATA, SCAN_PARTITION_TYPE);
    }
    // We will exclude the columns with custom types from filter clause when querying/scanning DDB
    // As those types are not natively supported by DDB or Glue
    // So we have to filter the results after the query/scan result is returned
    columnsToIgnore.addAll(recordMetadata.getNonComparableColumns());
    precomputeAdditionalMetadata(columnsToIgnore, summary, valueAccumulator, valueNameProducer, partitionSchemaBuilder, recordMetadata);
}
Also used : IncrementingValueNameProducer(com.amazonaws.athena.connectors.dynamodb.util.IncrementingValueNameProducer) AttributeValue(com.amazonaws.services.dynamodbv2.model.AttributeValue) ArrayList(java.util.ArrayList) DynamoDBIndex(com.amazonaws.athena.connectors.dynamodb.model.DynamoDBIndex) ValueSet(com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet) DDBRecordMetadata(com.amazonaws.athena.connectors.dynamodb.util.DDBRecordMetadata) DynamoDBTable(com.amazonaws.athena.connectors.dynamodb.model.DynamoDBTable) TimeoutException(java.util.concurrent.TimeoutException) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet)

Example 40 with ValueSet

use of com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet in project aws-athena-query-federation by awslabs.

the class DocDBRecordHandler method readWithConstraint.

/**
 * Scans DocumentDB using the scan settings set on the requested Split by DocDBeMetadataHandler.
 *
 * @see RecordHandler
 */
@Override
protected void readWithConstraint(BlockSpiller spiller, ReadRecordsRequest recordsRequest, QueryStatusChecker queryStatusChecker) {
    TableName tableName = recordsRequest.getTableName();
    Map<String, ValueSet> constraintSummary = recordsRequest.getConstraints().getSummary();
    MongoClient client = getOrCreateConn(recordsRequest.getSplit());
    MongoDatabase db = client.getDatabase(tableName.getSchemaName());
    MongoCollection<Document> table = db.getCollection(tableName.getTableName());
    Document query = QueryUtils.makeQuery(recordsRequest.getSchema(), constraintSummary);
    Document output = QueryUtils.makeProjection(recordsRequest.getSchema());
    logger.info("readWithConstraint: query[{}] projection[{}]", query, output);
    final MongoCursor<Document> iterable = table.find(query).projection(output).batchSize(MONGO_QUERY_BATCH_SIZE).iterator();
    long numRows = 0;
    AtomicLong numResultRows = new AtomicLong(0);
    while (iterable.hasNext() && queryStatusChecker.isQueryRunning()) {
        numRows++;
        spiller.writeRows((Block block, int rowNum) -> {
            Document doc = iterable.next();
            boolean matched = true;
            for (Field nextField : recordsRequest.getSchema().getFields()) {
                Object value = TypeUtils.coerce(nextField, doc.get(nextField.getName()));
                Types.MinorType fieldType = Types.getMinorTypeForArrowType(nextField.getType());
                try {
                    switch(fieldType) {
                        case LIST:
                        case STRUCT:
                            matched &= block.offerComplexValue(nextField.getName(), rowNum, DEFAULT_FIELD_RESOLVER, value);
                            break;
                        default:
                            matched &= block.offerValue(nextField.getName(), rowNum, value);
                            break;
                    }
                    if (!matched) {
                        return 0;
                    }
                } catch (Exception ex) {
                    throw new RuntimeException("Error while processing field " + nextField.getName(), ex);
                }
            }
            numResultRows.getAndIncrement();
            return 1;
        });
    }
    logger.info("readWithConstraint: numRows[{}] numResultRows[{}]", numRows, numResultRows.get());
}
Also used : Types(org.apache.arrow.vector.types.Types) Document(org.bson.Document) TableName(com.amazonaws.athena.connector.lambda.domain.TableName) MongoClient(com.mongodb.client.MongoClient) Field(org.apache.arrow.vector.types.pojo.Field) AtomicLong(java.util.concurrent.atomic.AtomicLong) Block(com.amazonaws.athena.connector.lambda.data.Block) ValueSet(com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet) MongoDatabase(com.mongodb.client.MongoDatabase)

Aggregations

ValueSet (com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet)104 Test (org.junit.Test)66 Constraints (com.amazonaws.athena.connector.lambda.domain.predicate.Constraints)63 HashMap (java.util.HashMap)48 TableName (com.amazonaws.athena.connector.lambda.domain.TableName)47 Schema (org.apache.arrow.vector.types.pojo.Schema)37 Split (com.amazonaws.athena.connector.lambda.domain.Split)31 Range (com.amazonaws.athena.connector.lambda.domain.predicate.Range)27 ReadRecordsRequest (com.amazonaws.athena.connector.lambda.records.ReadRecordsRequest)27 EquatableValueSet (com.amazonaws.athena.connector.lambda.domain.predicate.EquatableValueSet)26 ArrayList (java.util.ArrayList)25 Matchers.anyString (org.mockito.Matchers.anyString)25 RecordResponse (com.amazonaws.athena.connector.lambda.records.RecordResponse)24 Block (com.amazonaws.athena.connector.lambda.data.Block)23 S3SpillLocation (com.amazonaws.athena.connector.lambda.domain.spill.S3SpillLocation)21 RemoteReadRecordsResponse (com.amazonaws.athena.connector.lambda.records.RemoteReadRecordsResponse)18 SchemaBuilder (com.amazonaws.athena.connector.lambda.data.SchemaBuilder)17 ReadRecordsResponse (com.amazonaws.athena.connector.lambda.records.ReadRecordsResponse)17 InvocationOnMock (org.mockito.invocation.InvocationOnMock)17 BlockAllocatorImpl (com.amazonaws.athena.connector.lambda.data.BlockAllocatorImpl)13