Search in sources :

Example 31 with ValueSet

use of com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet in project aws-athena-query-federation by awslabs.

the class CloudwatchRecordHandler method pushDownConstraints.

/**
 * Attempts to push down predicates into Cloudwatch Logs by decorating the Cloudwatch Logs request.
 *
 * @param constraints The constraints for the read as provided by Athena based on the customer's query.
 * @param request The Cloudwatch Logs request to inject predicates to.
 * @return The decorated Cloudwatch Logs request.
 * @note This impl currently only pushing down SortedRangeSet filters (>=, =<, between) on the log time column.
 */
private GetLogEventsRequest pushDownConstraints(Constraints constraints, GetLogEventsRequest request) {
    ValueSet timeConstraint = constraints.getSummary().get(LOG_TIME_FIELD);
    if (timeConstraint instanceof SortedRangeSet && !timeConstraint.isNullAllowed()) {
        // SortedRangeSet is how >, <, between is represented which are easiest and most common when
        // searching logs so we attempt to push that down here as an optimization. SQL can represent complex
        // overlapping ranges which Cloudwatch can not support so this is not a replacement for applying
        // constraints using the ConstraintEvaluator.
        Range basicPredicate = ((SortedRangeSet) timeConstraint).getSpan();
        if (!basicPredicate.getLow().isNullValue()) {
            Long lowerBound = (Long) basicPredicate.getLow().getValue();
            request.setStartTime(lowerBound);
        }
        if (!basicPredicate.getHigh().isNullValue()) {
            Long upperBound = (Long) basicPredicate.getHigh().getValue();
            request.setEndTime(upperBound);
        }
    }
    return request;
}
Also used : SortedRangeSet(com.amazonaws.athena.connector.lambda.domain.predicate.SortedRangeSet) AtomicLong(java.util.concurrent.atomic.AtomicLong) Range(com.amazonaws.athena.connector.lambda.domain.predicate.Range) ValueSet(com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet)

Example 32 with ValueSet

use of com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet in project aws-athena-query-federation by awslabs.

the class CloudwatchMetadataHandlerTest method doGetTableLayout.

@Test
public void doGetTableLayout() throws Exception {
    logger.info("doGetTableLayout - enter");
    when(mockAwsLogs.describeLogStreams(any(DescribeLogStreamsRequest.class))).thenAnswer((InvocationOnMock invocationOnMock) -> {
        DescribeLogStreamsRequest request = (DescribeLogStreamsRequest) invocationOnMock.getArguments()[0];
        DescribeLogStreamsResult result = new DescribeLogStreamsResult();
        Integer nextToken;
        if (request.getNextToken() == null) {
            nextToken = 1;
        } else if (Integer.valueOf(request.getNextToken()) < 3) {
            nextToken = Integer.valueOf(request.getNextToken()) + 1;
        } else {
            nextToken = null;
        }
        List<LogStream> logStreams = new ArrayList<>();
        if (request.getNextToken() == null || Integer.valueOf(request.getNextToken()) < 3) {
            int continuation = request.getNextToken() == null ? 0 : Integer.valueOf(request.getNextToken());
            for (int i = 0 + continuation * 100; i < 300; i++) {
                LogStream nextLogStream = new LogStream();
                nextLogStream.setLogStreamName("table-" + String.valueOf(i));
                nextLogStream.setStoredBytes(i * 1000L);
                logStreams.add(nextLogStream);
            }
        }
        result.withLogStreams(logStreams);
        if (nextToken != null) {
            result.setNextToken(String.valueOf(nextToken));
        }
        return result;
    });
    Map<String, ValueSet> constraintsMap = new HashMap<>();
    constraintsMap.put("log_stream", EquatableValueSet.newBuilder(allocator, Types.MinorType.VARCHAR.getType(), true, false).add("table-10").build());
    Schema schema = SchemaBuilder.newBuilder().addStringField("log_stream").build();
    GetTableLayoutRequest req = new GetTableLayoutRequest(identity, "queryId", "default", new TableName("schema-1", "all_log_streams"), new Constraints(constraintsMap), schema, Collections.singleton("log_stream"));
    GetTableLayoutResponse res = handler.doGetTableLayout(allocator, req);
    logger.info("doGetTableLayout - {}", res.getPartitions().getSchema());
    logger.info("doGetTableLayout - {}", res.getPartitions());
    assertTrue(res.getPartitions().getSchema().findField("log_stream") != null);
    assertTrue(res.getPartitions().getRowCount() == 1);
    verify(mockAwsLogs, times(4)).describeLogStreams(any(DescribeLogStreamsRequest.class));
    logger.info("doGetTableLayout - exit");
}
Also used : HashMap(java.util.HashMap) Schema(org.apache.arrow.vector.types.pojo.Schema) ArrayList(java.util.ArrayList) LogStream(com.amazonaws.services.logs.model.LogStream) TableName(com.amazonaws.athena.connector.lambda.domain.TableName) Constraints(com.amazonaws.athena.connector.lambda.domain.predicate.Constraints) GetTableLayoutResponse(com.amazonaws.athena.connector.lambda.metadata.GetTableLayoutResponse) InvocationOnMock(org.mockito.invocation.InvocationOnMock) GetTableLayoutRequest(com.amazonaws.athena.connector.lambda.metadata.GetTableLayoutRequest) DescribeLogStreamsRequest(com.amazonaws.services.logs.model.DescribeLogStreamsRequest) DescribeLogStreamsResult(com.amazonaws.services.logs.model.DescribeLogStreamsResult) ValueSet(com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet) EquatableValueSet(com.amazonaws.athena.connector.lambda.domain.predicate.EquatableValueSet) Test(org.junit.Test)

Example 33 with ValueSet

use of com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet in project aws-athena-query-federation by awslabs.

the class DocDBRecordHandlerTest method nestedStructTest.

@Test
public void nestedStructTest() throws Exception {
    List<Document> documents = new ArrayList<>();
    Document result = new Document();
    documents.add(result);
    Document listStruct1 = new Document();
    listStruct1.put("SomeSubStruct", "someSubStruct1");
    List<Document> subList = new ArrayList<>();
    Document listSubStruct1 = new Document();
    listSubStruct1.put("SomeSubSubStruct", "someSubSubStruct");
    subList.add(listSubStruct1);
    listStruct1.put("SomeSubList", subList);
    Document listStruct2 = new Document();
    listStruct2.put("SomeSubStruct1", "someSubStruct2");
    List<Document> list = new ArrayList<>();
    list.add(listStruct1);
    list.add(listStruct1);
    Document structWithList = new Document();
    structWithList.put("SomeList", list);
    Document structWithNullList = new Document();
    structWithNullList.put("SomeNullList", null);
    Document simpleSubStruct = new Document();
    simpleSubStruct.put("SomeSimpleSubStruct", "someSimpleSubStruct");
    structWithList.put("SimpleSubStruct", simpleSubStruct);
    structWithList.put("SimpleSubStructNullList", structWithNullList);
    result.put("ComplexStruct", structWithList);
    Document simpleStruct = new Document();
    simpleStruct.put("SomeSimpleStruct", "someSimpleStruct");
    result.put("SimpleStruct", simpleStruct);
    when(mockCollection.find()).thenReturn(mockIterable);
    when(mockIterable.limit(anyInt())).thenReturn(mockIterable);
    when(mockIterable.maxScan(anyInt())).thenReturn(mockIterable);
    when(mockIterable.batchSize(anyInt())).thenReturn(mockIterable);
    when(mockIterable.iterator()).thenReturn(new StubbingCursor(documents.iterator()));
    GetTableRequest req = new GetTableRequest(IDENTITY, QUERY_ID, DEFAULT_CATALOG, TABLE_NAME);
    GetTableResponse res = mdHandler.doGetTable(allocator, req);
    logger.info("doGetTable - {}", res);
    when(mockCollection.find(any(Document.class))).thenAnswer((InvocationOnMock invocationOnMock) -> {
        logger.info("doReadRecordsNoSpill: query[{}]", invocationOnMock.getArguments()[0]);
        return mockIterable;
    });
    when(mockIterable.projection(any(Document.class))).thenAnswer((InvocationOnMock invocationOnMock) -> {
        logger.info("doReadRecordsNoSpill: projection[{}]", invocationOnMock.getArguments()[0]);
        return mockIterable;
    });
    when(mockIterable.batchSize(anyInt())).thenReturn(mockIterable);
    when(mockIterable.iterator()).thenReturn(new StubbingCursor(documents.iterator()));
    Map<String, ValueSet> constraintsMap = new HashMap<>();
    S3SpillLocation splitLoc = S3SpillLocation.newBuilder().withBucket(UUID.randomUUID().toString()).withSplitId(UUID.randomUUID().toString()).withQueryId(UUID.randomUUID().toString()).withIsDirectory(true).build();
    ReadRecordsRequest request = new ReadRecordsRequest(IDENTITY, DEFAULT_CATALOG, "queryId-" + System.currentTimeMillis(), TABLE_NAME, res.getSchema(), Split.newBuilder(splitLoc, keyFactory.create()).add(DOCDB_CONN_STR, CONNECTION_STRING).build(), new Constraints(constraintsMap), // 100GB don't expect this to spill
    100_000_000_000L, 100_000_000_000L);
    RecordResponse rawResponse = handler.doReadRecords(allocator, request);
    assertTrue(rawResponse instanceof ReadRecordsResponse);
    ReadRecordsResponse response = (ReadRecordsResponse) rawResponse;
    logger.info("doReadRecordsNoSpill: rows[{}]", response.getRecordCount());
    logger.info("doReadRecordsNoSpill: {}", BlockUtils.rowToString(response.getRecords(), 0));
    assertTrue(response.getRecordCount() == 1);
    String expectedString = "[ComplexStruct : {[SomeList : {{[SomeSubStruct : someSubStruct1]," + "[SomeSubList : {{[SomeSubSubStruct : someSubSubStruct]}}]}," + "{[SomeSubStruct : someSubStruct1],[SomeSubList : {{[SomeSubSubStruct : someSubSubStruct]}}]}}]," + "[SimpleSubStruct : {[SomeSimpleSubStruct : someSimpleSubStruct]}]," + "[SimpleSubStructNullList : {[SomeNullList : null]}]}], [SimpleStruct : {[SomeSimpleStruct : someSimpleStruct]}]";
    assertEquals(expectedString, BlockUtils.rowToString(response.getRecords(), 0));
}
Also used : HashMap(java.util.HashMap) ReadRecordsResponse(com.amazonaws.athena.connector.lambda.records.ReadRecordsResponse) RemoteReadRecordsResponse(com.amazonaws.athena.connector.lambda.records.RemoteReadRecordsResponse) ArrayList(java.util.ArrayList) Matchers.anyString(org.mockito.Matchers.anyString) RecordResponse(com.amazonaws.athena.connector.lambda.records.RecordResponse) Document(org.bson.Document) GetTableRequest(com.amazonaws.athena.connector.lambda.metadata.GetTableRequest) ReadRecordsRequest(com.amazonaws.athena.connector.lambda.records.ReadRecordsRequest) Constraints(com.amazonaws.athena.connector.lambda.domain.predicate.Constraints) GetTableResponse(com.amazonaws.athena.connector.lambda.metadata.GetTableResponse) InvocationOnMock(org.mockito.invocation.InvocationOnMock) S3SpillLocation(com.amazonaws.athena.connector.lambda.domain.spill.S3SpillLocation) ValueSet(com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet) Test(org.junit.Test)

Example 34 with ValueSet

use of com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet in project aws-athena-query-federation by awslabs.

the class DocDBRecordHandlerTest method doReadRecordsNoSpill.

@Test
public void doReadRecordsNoSpill() throws Exception {
    List<Document> documents = new ArrayList<>();
    int docNum = 11;
    Document doc1 = DocumentGenerator.makeRandomRow(schemaForRead.getFields(), docNum++);
    documents.add(doc1);
    doc1.put("col3", 22.0D);
    Document doc2 = DocumentGenerator.makeRandomRow(schemaForRead.getFields(), docNum++);
    documents.add(doc2);
    doc2.put("col3", 22.0D);
    Document doc3 = DocumentGenerator.makeRandomRow(schemaForRead.getFields(), docNum++);
    documents.add(doc3);
    doc3.put("col3", 21.0D);
    doc3.put("unsupported", new UnsupportedType());
    when(mockCollection.find(any(Document.class))).thenAnswer((InvocationOnMock invocationOnMock) -> {
        logger.info("doReadRecordsNoSpill: query[{}]", invocationOnMock.getArguments()[0]);
        return mockIterable;
    });
    when(mockIterable.projection(any(Document.class))).thenAnswer((InvocationOnMock invocationOnMock) -> {
        logger.info("doReadRecordsNoSpill: projection[{}]", invocationOnMock.getArguments()[0]);
        return mockIterable;
    });
    when(mockIterable.batchSize(anyInt())).thenReturn(mockIterable);
    when(mockIterable.iterator()).thenReturn(new StubbingCursor(documents.iterator()));
    Map<String, ValueSet> constraintsMap = new HashMap<>();
    constraintsMap.put("col3", SortedRangeSet.copyOf(Types.MinorType.FLOAT8.getType(), ImmutableList.of(Range.equal(allocator, Types.MinorType.FLOAT8.getType(), 22.0D)), false));
    S3SpillLocation splitLoc = S3SpillLocation.newBuilder().withBucket(UUID.randomUUID().toString()).withSplitId(UUID.randomUUID().toString()).withQueryId(UUID.randomUUID().toString()).withIsDirectory(true).build();
    ReadRecordsRequest request = new ReadRecordsRequest(IDENTITY, DEFAULT_CATALOG, "queryId-" + System.currentTimeMillis(), TABLE_NAME, schemaForRead, Split.newBuilder(splitLoc, keyFactory.create()).add(DOCDB_CONN_STR, CONNECTION_STRING).build(), new Constraints(constraintsMap), // 100GB don't expect this to spill
    100_000_000_000L, 100_000_000_000L);
    RecordResponse rawResponse = handler.doReadRecords(allocator, request);
    assertTrue(rawResponse instanceof ReadRecordsResponse);
    ReadRecordsResponse response = (ReadRecordsResponse) rawResponse;
    logger.info("doReadRecordsNoSpill: rows[{}]", response.getRecordCount());
    assertTrue(response.getRecords().getRowCount() == 2);
    logger.info("doReadRecordsNoSpill: {}", BlockUtils.rowToString(response.getRecords(), 0));
}
Also used : HashMap(java.util.HashMap) ReadRecordsResponse(com.amazonaws.athena.connector.lambda.records.ReadRecordsResponse) RemoteReadRecordsResponse(com.amazonaws.athena.connector.lambda.records.RemoteReadRecordsResponse) ArrayList(java.util.ArrayList) Matchers.anyString(org.mockito.Matchers.anyString) RecordResponse(com.amazonaws.athena.connector.lambda.records.RecordResponse) Document(org.bson.Document) ReadRecordsRequest(com.amazonaws.athena.connector.lambda.records.ReadRecordsRequest) Constraints(com.amazonaws.athena.connector.lambda.domain.predicate.Constraints) InvocationOnMock(org.mockito.invocation.InvocationOnMock) S3SpillLocation(com.amazonaws.athena.connector.lambda.domain.spill.S3SpillLocation) ValueSet(com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet) Test(org.junit.Test)

Example 35 with ValueSet

use of com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet in project aws-athena-query-federation by awslabs.

the class DocDBRecordHandlerTest method doReadRecordsSpill.

@Test
public void doReadRecordsSpill() throws Exception {
    List<Document> documents = new ArrayList<>();
    for (int docNum = 0; docNum < 20_000; docNum++) {
        documents.add(DocumentGenerator.makeRandomRow(schemaForRead.getFields(), docNum));
    }
    when(mockCollection.find(any(Document.class))).thenAnswer((InvocationOnMock invocationOnMock) -> {
        logger.info("doReadRecordsNoSpill: query[{}]", invocationOnMock.getArguments()[0]);
        return mockIterable;
    });
    when(mockIterable.projection(any(Document.class))).thenAnswer((InvocationOnMock invocationOnMock) -> {
        logger.info("doReadRecordsNoSpill: projection[{}]", invocationOnMock.getArguments()[0]);
        return mockIterable;
    });
    when(mockIterable.batchSize(anyInt())).thenReturn(mockIterable);
    when(mockIterable.iterator()).thenReturn(new StubbingCursor(documents.iterator()));
    Map<String, ValueSet> constraintsMap = new HashMap<>();
    constraintsMap.put("col3", SortedRangeSet.copyOf(Types.MinorType.FLOAT8.getType(), ImmutableList.of(Range.greaterThan(allocator, Types.MinorType.FLOAT8.getType(), -10000D)), false));
    S3SpillLocation splitLoc = S3SpillLocation.newBuilder().withBucket(UUID.randomUUID().toString()).withSplitId(UUID.randomUUID().toString()).withQueryId(UUID.randomUUID().toString()).withIsDirectory(true).build();
    ReadRecordsRequest request = new ReadRecordsRequest(IDENTITY, DEFAULT_CATALOG, "queryId-" + System.currentTimeMillis(), TABLE_NAME, schemaForRead, Split.newBuilder(splitLoc, keyFactory.create()).add(DOCDB_CONN_STR, CONNECTION_STRING).build(), new Constraints(constraintsMap), // ~1.5MB so we should see some spill
    1_500_000L, 0L);
    RecordResponse rawResponse = handler.doReadRecords(allocator, request);
    assertTrue(rawResponse instanceof RemoteReadRecordsResponse);
    try (RemoteReadRecordsResponse response = (RemoteReadRecordsResponse) rawResponse) {
        logger.info("doReadRecordsSpill: remoteBlocks[{}]", response.getRemoteBlocks().size());
        assertTrue(response.getNumberBlocks() > 1);
        int blockNum = 0;
        for (SpillLocation next : response.getRemoteBlocks()) {
            S3SpillLocation spillLocation = (S3SpillLocation) next;
            try (Block block = spillReader.read(spillLocation, response.getEncryptionKey(), response.getSchema())) {
                logger.info("doReadRecordsSpill: blockNum[{}] and recordCount[{}]", blockNum++, block.getRowCount());
                // assertTrue(++blockNum < response.getRemoteBlocks().size() && block.getRowCount() > 10_000);
                logger.info("doReadRecordsSpill: {}", BlockUtils.rowToString(block, 0));
                assertNotNull(BlockUtils.rowToString(block, 0));
            }
        }
    }
}
Also used : RemoteReadRecordsResponse(com.amazonaws.athena.connector.lambda.records.RemoteReadRecordsResponse) SpillLocation(com.amazonaws.athena.connector.lambda.domain.spill.SpillLocation) S3SpillLocation(com.amazonaws.athena.connector.lambda.domain.spill.S3SpillLocation) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) Matchers.anyString(org.mockito.Matchers.anyString) RecordResponse(com.amazonaws.athena.connector.lambda.records.RecordResponse) Document(org.bson.Document) ReadRecordsRequest(com.amazonaws.athena.connector.lambda.records.ReadRecordsRequest) Constraints(com.amazonaws.athena.connector.lambda.domain.predicate.Constraints) InvocationOnMock(org.mockito.invocation.InvocationOnMock) S3SpillLocation(com.amazonaws.athena.connector.lambda.domain.spill.S3SpillLocation) Block(com.amazonaws.athena.connector.lambda.data.Block) ValueSet(com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet) Test(org.junit.Test)

Aggregations

ValueSet (com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet)104 Test (org.junit.Test)66 Constraints (com.amazonaws.athena.connector.lambda.domain.predicate.Constraints)63 HashMap (java.util.HashMap)48 TableName (com.amazonaws.athena.connector.lambda.domain.TableName)47 Schema (org.apache.arrow.vector.types.pojo.Schema)37 Split (com.amazonaws.athena.connector.lambda.domain.Split)31 Range (com.amazonaws.athena.connector.lambda.domain.predicate.Range)27 ReadRecordsRequest (com.amazonaws.athena.connector.lambda.records.ReadRecordsRequest)27 EquatableValueSet (com.amazonaws.athena.connector.lambda.domain.predicate.EquatableValueSet)26 ArrayList (java.util.ArrayList)25 Matchers.anyString (org.mockito.Matchers.anyString)25 RecordResponse (com.amazonaws.athena.connector.lambda.records.RecordResponse)24 Block (com.amazonaws.athena.connector.lambda.data.Block)23 S3SpillLocation (com.amazonaws.athena.connector.lambda.domain.spill.S3SpillLocation)21 RemoteReadRecordsResponse (com.amazonaws.athena.connector.lambda.records.RemoteReadRecordsResponse)18 SchemaBuilder (com.amazonaws.athena.connector.lambda.data.SchemaBuilder)17 ReadRecordsResponse (com.amazonaws.athena.connector.lambda.records.ReadRecordsResponse)17 InvocationOnMock (org.mockito.invocation.InvocationOnMock)17 BlockAllocatorImpl (com.amazonaws.athena.connector.lambda.data.BlockAllocatorImpl)13