Search in sources :

Example 66 with SchemaBuilder

use of com.amazonaws.athena.connector.lambda.data.SchemaBuilder in project aws-athena-query-federation by awslabs.

the class DataLakeGen2MetadataHandlerTest method doGetTableLayoutWithNoPartitions.

@Test
public void doGetTableLayoutWithNoPartitions() throws Exception {
    BlockAllocator blockAllocator = new BlockAllocatorImpl();
    Constraints constraints = Mockito.mock(Constraints.class);
    TableName tableName = new TableName("testSchema", "testTable");
    Schema partitionSchema = this.dataLakeGen2MetadataHandler.getPartitionSchema("testCatalogName");
    Set<String> partitionCols = partitionSchema.getFields().stream().map(Field::getName).collect(Collectors.toSet());
    GetTableLayoutRequest getTableLayoutRequest = new GetTableLayoutRequest(this.federatedIdentity, "testQueryId", "testCatalogName", tableName, constraints, partitionSchema, partitionCols);
    GetTableLayoutResponse getTableLayoutResponse = this.dataLakeGen2MetadataHandler.doGetTableLayout(blockAllocator, getTableLayoutRequest);
    List<String> actualValues = new ArrayList<>();
    for (int i = 0; i < getTableLayoutResponse.getPartitions().getRowCount(); i++) {
        actualValues.add(BlockUtils.rowToString(getTableLayoutResponse.getPartitions(), i));
    }
    Assert.assertEquals(Collections.singletonList("[PARTITION_NUMBER : 0]"), actualValues);
    SchemaBuilder expectedSchemaBuilder = SchemaBuilder.newBuilder();
    expectedSchemaBuilder.addField(FieldBuilder.newBuilder(DataLakeGen2MetadataHandler.PARTITION_NUMBER, org.apache.arrow.vector.types.Types.MinorType.VARCHAR.getType()).build());
    Schema expectedSchema = expectedSchemaBuilder.build();
    Assert.assertEquals(expectedSchema, getTableLayoutResponse.getPartitions().getSchema());
    Assert.assertEquals(tableName, getTableLayoutResponse.getTableName());
}
Also used : Schema(org.apache.arrow.vector.types.pojo.Schema) ArrayList(java.util.ArrayList) TableName(com.amazonaws.athena.connector.lambda.domain.TableName) Constraints(com.amazonaws.athena.connector.lambda.domain.predicate.Constraints) GetTableLayoutResponse(com.amazonaws.athena.connector.lambda.metadata.GetTableLayoutResponse) BlockAllocatorImpl(com.amazonaws.athena.connector.lambda.data.BlockAllocatorImpl) BlockAllocator(com.amazonaws.athena.connector.lambda.data.BlockAllocator) GetTableLayoutRequest(com.amazonaws.athena.connector.lambda.metadata.GetTableLayoutRequest) SchemaBuilder(com.amazonaws.athena.connector.lambda.data.SchemaBuilder) Test(org.junit.Test)

Example 67 with SchemaBuilder

use of com.amazonaws.athena.connector.lambda.data.SchemaBuilder in project aws-athena-query-federation by awslabs.

the class ElasticsearchSchemaUtils method parseMapping.

/**
 * Main parsing method for the GET <index>/_mapping request.
 * @param mappings is the structure that contains the metadata definitions for the index, as well as the _meta
 *                 property used to define list fields.
 * @return a Schema derived from the mapping.
 */
protected static Schema parseMapping(Map<String, Object> mappings) {
    // Used to store the _meta structure (the mapping containing the fields that should be considered a list).
    Map<String, Object> meta = new HashMap<>();
    SchemaBuilder builder = SchemaBuilder.newBuilder();
    // Schema to indicate which fields should be considered a LIST.
    if (mappings.containsKey("_meta")) {
        meta.putAll((Map) mappings.get("_meta"));
    }
    if (mappings.containsKey("properties")) {
        Map<String, Object> fields = (Map) mappings.get("properties");
        for (Map.Entry<String, Object> entry : fields.entrySet()) {
            String fieldName = entry.getKey();
            Map<String, Object> value = (Map) entry.getValue();
            builder.addField(inferField(fieldName, fieldName, value, meta));
        }
    }
    return builder.build();
}
Also used : HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) SchemaBuilder(com.amazonaws.athena.connector.lambda.data.SchemaBuilder) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map)

Example 68 with SchemaBuilder

use of com.amazonaws.athena.connector.lambda.data.SchemaBuilder in project aws-athena-query-federation by awslabs.

the class MetadataHandler method doGetTableLayout.

/**
 * Used to get the partitions that must be read from the request table in order to satisfy the requested predicate.
 *
 * @param allocator Tool for creating and managing Apache Arrow Blocks.
 * @param request Provides details of the catalog, database, and table being queried as well as any filter predicate.
 * @return A GetTableLayoutResponse which primarily contains:
 * 1. An Apache Arrow Block with 0 or more partitions to read. 0 partitions implies there are 0 rows to read.
 * 2. Set<String> of partition column names which should correspond to columns in your Apache Arrow Block.
 * @note Partitions are opaque to Amazon Athena in that it does not understand their contents, just that it must call
 * doGetSplits(...) for each partition you return in order to determine which reads to perform and if those reads
 * can be parallelized. This means the contents of this response are more for you than they are for Athena.
 * @note Partitions are partially opaque to Amazon Athena in that it only understands your partition columns and
 * how to filter out partitions that do not meet the query's constraints. Any additional columns you add to the
 * partition data are ignored by Athena but passed on to calls on GetSplits.
 */
public GetTableLayoutResponse doGetTableLayout(final BlockAllocator allocator, final GetTableLayoutRequest request) throws Exception {
    SchemaBuilder constraintSchema = new SchemaBuilder().newBuilder();
    SchemaBuilder partitionSchemaBuilder = new SchemaBuilder().newBuilder();
    /**
     * Add our partition columns to the response schema so the engine knows how to interpret the list of
     * partitions we are going to return.
     */
    for (String nextPartCol : request.getPartitionCols()) {
        Field partitionCol = request.getSchema().findField(nextPartCol);
        partitionSchemaBuilder.addField(nextPartCol, partitionCol.getType());
        constraintSchema.addField(nextPartCol, partitionCol.getType());
    }
    enhancePartitionSchema(partitionSchemaBuilder, request);
    Schema partitionSchema = partitionSchemaBuilder.build();
    if (partitionSchema.getFields().isEmpty() && partitionSchema.getCustomMetadata().isEmpty()) {
        // Even though our table doesn't support complex layouts, partitioning or metadata, we need to convey that there is at least
        // 1 partition to read as part of the query or Athena will assume partition pruning found no candidate layouts to read.
        Block partitions = BlockUtils.newBlock(allocator, PARTITION_ID_COL, Types.MinorType.INT.getType(), 1);
        return new GetTableLayoutResponse(request.getCatalogName(), request.getTableName(), partitions);
    }
    /**
     * Now use the constraint that was in the request to do some partition pruning. Here we are just
     * generating some fake values for the partitions but in a real implementation you'd use your metastore
     * or knowledge of the actual table's physical layout to do this.
     */
    try (ConstraintEvaluator constraintEvaluator = new ConstraintEvaluator(allocator, constraintSchema.build(), request.getConstraints());
        QueryStatusChecker queryStatusChecker = new QueryStatusChecker(athena, athenaInvoker, request.getQueryId())) {
        Block partitions = allocator.createBlock(partitionSchemaBuilder.build());
        partitions.constrain(constraintEvaluator);
        SimpleBlockWriter blockWriter = new SimpleBlockWriter(partitions);
        getPartitions(blockWriter, request, queryStatusChecker);
        return new GetTableLayoutResponse(request.getCatalogName(), request.getTableName(), partitions);
    }
}
Also used : Field(org.apache.arrow.vector.types.pojo.Field) GetTableLayoutResponse(com.amazonaws.athena.connector.lambda.metadata.GetTableLayoutResponse) QueryStatusChecker(com.amazonaws.athena.connector.lambda.QueryStatusChecker) Schema(org.apache.arrow.vector.types.pojo.Schema) SchemaBuilder(com.amazonaws.athena.connector.lambda.data.SchemaBuilder) Block(com.amazonaws.athena.connector.lambda.data.Block) ConstraintEvaluator(com.amazonaws.athena.connector.lambda.domain.predicate.ConstraintEvaluator) SimpleBlockWriter(com.amazonaws.athena.connector.lambda.data.SimpleBlockWriter)

Aggregations

SchemaBuilder (com.amazonaws.athena.connector.lambda.data.SchemaBuilder)68 Schema (org.apache.arrow.vector.types.pojo.Schema)48 TableName (com.amazonaws.athena.connector.lambda.domain.TableName)43 Test (org.junit.Test)43 PreparedStatement (java.sql.PreparedStatement)37 ResultSet (java.sql.ResultSet)35 Constraints (com.amazonaws.athena.connector.lambda.domain.predicate.Constraints)30 BlockAllocatorImpl (com.amazonaws.athena.connector.lambda.data.BlockAllocatorImpl)23 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)23 BlockAllocator (com.amazonaws.athena.connector.lambda.data.BlockAllocator)20 Split (com.amazonaws.athena.connector.lambda.domain.Split)17 ArrowType (org.apache.arrow.vector.types.pojo.ArrowType)17 ArrayList (java.util.ArrayList)15 ValueSet (com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet)12 GetTableLayoutResponse (com.amazonaws.athena.connector.lambda.metadata.GetTableLayoutResponse)12 GetTableResponse (com.amazonaws.athena.connector.lambda.metadata.GetTableResponse)12 GetTableLayoutRequest (com.amazonaws.athena.connector.lambda.metadata.GetTableLayoutRequest)11 Connection (java.sql.Connection)10 HashMap (java.util.HashMap)10 ImmutableMap (com.google.common.collect.ImmutableMap)8