Search in sources :

Example 31 with SchemaBuilder

use of com.amazonaws.athena.connector.lambda.data.SchemaBuilder in project aws-athena-query-federation by awslabs.

the class DataLakeRecordHandlerTest method buildSplitSqlNew.

@Test
public void buildSplitSqlNew() throws SQLException {
    TableName tableName = new TableName("testSchema", "testTable");
    SchemaBuilder schemaBuilder = SchemaBuilder.newBuilder();
    schemaBuilder.addField(FieldBuilder.newBuilder("testCol1", Types.MinorType.INT.getType()).build());
    schemaBuilder.addField(FieldBuilder.newBuilder("testCol2", Types.MinorType.DATEDAY.getType()).build());
    schemaBuilder.addField(FieldBuilder.newBuilder("testCol3", Types.MinorType.DATEMILLI.getType()).build());
    schemaBuilder.addField(FieldBuilder.newBuilder("testCol4", Types.MinorType.VARCHAR.getType()).build());
    Schema schema = schemaBuilder.build();
    Split split = Mockito.mock(Split.class);
    Mockito.when(split.getProperty(DataLakeGen2MetadataHandler.PARTITION_NUMBER)).thenReturn("0");
    ValueSet valueSet = getSingleValueSet("varcharTest");
    Constraints constraints = Mockito.mock(Constraints.class);
    Mockito.when(constraints.getSummary()).thenReturn(new ImmutableMap.Builder<String, ValueSet>().put("testCol4", valueSet).build());
    String expectedSql = "SELECT `testCol1`, `testCol2`, `testCol3`, `testCol4` FROM `testSchema`.`testTable`  WHERE (`testCol4` = ?)";
    PreparedStatement expectedPreparedStatement = Mockito.mock(PreparedStatement.class);
    Mockito.when(this.connection.prepareStatement(Mockito.eq(expectedSql))).thenReturn(expectedPreparedStatement);
    PreparedStatement preparedStatement = this.dataLakeGen2RecordHandler.buildSplitSql(this.connection, "testCatalogName", tableName, schema, constraints, split);
    Assert.assertEquals(expectedPreparedStatement, preparedStatement);
    Mockito.verify(preparedStatement, Mockito.times(1)).setString(1, "varcharTest");
}
Also used : TableName(com.amazonaws.athena.connector.lambda.domain.TableName) Constraints(com.amazonaws.athena.connector.lambda.domain.predicate.Constraints) Schema(org.apache.arrow.vector.types.pojo.Schema) SchemaBuilder(com.amazonaws.athena.connector.lambda.data.SchemaBuilder) PreparedStatement(java.sql.PreparedStatement) Split(com.amazonaws.athena.connector.lambda.domain.Split) ValueSet(com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet) ImmutableMap(com.google.common.collect.ImmutableMap) Test(org.junit.Test)

Example 32 with SchemaBuilder

use of com.amazonaws.athena.connector.lambda.data.SchemaBuilder in project aws-athena-query-federation by awslabs.

the class ExampleMetadataHandler method doGetTable.

/**
 * Used to get definition (field names, types, descriptions, etc...) of a Table.
 *
 * @param allocator Tool for creating and managing Apache Arrow Blocks.
 * @param request Provides details on who made the request and which Athena catalog, database, and table they are querying.
 * @return A GetTableResponse which primarily contains:
 * 1. An Apache Arrow Schema object describing the table's columns, types, and descriptions.
 * 2. A Set<String> of partition column names (or empty if the table isn't partitioned).
 * 3. A TableName object confirming the schema and table name the response is for.
 * 4. A catalog name corresponding the Athena catalog that was queried.
 */
@Override
public GetTableResponse doGetTable(BlockAllocator allocator, GetTableRequest request) {
    logger.info("doGetTable: enter - " + request);
    Set<String> partitionColNames = new HashSet<>();
    /**
     * TODO: Add partitions columns, example below.
     *
     *         partitionColNames.add("year");
     *         partitionColNames.add("month");
     *         partitionColNames.add("day");
     */
    SchemaBuilder tableSchemaBuilder = SchemaBuilder.newBuilder();
    return new GetTableResponse(request.getCatalogName(), request.getTableName(), tableSchemaBuilder.build(), partitionColNames);
}
Also used : GetTableResponse(com.amazonaws.athena.connector.lambda.metadata.GetTableResponse) SchemaBuilder(com.amazonaws.athena.connector.lambda.data.SchemaBuilder) HashSet(java.util.HashSet)

Example 33 with SchemaBuilder

use of com.amazonaws.athena.connector.lambda.data.SchemaBuilder in project aws-athena-query-federation by awslabs.

the class DDBTableUtils method peekTableForSchema.

/**
 * Derives an Arrow {@link Schema} for the given table by performing a small table scan and mapping the returned
 * attribute values' types to Arrow types. If the table is empty, only attributes found in the table's metadata
 * are added to the return schema.
 *
 * @param tableName the table to derive a schema for
 * @param invoker the ThrottlingInvoker to call DDB with
 * @param ddbClient the DDB client to use
 * @return the table's derived schema
 */
public static Schema peekTableForSchema(String tableName, ThrottlingInvoker invoker, AmazonDynamoDB ddbClient) throws TimeoutException {
    ScanRequest scanRequest = new ScanRequest().withTableName(tableName).withLimit(SCHEMA_INFERENCE_NUM_RECORDS);
    ScanResult scanResult = invoker.invoke(() -> ddbClient.scan(scanRequest));
    List<Map<String, AttributeValue>> items = scanResult.getItems();
    Set<String> discoveredColumns = new HashSet<>();
    SchemaBuilder schemaBuilder = new SchemaBuilder();
    if (!items.isEmpty()) {
        for (Map<String, AttributeValue> item : items) {
            for (Map.Entry<String, AttributeValue> column : item.entrySet()) {
                if (!discoveredColumns.contains(column.getKey())) {
                    Field field = DDBTypeUtils.inferArrowField(column.getKey(), ItemUtils.toSimpleValue(column.getValue()));
                    if (field != null) {
                        schemaBuilder.addField(field);
                        discoveredColumns.add(column.getKey());
                    }
                }
            }
        }
    } else {
        // there's no items, so use any attributes defined in the table metadata
        DynamoDBTable table = getTable(tableName, invoker, ddbClient);
        for (AttributeDefinition attributeDefinition : table.getKnownAttributeDefinitions()) {
            schemaBuilder.addField(DDBTypeUtils.getArrowFieldFromDDBType(attributeDefinition.getAttributeName(), attributeDefinition.getAttributeType()));
        }
    }
    return schemaBuilder.build();
}
Also used : ScanResult(com.amazonaws.services.dynamodbv2.model.ScanResult) AttributeValue(com.amazonaws.services.dynamodbv2.model.AttributeValue) AttributeDefinition(com.amazonaws.services.dynamodbv2.model.AttributeDefinition) ScanRequest(com.amazonaws.services.dynamodbv2.model.ScanRequest) Field(org.apache.arrow.vector.types.pojo.Field) SchemaBuilder(com.amazonaws.athena.connector.lambda.data.SchemaBuilder) Map(java.util.Map) DynamoDBTable(com.amazonaws.athena.connectors.dynamodb.model.DynamoDBTable) HashSet(java.util.HashSet)

Example 34 with SchemaBuilder

use of com.amazonaws.athena.connector.lambda.data.SchemaBuilder in project aws-athena-query-federation by awslabs.

the class GlueMetadataHandler method doGetTable.

/**
 * Attempts to retrieve a Table (columns and properties) from AWS Glue for the request schema (aka database) and table
 * name with no filtering.
 *
 * @param blockAllocator Tool for creating and managing Apache Arrow Blocks.
 * @param request Provides details on who made the request and which Athena catalog, database, and table they are querying.
 * @param filter The TableFilter to apply to any matching table before generating the result.
 * @return A GetTableResponse mostly containing the columns, their types, and any table properties for the requested table.
 * @note This method throws a RuntimeException if not table matching the requested criteria (and filter) is found.
 */
protected GetTableResponse doGetTable(BlockAllocator blockAllocator, GetTableRequest request, TableFilter filter) throws Exception {
    TableName tableName = request.getTableName();
    com.amazonaws.services.glue.model.GetTableRequest getTableRequest = new com.amazonaws.services.glue.model.GetTableRequest();
    getTableRequest.setCatalogId(getCatalog(request));
    getTableRequest.setDatabaseName(tableName.getSchemaName());
    getTableRequest.setName(tableName.getTableName());
    GetTableResult result = awsGlue.getTable(getTableRequest);
    Table table = result.getTable();
    if (filter != null && !filter.filter(table)) {
        throw new RuntimeException("No matching table found " + request.getTableName());
    }
    SchemaBuilder schemaBuilder = SchemaBuilder.newBuilder();
    if (table.getParameters() != null) {
        table.getParameters().entrySet().forEach(next -> schemaBuilder.addMetadata(next.getKey(), next.getValue()));
    }
    // A column name mapping can be provided to get around restrictive Glue naming rules
    Map<String, String> columnNameMapping = getColumnNameMapping(table);
    Map<String, String> dateTimeFormatMapping = getDateTimeFormatMapping(table);
    Map<String, String> datetimeFormatMappingWithColumnName = new HashMap<>();
    Set<String> partitionCols = new HashSet<>();
    if (table.getPartitionKeys() != null) {
        partitionCols = table.getPartitionKeys().stream().map(next -> columnNameMapping.getOrDefault(next.getName(), next.getName())).collect(Collectors.toSet());
    }
    for (Column next : table.getStorageDescriptor().getColumns()) {
        String rawColumnName = next.getName();
        String mappedColumnName = columnNameMapping.getOrDefault(rawColumnName, rawColumnName);
        // apply any type override provided in typeOverrideMapping from metadata
        // this is currently only used for timestamp with timezone support
        logger.info("Column {} with registered type {}", rawColumnName, next.getType());
        schemaBuilder.addField(convertField(mappedColumnName, next.getType()));
        // Add non-null non-empty comments to metadata
        if (next.getComment() != null && !next.getComment().trim().isEmpty()) {
            schemaBuilder.addMetadata(mappedColumnName, next.getComment());
        }
        if (dateTimeFormatMapping.containsKey(rawColumnName)) {
            datetimeFormatMappingWithColumnName.put(mappedColumnName, dateTimeFormatMapping.get(rawColumnName));
        }
    }
    populateDatetimeFormatMappingIfAvailable(schemaBuilder, datetimeFormatMappingWithColumnName);
    populateSourceTableNameIfAvailable(table, schemaBuilder);
    if (table.getViewOriginalText() != null && !table.getViewOriginalText().isEmpty()) {
        schemaBuilder.addMetadata(VIEW_METADATA_FIELD, table.getViewOriginalText());
    }
    return new GetTableResponse(request.getCatalogName(), request.getTableName(), schemaBuilder.build(), partitionCols);
}
Also used : Table(com.amazonaws.services.glue.model.Table) HashMap(java.util.HashMap) TableName(com.amazonaws.athena.connector.lambda.domain.TableName) GetTableRequest(com.amazonaws.athena.connector.lambda.metadata.GetTableRequest) Column(com.amazonaws.services.glue.model.Column) GetTableResponse(com.amazonaws.athena.connector.lambda.metadata.GetTableResponse) SchemaBuilder(com.amazonaws.athena.connector.lambda.data.SchemaBuilder) GetTableResult(com.amazonaws.services.glue.model.GetTableResult) HashSet(java.util.HashSet)

Example 35 with SchemaBuilder

use of com.amazonaws.athena.connector.lambda.data.SchemaBuilder in project aws-athena-query-federation by awslabs.

the class HbaseSchemaUtils method scanAndInferSchema.

/**
 * This helper method is used in conjunction with the scan facility provided
 *
 * @param scanner The HBase ResultScanner to read results from while inferring schema.
 * @return An Apache Arrow Schema representing the schema of the HBase table.
 * @note The resulting schema is a union of the schema of every row that is scanned. Any time two rows
 * have a field with the same name but different inferred type the code will default the type of
 * that field in the resulting schema to a VARCHAR. This approach is not perfect and can struggle
 * to produce a usable schema if the table has a significant mix of entities.
 */
private static Schema scanAndInferSchema(ResultScanner scanner) {
    Map<String, Map<String, ArrowType>> schemaInference = new HashMap<>();
    int rowCount = 0;
    int fieldCount = 0;
    for (Result result : scanner) {
        rowCount++;
        for (KeyValue keyValue : result.list()) {
            fieldCount++;
            String family = new String(keyValue.getFamily());
            String column = new String(keyValue.getQualifier());
            Map<String, ArrowType> schemaForFamily = schemaInference.get(family);
            if (schemaForFamily == null) {
                schemaForFamily = new HashMap<>();
                schemaInference.put(family, schemaForFamily);
            }
            // Get the previously inferred type for this column if we've seen it on a past row
            ArrowType prevInferredType = schemaForFamily.get(column);
            // Infer the type of the column from the value on the current row.
            Types.MinorType inferredType = inferType(keyValue.getValue());
            // Check if the previous and currently inferred types match
            if (prevInferredType != null && Types.getMinorTypeForArrowType(prevInferredType) != inferredType) {
                logger.info("inferSchema: Type changed detected for field, using VARCHAR - family: {} col: {} previousType: {} newType: {}", family, column, prevInferredType, inferredType);
                schemaForFamily.put(column, Types.MinorType.VARCHAR.getType());
            } else {
                schemaForFamily.put(column, inferredType.getType());
            }
            logger.info("inferSchema: family: {} col: {} inferredType: {}", family, column, inferredType);
        }
    }
    logger.info("inferSchema: Evaluated {} field values across {} rows.", fieldCount, rowCount);
    // Used the union of all row's to produce our resultant Apache Arrow Schema.
    SchemaBuilder schemaBuilder = SchemaBuilder.newBuilder();
    for (Map.Entry<String, Map<String, ArrowType>> nextFamily : schemaInference.entrySet()) {
        String family = nextFamily.getKey();
        for (Map.Entry<String, ArrowType> nextCol : nextFamily.getValue().entrySet()) {
            schemaBuilder.addField(family + NAMESPACE_QUALIFIER + nextCol.getKey(), nextCol.getValue());
        }
    }
    Schema schema = schemaBuilder.build();
    if (schema.getFields().isEmpty()) {
        throw new RuntimeException("No columns found after scanning " + fieldCount + " values across " + rowCount + " rows. Please ensure the table is not empty and contains at least 1 supported column type.");
    }
    return schema;
}
Also used : Types(org.apache.arrow.vector.types.Types) KeyValue(org.apache.hadoop.hbase.KeyValue) HashMap(java.util.HashMap) Schema(org.apache.arrow.vector.types.pojo.Schema) ArrowType(org.apache.arrow.vector.types.pojo.ArrowType) Result(org.apache.hadoop.hbase.client.Result) SchemaBuilder(com.amazonaws.athena.connector.lambda.data.SchemaBuilder) HashMap(java.util.HashMap) Map(java.util.Map)

Aggregations

SchemaBuilder (com.amazonaws.athena.connector.lambda.data.SchemaBuilder)68 Schema (org.apache.arrow.vector.types.pojo.Schema)48 TableName (com.amazonaws.athena.connector.lambda.domain.TableName)43 Test (org.junit.Test)43 PreparedStatement (java.sql.PreparedStatement)37 ResultSet (java.sql.ResultSet)35 Constraints (com.amazonaws.athena.connector.lambda.domain.predicate.Constraints)30 BlockAllocatorImpl (com.amazonaws.athena.connector.lambda.data.BlockAllocatorImpl)23 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)23 BlockAllocator (com.amazonaws.athena.connector.lambda.data.BlockAllocator)20 Split (com.amazonaws.athena.connector.lambda.domain.Split)17 ArrowType (org.apache.arrow.vector.types.pojo.ArrowType)17 ArrayList (java.util.ArrayList)15 ValueSet (com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet)12 GetTableLayoutResponse (com.amazonaws.athena.connector.lambda.metadata.GetTableLayoutResponse)12 GetTableResponse (com.amazonaws.athena.connector.lambda.metadata.GetTableResponse)12 GetTableLayoutRequest (com.amazonaws.athena.connector.lambda.metadata.GetTableLayoutRequest)11 Connection (java.sql.Connection)10 HashMap (java.util.HashMap)10 ImmutableMap (com.google.common.collect.ImmutableMap)8