Search in sources :

Example 26 with SchemaBuilder

use of com.amazonaws.athena.connector.lambda.data.SchemaBuilder in project aws-athena-query-federation by awslabs.

the class ImpalaMetadataHandler method getSchema.

/**
 * Used to convert Impala data types to Apache arrow data types
 * @param jdbcConnection  A JDBC Impala database connection
 * @param tableName   Holds table name and schema name. see {@link TableName}
 * @param partitionSchema A partition schema for a given table .See {@link Schema}
 * @return Schema  Holds Table schema along with partition schema. See {@link Schema}
 * @throws SQLException A SQLException should be thrown for database connection failures , query syntax errors and so on.
 */
private Schema getSchema(Connection jdbcConnection, TableName tableName, Schema partitionSchema) throws SQLException {
    SchemaBuilder schemaBuilder = SchemaBuilder.newBuilder();
    try (ResultSet resultSet = getColumns(jdbcConnection.getCatalog(), tableName, jdbcConnection.getMetaData());
        Connection connection = getJdbcConnectionFactory().getConnection(getCredentialProvider())) {
        try (PreparedStatement psmt = connection.prepareStatement(GET_METADATA_QUERY + tableName.getTableName().toUpperCase())) {
            Map<String, String> hashMap = getMetadataForGivenTable(psmt);
            while (resultSet.next()) {
                ArrowType columnType = JdbcArrowTypeConverter.toArrowType(resultSet.getInt("DATA_TYPE"), resultSet.getInt("COLUMN_SIZE"), resultSet.getInt("DECIMAL_DIGITS"));
                String columnName = resultSet.getString(ImpalaConstants.COLUMN_NAME);
                String dataType = hashMap.get(columnName);
                LOGGER.debug("columnName:" + columnName);
                LOGGER.debug("dataType:" + dataType);
                /**
                 * Converting date data type into DATEDAY MinorType
                 */
                if (dataType != null && dataType.toUpperCase().contains("DATE")) {
                    columnType = Types.MinorType.DATEDAY.getType();
                }
                if (dataType != null && dataType.toUpperCase().contains("BINARY")) {
                    columnType = Types.MinorType.VARBINARY.getType();
                }
                /**
                 * Converting double data type into FLOAT8 MinorType
                 */
                if (dataType != null && dataType.toUpperCase().contains("DOUBLE")) {
                    columnType = Types.MinorType.FLOAT8.getType();
                }
                /**
                 * Converting boolean data type into BIT MinorType
                 */
                if (dataType != null && dataType.toUpperCase().contains("BOOLEAN")) {
                    columnType = Types.MinorType.BIT.getType();
                }
                /**
                 * Converting float data type into FLOAT4 MinorType
                 */
                if (dataType != null && dataType.toUpperCase().contains("FLOAT")) {
                    columnType = Types.MinorType.FLOAT4.getType();
                }
                /**
                 * Converting TIMESTAMP data type into DATEMILLI MinorType
                 */
                if (dataType != null && dataType.toUpperCase().contains("TIMESTAMP")) {
                    columnType = Types.MinorType.DATEMILLI.getType();
                }
                /**
                 * Converting other data type into VARCHAR MinorType
                 */
                if (columnType == null) {
                    columnType = Types.MinorType.VARCHAR.getType();
                }
                if (columnType != null && !SupportedTypes.isSupported(columnType)) {
                    columnType = Types.MinorType.VARCHAR.getType();
                }
                schemaBuilder.addField(FieldBuilder.newBuilder(columnName, columnType).build());
            }
        }
        partitionSchema.getFields().forEach(schemaBuilder::addField);
        return schemaBuilder.build();
    }
}
Also used : SchemaBuilder(com.amazonaws.athena.connector.lambda.data.SchemaBuilder) ResultSet(java.sql.ResultSet) Connection(java.sql.Connection) ArrowType(org.apache.arrow.vector.types.pojo.ArrowType) PreparedStatement(java.sql.PreparedStatement)

Example 27 with SchemaBuilder

use of com.amazonaws.athena.connector.lambda.data.SchemaBuilder in project aws-athena-query-federation by awslabs.

the class HiveMetadataHandler method getSchema.

/**
 * Used to convert Hive data types to Apache arrow data types
 * @param jdbcConnection  A JDBC Hive database connection
 * @param tableName   Holds table name and schema name. see {@link TableName}
 * @param partitionSchema A partition schema for a given table .See {@link Schema}
 * @return Schema  Holds Table schema along with partition schema. See {@link Schema}
 * @throws SQLException A SQLException should be thrown for database connection failures , query syntax errors and so on.
 */
private Schema getSchema(Connection jdbcConnection, TableName tableName, Schema partitionSchema) throws SQLException {
    SchemaBuilder schemaBuilder = SchemaBuilder.newBuilder();
    try (ResultSet resultSet = getColumns(jdbcConnection.getCatalog(), tableName, jdbcConnection.getMetaData());
        Connection connection = getJdbcConnectionFactory().getConnection(getCredentialProvider())) {
        try (PreparedStatement psmt = connection.prepareStatement(GET_METADATA_QUERY + tableName.getTableName().toUpperCase())) {
            Map<String, String> meteHashMap = getMetadataForGivenTable(psmt);
            while (resultSet.next()) {
                ArrowType columnType = JdbcArrowTypeConverter.toArrowType(resultSet.getInt("DATA_TYPE"), resultSet.getInt("COLUMN_SIZE"), resultSet.getInt("DECIMAL_DIGITS"));
                String columnName = resultSet.getString(HiveConstants.COLUMN_NAME);
                String dataType = meteHashMap.get(columnName);
                LOGGER.debug("columnName:" + columnName);
                LOGGER.debug("dataType:" + dataType);
                /**
                 * Converting date data type into DATEDAY MinorType
                 */
                if (dataType != null && (dataType.toUpperCase().contains("DATE"))) {
                    columnType = Types.MinorType.DATEDAY.getType();
                }
                if (dataType != null && (dataType.toUpperCase().contains("BINARY"))) {
                    columnType = Types.MinorType.VARBINARY.getType();
                }
                /**
                 * Converting double data type into FLOAT8 MinorType
                 */
                if (dataType != null && dataType.toUpperCase().contains("DOUBLE")) {
                    columnType = Types.MinorType.FLOAT8.getType();
                }
                /**
                 * Converting boolean data type into BIT MinorType
                 */
                if (dataType != null && dataType.toUpperCase().contains("BOOLEAN")) {
                    columnType = Types.MinorType.BIT.getType();
                }
                /**
                 * Converting float data type into FLOAT4 MinorType
                 */
                if (dataType != null && dataType.contains("FLOAT")) {
                    columnType = Types.MinorType.FLOAT4.getType();
                }
                /**
                 * Converting  TIMESTAMP data type into DATEMILLI MinorType
                 */
                if (dataType != null && (dataType.toUpperCase().contains("TIMESTAMP"))) {
                    columnType = Types.MinorType.DATEMILLI.getType();
                }
                /**
                 * Converting other data type into VARCHAR MinorType
                 */
                if ((columnType == null) || (columnType != null && !SupportedTypes.isSupported(columnType))) {
                    columnType = Types.MinorType.VARCHAR.getType();
                }
                schemaBuilder.addField(FieldBuilder.newBuilder(columnName, columnType).build());
            }
        }
        partitionSchema.getFields().forEach(schemaBuilder::addField);
        return schemaBuilder.build();
    }
}
Also used : SchemaBuilder(com.amazonaws.athena.connector.lambda.data.SchemaBuilder) ResultSet(java.sql.ResultSet) Connection(java.sql.Connection) ArrowType(org.apache.arrow.vector.types.pojo.ArrowType) PreparedStatement(java.sql.PreparedStatement)

Example 28 with SchemaBuilder

use of com.amazonaws.athena.connector.lambda.data.SchemaBuilder in project aws-athena-query-federation by awslabs.

the class DataLakeGen2MetadataHandler method getSchema.

/**
 * Appropriate datatype to arrow type conversions will be done by fetching data types of columns
 * @param jdbcConnection
 * @param tableName
 * @param partitionSchema
 * @return
 * @throws SQLException
 */
private Schema getSchema(Connection jdbcConnection, TableName tableName, Schema partitionSchema) throws SQLException {
    LOGGER.info("Inside getSchema");
    String dataTypeQuery = "SELECT C.NAME AS COLUMN_NAME, TYPE_NAME(C.USER_TYPE_ID) AS DATA_TYPE " + "FROM SYS.COLUMNS C " + "JOIN SYS.TYPES T " + "ON C.USER_TYPE_ID=T.USER_TYPE_ID " + "WHERE C.OBJECT_ID=OBJECT_ID(?)";
    String dataType;
    String columnName;
    HashMap<String, String> hashMap = new HashMap<>();
    boolean found = false;
    SchemaBuilder schemaBuilder = SchemaBuilder.newBuilder();
    try (ResultSet resultSet = getColumns(jdbcConnection.getCatalog(), tableName, jdbcConnection.getMetaData());
        Connection connection = getJdbcConnectionFactory().getConnection(getCredentialProvider());
        PreparedStatement stmt = connection.prepareStatement(dataTypeQuery)) {
        // fetch data types of columns and prepare map with column name and datatype.
        stmt.setString(1, tableName.getSchemaName().toUpperCase() + "." + tableName.getTableName().toUpperCase());
        try (ResultSet dataTypeResultSet = stmt.executeQuery()) {
            while (dataTypeResultSet.next()) {
                dataType = dataTypeResultSet.getString("DATA_TYPE");
                columnName = dataTypeResultSet.getString("COLUMN_NAME");
                hashMap.put(columnName.trim(), dataType.trim());
            }
        }
        while (resultSet.next()) {
            ArrowType columnType = JdbcArrowTypeConverter.toArrowType(resultSet.getInt("DATA_TYPE"), resultSet.getInt("COLUMN_SIZE"), resultSet.getInt("DECIMAL_DIGITS"));
            columnName = resultSet.getString("COLUMN_NAME");
            dataType = hashMap.get(columnName);
            LOGGER.debug("columnName: " + columnName);
            LOGGER.debug("dataType: " + dataType);
            /**
             * Converting date data type into DATEDAY since framework is unable to do it by default
             */
            if ("date".equalsIgnoreCase(dataType)) {
                columnType = Types.MinorType.DATEDAY.getType();
            }
            /**
             * Converting bit data type into TINYINT because BIT type is showing 0 as false and 1 as true.
             * we can avoid it by changing to TINYINT.
             */
            if ("bit".equalsIgnoreCase(dataType)) {
                columnType = Types.MinorType.TINYINT.getType();
            }
            /**
             * Converting tinyint data type into SMALLINT.
             * TINYINT range is 0 to 255 in SQL Server, usage of TINYINT(ArrowType) leads to data loss as its using 1 bit as signed flag.
             */
            if ("tinyint".equalsIgnoreCase(dataType)) {
                columnType = Types.MinorType.SMALLINT.getType();
            }
            /**
             * Converting numeric, smallmoney data types into FLOAT8 to avoid data loss
             * (ex: 123.45 is shown as 123 (loosing its scale))
             */
            if ("numeric".equalsIgnoreCase(dataType) || "smallmoney".equalsIgnoreCase(dataType)) {
                columnType = Types.MinorType.FLOAT8.getType();
            }
            /**
             * Converting time data type(s) into DATEMILLI since framework is unable to map it by default
             */
            if ("datetime".equalsIgnoreCase(dataType) || "datetime2".equalsIgnoreCase(dataType) || "smalldatetime".equalsIgnoreCase(dataType) || "datetimeoffset".equalsIgnoreCase(dataType)) {
                columnType = Types.MinorType.DATEMILLI.getType();
            }
            /**
             * converting into VARCHAR for non supported data types.
             */
            if (columnType == null) {
                columnType = Types.MinorType.VARCHAR.getType();
            }
            if (columnType != null && !SupportedTypes.isSupported(columnType)) {
                columnType = Types.MinorType.VARCHAR.getType();
            }
            LOGGER.debug("columnType: " + columnType);
            if (columnType != null && SupportedTypes.isSupported(columnType)) {
                schemaBuilder.addField(FieldBuilder.newBuilder(columnName, columnType).build());
                found = true;
            } else {
                LOGGER.error("getSchema: Unable to map type for column[" + columnName + "] to a supported type, attempted " + columnType);
            }
        }
        if (!found) {
            throw new RuntimeException("Could not find table in " + tableName.getSchemaName());
        }
        partitionSchema.getFields().forEach(schemaBuilder::addField);
        return schemaBuilder.build();
    }
}
Also used : HashMap(java.util.HashMap) SchemaBuilder(com.amazonaws.athena.connector.lambda.data.SchemaBuilder) ResultSet(java.sql.ResultSet) Connection(java.sql.Connection) ArrowType(org.apache.arrow.vector.types.pojo.ArrowType) PreparedStatement(java.sql.PreparedStatement)

Example 29 with SchemaBuilder

use of com.amazonaws.athena.connector.lambda.data.SchemaBuilder in project aws-athena-query-federation by awslabs.

the class SchemaUtils method inferSchema.

/**
 * This method will produce an Apache Arrow Schema for the given TableName and DocumentDB connection
 * by scanning up to the requested number of rows and using basic schema inference to determine
 * data types.
 *
 * @param client The DocumentDB connection to use for the scan operation.
 * @param table The DocumentDB TableName for which to produce an Apache Arrow Schema.
 * @param numObjToSample The number of records to scan as part of producing the Schema.
 * @return An Apache Arrow Schema representing the schema of the HBase table.
 * @note The resulting schema is a union of the schema of every row that is scanned. Presently the code does not
 * attempt to resolve conflicts if unique field has different types across documents. It is recommend that you
 * use AWS Glue to define a schema for tables which may have such conflicts. In the future we may enhance this method
 * to use a reasonable default (like String) and coerce heterogeneous fields to avoid query failure but forcing
 * explicit handling by defining Schema in AWS Glue is likely a better approach.
 */
public static Schema inferSchema(MongoClient client, TableName table, int numObjToSample) {
    MongoDatabase db = client.getDatabase(table.getSchemaName());
    int docCount = 0;
    int fieldCount = 0;
    try (MongoCursor<Document> docs = db.getCollection(table.getTableName()).find().batchSize(numObjToSample).limit(numObjToSample).iterator()) {
        if (!docs.hasNext()) {
            return SchemaBuilder.newBuilder().build();
        }
        SchemaBuilder schemaBuilder = SchemaBuilder.newBuilder();
        while (docs.hasNext()) {
            docCount++;
            Document doc = docs.next();
            for (String key : doc.keySet()) {
                fieldCount++;
                Field newField = getArrowField(key, doc.get(key));
                Types.MinorType newType = Types.getMinorTypeForArrowType(newField.getType());
                Field curField = schemaBuilder.getField(key);
                Types.MinorType curType = (curField != null) ? Types.getMinorTypeForArrowType(curField.getType()) : null;
                if (curField == null) {
                    schemaBuilder.addField(newField);
                } else if (newType != curType) {
                    // TODO: currently we resolve fields with mixed types by defaulting to VARCHAR. This is _not_ ideal
                    logger.warn("inferSchema: Encountered a mixed-type field[{}] {} vs {}, defaulting to String.", key, curType, newType);
                    schemaBuilder.addStringField(key);
                } else if (curType == Types.MinorType.LIST) {
                    schemaBuilder.addField(mergeListField(key, curField, newField));
                } else if (curType == Types.MinorType.STRUCT) {
                    schemaBuilder.addField(mergeStructField(key, curField, newField));
                }
            }
        }
        Schema schema = schemaBuilder.build();
        if (schema.getFields().isEmpty()) {
            throw new RuntimeException("No columns found after scanning " + fieldCount + " values across " + docCount + " documents. Please ensure the collection is not empty and contains at least 1 supported column type.");
        }
        return schema;
    } finally {
        logger.info("inferSchema: Evaluated {} field values across {} documents.", fieldCount, docCount);
    }
}
Also used : Field(org.apache.arrow.vector.types.pojo.Field) Types(org.apache.arrow.vector.types.Types) Schema(org.apache.arrow.vector.types.pojo.Schema) SchemaBuilder(com.amazonaws.athena.connector.lambda.data.SchemaBuilder) Document(org.bson.Document) MongoDatabase(com.mongodb.client.MongoDatabase)

Example 30 with SchemaBuilder

use of com.amazonaws.athena.connector.lambda.data.SchemaBuilder in project aws-athena-query-federation by awslabs.

the class DataLakeGen2MetadataHandlerTest method doGetTable.

@Test
public void doGetTable() throws Exception {
    BlockAllocator blockAllocator = new BlockAllocatorImpl();
    String[] schema = { "DATA_TYPE", "COLUMN_SIZE", "COLUMN_NAME", "DECIMAL_DIGITS", "NUM_PREC_RADIX" };
    Object[][] values = { { Types.INTEGER, 12, "testCol1", 0, 0 }, { Types.VARCHAR, 25, "testCol2", 0, 0 }, { Types.TIMESTAMP, 93, "testCol3", 0, 0 }, { Types.TIMESTAMP_WITH_TIMEZONE, 93, "testCol4", 0, 0 } };
    AtomicInteger rowNumber = new AtomicInteger(-1);
    ResultSet resultSet = mockResultSet(schema, values, rowNumber);
    SchemaBuilder expectedSchemaBuilder = SchemaBuilder.newBuilder();
    expectedSchemaBuilder.addField(FieldBuilder.newBuilder("testCol1", org.apache.arrow.vector.types.Types.MinorType.INT.getType()).build());
    expectedSchemaBuilder.addField(FieldBuilder.newBuilder("testCol2", org.apache.arrow.vector.types.Types.MinorType.VARCHAR.getType()).build());
    expectedSchemaBuilder.addField(FieldBuilder.newBuilder("testCol3", org.apache.arrow.vector.types.Types.MinorType.DATEMILLI.getType()).build());
    expectedSchemaBuilder.addField(FieldBuilder.newBuilder("testCol4", org.apache.arrow.vector.types.Types.MinorType.VARCHAR.getType()).build());
    PARTITION_SCHEMA.getFields().forEach(expectedSchemaBuilder::addField);
    Schema expected = expectedSchemaBuilder.build();
    TableName inputTableName = new TableName("TESTSCHEMA", "TESTTABLE");
    Mockito.when(connection.getMetaData().getColumns("testCatalog", inputTableName.getSchemaName(), inputTableName.getTableName(), null)).thenReturn(resultSet);
    Mockito.when(connection.getCatalog()).thenReturn("testCatalog");
    GetTableResponse getTableResponse = this.dataLakeGen2MetadataHandler.doGetTable(blockAllocator, new GetTableRequest(this.federatedIdentity, "testQueryId", "testCatalog", inputTableName));
    Assert.assertEquals(expected, getTableResponse.getSchema());
    Assert.assertEquals(inputTableName, getTableResponse.getTableName());
    Assert.assertEquals("testCatalog", getTableResponse.getCatalogName());
}
Also used : Schema(org.apache.arrow.vector.types.pojo.Schema) TableName(com.amazonaws.athena.connector.lambda.domain.TableName) GetTableRequest(com.amazonaws.athena.connector.lambda.metadata.GetTableRequest) BlockAllocatorImpl(com.amazonaws.athena.connector.lambda.data.BlockAllocatorImpl) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) GetTableResponse(com.amazonaws.athena.connector.lambda.metadata.GetTableResponse) BlockAllocator(com.amazonaws.athena.connector.lambda.data.BlockAllocator) ResultSet(java.sql.ResultSet) SchemaBuilder(com.amazonaws.athena.connector.lambda.data.SchemaBuilder) Test(org.junit.Test)

Aggregations

SchemaBuilder (com.amazonaws.athena.connector.lambda.data.SchemaBuilder)68 Schema (org.apache.arrow.vector.types.pojo.Schema)48 TableName (com.amazonaws.athena.connector.lambda.domain.TableName)43 Test (org.junit.Test)43 PreparedStatement (java.sql.PreparedStatement)37 ResultSet (java.sql.ResultSet)35 Constraints (com.amazonaws.athena.connector.lambda.domain.predicate.Constraints)30 BlockAllocatorImpl (com.amazonaws.athena.connector.lambda.data.BlockAllocatorImpl)23 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)23 BlockAllocator (com.amazonaws.athena.connector.lambda.data.BlockAllocator)20 Split (com.amazonaws.athena.connector.lambda.domain.Split)17 ArrowType (org.apache.arrow.vector.types.pojo.ArrowType)17 ArrayList (java.util.ArrayList)15 ValueSet (com.amazonaws.athena.connector.lambda.domain.predicate.ValueSet)12 GetTableLayoutResponse (com.amazonaws.athena.connector.lambda.metadata.GetTableLayoutResponse)12 GetTableResponse (com.amazonaws.athena.connector.lambda.metadata.GetTableResponse)12 GetTableLayoutRequest (com.amazonaws.athena.connector.lambda.metadata.GetTableLayoutRequest)11 Connection (java.sql.Connection)10 HashMap (java.util.HashMap)10 ImmutableMap (com.google.common.collect.ImmutableMap)8