Search in sources :

Example 1 with SchemaDifference

use of org.apache.hudi.hive.SchemaDifference in project hudi by apache.

the class HiveSchemaUtil method getSchemaDifference.

public static SchemaDifference getSchemaDifference(MessageType storageSchema, Map<String, String> tableSchema, List<String> partitionKeys, boolean supportTimestamp) {
    Map<String, String> newTableSchema;
    try {
        newTableSchema = convertParquetSchemaToHiveSchema(storageSchema, supportTimestamp);
    } catch (IOException e) {
        throw new HoodieHiveSyncException("Failed to convert parquet schema to hive schema", e);
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("Getting schema difference for " + tableSchema + "\r\n\r\n" + newTableSchema);
    }
    SchemaDifference.Builder schemaDiffBuilder = SchemaDifference.newBuilder(storageSchema, tableSchema);
    Set<String> tableColumns = new HashSet<>();
    for (Map.Entry<String, String> field : tableSchema.entrySet()) {
        String fieldName = field.getKey().toLowerCase();
        String tickSurroundedFieldName = tickSurround(fieldName);
        if (!isFieldExistsInSchema(newTableSchema, tickSurroundedFieldName) && !partitionKeys.contains(fieldName)) {
            schemaDiffBuilder.deleteTableColumn(fieldName);
        } else {
            // check type
            String tableColumnType = field.getValue();
            if (!isFieldExistsInSchema(newTableSchema, tickSurroundedFieldName)) {
                if (partitionKeys.contains(fieldName)) {
                    // Partition key does not have to be part of the storage schema
                    continue;
                }
                // We will log this and continue. Hive schema is a superset of all parquet schemas
                LOG.warn("Ignoring table column " + fieldName + " as its not present in the parquet schema");
                continue;
            }
            tableColumnType = tableColumnType.replaceAll("\\s+", "");
            String expectedType = getExpectedType(newTableSchema, tickSurroundedFieldName);
            expectedType = expectedType.replaceAll("\\s+", "");
            expectedType = expectedType.replaceAll("`", "");
            if (!tableColumnType.equalsIgnoreCase(expectedType)) {
                // rules
                if (!isSchemaTypeUpdateAllowed(tableColumnType, expectedType)) {
                    throw new HoodieHiveSyncException("Could not convert field Type from " + tableColumnType + " to " + expectedType + " for field " + fieldName);
                }
                schemaDiffBuilder.updateTableColumn(fieldName, getExpectedType(newTableSchema, tickSurroundedFieldName));
            }
        }
        tableColumns.add(tickSurroundedFieldName);
    }
    for (Map.Entry<String, String> entry : newTableSchema.entrySet()) {
        if (!tableColumns.contains(entry.getKey().toLowerCase())) {
            schemaDiffBuilder.addTableColumn(entry.getKey(), entry.getValue());
        }
    }
    if (LOG.isDebugEnabled()) {
        LOG.debug("Difference between schemas: " + schemaDiffBuilder.build().toString());
    }
    return schemaDiffBuilder.build();
}
Also used : IOException(java.io.IOException) HoodieHiveSyncException(org.apache.hudi.hive.HoodieHiveSyncException) SchemaDifference(org.apache.hudi.hive.SchemaDifference) LinkedHashMap(java.util.LinkedHashMap) Map(java.util.Map) HashSet(java.util.HashSet)

Example 2 with SchemaDifference

use of org.apache.hudi.hive.SchemaDifference in project hudi by apache.

the class TestHiveSchemaUtil method testSchemaDiffForTimestampMicros.

@Test
public void testSchemaDiffForTimestampMicros() {
    MessageType schema = Types.buildMessage().optional(PrimitiveType.PrimitiveTypeName.INT64).as(OriginalType.TIMESTAMP_MICROS).named("my_element").named("my_timestamp");
    // verify backward compatibility - int64 converted to bigint type
    SchemaDifference schemaDifference = HiveSchemaUtil.getSchemaDifference(schema, Collections.emptyMap(), Collections.emptyList(), false);
    assertEquals("bigint", schemaDifference.getAddColumnTypes().get("`my_element`"));
    schemaDifference = HiveSchemaUtil.getSchemaDifference(schema, schemaDifference.getAddColumnTypes(), Collections.emptyList(), false);
    assertTrue(schemaDifference.isEmpty());
    // verify schema difference is calculated correctly when supportTimestamp is enabled
    schemaDifference = HiveSchemaUtil.getSchemaDifference(schema, Collections.emptyMap(), Collections.emptyList(), true);
    assertEquals("TIMESTAMP", schemaDifference.getAddColumnTypes().get("`my_element`"));
    schemaDifference = HiveSchemaUtil.getSchemaDifference(schema, schemaDifference.getAddColumnTypes(), Collections.emptyList(), true);
    assertTrue(schemaDifference.isEmpty());
}
Also used : SchemaDifference(org.apache.hudi.hive.SchemaDifference) MessageType(org.apache.parquet.schema.MessageType) Test(org.junit.jupiter.api.Test)

Example 3 with SchemaDifference

use of org.apache.hudi.hive.SchemaDifference in project hudi by apache.

the class DLASyncTool method syncSchema.

/**
 * Get the latest schema from the last commit and check if its in sync with the dla table schema. If not, evolves the
 * table schema.
 *
 * @param tableExists - does table exist
 * @param schema - extracted schema
 */
private void syncSchema(String tableName, boolean tableExists, boolean useRealTimeInputFormat, MessageType schema) {
    // Check and sync schema
    if (!tableExists) {
        LOG.info("DLA table " + tableName + " is not found. Creating it");
        String inputFormatClassName = HoodieInputFormatUtils.getInputFormatClassName(HoodieFileFormat.PARQUET, useRealTimeInputFormat);
        // Custom serde will not work with ALTER TABLE REPLACE COLUMNS
        // https://github.com/apache/hive/blob/release-1.1.0/ql/src/java/org/apache/hadoop/hive
        // /ql/exec/DDLTask.java#L3488
        hoodieDLAClient.createTable(tableName, schema, inputFormatClassName, MapredParquetOutputFormat.class.getName(), ParquetHiveSerDe.class.getName(), new HashMap<>(), new HashMap<>());
    } else {
        // Check if the table schema has evolved
        Map<String, String> tableSchema = hoodieDLAClient.getTableSchema(tableName);
        SchemaDifference schemaDiff = HiveSchemaUtil.getSchemaDifference(schema, tableSchema, cfg.partitionFields, cfg.supportTimestamp);
        if (!schemaDiff.isEmpty()) {
            LOG.info("Schema difference found for " + tableName);
            hoodieDLAClient.updateTableDefinition(tableName, schemaDiff);
        } else {
            LOG.info("No Schema difference for " + tableName);
        }
    }
}
Also used : MapredParquetOutputFormat(org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat) ParquetHiveSerDe(org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe) SchemaDifference(org.apache.hudi.hive.SchemaDifference)

Aggregations

SchemaDifference (org.apache.hudi.hive.SchemaDifference)3 IOException (java.io.IOException)1 HashSet (java.util.HashSet)1 LinkedHashMap (java.util.LinkedHashMap)1 Map (java.util.Map)1 MapredParquetOutputFormat (org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat)1 ParquetHiveSerDe (org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe)1 HoodieHiveSyncException (org.apache.hudi.hive.HoodieHiveSyncException)1 MessageType (org.apache.parquet.schema.MessageType)1 Test (org.junit.jupiter.api.Test)1