Search in sources :

Example 1 with StringUtils

use of org.apache.hudi.common.util.StringUtils in project hudi by apache.

the class HiveSyncTool method syncSchema.

/**
 * Get the latest schema from the last commit and check if its in sync with the hive table schema. If not, evolves the
 * table schema.
 *
 * @param tableExists - does table exist
 * @param schema - extracted schema
 */
private boolean syncSchema(String tableName, boolean tableExists, boolean useRealTimeInputFormat, boolean readAsOptimized, MessageType schema) {
    // Append spark table properties & serde properties
    Map<String, String> tableProperties = ConfigUtils.toMap(cfg.tableProperties);
    Map<String, String> serdeProperties = ConfigUtils.toMap(cfg.serdeProperties);
    if (cfg.syncAsSparkDataSourceTable) {
        Map<String, String> sparkTableProperties = getSparkTableProperties(cfg.sparkSchemaLengthThreshold, schema);
        Map<String, String> sparkSerdeProperties = getSparkSerdeProperties(readAsOptimized);
        tableProperties.putAll(sparkTableProperties);
        serdeProperties.putAll(sparkSerdeProperties);
    }
    boolean schemaChanged = false;
    // Check and sync schema
    if (!tableExists) {
        LOG.info("Hive table " + tableName + " is not found. Creating it");
        HoodieFileFormat baseFileFormat = HoodieFileFormat.valueOf(cfg.baseFileFormat.toUpperCase());
        String inputFormatClassName = HoodieInputFormatUtils.getInputFormatClassName(baseFileFormat, useRealTimeInputFormat);
        if (baseFileFormat.equals(HoodieFileFormat.PARQUET) && cfg.usePreApacheInputFormat) {
            // Parquet input format had an InputFormat class visible under the old naming scheme.
            inputFormatClassName = useRealTimeInputFormat ? com.uber.hoodie.hadoop.realtime.HoodieRealtimeInputFormat.class.getName() : com.uber.hoodie.hadoop.HoodieInputFormat.class.getName();
        }
        String outputFormatClassName = HoodieInputFormatUtils.getOutputFormatClassName(baseFileFormat);
        String serDeFormatClassName = HoodieInputFormatUtils.getSerDeClassName(baseFileFormat);
        // Custom serde will not work with ALTER TABLE REPLACE COLUMNS
        // https://github.com/apache/hive/blob/release-1.1.0/ql/src/java/org/apache/hadoop/hive
        // /ql/exec/DDLTask.java#L3488
        hoodieHiveClient.createTable(tableName, schema, inputFormatClassName, outputFormatClassName, serDeFormatClassName, serdeProperties, tableProperties);
        schemaChanged = true;
    } else {
        // Check if the table schema has evolved
        Map<String, String> tableSchema = hoodieHiveClient.getTableSchema(tableName);
        SchemaDifference schemaDiff = HiveSchemaUtil.getSchemaDifference(schema, tableSchema, cfg.partitionFields, cfg.supportTimestamp);
        if (!schemaDiff.isEmpty()) {
            LOG.info("Schema difference found for " + tableName);
            hoodieHiveClient.updateTableDefinition(tableName, schema);
            // Sync the table properties if the schema has changed
            if (cfg.tableProperties != null || cfg.syncAsSparkDataSourceTable) {
                hoodieHiveClient.updateTableProperties(tableName, tableProperties);
                LOG.info("Sync table properties for " + tableName + ", table properties is: " + tableProperties);
            }
            schemaChanged = true;
        } else {
            LOG.info("No Schema difference for " + tableName);
        }
    }
    if (cfg.syncComment) {
        Schema avroSchemaWithoutMetadataFields = hoodieHiveClient.getAvroSchemaWithoutMetadataFields();
        Map<String, String> newComments = avroSchemaWithoutMetadataFields.getFields().stream().collect(Collectors.toMap(Schema.Field::name, field -> StringUtils.isNullOrEmpty(field.doc()) ? "" : field.doc()));
        boolean allEmpty = newComments.values().stream().allMatch(StringUtils::isNullOrEmpty);
        if (!allEmpty) {
            List<FieldSchema> hiveSchema = hoodieHiveClient.getTableCommentUsingMetastoreClient(tableName);
            hoodieHiveClient.updateTableComments(tableName, hiveSchema, avroSchemaWithoutMetadataFields.getFields());
        } else {
            LOG.info(String.format("No comment %s need to add", tableName));
        }
    }
    return schemaChanged;
}
Also used : HoodieInputFormatUtils(org.apache.hudi.hadoop.utils.HoodieInputFormatUtils) PrimitiveType(org.apache.parquet.schema.PrimitiveType) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieException(org.apache.hudi.exception.HoodieException) AbstractSyncTool(org.apache.hudi.sync.common.AbstractSyncTool) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) Partition(org.apache.hadoop.hive.metastore.api.Partition) ArrayList(java.util.ArrayList) HiveSchemaUtil(org.apache.hudi.hive.util.HiveSchemaUtil) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) StringUtils(org.apache.hudi.common.util.StringUtils) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) UTF8(org.apache.parquet.schema.OriginalType.UTF8) PartitionEventType(org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent.PartitionEventType) Schema(org.apache.avro.Schema) GroupType(org.apache.parquet.schema.GroupType) BINARY(org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY) JCommander(com.beust.jcommander.JCommander) HiveConf(org.apache.hadoop.hive.conf.HiveConf) Parquet2SparkSchemaUtils(org.apache.hudi.hive.util.Parquet2SparkSchemaUtils) InvalidTableException(org.apache.hudi.exception.InvalidTableException) Collectors(java.util.stream.Collectors) ConfigUtils(org.apache.hudi.hive.util.ConfigUtils) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) MessageType(org.apache.parquet.schema.MessageType) List(java.util.List) PartitionEvent(org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent) Type(org.apache.parquet.schema.Type) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) Schema(org.apache.avro.Schema) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) StringUtils(org.apache.hudi.common.util.StringUtils)

Aggregations

JCommander (com.beust.jcommander.JCommander)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 List (java.util.List)1 Map (java.util.Map)1 Collectors (java.util.stream.Collectors)1 Schema (org.apache.avro.Schema)1 Configuration (org.apache.hadoop.conf.Configuration)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 HiveConf (org.apache.hadoop.hive.conf.HiveConf)1 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)1 Partition (org.apache.hadoop.hive.metastore.api.Partition)1 FSUtils (org.apache.hudi.common.fs.FSUtils)1 HoodieFileFormat (org.apache.hudi.common.model.HoodieFileFormat)1 HoodieTableType (org.apache.hudi.common.model.HoodieTableType)1 Option (org.apache.hudi.common.util.Option)1 StringUtils (org.apache.hudi.common.util.StringUtils)1 HoodieException (org.apache.hudi.exception.HoodieException)1 InvalidTableException (org.apache.hudi.exception.InvalidTableException)1 HoodieInputFormatUtils (org.apache.hudi.hadoop.utils.HoodieInputFormatUtils)1