Examples with HoodieFileFormat - org.apache.hudi.common.model.HoodieFileFormat

Example 1 with HoodieFileFormat

use of org.apache.hudi.common.model.HoodieFileFormat in project hudi by apache.

the class HiveSyncTool method syncSchema.

/**
 * Get the latest schema from the last commit and check if its in sync with the hive table schema. If not, evolves the
 * table schema.
 *
 * @param tableExists - does table exist
 * @param schema - extracted schema
 */
private boolean syncSchema(String tableName, boolean tableExists, boolean useRealTimeInputFormat, boolean readAsOptimized, MessageType schema) {
    // Append spark table properties & serde properties
    Map<String, String> tableProperties = ConfigUtils.toMap(cfg.tableProperties);
    Map<String, String> serdeProperties = ConfigUtils.toMap(cfg.serdeProperties);
    if (cfg.syncAsSparkDataSourceTable) {
        Map<String, String> sparkTableProperties = getSparkTableProperties(cfg.sparkSchemaLengthThreshold, schema);
        Map<String, String> sparkSerdeProperties = getSparkSerdeProperties(readAsOptimized);
        tableProperties.putAll(sparkTableProperties);
        serdeProperties.putAll(sparkSerdeProperties);
    }
    boolean schemaChanged = false;
    // Check and sync schema
    if (!tableExists) {
        LOG.info("Hive table " + tableName + " is not found. Creating it");
        HoodieFileFormat baseFileFormat = HoodieFileFormat.valueOf(cfg.baseFileFormat.toUpperCase());
        String inputFormatClassName = HoodieInputFormatUtils.getInputFormatClassName(baseFileFormat, useRealTimeInputFormat);
        if (baseFileFormat.equals(HoodieFileFormat.PARQUET) && cfg.usePreApacheInputFormat) {
            // Parquet input format had an InputFormat class visible under the old naming scheme.
            inputFormatClassName = useRealTimeInputFormat ? com.uber.hoodie.hadoop.realtime.HoodieRealtimeInputFormat.class.getName() : com.uber.hoodie.hadoop.HoodieInputFormat.class.getName();
        }
        String outputFormatClassName = HoodieInputFormatUtils.getOutputFormatClassName(baseFileFormat);
        String serDeFormatClassName = HoodieInputFormatUtils.getSerDeClassName(baseFileFormat);
        // Custom serde will not work with ALTER TABLE REPLACE COLUMNS
        // https://github.com/apache/hive/blob/release-1.1.0/ql/src/java/org/apache/hadoop/hive
        // /ql/exec/DDLTask.java#L3488
        hoodieHiveClient.createTable(tableName, schema, inputFormatClassName, outputFormatClassName, serDeFormatClassName, serdeProperties, tableProperties);
        schemaChanged = true;
    } else {
        // Check if the table schema has evolved
        Map<String, String> tableSchema = hoodieHiveClient.getTableSchema(tableName);
        SchemaDifference schemaDiff = HiveSchemaUtil.getSchemaDifference(schema, tableSchema, cfg.partitionFields, cfg.supportTimestamp);
        if (!schemaDiff.isEmpty()) {
            LOG.info("Schema difference found for " + tableName);
            hoodieHiveClient.updateTableDefinition(tableName, schema);
            // Sync the table properties if the schema has changed
            if (cfg.tableProperties != null || cfg.syncAsSparkDataSourceTable) {
                hoodieHiveClient.updateTableProperties(tableName, tableProperties);
                LOG.info("Sync table properties for " + tableName + ", table properties is: " + tableProperties);
            }
            schemaChanged = true;
        } else {
            LOG.info("No Schema difference for " + tableName);
        }
    }
    if (cfg.syncComment) {
        Schema avroSchemaWithoutMetadataFields = hoodieHiveClient.getAvroSchemaWithoutMetadataFields();
        Map<String, String> newComments = avroSchemaWithoutMetadataFields.getFields().stream().collect(Collectors.toMap(Schema.Field::name, field -> StringUtils.isNullOrEmpty(field.doc()) ? "" : field.doc()));
        boolean allEmpty = newComments.values().stream().allMatch(StringUtils::isNullOrEmpty);
        if (!allEmpty) {
            List<FieldSchema> hiveSchema = hoodieHiveClient.getTableCommentUsingMetastoreClient(tableName);
            hoodieHiveClient.updateTableComments(tableName, hiveSchema, avroSchemaWithoutMetadataFields.getFields());
        } else {
            LOG.info(String.format("No comment %s need to add", tableName));
        }
    }
    return schemaChanged;
}

Also used : HoodieInputFormatUtils(org.apache.hudi.hadoop.utils.HoodieInputFormatUtils) PrimitiveType(org.apache.parquet.schema.PrimitiveType) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieException(org.apache.hudi.exception.HoodieException) AbstractSyncTool(org.apache.hudi.sync.common.AbstractSyncTool) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) Partition(org.apache.hadoop.hive.metastore.api.Partition) ArrayList(java.util.ArrayList) HiveSchemaUtil(org.apache.hudi.hive.util.HiveSchemaUtil) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) StringUtils(org.apache.hudi.common.util.StringUtils) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) UTF8(org.apache.parquet.schema.OriginalType.UTF8) PartitionEventType(org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent.PartitionEventType) Schema(org.apache.avro.Schema) GroupType(org.apache.parquet.schema.GroupType) BINARY(org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName.BINARY) JCommander(com.beust.jcommander.JCommander) HiveConf(org.apache.hadoop.hive.conf.HiveConf) Parquet2SparkSchemaUtils(org.apache.hudi.hive.util.Parquet2SparkSchemaUtils) InvalidTableException(org.apache.hudi.exception.InvalidTableException) Collectors(java.util.stream.Collectors) ConfigUtils(org.apache.hudi.hive.util.ConfigUtils) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) MessageType(org.apache.parquet.schema.MessageType) List(java.util.List) PartitionEvent(org.apache.hudi.sync.common.AbstractSyncHoodieClient.PartitionEvent) Type(org.apache.parquet.schema.Type) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) Schema(org.apache.avro.Schema) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) StringUtils(org.apache.hudi.common.util.StringUtils)

Example 2 with HoodieFileFormat

use of org.apache.hudi.common.model.HoodieFileFormat in project hudi by apache.

the class TestHoodieSparkMergeOnReadTableInsertUpdateDelete method testInlineScheduleCompaction.

@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testInlineScheduleCompaction(boolean scheduleInlineCompaction) throws Exception {
    HoodieFileFormat fileFormat = HoodieFileFormat.PARQUET;
    Properties properties = new Properties();
    properties.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), fileFormat.toString());
    HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, properties);
    HoodieWriteConfig cfg = getConfigBuilder(false).withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024).withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(2).withPreserveCommitMetadata(true).withScheduleInlineCompaction(scheduleInlineCompaction).build()).build();
    try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
        HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();
        /*
       * Write 1 (only inserts)
       */
        String newCommitTime = "001";
        client.startCommitWithTime(newCommitTime);
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 200);
        Stream<HoodieBaseFile> dataFiles = insertRecordsToMORTable(metaClient, records, client, cfg, newCommitTime, true);
        assertTrue(dataFiles.findAny().isPresent(), "should list the base files we wrote in the delta commit");
        /*
       * Write 2 (updates)
       */
        newCommitTime = "004";
        client.startCommitWithTime(newCommitTime);
        records = dataGen.generateUpdates(newCommitTime, 100);
        updateRecordsInMORTable(metaClient, records, client, cfg, newCommitTime, true);
        // verify that there is a commit
        if (scheduleInlineCompaction) {
            assertEquals(metaClient.reloadActiveTimeline().getAllCommitsTimeline().filterPendingCompactionTimeline().countInstants(), 1);
        } else {
            assertEquals(metaClient.reloadActiveTimeline().getAllCommitsTimeline().filterPendingCompactionTimeline().countInstants(), 0);
        }
    }
}

Also used : HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Properties(java.util.Properties) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) ValueSource(org.junit.jupiter.params.provider.ValueSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 3 with HoodieFileFormat

use of org.apache.hudi.common.model.HoodieFileFormat in project hudi by apache.

the class TestHoodieSparkMergeOnReadTableInsertUpdateDelete method testRepeatedRollbackOfCompaction.

@Test
public void testRepeatedRollbackOfCompaction() throws Exception {
    boolean scheduleInlineCompaction = false;
    HoodieFileFormat fileFormat = HoodieFileFormat.PARQUET;
    Properties properties = new Properties();
    properties.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), fileFormat.toString());
    HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, properties);
    HoodieWriteConfig cfg = getConfigBuilder(false).withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024).withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(2).withPreserveCommitMetadata(true).withScheduleInlineCompaction(scheduleInlineCompaction).build()).build();
    try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
        HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();
        /*
       * Write 1 (only inserts)
       */
        String newCommitTime = "001";
        client.startCommitWithTime(newCommitTime);
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 200);
        Stream<HoodieBaseFile> dataFiles = insertRecordsToMORTable(metaClient, records, client, cfg, newCommitTime, true);
        assertTrue(dataFiles.findAny().isPresent(), "should list the base files we wrote in the delta commit");
        /*
       * Write 2 (updates)
       */
        newCommitTime = "004";
        client.startCommitWithTime(newCommitTime);
        records = dataGen.generateUpdates(newCommitTime, 100);
        updateRecordsInMORTable(metaClient, records, client, cfg, newCommitTime, true);
        Option<String> compactionInstant = client.scheduleCompaction(Option.empty());
        client.compact(compactionInstant.get());
        // trigger compaction again.
        client.compact(compactionInstant.get());
        metaClient.reloadActiveTimeline();
        // verify that there is no new rollback instant generated
        HoodieInstant rollbackInstant = metaClient.getActiveTimeline().getRollbackTimeline().lastInstant().get();
        FileCreateUtils.deleteRollbackCommit(metaClient.getBasePath().substring(metaClient.getBasePath().indexOf(":") + 1), rollbackInstant.getTimestamp());
        metaClient.reloadActiveTimeline();
        SparkRDDWriteClient client1 = getHoodieWriteClient(cfg);
        // trigger compaction again.
        client1.compact(compactionInstant.get());
        metaClient.reloadActiveTimeline();
        // verify that there is no new rollback instant generated
        HoodieInstant newRollbackInstant = metaClient.getActiveTimeline().getRollbackTimeline().lastInstant().get();
        assertEquals(rollbackInstant.getTimestamp(), newRollbackInstant.getTimestamp());
    }
}

Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Properties(java.util.Properties) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Aggregations

HoodieFileFormat (org.apache.hudi.common.model.HoodieFileFormat)3 Properties (java.util.Properties)2 SparkRDDWriteClient (org.apache.hudi.client.SparkRDDWriteClient)2 HoodieBaseFile (org.apache.hudi.common.model.HoodieBaseFile)2 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)2 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)2 HoodieTestDataGenerator (org.apache.hudi.common.testutils.HoodieTestDataGenerator)2 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)2 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)2 JCommander (com.beust.jcommander.JCommander)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 List (java.util.List)1 Map (java.util.Map)1 Collectors (java.util.stream.Collectors)1 Schema (org.apache.avro.Schema)1 Configuration (org.apache.hadoop.conf.Configuration)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 HiveConf (org.apache.hadoop.hive.conf.HiveConf)1 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)1