use of org.apache.hudi.common.model.HoodieFileFormat in project hudi by apache.
the class HiveSyncTool method syncSchema.
/**
* Get the latest schema from the last commit and check if its in sync with the hive table schema. If not, evolves the
* table schema.
*
* @param tableExists - does table exist
* @param schema - extracted schema
*/
private boolean syncSchema(String tableName, boolean tableExists, boolean useRealTimeInputFormat, boolean readAsOptimized, MessageType schema) {
// Append spark table properties & serde properties
Map<String, String> tableProperties = ConfigUtils.toMap(cfg.tableProperties);
Map<String, String> serdeProperties = ConfigUtils.toMap(cfg.serdeProperties);
if (cfg.syncAsSparkDataSourceTable) {
Map<String, String> sparkTableProperties = getSparkTableProperties(cfg.sparkSchemaLengthThreshold, schema);
Map<String, String> sparkSerdeProperties = getSparkSerdeProperties(readAsOptimized);
tableProperties.putAll(sparkTableProperties);
serdeProperties.putAll(sparkSerdeProperties);
}
boolean schemaChanged = false;
// Check and sync schema
if (!tableExists) {
LOG.info("Hive table " + tableName + " is not found. Creating it");
HoodieFileFormat baseFileFormat = HoodieFileFormat.valueOf(cfg.baseFileFormat.toUpperCase());
String inputFormatClassName = HoodieInputFormatUtils.getInputFormatClassName(baseFileFormat, useRealTimeInputFormat);
if (baseFileFormat.equals(HoodieFileFormat.PARQUET) && cfg.usePreApacheInputFormat) {
// Parquet input format had an InputFormat class visible under the old naming scheme.
inputFormatClassName = useRealTimeInputFormat ? com.uber.hoodie.hadoop.realtime.HoodieRealtimeInputFormat.class.getName() : com.uber.hoodie.hadoop.HoodieInputFormat.class.getName();
}
String outputFormatClassName = HoodieInputFormatUtils.getOutputFormatClassName(baseFileFormat);
String serDeFormatClassName = HoodieInputFormatUtils.getSerDeClassName(baseFileFormat);
// Custom serde will not work with ALTER TABLE REPLACE COLUMNS
// https://github.com/apache/hive/blob/release-1.1.0/ql/src/java/org/apache/hadoop/hive
// /ql/exec/DDLTask.java#L3488
hoodieHiveClient.createTable(tableName, schema, inputFormatClassName, outputFormatClassName, serDeFormatClassName, serdeProperties, tableProperties);
schemaChanged = true;
} else {
// Check if the table schema has evolved
Map<String, String> tableSchema = hoodieHiveClient.getTableSchema(tableName);
SchemaDifference schemaDiff = HiveSchemaUtil.getSchemaDifference(schema, tableSchema, cfg.partitionFields, cfg.supportTimestamp);
if (!schemaDiff.isEmpty()) {
LOG.info("Schema difference found for " + tableName);
hoodieHiveClient.updateTableDefinition(tableName, schema);
// Sync the table properties if the schema has changed
if (cfg.tableProperties != null || cfg.syncAsSparkDataSourceTable) {
hoodieHiveClient.updateTableProperties(tableName, tableProperties);
LOG.info("Sync table properties for " + tableName + ", table properties is: " + tableProperties);
}
schemaChanged = true;
} else {
LOG.info("No Schema difference for " + tableName);
}
}
if (cfg.syncComment) {
Schema avroSchemaWithoutMetadataFields = hoodieHiveClient.getAvroSchemaWithoutMetadataFields();
Map<String, String> newComments = avroSchemaWithoutMetadataFields.getFields().stream().collect(Collectors.toMap(Schema.Field::name, field -> StringUtils.isNullOrEmpty(field.doc()) ? "" : field.doc()));
boolean allEmpty = newComments.values().stream().allMatch(StringUtils::isNullOrEmpty);
if (!allEmpty) {
List<FieldSchema> hiveSchema = hoodieHiveClient.getTableCommentUsingMetastoreClient(tableName);
hoodieHiveClient.updateTableComments(tableName, hiveSchema, avroSchemaWithoutMetadataFields.getFields());
} else {
LOG.info(String.format("No comment %s need to add", tableName));
}
}
return schemaChanged;
}
use of org.apache.hudi.common.model.HoodieFileFormat in project hudi by apache.
the class TestHoodieSparkMergeOnReadTableInsertUpdateDelete method testInlineScheduleCompaction.
@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testInlineScheduleCompaction(boolean scheduleInlineCompaction) throws Exception {
HoodieFileFormat fileFormat = HoodieFileFormat.PARQUET;
Properties properties = new Properties();
properties.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), fileFormat.toString());
HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, properties);
HoodieWriteConfig cfg = getConfigBuilder(false).withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024).withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(2).withPreserveCommitMetadata(true).withScheduleInlineCompaction(scheduleInlineCompaction).build()).build();
try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();
/*
* Write 1 (only inserts)
*/
String newCommitTime = "001";
client.startCommitWithTime(newCommitTime);
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 200);
Stream<HoodieBaseFile> dataFiles = insertRecordsToMORTable(metaClient, records, client, cfg, newCommitTime, true);
assertTrue(dataFiles.findAny().isPresent(), "should list the base files we wrote in the delta commit");
/*
* Write 2 (updates)
*/
newCommitTime = "004";
client.startCommitWithTime(newCommitTime);
records = dataGen.generateUpdates(newCommitTime, 100);
updateRecordsInMORTable(metaClient, records, client, cfg, newCommitTime, true);
// verify that there is a commit
if (scheduleInlineCompaction) {
assertEquals(metaClient.reloadActiveTimeline().getAllCommitsTimeline().filterPendingCompactionTimeline().countInstants(), 1);
} else {
assertEquals(metaClient.reloadActiveTimeline().getAllCommitsTimeline().filterPendingCompactionTimeline().countInstants(), 0);
}
}
}
use of org.apache.hudi.common.model.HoodieFileFormat in project hudi by apache.
the class TestHoodieSparkMergeOnReadTableInsertUpdateDelete method testRepeatedRollbackOfCompaction.
@Test
public void testRepeatedRollbackOfCompaction() throws Exception {
boolean scheduleInlineCompaction = false;
HoodieFileFormat fileFormat = HoodieFileFormat.PARQUET;
Properties properties = new Properties();
properties.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), fileFormat.toString());
HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, properties);
HoodieWriteConfig cfg = getConfigBuilder(false).withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024).withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(2).withPreserveCommitMetadata(true).withScheduleInlineCompaction(scheduleInlineCompaction).build()).build();
try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();
/*
* Write 1 (only inserts)
*/
String newCommitTime = "001";
client.startCommitWithTime(newCommitTime);
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 200);
Stream<HoodieBaseFile> dataFiles = insertRecordsToMORTable(metaClient, records, client, cfg, newCommitTime, true);
assertTrue(dataFiles.findAny().isPresent(), "should list the base files we wrote in the delta commit");
/*
* Write 2 (updates)
*/
newCommitTime = "004";
client.startCommitWithTime(newCommitTime);
records = dataGen.generateUpdates(newCommitTime, 100);
updateRecordsInMORTable(metaClient, records, client, cfg, newCommitTime, true);
Option<String> compactionInstant = client.scheduleCompaction(Option.empty());
client.compact(compactionInstant.get());
// trigger compaction again.
client.compact(compactionInstant.get());
metaClient.reloadActiveTimeline();
// verify that there is no new rollback instant generated
HoodieInstant rollbackInstant = metaClient.getActiveTimeline().getRollbackTimeline().lastInstant().get();
FileCreateUtils.deleteRollbackCommit(metaClient.getBasePath().substring(metaClient.getBasePath().indexOf(":") + 1), rollbackInstant.getTimestamp());
metaClient.reloadActiveTimeline();
SparkRDDWriteClient client1 = getHoodieWriteClient(cfg);
// trigger compaction again.
client1.compact(compactionInstant.get());
metaClient.reloadActiveTimeline();
// verify that there is no new rollback instant generated
HoodieInstant newRollbackInstant = metaClient.getActiveTimeline().getRollbackTimeline().lastInstant().get();
assertEquals(rollbackInstant.getTimestamp(), newRollbackInstant.getTimestamp());
}
}
Aggregations