use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.
the class TestHoodieDeltaStreamerWithMultiWriter method testLatestCheckpointCarryOverWithMultipleWriters.
@Disabled
@ParameterizedTest
@EnumSource(value = HoodieTableType.class, names = { "COPY_ON_WRITE" })
void testLatestCheckpointCarryOverWithMultipleWriters(HoodieTableType tableType) throws Exception {
// NOTE : Overriding the LockProvider to InProcessLockProvider since Zookeeper locks work in unit test but fail on Jenkins with connection timeouts
basePath = Paths.get(URI.create(basePath().replaceAll("/$", ""))).toString();
propsFilePath = basePath + "/" + PROPS_FILENAME_TEST_MULTI_WRITER;
tableBasePath = basePath + "/testtable_" + tableType;
prepareInitialConfigs(fs(), basePath, "foo");
TypedProperties props = prepareMultiWriterProps(fs(), basePath, propsFilePath);
props.setProperty("hoodie.write.lock.provider", "org.apache.hudi.client.transaction.lock.InProcessLockProvider");
props.setProperty(LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY, "3000");
UtilitiesTestBase.Helpers.savePropsToDFS(props, fs(), propsFilePath);
// Keep it higher than batch-size to test continuous mode
int totalRecords = 3000;
HoodieDeltaStreamer.Config prepJobConfig = getDeltaStreamerConfig(tableBasePath, tableType.name(), WriteOperationType.UPSERT, propsFilePath, Collections.singletonList(TestHoodieDeltaStreamer.TripsWithDistanceTransformer.class.getName()));
prepJobConfig.continuousMode = true;
prepJobConfig.configs.add(String.format("%s=%d", SourceConfigs.MAX_UNIQUE_RECORDS_PROP, totalRecords));
prepJobConfig.configs.add(String.format("%s=false", HoodieCompactionConfig.AUTO_CLEAN.key()));
HoodieDeltaStreamer prepJob = new HoodieDeltaStreamer(prepJobConfig, jsc());
// Prepare base dataset with some commits
deltaStreamerTestRunner(prepJob, prepJobConfig, (r) -> {
if (tableType.equals(HoodieTableType.MERGE_ON_READ)) {
TestHoodieDeltaStreamer.TestHelpers.assertAtleastNDeltaCommits(3, tableBasePath, fs());
TestHoodieDeltaStreamer.TestHelpers.assertAtleastNCompactionCommits(1, tableBasePath, fs());
} else {
TestHoodieDeltaStreamer.TestHelpers.assertAtleastNCompactionCommits(3, tableBasePath, fs());
}
TestHoodieDeltaStreamer.TestHelpers.assertRecordCount(totalRecords, tableBasePath + "/*/*.parquet", sqlContext());
TestHoodieDeltaStreamer.TestHelpers.assertDistanceCount(totalRecords, tableBasePath + "/*/*.parquet", sqlContext());
return true;
});
// create a backfill job with checkpoint from the first instant
HoodieDeltaStreamer.Config cfgBackfillJob = getDeltaStreamerConfig(tableBasePath, tableType.name(), WriteOperationType.UPSERT, propsFilePath, Collections.singletonList(TestHoodieDeltaStreamer.TripsWithDistanceTransformer.class.getName()));
cfgBackfillJob.continuousMode = false;
HoodieTableMetaClient meta = HoodieTableMetaClient.builder().setConf(hadoopConf()).setBasePath(tableBasePath).build();
HoodieTimeline timeline = meta.getActiveTimeline().getCommitsTimeline().filterCompletedInstants();
HoodieCommitMetadata commitMetadataForFirstInstant = HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(timeline.firstInstant().get()).get(), HoodieCommitMetadata.class);
// run the backfill job
props = prepareMultiWriterProps(fs(), basePath, propsFilePath);
props.setProperty("hoodie.write.lock.provider", "org.apache.hudi.client.transaction.lock.InProcessLockProvider");
props.setProperty(LockConfiguration.LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY, "3000");
UtilitiesTestBase.Helpers.savePropsToDFS(props, fs(), propsFilePath);
// get current checkpoint after preparing base dataset with some commits
HoodieCommitMetadata commitMetadataForLastInstant = getLatestMetadata(meta);
// Set checkpoint to the last successful position
cfgBackfillJob.checkpoint = commitMetadataForLastInstant.getMetadata(CHECKPOINT_KEY);
cfgBackfillJob.configs.add(String.format("%s=%d", SourceConfigs.MAX_UNIQUE_RECORDS_PROP, totalRecords));
cfgBackfillJob.configs.add(String.format("%s=false", HoodieCompactionConfig.AUTO_CLEAN.key()));
HoodieDeltaStreamer backfillJob = new HoodieDeltaStreamer(cfgBackfillJob, jsc());
backfillJob.sync();
meta.reloadActiveTimeline();
int totalCommits = meta.getCommitsTimeline().filterCompletedInstants().countInstants();
// add a new commit to timeline which may not have the checkpoint in extra metadata
addCommitToTimeline(meta);
meta.reloadActiveTimeline();
verifyCommitMetadataCheckpoint(meta, null);
cfgBackfillJob.checkpoint = null;
// if deltastreamer checkpoint fetch does not walk back to older commits, this sync will fail
new HoodieDeltaStreamer(cfgBackfillJob, jsc()).sync();
meta.reloadActiveTimeline();
Assertions.assertEquals(totalCommits + 2, meta.getCommitsTimeline().filterCompletedInstants().countInstants());
verifyCommitMetadataCheckpoint(meta, "00008");
}
use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.
the class HoodieDeltaStreamerTestBase method addCommitToTimeline.
static void addCommitToTimeline(HoodieTableMetaClient metaClient, WriteOperationType writeOperationType, String commitActiontype, Map<String, String> extraMetadata) throws IOException {
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
commitMetadata.setOperationType(writeOperationType);
extraMetadata.forEach((k, v) -> commitMetadata.getExtraMetadata().put(k, v));
String commitTime = HoodieActiveTimeline.createNewInstantTime();
metaClient.getActiveTimeline().createNewInstant(new HoodieInstant(HoodieInstant.State.REQUESTED, commitActiontype, commitTime));
metaClient.getActiveTimeline().createNewInstant(new HoodieInstant(HoodieInstant.State.INFLIGHT, commitActiontype, commitTime));
metaClient.getActiveTimeline().saveAsComplete(new HoodieInstant(HoodieInstant.State.INFLIGHT, commitActiontype, commitTime), Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
}
use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.
the class BaseCommitActionExecutor method saveWorkloadProfileMetadataToInflight.
/**
* Save the workload profile in an intermediate file (here re-using commit files) This is useful when performing
* rollback for MOR tables. Only updates are recorded in the workload profile metadata since updates to log blocks
* are unknown across batches Inserts (which are new parquet files) are rolled back based on commit time. // TODO :
* Create a new WorkloadProfile metadata file instead of using HoodieCommitMetadata
*/
void saveWorkloadProfileMetadataToInflight(WorkloadProfile profile, String instantTime) throws HoodieCommitException {
try {
HoodieCommitMetadata metadata = new HoodieCommitMetadata();
profile.getOutputPartitionPaths().forEach(path -> {
WorkloadStat partitionStat = profile.getOutputWorkloadStat(path);
HoodieWriteStat insertStat = new HoodieWriteStat();
insertStat.setNumInserts(partitionStat.getNumInserts());
insertStat.setFileId("");
insertStat.setPrevCommit(HoodieWriteStat.NULL_COMMIT);
metadata.addWriteStat(path, insertStat);
Map<String, Pair<String, Long>> updateLocationMap = partitionStat.getUpdateLocationToCount();
Map<String, Pair<String, Long>> insertLocationMap = partitionStat.getInsertLocationToCount();
Stream.concat(updateLocationMap.keySet().stream(), insertLocationMap.keySet().stream()).distinct().forEach(fileId -> {
HoodieWriteStat writeStat = new HoodieWriteStat();
writeStat.setFileId(fileId);
Pair<String, Long> updateLocation = updateLocationMap.get(fileId);
Pair<String, Long> insertLocation = insertLocationMap.get(fileId);
// TODO : Write baseCommitTime is possible here ?
writeStat.setPrevCommit(updateLocation != null ? updateLocation.getKey() : insertLocation.getKey());
if (updateLocation != null) {
writeStat.setNumUpdateWrites(updateLocation.getValue());
}
if (insertLocation != null) {
writeStat.setNumInserts(insertLocation.getValue());
}
metadata.addWriteStat(path, writeStat);
});
});
metadata.setOperationType(operationType);
HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
String commitActionType = getCommitActionType();
HoodieInstant requested = new HoodieInstant(State.REQUESTED, commitActionType, instantTime);
activeTimeline.transitionRequestedToInflight(requested, Option.of(metadata.toJsonString().getBytes(StandardCharsets.UTF_8)), config.shouldAllowMultiWriteOnSameInstant());
} catch (IOException io) {
throw new HoodieCommitException("Failed to commit " + instantTime + " unable to save inflight metadata ", io);
}
}
use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.
the class RunCompactionActionExecutor method execute.
@Override
public HoodieWriteMetadata<HoodieData<WriteStatus>> execute() {
HoodieTimeline pendingCompactionTimeline = table.getActiveTimeline().filterPendingCompactionTimeline();
compactor.preCompact(table, pendingCompactionTimeline, instantTime);
HoodieWriteMetadata<HoodieData<WriteStatus>> compactionMetadata = new HoodieWriteMetadata<>();
try {
// generate compaction plan
// should support configurable commit metadata
HoodieCompactionPlan compactionPlan = CompactionUtils.getCompactionPlan(table.getMetaClient(), instantTime);
HoodieData<WriteStatus> statuses = compactor.compact(context, compactionPlan, table, config, instantTime, compactionHandler);
compactor.maybePersist(statuses, config);
context.setJobStatus(this.getClass().getSimpleName(), "Preparing compaction metadata");
List<HoodieWriteStat> updateStatusMap = statuses.map(WriteStatus::getStat).collectAsList();
HoodieCommitMetadata metadata = new HoodieCommitMetadata(true);
for (HoodieWriteStat stat : updateStatusMap) {
metadata.addWriteStat(stat.getPartitionPath(), stat);
}
metadata.addMetadata(HoodieCommitMetadata.SCHEMA_KEY, config.getSchema());
compactionMetadata.setWriteStatuses(statuses);
compactionMetadata.setCommitted(false);
compactionMetadata.setCommitMetadata(Option.of(metadata));
} catch (IOException e) {
throw new HoodieCompactionException("Could not compact " + config.getBasePath(), e);
}
return compactionMetadata;
}
use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.
the class TestHoodieClientOnCopyOnWriteStorage method testCommitWritesRelativePaths.
/**
* Test to ensure commit metadata points to valid files.
*/
@ParameterizedTest
@MethodSource("populateMetaFieldsParams")
public void testCommitWritesRelativePaths(boolean populateMetaFields) throws Exception {
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder().withAutoCommit(false);
addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields);
try (SparkRDDWriteClient client = getHoodieWriteClient(cfgBuilder.build())) {
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build();
HoodieSparkTable table = HoodieSparkTable.create(cfgBuilder.build(), context, metaClient);
String instantTime = "000";
client.startCommitWithTime(instantTime);
List<HoodieRecord> records = dataGen.generateInserts(instantTime, 200);
JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
JavaRDD<WriteStatus> result = client.bulkInsert(writeRecords, instantTime);
assertTrue(client.commit(instantTime, result), "Commit should succeed");
assertTrue(testTable.commitExists(instantTime), "After explicit commit, commit file should be created");
// Get base file paths from commit metadata
String actionType = metaClient.getCommitActionType();
HoodieInstant commitInstant = new HoodieInstant(false, actionType, instantTime);
HoodieTimeline commitTimeline = metaClient.getCommitTimeline().filterCompletedInstants();
HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(commitTimeline.getInstantDetails(commitInstant).get(), HoodieCommitMetadata.class);
String basePath = table.getMetaClient().getBasePath();
Collection<String> commitPathNames = commitMetadata.getFileIdAndFullPaths(basePath).values();
// Read from commit file
try (FSDataInputStream inputStream = fs.open(testTable.getCommitFilePath(instantTime))) {
String everything = FileIOUtils.readAsUTFString(inputStream);
HoodieCommitMetadata metadata = HoodieCommitMetadata.fromJsonString(everything, HoodieCommitMetadata.class);
HashMap<String, String> paths = metadata.getFileIdAndFullPaths(basePath);
// Compare values in both to make sure they are equal.
for (String pathName : paths.values()) {
assertTrue(commitPathNames.contains(pathName));
}
}
}
}
Aggregations