use of org.apache.hudi.avro.model.HoodieCleanMetadata in project hudi by apache.
the class SavepointActionExecutor method execute.
@Override
public HoodieSavepointMetadata execute() {
Option<HoodieInstant> cleanInstant = table.getCompletedCleanTimeline().lastInstant();
if (!table.getCompletedCommitsTimeline().containsInstant(instantTime)) {
throw new HoodieSavepointException("Could not savepoint non-existing commit " + instantTime);
}
try {
// Check the last commit that was not cleaned and check if savepoint time is > that commit
String lastCommitRetained;
if (cleanInstant.isPresent()) {
HoodieCleanMetadata cleanMetadata = TimelineMetadataUtils.deserializeHoodieCleanMetadata(table.getActiveTimeline().getInstantDetails(cleanInstant.get()).get());
lastCommitRetained = cleanMetadata.getEarliestCommitToRetain();
} else {
lastCommitRetained = table.getCompletedCommitsTimeline().firstInstant().get().getTimestamp();
}
// Cannot allow savepoint time on a commit that could have been cleaned
ValidationUtils.checkArgument(HoodieTimeline.compareTimestamps(instantTime, HoodieTimeline.GREATER_THAN_OR_EQUALS, lastCommitRetained), "Could not savepoint commit " + instantTime + " as this is beyond the lookup window " + lastCommitRetained);
context.setJobStatus(this.getClass().getSimpleName(), "Collecting latest files for savepoint " + instantTime);
List<String> partitions = FSUtils.getAllPartitionPaths(context, config.getMetadataConfig(), table.getMetaClient().getBasePath());
Map<String, List<String>> latestFilesMap = context.mapToPair(partitions, partitionPath -> {
// Scan all partitions files with this commit time
LOG.info("Collecting latest files in partition path " + partitionPath);
TableFileSystemView.BaseFileOnlyView view = table.getBaseFileOnlyView();
List<String> latestFiles = view.getLatestBaseFilesBeforeOrOn(partitionPath, instantTime).map(HoodieBaseFile::getFileName).collect(Collectors.toList());
return new ImmutablePair<>(partitionPath, latestFiles);
}, null);
HoodieSavepointMetadata metadata = TimelineMetadataUtils.convertSavepointMetadata(user, comment, latestFilesMap);
// Nothing to save in the savepoint
table.getActiveTimeline().createNewInstant(new HoodieInstant(true, HoodieTimeline.SAVEPOINT_ACTION, instantTime));
table.getActiveTimeline().saveAsComplete(new HoodieInstant(true, HoodieTimeline.SAVEPOINT_ACTION, instantTime), TimelineMetadataUtils.serializeSavepointMetadata(metadata));
LOG.info("Savepoint " + instantTime + " created");
return metadata;
} catch (IOException e) {
throw new HoodieSavepointException("Failed to savepoint " + instantTime, e);
}
}
use of org.apache.hudi.avro.model.HoodieCleanMetadata in project hudi by apache.
the class IncrementalTimelineSyncFileSystemView method addCleanInstant.
/**
* Add newly found clean instant. Note that cleaner metadata (.clean.completed)
* contains only relative paths unlike clean plans (.clean.requested) which contains absolute paths.
*
* @param timeline Timeline
* @param instant Clean instant
*/
private void addCleanInstant(HoodieTimeline timeline, HoodieInstant instant) throws IOException {
LOG.info("Syncing cleaner instant (" + instant + ")");
HoodieCleanMetadata cleanMetadata = CleanerUtils.getCleanerMetadata(metaClient, instant);
cleanMetadata.getPartitionMetadata().entrySet().stream().forEach(entry -> {
final String basePath = metaClient.getBasePath();
final String partitionPath = entry.getValue().getPartitionPath();
List<String> fullPathList = entry.getValue().getSuccessDeleteFiles().stream().map(fileName -> new Path(FSUtils.getPartitionPath(basePath, partitionPath), fileName).toString()).collect(Collectors.toList());
removeFileSlicesForPartition(timeline, instant, entry.getKey(), fullPathList);
});
LOG.info("Done Syncing cleaner instant (" + instant + ")");
}
use of org.apache.hudi.avro.model.HoodieCleanMetadata in project hudi by apache.
the class CleanMetadataV2MigrationHandler method upgradeFrom.
@Override
public HoodieCleanMetadata upgradeFrom(HoodieCleanMetadata input) {
ValidationUtils.checkArgument(input.getVersion() == 1, "Input version is " + input.getVersion() + ". Must be 1");
HoodieCleanMetadata metadata = new HoodieCleanMetadata();
metadata.setEarliestCommitToRetain(input.getEarliestCommitToRetain());
metadata.setTimeTakenInMillis(input.getTimeTakenInMillis());
metadata.setStartCleanTime(input.getStartCleanTime());
metadata.setTotalFilesDeleted(input.getTotalFilesDeleted());
metadata.setVersion(getManagedVersion());
Map<String, HoodieCleanPartitionMetadata> partitionMetadataMap = input.getPartitionMetadata().entrySet().stream().map(entry -> {
final String partitionPath = entry.getKey();
final HoodieCleanPartitionMetadata partitionMetadata = entry.getValue();
final List<String> deletePathPatterns = convertToV2Path(partitionMetadata.getDeletePathPatterns());
final List<String> successDeleteFiles = convertToV2Path(partitionMetadata.getSuccessDeleteFiles());
final List<String> failedDeleteFiles = convertToV2Path(partitionMetadata.getFailedDeleteFiles());
final HoodieCleanPartitionMetadata cleanPartitionMetadata = HoodieCleanPartitionMetadata.newBuilder().setPolicy(partitionMetadata.getPolicy()).setPartitionPath(partitionMetadata.getPartitionPath()).setDeletePathPatterns(deletePathPatterns).setSuccessDeleteFiles(successDeleteFiles).setFailedDeleteFiles(failedDeleteFiles).build();
return Pair.of(partitionPath, cleanPartitionMetadata);
}).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
return HoodieCleanMetadata.newBuilder().setEarliestCommitToRetain(input.getEarliestCommitToRetain()).setStartCleanTime(input.getStartCleanTime()).setTimeTakenInMillis(input.getTimeTakenInMillis()).setTotalFilesDeleted(input.getTotalFilesDeleted()).setPartitionMetadata(partitionMetadataMap).setVersion(getManagedVersion()).build();
}
use of org.apache.hudi.avro.model.HoodieCleanMetadata in project hudi by apache.
the class TestHoodieClientOnCopyOnWriteStorage method testParallelInsertAndCleanPreviousFailedCommits.
@Test
public void testParallelInsertAndCleanPreviousFailedCommits() throws Exception {
HoodieFailedWritesCleaningPolicy cleaningPolicy = HoodieFailedWritesCleaningPolicy.LAZY;
ExecutorService service = Executors.newFixedThreadPool(2);
HoodieTestUtils.init(hadoopConf, basePath);
SparkRDDWriteClient client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, true));
// perform 1 successfull write
writeBatch(client, "100", "100", Option.of(Arrays.asList("100")), "100", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 100, 0, true);
// Perform 2 failed writes to table
writeBatch(client, "200", "100", Option.of(Arrays.asList("200")), "200", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 100, 0, false);
client.close();
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, true));
writeBatch(client, "300", "200", Option.of(Arrays.asList("300")), "300", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 100, 0, false);
client.close();
// refresh data generator to delete records generated from failed commits
dataGen = new HoodieTestDataGenerator();
// Create a succesful commit
Future<JavaRDD<WriteStatus>> commit3 = service.submit(() -> writeBatch(new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, true)), "400", "300", Option.of(Arrays.asList("400")), "300", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 100, 0, true));
commit3.get();
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build();
assertTrue(metaClient.getActiveTimeline().getTimelineOfActions(CollectionUtils.createSet(ROLLBACK_ACTION)).countInstants() == 0);
assertTrue(metaClient.getActiveTimeline().filterInflights().countInstants() == 2);
assertTrue(metaClient.getActiveTimeline().getCommitsTimeline().filterCompletedInstants().countInstants() == 2);
client = new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, true));
// Await till enough time passes such that the first 2 failed commits heartbeats are expired
boolean conditionMet = false;
while (!conditionMet) {
conditionMet = client.getHeartbeatClient().isHeartbeatExpired("300");
Thread.sleep(2000);
}
Future<JavaRDD<WriteStatus>> commit4 = service.submit(() -> writeBatch(new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, true)), "500", "400", Option.of(Arrays.asList("500")), "500", 100, dataGen::generateInserts, SparkRDDWriteClient::bulkInsert, false, 100, 100, 0, true));
Future<HoodieCleanMetadata> clean1 = service.submit(() -> new SparkRDDWriteClient(context, getParallelWritingWriteConfig(cleaningPolicy, true)).clean());
commit4.get();
clean1.get();
HoodieActiveTimeline timeline = metaClient.getActiveTimeline().reload();
assertTrue(timeline.getTimelineOfActions(CollectionUtils.createSet(ROLLBACK_ACTION)).countInstants() == 2);
// Since we write rollbacks not clean, there should be no clean action on the timeline
assertTrue(timeline.getTimelineOfActions(CollectionUtils.createSet(CLEAN_ACTION)).countInstants() == 0);
assertTrue(timeline.getCommitsTimeline().filterCompletedInstants().countInstants() == 3);
}
use of org.apache.hudi.avro.model.HoodieCleanMetadata in project hudi by apache.
the class TestCleaner method testMultiClean.
/**
* Tests no more than 1 clean is scheduled/executed if HoodieCompactionConfig.allowMultipleCleanSchedule config is disabled.
*/
@Test
public void testMultiClean() {
HoodieWriteConfig writeConfig = getConfigBuilder().withFileSystemViewConfig(new FileSystemViewStorageConfig.Builder().withEnableBackupForRemoteFileSystemView(false).build()).withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024).withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(1).withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.EAGER).allowMultipleCleans(false).withAutoClean(false).retainCommits(1).retainFileVersions(1).build()).withEmbeddedTimelineServerEnabled(false).build();
int index = 0;
String cleanInstantTime;
final String partition = "2015/03/16";
try (SparkRDDWriteClient client = new SparkRDDWriteClient(context, writeConfig)) {
// Three writes so we can initiate a clean
for (; index < 3; ++index) {
String newCommitTime = "00" + index;
List<HoodieRecord> records = dataGen.generateInsertsForPartition(newCommitTime, 1, partition);
client.startCommitWithTime(newCommitTime);
client.insert(jsc.parallelize(records, 1), newCommitTime).collect();
}
}
// mimic failed/leftover clean by scheduling a clean but not performing it
cleanInstantTime = "00" + index++;
HoodieTable table = HoodieSparkTable.create(writeConfig, context);
Option<HoodieCleanerPlan> cleanPlan = table.scheduleCleaning(context, cleanInstantTime, Option.empty());
assertEquals(cleanPlan.get().getFilePathsToBeDeletedPerPartition().get(partition).size(), 1);
assertEquals(metaClient.reloadActiveTimeline().getCleanerTimeline().filterInflightsAndRequested().countInstants(), 1);
try (SparkRDDWriteClient client = new SparkRDDWriteClient(context, writeConfig)) {
// Next commit. This is required so that there is an additional file version to clean.
String newCommitTime = "00" + index++;
List<HoodieRecord> records = dataGen.generateInsertsForPartition(newCommitTime, 1, partition);
client.startCommitWithTime(newCommitTime);
client.insert(jsc.parallelize(records, 1), newCommitTime).collect();
// Initiate another clean. The previous leftover clean will be attempted first, followed by another clean
// due to the commit above.
String newCleanInstantTime = "00" + index++;
HoodieCleanMetadata cleanMetadata = client.clean(newCleanInstantTime);
// subsequent clean should not be triggered since allowMultipleCleanSchedules is set to false
assertNull(cleanMetadata);
// let the old clean complete
table = HoodieSparkTable.create(writeConfig, context);
cleanMetadata = table.clean(context, cleanInstantTime, false);
assertNotNull(cleanMetadata);
// any new clean should go ahead
cleanMetadata = client.clean(newCleanInstantTime);
// subsequent clean should not be triggered since allowMultipleCleanSchedules is set to false
assertNotNull(cleanMetadata);
// 1 file cleaned
assertEquals(cleanMetadata.getPartitionMetadata().get(partition).getSuccessDeleteFiles().size(), 1);
assertEquals(cleanMetadata.getPartitionMetadata().get(partition).getFailedDeleteFiles().size(), 0);
assertEquals(cleanMetadata.getPartitionMetadata().get(partition).getDeletePathPatterns().size(), 1);
}
}
Aggregations