use of org.apache.hudi.common.table.view.HoodieTableFileSystemView in project hudi by apache.
the class TestHoodieSparkMergeOnReadTableRollback method testRollbackWithDeltaAndCompactionCommit.
@ParameterizedTest
@ValueSource(booleans = { true, false })
void testRollbackWithDeltaAndCompactionCommit(boolean rollbackUsingMarkers) throws Exception {
// NOTE: First writer will have Metadata table DISABLED
HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(false, rollbackUsingMarkers, HoodieIndex.IndexType.SIMPLE);
addConfigsForPopulateMetaFields(cfgBuilder, true);
HoodieWriteConfig cfg = cfgBuilder.build();
Properties properties = new Properties();
properties.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().toString());
HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, properties);
try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();
// Test delta commit rollback
/*
* Write 1 (only inserts)
*/
String newCommitTime = "001";
client.startCommitWithTime(newCommitTime);
List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 200);
JavaRDD<HoodieRecord> writeRecords = jsc().parallelize(records, 1);
JavaRDD<WriteStatus> writeStatusJavaRDD = client.upsert(writeRecords, newCommitTime);
List<WriteStatus> statuses = writeStatusJavaRDD.collect();
assertNoWriteErrors(statuses);
client.commit(newCommitTime, jsc().parallelize(statuses));
HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient);
Option<HoodieInstant> deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
assertTrue(deltaCommit.isPresent());
assertEquals("001", deltaCommit.get().getTimestamp(), "Delta commit should be 001");
Option<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
assertFalse(commit.isPresent());
FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable);
HoodieTableFileSystemView tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles);
Stream<HoodieBaseFile> dataFilesToRead = tableView.getLatestBaseFiles();
assertFalse(dataFilesToRead.findAny().isPresent());
tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
dataFilesToRead = tableView.getLatestBaseFiles();
assertTrue(dataFilesToRead.findAny().isPresent(), "should list the base files we wrote in the delta commit");
/*
* Write 2 (inserts + updates - testing failed delta commit)
*/
final String commitTime1 = "002";
// NOTE: Second writer will have Metadata table ENABLED
try (SparkRDDWriteClient secondClient = getHoodieWriteClient(getHoodieWriteConfigWithSmallFileHandlingOff(true))) {
secondClient.startCommitWithTime(commitTime1);
List<HoodieRecord> copyOfRecords = new ArrayList<>(records);
copyOfRecords = dataGen.generateUpdates(commitTime1, copyOfRecords);
copyOfRecords.addAll(dataGen.generateInserts(commitTime1, 200));
List<String> inputPaths = tableView.getLatestBaseFiles().map(baseFile -> new Path(baseFile.getPath()).getParent().toString()).collect(Collectors.toList());
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), inputPaths, basePath());
assertEquals(200, recordsRead.size());
statuses = secondClient.upsert(jsc().parallelize(copyOfRecords, 1), commitTime1).collect();
// Verify there are no errors
assertNoWriteErrors(statuses);
// Test failed delta commit rollback
secondClient.rollback(commitTime1);
allFiles = listAllBaseFilesInPath(hoodieTable);
// After rollback, there should be no base file with the failed commit time
List<String> remainingFiles = Arrays.stream(allFiles).filter(file -> file.getPath().getName().contains(commitTime1)).map(fileStatus -> fileStatus.getPath().toString()).collect(Collectors.toList());
assertEquals(0, remainingFiles.size(), "There files should have been rolled-back " + "when rolling back commit " + commitTime1 + " but are still remaining. Files: " + remainingFiles);
inputPaths = tableView.getLatestBaseFiles().map(baseFile -> new Path(baseFile.getPath()).getParent().toString()).collect(Collectors.toList());
recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), inputPaths, basePath());
assertEquals(200, recordsRead.size());
}
/*
* Write 3 (inserts + updates - testing successful delta commit)
*/
final String commitTime2 = "003";
try (SparkRDDWriteClient thirdClient = getHoodieWriteClient(getHoodieWriteConfigWithSmallFileHandlingOff(true))) {
thirdClient.startCommitWithTime(commitTime2);
List<HoodieRecord> copyOfRecords = new ArrayList<>(records);
copyOfRecords = dataGen.generateUpdates(commitTime2, copyOfRecords);
copyOfRecords.addAll(dataGen.generateInserts(commitTime2, 200));
List<String> inputPaths = tableView.getLatestBaseFiles().map(baseFile -> new Path(baseFile.getPath()).getParent().toString()).collect(Collectors.toList());
List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), inputPaths, basePath());
assertEquals(200, recordsRead.size());
writeRecords = jsc().parallelize(copyOfRecords, 1);
writeStatusJavaRDD = thirdClient.upsert(writeRecords, commitTime2);
statuses = writeStatusJavaRDD.collect();
// Verify there are no errors
assertNoWriteErrors(statuses);
// Test successful delta commit rollback
thirdClient.rollback(commitTime2);
allFiles = listAllBaseFilesInPath(hoodieTable);
// After rollback, there should be no base file with the failed commit time
assertEquals(0, Arrays.stream(allFiles).filter(file -> file.getPath().getName().contains(commitTime2)).count());
metaClient = HoodieTableMetaClient.reload(metaClient);
hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient);
tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
inputPaths = tableView.getLatestBaseFiles().map(baseFile -> new Path(baseFile.getPath()).getParent().toString()).collect(Collectors.toList());
recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), inputPaths, basePath());
// check that the number of records read is still correct after rollback operation
assertEquals(200, recordsRead.size());
// Test compaction commit rollback
/*
* Write 4 (updates)
*/
newCommitTime = "004";
thirdClient.startCommitWithTime(newCommitTime);
writeStatusJavaRDD = thirdClient.upsert(writeRecords, newCommitTime);
statuses = writeStatusJavaRDD.collect();
// Verify there are no errors
assertNoWriteErrors(statuses);
thirdClient.commit(newCommitTime, jsc().parallelize(statuses));
metaClient = HoodieTableMetaClient.reload(metaClient);
String compactionInstantTime = thirdClient.scheduleCompaction(Option.empty()).get().toString();
thirdClient.compact(compactionInstantTime);
metaClient = HoodieTableMetaClient.reload(metaClient);
final String compactedCommitTime = metaClient.getActiveTimeline().reload().lastInstant().get().getTimestamp();
assertTrue(Arrays.stream(listAllBaseFilesInPath(hoodieTable)).anyMatch(file -> compactedCommitTime.equals(new HoodieBaseFile(file).getCommitTime())));
hoodieTable.rollbackInflightCompaction(new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, compactedCommitTime));
allFiles = listAllBaseFilesInPath(hoodieTable);
metaClient = HoodieTableMetaClient.reload(metaClient);
tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitsTimeline(), allFiles);
assertFalse(tableView.getLatestBaseFiles().anyMatch(file -> compactedCommitTime.equals(file.getCommitTime())));
assertAll(tableView.getLatestBaseFiles().map(file -> () -> assertNotEquals(compactedCommitTime, file.getCommitTime())));
}
}
}
use of org.apache.hudi.common.table.view.HoodieTableFileSystemView in project hudi by apache.
the class BaseHoodieTableFileIndex method doRefresh.
private void doRefresh() {
long startTime = System.currentTimeMillis();
Map<PartitionPath, FileStatus[]> partitionFiles = loadPartitionPathFiles();
FileStatus[] allFiles = partitionFiles.values().stream().flatMap(Arrays::stream).toArray(FileStatus[]::new);
metaClient.reloadActiveTimeline();
HoodieTimeline activeTimeline = getActiveTimeline();
Option<HoodieInstant> latestInstant = activeTimeline.lastInstant();
// TODO we can optimize the flow by:
// - First fetch list of files from instants of interest
// - Load FileStatus's
fileSystemView = new HoodieTableFileSystemView(metaClient, activeTimeline, allFiles);
Option<String> queryInstant = specifiedQueryInstant.or(() -> latestInstant.map(HoodieInstant::getTimestamp));
validate(activeTimeline, queryInstant);
if (tableType.equals(HoodieTableType.MERGE_ON_READ) && queryType.equals(HoodieTableQueryType.SNAPSHOT)) {
cachedAllInputFileSlices = partitionFiles.keySet().stream().collect(Collectors.toMap(Function.identity(), partitionPath -> queryInstant.map(instant -> fileSystemView.getLatestMergedFileSlicesBeforeOrOn(partitionPath.path, queryInstant.get()).collect(Collectors.toList())).orElse(Collections.emptyList())));
} else {
cachedAllInputFileSlices = partitionFiles.keySet().stream().collect(Collectors.toMap(Function.identity(), partitionPath -> queryInstant.map(instant -> fileSystemView.getLatestFileSlicesBeforeOrOn(partitionPath.path, instant, true)).orElse(fileSystemView.getLatestFileSlices(partitionPath.path)).collect(Collectors.toList())));
}
cachedFileSize = cachedAllInputFileSlices.values().stream().flatMap(Collection::stream).mapToLong(BaseHoodieTableFileIndex::fileSliceSize).sum();
// If the partition value contains InternalRow.empty, we query it as a non-partitioned table.
queryAsNonePartitionedTable = partitionFiles.keySet().stream().anyMatch(p -> p.values.length == 0);
long duration = System.currentTimeMillis() - startTime;
LOG.info(String.format("Refresh table %s, spent: %d ms", metaClient.getTableConfig().getTableName(), duration));
}
use of org.apache.hudi.common.table.view.HoodieTableFileSystemView in project hudi by apache.
the class FileSystemViewCommand method showAllFileSlices.
@CliCommand(value = "show fsview all", help = "Show entire file-system view")
public String showAllFileSlices(@CliOption(key = { "pathRegex" }, help = "regex to select files, eg: 2016/08/02", unspecifiedDefaultValue = "*/*/*") String globRegex, @CliOption(key = { "baseFileOnly" }, help = "Only display base files view", unspecifiedDefaultValue = "false") boolean baseFileOnly, @CliOption(key = { "maxInstant" }, help = "File-Slices upto this instant are displayed", unspecifiedDefaultValue = "") String maxInstant, @CliOption(key = { "includeMax" }, help = "Include Max Instant", unspecifiedDefaultValue = "false") boolean includeMaxInstant, @CliOption(key = { "includeInflight" }, help = "Include Inflight Instants", unspecifiedDefaultValue = "false") boolean includeInflight, @CliOption(key = { "excludeCompaction" }, help = "Exclude compaction Instants", unspecifiedDefaultValue = "false") boolean excludeCompaction, @CliOption(key = { "limit" }, help = "Limit rows to be displayed", unspecifiedDefaultValue = "-1") Integer limit, @CliOption(key = { "sortBy" }, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = { "desc" }, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, @CliOption(key = { "headeronly" }, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly) throws IOException {
HoodieTableFileSystemView fsView = buildFileSystemView(globRegex, maxInstant, baseFileOnly, includeMaxInstant, includeInflight, excludeCompaction);
List<Comparable[]> rows = new ArrayList<>();
fsView.getAllFileGroups().forEach(fg -> fg.getAllFileSlices().forEach(fs -> {
int idx = 0;
// For base file only Views, do not display any delta-file related columns
Comparable[] row = new Comparable[baseFileOnly ? 5 : 8];
row[idx++] = fg.getPartitionPath();
row[idx++] = fg.getFileGroupId().getFileId();
row[idx++] = fs.getBaseInstantTime();
row[idx++] = fs.getBaseFile().isPresent() ? fs.getBaseFile().get().getPath() : "";
row[idx++] = fs.getBaseFile().isPresent() ? fs.getBaseFile().get().getFileSize() : -1;
if (!baseFileOnly) {
row[idx++] = fs.getLogFiles().count();
row[idx++] = fs.getLogFiles().mapToLong(HoodieLogFile::getFileSize).sum();
row[idx++] = fs.getLogFiles().collect(Collectors.toList()).toString();
}
rows.add(row);
}));
Function<Object, String> converterFunction = entry -> NumericUtils.humanReadableByteCount((Double.parseDouble(entry.toString())));
Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>();
fieldNameToConverterMap.put(HoodieTableHeaderFields.HEADER_TOTAL_DELTA_FILE_SIZE, converterFunction);
fieldNameToConverterMap.put(HoodieTableHeaderFields.HEADER_DATA_FILE_SIZE, converterFunction);
TableHeader header = new TableHeader().addTableHeaderField(HoodieTableHeaderFields.HEADER_PARTITION).addTableHeaderField(HoodieTableHeaderFields.HEADER_FILE_ID).addTableHeaderField(HoodieTableHeaderFields.HEADER_BASE_INSTANT).addTableHeaderField(HoodieTableHeaderFields.HEADER_DATA_FILE).addTableHeaderField(HoodieTableHeaderFields.HEADER_DATA_FILE_SIZE);
if (!baseFileOnly) {
header = header.addTableHeaderField(HoodieTableHeaderFields.HEADER_NUM_DELTA_FILES).addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_DELTA_FILE_SIZE).addTableHeaderField(HoodieTableHeaderFields.HEADER_DELTA_FILES);
}
return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows);
}
use of org.apache.hudi.common.table.view.HoodieTableFileSystemView in project hudi by apache.
the class ITTestRepairsCommand method testDeduplicateWithInserts.
/**
* Test case for dry run deduplicate.
*/
@Test
public void testDeduplicateWithInserts() throws IOException {
// get fs and check number of latest files
HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants(), fs.listStatus(new Path(duplicatedPartitionPath)));
List<String> filteredStatuses = fsView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList());
assertEquals(3, filteredStatuses.size(), "There should be 3 files.");
// Before deduplicate, all files contain 210 records
String[] files = filteredStatuses.toArray(new String[0]);
Dataset df = readFiles(files);
assertEquals(210, df.count());
String partitionPath = HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH;
String cmdStr = String.format("repair deduplicate --duplicatedPartitionPath %s --repairedOutputPath %s --sparkMaster %s", partitionPath, repairedOutputPath, "local");
CommandResult cr = getShell().executeCommand(cmdStr);
assertTrue(cr.isSuccess());
assertEquals(RepairsCommand.DEDUPLICATE_RETURN_PREFIX + repairedOutputPath, cr.getResult().toString());
// After deduplicate, there are 200 records
FileStatus[] fileStatus = fs.listStatus(new Path(repairedOutputPath));
files = Arrays.stream(fileStatus).map(status -> status.getPath().toString()).toArray(String[]::new);
Dataset result = readFiles(files);
assertEquals(200, result.count());
}
use of org.apache.hudi.common.table.view.HoodieTableFileSystemView in project hudi by apache.
the class HoodieTableSource method buildFileIndex.
private List<MergeOnReadInputSplit> buildFileIndex() {
Set<String> requiredPartitionPaths = getRequiredPartitionPaths();
fileIndex.setPartitionPaths(requiredPartitionPaths);
List<String> relPartitionPaths = fileIndex.getOrBuildPartitionPaths();
if (relPartitionPaths.size() == 0) {
return Collections.emptyList();
}
FileStatus[] fileStatuses = fileIndex.getFilesInPartitions();
if (fileStatuses.length == 0) {
throw new HoodieException("No files found for reading in user provided path.");
}
HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, // file-slice after pending compaction-requested instant-time is also considered valid
metaClient.getCommitsAndCompactionTimeline().filterCompletedAndCompactionInstants(), fileStatuses);
String latestCommit = fsView.getLastInstant().get().getTimestamp();
final String mergeType = this.conf.getString(FlinkOptions.MERGE_TYPE);
final AtomicInteger cnt = new AtomicInteger(0);
// generates one input split for each file group
return relPartitionPaths.stream().map(relPartitionPath -> fsView.getLatestMergedFileSlicesBeforeOrOn(relPartitionPath, latestCommit).map(fileSlice -> {
String basePath = fileSlice.getBaseFile().map(BaseFile::getPath).orElse(null);
Option<List<String>> logPaths = Option.ofNullable(fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()).map(logFile -> logFile.getPath().toString()).collect(Collectors.toList()));
return new MergeOnReadInputSplit(cnt.getAndAdd(1), basePath, logPaths, latestCommit, metaClient.getBasePath(), maxCompactionMemoryInBytes, mergeType, null);
}).collect(Collectors.toList())).flatMap(Collection::stream).collect(Collectors.toList());
}
Aggregations