use of org.apache.hudi.hadoop.RealtimeFileStatus in project hudi by apache.
the class TestHoodieRealtimeRecordReader method testLogOnlyReader.
@Test
public void testLogOnlyReader() throws Exception {
// initial commit
Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema());
HoodieTestUtils.init(hadoopConf, basePath.toString(), HoodieTableType.MERGE_ON_READ);
String baseInstant = "100";
File partitionDir = InputFormatTestUtil.prepareNonPartitionedParquetTable(basePath, schema, 1, 100, baseInstant, HoodieTableType.MERGE_ON_READ);
FileCreateUtils.createDeltaCommit(basePath.toString(), baseInstant);
// Add the paths
FileInputFormat.setInputPaths(baseJobConf, partitionDir.getPath());
FileSlice fileSlice = new FileSlice("default", baseInstant, "fileid1");
try {
// update files or generate new log file
int logVersion = 1;
int baseInstantTs = Integer.parseInt(baseInstant);
String instantTime = String.valueOf(baseInstantTs + logVersion);
HoodieLogFormat.Writer writer = InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid1", baseInstant, instantTime, 100, 0, logVersion);
long size = writer.getCurrentSize();
writer.close();
assertTrue(size > 0, "block - size should be > 0");
HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(Collections.emptyList(), Collections.emptyMap(), Option.empty(), WriteOperationType.UPSERT, schema.toString(), HoodieTimeline.COMMIT_ACTION);
FileCreateUtils.createDeltaCommit(basePath.toString(), instantTime, commitMetadata);
// create a split with new log file(s)
fileSlice.addLogFile(new HoodieLogFile(writer.getLogFile().getPath(), size));
RealtimeFileStatus realtimeFileStatus = new RealtimeFileStatus(new FileStatus(writer.getLogFile().getFileSize(), false, 1, 1, 0, writer.getLogFile().getPath()), basePath.toString(), fileSlice.getLogFiles().collect(Collectors.toList()), false, Option.empty());
realtimeFileStatus.setMaxCommitTime(instantTime);
HoodieRealtimePath realtimePath = (HoodieRealtimePath) realtimeFileStatus.getPath();
HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(new FileSplit(realtimePath, 0, 0, new String[] { "" }), realtimePath);
JobConf newJobConf = new JobConf(baseJobConf);
List<Schema.Field> fields = schema.getFields();
setHiveColumnNameProps(fields, newJobConf, false);
// create a dummy RecordReader to be used by HoodieRealtimeRecordReader
RecordReader<NullWritable, ArrayWritable> reader = new HoodieRealtimeRecordReader(split, newJobConf, new HoodieEmptyRecordReader(split, newJobConf));
// use reader to read log file.
NullWritable key = reader.createKey();
ArrayWritable value = reader.createValue();
while (reader.next(key, value)) {
Writable[] values = value.get();
assertEquals(instantTime, values[0].toString());
key = reader.createKey();
value = reader.createValue();
}
reader.close();
} catch (Exception e) {
throw new HoodieException(e.getMessage(), e);
}
}
use of org.apache.hudi.hadoop.RealtimeFileStatus in project hudi by apache.
the class HoodieMergeOnReadTableInputFormat method listStatusForIncrementalMode.
/**
* Keep the logic of mor_incr_view as same as spark datasource.
* Step1: Get list of commits to be fetched based on start commit and max commits(for snapshot max commits is -1).
* Step2: Get list of affected files status for these affected file status.
* Step3: Construct HoodieTableFileSystemView based on those affected file status.
* a. Filter affected partitions based on inputPaths.
* b. Get list of fileGroups based on affected partitions by fsView.getAllFileGroups.
* Step4: Set input paths based on filtered affected partition paths. changes that amony original input paths passed to
* this method. some partitions did not have commits as part of the trimmed down list of commits and hence we need this step.
* Step5: Find candidate fileStatus, since when we get baseFileStatus from HoodieTableFileSystemView,
* the BaseFileStatus will missing file size information.
* We should use candidate fileStatus to update the size information for BaseFileStatus.
* Step6: For every file group from step3(b)
* Get 1st available base file from all file slices. then we use candidate file status to update the baseFileStatus,
* and construct RealTimeFileStatus and add it to result along with log files.
* If file group just has log files, construct RealTimeFileStatus and add it to result.
* TODO: unify the incremental view code between hive/spark-sql and spark datasource
*/
@Override
protected List<FileStatus> listStatusForIncrementalMode(JobConf job, HoodieTableMetaClient tableMetaClient, List<Path> inputPaths, String incrementalTableName) throws IOException {
List<FileStatus> result = new ArrayList<>();
Job jobContext = Job.getInstance(job);
// step1
Option<HoodieTimeline> timeline = HoodieInputFormatUtils.getFilteredCommitsTimeline(jobContext, tableMetaClient);
if (!timeline.isPresent()) {
return result;
}
HoodieTimeline commitsTimelineToReturn = HoodieInputFormatUtils.getHoodieTimelineForIncrementalQuery(jobContext, incrementalTableName, timeline.get());
Option<List<HoodieInstant>> commitsToCheck = Option.of(commitsTimelineToReturn.getInstants().collect(Collectors.toList()));
if (!commitsToCheck.isPresent()) {
return result;
}
// step2
commitsToCheck.get().sort(HoodieInstant::compareTo);
List<HoodieCommitMetadata> metadataList = commitsToCheck.get().stream().map(instant -> {
try {
return HoodieInputFormatUtils.getCommitMetadata(instant, commitsTimelineToReturn);
} catch (IOException e) {
throw new HoodieException(String.format("cannot get metadata for instant: %s", instant));
}
}).collect(Collectors.toList());
// build fileGroup from fsView
List<FileStatus> affectedFileStatus = Arrays.asList(HoodieInputFormatUtils.listAffectedFilesForCommits(job, new Path(tableMetaClient.getBasePath()), metadataList));
// step3
HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(tableMetaClient, commitsTimelineToReturn, affectedFileStatus.toArray(new FileStatus[0]));
// build fileGroup from fsView
Path basePath = new Path(tableMetaClient.getBasePath());
// filter affectedPartition by inputPaths
List<String> affectedPartition = HoodieInputFormatUtils.getWritePartitionPaths(metadataList).stream().filter(k -> k.isEmpty() ? inputPaths.contains(basePath) : inputPaths.contains(new Path(basePath, k))).collect(Collectors.toList());
if (affectedPartition.isEmpty()) {
return result;
}
List<HoodieFileGroup> fileGroups = affectedPartition.stream().flatMap(partitionPath -> fsView.getAllFileGroups(partitionPath)).collect(Collectors.toList());
// step4
setInputPaths(job, affectedPartition.stream().map(p -> p.isEmpty() ? basePath.toString() : new Path(basePath, p).toString()).collect(Collectors.joining(",")));
// step5
// find all file status in partitionPaths.
FileStatus[] fileStatuses = doListStatus(job);
Map<String, FileStatus> candidateFileStatus = new HashMap<>();
for (int i = 0; i < fileStatuses.length; i++) {
String key = fileStatuses[i].getPath().toString();
candidateFileStatus.put(key, fileStatuses[i]);
}
Option<HoodieVirtualKeyInfo> virtualKeyInfoOpt = getHoodieVirtualKeyInfo(tableMetaClient);
String maxCommitTime = fsView.getLastInstant().get().getTimestamp();
// step6
result.addAll(collectAllIncrementalFiles(fileGroups, maxCommitTime, basePath.toString(), candidateFileStatus, virtualKeyInfoOpt));
return result;
}
use of org.apache.hudi.hadoop.RealtimeFileStatus in project hudi by apache.
the class HoodieMergeOnReadTableInputFormat method createRealtimeFileStatusUnchecked.
/**
* Creates {@link RealtimeFileStatus} for the file-slice where base file is present
*/
private static RealtimeFileStatus createRealtimeFileStatusUnchecked(HoodieBaseFile baseFile, Stream<HoodieLogFile> logFiles, String basePath, Option<HoodieInstant> latestCompletedInstantOpt, Option<HoodieVirtualKeyInfo> virtualKeyInfoOpt) {
FileStatus baseFileStatus = getFileStatusUnchecked(baseFile);
List<HoodieLogFile> sortedLogFiles = logFiles.sorted(HoodieLogFile.getLogFileComparator()).collect(Collectors.toList());
try {
RealtimeFileStatus rtFileStatus = new RealtimeFileStatus(baseFileStatus, basePath, sortedLogFiles, false, virtualKeyInfoOpt);
if (latestCompletedInstantOpt.isPresent()) {
HoodieInstant latestCompletedInstant = latestCompletedInstantOpt.get();
checkState(latestCompletedInstant.isCompleted());
rtFileStatus.setMaxCommitTime(latestCompletedInstant.getTimestamp());
}
if (baseFileStatus instanceof LocatedFileStatusWithBootstrapBaseFile || baseFileStatus instanceof FileStatusWithBootstrapBaseFile) {
rtFileStatus.setBootStrapFileStatus(baseFileStatus);
}
return rtFileStatus;
} catch (IOException e) {
throw new HoodieIOException(String.format("Failed to init %s", RealtimeFileStatus.class.getSimpleName()), e);
}
}
use of org.apache.hudi.hadoop.RealtimeFileStatus in project hudi by apache.
the class HoodieMergeOnReadTableInputFormat method collectAllIncrementalFiles.
private static List<FileStatus> collectAllIncrementalFiles(List<HoodieFileGroup> fileGroups, String maxCommitTime, String basePath, Map<String, FileStatus> candidateFileStatus, Option<HoodieVirtualKeyInfo> virtualKeyInfoOpt) {
List<FileStatus> result = new ArrayList<>();
fileGroups.stream().forEach(f -> {
try {
List<FileSlice> baseFiles = f.getAllFileSlices().filter(slice -> slice.getBaseFile().isPresent()).collect(Collectors.toList());
if (!baseFiles.isEmpty()) {
FileStatus baseFileStatus = HoodieInputFormatUtils.getFileStatus(baseFiles.get(0).getBaseFile().get());
String baseFilePath = baseFileStatus.getPath().toUri().toString();
if (!candidateFileStatus.containsKey(baseFilePath)) {
throw new HoodieException("Error obtaining fileStatus for file: " + baseFilePath);
}
List<HoodieLogFile> deltaLogFiles = f.getLatestFileSlice().get().getLogFiles().collect(Collectors.toList());
// We cannot use baseFileStatus.getPath() here, since baseFileStatus.getPath() missing file size information.
// So we use candidateFileStatus.get(baseFileStatus.getPath()) to get a correct path.
RealtimeFileStatus fileStatus = new RealtimeFileStatus(candidateFileStatus.get(baseFilePath), basePath, deltaLogFiles, true, virtualKeyInfoOpt);
fileStatus.setMaxCommitTime(maxCommitTime);
if (baseFileStatus instanceof LocatedFileStatusWithBootstrapBaseFile || baseFileStatus instanceof FileStatusWithBootstrapBaseFile) {
fileStatus.setBootStrapFileStatus(baseFileStatus);
}
result.add(fileStatus);
}
// add file group which has only logs.
if (f.getLatestFileSlice().isPresent() && baseFiles.isEmpty()) {
List<FileStatus> logFileStatus = f.getLatestFileSlice().get().getLogFiles().map(logFile -> logFile.getFileStatus()).collect(Collectors.toList());
if (logFileStatus.size() > 0) {
List<HoodieLogFile> deltaLogFiles = logFileStatus.stream().map(l -> new HoodieLogFile(l.getPath(), l.getLen())).collect(Collectors.toList());
RealtimeFileStatus fileStatus = new RealtimeFileStatus(logFileStatus.get(0), basePath, deltaLogFiles, true, virtualKeyInfoOpt);
fileStatus.setMaxCommitTime(maxCommitTime);
result.add(fileStatus);
}
}
} catch (IOException e) {
throw new HoodieException("Error obtaining data file/log file grouping ", e);
}
});
return result;
}
use of org.apache.hudi.hadoop.RealtimeFileStatus in project hudi by apache.
the class HoodieMergeOnReadTableInputFormat method createRealtimeFileStatusUnchecked.
/**
* Creates {@link RealtimeFileStatus} for the file-slice where base file is NOT present
*/
private static RealtimeFileStatus createRealtimeFileStatusUnchecked(HoodieLogFile latestLogFile, Stream<HoodieLogFile> logFiles, String basePath, Option<HoodieInstant> latestCompletedInstantOpt, Option<HoodieVirtualKeyInfo> virtualKeyInfoOpt) {
List<HoodieLogFile> sortedLogFiles = logFiles.sorted(HoodieLogFile.getLogFileComparator()).collect(Collectors.toList());
try {
RealtimeFileStatus rtFileStatus = new RealtimeFileStatus(latestLogFile.getFileStatus(), basePath, sortedLogFiles, false, virtualKeyInfoOpt);
if (latestCompletedInstantOpt.isPresent()) {
HoodieInstant latestCompletedInstant = latestCompletedInstantOpt.get();
checkState(latestCompletedInstant.isCompleted());
rtFileStatus.setMaxCommitTime(latestCompletedInstant.getTimestamp());
}
return rtFileStatus;
} catch (IOException e) {
throw new HoodieIOException(String.format("Failed to init %s", RealtimeFileStatus.class.getSimpleName()), e);
}
}
Aggregations