use of org.apache.hudi.table.format.mor.MergeOnReadInputSplit in project hudi by apache.
the class IncrementalInputSplits method inputSplits.
/**
* Returns the incremental input splits.
*
* @param metaClient The meta client
* @param hadoopConf The hadoop configuration
* @param issuedInstant The last issued instant, only valid in streaming read
* @return The list of incremental input splits or empty if there are no new instants
*/
public Result inputSplits(HoodieTableMetaClient metaClient, org.apache.hadoop.conf.Configuration hadoopConf, String issuedInstant) {
metaClient.reloadActiveTimeline();
HoodieTimeline commitTimeline = metaClient.getCommitsAndCompactionTimeline().filterCompletedAndCompactionInstants();
if (commitTimeline.empty()) {
LOG.warn("No splits found for the table under path " + path);
return Result.EMPTY;
}
List<HoodieInstant> instants = filterInstantsWithRange(commitTimeline, issuedInstant);
// get the latest instant that satisfies condition
final HoodieInstant instantToIssue = instants.size() == 0 ? null : instants.get(instants.size() - 1);
final InstantRange instantRange;
if (instantToIssue != null) {
if (issuedInstant != null) {
// the streaming reader may record the last issued instant, if the issued instant is present,
// the instant range should be: (issued instant, the latest instant].
instantRange = InstantRange.getInstance(issuedInstant, instantToIssue.getTimestamp(), InstantRange.RangeType.OPEN_CLOSE);
} else if (this.conf.getOptional(FlinkOptions.READ_START_COMMIT).isPresent()) {
// first time consume and has a start commit
final String startCommit = this.conf.getString(FlinkOptions.READ_START_COMMIT);
instantRange = startCommit.equalsIgnoreCase(FlinkOptions.START_COMMIT_EARLIEST) ? null : InstantRange.getInstance(startCommit, instantToIssue.getTimestamp(), InstantRange.RangeType.CLOSE_CLOSE);
} else {
// first time consume and no start commit, consumes the latest incremental data set.
instantRange = InstantRange.getInstance(instantToIssue.getTimestamp(), instantToIssue.getTimestamp(), InstantRange.RangeType.CLOSE_CLOSE);
}
} else {
LOG.info("No new instant found for the table under path " + path + ", skip reading");
return Result.EMPTY;
}
String tableName = conf.getString(FlinkOptions.TABLE_NAME);
Set<String> writePartitions;
final FileStatus[] fileStatuses;
if (instantRange == null) {
// reading from the earliest, scans the partitions and files directly.
FileIndex fileIndex = FileIndex.instance(new org.apache.hadoop.fs.Path(path.toUri()), conf);
if (this.requiredPartitions != null) {
// apply partition push down
fileIndex.setPartitionPaths(this.requiredPartitions);
}
writePartitions = new HashSet<>(fileIndex.getOrBuildPartitionPaths());
if (writePartitions.size() == 0) {
LOG.warn("No partitions found for reading in user provided path.");
return Result.EMPTY;
}
fileStatuses = fileIndex.getFilesInPartitions();
} else {
List<HoodieCommitMetadata> activeMetadataList = instants.stream().map(instant -> WriteProfiles.getCommitMetadata(tableName, path, instant, commitTimeline)).collect(Collectors.toList());
List<HoodieCommitMetadata> archivedMetadataList = getArchivedMetadata(metaClient, instantRange, commitTimeline, tableName);
if (archivedMetadataList.size() > 0) {
LOG.warn("\n" + "--------------------------------------------------------------------------------\n" + "---------- caution: the reader has fall behind too much from the writer,\n" + "---------- tweak 'read.tasks' option to add parallelism of read tasks.\n" + "--------------------------------------------------------------------------------");
}
List<HoodieCommitMetadata> metadataList = archivedMetadataList.size() > 0 ? // IMPORTANT: the merged metadata list must be in ascending order by instant time
mergeList(archivedMetadataList, activeMetadataList) : activeMetadataList;
writePartitions = HoodieInputFormatUtils.getWritePartitionPaths(metadataList);
// apply partition push down
if (this.requiredPartitions != null) {
writePartitions = writePartitions.stream().filter(this.requiredPartitions::contains).collect(Collectors.toSet());
}
if (writePartitions.size() == 0) {
LOG.warn("No partitions found for reading in user provided path.");
return Result.EMPTY;
}
fileStatuses = WriteProfiles.getWritePathsOfInstants(path, hadoopConf, metadataList, metaClient.getTableType());
}
if (fileStatuses.length == 0) {
LOG.warn("No files found for reading in user provided path.");
return Result.EMPTY;
}
HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, commitTimeline, fileStatuses);
final String endInstant = instantToIssue.getTimestamp();
final AtomicInteger cnt = new AtomicInteger(0);
final String mergeType = this.conf.getString(FlinkOptions.MERGE_TYPE);
List<MergeOnReadInputSplit> inputSplits = writePartitions.stream().map(relPartitionPath -> fsView.getLatestMergedFileSlicesBeforeOrOn(relPartitionPath, endInstant).map(fileSlice -> {
Option<List<String>> logPaths = Option.ofNullable(fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()).map(logFile -> logFile.getPath().toString()).collect(Collectors.toList()));
String basePath = fileSlice.getBaseFile().map(BaseFile::getPath).orElse(null);
return new MergeOnReadInputSplit(cnt.getAndAdd(1), basePath, logPaths, endInstant, metaClient.getBasePath(), maxCompactionMemoryInBytes, mergeType, instantRange);
}).collect(Collectors.toList())).flatMap(Collection::stream).collect(Collectors.toList());
return Result.instance(inputSplits, endInstant);
}
use of org.apache.hudi.table.format.mor.MergeOnReadInputSplit in project hudi by apache.
the class StreamReadOperator method initializeState.
@Override
public void initializeState(StateInitializationContext context) throws Exception {
super.initializeState(context);
// TODO Replace Java serialization with Avro approach to keep state compatibility.
inputSplitsState = context.getOperatorStateStore().getListState(new ListStateDescriptor<>("splits", new JavaSerializer<>()));
// Initialize the current split state to IDLE.
currentSplitState = SplitState.IDLE;
// Recover splits state from flink state backend if possible.
splits = new LinkedBlockingDeque<>();
if (context.isRestored()) {
int subtaskIdx = getRuntimeContext().getIndexOfThisSubtask();
LOG.info("Restoring state for operator {} (task ID: {}).", getClass().getSimpleName(), subtaskIdx);
for (MergeOnReadInputSplit split : inputSplitsState.get()) {
splits.add(split);
}
}
this.sourceContext = StreamSourceContexts.getSourceContext(getOperatorConfig().getTimeCharacteristic(), getProcessingTimeService(), // no actual locking needed
new Object(), output, getRuntimeContext().getExecutionConfig().getAutoWatermarkInterval(), -1, true);
// Enqueue to process the recovered input splits.
enqueueProcessSplits();
}
use of org.apache.hudi.table.format.mor.MergeOnReadInputSplit in project hudi by apache.
the class StreamReadOperator method processSplits.
private void processSplits() throws IOException {
MergeOnReadInputSplit split = splits.peek();
if (split == null) {
currentSplitState = SplitState.IDLE;
return;
}
// 4. for each runnable, reads at most #MINI_BATCH_SIZE number of records
if (format.isClosed()) {
// This log is important to indicate the consuming process,
// there is only one log message for one data bucket.
LOG.info("Processing input split : {}", split);
format.open(split);
}
try {
consumeAsMiniBatch(split);
} finally {
currentSplitState = SplitState.IDLE;
}
// Re-schedule to process the next split.
enqueueProcessSplits();
}
use of org.apache.hudi.table.format.mor.MergeOnReadInputSplit in project hudi by apache.
the class TestStreamReadMonitoringFunction method testConsumeFromLatestCommit.
@Test
public void testConsumeFromLatestCommit() throws Exception {
// write 2 commits first, and all the splits should come from the second commit.
TestData.writeData(TestData.DATA_SET_INSERT, conf);
TestData.writeData(TestData.DATA_SET_UPDATE_INSERT, conf);
StreamReadMonitoringFunction function = TestUtils.getMonitorFunc(conf);
try (AbstractStreamOperatorTestHarness<MergeOnReadInputSplit> harness = createHarness(function)) {
harness.setup();
harness.open();
CountDownLatch latch = new CountDownLatch(4);
CollectingSourceContext sourceContext = new CollectingSourceContext(latch);
runAsync(sourceContext, function);
assertTrue(latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS), "Should finish splits generation");
assertThat("Should produce the expected splits", sourceContext.getPartitionPaths(), is("par1,par2,par3,par4"));
assertTrue(sourceContext.splits.stream().allMatch(split -> split.getInstantRange().isPresent()), "All the instants should have range limit");
String latestCommit = TestUtils.getLastCompleteInstant(tempFile.getAbsolutePath());
assertTrue(sourceContext.splits.stream().allMatch(split -> split.getLatestCommit().equals(latestCommit)), "All the splits should be with latestCommit instant time");
// Stop the stream task.
function.close();
}
}
use of org.apache.hudi.table.format.mor.MergeOnReadInputSplit in project hudi by apache.
the class TestStreamReadMonitoringFunction method testConsumeFromEarliestCommit.
@Test
public void testConsumeFromEarliestCommit() throws Exception {
// write 2 commits first, then specify the start commit as 'earliest',
// all the splits should come from the earliest commit.
TestData.writeData(TestData.DATA_SET_INSERT, conf);
TestData.writeData(TestData.DATA_SET_UPDATE_INSERT, conf);
String specifiedCommit = TestUtils.getLastCompleteInstant(tempFile.getAbsolutePath());
conf.setString(FlinkOptions.READ_START_COMMIT, FlinkOptions.START_COMMIT_EARLIEST);
StreamReadMonitoringFunction function = TestUtils.getMonitorFunc(conf);
try (AbstractStreamOperatorTestHarness<MergeOnReadInputSplit> harness = createHarness(function)) {
harness.setup();
harness.open();
CountDownLatch latch = new CountDownLatch(4);
CollectingSourceContext sourceContext = new CollectingSourceContext(latch);
runAsync(sourceContext, function);
assertTrue(latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS), "Should finish splits generation");
assertThat("Should produce the expected splits", sourceContext.getPartitionPaths(), is("par1,par2,par3,par4"));
assertTrue(sourceContext.splits.stream().noneMatch(split -> split.getInstantRange().isPresent()), "No instants should have range limit");
assertTrue(sourceContext.splits.stream().allMatch(split -> split.getLatestCommit().equals(specifiedCommit)), "All the splits should be with specified instant time");
// Stop the stream task.
function.close();
}
}
Aggregations