Examples with MergeOnReadInputSplit - org.apache.hudi.table.format.mor.MergeOnReadInputSplit

Example 1 with MergeOnReadInputSplit

use of org.apache.hudi.table.format.mor.MergeOnReadInputSplit in project hudi by apache.

the class IncrementalInputSplits method inputSplits.

/**
 * Returns the incremental input splits.
 *
 * @param metaClient    The meta client
 * @param hadoopConf    The hadoop configuration
 * @param issuedInstant The last issued instant, only valid in streaming read
 * @return The list of incremental input splits or empty if there are no new instants
 */
public Result inputSplits(HoodieTableMetaClient metaClient, org.apache.hadoop.conf.Configuration hadoopConf, String issuedInstant) {
    metaClient.reloadActiveTimeline();
    HoodieTimeline commitTimeline = metaClient.getCommitsAndCompactionTimeline().filterCompletedAndCompactionInstants();
    if (commitTimeline.empty()) {
        LOG.warn("No splits found for the table under path " + path);
        return Result.EMPTY;
    }
    List<HoodieInstant> instants = filterInstantsWithRange(commitTimeline, issuedInstant);
    // get the latest instant that satisfies condition
    final HoodieInstant instantToIssue = instants.size() == 0 ? null : instants.get(instants.size() - 1);
    final InstantRange instantRange;
    if (instantToIssue != null) {
        if (issuedInstant != null) {
            // the streaming reader may record the last issued instant, if the issued instant is present,
            // the instant range should be: (issued instant, the latest instant].
            instantRange = InstantRange.getInstance(issuedInstant, instantToIssue.getTimestamp(), InstantRange.RangeType.OPEN_CLOSE);
        } else if (this.conf.getOptional(FlinkOptions.READ_START_COMMIT).isPresent()) {
            // first time consume and has a start commit
            final String startCommit = this.conf.getString(FlinkOptions.READ_START_COMMIT);
            instantRange = startCommit.equalsIgnoreCase(FlinkOptions.START_COMMIT_EARLIEST) ? null : InstantRange.getInstance(startCommit, instantToIssue.getTimestamp(), InstantRange.RangeType.CLOSE_CLOSE);
        } else {
            // first time consume and no start commit, consumes the latest incremental data set.
            instantRange = InstantRange.getInstance(instantToIssue.getTimestamp(), instantToIssue.getTimestamp(), InstantRange.RangeType.CLOSE_CLOSE);
        }
    } else {
        LOG.info("No new instant found for the table under path " + path + ", skip reading");
        return Result.EMPTY;
    }
    String tableName = conf.getString(FlinkOptions.TABLE_NAME);
    Set<String> writePartitions;
    final FileStatus[] fileStatuses;
    if (instantRange == null) {
        // reading from the earliest, scans the partitions and files directly.
        FileIndex fileIndex = FileIndex.instance(new org.apache.hadoop.fs.Path(path.toUri()), conf);
        if (this.requiredPartitions != null) {
            // apply partition push down
            fileIndex.setPartitionPaths(this.requiredPartitions);
        }
        writePartitions = new HashSet<>(fileIndex.getOrBuildPartitionPaths());
        if (writePartitions.size() == 0) {
            LOG.warn("No partitions found for reading in user provided path.");
            return Result.EMPTY;
        }
        fileStatuses = fileIndex.getFilesInPartitions();
    } else {
        List<HoodieCommitMetadata> activeMetadataList = instants.stream().map(instant -> WriteProfiles.getCommitMetadata(tableName, path, instant, commitTimeline)).collect(Collectors.toList());
        List<HoodieCommitMetadata> archivedMetadataList = getArchivedMetadata(metaClient, instantRange, commitTimeline, tableName);
        if (archivedMetadataList.size() > 0) {
            LOG.warn("\n" + "--------------------------------------------------------------------------------\n" + "---------- caution: the reader has fall behind too much from the writer,\n" + "---------- tweak 'read.tasks' option to add parallelism of read tasks.\n" + "--------------------------------------------------------------------------------");
        }
        List<HoodieCommitMetadata> metadataList = archivedMetadataList.size() > 0 ? // IMPORTANT: the merged metadata list must be in ascending order by instant time
        mergeList(archivedMetadataList, activeMetadataList) : activeMetadataList;
        writePartitions = HoodieInputFormatUtils.getWritePartitionPaths(metadataList);
        // apply partition push down
        if (this.requiredPartitions != null) {
            writePartitions = writePartitions.stream().filter(this.requiredPartitions::contains).collect(Collectors.toSet());
        }
        if (writePartitions.size() == 0) {
            LOG.warn("No partitions found for reading in user provided path.");
            return Result.EMPTY;
        }
        fileStatuses = WriteProfiles.getWritePathsOfInstants(path, hadoopConf, metadataList, metaClient.getTableType());
    }
    if (fileStatuses.length == 0) {
        LOG.warn("No files found for reading in user provided path.");
        return Result.EMPTY;
    }
    HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, commitTimeline, fileStatuses);
    final String endInstant = instantToIssue.getTimestamp();
    final AtomicInteger cnt = new AtomicInteger(0);
    final String mergeType = this.conf.getString(FlinkOptions.MERGE_TYPE);
    List<MergeOnReadInputSplit> inputSplits = writePartitions.stream().map(relPartitionPath -> fsView.getLatestMergedFileSlicesBeforeOrOn(relPartitionPath, endInstant).map(fileSlice -> {
        Option<List<String>> logPaths = Option.ofNullable(fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()).map(logFile -> logFile.getPath().toString()).collect(Collectors.toList()));
        String basePath = fileSlice.getBaseFile().map(BaseFile::getPath).orElse(null);
        return new MergeOnReadInputSplit(cnt.getAndAdd(1), basePath, logPaths, endInstant, metaClient.getBasePath(), maxCompactionMemoryInBytes, mergeType, instantRange);
    }).collect(Collectors.toList())).flatMap(Collection::stream).collect(Collectors.toList());
    return Result.instance(inputSplits, endInstant);
}

Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieInputFormatUtils(org.apache.hudi.hadoop.utils.HoodieInputFormatUtils) HoodieArchivedTimeline(org.apache.hudi.common.table.timeline.HoodieArchivedTimeline) Serializable(scala.Serializable) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) LoggerFactory(org.slf4j.LoggerFactory) Option(org.apache.hudi.common.util.Option) FileStatus(org.apache.hadoop.fs.FileStatus) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) BaseFile(org.apache.hudi.common.model.BaseFile) Path(org.apache.flink.core.fs.Path) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) Nullable(javax.annotation.Nullable) Logger(org.slf4j.Logger) Collection(java.util.Collection) Configuration(org.apache.flink.configuration.Configuration) Set(java.util.Set) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) LESSER_THAN_OR_EQUALS(org.apache.hudi.common.table.timeline.HoodieTimeline.LESSER_THAN_OR_EQUALS) Collectors(java.util.stream.Collectors) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) Objects(java.util.Objects) WriteProfiles(org.apache.hudi.sink.partitioner.profile.WriteProfiles) List(java.util.List) GREATER_THAN_OR_EQUALS(org.apache.hudi.common.table.timeline.HoodieTimeline.GREATER_THAN_OR_EQUALS) Stream(java.util.stream.Stream) InstantRange(org.apache.hudi.common.table.log.InstantRange) MergeOnReadInputSplit(org.apache.hudi.table.format.mor.MergeOnReadInputSplit) GREATER_THAN(org.apache.hudi.common.table.timeline.HoodieTimeline.GREATER_THAN) Collections(java.util.Collections) FlinkOptions(org.apache.hudi.configuration.FlinkOptions) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) MergeOnReadInputSplit(org.apache.hudi.table.format.mor.MergeOnReadInputSplit) InstantRange(org.apache.hudi.common.table.log.InstantRange) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) ArrayList(java.util.ArrayList) List(java.util.List) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView)

Example 2 with MergeOnReadInputSplit

use of org.apache.hudi.table.format.mor.MergeOnReadInputSplit in project hudi by apache.

the class StreamReadOperator method initializeState.

@Override
public void initializeState(StateInitializationContext context) throws Exception {
    super.initializeState(context);
    // TODO Replace Java serialization with Avro approach to keep state compatibility.
    inputSplitsState = context.getOperatorStateStore().getListState(new ListStateDescriptor<>("splits", new JavaSerializer<>()));
    // Initialize the current split state to IDLE.
    currentSplitState = SplitState.IDLE;
    // Recover splits state from flink state backend if possible.
    splits = new LinkedBlockingDeque<>();
    if (context.isRestored()) {
        int subtaskIdx = getRuntimeContext().getIndexOfThisSubtask();
        LOG.info("Restoring state for operator {} (task ID: {}).", getClass().getSimpleName(), subtaskIdx);
        for (MergeOnReadInputSplit split : inputSplitsState.get()) {
            splits.add(split);
        }
    }
    this.sourceContext = StreamSourceContexts.getSourceContext(getOperatorConfig().getTimeCharacteristic(), getProcessingTimeService(), // no actual locking needed
    new Object(), output, getRuntimeContext().getExecutionConfig().getAutoWatermarkInterval(), -1, true);
    // Enqueue to process the recovered input splits.
    enqueueProcessSplits();
}

Also used : MergeOnReadInputSplit(org.apache.hudi.table.format.mor.MergeOnReadInputSplit) ListStateDescriptor(org.apache.flink.api.common.state.ListStateDescriptor)

Example 3 with MergeOnReadInputSplit

use of org.apache.hudi.table.format.mor.MergeOnReadInputSplit in project hudi by apache.

the class StreamReadOperator method processSplits.

private void processSplits() throws IOException {
    MergeOnReadInputSplit split = splits.peek();
    if (split == null) {
        currentSplitState = SplitState.IDLE;
        return;
    }
    // 4. for each runnable, reads at most #MINI_BATCH_SIZE number of records
    if (format.isClosed()) {
        // This log is important to indicate the consuming process,
        // there is only one log message for one data bucket.
        LOG.info("Processing input split : {}", split);
        format.open(split);
    }
    try {
        consumeAsMiniBatch(split);
    } finally {
        currentSplitState = SplitState.IDLE;
    }
    // Re-schedule to process the next split.
    enqueueProcessSplits();
}

Also used : MergeOnReadInputSplit(org.apache.hudi.table.format.mor.MergeOnReadInputSplit)

Example 4 with MergeOnReadInputSplit

use of org.apache.hudi.table.format.mor.MergeOnReadInputSplit in project hudi by apache.

the class TestStreamReadMonitoringFunction method testConsumeFromLatestCommit.

@Test
public void testConsumeFromLatestCommit() throws Exception {
    // write 2 commits first， and all the splits should come from the second commit.
    TestData.writeData(TestData.DATA_SET_INSERT, conf);
    TestData.writeData(TestData.DATA_SET_UPDATE_INSERT, conf);
    StreamReadMonitoringFunction function = TestUtils.getMonitorFunc(conf);
    try (AbstractStreamOperatorTestHarness<MergeOnReadInputSplit> harness = createHarness(function)) {
        harness.setup();
        harness.open();
        CountDownLatch latch = new CountDownLatch(4);
        CollectingSourceContext sourceContext = new CollectingSourceContext(latch);
        runAsync(sourceContext, function);
        assertTrue(latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS), "Should finish splits generation");
        assertThat("Should produce the expected splits", sourceContext.getPartitionPaths(), is("par1,par2,par3,par4"));
        assertTrue(sourceContext.splits.stream().allMatch(split -> split.getInstantRange().isPresent()), "All the instants should have range limit");
        String latestCommit = TestUtils.getLastCompleteInstant(tempFile.getAbsolutePath());
        assertTrue(sourceContext.splits.stream().allMatch(split -> split.getLatestCommit().equals(latestCommit)), "All the splits should be with latestCommit instant time");
        // Stop the stream task.
        function.close();
    }
}

Also used : CoreMatchers.is(org.hamcrest.CoreMatchers.is) BeforeEach(org.junit.jupiter.api.BeforeEach) TestConfigurations(org.apache.hudi.utils.TestConfigurations) Watermark(org.apache.flink.streaming.api.watermark.Watermark) ArrayList(java.util.ArrayList) TestUtils(org.apache.hudi.utils.TestUtils) SourceFunction(org.apache.flink.streaming.api.functions.source.SourceFunction) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) StreamerUtil(org.apache.hudi.util.StreamerUtil) Configuration(org.apache.flink.configuration.Configuration) TestData(org.apache.hudi.utils.TestData) OperatorSubtaskState(org.apache.flink.runtime.checkpoint.OperatorSubtaskState) Collectors(java.util.stream.Collectors) File(java.io.File) Test(org.junit.jupiter.api.Test) TimeUnit(java.util.concurrent.TimeUnit) CountDownLatch(java.util.concurrent.CountDownLatch) AbstractStreamOperatorTestHarness(org.apache.flink.streaming.util.AbstractStreamOperatorTestHarness) List(java.util.List) TempDir(org.junit.jupiter.api.io.TempDir) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) MergeOnReadInputSplit(org.apache.hudi.table.format.mor.MergeOnReadInputSplit) Comparator(java.util.Comparator) StreamSource(org.apache.flink.streaming.api.operators.StreamSource) FlinkOptions(org.apache.hudi.configuration.FlinkOptions) MergeOnReadInputSplit(org.apache.hudi.table.format.mor.MergeOnReadInputSplit) CountDownLatch(java.util.concurrent.CountDownLatch) Test(org.junit.jupiter.api.Test)

Example 5 with MergeOnReadInputSplit

use of org.apache.hudi.table.format.mor.MergeOnReadInputSplit in project hudi by apache.

the class TestStreamReadMonitoringFunction method testConsumeFromEarliestCommit.

@Test
public void testConsumeFromEarliestCommit() throws Exception {
    // write 2 commits first, then specify the start commit as 'earliest',
    // all the splits should come from the earliest commit.
    TestData.writeData(TestData.DATA_SET_INSERT, conf);
    TestData.writeData(TestData.DATA_SET_UPDATE_INSERT, conf);
    String specifiedCommit = TestUtils.getLastCompleteInstant(tempFile.getAbsolutePath());
    conf.setString(FlinkOptions.READ_START_COMMIT, FlinkOptions.START_COMMIT_EARLIEST);
    StreamReadMonitoringFunction function = TestUtils.getMonitorFunc(conf);
    try (AbstractStreamOperatorTestHarness<MergeOnReadInputSplit> harness = createHarness(function)) {
        harness.setup();
        harness.open();
        CountDownLatch latch = new CountDownLatch(4);
        CollectingSourceContext sourceContext = new CollectingSourceContext(latch);
        runAsync(sourceContext, function);
        assertTrue(latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS), "Should finish splits generation");
        assertThat("Should produce the expected splits", sourceContext.getPartitionPaths(), is("par1,par2,par3,par4"));
        assertTrue(sourceContext.splits.stream().noneMatch(split -> split.getInstantRange().isPresent()), "No instants should have range limit");
        assertTrue(sourceContext.splits.stream().allMatch(split -> split.getLatestCommit().equals(specifiedCommit)), "All the splits should be with specified instant time");
        // Stop the stream task.
        function.close();
    }
}

Aggregations

MergeOnReadInputSplit (org.apache.hudi.table.format.mor.MergeOnReadInputSplit)16 ArrayList (java.util.ArrayList)8 Test (org.junit.jupiter.api.Test)8 List (java.util.List)7 Collectors (java.util.stream.Collectors)6 Configuration (org.apache.flink.configuration.Configuration)6 OperatorSubtaskState (org.apache.flink.runtime.checkpoint.OperatorSubtaskState)6 FlinkOptions (org.apache.hudi.configuration.FlinkOptions)6 File (java.io.File)5 Comparator (java.util.Comparator)5 CountDownLatch (java.util.concurrent.CountDownLatch)5 TimeUnit (java.util.concurrent.TimeUnit)5 SourceFunction (org.apache.flink.streaming.api.functions.source.SourceFunction)5 StreamSource (org.apache.flink.streaming.api.operators.StreamSource)5 Watermark (org.apache.flink.streaming.api.watermark.Watermark)5 AbstractStreamOperatorTestHarness (org.apache.flink.streaming.util.AbstractStreamOperatorTestHarness)5 RowData (org.apache.flink.table.data.RowData)5 StreamerUtil (org.apache.hudi.util.StreamerUtil)5 TestConfigurations (org.apache.hudi.utils.TestConfigurations)5 TestData (org.apache.hudi.utils.TestData)5