Examples with MergeOnReadInputSplit - org.apache.hudi.table.format.mor.MergeOnReadInputSplit

Example 11 with MergeOnReadInputSplit

use of org.apache.hudi.table.format.mor.MergeOnReadInputSplit in project hudi by apache.

the class TestStreamReadMonitoringFunction method testConsumeFromSpecifiedCommit.

@Test
public void testConsumeFromSpecifiedCommit() throws Exception {
    // write 2 commits first, use the second commit time as the specified start instant,
    // all the splits should come from the second commit.
    TestData.writeData(TestData.DATA_SET_INSERT, conf);
    TestData.writeData(TestData.DATA_SET_UPDATE_INSERT, conf);
    String specifiedCommit = TestUtils.getLastCompleteInstant(tempFile.getAbsolutePath());
    conf.setString(FlinkOptions.READ_START_COMMIT, specifiedCommit);
    StreamReadMonitoringFunction function = TestUtils.getMonitorFunc(conf);
    try (AbstractStreamOperatorTestHarness<MergeOnReadInputSplit> harness = createHarness(function)) {
        harness.setup();
        harness.open();
        CountDownLatch latch = new CountDownLatch(4);
        CollectingSourceContext sourceContext = new CollectingSourceContext(latch);
        runAsync(sourceContext, function);
        assertTrue(latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS), "Should finish splits generation");
        assertThat("Should produce the expected splits", sourceContext.getPartitionPaths(), is("par1,par2,par3,par4"));
        assertTrue(sourceContext.splits.stream().allMatch(split -> split.getInstantRange().isPresent()), "All the instants should have range limit");
        assertTrue(sourceContext.splits.stream().allMatch(split -> split.getLatestCommit().equals(specifiedCommit)), "All the splits should be with specified instant time");
        // Stop the stream task.
        function.close();
    }
}

Also used : CoreMatchers.is(org.hamcrest.CoreMatchers.is) BeforeEach(org.junit.jupiter.api.BeforeEach) TestConfigurations(org.apache.hudi.utils.TestConfigurations) Watermark(org.apache.flink.streaming.api.watermark.Watermark) ArrayList(java.util.ArrayList) TestUtils(org.apache.hudi.utils.TestUtils) SourceFunction(org.apache.flink.streaming.api.functions.source.SourceFunction) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) StreamerUtil(org.apache.hudi.util.StreamerUtil) Configuration(org.apache.flink.configuration.Configuration) TestData(org.apache.hudi.utils.TestData) OperatorSubtaskState(org.apache.flink.runtime.checkpoint.OperatorSubtaskState) Collectors(java.util.stream.Collectors) File(java.io.File) Test(org.junit.jupiter.api.Test) TimeUnit(java.util.concurrent.TimeUnit) CountDownLatch(java.util.concurrent.CountDownLatch) AbstractStreamOperatorTestHarness(org.apache.flink.streaming.util.AbstractStreamOperatorTestHarness) List(java.util.List) TempDir(org.junit.jupiter.api.io.TempDir) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) MergeOnReadInputSplit(org.apache.hudi.table.format.mor.MergeOnReadInputSplit) Comparator(java.util.Comparator) StreamSource(org.apache.flink.streaming.api.operators.StreamSource) FlinkOptions(org.apache.hudi.configuration.FlinkOptions) MergeOnReadInputSplit(org.apache.hudi.table.format.mor.MergeOnReadInputSplit) CountDownLatch(java.util.concurrent.CountDownLatch) Test(org.junit.jupiter.api.Test)

Example 12 with MergeOnReadInputSplit

use of org.apache.hudi.table.format.mor.MergeOnReadInputSplit in project hudi by apache.

the class TestStreamReadMonitoringFunction method testConsumeFromLastCommit.

@Test
public void testConsumeFromLastCommit() throws Exception {
    TestData.writeData(TestData.DATA_SET_INSERT, conf);
    StreamReadMonitoringFunction function = TestUtils.getMonitorFunc(conf);
    try (AbstractStreamOperatorTestHarness<MergeOnReadInputSplit> harness = createHarness(function)) {
        harness.setup();
        harness.open();
        CountDownLatch latch = new CountDownLatch(4);
        CollectingSourceContext sourceContext = new CollectingSourceContext(latch);
        runAsync(sourceContext, function);
        assertTrue(latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS), "Should finish splits generation");
        assertThat("Should produce the expected splits", sourceContext.getPartitionPaths(), is("par1,par2,par3,par4"));
        assertTrue(sourceContext.splits.stream().allMatch(split -> split.getInstantRange().isPresent()), "All instants should have range limit");
        Thread.sleep(1000L);
        // reset the source context
        latch = new CountDownLatch(4);
        sourceContext.reset(latch);
        // write another instant and validate
        TestData.writeData(TestData.DATA_SET_UPDATE_INSERT, conf);
        assertTrue(latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS), "Should finish splits generation");
        assertThat("Should produce the expected splits", sourceContext.getPartitionPaths(), is("par1,par2,par3,par4"));
        assertTrue(sourceContext.splits.stream().allMatch(split -> split.getInstantRange().isPresent()), "All the instants should have range limit");
        // Stop the stream task.
        function.close();
    }
}

Example 13 with MergeOnReadInputSplit

use of org.apache.hudi.table.format.mor.MergeOnReadInputSplit in project hudi by apache.

the class TestStreamReadMonitoringFunction method testCheckpointRestore.

@Test
public void testCheckpointRestore() throws Exception {
    TestData.writeData(TestData.DATA_SET_INSERT, conf);
    StreamReadMonitoringFunction function = TestUtils.getMonitorFunc(conf);
    OperatorSubtaskState state;
    try (AbstractStreamOperatorTestHarness<MergeOnReadInputSplit> harness = createHarness(function)) {
        harness.setup();
        harness.open();
        CountDownLatch latch = new CountDownLatch(4);
        CollectingSourceContext sourceContext = new CollectingSourceContext(latch);
        runAsync(sourceContext, function);
        assertTrue(latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS), "Should finish splits generation");
        Thread.sleep(1000L);
        state = harness.snapshot(1, 1);
        // Stop the stream task.
        function.close();
        assertTrue(latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS), "Should finish splits generation");
        assertThat("Should produce the expected splits", sourceContext.getPartitionPaths(), is("par1,par2,par3,par4"));
        assertTrue(sourceContext.splits.stream().allMatch(split -> split.getInstantRange().isPresent()), "All instants should have range limit");
    }
    TestData.writeData(TestData.DATA_SET_UPDATE_INSERT, conf);
    StreamReadMonitoringFunction function2 = TestUtils.getMonitorFunc(conf);
    try (AbstractStreamOperatorTestHarness<MergeOnReadInputSplit> harness = createHarness(function2)) {
        harness.setup();
        // Recover to process the remaining snapshots.
        harness.initializeState(state);
        harness.open();
        CountDownLatch latch = new CountDownLatch(4);
        CollectingSourceContext sourceContext = new CollectingSourceContext(latch);
        runAsync(sourceContext, function2);
        // Stop the stream task.
        function.close();
        assertTrue(latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS), "Should finish splits generation");
        assertThat("Should produce the expected splits", sourceContext.getPartitionPaths(), is("par1,par2,par3,par4"));
        assertTrue(sourceContext.splits.stream().allMatch(split -> split.getInstantRange().isPresent()), "All the instants should have range limit");
    }
}

Example 14 with MergeOnReadInputSplit

use of org.apache.hudi.table.format.mor.MergeOnReadInputSplit in project hudi by apache.

the class TestStreamReadOperator method testCheckpointRestore.

@Test
public void testCheckpointRestore() throws Exception {
    TestData.writeData(TestData.DATA_SET_INSERT, conf);
    OperatorSubtaskState state;
    final List<MergeOnReadInputSplit> splits;
    try (OneInputStreamOperatorTestHarness<MergeOnReadInputSplit, RowData> harness = createReader()) {
        harness.setup();
        harness.open();
        StreamReadMonitoringFunction func = TestUtils.getMonitorFunc(conf);
        splits = generateSplits(func);
        assertThat("Should have 4 splits", splits.size(), is(4));
        // Enqueue all the splits.
        for (MergeOnReadInputSplit split : splits) {
            harness.processElement(split, -1);
        }
        // Read all records from the first 2 splits.
        SteppingMailboxProcessor localMailbox = createLocalMailbox(harness);
        for (int i = 0; i < 2; i++) {
            assertTrue(localMailbox.runMailboxStep(), "Should have processed the split#" + i);
        }
        assertThat(TestData.rowDataToString(harness.extractOutputValues()), is(getSplitExpected(splits.subList(0, 2), EXPECTED)));
        // Snapshot state now,  there are 2 splits left in the state.
        state = harness.snapshot(1, 1);
    }
    try (OneInputStreamOperatorTestHarness<MergeOnReadInputSplit, RowData> harness = createReader()) {
        harness.setup();
        // Recover to process the remaining splits.
        harness.initializeState(state);
        harness.open();
        SteppingMailboxProcessor localMailbox = createLocalMailbox(harness);
        for (int i = 2; i < 4; i++) {
            assertTrue(localMailbox.runMailboxStep(), "Should have processed one split#" + i);
        }
        // expect to output the left data
        assertThat(TestData.rowDataToString(harness.extractOutputValues()), is(getSplitExpected(splits.subList(2, 4), EXPECTED)));
    }
}

Also used : MergeOnReadInputSplit(org.apache.hudi.table.format.mor.MergeOnReadInputSplit) RowData(org.apache.flink.table.data.RowData) SteppingMailboxProcessor(org.apache.flink.streaming.runtime.tasks.mailbox.SteppingMailboxProcessor) OperatorSubtaskState(org.apache.flink.runtime.checkpoint.OperatorSubtaskState) Test(org.junit.jupiter.api.Test)

Example 15 with MergeOnReadInputSplit

use of org.apache.hudi.table.format.mor.MergeOnReadInputSplit in project hudi by apache.

the class TestStreamReadOperator method createReader.

private OneInputStreamOperatorTestHarness<MergeOnReadInputSplit, RowData> createReader() throws Exception {
    final String basePath = tempFile.getAbsolutePath();
    final org.apache.hadoop.conf.Configuration hadoopConf = StreamerUtil.getHadoopConf();
    final HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build();
    final List<String> partitionKeys = Collections.singletonList("partition");
    // This input format is used to opening the emitted split.
    TableSchemaResolver schemaResolver = new TableSchemaResolver(metaClient);
    final Schema tableAvroSchema;
    try {
        tableAvroSchema = schemaResolver.getTableAvroSchema();
    } catch (Exception e) {
        throw new HoodieException("Get table avro schema error", e);
    }
    final DataType rowDataType = AvroSchemaConverter.convertToDataType(tableAvroSchema);
    final RowType rowType = (RowType) rowDataType.getLogicalType();
    final MergeOnReadTableState hoodieTableState = new MergeOnReadTableState(rowType, TestConfigurations.ROW_TYPE, tableAvroSchema.toString(), AvroSchemaConverter.convertToSchema(TestConfigurations.ROW_TYPE).toString(), Collections.emptyList(), new String[0]);
    MergeOnReadInputFormat inputFormat = MergeOnReadInputFormat.builder().config(conf).tableState(hoodieTableState).fieldTypes(rowDataType.getChildren()).defaultPartName("default").limit(1000L).emitDelete(true).build();
    OneInputStreamOperatorFactory<MergeOnReadInputSplit, RowData> factory = StreamReadOperator.factory(inputFormat);
    OneInputStreamOperatorTestHarness<MergeOnReadInputSplit, RowData> harness = new OneInputStreamOperatorTestHarness<>(factory, 1, 1, 0);
    harness.getStreamConfig().setTimeCharacteristic(TimeCharacteristic.ProcessingTime);
    return harness;
}

Also used : Schema(org.apache.avro.Schema) RowType(org.apache.flink.table.types.logical.RowType) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) HoodieException(org.apache.hudi.exception.HoodieException) MergeOnReadTableState(org.apache.hudi.table.format.mor.MergeOnReadTableState) OneInputStreamOperatorTestHarness(org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness) HoodieException(org.apache.hudi.exception.HoodieException) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) MergeOnReadInputSplit(org.apache.hudi.table.format.mor.MergeOnReadInputSplit) RowData(org.apache.flink.table.data.RowData) MergeOnReadInputFormat(org.apache.hudi.table.format.mor.MergeOnReadInputFormat) DataType(org.apache.flink.table.types.DataType)

Aggregations

MergeOnReadInputSplit (org.apache.hudi.table.format.mor.MergeOnReadInputSplit)16 ArrayList (java.util.ArrayList)8 Test (org.junit.jupiter.api.Test)8 List (java.util.List)7 Collectors (java.util.stream.Collectors)6 Configuration (org.apache.flink.configuration.Configuration)6 OperatorSubtaskState (org.apache.flink.runtime.checkpoint.OperatorSubtaskState)6 FlinkOptions (org.apache.hudi.configuration.FlinkOptions)6 File (java.io.File)5 Comparator (java.util.Comparator)5 CountDownLatch (java.util.concurrent.CountDownLatch)5 TimeUnit (java.util.concurrent.TimeUnit)5 SourceFunction (org.apache.flink.streaming.api.functions.source.SourceFunction)5 StreamSource (org.apache.flink.streaming.api.operators.StreamSource)5 Watermark (org.apache.flink.streaming.api.watermark.Watermark)5 AbstractStreamOperatorTestHarness (org.apache.flink.streaming.util.AbstractStreamOperatorTestHarness)5 RowData (org.apache.flink.table.data.RowData)5 StreamerUtil (org.apache.hudi.util.StreamerUtil)5 TestConfigurations (org.apache.hudi.utils.TestConfigurations)5 TestData (org.apache.hudi.utils.TestData)5