use of org.apache.hudi.table.format.mor.MergeOnReadInputSplit in project hudi by apache.
the class TestStreamReadMonitoringFunction method testConsumeFromSpecifiedCommit.
@Test
public void testConsumeFromSpecifiedCommit() throws Exception {
// write 2 commits first, use the second commit time as the specified start instant,
// all the splits should come from the second commit.
TestData.writeData(TestData.DATA_SET_INSERT, conf);
TestData.writeData(TestData.DATA_SET_UPDATE_INSERT, conf);
String specifiedCommit = TestUtils.getLastCompleteInstant(tempFile.getAbsolutePath());
conf.setString(FlinkOptions.READ_START_COMMIT, specifiedCommit);
StreamReadMonitoringFunction function = TestUtils.getMonitorFunc(conf);
try (AbstractStreamOperatorTestHarness<MergeOnReadInputSplit> harness = createHarness(function)) {
harness.setup();
harness.open();
CountDownLatch latch = new CountDownLatch(4);
CollectingSourceContext sourceContext = new CollectingSourceContext(latch);
runAsync(sourceContext, function);
assertTrue(latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS), "Should finish splits generation");
assertThat("Should produce the expected splits", sourceContext.getPartitionPaths(), is("par1,par2,par3,par4"));
assertTrue(sourceContext.splits.stream().allMatch(split -> split.getInstantRange().isPresent()), "All the instants should have range limit");
assertTrue(sourceContext.splits.stream().allMatch(split -> split.getLatestCommit().equals(specifiedCommit)), "All the splits should be with specified instant time");
// Stop the stream task.
function.close();
}
}
use of org.apache.hudi.table.format.mor.MergeOnReadInputSplit in project hudi by apache.
the class TestStreamReadMonitoringFunction method testConsumeFromLastCommit.
@Test
public void testConsumeFromLastCommit() throws Exception {
TestData.writeData(TestData.DATA_SET_INSERT, conf);
StreamReadMonitoringFunction function = TestUtils.getMonitorFunc(conf);
try (AbstractStreamOperatorTestHarness<MergeOnReadInputSplit> harness = createHarness(function)) {
harness.setup();
harness.open();
CountDownLatch latch = new CountDownLatch(4);
CollectingSourceContext sourceContext = new CollectingSourceContext(latch);
runAsync(sourceContext, function);
assertTrue(latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS), "Should finish splits generation");
assertThat("Should produce the expected splits", sourceContext.getPartitionPaths(), is("par1,par2,par3,par4"));
assertTrue(sourceContext.splits.stream().allMatch(split -> split.getInstantRange().isPresent()), "All instants should have range limit");
Thread.sleep(1000L);
// reset the source context
latch = new CountDownLatch(4);
sourceContext.reset(latch);
// write another instant and validate
TestData.writeData(TestData.DATA_SET_UPDATE_INSERT, conf);
assertTrue(latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS), "Should finish splits generation");
assertThat("Should produce the expected splits", sourceContext.getPartitionPaths(), is("par1,par2,par3,par4"));
assertTrue(sourceContext.splits.stream().allMatch(split -> split.getInstantRange().isPresent()), "All the instants should have range limit");
// Stop the stream task.
function.close();
}
}
use of org.apache.hudi.table.format.mor.MergeOnReadInputSplit in project hudi by apache.
the class TestStreamReadMonitoringFunction method testCheckpointRestore.
@Test
public void testCheckpointRestore() throws Exception {
TestData.writeData(TestData.DATA_SET_INSERT, conf);
StreamReadMonitoringFunction function = TestUtils.getMonitorFunc(conf);
OperatorSubtaskState state;
try (AbstractStreamOperatorTestHarness<MergeOnReadInputSplit> harness = createHarness(function)) {
harness.setup();
harness.open();
CountDownLatch latch = new CountDownLatch(4);
CollectingSourceContext sourceContext = new CollectingSourceContext(latch);
runAsync(sourceContext, function);
assertTrue(latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS), "Should finish splits generation");
Thread.sleep(1000L);
state = harness.snapshot(1, 1);
// Stop the stream task.
function.close();
assertTrue(latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS), "Should finish splits generation");
assertThat("Should produce the expected splits", sourceContext.getPartitionPaths(), is("par1,par2,par3,par4"));
assertTrue(sourceContext.splits.stream().allMatch(split -> split.getInstantRange().isPresent()), "All instants should have range limit");
}
TestData.writeData(TestData.DATA_SET_UPDATE_INSERT, conf);
StreamReadMonitoringFunction function2 = TestUtils.getMonitorFunc(conf);
try (AbstractStreamOperatorTestHarness<MergeOnReadInputSplit> harness = createHarness(function2)) {
harness.setup();
// Recover to process the remaining snapshots.
harness.initializeState(state);
harness.open();
CountDownLatch latch = new CountDownLatch(4);
CollectingSourceContext sourceContext = new CollectingSourceContext(latch);
runAsync(sourceContext, function2);
// Stop the stream task.
function.close();
assertTrue(latch.await(WAIT_TIME_MILLIS, TimeUnit.MILLISECONDS), "Should finish splits generation");
assertThat("Should produce the expected splits", sourceContext.getPartitionPaths(), is("par1,par2,par3,par4"));
assertTrue(sourceContext.splits.stream().allMatch(split -> split.getInstantRange().isPresent()), "All the instants should have range limit");
}
}
use of org.apache.hudi.table.format.mor.MergeOnReadInputSplit in project hudi by apache.
the class TestStreamReadOperator method testCheckpointRestore.
@Test
public void testCheckpointRestore() throws Exception {
TestData.writeData(TestData.DATA_SET_INSERT, conf);
OperatorSubtaskState state;
final List<MergeOnReadInputSplit> splits;
try (OneInputStreamOperatorTestHarness<MergeOnReadInputSplit, RowData> harness = createReader()) {
harness.setup();
harness.open();
StreamReadMonitoringFunction func = TestUtils.getMonitorFunc(conf);
splits = generateSplits(func);
assertThat("Should have 4 splits", splits.size(), is(4));
// Enqueue all the splits.
for (MergeOnReadInputSplit split : splits) {
harness.processElement(split, -1);
}
// Read all records from the first 2 splits.
SteppingMailboxProcessor localMailbox = createLocalMailbox(harness);
for (int i = 0; i < 2; i++) {
assertTrue(localMailbox.runMailboxStep(), "Should have processed the split#" + i);
}
assertThat(TestData.rowDataToString(harness.extractOutputValues()), is(getSplitExpected(splits.subList(0, 2), EXPECTED)));
// Snapshot state now, there are 2 splits left in the state.
state = harness.snapshot(1, 1);
}
try (OneInputStreamOperatorTestHarness<MergeOnReadInputSplit, RowData> harness = createReader()) {
harness.setup();
// Recover to process the remaining splits.
harness.initializeState(state);
harness.open();
SteppingMailboxProcessor localMailbox = createLocalMailbox(harness);
for (int i = 2; i < 4; i++) {
assertTrue(localMailbox.runMailboxStep(), "Should have processed one split#" + i);
}
// expect to output the left data
assertThat(TestData.rowDataToString(harness.extractOutputValues()), is(getSplitExpected(splits.subList(2, 4), EXPECTED)));
}
}
use of org.apache.hudi.table.format.mor.MergeOnReadInputSplit in project hudi by apache.
the class TestStreamReadOperator method createReader.
private OneInputStreamOperatorTestHarness<MergeOnReadInputSplit, RowData> createReader() throws Exception {
final String basePath = tempFile.getAbsolutePath();
final org.apache.hadoop.conf.Configuration hadoopConf = StreamerUtil.getHadoopConf();
final HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(basePath).build();
final List<String> partitionKeys = Collections.singletonList("partition");
// This input format is used to opening the emitted split.
TableSchemaResolver schemaResolver = new TableSchemaResolver(metaClient);
final Schema tableAvroSchema;
try {
tableAvroSchema = schemaResolver.getTableAvroSchema();
} catch (Exception e) {
throw new HoodieException("Get table avro schema error", e);
}
final DataType rowDataType = AvroSchemaConverter.convertToDataType(tableAvroSchema);
final RowType rowType = (RowType) rowDataType.getLogicalType();
final MergeOnReadTableState hoodieTableState = new MergeOnReadTableState(rowType, TestConfigurations.ROW_TYPE, tableAvroSchema.toString(), AvroSchemaConverter.convertToSchema(TestConfigurations.ROW_TYPE).toString(), Collections.emptyList(), new String[0]);
MergeOnReadInputFormat inputFormat = MergeOnReadInputFormat.builder().config(conf).tableState(hoodieTableState).fieldTypes(rowDataType.getChildren()).defaultPartName("default").limit(1000L).emitDelete(true).build();
OneInputStreamOperatorFactory<MergeOnReadInputSplit, RowData> factory = StreamReadOperator.factory(inputFormat);
OneInputStreamOperatorTestHarness<MergeOnReadInputSplit, RowData> harness = new OneInputStreamOperatorTestHarness<>(factory, 1, 1, 0);
harness.getStreamConfig().setTimeCharacteristic(TimeCharacteristic.ProcessingTime);
return harness;
}
Aggregations