use of org.apache.hudi.table.format.mor.MergeOnReadInputSplit in project hudi by apache.
the class TestStreamReadOperator method testCheckpoint.
@Test
public void testCheckpoint() throws Exception {
// Received emitted splits: split1, split2, split3, split4, checkpoint request is triggered
// when reading records from split1.
TestData.writeData(TestData.DATA_SET_INSERT, conf);
long timestamp = 0;
try (OneInputStreamOperatorTestHarness<MergeOnReadInputSplit, RowData> harness = createReader()) {
harness.setup();
harness.open();
SteppingMailboxProcessor processor = createLocalMailbox(harness);
StreamReadMonitoringFunction func = TestUtils.getMonitorFunc(conf);
List<MergeOnReadInputSplit> splits = generateSplits(func);
assertThat("Should have 4 splits", splits.size(), is(4));
for (MergeOnReadInputSplit split : splits) {
harness.processElement(split, ++timestamp);
}
// Trigger snapshot state, it will start to work once all records from split0 are read.
processor.getMainMailboxExecutor().execute(() -> harness.snapshot(1, 3), "Trigger snapshot");
assertTrue(processor.runMailboxStep(), "Should have processed the split0");
assertTrue(processor.runMailboxStep(), "Should have processed the snapshot state action");
assertThat(TestData.rowDataToString(harness.extractOutputValues()), is(getSplitExpected(Collections.singletonList(splits.get(0)), EXPECTED)));
// Read records from split1.
assertTrue(processor.runMailboxStep(), "Should have processed the split1");
// Read records from split2.
assertTrue(processor.runMailboxStep(), "Should have processed the split2");
// Read records from split3.
assertTrue(processor.runMailboxStep(), "Should have processed the split3");
// Assert the output has expected elements.
TestData.assertRowDataEquals(harness.extractOutputValues(), TestData.DATA_SET_INSERT);
}
}
use of org.apache.hudi.table.format.mor.MergeOnReadInputSplit in project hudi by apache.
the class HoodieTableSource method buildFileIndex.
private List<MergeOnReadInputSplit> buildFileIndex() {
Set<String> requiredPartitionPaths = getRequiredPartitionPaths();
fileIndex.setPartitionPaths(requiredPartitionPaths);
List<String> relPartitionPaths = fileIndex.getOrBuildPartitionPaths();
if (relPartitionPaths.size() == 0) {
return Collections.emptyList();
}
FileStatus[] fileStatuses = fileIndex.getFilesInPartitions();
if (fileStatuses.length == 0) {
throw new HoodieException("No files found for reading in user provided path.");
}
HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, // file-slice after pending compaction-requested instant-time is also considered valid
metaClient.getCommitsAndCompactionTimeline().filterCompletedAndCompactionInstants(), fileStatuses);
String latestCommit = fsView.getLastInstant().get().getTimestamp();
final String mergeType = this.conf.getString(FlinkOptions.MERGE_TYPE);
final AtomicInteger cnt = new AtomicInteger(0);
// generates one input split for each file group
return relPartitionPaths.stream().map(relPartitionPath -> fsView.getLatestMergedFileSlicesBeforeOrOn(relPartitionPath, latestCommit).map(fileSlice -> {
String basePath = fileSlice.getBaseFile().map(BaseFile::getPath).orElse(null);
Option<List<String>> logPaths = Option.ofNullable(fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()).map(logFile -> logFile.getPath().toString()).collect(Collectors.toList()));
return new MergeOnReadInputSplit(cnt.getAndAdd(1), basePath, logPaths, latestCommit, metaClient.getBasePath(), maxCompactionMemoryInBytes, mergeType, null);
}).collect(Collectors.toList())).flatMap(Collection::stream).collect(Collectors.toList());
}
use of org.apache.hudi.table.format.mor.MergeOnReadInputSplit in project hudi by apache.
the class HoodieTableSource method getBatchInputFormat.
private InputFormat<RowData, ?> getBatchInputFormat() {
final Schema tableAvroSchema = getTableAvroSchema();
final DataType rowDataType = AvroSchemaConverter.convertToDataType(tableAvroSchema);
final RowType rowType = (RowType) rowDataType.getLogicalType();
final RowType requiredRowType = (RowType) getProducedDataType().notNull().getLogicalType();
final String queryType = this.conf.getString(FlinkOptions.QUERY_TYPE);
switch(queryType) {
case FlinkOptions.QUERY_TYPE_SNAPSHOT:
final HoodieTableType tableType = HoodieTableType.valueOf(this.conf.getString(FlinkOptions.TABLE_TYPE));
switch(tableType) {
case MERGE_ON_READ:
final List<MergeOnReadInputSplit> inputSplits = buildFileIndex();
if (inputSplits.size() == 0) {
// When there is no input splits, just return an empty source.
LOG.warn("No input splits generate for MERGE_ON_READ input format, returns empty collection instead");
return InputFormats.EMPTY_INPUT_FORMAT;
}
return mergeOnReadInputFormat(rowType, requiredRowType, tableAvroSchema, rowDataType, inputSplits, false);
case COPY_ON_WRITE:
return baseFileOnlyInputFormat();
default:
throw new HoodieException("Unexpected table type: " + this.conf.getString(FlinkOptions.TABLE_TYPE));
}
case FlinkOptions.QUERY_TYPE_READ_OPTIMIZED:
return baseFileOnlyInputFormat();
case FlinkOptions.QUERY_TYPE_INCREMENTAL:
IncrementalInputSplits incrementalInputSplits = IncrementalInputSplits.builder().conf(conf).path(FilePathUtils.toFlinkPath(path)).maxCompactionMemoryInBytes(maxCompactionMemoryInBytes).requiredPartitions(getRequiredPartitionPaths()).build();
final IncrementalInputSplits.Result result = incrementalInputSplits.inputSplits(metaClient, hadoopConf);
if (result.isEmpty()) {
// When there is no input splits, just return an empty source.
LOG.warn("No input splits generate for incremental read, returns empty collection instead");
return InputFormats.EMPTY_INPUT_FORMAT;
}
return mergeOnReadInputFormat(rowType, requiredRowType, tableAvroSchema, rowDataType, result.getInputSplits(), false);
default:
String errMsg = String.format("Invalid query type : '%s', options ['%s', '%s', '%s'] are supported now", queryType, FlinkOptions.QUERY_TYPE_SNAPSHOT, FlinkOptions.QUERY_TYPE_READ_OPTIMIZED, FlinkOptions.QUERY_TYPE_INCREMENTAL);
throw new HoodieException(errMsg);
}
}
use of org.apache.hudi.table.format.mor.MergeOnReadInputSplit in project hudi by apache.
the class HoodieTableSource method getScanRuntimeProvider.
@Override
public ScanRuntimeProvider getScanRuntimeProvider(ScanContext scanContext) {
return new DataStreamScanProvider() {
@Override
public boolean isBounded() {
return !conf.getBoolean(FlinkOptions.READ_AS_STREAMING);
}
@Override
public DataStream<RowData> produceDataStream(StreamExecutionEnvironment execEnv) {
@SuppressWarnings("unchecked") TypeInformation<RowData> typeInfo = (TypeInformation<RowData>) TypeInfoDataTypeConverter.fromDataTypeToTypeInfo(getProducedDataType());
if (conf.getBoolean(FlinkOptions.READ_AS_STREAMING)) {
StreamReadMonitoringFunction monitoringFunction = new StreamReadMonitoringFunction(conf, FilePathUtils.toFlinkPath(path), maxCompactionMemoryInBytes, getRequiredPartitionPaths());
InputFormat<RowData, ?> inputFormat = getInputFormat(true);
OneInputStreamOperatorFactory<MergeOnReadInputSplit, RowData> factory = StreamReadOperator.factory((MergeOnReadInputFormat) inputFormat);
SingleOutputStreamOperator<RowData> source = execEnv.addSource(monitoringFunction, getSourceOperatorName("split_monitor")).setParallelism(1).transform("split_reader", typeInfo, factory).setParallelism(conf.getInteger(FlinkOptions.READ_TASKS));
return new DataStreamSource<>(source);
} else {
InputFormatSourceFunction<RowData> func = new InputFormatSourceFunction<>(getInputFormat(), typeInfo);
DataStreamSource<RowData> source = execEnv.addSource(func, asSummaryString(), typeInfo);
return source.name(getSourceOperatorName("bounded_source")).setParallelism(conf.getInteger(FlinkOptions.READ_TASKS));
}
}
};
}
use of org.apache.hudi.table.format.mor.MergeOnReadInputSplit in project hudi by apache.
the class StreamReadMonitoringFunction method monitorDirAndForwardSplits.
@VisibleForTesting
public void monitorDirAndForwardSplits(SourceContext<MergeOnReadInputSplit> context) {
HoodieTableMetaClient metaClient = getOrCreateMetaClient();
if (metaClient == null) {
// table does not exist
return;
}
IncrementalInputSplits.Result result = incrementalInputSplits.inputSplits(metaClient, this.hadoopConf, this.issuedInstant);
if (result.isEmpty()) {
// no new instants, returns early
return;
}
for (MergeOnReadInputSplit split : result.getInputSplits()) {
context.collect(split);
}
// update the issues instant time
this.issuedInstant = result.getEndInstant();
LOG.info("\n" + "------------------------------------------------------------\n" + "---------- consumed to instant: {}\n" + "------------------------------------------------------------", this.issuedInstant);
}
Aggregations