use of org.apache.hudi.hadoop.hive.HoodieCombineRealtimeHiveSplit in project hudi by apache.
the class TestHoodieCombineHiveInputFormat method multiPartitionReadersRealtimeCombineHoodieInputFormat.
@Test
public void multiPartitionReadersRealtimeCombineHoodieInputFormat() throws Exception {
// test for HUDI-1718
Configuration conf = new Configuration();
// initial commit
Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema());
HoodieTestUtils.init(hadoopConf, tempDir.toAbsolutePath().toString(), HoodieTableType.MERGE_ON_READ);
String commitTime = "100";
final int numRecords = 1000;
// Create 3 partitions, each partition holds one parquet file and 1000 records
List<File> partitionDirs = InputFormatTestUtil.prepareMultiPartitionedParquetTable(tempDir, schema, 3, numRecords, commitTime, HoodieTableType.MERGE_ON_READ);
HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(Collections.emptyList(), Collections.emptyMap(), Option.empty(), WriteOperationType.UPSERT, schema.toString(), HoodieTimeline.COMMIT_ACTION);
FileCreateUtils.createCommit(tempDir.toString(), commitTime, Option.of(commitMetadata));
TableDesc tblDesc = Utilities.defaultTd;
// Set the input format
tblDesc.setInputFileFormatClass(HoodieParquetRealtimeInputFormat.class);
LinkedHashMap<Path, PartitionDesc> pt = new LinkedHashMap<>();
LinkedHashMap<Path, ArrayList<String>> talias = new LinkedHashMap<>();
PartitionDesc partDesc = new PartitionDesc(tblDesc, null);
pt.put(new Path(tempDir.toAbsolutePath().toString()), partDesc);
ArrayList<String> arrayList = new ArrayList<>();
arrayList.add(tempDir.toAbsolutePath().toString());
talias.put(new Path(tempDir.toAbsolutePath().toString()), arrayList);
MapredWork mrwork = new MapredWork();
mrwork.getMapWork().setPathToPartitionInfo(pt);
mrwork.getMapWork().setPathToAliases(talias);
Path mapWorkPath = new Path(tempDir.toAbsolutePath().toString());
Utilities.setMapRedWork(conf, mrwork, mapWorkPath);
jobConf = new JobConf(conf);
// Add three partition path to InputPaths
Path[] partitionDirArray = new Path[partitionDirs.size()];
partitionDirs.stream().map(p -> new Path(p.getPath())).collect(Collectors.toList()).toArray(partitionDirArray);
FileInputFormat.setInputPaths(jobConf, partitionDirArray);
jobConf.set(HAS_MAP_WORK, "true");
// The following config tells Hive to choose ExecMapper to read the MAP_WORK
jobConf.set(MAPRED_MAPPER_CLASS, ExecMapper.class.getName());
// setting the split size to be 3 to create one split for 3 file groups
jobConf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.SPLIT_MAXSIZE, "128000000");
HoodieCombineHiveInputFormat combineHiveInputFormat = new HoodieCombineHiveInputFormat();
String tripsHiveColumnTypes = "double,string,string,string,double,double,double,double,double";
InputFormatTestUtil.setPropsForInputFormat(jobConf, schema, tripsHiveColumnTypes);
InputSplit[] splits = combineHiveInputFormat.getSplits(jobConf, 1);
// Since the SPLIT_SIZE is 3, we should create only 1 split with all 3 file groups
assertEquals(1, splits.length);
RecordReader<NullWritable, ArrayWritable> recordReader = combineHiveInputFormat.getRecordReader(splits[0], jobConf, null);
NullWritable nullWritable = recordReader.createKey();
ArrayWritable arrayWritable = recordReader.createValue();
int counter = 0;
HoodieCombineRealtimeHiveSplit hiveSplit = (HoodieCombineRealtimeHiveSplit) splits[0];
HoodieCombineRealtimeFileSplit fileSplit = (HoodieCombineRealtimeFileSplit) hiveSplit.getInputSplitShim();
List<FileSplit> realtimeFileSplits = fileSplit.getRealtimeFileSplits();
while (recordReader.next(nullWritable, arrayWritable)) {
// Hive use ioctx to extract partition info, when switch reader, ioctx should be updated.
if (counter < 1000) {
assertEquals(IOContextMap.get(jobConf).getInputPath().toString(), realtimeFileSplits.get(0).getPath().toString());
} else if (counter < 2000) {
assertEquals(IOContextMap.get(jobConf).getInputPath().toString(), realtimeFileSplits.get(1).getPath().toString());
} else {
assertEquals(IOContextMap.get(jobConf).getInputPath().toString(), realtimeFileSplits.get(2).getPath().toString());
}
counter++;
}
// should read out 3 splits, each for file0, file1, file2 containing 1000 records each
assertEquals(3000, counter);
recordReader.close();
}
Aggregations