Search in sources :

Example 1 with HoodieCombineRealtimeFileSplit

use of org.apache.hudi.hadoop.hive.HoodieCombineRealtimeFileSplit in project hudi by apache.

the class TestHoodieCombineHiveInputFormat method multiPartitionReadersRealtimeCombineHoodieInputFormat.

@Test
public void multiPartitionReadersRealtimeCombineHoodieInputFormat() throws Exception {
    // test for HUDI-1718
    Configuration conf = new Configuration();
    // initial commit
    Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema());
    HoodieTestUtils.init(hadoopConf, tempDir.toAbsolutePath().toString(), HoodieTableType.MERGE_ON_READ);
    String commitTime = "100";
    final int numRecords = 1000;
    // Create 3 partitions, each partition holds one parquet file and 1000 records
    List<File> partitionDirs = InputFormatTestUtil.prepareMultiPartitionedParquetTable(tempDir, schema, 3, numRecords, commitTime, HoodieTableType.MERGE_ON_READ);
    HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(Collections.emptyList(), Collections.emptyMap(), Option.empty(), WriteOperationType.UPSERT, schema.toString(), HoodieTimeline.COMMIT_ACTION);
    FileCreateUtils.createCommit(tempDir.toString(), commitTime, Option.of(commitMetadata));
    TableDesc tblDesc = Utilities.defaultTd;
    // Set the input format
    tblDesc.setInputFileFormatClass(HoodieParquetRealtimeInputFormat.class);
    LinkedHashMap<Path, PartitionDesc> pt = new LinkedHashMap<>();
    LinkedHashMap<Path, ArrayList<String>> talias = new LinkedHashMap<>();
    PartitionDesc partDesc = new PartitionDesc(tblDesc, null);
    pt.put(new Path(tempDir.toAbsolutePath().toString()), partDesc);
    ArrayList<String> arrayList = new ArrayList<>();
    arrayList.add(tempDir.toAbsolutePath().toString());
    talias.put(new Path(tempDir.toAbsolutePath().toString()), arrayList);
    MapredWork mrwork = new MapredWork();
    mrwork.getMapWork().setPathToPartitionInfo(pt);
    mrwork.getMapWork().setPathToAliases(talias);
    Path mapWorkPath = new Path(tempDir.toAbsolutePath().toString());
    Utilities.setMapRedWork(conf, mrwork, mapWorkPath);
    jobConf = new JobConf(conf);
    // Add three partition path to InputPaths
    Path[] partitionDirArray = new Path[partitionDirs.size()];
    partitionDirs.stream().map(p -> new Path(p.getPath())).collect(Collectors.toList()).toArray(partitionDirArray);
    FileInputFormat.setInputPaths(jobConf, partitionDirArray);
    jobConf.set(HAS_MAP_WORK, "true");
    // The following config tells Hive to choose ExecMapper to read the MAP_WORK
    jobConf.set(MAPRED_MAPPER_CLASS, ExecMapper.class.getName());
    // setting the split size to be 3 to create one split for 3 file groups
    jobConf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.SPLIT_MAXSIZE, "128000000");
    HoodieCombineHiveInputFormat combineHiveInputFormat = new HoodieCombineHiveInputFormat();
    String tripsHiveColumnTypes = "double,string,string,string,double,double,double,double,double";
    InputFormatTestUtil.setPropsForInputFormat(jobConf, schema, tripsHiveColumnTypes);
    InputSplit[] splits = combineHiveInputFormat.getSplits(jobConf, 1);
    // Since the SPLIT_SIZE is 3, we should create only 1 split with all 3 file groups
    assertEquals(1, splits.length);
    RecordReader<NullWritable, ArrayWritable> recordReader = combineHiveInputFormat.getRecordReader(splits[0], jobConf, null);
    NullWritable nullWritable = recordReader.createKey();
    ArrayWritable arrayWritable = recordReader.createValue();
    int counter = 0;
    HoodieCombineRealtimeHiveSplit hiveSplit = (HoodieCombineRealtimeHiveSplit) splits[0];
    HoodieCombineRealtimeFileSplit fileSplit = (HoodieCombineRealtimeFileSplit) hiveSplit.getInputSplitShim();
    List<FileSplit> realtimeFileSplits = fileSplit.getRealtimeFileSplits();
    while (recordReader.next(nullWritable, arrayWritable)) {
        // Hive use ioctx to extract partition info, when switch reader, ioctx should be updated.
        if (counter < 1000) {
            assertEquals(IOContextMap.get(jobConf).getInputPath().toString(), realtimeFileSplits.get(0).getPath().toString());
        } else if (counter < 2000) {
            assertEquals(IOContextMap.get(jobConf).getInputPath().toString(), realtimeFileSplits.get(1).getPath().toString());
        } else {
            assertEquals(IOContextMap.get(jobConf).getInputPath().toString(), realtimeFileSplits.get(2).getPath().toString());
        }
        counter++;
    }
    // should read out 3 splits, each for file0, file1, file2 containing 1000 records each
    assertEquals(3000, counter);
    recordReader.close();
}
Also used : HoodieCombineRealtimeHiveSplit(org.apache.hudi.hadoop.hive.HoodieCombineRealtimeHiveSplit) Configuration(org.apache.hadoop.conf.Configuration) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) FileSplit(org.apache.hadoop.mapred.FileSplit) HoodieCombineRealtimeFileSplit(org.apache.hudi.hadoop.hive.HoodieCombineRealtimeFileSplit) LinkedHashMap(java.util.LinkedHashMap) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ArrayWritable(org.apache.hadoop.io.ArrayWritable) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) HoodieCombineRealtimeFileSplit(org.apache.hudi.hadoop.hive.HoodieCombineRealtimeFileSplit) Path(org.apache.hadoop.fs.Path) NullWritable(org.apache.hadoop.io.NullWritable) HoodieCombineHiveInputFormat(org.apache.hudi.hadoop.hive.HoodieCombineHiveInputFormat) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) File(java.io.File) ExecMapper(org.apache.hadoop.hive.ql.exec.mr.ExecMapper) Test(org.junit.jupiter.api.Test)

Aggregations

File (java.io.File)1 ArrayList (java.util.ArrayList)1 LinkedHashMap (java.util.LinkedHashMap)1 Schema (org.apache.avro.Schema)1 Configuration (org.apache.hadoop.conf.Configuration)1 Path (org.apache.hadoop.fs.Path)1 ExecMapper (org.apache.hadoop.hive.ql.exec.mr.ExecMapper)1 MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)1 PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)1 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)1 ArrayWritable (org.apache.hadoop.io.ArrayWritable)1 NullWritable (org.apache.hadoop.io.NullWritable)1 FileSplit (org.apache.hadoop.mapred.FileSplit)1 InputSplit (org.apache.hadoop.mapred.InputSplit)1 JobConf (org.apache.hadoop.mapred.JobConf)1 HoodieCommitMetadata (org.apache.hudi.common.model.HoodieCommitMetadata)1 HoodieCombineHiveInputFormat (org.apache.hudi.hadoop.hive.HoodieCombineHiveInputFormat)1 HoodieCombineRealtimeFileSplit (org.apache.hudi.hadoop.hive.HoodieCombineRealtimeFileSplit)1 HoodieCombineRealtimeHiveSplit (org.apache.hudi.hadoop.hive.HoodieCombineRealtimeHiveSplit)1 Test (org.junit.jupiter.api.Test)1