Search in sources :

Example 1 with HoodieCombineHiveInputFormat

use of org.apache.hudi.hadoop.hive.HoodieCombineHiveInputFormat in project hudi by apache.

the class TestHoodieCombineHiveInputFormat method testHoodieRealtimeCombineHoodieInputFormat.

@Test
@Disabled
public void testHoodieRealtimeCombineHoodieInputFormat() throws Exception {
    Configuration conf = new Configuration();
    // initial commit
    Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema());
    HoodieTestUtils.init(hadoopConf, tempDir.toAbsolutePath().toString(), HoodieTableType.MERGE_ON_READ);
    String commitTime = "100";
    final int numRecords = 1000;
    // Create 3 parquet files with 1000 records each
    File partitionDir = InputFormatTestUtil.prepareParquetTable(tempDir, schema, 3, numRecords, commitTime);
    HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(Collections.emptyList(), Collections.emptyMap(), Option.empty(), WriteOperationType.UPSERT, schema.toString(), HoodieTimeline.COMMIT_ACTION);
    FileCreateUtils.createCommit(tempDir.toString(), commitTime, Option.of(commitMetadata));
    // insert 1000 update records to log file 0
    String newCommitTime = "101";
    HoodieLogFormat.Writer writer = InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid0", commitTime, newCommitTime, numRecords, numRecords, 0);
    writer.close();
    // insert 1000 update records to log file 1
    writer = InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid1", commitTime, newCommitTime, numRecords, numRecords, 0);
    writer.close();
    // insert 1000 update records to log file 2
    writer = InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid2", commitTime, newCommitTime, numRecords, numRecords, 0);
    writer.close();
    TableDesc tblDesc = Utilities.defaultTd;
    // Set the input format
    tblDesc.setInputFileFormatClass(HoodieCombineHiveInputFormat.class);
    PartitionDesc partDesc = new PartitionDesc(tblDesc, null);
    LinkedHashMap<Path, PartitionDesc> pt = new LinkedHashMap<>();
    pt.put(new Path(tempDir.toAbsolutePath().toString()), partDesc);
    MapredWork mrwork = new MapredWork();
    mrwork.getMapWork().setPathToPartitionInfo(pt);
    Path mapWorkPath = new Path(tempDir.toAbsolutePath().toString());
    Utilities.setMapRedWork(conf, mrwork, mapWorkPath);
    jobConf = new JobConf(conf);
    // Add the paths
    FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
    jobConf.set(HAS_MAP_WORK, "true");
    // The following config tells Hive to choose ExecMapper to read the MAP_WORK
    jobConf.set(MAPRED_MAPPER_CLASS, ExecMapper.class.getName());
    // setting the split size to be 3 to create one split for 3 file groups
    jobConf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.SPLIT_MAXSIZE, "3");
    HoodieCombineHiveInputFormat combineHiveInputFormat = new HoodieCombineHiveInputFormat();
    String tripsHiveColumnTypes = "double,string,string,string,double,double,double,double,double";
    InputFormatTestUtil.setPropsForInputFormat(jobConf, schema, tripsHiveColumnTypes);
    InputSplit[] splits = combineHiveInputFormat.getSplits(jobConf, 1);
    // Since the SPLIT_SIZE is 3, we should create only 1 split with all 3 file groups
    assertEquals(1, splits.length);
    RecordReader<NullWritable, ArrayWritable> recordReader = combineHiveInputFormat.getRecordReader(splits[0], jobConf, null);
    NullWritable nullWritable = recordReader.createKey();
    ArrayWritable arrayWritable = recordReader.createValue();
    int counter = 0;
    while (recordReader.next(nullWritable, arrayWritable)) {
        // read over all the splits
        counter++;
    }
    // should read out 3 splits, each for file0, file1, file2 containing 1000 records each
    assertEquals(3000, counter);
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) Schema(org.apache.avro.Schema) NullWritable(org.apache.hadoop.io.NullWritable) LinkedHashMap(java.util.LinkedHashMap) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ArrayWritable(org.apache.hadoop.io.ArrayWritable) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) HoodieCombineHiveInputFormat(org.apache.hudi.hadoop.hive.HoodieCombineHiveInputFormat) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) File(java.io.File) JobConf(org.apache.hadoop.mapred.JobConf) ExecMapper(org.apache.hadoop.hive.ql.exec.mr.ExecMapper) InputSplit(org.apache.hadoop.mapred.InputSplit) Test(org.junit.jupiter.api.Test) Disabled(org.junit.jupiter.api.Disabled)

Example 2 with HoodieCombineHiveInputFormat

use of org.apache.hudi.hadoop.hive.HoodieCombineHiveInputFormat in project hudi by apache.

the class TestHoodieCombineHiveInputFormat method testMultiReaderRealtimeCombineHoodieInputFormat.

@Test
public void testMultiReaderRealtimeCombineHoodieInputFormat() throws Exception {
    // test for hudi-1722
    Configuration conf = new Configuration();
    // initial commit
    Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema());
    HoodieTestUtils.init(hadoopConf, tempDir.toAbsolutePath().toString(), HoodieTableType.MERGE_ON_READ);
    String commitTime = "100";
    final int numRecords = 1000;
    // Create 3 parquet files with 1000 records each
    File partitionDir = InputFormatTestUtil.prepareParquetTable(tempDir, schema, 3, numRecords, commitTime);
    HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(Collections.emptyList(), Collections.emptyMap(), Option.empty(), WriteOperationType.UPSERT, schema.toString(), HoodieTimeline.COMMIT_ACTION);
    FileCreateUtils.createCommit(tempDir.toString(), commitTime, Option.of(commitMetadata));
    String newCommitTime = "101";
    // to trigger the bug of HUDI-1772, only update fileid2
    // insert 1000 update records to log file 2
    // now fileid0, fileid1 has no log files, fileid2 has log file
    HoodieLogFormat.Writer writer = InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid2", commitTime, newCommitTime, numRecords, numRecords, 0);
    writer.close();
    TableDesc tblDesc = Utilities.defaultTd;
    // Set the input format
    tblDesc.setInputFileFormatClass(HoodieParquetRealtimeInputFormat.class);
    PartitionDesc partDesc = new PartitionDesc(tblDesc, null);
    LinkedHashMap<Path, PartitionDesc> pt = new LinkedHashMap<>();
    LinkedHashMap<Path, ArrayList<String>> tableAlias = new LinkedHashMap<>();
    ArrayList<String> alias = new ArrayList<>();
    alias.add(tempDir.toAbsolutePath().toString());
    tableAlias.put(new Path(tempDir.toAbsolutePath().toString()), alias);
    pt.put(new Path(tempDir.toAbsolutePath().toString()), partDesc);
    MapredWork mrwork = new MapredWork();
    mrwork.getMapWork().setPathToPartitionInfo(pt);
    mrwork.getMapWork().setPathToAliases(tableAlias);
    Path mapWorkPath = new Path(tempDir.toAbsolutePath().toString());
    Utilities.setMapRedWork(conf, mrwork, mapWorkPath);
    jobConf = new JobConf(conf);
    // Add the paths
    FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
    jobConf.set(HAS_MAP_WORK, "true");
    // The following config tells Hive to choose ExecMapper to read the MAP_WORK
    jobConf.set(MAPRED_MAPPER_CLASS, ExecMapper.class.getName());
    // set SPLIT_MAXSIZE larger  to create one split for 3 files groups
    jobConf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.SPLIT_MAXSIZE, "128000000");
    HoodieCombineHiveInputFormat combineHiveInputFormat = new HoodieCombineHiveInputFormat();
    String tripsHiveColumnTypes = "double,string,string,string,double,double,double,double,double";
    InputFormatTestUtil.setProjectFieldsForInputFormat(jobConf, schema, tripsHiveColumnTypes);
    InputSplit[] splits = combineHiveInputFormat.getSplits(jobConf, 1);
    // Since the SPLIT_SIZE is 3, we should create only 1 split with all 3 file groups
    assertEquals(1, splits.length);
    RecordReader<NullWritable, ArrayWritable> recordReader = combineHiveInputFormat.getRecordReader(splits[0], jobConf, null);
    NullWritable nullWritable = recordReader.createKey();
    ArrayWritable arrayWritable = recordReader.createValue();
    int counter = 0;
    while (recordReader.next(nullWritable, arrayWritable)) {
        // read over all the splits
        counter++;
    }
    // should read out 3 splits, each for file0, file1, file2 containing 1000 records each
    assertEquals(3000, counter);
    recordReader.close();
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) LinkedHashMap(java.util.LinkedHashMap) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ArrayWritable(org.apache.hadoop.io.ArrayWritable) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) Path(org.apache.hadoop.fs.Path) NullWritable(org.apache.hadoop.io.NullWritable) HoodieCombineHiveInputFormat(org.apache.hudi.hadoop.hive.HoodieCombineHiveInputFormat) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) File(java.io.File) ExecMapper(org.apache.hadoop.hive.ql.exec.mr.ExecMapper) Test(org.junit.jupiter.api.Test)

Example 3 with HoodieCombineHiveInputFormat

use of org.apache.hudi.hadoop.hive.HoodieCombineHiveInputFormat in project hudi by apache.

the class TestHoodieCombineHiveInputFormat method multiPartitionReadersRealtimeCombineHoodieInputFormat.

@Test
public void multiPartitionReadersRealtimeCombineHoodieInputFormat() throws Exception {
    // test for HUDI-1718
    Configuration conf = new Configuration();
    // initial commit
    Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema());
    HoodieTestUtils.init(hadoopConf, tempDir.toAbsolutePath().toString(), HoodieTableType.MERGE_ON_READ);
    String commitTime = "100";
    final int numRecords = 1000;
    // Create 3 partitions, each partition holds one parquet file and 1000 records
    List<File> partitionDirs = InputFormatTestUtil.prepareMultiPartitionedParquetTable(tempDir, schema, 3, numRecords, commitTime, HoodieTableType.MERGE_ON_READ);
    HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(Collections.emptyList(), Collections.emptyMap(), Option.empty(), WriteOperationType.UPSERT, schema.toString(), HoodieTimeline.COMMIT_ACTION);
    FileCreateUtils.createCommit(tempDir.toString(), commitTime, Option.of(commitMetadata));
    TableDesc tblDesc = Utilities.defaultTd;
    // Set the input format
    tblDesc.setInputFileFormatClass(HoodieParquetRealtimeInputFormat.class);
    LinkedHashMap<Path, PartitionDesc> pt = new LinkedHashMap<>();
    LinkedHashMap<Path, ArrayList<String>> talias = new LinkedHashMap<>();
    PartitionDesc partDesc = new PartitionDesc(tblDesc, null);
    pt.put(new Path(tempDir.toAbsolutePath().toString()), partDesc);
    ArrayList<String> arrayList = new ArrayList<>();
    arrayList.add(tempDir.toAbsolutePath().toString());
    talias.put(new Path(tempDir.toAbsolutePath().toString()), arrayList);
    MapredWork mrwork = new MapredWork();
    mrwork.getMapWork().setPathToPartitionInfo(pt);
    mrwork.getMapWork().setPathToAliases(talias);
    Path mapWorkPath = new Path(tempDir.toAbsolutePath().toString());
    Utilities.setMapRedWork(conf, mrwork, mapWorkPath);
    jobConf = new JobConf(conf);
    // Add three partition path to InputPaths
    Path[] partitionDirArray = new Path[partitionDirs.size()];
    partitionDirs.stream().map(p -> new Path(p.getPath())).collect(Collectors.toList()).toArray(partitionDirArray);
    FileInputFormat.setInputPaths(jobConf, partitionDirArray);
    jobConf.set(HAS_MAP_WORK, "true");
    // The following config tells Hive to choose ExecMapper to read the MAP_WORK
    jobConf.set(MAPRED_MAPPER_CLASS, ExecMapper.class.getName());
    // setting the split size to be 3 to create one split for 3 file groups
    jobConf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.SPLIT_MAXSIZE, "128000000");
    HoodieCombineHiveInputFormat combineHiveInputFormat = new HoodieCombineHiveInputFormat();
    String tripsHiveColumnTypes = "double,string,string,string,double,double,double,double,double";
    InputFormatTestUtil.setPropsForInputFormat(jobConf, schema, tripsHiveColumnTypes);
    InputSplit[] splits = combineHiveInputFormat.getSplits(jobConf, 1);
    // Since the SPLIT_SIZE is 3, we should create only 1 split with all 3 file groups
    assertEquals(1, splits.length);
    RecordReader<NullWritable, ArrayWritable> recordReader = combineHiveInputFormat.getRecordReader(splits[0], jobConf, null);
    NullWritable nullWritable = recordReader.createKey();
    ArrayWritable arrayWritable = recordReader.createValue();
    int counter = 0;
    HoodieCombineRealtimeHiveSplit hiveSplit = (HoodieCombineRealtimeHiveSplit) splits[0];
    HoodieCombineRealtimeFileSplit fileSplit = (HoodieCombineRealtimeFileSplit) hiveSplit.getInputSplitShim();
    List<FileSplit> realtimeFileSplits = fileSplit.getRealtimeFileSplits();
    while (recordReader.next(nullWritable, arrayWritable)) {
        // Hive use ioctx to extract partition info, when switch reader, ioctx should be updated.
        if (counter < 1000) {
            assertEquals(IOContextMap.get(jobConf).getInputPath().toString(), realtimeFileSplits.get(0).getPath().toString());
        } else if (counter < 2000) {
            assertEquals(IOContextMap.get(jobConf).getInputPath().toString(), realtimeFileSplits.get(1).getPath().toString());
        } else {
            assertEquals(IOContextMap.get(jobConf).getInputPath().toString(), realtimeFileSplits.get(2).getPath().toString());
        }
        counter++;
    }
    // should read out 3 splits, each for file0, file1, file2 containing 1000 records each
    assertEquals(3000, counter);
    recordReader.close();
}
Also used : HoodieCombineRealtimeHiveSplit(org.apache.hudi.hadoop.hive.HoodieCombineRealtimeHiveSplit) Configuration(org.apache.hadoop.conf.Configuration) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) FileSplit(org.apache.hadoop.mapred.FileSplit) HoodieCombineRealtimeFileSplit(org.apache.hudi.hadoop.hive.HoodieCombineRealtimeFileSplit) LinkedHashMap(java.util.LinkedHashMap) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ArrayWritable(org.apache.hadoop.io.ArrayWritable) JobConf(org.apache.hadoop.mapred.JobConf) InputSplit(org.apache.hadoop.mapred.InputSplit) HoodieCombineRealtimeFileSplit(org.apache.hudi.hadoop.hive.HoodieCombineRealtimeFileSplit) Path(org.apache.hadoop.fs.Path) NullWritable(org.apache.hadoop.io.NullWritable) HoodieCombineHiveInputFormat(org.apache.hudi.hadoop.hive.HoodieCombineHiveInputFormat) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) File(java.io.File) ExecMapper(org.apache.hadoop.hive.ql.exec.mr.ExecMapper) Test(org.junit.jupiter.api.Test)

Example 4 with HoodieCombineHiveInputFormat

use of org.apache.hudi.hadoop.hive.HoodieCombineHiveInputFormat in project hudi by apache.

the class TestHoodieCombineHiveInputFormat method multiLevelPartitionReadersRealtimeCombineHoodieInputFormat.

@Test
public void multiLevelPartitionReadersRealtimeCombineHoodieInputFormat() throws Exception {
    // test for HUDI-1718
    Configuration conf = new Configuration();
    // initial commit
    Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema());
    HoodieTestUtils.init(hadoopConf, tempDir.toAbsolutePath().toString(), HoodieTableType.MERGE_ON_READ);
    String commitTime = "100";
    final int numRecords = 1000;
    // Create 3 parquet files with 1000 records each
    File partitionDir = InputFormatTestUtil.prepareParquetTable(tempDir, schema, 3, numRecords, commitTime);
    HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(Collections.emptyList(), Collections.emptyMap(), Option.empty(), WriteOperationType.UPSERT, schema.toString(), HoodieTimeline.COMMIT_ACTION);
    FileCreateUtils.createCommit(tempDir.toString(), commitTime, Option.of(commitMetadata));
    TableDesc tblDesc = Utilities.defaultTd;
    // Set the input format
    tblDesc.setInputFileFormatClass(HoodieParquetRealtimeInputFormat.class);
    LinkedHashMap<Path, PartitionDesc> pt = new LinkedHashMap<>();
    LinkedHashMap<Path, ArrayList<String>> talias = new LinkedHashMap<>();
    LinkedHashMap<String, String> partSpec = new LinkedHashMap<>();
    // add three level partitions info
    partSpec.put("year", "2016");
    partSpec.put("month", "05");
    partSpec.put("day", "01");
    PartitionDesc partDesc = new PartitionDesc(tblDesc, partSpec);
    pt.put(new Path(tempDir.toAbsolutePath().toString()), partDesc);
    ArrayList<String> arrayList = new ArrayList<>();
    arrayList.add(tempDir.toAbsolutePath().toString());
    talias.put(new Path(tempDir.toAbsolutePath().toString()), arrayList);
    MapredWork mrwork = new MapredWork();
    mrwork.getMapWork().setPathToPartitionInfo(pt);
    mrwork.getMapWork().setPathToAliases(talias);
    Path mapWorkPath = new Path(tempDir.toAbsolutePath().toString());
    Utilities.setMapRedWork(conf, mrwork, mapWorkPath);
    jobConf = new JobConf(conf);
    // Add the paths
    FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
    jobConf.set(HAS_MAP_WORK, "true");
    // The following config tells Hive to choose ExecMapper to read the MAP_WORK
    jobConf.set(MAPRED_MAPPER_CLASS, ExecMapper.class.getName());
    // setting the split size to be 3 to create one split for 3 file groups
    jobConf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.SPLIT_MAXSIZE, "128000000");
    HoodieCombineHiveInputFormat combineHiveInputFormat = new HoodieCombineHiveInputFormat();
    String tripsHiveColumnTypes = "double,string,string,string,double,double,double,double,double";
    InputFormatTestUtil.setPropsForInputFormat(jobConf, schema, tripsHiveColumnTypes);
    // unset META_TABLE_PARTITION_COLUMNS to trigger HUDI-1718
    jobConf.set(hive_metastoreConstants.META_TABLE_PARTITION_COLUMNS, "");
    InputSplit[] splits = combineHiveInputFormat.getSplits(jobConf, 1);
    // Since the SPLIT_SIZE is 3, we should create only 1 split with all 3 file groups
    assertEquals(1, splits.length);
    // if HUDI-1718 is not fixed, the follow code will throw exception
    RecordReader<NullWritable, ArrayWritable> recordReader = combineHiveInputFormat.getRecordReader(splits[0], jobConf, null);
    NullWritable nullWritable = recordReader.createKey();
    ArrayWritable arrayWritable = recordReader.createValue();
    int counter = 0;
    while (recordReader.next(nullWritable, arrayWritable)) {
        // read over all the splits
        counter++;
    }
    // should read out 3 splits, each for file0, file1, file2 containing 1000 records each
    assertEquals(3000, counter);
    recordReader.close();
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) NullWritable(org.apache.hadoop.io.NullWritable) LinkedHashMap(java.util.LinkedHashMap) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ArrayWritable(org.apache.hadoop.io.ArrayWritable) HoodieCombineHiveInputFormat(org.apache.hudi.hadoop.hive.HoodieCombineHiveInputFormat) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) File(java.io.File) JobConf(org.apache.hadoop.mapred.JobConf) ExecMapper(org.apache.hadoop.hive.ql.exec.mr.ExecMapper) InputSplit(org.apache.hadoop.mapred.InputSplit) Test(org.junit.jupiter.api.Test)

Aggregations

File (java.io.File)4 LinkedHashMap (java.util.LinkedHashMap)4 Schema (org.apache.avro.Schema)4 Configuration (org.apache.hadoop.conf.Configuration)4 Path (org.apache.hadoop.fs.Path)4 ExecMapper (org.apache.hadoop.hive.ql.exec.mr.ExecMapper)4 MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)4 PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)4 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)4 ArrayWritable (org.apache.hadoop.io.ArrayWritable)4 NullWritable (org.apache.hadoop.io.NullWritable)4 InputSplit (org.apache.hadoop.mapred.InputSplit)4 JobConf (org.apache.hadoop.mapred.JobConf)4 HoodieCommitMetadata (org.apache.hudi.common.model.HoodieCommitMetadata)4 HoodieCombineHiveInputFormat (org.apache.hudi.hadoop.hive.HoodieCombineHiveInputFormat)4 Test (org.junit.jupiter.api.Test)4 ArrayList (java.util.ArrayList)3 HoodieLogFormat (org.apache.hudi.common.table.log.HoodieLogFormat)2 FileSplit (org.apache.hadoop.mapred.FileSplit)1 HoodieCombineRealtimeFileSplit (org.apache.hudi.hadoop.hive.HoodieCombineRealtimeFileSplit)1