use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.
the class TestCommitUtils method testCommitMetadataCreation.
@Test
public void testCommitMetadataCreation() {
List<HoodieWriteStat> writeStats = new ArrayList<>();
writeStats.add(createWriteStat("p1", "f1"));
writeStats.add(createWriteStat("p2", "f2"));
Map<String, List<String>> partitionToReplaceFileIds = new HashMap<>();
List<String> replacedFileIds = new ArrayList<>();
replacedFileIds.add("f0");
partitionToReplaceFileIds.put("p1", replacedFileIds);
HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(writeStats, partitionToReplaceFileIds, Option.empty(), WriteOperationType.INSERT, TRIP_SCHEMA, HoodieTimeline.DELTA_COMMIT_ACTION);
assertFalse(commitMetadata instanceof HoodieReplaceCommitMetadata);
assertEquals(2, commitMetadata.getPartitionToWriteStats().size());
assertEquals("f1", commitMetadata.getPartitionToWriteStats().get("p1").get(0).getFileId());
assertEquals("f2", commitMetadata.getPartitionToWriteStats().get("p2").get(0).getFileId());
assertEquals(WriteOperationType.INSERT, commitMetadata.getOperationType());
assertEquals(TRIP_SCHEMA, commitMetadata.getMetadata(HoodieCommitMetadata.SCHEMA_KEY));
}
use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.
the class TestHoodieCombineHiveInputFormat method testHoodieRealtimeCombineHoodieInputFormat.
@Test
@Disabled
public void testHoodieRealtimeCombineHoodieInputFormat() throws Exception {
Configuration conf = new Configuration();
// initial commit
Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema());
HoodieTestUtils.init(hadoopConf, tempDir.toAbsolutePath().toString(), HoodieTableType.MERGE_ON_READ);
String commitTime = "100";
final int numRecords = 1000;
// Create 3 parquet files with 1000 records each
File partitionDir = InputFormatTestUtil.prepareParquetTable(tempDir, schema, 3, numRecords, commitTime);
HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(Collections.emptyList(), Collections.emptyMap(), Option.empty(), WriteOperationType.UPSERT, schema.toString(), HoodieTimeline.COMMIT_ACTION);
FileCreateUtils.createCommit(tempDir.toString(), commitTime, Option.of(commitMetadata));
// insert 1000 update records to log file 0
String newCommitTime = "101";
HoodieLogFormat.Writer writer = InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid0", commitTime, newCommitTime, numRecords, numRecords, 0);
writer.close();
// insert 1000 update records to log file 1
writer = InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid1", commitTime, newCommitTime, numRecords, numRecords, 0);
writer.close();
// insert 1000 update records to log file 2
writer = InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid2", commitTime, newCommitTime, numRecords, numRecords, 0);
writer.close();
TableDesc tblDesc = Utilities.defaultTd;
// Set the input format
tblDesc.setInputFileFormatClass(HoodieCombineHiveInputFormat.class);
PartitionDesc partDesc = new PartitionDesc(tblDesc, null);
LinkedHashMap<Path, PartitionDesc> pt = new LinkedHashMap<>();
pt.put(new Path(tempDir.toAbsolutePath().toString()), partDesc);
MapredWork mrwork = new MapredWork();
mrwork.getMapWork().setPathToPartitionInfo(pt);
Path mapWorkPath = new Path(tempDir.toAbsolutePath().toString());
Utilities.setMapRedWork(conf, mrwork, mapWorkPath);
jobConf = new JobConf(conf);
// Add the paths
FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
jobConf.set(HAS_MAP_WORK, "true");
// The following config tells Hive to choose ExecMapper to read the MAP_WORK
jobConf.set(MAPRED_MAPPER_CLASS, ExecMapper.class.getName());
// setting the split size to be 3 to create one split for 3 file groups
jobConf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.SPLIT_MAXSIZE, "3");
HoodieCombineHiveInputFormat combineHiveInputFormat = new HoodieCombineHiveInputFormat();
String tripsHiveColumnTypes = "double,string,string,string,double,double,double,double,double";
InputFormatTestUtil.setPropsForInputFormat(jobConf, schema, tripsHiveColumnTypes);
InputSplit[] splits = combineHiveInputFormat.getSplits(jobConf, 1);
// Since the SPLIT_SIZE is 3, we should create only 1 split with all 3 file groups
assertEquals(1, splits.length);
RecordReader<NullWritable, ArrayWritable> recordReader = combineHiveInputFormat.getRecordReader(splits[0], jobConf, null);
NullWritable nullWritable = recordReader.createKey();
ArrayWritable arrayWritable = recordReader.createValue();
int counter = 0;
while (recordReader.next(nullWritable, arrayWritable)) {
// read over all the splits
counter++;
}
// should read out 3 splits, each for file0, file1, file2 containing 1000 records each
assertEquals(3000, counter);
}
use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.
the class TestHoodieCombineHiveInputFormat method testMultiReaderRealtimeCombineHoodieInputFormat.
@Test
public void testMultiReaderRealtimeCombineHoodieInputFormat() throws Exception {
// test for hudi-1722
Configuration conf = new Configuration();
// initial commit
Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getEvolvedSchema());
HoodieTestUtils.init(hadoopConf, tempDir.toAbsolutePath().toString(), HoodieTableType.MERGE_ON_READ);
String commitTime = "100";
final int numRecords = 1000;
// Create 3 parquet files with 1000 records each
File partitionDir = InputFormatTestUtil.prepareParquetTable(tempDir, schema, 3, numRecords, commitTime);
HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(Collections.emptyList(), Collections.emptyMap(), Option.empty(), WriteOperationType.UPSERT, schema.toString(), HoodieTimeline.COMMIT_ACTION);
FileCreateUtils.createCommit(tempDir.toString(), commitTime, Option.of(commitMetadata));
String newCommitTime = "101";
// to trigger the bug of HUDI-1772, only update fileid2
// insert 1000 update records to log file 2
// now fileid0, fileid1 has no log files, fileid2 has log file
HoodieLogFormat.Writer writer = InputFormatTestUtil.writeDataBlockToLogFile(partitionDir, fs, schema, "fileid2", commitTime, newCommitTime, numRecords, numRecords, 0);
writer.close();
TableDesc tblDesc = Utilities.defaultTd;
// Set the input format
tblDesc.setInputFileFormatClass(HoodieParquetRealtimeInputFormat.class);
PartitionDesc partDesc = new PartitionDesc(tblDesc, null);
LinkedHashMap<Path, PartitionDesc> pt = new LinkedHashMap<>();
LinkedHashMap<Path, ArrayList<String>> tableAlias = new LinkedHashMap<>();
ArrayList<String> alias = new ArrayList<>();
alias.add(tempDir.toAbsolutePath().toString());
tableAlias.put(new Path(tempDir.toAbsolutePath().toString()), alias);
pt.put(new Path(tempDir.toAbsolutePath().toString()), partDesc);
MapredWork mrwork = new MapredWork();
mrwork.getMapWork().setPathToPartitionInfo(pt);
mrwork.getMapWork().setPathToAliases(tableAlias);
Path mapWorkPath = new Path(tempDir.toAbsolutePath().toString());
Utilities.setMapRedWork(conf, mrwork, mapWorkPath);
jobConf = new JobConf(conf);
// Add the paths
FileInputFormat.setInputPaths(jobConf, partitionDir.getPath());
jobConf.set(HAS_MAP_WORK, "true");
// The following config tells Hive to choose ExecMapper to read the MAP_WORK
jobConf.set(MAPRED_MAPPER_CLASS, ExecMapper.class.getName());
// set SPLIT_MAXSIZE larger to create one split for 3 files groups
jobConf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.SPLIT_MAXSIZE, "128000000");
HoodieCombineHiveInputFormat combineHiveInputFormat = new HoodieCombineHiveInputFormat();
String tripsHiveColumnTypes = "double,string,string,string,double,double,double,double,double";
InputFormatTestUtil.setProjectFieldsForInputFormat(jobConf, schema, tripsHiveColumnTypes);
InputSplit[] splits = combineHiveInputFormat.getSplits(jobConf, 1);
// Since the SPLIT_SIZE is 3, we should create only 1 split with all 3 file groups
assertEquals(1, splits.length);
RecordReader<NullWritable, ArrayWritable> recordReader = combineHiveInputFormat.getRecordReader(splits[0], jobConf, null);
NullWritable nullWritable = recordReader.createKey();
ArrayWritable arrayWritable = recordReader.createValue();
int counter = 0;
while (recordReader.next(nullWritable, arrayWritable)) {
// read over all the splits
counter++;
}
// should read out 3 splits, each for file0, file1, file2 containing 1000 records each
assertEquals(3000, counter);
recordReader.close();
}
use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.
the class TestHoodieRealtimeRecordReader method createDeltaCommitFile.
private void createDeltaCommitFile(java.nio.file.Path basePath, String commitNumber, String partitionPath, String filePath, String fileId, String schemaStr) throws IOException {
List<HoodieWriteStat> writeStats = new ArrayList<>();
HoodieWriteStat writeStat = createHoodieWriteStat(basePath, commitNumber, partitionPath, filePath, fileId);
writeStats.add(writeStat);
HoodieCommitMetadata commitMetadata = new HoodieCommitMetadata();
writeStats.forEach(stat -> commitMetadata.addWriteStat(partitionPath, stat));
if (schemaStr != null) {
commitMetadata.getExtraMetadata().put(HoodieCommitMetadata.SCHEMA_KEY, schemaStr);
}
File file = basePath.resolve(".hoodie").resolve(commitNumber + ".deltacommit").toFile();
file.createNewFile();
FileOutputStream fileOutputStream = new FileOutputStream(file);
fileOutputStream.write(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8));
fileOutputStream.flush();
fileOutputStream.close();
}
use of org.apache.hudi.common.model.HoodieCommitMetadata in project hudi by apache.
the class TestHoodieRealtimeRecordReader method testReaderWithNestedAndComplexSchema.
@ParameterizedTest
@MethodSource("testArguments")
public void testReaderWithNestedAndComplexSchema(ExternalSpillableMap.DiskMapType diskMapType, boolean isCompressionEnabled) throws Exception {
// initial commit
Schema schema = HoodieAvroUtils.addMetadataFields(SchemaTestUtil.getComplexEvolvedSchema());
HoodieTestUtils.init(hadoopConf, basePath.toString(), HoodieTableType.MERGE_ON_READ);
String instantTime = "100";
int numberOfRecords = 100;
int numberOfLogRecords = numberOfRecords / 2;
File partitionDir = InputFormatTestUtil.prepareParquetTable(basePath, schema, 1, numberOfRecords, instantTime, HoodieTableType.MERGE_ON_READ);
HoodieCommitMetadata commitMetadata = CommitUtils.buildMetadata(Collections.emptyList(), Collections.emptyMap(), Option.empty(), WriteOperationType.UPSERT, schema.toString(), HoodieTimeline.COMMIT_ACTION);
FileCreateUtils.createCommit(basePath.toString(), instantTime, Option.of(commitMetadata));
// Add the paths
FileInputFormat.setInputPaths(baseJobConf, partitionDir.getPath());
// update files or generate new log file
String newCommitTime = "101";
HoodieLogFormat.Writer writer = writeLogFile(partitionDir, schema, "fileid0", instantTime, newCommitTime, numberOfLogRecords);
long size = writer.getCurrentSize();
writer.close();
assertTrue(size > 0, "block - size should be > 0");
commitMetadata = CommitUtils.buildMetadata(Collections.emptyList(), Collections.emptyMap(), Option.empty(), WriteOperationType.UPSERT, schema.toString(), HoodieTimeline.DELTA_COMMIT_ACTION);
FileCreateUtils.createDeltaCommit(basePath.toString(), newCommitTime, commitMetadata);
// create a split with baseFile (parquet file written earlier) and new log file(s)
HoodieRealtimeFileSplit split = new HoodieRealtimeFileSplit(new FileSplit(new Path(partitionDir + "/fileid0_1-0-1_" + instantTime + ".parquet"), 0, 1, baseJobConf), basePath.toUri().toString(), Collections.singletonList(writer.getLogFile()), newCommitTime, false, Option.empty());
// create a RecordReader to be used by HoodieRealtimeRecordReader
RecordReader<NullWritable, ArrayWritable> reader = new MapredParquetInputFormat().getRecordReader(new FileSplit(split.getPath(), 0, fs.getLength(split.getPath()), (String[]) null), baseJobConf, null);
JobConf jobConf = new JobConf(baseJobConf);
List<Schema.Field> fields = schema.getFields();
setHiveColumnNameProps(fields, jobConf, true);
jobConf.setEnum(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.key(), diskMapType);
jobConf.setBoolean(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.key(), isCompressionEnabled);
// validate record reader compaction
HoodieRealtimeRecordReader recordReader = new HoodieRealtimeRecordReader(split, jobConf, reader);
// use reader to read base Parquet File and log file, merge in flight and return latest commit
// here the first 50 records should be updated, see above
NullWritable key = recordReader.createKey();
ArrayWritable value = recordReader.createValue();
int numRecordsRead = 0;
while (recordReader.next(key, value)) {
int currentRecordNo = numRecordsRead;
++numRecordsRead;
Writable[] values = value.get();
String recordCommitTime;
// check if the record written is with latest commit, here "101"
if (numRecordsRead > numberOfLogRecords) {
recordCommitTime = instantTime;
} else {
recordCommitTime = newCommitTime;
}
String recordCommitTimeSuffix = "@" + recordCommitTime;
assertEquals(values[0].toString(), recordCommitTime);
key = recordReader.createKey();
value = recordReader.createValue();
// Assert type STRING
assertEquals(values[5].toString(), "field" + currentRecordNo, "test value for field: field1");
assertEquals(values[6].toString(), "field" + currentRecordNo + recordCommitTimeSuffix, "test value for field: field2");
assertEquals(values[7].toString(), "name" + currentRecordNo, "test value for field: name");
// Assert type INT
IntWritable intWritable = (IntWritable) values[8];
assertEquals(intWritable.get(), currentRecordNo + recordCommitTime.hashCode(), "test value for field: favoriteIntNumber");
// Assert type LONG
LongWritable longWritable = (LongWritable) values[9];
assertEquals(longWritable.get(), currentRecordNo + recordCommitTime.hashCode(), "test value for field: favoriteNumber");
// Assert type FLOAT
FloatWritable floatWritable = (FloatWritable) values[10];
assertEquals(floatWritable.get(), (float) ((currentRecordNo + recordCommitTime.hashCode()) / 1024.0), 0, "test value for field: favoriteFloatNumber");
// Assert type DOUBLE
DoubleWritable doubleWritable = (DoubleWritable) values[11];
assertEquals(doubleWritable.get(), (currentRecordNo + recordCommitTime.hashCode()) / 1024.0, 0, "test value for field: favoriteDoubleNumber");
// Assert type MAP
ArrayWritable mapItem = (ArrayWritable) values[12];
Writable mapItemValue1 = mapItem.get()[0];
Writable mapItemValue2 = mapItem.get()[1];
assertEquals(((ArrayWritable) mapItemValue1).get()[0].toString(), "mapItem1", "test value for field: tags");
assertEquals(((ArrayWritable) mapItemValue2).get()[0].toString(), "mapItem2", "test value for field: tags");
assertEquals(((ArrayWritable) mapItemValue1).get().length, 2, "test value for field: tags");
assertEquals(((ArrayWritable) mapItemValue2).get().length, 2, "test value for field: tags");
Writable mapItemValue1value = ((ArrayWritable) mapItemValue1).get()[1];
Writable mapItemValue2value = ((ArrayWritable) mapItemValue2).get()[1];
assertEquals(((ArrayWritable) mapItemValue1value).get()[0].toString(), "item" + currentRecordNo, "test value for field: tags[\"mapItem1\"].item1");
assertEquals(((ArrayWritable) mapItemValue2value).get()[0].toString(), "item2" + currentRecordNo, "test value for field: tags[\"mapItem2\"].item1");
assertEquals(((ArrayWritable) mapItemValue1value).get()[1].toString(), "item" + currentRecordNo + recordCommitTimeSuffix, "test value for field: tags[\"mapItem1\"].item2");
assertEquals(((ArrayWritable) mapItemValue2value).get()[1].toString(), "item2" + currentRecordNo + recordCommitTimeSuffix, "test value for field: tags[\"mapItem2\"].item2");
// Assert type RECORD
ArrayWritable recordItem = (ArrayWritable) values[13];
Writable[] nestedRecord = recordItem.get();
assertFalse(((BooleanWritable) nestedRecord[0]).get(), "test value for field: testNestedRecord.isAdmin");
assertEquals(nestedRecord[1].toString(), "UserId" + currentRecordNo + recordCommitTimeSuffix, "test value for field: testNestedRecord.userId");
// Assert type ARRAY
ArrayWritable arrayValue = (ArrayWritable) values[14];
Writable[] arrayValues = arrayValue.get();
for (int i = 0; i < arrayValues.length; i++) {
assertEquals("stringArray" + i + recordCommitTimeSuffix, arrayValues[i].toString(), "test value for field: stringArray");
}
reader.close();
}
}
Aggregations