use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class HoodieWriteableTestTable method appendRecordsToLogFile.
private Pair<String, HoodieLogFile> appendRecordsToLogFile(List<HoodieRecord> groupedRecords) throws Exception {
String partitionPath = groupedRecords.get(0).getPartitionPath();
HoodieRecordLocation location = groupedRecords.get(0).getCurrentLocation();
try (HoodieLogFormat.Writer logWriter = HoodieLogFormat.newWriterBuilder().onParentPath(new Path(basePath, partitionPath)).withFileExtension(HoodieLogFile.DELTA_EXTENSION).withFileId(location.getFileId()).overBaseCommit(location.getInstantTime()).withFs(fs).build()) {
Map<HoodieLogBlock.HeaderMetadataType, String> header = new HashMap<>();
header.put(HoodieLogBlock.HeaderMetadataType.INSTANT_TIME, location.getInstantTime());
header.put(HoodieLogBlock.HeaderMetadataType.SCHEMA, schema.toString());
logWriter.appendBlock(new HoodieAvroDataBlock(groupedRecords.stream().map(r -> {
try {
GenericRecord val = (GenericRecord) ((HoodieRecordPayload) r.getData()).getInsertValue(schema).get();
HoodieAvroUtils.addHoodieKeyToRecord(val, r.getRecordKey(), r.getPartitionPath(), "");
return (IndexedRecord) val;
} catch (IOException e) {
LOG.warn("Failed to convert record " + r.toString(), e);
return null;
}
}).collect(Collectors.toList()), header, HoodieRecord.RECORD_KEY_METADATA_FIELD));
return Pair.of(partitionPath, logWriter.getLogFile());
}
}
use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class TestCompactionUtils method buildCompactionPlan.
/**
* Generate input for compaction plan tests.
*/
private Pair<List<Pair<String, FileSlice>>, HoodieCompactionPlan> buildCompactionPlan() {
String extension = metaClient.getTableConfig().getBaseFileFormat().getFileExtension();
Path fullPartitionPath = new Path(new Path(metaClient.getBasePath()), DEFAULT_PARTITION_PATHS[0]);
FileSlice emptyFileSlice = new FileSlice(DEFAULT_PARTITION_PATHS[0], "000", "empty1");
FileSlice fileSlice = new FileSlice(DEFAULT_PARTITION_PATHS[0], "000", "noData1");
fileSlice.setBaseFile(new DummyHoodieBaseFile(fullPartitionPath.toString() + "/data1_1_000" + extension));
fileSlice.addLogFile(new HoodieLogFile(new Path(fullPartitionPath, new Path(FSUtils.makeLogFileName("noData1", ".log", "000", 1, TEST_WRITE_TOKEN)))));
fileSlice.addLogFile(new HoodieLogFile(new Path(fullPartitionPath, new Path(FSUtils.makeLogFileName("noData1", ".log", "000", 2, TEST_WRITE_TOKEN)))));
FileSlice noLogFileSlice = new FileSlice(DEFAULT_PARTITION_PATHS[0], "000", "noLog1");
noLogFileSlice.setBaseFile(new DummyHoodieBaseFile(fullPartitionPath.toString() + "/noLog_1_000" + extension));
FileSlice noDataFileSlice = new FileSlice(DEFAULT_PARTITION_PATHS[0], "000", "noData1");
noDataFileSlice.addLogFile(new HoodieLogFile(new Path(fullPartitionPath, new Path(FSUtils.makeLogFileName("noData1", ".log", "000", 1, TEST_WRITE_TOKEN)))));
noDataFileSlice.addLogFile(new HoodieLogFile(new Path(fullPartitionPath, new Path(FSUtils.makeLogFileName("noData1", ".log", "000", 2, TEST_WRITE_TOKEN)))));
List<FileSlice> fileSliceList = Arrays.asList(emptyFileSlice, noDataFileSlice, fileSlice, noLogFileSlice);
List<Pair<String, FileSlice>> input = fileSliceList.stream().map(f -> Pair.of(DEFAULT_PARTITION_PATHS[0], f)).collect(Collectors.toList());
return Pair.of(input, CompactionUtils.buildFromFileSlices(input, Option.empty(), Option.of(metricsCaptureFn)));
}
use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class TestCompactionUtils method testGetAllPendingCompactionOperationsWithFullDupFileId.
@Test
public void testGetAllPendingCompactionOperationsWithFullDupFileId() throws IOException {
// Case where there is duplicate fileIds in compaction requests
HoodieCompactionPlan plan1 = createCompactionPlan(metaClient, "000", "001", 10, true, true);
HoodieCompactionPlan plan2 = createCompactionPlan(metaClient, "002", "003", 0, false, false);
scheduleCompaction(metaClient, "001", plan1);
scheduleCompaction(metaClient, "003", plan2);
// schedule same plan again so that there will be duplicates. It should not fail as it is a full duplicate
scheduleCompaction(metaClient, "005", plan1);
metaClient = HoodieTableMetaClient.builder().setConf(metaClient.getHadoopConf()).setBasePath(basePath).setLoadActiveTimelineOnLoad(true).build();
Map<HoodieFileGroupId, Pair<String, HoodieCompactionOperation>> res = CompactionUtils.getAllPendingCompactionOperations(metaClient);
}
use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class HoodieBackedTableMetadataWriter method initialCommit.
/**
* This is invoked to initialize metadata table for a dataset. Bootstrap Commit has special handling mechanism due to its scale compared to
* other regular commits.
*/
private void initialCommit(String createInstantTime) {
// List all partitions in the basePath of the containing dataset
LOG.info("Initializing metadata table by using file listings in " + dataWriteConfig.getBasePath());
engineContext.setJobStatus(this.getClass().getSimpleName(), "Initializing metadata table by listing files and partitions");
List<DirectoryInfo> partitionInfoList = listAllPartitions(dataMetaClient);
List<String> partitions = new ArrayList<>();
AtomicLong totalFiles = new AtomicLong(0);
Map<String, Map<String, Long>> partitionToFilesMap = partitionInfoList.stream().map(p -> {
final String partitionName = HoodieTableMetadataUtil.getPartition(p.getRelativePath());
partitions.add(partitionName);
totalFiles.addAndGet(p.getTotalFiles());
return Pair.of(partitionName, p.getFileNameToSizeMap());
}).collect(Collectors.toMap(Pair::getKey, Pair::getValue));
final Map<MetadataPartitionType, HoodieData<HoodieRecord>> partitionToRecordsMap = new HashMap<>();
// Record which saves the list of all partitions
HoodieRecord allPartitionRecord = HoodieMetadataPayload.createPartitionListRecord(partitions);
if (partitions.isEmpty()) {
// in case of initializing of a fresh table, there won't be any partitions, but we need to make a boostrap commit
final HoodieData<HoodieRecord> allPartitionRecordsRDD = engineContext.parallelize(Collections.singletonList(allPartitionRecord), 1);
partitionToRecordsMap.put(MetadataPartitionType.FILES, allPartitionRecordsRDD);
commit(createInstantTime, partitionToRecordsMap, false);
return;
}
HoodieData<HoodieRecord> filesPartitionRecords = engineContext.parallelize(Arrays.asList(allPartitionRecord), 1);
if (!partitionInfoList.isEmpty()) {
HoodieData<HoodieRecord> fileListRecords = engineContext.parallelize(partitionInfoList, partitionInfoList.size()).map(partitionInfo -> {
Map<String, Long> fileNameToSizeMap = partitionInfo.getFileNameToSizeMap();
// filter for files that are part of the completed commits
Map<String, Long> validFileNameToSizeMap = fileNameToSizeMap.entrySet().stream().filter(fileSizePair -> {
String commitTime = FSUtils.getCommitTime(fileSizePair.getKey());
return HoodieTimeline.compareTimestamps(commitTime, HoodieTimeline.LESSER_THAN_OR_EQUALS, createInstantTime);
}).collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue));
// Record which saves files within a partition
return HoodieMetadataPayload.createPartitionFilesRecord(HoodieTableMetadataUtil.getPartition(partitionInfo.getRelativePath()), Option.of(validFileNameToSizeMap), Option.empty());
});
filesPartitionRecords = filesPartitionRecords.union(fileListRecords);
}
ValidationUtils.checkState(filesPartitionRecords.count() == (partitions.size() + 1));
partitionToRecordsMap.put(MetadataPartitionType.FILES, filesPartitionRecords);
if (enabledPartitionTypes.contains(MetadataPartitionType.BLOOM_FILTERS)) {
final HoodieData<HoodieRecord> recordsRDD = HoodieTableMetadataUtil.convertFilesToBloomFilterRecords(engineContext, Collections.emptyMap(), partitionToFilesMap, getRecordsGenerationParams(), createInstantTime);
partitionToRecordsMap.put(MetadataPartitionType.BLOOM_FILTERS, recordsRDD);
}
if (enabledPartitionTypes.contains(MetadataPartitionType.COLUMN_STATS)) {
final HoodieData<HoodieRecord> recordsRDD = HoodieTableMetadataUtil.convertFilesToColumnStatsRecords(engineContext, Collections.emptyMap(), partitionToFilesMap, getRecordsGenerationParams());
partitionToRecordsMap.put(MetadataPartitionType.COLUMN_STATS, recordsRDD);
}
LOG.info("Committing " + partitions.size() + " partitions and " + totalFiles + " files to metadata");
commit(createInstantTime, partitionToRecordsMap, false);
}
use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class HoodieWriteHelper method deduplicateRecords.
@Override
public HoodieData<HoodieRecord<T>> deduplicateRecords(HoodieData<HoodieRecord<T>> records, HoodieIndex<?, ?> index, int parallelism) {
boolean isIndexingGlobal = index.isGlobal();
return records.mapToPair(record -> {
HoodieKey hoodieKey = record.getKey();
// If index used is global, then records are expected to differ in their partitionPath
Object key = isIndexingGlobal ? hoodieKey.getRecordKey() : hoodieKey;
return Pair.of(key, record);
}).reduceByKey((rec1, rec2) -> {
@SuppressWarnings("unchecked") T reducedData = (T) rec2.getData().preCombine(rec1.getData());
HoodieKey reducedKey = rec1.getData().equals(reducedData) ? rec1.getKey() : rec2.getKey();
return new HoodieAvroRecord<>(reducedKey, reducedData);
}, parallelism).map(Pair::getRight);
}
Aggregations