use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class IncrementalTimelineSyncFileSystemView method addPendingCompactionInstant.
/**
* Add newly found compaction instant.
*
* @param timeline Hoodie Timeline
* @param instant Compaction Instant
*/
private void addPendingCompactionInstant(HoodieTimeline timeline, HoodieInstant instant) throws IOException {
LOG.info("Syncing pending compaction instant (" + instant + ")");
HoodieCompactionPlan compactionPlan = CompactionUtils.getCompactionPlan(metaClient, instant.getTimestamp());
List<Pair<String, CompactionOperation>> pendingOps = CompactionUtils.getPendingCompactionOperations(instant, compactionPlan).map(p -> Pair.of(p.getValue().getKey(), CompactionOperation.convertFromAvroRecordInstance(p.getValue().getValue()))).collect(Collectors.toList());
// First, update Pending compaction instants
addPendingCompactionOperations(pendingOps.stream());
Map<String, List<Pair<String, HoodieFileGroup>>> partitionToFileGroups = pendingOps.stream().map(opPair -> {
String compactionInstantTime = opPair.getKey();
HoodieFileGroup fileGroup = new HoodieFileGroup(opPair.getValue().getFileGroupId(), timeline);
fileGroup.addNewFileSliceAtInstant(compactionInstantTime);
return Pair.of(compactionInstantTime, fileGroup);
}).collect(Collectors.groupingBy(x -> x.getValue().getPartitionPath()));
partitionToFileGroups.entrySet().forEach(entry -> {
if (isPartitionAvailableInStore(entry.getKey())) {
applyDeltaFileSlicesToPartitionView(entry.getKey(), entry.getValue().stream().map(Pair::getValue).collect(Collectors.toList()), DeltaApplyMode.ADD);
}
});
}
use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class CompactionUtils method getOldestInstantToRetainForCompaction.
/**
* Gets the oldest instant to retain for MOR compaction.
* If there is no completed compaction,
* num delta commits >= "hoodie.compact.inline.max.delta.commits"
* If there is a completed compaction,
* num delta commits after latest completed compaction >= "hoodie.compact.inline.max.delta.commits"
*
* @param activeTimeline Active timeline of a table.
* @param maxDeltaCommits Maximum number of delta commits that trigger the compaction plan,
* i.e., "hoodie.compact.inline.max.delta.commits".
* @return the oldest instant to keep for MOR compaction.
*/
public static Option<HoodieInstant> getOldestInstantToRetainForCompaction(HoodieActiveTimeline activeTimeline, int maxDeltaCommits) {
Option<Pair<HoodieTimeline, HoodieInstant>> deltaCommitsInfoOption = CompactionUtils.getDeltaCommitsSinceLatestCompaction(activeTimeline);
if (deltaCommitsInfoOption.isPresent()) {
Pair<HoodieTimeline, HoodieInstant> deltaCommitsInfo = deltaCommitsInfoOption.get();
HoodieTimeline deltaCommitTimeline = deltaCommitsInfo.getLeft();
int numDeltaCommits = deltaCommitTimeline.countInstants();
if (numDeltaCommits < maxDeltaCommits) {
return Option.of(deltaCommitsInfo.getRight());
} else {
// delta commits with the last one to keep
List<HoodieInstant> instants = deltaCommitTimeline.getInstants().limit(numDeltaCommits - maxDeltaCommits + 1).collect(Collectors.toList());
return Option.of(instants.get(instants.size() - 1));
}
}
return Option.empty();
}
use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class TestOrcBootstrap method generateInputBatch.
private static JavaRDD<HoodieRecord> generateInputBatch(JavaSparkContext jsc, List<Pair<String, List<HoodieFileStatus>>> partitionPaths, Schema writerSchema) {
List<Pair<String, Path>> fullFilePathsWithPartition = partitionPaths.stream().flatMap(p -> p.getValue().stream().map(x -> Pair.of(p.getKey(), FileStatusUtils.toPath(x.getPath())))).collect(Collectors.toList());
return jsc.parallelize(fullFilePathsWithPartition.stream().flatMap(p -> {
try {
Configuration conf = jsc.hadoopConfiguration();
AvroReadSupport.setAvroReadSchema(conf, writerSchema);
Reader orcReader = OrcFile.createReader(p.getValue(), new OrcFile.ReaderOptions(jsc.hadoopConfiguration()));
RecordReader recordReader = orcReader.rows();
TypeDescription orcSchema = orcReader.getSchema();
Schema avroSchema = AvroOrcUtils.createAvroSchemaWithDefaultValue(orcSchema, "test_orc_record", null, true);
Iterator<GenericRecord> recIterator = new OrcReaderIterator(recordReader, avroSchema, orcSchema);
return StreamSupport.stream(Spliterators.spliteratorUnknownSize(recIterator, 0), false).map(gr -> {
try {
String key = gr.get("_row_key").toString();
String pPath = p.getKey();
return new HoodieAvroRecord<>(new HoodieKey(key, pPath), new RawTripTestPayload(gr.toString(), key, pPath, HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA));
} catch (IOException e) {
throw new HoodieIOException(e.getMessage(), e);
}
});
} catch (IOException ioe) {
throw new HoodieIOException(ioe.getMessage(), ioe);
}
}).collect(Collectors.toList()));
}
use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class HoodieBloomIndex method loadColumnRangesFromMetaIndex.
/**
* Load the column stats index as BloomIndexFileInfo for all the involved files in the partition.
*
* @param partitions - List of partitions for which column stats need to be loaded
* @param context - Engine context
* @param hoodieTable - Hoodie table
* @return List of partition and file column range info pairs
*/
protected List<Pair<String, BloomIndexFileInfo>> loadColumnRangesFromMetaIndex(List<String> partitions, final HoodieEngineContext context, final HoodieTable hoodieTable) {
// also obtain file ranges, if range pruning is enabled
context.setJobStatus(this.getClass().getName(), "Load meta index key ranges for file slices");
final String keyField = hoodieTable.getMetaClient().getTableConfig().getRecordKeyFieldProp();
return context.flatMap(partitions, partitionName -> {
// Partition and file name pairs
List<Pair<String, String>> partitionFileNameList = HoodieIndexUtils.getLatestBaseFilesForPartition(partitionName, hoodieTable).stream().map(baseFile -> Pair.of(partitionName, baseFile.getFileName())).sorted().collect(toList());
if (partitionFileNameList.isEmpty()) {
return Stream.empty();
}
try {
Map<Pair<String, String>, HoodieMetadataColumnStats> fileToColumnStatsMap = hoodieTable.getMetadataTable().getColumnStats(partitionFileNameList, keyField);
List<Pair<String, BloomIndexFileInfo>> result = new ArrayList<>();
for (Map.Entry<Pair<String, String>, HoodieMetadataColumnStats> entry : fileToColumnStatsMap.entrySet()) {
result.add(Pair.of(entry.getKey().getLeft(), new BloomIndexFileInfo(FSUtils.getFileId(entry.getKey().getRight()), entry.getValue().getMinValue(), entry.getValue().getMaxValue())));
}
return result.stream();
} catch (MetadataNotFoundException me) {
throw new HoodieMetadataException("Unable to find column range metadata for partition:" + partitionName, me);
}
}, Math.max(partitions.size(), 1));
}
use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class HoodieBloomIndex method loadColumnRangesFromFiles.
/**
* Load all involved files as <Partition, filename> pair List.
*/
List<Pair<String, BloomIndexFileInfo>> loadColumnRangesFromFiles(List<String> partitions, final HoodieEngineContext context, final HoodieTable hoodieTable) {
// Obtain the latest data files from all the partitions.
List<Pair<String, String>> partitionPathFileIDList = getLatestBaseFilesForAllPartitions(partitions, context, hoodieTable).stream().map(pair -> Pair.of(pair.getKey(), pair.getValue().getFileId())).collect(toList());
context.setJobStatus(this.getClass().getName(), "Obtain key ranges for file slices (range pruning=on)");
return context.map(partitionPathFileIDList, pf -> {
try {
HoodieRangeInfoHandle rangeInfoHandle = new HoodieRangeInfoHandle(config, hoodieTable, pf);
String[] minMaxKeys = rangeInfoHandle.getMinMaxKeys();
return Pair.of(pf.getKey(), new BloomIndexFileInfo(pf.getValue(), minMaxKeys[0], minMaxKeys[1]));
} catch (MetadataNotFoundException me) {
LOG.warn("Unable to find range metadata in file :" + pf);
return Pair.of(pf.getKey(), new BloomIndexFileInfo(pf.getValue()));
}
}, Math.max(partitionPathFileIDList.size(), 1));
}
Aggregations