Search in sources :

Example 21 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class IncrementalTimelineSyncFileSystemView method addPendingCompactionInstant.

/**
 * Add newly found compaction instant.
 *
 * @param timeline Hoodie Timeline
 * @param instant Compaction Instant
 */
private void addPendingCompactionInstant(HoodieTimeline timeline, HoodieInstant instant) throws IOException {
    LOG.info("Syncing pending compaction instant (" + instant + ")");
    HoodieCompactionPlan compactionPlan = CompactionUtils.getCompactionPlan(metaClient, instant.getTimestamp());
    List<Pair<String, CompactionOperation>> pendingOps = CompactionUtils.getPendingCompactionOperations(instant, compactionPlan).map(p -> Pair.of(p.getValue().getKey(), CompactionOperation.convertFromAvroRecordInstance(p.getValue().getValue()))).collect(Collectors.toList());
    // First, update Pending compaction instants
    addPendingCompactionOperations(pendingOps.stream());
    Map<String, List<Pair<String, HoodieFileGroup>>> partitionToFileGroups = pendingOps.stream().map(opPair -> {
        String compactionInstantTime = opPair.getKey();
        HoodieFileGroup fileGroup = new HoodieFileGroup(opPair.getValue().getFileGroupId(), timeline);
        fileGroup.addNewFileSliceAtInstant(compactionInstantTime);
        return Pair.of(compactionInstantTime, fileGroup);
    }).collect(Collectors.groupingBy(x -> x.getValue().getPartitionPath()));
    partitionToFileGroups.entrySet().forEach(entry -> {
        if (isPartitionAvailableInStore(entry.getKey())) {
            applyDeltaFileSlicesToPartitionView(entry.getKey(), entry.getValue().stream().map(Pair::getValue).collect(Collectors.toList()), DeltaApplyMode.ADD);
        }
    });
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSlice(org.apache.hudi.common.model.FileSlice) TimelineDiffHelper(org.apache.hudi.common.table.timeline.TimelineDiffHelper) HoodieException(org.apache.hudi.exception.HoodieException) Option(org.apache.hudi.common.util.Option) FileStatus(org.apache.hadoop.fs.FileStatus) Logger(org.apache.log4j.Logger) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) CleanerUtils(org.apache.hudi.common.util.CleanerUtils) Map(java.util.Map) HoodieRollbackMetadata(org.apache.hudi.avro.model.HoodieRollbackMetadata) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) Set(java.util.Set) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) HoodieReplaceCommitMetadata(org.apache.hudi.common.model.HoodieReplaceCommitMetadata) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) TimelineDiffResult(org.apache.hudi.common.table.timeline.TimelineDiffHelper.TimelineDiffResult) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieRestoreMetadata(org.apache.hudi.avro.model.HoodieRestoreMetadata) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) List(java.util.List) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) Pair(org.apache.hudi.common.util.collection.Pair)

Example 22 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class CompactionUtils method getOldestInstantToRetainForCompaction.

/**
 * Gets the oldest instant to retain for MOR compaction.
 * If there is no completed compaction,
 * num delta commits >= "hoodie.compact.inline.max.delta.commits"
 * If there is a completed compaction,
 * num delta commits after latest completed compaction >= "hoodie.compact.inline.max.delta.commits"
 *
 * @param activeTimeline  Active timeline of a table.
 * @param maxDeltaCommits Maximum number of delta commits that trigger the compaction plan,
 *                        i.e., "hoodie.compact.inline.max.delta.commits".
 * @return the oldest instant to keep for MOR compaction.
 */
public static Option<HoodieInstant> getOldestInstantToRetainForCompaction(HoodieActiveTimeline activeTimeline, int maxDeltaCommits) {
    Option<Pair<HoodieTimeline, HoodieInstant>> deltaCommitsInfoOption = CompactionUtils.getDeltaCommitsSinceLatestCompaction(activeTimeline);
    if (deltaCommitsInfoOption.isPresent()) {
        Pair<HoodieTimeline, HoodieInstant> deltaCommitsInfo = deltaCommitsInfoOption.get();
        HoodieTimeline deltaCommitTimeline = deltaCommitsInfo.getLeft();
        int numDeltaCommits = deltaCommitTimeline.countInstants();
        if (numDeltaCommits < maxDeltaCommits) {
            return Option.of(deltaCommitsInfo.getRight());
        } else {
            // delta commits with the last one to keep
            List<HoodieInstant> instants = deltaCommitTimeline.getInstants().limit(numDeltaCommits - maxDeltaCommits + 1).collect(Collectors.toList());
            return Option.of(instants.get(instants.size() - 1));
        }
    }
    return Option.empty();
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) Pair(org.apache.hudi.common.util.collection.Pair)

Example 23 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class TestOrcBootstrap method generateInputBatch.

private static JavaRDD<HoodieRecord> generateInputBatch(JavaSparkContext jsc, List<Pair<String, List<HoodieFileStatus>>> partitionPaths, Schema writerSchema) {
    List<Pair<String, Path>> fullFilePathsWithPartition = partitionPaths.stream().flatMap(p -> p.getValue().stream().map(x -> Pair.of(p.getKey(), FileStatusUtils.toPath(x.getPath())))).collect(Collectors.toList());
    return jsc.parallelize(fullFilePathsWithPartition.stream().flatMap(p -> {
        try {
            Configuration conf = jsc.hadoopConfiguration();
            AvroReadSupport.setAvroReadSchema(conf, writerSchema);
            Reader orcReader = OrcFile.createReader(p.getValue(), new OrcFile.ReaderOptions(jsc.hadoopConfiguration()));
            RecordReader recordReader = orcReader.rows();
            TypeDescription orcSchema = orcReader.getSchema();
            Schema avroSchema = AvroOrcUtils.createAvroSchemaWithDefaultValue(orcSchema, "test_orc_record", null, true);
            Iterator<GenericRecord> recIterator = new OrcReaderIterator(recordReader, avroSchema, orcSchema);
            return StreamSupport.stream(Spliterators.spliteratorUnknownSize(recIterator, 0), false).map(gr -> {
                try {
                    String key = gr.get("_row_key").toString();
                    String pPath = p.getKey();
                    return new HoodieAvroRecord<>(new HoodieKey(key, pPath), new RawTripTestPayload(gr.toString(), key, pPath, HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA));
                } catch (IOException e) {
                    throw new HoodieIOException(e.getMessage(), e);
                }
            });
        } catch (IOException ioe) {
            throw new HoodieIOException(ioe.getMessage(), ioe);
        }
    }).collect(Collectors.toList()));
}
Also used : BootstrapUtils(org.apache.hudi.table.action.bootstrap.BootstrapUtils) BeforeEach(org.junit.jupiter.api.BeforeEach) Arrays(java.util.Arrays) BootstrapMode(org.apache.hudi.client.bootstrap.BootstrapMode) Spliterators(java.util.Spliterators) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Random(java.util.Random) HoodieParquetRealtimeInputFormat(org.apache.hudi.hadoop.realtime.HoodieParquetRealtimeInputFormat) DataSourceWriteOptions(org.apache.hudi.DataSourceWriteOptions) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) NonpartitionedKeyGenerator(org.apache.hudi.keygen.NonpartitionedKeyGenerator) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) Tag(org.junit.jupiter.api.Tag) DataTypes(org.apache.spark.sql.types.DataTypes) Schema(org.apache.avro.Schema) IndexType(org.apache.hudi.index.HoodieIndex.IndexType) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) MetadataOnlyBootstrapModeSelector(org.apache.hudi.client.bootstrap.selector.MetadataOnlyBootstrapModeSelector) SimpleKeyGenerator(org.apache.hudi.keygen.SimpleKeyGenerator) Instant(java.time.Instant) Collectors(java.util.stream.Collectors) Test(org.junit.jupiter.api.Test) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) List(java.util.List) TempDir(org.junit.jupiter.api.io.TempDir) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) FullRecordBootstrapDataProvider(org.apache.hudi.client.bootstrap.FullRecordBootstrapDataProvider) HoodieClientTestBase(org.apache.hudi.testutils.HoodieClientTestBase) IntStream(java.util.stream.IntStream) AvroReadSupport(org.apache.parquet.avro.AvroReadSupport) AvroOrcUtils(org.apache.hudi.common.util.AvroOrcUtils) Dataset(org.apache.spark.sql.Dataset) OrcReaderIterator(org.apache.hudi.common.util.OrcReaderIterator) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) OrcFile(org.apache.orc.OrcFile) ArrayList(java.util.ArrayList) Reader(org.apache.orc.Reader) Collectors.mapping(java.util.stream.Collectors.mapping) StreamSupport(java.util.stream.StreamSupport) HoodieParquetInputFormat(org.apache.hudi.hadoop.HoodieParquetInputFormat) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) FullRecordBootstrapModeSelector(org.apache.hudi.client.bootstrap.selector.FullRecordBootstrapModeSelector) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) SparkSession(org.apache.spark.sql.SparkSession) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) SaveMode(org.apache.spark.sql.SaveMode) BootstrapModeSelector(org.apache.hudi.client.bootstrap.selector.BootstrapModeSelector) BootstrapIndex(org.apache.hudi.common.bootstrap.index.BootstrapIndex) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) Column(org.apache.spark.sql.Column) RecordReader(org.apache.orc.RecordReader) SQLContext(org.apache.spark.sql.SQLContext) TypeDescription(org.apache.orc.TypeDescription) IOException(java.io.IOException) Row(org.apache.spark.sql.Row) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) FileCreateUtils(org.apache.hudi.common.testutils.FileCreateUtils) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) JobConf(org.apache.hadoop.mapred.JobConf) AfterEach(org.junit.jupiter.api.AfterEach) Collectors.toList(java.util.stream.Collectors.toList) UDF1(org.apache.spark.sql.api.java.UDF1) HoodieKey(org.apache.hudi.common.model.HoodieKey) FileStatusUtils(org.apache.hudi.common.bootstrap.FileStatusUtils) HoodieIOException(org.apache.hudi.exception.HoodieIOException) PartitionPathEncodeUtils(org.apache.hudi.common.util.PartitionPathEncodeUtils) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) org.apache.spark.sql.functions.callUDF(org.apache.spark.sql.functions.callUDF) Pair(org.apache.hudi.common.util.collection.Pair) HoodieBootstrapConfig(org.apache.hudi.config.HoodieBootstrapConfig) Configuration(org.apache.hadoop.conf.Configuration) RecordReader(org.apache.orc.RecordReader) Schema(org.apache.avro.Schema) Reader(org.apache.orc.Reader) RecordReader(org.apache.orc.RecordReader) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) RawTripTestPayload(org.apache.hudi.common.testutils.RawTripTestPayload) HoodieIOException(org.apache.hudi.exception.HoodieIOException) OrcReaderIterator(org.apache.hudi.common.util.OrcReaderIterator) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) OrcFile(org.apache.orc.OrcFile) HoodieKey(org.apache.hudi.common.model.HoodieKey) TypeDescription(org.apache.orc.TypeDescription) GenericRecord(org.apache.avro.generic.GenericRecord) Pair(org.apache.hudi.common.util.collection.Pair)

Example 24 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class HoodieBloomIndex method loadColumnRangesFromMetaIndex.

/**
 * Load the column stats index as BloomIndexFileInfo for all the involved files in the partition.
 *
 * @param partitions  - List of partitions for which column stats need to be loaded
 * @param context     - Engine context
 * @param hoodieTable - Hoodie table
 * @return List of partition and file column range info pairs
 */
protected List<Pair<String, BloomIndexFileInfo>> loadColumnRangesFromMetaIndex(List<String> partitions, final HoodieEngineContext context, final HoodieTable hoodieTable) {
    // also obtain file ranges, if range pruning is enabled
    context.setJobStatus(this.getClass().getName(), "Load meta index key ranges for file slices");
    final String keyField = hoodieTable.getMetaClient().getTableConfig().getRecordKeyFieldProp();
    return context.flatMap(partitions, partitionName -> {
        // Partition and file name pairs
        List<Pair<String, String>> partitionFileNameList = HoodieIndexUtils.getLatestBaseFilesForPartition(partitionName, hoodieTable).stream().map(baseFile -> Pair.of(partitionName, baseFile.getFileName())).sorted().collect(toList());
        if (partitionFileNameList.isEmpty()) {
            return Stream.empty();
        }
        try {
            Map<Pair<String, String>, HoodieMetadataColumnStats> fileToColumnStatsMap = hoodieTable.getMetadataTable().getColumnStats(partitionFileNameList, keyField);
            List<Pair<String, BloomIndexFileInfo>> result = new ArrayList<>();
            for (Map.Entry<Pair<String, String>, HoodieMetadataColumnStats> entry : fileToColumnStatsMap.entrySet()) {
                result.add(Pair.of(entry.getKey().getLeft(), new BloomIndexFileInfo(FSUtils.getFileId(entry.getKey().getRight()), entry.getValue().getMinValue(), entry.getValue().getMaxValue())));
            }
            return result.stream();
        } catch (MetadataNotFoundException me) {
            throw new HoodieMetadataException("Unable to find column range metadata for partition:" + partitionName, me);
        }
    }, Math.max(partitions.size(), 1));
}
Also used : HoodieMetadataException(org.apache.hudi.exception.HoodieMetadataException) MetadataNotFoundException(org.apache.hudi.exception.MetadataNotFoundException) ArrayList(java.util.ArrayList) HoodieMetadataColumnStats(org.apache.hudi.avro.model.HoodieMetadataColumnStats) Map(java.util.Map) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Pair(org.apache.hudi.common.util.collection.Pair)

Example 25 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class HoodieBloomIndex method loadColumnRangesFromFiles.

/**
 * Load all involved files as <Partition, filename> pair List.
 */
List<Pair<String, BloomIndexFileInfo>> loadColumnRangesFromFiles(List<String> partitions, final HoodieEngineContext context, final HoodieTable hoodieTable) {
    // Obtain the latest data files from all the partitions.
    List<Pair<String, String>> partitionPathFileIDList = getLatestBaseFilesForAllPartitions(partitions, context, hoodieTable).stream().map(pair -> Pair.of(pair.getKey(), pair.getValue().getFileId())).collect(toList());
    context.setJobStatus(this.getClass().getName(), "Obtain key ranges for file slices (range pruning=on)");
    return context.map(partitionPathFileIDList, pf -> {
        try {
            HoodieRangeInfoHandle rangeInfoHandle = new HoodieRangeInfoHandle(config, hoodieTable, pf);
            String[] minMaxKeys = rangeInfoHandle.getMinMaxKeys();
            return Pair.of(pf.getKey(), new BloomIndexFileInfo(pf.getValue(), minMaxKeys[0], minMaxKeys[1]));
        } catch (MetadataNotFoundException me) {
            LOG.warn("Unable to find range metadata in file :" + pf);
            return Pair.of(pf.getKey(), new BloomIndexFileInfo(pf.getValue()));
        }
    }, Math.max(partitionPathFileIDList.size(), 1));
}
Also used : ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieIndexUtils.getLatestBaseFilesForAllPartitions(org.apache.hudi.index.HoodieIndexUtils.getLatestBaseFilesForAllPartitions) Collectors.groupingBy(java.util.stream.Collectors.groupingBy) Option(org.apache.hudi.common.util.Option) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) HoodieConfig(org.apache.hudi.common.config.HoodieConfig) HoodieRangeInfoHandle(org.apache.hudi.io.HoodieRangeInfoHandle) Map(java.util.Map) Collectors.mapping(java.util.stream.Collectors.mapping) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieData(org.apache.hudi.common.data.HoodieData) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) MetadataNotFoundException(org.apache.hudi.exception.MetadataNotFoundException) HoodiePairData(org.apache.hudi.common.data.HoodiePairData) Collectors(java.util.stream.Collectors) HoodieIndex(org.apache.hudi.index.HoodieIndex) HoodieMetadataException(org.apache.hudi.exception.HoodieMetadataException) WriteStatus(org.apache.hudi.client.WriteStatus) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) Stream(java.util.stream.Stream) HoodieMetadataColumnStats(org.apache.hudi.avro.model.HoodieMetadataColumnStats) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieIndexUtils(org.apache.hudi.index.HoodieIndexUtils) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) MetadataNotFoundException(org.apache.hudi.exception.MetadataNotFoundException) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Pair(org.apache.hudi.common.util.collection.Pair) HoodieRangeInfoHandle(org.apache.hudi.io.HoodieRangeInfoHandle)

Aggregations

Pair (org.apache.hudi.common.util.collection.Pair)147 List (java.util.List)98 Map (java.util.Map)91 IOException (java.io.IOException)89 Collectors (java.util.stream.Collectors)87 Option (org.apache.hudi.common.util.Option)87 ArrayList (java.util.ArrayList)85 Path (org.apache.hadoop.fs.Path)81 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)76 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)66 HashMap (java.util.HashMap)65 LogManager (org.apache.log4j.LogManager)64 Logger (org.apache.log4j.Logger)64 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)63 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)58 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)54 HoodieIOException (org.apache.hudi.exception.HoodieIOException)54 Arrays (java.util.Arrays)48 HoodieTable (org.apache.hudi.table.HoodieTable)46 Test (org.junit.jupiter.api.Test)46