Search in sources :

Example 41 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class SimpleWorkflowDagGenerator method build.

@Override
public WorkflowDag build() {
    DagNode root = new InsertNode(DeltaConfig.Config.newBuilder().withNumRecordsToInsert(100).withNumInsertPartitions(1).withNumTimesToRepeat(2).withRecordSize(1000).build());
    DagNode child1 = new InsertNode(DeltaConfig.Config.newBuilder().withNumRecordsToInsert(100).withNumInsertPartitions(1).withNumTimesToRepeat(2).withRecordSize(1000).build());
    root.addChildNode(child1);
    DagNode child1OfChild1 = new UpsertNode(DeltaConfig.Config.newBuilder().withNumRecordsToUpdate(100).withNumUpsertPartitions(2).withNumTimesToRepeat(1).withRecordSize(1000).build());
    // Tests running 2 nodes in parallel
    child1.addChildNode(child1OfChild1);
    List<Pair<String, Integer>> queryAndResult = new ArrayList<>();
    queryAndResult.add(Pair.of("select " + "count(*) from testdb1.table1 group " + "by rider having count(*) < 1", 0));
    DagNode child2OfChild1 = new HiveQueryNode(DeltaConfig.Config.newBuilder().withHiveQueryAndResults(queryAndResult).withHiveLocal(true).build());
    child1.addChildNode(child2OfChild1);
    List<DagNode> rootNodes = new ArrayList<>();
    rootNodes.add(root);
    return new WorkflowDag(rootNodes);
}
Also used : DagNode(org.apache.hudi.integ.testsuite.dag.nodes.DagNode) ArrayList(java.util.ArrayList) HiveQueryNode(org.apache.hudi.integ.testsuite.dag.nodes.HiveQueryNode) InsertNode(org.apache.hudi.integ.testsuite.dag.nodes.InsertNode) UpsertNode(org.apache.hudi.integ.testsuite.dag.nodes.UpsertNode) Pair(org.apache.hudi.common.util.collection.Pair)

Example 42 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class CompactionCommand method printAllCompactions.

/**
 * Prints all compaction details.
 */
private String printAllCompactions(HoodieDefaultTimeline timeline, Function<HoodieInstant, HoodieCompactionPlan> compactionPlanReader, boolean includeExtraMetadata, String sortByField, boolean descending, int limit, boolean headerOnly) {
    Stream<HoodieInstant> instantsStream = timeline.getWriteTimeline().getReverseOrderedInstants();
    List<Pair<HoodieInstant, HoodieCompactionPlan>> compactionPlans = instantsStream.map(instant -> Pair.of(instant, compactionPlanReader.apply(instant))).filter(pair -> pair.getRight() != null).collect(Collectors.toList());
    Set<String> committedInstants = timeline.getCommitTimeline().filterCompletedInstants().getInstants().map(HoodieInstant::getTimestamp).collect(Collectors.toSet());
    List<Comparable[]> rows = new ArrayList<>();
    for (Pair<HoodieInstant, HoodieCompactionPlan> compactionPlan : compactionPlans) {
        HoodieCompactionPlan plan = compactionPlan.getRight();
        HoodieInstant instant = compactionPlan.getLeft();
        final HoodieInstant.State state;
        if (committedInstants.contains(instant.getTimestamp())) {
            state = HoodieInstant.State.COMPLETED;
        } else {
            state = instant.getState();
        }
        if (includeExtraMetadata) {
            rows.add(new Comparable[] { instant.getTimestamp(), state.toString(), plan.getOperations() == null ? 0 : plan.getOperations().size(), plan.getExtraMetadata().toString() });
        } else {
            rows.add(new Comparable[] { instant.getTimestamp(), state.toString(), plan.getOperations() == null ? 0 : plan.getOperations().size() });
        }
    }
    Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>();
    TableHeader header = new TableHeader().addTableHeaderField(HoodieTableHeaderFields.HEADER_COMPACTION_INSTANT_TIME).addTableHeaderField(HoodieTableHeaderFields.HEADER_STATE).addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_FILES_TO_BE_COMPACTED);
    if (includeExtraMetadata) {
        header = header.addTableHeaderField(HoodieTableHeaderFields.HEADER_EXTRA_METADATA);
    }
    return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows);
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieArchivedTimeline(org.apache.hudi.common.table.timeline.HoodieArchivedTimeline) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) BiFunction(java.util.function.BiFunction) HoodieException(org.apache.hudi.exception.HoodieException) ObjectInputStream(java.io.ObjectInputStream) HoodieTableHeaderFields(org.apache.hudi.cli.HoodieTableHeaderFields) OperationResult(org.apache.hudi.table.action.compact.OperationResult) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) Utils(org.apache.spark.util.Utils) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) RenameOpResult(org.apache.hudi.client.CompactionAdminClient.RenameOpResult) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) CommandMarker(org.springframework.shell.core.CommandMarker) SparkCommand(org.apache.hudi.cli.commands.SparkMain.SparkCommand) UtilHelpers(org.apache.hudi.utilities.UtilHelpers) TableHeader(org.apache.hudi.cli.TableHeader) Set(java.util.Set) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) UUID(java.util.UUID) Collectors(java.util.stream.Collectors) InputStreamConsumer(org.apache.hudi.cli.utils.InputStreamConsumer) HoodieCompactionOperation(org.apache.hudi.avro.model.HoodieCompactionOperation) List(java.util.List) Stream(java.util.stream.Stream) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) CliOption(org.springframework.shell.core.annotation.CliOption) Function(java.util.function.Function) ArrayList(java.util.ArrayList) StringUtils(org.apache.hudi.common.util.StringUtils) CommitUtil(org.apache.hudi.cli.utils.CommitUtil) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) ValidationOpResult(org.apache.hudi.client.CompactionAdminClient.ValidationOpResult) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) CliCommand(org.springframework.shell.core.annotation.CliCommand) SparkLauncher(org.apache.spark.launcher.SparkLauncher) IOException(java.io.IOException) HoodieDefaultTimeline(org.apache.hudi.common.table.timeline.HoodieDefaultTimeline) SparkUtil(org.apache.hudi.cli.utils.SparkUtil) HoodieCLI(org.apache.hudi.cli.HoodieCLI) Component(org.springframework.stereotype.Component) HoodiePrintHelper(org.apache.hudi.cli.HoodiePrintHelper) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) Pair(org.apache.hudi.common.util.collection.Pair) TableHeader(org.apache.hudi.cli.TableHeader) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) BiFunction(java.util.function.BiFunction) Function(java.util.function.Function) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) Pair(org.apache.hudi.common.util.collection.Pair)

Example 43 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class TestInLineFileSystem method testReadInlineFile.

@Test
public void testReadInlineFile() throws IOException {
    Path outerPath = getRandomOuterFSPath();
    listOfGeneratedPaths.add(outerPath);
    // embed n slices so that we can test N inline seqPaths
    int totalSlices = 5;
    List<Pair<Long, Integer>> startOffsetLengthPairs = new ArrayList<>();
    List<byte[]> expectedByteArrays = new ArrayList<>();
    FSDataOutputStream wrappedOut = outerPath.getFileSystem(conf).create(outerPath, true);
    for (int i = 0; i < totalSlices; i++) {
        // append random bytes
        byte[] randomBytes = new byte[RANDOM.nextInt(1000)];
        RANDOM.nextBytes(randomBytes);
        wrappedOut.write(randomBytes);
        long startOffset = wrappedOut.getPos();
        // add inline content
        byte[] embeddedInlineBytes = new byte[RANDOM.nextInt(1000)];
        RANDOM.nextBytes(embeddedInlineBytes);
        wrappedOut.write(embeddedInlineBytes);
        expectedByteArrays.add(embeddedInlineBytes);
        startOffsetLengthPairs.add(Pair.of(startOffset, embeddedInlineBytes.length));
    }
    // suffix random bytes
    byte[] randomBytes = new byte[RANDOM.nextInt(1000)];
    RANDOM.nextBytes(randomBytes);
    wrappedOut.write(randomBytes);
    wrappedOut.flush();
    wrappedOut.close();
    FileStatus expectedFileStatus = outerPath.getFileSystem(conf).getFileStatus(outerPath);
    for (int i = 0; i < totalSlices; i++) {
        Pair<Long, Integer> startOffsetLengthPair = startOffsetLengthPairs.get(i);
        byte[] expectedBytes = expectedByteArrays.get(i);
        Path inlinePath = FileSystemTestUtils.getPhantomFile(outerPath, startOffsetLengthPair.getLeft(), startOffsetLengthPair.getRight());
        InLineFileSystem inlineFileSystem = (InLineFileSystem) inlinePath.getFileSystem(conf);
        FSDataInputStream fsDataInputStream = inlineFileSystem.open(inlinePath);
        assertTrue(inlineFileSystem.exists(inlinePath));
        verifyFileStatus(expectedFileStatus, inlinePath, startOffsetLengthPair.getRight(), inlineFileSystem.getFileStatus(inlinePath));
        FileStatus[] actualFileStatuses = inlineFileSystem.listStatus(inlinePath);
        assertEquals(1, actualFileStatuses.length);
        verifyFileStatus(expectedFileStatus, inlinePath, startOffsetLengthPair.getRight(), actualFileStatuses[0]);
        byte[] actualBytes = new byte[expectedBytes.length];
        fsDataInputStream.readFully(0, actualBytes);
        assertArrayEquals(expectedBytes, actualBytes);
        fsDataInputStream.close();
        assertEquals(InLineFileSystem.SCHEME, inlineFileSystem.getScheme());
        assertEquals(URI.create(InLineFileSystem.SCHEME), inlineFileSystem.getUri());
    }
}
Also used : FileSystemTestUtils.getRandomOuterFSPath(org.apache.hudi.common.testutils.FileSystemTestUtils.getRandomOuterFSPath) Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) ArrayList(java.util.ArrayList) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) Pair(org.apache.hudi.common.util.collection.Pair) Test(org.junit.jupiter.api.Test)

Example 44 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class HoodieTableMetadataUtil method convertFilesToBloomFilterRecords.

/**
 * Convert added and deleted files metadata to bloom filter index records.
 */
public static HoodieData<HoodieRecord> convertFilesToBloomFilterRecords(HoodieEngineContext engineContext, Map<String, List<String>> partitionToDeletedFiles, Map<String, Map<String, Long>> partitionToAppendedFiles, MetadataRecordsGenerationParams recordsGenerationParams, String instantTime) {
    HoodieData<HoodieRecord> allRecordsRDD = engineContext.emptyHoodieData();
    List<Pair<String, List<String>>> partitionToDeletedFilesList = partitionToDeletedFiles.entrySet().stream().map(e -> Pair.of(e.getKey(), e.getValue())).collect(Collectors.toList());
    int parallelism = Math.max(Math.min(partitionToDeletedFilesList.size(), recordsGenerationParams.getBloomIndexParallelism()), 1);
    HoodieData<Pair<String, List<String>>> partitionToDeletedFilesRDD = engineContext.parallelize(partitionToDeletedFilesList, parallelism);
    HoodieData<HoodieRecord> deletedFilesRecordsRDD = partitionToDeletedFilesRDD.flatMap(partitionToDeletedFilesPair -> {
        final String partitionName = partitionToDeletedFilesPair.getLeft();
        final List<String> deletedFileList = partitionToDeletedFilesPair.getRight();
        return deletedFileList.stream().flatMap(deletedFile -> {
            if (!FSUtils.isBaseFile(new Path(deletedFile))) {
                return Stream.empty();
            }
            final String partition = getPartition(partitionName);
            return Stream.<HoodieRecord>of(HoodieMetadataPayload.createBloomFilterMetadataRecord(partition, deletedFile, instantTime, StringUtils.EMPTY_STRING, ByteBuffer.allocate(0), true));
        }).iterator();
    });
    allRecordsRDD = allRecordsRDD.union(deletedFilesRecordsRDD);
    List<Pair<String, Map<String, Long>>> partitionToAppendedFilesList = partitionToAppendedFiles.entrySet().stream().map(entry -> Pair.of(entry.getKey(), entry.getValue())).collect(Collectors.toList());
    parallelism = Math.max(Math.min(partitionToAppendedFilesList.size(), recordsGenerationParams.getBloomIndexParallelism()), 1);
    HoodieData<Pair<String, Map<String, Long>>> partitionToAppendedFilesRDD = engineContext.parallelize(partitionToAppendedFilesList, parallelism);
    HoodieData<HoodieRecord> appendedFilesRecordsRDD = partitionToAppendedFilesRDD.flatMap(partitionToAppendedFilesPair -> {
        final String partitionName = partitionToAppendedFilesPair.getLeft();
        final Map<String, Long> appendedFileMap = partitionToAppendedFilesPair.getRight();
        final String partition = getPartition(partitionName);
        return appendedFileMap.entrySet().stream().flatMap(appendedFileLengthPairEntry -> {
            final String appendedFile = appendedFileLengthPairEntry.getKey();
            if (!FSUtils.isBaseFile(new Path(appendedFile))) {
                return Stream.empty();
            }
            final String pathWithPartition = partitionName + "/" + appendedFile;
            final Path appendedFilePath = new Path(recordsGenerationParams.getDataMetaClient().getBasePath(), pathWithPartition);
            try (HoodieFileReader<IndexedRecord> fileReader = HoodieFileReaderFactory.getFileReader(recordsGenerationParams.getDataMetaClient().getHadoopConf(), appendedFilePath)) {
                final BloomFilter fileBloomFilter = fileReader.readBloomFilter();
                if (fileBloomFilter == null) {
                    LOG.error("Failed to read bloom filter for " + appendedFilePath);
                    return Stream.empty();
                }
                ByteBuffer bloomByteBuffer = ByteBuffer.wrap(fileBloomFilter.serializeToString().getBytes());
                HoodieRecord record = HoodieMetadataPayload.createBloomFilterMetadataRecord(partition, appendedFile, instantTime, recordsGenerationParams.getBloomFilterType(), bloomByteBuffer, false);
                return Stream.of(record);
            } catch (IOException e) {
                LOG.error("Failed to get bloom filter for file: " + appendedFilePath);
            }
            return Stream.empty();
        }).iterator();
    });
    allRecordsRDD = allRecordsRDD.union(appendedFilesRecordsRDD);
    return allRecordsRDD;
}
Also used : HoodieColumnRangeMetadata(org.apache.hudi.common.model.HoodieColumnRangeMetadata) Arrays(java.util.Arrays) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) BiFunction(java.util.function.BiFunction) HoodieException(org.apache.hudi.exception.HoodieException) CollectionUtils(org.apache.hudi.common.util.CollectionUtils) ByteBuffer(java.nio.ByteBuffer) MAX(org.apache.hudi.common.model.HoodieColumnRangeMetadata.Stats.MAX) Logger(org.apache.log4j.Logger) Map(java.util.Map) HoodieRollbackMetadata(org.apache.hudi.avro.model.HoodieRollbackMetadata) Path(org.apache.hadoop.fs.Path) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) HoodieFileReader(org.apache.hudi.io.storage.HoodieFileReader) Schema(org.apache.avro.Schema) Collectors(java.util.stream.Collectors) TOTAL_SIZE(org.apache.hudi.common.model.HoodieColumnRangeMetadata.Stats.TOTAL_SIZE) Objects(java.util.Objects) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) VALUE_COUNT(org.apache.hudi.common.model.HoodieColumnRangeMetadata.Stats.VALUE_COUNT) List(java.util.List) Stream(java.util.stream.Stream) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HoodieMetadataColumnStats(org.apache.hudi.avro.model.HoodieMetadataColumnStats) HoodieRestoreMetadata(org.apache.hudi.avro.model.HoodieRestoreMetadata) TOTAL_UNCOMPRESSED_SIZE(org.apache.hudi.common.model.HoodieColumnRangeMetadata.Stats.TOTAL_UNCOMPRESSED_SIZE) EMPTY_PARTITION_NAME(org.apache.hudi.metadata.HoodieTableMetadata.EMPTY_PARTITION_NAME) FileSlice(org.apache.hudi.common.model.FileSlice) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) ArrayList(java.util.ArrayList) StringUtils(org.apache.hudi.common.util.StringUtils) HoodieFileReaderFactory(org.apache.hudi.io.storage.HoodieFileReaderFactory) NULL_COUNT(org.apache.hudi.common.model.HoodieColumnRangeMetadata.Stats.NULL_COUNT) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) LinkedList(java.util.LinkedList) Nonnull(javax.annotation.Nonnull) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) IndexedRecord(org.apache.avro.generic.IndexedRecord) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieAvroUtils.getNestedFieldValAsString(org.apache.hudi.avro.HoodieAvroUtils.getNestedFieldValAsString) GenericRecord(org.apache.avro.generic.GenericRecord) MIN(org.apache.hudi.common.model.HoodieColumnRangeMetadata.Stats.MIN) HoodieData(org.apache.hudi.common.data.HoodieData) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) HoodieDefaultTimeline(org.apache.hudi.common.table.timeline.HoodieDefaultTimeline) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) ParquetUtils(org.apache.hudi.common.util.ParquetUtils) HoodieMetadataException(org.apache.hudi.exception.HoodieMetadataException) NON_PARTITIONED_NAME(org.apache.hudi.metadata.HoodieTableMetadata.NON_PARTITIONED_NAME) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) COLUMN_RANGE_MERGE_FUNCTION(org.apache.hudi.common.model.HoodieColumnRangeMetadata.COLUMN_RANGE_MERGE_FUNCTION) HoodieDeltaWriteStat(org.apache.hudi.common.model.HoodieDeltaWriteStat) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) Comparator(java.util.Comparator) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) Path(org.apache.hadoop.fs.Path) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieFileReader(org.apache.hudi.io.storage.HoodieFileReader) HoodieAvroUtils.getNestedFieldValAsString(org.apache.hudi.avro.HoodieAvroUtils.getNestedFieldValAsString) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) ByteBuffer(java.nio.ByteBuffer) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) Pair(org.apache.hudi.common.util.collection.Pair)

Example 45 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class HoodieTableMetadataUtil method convertFilesToColumnStatsRecords.

/**
 * Convert added and deleted action metadata to column stats index records.
 */
public static HoodieData<HoodieRecord> convertFilesToColumnStatsRecords(HoodieEngineContext engineContext, Map<String, List<String>> partitionToDeletedFiles, Map<String, Map<String, Long>> partitionToAppendedFiles, MetadataRecordsGenerationParams recordsGenerationParams) {
    HoodieData<HoodieRecord> allRecordsRDD = engineContext.emptyHoodieData();
    final List<String> columnsToIndex = getColumnsToIndex(recordsGenerationParams.getDataMetaClient(), recordsGenerationParams.isAllColumnStatsIndexEnabled());
    final List<Pair<String, List<String>>> partitionToDeletedFilesList = partitionToDeletedFiles.entrySet().stream().map(e -> Pair.of(e.getKey(), e.getValue())).collect(Collectors.toList());
    int parallelism = Math.max(Math.min(partitionToDeletedFilesList.size(), recordsGenerationParams.getColumnStatsIndexParallelism()), 1);
    final HoodieData<Pair<String, List<String>>> partitionToDeletedFilesRDD = engineContext.parallelize(partitionToDeletedFilesList, parallelism);
    HoodieData<HoodieRecord> deletedFilesRecordsRDD = partitionToDeletedFilesRDD.flatMap(partitionToDeletedFilesPair -> {
        final String partitionName = partitionToDeletedFilesPair.getLeft();
        final String partition = getPartition(partitionName);
        final List<String> deletedFileList = partitionToDeletedFilesPair.getRight();
        return deletedFileList.stream().flatMap(deletedFile -> {
            final String filePathWithPartition = partitionName + "/" + deletedFile;
            return getColumnStats(partition, filePathWithPartition, recordsGenerationParams.getDataMetaClient(), columnsToIndex, true);
        }).iterator();
    });
    allRecordsRDD = allRecordsRDD.union(deletedFilesRecordsRDD);
    final List<Pair<String, Map<String, Long>>> partitionToAppendedFilesList = partitionToAppendedFiles.entrySet().stream().map(entry -> Pair.of(entry.getKey(), entry.getValue())).collect(Collectors.toList());
    parallelism = Math.max(Math.min(partitionToAppendedFilesList.size(), recordsGenerationParams.getColumnStatsIndexParallelism()), 1);
    final HoodieData<Pair<String, Map<String, Long>>> partitionToAppendedFilesRDD = engineContext.parallelize(partitionToAppendedFilesList, parallelism);
    HoodieData<HoodieRecord> appendedFilesRecordsRDD = partitionToAppendedFilesRDD.flatMap(partitionToAppendedFilesPair -> {
        final String partitionName = partitionToAppendedFilesPair.getLeft();
        final String partition = getPartition(partitionName);
        final Map<String, Long> appendedFileMap = partitionToAppendedFilesPair.getRight();
        return appendedFileMap.entrySet().stream().flatMap(appendedFileNameLengthEntry -> {
            if (!FSUtils.isBaseFile(new Path(appendedFileNameLengthEntry.getKey())) || !appendedFileNameLengthEntry.getKey().endsWith(HoodieFileFormat.PARQUET.getFileExtension())) {
                return Stream.empty();
            }
            final String filePathWithPartition = partitionName + "/" + appendedFileNameLengthEntry.getKey();
            return getColumnStats(partition, filePathWithPartition, recordsGenerationParams.getDataMetaClient(), columnsToIndex, false);
        }).iterator();
    });
    allRecordsRDD = allRecordsRDD.union(appendedFilesRecordsRDD);
    return allRecordsRDD;
}
Also used : HoodieColumnRangeMetadata(org.apache.hudi.common.model.HoodieColumnRangeMetadata) Arrays(java.util.Arrays) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) BiFunction(java.util.function.BiFunction) HoodieException(org.apache.hudi.exception.HoodieException) CollectionUtils(org.apache.hudi.common.util.CollectionUtils) ByteBuffer(java.nio.ByteBuffer) MAX(org.apache.hudi.common.model.HoodieColumnRangeMetadata.Stats.MAX) Logger(org.apache.log4j.Logger) Map(java.util.Map) HoodieRollbackMetadata(org.apache.hudi.avro.model.HoodieRollbackMetadata) Path(org.apache.hadoop.fs.Path) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) HoodieFileReader(org.apache.hudi.io.storage.HoodieFileReader) Schema(org.apache.avro.Schema) Collectors(java.util.stream.Collectors) TOTAL_SIZE(org.apache.hudi.common.model.HoodieColumnRangeMetadata.Stats.TOTAL_SIZE) Objects(java.util.Objects) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) VALUE_COUNT(org.apache.hudi.common.model.HoodieColumnRangeMetadata.Stats.VALUE_COUNT) List(java.util.List) Stream(java.util.stream.Stream) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HoodieMetadataColumnStats(org.apache.hudi.avro.model.HoodieMetadataColumnStats) HoodieRestoreMetadata(org.apache.hudi.avro.model.HoodieRestoreMetadata) TOTAL_UNCOMPRESSED_SIZE(org.apache.hudi.common.model.HoodieColumnRangeMetadata.Stats.TOTAL_UNCOMPRESSED_SIZE) EMPTY_PARTITION_NAME(org.apache.hudi.metadata.HoodieTableMetadata.EMPTY_PARTITION_NAME) FileSlice(org.apache.hudi.common.model.FileSlice) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) ArrayList(java.util.ArrayList) StringUtils(org.apache.hudi.common.util.StringUtils) HoodieFileReaderFactory(org.apache.hudi.io.storage.HoodieFileReaderFactory) NULL_COUNT(org.apache.hudi.common.model.HoodieColumnRangeMetadata.Stats.NULL_COUNT) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) LinkedList(java.util.LinkedList) Nonnull(javax.annotation.Nonnull) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) IndexedRecord(org.apache.avro.generic.IndexedRecord) BloomFilter(org.apache.hudi.common.bloom.BloomFilter) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieAvroUtils.getNestedFieldValAsString(org.apache.hudi.avro.HoodieAvroUtils.getNestedFieldValAsString) GenericRecord(org.apache.avro.generic.GenericRecord) MIN(org.apache.hudi.common.model.HoodieColumnRangeMetadata.Stats.MIN) HoodieData(org.apache.hudi.common.data.HoodieData) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) HoodieDefaultTimeline(org.apache.hudi.common.table.timeline.HoodieDefaultTimeline) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) ParquetUtils(org.apache.hudi.common.util.ParquetUtils) HoodieMetadataException(org.apache.hudi.exception.HoodieMetadataException) NON_PARTITIONED_NAME(org.apache.hudi.metadata.HoodieTableMetadata.NON_PARTITIONED_NAME) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) COLUMN_RANGE_MERGE_FUNCTION(org.apache.hudi.common.model.HoodieColumnRangeMetadata.COLUMN_RANGE_MERGE_FUNCTION) HoodieDeltaWriteStat(org.apache.hudi.common.model.HoodieDeltaWriteStat) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) Comparator(java.util.Comparator) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) Path(org.apache.hadoop.fs.Path) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieAvroUtils.getNestedFieldValAsString(org.apache.hudi.avro.HoodieAvroUtils.getNestedFieldValAsString) Pair(org.apache.hudi.common.util.collection.Pair)

Aggregations

Pair (org.apache.hudi.common.util.collection.Pair)147 List (java.util.List)98 Map (java.util.Map)91 IOException (java.io.IOException)89 Collectors (java.util.stream.Collectors)87 Option (org.apache.hudi.common.util.Option)87 ArrayList (java.util.ArrayList)85 Path (org.apache.hadoop.fs.Path)81 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)76 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)66 HashMap (java.util.HashMap)65 LogManager (org.apache.log4j.LogManager)64 Logger (org.apache.log4j.Logger)64 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)63 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)58 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)54 HoodieIOException (org.apache.hudi.exception.HoodieIOException)54 Arrays (java.util.Arrays)48 HoodieTable (org.apache.hudi.table.HoodieTable)46 Test (org.junit.jupiter.api.Test)46