Search in sources :

Example 31 with HoodieCompactionPlan

use of org.apache.hudi.avro.model.HoodieCompactionPlan in project hudi by apache.

the class CompactionUtils method getAllPendingCompactionOperations.

/**
 * Get all PartitionPath + file-ids with pending Compaction operations and their target compaction instant time.
 *
 * @param metaClient Hoodie Table Meta Client
 */
public static Map<HoodieFileGroupId, Pair<String, HoodieCompactionOperation>> getAllPendingCompactionOperations(HoodieTableMetaClient metaClient) {
    List<Pair<HoodieInstant, HoodieCompactionPlan>> pendingCompactionPlanWithInstants = getAllPendingCompactionPlans(metaClient);
    Map<HoodieFileGroupId, Pair<String, HoodieCompactionOperation>> fgIdToPendingCompactionWithInstantMap = new HashMap<>();
    pendingCompactionPlanWithInstants.stream().flatMap(instantPlanPair -> getPendingCompactionOperations(instantPlanPair.getKey(), instantPlanPair.getValue())).forEach(pair -> {
        // on some DFSs.
        if (fgIdToPendingCompactionWithInstantMap.containsKey(pair.getKey())) {
            HoodieCompactionOperation operation = pair.getValue().getValue();
            HoodieCompactionOperation anotherOperation = fgIdToPendingCompactionWithInstantMap.get(pair.getKey()).getValue();
            if (!operation.equals(anotherOperation)) {
                String msg = "Hudi File Id (" + pair.getKey() + ") has more than 1 pending compactions. Instants: " + pair.getValue() + ", " + fgIdToPendingCompactionWithInstantMap.get(pair.getKey());
                throw new IllegalStateException(msg);
            }
        }
        fgIdToPendingCompactionWithInstantMap.put(pair.getKey(), pair.getValue());
    });
    return fgIdToPendingCompactionWithInstantMap;
}
Also used : CompactionPlanMigrator(org.apache.hudi.common.table.timeline.versioning.compaction.CompactionPlanMigrator) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieException(org.apache.hudi.exception.HoodieException) HashMap(java.util.HashMap) CompactionV1MigrationHandler(org.apache.hudi.common.table.timeline.versioning.compaction.CompactionV1MigrationHandler) Function(java.util.function.Function) Logger(org.apache.log4j.Logger) BaseFile(org.apache.hudi.common.model.BaseFile) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) CompactionV2MigrationHandler(org.apache.hudi.common.table.timeline.versioning.compaction.CompactionV2MigrationHandler) Map(java.util.Map) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) HoodieCompactionOperation(org.apache.hudi.avro.model.HoodieCompactionOperation) List(java.util.List) Stream(java.util.stream.Stream) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) LogManager(org.apache.log4j.LogManager) Pair(org.apache.hudi.common.util.collection.Pair) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HashMap(java.util.HashMap) HoodieCompactionOperation(org.apache.hudi.avro.model.HoodieCompactionOperation) Pair(org.apache.hudi.common.util.collection.Pair)

Example 32 with HoodieCompactionPlan

use of org.apache.hudi.avro.model.HoodieCompactionPlan in project hudi by apache.

the class TestCleaner method testInsertAndCleanByVersions.

/**
 * Test Helper for Cleaning by versions logic from HoodieWriteClient API perspective.
 *
 * @param insertFn Insert API to be tested
 * @param upsertFn Upsert API to be tested
 * @param isPreppedAPI Flag to indicate if a prepped-version is used. If true, a wrapper function will be used during
 *        record generation to also tag the regards (de-dupe is implicit as we use unique record-gen APIs)
 * @throws Exception in case of errors
 */
private void testInsertAndCleanByVersions(Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> insertFn, Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> upsertFn, boolean isPreppedAPI) throws Exception {
    // keep upto 2 versions for each file
    int maxVersions = 2;
    HoodieWriteConfig cfg = getConfigBuilder().withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS).retainFileVersions(maxVersions).build()).withParallelism(1, 1).withBulkInsertParallelism(1).withFinalizeWriteParallelism(1).withDeleteParallelism(1).withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build()).build();
    try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
        final Function2<List<HoodieRecord>, String, Integer> recordInsertGenWrappedFunction = generateWrapRecordsFn(isPreppedAPI, cfg, dataGen::generateInserts);
        final Function2<List<HoodieRecord>, String, Integer> recordUpsertGenWrappedFunction = generateWrapRecordsFn(isPreppedAPI, cfg, dataGen::generateUniqueUpdates);
        insertFirstBigBatchForClientCleanerTest(cfg, client, recordInsertGenWrappedFunction, insertFn, HoodieCleaningPolicy.KEEP_LATEST_FILE_VERSIONS);
        Map<HoodieFileGroupId, FileSlice> compactionFileIdToLatestFileSlice = new HashMap<>();
        metaClient = HoodieTableMetaClient.reload(metaClient);
        HoodieTable table = HoodieSparkTable.create(getConfig(), context, metaClient);
        for (String partitionPath : dataGen.getPartitionPaths()) {
            TableFileSystemView fsView = table.getFileSystemView();
            Option<Boolean> added = Option.fromJavaOptional(fsView.getAllFileGroups(partitionPath).findFirst().map(fg -> {
                fg.getLatestFileSlice().map(fs -> compactionFileIdToLatestFileSlice.put(fg.getFileGroupId(), fs));
                return true;
            }));
            if (added.isPresent()) {
                // Select only one file-group for compaction
                break;
            }
        }
        // Create workload with selected file-slices
        List<Pair<String, FileSlice>> partitionFileSlicePairs = compactionFileIdToLatestFileSlice.entrySet().stream().map(e -> Pair.of(e.getKey().getPartitionPath(), e.getValue())).collect(Collectors.toList());
        HoodieCompactionPlan compactionPlan = CompactionUtils.buildFromFileSlices(partitionFileSlicePairs, Option.empty(), Option.empty());
        List<String> instantTimes = makeIncrementalCommitTimes(9, 1, 10);
        String compactionTime = instantTimes.get(0);
        table.getActiveTimeline().saveToCompactionRequested(new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, compactionTime), TimelineMetadataUtils.serializeCompactionPlan(compactionPlan));
        instantTimes = instantTimes.subList(1, instantTimes.size());
        // remaining.
        for (String newInstantTime : instantTimes) {
            try {
                client.startCommitWithTime(newInstantTime);
                List<HoodieRecord> records = recordUpsertGenWrappedFunction.apply(newInstantTime, 100);
                List<WriteStatus> statuses = upsertFn.apply(client, jsc.parallelize(records, 1), newInstantTime).collect();
                // Verify there are no errors
                assertNoWriteErrors(statuses);
                metaClient = HoodieTableMetaClient.reload(metaClient);
                table = HoodieSparkTable.create(getConfig(), context, metaClient);
                HoodieTimeline timeline = table.getMetaClient().getCommitsTimeline();
                TableFileSystemView fsView = table.getFileSystemView();
                // Need to ensure the following
                for (String partitionPath : dataGen.getPartitionPaths()) {
                    // compute all the versions of all files, from time 0
                    HashMap<String, TreeSet<String>> fileIdToVersions = new HashMap<>();
                    for (HoodieInstant entry : timeline.getInstants().collect(Collectors.toList())) {
                        HoodieCommitMetadata commitMetadata = HoodieCommitMetadata.fromBytes(timeline.getInstantDetails(entry).get(), HoodieCommitMetadata.class);
                        for (HoodieWriteStat wstat : commitMetadata.getWriteStats(partitionPath)) {
                            if (!fileIdToVersions.containsKey(wstat.getFileId())) {
                                fileIdToVersions.put(wstat.getFileId(), new TreeSet<>());
                            }
                            fileIdToVersions.get(wstat.getFileId()).add(FSUtils.getCommitTime(new Path(wstat.getPath()).getName()));
                        }
                    }
                    List<HoodieFileGroup> fileGroups = fsView.getAllFileGroups(partitionPath).collect(Collectors.toList());
                    for (HoodieFileGroup fileGroup : fileGroups) {
                        if (compactionFileIdToLatestFileSlice.containsKey(fileGroup.getFileGroupId())) {
                            // Ensure latest file-slice selected for compaction is retained
                            Option<HoodieBaseFile> dataFileForCompactionPresent = Option.fromJavaOptional(fileGroup.getAllBaseFiles().filter(df -> {
                                return compactionFileIdToLatestFileSlice.get(fileGroup.getFileGroupId()).getBaseInstantTime().equals(df.getCommitTime());
                            }).findAny());
                            assertTrue(dataFileForCompactionPresent.isPresent(), "Data File selected for compaction is retained");
                        } else {
                            // file has no more than max versions
                            String fileId = fileGroup.getFileGroupId().getFileId();
                            List<HoodieBaseFile> dataFiles = fileGroup.getAllBaseFiles().collect(Collectors.toList());
                            assertTrue(dataFiles.size() <= maxVersions, "fileId " + fileId + " has more than " + maxVersions + " versions");
                            // Each file, has the latest N versions (i.e cleaning gets rid of older versions)
                            List<String> commitedVersions = new ArrayList<>(fileIdToVersions.get(fileId));
                            for (int i = 0; i < dataFiles.size(); i++) {
                                assertEquals((dataFiles.get(i)).getCommitTime(), commitedVersions.get(commitedVersions.size() - 1 - i), "File " + fileId + " does not have latest versions on commits" + commitedVersions);
                            }
                        }
                    }
                }
            } catch (IOException ioe) {
                throw new RuntimeException(ioe);
            }
        }
    }
}
Also used : Arrays(java.util.Arrays) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) CleanPlanner(org.apache.hudi.table.action.clean.CleanPlanner) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) Map(java.util.Map) HoodieRollbackMetadata(org.apache.hudi.avro.model.HoodieRollbackMetadata) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) Awaitility.await(org.awaitility.Awaitility.await) HoodieCleanerPlan(org.apache.hudi.avro.model.HoodieCleanerPlan) HoodieClusteringPlan(org.apache.hudi.avro.model.HoodieClusteringPlan) Set(java.util.Set) Arguments(org.junit.jupiter.params.provider.Arguments) HoodieIndex(org.apache.hudi.index.HoodieIndex) StandardCharsets(java.nio.charset.StandardCharsets) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) Stream(java.util.stream.Stream) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView) HoodieClientTestBase(org.apache.hudi.testutils.HoodieClientTestBase) Assertions.assertNotNull(org.junit.jupiter.api.Assertions.assertNotNull) HoodieCleaningPolicy(org.apache.hudi.common.model.HoodieCleaningPolicy) CleanPlanMigrator(org.apache.hudi.common.table.timeline.versioning.clean.CleanPlanMigrator) Assertions.assertNull(org.junit.jupiter.api.Assertions.assertNull) Option(org.apache.hudi.common.util.Option) CleanPlanV1MigrationHandler(org.apache.hudi.common.table.timeline.versioning.clean.CleanPlanV1MigrationHandler) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) StringUtils(org.apache.hudi.common.util.StringUtils) CleanerUtils(org.apache.hudi.common.util.CleanerUtils) HoodieTestCommitGenerator.getBaseFilename(org.apache.hudi.HoodieTestCommitGenerator.getBaseFilename) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieCleanStat(org.apache.hudi.common.HoodieCleanStat) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) ValueSource(org.junit.jupiter.params.provider.ValueSource) ConsistencyGuardConfig(org.apache.hudi.common.fs.ConsistencyGuardConfig) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Assertions.assertNoWriteErrors(org.apache.hudi.testutils.Assertions.assertNoWriteErrors) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Files(java.nio.file.Files) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) File(java.io.File) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) HoodieClusteringGroup(org.apache.hudi.avro.model.HoodieClusteringGroup) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) Paths(java.nio.file.Paths) HoodieTableMetadataWriter(org.apache.hudi.metadata.HoodieTableMetadataWriter) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieCleanPartitionMetadata(org.apache.hudi.avro.model.HoodieCleanPartitionMetadata) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) HoodieFailedWritesCleaningPolicy(org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) CollectionUtils(org.apache.hudi.common.util.CollectionUtils) HoodieTestTable.makeIncrementalCommitTimes(org.apache.hudi.common.testutils.HoodieTestTable.makeIncrementalCommitTimes) HoodieMetadataTestTable(org.apache.hudi.common.testutils.HoodieMetadataTestTable) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) SparkHoodieBackedTableMetadataWriter(org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter) Path(org.apache.hadoop.fs.Path) MethodSource(org.junit.jupiter.params.provider.MethodSource) IOType(org.apache.hudi.common.model.IOType) Predicate(java.util.function.Predicate) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) UUID(java.util.UUID) Collectors(java.util.stream.Collectors) Tuple3(scala.Tuple3) HoodieClusteringStrategy(org.apache.hudi.avro.model.HoodieClusteringStrategy) Test(org.junit.jupiter.api.Test) List(java.util.List) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) CleanMetadataMigrator(org.apache.hudi.common.table.timeline.versioning.clean.CleanMetadataMigrator) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) DEFAULT_PARTITION_PATHS(org.apache.hudi.common.testutils.HoodieTestUtils.DEFAULT_PARTITION_PATHS) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) FileSlice(org.apache.hudi.common.model.FileSlice) HashMap(java.util.HashMap) State(org.apache.hudi.common.table.timeline.HoodieInstant.State) HashSet(java.util.HashSet) HoodieRequestedReplaceMetadata(org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) BootstrapFileMapping(org.apache.hudi.common.model.BootstrapFileMapping) TestBootstrapIndex(org.apache.hudi.common.bootstrap.TestBootstrapIndex) HoodieActionInstant(org.apache.hudi.avro.model.HoodieActionInstant) HoodieReplaceCommitMetadata(org.apache.hudi.common.model.HoodieReplaceCommitMetadata) TimeUnit(java.util.concurrent.TimeUnit) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieTestTable.makeNewCommitTime(org.apache.hudi.common.testutils.HoodieTestTable.makeNewCommitTime) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) SparkHoodieIndexFactory(org.apache.hudi.index.SparkHoodieIndexFactory) HoodieSliceInfo(org.apache.hudi.avro.model.HoodieSliceInfo) LogManager(org.apache.log4j.LogManager) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HashMap(java.util.HashMap) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) ArrayList(java.util.ArrayList) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) List(java.util.List) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView) WriteStatus(org.apache.hudi.client.WriteStatus) Pair(org.apache.hudi.common.util.collection.Pair) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Path(org.apache.hadoop.fs.Path) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup)

Example 33 with HoodieCompactionPlan

use of org.apache.hudi.avro.model.HoodieCompactionPlan in project hudi by apache.

the class TestHoodieCompactor method testCompactionEmpty.

@Test
public void testCompactionEmpty() throws Exception {
    HoodieWriteConfig config = getConfig();
    metaClient = HoodieTableMetaClient.reload(metaClient);
    HoodieTable table = HoodieSparkTable.create(getConfig(), context, metaClient);
    try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config)) {
        String newCommitTime = writeClient.startCommit();
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 100);
        JavaRDD<HoodieRecord> recordsRDD = jsc.parallelize(records, 1);
        writeClient.insert(recordsRDD, newCommitTime).collect();
        String compactionInstantTime = HoodieActiveTimeline.createNewInstantTime();
        Option<HoodieCompactionPlan> plan = table.scheduleCompaction(context, compactionInstantTime, Option.empty());
        assertFalse(plan.isPresent(), "If there is nothing to compact, result will be empty");
    }
}
Also used : SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Test(org.junit.jupiter.api.Test)

Example 34 with HoodieCompactionPlan

use of org.apache.hudi.avro.model.HoodieCompactionPlan in project hudi by apache.

the class TestHoodieTableFileSystemView method testPendingCompactionWithDuplicateFileIdsAcrossPartitions.

@Test
public void testPendingCompactionWithDuplicateFileIdsAcrossPartitions() throws Exception {
    // Put some files in the partition
    String partitionPath1 = "2016/05/01";
    String partitionPath2 = "2016/05/02";
    String partitionPath3 = "2016/05/03";
    String fullPartitionPath1 = basePath + "/" + partitionPath1 + "/";
    new File(fullPartitionPath1).mkdirs();
    String fullPartitionPath2 = basePath + "/" + partitionPath2 + "/";
    new File(fullPartitionPath2).mkdirs();
    String fullPartitionPath3 = basePath + "/" + partitionPath3 + "/";
    new File(fullPartitionPath3).mkdirs();
    String instantTime1 = "1";
    String deltaInstantTime1 = "2";
    String deltaInstantTime2 = "3";
    String fileId = UUID.randomUUID().toString();
    String dataFileName = FSUtils.makeDataFileName(instantTime1, TEST_WRITE_TOKEN, fileId);
    new File(fullPartitionPath1 + dataFileName).createNewFile();
    String fileName1 = FSUtils.makeLogFileName(fileId, HoodieLogFile.DELTA_EXTENSION, instantTime1, 0, TEST_WRITE_TOKEN);
    new File(fullPartitionPath1 + fileName1).createNewFile();
    new File(fullPartitionPath2 + FSUtils.makeDataFileName(instantTime1, TEST_WRITE_TOKEN, fileId)).createNewFile();
    new File(fullPartitionPath2 + fileName1).createNewFile();
    new File(fullPartitionPath3 + FSUtils.makeDataFileName(instantTime1, TEST_WRITE_TOKEN, fileId)).createNewFile();
    new File(fullPartitionPath3 + fileName1).createNewFile();
    HoodieActiveTimeline commitTimeline = metaClient.getActiveTimeline();
    HoodieInstant instant1 = new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, instantTime1);
    HoodieInstant deltaInstant2 = new HoodieInstant(true, HoodieTimeline.DELTA_COMMIT_ACTION, deltaInstantTime1);
    HoodieInstant deltaInstant3 = new HoodieInstant(true, HoodieTimeline.DELTA_COMMIT_ACTION, deltaInstantTime2);
    saveAsComplete(commitTimeline, instant1, Option.empty());
    saveAsComplete(commitTimeline, deltaInstant2, Option.empty());
    saveAsComplete(commitTimeline, deltaInstant3, Option.empty());
    // Now we list all partitions
    FileStatus[] statuses = metaClient.getFs().listStatus(new Path[] { new Path(fullPartitionPath1), new Path(fullPartitionPath2), new Path(fullPartitionPath3) });
    assertEquals(6, statuses.length);
    refreshFsView();
    Arrays.asList(partitionPath1, partitionPath2, partitionPath3).forEach(p -> fsView.getAllFileGroups(p).count());
    List<HoodieFileGroup> groups = Stream.of(partitionPath1, partitionPath2, partitionPath3).flatMap(p -> fsView.getAllFileGroups(p)).collect(Collectors.toList());
    assertEquals(3, groups.size(), "Expected number of file-groups");
    assertEquals(3, groups.stream().map(HoodieFileGroup::getPartitionPath).collect(Collectors.toSet()).size(), "Partitions must be different for file-groups");
    Set<String> fileIds = groups.stream().map(HoodieFileGroup::getFileGroupId).map(HoodieFileGroupId::getFileId).collect(Collectors.toSet());
    assertEquals(1, fileIds.size(), "File Id must be same");
    assertTrue(fileIds.contains(fileId), "Expected FileId");
    // Setup Pending compaction for all of these fileIds.
    List<Pair<String, FileSlice>> partitionFileSlicesPairs = new ArrayList<>();
    List<FileSlice> fileSlices = rtView.getLatestFileSlices(partitionPath1).collect(Collectors.toList());
    partitionFileSlicesPairs.add(Pair.of(partitionPath1, fileSlices.get(0)));
    fileSlices = rtView.getLatestFileSlices(partitionPath2).collect(Collectors.toList());
    partitionFileSlicesPairs.add(Pair.of(partitionPath2, fileSlices.get(0)));
    fileSlices = rtView.getLatestFileSlices(partitionPath3).collect(Collectors.toList());
    partitionFileSlicesPairs.add(Pair.of(partitionPath3, fileSlices.get(0)));
    String compactionRequestedTime = "2";
    String compactDataFileName = FSUtils.makeDataFileName(compactionRequestedTime, TEST_WRITE_TOKEN, fileId);
    HoodieCompactionPlan compactionPlan = CompactionUtils.buildFromFileSlices(partitionFileSlicesPairs, Option.empty(), Option.empty());
    // Create a Data-file for some of the partitions but this should be skipped by view
    new File(basePath + "/" + partitionPath1 + "/" + compactDataFileName).createNewFile();
    new File(basePath + "/" + partitionPath2 + "/" + compactDataFileName).createNewFile();
    HoodieInstant compactionInstant = new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, compactionRequestedTime);
    HoodieInstant requested = HoodieTimeline.getCompactionRequestedInstant(compactionInstant.getTimestamp());
    metaClient.getActiveTimeline().saveToCompactionRequested(requested, TimelineMetadataUtils.serializeCompactionPlan(compactionPlan));
    metaClient.getActiveTimeline().transitionCompactionRequestedToInflight(requested);
    // Fake delta-ingestion after compaction-requested
    String deltaInstantTime4 = "4";
    String deltaInstantTime5 = "6";
    String fileName3 = FSUtils.makeLogFileName(fileId, HoodieLogFile.DELTA_EXTENSION, compactionRequestedTime, 0, TEST_WRITE_TOKEN);
    String fileName4 = FSUtils.makeLogFileName(fileId, HoodieLogFile.DELTA_EXTENSION, compactionRequestedTime, 1, TEST_WRITE_TOKEN);
    new File(basePath + "/" + partitionPath1 + "/" + fileName3).createNewFile();
    new File(basePath + "/" + partitionPath1 + "/" + fileName4).createNewFile();
    new File(basePath + "/" + partitionPath2 + "/" + fileName3).createNewFile();
    new File(basePath + "/" + partitionPath2 + "/" + fileName4).createNewFile();
    new File(basePath + "/" + partitionPath3 + "/" + fileName3).createNewFile();
    new File(basePath + "/" + partitionPath3 + "/" + fileName4).createNewFile();
    HoodieInstant deltaInstant4 = new HoodieInstant(true, HoodieTimeline.DELTA_COMMIT_ACTION, deltaInstantTime4);
    HoodieInstant deltaInstant5 = new HoodieInstant(true, HoodieTimeline.DELTA_COMMIT_ACTION, deltaInstantTime5);
    saveAsComplete(commitTimeline, deltaInstant4, Option.empty());
    saveAsComplete(commitTimeline, deltaInstant5, Option.empty());
    refreshFsView();
    // Test Data Files
    List<HoodieBaseFile> dataFiles = roView.getAllBaseFiles(partitionPath1).collect(Collectors.toList());
    assertEquals(1, dataFiles.size(), "One data-file is expected as there is only one file-group");
    assertEquals("1", dataFiles.get(0).getCommitTime(), "Expect only valid commit");
    dataFiles = roView.getAllBaseFiles(partitionPath2).collect(Collectors.toList());
    assertEquals(1, dataFiles.size(), "One data-file is expected as there is only one file-group");
    assertEquals("1", dataFiles.get(0).getCommitTime(), "Expect only valid commit");
    // Merge API Tests
    Arrays.asList(partitionPath1, partitionPath2, partitionPath3).forEach(partitionPath -> {
        List<FileSlice> fileSliceList = rtView.getLatestMergedFileSlicesBeforeOrOn(partitionPath, deltaInstantTime5).collect(Collectors.toList());
        assertEquals(1, fileSliceList.size(), "Expect file-slice to be merged");
        FileSlice fileSlice = fileSliceList.get(0);
        assertEquals(fileId, fileSlice.getFileId());
        assertEquals(dataFileName, fileSlice.getBaseFile().get().getFileName(), "Data file must be present");
        assertEquals(instantTime1, fileSlice.getBaseInstantTime(), "Base Instant of penultimate file-slice must be base instant");
        List<HoodieLogFile> logFiles = fileSlice.getLogFiles().collect(Collectors.toList());
        assertEquals(3, logFiles.size(), "Log files must include those after compaction request");
        assertEquals(fileName4, logFiles.get(0).getFileName(), "Log File Order check");
        assertEquals(fileName3, logFiles.get(1).getFileName(), "Log File Order check");
        assertEquals(fileName1, logFiles.get(2).getFileName(), "Log File Order check");
        fileSliceList = rtView.getLatestFileSlicesBeforeOrOn(partitionPath, deltaInstantTime5, true).collect(Collectors.toList());
        assertEquals(1, fileSliceList.size(), "Expect only one file-id");
        fileSlice = fileSliceList.get(0);
        assertEquals(fileId, fileSlice.getFileId());
        assertFalse(fileSlice.getBaseFile().isPresent(), "No data-file expected in latest file-slice");
        assertEquals(compactionRequestedTime, fileSlice.getBaseInstantTime(), "Compaction requested instant must be base instant");
        logFiles = fileSlice.getLogFiles().collect(Collectors.toList());
        assertEquals(2, logFiles.size(), "Log files must include only those after compaction request");
        assertEquals(fileName4, logFiles.get(0).getFileName(), "Log File Order check");
        assertEquals(fileName3, logFiles.get(1).getFileName(), "Log File Order check");
        // Check getLatestFileSlicesBeforeOrOn excluding fileIds in pending compaction
        fileSliceList = rtView.getLatestFileSlicesBeforeOrOn(partitionPath, deltaInstantTime5, false).collect(Collectors.toList());
        assertEquals(0, fileSliceList.size(), "Expect empty list as file-id is in pending compaction");
    });
    assertEquals(3, fsView.getPendingCompactionOperations().count());
    Set<String> partitionsInCompaction = fsView.getPendingCompactionOperations().map(Pair::getValue).map(CompactionOperation::getPartitionPath).collect(Collectors.toSet());
    assertEquals(3, partitionsInCompaction.size());
    assertTrue(partitionsInCompaction.contains(partitionPath1));
    assertTrue(partitionsInCompaction.contains(partitionPath2));
    assertTrue(partitionsInCompaction.contains(partitionPath3));
    Set<String> fileIdsInCompaction = fsView.getPendingCompactionOperations().map(Pair::getValue).map(CompactionOperation::getFileId).collect(Collectors.toSet());
    assertEquals(1, fileIdsInCompaction.size());
    assertTrue(fileIdsInCompaction.contains(fileId));
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Path(org.apache.hadoop.fs.Path) HoodiePath(org.apache.hudi.avro.model.HoodiePath) BeforeEach(org.junit.jupiter.api.BeforeEach) HoodieWrapperFileSystem(org.apache.hudi.common.fs.HoodieWrapperFileSystem) Arrays(java.util.Arrays) Date(java.util.Date) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileStatus(org.apache.hadoop.fs.FileStatus) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) BaseFile(org.apache.hudi.common.model.BaseFile) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) MethodSource(org.junit.jupiter.params.provider.MethodSource) HoodiePath(org.apache.hudi.avro.model.HoodiePath) HoodieFSPermission(org.apache.hudi.avro.model.HoodieFSPermission) HoodieClusteringPlan(org.apache.hudi.avro.model.HoodieClusteringPlan) Set(java.util.Set) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) UUID(java.util.UUID) Arguments(org.junit.jupiter.params.provider.Arguments) HoodieCommonTestHarness(org.apache.hudi.common.testutils.HoodieCommonTestHarness) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) Test(org.junit.jupiter.api.Test) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) Stream(java.util.stream.Stream) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) ClusteringUtils(org.apache.hudi.common.util.ClusteringUtils) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) SliceView(org.apache.hudi.common.table.view.TableFileSystemView.SliceView) Assertions.assertDoesNotThrow(org.junit.jupiter.api.Assertions.assertDoesNotThrow) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) FileSlice(org.apache.hudi.common.model.FileSlice) IndexWriter(org.apache.hudi.common.bootstrap.index.BootstrapIndex.IndexWriter) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) CommitUtils(org.apache.hudi.common.util.CommitUtils) FsAction(org.apache.hadoop.fs.permission.FsAction) State(org.apache.hudi.common.table.timeline.HoodieInstant.State) HFileBootstrapIndex(org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) HoodieRequestedReplaceMetadata(org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) BootstrapFileMapping(org.apache.hudi.common.model.BootstrapFileMapping) BaseFileOnlyView(org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) File(java.io.File) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Paths(java.nio.file.Paths) FileStatusUtils(org.apache.hudi.common.bootstrap.FileStatusUtils) LogManager(org.apache.log4j.LogManager) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) FileSlice(org.apache.hudi.common.model.FileSlice) ArrayList(java.util.ArrayList) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) BaseFile(org.apache.hudi.common.model.BaseFile) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) File(java.io.File) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Pair(org.apache.hudi.common.util.collection.Pair) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 35 with HoodieCompactionPlan

use of org.apache.hudi.avro.model.HoodieCompactionPlan in project hudi by apache.

the class TestHoodieTableFileSystemView method testViewForFileSlicesWithAsyncCompaction.

/**
 * Helper method to test Views in the presence of concurrent compaction.
 *
 * @param skipCreatingDataFile if set, first File Slice will not have data-file set. This would simulate inserts going
 *        directly to log files
 * @param isCompactionInFlight if set, compaction was inflight (running) when view was tested first time, otherwise
 *        compaction was in requested state
 * @param expTotalFileSlices Total number of file-slices across file-groups in the partition path
 * @param expTotalDataFiles Total number of data-files across file-groups in the partition path
 * @param includeInvalidAndInflight Whether view includes inflight and invalid file-groups.
 * @param testBootstrap enable Bootstrap and test
 * @throws Exception -
 */
protected void testViewForFileSlicesWithAsyncCompaction(boolean skipCreatingDataFile, boolean isCompactionInFlight, int expTotalFileSlices, int expTotalDataFiles, boolean includeInvalidAndInflight, boolean testBootstrap) throws Exception {
    if (testBootstrap) {
        metaClient = HoodieTestUtils.init(tempDir.toAbsolutePath().toString(), getTableType(), BOOTSTRAP_SOURCE_PATH, testBootstrap);
    }
    String partitionPath = "2016/05/01";
    new File(basePath + "/" + partitionPath).mkdirs();
    String fileId = UUID.randomUUID().toString();
    String srcName = "part_0000" + metaClient.getTableConfig().getBaseFileFormat().getFileExtension();
    HoodieFileStatus srcFileStatus = HoodieFileStatus.newBuilder().setPath(HoodiePath.newBuilder().setUri(BOOTSTRAP_SOURCE_PATH + partitionPath + "/" + srcName).build()).setLength(256 * 1024 * 1024L).setAccessTime(new Date().getTime()).setModificationTime(new Date().getTime() + 99999).setBlockReplication(2).setOwner("hudi").setGroup("hudi").setBlockSize(128 * 1024 * 1024L).setPermission(HoodieFSPermission.newBuilder().setUserAction(FsAction.ALL.name()).setGroupAction(FsAction.READ.name()).setOtherAction(FsAction.NONE.name()).setStickyBit(true).build()).build();
    // if skipCreatingDataFile, then instantTime1 below acts like delta-commit, otherwise it is base-commit
    String instantTime1 = testBootstrap && !skipCreatingDataFile ? HoodieTimeline.METADATA_BOOTSTRAP_INSTANT_TS : "1";
    String deltaInstantTime1 = "2";
    String deltaInstantTime2 = "3";
    String dataFileName = null;
    if (!skipCreatingDataFile) {
        dataFileName = FSUtils.makeDataFileName(instantTime1, TEST_WRITE_TOKEN, fileId);
        new File(basePath + "/" + partitionPath + "/" + dataFileName).createNewFile();
    }
    String fileName1 = FSUtils.makeLogFileName(fileId, HoodieLogFile.DELTA_EXTENSION, instantTime1, 0, TEST_WRITE_TOKEN);
    String fileName2 = FSUtils.makeLogFileName(fileId, HoodieLogFile.DELTA_EXTENSION, instantTime1, 1, TEST_WRITE_TOKEN);
    new File(basePath + "/" + partitionPath + "/" + fileName1).createNewFile();
    new File(basePath + "/" + partitionPath + "/" + fileName2).createNewFile();
    HoodieActiveTimeline commitTimeline = metaClient.getActiveTimeline();
    HoodieInstant instant1 = new HoodieInstant(true, HoodieTimeline.COMMIT_ACTION, instantTime1);
    HoodieInstant deltaInstant2 = new HoodieInstant(true, HoodieTimeline.DELTA_COMMIT_ACTION, deltaInstantTime1);
    HoodieInstant deltaInstant3 = new HoodieInstant(true, HoodieTimeline.DELTA_COMMIT_ACTION, deltaInstantTime2);
    if (testBootstrap && !skipCreatingDataFile) {
        try (IndexWriter writer = new HFileBootstrapIndex(metaClient).createWriter(BOOTSTRAP_SOURCE_PATH)) {
            writer.begin();
            BootstrapFileMapping mapping = new BootstrapFileMapping(BOOTSTRAP_SOURCE_PATH, partitionPath, partitionPath, srcFileStatus, fileId);
            List<BootstrapFileMapping> b = new ArrayList<>();
            b.add(mapping);
            writer.appendNextPartition(partitionPath, b);
            writer.finish();
        }
    }
    saveAsComplete(commitTimeline, instant1, Option.empty());
    saveAsComplete(commitTimeline, deltaInstant2, Option.empty());
    saveAsComplete(commitTimeline, deltaInstant3, Option.empty());
    refreshFsView();
    List<FileSlice> fileSlices = rtView.getLatestFileSlices(partitionPath).collect(Collectors.toList());
    assertEquals(1, fileSlices.size());
    FileSlice fileSlice = fileSlices.get(0);
    assertEquals(instantTime1, fileSlice.getBaseInstantTime());
    if (!skipCreatingDataFile) {
        assertTrue(fileSlice.getBaseFile().isPresent());
        checkExternalFile(srcFileStatus, fileSlice.getBaseFile().get().getBootstrapBaseFile(), testBootstrap);
    }
    String compactionRequestedTime = "4";
    String compactDataFileName = FSUtils.makeDataFileName(compactionRequestedTime, TEST_WRITE_TOKEN, fileId);
    List<Pair<String, FileSlice>> partitionFileSlicesPairs = new ArrayList<>();
    partitionFileSlicesPairs.add(Pair.of(partitionPath, fileSlices.get(0)));
    HoodieCompactionPlan compactionPlan = CompactionUtils.buildFromFileSlices(partitionFileSlicesPairs, Option.empty(), Option.empty());
    HoodieInstant compactionInstant;
    if (isCompactionInFlight) {
        // Create a Data-file but this should be skipped by view
        new File(basePath + "/" + partitionPath + "/" + compactDataFileName).createNewFile();
        compactionInstant = new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, compactionRequestedTime);
        HoodieInstant requested = HoodieTimeline.getCompactionRequestedInstant(compactionInstant.getTimestamp());
        commitTimeline.saveToCompactionRequested(requested, TimelineMetadataUtils.serializeCompactionPlan(compactionPlan));
        commitTimeline.transitionCompactionRequestedToInflight(requested);
    } else {
        compactionInstant = new HoodieInstant(State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, compactionRequestedTime);
        commitTimeline.saveToCompactionRequested(compactionInstant, TimelineMetadataUtils.serializeCompactionPlan(compactionPlan));
    }
    // View immediately after scheduling compaction
    refreshFsView();
    List<FileSlice> slices = rtView.getLatestFileSlices(partitionPath).collect(Collectors.toList());
    assertEquals(1, slices.size(), "Expected latest file-slices");
    assertEquals(compactionRequestedTime, slices.get(0).getBaseInstantTime(), "Base-Instant must be compaction Instant");
    assertFalse(slices.get(0).getBaseFile().isPresent(), "Latest File Slice must not have data-file");
    assertEquals(0, slices.get(0).getLogFiles().count(), "Latest File Slice must not have any log-files");
    // Fake delta-ingestion after compaction-requested
    String deltaInstantTime4 = "5";
    String deltaInstantTime5 = "6";
    List<String> allInstantTimes = Arrays.asList(instantTime1, deltaInstantTime1, deltaInstantTime2, compactionRequestedTime, deltaInstantTime4, deltaInstantTime5);
    String fileName3 = FSUtils.makeLogFileName(fileId, HoodieLogFile.DELTA_EXTENSION, compactionRequestedTime, 0, TEST_WRITE_TOKEN);
    String fileName4 = FSUtils.makeLogFileName(fileId, HoodieLogFile.DELTA_EXTENSION, compactionRequestedTime, 1, TEST_WRITE_TOKEN);
    new File(basePath + "/" + partitionPath + "/" + fileName3).createNewFile();
    new File(basePath + "/" + partitionPath + "/" + fileName4).createNewFile();
    HoodieInstant deltaInstant4 = new HoodieInstant(true, HoodieTimeline.DELTA_COMMIT_ACTION, deltaInstantTime4);
    HoodieInstant deltaInstant5 = new HoodieInstant(true, HoodieTimeline.DELTA_COMMIT_ACTION, deltaInstantTime5);
    saveAsComplete(commitTimeline, deltaInstant4, Option.empty());
    saveAsComplete(commitTimeline, deltaInstant5, Option.empty());
    refreshFsView();
    List<HoodieBaseFile> dataFiles = roView.getAllBaseFiles(partitionPath).collect(Collectors.toList());
    if (skipCreatingDataFile) {
        assertTrue(dataFiles.isEmpty(), "No data file expected");
    } else {
        assertEquals(1, dataFiles.size(), "One data-file is expected as there is only one file-group");
        assertEquals(dataFileName, dataFiles.get(0).getFileName(), "Expect only valid data-file");
    }
    // Merge API Tests
    List<FileSlice> fileSliceList = rtView.getLatestMergedFileSlicesBeforeOrOn(partitionPath, deltaInstantTime5).collect(Collectors.toList());
    assertEquals(1, fileSliceList.size(), "Expect file-slice to be merged");
    fileSlice = fileSliceList.get(0);
    assertEquals(fileId, fileSlice.getFileId());
    if (!skipCreatingDataFile) {
        assertEquals(dataFileName, fileSlice.getBaseFile().get().getFileName(), "Data file must be present");
        checkExternalFile(srcFileStatus, fileSlice.getBaseFile().get().getBootstrapBaseFile(), testBootstrap);
    } else {
        assertFalse(fileSlice.getBaseFile().isPresent(), "No data-file expected as it was not created");
    }
    assertEquals(instantTime1, fileSlice.getBaseInstantTime(), "Base Instant of penultimate file-slice must be base instant");
    List<HoodieLogFile> logFiles = fileSlice.getLogFiles().collect(Collectors.toList());
    assertEquals(4, logFiles.size(), "Log files must include those after compaction request");
    assertEquals(fileName4, logFiles.get(0).getFileName(), "Log File Order check");
    assertEquals(fileName3, logFiles.get(1).getFileName(), "Log File Order check");
    assertEquals(fileName2, logFiles.get(2).getFileName(), "Log File Order check");
    assertEquals(fileName1, logFiles.get(3).getFileName(), "Log File Order check");
    fileSliceList = rtView.getLatestFileSlicesBeforeOrOn(partitionPath, deltaInstantTime5, true).collect(Collectors.toList());
    assertEquals(1, fileSliceList.size(), "Expect only one file-id");
    fileSlice = fileSliceList.get(0);
    assertEquals(fileId, fileSlice.getFileId());
    assertFalse(fileSlice.getBaseFile().isPresent(), "No data-file expected in latest file-slice");
    assertEquals(compactionRequestedTime, fileSlice.getBaseInstantTime(), "Compaction requested instant must be base instant");
    logFiles = fileSlice.getLogFiles().collect(Collectors.toList());
    assertEquals(2, logFiles.size(), "Log files must include only those after compaction request");
    assertEquals(fileName4, logFiles.get(0).getFileName(), "Log File Order check");
    assertEquals(fileName3, logFiles.get(1).getFileName(), "Log File Order check");
    // Data Files API tests
    dataFiles = roView.getLatestBaseFiles().collect(Collectors.toList());
    if (skipCreatingDataFile) {
        assertEquals(0, dataFiles.size(), "Expect no data file to be returned");
    } else {
        assertEquals(1, dataFiles.size(), "Expect only one data-file to be sent");
        dataFiles.forEach(df -> assertEquals(df.getCommitTime(), instantTime1, "Expect data-file for instant 1 be returned"));
        checkExternalFile(srcFileStatus, dataFiles.get(0).getBootstrapBaseFile(), testBootstrap);
    }
    dataFiles = roView.getLatestBaseFiles(partitionPath).collect(Collectors.toList());
    if (skipCreatingDataFile) {
        assertEquals(0, dataFiles.size(), "Expect no data file to be returned");
    } else {
        assertEquals(1, dataFiles.size(), "Expect only one data-file to be sent");
        dataFiles.forEach(df -> assertEquals(df.getCommitTime(), instantTime1, "Expect data-file for instant 1 be returned"));
        checkExternalFile(srcFileStatus, dataFiles.get(0).getBootstrapBaseFile(), testBootstrap);
    }
    dataFiles = roView.getLatestBaseFilesBeforeOrOn(partitionPath, deltaInstantTime5).collect(Collectors.toList());
    if (skipCreatingDataFile) {
        assertEquals(0, dataFiles.size(), "Expect no data file to be returned");
    } else {
        assertEquals(1, dataFiles.size(), "Expect only one data-file to be sent");
        dataFiles.forEach(df -> assertEquals(df.getCommitTime(), instantTime1, "Expect data-file for instant 1 be returned"));
        checkExternalFile(srcFileStatus, dataFiles.get(0).getBootstrapBaseFile(), testBootstrap);
    }
    dataFiles = roView.getLatestBaseFilesInRange(allInstantTimes).collect(Collectors.toList());
    if (skipCreatingDataFile) {
        assertEquals(0, dataFiles.size(), "Expect no data file to be returned");
    } else {
        assertEquals(1, dataFiles.size(), "Expect only one data-file to be sent");
        dataFiles.forEach(df -> assertEquals(df.getCommitTime(), instantTime1, "Expect data-file for instant 1 be returned"));
        checkExternalFile(srcFileStatus, dataFiles.get(0).getBootstrapBaseFile(), testBootstrap);
    }
    // Inflight/Orphan File-groups needs to be in the view
    // There is a data-file with this inflight file-id
    final String inflightFileId1 = UUID.randomUUID().toString();
    // There is a log-file with this inflight file-id
    final String inflightFileId2 = UUID.randomUUID().toString();
    // There is an orphan data file with this file-id
    final String orphanFileId1 = UUID.randomUUID().toString();
    // There is an orphan log data file with this file-id
    final String orphanFileId2 = UUID.randomUUID().toString();
    final String invalidInstantId = "INVALIDTIME";
    String inflightDeltaInstantTime = "7";
    String orphanDataFileName = FSUtils.makeDataFileName(invalidInstantId, TEST_WRITE_TOKEN, orphanFileId1);
    new File(basePath + "/" + partitionPath + "/" + orphanDataFileName).createNewFile();
    String orphanLogFileName = FSUtils.makeLogFileName(orphanFileId2, HoodieLogFile.DELTA_EXTENSION, invalidInstantId, 0, TEST_WRITE_TOKEN);
    new File(basePath + "/" + partitionPath + "/" + orphanLogFileName).createNewFile();
    String inflightDataFileName = FSUtils.makeDataFileName(inflightDeltaInstantTime, TEST_WRITE_TOKEN, inflightFileId1);
    new File(basePath + "/" + partitionPath + "/" + inflightDataFileName).createNewFile();
    String inflightLogFileName = FSUtils.makeLogFileName(inflightFileId2, HoodieLogFile.DELTA_EXTENSION, inflightDeltaInstantTime, 0, TEST_WRITE_TOKEN);
    new File(basePath + "/" + partitionPath + "/" + inflightLogFileName).createNewFile();
    // Mark instant as inflight
    commitTimeline.createNewInstant(new HoodieInstant(State.REQUESTED, HoodieTimeline.DELTA_COMMIT_ACTION, inflightDeltaInstantTime));
    commitTimeline.transitionRequestedToInflight(new HoodieInstant(State.REQUESTED, HoodieTimeline.DELTA_COMMIT_ACTION, inflightDeltaInstantTime), Option.empty());
    refreshFsView();
    List<FileSlice> allRawFileSlices = getAllRawFileSlices(partitionPath).collect(Collectors.toList());
    dataFiles = allRawFileSlices.stream().flatMap(slice -> {
        if (slice.getBaseFile().isPresent()) {
            return Stream.of(slice.getBaseFile().get());
        }
        return Stream.empty();
    }).collect(Collectors.toList());
    if (includeInvalidAndInflight) {
        assertEquals(2 + (isCompactionInFlight ? 1 : 0) + (skipCreatingDataFile ? 0 : 1), dataFiles.size(), "Inflight/Orphan data-file is also expected");
        Set<String> fileNames = dataFiles.stream().map(HoodieBaseFile::getFileName).collect(Collectors.toSet());
        assertTrue(fileNames.contains(orphanDataFileName), "Expect orphan data-file to be present");
        assertTrue(fileNames.contains(inflightDataFileName), "Expect inflight data-file to be present");
        if (!skipCreatingDataFile) {
            assertTrue(fileNames.contains(dataFileName), "Expect old committed data-file");
        }
        if (isCompactionInFlight) {
            assertTrue(fileNames.contains(compactDataFileName), "Expect inflight compacted data file to be present");
        }
        fileSliceList = getLatestRawFileSlices(partitionPath).collect(Collectors.toList());
        assertEquals(includeInvalidAndInflight ? 5 : 1, fileSliceList.size(), "Expect both inflight and orphan file-slice to be included");
        Map<String, FileSlice> fileSliceMap = fileSliceList.stream().collect(Collectors.toMap(FileSlice::getFileId, r -> r));
        FileSlice orphanFileSliceWithDataFile = fileSliceMap.get(orphanFileId1);
        FileSlice orphanFileSliceWithLogFile = fileSliceMap.get(orphanFileId2);
        FileSlice inflightFileSliceWithDataFile = fileSliceMap.get(inflightFileId1);
        FileSlice inflightFileSliceWithLogFile = fileSliceMap.get(inflightFileId2);
        assertEquals(invalidInstantId, orphanFileSliceWithDataFile.getBaseInstantTime(), "Orphan File Slice with data-file check base-commit");
        assertEquals(orphanDataFileName, orphanFileSliceWithDataFile.getBaseFile().get().getFileName(), "Orphan File Slice with data-file check data-file");
        assertEquals(0, orphanFileSliceWithDataFile.getLogFiles().count(), "Orphan File Slice with data-file check data-file");
        assertEquals(inflightDeltaInstantTime, inflightFileSliceWithDataFile.getBaseInstantTime(), "Inflight File Slice with data-file check base-commit");
        assertEquals(inflightDataFileName, inflightFileSliceWithDataFile.getBaseFile().get().getFileName(), "Inflight File Slice with data-file check data-file");
        assertEquals(0, inflightFileSliceWithDataFile.getLogFiles().count(), "Inflight File Slice with data-file check data-file");
        assertEquals(invalidInstantId, orphanFileSliceWithLogFile.getBaseInstantTime(), "Orphan File Slice with log-file check base-commit");
        assertFalse(orphanFileSliceWithLogFile.getBaseFile().isPresent(), "Orphan File Slice with log-file check data-file");
        logFiles = orphanFileSliceWithLogFile.getLogFiles().collect(Collectors.toList());
        assertEquals(1, logFiles.size(), "Orphan File Slice with log-file check data-file");
        assertEquals(orphanLogFileName, logFiles.get(0).getFileName(), "Orphan File Slice with log-file check data-file");
        assertEquals(inflightDeltaInstantTime, inflightFileSliceWithLogFile.getBaseInstantTime(), "Inflight File Slice with log-file check base-commit");
        assertFalse(inflightFileSliceWithLogFile.getBaseFile().isPresent(), "Inflight File Slice with log-file check data-file");
        logFiles = inflightFileSliceWithLogFile.getLogFiles().collect(Collectors.toList());
        assertEquals(1, logFiles.size(), "Inflight File Slice with log-file check data-file");
        assertEquals(inflightLogFileName, logFiles.get(0).getFileName(), "Inflight File Slice with log-file check data-file");
    }
    compactionInstant = new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, compactionRequestedTime);
    // Now simulate Compaction completing - Check the view
    if (!isCompactionInFlight) {
        // For inflight compaction, we already create a data-file to test concurrent inflight case.
        // If we skipped creating data file corresponding to compaction commit, create it now
        new File(basePath + "/" + partitionPath + "/" + compactDataFileName).createNewFile();
        commitTimeline.createNewInstant(compactionInstant);
    }
    commitTimeline.saveAsComplete(compactionInstant, Option.empty());
    refreshFsView();
    // populate the cache
    roView.getAllBaseFiles(partitionPath);
    fileSliceList = rtView.getLatestFileSlices(partitionPath).collect(Collectors.toList());
    LOG.info("FILESLICE LIST=" + fileSliceList);
    dataFiles = fileSliceList.stream().map(FileSlice::getBaseFile).filter(Option::isPresent).map(Option::get).collect(Collectors.toList());
    assertEquals(1, dataFiles.size(), "Expect only one data-files in latest view as there is only one file-group");
    assertEquals(compactDataFileName, dataFiles.get(0).getFileName(), "Data Filename must match");
    assertEquals(1, fileSliceList.size(), "Only one latest file-slice in the partition");
    assertFalse(dataFiles.get(0).getBootstrapBaseFile().isPresent(), "No external data file must be present");
    fileSlice = fileSliceList.get(0);
    assertEquals(fileId, fileSlice.getFileId(), "Check file-Id is set correctly");
    assertEquals(compactDataFileName, fileSlice.getBaseFile().get().getFileName(), "Check data-filename is set correctly");
    assertEquals(compactionRequestedTime, fileSlice.getBaseInstantTime(), "Ensure base-instant is now compaction request instant");
    logFiles = fileSlice.getLogFiles().collect(Collectors.toList());
    assertEquals(2, logFiles.size(), "Only log-files after compaction request shows up");
    assertEquals(fileName4, logFiles.get(0).getFileName(), "Log File Order check");
    assertEquals(fileName3, logFiles.get(1).getFileName(), "Log File Order check");
    // Data Files API tests
    dataFiles = roView.getLatestBaseFiles().collect(Collectors.toList());
    assertEquals(1, dataFiles.size(), "Expect only one data-file to be sent");
    assertFalse(dataFiles.get(0).getBootstrapBaseFile().isPresent(), "No external data file must be present");
    dataFiles.forEach(df -> {
        assertEquals(df.getCommitTime(), compactionRequestedTime, "Expect data-file created by compaction be returned");
        assertFalse(df.getBootstrapBaseFile().isPresent(), "No external data file must be present");
    });
    dataFiles = roView.getLatestBaseFiles(partitionPath).collect(Collectors.toList());
    assertEquals(1, dataFiles.size(), "Expect only one data-file to be sent");
    dataFiles.forEach(df -> {
        assertEquals(df.getCommitTime(), compactionRequestedTime, "Expect data-file created by compaction be returned");
        assertFalse(df.getBootstrapBaseFile().isPresent(), "No external data file must be present");
    });
    dataFiles = roView.getLatestBaseFilesBeforeOrOn(partitionPath, deltaInstantTime5).collect(Collectors.toList());
    assertEquals(1, dataFiles.size(), "Expect only one data-file to be sent");
    dataFiles.forEach(df -> {
        assertEquals(df.getCommitTime(), compactionRequestedTime, "Expect data-file created by compaction be returned");
        assertFalse(df.getBootstrapBaseFile().isPresent(), "No external data file must be present");
    });
    dataFiles = roView.getLatestBaseFilesInRange(allInstantTimes).collect(Collectors.toList());
    assertEquals(1, dataFiles.size(), "Expect only one data-file to be sent");
    dataFiles.forEach(df -> {
        assertEquals(df.getCommitTime(), compactionRequestedTime, "Expect data-file created by compaction be returned");
        assertFalse(df.getBootstrapBaseFile().isPresent(), "No external data file must be present");
    });
    assertEquals(expTotalFileSlices, rtView.getAllFileSlices(partitionPath).count(), "Total number of file-slices in partitions matches expected");
    assertEquals(expTotalDataFiles, roView.getAllBaseFiles(partitionPath).count(), "Total number of data-files in partitions matches expected");
    // file-groups includes inflight/invalid file-ids
    assertEquals(5, fsView.getAllFileGroups(partitionPath).count(), "Total number of file-groups in partitions matches expected");
}
Also used : BeforeEach(org.junit.jupiter.api.BeforeEach) HoodieWrapperFileSystem(org.apache.hudi.common.fs.HoodieWrapperFileSystem) Arrays(java.util.Arrays) Date(java.util.Date) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileStatus(org.apache.hadoop.fs.FileStatus) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) BaseFile(org.apache.hudi.common.model.BaseFile) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) MethodSource(org.junit.jupiter.params.provider.MethodSource) HoodiePath(org.apache.hudi.avro.model.HoodiePath) HoodieFSPermission(org.apache.hudi.avro.model.HoodieFSPermission) HoodieClusteringPlan(org.apache.hudi.avro.model.HoodieClusteringPlan) Set(java.util.Set) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) UUID(java.util.UUID) Arguments(org.junit.jupiter.params.provider.Arguments) HoodieCommonTestHarness(org.apache.hudi.common.testutils.HoodieCommonTestHarness) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) Test(org.junit.jupiter.api.Test) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) Stream(java.util.stream.Stream) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) ClusteringUtils(org.apache.hudi.common.util.ClusteringUtils) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) SliceView(org.apache.hudi.common.table.view.TableFileSystemView.SliceView) Assertions.assertDoesNotThrow(org.junit.jupiter.api.Assertions.assertDoesNotThrow) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) FileSlice(org.apache.hudi.common.model.FileSlice) IndexWriter(org.apache.hudi.common.bootstrap.index.BootstrapIndex.IndexWriter) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) CommitUtils(org.apache.hudi.common.util.CommitUtils) FsAction(org.apache.hadoop.fs.permission.FsAction) State(org.apache.hudi.common.table.timeline.HoodieInstant.State) HFileBootstrapIndex(org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) HoodieRequestedReplaceMetadata(org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) BootstrapFileMapping(org.apache.hudi.common.model.BootstrapFileMapping) BaseFileOnlyView(org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) File(java.io.File) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Paths(java.nio.file.Paths) FileStatusUtils(org.apache.hudi.common.bootstrap.FileStatusUtils) LogManager(org.apache.log4j.LogManager) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) FileSlice(org.apache.hudi.common.model.FileSlice) ArrayList(java.util.ArrayList) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HFileBootstrapIndex(org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Pair(org.apache.hudi.common.util.collection.Pair) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Date(java.util.Date) BootstrapFileMapping(org.apache.hudi.common.model.BootstrapFileMapping) IndexWriter(org.apache.hudi.common.bootstrap.index.BootstrapIndex.IndexWriter) Option(org.apache.hudi.common.util.Option) BaseFile(org.apache.hudi.common.model.BaseFile) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) File(java.io.File)

Aggregations

HoodieCompactionPlan (org.apache.hudi.avro.model.HoodieCompactionPlan)41 IOException (java.io.IOException)20 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)18 List (java.util.List)17 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)16 Pair (org.apache.hudi.common.util.collection.Pair)16 ArrayList (java.util.ArrayList)15 HoodieFileGroupId (org.apache.hudi.common.model.HoodieFileGroupId)15 Path (org.apache.hadoop.fs.Path)14 HoodieCompactionOperation (org.apache.hudi.avro.model.HoodieCompactionOperation)13 Option (org.apache.hudi.common.util.Option)13 LogManager (org.apache.log4j.LogManager)13 Logger (org.apache.log4j.Logger)13 Map (java.util.Map)12 Set (java.util.Set)12 Collectors (java.util.stream.Collectors)12 CompactionOperation (org.apache.hudi.common.model.CompactionOperation)12 CompactionUtils (org.apache.hudi.common.util.CompactionUtils)12 FileSlice (org.apache.hudi.common.model.FileSlice)11 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)11