Search in sources :

Example 51 with HoodieBaseFile

use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.

the class TestClientRollback method testSavepointAndRollback.

/**
 * Test case for rollback-savepoint interaction.
 */
@Test
public void testSavepointAndRollback() throws Exception {
    HoodieWriteConfig cfg = getConfigBuilder().withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(1).build()).build();
    try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
        HoodieTestDataGenerator.writePartitionMetadataDeprecated(fs, HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS, basePath);
        /**
         * Write 1 (only inserts)
         */
        String newCommitTime = "001";
        client.startCommitWithTime(newCommitTime);
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 200);
        JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
        List<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime).collect();
        assertNoWriteErrors(statuses);
        /**
         * Write 2 (updates)
         */
        newCommitTime = "002";
        client.startCommitWithTime(newCommitTime);
        records = dataGen.generateUpdates(newCommitTime, records);
        statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
        // Verify there are no errors
        assertNoWriteErrors(statuses);
        client.savepoint("hoodie-unit-test", "test");
        /**
         * Write 3 (updates)
         */
        newCommitTime = "003";
        client.startCommitWithTime(newCommitTime);
        records = dataGen.generateUpdates(newCommitTime, records);
        statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
        // Verify there are no errors
        assertNoWriteErrors(statuses);
        HoodieWriteConfig config = getConfig();
        List<String> partitionPaths = FSUtils.getAllPartitionPaths(context, config.getMetadataConfig(), cfg.getBasePath());
        metaClient = HoodieTableMetaClient.reload(metaClient);
        HoodieSparkTable table = HoodieSparkTable.create(getConfig(), context, metaClient);
        final BaseFileOnlyView view1 = table.getBaseFileOnlyView();
        List<HoodieBaseFile> dataFiles = partitionPaths.stream().flatMap(s -> {
            return view1.getAllBaseFiles(s).filter(f -> f.getCommitTime().equals("003"));
        }).collect(Collectors.toList());
        assertEquals(3, dataFiles.size(), "The data files for commit 003 should be present");
        dataFiles = partitionPaths.stream().flatMap(s -> {
            return view1.getAllBaseFiles(s).filter(f -> f.getCommitTime().equals("002"));
        }).collect(Collectors.toList());
        assertEquals(3, dataFiles.size(), "The data files for commit 002 should be present");
        /**
         * Write 4 (updates)
         */
        newCommitTime = "004";
        client.startCommitWithTime(newCommitTime);
        records = dataGen.generateUpdates(newCommitTime, records);
        statuses = client.upsert(jsc.parallelize(records, 1), newCommitTime).collect();
        // Verify there are no errors
        assertNoWriteErrors(statuses);
        metaClient = HoodieTableMetaClient.reload(metaClient);
        table = HoodieSparkTable.create(getConfig(), context, metaClient);
        final BaseFileOnlyView view2 = table.getBaseFileOnlyView();
        dataFiles = partitionPaths.stream().flatMap(s -> view2.getAllBaseFiles(s).filter(f -> f.getCommitTime().equals("004"))).collect(Collectors.toList());
        assertEquals(3, dataFiles.size(), "The data files for commit 004 should be present");
        // rolling back to a non existent savepoint must not succeed
        assertThrows(HoodieRollbackException.class, () -> {
            client.restoreToSavepoint("001");
        }, "Rolling back to non-existent savepoint should not be allowed");
        // rollback to savepoint 002
        HoodieInstant savepoint = table.getCompletedSavepointTimeline().getInstants().findFirst().get();
        client.restoreToSavepoint(savepoint.getTimestamp());
        metaClient = HoodieTableMetaClient.reload(metaClient);
        table = HoodieSparkTable.create(getConfig(), context, metaClient);
        final BaseFileOnlyView view3 = table.getBaseFileOnlyView();
        dataFiles = partitionPaths.stream().flatMap(s -> view3.getAllBaseFiles(s).filter(f -> f.getCommitTime().equals("002"))).collect(Collectors.toList());
        assertEquals(3, dataFiles.size(), "The data files for commit 002 be available");
        dataFiles = partitionPaths.stream().flatMap(s -> view3.getAllBaseFiles(s).filter(f -> f.getCommitTime().equals("003"))).collect(Collectors.toList());
        assertEquals(0, dataFiles.size(), "The data files for commit 003 should be rolled back");
        dataFiles = partitionPaths.stream().flatMap(s -> view3.getAllBaseFiles(s).filter(f -> f.getCommitTime().equals("004"))).collect(Collectors.toList());
        assertEquals(0, dataFiles.size(), "The data files for commit 004 should be rolled back");
    }
}
Also used : HoodieClientTestBase(org.apache.hudi.testutils.HoodieClientTestBase) Assertions.assertThrows(org.junit.jupiter.api.Assertions.assertThrows) HoodieCleaningPolicy(org.apache.hudi.common.model.HoodieCleaningPolicy) Arrays(java.util.Arrays) HoodieFailedWritesCleaningPolicy(org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) HashMap(java.util.HashMap) HoodieMetadataTestTable(org.apache.hudi.common.testutils.HoodieMetadataTestTable) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) SparkHoodieBackedTableMetadataWriter(org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieRollbackException(org.apache.hudi.exception.HoodieRollbackException) Assertions.assertNoWriteErrors(org.apache.hudi.testutils.Assertions.assertNoWriteErrors) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) BaseFileOnlyView(org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) HoodieRollbackPlan(org.apache.hudi.avro.model.HoodieRollbackPlan) Collectors(java.util.stream.Collectors) HoodieInstantInfo(org.apache.hudi.avro.model.HoodieInstantInfo) FileCreateUtils(org.apache.hudi.common.testutils.FileCreateUtils) HoodieIndex(org.apache.hudi.index.HoodieIndex) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) Test(org.junit.jupiter.api.Test) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) HoodieTableMetadataWriter(org.apache.hudi.metadata.HoodieTableMetadataWriter) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) BaseFileOnlyView(org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) Test(org.junit.jupiter.api.Test)

Example 52 with HoodieBaseFile

use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.

the class UpsertPartitioner method getSmallFiles.

/**
 * Returns a list of small files in the given partition path.
 */
protected List<SmallFile> getSmallFiles(String partitionPath) {
    // smallFiles only for partitionPath
    List<SmallFile> smallFileLocations = new ArrayList<>();
    HoodieTimeline commitTimeline = table.getMetaClient().getCommitsTimeline().filterCompletedInstants();
    if (!commitTimeline.empty()) {
        // if we have some commits
        HoodieInstant latestCommitTime = commitTimeline.lastInstant().get();
        List<HoodieBaseFile> allFiles = table.getBaseFileOnlyView().getLatestBaseFilesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp()).collect(Collectors.toList());
        for (HoodieBaseFile file : allFiles) {
            if (file.getFileSize() < config.getParquetSmallFileLimit()) {
                String filename = file.getFileName();
                SmallFile sf = new SmallFile();
                sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename));
                sf.sizeBytes = file.getFileSize();
                smallFileLocations.add(sf);
            }
        }
    }
    return smallFileLocations;
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) ArrayList(java.util.ArrayList) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation)

Example 53 with HoodieBaseFile

use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.

the class TestHoodieSparkMergeOnReadTableInsertUpdateDelete method testInlineScheduleCompaction.

@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testInlineScheduleCompaction(boolean scheduleInlineCompaction) throws Exception {
    HoodieFileFormat fileFormat = HoodieFileFormat.PARQUET;
    Properties properties = new Properties();
    properties.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), fileFormat.toString());
    HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, properties);
    HoodieWriteConfig cfg = getConfigBuilder(false).withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024).withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(2).withPreserveCommitMetadata(true).withScheduleInlineCompaction(scheduleInlineCompaction).build()).build();
    try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
        HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();
        /*
       * Write 1 (only inserts)
       */
        String newCommitTime = "001";
        client.startCommitWithTime(newCommitTime);
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 200);
        Stream<HoodieBaseFile> dataFiles = insertRecordsToMORTable(metaClient, records, client, cfg, newCommitTime, true);
        assertTrue(dataFiles.findAny().isPresent(), "should list the base files we wrote in the delta commit");
        /*
       * Write 2 (updates)
       */
        newCommitTime = "004";
        client.startCommitWithTime(newCommitTime);
        records = dataGen.generateUpdates(newCommitTime, 100);
        updateRecordsInMORTable(metaClient, records, client, cfg, newCommitTime, true);
        // verify that there is a commit
        if (scheduleInlineCompaction) {
            assertEquals(metaClient.reloadActiveTimeline().getAllCommitsTimeline().filterPendingCompactionTimeline().countInstants(), 1);
        } else {
            assertEquals(metaClient.reloadActiveTimeline().getAllCommitsTimeline().filterPendingCompactionTimeline().countInstants(), 0);
        }
    }
}
Also used : HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Properties(java.util.Properties) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) ValueSource(org.junit.jupiter.params.provider.ValueSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 54 with HoodieBaseFile

use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.

the class TestHoodieSparkMergeOnReadTableInsertUpdateDelete method testSimpleInsertAndUpdate.

@ParameterizedTest
@MethodSource
public void testSimpleInsertAndUpdate(HoodieFileFormat fileFormat, boolean populateMetaFields) throws Exception {
    Properties properties = populateMetaFields ? new Properties() : getPropertiesForKeyGen();
    properties.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), fileFormat.toString());
    HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, properties);
    HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(true);
    addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields);
    HoodieWriteConfig cfg = cfgBuilder.build();
    try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
        HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();
        /*
       * Write 1 (only inserts)
       */
        String newCommitTime = "001";
        client.startCommitWithTime(newCommitTime);
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 200);
        Stream<HoodieBaseFile> dataFiles = insertRecordsToMORTable(metaClient, records, client, cfg, newCommitTime);
        assertTrue(dataFiles.findAny().isPresent(), "should list the base files we wrote in the delta commit");
        /*
       * Write 2 (updates)
       */
        newCommitTime = "004";
        client.startCommitWithTime(newCommitTime);
        records = dataGen.generateUpdates(newCommitTime, 100);
        updateRecordsInMORTable(metaClient, records, client, cfg, newCommitTime, false);
        String compactionCommitTime = client.scheduleCompaction(Option.empty()).get().toString();
        client.compact(compactionCommitTime);
        HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient);
        hoodieTable.getHoodieView().sync();
        FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable);
        HoodieTableFileSystemView tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
        Stream<HoodieBaseFile> dataFilesToRead = tableView.getLatestBaseFiles();
        assertTrue(dataFilesToRead.findAny().isPresent());
        // verify that there is a commit
        metaClient = HoodieTableMetaClient.reload(metaClient);
        HoodieTimeline timeline = metaClient.getCommitTimeline().filterCompletedInstants();
        assertEquals(1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), "Expecting a single commit.");
        String latestCompactionCommitTime = timeline.lastInstant().get().getTimestamp();
        assertTrue(HoodieTimeline.compareTimestamps("000", HoodieTimeline.LESSER_THAN, latestCompactionCommitTime));
        if (cfg.populateMetaFields()) {
            assertEquals(200, HoodieClientTestUtils.countRecordsOptionallySince(jsc(), basePath(), sqlContext(), timeline, Option.of("000")), "Must contain 200 records");
        } else {
            assertEquals(200, HoodieClientTestUtils.countRecordsOptionallySince(jsc(), basePath(), sqlContext(), timeline, Option.empty()));
        }
    }
}
Also used : SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Properties(java.util.Properties) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 55 with HoodieBaseFile

use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.

the class TestHoodieSparkMergeOnReadTableInsertUpdateDelete method testRepeatedRollbackOfCompaction.

@Test
public void testRepeatedRollbackOfCompaction() throws Exception {
    boolean scheduleInlineCompaction = false;
    HoodieFileFormat fileFormat = HoodieFileFormat.PARQUET;
    Properties properties = new Properties();
    properties.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), fileFormat.toString());
    HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, properties);
    HoodieWriteConfig cfg = getConfigBuilder(false).withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(1024 * 1024 * 1024).withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(2).withPreserveCommitMetadata(true).withScheduleInlineCompaction(scheduleInlineCompaction).build()).build();
    try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
        HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();
        /*
       * Write 1 (only inserts)
       */
        String newCommitTime = "001";
        client.startCommitWithTime(newCommitTime);
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 200);
        Stream<HoodieBaseFile> dataFiles = insertRecordsToMORTable(metaClient, records, client, cfg, newCommitTime, true);
        assertTrue(dataFiles.findAny().isPresent(), "should list the base files we wrote in the delta commit");
        /*
       * Write 2 (updates)
       */
        newCommitTime = "004";
        client.startCommitWithTime(newCommitTime);
        records = dataGen.generateUpdates(newCommitTime, 100);
        updateRecordsInMORTable(metaClient, records, client, cfg, newCommitTime, true);
        Option<String> compactionInstant = client.scheduleCompaction(Option.empty());
        client.compact(compactionInstant.get());
        // trigger compaction again.
        client.compact(compactionInstant.get());
        metaClient.reloadActiveTimeline();
        // verify that there is no new rollback instant generated
        HoodieInstant rollbackInstant = metaClient.getActiveTimeline().getRollbackTimeline().lastInstant().get();
        FileCreateUtils.deleteRollbackCommit(metaClient.getBasePath().substring(metaClient.getBasePath().indexOf(":") + 1), rollbackInstant.getTimestamp());
        metaClient.reloadActiveTimeline();
        SparkRDDWriteClient client1 = getHoodieWriteClient(cfg);
        // trigger compaction again.
        client1.compact(compactionInstant.get());
        metaClient.reloadActiveTimeline();
        // verify that there is no new rollback instant generated
        HoodieInstant newRollbackInstant = metaClient.getActiveTimeline().getRollbackTimeline().lastInstant().get();
        assertEquals(rollbackInstant.getTimestamp(), newRollbackInstant.getTimestamp());
    }
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Properties(java.util.Properties) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Aggregations

HoodieBaseFile (org.apache.hudi.common.model.HoodieBaseFile)71 Path (org.apache.hadoop.fs.Path)40 ArrayList (java.util.ArrayList)33 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)31 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)31 FileSlice (org.apache.hudi.common.model.FileSlice)29 List (java.util.List)27 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)27 IOException (java.io.IOException)26 FileStatus (org.apache.hadoop.fs.FileStatus)25 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)24 Pair (org.apache.hudi.common.util.collection.Pair)24 Option (org.apache.hudi.common.util.Option)23 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)23 Collectors (java.util.stream.Collectors)21 Test (org.junit.jupiter.api.Test)21 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)21 Map (java.util.Map)20 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)20 HoodieTable (org.apache.hudi.table.HoodieTable)20