Search in sources :

Example 56 with HoodieBaseFile

use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.

the class TestHoodieSparkMergeOnReadTableRollback method testMultiRollbackWithDeltaAndCompactionCommit.

@Test
void testMultiRollbackWithDeltaAndCompactionCommit() throws Exception {
    boolean populateMetaFields = true;
    HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(false).withMarkersType(MarkerType.DIRECT.name());
    addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields);
    HoodieWriteConfig cfg = cfgBuilder.build();
    Properties properties = populateMetaFields ? new Properties() : getPropertiesForKeyGen();
    properties.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().toString());
    HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, properties);
    try (final SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
        HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();
        /*
       * Write 1 (only inserts)
       */
        String newCommitTime = "001";
        client.startCommitWithTime(newCommitTime);
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 200);
        JavaRDD<HoodieRecord> writeRecords = jsc().parallelize(records, 1);
        JavaRDD<WriteStatus> writeStatusJavaRDD = client.upsert(writeRecords, newCommitTime);
        List<WriteStatus> statuses = writeStatusJavaRDD.collect();
        assertNoWriteErrors(statuses);
        client.commit(newCommitTime, jsc().parallelize(statuses));
        client.close();
        Option<Pair<HoodieInstant, HoodieCommitMetadata>> instantCommitMetadataPairOpt = metaClient.getActiveTimeline().getLastCommitMetadataWithValidData();
        assertTrue(instantCommitMetadataPairOpt.isPresent());
        HoodieInstant commitInstant = instantCommitMetadataPairOpt.get().getKey();
        assertEquals("001", commitInstant.getTimestamp());
        assertEquals(HoodieTimeline.DELTA_COMMIT_ACTION, commitInstant.getAction());
        assertEquals(200, getTotalRecordsWritten(instantCommitMetadataPairOpt.get().getValue()));
        Option<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
        assertFalse(commit.isPresent());
        HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient);
        FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable);
        HoodieTableFileSystemView tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles);
        Stream<HoodieBaseFile> dataFilesToRead = tableView.getLatestBaseFiles();
        assertFalse(dataFilesToRead.findAny().isPresent());
        tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
        dataFilesToRead = tableView.getLatestBaseFiles();
        assertTrue(dataFilesToRead.findAny().isPresent(), "Should list the base files we wrote in the delta commit");
        /*
       * Write 2 (inserts + updates)
       */
        newCommitTime = "002";
        // WriteClient with custom config (disable small file handling)
        HoodieWriteConfig smallFileWriteConfig = getHoodieWriteConfigWithSmallFileHandlingOffBuilder(populateMetaFields).withMarkersType(MarkerType.DIRECT.name()).build();
        try (SparkRDDWriteClient nClient = getHoodieWriteClient(smallFileWriteConfig)) {
            nClient.startCommitWithTime(newCommitTime);
            List<HoodieRecord> copyOfRecords = new ArrayList<>(records);
            copyOfRecords = dataGen.generateUpdates(newCommitTime, copyOfRecords);
            copyOfRecords.addAll(dataGen.generateInserts(newCommitTime, 200));
            List<String> dataFiles = tableView.getLatestBaseFiles().map(baseFile -> new Path(baseFile.getPath()).getParent().toString()).collect(Collectors.toList());
            List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), dataFiles, basePath());
            assertEquals(200, recordsRead.size());
            statuses = nClient.upsert(jsc().parallelize(copyOfRecords, 1), newCommitTime).collect();
            // Verify there are no errors
            assertNoWriteErrors(statuses);
            nClient.commit(newCommitTime, jsc().parallelize(statuses));
            copyOfRecords.clear();
        }
        // Schedule a compaction
        /*
       * Write 3 (inserts + updates)
       */
        newCommitTime = "003";
        client.startCommitWithTime(newCommitTime);
        List<HoodieRecord> newInserts = dataGen.generateInserts(newCommitTime, 100);
        records = dataGen.generateUpdates(newCommitTime, records);
        records.addAll(newInserts);
        writeRecords = jsc().parallelize(records, 1);
        writeStatusJavaRDD = client.upsert(writeRecords, newCommitTime);
        statuses = writeStatusJavaRDD.collect();
        // Verify there are no errors
        assertNoWriteErrors(statuses);
        client.commit(newCommitTime, jsc().parallelize(statuses));
        metaClient = HoodieTableMetaClient.reload(metaClient);
        String compactionInstantTime = "004";
        client.scheduleCompactionAtInstant(compactionInstantTime, Option.empty());
        // Compaction commit
        /*
       * Write 4 (updates)
       */
        newCommitTime = "005";
        client.startCommitWithTime(newCommitTime);
        records = dataGen.generateUpdates(newCommitTime, records);
        writeRecords = jsc().parallelize(records, 1);
        writeStatusJavaRDD = client.upsert(writeRecords, newCommitTime);
        statuses = writeStatusJavaRDD.collect();
        // Verify there are no errors
        assertNoWriteErrors(statuses);
        client.commit(newCommitTime, jsc().parallelize(statuses));
        metaClient = HoodieTableMetaClient.reload(metaClient);
        compactionInstantTime = "006";
        client.scheduleCompactionAtInstant(compactionInstantTime, Option.empty());
        HoodieWriteMetadata<JavaRDD<WriteStatus>> compactionMetadata = client.compact(compactionInstantTime);
        client.commitCompaction(compactionInstantTime, compactionMetadata.getCommitMetadata().get(), Option.empty());
        allFiles = listAllBaseFilesInPath(hoodieTable);
        metaClient = HoodieTableMetaClient.reload(metaClient);
        tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitsTimeline(), allFiles);
        final String compactedCommitTime = metaClient.getActiveTimeline().reload().getCommitsTimeline().lastInstant().get().getTimestamp();
        assertTrue(tableView.getLatestBaseFiles().anyMatch(file -> compactedCommitTime.equals(file.getCommitTime())));
        /*
       * Write 5 (updates)
       */
        newCommitTime = "007";
        client.startCommitWithTime(newCommitTime);
        List<HoodieRecord> copyOfRecords = new ArrayList<>(records);
        copyOfRecords = dataGen.generateUpdates(newCommitTime, copyOfRecords);
        copyOfRecords.addAll(dataGen.generateInserts(newCommitTime, 200));
        statuses = client.upsert(jsc().parallelize(copyOfRecords, 1), newCommitTime).collect();
        // Verify there are no errors
        assertNoWriteErrors(statuses);
        client.commit(newCommitTime, jsc().parallelize(statuses));
        copyOfRecords.clear();
        // Rollback latest commit first
        client.restoreToInstant("000");
        metaClient = HoodieTableMetaClient.reload(metaClient);
        allFiles = listAllBaseFilesInPath(hoodieTable);
        tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles);
        dataFilesToRead = tableView.getLatestBaseFiles();
        assertFalse(dataFilesToRead.findAny().isPresent());
        TableFileSystemView.SliceView rtView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles);
        List<HoodieFileGroup> fileGroups = ((HoodieTableFileSystemView) rtView).getAllFileGroups().collect(Collectors.toList());
        assertTrue(fileGroups.isEmpty());
        // make sure there are no log files remaining
        assertEquals(0L, ((HoodieTableFileSystemView) rtView).getAllFileGroups().filter(fileGroup -> fileGroup.getAllRawFileSlices().noneMatch(f -> f.getLogFiles().count() == 0)).count());
    }
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) HoodieMergeOnReadTestUtils(org.apache.hudi.testutils.HoodieMergeOnReadTestUtils) Arrays(java.util.Arrays) HoodieFailedWritesCleaningPolicy(org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Assertions.assertNotEquals(org.junit.jupiter.api.Assertions.assertNotEquals) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) Map(java.util.Map) HoodieStorageConfig(org.apache.hudi.config.HoodieStorageConfig) Path(org.apache.hadoop.fs.Path) Assertions.assertAll(org.junit.jupiter.api.Assertions.assertAll) Tag(org.junit.jupiter.api.Tag) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) TRIP_EXAMPLE_SCHEMA(org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) Collection(java.util.Collection) Collectors(java.util.stream.Collectors) HoodieIndex(org.apache.hudi.index.HoodieIndex) Test(org.junit.jupiter.api.Test) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) Stream(java.util.stream.Stream) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView) FileSlice(org.apache.hudi.common.model.FileSlice) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) MarkerType(org.apache.hudi.common.table.marker.MarkerType) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) SyncableFileSystemView(org.apache.hudi.common.table.view.SyncableFileSystemView) ValueSource(org.junit.jupiter.params.provider.ValueSource) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Assertions.assertNoWriteErrors(org.apache.hudi.testutils.Assertions.assertNoWriteErrors) Properties(java.util.Properties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Files(java.nio.file.Files) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) File(java.io.File) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) SparkClientFunctionalTestHarness(org.apache.hudi.testutils.SparkClientFunctionalTestHarness) HoodieIOException(org.apache.hudi.exception.HoodieIOException) Pair(org.apache.hudi.common.util.collection.Pair) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) Properties(java.util.Properties) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) GenericRecord(org.apache.avro.generic.GenericRecord) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) WriteStatus(org.apache.hudi.client.WriteStatus) Pair(org.apache.hudi.common.util.collection.Pair) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Path(org.apache.hadoop.fs.Path) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 57 with HoodieBaseFile

use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.

the class TestHoodieSparkMergeOnReadTableRollback method testRollbackWithDeltaAndCompactionCommit.

@ParameterizedTest
@ValueSource(booleans = { true, false })
void testRollbackWithDeltaAndCompactionCommit(boolean rollbackUsingMarkers) throws Exception {
    // NOTE: First writer will have Metadata table DISABLED
    HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(false, rollbackUsingMarkers, HoodieIndex.IndexType.SIMPLE);
    addConfigsForPopulateMetaFields(cfgBuilder, true);
    HoodieWriteConfig cfg = cfgBuilder.build();
    Properties properties = new Properties();
    properties.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().toString());
    HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, properties);
    try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
        HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();
        // Test delta commit rollback
        /*
       * Write 1 (only inserts)
       */
        String newCommitTime = "001";
        client.startCommitWithTime(newCommitTime);
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 200);
        JavaRDD<HoodieRecord> writeRecords = jsc().parallelize(records, 1);
        JavaRDD<WriteStatus> writeStatusJavaRDD = client.upsert(writeRecords, newCommitTime);
        List<WriteStatus> statuses = writeStatusJavaRDD.collect();
        assertNoWriteErrors(statuses);
        client.commit(newCommitTime, jsc().parallelize(statuses));
        HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient);
        Option<HoodieInstant> deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
        assertTrue(deltaCommit.isPresent());
        assertEquals("001", deltaCommit.get().getTimestamp(), "Delta commit should be 001");
        Option<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
        assertFalse(commit.isPresent());
        FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable);
        HoodieTableFileSystemView tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles);
        Stream<HoodieBaseFile> dataFilesToRead = tableView.getLatestBaseFiles();
        assertFalse(dataFilesToRead.findAny().isPresent());
        tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
        dataFilesToRead = tableView.getLatestBaseFiles();
        assertTrue(dataFilesToRead.findAny().isPresent(), "should list the base files we wrote in the delta commit");
        /*
       * Write 2 (inserts + updates - testing failed delta commit)
       */
        final String commitTime1 = "002";
        // NOTE: Second writer will have Metadata table ENABLED
        try (SparkRDDWriteClient secondClient = getHoodieWriteClient(getHoodieWriteConfigWithSmallFileHandlingOff(true))) {
            secondClient.startCommitWithTime(commitTime1);
            List<HoodieRecord> copyOfRecords = new ArrayList<>(records);
            copyOfRecords = dataGen.generateUpdates(commitTime1, copyOfRecords);
            copyOfRecords.addAll(dataGen.generateInserts(commitTime1, 200));
            List<String> inputPaths = tableView.getLatestBaseFiles().map(baseFile -> new Path(baseFile.getPath()).getParent().toString()).collect(Collectors.toList());
            List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), inputPaths, basePath());
            assertEquals(200, recordsRead.size());
            statuses = secondClient.upsert(jsc().parallelize(copyOfRecords, 1), commitTime1).collect();
            // Verify there are no errors
            assertNoWriteErrors(statuses);
            // Test failed delta commit rollback
            secondClient.rollback(commitTime1);
            allFiles = listAllBaseFilesInPath(hoodieTable);
            // After rollback, there should be no base file with the failed commit time
            List<String> remainingFiles = Arrays.stream(allFiles).filter(file -> file.getPath().getName().contains(commitTime1)).map(fileStatus -> fileStatus.getPath().toString()).collect(Collectors.toList());
            assertEquals(0, remainingFiles.size(), "There files should have been rolled-back " + "when rolling back commit " + commitTime1 + " but are still remaining. Files: " + remainingFiles);
            inputPaths = tableView.getLatestBaseFiles().map(baseFile -> new Path(baseFile.getPath()).getParent().toString()).collect(Collectors.toList());
            recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), inputPaths, basePath());
            assertEquals(200, recordsRead.size());
        }
        /*
       * Write 3 (inserts + updates - testing successful delta commit)
       */
        final String commitTime2 = "003";
        try (SparkRDDWriteClient thirdClient = getHoodieWriteClient(getHoodieWriteConfigWithSmallFileHandlingOff(true))) {
            thirdClient.startCommitWithTime(commitTime2);
            List<HoodieRecord> copyOfRecords = new ArrayList<>(records);
            copyOfRecords = dataGen.generateUpdates(commitTime2, copyOfRecords);
            copyOfRecords.addAll(dataGen.generateInserts(commitTime2, 200));
            List<String> inputPaths = tableView.getLatestBaseFiles().map(baseFile -> new Path(baseFile.getPath()).getParent().toString()).collect(Collectors.toList());
            List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), inputPaths, basePath());
            assertEquals(200, recordsRead.size());
            writeRecords = jsc().parallelize(copyOfRecords, 1);
            writeStatusJavaRDD = thirdClient.upsert(writeRecords, commitTime2);
            statuses = writeStatusJavaRDD.collect();
            // Verify there are no errors
            assertNoWriteErrors(statuses);
            // Test successful delta commit rollback
            thirdClient.rollback(commitTime2);
            allFiles = listAllBaseFilesInPath(hoodieTable);
            // After rollback, there should be no base file with the failed commit time
            assertEquals(0, Arrays.stream(allFiles).filter(file -> file.getPath().getName().contains(commitTime2)).count());
            metaClient = HoodieTableMetaClient.reload(metaClient);
            hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient);
            tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
            inputPaths = tableView.getLatestBaseFiles().map(baseFile -> new Path(baseFile.getPath()).getParent().toString()).collect(Collectors.toList());
            recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), inputPaths, basePath());
            // check that the number of records read is still correct after rollback operation
            assertEquals(200, recordsRead.size());
            // Test compaction commit rollback
            /*
         * Write 4 (updates)
         */
            newCommitTime = "004";
            thirdClient.startCommitWithTime(newCommitTime);
            writeStatusJavaRDD = thirdClient.upsert(writeRecords, newCommitTime);
            statuses = writeStatusJavaRDD.collect();
            // Verify there are no errors
            assertNoWriteErrors(statuses);
            thirdClient.commit(newCommitTime, jsc().parallelize(statuses));
            metaClient = HoodieTableMetaClient.reload(metaClient);
            String compactionInstantTime = thirdClient.scheduleCompaction(Option.empty()).get().toString();
            thirdClient.compact(compactionInstantTime);
            metaClient = HoodieTableMetaClient.reload(metaClient);
            final String compactedCommitTime = metaClient.getActiveTimeline().reload().lastInstant().get().getTimestamp();
            assertTrue(Arrays.stream(listAllBaseFilesInPath(hoodieTable)).anyMatch(file -> compactedCommitTime.equals(new HoodieBaseFile(file).getCommitTime())));
            hoodieTable.rollbackInflightCompaction(new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, compactedCommitTime));
            allFiles = listAllBaseFilesInPath(hoodieTable);
            metaClient = HoodieTableMetaClient.reload(metaClient);
            tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitsTimeline(), allFiles);
            assertFalse(tableView.getLatestBaseFiles().anyMatch(file -> compactedCommitTime.equals(file.getCommitTime())));
            assertAll(tableView.getLatestBaseFiles().map(file -> () -> assertNotEquals(compactedCommitTime, file.getCommitTime())));
        }
    }
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) HoodieMergeOnReadTestUtils(org.apache.hudi.testutils.HoodieMergeOnReadTestUtils) Arrays(java.util.Arrays) HoodieFailedWritesCleaningPolicy(org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Assertions.assertNotEquals(org.junit.jupiter.api.Assertions.assertNotEquals) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) Map(java.util.Map) HoodieStorageConfig(org.apache.hudi.config.HoodieStorageConfig) Path(org.apache.hadoop.fs.Path) Assertions.assertAll(org.junit.jupiter.api.Assertions.assertAll) Tag(org.junit.jupiter.api.Tag) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) TRIP_EXAMPLE_SCHEMA(org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) Collection(java.util.Collection) Collectors(java.util.stream.Collectors) HoodieIndex(org.apache.hudi.index.HoodieIndex) Test(org.junit.jupiter.api.Test) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) Stream(java.util.stream.Stream) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView) FileSlice(org.apache.hudi.common.model.FileSlice) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) MarkerType(org.apache.hudi.common.table.marker.MarkerType) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) SyncableFileSystemView(org.apache.hudi.common.table.view.SyncableFileSystemView) ValueSource(org.junit.jupiter.params.provider.ValueSource) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Assertions.assertNoWriteErrors(org.apache.hudi.testutils.Assertions.assertNoWriteErrors) Properties(java.util.Properties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Files(java.nio.file.Files) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) File(java.io.File) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) SparkClientFunctionalTestHarness(org.apache.hudi.testutils.SparkClientFunctionalTestHarness) HoodieIOException(org.apache.hudi.exception.HoodieIOException) Pair(org.apache.hudi.common.util.collection.Pair) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) Properties(java.util.Properties) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) GenericRecord(org.apache.avro.generic.GenericRecord) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Path(org.apache.hadoop.fs.Path) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) ValueSource(org.junit.jupiter.params.provider.ValueSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 58 with HoodieBaseFile

use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.

the class SparkClientFunctionalTestHarness method insertRecordsToMORTable.

protected Stream<HoodieBaseFile> insertRecordsToMORTable(HoodieTableMetaClient metaClient, List<HoodieRecord> records, SparkRDDWriteClient client, HoodieWriteConfig cfg, String commitTime, boolean doExplicitCommit) throws IOException {
    HoodieTableMetaClient reloadedMetaClient = HoodieTableMetaClient.reload(metaClient);
    JavaRDD<HoodieRecord> writeRecords = jsc().parallelize(records, 1);
    JavaRDD<WriteStatus> statusesRdd = client.insert(writeRecords, commitTime);
    List<WriteStatus> statuses = statusesRdd.collect();
    assertNoWriteErrors(statuses);
    if (doExplicitCommit) {
        client.commit(commitTime, statusesRdd);
    }
    assertFileSizesEqual(statuses, status -> FSUtils.getFileSize(reloadedMetaClient.getFs(), new Path(reloadedMetaClient.getBasePath(), status.getStat().getPath())));
    HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), reloadedMetaClient);
    Option<HoodieInstant> deltaCommit = reloadedMetaClient.getActiveTimeline().getDeltaCommitTimeline().lastInstant();
    assertTrue(deltaCommit.isPresent());
    assertEquals(commitTime, deltaCommit.get().getTimestamp(), "Delta commit should be specified value");
    Option<HoodieInstant> commit = reloadedMetaClient.getActiveTimeline().getCommitTimeline().lastInstant();
    assertFalse(commit.isPresent());
    FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable);
    TableFileSystemView.BaseFileOnlyView roView = getHoodieTableFileSystemView(reloadedMetaClient, reloadedMetaClient.getCommitTimeline().filterCompletedInstants(), allFiles);
    Stream<HoodieBaseFile> dataFilesToRead = roView.getLatestBaseFiles();
    assertTrue(!dataFilesToRead.findAny().isPresent());
    roView = getHoodieTableFileSystemView(reloadedMetaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
    dataFilesToRead = roView.getLatestBaseFiles();
    return dataFilesToRead;
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieTable(org.apache.hudi.table.HoodieTable) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView) WriteStatus(org.apache.hudi.client.WriteStatus)

Example 59 with HoodieBaseFile

use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.

the class JavaMergeHelper method runMerge.

@Override
public void runMerge(HoodieTable<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> table, HoodieMergeHandle<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> upsertHandle) throws IOException {
    final boolean externalSchemaTransformation = table.getConfig().shouldUseExternalSchemaTransformation();
    Configuration cfgForHoodieFile = new Configuration(table.getHadoopConf());
    HoodieMergeHandle<T, List<HoodieRecord<T>>, List<HoodieKey>, List<WriteStatus>> mergeHandle = upsertHandle;
    HoodieBaseFile baseFile = mergeHandle.baseFileForMerge();
    final GenericDatumWriter<GenericRecord> gWriter;
    final GenericDatumReader<GenericRecord> gReader;
    Schema readSchema;
    if (externalSchemaTransformation || baseFile.getBootstrapBaseFile().isPresent()) {
        readSchema = HoodieFileReaderFactory.getFileReader(table.getHadoopConf(), mergeHandle.getOldFilePath()).getSchema();
        gWriter = new GenericDatumWriter<>(readSchema);
        gReader = new GenericDatumReader<>(readSchema, mergeHandle.getWriterSchemaWithMetaFields());
    } else {
        gReader = null;
        gWriter = null;
        readSchema = mergeHandle.getWriterSchemaWithMetaFields();
    }
    BoundedInMemoryExecutor<GenericRecord, GenericRecord, Void> wrapper = null;
    HoodieFileReader<GenericRecord> reader = HoodieFileReaderFactory.<GenericRecord>getFileReader(cfgForHoodieFile, mergeHandle.getOldFilePath());
    try {
        final Iterator<GenericRecord> readerIterator;
        if (baseFile.getBootstrapBaseFile().isPresent()) {
            readerIterator = getMergingIterator(table, mergeHandle, baseFile, reader, readSchema, externalSchemaTransformation);
        } else {
            readerIterator = reader.getRecordIterator(readSchema);
        }
        ThreadLocal<BinaryEncoder> encoderCache = new ThreadLocal<>();
        ThreadLocal<BinaryDecoder> decoderCache = new ThreadLocal<>();
        wrapper = new BoundedInMemoryExecutor<>(table.getConfig().getWriteBufferLimitBytes(), new IteratorBasedQueueProducer<>(readerIterator), Option.of(new UpdateHandler(mergeHandle)), record -> {
            if (!externalSchemaTransformation) {
                return record;
            }
            return transformRecordBasedOnNewSchema(gReader, gWriter, encoderCache, decoderCache, (GenericRecord) record);
        });
        wrapper.execute();
    } catch (Exception e) {
        throw new HoodieException(e);
    } finally {
        if (reader != null) {
            reader.close();
        }
        mergeHandle.close();
        if (null != wrapper) {
            wrapper.shutdownNow();
        }
    }
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericDatumWriter(org.apache.avro.generic.GenericDatumWriter) GenericRecord(org.apache.avro.generic.GenericRecord) HoodieFileReader(org.apache.hudi.io.storage.HoodieFileReader) Schema(org.apache.avro.Schema) Iterator(java.util.Iterator) IteratorBasedQueueProducer(org.apache.hudi.common.util.queue.IteratorBasedQueueProducer) HoodieException(org.apache.hudi.exception.HoodieException) BinaryDecoder(org.apache.avro.io.BinaryDecoder) Option(org.apache.hudi.common.util.Option) IOException(java.io.IOException) BinaryEncoder(org.apache.avro.io.BinaryEncoder) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) HoodieMergeHandle(org.apache.hudi.io.HoodieMergeHandle) HoodieFileReaderFactory(org.apache.hudi.io.storage.HoodieFileReaderFactory) Configuration(org.apache.hadoop.conf.Configuration) HoodieKey(org.apache.hudi.common.model.HoodieKey) BoundedInMemoryExecutor(org.apache.hudi.common.util.queue.BoundedInMemoryExecutor) GenericDatumReader(org.apache.avro.generic.GenericDatumReader) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) Configuration(org.apache.hadoop.conf.Configuration) Schema(org.apache.avro.Schema) HoodieException(org.apache.hudi.exception.HoodieException) List(java.util.List) GenericRecord(org.apache.avro.generic.GenericRecord) BinaryDecoder(org.apache.avro.io.BinaryDecoder) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) BinaryEncoder(org.apache.avro.io.BinaryEncoder) IteratorBasedQueueProducer(org.apache.hudi.common.util.queue.IteratorBasedQueueProducer)

Example 60 with HoodieBaseFile

use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.

the class JavaUpsertPartitioner method getSmallFiles.

/**
 * Returns a list of small files in the given partition path.
 */
protected List<SmallFile> getSmallFiles(String partitionPath) {
    // smallFiles only for partitionPath
    List<SmallFile> smallFileLocations = new ArrayList<>();
    HoodieTimeline commitTimeline = table.getMetaClient().getCommitsTimeline().filterCompletedInstants();
    if (!commitTimeline.empty()) {
        // if we have some commits
        HoodieInstant latestCommitTime = commitTimeline.lastInstant().get();
        List<HoodieBaseFile> allFiles = table.getBaseFileOnlyView().getLatestBaseFilesBeforeOrOn(partitionPath, latestCommitTime.getTimestamp()).collect(Collectors.toList());
        for (HoodieBaseFile file : allFiles) {
            if (file.getFileSize() < config.getParquetSmallFileLimit()) {
                String filename = file.getFileName();
                SmallFile sf = new SmallFile();
                sf.location = new HoodieRecordLocation(FSUtils.getCommitTime(filename), FSUtils.getFileId(filename));
                sf.sizeBytes = file.getFileSize();
                smallFileLocations.add(sf);
            }
        }
    }
    return smallFileLocations;
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) ArrayList(java.util.ArrayList) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation)

Aggregations

HoodieBaseFile (org.apache.hudi.common.model.HoodieBaseFile)71 Path (org.apache.hadoop.fs.Path)40 ArrayList (java.util.ArrayList)33 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)31 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)31 FileSlice (org.apache.hudi.common.model.FileSlice)29 List (java.util.List)27 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)27 IOException (java.io.IOException)26 FileStatus (org.apache.hadoop.fs.FileStatus)25 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)24 Pair (org.apache.hudi.common.util.collection.Pair)24 Option (org.apache.hudi.common.util.Option)23 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)23 Collectors (java.util.stream.Collectors)21 Test (org.junit.jupiter.api.Test)21 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)21 Map (java.util.Map)20 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)20 HoodieTable (org.apache.hudi.table.HoodieTable)20