Search in sources :

Example 1 with SparkRDDWriteClient

use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.

the class TestCleaner method testFailedInsertAndCleanByCommits.

/**
 * Test Helper for Cleaning failed commits by commits logic from HoodieWriteClient API perspective.
 *
 * @param insertFn Insert API to be tested
 * @param isPreppedAPI Flag to indicate if a prepped-version is used. If true, a wrapper function will be used during
 *        record generation to also tag the regards (de-dupe is implicit as we use uniq record-gen APIs)
 * @throws Exception in case of errors
 */
private void testFailedInsertAndCleanByCommits(Function3<JavaRDD<WriteStatus>, SparkRDDWriteClient, JavaRDD<HoodieRecord>, String> insertFn, boolean isPreppedAPI) throws Exception {
    // keep upto 3 commits from the past
    int maxCommits = 3;
    HoodieWriteConfig cfg = getConfigBuilder().withAutoCommit(false).withHeartbeatIntervalInMs(3000).withCompactionConfig(HoodieCompactionConfig.newBuilder().withFailedWritesCleaningPolicy(HoodieFailedWritesCleaningPolicy.LAZY).withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(maxCommits).build()).withParallelism(1, 1).withBulkInsertParallelism(1).withFinalizeWriteParallelism(1).withDeleteParallelism(1).withConsistencyGuardConfig(ConsistencyGuardConfig.newBuilder().withConsistencyCheckEnabled(true).build()).build();
    SparkRDDWriteClient client = getHoodieWriteClient(cfg);
    final Function2<List<HoodieRecord>, String, Integer> recordInsertGenWrappedFunction = generateWrapRecordsFn(isPreppedAPI, cfg, dataGen::generateInserts);
    Pair<String, JavaRDD<WriteStatus>> result = insertFirstBigBatchForClientCleanerTest(cfg, client, recordInsertGenWrappedFunction, insertFn, HoodieCleaningPolicy.KEEP_LATEST_COMMITS);
    client.commit(result.getLeft(), result.getRight());
    HoodieTable table = HoodieSparkTable.create(client.getConfig(), context, metaClient);
    assertTrue(table.getCompletedCleanTimeline().empty());
    insertFirstFailedBigBatchForClientCleanerTest(cfg, client, recordInsertGenWrappedFunction, insertFn, HoodieCleaningPolicy.KEEP_LATEST_COMMITS);
    insertFirstFailedBigBatchForClientCleanerTest(cfg, client, recordInsertGenWrappedFunction, insertFn, HoodieCleaningPolicy.KEEP_LATEST_COMMITS);
    Pair<String, JavaRDD<WriteStatus>> ret = insertFirstFailedBigBatchForClientCleanerTest(cfg, client, recordInsertGenWrappedFunction, insertFn, HoodieCleaningPolicy.KEEP_LATEST_COMMITS);
    // Await till enough time passes such that the last failed commits heartbeats are expired
    await().atMost(10, TimeUnit.SECONDS).until(() -> client.getHeartbeatClient().isHeartbeatExpired(ret.getLeft()));
    List<HoodieCleanStat> cleanStats = runCleaner(cfg);
    assertEquals(0, cleanStats.size(), "Must not clean any files");
    HoodieActiveTimeline timeline = metaClient.reloadActiveTimeline();
    assertTrue(timeline.getTimelineOfActions(CollectionUtils.createSet(HoodieTimeline.ROLLBACK_ACTION)).filterCompletedInstants().countInstants() == 3);
    Option<HoodieInstant> rollBackInstantForFailedCommit = timeline.getTimelineOfActions(CollectionUtils.createSet(HoodieTimeline.ROLLBACK_ACTION)).filterCompletedInstants().lastInstant();
    HoodieRollbackMetadata rollbackMetadata = TimelineMetadataUtils.deserializeAvroMetadata(timeline.getInstantDetails(rollBackInstantForFailedCommit.get()).get(), HoodieRollbackMetadata.class);
    // Rollback of one of the failed writes should have deleted 3 files
    assertEquals(3, rollbackMetadata.getTotalFilesDeleted());
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieRollbackMetadata(org.apache.hudi.avro.model.HoodieRollbackMetadata) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieCleanStat(org.apache.hudi.common.HoodieCleanStat) ArrayList(java.util.ArrayList) List(java.util.List)

Example 2 with SparkRDDWriteClient

use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.

the class TestHoodieMergeOnReadTable method testUpsertPartitioner.

@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testUpsertPartitioner(boolean populateMetaFields) throws Exception {
    HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(true);
    addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields);
    HoodieWriteConfig cfg = cfgBuilder.build();
    try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
        /**
         * Write 1 (only inserts, written as base file)
         */
        String newCommitTime = "001";
        client.startCommitWithTime(newCommitTime);
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 20);
        JavaRDD<HoodieRecord> writeRecords = jsc().parallelize(records, 1);
        List<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime).collect();
        assertNoWriteErrors(statuses);
        HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient);
        Option<HoodieInstant> deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
        assertTrue(deltaCommit.isPresent());
        assertEquals("001", deltaCommit.get().getTimestamp(), "Delta commit should be 001");
        Option<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
        assertFalse(commit.isPresent());
        FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable);
        BaseFileOnlyView roView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitsTimeline().filterCompletedInstants(), allFiles);
        Stream<HoodieBaseFile> dataFilesToRead = roView.getLatestBaseFiles();
        Map<String, Long> fileIdToSize = dataFilesToRead.collect(Collectors.toMap(HoodieBaseFile::getFileId, HoodieBaseFile::getFileSize));
        roView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
        dataFilesToRead = roView.getLatestBaseFiles();
        List<HoodieBaseFile> dataFilesList = dataFilesToRead.collect(Collectors.toList());
        assertTrue(dataFilesList.size() > 0, "Should list the base files we wrote in the delta commit");
        /**
         * Write 2 (only updates + inserts, written to .log file + correction of existing base file size)
         */
        newCommitTime = "002";
        client.startCommitWithTime(newCommitTime);
        List<HoodieRecord> newRecords = dataGen.generateUpdates(newCommitTime, records);
        newRecords.addAll(dataGen.generateInserts(newCommitTime, 20));
        statuses = client.upsert(jsc().parallelize(newRecords), newCommitTime).collect();
        // Verify there are no errors
        assertNoWriteErrors(statuses);
        metaClient = HoodieTableMetaClient.reload(metaClient);
        deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().lastInstant();
        assertTrue(deltaCommit.isPresent());
        assertEquals("002", deltaCommit.get().getTimestamp(), "Latest Delta commit should be 002");
        commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
        assertFalse(commit.isPresent());
        allFiles = listAllBaseFilesInPath(hoodieTable);
        roView = getHoodieTableFileSystemView(metaClient, hoodieTable.getActiveTimeline().reload().getCommitsTimeline().filterCompletedInstants(), allFiles);
        dataFilesToRead = roView.getLatestBaseFiles();
        List<HoodieBaseFile> newDataFilesList = dataFilesToRead.collect(Collectors.toList());
        Map<String, Long> fileIdToNewSize = newDataFilesList.stream().collect(Collectors.toMap(HoodieBaseFile::getFileId, HoodieBaseFile::getFileSize));
        assertTrue(fileIdToNewSize.entrySet().stream().anyMatch(entry -> fileIdToSize.get(entry.getKey()) < entry.getValue()));
        List<String> inputPaths = roView.getLatestBaseFiles().map(baseFile -> new Path(baseFile.getPath()).getParent().toString()).collect(Collectors.toList());
        List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), inputPaths, basePath(), new JobConf(hadoopConf()), true, false);
        // Wrote 20 records in 2 batches
        assertEquals(40, recordsRead.size(), "Must contain 40 records");
    }
}
Also used : HoodieClientTestHarness.buildProfile(org.apache.hudi.testutils.HoodieClientTestHarness.buildProfile) BeforeEach(org.junit.jupiter.api.BeforeEach) HoodieMergeOnReadTestUtils(org.apache.hudi.testutils.HoodieMergeOnReadTestUtils) Arrays(java.util.Arrays) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) StorageLevel(org.apache.spark.storage.StorageLevel) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) BaseSparkDeltaCommitActionExecutor(org.apache.hudi.table.action.deltacommit.BaseSparkDeltaCommitActionExecutor) Map(java.util.Map) SparkHoodieBackedTableMetadataWriter(org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter) Path(org.apache.hadoop.fs.Path) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) IndexType(org.apache.hudi.index.HoodieIndex.IndexType) Set(java.util.Set) Collectors(java.util.stream.Collectors) HoodieIndex(org.apache.hudi.index.HoodieIndex) Test(org.junit.jupiter.api.Test) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) Stream(java.util.stream.Stream) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) SparkDeleteDeltaCommitActionExecutor(org.apache.hudi.table.action.deltacommit.SparkDeleteDeltaCommitActionExecutor) HoodieClientTestUtils(org.apache.hudi.testutils.HoodieClientTestUtils) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) Dataset(org.apache.spark.sql.Dataset) FileSlice(org.apache.hudi.common.model.FileSlice) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) State(org.apache.hudi.common.table.timeline.HoodieInstant.State) HoodieReadClient(org.apache.hudi.client.HoodieReadClient) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) ValueSource(org.junit.jupiter.params.provider.ValueSource) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Assertions.assertNoWriteErrors(org.apache.hudi.testutils.Assertions.assertNoWriteErrors) Properties(java.util.Properties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) BaseFileOnlyView(org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) Row(org.apache.spark.sql.Row) JobConf(org.apache.hadoop.mapred.JobConf) WriteStatus(org.apache.hudi.client.WriteStatus) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) Transformations(org.apache.hudi.common.testutils.Transformations) SparkClientFunctionalTestHarness(org.apache.hudi.testutils.SparkClientFunctionalTestHarness) HoodieTableMetadataWriter(org.apache.hudi.metadata.HoodieTableMetadataWriter) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) HoodieClusteringConfig(org.apache.hudi.config.HoodieClusteringConfig) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) JobConf(org.apache.hadoop.mapred.JobConf) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Path(org.apache.hadoop.fs.Path) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) BaseFileOnlyView(org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView) ValueSource(org.junit.jupiter.params.provider.ValueSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 3 with SparkRDDWriteClient

use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.

the class TestHoodieMergeOnReadTable method testMetadataStatsOnCommit.

/**
 * Test to ensure metadata stats are correctly written to metadata file.
 */
@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testMetadataStatsOnCommit(Boolean rollbackUsingMarkers) throws Exception {
    HoodieWriteConfig cfg = getConfigBuilder(false, rollbackUsingMarkers, IndexType.INMEMORY).withAutoCommit(false).build();
    try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
        HoodieTable table = HoodieSparkTable.create(cfg, context(), metaClient);
        // Create a commit without metadata stats in metadata to test backwards compatibility
        HoodieActiveTimeline activeTimeline = table.getActiveTimeline();
        String commitActionType = table.getMetaClient().getCommitActionType();
        HoodieInstant instant = new HoodieInstant(State.REQUESTED, commitActionType, "000");
        activeTimeline.createNewInstant(instant);
        activeTimeline.transitionRequestedToInflight(instant, Option.empty());
        instant = new HoodieInstant(State.INFLIGHT, commitActionType, "000");
        activeTimeline.saveAsComplete(instant, Option.empty());
        String instantTime = "001";
        client.startCommitWithTime(instantTime);
        List<HoodieRecord> records = dataGen.generateInserts(instantTime, 200);
        JavaRDD<HoodieRecord> writeRecords = jsc().parallelize(records, 1);
        JavaRDD<WriteStatus> statuses = client.insert(writeRecords, instantTime);
        assertTrue(client.commit(instantTime, statuses), "Commit should succeed");
        // Read from commit file
        table = HoodieSparkTable.create(cfg, context());
        HoodieCommitMetadata metadata = HoodieCommitMetadata.fromBytes(table.getActiveTimeline().getInstantDetails(table.getActiveTimeline().getDeltaCommitTimeline().lastInstant().get()).get(), HoodieCommitMetadata.class);
        int inserts = 0;
        for (Map.Entry<String, List<HoodieWriteStat>> pstat : metadata.getPartitionToWriteStats().entrySet()) {
            for (HoodieWriteStat stat : pstat.getValue()) {
                inserts += stat.getNumInserts();
            }
        }
        assertEquals(200, inserts);
        instantTime = "002";
        client.startCommitWithTime(instantTime);
        records = dataGen.generateUpdates(instantTime, records);
        writeRecords = jsc().parallelize(records, 1);
        statuses = client.upsert(writeRecords, instantTime);
        // assertTrue(client.commit(instantTime, statuses), "Commit should succeed");
        inserts = 0;
        int upserts = 0;
        List<WriteStatus> writeStatusList = statuses.collect();
        for (WriteStatus ws : writeStatusList) {
            inserts += ws.getStat().getNumInserts();
            upserts += ws.getStat().getNumUpdateWrites();
        }
        // Read from commit file
        assertEquals(0, inserts);
        assertEquals(200, upserts);
        client.rollback(instantTime);
        // Read from commit file
        table = HoodieSparkTable.create(cfg, context());
        metadata = HoodieCommitMetadata.fromBytes(table.getActiveTimeline().getInstantDetails(table.getActiveTimeline().getDeltaCommitTimeline().lastInstant().get()).get(), HoodieCommitMetadata.class);
        inserts = 0;
        upserts = 0;
        for (Map.Entry<String, List<HoodieWriteStat>> pstat : metadata.getPartitionToWriteStats().entrySet()) {
            for (HoodieWriteStat stat : pstat.getValue()) {
                inserts += stat.getNumInserts();
                upserts += stat.getNumUpdateWrites();
            }
        }
        assertEquals(200, inserts);
        assertEquals(0, upserts);
    }
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) List(java.util.List) Map(java.util.Map) HashMap(java.util.HashMap) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) WriteStatus(org.apache.hudi.client.WriteStatus) ValueSource(org.junit.jupiter.params.provider.ValueSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 4 with SparkRDDWriteClient

use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.

the class TestHoodieMergeOnReadTable method testMetadataAggregateFromWriteStatus.

// Check if record level metadata is aggregated properly at the end of write.
@Test
public void testMetadataAggregateFromWriteStatus() throws Exception {
    HoodieWriteConfig cfg = getConfigBuilder(false).withWriteStatusClass(MetadataMergeWriteStatus.class).build();
    try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
        String newCommitTime = "001";
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 200);
        JavaRDD<HoodieRecord> writeRecords = jsc().parallelize(records, 1);
        client.startCommitWithTime(newCommitTime);
        List<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime).collect();
        assertNoWriteErrors(statuses);
        Map<String, String> allWriteStatusMergedMetadataMap = MetadataMergeWriteStatus.mergeMetadataForWriteStatuses(statuses);
        assertTrue(allWriteStatusMergedMetadataMap.containsKey("InputRecordCount_1506582000"));
        // For metadata key InputRecordCount_1506582000, value is 2 for each record. So sum of this
        // should be 2 * records.size()
        assertEquals(String.valueOf(2 * records.size()), allWriteStatusMergedMetadataMap.get("InputRecordCount_1506582000"));
    }
}
Also used : SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) WriteStatus(org.apache.hudi.client.WriteStatus) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 5 with SparkRDDWriteClient

use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.

the class TestHoodieMergeOnReadTable method testRollingStatsWithSmallFileHandling.

/**
 * Test to ensure rolling stats are correctly written to the metadata file, identifies small files and corrects them.
 */
@Test
public void testRollingStatsWithSmallFileHandling() throws Exception {
    HoodieWriteConfig cfg = getConfigBuilder(false, IndexType.INMEMORY).withAutoCommit(false).build();
    try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
        Map<String, Long> fileIdToInsertsMap = new HashMap<>();
        Map<String, Long> fileIdToUpsertsMap = new HashMap<>();
        String instantTime = "000";
        client.startCommitWithTime(instantTime);
        List<HoodieRecord> records = dataGen.generateInserts(instantTime, 200);
        JavaRDD<HoodieRecord> writeRecords = jsc().parallelize(records, 1);
        JavaRDD<WriteStatus> statuses = client.insert(writeRecords, instantTime);
        assertTrue(client.commit(instantTime, statuses), "Commit should succeed");
        // Read from commit file
        HoodieTable table = HoodieSparkTable.create(cfg, context());
        HoodieCommitMetadata metadata = HoodieCommitMetadata.fromBytes(table.getActiveTimeline().getInstantDetails(table.getActiveTimeline().getDeltaCommitTimeline().lastInstant().get()).get(), HoodieCommitMetadata.class);
        int inserts = 0;
        for (Map.Entry<String, List<HoodieWriteStat>> pstat : metadata.getPartitionToWriteStats().entrySet()) {
            for (HoodieWriteStat stat : pstat.getValue()) {
                inserts += stat.getNumInserts();
                fileIdToInsertsMap.put(stat.getFileId(), stat.getNumInserts());
                fileIdToUpsertsMap.put(stat.getFileId(), stat.getNumUpdateWrites());
            }
        }
        assertEquals(200, inserts);
        instantTime = "001";
        client.startCommitWithTime(instantTime);
        // generate updates + inserts. inserts should be handled into small files
        records = dataGen.generateUpdates(instantTime, records);
        records.addAll(dataGen.generateInserts(instantTime, 200));
        writeRecords = jsc().parallelize(records, 1);
        statuses = client.upsert(writeRecords, instantTime);
        assertTrue(client.commit(instantTime, statuses), "Commit should succeed");
        // Read from commit file
        table = HoodieSparkTable.create(cfg, context());
        metadata = HoodieCommitMetadata.fromBytes(table.getActiveTimeline().getInstantDetails(table.getActiveTimeline().getDeltaCommitTimeline().lastInstant().get()).get(), HoodieCommitMetadata.class);
        inserts = 0;
        int upserts = 0;
        for (Map.Entry<String, List<HoodieWriteStat>> pstat : metadata.getPartitionToWriteStats().entrySet()) {
            for (HoodieWriteStat stat : pstat.getValue()) {
                assertTrue(fileIdToInsertsMap.containsKey(stat.getFileId()));
                assertTrue(fileIdToUpsertsMap.containsKey(stat.getFileId()));
                inserts += stat.getNumInserts();
                upserts += stat.getNumUpdateWrites();
            }
        }
        assertEquals(200, inserts);
        assertEquals(200, upserts);
        // Test small file handling after compaction
        instantTime = "002";
        client.scheduleCompactionAtInstant(instantTime, Option.of(metadata.getExtraMetadata()));
        HoodieWriteMetadata<JavaRDD<WriteStatus>> compactionMetadata = client.compact(instantTime);
        statuses = compactionMetadata.getWriteStatuses();
        client.commitCompaction(instantTime, compactionMetadata.getCommitMetadata().get(), Option.empty());
        // Read from commit file
        table = HoodieSparkTable.create(cfg, context());
        HoodieCommitMetadata metadata1 = HoodieCommitMetadata.fromBytes(table.getActiveTimeline().getInstantDetails(table.getActiveTimeline().getCommitsTimeline().lastInstant().get()).get(), HoodieCommitMetadata.class);
        // Ensure that the metadata stats from the extra metadata of delta commits is copied over to the compaction commit
        for (Map.Entry<String, List<HoodieWriteStat>> pstat : metadata.getPartitionToWriteStats().entrySet()) {
            assertTrue(metadata1.getPartitionToWriteStats().containsKey(pstat.getKey()));
            assertEquals(metadata1.getPartitionToWriteStats().get(pstat.getKey()).size(), pstat.getValue().size());
        }
        // Write inserts + updates
        instantTime = "003";
        client.startCommitWithTime(instantTime);
        // generate updates + inserts. inserts should be handled into small files
        records = dataGen.generateUpdates(instantTime, records);
        records.addAll(dataGen.generateInserts(instantTime, 200));
        writeRecords = jsc().parallelize(records, 1);
        statuses = client.upsert(writeRecords, instantTime);
        assertTrue(client.commit(instantTime, statuses), "Commit should succeed");
        // Read from commit file
        table = HoodieSparkTable.create(cfg, context());
        metadata = HoodieCommitMetadata.fromBytes(table.getActiveTimeline().getInstantDetails(table.getActiveTimeline().getDeltaCommitTimeline().lastInstant().get()).get(), HoodieCommitMetadata.class);
        inserts = 0;
        upserts = 0;
        for (Map.Entry<String, List<HoodieWriteStat>> pstat : metadata.getPartitionToWriteStats().entrySet()) {
            for (HoodieWriteStat stat : pstat.getValue()) {
                assertTrue(fileIdToInsertsMap.containsKey(stat.getFileId()));
                inserts += stat.getNumInserts();
                upserts += stat.getNumUpdateWrites();
            }
        }
        assertEquals(200, inserts);
        assertEquals(400, upserts);
    }
}
Also used : SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) HashMap(java.util.HashMap) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) List(java.util.List) Map(java.util.Map) HashMap(java.util.HashMap) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) WriteStatus(org.apache.hudi.client.WriteStatus) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Aggregations

SparkRDDWriteClient (org.apache.hudi.client.SparkRDDWriteClient)143 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)127 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)113 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)86 Test (org.junit.jupiter.api.Test)80 WriteStatus (org.apache.hudi.client.WriteStatus)76 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)74 HoodieTestDataGenerator (org.apache.hudi.common.testutils.HoodieTestDataGenerator)61 List (java.util.List)59 ArrayList (java.util.ArrayList)51 HoodieTable (org.apache.hudi.table.HoodieTable)51 Path (org.apache.hadoop.fs.Path)47 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)47 JavaRDD (org.apache.spark.api.java.JavaRDD)47 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)44 Collectors (java.util.stream.Collectors)43 Assertions.assertEquals (org.junit.jupiter.api.Assertions.assertEquals)43 HoodieCompactionConfig (org.apache.hudi.config.HoodieCompactionConfig)42 HashMap (java.util.HashMap)41 Properties (java.util.Properties)41