Search in sources :

Example 1 with SparkDeleteDeltaCommitActionExecutor

use of org.apache.hudi.table.action.deltacommit.SparkDeleteDeltaCommitActionExecutor in project hudi by apache.

the class TestHoodieMergeOnReadTable method testHandleUpdateWithMultiplePartitions.

/**
 * Test to validate invoking table.handleUpdate() with input records from multiple partitions will fail.
 */
@Test
public void testHandleUpdateWithMultiplePartitions() throws Exception {
    HoodieWriteConfig cfg = getConfig(true);
    try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
        /**
         * Write 1 (only inserts, written as base file)
         */
        String newCommitTime = "001";
        client.startCommitWithTime(newCommitTime);
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 20);
        JavaRDD<HoodieRecord> writeRecords = jsc().parallelize(records, 1);
        List<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime).collect();
        assertNoWriteErrors(statuses);
        HoodieSparkMergeOnReadTable hoodieTable = (HoodieSparkMergeOnReadTable) HoodieSparkTable.create(cfg, context(), metaClient);
        Option<HoodieInstant> deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
        assertTrue(deltaCommit.isPresent());
        assertEquals("001", deltaCommit.get().getTimestamp(), "Delta commit should be 001");
        Option<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
        assertFalse(commit.isPresent());
        FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable);
        BaseFileOnlyView roView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles);
        Stream<HoodieBaseFile> dataFilesToRead = roView.getLatestBaseFiles();
        assertFalse(dataFilesToRead.findAny().isPresent());
        roView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
        dataFilesToRead = roView.getLatestBaseFiles();
        assertTrue(dataFilesToRead.findAny().isPresent(), "should list the base files we wrote in the delta commit");
        /**
         * Write 2 (only updates, written to .log file)
         */
        newCommitTime = "002";
        client.startCommitWithTime(newCommitTime);
        metaClient.reloadActiveTimeline();
        records = dataGen.generateUpdates(newCommitTime, records);
        writeRecords = jsc().parallelize(records, 1);
        statuses = client.upsert(writeRecords, newCommitTime).collect();
        assertNoWriteErrors(statuses);
        /**
         * Write 3 (only deletes, written to .log file)
         */
        final String newDeleteTime = "004";
        final String partitionPath = records.get(0).getPartitionPath();
        final String fileId = statuses.get(0).getFileId();
        client.startCommitWithTime(newDeleteTime);
        metaClient.reloadActiveTimeline();
        List<HoodieRecord> fewRecordsForDelete = dataGen.generateDeletesFromExistingRecords(records);
        JavaRDD<HoodieRecord> deleteRDD = jsc().parallelize(fewRecordsForDelete, 1);
        // initialize partitioner
        hoodieTable.getHoodieView().sync();
        BaseSparkDeltaCommitActionExecutor actionExecutor = new SparkDeleteDeltaCommitActionExecutor(context(), cfg, hoodieTable, newDeleteTime, HoodieJavaRDD.of(deleteRDD));
        actionExecutor.getUpsertPartitioner(new WorkloadProfile(buildProfile(deleteRDD)));
        final List<List<WriteStatus>> deleteStatus = jsc().parallelize(Arrays.asList(1)).map(x -> {
            return actionExecutor.handleUpdate(partitionPath, fileId, fewRecordsForDelete.iterator());
        }).map(Transformations::flatten).collect();
        // Verify there are  errors because records are from multiple partitions (but handleUpdate is invoked for
        // specific partition)
        WriteStatus status = deleteStatus.get(0).get(0);
        assertTrue(status.hasErrors());
        long numRecordsInPartition = fewRecordsForDelete.stream().filter(u -> u.getPartitionPath().equals(partitionPath)).count();
        assertEquals(fewRecordsForDelete.size() - numRecordsInPartition, status.getTotalErrorRecords());
    }
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieClientTestHarness.buildProfile(org.apache.hudi.testutils.HoodieClientTestHarness.buildProfile) BeforeEach(org.junit.jupiter.api.BeforeEach) HoodieMergeOnReadTestUtils(org.apache.hudi.testutils.HoodieMergeOnReadTestUtils) Arrays(java.util.Arrays) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) StorageLevel(org.apache.spark.storage.StorageLevel) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) BaseSparkDeltaCommitActionExecutor(org.apache.hudi.table.action.deltacommit.BaseSparkDeltaCommitActionExecutor) Map(java.util.Map) SparkHoodieBackedTableMetadataWriter(org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter) Path(org.apache.hadoop.fs.Path) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) IndexType(org.apache.hudi.index.HoodieIndex.IndexType) Set(java.util.Set) Collectors(java.util.stream.Collectors) HoodieIndex(org.apache.hudi.index.HoodieIndex) Test(org.junit.jupiter.api.Test) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) Stream(java.util.stream.Stream) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) SparkDeleteDeltaCommitActionExecutor(org.apache.hudi.table.action.deltacommit.SparkDeleteDeltaCommitActionExecutor) HoodieClientTestUtils(org.apache.hudi.testutils.HoodieClientTestUtils) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) Dataset(org.apache.spark.sql.Dataset) FileSlice(org.apache.hudi.common.model.FileSlice) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) State(org.apache.hudi.common.table.timeline.HoodieInstant.State) HoodieReadClient(org.apache.hudi.client.HoodieReadClient) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) ValueSource(org.junit.jupiter.params.provider.ValueSource) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Assertions.assertNoWriteErrors(org.apache.hudi.testutils.Assertions.assertNoWriteErrors) Properties(java.util.Properties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) BaseFileOnlyView(org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) Row(org.apache.spark.sql.Row) JobConf(org.apache.hadoop.mapred.JobConf) WriteStatus(org.apache.hudi.client.WriteStatus) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) Transformations(org.apache.hudi.common.testutils.Transformations) SparkClientFunctionalTestHarness(org.apache.hudi.testutils.SparkClientFunctionalTestHarness) HoodieTableMetadataWriter(org.apache.hudi.metadata.HoodieTableMetadataWriter) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) HoodieClusteringConfig(org.apache.hudi.config.HoodieClusteringConfig) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) FileStatus(org.apache.hadoop.fs.FileStatus) SparkDeleteDeltaCommitActionExecutor(org.apache.hudi.table.action.deltacommit.SparkDeleteDeltaCommitActionExecutor) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) BaseSparkDeltaCommitActionExecutor(org.apache.hudi.table.action.deltacommit.BaseSparkDeltaCommitActionExecutor) BaseFileOnlyView(org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView) List(java.util.List) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) WriteStatus(org.apache.hudi.client.WriteStatus) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Aggregations

IOException (java.io.IOException)1 Arrays (java.util.Arrays)1 HashMap (java.util.HashMap)1 List (java.util.List)1 Map (java.util.Map)1 Properties (java.util.Properties)1 Set (java.util.Set)1 Collectors (java.util.stream.Collectors)1 Stream (java.util.stream.Stream)1 GenericRecord (org.apache.avro.generic.GenericRecord)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 Path (org.apache.hadoop.fs.Path)1 JobConf (org.apache.hadoop.mapred.JobConf)1 HoodieReadClient (org.apache.hudi.client.HoodieReadClient)1 SparkRDDWriteClient (org.apache.hudi.client.SparkRDDWriteClient)1 WriteStatus (org.apache.hudi.client.WriteStatus)1 FileSlice (org.apache.hudi.common.model.FileSlice)1 HoodieBaseFile (org.apache.hudi.common.model.HoodieBaseFile)1 HoodieCommitMetadata (org.apache.hudi.common.model.HoodieCommitMetadata)1 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)1