Search in sources :

Example 36 with SparkRDDWriteClient

use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.

the class TestHoodieMergeHandle method testHoodieMergeHandleWriteStatMetrics.

@ParameterizedTest
@MethodSource("testArguments")
public void testHoodieMergeHandleWriteStatMetrics(ExternalSpillableMap.DiskMapType diskMapType, boolean isCompressionEnabled) throws Exception {
    // insert 100 records
    // Build a common config with diff configs
    Properties properties = new Properties();
    properties.setProperty(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.key(), diskMapType.name());
    properties.setProperty(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.key(), String.valueOf(isCompressionEnabled));
    HoodieWriteConfig config = getConfigBuilder().withProperties(properties).build();
    try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config)) {
        String newCommitTime = "100";
        writeClient.startCommitWithTime(newCommitTime);
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 100);
        JavaRDD<HoodieRecord> recordsRDD = jsc.parallelize(records, 1);
        List<WriteStatus> statuses = writeClient.insert(recordsRDD, newCommitTime).collect();
        // All records should be inserts into new parquet
        assertTrue(statuses.stream().filter(status -> status.getStat().getPrevCommit() != HoodieWriteStat.NULL_COMMIT).count() > 0);
        // Num writes should be equal to the number of records inserted
        assertEquals(100, (long) statuses.stream().map(status -> status.getStat().getNumWrites()).reduce((a, b) -> a + b).get());
        // Num update writes should be equal to the number of records updated
        assertEquals(0, (long) statuses.stream().map(status -> status.getStat().getNumUpdateWrites()).reduce((a, b) -> a + b).get());
        // Num update writes should be equal to the number of insert records converted to updates as part of small file
        // handling
        assertEquals(100, (long) statuses.stream().map(status -> status.getStat().getNumInserts()).reduce((a, b) -> a + b).get());
        // Update all the 100 records
        metaClient = HoodieTableMetaClient.reload(metaClient);
        newCommitTime = "101";
        writeClient.startCommitWithTime(newCommitTime);
        List<HoodieRecord> updatedRecords = dataGen.generateUpdates(newCommitTime, records);
        JavaRDD<HoodieRecord> updatedRecordsRDD = jsc.parallelize(updatedRecords, 1);
        statuses = writeClient.upsert(updatedRecordsRDD, newCommitTime).collect();
        // All records should be upserts into existing parquet
        assertEquals(0, statuses.stream().filter(status -> status.getStat().getPrevCommit() == HoodieWriteStat.NULL_COMMIT).count());
        // Num writes should be equal to the number of records inserted
        assertEquals(100, (long) statuses.stream().map(status -> status.getStat().getNumWrites()).reduce((a, b) -> a + b).get());
        // Num update writes should be equal to the number of records updated
        assertEquals(100, (long) statuses.stream().map(status -> status.getStat().getNumUpdateWrites()).reduce((a, b) -> a + b).get());
        // Num update writes should be equal to the number of insert records converted to updates as part of small file
        // handling
        assertEquals(0, (long) statuses.stream().map(status -> status.getStat().getNumInserts()).reduce((a, b) -> a + b).get());
        newCommitTime = "102";
        writeClient.startCommitWithTime(newCommitTime);
        List<HoodieRecord> allRecords = dataGen.generateInserts(newCommitTime, 100);
        allRecords.addAll(updatedRecords);
        JavaRDD<HoodieRecord> allRecordsRDD = jsc.parallelize(allRecords, 1);
        statuses = writeClient.upsert(allRecordsRDD, newCommitTime).collect();
        // All records should be upserts into existing parquet (with inserts as updates small file handled)
        assertEquals(0, (long) statuses.stream().filter(status -> status.getStat().getPrevCommit() == HoodieWriteStat.NULL_COMMIT).count());
        // Num writes should be equal to the total number of records written
        assertEquals(200, (long) statuses.stream().map(status -> status.getStat().getNumWrites()).reduce((a, b) -> a + b).get());
        // Num update writes should be equal to the number of records updated (including inserts converted as updates)
        assertEquals(100, (long) statuses.stream().map(status -> status.getStat().getNumUpdateWrites()).reduce((a, b) -> a + b).get());
        // Num update writes should be equal to the number of insert records converted to updates as part of small file
        // handling
        assertEquals(100, (long) statuses.stream().map(status -> status.getStat().getNumInserts()).reduce((a, b) -> a + b).get());
        // Verify all records have location set
        statuses.forEach(writeStatus -> {
            writeStatus.getWrittenRecords().forEach(r -> {
                // Ensure New Location is set
                assertTrue(r.getNewLocation().isPresent());
            });
        });
    }
}
Also used : BeforeEach(org.junit.jupiter.api.BeforeEach) Dataset(org.apache.spark.sql.Dataset) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Assertions.assertNotEquals(org.junit.jupiter.api.Assertions.assertNotEquals) HoodieClientTestHarness(org.apache.hudi.testutils.HoodieClientTestHarness) ArrayList(java.util.ArrayList) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieStorageConfig(org.apache.hudi.config.HoodieStorageConfig) ExternalSpillableMap(org.apache.hudi.common.util.collection.ExternalSpillableMap) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) Arguments.arguments(org.junit.jupiter.params.provider.Arguments.arguments) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) MethodSource(org.junit.jupiter.params.provider.MethodSource) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Assertions.assertNoWriteErrors(org.apache.hudi.testutils.Assertions.assertNoWriteErrors) Properties(java.util.Properties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Row(org.apache.spark.sql.Row) Arguments(org.junit.jupiter.params.provider.Arguments) HoodieIndex(org.apache.hudi.index.HoodieIndex) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) WriteStatus(org.apache.hudi.client.WriteStatus) AfterEach(org.junit.jupiter.api.AfterEach) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) List(java.util.List) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) Stream(java.util.stream.Stream) Paths(java.nio.file.Paths) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) HoodieClientTestUtils(org.apache.hudi.testutils.HoodieClientTestUtils) HoodieCommonConfig(org.apache.hudi.common.config.HoodieCommonConfig) FSUtils(org.apache.hudi.common.fs.FSUtils) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Properties(java.util.Properties) WriteStatus(org.apache.hudi.client.WriteStatus) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 37 with SparkRDDWriteClient

use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.

the class TestSparkHoodieHBaseIndex method testSimpleTagLocationWithInvalidCommit.

/*
   * Test case to verify that for taglocation entries present in HBase, if the corresponding commit instant is missing
   * in timeline and the commit is not archived, taglocation would reset the current record location to null.
   */
@Test
public void testSimpleTagLocationWithInvalidCommit() throws Exception {
    // Load to memory
    HoodieWriteConfig config = getConfigBuilder(100, false, false).withRollbackUsingMarkers(false).build();
    SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config);
    SparkRDDWriteClient writeClient = getHoodieWriteClient(config);
    String newCommitTime = writeClient.startCommit();
    // make a commit with 199 records
    JavaRDD<HoodieRecord> writeRecords = generateAndCommitRecords(writeClient, 199, newCommitTime);
    // make a second commit with a single record
    String invalidCommit = writeClient.startCommit();
    JavaRDD<HoodieRecord> invalidWriteRecords = generateAndCommitRecords(writeClient, 1, invalidCommit);
    // verify location is tagged.
    HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
    JavaRDD<HoodieRecord> javaRDD0 = tagLocation(index, invalidWriteRecords, hoodieTable);
    // one record present
    assert (javaRDD0.collect().size() == 1);
    // it is tagged
    assert (javaRDD0.filter(HoodieRecord::isCurrentLocationKnown).collect().size() == 1);
    assert (javaRDD0.collect().get(0).getCurrentLocation().getInstantTime().equals(invalidCommit));
    // rollback the invalid commit, so that hbase will be left with a stale entry.
    writeClient.rollback(invalidCommit);
    // Now tagLocation for the valid records, hbaseIndex should tag them
    metaClient = HoodieTableMetaClient.reload(metaClient);
    hoodieTable = HoodieSparkTable.create(config, context, metaClient);
    JavaRDD<HoodieRecord> javaRDD1 = tagLocation(index, writeRecords, hoodieTable);
    assert (javaRDD1.filter(HoodieRecord::isCurrentLocationKnown).collect().size() == 199);
    // tagLocation for the invalid record - commit is not present in timeline due to rollback.
    JavaRDD<HoodieRecord> javaRDD2 = tagLocation(index, invalidWriteRecords, hoodieTable);
    // one record present
    assert (javaRDD2.collect().size() == 1);
    // it is not tagged
    assert (javaRDD2.filter(HoodieRecord::isCurrentLocationKnown).collect().size() == 0);
}
Also used : SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 38 with SparkRDDWriteClient

use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.

the class TestSparkHoodieHBaseIndex method testEnsureTagLocationUsesCommitTimeline.

/*
   * Test case to verify that taglocation() uses the commit timeline to validate the commitTS stored in hbase.
   * When CheckIfValidCommit() in HbaseIndex uses the incorrect timeline filtering, this test would fail.
   */
@Test
public void testEnsureTagLocationUsesCommitTimeline() throws Exception {
    // Load to memory
    HoodieWriteConfig config = getConfigBuilder(100, false, false).withRollbackUsingMarkers(false).build();
    SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config);
    SparkRDDWriteClient writeClient = getHoodieWriteClient(config);
    String commitTime1 = writeClient.startCommit();
    JavaRDD<HoodieRecord> writeRecords1 = generateAndCommitRecords(writeClient, 20, commitTime1);
    // rollback the commit - leaves a clean file in timeline.
    writeClient.rollback(commitTime1);
    // create a second commit with 20 records
    metaClient = HoodieTableMetaClient.reload(metaClient);
    generateAndCommitRecords(writeClient, 20);
    // Now tagLocation for the first set of rolledback records, hbaseIndex should tag them
    metaClient = HoodieTableMetaClient.reload(metaClient);
    HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
    JavaRDD<HoodieRecord> javaRDD1 = tagLocation(index, writeRecords1, hoodieTable);
    assert (javaRDD1.filter(HoodieRecord::isCurrentLocationKnown).collect().size() == 20);
}
Also used : SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 39 with SparkRDDWriteClient

use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.

the class TestSparkHoodieHBaseIndex method testSimpleTagLocationAndUpdate.

@ParameterizedTest
@EnumSource(HoodieTableType.class)
public void testSimpleTagLocationAndUpdate(HoodieTableType tableType) throws Exception {
    metaClient = HoodieTestUtils.init(hadoopConf, basePath, tableType);
    final String newCommitTime = "001";
    final int numRecords = 10;
    List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, numRecords);
    JavaRDD<HoodieRecord> writeRecords = jsc().parallelize(records, 1);
    // Load to memory
    HoodieWriteConfig config = getConfig();
    SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config);
    try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config)) {
        metaClient = HoodieTableMetaClient.reload(metaClient);
        HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
        // Test tagLocation without any entries in index
        JavaRDD<HoodieRecord> records1 = tagLocation(index, writeRecords, hoodieTable);
        assertEquals(0, records1.filter(record -> record.isCurrentLocationKnown()).count());
        // Insert 200 records
        writeClient.startCommitWithTime(newCommitTime);
        JavaRDD<WriteStatus> writeStatues = writeClient.upsert(writeRecords, newCommitTime);
        assertNoWriteErrors(writeStatues.collect());
        // Now tagLocation for these records, hbaseIndex should not tag them since commit never occurred
        JavaRDD<HoodieRecord> records2 = tagLocation(index, writeRecords, hoodieTable);
        assertEquals(0, records2.filter(record -> record.isCurrentLocationKnown()).count());
        // Now commit this & update location of records inserted and validate no errors
        writeClient.commit(newCommitTime, writeStatues);
        // Now tagLocation for these records, hbaseIndex should tag them correctly
        metaClient = HoodieTableMetaClient.reload(metaClient);
        hoodieTable = HoodieSparkTable.create(config, context, metaClient);
        List<HoodieRecord> records3 = tagLocation(index, writeRecords, hoodieTable).collect();
        assertEquals(numRecords, records3.stream().filter(record -> record.isCurrentLocationKnown()).count());
        assertEquals(numRecords, records3.stream().map(record -> record.getKey().getRecordKey()).distinct().count());
        assertEquals(numRecords, records3.stream().filter(record -> (record.getCurrentLocation() != null && record.getCurrentLocation().getInstantTime().equals(newCommitTime))).distinct().count());
    }
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) BeforeEach(org.junit.jupiter.api.BeforeEach) Arrays(java.util.Arrays) Result(org.apache.hadoop.hbase.client.Result) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) AfterAll(org.junit.jupiter.api.AfterAll) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) BeforeAll(org.junit.jupiter.api.BeforeAll) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) HoodieStorageConfig(org.apache.hudi.config.HoodieStorageConfig) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) Tag(org.junit.jupiter.api.Tag) Get(org.apache.hadoop.hbase.client.Get) UUID(java.util.UUID) Tuple2(scala.Tuple2) Collectors(java.util.stream.Collectors) HoodieIndex(org.apache.hudi.index.HoodieIndex) Test(org.junit.jupiter.api.Test) List(java.util.List) HBaseConfiguration(org.apache.hadoop.hbase.HBaseConfiguration) HBaseTestingUtility(org.apache.hadoop.hbase.HBaseTestingUtility) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) Mockito.atMost(org.mockito.Mockito.atMost) Mockito.mock(org.mockito.Mockito.mock) ArgumentMatchers.any(org.mockito.ArgumentMatchers.any) Option(org.apache.hudi.common.util.Option) EnumSource(org.junit.jupiter.params.provider.EnumSource) HashMap(java.util.HashMap) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HTable(org.apache.hadoop.hbase.client.HTable) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) EmptyHoodieRecordPayload(org.apache.hudi.common.model.EmptyHoodieRecordPayload) LinkedList(java.util.LinkedList) JavaRDD(org.apache.spark.api.java.JavaRDD) Bytes(org.apache.hadoop.hbase.util.Bytes) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) TableName(org.apache.hadoop.hbase.TableName) TestMethodOrder(org.junit.jupiter.api.TestMethodOrder) Assertions.assertNoWriteErrors(org.apache.hudi.testutils.Assertions.assertNoWriteErrors) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Put(org.apache.hadoop.hbase.client.Put) HoodieHBaseIndexConfig(org.apache.hudi.config.HoodieHBaseIndexConfig) IOException(java.io.IOException) Mockito.times(org.mockito.Mockito.times) Mockito.when(org.mockito.Mockito.when) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) MethodOrderer(org.junit.jupiter.api.MethodOrderer) Mockito.verify(org.mockito.Mockito.verify) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) AfterEach(org.junit.jupiter.api.AfterEach) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) SparkClientFunctionalTestHarness(org.apache.hudi.testutils.SparkClientFunctionalTestHarness) Connection(org.apache.hadoop.hbase.client.Connection) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) WriteStatus(org.apache.hudi.client.WriteStatus) EnumSource(org.junit.jupiter.params.provider.EnumSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 40 with SparkRDDWriteClient

use of org.apache.hudi.client.SparkRDDWriteClient in project hudi by apache.

the class TestSparkHoodieHBaseIndex method testHbaseTagLocationForArchivedCommits.

// Verify hbase is tagging records belonging to an archived commit as valid.
@Test
public void testHbaseTagLocationForArchivedCommits() throws Exception {
    // Load to memory
    Map<String, String> params = new HashMap<String, String>();
    params.put(HoodieCompactionConfig.CLEANER_COMMITS_RETAINED.key(), "1");
    params.put(HoodieCompactionConfig.MAX_COMMITS_TO_KEEP.key(), "3");
    params.put(HoodieCompactionConfig.MIN_COMMITS_TO_KEEP.key(), "2");
    HoodieWriteConfig config = getConfigBuilder(100, false, false).withProps(params).build();
    SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config);
    SparkRDDWriteClient writeClient = getHoodieWriteClient(config);
    // make first commit with 20 records
    JavaRDD<HoodieRecord> writeRecords1 = generateAndCommitRecords(writeClient, 20);
    // Make 3 additional commits, so that first commit is archived
    for (int nCommit = 0; nCommit < 3; nCommit++) {
        generateAndCommitRecords(writeClient, 20);
    }
    // tagLocation for the first set of records (for the archived commit), hbaseIndex should tag them as valid
    metaClient = HoodieTableMetaClient.reload(metaClient);
    HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
    JavaRDD<HoodieRecord> javaRDD1 = tagLocation(index, writeRecords1, hoodieTable);
    assertEquals(20, javaRDD1.filter(HoodieRecord::isCurrentLocationKnown).collect().size());
}
Also used : SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HashMap(java.util.HashMap) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Aggregations

SparkRDDWriteClient (org.apache.hudi.client.SparkRDDWriteClient)143 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)127 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)113 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)86 Test (org.junit.jupiter.api.Test)80 WriteStatus (org.apache.hudi.client.WriteStatus)76 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)74 HoodieTestDataGenerator (org.apache.hudi.common.testutils.HoodieTestDataGenerator)61 List (java.util.List)59 ArrayList (java.util.ArrayList)51 HoodieTable (org.apache.hudi.table.HoodieTable)51 Path (org.apache.hadoop.fs.Path)47 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)47 JavaRDD (org.apache.spark.api.java.JavaRDD)47 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)44 Collectors (java.util.stream.Collectors)43 Assertions.assertEquals (org.junit.jupiter.api.Assertions.assertEquals)43 HoodieCompactionConfig (org.apache.hudi.config.HoodieCompactionConfig)42 HashMap (java.util.HashMap)41 Properties (java.util.Properties)41