Search in sources :

Example 36 with HoodieTable

use of org.apache.hudi.table.HoodieTable in project hudi by apache.

the class TestSparkHoodieHBaseIndex method testSimpleTagLocationAndUpdateWithRollback.

@Test
public void testSimpleTagLocationAndUpdateWithRollback() throws Exception {
    // Load to memory
    HoodieWriteConfig config = getConfigBuilder(100, false, false).withRollbackUsingMarkers(false).build();
    SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config);
    SparkRDDWriteClient writeClient = getHoodieWriteClient(config);
    final String newCommitTime = writeClient.startCommit();
    final int numRecords = 10;
    List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, numRecords);
    JavaRDD<HoodieRecord> writeRecords = jsc().parallelize(records, 1);
    metaClient = HoodieTableMetaClient.reload(metaClient);
    // Insert 200 records
    JavaRDD<WriteStatus> writeStatues = writeClient.upsert(writeRecords, newCommitTime);
    assertNoWriteErrors(writeStatues.collect());
    // commit this upsert
    writeClient.commit(newCommitTime, writeStatues);
    HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
    // Now tagLocation for these records, hbaseIndex should tag them
    List<HoodieRecord> records2 = tagLocation(index, writeRecords, hoodieTable).collect();
    assertEquals(numRecords, records2.stream().filter(HoodieRecord::isCurrentLocationKnown).count());
    // check tagged records are tagged with correct fileIds
    List<String> fileIds = writeStatues.map(WriteStatus::getFileId).collect();
    assertEquals(0, records2.stream().filter(record -> record.getCurrentLocation().getFileId() == null).count());
    List<String> taggedFileIds = records2.stream().map(record -> record.getCurrentLocation().getFileId()).distinct().collect(Collectors.toList());
    // both lists should match
    assertTrue(taggedFileIds.containsAll(fileIds) && fileIds.containsAll(taggedFileIds));
    // Rollback the last commit
    writeClient.rollback(newCommitTime);
    hoodieTable = HoodieSparkTable.create(config, context, metaClient);
    // Now tagLocation for these records, hbaseIndex should not tag them since it was a rolled
    // back commit
    List<HoodieRecord> records3 = tagLocation(index, writeRecords, hoodieTable).collect();
    assertEquals(0, records3.stream().filter(HoodieRecord::isCurrentLocationKnown).count());
    assertEquals(0, records3.stream().filter(record -> record.getCurrentLocation() != null).count());
}
Also used : SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) WriteStatus(org.apache.hudi.client.WriteStatus) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 37 with HoodieTable

use of org.apache.hudi.table.HoodieTable in project hudi by apache.

the class TestSparkHoodieHBaseIndex method testSmallBatchSize.

@Test
public void testSmallBatchSize() throws Exception {
    final String newCommitTime = "001";
    final int numRecords = 10;
    List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, numRecords);
    JavaRDD<HoodieRecord> writeRecords = jsc().parallelize(records, 1);
    // Load to memory
    HoodieWriteConfig config = getConfig(2);
    SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config);
    try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config)) {
        metaClient = HoodieTableMetaClient.reload(metaClient);
        HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
        // Test tagLocation without any entries in index
        JavaRDD<HoodieRecord> records1 = tagLocation(index, writeRecords, hoodieTable);
        assertEquals(0, records1.filter(record -> record.isCurrentLocationKnown()).count());
        // Insert 200 records
        writeClient.startCommitWithTime(newCommitTime);
        JavaRDD<WriteStatus> writeStatues = writeClient.upsert(writeRecords, newCommitTime);
        assertNoWriteErrors(writeStatues.collect());
        // Now tagLocation for these records, hbaseIndex should not tag them since it was a failed
        // commit
        JavaRDD<HoodieRecord> records2 = tagLocation(index, writeRecords, hoodieTable);
        assertEquals(0, records2.filter(record -> record.isCurrentLocationKnown()).count());
        // Now commit this & update location of records inserted and validate no errors
        writeClient.commit(newCommitTime, writeStatues);
        // Now tagLocation for these records, hbaseIndex should tag them correctly
        metaClient = HoodieTableMetaClient.reload(metaClient);
        hoodieTable = HoodieSparkTable.create(config, context, metaClient);
        List<HoodieRecord> records3 = tagLocation(index, writeRecords, hoodieTable).collect();
        assertEquals(numRecords, records3.stream().filter(record -> record.isCurrentLocationKnown()).count());
        assertEquals(numRecords, records3.stream().map(record -> record.getKey().getRecordKey()).distinct().count());
        assertEquals(numRecords, records3.stream().filter(record -> (record.getCurrentLocation() != null && record.getCurrentLocation().getInstantTime().equals(newCommitTime))).distinct().count());
    }
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) BeforeEach(org.junit.jupiter.api.BeforeEach) Arrays(java.util.Arrays) Result(org.apache.hadoop.hbase.client.Result) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) AfterAll(org.junit.jupiter.api.AfterAll) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) BeforeAll(org.junit.jupiter.api.BeforeAll) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) HoodieStorageConfig(org.apache.hudi.config.HoodieStorageConfig) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) Tag(org.junit.jupiter.api.Tag) Get(org.apache.hadoop.hbase.client.Get) UUID(java.util.UUID) Tuple2(scala.Tuple2) Collectors(java.util.stream.Collectors) HoodieIndex(org.apache.hudi.index.HoodieIndex) Test(org.junit.jupiter.api.Test) List(java.util.List) HBaseConfiguration(org.apache.hadoop.hbase.HBaseConfiguration) HBaseTestingUtility(org.apache.hadoop.hbase.HBaseTestingUtility) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) Mockito.atMost(org.mockito.Mockito.atMost) Mockito.mock(org.mockito.Mockito.mock) ArgumentMatchers.any(org.mockito.ArgumentMatchers.any) Option(org.apache.hudi.common.util.Option) EnumSource(org.junit.jupiter.params.provider.EnumSource) HashMap(java.util.HashMap) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HTable(org.apache.hadoop.hbase.client.HTable) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) EmptyHoodieRecordPayload(org.apache.hudi.common.model.EmptyHoodieRecordPayload) LinkedList(java.util.LinkedList) JavaRDD(org.apache.spark.api.java.JavaRDD) Bytes(org.apache.hadoop.hbase.util.Bytes) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) TableName(org.apache.hadoop.hbase.TableName) TestMethodOrder(org.junit.jupiter.api.TestMethodOrder) Assertions.assertNoWriteErrors(org.apache.hudi.testutils.Assertions.assertNoWriteErrors) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Put(org.apache.hadoop.hbase.client.Put) HoodieHBaseIndexConfig(org.apache.hudi.config.HoodieHBaseIndexConfig) IOException(java.io.IOException) Mockito.times(org.mockito.Mockito.times) Mockito.when(org.mockito.Mockito.when) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) MethodOrderer(org.junit.jupiter.api.MethodOrderer) Mockito.verify(org.mockito.Mockito.verify) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) AfterEach(org.junit.jupiter.api.AfterEach) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) SparkClientFunctionalTestHarness(org.apache.hudi.testutils.SparkClientFunctionalTestHarness) Connection(org.apache.hadoop.hbase.client.Connection) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) WriteStatus(org.apache.hudi.client.WriteStatus) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 38 with HoodieTable

use of org.apache.hudi.table.HoodieTable in project hudi by apache.

the class TestSparkHoodieHBaseIndex method testTotalPutsBatching.

@Test
public void testTotalPutsBatching() throws Exception {
    HoodieWriteConfig config = getConfig();
    SparkHoodieHBaseIndex index = new SparkHoodieHBaseIndex(config);
    SparkRDDWriteClient writeClient = getHoodieWriteClient(config);
    // start a commit and generate test data
    String newCommitTime = writeClient.startCommit();
    List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 250);
    JavaRDD<HoodieRecord> writeRecords = jsc().parallelize(records, 1);
    metaClient = HoodieTableMetaClient.reload(metaClient);
    HoodieTable hoodieTable = HoodieSparkTable.create(config, context, metaClient);
    // Insert 200 records
    JavaRDD<WriteStatus> writeStatues = writeClient.upsert(writeRecords, newCommitTime);
    // commit this upsert
    writeClient.commit(newCommitTime, writeStatues);
    // Mock hbaseConnection and related entities
    Connection hbaseConnection = mock(Connection.class);
    HTable table = mock(HTable.class);
    when(hbaseConnection.getTable(TableName.valueOf(TABLE_NAME))).thenReturn(table);
    when(table.get((List<Get>) any())).thenReturn(new Result[0]);
    // only for test, set the hbaseConnection to mocked object
    index.setHbaseConnection(hbaseConnection);
    // Get all the files generated
    int numberOfDataFileIds = (int) writeStatues.map(status -> status.getFileId()).distinct().count();
    updateLocation(index, writeStatues, hoodieTable);
    // 3 batches should be executed given batchSize = 100 and <=numberOfDataFileIds getting updated,
    // so each fileId ideally gets updates
    verify(table, atMost(numberOfDataFileIds)).put((List<Put>) any());
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) BeforeEach(org.junit.jupiter.api.BeforeEach) Arrays(java.util.Arrays) Result(org.apache.hadoop.hbase.client.Result) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) AfterAll(org.junit.jupiter.api.AfterAll) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) BeforeAll(org.junit.jupiter.api.BeforeAll) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) HoodieStorageConfig(org.apache.hudi.config.HoodieStorageConfig) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) Tag(org.junit.jupiter.api.Tag) Get(org.apache.hadoop.hbase.client.Get) UUID(java.util.UUID) Tuple2(scala.Tuple2) Collectors(java.util.stream.Collectors) HoodieIndex(org.apache.hudi.index.HoodieIndex) Test(org.junit.jupiter.api.Test) List(java.util.List) HBaseConfiguration(org.apache.hadoop.hbase.HBaseConfiguration) HBaseTestingUtility(org.apache.hadoop.hbase.HBaseTestingUtility) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) Mockito.atMost(org.mockito.Mockito.atMost) Mockito.mock(org.mockito.Mockito.mock) ArgumentMatchers.any(org.mockito.ArgumentMatchers.any) Option(org.apache.hudi.common.util.Option) EnumSource(org.junit.jupiter.params.provider.EnumSource) HashMap(java.util.HashMap) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HTable(org.apache.hadoop.hbase.client.HTable) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) EmptyHoodieRecordPayload(org.apache.hudi.common.model.EmptyHoodieRecordPayload) LinkedList(java.util.LinkedList) JavaRDD(org.apache.spark.api.java.JavaRDD) Bytes(org.apache.hadoop.hbase.util.Bytes) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) TableName(org.apache.hadoop.hbase.TableName) TestMethodOrder(org.junit.jupiter.api.TestMethodOrder) Assertions.assertNoWriteErrors(org.apache.hudi.testutils.Assertions.assertNoWriteErrors) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Put(org.apache.hadoop.hbase.client.Put) HoodieHBaseIndexConfig(org.apache.hudi.config.HoodieHBaseIndexConfig) IOException(java.io.IOException) Mockito.times(org.mockito.Mockito.times) Mockito.when(org.mockito.Mockito.when) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) MethodOrderer(org.junit.jupiter.api.MethodOrderer) Mockito.verify(org.mockito.Mockito.verify) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) AfterEach(org.junit.jupiter.api.AfterEach) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) SparkClientFunctionalTestHarness(org.apache.hudi.testutils.SparkClientFunctionalTestHarness) Connection(org.apache.hadoop.hbase.client.Connection) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) HoodieKey(org.apache.hudi.common.model.HoodieKey) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Connection(org.apache.hadoop.hbase.client.Connection) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HTable(org.apache.hadoop.hbase.client.HTable) Put(org.apache.hadoop.hbase.client.Put) HoodieTable(org.apache.hudi.table.HoodieTable) Get(org.apache.hadoop.hbase.client.Get) WriteStatus(org.apache.hudi.client.WriteStatus) Test(org.junit.jupiter.api.Test) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 39 with HoodieTable

use of org.apache.hudi.table.HoodieTable in project hudi by apache.

the class BaseHoodieWriteClient method clean.

/**
 * Clean up any stale/old files/data lying around (either on file storage or index storage) based on the
 * configurations and CleaningPolicy used. (typically files that no longer can be used by a running query can be
 * cleaned). This API provides the flexibility to schedule clean instant asynchronously via
 * {@link BaseHoodieWriteClient#scheduleTableService(String, Option, TableServiceType)} and disable inline scheduling
 * of clean.
 * @param cleanInstantTime instant time for clean.
 * @param scheduleInline true if needs to be scheduled inline. false otherwise.
 * @param skipLocking if this is triggered by another parent transaction, locking can be skipped.
 */
public HoodieCleanMetadata clean(String cleanInstantTime, boolean scheduleInline, boolean skipLocking) throws HoodieIOException {
    if (!tableServicesEnabled(config)) {
        return null;
    }
    final Timer.Context timerContext = metrics.getCleanCtx();
    CleanerUtils.rollbackFailedWrites(config.getFailedWritesCleanPolicy(), HoodieTimeline.CLEAN_ACTION, () -> rollbackFailedWrites(skipLocking));
    HoodieCleanMetadata metadata = null;
    HoodieTable table = createTable(config, hadoopConf);
    if (config.allowMultipleCleans() || !table.getActiveTimeline().getCleanerTimeline().filterInflightsAndRequested().firstInstant().isPresent()) {
        LOG.info("Cleaner started");
        // proceed only if multiple clean schedules are enabled or if there are no pending cleans.
        if (scheduleInline) {
            scheduleTableServiceInternal(cleanInstantTime, Option.empty(), TableServiceType.CLEAN);
            table.getMetaClient().reloadActiveTimeline();
        }
        metadata = table.clean(context, cleanInstantTime, skipLocking);
        if (timerContext != null && metadata != null) {
            long durationMs = metrics.getDurationInMs(timerContext.stop());
            metrics.updateCleanMetrics(durationMs, metadata.getTotalFilesDeleted());
            LOG.info("Cleaned " + metadata.getTotalFilesDeleted() + " files" + " Earliest Retained Instant :" + metadata.getEarliestCommitToRetain() + " cleanerElapsedMs" + durationMs);
        }
    }
    return metadata;
}
Also used : Timer(com.codahale.metrics.Timer) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata)

Example 40 with HoodieTable

use of org.apache.hudi.table.HoodieTable in project hudi by apache.

the class BaseHoodieWriteClient method archive.

/**
 * Trigger archival for the table. This ensures that the number of commits do not explode
 * and keep increasing unbounded over time.
 */
public void archive() {
    // Create a Hoodie table which encapsulated the commits and files visible
    HoodieTable table = createTable(config, hadoopConf);
    archive(table);
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable)

Aggregations

HoodieTable (org.apache.hudi.table.HoodieTable)133 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)105 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)76 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)75 List (java.util.List)64 Test (org.junit.jupiter.api.Test)63 ArrayList (java.util.ArrayList)58 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)57 WriteStatus (org.apache.hudi.client.WriteStatus)49 Path (org.apache.hadoop.fs.Path)48 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)46 Option (org.apache.hudi.common.util.Option)46 IOException (java.io.IOException)44 Map (java.util.Map)44 Collectors (java.util.stream.Collectors)44 SparkRDDWriteClient (org.apache.hudi.client.SparkRDDWriteClient)43 HashMap (java.util.HashMap)41 Pair (org.apache.hudi.common.util.collection.Pair)39 HoodieKey (org.apache.hudi.common.model.HoodieKey)38 HoodieSparkTable (org.apache.hudi.table.HoodieSparkTable)38