Search in sources :

Example 11 with HoodieReadClient

use of org.apache.hudi.client.HoodieReadClient in project hudi by apache.

the class TestHoodieMergeOnReadTable method testLogFileCountsAfterCompaction.

// TODO: Enable metadata virtual keys in this test once the feature HUDI-2593 is completed
@ParameterizedTest
@ValueSource(booleans = { false, true })
public void testLogFileCountsAfterCompaction(boolean preserveCommitMeta) throws Exception {
    boolean populateMetaFields = true;
    // insert 100 records
    HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(true, false, HoodieIndex.IndexType.BLOOM, 1024 * 1024 * 1024L, HoodieClusteringConfig.newBuilder().build(), preserveCommitMeta);
    addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields);
    HoodieWriteConfig config = cfgBuilder.build();
    try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config)) {
        String newCommitTime = "100";
        writeClient.startCommitWithTime(newCommitTime);
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 100);
        JavaRDD<HoodieRecord> recordsRDD = jsc().parallelize(records, 1);
        writeClient.insert(recordsRDD, newCommitTime).collect();
        // Update all the 100 records
        newCommitTime = "101";
        List<HoodieRecord> updatedRecords = dataGen.generateUpdates(newCommitTime, records);
        JavaRDD<HoodieRecord> updatedRecordsRDD = jsc().parallelize(updatedRecords, 1);
        HoodieReadClient readClient = new HoodieReadClient(context(), config);
        JavaRDD<HoodieRecord> updatedTaggedRecordsRDD = readClient.tagLocation(updatedRecordsRDD);
        writeClient.startCommitWithTime(newCommitTime);
        writeClient.upsertPreppedRecords(updatedTaggedRecordsRDD, newCommitTime).collect();
        // Write them to corresponding avro logfiles
        metaClient = HoodieTableMetaClient.reload(metaClient);
        HoodieTableMetadataWriter metadataWriter = SparkHoodieBackedTableMetadataWriter.create(writeClient.getEngineContext().getHadoopConf().get(), config, writeClient.getEngineContext());
        HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(metaClient, HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS, metadataWriter);
        Set<String> allPartitions = updatedRecords.stream().map(record -> record.getPartitionPath()).collect(Collectors.groupingBy(partitionPath -> partitionPath)).keySet();
        assertEquals(allPartitions.size(), testTable.listAllBaseFiles().length);
        // Verify that all data file has one log file
        HoodieTable table = HoodieSparkTable.create(config, context(), metaClient, true);
        for (String partitionPath : dataGen.getPartitionPaths()) {
            List<FileSlice> groupedLogFiles = table.getSliceView().getLatestFileSlices(partitionPath).collect(Collectors.toList());
            for (FileSlice fileSlice : groupedLogFiles) {
                assertEquals(1, fileSlice.getLogFiles().count(), "There should be 1 log file written for the latest data file - " + fileSlice);
            }
        }
        // Do a compaction
        String compactionInstantTime = writeClient.scheduleCompaction(Option.empty()).get().toString();
        HoodieWriteMetadata<JavaRDD<WriteStatus>> result = writeClient.compact(compactionInstantTime);
        // Verify that recently written compacted data file has no log file
        metaClient = HoodieTableMetaClient.reload(metaClient);
        table = HoodieSparkTable.create(config, context(), metaClient);
        HoodieActiveTimeline timeline = metaClient.getActiveTimeline();
        assertTrue(HoodieTimeline.compareTimestamps(timeline.lastInstant().get().getTimestamp(), HoodieTimeline.GREATER_THAN, newCommitTime), "Compaction commit should be > than last insert");
        for (String partitionPath : dataGen.getPartitionPaths()) {
            List<FileSlice> groupedLogFiles = table.getSliceView().getLatestFileSlices(partitionPath).collect(Collectors.toList());
            for (FileSlice slice : groupedLogFiles) {
                assertEquals(0, slice.getLogFiles().count(), "After compaction there should be no log files visible on a full view");
            }
            assertTrue(result.getCommitMetadata().get().getWritePartitionPaths().stream().anyMatch(part -> part.contentEquals(partitionPath)));
        }
        // Check the entire dataset has all records still
        String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length];
        for (int i = 0; i < fullPartitionPaths.length; i++) {
            fullPartitionPaths[i] = String.format("%s/%s/*", basePath(), dataGen.getPartitionPaths()[i]);
        }
        Dataset<Row> actual = HoodieClientTestUtils.read(jsc(), basePath(), sqlContext(), fs(), fullPartitionPaths);
        List<Row> rows = actual.collectAsList();
        assertEquals(updatedRecords.size(), rows.size());
        for (Row row : rows) {
            assertEquals(row.getAs(HoodieRecord.COMMIT_TIME_METADATA_FIELD), preserveCommitMeta ? newCommitTime : compactionInstantTime);
        }
    }
}
Also used : HoodieClientTestHarness.buildProfile(org.apache.hudi.testutils.HoodieClientTestHarness.buildProfile) BeforeEach(org.junit.jupiter.api.BeforeEach) HoodieMergeOnReadTestUtils(org.apache.hudi.testutils.HoodieMergeOnReadTestUtils) Arrays(java.util.Arrays) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) StorageLevel(org.apache.spark.storage.StorageLevel) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) BaseSparkDeltaCommitActionExecutor(org.apache.hudi.table.action.deltacommit.BaseSparkDeltaCommitActionExecutor) Map(java.util.Map) SparkHoodieBackedTableMetadataWriter(org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter) Path(org.apache.hadoop.fs.Path) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) IndexType(org.apache.hudi.index.HoodieIndex.IndexType) Set(java.util.Set) Collectors(java.util.stream.Collectors) HoodieIndex(org.apache.hudi.index.HoodieIndex) Test(org.junit.jupiter.api.Test) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) Stream(java.util.stream.Stream) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) SparkDeleteDeltaCommitActionExecutor(org.apache.hudi.table.action.deltacommit.SparkDeleteDeltaCommitActionExecutor) HoodieClientTestUtils(org.apache.hudi.testutils.HoodieClientTestUtils) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) Dataset(org.apache.spark.sql.Dataset) FileSlice(org.apache.hudi.common.model.FileSlice) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) State(org.apache.hudi.common.table.timeline.HoodieInstant.State) HoodieReadClient(org.apache.hudi.client.HoodieReadClient) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) ValueSource(org.junit.jupiter.params.provider.ValueSource) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Assertions.assertNoWriteErrors(org.apache.hudi.testutils.Assertions.assertNoWriteErrors) Properties(java.util.Properties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) BaseFileOnlyView(org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) Row(org.apache.spark.sql.Row) JobConf(org.apache.hadoop.mapred.JobConf) WriteStatus(org.apache.hudi.client.WriteStatus) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) Transformations(org.apache.hudi.common.testutils.Transformations) SparkClientFunctionalTestHarness(org.apache.hudi.testutils.SparkClientFunctionalTestHarness) HoodieTableMetadataWriter(org.apache.hudi.metadata.HoodieTableMetadataWriter) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) HoodieClusteringConfig(org.apache.hudi.config.HoodieClusteringConfig) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieReadClient(org.apache.hudi.client.HoodieReadClient) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) Row(org.apache.spark.sql.Row) HoodieTableMetadataWriter(org.apache.hudi.metadata.HoodieTableMetadataWriter) ValueSource(org.junit.jupiter.params.provider.ValueSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 12 with HoodieReadClient

use of org.apache.hudi.client.HoodieReadClient in project hudi by apache.

the class TestAsyncCompaction method testInterleavedCompaction.

@Test
public void testInterleavedCompaction() throws Exception {
    // Case: Two delta commits before and after compaction schedule
    HoodieWriteConfig cfg = getConfig(true);
    try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
        HoodieReadClient readClient = getHoodieReadClient(cfg.getBasePath());
        String firstInstantTime = "001";
        String secondInstantTime = "004";
        String compactionInstantTime = "005";
        String thirdInstantTime = "006";
        String fourthInstantTime = "007";
        int numRecs = 2000;
        List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs);
        records = runNextDeltaCommits(client, readClient, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true, new ArrayList<>());
        HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build();
        HoodieTable hoodieTable = getHoodieTable(metaClient, cfg);
        scheduleCompaction(compactionInstantTime, client, cfg);
        runNextDeltaCommits(client, readClient, Arrays.asList(thirdInstantTime, fourthInstantTime), records, cfg, false, Arrays.asList(compactionInstantTime));
        executeCompaction(compactionInstantTime, client, hoodieTable, cfg, numRecs, true);
    }
}
Also used : HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieReadClient(org.apache.hudi.client.HoodieReadClient) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieTable(org.apache.hudi.table.HoodieTable) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Test(org.junit.jupiter.api.Test)

Example 13 with HoodieReadClient

use of org.apache.hudi.client.HoodieReadClient in project hudi by apache.

the class TestAsyncCompaction method testCompactionAfterTwoDeltaCommits.

@Test
public void testCompactionAfterTwoDeltaCommits() throws Exception {
    // No Delta Commits after compaction request
    HoodieWriteConfig cfg = getConfig(true);
    try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
        HoodieReadClient readClient = getHoodieReadClient(cfg.getBasePath());
        String firstInstantTime = "001";
        String secondInstantTime = "004";
        String compactionInstantTime = "005";
        int numRecs = 2000;
        List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs);
        runNextDeltaCommits(client, readClient, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true, new ArrayList<>());
        HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build();
        HoodieTable hoodieTable = getHoodieTable(metaClient, cfg);
        scheduleAndExecuteCompaction(compactionInstantTime, client, hoodieTable, cfg, numRecs, false);
    }
}
Also used : HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieReadClient(org.apache.hudi.client.HoodieReadClient) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Test(org.junit.jupiter.api.Test)

Example 14 with HoodieReadClient

use of org.apache.hudi.client.HoodieReadClient in project hudi by apache.

the class TestAsyncCompaction method testScheduleIngestionBeforePendingCompaction.

@Test
public void testScheduleIngestionBeforePendingCompaction() throws Exception {
    // Case: Failure case. Latest pending compaction instant time must be earlier than this instant time
    HoodieWriteConfig cfg = getConfig(false);
    SparkRDDWriteClient client = getHoodieWriteClient(cfg);
    HoodieReadClient readClient = getHoodieReadClient(cfg.getBasePath());
    String firstInstantTime = "001";
    String secondInstantTime = "004";
    String failedInstantTime = "005";
    String compactionInstantTime = "006";
    int numRecs = 2000;
    final List<HoodieRecord> initialRecords = dataGen.generateInserts(firstInstantTime, numRecs);
    final List<HoodieRecord> records = runNextDeltaCommits(client, readClient, Arrays.asList(firstInstantTime, secondInstantTime), initialRecords, cfg, true, new ArrayList<>());
    // Schedule compaction but do not run them
    scheduleCompaction(compactionInstantTime, client, cfg);
    HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build();
    HoodieInstant pendingCompactionInstant = metaClient.getActiveTimeline().filterPendingCompactionTimeline().firstInstant().get();
    assertEquals(compactionInstantTime, pendingCompactionInstant.getTimestamp(), "Pending Compaction instant has expected instant time");
    assertThrows(IllegalArgumentException.class, () -> {
        runNextDeltaCommits(client, readClient, Arrays.asList(failedInstantTime), records, cfg, false, Arrays.asList(compactionInstantTime));
    }, "Latest pending compaction instant time must be earlier than this instant time");
}
Also used : HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieReadClient(org.apache.hudi.client.HoodieReadClient) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Test(org.junit.jupiter.api.Test)

Example 15 with HoodieReadClient

use of org.apache.hudi.client.HoodieReadClient in project hudi by apache.

the class TestAsyncCompaction method testScheduleCompactionWithOlderOrSameTimestamp.

@Test
public void testScheduleCompactionWithOlderOrSameTimestamp() throws Exception {
    // Case: Failure case. Earliest ingestion inflight instant time must be later than compaction time
    HoodieWriteConfig cfg = getConfig(false);
    SparkRDDWriteClient client = getHoodieWriteClient(cfg);
    HoodieReadClient readClient = getHoodieReadClient(cfg.getBasePath());
    final String firstInstantTime = "001";
    final String secondInstantTime = "004";
    final String compactionInstantTime = "002";
    int numRecs = 2000;
    List<HoodieRecord> records = dataGen.generateInserts(firstInstantTime, numRecs);
    runNextDeltaCommits(client, readClient, Arrays.asList(firstInstantTime, secondInstantTime), records, cfg, true, new ArrayList<>());
    assertThrows(IllegalArgumentException.class, () -> {
        // Schedule compaction but do not run them
        scheduleCompaction(compactionInstantTime, client, cfg);
    }, "Compaction Instant to be scheduled cannot have older timestamp");
    // Schedule with timestamp same as that of committed instant
    assertThrows(IllegalArgumentException.class, () -> {
        // Schedule compaction but do not run them
        scheduleCompaction(secondInstantTime, client, cfg);
    }, "Compaction Instant to be scheduled cannot have same timestamp as committed instant");
    final String compactionInstantTime2 = "006";
    scheduleCompaction(compactionInstantTime2, client, cfg);
    assertThrows(IllegalArgumentException.class, () -> {
        // Schedule compaction with the same times as a pending compaction
        scheduleCompaction(secondInstantTime, client, cfg);
    }, "Compaction Instant to be scheduled cannot have same timestamp as a pending compaction");
}
Also used : SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieReadClient(org.apache.hudi.client.HoodieReadClient) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Test(org.junit.jupiter.api.Test)

Aggregations

HoodieReadClient (org.apache.hudi.client.HoodieReadClient)18 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)18 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)18 Test (org.junit.jupiter.api.Test)18 SparkRDDWriteClient (org.apache.hudi.client.SparkRDDWriteClient)17 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)17 ArrayList (java.util.ArrayList)11 Arrays (java.util.Arrays)8 List (java.util.List)8 Collectors (java.util.stream.Collectors)8 HoodieActiveTimeline (org.apache.hudi.common.table.timeline.HoodieActiveTimeline)8 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)8 Assertions.assertEquals (org.junit.jupiter.api.Assertions.assertEquals)8 Assertions.assertFalse (org.junit.jupiter.api.Assertions.assertFalse)8 IntStream (java.util.stream.IntStream)7 HoodieCompactionConfig (org.apache.hudi.config.HoodieCompactionConfig)7 HoodieSparkTable (org.apache.hudi.table.HoodieSparkTable)7 WriteMarkersFactory (org.apache.hudi.table.marker.WriteMarkersFactory)7 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)6 HoodieTable (org.apache.hudi.table.HoodieTable)5