Search in sources :

Example 6 with HoodieTableMetadataWriter

use of org.apache.hudi.metadata.HoodieTableMetadataWriter in project hudi by apache.

the class TestCleaner method testCleanWithReplaceCommits.

@Test
public void testCleanWithReplaceCommits() throws Exception {
    HoodieWriteConfig config = HoodieWriteConfig.newBuilder().withPath(basePath).withMetadataConfig(HoodieMetadataConfig.newBuilder().withMaxNumDeltaCommitsBeforeCompaction(1).withAssumeDatePartitioning(true).build()).withCompactionConfig(HoodieCompactionConfig.newBuilder().withCleanerPolicy(HoodieCleaningPolicy.KEEP_LATEST_COMMITS).retainCommits(2).build()).build();
    HoodieTableMetadataWriter metadataWriter = SparkHoodieBackedTableMetadataWriter.create(hadoopConf, config, context);
    HoodieTestTable testTable = HoodieMetadataTestTable.of(metaClient, metadataWriter);
    String p0 = "2020/01/01";
    String p1 = "2020/01/02";
    // make 1 commit, with 1 file per partition
    String file1P0C0 = UUID.randomUUID().toString();
    String file1P1C0 = UUID.randomUUID().toString();
    testTable.addInflightCommit("00000000000001").withBaseFilesInPartition(p0, file1P0C0).withBaseFilesInPartition(p1, file1P1C0);
    HoodieCommitMetadata commitMetadata = generateCommitMetadata("00000000000001", Collections.unmodifiableMap(new HashMap<String, List<String>>() {

        {
            put(p0, CollectionUtils.createImmutableList(file1P0C0));
            put(p1, CollectionUtils.createImmutableList(file1P1C0));
        }
    }));
    metadataWriter.update(commitMetadata, "00000000000001", false);
    metaClient.getActiveTimeline().saveAsComplete(new HoodieInstant(State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "00000000000001"), Option.of(commitMetadata.toJsonString().getBytes(StandardCharsets.UTF_8)));
    metaClient = HoodieTableMetaClient.reload(metaClient);
    List<HoodieCleanStat> hoodieCleanStatsOne = runCleaner(config);
    assertEquals(0, hoodieCleanStatsOne.size(), "Must not scan any partitions and clean any files");
    assertTrue(testTable.baseFileExists(p0, "00000000000001", file1P0C0));
    assertTrue(testTable.baseFileExists(p1, "00000000000001", file1P1C0));
    // make next replacecommit, with 1 clustering operation. logically delete p0. No change to p1
    // notice that clustering generates empty inflight commit files
    Map<String, String> partitionAndFileId002 = testTable.forReplaceCommit("00000000000002").getFileIdsWithBaseFilesInPartitions(p0);
    String file2P0C1 = partitionAndFileId002.get(p0);
    Pair<HoodieRequestedReplaceMetadata, HoodieReplaceCommitMetadata> replaceMetadata = generateReplaceCommitMetadata("00000000000002", p0, file1P0C0, file2P0C1);
    testTable.addReplaceCommit("00000000000002", Option.of(replaceMetadata.getKey()), Option.empty(), replaceMetadata.getValue());
    // run cleaner
    List<HoodieCleanStat> hoodieCleanStatsTwo = runCleaner(config);
    assertEquals(0, hoodieCleanStatsTwo.size(), "Must not scan any partitions and clean any files");
    assertTrue(testTable.baseFileExists(p0, "00000000000002", file2P0C1));
    assertTrue(testTable.baseFileExists(p0, "00000000000001", file1P0C0));
    assertTrue(testTable.baseFileExists(p1, "00000000000001", file1P1C0));
    // make next replacecommit, with 1 clustering operation. Replace data in p1. No change to p0
    // notice that clustering generates empty inflight commit files
    Map<String, String> partitionAndFileId003 = testTable.forReplaceCommit("00000000000003").getFileIdsWithBaseFilesInPartitions(p1);
    String file3P1C2 = partitionAndFileId003.get(p1);
    replaceMetadata = generateReplaceCommitMetadata("00000000000003", p1, file1P1C0, file3P1C2);
    testTable.addReplaceCommit("00000000000003", Option.of(replaceMetadata.getKey()), Option.empty(), replaceMetadata.getValue());
    // run cleaner
    List<HoodieCleanStat> hoodieCleanStatsThree = runCleaner(config);
    assertEquals(0, hoodieCleanStatsThree.size(), "Must not scan any partitions and clean any files");
    assertTrue(testTable.baseFileExists(p0, "00000000000002", file2P0C1));
    assertTrue(testTable.baseFileExists(p0, "00000000000001", file1P0C0));
    assertTrue(testTable.baseFileExists(p1, "00000000000003", file3P1C2));
    assertTrue(testTable.baseFileExists(p1, "00000000000001", file1P1C0));
    // make next replacecommit, with 1 clustering operation. Replace data in p0 again
    // notice that clustering generates empty inflight commit files
    Map<String, String> partitionAndFileId004 = testTable.forReplaceCommit("00000000000004").getFileIdsWithBaseFilesInPartitions(p0);
    String file4P0C3 = partitionAndFileId004.get(p0);
    replaceMetadata = generateReplaceCommitMetadata("00000000000004", p0, file2P0C1, file4P0C3);
    testTable.addReplaceCommit("00000000000004", Option.of(replaceMetadata.getKey()), Option.empty(), replaceMetadata.getValue());
    // run cleaner
    List<HoodieCleanStat> hoodieCleanStatsFour = runCleaner(config, 5);
    assertTrue(testTable.baseFileExists(p0, "00000000000004", file4P0C3));
    assertTrue(testTable.baseFileExists(p0, "00000000000002", file2P0C1));
    assertTrue(testTable.baseFileExists(p1, "00000000000003", file3P1C2));
    assertFalse(testTable.baseFileExists(p0, "00000000000001", file1P0C0));
    // file1P1C0 still stays because its not replaced until 3 and its the only version available
    assertTrue(testTable.baseFileExists(p1, "00000000000001", file1P1C0));
    // make next replacecommit, with 1 clustering operation. Replace all data in p1. no new files created
    // notice that clustering generates empty inflight commit files
    Map<String, String> partitionAndFileId005 = testTable.forReplaceCommit("00000000000006").getFileIdsWithBaseFilesInPartitions(p1);
    String file4P1C4 = partitionAndFileId005.get(p1);
    replaceMetadata = generateReplaceCommitMetadata("00000000000006", p0, file3P1C2, file4P1C4);
    testTable.addReplaceCommit("00000000000006", Option.of(replaceMetadata.getKey()), Option.empty(), replaceMetadata.getValue());
    List<HoodieCleanStat> hoodieCleanStatsFive = runCleaner(config, 7);
    assertTrue(testTable.baseFileExists(p0, "00000000000004", file4P0C3));
    assertTrue(testTable.baseFileExists(p0, "00000000000002", file2P0C1));
    assertTrue(testTable.baseFileExists(p1, "00000000000003", file3P1C2));
    assertFalse(testTable.baseFileExists(p0, "00000000000001", file1P0C0));
    assertFalse(testTable.baseFileExists(p1, "00000000000001", file1P1C0));
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HashMap(java.util.HashMap) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) HoodieCleanStat(org.apache.hudi.common.HoodieCleanStat) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) HoodieTableMetadataWriter(org.apache.hudi.metadata.HoodieTableMetadataWriter) HoodieRequestedReplaceMetadata(org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata) HoodieReplaceCommitMetadata(org.apache.hudi.common.model.HoodieReplaceCommitMetadata) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) Test(org.junit.jupiter.api.Test)

Example 7 with HoodieTableMetadataWriter

use of org.apache.hudi.metadata.HoodieTableMetadataWriter in project hudi by apache.

the class TestHoodieMergeOnReadTable method testLogFileCountsAfterCompaction.

// TODO: Enable metadata virtual keys in this test once the feature HUDI-2593 is completed
@ParameterizedTest
@ValueSource(booleans = { false, true })
public void testLogFileCountsAfterCompaction(boolean preserveCommitMeta) throws Exception {
    boolean populateMetaFields = true;
    // insert 100 records
    HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(true, false, HoodieIndex.IndexType.BLOOM, 1024 * 1024 * 1024L, HoodieClusteringConfig.newBuilder().build(), preserveCommitMeta);
    addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields);
    HoodieWriteConfig config = cfgBuilder.build();
    try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config)) {
        String newCommitTime = "100";
        writeClient.startCommitWithTime(newCommitTime);
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 100);
        JavaRDD<HoodieRecord> recordsRDD = jsc().parallelize(records, 1);
        writeClient.insert(recordsRDD, newCommitTime).collect();
        // Update all the 100 records
        newCommitTime = "101";
        List<HoodieRecord> updatedRecords = dataGen.generateUpdates(newCommitTime, records);
        JavaRDD<HoodieRecord> updatedRecordsRDD = jsc().parallelize(updatedRecords, 1);
        HoodieReadClient readClient = new HoodieReadClient(context(), config);
        JavaRDD<HoodieRecord> updatedTaggedRecordsRDD = readClient.tagLocation(updatedRecordsRDD);
        writeClient.startCommitWithTime(newCommitTime);
        writeClient.upsertPreppedRecords(updatedTaggedRecordsRDD, newCommitTime).collect();
        // Write them to corresponding avro logfiles
        metaClient = HoodieTableMetaClient.reload(metaClient);
        HoodieTableMetadataWriter metadataWriter = SparkHoodieBackedTableMetadataWriter.create(writeClient.getEngineContext().getHadoopConf().get(), config, writeClient.getEngineContext());
        HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(metaClient, HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS, metadataWriter);
        Set<String> allPartitions = updatedRecords.stream().map(record -> record.getPartitionPath()).collect(Collectors.groupingBy(partitionPath -> partitionPath)).keySet();
        assertEquals(allPartitions.size(), testTable.listAllBaseFiles().length);
        // Verify that all data file has one log file
        HoodieTable table = HoodieSparkTable.create(config, context(), metaClient, true);
        for (String partitionPath : dataGen.getPartitionPaths()) {
            List<FileSlice> groupedLogFiles = table.getSliceView().getLatestFileSlices(partitionPath).collect(Collectors.toList());
            for (FileSlice fileSlice : groupedLogFiles) {
                assertEquals(1, fileSlice.getLogFiles().count(), "There should be 1 log file written for the latest data file - " + fileSlice);
            }
        }
        // Do a compaction
        String compactionInstantTime = writeClient.scheduleCompaction(Option.empty()).get().toString();
        HoodieWriteMetadata<JavaRDD<WriteStatus>> result = writeClient.compact(compactionInstantTime);
        // Verify that recently written compacted data file has no log file
        metaClient = HoodieTableMetaClient.reload(metaClient);
        table = HoodieSparkTable.create(config, context(), metaClient);
        HoodieActiveTimeline timeline = metaClient.getActiveTimeline();
        assertTrue(HoodieTimeline.compareTimestamps(timeline.lastInstant().get().getTimestamp(), HoodieTimeline.GREATER_THAN, newCommitTime), "Compaction commit should be > than last insert");
        for (String partitionPath : dataGen.getPartitionPaths()) {
            List<FileSlice> groupedLogFiles = table.getSliceView().getLatestFileSlices(partitionPath).collect(Collectors.toList());
            for (FileSlice slice : groupedLogFiles) {
                assertEquals(0, slice.getLogFiles().count(), "After compaction there should be no log files visible on a full view");
            }
            assertTrue(result.getCommitMetadata().get().getWritePartitionPaths().stream().anyMatch(part -> part.contentEquals(partitionPath)));
        }
        // Check the entire dataset has all records still
        String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length];
        for (int i = 0; i < fullPartitionPaths.length; i++) {
            fullPartitionPaths[i] = String.format("%s/%s/*", basePath(), dataGen.getPartitionPaths()[i]);
        }
        Dataset<Row> actual = HoodieClientTestUtils.read(jsc(), basePath(), sqlContext(), fs(), fullPartitionPaths);
        List<Row> rows = actual.collectAsList();
        assertEquals(updatedRecords.size(), rows.size());
        for (Row row : rows) {
            assertEquals(row.getAs(HoodieRecord.COMMIT_TIME_METADATA_FIELD), preserveCommitMeta ? newCommitTime : compactionInstantTime);
        }
    }
}
Also used : HoodieClientTestHarness.buildProfile(org.apache.hudi.testutils.HoodieClientTestHarness.buildProfile) BeforeEach(org.junit.jupiter.api.BeforeEach) HoodieMergeOnReadTestUtils(org.apache.hudi.testutils.HoodieMergeOnReadTestUtils) Arrays(java.util.Arrays) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) StorageLevel(org.apache.spark.storage.StorageLevel) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) BaseSparkDeltaCommitActionExecutor(org.apache.hudi.table.action.deltacommit.BaseSparkDeltaCommitActionExecutor) Map(java.util.Map) SparkHoodieBackedTableMetadataWriter(org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter) Path(org.apache.hadoop.fs.Path) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) IndexType(org.apache.hudi.index.HoodieIndex.IndexType) Set(java.util.Set) Collectors(java.util.stream.Collectors) HoodieIndex(org.apache.hudi.index.HoodieIndex) Test(org.junit.jupiter.api.Test) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) Stream(java.util.stream.Stream) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) SparkDeleteDeltaCommitActionExecutor(org.apache.hudi.table.action.deltacommit.SparkDeleteDeltaCommitActionExecutor) HoodieClientTestUtils(org.apache.hudi.testutils.HoodieClientTestUtils) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) Dataset(org.apache.spark.sql.Dataset) FileSlice(org.apache.hudi.common.model.FileSlice) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) State(org.apache.hudi.common.table.timeline.HoodieInstant.State) HoodieReadClient(org.apache.hudi.client.HoodieReadClient) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) ValueSource(org.junit.jupiter.params.provider.ValueSource) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Assertions.assertNoWriteErrors(org.apache.hudi.testutils.Assertions.assertNoWriteErrors) Properties(java.util.Properties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) BaseFileOnlyView(org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) Row(org.apache.spark.sql.Row) JobConf(org.apache.hadoop.mapred.JobConf) WriteStatus(org.apache.hudi.client.WriteStatus) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) Transformations(org.apache.hudi.common.testutils.Transformations) SparkClientFunctionalTestHarness(org.apache.hudi.testutils.SparkClientFunctionalTestHarness) HoodieTableMetadataWriter(org.apache.hudi.metadata.HoodieTableMetadataWriter) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) HoodieClusteringConfig(org.apache.hudi.config.HoodieClusteringConfig) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieReadClient(org.apache.hudi.client.HoodieReadClient) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) Row(org.apache.spark.sql.Row) HoodieTableMetadataWriter(org.apache.hudi.metadata.HoodieTableMetadataWriter) ValueSource(org.junit.jupiter.params.provider.ValueSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Aggregations

HoodieTableMetadataWriter (org.apache.hudi.metadata.HoodieTableMetadataWriter)7 HashMap (java.util.HashMap)6 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)6 Test (org.junit.jupiter.api.Test)6 List (java.util.List)5 HoodieTestTable (org.apache.hudi.common.testutils.HoodieTestTable)5 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)5 HoodieCleanStat (org.apache.hudi.common.HoodieCleanStat)3 ArrayList (java.util.ArrayList)2 Path (org.apache.hadoop.fs.Path)2 HoodieCommitMetadata (org.apache.hudi.common.model.HoodieCommitMetadata)2 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)2 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)2 IOException (java.io.IOException)1 Arrays (java.util.Arrays)1 Map (java.util.Map)1 Properties (java.util.Properties)1 Set (java.util.Set)1 Collectors (java.util.stream.Collectors)1 Stream (java.util.stream.Stream)1