Search in sources :

Example 66 with HoodieActiveTimeline

use of org.apache.hudi.common.table.timeline.HoodieActiveTimeline in project hudi by apache.

the class TestHoodieMergeOnReadTable method testLogFileCountsAfterCompaction.

// TODO: Enable metadata virtual keys in this test once the feature HUDI-2593 is completed
@ParameterizedTest
@ValueSource(booleans = { false, true })
public void testLogFileCountsAfterCompaction(boolean preserveCommitMeta) throws Exception {
    boolean populateMetaFields = true;
    // insert 100 records
    HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(true, false, HoodieIndex.IndexType.BLOOM, 1024 * 1024 * 1024L, HoodieClusteringConfig.newBuilder().build(), preserveCommitMeta);
    addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields);
    HoodieWriteConfig config = cfgBuilder.build();
    try (SparkRDDWriteClient writeClient = getHoodieWriteClient(config)) {
        String newCommitTime = "100";
        writeClient.startCommitWithTime(newCommitTime);
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 100);
        JavaRDD<HoodieRecord> recordsRDD = jsc().parallelize(records, 1);
        writeClient.insert(recordsRDD, newCommitTime).collect();
        // Update all the 100 records
        newCommitTime = "101";
        List<HoodieRecord> updatedRecords = dataGen.generateUpdates(newCommitTime, records);
        JavaRDD<HoodieRecord> updatedRecordsRDD = jsc().parallelize(updatedRecords, 1);
        HoodieReadClient readClient = new HoodieReadClient(context(), config);
        JavaRDD<HoodieRecord> updatedTaggedRecordsRDD = readClient.tagLocation(updatedRecordsRDD);
        writeClient.startCommitWithTime(newCommitTime);
        writeClient.upsertPreppedRecords(updatedTaggedRecordsRDD, newCommitTime).collect();
        // Write them to corresponding avro logfiles
        metaClient = HoodieTableMetaClient.reload(metaClient);
        HoodieTableMetadataWriter metadataWriter = SparkHoodieBackedTableMetadataWriter.create(writeClient.getEngineContext().getHadoopConf().get(), config, writeClient.getEngineContext());
        HoodieSparkWriteableTestTable testTable = HoodieSparkWriteableTestTable.of(metaClient, HoodieTestDataGenerator.AVRO_SCHEMA_WITH_METADATA_FIELDS, metadataWriter);
        Set<String> allPartitions = updatedRecords.stream().map(record -> record.getPartitionPath()).collect(Collectors.groupingBy(partitionPath -> partitionPath)).keySet();
        assertEquals(allPartitions.size(), testTable.listAllBaseFiles().length);
        // Verify that all data file has one log file
        HoodieTable table = HoodieSparkTable.create(config, context(), metaClient, true);
        for (String partitionPath : dataGen.getPartitionPaths()) {
            List<FileSlice> groupedLogFiles = table.getSliceView().getLatestFileSlices(partitionPath).collect(Collectors.toList());
            for (FileSlice fileSlice : groupedLogFiles) {
                assertEquals(1, fileSlice.getLogFiles().count(), "There should be 1 log file written for the latest data file - " + fileSlice);
            }
        }
        // Do a compaction
        String compactionInstantTime = writeClient.scheduleCompaction(Option.empty()).get().toString();
        HoodieWriteMetadata<JavaRDD<WriteStatus>> result = writeClient.compact(compactionInstantTime);
        // Verify that recently written compacted data file has no log file
        metaClient = HoodieTableMetaClient.reload(metaClient);
        table = HoodieSparkTable.create(config, context(), metaClient);
        HoodieActiveTimeline timeline = metaClient.getActiveTimeline();
        assertTrue(HoodieTimeline.compareTimestamps(timeline.lastInstant().get().getTimestamp(), HoodieTimeline.GREATER_THAN, newCommitTime), "Compaction commit should be > than last insert");
        for (String partitionPath : dataGen.getPartitionPaths()) {
            List<FileSlice> groupedLogFiles = table.getSliceView().getLatestFileSlices(partitionPath).collect(Collectors.toList());
            for (FileSlice slice : groupedLogFiles) {
                assertEquals(0, slice.getLogFiles().count(), "After compaction there should be no log files visible on a full view");
            }
            assertTrue(result.getCommitMetadata().get().getWritePartitionPaths().stream().anyMatch(part -> part.contentEquals(partitionPath)));
        }
        // Check the entire dataset has all records still
        String[] fullPartitionPaths = new String[dataGen.getPartitionPaths().length];
        for (int i = 0; i < fullPartitionPaths.length; i++) {
            fullPartitionPaths[i] = String.format("%s/%s/*", basePath(), dataGen.getPartitionPaths()[i]);
        }
        Dataset<Row> actual = HoodieClientTestUtils.read(jsc(), basePath(), sqlContext(), fs(), fullPartitionPaths);
        List<Row> rows = actual.collectAsList();
        assertEquals(updatedRecords.size(), rows.size());
        for (Row row : rows) {
            assertEquals(row.getAs(HoodieRecord.COMMIT_TIME_METADATA_FIELD), preserveCommitMeta ? newCommitTime : compactionInstantTime);
        }
    }
}
Also used : HoodieClientTestHarness.buildProfile(org.apache.hudi.testutils.HoodieClientTestHarness.buildProfile) BeforeEach(org.junit.jupiter.api.BeforeEach) HoodieMergeOnReadTestUtils(org.apache.hudi.testutils.HoodieMergeOnReadTestUtils) Arrays(java.util.Arrays) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) StorageLevel(org.apache.spark.storage.StorageLevel) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) BaseSparkDeltaCommitActionExecutor(org.apache.hudi.table.action.deltacommit.BaseSparkDeltaCommitActionExecutor) Map(java.util.Map) SparkHoodieBackedTableMetadataWriter(org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter) Path(org.apache.hadoop.fs.Path) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) IndexType(org.apache.hudi.index.HoodieIndex.IndexType) Set(java.util.Set) Collectors(java.util.stream.Collectors) HoodieIndex(org.apache.hudi.index.HoodieIndex) Test(org.junit.jupiter.api.Test) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) Stream(java.util.stream.Stream) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) SparkDeleteDeltaCommitActionExecutor(org.apache.hudi.table.action.deltacommit.SparkDeleteDeltaCommitActionExecutor) HoodieClientTestUtils(org.apache.hudi.testutils.HoodieClientTestUtils) MetadataMergeWriteStatus(org.apache.hudi.testutils.MetadataMergeWriteStatus) Dataset(org.apache.spark.sql.Dataset) FileSlice(org.apache.hudi.common.model.FileSlice) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) State(org.apache.hudi.common.table.timeline.HoodieInstant.State) HoodieReadClient(org.apache.hudi.client.HoodieReadClient) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) ValueSource(org.junit.jupiter.params.provider.ValueSource) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Assertions.assertNoWriteErrors(org.apache.hudi.testutils.Assertions.assertNoWriteErrors) Properties(java.util.Properties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) BaseFileOnlyView(org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) Row(org.apache.spark.sql.Row) JobConf(org.apache.hadoop.mapred.JobConf) WriteStatus(org.apache.hudi.client.WriteStatus) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) Transformations(org.apache.hudi.common.testutils.Transformations) SparkClientFunctionalTestHarness(org.apache.hudi.testutils.SparkClientFunctionalTestHarness) HoodieTableMetadataWriter(org.apache.hudi.metadata.HoodieTableMetadataWriter) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) HoodieClusteringConfig(org.apache.hudi.config.HoodieClusteringConfig) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieReadClient(org.apache.hudi.client.HoodieReadClient) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieSparkWriteableTestTable(org.apache.hudi.testutils.HoodieSparkWriteableTestTable) Row(org.apache.spark.sql.Row) HoodieTableMetadataWriter(org.apache.hudi.metadata.HoodieTableMetadataWriter) ValueSource(org.junit.jupiter.params.provider.ValueSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 67 with HoodieActiveTimeline

use of org.apache.hudi.common.table.timeline.HoodieActiveTimeline in project hudi by apache.

the class TestHoodieMergeHandle method testUpsertsForMultipleRecordsInSameFile.

@ParameterizedTest
@MethodSource("testArguments")
public void testUpsertsForMultipleRecordsInSameFile(ExternalSpillableMap.DiskMapType diskMapType, boolean isCompressionEnabled) throws Exception {
    // Create records in a single partition
    String partitionPath = HoodieTestDataGenerator.DEFAULT_PARTITION_PATHS[0];
    dataGen = new HoodieTestDataGenerator(new String[] { partitionPath });
    // Build a common config with diff configs
    Properties properties = new Properties();
    properties.setProperty(HoodieCommonConfig.SPILLABLE_DISK_MAP_TYPE.key(), diskMapType.name());
    properties.setProperty(HoodieCommonConfig.DISK_MAP_BITCASK_COMPRESSION_ENABLED.key(), String.valueOf(isCompressionEnabled));
    // Build a write config with bulkinsertparallelism set
    HoodieWriteConfig cfg = getConfigBuilder().withProperties(properties).build();
    try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
        FileSystem fs = FSUtils.getFs(basePath, hadoopConf);
        /**
         * Write 1 (only inserts) This will do a bulk insert of 44 records of which there are 2 records repeated 21 times
         * each. id1 (21 records), id2 (21 records), id3, id4
         */
        String newCommitTime = "001";
        client.startCommitWithTime(newCommitTime);
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 4);
        HoodieRecord record1 = records.get(0);
        HoodieRecord record2 = records.get(1);
        for (int i = 0; i < 20; i++) {
            HoodieRecord dup = dataGen.generateUpdateRecord(record1.getKey(), newCommitTime);
            records.add(dup);
        }
        for (int i = 0; i < 20; i++) {
            HoodieRecord dup = dataGen.generateUpdateRecord(record2.getKey(), newCommitTime);
            records.add(dup);
        }
        JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
        List<WriteStatus> statuses = client.bulkInsert(writeRecords, newCommitTime).collect();
        assertNoWriteErrors(statuses);
        // verify that there is a commit
        metaClient = HoodieTableMetaClient.reload(metaClient);
        HoodieTimeline timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline();
        assertEquals(1, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), "Expecting a single commit.");
        assertEquals(newCommitTime, timeline.lastInstant().get().getTimestamp(), "Latest commit should be 001");
        assertEquals(records.size(), HoodieClientTestUtils.readCommit(basePath, sqlContext, timeline, newCommitTime).count(), "Must contain 44 records");
        /**
         * Write 2 (insert) This will do a bulk insert of 1 record with the same row_key as record1 in the previous insert
         * - id1. At this point, we will have 2 files with the row_keys as shown here - File 1 - id1 (21 records), id2 (21
         * records), id3, id4 File 2 - id1
         */
        newCommitTime = "002";
        client.startCommitWithTime(newCommitTime);
        // Do 1 more bulk insert with the same dup record1
        List<HoodieRecord> newRecords = new ArrayList<>();
        HoodieRecord sameAsRecord1 = dataGen.generateUpdateRecord(record1.getKey(), newCommitTime);
        newRecords.add(sameAsRecord1);
        writeRecords = jsc.parallelize(newRecords, 1);
        statuses = client.bulkInsert(writeRecords, newCommitTime).collect();
        assertNoWriteErrors(statuses);
        // verify that there are 2 commits
        metaClient = HoodieTableMetaClient.reload(metaClient);
        timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline();
        assertEquals(2, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), "Expecting two commits.");
        assertEquals(newCommitTime, timeline.lastInstant().get().getTimestamp(), "Latest commit should be 002");
        Dataset<Row> dataSet = getRecords();
        assertEquals(45, dataSet.count(), "Must contain 45 records");
        /**
         * Write 3 (insert) This will bulk insert 2 new completely new records. At this point, we will have 2 files with
         * the row_keys as shown here - File 1 - id1 (21 records), id2 (21 records), id3, id4 File 2 - id1 File 3 - id5,
         * id6
         */
        newCommitTime = "003";
        client.startCommitWithTime(newCommitTime);
        newRecords = dataGen.generateInserts(newCommitTime, 2);
        writeRecords = jsc.parallelize(newRecords, 1);
        statuses = client.bulkInsert(writeRecords, newCommitTime).collect();
        assertNoWriteErrors(statuses);
        // verify that there are now 3 commits
        metaClient = HoodieTableMetaClient.reload(metaClient);
        timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline();
        assertEquals(3, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), "Expecting three commits.");
        assertEquals(newCommitTime, timeline.lastInstant().get().getTimestamp(), "Latest commit should be 003");
        dataSet = getRecords();
        assertEquals(47, dataSet.count(), "Must contain 47 records");
        /**
         * Write 4 (updates) This will generate 2 upsert records with id1 and id2. The rider and driver names in the
         * update records will be rider-004 and driver-004. After the upsert is complete, all the records with id1 in File
         * 1 and File 2 must be updated, all the records with id2 in File 2 must also be updated. Also, none of the other
         * records in File 1, File 2 and File 3 must be updated.
         */
        newCommitTime = "004";
        client.startCommitWithTime(newCommitTime);
        List<HoodieRecord> updateRecords = new ArrayList<>();
        // This exists in 001 and 002 and should be updated in both
        sameAsRecord1 = dataGen.generateUpdateRecord(record1.getKey(), newCommitTime);
        updateRecords.add(sameAsRecord1);
        // This exists in 001 and should be updated
        HoodieRecord sameAsRecord2 = dataGen.generateUpdateRecord(record2.getKey(), newCommitTime);
        updateRecords.add(sameAsRecord2);
        JavaRDD<HoodieRecord> updateRecordsRDD = jsc.parallelize(updateRecords, 1);
        statuses = client.upsert(updateRecordsRDD, newCommitTime).collect();
        // Verify there are no errors
        assertNoWriteErrors(statuses);
        // verify there are now 4 commits
        timeline = new HoodieActiveTimeline(metaClient).getCommitTimeline();
        assertEquals(4, timeline.findInstantsAfter("000", Integer.MAX_VALUE).countInstants(), "Expecting four commits.");
        assertEquals(timeline.lastInstant().get().getTimestamp(), newCommitTime, "Latest commit should be 004");
        // Check the entire dataset has 47 records still
        dataSet = getRecords();
        assertEquals(47, dataSet.count(), "Must contain 47 records");
        Row[] rows = (Row[]) dataSet.collect();
        int record1Count = 0;
        int record2Count = 0;
        for (Row row : rows) {
            if (row.getAs("_hoodie_record_key").equals(record1.getKey().getRecordKey())) {
                record1Count++;
                // assert each duplicate record is updated
                assertEquals(row.getAs("rider"), "rider-004");
                assertEquals(row.getAs("driver"), "driver-004");
            } else if (row.getAs("_hoodie_record_key").equals(record2.getKey().getRecordKey())) {
                record2Count++;
                // assert each duplicate record is updated
                assertEquals(row.getAs("rider"), "rider-004");
                assertEquals(row.getAs("driver"), "driver-004");
            } else {
                assertNotEquals(row.getAs("rider"), "rider-004");
                assertNotEquals(row.getAs("driver"), "rider-004");
            }
        }
        // Assert that id1 record count which has been updated to rider-004 and driver-004 is 22, which is the total
        // number of records with row_key id1
        assertEquals(22, record1Count);
        // Assert that id2 record count which has been updated to rider-004 and driver-004 is 21, which is the total
        // number of records with row_key id2
        assertEquals(21, record2Count);
    }
}
Also used : SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) ArrayList(java.util.ArrayList) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Properties(java.util.Properties) FileSystem(org.apache.hadoop.fs.FileSystem) Row(org.apache.spark.sql.Row) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) WriteStatus(org.apache.hudi.client.WriteStatus) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 68 with HoodieActiveTimeline

use of org.apache.hudi.common.table.timeline.HoodieActiveTimeline in project hudi by apache.

the class TestHoodieTimelineArchiver method testMergeSmallArchiveFilesRecoverFromDeleteFailed.

@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testMergeSmallArchiveFilesRecoverFromDeleteFailed(boolean enableArchiveMerge) throws Exception {
    HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(true, 2, 3, 2, enableArchiveMerge, 3, 209715200);
    // do ingestion and trigger archive actions here.
    for (int i = 1; i < 8; i++) {
        testTable.doWriteOperation("0000000" + i, WriteOperationType.UPSERT, i == 1 ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 2);
        archiveAndGetCommitsList(writeConfig);
    }
    // do a single merge small archive files
    HoodieTable table = HoodieSparkTable.create(writeConfig, context, metaClient);
    HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(writeConfig, table);
    FileStatus[] fsStatuses = metaClient.getFs().globStatus(new Path(metaClient.getArchivePath() + "/.commits_.archive*"));
    List<String> candidateFiles = Arrays.stream(fsStatuses).map(fs -> fs.getPath().toString()).collect(Collectors.toList());
    archiver.reOpenWriter();
    archiver.buildArchiveMergePlan(candidateFiles, new Path(metaClient.getArchivePath(), HoodieArchivedTimeline.MERGE_ARCHIVE_PLAN_NAME), ".commits_.archive.3_1-0-1");
    archiver.mergeArchiveFiles(Arrays.stream(fsStatuses).collect(Collectors.toList()));
    archiver.reOpenWriter();
    // delete only one of the small archive file to simulate delete action failed.
    metaClient.getFs().delete(fsStatuses[0].getPath());
    // loading archived timeline and active timeline success
    HoodieActiveTimeline rawActiveTimeline = new HoodieActiveTimeline(metaClient, false);
    HoodieArchivedTimeline archivedTimeLine = metaClient.getArchivedTimeline().reload();
    assertEquals(7 * 3, rawActiveTimeline.countInstants() + archivedTimeLine.countInstants());
    // do another archive actions with merge small archive files.
    for (int i = 1; i < 10; i++) {
        testTable.doWriteOperation("1000000" + i, WriteOperationType.UPSERT, i == 1 ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 2);
        archiveAndGetCommitsList(writeConfig);
    }
    // check result.
    HoodieActiveTimeline rawActiveTimeline1 = new HoodieActiveTimeline(metaClient, false);
    HoodieArchivedTimeline archivedTimeLine1 = metaClient.getArchivedTimeline().reload();
    assertEquals(16 * 3, archivedTimeLine1.countInstants() + rawActiveTimeline1.countInstants());
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieWrapperFileSystem(org.apache.hudi.common.fs.HoodieWrapperFileSystem) Arrays(java.util.Arrays) HoodieArchivedTimeline(org.apache.hudi.common.table.timeline.HoodieArchivedTimeline) FileIOUtils(org.apache.hudi.common.util.FileIOUtils) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) HoodieException(org.apache.hudi.exception.HoodieException) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieMetadataTestTable(org.apache.hudi.common.testutils.HoodieMetadataTestTable) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) HoodieTimelineArchiver(org.apache.hudi.client.HoodieTimelineArchiver) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) HoodieRollbackMetadata(org.apache.hudi.avro.model.HoodieRollbackMetadata) SparkHoodieBackedTableMetadataWriter(org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter) Path(org.apache.hadoop.fs.Path) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) Collectors(java.util.stream.Collectors) Test(org.junit.jupiter.api.Test) List(java.util.List) Stream(java.util.stream.Stream) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) HoodieTestUtils.createCompactionCommitInMetadataTable(org.apache.hudi.common.testutils.HoodieTestUtils.createCompactionCommitInMetadataTable) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) IntStream(java.util.stream.IntStream) Assertions.assertThrows(org.junit.jupiter.api.Assertions.assertThrows) CsvSource(org.junit.jupiter.params.provider.CsvSource) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) State(org.apache.hudi.common.table.timeline.HoodieInstant.State) HoodieClientTestHarness(org.apache.hudi.testutils.HoodieClientTestHarness) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) MetadataConversionUtils(org.apache.hudi.client.utils.MetadataConversionUtils) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) ValueSource(org.junit.jupiter.params.provider.ValueSource) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) AfterEach(org.junit.jupiter.api.AfterEach) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) HoodieTableMetadataWriter(org.apache.hudi.metadata.HoodieTableMetadataWriter) LogManager(org.apache.log4j.LogManager) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) Comparator(java.util.Comparator) Collections(java.util.Collections) Pair(org.apache.hudi.common.util.collection.Pair) HoodieTimelineArchiver(org.apache.hudi.client.HoodieTimelineArchiver) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieArchivedTimeline(org.apache.hudi.common.table.timeline.HoodieArchivedTimeline) ValueSource(org.junit.jupiter.params.provider.ValueSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 69 with HoodieActiveTimeline

use of org.apache.hudi.common.table.timeline.HoodieActiveTimeline in project hudi by apache.

the class TestHoodieTimelineArchiver method testLoadArchiveTimelineWithUncompletedMergeArchiveFile.

@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testLoadArchiveTimelineWithUncompletedMergeArchiveFile(boolean enableArchiveMerge) throws Exception {
    HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(true, 2, 3, 2, enableArchiveMerge, 3, 209715200);
    for (int i = 1; i < 8; i++) {
        testTable.doWriteOperation("0000000" + i, WriteOperationType.UPSERT, i == 1 ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 2);
        archiveAndGetCommitsList(writeConfig);
    }
    HoodieTable table = HoodieSparkTable.create(writeConfig, context, metaClient);
    HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(writeConfig, table);
    FileStatus[] fsStatuses = metaClient.getFs().globStatus(new Path(metaClient.getArchivePath() + "/.commits_.archive*"));
    List<String> candidateFiles = Arrays.stream(fsStatuses).map(fs -> fs.getPath().toString()).collect(Collectors.toList());
    archiver.reOpenWriter();
    archiver.buildArchiveMergePlan(candidateFiles, new Path(metaClient.getArchivePath(), HoodieArchivedTimeline.MERGE_ARCHIVE_PLAN_NAME), ".commits_.archive.3_1-0-1");
    archiver.mergeArchiveFiles(Arrays.stream(fsStatuses).collect(Collectors.toList()));
    HoodieLogFormat.Writer writer = archiver.reOpenWriter();
    String s = "Dummy Content";
    // stain the current merged archive file.
    FileIOUtils.createFileInPath(metaClient.getFs(), writer.getLogFile().getPath(), Option.of(s.getBytes()));
    // if there's only a damaged merged archive file, we need to ignore the exception while reading this damaged file.
    HoodieActiveTimeline rawActiveTimeline1 = new HoodieActiveTimeline(metaClient, false);
    HoodieArchivedTimeline archivedTimeLine1 = metaClient.getArchivedTimeline();
    assertEquals(7 * 3, archivedTimeLine1.countInstants() + rawActiveTimeline1.countInstants());
    // if there are a damaged merged archive files and other common damaged archive file.
    // hoodie need throw ioe while loading archived timeline because of parsing the damaged archive file.
    Path damagedFile = new Path(metaClient.getArchivePath(), ".commits_.archive.300_1-0-1");
    FileIOUtils.createFileInPath(metaClient.getFs(), damagedFile, Option.of(s.getBytes()));
    assertThrows(HoodieException.class, () -> metaClient.getArchivedTimeline().reload());
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieWrapperFileSystem(org.apache.hudi.common.fs.HoodieWrapperFileSystem) Arrays(java.util.Arrays) HoodieArchivedTimeline(org.apache.hudi.common.table.timeline.HoodieArchivedTimeline) FileIOUtils(org.apache.hudi.common.util.FileIOUtils) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) HoodieException(org.apache.hudi.exception.HoodieException) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieMetadataTestTable(org.apache.hudi.common.testutils.HoodieMetadataTestTable) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) HoodieTimelineArchiver(org.apache.hudi.client.HoodieTimelineArchiver) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) HoodieRollbackMetadata(org.apache.hudi.avro.model.HoodieRollbackMetadata) SparkHoodieBackedTableMetadataWriter(org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter) Path(org.apache.hadoop.fs.Path) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) Collectors(java.util.stream.Collectors) Test(org.junit.jupiter.api.Test) List(java.util.List) Stream(java.util.stream.Stream) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) HoodieTestUtils.createCompactionCommitInMetadataTable(org.apache.hudi.common.testutils.HoodieTestUtils.createCompactionCommitInMetadataTable) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) IntStream(java.util.stream.IntStream) Assertions.assertThrows(org.junit.jupiter.api.Assertions.assertThrows) CsvSource(org.junit.jupiter.params.provider.CsvSource) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) State(org.apache.hudi.common.table.timeline.HoodieInstant.State) HoodieClientTestHarness(org.apache.hudi.testutils.HoodieClientTestHarness) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) MetadataConversionUtils(org.apache.hudi.client.utils.MetadataConversionUtils) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) ValueSource(org.junit.jupiter.params.provider.ValueSource) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) AfterEach(org.junit.jupiter.api.AfterEach) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) HoodieTableMetadataWriter(org.apache.hudi.metadata.HoodieTableMetadataWriter) LogManager(org.apache.log4j.LogManager) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) Comparator(java.util.Comparator) Collections(java.util.Collections) Pair(org.apache.hudi.common.util.collection.Pair) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieArchivedTimeline(org.apache.hudi.common.table.timeline.HoodieArchivedTimeline) HoodieTimelineArchiver(org.apache.hudi.client.HoodieTimelineArchiver) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) HoodieTable(org.apache.hudi.table.HoodieTable) ValueSource(org.junit.jupiter.params.provider.ValueSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 70 with HoodieActiveTimeline

use of org.apache.hudi.common.table.timeline.HoodieActiveTimeline in project hudi by apache.

the class TestHoodieTimelineArchiver method testMergeSmallArchiveFilesRecoverFromBuildPlanFailed.

@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testMergeSmallArchiveFilesRecoverFromBuildPlanFailed(boolean enableArchiveMerge) throws Exception {
    HoodieWriteConfig writeConfig = initTestTableAndGetWriteConfig(true, 2, 3, 2, enableArchiveMerge, 3, 209715200);
    // do ingestion and trigger archive actions here.
    for (int i = 1; i < 8; i++) {
        testTable.doWriteOperation("0000000" + i, WriteOperationType.UPSERT, i == 1 ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 2);
        archiveAndGetCommitsList(writeConfig);
    }
    // build a merge small archive plan with dummy content
    // this plan can not be deserialized.
    HoodieTable table = HoodieSparkTable.create(writeConfig, context, metaClient);
    HoodieTimelineArchiver archiver = new HoodieTimelineArchiver(writeConfig, table);
    FileStatus[] fsStatuses = metaClient.getFs().globStatus(new Path(metaClient.getArchivePath() + "/.commits_.archive*"));
    List<String> candidateFiles = Arrays.stream(fsStatuses).map(fs -> fs.getPath().toString()).collect(Collectors.toList());
    archiver.reOpenWriter();
    Path plan = new Path(metaClient.getArchivePath(), HoodieArchivedTimeline.MERGE_ARCHIVE_PLAN_NAME);
    archiver.buildArchiveMergePlan(candidateFiles, plan, ".commits_.archive.3_1-0-1");
    String s = "Dummy Content";
    // stain the current merge plan file.
    FileIOUtils.createFileInPath(metaClient.getFs(), plan, Option.of(s.getBytes()));
    // check that damaged plan file will not block archived timeline loading.
    HoodieActiveTimeline rawActiveTimeline = new HoodieActiveTimeline(metaClient, false);
    HoodieArchivedTimeline archivedTimeLine = metaClient.getArchivedTimeline().reload();
    assertEquals(7 * 3, rawActiveTimeline.countInstants() + archivedTimeLine.countInstants());
    // trigger several archive after left damaged merge small archive file plan.
    for (int i = 1; i < 10; i++) {
        testTable.doWriteOperation("1000000" + i, WriteOperationType.UPSERT, i == 1 ? Arrays.asList("p1", "p2") : Collections.emptyList(), Arrays.asList("p1", "p2"), 2);
        archiveAndGetCommitsList(writeConfig);
    }
    // loading archived timeline and active timeline success
    HoodieActiveTimeline rawActiveTimeline1 = new HoodieActiveTimeline(metaClient, false);
    HoodieArchivedTimeline archivedTimeLine1 = metaClient.getArchivedTimeline().reload();
    // check instant number
    assertEquals(16 * 3, archivedTimeLine1.countInstants() + rawActiveTimeline1.countInstants());
    // if there are damaged archive files and damaged plan, hoodie need throw ioe while loading archived timeline.
    Path damagedFile = new Path(metaClient.getArchivePath(), ".commits_.archive.300_1-0-1");
    FileIOUtils.createFileInPath(metaClient.getFs(), damagedFile, Option.of(s.getBytes()));
    assertThrows(HoodieException.class, () -> metaClient.getArchivedTimeline().reload());
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieWrapperFileSystem(org.apache.hudi.common.fs.HoodieWrapperFileSystem) Arrays(java.util.Arrays) HoodieArchivedTimeline(org.apache.hudi.common.table.timeline.HoodieArchivedTimeline) FileIOUtils(org.apache.hudi.common.util.FileIOUtils) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) HoodieException(org.apache.hudi.exception.HoodieException) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieMetadataTestTable(org.apache.hudi.common.testutils.HoodieMetadataTestTable) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) HoodieTimelineArchiver(org.apache.hudi.client.HoodieTimelineArchiver) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) HoodieRollbackMetadata(org.apache.hudi.avro.model.HoodieRollbackMetadata) SparkHoodieBackedTableMetadataWriter(org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter) Path(org.apache.hadoop.fs.Path) HoodieLogFormat(org.apache.hudi.common.table.log.HoodieLogFormat) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) Collectors(java.util.stream.Collectors) Test(org.junit.jupiter.api.Test) List(java.util.List) Stream(java.util.stream.Stream) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) HoodieTestUtils.createCompactionCommitInMetadataTable(org.apache.hudi.common.testutils.HoodieTestUtils.createCompactionCommitInMetadataTable) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) IntStream(java.util.stream.IntStream) Assertions.assertThrows(org.junit.jupiter.api.Assertions.assertThrows) CsvSource(org.junit.jupiter.params.provider.CsvSource) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) State(org.apache.hudi.common.table.timeline.HoodieInstant.State) HoodieClientTestHarness(org.apache.hudi.testutils.HoodieClientTestHarness) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) MetadataConversionUtils(org.apache.hudi.client.utils.MetadataConversionUtils) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) ValueSource(org.junit.jupiter.params.provider.ValueSource) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) AfterEach(org.junit.jupiter.api.AfterEach) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) HoodieTableMetadataWriter(org.apache.hudi.metadata.HoodieTableMetadataWriter) LogManager(org.apache.log4j.LogManager) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) Comparator(java.util.Comparator) Collections(java.util.Collections) Pair(org.apache.hudi.common.util.collection.Pair) HoodieTimelineArchiver(org.apache.hudi.client.HoodieTimelineArchiver) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieArchivedTimeline(org.apache.hudi.common.table.timeline.HoodieArchivedTimeline) ValueSource(org.junit.jupiter.params.provider.ValueSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Aggregations

HoodieActiveTimeline (org.apache.hudi.common.table.timeline.HoodieActiveTimeline)95 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)70 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)47 Test (org.junit.jupiter.api.Test)45 HoodieCommitMetadata (org.apache.hudi.common.model.HoodieCommitMetadata)37 ArrayList (java.util.ArrayList)36 IOException (java.io.IOException)32 List (java.util.List)30 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)30 HashMap (java.util.HashMap)28 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)26 Map (java.util.Map)25 Option (org.apache.hudi.common.util.Option)22 Pair (org.apache.hudi.common.util.collection.Pair)22 Collectors (java.util.stream.Collectors)21 Path (org.apache.hadoop.fs.Path)21 Logger (org.apache.log4j.Logger)21 LogManager (org.apache.log4j.LogManager)20 Stream (java.util.stream.Stream)19 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)19