Search in sources :

Example 6 with HoodieBaseFile

use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.

the class TestHoodieSparkMergeOnReadTableClustering method testClusteringWithNoBaseFiles.

@ParameterizedTest
@ValueSource(booleans = { true, false })
void testClusteringWithNoBaseFiles(boolean doUpdates) throws Exception {
    // set low compaction small File Size to generate more file groups.
    HoodieWriteConfig.Builder cfgBuilder = HoodieWriteConfig.newBuilder().forTable("test-trip-table").withPath(basePath()).withSchema(TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).withDeleteParallelism(2).withAutoCommit(true).withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(10L).withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(1).build()).withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1024 * 1024 * 1024).parquetMaxFileSize(1024 * 1024 * 1024).build()).withEmbeddedTimelineServerEnabled(true).withFileSystemViewConfig(new FileSystemViewStorageConfig.Builder().withEnableBackupForRemoteFileSystemView(false).build()).withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.INMEMORY).build()).withClusteringConfig(HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10).withClusteringTargetPartitions(0).withInlineClustering(true).withInlineClusteringNumCommits(1).build()).withRollbackUsingMarkers(false);
    HoodieWriteConfig cfg = cfgBuilder.build();
    HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, cfg.getProps());
    HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();
    try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
        // test 2 inserts
        String newCommitTime = "001";
        client.startCommitWithTime(newCommitTime);
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 400);
        Stream<HoodieBaseFile> dataFiles = insertRecordsToMORTable(metaClient, records.subList(0, 200), client, cfg, newCommitTime);
        assertTrue(!dataFiles.findAny().isPresent(), "should not have any base files");
        newCommitTime = "002";
        client.startCommitWithTime(newCommitTime);
        dataFiles = insertRecordsToMORTable(metaClient, records.subList(200, 400), client, cfg, newCommitTime);
        assertTrue(!dataFiles.findAny().isPresent(), "should not have any base files");
        // run updates
        if (doUpdates) {
            newCommitTime = "003";
            client.startCommitWithTime(newCommitTime);
            records = dataGen.generateUpdates(newCommitTime, 100);
            updateRecordsInMORTable(metaClient, records, client, cfg, newCommitTime, false);
        }
        HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient);
        hoodieTable.getHoodieView().sync();
        FileStatus[] allBaseFiles = listAllBaseFilesInPath(hoodieTable);
        // expect 0 base files for each partition
        assertEquals(0, allBaseFiles.length);
        String clusteringCommitTime = client.scheduleClustering(Option.empty()).get().toString();
        metaClient = HoodieTableMetaClient.reload(metaClient);
        hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient);
        // verify log files are included in clustering plan for each partition.
        assertEquals(dataGen.getPartitionPaths().length, hoodieTable.getFileSystemView().getFileGroupsInPendingClustering().map(Pair::getLeft).count());
        // do the clustering and validate
        doClusteringAndValidate(client, clusteringCommitTime, metaClient, cfg, dataGen);
    }
}
Also used : SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Pair(org.apache.hudi.common.util.collection.Pair) ValueSource(org.junit.jupiter.params.provider.ValueSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 7 with HoodieBaseFile

use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.

the class TestHoodieSparkMergeOnReadTableClustering method testClustering.

@ParameterizedTest
@MethodSource
void testClustering(boolean doUpdates, boolean populateMetaFields, boolean preserveCommitMetadata) throws Exception {
    // set low compaction small File Size to generate more file groups.
    HoodieWriteConfig.Builder cfgBuilder = HoodieWriteConfig.newBuilder().forTable("test-trip-table").withPath(basePath()).withSchema(TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).withDeleteParallelism(2).withAutoCommit(true).withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(10L).withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(1).build()).withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1024 * 1024 * 1024).parquetMaxFileSize(1024 * 1024 * 1024).build()).withEmbeddedTimelineServerEnabled(true).withFileSystemViewConfig(new FileSystemViewStorageConfig.Builder().withEnableBackupForRemoteFileSystemView(false).build()).withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.BLOOM).build()).withClusteringConfig(HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10).withClusteringTargetPartitions(0).withInlineClustering(true).withInlineClusteringNumCommits(1).withPreserveHoodieCommitMetadata(preserveCommitMetadata).build()).withRollbackUsingMarkers(false);
    addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields);
    HoodieWriteConfig cfg = cfgBuilder.build();
    HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, cfg.getProps());
    HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();
    try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
        /*
       * Write 1 (only inserts)
       */
        String newCommitTime = "001";
        client.startCommitWithTime(newCommitTime);
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 400);
        Stream<HoodieBaseFile> dataFiles = insertRecordsToMORTable(metaClient, records.subList(0, 200), client, cfg, newCommitTime);
        assertTrue(dataFiles.findAny().isPresent(), "should list the base files we wrote in the delta commit");
        /*
       * Write 2 (more inserts to create new files)
       */
        // we already set small file size to small number to force inserts to go into new file.
        newCommitTime = "002";
        client.startCommitWithTime(newCommitTime);
        dataFiles = insertRecordsToMORTable(metaClient, records.subList(200, 400), client, cfg, newCommitTime);
        assertTrue(dataFiles.findAny().isPresent(), "should list the base files we wrote in the delta commit");
        if (doUpdates) {
            /*
         * Write 3 (updates)
         */
            newCommitTime = "003";
            client.startCommitWithTime(newCommitTime);
            records = dataGen.generateUpdates(newCommitTime, 100);
            updateRecordsInMORTable(metaClient, records, client, cfg, newCommitTime, false);
        }
        HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient);
        hoodieTable.getHoodieView().sync();
        FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable);
        // expect 2 base files for each partition
        assertEquals(dataGen.getPartitionPaths().length * 2, allFiles.length);
        String clusteringCommitTime = client.scheduleClustering(Option.empty()).get().toString();
        metaClient = HoodieTableMetaClient.reload(metaClient);
        hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient);
        // verify all files are included in clustering plan.
        assertEquals(allFiles.length, hoodieTable.getFileSystemView().getFileGroupsInPendingClustering().map(Pair::getLeft).count());
        // Do the clustering and validate
        doClusteringAndValidate(client, clusteringCommitTime, metaClient, cfg, dataGen);
    }
}
Also used : SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Pair(org.apache.hudi.common.util.collection.Pair) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) MethodSource(org.junit.jupiter.params.provider.MethodSource)

Example 8 with HoodieBaseFile

use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.

the class TestHoodieSparkMergeOnReadTableIncrementalRead method testIncrementalReadsWithCompaction.

// test incremental read does not go past compaction instant for RO views
// For RT views, incremental read can go past compaction
@Test
public void testIncrementalReadsWithCompaction() throws Exception {
    // use only one partition for this test
    final String partitionPath = "2020/02/20";
    final HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator(new String[] { partitionPath });
    Properties props = new Properties();
    props.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), HoodieFileFormat.PARQUET.toString());
    HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, props);
    HoodieWriteConfig cfg = getConfigBuilder(true).build();
    try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
        /*
       * Write 1 (only inserts)
       */
        String commitTime1 = "001";
        client.startCommitWithTime(commitTime1);
        List<HoodieRecord> records001 = dataGen.generateInserts(commitTime1, 200);
        Stream<HoodieBaseFile> dataFiles = insertRecordsToMORTable(metaClient, records001, client, cfg, commitTime1);
        assertTrue(dataFiles.findAny().isPresent(), "should list the base files we wrote in the delta commit");
        // verify only one base file shows up with commit time 001
        FileStatus[] snapshotROFiles = getROSnapshotFiles(partitionPath);
        validateFiles(partitionPath, 1, snapshotROFiles, false, roSnapshotJobConf, 200, commitTime1);
        FileStatus[] incrementalROFiles = getROIncrementalFiles(partitionPath, true);
        validateFiles(partitionPath, 1, incrementalROFiles, false, roJobConf, 200, commitTime1);
        Path firstFilePath = incrementalROFiles[0].getPath();
        FileStatus[] incrementalRTFiles = getRTIncrementalFiles(partitionPath);
        validateFiles(partitionPath, 1, incrementalRTFiles, true, rtJobConf, 200, commitTime1);
        assertEquals(firstFilePath, incrementalRTFiles[0].getPath());
        /*
       * Write 2 (updates)
       */
        String updateTime = "004";
        client.startCommitWithTime(updateTime);
        List<HoodieRecord> records004 = dataGen.generateUpdates(updateTime, 100);
        updateRecordsInMORTable(metaClient, records004, client, cfg, updateTime, false);
        // verify RO incremental reads - only one base file shows up because updates to into log files
        incrementalROFiles = getROIncrementalFiles(partitionPath, false);
        validateFiles(partitionPath, 1, incrementalROFiles, false, roJobConf, 200, commitTime1);
        assertEquals(firstFilePath, incrementalROFiles[0].getPath());
        // verify RT incremental reads includes updates also
        incrementalRTFiles = getRTIncrementalFiles(partitionPath);
        validateFiles(partitionPath, 1, incrementalRTFiles, true, rtJobConf, 200, commitTime1, updateTime);
        // request compaction, but do not perform compaction
        String compactionCommitTime = "005";
        client.scheduleCompactionAtInstant("005", Option.empty());
        // verify RO incremental reads - only one base file shows up because updates go into log files
        incrementalROFiles = getROIncrementalFiles(partitionPath, true);
        validateFiles(partitionPath, 1, incrementalROFiles, false, roJobConf, 200, commitTime1);
        // verify RT incremental reads includes updates also
        incrementalRTFiles = getRTIncrementalFiles(partitionPath);
        validateFiles(partitionPath, 1, incrementalRTFiles, true, rtJobConf, 200, commitTime1, updateTime);
        // write 3 - more inserts
        String insertsTime = "006";
        List<HoodieRecord> records006 = dataGen.generateInserts(insertsTime, 200);
        client.startCommitWithTime(insertsTime);
        dataFiles = insertRecordsToMORTable(metaClient, records006, client, cfg, insertsTime);
        assertTrue(dataFiles.findAny().isPresent(), "should list the base files we wrote in the delta commit");
        // verify new write shows up in snapshot mode even though there is pending compaction
        snapshotROFiles = getROSnapshotFiles(partitionPath);
        validateFiles(partitionPath, 2, snapshotROFiles, false, roSnapshotJobConf, 400, commitTime1, insertsTime);
        incrementalROFiles = getROIncrementalFiles(partitionPath, true);
        assertEquals(firstFilePath, incrementalROFiles[0].getPath());
        // verify 006 does not show up in RO mode because of pending compaction
        validateFiles(partitionPath, 1, incrementalROFiles, false, roJobConf, 200, commitTime1);
        // verify that if stopAtCompaction is disabled, inserts from "insertsTime" show up
        incrementalROFiles = getROIncrementalFiles(partitionPath, false);
        validateFiles(partitionPath, 2, incrementalROFiles, false, roJobConf, 400, commitTime1, insertsTime);
        // verify 006 shows up in RT views
        incrementalRTFiles = getRTIncrementalFiles(partitionPath);
        validateFiles(partitionPath, 2, incrementalRTFiles, true, rtJobConf, 400, commitTime1, updateTime, insertsTime);
        // perform the scheduled compaction
        client.compact(compactionCommitTime);
        // verify new write shows up in snapshot mode after compaction is complete
        snapshotROFiles = getROSnapshotFiles(partitionPath);
        validateFiles(partitionPath, 2, snapshotROFiles, false, roSnapshotJobConf, 400, commitTime1, compactionCommitTime, insertsTime);
        incrementalROFiles = getROIncrementalFiles(partitionPath, "002", -1, true);
        assertTrue(incrementalROFiles.length == 2);
        // verify 006 shows up because of pending compaction
        validateFiles(partitionPath, 2, incrementalROFiles, false, roJobConf, 400, commitTime1, compactionCommitTime, insertsTime);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Properties(java.util.Properties) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Test(org.junit.jupiter.api.Test)

Example 9 with HoodieBaseFile

use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.

the class TestHoodieSparkMergeOnReadTableInsertUpdateDelete method testSimpleInsertUpdateAndDelete.

@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testSimpleInsertUpdateAndDelete(boolean populateMetaFields) throws Exception {
    Properties properties = populateMetaFields ? new Properties() : getPropertiesForKeyGen();
    properties.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().toString());
    HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, properties);
    HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(true);
    addConfigsForPopulateMetaFields(cfgBuilder, populateMetaFields);
    HoodieWriteConfig cfg = cfgBuilder.build();
    try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
        HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();
        /*
       * Write 1 (only inserts, written as base file)
       */
        String newCommitTime = "001";
        client.startCommitWithTime(newCommitTime);
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 20);
        JavaRDD<HoodieRecord> writeRecords = jsc().parallelize(records, 1);
        List<WriteStatus> statuses = client.upsert(writeRecords, newCommitTime).collect();
        assertNoWriteErrors(statuses);
        HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient);
        Option<HoodieInstant> deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
        assertTrue(deltaCommit.isPresent());
        assertEquals("001", deltaCommit.get().getTimestamp(), "Delta commit should be 001");
        Option<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
        assertFalse(commit.isPresent());
        FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable);
        HoodieTableFileSystemView tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles);
        Stream<HoodieBaseFile> dataFilesToRead = tableView.getLatestBaseFiles();
        assertFalse(dataFilesToRead.findAny().isPresent());
        tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
        dataFilesToRead = tableView.getLatestBaseFiles();
        assertTrue(dataFilesToRead.findAny().isPresent(), "should list the base files we wrote in the delta commit");
        /*
       * Write 2 (only updates, written to .log file)
       */
        newCommitTime = "002";
        client.startCommitWithTime(newCommitTime);
        records = dataGen.generateUpdates(newCommitTime, records);
        writeRecords = jsc().parallelize(records, 1);
        statuses = client.upsert(writeRecords, newCommitTime).collect();
        assertNoWriteErrors(statuses);
        /*
       * Write 2 (only deletes, written to .log file)
       */
        newCommitTime = "004";
        client.startCommitWithTime(newCommitTime);
        List<HoodieRecord> fewRecordsForDelete = dataGen.generateDeletesFromExistingRecords(records);
        statuses = client.upsert(jsc().parallelize(fewRecordsForDelete, 1), newCommitTime).collect();
        // Verify there are no errors
        assertNoWriteErrors(statuses);
        metaClient = HoodieTableMetaClient.reload(metaClient);
        deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().lastInstant();
        assertTrue(deltaCommit.isPresent());
        assertEquals("004", deltaCommit.get().getTimestamp(), "Latest Delta commit should be 004");
        commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
        assertFalse(commit.isPresent());
        allFiles = listAllBaseFilesInPath(hoodieTable);
        tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
        dataFilesToRead = tableView.getLatestBaseFiles();
        assertTrue(dataFilesToRead.findAny().isPresent());
        List<String> inputPaths = tableView.getLatestBaseFiles().map(baseFile -> new Path(baseFile.getPath()).getParent().toString()).collect(Collectors.toList());
        List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), inputPaths, basePath(), new JobConf(hadoopConf()), true, populateMetaFields);
        // Wrote 20 records and deleted 20 records, so remaining 20-20 = 0
        assertEquals(0, recordsRead.size(), "Must contain 0 records");
    }
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) HoodieMergeOnReadTestUtils(org.apache.hudi.testutils.HoodieMergeOnReadTestUtils) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Option(org.apache.hudi.common.util.Option) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) Tag(org.junit.jupiter.api.Tag) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) MethodSource(org.junit.jupiter.params.provider.MethodSource) ValueSource(org.junit.jupiter.params.provider.ValueSource) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Assertions.assertNoWriteErrors(org.apache.hudi.testutils.Assertions.assertNoWriteErrors) Properties(java.util.Properties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Collection(java.util.Collection) Arguments(org.junit.jupiter.params.provider.Arguments) Collectors(java.util.stream.Collectors) FileCreateUtils(org.apache.hudi.common.testutils.FileCreateUtils) HoodieIndex(org.apache.hudi.index.HoodieIndex) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) JobConf(org.apache.hadoop.mapred.JobConf) Test(org.junit.jupiter.api.Test) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) List(java.util.List) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) Stream(java.util.stream.Stream) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) SparkClientFunctionalTestHarness(org.apache.hudi.testutils.SparkClientFunctionalTestHarness) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView) HoodieClientTestUtils(org.apache.hudi.testutils.HoodieClientTestUtils) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) Properties(java.util.Properties) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) GenericRecord(org.apache.avro.generic.GenericRecord) JobConf(org.apache.hadoop.mapred.JobConf) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Path(org.apache.hadoop.fs.Path) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) ValueSource(org.junit.jupiter.params.provider.ValueSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 10 with HoodieBaseFile

use of org.apache.hudi.common.model.HoodieBaseFile in project hudi by apache.

the class TestHoodieCompactionStrategy method createCompactionOperations.

private List<HoodieCompactionOperation> createCompactionOperations(HoodieWriteConfig config, Map<Long, List<Long>> sizesMap, Map<Long, String> keyToPartitionMap) {
    List<HoodieCompactionOperation> operations = new ArrayList<>(sizesMap.size());
    sizesMap.forEach((k, v) -> {
        HoodieBaseFile df = TestHoodieBaseFile.newDataFile(k);
        String partitionPath = keyToPartitionMap.get(k);
        List<HoodieLogFile> logFiles = v.stream().map(TestHoodieLogFile::newLogFile).collect(Collectors.toList());
        FileSlice slice = new FileSlice(new HoodieFileGroupId(partitionPath, df.getFileId()), df.getCommitTime());
        slice.setBaseFile(df);
        logFiles.stream().forEach(f -> slice.addLogFile(f));
        operations.add(new HoodieCompactionOperation(df.getCommitTime(), logFiles.stream().map(s -> s.getPath().toString()).collect(Collectors.toList()), df.getPath(), df.getFileId(), partitionPath, config.getCompactionStrategy().captureMetrics(config, slice), df.getBootstrapBaseFile().map(BaseFile::getPath).orElse(null)));
    });
    return operations;
}
Also used : Arrays(java.util.Arrays) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Date(java.util.Date) FileSlice(org.apache.hudi.common.model.FileSlice) SimpleDateFormat(java.text.SimpleDateFormat) HashMap(java.util.HashMap) Random(java.util.Random) UUID(java.util.UUID) Collectors(java.util.stream.Collectors) ArrayList(java.util.ArrayList) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) Test(org.junit.jupiter.api.Test) HoodieCompactionOperation(org.apache.hudi.avro.model.HoodieCompactionOperation) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) BaseFile(org.apache.hudi.common.model.BaseFile) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) Map(java.util.Map) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) Collections(java.util.Collections) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) Pair(org.apache.hudi.common.util.collection.Pair) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieCompactionOperation(org.apache.hudi.avro.model.HoodieCompactionOperation) ArrayList(java.util.ArrayList) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile)

Aggregations

HoodieBaseFile (org.apache.hudi.common.model.HoodieBaseFile)71 Path (org.apache.hadoop.fs.Path)40 ArrayList (java.util.ArrayList)33 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)31 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)31 FileSlice (org.apache.hudi.common.model.FileSlice)29 List (java.util.List)27 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)27 IOException (java.io.IOException)26 FileStatus (org.apache.hadoop.fs.FileStatus)25 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)24 Pair (org.apache.hudi.common.util.collection.Pair)24 Option (org.apache.hudi.common.util.Option)23 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)23 Collectors (java.util.stream.Collectors)21 Test (org.junit.jupiter.api.Test)21 ParameterizedTest (org.junit.jupiter.params.ParameterizedTest)21 Map (java.util.Map)20 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)20 HoodieTable (org.apache.hudi.table.HoodieTable)20