Search in sources :

Example 1 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class TestRepairUtils method testGetBaseAndLogFilePathsFromTimeline.

@Test
public void testGetBaseAndLogFilePathsFromTimeline() throws IOException {
    setupTimelineInFS();
    HoodieTimeline timeline = metaClient.getActiveTimeline();
    HoodieInstant commitInstant = new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "001");
    HoodieInstant inflightInstant = new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.COMMIT_ACTION, "005");
    HoodieInstant compactionInstant = new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.COMPACTION_ACTION, "006");
    Map<String, List<Pair<String, String>>> partitionToFileIdAndNameMap = instantInfoMap.get(commitInstant.getTimestamp());
    Set<String> expectedPaths = partitionToFileIdAndNameMap.entrySet().stream().flatMap(entry -> entry.getValue().stream().map(fileInfo -> new Path(entry.getKey(), fileInfo.getValue()).toString()).collect(Collectors.toList()).stream()).collect(Collectors.toSet());
    assertEquals(Option.of(expectedPaths), RepairUtils.getBaseAndLogFilePathsFromTimeline(timeline, commitInstant));
    assertThrows(HoodieException.class, () -> RepairUtils.getBaseAndLogFilePathsFromTimeline(timeline, inflightInstant));
    assertEquals(Option.empty(), RepairUtils.getBaseAndLogFilePathsFromTimeline(timeline, compactionInstant));
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Assertions.assertThrows(org.junit.jupiter.api.Assertions.assertThrows) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieException(org.apache.hudi.exception.HoodieException) HoodieTestCommitGenerator(org.apache.hudi.HoodieTestCommitGenerator) CollectionUtils(org.apache.hudi.common.util.CollectionUtils) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) HoodieTestCommitGenerator.getLogFilename(org.apache.hudi.HoodieTestCommitGenerator.getLogFilename) HoodieTestCommitGenerator.initCommitInfoForRepairTests(org.apache.hudi.HoodieTestCommitGenerator.initCommitInfoForRepairTests) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) BeforeAll(org.junit.jupiter.api.BeforeAll) HoodieTestCommitGenerator.getBaseFilename(org.apache.hudi.HoodieTestCommitGenerator.getBaseFilename) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) Set(java.util.Set) IOException(java.io.IOException) UUID(java.util.UUID) Collectors(java.util.stream.Collectors) Test(org.junit.jupiter.api.Test) List(java.util.List) TempDir(org.junit.jupiter.api.io.TempDir) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) Pair(org.apache.hudi.common.util.collection.Pair) Path(org.apache.hadoop.fs.Path) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) ArrayList(java.util.ArrayList) List(java.util.List) Test(org.junit.jupiter.api.Test)

Example 2 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class TestBootstrapUtils method testAllLeafFoldersWithFiles.

@Test
public void testAllLeafFoldersWithFiles() throws IOException {
    // All directories including marker dirs.
    List<String> folders = Arrays.asList("2016/04/15", "2016/05/16", "2016/05/17");
    folders.forEach(f -> {
        try {
            metaClient.getFs().mkdirs(new Path(new Path(basePath), f));
        } catch (IOException e) {
            throw new HoodieException(e);
        }
    });
    // Files inside partitions and marker directories
    List<String> files = Stream.of("2016/04/15/1_1-0-1_20190528120000", "2016/04/15/2_1-0-1_20190528120000", "2016/05/16/3_1-0-1_20190528120000", "2016/05/16/4_1-0-1_20190528120000", "2016/04/17/5_1-0-1_20190528120000", "2016/04/17/6_1-0-1_20190528120000").map(file -> file + metaClient.getTableConfig().getBaseFileFormat().getFileExtension()).collect(Collectors.toList());
    files.forEach(f -> {
        try {
            metaClient.getFs().create(new Path(new Path(basePath), f));
        } catch (IOException e) {
            throw new HoodieException(e);
        }
    });
    List<Pair<String, List<HoodieFileStatus>>> collected = BootstrapUtils.getAllLeafFoldersWithFiles(metaClient, metaClient.getFs(), basePath, context);
    assertEquals(3, collected.size());
    collected.stream().forEach(k -> {
        assertEquals(2, k.getRight().size());
    });
    // Simulate reading from un-partitioned dataset
    collected = BootstrapUtils.getAllLeafFoldersWithFiles(metaClient, metaClient.getFs(), basePath + "/" + folders.get(0), context);
    assertEquals(1, collected.size());
    collected.stream().forEach(k -> {
        assertEquals(2, k.getRight().size());
    });
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieClientTestBase(org.apache.hudi.testutils.HoodieClientTestBase) Test(org.junit.jupiter.api.Test) Arrays(java.util.Arrays) List(java.util.List) Stream(java.util.stream.Stream) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) HoodieException(org.apache.hudi.exception.HoodieException) Path(org.apache.hadoop.fs.Path) IOException(java.io.IOException) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) Collectors(java.util.stream.Collectors) Pair(org.apache.hudi.common.util.collection.Pair) HoodieFileStatus(org.apache.hudi.avro.model.HoodieFileStatus) HoodieException(org.apache.hudi.exception.HoodieException) IOException(java.io.IOException) Pair(org.apache.hudi.common.util.collection.Pair) Test(org.junit.jupiter.api.Test)

Example 3 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class CompactionTestBase method runNextDeltaCommits.

protected List<HoodieRecord> runNextDeltaCommits(SparkRDDWriteClient client, final HoodieReadClient readClient, List<String> deltaInstants, List<HoodieRecord> records, HoodieWriteConfig cfg, boolean insertFirst, List<String> expPendingCompactionInstants) throws Exception {
    HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build();
    List<Pair<String, HoodieCompactionPlan>> pendingCompactions = readClient.getPendingCompactions();
    List<String> gotPendingCompactionInstants = pendingCompactions.stream().map(pc -> pc.getKey()).sorted().collect(Collectors.toList());
    assertEquals(expPendingCompactionInstants, gotPendingCompactionInstants);
    Map<HoodieFileGroupId, Pair<String, HoodieCompactionOperation>> fgIdToCompactionOperation = CompactionUtils.getAllPendingCompactionOperations(metaClient);
    if (insertFirst) {
        // Use first instant for inserting records
        String firstInstant = deltaInstants.get(0);
        deltaInstants = deltaInstants.subList(1, deltaInstants.size());
        JavaRDD<HoodieRecord> writeRecords = jsc.parallelize(records, 1);
        client.startCommitWithTime(firstInstant);
        JavaRDD<WriteStatus> statuses = client.upsert(writeRecords, firstInstant);
        List<WriteStatus> statusList = statuses.collect();
        if (!cfg.shouldAutoCommit()) {
            client.commit(firstInstant, statuses);
        }
        assertNoWriteErrors(statusList);
        metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build();
        HoodieTable hoodieTable = getHoodieTable(metaClient, cfg);
        List<HoodieBaseFile> dataFilesToRead = getCurrentLatestBaseFiles(hoodieTable);
        assertTrue(dataFilesToRead.stream().findAny().isPresent(), "should list the base files we wrote in the delta commit");
        validateDeltaCommit(firstInstant, fgIdToCompactionOperation, cfg);
    }
    int numRecords = records.size();
    for (String instantTime : deltaInstants) {
        records = dataGen.generateUpdates(instantTime, numRecords);
        metaClient = HoodieTableMetaClient.builder().setConf(hadoopConf).setBasePath(cfg.getBasePath()).build();
        createNextDeltaCommit(instantTime, records, client, metaClient, cfg, false);
        validateDeltaCommit(instantTime, fgIdToCompactionOperation, cfg);
    }
    return records;
}
Also used : HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieTable(org.apache.hudi.table.HoodieTable) WriteStatus(org.apache.hudi.client.WriteStatus) Pair(org.apache.hudi.common.util.collection.Pair)

Example 4 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class HoodieClientTestHarness method buildProfile.

public static Pair<HashMap<String, WorkloadStat>, WorkloadStat> buildProfile(JavaRDD<HoodieRecord> inputRecordsRDD) {
    HashMap<String, WorkloadStat> partitionPathStatMap = new HashMap<>();
    WorkloadStat globalStat = new WorkloadStat();
    // group the records by partitionPath + currentLocation combination, count the number of
    // records in each partition
    Map<Tuple2<String, Option<HoodieRecordLocation>>, Long> partitionLocationCounts = inputRecordsRDD.mapToPair(record -> new Tuple2<>(new Tuple2<>(record.getPartitionPath(), Option.ofNullable(record.getCurrentLocation())), record)).countByKey();
    // count the number of both inserts and updates in each partition, update the counts to workLoadStats
    for (Map.Entry<Tuple2<String, Option<HoodieRecordLocation>>, Long> e : partitionLocationCounts.entrySet()) {
        String partitionPath = e.getKey()._1();
        Long count = e.getValue();
        Option<HoodieRecordLocation> locOption = e.getKey()._2();
        if (!partitionPathStatMap.containsKey(partitionPath)) {
            partitionPathStatMap.put(partitionPath, new WorkloadStat());
        }
        if (locOption.isPresent()) {
            // update
            partitionPathStatMap.get(partitionPath).addUpdates(locOption.get(), count);
            globalStat.addUpdates(locOption.get(), count);
        } else {
            // insert
            partitionPathStatMap.get(partitionPath).addInserts(count);
            globalStat.addInserts(count);
        }
    }
    return Pair.of(partitionPathStatMap, globalStat);
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) BeforeEach(org.junit.jupiter.api.BeforeEach) Arrays(java.util.Arrays) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Random(java.util.Random) HoodieTimer(org.apache.hudi.common.util.HoodieTimer) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) SparkTaskContextSupplier(org.apache.hudi.client.SparkTaskContextSupplier) AfterAll(org.junit.jupiter.api.AfterAll) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) Assertions.assertLinesMatch(org.junit.jupiter.api.Assertions.assertLinesMatch) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) CleanPlanV2MigrationHandler(org.apache.hudi.common.table.timeline.versioning.clean.CleanPlanV2MigrationHandler) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) SparkHoodieBackedTableMetadataWriter(org.apache.hudi.metadata.SparkHoodieBackedTableMetadataWriter) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) WorkloadStat(org.apache.hudi.table.WorkloadStat) CleanerUtils.convertCleanMetadata(org.apache.hudi.common.util.CleanerUtils.convertCleanMetadata) HoodieCleanerPlan(org.apache.hudi.avro.model.HoodieCleanerPlan) SimpleKeyGenerator(org.apache.hudi.keygen.SimpleKeyGenerator) Tuple2(scala.Tuple2) HoodieCommonTestHarness(org.apache.hudi.common.testutils.HoodieCommonTestHarness) Collectors(java.util.stream.Collectors) HoodieIndex(org.apache.hudi.index.HoodieIndex) TestInfo(org.junit.jupiter.api.TestInfo) Executors(java.util.concurrent.Executors) Serializable(java.io.Serializable) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) List(java.util.List) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) FileSystemBackedTableMetadata(org.apache.hudi.metadata.FileSystemBackedTableMetadata) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView) Assertions.fail(org.junit.jupiter.api.Assertions.fail) Assertions.assertNotNull(org.junit.jupiter.api.Assertions.assertNotNull) HoodieCleaningPolicy(org.apache.hudi.common.model.HoodieCleaningPolicy) MiniDFSCluster(org.apache.hadoop.hdfs.MiniDFSCluster) HoodieBackedTableMetadataWriter(org.apache.hudi.metadata.HoodieBackedTableMetadataWriter) FileSlice(org.apache.hudi.common.model.FileSlice) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) ArrayList(java.util.ArrayList) TimelineService(org.apache.hudi.timeline.service.TimelineService) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HoodieReadClient(org.apache.hudi.client.HoodieReadClient) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) ExecutorService(java.util.concurrent.ExecutorService) HoodieCleanStat(org.apache.hudi.common.HoodieCleanStat) SparkSession(org.apache.spark.sql.SparkSession) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HdfsTestService(org.apache.hudi.common.testutils.minicluster.HdfsTestService) DistributedFileSystem(org.apache.hadoop.hdfs.DistributedFileSystem) Properties(java.util.Properties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) SQLContext(org.apache.spark.sql.SQLContext) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) IOException(java.io.IOException) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) HoodieActionInstant(org.apache.hudi.avro.model.HoodieActionInstant) HoodieMetadataException(org.apache.hudi.exception.HoodieMetadataException) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) Assertions(org.junit.jupiter.api.Assertions) HoodieTableMetadataWriter(org.apache.hudi.metadata.HoodieTableMetadataWriter) LogManager(org.apache.log4j.LogManager) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) LocalFileSystem(org.apache.hadoop.fs.LocalFileSystem) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) Pair(org.apache.hudi.common.util.collection.Pair) HashMap(java.util.HashMap) HoodieRecordLocation(org.apache.hudi.common.model.HoodieRecordLocation) WorkloadStat(org.apache.hudi.table.WorkloadStat) Tuple2(scala.Tuple2) Map(java.util.Map) HashMap(java.util.HashMap)

Example 5 with Pair

use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.

the class TestHoodieSparkMergeOnReadTableClustering method testClusteringWithNoBaseFiles.

@ParameterizedTest
@ValueSource(booleans = { true, false })
void testClusteringWithNoBaseFiles(boolean doUpdates) throws Exception {
    // set low compaction small File Size to generate more file groups.
    HoodieWriteConfig.Builder cfgBuilder = HoodieWriteConfig.newBuilder().forTable("test-trip-table").withPath(basePath()).withSchema(TRIP_EXAMPLE_SCHEMA).withParallelism(2, 2).withDeleteParallelism(2).withAutoCommit(true).withCompactionConfig(HoodieCompactionConfig.newBuilder().compactionSmallFileSize(10L).withInlineCompaction(false).withMaxNumDeltaCommitsBeforeCompaction(1).build()).withStorageConfig(HoodieStorageConfig.newBuilder().hfileMaxFileSize(1024 * 1024 * 1024).parquetMaxFileSize(1024 * 1024 * 1024).build()).withEmbeddedTimelineServerEnabled(true).withFileSystemViewConfig(new FileSystemViewStorageConfig.Builder().withEnableBackupForRemoteFileSystemView(false).build()).withIndexConfig(HoodieIndexConfig.newBuilder().withIndexType(HoodieIndex.IndexType.INMEMORY).build()).withClusteringConfig(HoodieClusteringConfig.newBuilder().withClusteringMaxNumGroups(10).withClusteringTargetPartitions(0).withInlineClustering(true).withInlineClusteringNumCommits(1).build()).withRollbackUsingMarkers(false);
    HoodieWriteConfig cfg = cfgBuilder.build();
    HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, cfg.getProps());
    HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();
    try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
        // test 2 inserts
        String newCommitTime = "001";
        client.startCommitWithTime(newCommitTime);
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 400);
        Stream<HoodieBaseFile> dataFiles = insertRecordsToMORTable(metaClient, records.subList(0, 200), client, cfg, newCommitTime);
        assertTrue(!dataFiles.findAny().isPresent(), "should not have any base files");
        newCommitTime = "002";
        client.startCommitWithTime(newCommitTime);
        dataFiles = insertRecordsToMORTable(metaClient, records.subList(200, 400), client, cfg, newCommitTime);
        assertTrue(!dataFiles.findAny().isPresent(), "should not have any base files");
        // run updates
        if (doUpdates) {
            newCommitTime = "003";
            client.startCommitWithTime(newCommitTime);
            records = dataGen.generateUpdates(newCommitTime, 100);
            updateRecordsInMORTable(metaClient, records, client, cfg, newCommitTime, false);
        }
        HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient);
        hoodieTable.getHoodieView().sync();
        FileStatus[] allBaseFiles = listAllBaseFilesInPath(hoodieTable);
        // expect 0 base files for each partition
        assertEquals(0, allBaseFiles.length);
        String clusteringCommitTime = client.scheduleClustering(Option.empty()).get().toString();
        metaClient = HoodieTableMetaClient.reload(metaClient);
        hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient);
        // verify log files are included in clustering plan for each partition.
        assertEquals(dataGen.getPartitionPaths().length, hoodieTable.getFileSystemView().getFileGroupsInPendingClustering().map(Pair::getLeft).count());
        // do the clustering and validate
        doClusteringAndValidate(client, clusteringCommitTime, metaClient, cfg, dataGen);
    }
}
Also used : SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Pair(org.apache.hudi.common.util.collection.Pair) ValueSource(org.junit.jupiter.params.provider.ValueSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Aggregations

Pair (org.apache.hudi.common.util.collection.Pair)147 List (java.util.List)98 Map (java.util.Map)91 IOException (java.io.IOException)89 Collectors (java.util.stream.Collectors)87 Option (org.apache.hudi.common.util.Option)87 ArrayList (java.util.ArrayList)85 Path (org.apache.hadoop.fs.Path)81 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)76 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)66 HashMap (java.util.HashMap)65 LogManager (org.apache.log4j.LogManager)64 Logger (org.apache.log4j.Logger)64 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)63 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)58 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)54 HoodieIOException (org.apache.hudi.exception.HoodieIOException)54 Arrays (java.util.Arrays)48 HoodieTable (org.apache.hudi.table.HoodieTable)46 Test (org.junit.jupiter.api.Test)46