Examples with HoodieTableFileSystemView - org.apache.hudi.common.table.view.HoodieTableFileSystemView

Example 36 with HoodieTableFileSystemView

use of org.apache.hudi.common.table.view.HoodieTableFileSystemView in project hudi by apache.

the class TestHoodieSparkMergeOnReadTableRollback method testRollbackWithDeltaAndCompactionCommit.

@ParameterizedTest
@ValueSource(booleans = { true, false })
void testRollbackWithDeltaAndCompactionCommit(boolean rollbackUsingMarkers) throws Exception {
    // NOTE: First writer will have Metadata table DISABLED
    HoodieWriteConfig.Builder cfgBuilder = getConfigBuilder(false, rollbackUsingMarkers, HoodieIndex.IndexType.SIMPLE);
    addConfigsForPopulateMetaFields(cfgBuilder, true);
    HoodieWriteConfig cfg = cfgBuilder.build();
    Properties properties = new Properties();
    properties.setProperty(HoodieTableConfig.BASE_FILE_FORMAT.key(), HoodieTableConfig.BASE_FILE_FORMAT.defaultValue().toString());
    HoodieTableMetaClient metaClient = getHoodieMetaClient(HoodieTableType.MERGE_ON_READ, properties);
    try (SparkRDDWriteClient client = getHoodieWriteClient(cfg)) {
        HoodieTestDataGenerator dataGen = new HoodieTestDataGenerator();
        // Test delta commit rollback
        /*
       * Write 1 (only inserts)
       */
        String newCommitTime = "001";
        client.startCommitWithTime(newCommitTime);
        List<HoodieRecord> records = dataGen.generateInserts(newCommitTime, 200);
        JavaRDD<HoodieRecord> writeRecords = jsc().parallelize(records, 1);
        JavaRDD<WriteStatus> writeStatusJavaRDD = client.upsert(writeRecords, newCommitTime);
        List<WriteStatus> statuses = writeStatusJavaRDD.collect();
        assertNoWriteErrors(statuses);
        client.commit(newCommitTime, jsc().parallelize(statuses));
        HoodieTable hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient);
        Option<HoodieInstant> deltaCommit = metaClient.getActiveTimeline().getDeltaCommitTimeline().firstInstant();
        assertTrue(deltaCommit.isPresent());
        assertEquals("001", deltaCommit.get().getTimestamp(), "Delta commit should be 001");
        Option<HoodieInstant> commit = metaClient.getActiveTimeline().getCommitTimeline().firstInstant();
        assertFalse(commit.isPresent());
        FileStatus[] allFiles = listAllBaseFilesInPath(hoodieTable);
        HoodieTableFileSystemView tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitTimeline().filterCompletedInstants(), allFiles);
        Stream<HoodieBaseFile> dataFilesToRead = tableView.getLatestBaseFiles();
        assertFalse(dataFilesToRead.findAny().isPresent());
        tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
        dataFilesToRead = tableView.getLatestBaseFiles();
        assertTrue(dataFilesToRead.findAny().isPresent(), "should list the base files we wrote in the delta commit");
        /*
       * Write 2 (inserts + updates - testing failed delta commit)
       */
        final String commitTime1 = "002";
        // NOTE: Second writer will have Metadata table ENABLED
        try (SparkRDDWriteClient secondClient = getHoodieWriteClient(getHoodieWriteConfigWithSmallFileHandlingOff(true))) {
            secondClient.startCommitWithTime(commitTime1);
            List<HoodieRecord> copyOfRecords = new ArrayList<>(records);
            copyOfRecords = dataGen.generateUpdates(commitTime1, copyOfRecords);
            copyOfRecords.addAll(dataGen.generateInserts(commitTime1, 200));
            List<String> inputPaths = tableView.getLatestBaseFiles().map(baseFile -> new Path(baseFile.getPath()).getParent().toString()).collect(Collectors.toList());
            List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), inputPaths, basePath());
            assertEquals(200, recordsRead.size());
            statuses = secondClient.upsert(jsc().parallelize(copyOfRecords, 1), commitTime1).collect();
            // Verify there are no errors
            assertNoWriteErrors(statuses);
            // Test failed delta commit rollback
            secondClient.rollback(commitTime1);
            allFiles = listAllBaseFilesInPath(hoodieTable);
            // After rollback, there should be no base file with the failed commit time
            List<String> remainingFiles = Arrays.stream(allFiles).filter(file -> file.getPath().getName().contains(commitTime1)).map(fileStatus -> fileStatus.getPath().toString()).collect(Collectors.toList());
            assertEquals(0, remainingFiles.size(), "There files should have been rolled-back " + "when rolling back commit " + commitTime1 + " but are still remaining. Files: " + remainingFiles);
            inputPaths = tableView.getLatestBaseFiles().map(baseFile -> new Path(baseFile.getPath()).getParent().toString()).collect(Collectors.toList());
            recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), inputPaths, basePath());
            assertEquals(200, recordsRead.size());
        }
        /*
       * Write 3 (inserts + updates - testing successful delta commit)
       */
        final String commitTime2 = "003";
        try (SparkRDDWriteClient thirdClient = getHoodieWriteClient(getHoodieWriteConfigWithSmallFileHandlingOff(true))) {
            thirdClient.startCommitWithTime(commitTime2);
            List<HoodieRecord> copyOfRecords = new ArrayList<>(records);
            copyOfRecords = dataGen.generateUpdates(commitTime2, copyOfRecords);
            copyOfRecords.addAll(dataGen.generateInserts(commitTime2, 200));
            List<String> inputPaths = tableView.getLatestBaseFiles().map(baseFile -> new Path(baseFile.getPath()).getParent().toString()).collect(Collectors.toList());
            List<GenericRecord> recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), inputPaths, basePath());
            assertEquals(200, recordsRead.size());
            writeRecords = jsc().parallelize(copyOfRecords, 1);
            writeStatusJavaRDD = thirdClient.upsert(writeRecords, commitTime2);
            statuses = writeStatusJavaRDD.collect();
            // Verify there are no errors
            assertNoWriteErrors(statuses);
            // Test successful delta commit rollback
            thirdClient.rollback(commitTime2);
            allFiles = listAllBaseFilesInPath(hoodieTable);
            // After rollback, there should be no base file with the failed commit time
            assertEquals(0, Arrays.stream(allFiles).filter(file -> file.getPath().getName().contains(commitTime2)).count());
            metaClient = HoodieTableMetaClient.reload(metaClient);
            hoodieTable = HoodieSparkTable.create(cfg, context(), metaClient);
            tableView = getHoodieTableFileSystemView(metaClient, hoodieTable.getCompletedCommitsTimeline(), allFiles);
            inputPaths = tableView.getLatestBaseFiles().map(baseFile -> new Path(baseFile.getPath()).getParent().toString()).collect(Collectors.toList());
            recordsRead = HoodieMergeOnReadTestUtils.getRecordsUsingInputFormat(hadoopConf(), inputPaths, basePath());
            // check that the number of records read is still correct after rollback operation
            assertEquals(200, recordsRead.size());
            // Test compaction commit rollback
            /*
         * Write 4 (updates)
         */
            newCommitTime = "004";
            thirdClient.startCommitWithTime(newCommitTime);
            writeStatusJavaRDD = thirdClient.upsert(writeRecords, newCommitTime);
            statuses = writeStatusJavaRDD.collect();
            // Verify there are no errors
            assertNoWriteErrors(statuses);
            thirdClient.commit(newCommitTime, jsc().parallelize(statuses));
            metaClient = HoodieTableMetaClient.reload(metaClient);
            String compactionInstantTime = thirdClient.scheduleCompaction(Option.empty()).get().toString();
            thirdClient.compact(compactionInstantTime);
            metaClient = HoodieTableMetaClient.reload(metaClient);
            final String compactedCommitTime = metaClient.getActiveTimeline().reload().lastInstant().get().getTimestamp();
            assertTrue(Arrays.stream(listAllBaseFilesInPath(hoodieTable)).anyMatch(file -> compactedCommitTime.equals(new HoodieBaseFile(file).getCommitTime())));
            hoodieTable.rollbackInflightCompaction(new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.COMPACTION_ACTION, compactedCommitTime));
            allFiles = listAllBaseFilesInPath(hoodieTable);
            metaClient = HoodieTableMetaClient.reload(metaClient);
            tableView = getHoodieTableFileSystemView(metaClient, metaClient.getCommitsTimeline(), allFiles);
            assertFalse(tableView.getLatestBaseFiles().anyMatch(file -> compactedCommitTime.equals(file.getCommitTime())));
            assertAll(tableView.getLatestBaseFiles().map(file -> () -> assertNotEquals(compactedCommitTime, file.getCommitTime())));
        }
    }
}

Also used : HoodieTable(org.apache.hudi.table.HoodieTable) HoodieMergeOnReadTestUtils(org.apache.hudi.testutils.HoodieMergeOnReadTestUtils) Arrays(java.util.Arrays) HoodieFailedWritesCleaningPolicy(org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Assertions.assertNotEquals(org.junit.jupiter.api.Assertions.assertNotEquals) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) Map(java.util.Map) HoodieStorageConfig(org.apache.hudi.config.HoodieStorageConfig) Path(org.apache.hadoop.fs.Path) Assertions.assertAll(org.junit.jupiter.api.Assertions.assertAll) Tag(org.junit.jupiter.api.Tag) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) TRIP_EXAMPLE_SCHEMA(org.apache.hudi.common.testutils.HoodieTestDataGenerator.TRIP_EXAMPLE_SCHEMA) Collection(java.util.Collection) Collectors(java.util.stream.Collectors) HoodieIndex(org.apache.hudi.index.HoodieIndex) Test(org.junit.jupiter.api.Test) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) Stream(java.util.stream.Stream) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) HoodieWriteStat(org.apache.hudi.common.model.HoodieWriteStat) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) TableFileSystemView(org.apache.hudi.common.table.view.TableFileSystemView) FileSlice(org.apache.hudi.common.model.FileSlice) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) MarkerType(org.apache.hudi.common.table.marker.MarkerType) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) SyncableFileSystemView(org.apache.hudi.common.table.view.SyncableFileSystemView) ValueSource(org.junit.jupiter.params.provider.ValueSource) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) GenericRecord(org.apache.avro.generic.GenericRecord) Assertions.assertNoWriteErrors(org.apache.hudi.testutils.Assertions.assertNoWriteErrors) Properties(java.util.Properties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Files(java.nio.file.Files) HoodieTestTable(org.apache.hudi.common.testutils.HoodieTestTable) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) File(java.io.File) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) SparkClientFunctionalTestHarness(org.apache.hudi.testutils.SparkClientFunctionalTestHarness) HoodieIOException(org.apache.hudi.exception.HoodieIOException) Pair(org.apache.hudi.common.util.collection.Pair) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) ArrayList(java.util.ArrayList) Properties(java.util.Properties) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) GenericRecord(org.apache.avro.generic.GenericRecord) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Path(org.apache.hadoop.fs.Path) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieTable(org.apache.hudi.table.HoodieTable) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) ValueSource(org.junit.jupiter.params.provider.ValueSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Example 37 with HoodieTableFileSystemView

use of org.apache.hudi.common.table.view.HoodieTableFileSystemView in project hudi by apache.

the class BaseHoodieTableFileIndex method doRefresh.

private void doRefresh() {
    long startTime = System.currentTimeMillis();
    Map<PartitionPath, FileStatus[]> partitionFiles = loadPartitionPathFiles();
    FileStatus[] allFiles = partitionFiles.values().stream().flatMap(Arrays::stream).toArray(FileStatus[]::new);
    metaClient.reloadActiveTimeline();
    HoodieTimeline activeTimeline = getActiveTimeline();
    Option<HoodieInstant> latestInstant = activeTimeline.lastInstant();
    // TODO we can optimize the flow by:
    // - First fetch list of files from instants of interest
    // - Load FileStatus's
    fileSystemView = new HoodieTableFileSystemView(metaClient, activeTimeline, allFiles);
    Option<String> queryInstant = specifiedQueryInstant.or(() -> latestInstant.map(HoodieInstant::getTimestamp));
    validate(activeTimeline, queryInstant);
    if (tableType.equals(HoodieTableType.MERGE_ON_READ) && queryType.equals(HoodieTableQueryType.SNAPSHOT)) {
        cachedAllInputFileSlices = partitionFiles.keySet().stream().collect(Collectors.toMap(Function.identity(), partitionPath -> queryInstant.map(instant -> fileSystemView.getLatestMergedFileSlicesBeforeOrOn(partitionPath.path, queryInstant.get()).collect(Collectors.toList())).orElse(Collections.emptyList())));
    } else {
        cachedAllInputFileSlices = partitionFiles.keySet().stream().collect(Collectors.toMap(Function.identity(), partitionPath -> queryInstant.map(instant -> fileSystemView.getLatestFileSlicesBeforeOrOn(partitionPath.path, instant, true)).orElse(fileSystemView.getLatestFileSlices(partitionPath.path)).collect(Collectors.toList())));
    }
    cachedFileSize = cachedAllInputFileSlices.values().stream().flatMap(Collection::stream).mapToLong(BaseHoodieTableFileIndex::fileSliceSize).sum();
    // If the partition value contains InternalRow.empty, we query it as a non-partitioned table.
    queryAsNonePartitionedTable = partitionFiles.keySet().stream().anyMatch(p -> p.values.length == 0);
    long duration = System.currentTimeMillis() - startTime;
    LOG.info(String.format("Refresh table %s, spent: %d ms", metaClient.getTableConfig().getTableName(), duration));
}

Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Arrays(java.util.Arrays) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSlice(org.apache.hudi.common.model.FileSlice) CollectionUtils(org.apache.hudi.common.util.CollectionUtils) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) FileStatus(org.apache.hadoop.fs.FileStatus) Function(java.util.function.Function) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) BaseFile(org.apache.hudi.common.model.BaseFile) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) TypedProperties(org.apache.hudi.common.config.TypedProperties) Collection(java.util.Collection) HoodieTableQueryType(org.apache.hudi.common.model.HoodieTableQueryType) Collectors(java.util.stream.Collectors) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) Objects(java.util.Objects) List(java.util.List) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView)

Example 38 with HoodieTableFileSystemView

use of org.apache.hudi.common.table.view.HoodieTableFileSystemView in project hudi by apache.

the class FileSystemViewCommand method showAllFileSlices.

@CliCommand(value = "show fsview all", help = "Show entire file-system view")
public String showAllFileSlices(@CliOption(key = { "pathRegex" }, help = "regex to select files, eg: 2016/08/02", unspecifiedDefaultValue = "*/*/*") String globRegex, @CliOption(key = { "baseFileOnly" }, help = "Only display base files view", unspecifiedDefaultValue = "false") boolean baseFileOnly, @CliOption(key = { "maxInstant" }, help = "File-Slices upto this instant are displayed", unspecifiedDefaultValue = "") String maxInstant, @CliOption(key = { "includeMax" }, help = "Include Max Instant", unspecifiedDefaultValue = "false") boolean includeMaxInstant, @CliOption(key = { "includeInflight" }, help = "Include Inflight Instants", unspecifiedDefaultValue = "false") boolean includeInflight, @CliOption(key = { "excludeCompaction" }, help = "Exclude compaction Instants", unspecifiedDefaultValue = "false") boolean excludeCompaction, @CliOption(key = { "limit" }, help = "Limit rows to be displayed", unspecifiedDefaultValue = "-1") Integer limit, @CliOption(key = { "sortBy" }, help = "Sorting Field", unspecifiedDefaultValue = "") final String sortByField, @CliOption(key = { "desc" }, help = "Ordering", unspecifiedDefaultValue = "false") final boolean descending, @CliOption(key = { "headeronly" }, help = "Print Header Only", unspecifiedDefaultValue = "false") final boolean headerOnly) throws IOException {
    HoodieTableFileSystemView fsView = buildFileSystemView(globRegex, maxInstant, baseFileOnly, includeMaxInstant, includeInflight, excludeCompaction);
    List<Comparable[]> rows = new ArrayList<>();
    fsView.getAllFileGroups().forEach(fg -> fg.getAllFileSlices().forEach(fs -> {
        int idx = 0;
        // For base file only Views, do not display any delta-file related columns
        Comparable[] row = new Comparable[baseFileOnly ? 5 : 8];
        row[idx++] = fg.getPartitionPath();
        row[idx++] = fg.getFileGroupId().getFileId();
        row[idx++] = fs.getBaseInstantTime();
        row[idx++] = fs.getBaseFile().isPresent() ? fs.getBaseFile().get().getPath() : "";
        row[idx++] = fs.getBaseFile().isPresent() ? fs.getBaseFile().get().getFileSize() : -1;
        if (!baseFileOnly) {
            row[idx++] = fs.getLogFiles().count();
            row[idx++] = fs.getLogFiles().mapToLong(HoodieLogFile::getFileSize).sum();
            row[idx++] = fs.getLogFiles().collect(Collectors.toList()).toString();
        }
        rows.add(row);
    }));
    Function<Object, String> converterFunction = entry -> NumericUtils.humanReadableByteCount((Double.parseDouble(entry.toString())));
    Map<String, Function<Object, String>> fieldNameToConverterMap = new HashMap<>();
    fieldNameToConverterMap.put(HoodieTableHeaderFields.HEADER_TOTAL_DELTA_FILE_SIZE, converterFunction);
    fieldNameToConverterMap.put(HoodieTableHeaderFields.HEADER_DATA_FILE_SIZE, converterFunction);
    TableHeader header = new TableHeader().addTableHeaderField(HoodieTableHeaderFields.HEADER_PARTITION).addTableHeaderField(HoodieTableHeaderFields.HEADER_FILE_ID).addTableHeaderField(HoodieTableHeaderFields.HEADER_BASE_INSTANT).addTableHeaderField(HoodieTableHeaderFields.HEADER_DATA_FILE).addTableHeaderField(HoodieTableHeaderFields.HEADER_DATA_FILE_SIZE);
    if (!baseFileOnly) {
        header = header.addTableHeaderField(HoodieTableHeaderFields.HEADER_NUM_DELTA_FILES).addTableHeaderField(HoodieTableHeaderFields.HEADER_TOTAL_DELTA_FILE_SIZE).addTableHeaderField(HoodieTableHeaderFields.HEADER_DELTA_FILES);
    }
    return HoodiePrintHelper.print(header, fieldNameToConverterMap, sortByField, descending, limit, headerOnly, rows);
}

Also used : FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieTableHeaderFields(org.apache.hudi.cli.HoodieTableHeaderFields) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) FileStatus(org.apache.hadoop.fs.FileStatus) CliOption(org.springframework.shell.core.annotation.CliOption) Function(java.util.function.Function) ArrayList(java.util.ArrayList) BiPredicate(java.util.function.BiPredicate) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) CommandMarker(org.springframework.shell.core.CommandMarker) CliCommand(org.springframework.shell.core.annotation.CliCommand) TableHeader(org.apache.hudi.cli.TableHeader) IOException(java.io.IOException) HoodieDefaultTimeline(org.apache.hudi.common.table.timeline.HoodieDefaultTimeline) Collectors(java.util.stream.Collectors) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) Serializable(java.io.Serializable) HoodieCLI(org.apache.hudi.cli.HoodieCLI) Component(org.springframework.stereotype.Component) List(java.util.List) Stream(java.util.stream.Stream) HoodiePrintHelper(org.apache.hudi.cli.HoodiePrintHelper) FSUtils(org.apache.hudi.common.fs.FSUtils) NumericUtils(org.apache.hudi.common.util.NumericUtils) Function(java.util.function.Function) TableHeader(org.apache.hudi.cli.TableHeader) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) CliCommand(org.springframework.shell.core.annotation.CliCommand)

Example 39 with HoodieTableFileSystemView

use of org.apache.hudi.common.table.view.HoodieTableFileSystemView in project hudi by apache.

the class ITTestRepairsCommand method testDeduplicateWithInserts.

/**
 * Test case for dry run deduplicate.
 */
@Test
public void testDeduplicateWithInserts() throws IOException {
    // get fs and check number of latest files
    HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, metaClient.getActiveTimeline().getCommitTimeline().filterCompletedInstants(), fs.listStatus(new Path(duplicatedPartitionPath)));
    List<String> filteredStatuses = fsView.getLatestBaseFiles().map(HoodieBaseFile::getPath).collect(Collectors.toList());
    assertEquals(3, filteredStatuses.size(), "There should be 3 files.");
    // Before deduplicate, all files contain 210 records
    String[] files = filteredStatuses.toArray(new String[0]);
    Dataset df = readFiles(files);
    assertEquals(210, df.count());
    String partitionPath = HoodieTestDataGenerator.DEFAULT_FIRST_PARTITION_PATH;
    String cmdStr = String.format("repair deduplicate --duplicatedPartitionPath %s --repairedOutputPath %s --sparkMaster %s", partitionPath, repairedOutputPath, "local");
    CommandResult cr = getShell().executeCommand(cmdStr);
    assertTrue(cr.isSuccess());
    assertEquals(RepairsCommand.DEDUPLICATE_RETURN_PREFIX + repairedOutputPath, cr.getResult().toString());
    // After deduplicate, there are 200 records
    FileStatus[] fileStatus = fs.listStatus(new Path(repairedOutputPath));
    files = Arrays.stream(fileStatus).map(status -> status.getPath().toString()).toArray(String[]::new);
    Dataset result = readFiles(files);
    assertEquals(200, result.count());
}

Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) Dataset(org.apache.spark.sql.Dataset) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) CommandResult(org.springframework.shell.core.CommandResult) AbstractShellIntegrationTest(org.apache.hudi.cli.testutils.AbstractShellIntegrationTest) Test(org.junit.jupiter.api.Test)

Example 40 with HoodieTableFileSystemView

use of org.apache.hudi.common.table.view.HoodieTableFileSystemView in project hudi by apache.

the class HoodieTableSource method buildFileIndex.

private List<MergeOnReadInputSplit> buildFileIndex() {
    Set<String> requiredPartitionPaths = getRequiredPartitionPaths();
    fileIndex.setPartitionPaths(requiredPartitionPaths);
    List<String> relPartitionPaths = fileIndex.getOrBuildPartitionPaths();
    if (relPartitionPaths.size() == 0) {
        return Collections.emptyList();
    }
    FileStatus[] fileStatuses = fileIndex.getFilesInPartitions();
    if (fileStatuses.length == 0) {
        throw new HoodieException("No files found for reading in user provided path.");
    }
    HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(metaClient, // file-slice after pending compaction-requested instant-time is also considered valid
    metaClient.getCommitsAndCompactionTimeline().filterCompletedAndCompactionInstants(), fileStatuses);
    String latestCommit = fsView.getLastInstant().get().getTimestamp();
    final String mergeType = this.conf.getString(FlinkOptions.MERGE_TYPE);
    final AtomicInteger cnt = new AtomicInteger(0);
    // generates one input split for each file group
    return relPartitionPaths.stream().map(relPartitionPath -> fsView.getLatestMergedFileSlicesBeforeOrOn(relPartitionPath, latestCommit).map(fileSlice -> {
        String basePath = fileSlice.getBaseFile().map(BaseFile::getPath).orElse(null);
        Option<List<String>> logPaths = Option.ofNullable(fileSlice.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()).map(logFile -> logFile.getPath().toString()).collect(Collectors.toList()));
        return new MergeOnReadInputSplit(cnt.getAndAdd(1), basePath, logPaths, latestCommit, metaClient.getBasePath(), maxCompactionMemoryInBytes, mergeType, null);
    }).collect(Collectors.toList())).flatMap(Collection::stream).collect(Collectors.toList());
}

Also used : MergeOnReadInputSplit(org.apache.hudi.table.format.mor.MergeOnReadInputSplit) FileStatus(org.apache.hadoop.fs.FileStatus) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) HoodieException(org.apache.hudi.exception.HoodieException) List(java.util.List) ArrayList(java.util.ArrayList) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView)

Aggregations

HoodieTableFileSystemView (org.apache.hudi.common.table.view.HoodieTableFileSystemView)42 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)29 FileStatus (org.apache.hadoop.fs.FileStatus)25 Path (org.apache.hadoop.fs.Path)24 IOException (java.io.IOException)22 ArrayList (java.util.ArrayList)22 FileSlice (org.apache.hudi.common.model.FileSlice)22 List (java.util.List)21 Collectors (java.util.stream.Collectors)20 Option (org.apache.hudi.common.util.Option)20 Map (java.util.Map)19 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)19 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)18 HoodieBaseFile (org.apache.hudi.common.model.HoodieBaseFile)17 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)16 HoodieIOException (org.apache.hudi.exception.HoodieIOException)16 HoodieException (org.apache.hudi.exception.HoodieException)15 Stream (java.util.stream.Stream)14 Test (org.junit.jupiter.api.Test)13 HashMap (java.util.HashMap)12