Search in sources :

Example 31 with HoodieTimeline

use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.

the class TestHoodieParquetInputFormat method testPendingCompactionWithActiveCommits.

// Verify that HoodieParquetInputFormat does not return instants after pending compaction
@Test
public void testPendingCompactionWithActiveCommits() throws IOException {
    // setup 4 sample instants in timeline
    List<HoodieInstant> instants = new ArrayList<>();
    HoodieInstant t1 = new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "1");
    HoodieInstant t2 = new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.DELTA_COMMIT_ACTION, "2");
    HoodieInstant t3 = new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "3");
    HoodieInstant t4 = new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, "4");
    HoodieInstant t5 = new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "5");
    HoodieInstant t6 = new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, "6");
    instants.add(t1);
    instants.add(t2);
    instants.add(t3);
    instants.add(t4);
    instants.add(t5);
    instants.add(t6);
    HoodieTableMetaClient metaClient = HoodieTestUtils.init(basePath.toString());
    HoodieActiveTimeline timeline = new HoodieActiveTimeline(metaClient);
    timeline.setInstants(instants);
    // Verify getCommitsTimelineBeforePendingCompaction does not return instants after first compaction instant
    HoodieTimeline filteredTimeline = HoodieInputFormatUtils.filterInstantsTimeline(timeline);
    assertTrue(filteredTimeline.containsInstant(t1));
    assertTrue(filteredTimeline.containsInstant(t2));
    assertFalse(filteredTimeline.containsInstant(t3));
    assertFalse(filteredTimeline.containsInstant(t4));
    assertFalse(filteredTimeline.containsInstant(t5));
    assertFalse(filteredTimeline.containsInstant(t6));
    // remove compaction instant and setup timeline again
    instants.remove(t3);
    timeline = new HoodieActiveTimeline(metaClient);
    timeline.setInstants(instants);
    filteredTimeline = HoodieInputFormatUtils.filterInstantsTimeline(timeline);
    // verify all remaining instants are returned.
    assertTrue(filteredTimeline.containsInstant(t1));
    assertTrue(filteredTimeline.containsInstant(t2));
    assertFalse(filteredTimeline.containsInstant(t3));
    assertTrue(filteredTimeline.containsInstant(t4));
    assertFalse(filteredTimeline.containsInstant(t5));
    assertFalse(filteredTimeline.containsInstant(t6));
    // remove remaining compaction instant and setup timeline again
    instants.remove(t5);
    timeline = new HoodieActiveTimeline(metaClient);
    timeline.setInstants(instants);
    filteredTimeline = HoodieInputFormatUtils.filterInstantsTimeline(timeline);
    // verify all remaining instants are returned.
    assertTrue(filteredTimeline.containsInstant(t1));
    assertTrue(filteredTimeline.containsInstant(t2));
    assertFalse(filteredTimeline.containsInstant(t3));
    assertTrue(filteredTimeline.containsInstant(t4));
    assertFalse(filteredTimeline.containsInstant(t5));
    assertTrue(filteredTimeline.containsInstant(t6));
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) ArrayList(java.util.ArrayList) Test(org.junit.jupiter.api.Test)

Example 32 with HoodieTimeline

use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.

the class TestHoodieHFileInputFormat method testPendingCompactionWithActiveCommits.

// Verify that HoodieParquetInputFormat does not return instants after pending compaction
@Test
public void testPendingCompactionWithActiveCommits() throws IOException {
    // setup 4 sample instants in timeline
    List<HoodieInstant> instants = new ArrayList<>();
    HoodieInstant t1 = new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "1");
    HoodieInstant t2 = new HoodieInstant(HoodieInstant.State.INFLIGHT, HoodieTimeline.DELTA_COMMIT_ACTION, "2");
    HoodieInstant t3 = new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "3");
    HoodieInstant t4 = new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, "4");
    HoodieInstant t5 = new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, "5");
    HoodieInstant t6 = new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.DELTA_COMMIT_ACTION, "6");
    instants.add(t1);
    instants.add(t2);
    instants.add(t3);
    instants.add(t4);
    instants.add(t5);
    instants.add(t6);
    HoodieTableMetaClient metaClient = HoodieTestUtils.init(basePath.toString(), HoodieFileFormat.HFILE);
    HoodieActiveTimeline timeline = new HoodieActiveTimeline(metaClient);
    timeline.setInstants(instants);
    // Verify getCommitsTimelineBeforePendingCompaction does not return instants after first compaction instant
    HoodieTimeline filteredTimeline = inputFormat.filterInstantsTimeline(timeline);
    assertTrue(filteredTimeline.containsInstant(t1));
    assertTrue(filteredTimeline.containsInstant(t2));
    assertFalse(filteredTimeline.containsInstant(t3));
    assertFalse(filteredTimeline.containsInstant(t4));
    assertFalse(filteredTimeline.containsInstant(t5));
    assertFalse(filteredTimeline.containsInstant(t6));
    // remove compaction instant and setup timeline again
    instants.remove(t3);
    timeline = new HoodieActiveTimeline(metaClient);
    timeline.setInstants(instants);
    filteredTimeline = inputFormat.filterInstantsTimeline(timeline);
    // verify all remaining instants are returned.
    assertTrue(filteredTimeline.containsInstant(t1));
    assertTrue(filteredTimeline.containsInstant(t2));
    assertFalse(filteredTimeline.containsInstant(t3));
    assertTrue(filteredTimeline.containsInstant(t4));
    assertFalse(filteredTimeline.containsInstant(t5));
    assertFalse(filteredTimeline.containsInstant(t6));
    // remove remaining compaction instant and setup timeline again
    instants.remove(t5);
    timeline = new HoodieActiveTimeline(metaClient);
    timeline.setInstants(instants);
    filteredTimeline = inputFormat.filterInstantsTimeline(timeline);
    // verify all remaining instants are returned.
    assertTrue(filteredTimeline.containsInstant(t1));
    assertTrue(filteredTimeline.containsInstant(t2));
    assertFalse(filteredTimeline.containsInstant(t3));
    assertTrue(filteredTimeline.containsInstant(t4));
    assertFalse(filteredTimeline.containsInstant(t5));
    assertTrue(filteredTimeline.containsInstant(t6));
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) ArrayList(java.util.ArrayList) Test(org.junit.jupiter.api.Test)

Example 33 with HoodieTimeline

use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.

the class HoodieMergeOnReadTableInputFormat method listStatusForIncrementalMode.

/**
 * Keep the logic of mor_incr_view as same as spark datasource.
 * Step1: Get list of commits to be fetched based on start commit and max commits(for snapshot max commits is -1).
 * Step2: Get list of affected files status for these affected file status.
 * Step3: Construct HoodieTableFileSystemView based on those affected file status.
 *        a. Filter affected partitions based on inputPaths.
 *        b. Get list of fileGroups based on affected partitions by fsView.getAllFileGroups.
 * Step4: Set input paths based on filtered affected partition paths. changes that amony original input paths passed to
 *        this method. some partitions did not have commits as part of the trimmed down list of commits and hence we need this step.
 * Step5: Find candidate fileStatus, since when we get baseFileStatus from HoodieTableFileSystemView,
 *        the BaseFileStatus will missing file size information.
 *        We should use candidate fileStatus to update the size information for BaseFileStatus.
 * Step6: For every file group from step3(b)
 *        Get 1st available base file from all file slices. then we use candidate file status to update the baseFileStatus,
 *        and construct RealTimeFileStatus and add it to result along with log files.
 *        If file group just has log files, construct RealTimeFileStatus and add it to result.
 * TODO: unify the incremental view code between hive/spark-sql and spark datasource
 */
@Override
protected List<FileStatus> listStatusForIncrementalMode(JobConf job, HoodieTableMetaClient tableMetaClient, List<Path> inputPaths, String incrementalTableName) throws IOException {
    List<FileStatus> result = new ArrayList<>();
    Job jobContext = Job.getInstance(job);
    // step1
    Option<HoodieTimeline> timeline = HoodieInputFormatUtils.getFilteredCommitsTimeline(jobContext, tableMetaClient);
    if (!timeline.isPresent()) {
        return result;
    }
    HoodieTimeline commitsTimelineToReturn = HoodieInputFormatUtils.getHoodieTimelineForIncrementalQuery(jobContext, incrementalTableName, timeline.get());
    Option<List<HoodieInstant>> commitsToCheck = Option.of(commitsTimelineToReturn.getInstants().collect(Collectors.toList()));
    if (!commitsToCheck.isPresent()) {
        return result;
    }
    // step2
    commitsToCheck.get().sort(HoodieInstant::compareTo);
    List<HoodieCommitMetadata> metadataList = commitsToCheck.get().stream().map(instant -> {
        try {
            return HoodieInputFormatUtils.getCommitMetadata(instant, commitsTimelineToReturn);
        } catch (IOException e) {
            throw new HoodieException(String.format("cannot get metadata for instant: %s", instant));
        }
    }).collect(Collectors.toList());
    // build fileGroup from fsView
    List<FileStatus> affectedFileStatus = Arrays.asList(HoodieInputFormatUtils.listAffectedFilesForCommits(job, new Path(tableMetaClient.getBasePath()), metadataList));
    // step3
    HoodieTableFileSystemView fsView = new HoodieTableFileSystemView(tableMetaClient, commitsTimelineToReturn, affectedFileStatus.toArray(new FileStatus[0]));
    // build fileGroup from fsView
    Path basePath = new Path(tableMetaClient.getBasePath());
    // filter affectedPartition by inputPaths
    List<String> affectedPartition = HoodieInputFormatUtils.getWritePartitionPaths(metadataList).stream().filter(k -> k.isEmpty() ? inputPaths.contains(basePath) : inputPaths.contains(new Path(basePath, k))).collect(Collectors.toList());
    if (affectedPartition.isEmpty()) {
        return result;
    }
    List<HoodieFileGroup> fileGroups = affectedPartition.stream().flatMap(partitionPath -> fsView.getAllFileGroups(partitionPath)).collect(Collectors.toList());
    // step4
    setInputPaths(job, affectedPartition.stream().map(p -> p.isEmpty() ? basePath.toString() : new Path(basePath, p).toString()).collect(Collectors.joining(",")));
    // step5
    // find all file status in partitionPaths.
    FileStatus[] fileStatuses = doListStatus(job);
    Map<String, FileStatus> candidateFileStatus = new HashMap<>();
    for (int i = 0; i < fileStatuses.length; i++) {
        String key = fileStatuses[i].getPath().toString();
        candidateFileStatus.put(key, fileStatuses[i]);
    }
    Option<HoodieVirtualKeyInfo> virtualKeyInfoOpt = getHoodieVirtualKeyInfo(tableMetaClient);
    String maxCommitTime = fsView.getLastInstant().get().getTimestamp();
    // step6
    result.addAll(collectAllIncrementalFiles(fileGroups, maxCommitTime, basePath.toString(), candidateFileStatus, virtualKeyInfoOpt));
    return result;
}
Also used : HoodieInputFormatUtils(org.apache.hudi.hadoop.utils.HoodieInputFormatUtils) Arrays(java.util.Arrays) FileStatusWithBootstrapBaseFile(org.apache.hudi.hadoop.FileStatusWithBootstrapBaseFile) FileSystem(org.apache.hadoop.fs.FileSystem) HiveHoodieTableFileIndex(org.apache.hudi.hadoop.HiveHoodieTableFileIndex) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieException(org.apache.hudi.exception.HoodieException) ValidationUtils.checkState(org.apache.hudi.common.util.ValidationUtils.checkState) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) FileStatus(org.apache.hadoop.fs.FileStatus) ArrayList(java.util.ArrayList) SplitLocationInfo(org.apache.hadoop.mapred.SplitLocationInfo) HoodieCopyOnWriteTableInputFormat(org.apache.hudi.hadoop.HoodieCopyOnWriteTableInputFormat) FileSplit(org.apache.hadoop.mapred.FileSplit) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) Configurable(org.apache.hadoop.conf.Configurable) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) LocatedFileStatusWithBootstrapBaseFile(org.apache.hudi.hadoop.LocatedFileStatusWithBootstrapBaseFile) RealtimeFileStatus(org.apache.hudi.hadoop.RealtimeFileStatus) FileInputFormat(org.apache.hadoop.mapred.FileInputFormat) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) BootstrapBaseFileSplit(org.apache.hudi.hadoop.BootstrapBaseFileSplit) Collectors(java.util.stream.Collectors) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) JobConf(org.apache.hadoop.mapred.JobConf) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) Stream(java.util.stream.Stream) HoodieRealtimeInputFormatUtils(org.apache.hudi.hadoop.utils.HoodieRealtimeInputFormatUtils) Job(org.apache.hadoop.mapreduce.Job) InputSplit(org.apache.hadoop.mapred.InputSplit) HoodieIOException(org.apache.hudi.exception.HoodieIOException) FileStatus(org.apache.hadoop.fs.FileStatus) RealtimeFileStatus(org.apache.hudi.hadoop.RealtimeFileStatus) HashMap(java.util.HashMap) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) ArrayList(java.util.ArrayList) HoodieException(org.apache.hudi.exception.HoodieException) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) ArrayList(java.util.ArrayList) List(java.util.List) Job(org.apache.hadoop.mapreduce.Job) HoodieTableFileSystemView(org.apache.hudi.common.table.view.HoodieTableFileSystemView) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Path(org.apache.hadoop.fs.Path) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieFileGroup(org.apache.hudi.common.model.HoodieFileGroup)

Example 34 with HoodieTimeline

use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.

the class RepairsCommand method removeCorruptedPendingCleanAction.

@CliCommand(value = "repair corrupted clean files", help = "repair corrupted clean files")
public void removeCorruptedPendingCleanAction() {
    HoodieTableMetaClient client = HoodieCLI.getTableMetaClient();
    HoodieTimeline cleanerTimeline = HoodieCLI.getTableMetaClient().getActiveTimeline().getCleanerTimeline();
    LOG.info("Inspecting pending clean metadata in timeline for corrupted files");
    cleanerTimeline.filterInflightsAndRequested().getInstants().forEach(instant -> {
        try {
            CleanerUtils.getCleanerPlan(client, instant);
        } catch (AvroRuntimeException e) {
            LOG.warn("Corruption found. Trying to remove corrupted clean instant file: " + instant);
            HoodieActiveTimeline.deleteInstantFile(client.getFs(), client.getMetaPath(), instant);
        } catch (IOException ioe) {
            if (ioe.getMessage().contains("Not an Avro data file")) {
                LOG.warn("Corruption found. Trying to remove corrupted clean instant file: " + instant);
                HoodieActiveTimeline.deleteInstantFile(client.getFs(), client.getMetaPath(), instant);
            } else {
                throw new HoodieIOException(ioe.getMessage(), ioe);
            }
        }
    });
}
Also used : HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) AvroRuntimeException(org.apache.avro.AvroRuntimeException) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException) CliCommand(org.springframework.shell.core.annotation.CliCommand)

Example 35 with HoodieTimeline

use of org.apache.hudi.common.table.timeline.HoodieTimeline in project hudi by apache.

the class SavepointsCommand method rollbackToSavepoint.

@CliCommand(value = "savepoint rollback", help = "Savepoint a commit")
public String rollbackToSavepoint(@CliOption(key = { "savepoint" }, help = "Savepoint to rollback") final String instantTime, @CliOption(key = { "sparkProperties" }, help = "Spark Properties File Path") final String sparkPropertiesPath, @CliOption(key = "sparkMaster", unspecifiedDefaultValue = "", help = "Spark Master") String master, @CliOption(key = "sparkMemory", unspecifiedDefaultValue = "4G", help = "Spark executor memory") final String sparkMemory) throws Exception {
    HoodieTableMetaClient metaClient = HoodieCLI.getTableMetaClient();
    if (metaClient.getActiveTimeline().getSavePointTimeline().filterCompletedInstants().empty()) {
        throw new HoodieException("There are no completed instants to run rollback");
    }
    HoodieActiveTimeline activeTimeline = metaClient.getActiveTimeline();
    HoodieTimeline timeline = activeTimeline.getCommitsTimeline().filterCompletedInstants();
    List<HoodieInstant> instants = timeline.getInstants().filter(instant -> instant.getTimestamp().equals(instantTime)).collect(Collectors.toList());
    if (instants.isEmpty()) {
        return "Commit " + instantTime + " not found in Commits " + timeline;
    }
    SparkLauncher sparkLauncher = SparkUtil.initLauncher(sparkPropertiesPath);
    sparkLauncher.addAppArgs(SparkMain.SparkCommand.ROLLBACK_TO_SAVEPOINT.toString(), master, sparkMemory, instantTime, metaClient.getBasePath());
    Process process = sparkLauncher.launch();
    InputStreamConsumer.captureOutput(process);
    int exitCode = process.waitFor();
    // Refresh the current
    HoodieCLI.refreshTableMetadata();
    if (exitCode != 0) {
        return String.format("Savepoint \"%s\" failed to roll back", instantTime);
    }
    return String.format("Savepoint \"%s\" rolled back", instantTime);
}
Also used : HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) CliCommand(org.springframework.shell.core.annotation.CliCommand) SparkLauncher(org.apache.spark.launcher.SparkLauncher) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieException(org.apache.hudi.exception.HoodieException) HoodieTableHeaderFields(org.apache.hudi.cli.HoodieTableHeaderFields) SparkUtil(org.apache.hudi.cli.utils.SparkUtil) CliOption(org.springframework.shell.core.annotation.CliOption) Collectors(java.util.stream.Collectors) HoodieCLI(org.apache.hudi.cli.HoodieCLI) InputStreamConsumer(org.apache.hudi.cli.utils.InputStreamConsumer) Component(org.springframework.stereotype.Component) List(java.util.List) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodiePrintHelper(org.apache.hudi.cli.HoodiePrintHelper) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) CommandMarker(org.springframework.shell.core.CommandMarker) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) HoodieException(org.apache.hudi.exception.HoodieException) SparkLauncher(org.apache.spark.launcher.SparkLauncher) CliCommand(org.springframework.shell.core.annotation.CliCommand)

Aggregations

HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)118 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)74 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)39 List (java.util.List)36 IOException (java.io.IOException)34 HoodieCommitMetadata (org.apache.hudi.common.model.HoodieCommitMetadata)34 ArrayList (java.util.ArrayList)32 Option (org.apache.hudi.common.util.Option)30 Collectors (java.util.stream.Collectors)29 HoodieActiveTimeline (org.apache.hudi.common.table.timeline.HoodieActiveTimeline)29 HoodieException (org.apache.hudi.exception.HoodieException)26 Map (java.util.Map)25 FileStatus (org.apache.hadoop.fs.FileStatus)24 Path (org.apache.hadoop.fs.Path)24 Set (java.util.Set)22 HoodieBaseFile (org.apache.hudi.common.model.HoodieBaseFile)22 FileSlice (org.apache.hudi.common.model.FileSlice)21 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)21 Pair (org.apache.hudi.common.util.collection.Pair)21 FSUtils (org.apache.hudi.common.fs.FSUtils)20