Search in sources :

Example 51 with Option

use of org.apache.hudi.common.util.Option in project hudi by apache.

the class HoodieCompactor method compact.

/**
 * Execute a single compaction operation and report back status.
 */
public List<WriteStatus> compact(HoodieCompactionHandler compactionHandler, HoodieTableMetaClient metaClient, HoodieWriteConfig config, CompactionOperation operation, String instantTime, TaskContextSupplier taskContextSupplier) throws IOException {
    FileSystem fs = metaClient.getFs();
    Schema readerSchema = HoodieAvroUtils.addMetadataFields(new Schema.Parser().parse(config.getSchema()), config.allowOperationMetadataField());
    LOG.info("Compacting base " + operation.getDataFileName() + " with delta files " + operation.getDeltaFileNames() + " for commit " + instantTime);
    // TODO - FIX THIS
    // Reads the entire avro file. Always only specific blocks should be read from the avro file
    // (failure recover).
    // Load all the delta commits since the last compaction commit and get all the blocks to be
    // loaded and load it using CompositeAvroLogReader
    // Since a DeltaCommit is not defined yet, reading all the records. revisit this soon.
    String maxInstantTime = metaClient.getActiveTimeline().getTimelineOfActions(CollectionUtils.createSet(HoodieTimeline.COMMIT_ACTION, HoodieTimeline.ROLLBACK_ACTION, HoodieTimeline.DELTA_COMMIT_ACTION)).filterCompletedInstants().lastInstant().get().getTimestamp();
    long maxMemoryPerCompaction = IOUtils.getMaxMemoryPerCompaction(taskContextSupplier, config);
    LOG.info("MaxMemoryPerCompaction => " + maxMemoryPerCompaction);
    List<String> logFiles = operation.getDeltaFileNames().stream().map(p -> new Path(FSUtils.getPartitionPath(metaClient.getBasePath(), operation.getPartitionPath()), p).toString()).collect(toList());
    HoodieMergedLogRecordScanner scanner = HoodieMergedLogRecordScanner.newBuilder().withFileSystem(fs).withBasePath(metaClient.getBasePath()).withLogFilePaths(logFiles).withReaderSchema(readerSchema).withLatestInstantTime(maxInstantTime).withMaxMemorySizeInBytes(maxMemoryPerCompaction).withReadBlocksLazily(config.getCompactionLazyBlockReadEnabled()).withReverseReader(config.getCompactionReverseLogReadEnabled()).withBufferSize(config.getMaxDFSStreamBufferSize()).withSpillableMapBasePath(config.getSpillableMapBasePath()).withDiskMapType(config.getCommonConfig().getSpillableDiskMapType()).withBitCaskDiskMapCompressionEnabled(config.getCommonConfig().isBitCaskDiskMapCompressionEnabled()).withOperationField(config.allowOperationMetadataField()).withPartition(operation.getPartitionPath()).build();
    Option<HoodieBaseFile> oldDataFileOpt = operation.getBaseFile(metaClient.getBasePath(), operation.getPartitionPath());
    // But in this case, we need to give it a base file. Otherwise, it will lose base file in following fileSlice.
    if (!scanner.iterator().hasNext()) {
        if (!oldDataFileOpt.isPresent()) {
            scanner.close();
            return new ArrayList<>();
        } else {
        // TODO: we may directly rename original parquet file if there is not evolution/devolution of schema
        /*
        TaskContextSupplier taskContextSupplier = hoodieCopyOnWriteTable.getTaskContextSupplier();
        String newFileName = FSUtils.makeDataFileName(instantTime,
            FSUtils.makeWriteToken(taskContextSupplier.getPartitionIdSupplier().get(), taskContextSupplier.getStageIdSupplier().get(), taskContextSupplier.getAttemptIdSupplier().get()),
            operation.getFileId(), hoodieCopyOnWriteTable.getBaseFileExtension());
        Path oldFilePath = new Path(oldDataFileOpt.get().getPath());
        Path newFilePath = new Path(oldFilePath.getParent(), newFileName);
        FileUtil.copy(fs,oldFilePath, fs, newFilePath, false, fs.getConf());
        */
        }
    }
    // Compacting is very similar to applying updates to existing file
    Iterator<List<WriteStatus>> result;
    // If the dataFile is present, perform updates else perform inserts into a new base file.
    if (oldDataFileOpt.isPresent()) {
        result = compactionHandler.handleUpdate(instantTime, operation.getPartitionPath(), operation.getFileId(), scanner.getRecords(), oldDataFileOpt.get());
    } else {
        result = compactionHandler.handleInsert(instantTime, operation.getPartitionPath(), operation.getFileId(), scanner.getRecords());
    }
    scanner.close();
    Iterable<List<WriteStatus>> resultIterable = () -> result;
    return StreamSupport.stream(resultIterable.spliterator(), false).flatMap(Collection::stream).peek(s -> {
        s.getStat().setTotalUpdatedRecordsCompacted(scanner.getNumMergedRecordsInLog());
        s.getStat().setTotalLogFilesCompacted(scanner.getTotalLogFiles());
        s.getStat().setTotalLogRecords(scanner.getTotalLogRecords());
        s.getStat().setPartitionPath(operation.getPartitionPath());
        s.getStat().setTotalLogSizeCompacted(operation.getMetrics().get(CompactionStrategy.TOTAL_LOG_FILE_SIZE).longValue());
        s.getStat().setTotalLogBlocks(scanner.getTotalLogBlocks());
        s.getStat().setTotalCorruptLogBlock(scanner.getTotalCorruptBlocks());
        s.getStat().setTotalRollbackBlocks(scanner.getTotalRollbacks());
        RuntimeStats runtimeStats = new RuntimeStats();
        runtimeStats.setTotalScanTime(scanner.getTotalTimeTakenToReadAndMergeBlocks());
        s.getStat().setRuntimeStats(runtimeStats);
    }).collect(toList());
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) CollectionUtils(org.apache.hudi.common.util.CollectionUtils) Option(org.apache.hudi.common.util.Option) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieAccumulator(org.apache.hudi.common.data.HoodieAccumulator) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) RuntimeStats(org.apache.hudi.common.model.HoodieWriteStat.RuntimeStats) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) StreamSupport(java.util.stream.StreamSupport) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) HoodieData(org.apache.hudi.common.data.HoodieData) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) Schema(org.apache.avro.Schema) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) TaskContextSupplier(org.apache.hudi.common.engine.TaskContextSupplier) Collection(java.util.Collection) Set(java.util.Set) IOException(java.io.IOException) CompactionStrategy(org.apache.hudi.table.action.compact.strategy.CompactionStrategy) Serializable(java.io.Serializable) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) HoodieCompactionOperation(org.apache.hudi.avro.model.HoodieCompactionOperation) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieCompactionHandler(org.apache.hudi.table.HoodieCompactionHandler) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) SliceView(org.apache.hudi.common.table.view.TableFileSystemView.SliceView) IOUtils(org.apache.hudi.io.IOUtils) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) Pair(org.apache.hudi.common.util.collection.Pair) Path(org.apache.hadoop.fs.Path) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) RuntimeStats(org.apache.hudi.common.model.HoodieWriteStat.RuntimeStats) Schema(org.apache.avro.Schema) ArrayList(java.util.ArrayList) FileSystem(org.apache.hadoop.fs.FileSystem) Collection(java.util.Collection) ArrayList(java.util.ArrayList) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList)

Example 52 with Option

use of org.apache.hudi.common.util.Option in project hudi by apache.

the class ScheduleCompactionActionExecutor method execute.

@Override
public Option<HoodieCompactionPlan> execute() {
    if (!config.getWriteConcurrencyMode().supportsOptimisticConcurrencyControl() && !config.getFailedWritesCleanPolicy().isLazy()) {
        // TODO(yihua): this validation is removed for Java client used by kafka-connect.  Need to revisit this.
        if (config.getEngineType() != EngineType.JAVA) {
            // if there are inflight writes, their instantTime must not be less than that of compaction instant time
            table.getActiveTimeline().getCommitsTimeline().filterPendingExcludingCompaction().firstInstant().ifPresent(earliestInflight -> ValidationUtils.checkArgument(HoodieTimeline.compareTimestamps(earliestInflight.getTimestamp(), HoodieTimeline.GREATER_THAN, instantTime), "Earliest write inflight instant time must be later than compaction time. Earliest :" + earliestInflight + ", Compaction scheduled at " + instantTime));
        }
        // Committed and pending compaction instants should have strictly lower timestamps
        List<HoodieInstant> conflictingInstants = table.getActiveTimeline().getWriteTimeline().filterCompletedAndCompactionInstants().getInstants().filter(instant -> HoodieTimeline.compareTimestamps(instant.getTimestamp(), HoodieTimeline.GREATER_THAN_OR_EQUALS, instantTime)).collect(Collectors.toList());
        ValidationUtils.checkArgument(conflictingInstants.isEmpty(), "Following instants have timestamps >= compactionInstant (" + instantTime + ") Instants :" + conflictingInstants);
    }
    HoodieCompactionPlan plan = scheduleCompaction();
    if (plan != null && (plan.getOperations() != null) && (!plan.getOperations().isEmpty())) {
        extraMetadata.ifPresent(plan::setExtraMetadata);
        HoodieInstant compactionInstant = new HoodieInstant(HoodieInstant.State.REQUESTED, HoodieTimeline.COMPACTION_ACTION, instantTime);
        try {
            table.getActiveTimeline().saveToCompactionRequested(compactionInstant, TimelineMetadataUtils.serializeCompactionPlan(plan));
        } catch (IOException ioe) {
            throw new HoodieIOException("Exception scheduling compaction", ioe);
        }
        return Option.of(plan);
    }
    return Option.empty();
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTable(org.apache.hudi.table.HoodieTable) BaseActionExecutor(org.apache.hudi.table.action.BaseActionExecutor) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Option(org.apache.hudi.common.util.Option) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) Logger(org.apache.log4j.Logger) Map(java.util.Map) ParseException(java.text.ParseException) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) SyncableFileSystemView(org.apache.hudi.common.table.view.SyncableFileSystemView) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Set(java.util.Set) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieCompactionException(org.apache.hudi.exception.HoodieCompactionException) List(java.util.List) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) EngineType(org.apache.hudi.common.engine.EngineType) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) Pair(org.apache.hudi.common.util.collection.Pair) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException)

Example 53 with Option

use of org.apache.hudi.common.util.Option in project hudi by apache.

the class TimelineMetadataUtils method convertRollbackMetadata.

public static HoodieRollbackMetadata convertRollbackMetadata(String startRollbackTime, Option<Long> durationInMs, List<HoodieInstant> instants, List<HoodieRollbackStat> rollbackStats) {
    Map<String, HoodieRollbackPartitionMetadata> partitionMetadataBuilder = new HashMap<>();
    int totalDeleted = 0;
    for (HoodieRollbackStat stat : rollbackStats) {
        Map<String, Long> rollbackLogFiles = stat.getCommandBlocksCount().keySet().stream().collect(Collectors.toMap(f -> f.getPath().toString(), FileStatus::getLen));
        HoodieRollbackPartitionMetadata metadata = new HoodieRollbackPartitionMetadata(stat.getPartitionPath(), stat.getSuccessDeleteFiles(), stat.getFailedDeleteFiles(), rollbackLogFiles);
        partitionMetadataBuilder.put(stat.getPartitionPath(), metadata);
        totalDeleted += stat.getSuccessDeleteFiles().size();
    }
    return new HoodieRollbackMetadata(startRollbackTime, durationInMs.orElseGet(() -> -1L), totalDeleted, instants.stream().map(HoodieInstant::getTimestamp).collect(Collectors.toList()), Collections.unmodifiableMap(partitionMetadataBuilder), DEFAULT_VERSION, instants.stream().map(instant -> new HoodieInstantInfo(instant.getTimestamp(), instant.getAction())).collect(Collectors.toList()));
}
Also used : HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) ByteArrayOutputStream(java.io.ByteArrayOutputStream) HoodieRestorePlan(org.apache.hudi.avro.model.HoodieRestorePlan) FileReader(org.apache.avro.file.FileReader) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieSavepointMetadata(org.apache.hudi.avro.model.HoodieSavepointMetadata) SpecificDatumWriter(org.apache.avro.specific.SpecificDatumWriter) HoodieRequestedReplaceMetadata(org.apache.hudi.avro.model.HoodieRequestedReplaceMetadata) Map(java.util.Map) HoodieRollbackMetadata(org.apache.hudi.avro.model.HoodieRollbackMetadata) HoodieRollbackPartitionMetadata(org.apache.hudi.avro.model.HoodieRollbackPartitionMetadata) SpecificData(org.apache.avro.specific.SpecificData) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) Schema(org.apache.avro.Schema) DatumWriter(org.apache.avro.io.DatumWriter) HoodieCleanerPlan(org.apache.hudi.avro.model.HoodieCleanerPlan) SpecificRecordBase(org.apache.avro.specific.SpecificRecordBase) DataFileWriter(org.apache.avro.file.DataFileWriter) IOException(java.io.IOException) HoodieRollbackPlan(org.apache.hudi.avro.model.HoodieRollbackPlan) Collectors(java.util.stream.Collectors) HoodieInstantInfo(org.apache.hudi.avro.model.HoodieInstantInfo) SpecificDatumReader(org.apache.avro.specific.SpecificDatumReader) DatumReader(org.apache.avro.io.DatumReader) List(java.util.List) HoodieCleanMetadata(org.apache.hudi.avro.model.HoodieCleanMetadata) SeekableByteArrayInput(org.apache.avro.file.SeekableByteArrayInput) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieReplaceCommitMetadata(org.apache.hudi.avro.model.HoodieReplaceCommitMetadata) HoodieRestoreMetadata(org.apache.hudi.avro.model.HoodieRestoreMetadata) HoodieSavepointPartitionMetadata(org.apache.hudi.avro.model.HoodieSavepointPartitionMetadata) HoodieRollbackStat(org.apache.hudi.common.HoodieRollbackStat) Collections(java.util.Collections) DataFileReader(org.apache.avro.file.DataFileReader) HoodieRollbackStat(org.apache.hudi.common.HoodieRollbackStat) HoodieRollbackMetadata(org.apache.hudi.avro.model.HoodieRollbackMetadata) HashMap(java.util.HashMap) HoodieInstantInfo(org.apache.hudi.avro.model.HoodieInstantInfo) HoodieRollbackPartitionMetadata(org.apache.hudi.avro.model.HoodieRollbackPartitionMetadata)

Example 54 with Option

use of org.apache.hudi.common.util.Option in project hudi by apache.

the class HoodieTestSuiteWriter method commitCompaction.

public void commitCompaction(JavaRDD<WriteStatus> records, JavaRDD<DeltaWriteStats> generatedDataStats, Option<String> instantTime) throws IOException {
    if (!cfg.useDeltaStreamer) {
        Map<String, String> extraMetadata = new HashMap<>();
        /**
         * Store the checkpoint in the commit metadata just like
         * {@link HoodieDeltaStreamer#commit(SparkRDDWriteClient, JavaRDD, Option)} *
         */
        extraMetadata.put(HoodieDeltaStreamerWrapper.CHECKPOINT_KEY, lastCheckpoint.get());
        if (generatedDataStats != null && generatedDataStats.count() > 1) {
            // Just stores the path where this batch of data is generated to
            extraMetadata.put(GENERATED_DATA_PATH, generatedDataStats.map(s -> s.getFilePath()).collect().get(0));
        }
        HoodieSparkTable<HoodieRecordPayload> table = HoodieSparkTable.create(writeClient.getConfig(), writeClient.getEngineContext());
        HoodieCommitMetadata metadata = CompactHelpers.getInstance().createCompactionMetadata(table, instantTime.get(), HoodieJavaRDD.of(records), writeClient.getConfig().getSchema());
        writeClient.commitCompaction(instantTime.get(), metadata, Option.of(extraMetadata));
    }
}
Also used : Arrays(java.util.Arrays) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LoggerFactory(org.slf4j.LoggerFactory) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieJavaRDD(org.apache.hudi.data.HoodieJavaRDD) HashSet(java.util.HashSet) DagNode(org.apache.hudi.integ.testsuite.dag.nodes.DagNode) HoodieSparkTable(org.apache.hudi.table.HoodieSparkTable) HoodieReadClient(org.apache.hudi.client.HoodieReadClient) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieWriteMetadata(org.apache.hudi.table.action.HoodieWriteMetadata) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) HoodieTestSuiteConfig(org.apache.hudi.integ.testsuite.HoodieTestSuiteJob.HoodieTestSuiteConfig) SchemaProvider(org.apache.hudi.utilities.schema.SchemaProvider) GenericRecord(org.apache.avro.generic.GenericRecord) Schema(org.apache.avro.Schema) Logger(org.slf4j.Logger) Properties(java.util.Properties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) DeltaWriteStats(org.apache.hudi.integ.testsuite.writer.DeltaWriteStats) HoodiePayloadConfig(org.apache.hudi.config.HoodiePayloadConfig) CompactHelpers(org.apache.hudi.table.action.compact.CompactHelpers) Set(java.util.Set) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) HoodieAvroRecord(org.apache.hudi.common.model.HoodieAvroRecord) HoodieIndex(org.apache.hudi.index.HoodieIndex) Serializable(java.io.Serializable) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) RollbackNode(org.apache.hudi.integ.testsuite.dag.nodes.RollbackNode) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) CleanNode(org.apache.hudi.integ.testsuite.dag.nodes.CleanNode) ScheduleCompactNode(org.apache.hudi.integ.testsuite.dag.nodes.ScheduleCompactNode) HoodieIndexConfig(org.apache.hudi.config.HoodieIndexConfig) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) RDD(org.apache.spark.rdd.RDD) Pair(org.apache.hudi.common.util.collection.Pair) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) HashMap(java.util.HashMap) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload)

Example 55 with Option

use of org.apache.hudi.common.util.Option in project hudi by apache.

the class TestHoodieDeltaStreamer method testCleanerDeleteReplacedDataWithArchive.

@ParameterizedTest
@ValueSource(booleans = { true, false })
public void testCleanerDeleteReplacedDataWithArchive(Boolean asyncClean) throws Exception {
    String tableBasePath = dfsBasePath + "/cleanerDeleteReplacedDataWithArchive" + asyncClean;
    int totalRecords = 3000;
    // Step 1 : Prepare and insert data without archival and cleaner.
    // Make sure that there are 6 commits including 2 replacecommits completed.
    HoodieDeltaStreamer.Config cfg = TestHelpers.makeConfig(tableBasePath, WriteOperationType.INSERT);
    cfg.continuousMode = true;
    cfg.tableType = HoodieTableType.COPY_ON_WRITE.name();
    cfg.configs.addAll(getAsyncServicesConfigs(totalRecords, "false", "true", "2", "", ""));
    cfg.configs.add(String.format("%s=%s", HoodieCompactionConfig.PARQUET_SMALL_FILE_LIMIT.key(), "0"));
    cfg.configs.add(String.format("%s=%s", HoodieMetadataConfig.COMPACT_NUM_DELTA_COMMITS.key(), "1"));
    HoodieDeltaStreamer ds = new HoodieDeltaStreamer(cfg, jsc);
    deltaStreamerTestRunner(ds, cfg, (r) -> {
        TestHelpers.assertAtLeastNReplaceCommits(2, tableBasePath, dfs);
        return true;
    });
    TestHelpers.assertAtLeastNCommits(6, tableBasePath, dfs);
    TestHelpers.assertAtLeastNReplaceCommits(2, tableBasePath, dfs);
    // Step 2 : Get the first replacecommit and extract the corresponding replaced file IDs.
    HoodieTableMetaClient meta = HoodieTableMetaClient.builder().setConf(dfs.getConf()).setBasePath(tableBasePath).build();
    HoodieTimeline replacedTimeline = meta.reloadActiveTimeline().getCompletedReplaceTimeline();
    Option<HoodieInstant> firstReplaceHoodieInstant = replacedTimeline.nthFromLastInstant(1);
    assertTrue(firstReplaceHoodieInstant.isPresent());
    Option<byte[]> firstReplaceHoodieInstantDetails = replacedTimeline.getInstantDetails(firstReplaceHoodieInstant.get());
    HoodieReplaceCommitMetadata firstReplaceMetadata = HoodieReplaceCommitMetadata.fromBytes(firstReplaceHoodieInstantDetails.get(), HoodieReplaceCommitMetadata.class);
    Map<String, List<String>> partitionToReplaceFileIds = firstReplaceMetadata.getPartitionToReplaceFileIds();
    String partitionName = null;
    List replacedFileIDs = null;
    for (Map.Entry entry : partitionToReplaceFileIds.entrySet()) {
        partitionName = String.valueOf(entry.getKey());
        replacedFileIDs = (List) entry.getValue();
    }
    assertNotNull(partitionName);
    assertNotNull(replacedFileIDs);
    // Step 3 : Based to replacedFileIDs , get the corresponding complete path.
    ArrayList<String> replacedFilePaths = new ArrayList<>();
    Path partitionPath = new Path(meta.getBasePath(), partitionName);
    RemoteIterator<LocatedFileStatus> hoodieFiles = meta.getFs().listFiles(partitionPath, true);
    while (hoodieFiles.hasNext()) {
        LocatedFileStatus f = hoodieFiles.next();
        String file = f.getPath().toUri().toString();
        for (Object replacedFileID : replacedFileIDs) {
            if (file.contains(String.valueOf(replacedFileID))) {
                replacedFilePaths.add(file);
            }
        }
    }
    assertFalse(replacedFilePaths.isEmpty());
    // Step 4 : Insert 1 record and trigger sync/async cleaner and archive.
    List<String> configs = getAsyncServicesConfigs(1, "true", "true", "2", "", "");
    configs.add(String.format("%s=%s", HoodieCompactionConfig.CLEANER_POLICY.key(), "KEEP_LATEST_COMMITS"));
    configs.add(String.format("%s=%s", HoodieCompactionConfig.CLEANER_COMMITS_RETAINED.key(), "1"));
    configs.add(String.format("%s=%s", HoodieCompactionConfig.MIN_COMMITS_TO_KEEP.key(), "2"));
    configs.add(String.format("%s=%s", HoodieCompactionConfig.MAX_COMMITS_TO_KEEP.key(), "3"));
    configs.add(String.format("%s=%s", HoodieCompactionConfig.ASYNC_CLEAN.key(), asyncClean));
    configs.add(String.format("%s=%s", HoodieMetadataConfig.COMPACT_NUM_DELTA_COMMITS.key(), "1"));
    if (asyncClean) {
        configs.add(String.format("%s=%s", HoodieWriteConfig.WRITE_CONCURRENCY_MODE.key(), WriteConcurrencyMode.OPTIMISTIC_CONCURRENCY_CONTROL.name()));
        configs.add(String.format("%s=%s", HoodieCompactionConfig.FAILED_WRITES_CLEANER_POLICY.key(), HoodieFailedWritesCleaningPolicy.LAZY.name()));
        configs.add(String.format("%s=%s", HoodieLockConfig.LOCK_PROVIDER_CLASS_NAME.key(), InProcessLockProvider.class.getName()));
    }
    cfg.configs = configs;
    cfg.continuousMode = false;
    ds = new HoodieDeltaStreamer(cfg, jsc);
    ds.sync();
    // Step 5 : Make sure that firstReplaceHoodieInstant is archived.
    long count = meta.reloadActiveTimeline().getCompletedReplaceTimeline().getInstants().filter(instant -> firstReplaceHoodieInstant.get().equals(instant)).count();
    assertEquals(0, count);
    // Step 6 : All the replaced files in firstReplaceHoodieInstant should be deleted through sync/async cleaner.
    for (String replacedFilePath : replacedFilePaths) {
        assertFalse(meta.getFs().exists(new Path(replacedFilePath)));
    }
}
Also used : HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) Path(org.apache.hadoop.fs.Path) BeforeEach(org.junit.jupiter.api.BeforeEach) Arrays(java.util.Arrays) HoodieHiveClient(org.apache.hudi.hive.HoodieHiveClient) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieException(org.apache.hudi.exception.HoodieException) DataSourceWriteOptions(org.apache.hudi.DataSourceWriteOptions) AfterAll(org.junit.jupiter.api.AfterAll) Future(java.util.concurrent.Future) DFSPropertiesConfiguration(org.apache.hudi.common.config.DFSPropertiesConfiguration) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) WriteConcurrencyMode(org.apache.hudi.common.model.WriteConcurrencyMode) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) DataTypes(org.apache.spark.sql.types.DataTypes) StructField(org.apache.spark.sql.types.StructField) Schema(org.apache.avro.Schema) UtilitiesTestBase(org.apache.hudi.utilities.testutils.UtilitiesTestBase) SimpleKeyGenerator(org.apache.hudi.keygen.SimpleKeyGenerator) Arguments(org.junit.jupiter.params.provider.Arguments) Executors(java.util.concurrent.Executors) Stream(java.util.stream.Stream) SqlQueryBasedTransformer(org.apache.hudi.utilities.transform.SqlQueryBasedTransformer) DataSourceReadOptions(org.apache.hudi.DataSourceReadOptions) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) org.apache.spark.sql.functions(org.apache.spark.sql.functions) RemoteIterator(org.apache.hadoop.fs.RemoteIterator) Assertions.assertThrows(org.junit.jupiter.api.Assertions.assertThrows) Assertions.fail(org.junit.jupiter.api.Assertions.fail) Assertions.assertNotNull(org.junit.jupiter.api.Assertions.assertNotNull) Dataset(org.apache.spark.sql.Dataset) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) AvroConversionUtils(org.apache.hudi.AvroConversionUtils) Option(org.apache.hudi.common.util.Option) ArrayList(java.util.ArrayList) StringUtils(org.apache.hudi.common.util.StringUtils) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) CsvDFSSource(org.apache.hudi.utilities.sources.CsvDFSSource) JavaRDD(org.apache.spark.api.java.JavaRDD) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) SparkSession(org.apache.spark.sql.SparkSession) ValueSource(org.junit.jupiter.params.provider.ValueSource) HoodieRecord(org.apache.hudi.common.model.HoodieRecord) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) Properties(java.util.Properties) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) HoodieCommitMetadata(org.apache.hudi.common.model.HoodieCommitMetadata) IOException(java.io.IOException) Row(org.apache.spark.sql.Row) JsonKafkaSource(org.apache.hudi.utilities.sources.JsonKafkaSource) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) AfterEach(org.junit.jupiter.api.AfterEach) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest) HoodieTestUtils(org.apache.hudi.common.testutils.HoodieTestUtils) TestDataSource(org.apache.hudi.utilities.sources.TestDataSource) Connection(java.sql.Connection) HoodieFailedWritesCleaningPolicy(org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) HoodieTestDataGenerator(org.apache.hudi.common.testutils.HoodieTestDataGenerator) Assertions.assertNotEquals(org.junit.jupiter.api.Assertions.assertNotEquals) HoodieDeltaStreamer(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer) OverwriteWithLatestAvroPayload(org.apache.hudi.common.model.OverwriteWithLatestAvroPayload) HOODIE_RECORD_NAMESPACE(org.apache.hudi.utilities.schema.RowBasedSchemaProvider.HOODIE_RECORD_NAMESPACE) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieConfig(org.apache.hudi.common.config.HoodieConfig) Assertions.assertFalse(org.junit.jupiter.api.Assertions.assertFalse) Transformer(org.apache.hudi.utilities.transform.Transformer) Path(org.apache.hadoop.fs.Path) HoodieIncrSource(org.apache.hudi.utilities.sources.HoodieIncrSource) MethodSource(org.junit.jupiter.params.provider.MethodSource) HOODIE_RECORD_STRUCT_NAME(org.apache.hudi.utilities.schema.RowBasedSchemaProvider.HOODIE_RECORD_STRUCT_NAME) TableNotFoundException(org.apache.hudi.exception.TableNotFoundException) SchemaProvider(org.apache.hudi.utilities.schema.SchemaProvider) SparkAvroPostProcessor(org.apache.hudi.utilities.schema.SparkAvroPostProcessor) DummySchemaProvider(org.apache.hudi.utilities.DummySchemaProvider) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) InputBatch(org.apache.hudi.utilities.sources.InputBatch) Collectors(java.util.stream.Collectors) Test(org.junit.jupiter.api.Test) TopicExistsException(org.apache.kafka.common.errors.TopicExistsException) List(java.util.List) WriteOperationType(org.apache.hudi.common.model.WriteOperationType) AnalysisException(org.apache.spark.sql.AnalysisException) SourceConfigs(org.apache.hudi.utilities.testutils.sources.config.SourceConfigs) InProcessLockProvider(org.apache.hudi.client.transaction.lock.InProcessLockProvider) CHECKPOINT_KEY(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer.CHECKPOINT_KEY) HashMap(java.util.HashMap) ParquetDFSSource(org.apache.hudi.utilities.sources.ParquetDFSSource) Function(java.util.function.Function) DistributedTestDataSource(org.apache.hudi.utilities.testutils.sources.DistributedTestDataSource) TestParquetDFSSourceEmptyBatch(org.apache.hudi.utilities.sources.TestParquetDFSSourceEmptyBatch) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Arguments.arguments(org.junit.jupiter.params.provider.Arguments.arguments) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) SqlSource(org.apache.hudi.utilities.sources.SqlSource) GenericRecord(org.apache.avro.generic.GenericRecord) TypedProperties(org.apache.hudi.common.config.TypedProperties) JdbcTestUtils(org.apache.hudi.utilities.testutils.JdbcTestUtils) DeltaSync(org.apache.hudi.utilities.deltastreamer.DeltaSync) SQLContext(org.apache.spark.sql.SQLContext) HiveSyncConfig(org.apache.hudi.hive.HiveSyncConfig) HoodieLockConfig(org.apache.hudi.config.HoodieLockConfig) HoodieReplaceCommitMetadata(org.apache.hudi.common.model.HoodieReplaceCommitMetadata) TimeUnit(java.util.concurrent.TimeUnit) SparkRDDWriteClient(org.apache.hudi.client.SparkRDDWriteClient) FilebasedSchemaProvider(org.apache.hudi.utilities.schema.FilebasedSchemaProvider) UDF4(org.apache.spark.sql.api.java.UDF4) HoodieClusteringConfig(org.apache.hudi.config.HoodieClusteringConfig) JdbcSource(org.apache.hudi.utilities.sources.JdbcSource) LogManager(org.apache.log4j.LogManager) HoodieClusteringJob(org.apache.hudi.utilities.HoodieClusteringJob) Collections(java.util.Collections) FSUtils(org.apache.hudi.common.fs.FSUtils) ORCDFSSource(org.apache.hudi.utilities.sources.ORCDFSSource) DriverManager(java.sql.DriverManager) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) ArrayList(java.util.ArrayList) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) HoodieDeltaStreamer(org.apache.hudi.utilities.deltastreamer.HoodieDeltaStreamer) ArrayList(java.util.ArrayList) List(java.util.List) Map(java.util.Map) HashMap(java.util.HashMap) HoodieReplaceCommitMetadata(org.apache.hudi.common.model.HoodieReplaceCommitMetadata) ValueSource(org.junit.jupiter.params.provider.ValueSource) ParameterizedTest(org.junit.jupiter.params.ParameterizedTest)

Aggregations

Option (org.apache.hudi.common.util.Option)105 List (java.util.List)84 IOException (java.io.IOException)70 Collectors (java.util.stream.Collectors)69 Map (java.util.Map)67 ArrayList (java.util.ArrayList)61 Path (org.apache.hadoop.fs.Path)59 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)59 Pair (org.apache.hudi.common.util.collection.Pair)59 HashMap (java.util.HashMap)58 HoodieTimeline (org.apache.hudi.common.table.timeline.HoodieTimeline)58 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)56 LogManager (org.apache.log4j.LogManager)54 Logger (org.apache.log4j.Logger)54 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)53 HoodieRecord (org.apache.hudi.common.model.HoodieRecord)46 HoodieIOException (org.apache.hudi.exception.HoodieIOException)44 Arrays (java.util.Arrays)43 FSUtils (org.apache.hudi.common.fs.FSUtils)43 Collections (java.util.Collections)39