Search in sources :

Example 11 with HoodieCompactionOperation

use of org.apache.hudi.avro.model.HoodieCompactionOperation in project hudi by apache.

the class HoodieCompactor method generateCompactionPlan.

/**
 * Generate a new compaction plan for scheduling.
 *
 * @param context                               HoodieEngineContext
 * @param hoodieTable                           Hoodie Table
 * @param config                                Hoodie Write Configuration
 * @param compactionCommitTime                  scheduled compaction commit time
 * @param fgIdsInPendingCompactionAndClustering partition-fileId pairs for which compaction is pending
 * @return Compaction Plan
 * @throws IOException when encountering errors
 */
HoodieCompactionPlan generateCompactionPlan(HoodieEngineContext context, HoodieTable<T, I, K, O> hoodieTable, HoodieWriteConfig config, String compactionCommitTime, Set<HoodieFileGroupId> fgIdsInPendingCompactionAndClustering) throws IOException {
    // Accumulator to keep track of total log files for a table
    HoodieAccumulator totalLogFiles = context.newAccumulator();
    // Accumulator to keep track of total log file slices for a table
    HoodieAccumulator totalFileSlices = context.newAccumulator();
    ValidationUtils.checkArgument(hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ, "Can only compact table of type " + HoodieTableType.MERGE_ON_READ + " and not " + hoodieTable.getMetaClient().getTableType().name());
    // TODO : check if maxMemory is not greater than JVM or executor memory
    // TODO - rollback any compactions in flight
    HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
    LOG.info("Compacting " + metaClient.getBasePath() + " with commit " + compactionCommitTime);
    List<String> partitionPaths = FSUtils.getAllPartitionPaths(context, config.getMetadataConfig(), metaClient.getBasePath());
    // filter the partition paths if needed to reduce list status
    partitionPaths = config.getCompactionStrategy().filterPartitionPaths(config, partitionPaths);
    if (partitionPaths.isEmpty()) {
        // In case no partitions could be picked, return no compaction plan
        return null;
    }
    SliceView fileSystemView = hoodieTable.getSliceView();
    LOG.info("Compaction looking for files to compact in " + partitionPaths + " partitions");
    context.setJobStatus(this.getClass().getSimpleName(), "Looking for files to compact");
    List<HoodieCompactionOperation> operations = context.flatMap(partitionPaths, partitionPath -> fileSystemView.getLatestFileSlices(partitionPath).filter(slice -> !fgIdsInPendingCompactionAndClustering.contains(slice.getFileGroupId())).map(s -> {
        List<HoodieLogFile> logFiles = s.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()).collect(toList());
        totalLogFiles.add(logFiles.size());
        totalFileSlices.add(1L);
        // Avro generated classes are not inheriting Serializable. Using CompactionOperation POJO
        // for Map operations and collecting them finally in Avro generated classes for storing
        // into meta files.
        Option<HoodieBaseFile> dataFile = s.getBaseFile();
        return new CompactionOperation(dataFile, partitionPath, logFiles, config.getCompactionStrategy().captureMetrics(config, s));
    }).filter(c -> !c.getDeltaFileNames().isEmpty()), partitionPaths.size()).stream().map(CompactionUtils::buildHoodieCompactionOperation).collect(toList());
    LOG.info("Total of " + operations.size() + " compactions are retrieved");
    LOG.info("Total number of latest files slices " + totalFileSlices.value());
    LOG.info("Total number of log files " + totalLogFiles.value());
    LOG.info("Total number of file slices " + totalFileSlices.value());
    // Filter the compactions with the passed in filter. This lets us choose most effective
    // compactions only
    HoodieCompactionPlan compactionPlan = config.getCompactionStrategy().generateCompactionPlan(config, operations, CompactionUtils.getAllPendingCompactionPlans(metaClient).stream().map(Pair::getValue).collect(toList()));
    ValidationUtils.checkArgument(compactionPlan.getOperations().stream().noneMatch(op -> fgIdsInPendingCompactionAndClustering.contains(new HoodieFileGroupId(op.getPartitionPath(), op.getFileId()))), "Bad Compaction Plan. FileId MUST NOT have multiple pending compactions. " + "Please fix your strategy implementation. FileIdsWithPendingCompactions :" + fgIdsInPendingCompactionAndClustering + ", Selected workload :" + compactionPlan);
    if (compactionPlan.getOperations().isEmpty()) {
        LOG.warn("After filtering, Nothing to compact for " + metaClient.getBasePath());
    }
    return compactionPlan;
}
Also used : HoodieTable(org.apache.hudi.table.HoodieTable) HoodieAvroUtils(org.apache.hudi.avro.HoodieAvroUtils) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) CollectionUtils(org.apache.hudi.common.util.CollectionUtils) Option(org.apache.hudi.common.util.Option) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) HoodieTableType(org.apache.hudi.common.model.HoodieTableType) HoodieAccumulator(org.apache.hudi.common.data.HoodieAccumulator) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) RuntimeStats(org.apache.hudi.common.model.HoodieWriteStat.RuntimeStats) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) StreamSupport(java.util.stream.StreamSupport) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) HoodieData(org.apache.hudi.common.data.HoodieData) TableSchemaResolver(org.apache.hudi.common.table.TableSchemaResolver) HoodieMergedLogRecordScanner(org.apache.hudi.common.table.log.HoodieMergedLogRecordScanner) Schema(org.apache.avro.Schema) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Iterator(java.util.Iterator) TaskContextSupplier(org.apache.hudi.common.engine.TaskContextSupplier) Collection(java.util.Collection) Set(java.util.Set) IOException(java.io.IOException) CompactionStrategy(org.apache.hudi.table.action.compact.strategy.CompactionStrategy) Serializable(java.io.Serializable) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) HoodieCompactionOperation(org.apache.hudi.avro.model.HoodieCompactionOperation) WriteStatus(org.apache.hudi.client.WriteStatus) HoodieRecordPayload(org.apache.hudi.common.model.HoodieRecordPayload) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) HoodieCompactionHandler(org.apache.hudi.table.HoodieCompactionHandler) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) SliceView(org.apache.hudi.common.table.view.TableFileSystemView.SliceView) IOUtils(org.apache.hudi.io.IOUtils) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) CompactionUtils(org.apache.hudi.common.util.CompactionUtils) Pair(org.apache.hudi.common.util.collection.Pair) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) HoodieCompactionOperation(org.apache.hudi.avro.model.HoodieCompactionOperation) HoodieAccumulator(org.apache.hudi.common.data.HoodieAccumulator) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) SliceView(org.apache.hudi.common.table.view.TableFileSystemView.SliceView) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieCompactionOperation(org.apache.hudi.avro.model.HoodieCompactionOperation) ArrayList(java.util.ArrayList) List(java.util.List) Collectors.toList(java.util.stream.Collectors.toList) Option(org.apache.hudi.common.util.Option) Pair(org.apache.hudi.common.util.collection.Pair)

Example 12 with HoodieCompactionOperation

use of org.apache.hudi.avro.model.HoodieCompactionOperation in project hudi by apache.

the class CompactionV1MigrationHandler method downgradeFrom.

@Override
public HoodieCompactionPlan downgradeFrom(HoodieCompactionPlan input) {
    ValidationUtils.checkArgument(input.getVersion() == 2, "Input version is " + input.getVersion() + ". Must be 2");
    HoodieCompactionPlan compactionPlan = new HoodieCompactionPlan();
    final Path basePath = new Path(metaClient.getBasePath());
    List<HoodieCompactionOperation> v1CompactionOperationList = new ArrayList<>();
    if (null != input.getOperations()) {
        v1CompactionOperationList = input.getOperations().stream().map(inp -> HoodieCompactionOperation.newBuilder().setBaseInstantTime(inp.getBaseInstantTime()).setFileId(inp.getFileId()).setPartitionPath(inp.getPartitionPath()).setMetrics(inp.getMetrics()).setDataFilePath(convertToV1Path(basePath, inp.getPartitionPath(), inp.getDataFilePath())).setDeltaFilePaths(inp.getDeltaFilePaths().stream().map(s -> convertToV1Path(basePath, inp.getPartitionPath(), s)).collect(Collectors.toList())).build()).collect(Collectors.toList());
    }
    compactionPlan.setOperations(v1CompactionOperationList);
    compactionPlan.setExtraMetadata(input.getExtraMetadata());
    compactionPlan.setVersion(getManagedVersion());
    return compactionPlan;
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieCompactionOperation(org.apache.hudi.avro.model.HoodieCompactionOperation) ArrayList(java.util.ArrayList)

Example 13 with HoodieCompactionOperation

use of org.apache.hudi.avro.model.HoodieCompactionOperation in project hudi by apache.

the class CompactionV2MigrationHandler method upgradeFrom.

@Override
public HoodieCompactionPlan upgradeFrom(HoodieCompactionPlan input) {
    ValidationUtils.checkArgument(input.getVersion() == 1, "Input version is " + input.getVersion() + ". Must be 1");
    HoodieCompactionPlan compactionPlan = new HoodieCompactionPlan();
    List<HoodieCompactionOperation> v2CompactionOperationList = new ArrayList<>();
    if (null != input.getOperations()) {
        v2CompactionOperationList = input.getOperations().stream().map(inp -> HoodieCompactionOperation.newBuilder().setBaseInstantTime(inp.getBaseInstantTime()).setFileId(inp.getFileId()).setPartitionPath(inp.getPartitionPath()).setMetrics(inp.getMetrics()).setDataFilePath(inp.getDataFilePath() == null ? null : new Path(inp.getDataFilePath()).getName()).setDeltaFilePaths(inp.getDeltaFilePaths().stream().map(s -> new Path(s).getName()).collect(Collectors.toList())).build()).collect(Collectors.toList());
    }
    compactionPlan.setOperations(v2CompactionOperationList);
    compactionPlan.setExtraMetadata(input.getExtraMetadata());
    compactionPlan.setVersion(getManagedVersion());
    return compactionPlan;
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) HoodieCompactionOperation(org.apache.hudi.avro.model.HoodieCompactionOperation) ArrayList(java.util.ArrayList)

Example 14 with HoodieCompactionOperation

use of org.apache.hudi.avro.model.HoodieCompactionOperation in project hudi by apache.

the class CompactionUtils method getAllPendingCompactionOperations.

/**
 * Get all PartitionPath + file-ids with pending Compaction operations and their target compaction instant time.
 *
 * @param metaClient Hoodie Table Meta Client
 */
public static Map<HoodieFileGroupId, Pair<String, HoodieCompactionOperation>> getAllPendingCompactionOperations(HoodieTableMetaClient metaClient) {
    List<Pair<HoodieInstant, HoodieCompactionPlan>> pendingCompactionPlanWithInstants = getAllPendingCompactionPlans(metaClient);
    Map<HoodieFileGroupId, Pair<String, HoodieCompactionOperation>> fgIdToPendingCompactionWithInstantMap = new HashMap<>();
    pendingCompactionPlanWithInstants.stream().flatMap(instantPlanPair -> getPendingCompactionOperations(instantPlanPair.getKey(), instantPlanPair.getValue())).forEach(pair -> {
        // on some DFSs.
        if (fgIdToPendingCompactionWithInstantMap.containsKey(pair.getKey())) {
            HoodieCompactionOperation operation = pair.getValue().getValue();
            HoodieCompactionOperation anotherOperation = fgIdToPendingCompactionWithInstantMap.get(pair.getKey()).getValue();
            if (!operation.equals(anotherOperation)) {
                String msg = "Hudi File Id (" + pair.getKey() + ") has more than 1 pending compactions. Instants: " + pair.getValue() + ", " + fgIdToPendingCompactionWithInstantMap.get(pair.getKey());
                throw new IllegalStateException(msg);
            }
        }
        fgIdToPendingCompactionWithInstantMap.put(pair.getKey(), pair.getValue());
    });
    return fgIdToPendingCompactionWithInstantMap;
}
Also used : CompactionPlanMigrator(org.apache.hudi.common.table.timeline.versioning.compaction.CompactionPlanMigrator) HoodieInstant(org.apache.hudi.common.table.timeline.HoodieInstant) FileSlice(org.apache.hudi.common.model.FileSlice) HoodieException(org.apache.hudi.exception.HoodieException) HashMap(java.util.HashMap) CompactionV1MigrationHandler(org.apache.hudi.common.table.timeline.versioning.compaction.CompactionV1MigrationHandler) Function(java.util.function.Function) Logger(org.apache.log4j.Logger) BaseFile(org.apache.hudi.common.model.BaseFile) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) CompactionV2MigrationHandler(org.apache.hudi.common.table.timeline.versioning.compaction.CompactionV2MigrationHandler) Map(java.util.Map) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) HoodieTimeline(org.apache.hudi.common.table.timeline.HoodieTimeline) TimelineMetadataUtils(org.apache.hudi.common.table.timeline.TimelineMetadataUtils) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) CompactionOperation(org.apache.hudi.common.model.CompactionOperation) HoodieCompactionOperation(org.apache.hudi.avro.model.HoodieCompactionOperation) List(java.util.List) Stream(java.util.stream.Stream) HoodieCompactionPlan(org.apache.hudi.avro.model.HoodieCompactionPlan) LogManager(org.apache.log4j.LogManager) Pair(org.apache.hudi.common.util.collection.Pair) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) HashMap(java.util.HashMap) HoodieCompactionOperation(org.apache.hudi.avro.model.HoodieCompactionOperation) Pair(org.apache.hudi.common.util.collection.Pair)

Example 15 with HoodieCompactionOperation

use of org.apache.hudi.avro.model.HoodieCompactionOperation in project hudi by apache.

the class TestHoodieCompactionStrategy method testBoundedIOSimple.

@Test
public void testBoundedIOSimple() {
    Map<Long, List<Long>> sizesMap = new HashMap<>();
    sizesMap.put(120 * MB, Arrays.asList(60 * MB, 10 * MB, 80 * MB));
    sizesMap.put(110 * MB, new ArrayList<>());
    sizesMap.put(100 * MB, Collections.singletonList(MB));
    sizesMap.put(90 * MB, Collections.singletonList(1024 * MB));
    BoundedIOCompactionStrategy strategy = new BoundedIOCompactionStrategy();
    HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp").withCompactionConfig(HoodieCompactionConfig.newBuilder().withCompactionStrategy(strategy).withTargetIOPerCompactionInMB(400).build()).build();
    List<HoodieCompactionOperation> operations = createCompactionOperations(writeConfig, sizesMap);
    List<HoodieCompactionOperation> returned = strategy.orderAndFilter(writeConfig, operations, new ArrayList<>());
    assertTrue(returned.size() < operations.size(), "BoundedIOCompaction should have resulted in fewer compactions");
    assertEquals(2, returned.size(), "BoundedIOCompaction should have resulted in 2 compactions being chosen");
    // Total size of all the log files
    Long returnedSize = returned.stream().map(s -> s.getMetrics().get(BoundedIOCompactionStrategy.TOTAL_IO_MB)).map(Double::longValue).reduce(Long::sum).orElse(0L);
    assertEquals(610, (long) returnedSize, "Should chose the first 2 compactions which should result in a total IO of 690 MB");
}
Also used : Arrays(java.util.Arrays) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) Date(java.util.Date) FileSlice(org.apache.hudi.common.model.FileSlice) SimpleDateFormat(java.text.SimpleDateFormat) HashMap(java.util.HashMap) Random(java.util.Random) UUID(java.util.UUID) Collectors(java.util.stream.Collectors) ArrayList(java.util.ArrayList) HoodieCompactionConfig(org.apache.hudi.config.HoodieCompactionConfig) Test(org.junit.jupiter.api.Test) HoodieCompactionOperation(org.apache.hudi.avro.model.HoodieCompactionOperation) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) List(java.util.List) BaseFile(org.apache.hudi.common.model.BaseFile) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) Map(java.util.Map) Assertions.assertTrue(org.junit.jupiter.api.Assertions.assertTrue) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) Collections(java.util.Collections) HoodieFileGroupId(org.apache.hudi.common.model.HoodieFileGroupId) Pair(org.apache.hudi.common.util.collection.Pair) HashMap(java.util.HashMap) HoodieCompactionOperation(org.apache.hudi.avro.model.HoodieCompactionOperation) HoodieWriteConfig(org.apache.hudi.config.HoodieWriteConfig) ArrayList(java.util.ArrayList) List(java.util.List) Test(org.junit.jupiter.api.Test)

Aggregations

HoodieCompactionOperation (org.apache.hudi.avro.model.HoodieCompactionOperation)21 ArrayList (java.util.ArrayList)16 List (java.util.List)13 HashMap (java.util.HashMap)11 Test (org.junit.jupiter.api.Test)10 HoodieCompactionPlan (org.apache.hudi.avro.model.HoodieCompactionPlan)9 FileSlice (org.apache.hudi.common.model.FileSlice)9 HoodieWriteConfig (org.apache.hudi.config.HoodieWriteConfig)9 Map (java.util.Map)7 Collectors (java.util.stream.Collectors)7 Path (org.apache.hadoop.fs.Path)7 HoodieFileGroupId (org.apache.hudi.common.model.HoodieFileGroupId)7 HoodieLogFile (org.apache.hudi.common.model.HoodieLogFile)7 Pair (org.apache.hudi.common.util.collection.Pair)7 HoodieBaseFile (org.apache.hudi.common.model.HoodieBaseFile)6 IOException (java.io.IOException)5 SimpleDateFormat (java.text.SimpleDateFormat)5 Date (java.util.Date)5 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)5 HoodieInstant (org.apache.hudi.common.table.timeline.HoodieInstant)5