use of org.apache.hudi.avro.model.HoodieCompactionOperation in project hudi by apache.
the class HoodieCompactor method generateCompactionPlan.
/**
* Generate a new compaction plan for scheduling.
*
* @param context HoodieEngineContext
* @param hoodieTable Hoodie Table
* @param config Hoodie Write Configuration
* @param compactionCommitTime scheduled compaction commit time
* @param fgIdsInPendingCompactionAndClustering partition-fileId pairs for which compaction is pending
* @return Compaction Plan
* @throws IOException when encountering errors
*/
HoodieCompactionPlan generateCompactionPlan(HoodieEngineContext context, HoodieTable<T, I, K, O> hoodieTable, HoodieWriteConfig config, String compactionCommitTime, Set<HoodieFileGroupId> fgIdsInPendingCompactionAndClustering) throws IOException {
// Accumulator to keep track of total log files for a table
HoodieAccumulator totalLogFiles = context.newAccumulator();
// Accumulator to keep track of total log file slices for a table
HoodieAccumulator totalFileSlices = context.newAccumulator();
ValidationUtils.checkArgument(hoodieTable.getMetaClient().getTableType() == HoodieTableType.MERGE_ON_READ, "Can only compact table of type " + HoodieTableType.MERGE_ON_READ + " and not " + hoodieTable.getMetaClient().getTableType().name());
// TODO : check if maxMemory is not greater than JVM or executor memory
// TODO - rollback any compactions in flight
HoodieTableMetaClient metaClient = hoodieTable.getMetaClient();
LOG.info("Compacting " + metaClient.getBasePath() + " with commit " + compactionCommitTime);
List<String> partitionPaths = FSUtils.getAllPartitionPaths(context, config.getMetadataConfig(), metaClient.getBasePath());
// filter the partition paths if needed to reduce list status
partitionPaths = config.getCompactionStrategy().filterPartitionPaths(config, partitionPaths);
if (partitionPaths.isEmpty()) {
// In case no partitions could be picked, return no compaction plan
return null;
}
SliceView fileSystemView = hoodieTable.getSliceView();
LOG.info("Compaction looking for files to compact in " + partitionPaths + " partitions");
context.setJobStatus(this.getClass().getSimpleName(), "Looking for files to compact");
List<HoodieCompactionOperation> operations = context.flatMap(partitionPaths, partitionPath -> fileSystemView.getLatestFileSlices(partitionPath).filter(slice -> !fgIdsInPendingCompactionAndClustering.contains(slice.getFileGroupId())).map(s -> {
List<HoodieLogFile> logFiles = s.getLogFiles().sorted(HoodieLogFile.getLogFileComparator()).collect(toList());
totalLogFiles.add(logFiles.size());
totalFileSlices.add(1L);
// Avro generated classes are not inheriting Serializable. Using CompactionOperation POJO
// for Map operations and collecting them finally in Avro generated classes for storing
// into meta files.
Option<HoodieBaseFile> dataFile = s.getBaseFile();
return new CompactionOperation(dataFile, partitionPath, logFiles, config.getCompactionStrategy().captureMetrics(config, s));
}).filter(c -> !c.getDeltaFileNames().isEmpty()), partitionPaths.size()).stream().map(CompactionUtils::buildHoodieCompactionOperation).collect(toList());
LOG.info("Total of " + operations.size() + " compactions are retrieved");
LOG.info("Total number of latest files slices " + totalFileSlices.value());
LOG.info("Total number of log files " + totalLogFiles.value());
LOG.info("Total number of file slices " + totalFileSlices.value());
// Filter the compactions with the passed in filter. This lets us choose most effective
// compactions only
HoodieCompactionPlan compactionPlan = config.getCompactionStrategy().generateCompactionPlan(config, operations, CompactionUtils.getAllPendingCompactionPlans(metaClient).stream().map(Pair::getValue).collect(toList()));
ValidationUtils.checkArgument(compactionPlan.getOperations().stream().noneMatch(op -> fgIdsInPendingCompactionAndClustering.contains(new HoodieFileGroupId(op.getPartitionPath(), op.getFileId()))), "Bad Compaction Plan. FileId MUST NOT have multiple pending compactions. " + "Please fix your strategy implementation. FileIdsWithPendingCompactions :" + fgIdsInPendingCompactionAndClustering + ", Selected workload :" + compactionPlan);
if (compactionPlan.getOperations().isEmpty()) {
LOG.warn("After filtering, Nothing to compact for " + metaClient.getBasePath());
}
return compactionPlan;
}
use of org.apache.hudi.avro.model.HoodieCompactionOperation in project hudi by apache.
the class CompactionV1MigrationHandler method downgradeFrom.
@Override
public HoodieCompactionPlan downgradeFrom(HoodieCompactionPlan input) {
ValidationUtils.checkArgument(input.getVersion() == 2, "Input version is " + input.getVersion() + ". Must be 2");
HoodieCompactionPlan compactionPlan = new HoodieCompactionPlan();
final Path basePath = new Path(metaClient.getBasePath());
List<HoodieCompactionOperation> v1CompactionOperationList = new ArrayList<>();
if (null != input.getOperations()) {
v1CompactionOperationList = input.getOperations().stream().map(inp -> HoodieCompactionOperation.newBuilder().setBaseInstantTime(inp.getBaseInstantTime()).setFileId(inp.getFileId()).setPartitionPath(inp.getPartitionPath()).setMetrics(inp.getMetrics()).setDataFilePath(convertToV1Path(basePath, inp.getPartitionPath(), inp.getDataFilePath())).setDeltaFilePaths(inp.getDeltaFilePaths().stream().map(s -> convertToV1Path(basePath, inp.getPartitionPath(), s)).collect(Collectors.toList())).build()).collect(Collectors.toList());
}
compactionPlan.setOperations(v1CompactionOperationList);
compactionPlan.setExtraMetadata(input.getExtraMetadata());
compactionPlan.setVersion(getManagedVersion());
return compactionPlan;
}
use of org.apache.hudi.avro.model.HoodieCompactionOperation in project hudi by apache.
the class CompactionV2MigrationHandler method upgradeFrom.
@Override
public HoodieCompactionPlan upgradeFrom(HoodieCompactionPlan input) {
ValidationUtils.checkArgument(input.getVersion() == 1, "Input version is " + input.getVersion() + ". Must be 1");
HoodieCompactionPlan compactionPlan = new HoodieCompactionPlan();
List<HoodieCompactionOperation> v2CompactionOperationList = new ArrayList<>();
if (null != input.getOperations()) {
v2CompactionOperationList = input.getOperations().stream().map(inp -> HoodieCompactionOperation.newBuilder().setBaseInstantTime(inp.getBaseInstantTime()).setFileId(inp.getFileId()).setPartitionPath(inp.getPartitionPath()).setMetrics(inp.getMetrics()).setDataFilePath(inp.getDataFilePath() == null ? null : new Path(inp.getDataFilePath()).getName()).setDeltaFilePaths(inp.getDeltaFilePaths().stream().map(s -> new Path(s).getName()).collect(Collectors.toList())).build()).collect(Collectors.toList());
}
compactionPlan.setOperations(v2CompactionOperationList);
compactionPlan.setExtraMetadata(input.getExtraMetadata());
compactionPlan.setVersion(getManagedVersion());
return compactionPlan;
}
use of org.apache.hudi.avro.model.HoodieCompactionOperation in project hudi by apache.
the class CompactionUtils method getAllPendingCompactionOperations.
/**
* Get all PartitionPath + file-ids with pending Compaction operations and their target compaction instant time.
*
* @param metaClient Hoodie Table Meta Client
*/
public static Map<HoodieFileGroupId, Pair<String, HoodieCompactionOperation>> getAllPendingCompactionOperations(HoodieTableMetaClient metaClient) {
List<Pair<HoodieInstant, HoodieCompactionPlan>> pendingCompactionPlanWithInstants = getAllPendingCompactionPlans(metaClient);
Map<HoodieFileGroupId, Pair<String, HoodieCompactionOperation>> fgIdToPendingCompactionWithInstantMap = new HashMap<>();
pendingCompactionPlanWithInstants.stream().flatMap(instantPlanPair -> getPendingCompactionOperations(instantPlanPair.getKey(), instantPlanPair.getValue())).forEach(pair -> {
// on some DFSs.
if (fgIdToPendingCompactionWithInstantMap.containsKey(pair.getKey())) {
HoodieCompactionOperation operation = pair.getValue().getValue();
HoodieCompactionOperation anotherOperation = fgIdToPendingCompactionWithInstantMap.get(pair.getKey()).getValue();
if (!operation.equals(anotherOperation)) {
String msg = "Hudi File Id (" + pair.getKey() + ") has more than 1 pending compactions. Instants: " + pair.getValue() + ", " + fgIdToPendingCompactionWithInstantMap.get(pair.getKey());
throw new IllegalStateException(msg);
}
}
fgIdToPendingCompactionWithInstantMap.put(pair.getKey(), pair.getValue());
});
return fgIdToPendingCompactionWithInstantMap;
}
use of org.apache.hudi.avro.model.HoodieCompactionOperation in project hudi by apache.
the class TestHoodieCompactionStrategy method testBoundedIOSimple.
@Test
public void testBoundedIOSimple() {
Map<Long, List<Long>> sizesMap = new HashMap<>();
sizesMap.put(120 * MB, Arrays.asList(60 * MB, 10 * MB, 80 * MB));
sizesMap.put(110 * MB, new ArrayList<>());
sizesMap.put(100 * MB, Collections.singletonList(MB));
sizesMap.put(90 * MB, Collections.singletonList(1024 * MB));
BoundedIOCompactionStrategy strategy = new BoundedIOCompactionStrategy();
HoodieWriteConfig writeConfig = HoodieWriteConfig.newBuilder().withPath("/tmp").withCompactionConfig(HoodieCompactionConfig.newBuilder().withCompactionStrategy(strategy).withTargetIOPerCompactionInMB(400).build()).build();
List<HoodieCompactionOperation> operations = createCompactionOperations(writeConfig, sizesMap);
List<HoodieCompactionOperation> returned = strategy.orderAndFilter(writeConfig, operations, new ArrayList<>());
assertTrue(returned.size() < operations.size(), "BoundedIOCompaction should have resulted in fewer compactions");
assertEquals(2, returned.size(), "BoundedIOCompaction should have resulted in 2 compactions being chosen");
// Total size of all the log files
Long returnedSize = returned.stream().map(s -> s.getMetrics().get(BoundedIOCompactionStrategy.TOTAL_IO_MB)).map(Double::longValue).reduce(Long::sum).orElse(0L);
assertEquals(610, (long) returnedSize, "Should chose the first 2 compactions which should result in a total IO of 690 MB");
}
Aggregations