use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class HoodieDataSourceHelpers method getClusteringPlan.
@PublicAPIMethod(maturity = ApiMaturityLevel.STABLE)
public static Option<HoodieClusteringPlan> getClusteringPlan(FileSystem fs, String basePath, String instantTime) {
HoodieTableMetaClient metaClient = HoodieTableMetaClient.builder().setConf(fs.getConf()).setBasePath(basePath).setLoadActiveTimelineOnLoad(true).build();
HoodieInstant hoodieInstant = HoodieTimeline.getReplaceCommitRequestedInstant(instantTime);
Option<Pair<HoodieInstant, HoodieClusteringPlan>> clusteringPlan = ClusteringUtils.getClusteringPlan(metaClient, hoodieInstant);
if (clusteringPlan.isPresent()) {
return Option.of(clusteringPlan.get().getValue());
} else {
return Option.empty();
}
}
use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class SparkFullBootstrapDataProviderBase method generateInputRecords.
@Override
public JavaRDD<HoodieRecord> generateInputRecords(String tableName, String sourceBasePath, List<Pair<String, List<HoodieFileStatus>>> partitionPathsWithFiles) {
String[] filePaths = partitionPathsWithFiles.stream().map(Pair::getValue).flatMap(f -> f.stream().map(fs -> FileStatusUtils.toPath(fs.getPath()).toString())).toArray(String[]::new);
Dataset inputDataset = sparkSession.read().format(getFormat()).load(filePaths);
try {
KeyGenerator keyGenerator = HoodieSparkKeyGeneratorFactory.createKeyGenerator(props);
String structName = tableName + "_record";
String namespace = "hoodie." + tableName;
RDD<GenericRecord> genericRecords = HoodieSparkUtils.createRdd(inputDataset, structName, namespace, false, Option.empty());
return genericRecords.toJavaRDD().map(gr -> {
String orderingVal = HoodieAvroUtils.getNestedFieldValAsString(gr, props.getString("hoodie.datasource.write.precombine.field"), false, props.getBoolean(KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.key(), Boolean.parseBoolean(KeyGeneratorOptions.KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED.defaultValue())));
try {
return DataSourceUtils.createHoodieRecord(gr, orderingVal, keyGenerator.getKey(gr), props.getString("hoodie.datasource.write.payload.class"));
} catch (IOException ioe) {
throw new HoodieIOException(ioe.getMessage(), ioe);
}
});
} catch (IOException ioe) {
throw new HoodieIOException(ioe.getMessage(), ioe);
}
}
use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class HoodieTestCommitGenerator method generateCommitMetadata.
public static HoodieCommitMetadata generateCommitMetadata(Map<String, List<Pair<String, String>>> partitionPathToFileIdAndNameMap, Map<String, String> extraMetadata) {
HoodieCommitMetadata metadata = new HoodieCommitMetadata();
for (Map.Entry<String, String> entry : extraMetadata.entrySet()) {
metadata.addMetadata(entry.getKey(), entry.getValue());
}
partitionPathToFileIdAndNameMap.forEach((partitionPath, fileInfoList) -> fileInfoList.forEach(fileInfo -> {
HoodieWriteStat writeStat = new HoodieWriteStat();
writeStat.setPartitionPath(partitionPath);
writeStat.setPath(new Path(partitionPath, fileInfo.getValue()).toString());
writeStat.setFileId(fileInfo.getKey());
// Below are dummy values
writeStat.setTotalWriteBytes(10000);
writeStat.setPrevCommit("000");
writeStat.setNumWrites(10);
writeStat.setNumUpdateWrites(15);
writeStat.setTotalLogBlocks(2);
writeStat.setTotalLogRecords(100);
metadata.addWriteStat(partitionPath, writeStat);
}));
return metadata;
}
use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class TestRepairUtils method testTagInstantsOfBaseAndLogFiles.
@Test
public void testTagInstantsOfBaseAndLogFiles() {
Map<String, List<String>> expectedResult = new HashMap<>();
List<Path> inputPathList = new ArrayList<>();
for (Map.Entry<String, List<Pair<String, String>>> entry : BASE_FILE_INFO.entrySet()) {
String instantTime = entry.getKey();
List<String> fileNameList = entry.getValue().stream().map(e -> {
String partitionPath = e.getKey();
String fileId = e.getValue();
return new Path(new Path(partitionPath), getBaseFilename(instantTime, fileId)).toString();
}).collect(Collectors.toList());
List<String> expectedList = expectedResult.computeIfAbsent(instantTime, k -> new ArrayList<>());
expectedList.addAll(fileNameList);
inputPathList.addAll(fileNameList.stream().map(path -> new Path(basePath, path)).collect(Collectors.toList()));
}
for (Map.Entry<String, List<Pair<String, String>>> entry : LOG_FILE_INFO.entrySet()) {
String instantTime = entry.getKey();
List<String> fileNameList = entry.getValue().stream().map(e -> {
String partitionPath = e.getKey();
String fileId = e.getValue();
return new Path(new Path(partitionPath), getLogFilename(instantTime, fileId)).toString();
}).collect(Collectors.toList());
List<String> expectedList = expectedResult.computeIfAbsent(instantTime, k -> new ArrayList<>());
expectedList.addAll(fileNameList);
inputPathList.addAll(fileNameList.stream().map(path -> new Path(basePath, path)).collect(Collectors.toList()));
}
assertEquals(expectedResult, RepairUtils.tagInstantsOfBaseAndLogFiles(basePath, inputPathList));
}
use of org.apache.hudi.common.util.collection.Pair in project hudi by apache.
the class TestRepairUtils method testFindInstantFilesToRemove.
@Test
public void testFindInstantFilesToRemove() throws IOException {
setupTimelineInFS();
HoodieInstant existingInstant = new HoodieInstant(HoodieInstant.State.COMPLETED, HoodieTimeline.COMMIT_ACTION, "001");
Map<String, List<Pair<String, String>>> partitionToFileIdAndNameMap = instantInfoMap.get(existingInstant.getTimestamp());
List<String> fileListFromFs = partitionToFileIdAndNameMap.entrySet().stream().flatMap(entry -> entry.getValue().stream().map(fileInfo -> new Path(entry.getKey(), fileInfo.getValue()).toString()).collect(Collectors.toList()).stream()).collect(Collectors.toList());
String danglingFilePath = new Path("2022/01/02", getBaseFilename(existingInstant.getTimestamp(), UUID.randomUUID().toString())).toString();
fileListFromFs.add(danglingFilePath);
// Existing instant
assertEquals(CollectionUtils.createImmutableList(danglingFilePath), RepairUtils.findInstantFilesToRemove(existingInstant.getTimestamp(), fileListFromFs, metaClient.getActiveTimeline(), metaClient.getArchivedTimeline()));
// Non-existing instant
assertEquals(fileListFromFs, RepairUtils.findInstantFilesToRemove("004", fileListFromFs, metaClient.getActiveTimeline(), metaClient.getArchivedTimeline()));
}
Aggregations