use of org.apache.hudi.common.config.SerializableConfiguration in project hudi by apache.
the class HoodieRepairTool method copyFiles.
/**
* Copies the list of files from source base path to destination base path.
* The destination file path (base + relative) should not already exist.
*
* @param context {@link HoodieEngineContext} instance.
* @param relativeFilePaths A {@link List} of relative file paths for copying.
* @param sourceBasePath Source base path.
* @param destBasePath Destination base path.
* @return {@code true} if all successful; {@code false} otherwise.
*/
static boolean copyFiles(HoodieEngineContext context, List<String> relativeFilePaths, String sourceBasePath, String destBasePath) {
SerializableConfiguration conf = context.getHadoopConf();
List<Boolean> allResults = context.parallelize(relativeFilePaths).mapPartitions(iterator -> {
List<Boolean> results = new ArrayList<>();
FileSystem fs = FSUtils.getFs(destBasePath, conf.get());
iterator.forEachRemaining(filePath -> {
boolean success = false;
Path sourcePath = new Path(sourceBasePath, filePath);
Path destPath = new Path(destBasePath, filePath);
try {
if (!fs.exists(destPath)) {
FileIOUtils.copy(fs, sourcePath, destPath);
success = true;
}
} catch (IOException e) {
// Copy Fail
LOG.error(String.format("Copying file fails: source [%s], destination [%s]", sourcePath, destPath));
} finally {
results.add(success);
}
});
return results.iterator();
}, true).collectAsList();
return allResults.stream().reduce((r1, r2) -> r1 && r2).orElse(false);
}
use of org.apache.hudi.common.config.SerializableConfiguration in project hudi by apache.
the class HoodieSnapshotExporter method exportAsHudi.
private void exportAsHudi(JavaSparkContext jsc, Config cfg, List<String> partitions, String latestCommitTimestamp) throws IOException {
final BaseFileOnlyView fsView = getBaseFileOnlyView(jsc, cfg);
final HoodieEngineContext context = new HoodieSparkEngineContext(jsc);
final SerializableConfiguration serConf = context.getHadoopConf();
context.setJobStatus(this.getClass().getSimpleName(), "Exporting as HUDI dataset");
List<Tuple2<String, String>> files = context.flatMap(partitions, partition -> {
// Only take latest version files <= latestCommit.
List<Tuple2<String, String>> filePaths = new ArrayList<>();
Stream<HoodieBaseFile> dataFiles = fsView.getLatestBaseFilesBeforeOrOn(partition, latestCommitTimestamp);
dataFiles.forEach(hoodieDataFile -> filePaths.add(new Tuple2<>(partition, hoodieDataFile.getPath())));
// also need to copy over partition metadata
Path partitionMetaFile = new Path(FSUtils.getPartitionPath(cfg.sourceBasePath, partition), HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE);
FileSystem fs = FSUtils.getFs(cfg.sourceBasePath, serConf.newCopy());
if (fs.exists(partitionMetaFile)) {
filePaths.add(new Tuple2<>(partition, partitionMetaFile.toString()));
}
return filePaths.stream();
}, partitions.size());
context.foreach(files, tuple -> {
String partition = tuple._1();
Path sourceFilePath = new Path(tuple._2());
Path toPartitionPath = FSUtils.getPartitionPath(cfg.targetOutputPath, partition);
FileSystem fs = FSUtils.getFs(cfg.targetOutputPath, serConf.newCopy());
if (!fs.exists(toPartitionPath)) {
fs.mkdirs(toPartitionPath);
}
FileUtil.copy(fs, sourceFilePath, fs, new Path(toPartitionPath, sourceFilePath.getName()), false, fs.getConf());
}, files.size());
// Also copy the .commit files
LOG.info(String.format("Copying .commit files which are no-late-than %s.", latestCommitTimestamp));
final FileSystem fileSystem = FSUtils.getFs(cfg.sourceBasePath, jsc.hadoopConfiguration());
FileStatus[] commitFilesToCopy = fileSystem.listStatus(new Path(cfg.sourceBasePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME), (commitFilePath) -> {
if (commitFilePath.getName().equals(HoodieTableConfig.HOODIE_PROPERTIES_FILE)) {
return true;
} else {
String instantTime = FSUtils.getCommitFromCommitFile(commitFilePath.getName());
return HoodieTimeline.compareTimestamps(instantTime, HoodieTimeline.LESSER_THAN_OR_EQUALS, latestCommitTimestamp);
}
});
for (FileStatus commitStatus : commitFilesToCopy) {
Path targetFilePath = new Path(cfg.targetOutputPath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitStatus.getPath().getName());
if (!fileSystem.exists(targetFilePath.getParent())) {
fileSystem.mkdirs(targetFilePath.getParent());
}
if (fileSystem.exists(targetFilePath)) {
LOG.error(String.format("The target output commit file (%s targetBasePath) already exists.", targetFilePath));
}
FileUtil.copy(fileSystem, commitStatus.getPath(), fileSystem, targetFilePath, false, fileSystem.getConf());
}
}
use of org.apache.hudi.common.config.SerializableConfiguration in project hudi by apache.
the class DatePartitionPathSelector method pruneDatePartitionPaths.
/**
* Prunes date level partitions to last few days configured by 'NUM_PREV_DAYS_TO_LIST' from
* 'CURRENT_DATE'. Parallelizes listing by leveraging HoodieSparkEngineContext's methods.
*/
public List<String> pruneDatePartitionPaths(HoodieSparkEngineContext context, FileSystem fs, String rootPath, LocalDate currentDate) {
List<String> partitionPaths = new ArrayList<>();
// get all partition paths before date partition level
partitionPaths.add(rootPath);
if (datePartitionDepth <= 0) {
return partitionPaths;
}
SerializableConfiguration serializedConf = new SerializableConfiguration(fs.getConf());
for (int i = 0; i < datePartitionDepth; i++) {
partitionPaths = context.flatMap(partitionPaths, path -> {
Path subDir = new Path(path);
FileSystem fileSystem = subDir.getFileSystem(serializedConf.get());
// skip files/dirs whose names start with (_, ., etc)
FileStatus[] statuses = fileSystem.listStatus(subDir, file -> IGNORE_FILEPREFIX_LIST.stream().noneMatch(pfx -> file.getName().startsWith(pfx)));
List<String> res = new ArrayList<>();
for (FileStatus status : statuses) {
res.add(status.getPath().toString());
}
return res.stream();
}, partitionsListParallelism);
}
// Prune date partitions to last few days
return context.getJavaSparkContext().parallelize(partitionPaths, partitionsListParallelism).filter(s -> {
LocalDate fromDate = currentDate.minusDays(numPrevDaysToList);
String[] splits = s.split("/");
String datePartition = splits[splits.length - 1];
LocalDate partitionDate;
DateTimeFormatter dateFormatter = DateTimeFormatter.ofPattern(dateFormat);
if (datePartition.contains("=")) {
String[] moreSplit = datePartition.split("=");
ValidationUtils.checkArgument(moreSplit.length == 2, "Partition Field (" + datePartition + ") not in expected format");
partitionDate = LocalDate.parse(moreSplit[1], dateFormatter);
} else {
partitionDate = LocalDate.parse(datePartition, dateFormatter);
}
return (partitionDate.isEqual(fromDate) || partitionDate.isAfter(fromDate)) && (partitionDate.isEqual(currentDate) || partitionDate.isBefore(currentDate));
}).collect();
}
use of org.apache.hudi.common.config.SerializableConfiguration in project hudi by apache.
the class MarkerUtils method readTimelineServerBasedMarkersFromFileSystem.
/**
* Reads files containing the markers written by timeline-server-based marker mechanism.
*
* @param markerDir marker directory.
* @param fileSystem file system to use.
* @param context instance of {@link HoodieEngineContext} to use
* @param parallelism parallelism to use
* @return A {@code Map} of file name to the set of markers stored in the file.
*/
public static Map<String, Set<String>> readTimelineServerBasedMarkersFromFileSystem(String markerDir, FileSystem fileSystem, HoodieEngineContext context, int parallelism) {
Path dirPath = new Path(markerDir);
try {
if (fileSystem.exists(dirPath)) {
Predicate<FileStatus> prefixFilter = fileStatus -> fileStatus.getPath().getName().startsWith(MARKERS_FILENAME_PREFIX);
Predicate<FileStatus> markerTypeFilter = fileStatus -> !fileStatus.getPath().getName().equals(MARKER_TYPE_FILENAME);
return FSUtils.parallelizeSubPathProcess(context, fileSystem, dirPath, parallelism, prefixFilter.and(markerTypeFilter), pairOfSubPathAndConf -> {
String markersFilePathStr = pairOfSubPathAndConf.getKey();
SerializableConfiguration conf = pairOfSubPathAndConf.getValue();
return readMarkersFromFile(new Path(markersFilePathStr), conf);
});
}
return new HashMap<>();
} catch (IOException ioe) {
throw new HoodieIOException(ioe.getMessage(), ioe);
}
}
use of org.apache.hudi.common.config.SerializableConfiguration in project hudi by apache.
the class TestFileSystemBackedTableMetadata method testNonPartitionedTable.
/**
* Test non partition hoodie table.
* @throws Exception
*/
@Test
public void testNonPartitionedTable() throws Exception {
// Generate 10 files under basepath
hoodieTestTable.addCommit("100").withBaseFilesInPartition(DEFAULT_PARTITION, IntStream.range(0, 10).toArray());
HoodieLocalEngineContext localEngineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf());
FileSystemBackedTableMetadata fileSystemBackedTableMetadata = new FileSystemBackedTableMetadata(localEngineContext, new SerializableConfiguration(metaClient.getHadoopConf()), basePath, false);
Assertions.assertEquals(0, fileSystemBackedTableMetadata.getAllPartitionPaths().size());
Assertions.assertEquals(10, fileSystemBackedTableMetadata.getAllFilesInPartition(new Path(basePath)).length);
Assertions.assertEquals(10, fileSystemBackedTableMetadata.getAllFilesInPartitions(Collections.singletonList(basePath)).get(basePath).length);
}
Aggregations