Search in sources :

Example 26 with SerializableConfiguration

use of org.apache.hudi.common.config.SerializableConfiguration in project hudi by apache.

the class FSUtils method parallelizeFilesProcess.

public static <T> Map<String, T> parallelizeFilesProcess(HoodieEngineContext hoodieEngineContext, FileSystem fs, int parallelism, SerializableFunction<Pair<String, SerializableConfiguration>, T> pairFunction, List<String> subPaths) {
    Map<String, T> result = new HashMap<>();
    if (subPaths.size() > 0) {
        SerializableConfiguration conf = new SerializableConfiguration(fs.getConf());
        int actualParallelism = Math.min(subPaths.size(), parallelism);
        result = hoodieEngineContext.mapToPair(subPaths, subPath -> new ImmutablePair<>(subPath, pairFunction.apply(new ImmutablePair<>(subPath, conf))), actualParallelism);
    }
    return result;
}
Also used : ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Arrays(java.util.Arrays) InvalidHoodiePathException(org.apache.hudi.exception.InvalidHoodiePathException) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieException(org.apache.hudi.exception.HoodieException) PathFilter(org.apache.hadoop.fs.PathFilter) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) FileStatus(org.apache.hadoop.fs.FileStatus) Function(java.util.function.Function) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Logger(org.apache.log4j.Logger) Matcher(java.util.regex.Matcher) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) DistributedFileSystem(org.apache.hadoop.hdfs.DistributedFileSystem) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) Predicate(java.util.function.Predicate) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) Set(java.util.Set) IOException(java.io.IOException) UUID(java.util.UUID) Collectors(java.util.stream.Collectors) File(java.io.File) FileNotFoundException(java.io.FileNotFoundException) Serializable(java.io.Serializable) Objects(java.util.Objects) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) List(java.util.List) Stream(java.util.stream.Stream) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) HoodiePartitionMetadata(org.apache.hudi.common.model.HoodiePartitionMetadata) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) Entry(java.util.Map.Entry) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) Pattern(java.util.regex.Pattern) RemoteIterator(org.apache.hadoop.fs.RemoteIterator) Pair(org.apache.hudi.common.util.collection.Pair) ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) HashMap(java.util.HashMap) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration)

Example 27 with SerializableConfiguration

use of org.apache.hudi.common.config.SerializableConfiguration in project hudi by apache.

the class FSUtils method parallelizeSubPathProcess.

/**
 * Processes sub-path in parallel.
 *
 * @param hoodieEngineContext {@code HoodieEngineContext} instance
 * @param fs file system
 * @param dirPath directory path
 * @param parallelism parallelism to use for sub-paths
 * @param subPathPredicate predicate to use to filter sub-paths for processing
 * @param pairFunction actual processing logic for each sub-path
 * @param <T> type of result to return for each sub-path
 * @return a map of sub-path to result of the processing
 */
public static <T> Map<String, T> parallelizeSubPathProcess(HoodieEngineContext hoodieEngineContext, FileSystem fs, Path dirPath, int parallelism, Predicate<FileStatus> subPathPredicate, SerializableFunction<Pair<String, SerializableConfiguration>, T> pairFunction) {
    Map<String, T> result = new HashMap<>();
    try {
        FileStatus[] fileStatuses = fs.listStatus(dirPath);
        List<String> subPaths = Arrays.stream(fileStatuses).filter(subPathPredicate).map(fileStatus -> fileStatus.getPath().toString()).collect(Collectors.toList());
        result = parallelizeFilesProcess(hoodieEngineContext, fs, parallelism, pairFunction, subPaths);
    } catch (IOException ioe) {
        throw new HoodieIOException(ioe.getMessage(), ioe);
    }
    return result;
}
Also used : ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) Arrays(java.util.Arrays) InvalidHoodiePathException(org.apache.hudi.exception.InvalidHoodiePathException) FileSystem(org.apache.hadoop.fs.FileSystem) HoodieException(org.apache.hudi.exception.HoodieException) PathFilter(org.apache.hadoop.fs.PathFilter) Option(org.apache.hudi.common.util.Option) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) FileStatus(org.apache.hadoop.fs.FileStatus) Function(java.util.function.Function) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Logger(org.apache.log4j.Logger) Matcher(java.util.regex.Matcher) HoodieTableConfig(org.apache.hudi.common.table.HoodieTableConfig) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Configuration(org.apache.hadoop.conf.Configuration) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieLogFile(org.apache.hudi.common.model.HoodieLogFile) HoodieMetadataConfig(org.apache.hudi.common.config.HoodieMetadataConfig) DistributedFileSystem(org.apache.hadoop.hdfs.DistributedFileSystem) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) Predicate(java.util.function.Predicate) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) Set(java.util.Set) IOException(java.io.IOException) UUID(java.util.UUID) Collectors(java.util.stream.Collectors) File(java.io.File) FileNotFoundException(java.io.FileNotFoundException) Serializable(java.io.Serializable) Objects(java.util.Objects) HoodieFileFormat(org.apache.hudi.common.model.HoodieFileFormat) List(java.util.List) Stream(java.util.stream.Stream) FileSystemViewStorageConfig(org.apache.hudi.common.table.view.FileSystemViewStorageConfig) HoodiePartitionMetadata(org.apache.hudi.common.model.HoodiePartitionMetadata) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) Entry(java.util.Map.Entry) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) Pattern(java.util.regex.Pattern) RemoteIterator(org.apache.hadoop.fs.RemoteIterator) Pair(org.apache.hudi.common.util.collection.Pair) FileStatus(org.apache.hadoop.fs.FileStatus) LocatedFileStatus(org.apache.hadoop.fs.LocatedFileStatus) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HashMap(java.util.HashMap) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException)

Example 28 with SerializableConfiguration

use of org.apache.hudi.common.config.SerializableConfiguration in project hudi by apache.

the class TestFSUtils method testDeleteSubDirectoryRecursively.

@Test
public void testDeleteSubDirectoryRecursively() throws IOException {
    String rootDir = basePath + "/.hoodie/.temp";
    String subPathStr = rootDir + "/subdir1";
    FileSystem fileSystem = metaClient.getFs();
    prepareTestDirectory(fileSystem, rootDir);
    assertTrue(FSUtils.deleteSubPath(subPathStr, new SerializableConfiguration(fileSystem.getConf()), true));
}
Also used : SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) FileSystem(org.apache.hadoop.fs.FileSystem) Test(org.junit.jupiter.api.Test)

Example 29 with SerializableConfiguration

use of org.apache.hudi.common.config.SerializableConfiguration in project hudi by apache.

the class TestFSUtils method testDeleteNonExistingSubDirectory.

@Test
public void testDeleteNonExistingSubDirectory() throws IOException {
    String rootDir = basePath + "/.hoodie/.temp";
    String subPathStr = rootDir + "/subdir10";
    FileSystem fileSystem = metaClient.getFs();
    cleanUpTestDirectory(fileSystem, rootDir);
    assertFalse(FSUtils.deleteSubPath(subPathStr, new SerializableConfiguration(fileSystem.getConf()), true));
}
Also used : SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) FileSystem(org.apache.hadoop.fs.FileSystem) Test(org.junit.jupiter.api.Test)

Example 30 with SerializableConfiguration

use of org.apache.hudi.common.config.SerializableConfiguration in project hudi by apache.

the class TestFSUtils method testDeleteSubDirectoryNonRecursively.

@Test
public void testDeleteSubDirectoryNonRecursively() throws IOException {
    String rootDir = basePath + "/.hoodie/.temp";
    String subPathStr = rootDir + "/subdir1";
    FileSystem fileSystem = metaClient.getFs();
    prepareTestDirectory(fileSystem, rootDir);
    assertThrows(HoodieIOException.class, () -> FSUtils.deleteSubPath(subPathStr, new SerializableConfiguration(fileSystem.getConf()), false));
}
Also used : SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) FileSystem(org.apache.hadoop.fs.FileSystem) Test(org.junit.jupiter.api.Test)

Aggregations

SerializableConfiguration (org.apache.hudi.common.config.SerializableConfiguration)32 Path (org.apache.hadoop.fs.Path)20 FileSystem (org.apache.hadoop.fs.FileSystem)16 FileStatus (org.apache.hadoop.fs.FileStatus)15 List (java.util.List)14 IOException (java.io.IOException)13 Collectors (java.util.stream.Collectors)13 Map (java.util.Map)12 Test (org.junit.jupiter.api.Test)12 ArrayList (java.util.ArrayList)11 LogManager (org.apache.log4j.LogManager)10 Logger (org.apache.log4j.Logger)10 HoodieEngineContext (org.apache.hudi.common.engine.HoodieEngineContext)9 Option (org.apache.hudi.common.util.Option)9 Arrays (java.util.Arrays)8 HoodieSparkEngineContext (org.apache.hudi.client.common.HoodieSparkEngineContext)8 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)8 HoodieIOException (org.apache.hudi.exception.HoodieIOException)7 Collections (java.util.Collections)6 Configuration (org.apache.hadoop.conf.Configuration)6