Search in sources :

Example 1 with SerializableConfiguration

use of org.apache.hudi.common.config.SerializableConfiguration in project hudi by apache.

the class HoodieRepairTool method copyFiles.

/**
 * Copies the list of files from source base path to destination base path.
 * The destination file path (base + relative) should not already exist.
 *
 * @param context           {@link HoodieEngineContext} instance.
 * @param relativeFilePaths A {@link List} of relative file paths for copying.
 * @param sourceBasePath    Source base path.
 * @param destBasePath      Destination base path.
 * @return {@code true} if all successful; {@code false} otherwise.
 */
static boolean copyFiles(HoodieEngineContext context, List<String> relativeFilePaths, String sourceBasePath, String destBasePath) {
    SerializableConfiguration conf = context.getHadoopConf();
    List<Boolean> allResults = context.parallelize(relativeFilePaths).mapPartitions(iterator -> {
        List<Boolean> results = new ArrayList<>();
        FileSystem fs = FSUtils.getFs(destBasePath, conf.get());
        iterator.forEachRemaining(filePath -> {
            boolean success = false;
            Path sourcePath = new Path(sourceBasePath, filePath);
            Path destPath = new Path(destBasePath, filePath);
            try {
                if (!fs.exists(destPath)) {
                    FileIOUtils.copy(fs, sourcePath, destPath);
                    success = true;
                }
            } catch (IOException e) {
                // Copy Fail
                LOG.error(String.format("Copying file fails: source [%s], destination [%s]", sourcePath, destPath));
            } finally {
                results.add(success);
            }
        });
        return results.iterator();
    }, true).collectAsList();
    return allResults.stream().reduce((r1, r2) -> r1 && r2).orElse(false);
}
Also used : ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) HoodieArchivedTimeline(org.apache.hudi.common.table.timeline.HoodieArchivedTimeline) FileIOUtils(org.apache.hudi.common.util.FileIOUtils) Parameter(com.beust.jcommander.Parameter) FileSystem(org.apache.hadoop.fs.FileSystem) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) ArrayList(java.util.ArrayList) SecureRandom(java.security.SecureRandom) Logger(org.apache.log4j.Logger) StringUtils(org.apache.hudi.common.util.StringUtils) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieActiveTimeline(org.apache.hudi.common.table.timeline.HoodieActiveTimeline) TypedProperties(org.apache.hudi.common.config.TypedProperties) HoodieTableMetadata(org.apache.hudi.metadata.HoodieTableMetadata) JCommander(com.beust.jcommander.JCommander) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) Serializable(java.io.Serializable) List(java.util.List) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) FileSystemBackedTableMetadata(org.apache.hudi.metadata.FileSystemBackedTableMetadata) HoodieIOException(org.apache.hudi.exception.HoodieIOException) RepairUtils(org.apache.hudi.table.repair.RepairUtils) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) Path(org.apache.hadoop.fs.Path) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) FileSystem(org.apache.hadoop.fs.FileSystem) ArrayList(java.util.ArrayList) List(java.util.List) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException)

Example 2 with SerializableConfiguration

use of org.apache.hudi.common.config.SerializableConfiguration in project hudi by apache.

the class HoodieSnapshotExporter method exportAsHudi.

private void exportAsHudi(JavaSparkContext jsc, Config cfg, List<String> partitions, String latestCommitTimestamp) throws IOException {
    final BaseFileOnlyView fsView = getBaseFileOnlyView(jsc, cfg);
    final HoodieEngineContext context = new HoodieSparkEngineContext(jsc);
    final SerializableConfiguration serConf = context.getHadoopConf();
    context.setJobStatus(this.getClass().getSimpleName(), "Exporting as HUDI dataset");
    List<Tuple2<String, String>> files = context.flatMap(partitions, partition -> {
        // Only take latest version files <= latestCommit.
        List<Tuple2<String, String>> filePaths = new ArrayList<>();
        Stream<HoodieBaseFile> dataFiles = fsView.getLatestBaseFilesBeforeOrOn(partition, latestCommitTimestamp);
        dataFiles.forEach(hoodieDataFile -> filePaths.add(new Tuple2<>(partition, hoodieDataFile.getPath())));
        // also need to copy over partition metadata
        Path partitionMetaFile = new Path(FSUtils.getPartitionPath(cfg.sourceBasePath, partition), HoodiePartitionMetadata.HOODIE_PARTITION_METAFILE);
        FileSystem fs = FSUtils.getFs(cfg.sourceBasePath, serConf.newCopy());
        if (fs.exists(partitionMetaFile)) {
            filePaths.add(new Tuple2<>(partition, partitionMetaFile.toString()));
        }
        return filePaths.stream();
    }, partitions.size());
    context.foreach(files, tuple -> {
        String partition = tuple._1();
        Path sourceFilePath = new Path(tuple._2());
        Path toPartitionPath = FSUtils.getPartitionPath(cfg.targetOutputPath, partition);
        FileSystem fs = FSUtils.getFs(cfg.targetOutputPath, serConf.newCopy());
        if (!fs.exists(toPartitionPath)) {
            fs.mkdirs(toPartitionPath);
        }
        FileUtil.copy(fs, sourceFilePath, fs, new Path(toPartitionPath, sourceFilePath.getName()), false, fs.getConf());
    }, files.size());
    // Also copy the .commit files
    LOG.info(String.format("Copying .commit files which are no-late-than %s.", latestCommitTimestamp));
    final FileSystem fileSystem = FSUtils.getFs(cfg.sourceBasePath, jsc.hadoopConfiguration());
    FileStatus[] commitFilesToCopy = fileSystem.listStatus(new Path(cfg.sourceBasePath + "/" + HoodieTableMetaClient.METAFOLDER_NAME), (commitFilePath) -> {
        if (commitFilePath.getName().equals(HoodieTableConfig.HOODIE_PROPERTIES_FILE)) {
            return true;
        } else {
            String instantTime = FSUtils.getCommitFromCommitFile(commitFilePath.getName());
            return HoodieTimeline.compareTimestamps(instantTime, HoodieTimeline.LESSER_THAN_OR_EQUALS, latestCommitTimestamp);
        }
    });
    for (FileStatus commitStatus : commitFilesToCopy) {
        Path targetFilePath = new Path(cfg.targetOutputPath + "/" + HoodieTableMetaClient.METAFOLDER_NAME + "/" + commitStatus.getPath().getName());
        if (!fileSystem.exists(targetFilePath.getParent())) {
            fileSystem.mkdirs(targetFilePath.getParent());
        }
        if (fileSystem.exists(targetFilePath)) {
            LOG.error(String.format("The target output commit file (%s targetBasePath) already exists.", targetFilePath));
        }
        FileUtil.copy(fileSystem, commitStatus.getPath(), fileSystem, targetFilePath, false, fileSystem.getConf());
    }
}
Also used : Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) HoodieBaseFile(org.apache.hudi.common.model.HoodieBaseFile) FileStatus(org.apache.hadoop.fs.FileStatus) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) ArrayList(java.util.ArrayList) BaseFileOnlyView(org.apache.hudi.common.table.view.TableFileSystemView.BaseFileOnlyView) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) Tuple2(scala.Tuple2) FileSystem(org.apache.hadoop.fs.FileSystem)

Example 3 with SerializableConfiguration

use of org.apache.hudi.common.config.SerializableConfiguration in project hudi by apache.

the class DatePartitionPathSelector method pruneDatePartitionPaths.

/**
 * Prunes date level partitions to last few days configured by 'NUM_PREV_DAYS_TO_LIST' from
 * 'CURRENT_DATE'. Parallelizes listing by leveraging HoodieSparkEngineContext's methods.
 */
public List<String> pruneDatePartitionPaths(HoodieSparkEngineContext context, FileSystem fs, String rootPath, LocalDate currentDate) {
    List<String> partitionPaths = new ArrayList<>();
    // get all partition paths before date partition level
    partitionPaths.add(rootPath);
    if (datePartitionDepth <= 0) {
        return partitionPaths;
    }
    SerializableConfiguration serializedConf = new SerializableConfiguration(fs.getConf());
    for (int i = 0; i < datePartitionDepth; i++) {
        partitionPaths = context.flatMap(partitionPaths, path -> {
            Path subDir = new Path(path);
            FileSystem fileSystem = subDir.getFileSystem(serializedConf.get());
            // skip files/dirs whose names start with (_, ., etc)
            FileStatus[] statuses = fileSystem.listStatus(subDir, file -> IGNORE_FILEPREFIX_LIST.stream().noneMatch(pfx -> file.getName().startsWith(pfx)));
            List<String> res = new ArrayList<>();
            for (FileStatus status : statuses) {
                res.add(status.getPath().toString());
            }
            return res.stream();
        }, partitionsListParallelism);
    }
    // Prune date partitions to last few days
    return context.getJavaSparkContext().parallelize(partitionPaths, partitionsListParallelism).filter(s -> {
        LocalDate fromDate = currentDate.minusDays(numPrevDaysToList);
        String[] splits = s.split("/");
        String datePartition = splits[splits.length - 1];
        LocalDate partitionDate;
        DateTimeFormatter dateFormatter = DateTimeFormatter.ofPattern(dateFormat);
        if (datePartition.contains("=")) {
            String[] moreSplit = datePartition.split("=");
            ValidationUtils.checkArgument(moreSplit.length == 2, "Partition Field (" + datePartition + ") not in expected format");
            partitionDate = LocalDate.parse(moreSplit[1], dateFormatter);
        } else {
            partitionDate = LocalDate.parse(datePartition, dateFormatter);
        }
        return (partitionDate.isEqual(fromDate) || partitionDate.isAfter(fromDate)) && (partitionDate.isEqual(currentDate) || partitionDate.isBefore(currentDate));
    }).collect();
}
Also used : ImmutablePair(org.apache.hudi.common.util.collection.ImmutablePair) FileSystem(org.apache.hadoop.fs.FileSystem) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(org.apache.hudi.common.util.Option) FileStatus(org.apache.hadoop.fs.FileStatus) DEFAULT_PARTITIONS_LIST_PARALLELISM(org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.DEFAULT_PARTITIONS_LIST_PARALLELISM) ArrayList(java.util.ArrayList) Logger(org.apache.log4j.Logger) Configuration(org.apache.hadoop.conf.Configuration) Path(org.apache.hadoop.fs.Path) HoodieSparkEngineContext(org.apache.hudi.client.common.HoodieSparkEngineContext) ValidationUtils(org.apache.hudi.common.util.ValidationUtils) ROOT_INPUT_PATH_PROP(org.apache.hudi.utilities.sources.helpers.DFSPathSelector.Config.ROOT_INPUT_PATH_PROP) TypedProperties(org.apache.hudi.common.config.TypedProperties) LOOKBACK_DAYS(org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.LOOKBACK_DAYS) DATE_PARTITION_DEPTH(org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.DATE_PARTITION_DEPTH) DEFAULT_DATE_PARTITION_DEPTH(org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.DEFAULT_DATE_PARTITION_DEPTH) Collectors(java.util.stream.Collectors) DATE_FORMAT(org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.DATE_FORMAT) DEFAULT_DATE_FORMAT(org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.DEFAULT_DATE_FORMAT) PARTITIONS_LIST_PARALLELISM(org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.PARTITIONS_LIST_PARALLELISM) List(java.util.List) LocalDate(java.time.LocalDate) DateTimeFormatter(java.time.format.DateTimeFormatter) DEFAULT_LOOKBACK_DAYS(org.apache.hudi.utilities.sources.helpers.DatePartitionPathSelector.Config.DEFAULT_LOOKBACK_DAYS) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) LogManager(org.apache.log4j.LogManager) Comparator(java.util.Comparator) Pair(org.apache.hudi.common.util.collection.Pair) Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) ArrayList(java.util.ArrayList) LocalDate(java.time.LocalDate) FileSystem(org.apache.hadoop.fs.FileSystem) ArrayList(java.util.ArrayList) List(java.util.List) DateTimeFormatter(java.time.format.DateTimeFormatter)

Example 4 with SerializableConfiguration

use of org.apache.hudi.common.config.SerializableConfiguration in project hudi by apache.

the class MarkerUtils method readTimelineServerBasedMarkersFromFileSystem.

/**
 * Reads files containing the markers written by timeline-server-based marker mechanism.
 *
 * @param markerDir   marker directory.
 * @param fileSystem  file system to use.
 * @param context     instance of {@link HoodieEngineContext} to use
 * @param parallelism parallelism to use
 * @return A {@code Map} of file name to the set of markers stored in the file.
 */
public static Map<String, Set<String>> readTimelineServerBasedMarkersFromFileSystem(String markerDir, FileSystem fileSystem, HoodieEngineContext context, int parallelism) {
    Path dirPath = new Path(markerDir);
    try {
        if (fileSystem.exists(dirPath)) {
            Predicate<FileStatus> prefixFilter = fileStatus -> fileStatus.getPath().getName().startsWith(MARKERS_FILENAME_PREFIX);
            Predicate<FileStatus> markerTypeFilter = fileStatus -> !fileStatus.getPath().getName().equals(MARKER_TYPE_FILENAME);
            return FSUtils.parallelizeSubPathProcess(context, fileSystem, dirPath, parallelism, prefixFilter.and(markerTypeFilter), pairOfSubPathAndConf -> {
                String markersFilePathStr = pairOfSubPathAndConf.getKey();
                SerializableConfiguration conf = pairOfSubPathAndConf.getValue();
                return readMarkersFromFile(new Path(markersFilePathStr), conf);
            });
        }
        return new HashMap<>();
    } catch (IOException ioe) {
        throw new HoodieIOException(ioe.getMessage(), ioe);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) FileSystem(org.apache.hadoop.fs.FileSystem) BufferedWriter(java.io.BufferedWriter) Predicate(java.util.function.Predicate) HoodieException(org.apache.hudi.exception.HoodieException) Set(java.util.Set) IOException(java.io.IOException) HashMap(java.util.HashMap) HoodieEngineContext(org.apache.hudi.common.engine.HoodieEngineContext) FileStatus(org.apache.hadoop.fs.FileStatus) StandardCharsets(java.nio.charset.StandardCharsets) HashSet(java.util.HashSet) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) Logger(org.apache.log4j.Logger) MarkerType(org.apache.hudi.common.table.marker.MarkerType) HoodieTableMetaClient(org.apache.hudi.common.table.HoodieTableMetaClient) Map(java.util.Map) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) Path(org.apache.hadoop.fs.Path) OutputStreamWriter(java.io.OutputStreamWriter) FileIOUtils.closeQuietly(org.apache.hudi.common.util.FileIOUtils.closeQuietly) HoodieIOException(org.apache.hudi.exception.HoodieIOException) LogManager(org.apache.log4j.LogManager) FSUtils(org.apache.hudi.common.fs.FSUtils) FSDataInputStream(org.apache.hadoop.fs.FSDataInputStream) FileStatus(org.apache.hadoop.fs.FileStatus) HoodieIOException(org.apache.hudi.exception.HoodieIOException) HashMap(java.util.HashMap) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) IOException(java.io.IOException) HoodieIOException(org.apache.hudi.exception.HoodieIOException)

Example 5 with SerializableConfiguration

use of org.apache.hudi.common.config.SerializableConfiguration in project hudi by apache.

the class TestFileSystemBackedTableMetadata method testNonPartitionedTable.

/**
 * Test non partition hoodie table.
 * @throws Exception
 */
@Test
public void testNonPartitionedTable() throws Exception {
    // Generate 10 files under basepath
    hoodieTestTable.addCommit("100").withBaseFilesInPartition(DEFAULT_PARTITION, IntStream.range(0, 10).toArray());
    HoodieLocalEngineContext localEngineContext = new HoodieLocalEngineContext(metaClient.getHadoopConf());
    FileSystemBackedTableMetadata fileSystemBackedTableMetadata = new FileSystemBackedTableMetadata(localEngineContext, new SerializableConfiguration(metaClient.getHadoopConf()), basePath, false);
    Assertions.assertEquals(0, fileSystemBackedTableMetadata.getAllPartitionPaths().size());
    Assertions.assertEquals(10, fileSystemBackedTableMetadata.getAllFilesInPartition(new Path(basePath)).length);
    Assertions.assertEquals(10, fileSystemBackedTableMetadata.getAllFilesInPartitions(Collections.singletonList(basePath)).get(basePath).length);
}
Also used : Path(org.apache.hadoop.fs.Path) SerializableConfiguration(org.apache.hudi.common.config.SerializableConfiguration) HoodieLocalEngineContext(org.apache.hudi.common.engine.HoodieLocalEngineContext) Test(org.junit.jupiter.api.Test)

Aggregations

SerializableConfiguration (org.apache.hudi.common.config.SerializableConfiguration)32 Path (org.apache.hadoop.fs.Path)20 FileSystem (org.apache.hadoop.fs.FileSystem)16 FileStatus (org.apache.hadoop.fs.FileStatus)15 List (java.util.List)14 IOException (java.io.IOException)13 Collectors (java.util.stream.Collectors)13 Map (java.util.Map)12 Test (org.junit.jupiter.api.Test)12 ArrayList (java.util.ArrayList)11 LogManager (org.apache.log4j.LogManager)10 Logger (org.apache.log4j.Logger)10 HoodieEngineContext (org.apache.hudi.common.engine.HoodieEngineContext)9 Option (org.apache.hudi.common.util.Option)9 Arrays (java.util.Arrays)8 HoodieSparkEngineContext (org.apache.hudi.client.common.HoodieSparkEngineContext)8 HoodieTableMetaClient (org.apache.hudi.common.table.HoodieTableMetaClient)8 HoodieIOException (org.apache.hudi.exception.HoodieIOException)7 Collections (java.util.Collections)6 Configuration (org.apache.hadoop.conf.Configuration)6