Search in sources :

Example 6 with PostPublishStep

use of org.apache.gobblin.data.management.copy.entities.PostPublishStep in project incubator-gobblin by apache.

the class ConfigBasedDataset method getCopyableFiles.

@Override
public Collection<? extends CopyEntity> getCopyableFiles(FileSystem targetFs, CopyConfiguration copyConfiguration) throws IOException {
    List<CopyEntity> copyableFiles = Lists.newArrayList();
    EndPoint copyFromRaw = copyRoute.getCopyFrom();
    EndPoint copyToRaw = copyRoute.getCopyTo();
    if (!(copyFromRaw instanceof HadoopFsEndPoint && copyToRaw instanceof HadoopFsEndPoint)) {
        log.warn("Currently only handle the Hadoop Fs EndPoint replication");
        return copyableFiles;
    }
    // For {@link HadoopFsEndPoint}s, set pathfilter and applyFilterToDirectories
    HadoopFsEndPoint copyFrom = (HadoopFsEndPoint) copyFromRaw;
    HadoopFsEndPoint copyTo = (HadoopFsEndPoint) copyToRaw;
    copyFrom.setPathFilter(pathFilter);
    copyFrom.setApplyFilterToDirectories(applyFilterToDirectories);
    copyTo.setPathFilter(pathFilter);
    copyTo.setApplyFilterToDirectories(applyFilterToDirectories);
    if (this.watermarkEnabled) {
        if ((!copyFromRaw.getWatermark().isPresent() && copyToRaw.getWatermark().isPresent()) || (copyFromRaw.getWatermark().isPresent() && copyToRaw.getWatermark().isPresent() && copyFromRaw.getWatermark().get().compareTo(copyToRaw.getWatermark().get()) <= 0)) {
            log.info("No need to copy as destination watermark >= source watermark with source watermark {}, for dataset with metadata {}", copyFromRaw.getWatermark().isPresent() ? copyFromRaw.getWatermark().get().toJson() : "N/A", this.rc.getMetaData());
            return copyableFiles;
        }
    }
    Configuration conf = HadoopUtils.newConfiguration();
    FileSystem copyFromFs = FileSystem.get(copyFrom.getFsURI(), conf);
    FileSystem copyToFs = FileSystem.get(copyTo.getFsURI(), conf);
    Collection<FileStatus> allFilesInSource = copyFrom.getFiles();
    Collection<FileStatus> allFilesInTarget = copyTo.getFiles();
    Set<FileStatus> copyFromFileStatuses = Sets.newHashSet(allFilesInSource);
    Map<Path, FileStatus> copyToFileMap = Maps.newHashMap();
    for (FileStatus f : allFilesInTarget) {
        copyToFileMap.put(PathUtils.getPathWithoutSchemeAndAuthority(f.getPath()), f);
    }
    Collection<Path> deletedPaths = Lists.newArrayList();
    boolean watermarkMetadataCopied = false;
    boolean deleteTargetIfNotExistOnSource = rc.isDeleteTargetIfNotExistOnSource();
    for (FileStatus originFileStatus : copyFromFileStatuses) {
        Path relative = PathUtils.relativizePath(PathUtils.getPathWithoutSchemeAndAuthority(originFileStatus.getPath()), PathUtils.getPathWithoutSchemeAndAuthority(copyFrom.getDatasetPath()));
        // construct the new path in the target file system
        Path newPath = new Path(copyTo.getDatasetPath(), relative);
        if (relative.toString().equals(ReplicaHadoopFsEndPoint.WATERMARK_FILE)) {
            watermarkMetadataCopied = true;
        }
        // skip copy same file
        if (copyToFileMap.containsKey(newPath) && copyToFileMap.get(newPath).getLen() == originFileStatus.getLen() && copyToFileMap.get(newPath).getModificationTime() > originFileStatus.getModificationTime()) {
            log.debug("Copy from timestamp older than copy to timestamp, skipped copy {} for dataset with metadata {}", originFileStatus.getPath(), this.rc.getMetaData());
        } else {
            // need to remove those files in the target File System
            if (copyToFileMap.containsKey(newPath)) {
                deletedPaths.add(newPath);
            }
            CopyableFile copyableFile = CopyableFile.fromOriginAndDestination(copyFromFs, originFileStatus, copyToFs.makeQualified(newPath), copyConfiguration).fileSet(PathUtils.getPathWithoutSchemeAndAuthority(copyTo.getDatasetPath()).toString()).build();
            copyableFile.setFsDatasets(copyFromFs, copyToFs);
            copyableFiles.add(copyableFile);
        }
        // clean up already checked paths
        copyToFileMap.remove(newPath);
    }
    // delete the paths on target directory if NOT exists on source
    if (deleteTargetIfNotExistOnSource) {
        deletedPaths.addAll(copyToFileMap.keySet());
    }
    // delete old files first
    if (!deletedPaths.isEmpty()) {
        DeleteFileCommitStep deleteCommitStep = DeleteFileCommitStep.fromPaths(copyToFs, deletedPaths, this.props);
        copyableFiles.add(new PrePublishStep(copyTo.getDatasetPath().toString(), Maps.<String, String>newHashMap(), deleteCommitStep, 0));
    }
    // generate the watermark file even if watermark checking is disabled. Make sure it can come into functional once disired.
    if ((!watermarkMetadataCopied) && copyFrom.getWatermark().isPresent()) {
        copyableFiles.add(new PostPublishStep(copyTo.getDatasetPath().toString(), Maps.<String, String>newHashMap(), new WatermarkMetadataGenerationCommitStep(copyTo.getFsURI().toString(), copyTo.getDatasetPath(), copyFrom.getWatermark().get()), 1));
    }
    return copyableFiles;
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) Configuration(org.apache.hadoop.conf.Configuration) CopyConfiguration(org.apache.gobblin.data.management.copy.CopyConfiguration) CopyEntity(org.apache.gobblin.data.management.copy.CopyEntity) PostPublishStep(org.apache.gobblin.data.management.copy.entities.PostPublishStep) DeleteFileCommitStep(org.apache.gobblin.util.commit.DeleteFileCommitStep) FileSystem(org.apache.hadoop.fs.FileSystem) CopyableFile(org.apache.gobblin.data.management.copy.CopyableFile) PrePublishStep(org.apache.gobblin.data.management.copy.entities.PrePublishStep)

Aggregations

PostPublishStep (org.apache.gobblin.data.management.copy.entities.PostPublishStep)6 DeleteFileCommitStep (org.apache.gobblin.util.commit.DeleteFileCommitStep)5 Path (org.apache.hadoop.fs.Path)5 CopyEntity (org.apache.gobblin.data.management.copy.CopyEntity)4 CopyableFile (org.apache.gobblin.data.management.copy.CopyableFile)3 PrePublishStep (org.apache.gobblin.data.management.copy.entities.PrePublishStep)3 ToString (lombok.ToString)2 Test (org.testng.annotations.Test)2 VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 Closer (com.google.common.io.Closer)1 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 List (java.util.List)1 Properties (java.util.Properties)1 State (org.apache.gobblin.configuration.State)1 CopyConfiguration (org.apache.gobblin.data.management.copy.CopyConfiguration)1 HiveRegProps (org.apache.gobblin.hive.HiveRegProps)1 HiveRegisterStep (org.apache.gobblin.hive.HiveRegisterStep)1 PartitionDeregisterStep (org.apache.gobblin.hive.PartitionDeregisterStep)1 TableDeregisterStep (org.apache.gobblin.hive.TableDeregisterStep)1