Search in sources :

Example 6 with DeleteFileCommitStep

use of org.apache.gobblin.util.commit.DeleteFileCommitStep in project incubator-gobblin by apache.

the class RecursiveCopyableDatasetTest method testCopyWithDeleteTargetAndDeleteParentDirectories.

@Test
public void testCopyWithDeleteTargetAndDeleteParentDirectories() throws Exception {
    Path source = new Path("/source");
    Path target = new Path("/target");
    List<FileStatus> sourceFiles = Lists.newArrayList(createFileStatus(source, "file1"));
    List<FileStatus> targetFiles = Lists.newArrayList(createFileStatus(target, "file3"));
    Properties properties = new Properties();
    properties.setProperty(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, target.toString());
    properties.setProperty(RecursiveCopyableDataset.DELETE_EMPTY_DIRECTORIES_KEY, "true");
    properties.setProperty(RecursiveCopyableDataset.DELETE_KEY, "true");
    RecursiveCopyableDataset dataset = new TestRecursiveCopyableDataset(source, target, sourceFiles, targetFiles, properties);
    Collection<? extends CopyEntity> copyableFiles = dataset.getCopyableFiles(FileSystem.getLocal(new Configuration()), CopyConfiguration.builder(FileSystem.getLocal(new Configuration()), properties).build());
    Assert.assertEquals(copyableFiles.size(), 2);
    ClassifiedFiles classifiedFiles = classifyFiles(copyableFiles);
    Assert.assertTrue(classifiedFiles.getPathsToCopy().containsKey(new Path(source, "file1")));
    Assert.assertEquals(classifiedFiles.getPathsToCopy().get(new Path(source, "file1")), new Path(target, "file1"));
    Assert.assertEquals(classifiedFiles.getPathsToDelete().size(), 1);
    Assert.assertTrue(classifiedFiles.getPathsToDelete().contains(new Path(target, "file3")));
    CommitStepCopyEntity entity = (CommitStepCopyEntity) Iterables.filter(copyableFiles, new Predicate<CopyEntity>() {

        @Override
        public boolean apply(@Nullable CopyEntity copyEntity) {
            return copyEntity instanceof CommitStepCopyEntity;
        }
    }).iterator().next();
    DeleteFileCommitStep step = (DeleteFileCommitStep) entity.getStep();
    Assert.assertTrue(step.getParentDeletionLimit().isPresent());
    Assert.assertEquals(step.getParentDeletionLimit().get(), target);
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) Configuration(org.apache.hadoop.conf.Configuration) CommitStepCopyEntity(org.apache.gobblin.data.management.copy.entities.CommitStepCopyEntity) DeleteFileCommitStep(org.apache.gobblin.util.commit.DeleteFileCommitStep) Properties(java.util.Properties) CommitStepCopyEntity(org.apache.gobblin.data.management.copy.entities.CommitStepCopyEntity) Predicate(avro.shaded.com.google.common.base.Predicate) Nullable(javax.annotation.Nullable) Test(org.testng.annotations.Test)

Example 7 with DeleteFileCommitStep

use of org.apache.gobblin.util.commit.DeleteFileCommitStep in project incubator-gobblin by apache.

the class RecursiveCopyableDatasetTest method testCopyWithDeleteTarget.

@Test
public void testCopyWithDeleteTarget() throws Exception {
    Path source = new Path("/source");
    Path target = new Path("/target");
    List<FileStatus> sourceFiles = Lists.newArrayList(createFileStatus(source, "file1"));
    List<FileStatus> targetFiles = Lists.newArrayList(createFileStatus(target, "file3"));
    Properties properties = new Properties();
    properties.setProperty(ConfigurationKeys.DATA_PUBLISHER_FINAL_DIR, target.toString());
    properties.setProperty(RecursiveCopyableDataset.DELETE_KEY, "true");
    RecursiveCopyableDataset dataset = new TestRecursiveCopyableDataset(source, target, sourceFiles, targetFiles, properties);
    Collection<? extends CopyEntity> copyableFiles = dataset.getCopyableFiles(FileSystem.getLocal(new Configuration()), CopyConfiguration.builder(FileSystem.getLocal(new Configuration()), properties).build());
    Assert.assertEquals(copyableFiles.size(), 2);
    ClassifiedFiles classifiedFiles = classifyFiles(copyableFiles);
    Assert.assertTrue(classifiedFiles.getPathsToCopy().containsKey(new Path(source, "file1")));
    Assert.assertEquals(classifiedFiles.getPathsToCopy().get(new Path(source, "file1")), new Path(target, "file1"));
    Assert.assertEquals(classifiedFiles.getPathsToDelete().size(), 1);
    Assert.assertTrue(classifiedFiles.getPathsToDelete().contains(new Path(target, "file3")));
    CommitStepCopyEntity entity = (CommitStepCopyEntity) Iterables.filter(copyableFiles, new Predicate<CopyEntity>() {

        @Override
        public boolean apply(@Nullable CopyEntity copyEntity) {
            return copyEntity instanceof CommitStepCopyEntity;
        }
    }).iterator().next();
    DeleteFileCommitStep step = (DeleteFileCommitStep) entity.getStep();
    Assert.assertFalse(step.getParentDeletionLimit().isPresent());
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) Configuration(org.apache.hadoop.conf.Configuration) CommitStepCopyEntity(org.apache.gobblin.data.management.copy.entities.CommitStepCopyEntity) DeleteFileCommitStep(org.apache.gobblin.util.commit.DeleteFileCommitStep) Properties(java.util.Properties) CommitStepCopyEntity(org.apache.gobblin.data.management.copy.entities.CommitStepCopyEntity) Predicate(avro.shaded.com.google.common.base.Predicate) Nullable(javax.annotation.Nullable) Test(org.testng.annotations.Test)

Example 8 with DeleteFileCommitStep

use of org.apache.gobblin.util.commit.DeleteFileCommitStep in project incubator-gobblin by apache.

the class HiveCopyEntityHelper method addTableDeregisterSteps.

@VisibleForTesting
protected int addTableDeregisterSteps(List<CopyEntity> copyEntities, String fileSet, int initialPriority, Table table) throws IOException {
    int stepPriority = initialPriority;
    Collection<Path> tablePaths = Lists.newArrayList();
    switch(this.getDeleteMethod()) {
        case RECURSIVE:
            tablePaths = Lists.newArrayList(table.getDataLocation());
            break;
        case INPUT_FORMAT:
            InputFormat<?, ?> inputFormat = HiveUtils.getInputFormat(table.getSd());
            HiveLocationDescriptor targetLocation = new HiveLocationDescriptor(table.getDataLocation(), inputFormat, this.getTargetFs(), this.getDataset().getProperties());
            tablePaths = targetLocation.getPaths().keySet();
            break;
        case NO_DELETE:
            tablePaths = Lists.newArrayList();
            break;
        default:
            tablePaths = Lists.newArrayList();
    }
    if (!tablePaths.isEmpty()) {
        DeleteFileCommitStep deletePaths = DeleteFileCommitStep.fromPaths(this.getTargetFs(), tablePaths, this.getDataset().getProperties(), table.getDataLocation());
        copyEntities.add(new PostPublishStep(fileSet, Maps.<String, String>newHashMap(), deletePaths, stepPriority++));
    }
    TableDeregisterStep deregister = new TableDeregisterStep(table.getTTable(), this.getTargetURI(), this.getHiveRegProps());
    copyEntities.add(new PostPublishStep(fileSet, Maps.<String, String>newHashMap(), deregister, stepPriority++));
    return stepPriority;
}
Also used : Path(org.apache.hadoop.fs.Path) PostPublishStep(org.apache.gobblin.data.management.copy.entities.PostPublishStep) DeleteFileCommitStep(org.apache.gobblin.util.commit.DeleteFileCommitStep) ToString(lombok.ToString) TableDeregisterStep(org.apache.gobblin.hive.TableDeregisterStep) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 9 with DeleteFileCommitStep

use of org.apache.gobblin.util.commit.DeleteFileCommitStep in project incubator-gobblin by apache.

the class ConfigBasedDataset method getCopyableFiles.

@Override
public Collection<? extends CopyEntity> getCopyableFiles(FileSystem targetFs, CopyConfiguration copyConfiguration) throws IOException {
    List<CopyEntity> copyableFiles = Lists.newArrayList();
    EndPoint copyFromRaw = copyRoute.getCopyFrom();
    EndPoint copyToRaw = copyRoute.getCopyTo();
    if (!(copyFromRaw instanceof HadoopFsEndPoint && copyToRaw instanceof HadoopFsEndPoint)) {
        log.warn("Currently only handle the Hadoop Fs EndPoint replication");
        return copyableFiles;
    }
    // For {@link HadoopFsEndPoint}s, set pathfilter and applyFilterToDirectories
    HadoopFsEndPoint copyFrom = (HadoopFsEndPoint) copyFromRaw;
    HadoopFsEndPoint copyTo = (HadoopFsEndPoint) copyToRaw;
    copyFrom.setPathFilter(pathFilter);
    copyFrom.setApplyFilterToDirectories(applyFilterToDirectories);
    copyTo.setPathFilter(pathFilter);
    copyTo.setApplyFilterToDirectories(applyFilterToDirectories);
    if (this.watermarkEnabled) {
        if ((!copyFromRaw.getWatermark().isPresent() && copyToRaw.getWatermark().isPresent()) || (copyFromRaw.getWatermark().isPresent() && copyToRaw.getWatermark().isPresent() && copyFromRaw.getWatermark().get().compareTo(copyToRaw.getWatermark().get()) <= 0)) {
            log.info("No need to copy as destination watermark >= source watermark with source watermark {}, for dataset with metadata {}", copyFromRaw.getWatermark().isPresent() ? copyFromRaw.getWatermark().get().toJson() : "N/A", this.rc.getMetaData());
            return copyableFiles;
        }
    }
    Configuration conf = HadoopUtils.newConfiguration();
    FileSystem copyFromFs = FileSystem.get(copyFrom.getFsURI(), conf);
    FileSystem copyToFs = FileSystem.get(copyTo.getFsURI(), conf);
    Collection<FileStatus> allFilesInSource = copyFrom.getFiles();
    Collection<FileStatus> allFilesInTarget = copyTo.getFiles();
    Set<FileStatus> copyFromFileStatuses = Sets.newHashSet(allFilesInSource);
    Map<Path, FileStatus> copyToFileMap = Maps.newHashMap();
    for (FileStatus f : allFilesInTarget) {
        copyToFileMap.put(PathUtils.getPathWithoutSchemeAndAuthority(f.getPath()), f);
    }
    Collection<Path> deletedPaths = Lists.newArrayList();
    boolean watermarkMetadataCopied = false;
    boolean deleteTargetIfNotExistOnSource = rc.isDeleteTargetIfNotExistOnSource();
    for (FileStatus originFileStatus : copyFromFileStatuses) {
        Path relative = PathUtils.relativizePath(PathUtils.getPathWithoutSchemeAndAuthority(originFileStatus.getPath()), PathUtils.getPathWithoutSchemeAndAuthority(copyFrom.getDatasetPath()));
        // construct the new path in the target file system
        Path newPath = new Path(copyTo.getDatasetPath(), relative);
        if (relative.toString().equals(ReplicaHadoopFsEndPoint.WATERMARK_FILE)) {
            watermarkMetadataCopied = true;
        }
        // skip copy same file
        if (copyToFileMap.containsKey(newPath) && copyToFileMap.get(newPath).getLen() == originFileStatus.getLen() && copyToFileMap.get(newPath).getModificationTime() > originFileStatus.getModificationTime()) {
            log.debug("Copy from timestamp older than copy to timestamp, skipped copy {} for dataset with metadata {}", originFileStatus.getPath(), this.rc.getMetaData());
        } else {
            // need to remove those files in the target File System
            if (copyToFileMap.containsKey(newPath)) {
                deletedPaths.add(newPath);
            }
            CopyableFile copyableFile = CopyableFile.fromOriginAndDestination(copyFromFs, originFileStatus, copyToFs.makeQualified(newPath), copyConfiguration).fileSet(PathUtils.getPathWithoutSchemeAndAuthority(copyTo.getDatasetPath()).toString()).build();
            copyableFile.setFsDatasets(copyFromFs, copyToFs);
            copyableFiles.add(copyableFile);
        }
        // clean up already checked paths
        copyToFileMap.remove(newPath);
    }
    // delete the paths on target directory if NOT exists on source
    if (deleteTargetIfNotExistOnSource) {
        deletedPaths.addAll(copyToFileMap.keySet());
    }
    // delete old files first
    if (!deletedPaths.isEmpty()) {
        DeleteFileCommitStep deleteCommitStep = DeleteFileCommitStep.fromPaths(copyToFs, deletedPaths, this.props);
        copyableFiles.add(new PrePublishStep(copyTo.getDatasetPath().toString(), Maps.<String, String>newHashMap(), deleteCommitStep, 0));
    }
    // generate the watermark file even if watermark checking is disabled. Make sure it can come into functional once disired.
    if ((!watermarkMetadataCopied) && copyFrom.getWatermark().isPresent()) {
        copyableFiles.add(new PostPublishStep(copyTo.getDatasetPath().toString(), Maps.<String, String>newHashMap(), new WatermarkMetadataGenerationCommitStep(copyTo.getFsURI().toString(), copyTo.getDatasetPath(), copyFrom.getWatermark().get()), 1));
    }
    return copyableFiles;
}
Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) Configuration(org.apache.hadoop.conf.Configuration) CopyConfiguration(org.apache.gobblin.data.management.copy.CopyConfiguration) CopyEntity(org.apache.gobblin.data.management.copy.CopyEntity) PostPublishStep(org.apache.gobblin.data.management.copy.entities.PostPublishStep) DeleteFileCommitStep(org.apache.gobblin.util.commit.DeleteFileCommitStep) FileSystem(org.apache.hadoop.fs.FileSystem) CopyableFile(org.apache.gobblin.data.management.copy.CopyableFile) PrePublishStep(org.apache.gobblin.data.management.copy.entities.PrePublishStep)

Example 10 with DeleteFileCommitStep

use of org.apache.gobblin.util.commit.DeleteFileCommitStep in project incubator-gobblin by apache.

the class RecursiveCopyableDataset method getCopyableFiles.

@Override
public Collection<? extends CopyEntity> getCopyableFiles(FileSystem targetFs, CopyConfiguration configuration) throws IOException {
    Path nonGlobSearchPath = PathUtils.deepestNonGlobPath(this.glob);
    Path targetPath = new Path(configuration.getPublishDir(), PathUtils.relativizePath(this.rootPath, nonGlobSearchPath));
    Map<Path, FileStatus> filesInSource = createPathMap(getFilesAtPath(this.fs, this.rootPath, this.pathFilter), this.rootPath);
    Map<Path, FileStatus> filesInTarget = createPathMap(getFilesAtPath(targetFs, targetPath, this.pathFilter), targetPath);
    List<Path> toCopy = Lists.newArrayList();
    Map<Path, FileStatus> toDelete = Maps.newHashMap();
    boolean requiresUpdate = false;
    for (Map.Entry<Path, FileStatus> entry : filesInSource.entrySet()) {
        FileStatus statusInTarget = filesInTarget.remove(entry.getKey());
        if (statusInTarget != null) {
            // in both
            if (!sameFile(filesInSource.get(entry.getKey()), statusInTarget)) {
                toCopy.add(entry.getKey());
                toDelete.put(entry.getKey(), statusInTarget);
                requiresUpdate = true;
            }
        } else {
            toCopy.add(entry.getKey());
        }
    }
    if (!this.update && requiresUpdate) {
        throw new IOException("Some files need to be copied but they already exist in the destination. " + "Aborting because not running in update mode.");
    }
    if (this.delete) {
        toDelete.putAll(filesInTarget);
    }
    List<CopyEntity> copyEntities = Lists.newArrayList();
    List<CopyableFile> copyableFiles = Lists.newArrayList();
    for (Path path : toCopy) {
        FileStatus file = filesInSource.get(path);
        Path filePathRelativeToSearchPath = PathUtils.relativizePath(file.getPath(), nonGlobSearchPath);
        Path thisTargetPath = new Path(configuration.getPublishDir(), filePathRelativeToSearchPath);
        CopyableFile copyableFile = CopyableFile.fromOriginAndDestination(this.fs, file, thisTargetPath, configuration).fileSet(datasetURN()).datasetOutputPath(thisTargetPath.toString()).ancestorsOwnerAndPermission(CopyableFile.resolveReplicatedOwnerAndPermissionsRecursively(this.fs, file.getPath().getParent(), nonGlobSearchPath, configuration)).build();
        copyableFile.setFsDatasets(this.fs, targetFs);
        copyableFiles.add(copyableFile);
    }
    copyEntities.addAll(this.copyableFileFilter.filter(this.fs, targetFs, copyableFiles));
    if (!toDelete.isEmpty()) {
        CommitStep step = new DeleteFileCommitStep(targetFs, toDelete.values(), this.properties, this.deleteEmptyDirectories ? Optional.of(targetPath) : Optional.<Path>absent());
        copyEntities.add(new PrePublishStep(datasetURN(), Maps.<String, String>newHashMap(), step, 1));
    }
    return copyEntities;
}
Also used : Path(org.apache.hadoop.fs.Path) DeleteFileCommitStep(org.apache.gobblin.util.commit.DeleteFileCommitStep) CommitStep(org.apache.gobblin.commit.CommitStep) FileStatus(org.apache.hadoop.fs.FileStatus) IOException(java.io.IOException) DeleteFileCommitStep(org.apache.gobblin.util.commit.DeleteFileCommitStep) PrePublishStep(org.apache.gobblin.data.management.copy.entities.PrePublishStep) Map(java.util.Map)

Aggregations

DeleteFileCommitStep (org.apache.gobblin.util.commit.DeleteFileCommitStep)10 Path (org.apache.hadoop.fs.Path)9 PostPublishStep (org.apache.gobblin.data.management.copy.entities.PostPublishStep)5 PrePublishStep (org.apache.gobblin.data.management.copy.entities.PrePublishStep)5 FileStatus (org.apache.hadoop.fs.FileStatus)5 CopyEntity (org.apache.gobblin.data.management.copy.CopyEntity)4 CopyableFile (org.apache.gobblin.data.management.copy.CopyableFile)4 CommitStepCopyEntity (org.apache.gobblin.data.management.copy.entities.CommitStepCopyEntity)3 Configuration (org.apache.hadoop.conf.Configuration)3 Test (org.testng.annotations.Test)3 Predicate (avro.shaded.com.google.common.base.Predicate)2 IOException (java.io.IOException)2 Properties (java.util.Properties)2 Nullable (javax.annotation.Nullable)2 ToString (lombok.ToString)2 CommitStep (org.apache.gobblin.commit.CommitStep)2 MultiTimingEvent (org.apache.gobblin.metrics.event.MultiTimingEvent)2 VisibleForTesting (com.google.common.annotations.VisibleForTesting)1 Closer (com.google.common.io.Closer)1 SuppressFBWarnings (edu.umd.cs.findbugs.annotations.SuppressFBWarnings)1