Search in sources :

Example 1 with GlobbedCopyListing

use of org.apache.hadoop.tools.GlobbedCopyListing in project hadoop by apache.

the class TestCopyCommitter method testDeleteMissingFlatInterleavedFiles.

@Test
public void testDeleteMissingFlatInterleavedFiles() {
    TaskAttemptContext taskAttemptContext = getTaskAttemptContext(config);
    JobContext jobContext = new JobContextImpl(taskAttemptContext.getConfiguration(), taskAttemptContext.getTaskAttemptID().getJobID());
    Configuration conf = jobContext.getConfiguration();
    String sourceBase;
    String targetBase;
    FileSystem fs = null;
    try {
        OutputCommitter committer = new CopyCommitter(null, taskAttemptContext);
        fs = FileSystem.get(conf);
        sourceBase = "/tmp1/" + String.valueOf(rand.nextLong());
        targetBase = "/tmp1/" + String.valueOf(rand.nextLong());
        TestDistCpUtils.createFile(fs, sourceBase + "/1");
        TestDistCpUtils.createFile(fs, sourceBase + "/3");
        TestDistCpUtils.createFile(fs, sourceBase + "/4");
        TestDistCpUtils.createFile(fs, sourceBase + "/5");
        TestDistCpUtils.createFile(fs, sourceBase + "/7");
        TestDistCpUtils.createFile(fs, sourceBase + "/8");
        TestDistCpUtils.createFile(fs, sourceBase + "/9");
        TestDistCpUtils.createFile(fs, targetBase + "/2");
        TestDistCpUtils.createFile(fs, targetBase + "/4");
        TestDistCpUtils.createFile(fs, targetBase + "/5");
        TestDistCpUtils.createFile(fs, targetBase + "/7");
        TestDistCpUtils.createFile(fs, targetBase + "/9");
        TestDistCpUtils.createFile(fs, targetBase + "/A");
        DistCpOptions options = new DistCpOptions(Arrays.asList(new Path(sourceBase)), new Path("/out"));
        options.setSyncFolder(true);
        options.setDeleteMissing(true);
        options.appendToConf(conf);
        CopyListing listing = new GlobbedCopyListing(conf, CREDENTIALS);
        Path listingFile = new Path("/tmp1/" + String.valueOf(rand.nextLong()));
        listing.buildListing(listingFile, options);
        conf.set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, targetBase);
        conf.set(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH, targetBase);
        committer.commitJob(jobContext);
        if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, targetBase, sourceBase)) {
            Assert.fail("Source and target folders are not in sync");
        }
        Assert.assertEquals(fs.listStatus(new Path(targetBase)).length, 4);
        //Test for idempotent commit
        committer.commitJob(jobContext);
        if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, targetBase, sourceBase)) {
            Assert.fail("Source and target folders are not in sync");
        }
        Assert.assertEquals(fs.listStatus(new Path(targetBase)).length, 4);
    } catch (IOException e) {
        LOG.error("Exception encountered while testing for delete missing", e);
        Assert.fail("Delete missing failure");
    } finally {
        TestDistCpUtils.delete(fs, "/tmp1");
        conf.set(DistCpConstants.CONF_LABEL_DELETE_MISSING, "false");
    }
}
Also used : Path(org.apache.hadoop.fs.Path) JobContextImpl(org.apache.hadoop.mapreduce.task.JobContextImpl) Configuration(org.apache.hadoop.conf.Configuration) IOException(java.io.IOException) CopyListing(org.apache.hadoop.tools.CopyListing) GlobbedCopyListing(org.apache.hadoop.tools.GlobbedCopyListing) DistCpOptions(org.apache.hadoop.tools.DistCpOptions) FileSystem(org.apache.hadoop.fs.FileSystem) GlobbedCopyListing(org.apache.hadoop.tools.GlobbedCopyListing)

Example 2 with GlobbedCopyListing

use of org.apache.hadoop.tools.GlobbedCopyListing in project hadoop by apache.

the class CopyCommitter method deleteMissing.

// This method deletes "extra" files from the target, if they're not
// available at the source.
private void deleteMissing(Configuration conf) throws IOException {
    LOG.info("-delete option is enabled. About to remove entries from " + "target that are missing in source");
    // Sort the source-file listing alphabetically.
    Path sourceListing = new Path(conf.get(DistCpConstants.CONF_LABEL_LISTING_FILE_PATH));
    FileSystem clusterFS = sourceListing.getFileSystem(conf);
    Path sortedSourceListing = DistCpUtils.sortListing(clusterFS, conf, sourceListing);
    // Similarly, create the listing of target-files. Sort alphabetically.
    Path targetListing = new Path(sourceListing.getParent(), "targetListing.seq");
    CopyListing target = new GlobbedCopyListing(new Configuration(conf), null);
    List<Path> targets = new ArrayList<Path>(1);
    Path targetFinalPath = new Path(conf.get(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH));
    targets.add(targetFinalPath);
    Path resultNonePath = Path.getPathWithoutSchemeAndAuthority(targetFinalPath).toString().startsWith(DistCpConstants.HDFS_RESERVED_RAW_DIRECTORY_NAME) ? DistCpConstants.RAW_NONE_PATH : DistCpConstants.NONE_PATH;
    DistCpOptions options = new DistCpOptions(targets, resultNonePath);
    //
    // Set up options to be the same from the CopyListing.buildListing's perspective,
    // so to collect similar listings as when doing the copy
    //
    options.setOverwrite(overwrite);
    options.setSyncFolder(syncFolder);
    options.setTargetPathExists(targetPathExists);
    target.buildListing(targetListing, options);
    Path sortedTargetListing = DistCpUtils.sortListing(clusterFS, conf, targetListing);
    long totalLen = clusterFS.getFileStatus(sortedTargetListing).getLen();
    SequenceFile.Reader sourceReader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(sortedSourceListing));
    SequenceFile.Reader targetReader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(sortedTargetListing));
    // Walk both source and target file listings.
    // Delete all from target that doesn't also exist on source.
    long deletedEntries = 0;
    try {
        CopyListingFileStatus srcFileStatus = new CopyListingFileStatus();
        Text srcRelPath = new Text();
        CopyListingFileStatus trgtFileStatus = new CopyListingFileStatus();
        Text trgtRelPath = new Text();
        FileSystem targetFS = targetFinalPath.getFileSystem(conf);
        boolean srcAvailable = sourceReader.next(srcRelPath, srcFileStatus);
        while (targetReader.next(trgtRelPath, trgtFileStatus)) {
            // Skip sources that don't exist on target.
            while (srcAvailable && trgtRelPath.compareTo(srcRelPath) > 0) {
                srcAvailable = sourceReader.next(srcRelPath, srcFileStatus);
            }
            if (srcAvailable && trgtRelPath.equals(srcRelPath))
                continue;
            // Target doesn't exist at source. Delete.
            boolean result = targetFS.delete(trgtFileStatus.getPath(), true) || !targetFS.exists(trgtFileStatus.getPath());
            if (result) {
                LOG.info("Deleted " + trgtFileStatus.getPath() + " - Missing at source");
                deletedEntries++;
            } else {
                throw new IOException("Unable to delete " + trgtFileStatus.getPath());
            }
            taskAttemptContext.progress();
            taskAttemptContext.setStatus("Deleting missing files from target. [" + targetReader.getPosition() * 100 / totalLen + "%]");
        }
    } finally {
        IOUtils.closeStream(sourceReader);
        IOUtils.closeStream(targetReader);
    }
    LOG.info("Deleted " + deletedEntries + " from target: " + targets.get(0));
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) Text(org.apache.hadoop.io.Text) IOException(java.io.IOException) CopyListing(org.apache.hadoop.tools.CopyListing) GlobbedCopyListing(org.apache.hadoop.tools.GlobbedCopyListing) DistCpOptions(org.apache.hadoop.tools.DistCpOptions) SequenceFile(org.apache.hadoop.io.SequenceFile) CopyListingFileStatus(org.apache.hadoop.tools.CopyListingFileStatus) FileSystem(org.apache.hadoop.fs.FileSystem) GlobbedCopyListing(org.apache.hadoop.tools.GlobbedCopyListing)

Example 3 with GlobbedCopyListing

use of org.apache.hadoop.tools.GlobbedCopyListing in project hadoop by apache.

the class TestCopyCommitter method testPreserveStatus.

@Test
public void testPreserveStatus() {
    TaskAttemptContext taskAttemptContext = getTaskAttemptContext(config);
    JobContext jobContext = new JobContextImpl(taskAttemptContext.getConfiguration(), taskAttemptContext.getTaskAttemptID().getJobID());
    Configuration conf = jobContext.getConfiguration();
    String sourceBase;
    String targetBase;
    FileSystem fs = null;
    try {
        OutputCommitter committer = new CopyCommitter(null, taskAttemptContext);
        fs = FileSystem.get(conf);
        FsPermission sourcePerm = new FsPermission((short) 511);
        FsPermission initialPerm = new FsPermission((short) 448);
        sourceBase = TestDistCpUtils.createTestSetup(fs, sourcePerm);
        targetBase = TestDistCpUtils.createTestSetup(fs, initialPerm);
        DistCpOptions options = new DistCpOptions(Arrays.asList(new Path(sourceBase)), new Path("/out"));
        options.preserve(FileAttribute.PERMISSION);
        options.appendToConf(conf);
        options.setTargetPathExists(false);
        CopyListing listing = new GlobbedCopyListing(conf, CREDENTIALS);
        Path listingFile = new Path("/tmp1/" + String.valueOf(rand.nextLong()));
        listing.buildListing(listingFile, options);
        conf.set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, targetBase);
        committer.commitJob(jobContext);
        if (!checkDirectoryPermissions(fs, targetBase, sourcePerm)) {
            Assert.fail("Permission don't match");
        }
        //Test for idempotent commit
        committer.commitJob(jobContext);
        if (!checkDirectoryPermissions(fs, targetBase, sourcePerm)) {
            Assert.fail("Permission don't match");
        }
    } catch (IOException e) {
        LOG.error("Exception encountered while testing for preserve status", e);
        Assert.fail("Preserve status failure");
    } finally {
        TestDistCpUtils.delete(fs, "/tmp1");
        conf.unset(DistCpConstants.CONF_LABEL_PRESERVE_STATUS);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) JobContextImpl(org.apache.hadoop.mapreduce.task.JobContextImpl) Configuration(org.apache.hadoop.conf.Configuration) IOException(java.io.IOException) CopyListing(org.apache.hadoop.tools.CopyListing) GlobbedCopyListing(org.apache.hadoop.tools.GlobbedCopyListing) DistCpOptions(org.apache.hadoop.tools.DistCpOptions) FileSystem(org.apache.hadoop.fs.FileSystem) FsPermission(org.apache.hadoop.fs.permission.FsPermission) GlobbedCopyListing(org.apache.hadoop.tools.GlobbedCopyListing)

Example 4 with GlobbedCopyListing

use of org.apache.hadoop.tools.GlobbedCopyListing in project hadoop by apache.

the class TestCopyCommitter method testDeleteMissing.

@Test
public void testDeleteMissing() {
    TaskAttemptContext taskAttemptContext = getTaskAttemptContext(config);
    JobContext jobContext = new JobContextImpl(taskAttemptContext.getConfiguration(), taskAttemptContext.getTaskAttemptID().getJobID());
    Configuration conf = jobContext.getConfiguration();
    String sourceBase;
    String targetBase;
    FileSystem fs = null;
    try {
        OutputCommitter committer = new CopyCommitter(null, taskAttemptContext);
        fs = FileSystem.get(conf);
        sourceBase = TestDistCpUtils.createTestSetup(fs, FsPermission.getDefault());
        targetBase = TestDistCpUtils.createTestSetup(fs, FsPermission.getDefault());
        String targetBaseAdd = TestDistCpUtils.createTestSetup(fs, FsPermission.getDefault());
        fs.rename(new Path(targetBaseAdd), new Path(targetBase));
        DistCpOptions options = new DistCpOptions(Arrays.asList(new Path(sourceBase)), new Path("/out"));
        options.setSyncFolder(true);
        options.setDeleteMissing(true);
        options.appendToConf(conf);
        CopyListing listing = new GlobbedCopyListing(conf, CREDENTIALS);
        Path listingFile = new Path("/tmp1/" + String.valueOf(rand.nextLong()));
        listing.buildListing(listingFile, options);
        conf.set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, targetBase);
        conf.set(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH, targetBase);
        committer.commitJob(jobContext);
        if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, targetBase, sourceBase)) {
            Assert.fail("Source and target folders are not in sync");
        }
        if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, sourceBase, targetBase)) {
            Assert.fail("Source and target folders are not in sync");
        }
        //Test for idempotent commit
        committer.commitJob(jobContext);
        if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, targetBase, sourceBase)) {
            Assert.fail("Source and target folders are not in sync");
        }
        if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, sourceBase, targetBase)) {
            Assert.fail("Source and target folders are not in sync");
        }
    } catch (Throwable e) {
        LOG.error("Exception encountered while testing for delete missing", e);
        Assert.fail("Delete missing failure");
    } finally {
        TestDistCpUtils.delete(fs, "/tmp1");
        conf.set(DistCpConstants.CONF_LABEL_DELETE_MISSING, "false");
    }
}
Also used : Path(org.apache.hadoop.fs.Path) JobContextImpl(org.apache.hadoop.mapreduce.task.JobContextImpl) Configuration(org.apache.hadoop.conf.Configuration) CopyListing(org.apache.hadoop.tools.CopyListing) GlobbedCopyListing(org.apache.hadoop.tools.GlobbedCopyListing) DistCpOptions(org.apache.hadoop.tools.DistCpOptions) FileSystem(org.apache.hadoop.fs.FileSystem) GlobbedCopyListing(org.apache.hadoop.tools.GlobbedCopyListing)

Aggregations

Configuration (org.apache.hadoop.conf.Configuration)4 FileSystem (org.apache.hadoop.fs.FileSystem)4 Path (org.apache.hadoop.fs.Path)4 CopyListing (org.apache.hadoop.tools.CopyListing)4 DistCpOptions (org.apache.hadoop.tools.DistCpOptions)4 GlobbedCopyListing (org.apache.hadoop.tools.GlobbedCopyListing)4 IOException (java.io.IOException)3 JobContextImpl (org.apache.hadoop.mapreduce.task.JobContextImpl)3 ArrayList (java.util.ArrayList)1 FsPermission (org.apache.hadoop.fs.permission.FsPermission)1 SequenceFile (org.apache.hadoop.io.SequenceFile)1 Text (org.apache.hadoop.io.Text)1 CopyListingFileStatus (org.apache.hadoop.tools.CopyListingFileStatus)1