use of org.apache.hadoop.tools.GlobbedCopyListing in project hadoop by apache.
the class TestCopyCommitter method testDeleteMissingFlatInterleavedFiles.
@Test
public void testDeleteMissingFlatInterleavedFiles() {
TaskAttemptContext taskAttemptContext = getTaskAttemptContext(config);
JobContext jobContext = new JobContextImpl(taskAttemptContext.getConfiguration(), taskAttemptContext.getTaskAttemptID().getJobID());
Configuration conf = jobContext.getConfiguration();
String sourceBase;
String targetBase;
FileSystem fs = null;
try {
OutputCommitter committer = new CopyCommitter(null, taskAttemptContext);
fs = FileSystem.get(conf);
sourceBase = "/tmp1/" + String.valueOf(rand.nextLong());
targetBase = "/tmp1/" + String.valueOf(rand.nextLong());
TestDistCpUtils.createFile(fs, sourceBase + "/1");
TestDistCpUtils.createFile(fs, sourceBase + "/3");
TestDistCpUtils.createFile(fs, sourceBase + "/4");
TestDistCpUtils.createFile(fs, sourceBase + "/5");
TestDistCpUtils.createFile(fs, sourceBase + "/7");
TestDistCpUtils.createFile(fs, sourceBase + "/8");
TestDistCpUtils.createFile(fs, sourceBase + "/9");
TestDistCpUtils.createFile(fs, targetBase + "/2");
TestDistCpUtils.createFile(fs, targetBase + "/4");
TestDistCpUtils.createFile(fs, targetBase + "/5");
TestDistCpUtils.createFile(fs, targetBase + "/7");
TestDistCpUtils.createFile(fs, targetBase + "/9");
TestDistCpUtils.createFile(fs, targetBase + "/A");
DistCpOptions options = new DistCpOptions(Arrays.asList(new Path(sourceBase)), new Path("/out"));
options.setSyncFolder(true);
options.setDeleteMissing(true);
options.appendToConf(conf);
CopyListing listing = new GlobbedCopyListing(conf, CREDENTIALS);
Path listingFile = new Path("/tmp1/" + String.valueOf(rand.nextLong()));
listing.buildListing(listingFile, options);
conf.set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, targetBase);
conf.set(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH, targetBase);
committer.commitJob(jobContext);
if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, targetBase, sourceBase)) {
Assert.fail("Source and target folders are not in sync");
}
Assert.assertEquals(fs.listStatus(new Path(targetBase)).length, 4);
//Test for idempotent commit
committer.commitJob(jobContext);
if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, targetBase, sourceBase)) {
Assert.fail("Source and target folders are not in sync");
}
Assert.assertEquals(fs.listStatus(new Path(targetBase)).length, 4);
} catch (IOException e) {
LOG.error("Exception encountered while testing for delete missing", e);
Assert.fail("Delete missing failure");
} finally {
TestDistCpUtils.delete(fs, "/tmp1");
conf.set(DistCpConstants.CONF_LABEL_DELETE_MISSING, "false");
}
}
use of org.apache.hadoop.tools.GlobbedCopyListing in project hadoop by apache.
the class CopyCommitter method deleteMissing.
// This method deletes "extra" files from the target, if they're not
// available at the source.
private void deleteMissing(Configuration conf) throws IOException {
LOG.info("-delete option is enabled. About to remove entries from " + "target that are missing in source");
// Sort the source-file listing alphabetically.
Path sourceListing = new Path(conf.get(DistCpConstants.CONF_LABEL_LISTING_FILE_PATH));
FileSystem clusterFS = sourceListing.getFileSystem(conf);
Path sortedSourceListing = DistCpUtils.sortListing(clusterFS, conf, sourceListing);
// Similarly, create the listing of target-files. Sort alphabetically.
Path targetListing = new Path(sourceListing.getParent(), "targetListing.seq");
CopyListing target = new GlobbedCopyListing(new Configuration(conf), null);
List<Path> targets = new ArrayList<Path>(1);
Path targetFinalPath = new Path(conf.get(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH));
targets.add(targetFinalPath);
Path resultNonePath = Path.getPathWithoutSchemeAndAuthority(targetFinalPath).toString().startsWith(DistCpConstants.HDFS_RESERVED_RAW_DIRECTORY_NAME) ? DistCpConstants.RAW_NONE_PATH : DistCpConstants.NONE_PATH;
DistCpOptions options = new DistCpOptions(targets, resultNonePath);
//
// Set up options to be the same from the CopyListing.buildListing's perspective,
// so to collect similar listings as when doing the copy
//
options.setOverwrite(overwrite);
options.setSyncFolder(syncFolder);
options.setTargetPathExists(targetPathExists);
target.buildListing(targetListing, options);
Path sortedTargetListing = DistCpUtils.sortListing(clusterFS, conf, targetListing);
long totalLen = clusterFS.getFileStatus(sortedTargetListing).getLen();
SequenceFile.Reader sourceReader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(sortedSourceListing));
SequenceFile.Reader targetReader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(sortedTargetListing));
// Walk both source and target file listings.
// Delete all from target that doesn't also exist on source.
long deletedEntries = 0;
try {
CopyListingFileStatus srcFileStatus = new CopyListingFileStatus();
Text srcRelPath = new Text();
CopyListingFileStatus trgtFileStatus = new CopyListingFileStatus();
Text trgtRelPath = new Text();
FileSystem targetFS = targetFinalPath.getFileSystem(conf);
boolean srcAvailable = sourceReader.next(srcRelPath, srcFileStatus);
while (targetReader.next(trgtRelPath, trgtFileStatus)) {
// Skip sources that don't exist on target.
while (srcAvailable && trgtRelPath.compareTo(srcRelPath) > 0) {
srcAvailable = sourceReader.next(srcRelPath, srcFileStatus);
}
if (srcAvailable && trgtRelPath.equals(srcRelPath))
continue;
// Target doesn't exist at source. Delete.
boolean result = targetFS.delete(trgtFileStatus.getPath(), true) || !targetFS.exists(trgtFileStatus.getPath());
if (result) {
LOG.info("Deleted " + trgtFileStatus.getPath() + " - Missing at source");
deletedEntries++;
} else {
throw new IOException("Unable to delete " + trgtFileStatus.getPath());
}
taskAttemptContext.progress();
taskAttemptContext.setStatus("Deleting missing files from target. [" + targetReader.getPosition() * 100 / totalLen + "%]");
}
} finally {
IOUtils.closeStream(sourceReader);
IOUtils.closeStream(targetReader);
}
LOG.info("Deleted " + deletedEntries + " from target: " + targets.get(0));
}
use of org.apache.hadoop.tools.GlobbedCopyListing in project hadoop by apache.
the class TestCopyCommitter method testPreserveStatus.
@Test
public void testPreserveStatus() {
TaskAttemptContext taskAttemptContext = getTaskAttemptContext(config);
JobContext jobContext = new JobContextImpl(taskAttemptContext.getConfiguration(), taskAttemptContext.getTaskAttemptID().getJobID());
Configuration conf = jobContext.getConfiguration();
String sourceBase;
String targetBase;
FileSystem fs = null;
try {
OutputCommitter committer = new CopyCommitter(null, taskAttemptContext);
fs = FileSystem.get(conf);
FsPermission sourcePerm = new FsPermission((short) 511);
FsPermission initialPerm = new FsPermission((short) 448);
sourceBase = TestDistCpUtils.createTestSetup(fs, sourcePerm);
targetBase = TestDistCpUtils.createTestSetup(fs, initialPerm);
DistCpOptions options = new DistCpOptions(Arrays.asList(new Path(sourceBase)), new Path("/out"));
options.preserve(FileAttribute.PERMISSION);
options.appendToConf(conf);
options.setTargetPathExists(false);
CopyListing listing = new GlobbedCopyListing(conf, CREDENTIALS);
Path listingFile = new Path("/tmp1/" + String.valueOf(rand.nextLong()));
listing.buildListing(listingFile, options);
conf.set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, targetBase);
committer.commitJob(jobContext);
if (!checkDirectoryPermissions(fs, targetBase, sourcePerm)) {
Assert.fail("Permission don't match");
}
//Test for idempotent commit
committer.commitJob(jobContext);
if (!checkDirectoryPermissions(fs, targetBase, sourcePerm)) {
Assert.fail("Permission don't match");
}
} catch (IOException e) {
LOG.error("Exception encountered while testing for preserve status", e);
Assert.fail("Preserve status failure");
} finally {
TestDistCpUtils.delete(fs, "/tmp1");
conf.unset(DistCpConstants.CONF_LABEL_PRESERVE_STATUS);
}
}
use of org.apache.hadoop.tools.GlobbedCopyListing in project hadoop by apache.
the class TestCopyCommitter method testDeleteMissing.
@Test
public void testDeleteMissing() {
TaskAttemptContext taskAttemptContext = getTaskAttemptContext(config);
JobContext jobContext = new JobContextImpl(taskAttemptContext.getConfiguration(), taskAttemptContext.getTaskAttemptID().getJobID());
Configuration conf = jobContext.getConfiguration();
String sourceBase;
String targetBase;
FileSystem fs = null;
try {
OutputCommitter committer = new CopyCommitter(null, taskAttemptContext);
fs = FileSystem.get(conf);
sourceBase = TestDistCpUtils.createTestSetup(fs, FsPermission.getDefault());
targetBase = TestDistCpUtils.createTestSetup(fs, FsPermission.getDefault());
String targetBaseAdd = TestDistCpUtils.createTestSetup(fs, FsPermission.getDefault());
fs.rename(new Path(targetBaseAdd), new Path(targetBase));
DistCpOptions options = new DistCpOptions(Arrays.asList(new Path(sourceBase)), new Path("/out"));
options.setSyncFolder(true);
options.setDeleteMissing(true);
options.appendToConf(conf);
CopyListing listing = new GlobbedCopyListing(conf, CREDENTIALS);
Path listingFile = new Path("/tmp1/" + String.valueOf(rand.nextLong()));
listing.buildListing(listingFile, options);
conf.set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, targetBase);
conf.set(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH, targetBase);
committer.commitJob(jobContext);
if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, targetBase, sourceBase)) {
Assert.fail("Source and target folders are not in sync");
}
if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, sourceBase, targetBase)) {
Assert.fail("Source and target folders are not in sync");
}
//Test for idempotent commit
committer.commitJob(jobContext);
if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, targetBase, sourceBase)) {
Assert.fail("Source and target folders are not in sync");
}
if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, sourceBase, targetBase)) {
Assert.fail("Source and target folders are not in sync");
}
} catch (Throwable e) {
LOG.error("Exception encountered while testing for delete missing", e);
Assert.fail("Delete missing failure");
} finally {
TestDistCpUtils.delete(fs, "/tmp1");
conf.set(DistCpConstants.CONF_LABEL_DELETE_MISSING, "false");
}
}
Aggregations