use of org.apache.hadoop.tools.DistCpOptions in project hadoop by apache.
the class TestOptionsParser method testParseNumListstatusThreads.
@Test
public void testParseNumListstatusThreads() {
DistCpOptions options = OptionsParser.parse(new String[] { "hdfs://localhost:9820/source/first", "hdfs://localhost:9820/target/" });
// If command line argument isn't set, we expect .getNumListstatusThreads
// option to be zero (so that we know when to override conf properties).
Assert.assertEquals(0, options.getNumListstatusThreads());
options = OptionsParser.parse(new String[] { "--numListstatusThreads", "12", "hdfs://localhost:9820/source/first", "hdfs://localhost:9820/target/" });
Assert.assertEquals(12, options.getNumListstatusThreads());
options = OptionsParser.parse(new String[] { "--numListstatusThreads", "0", "hdfs://localhost:9820/source/first", "hdfs://localhost:9820/target/" });
Assert.assertEquals(0, options.getNumListstatusThreads());
try {
OptionsParser.parse(new String[] { "--numListstatusThreads", "hello", "hdfs://localhost:9820/source/first", "hdfs://localhost:9820/target/" });
Assert.fail("Non numberic numListstatusThreads parsed");
} catch (IllegalArgumentException ignore) {
}
// Ignore large number of threads.
options = OptionsParser.parse(new String[] { "--numListstatusThreads", "100", "hdfs://localhost:9820/source/first", "hdfs://localhost:9820/target/" });
Assert.assertEquals(DistCpOptions.maxNumListstatusThreads, options.getNumListstatusThreads());
}
use of org.apache.hadoop.tools.DistCpOptions in project hadoop by apache.
the class TestOptionsParser method testOptionsAppendToConf.
@Test
public void testOptionsAppendToConf() {
Configuration conf = new Configuration();
Assert.assertFalse(conf.getBoolean(DistCpOptionSwitch.IGNORE_FAILURES.getConfigLabel(), false));
Assert.assertFalse(conf.getBoolean(DistCpOptionSwitch.ATOMIC_COMMIT.getConfigLabel(), false));
Assert.assertEquals(conf.getRaw(DistCpOptionSwitch.BANDWIDTH.getConfigLabel()), null);
DistCpOptions options = OptionsParser.parse(new String[] { "-atomic", "-i", "hdfs://localhost:9820/source/first", "hdfs://localhost:9820/target/" });
options.appendToConf(conf);
Assert.assertTrue(conf.getBoolean(DistCpOptionSwitch.IGNORE_FAILURES.getConfigLabel(), false));
Assert.assertTrue(conf.getBoolean(DistCpOptionSwitch.ATOMIC_COMMIT.getConfigLabel(), false));
Assert.assertEquals(conf.getFloat(DistCpOptionSwitch.BANDWIDTH.getConfigLabel(), -1), -1.0, DELTA);
conf = new Configuration();
Assert.assertFalse(conf.getBoolean(DistCpOptionSwitch.SYNC_FOLDERS.getConfigLabel(), false));
Assert.assertFalse(conf.getBoolean(DistCpOptionSwitch.DELETE_MISSING.getConfigLabel(), false));
Assert.assertEquals(conf.get(DistCpOptionSwitch.PRESERVE_STATUS.getConfigLabel()), null);
options = OptionsParser.parse(new String[] { "-update", "-delete", "-pu", "-bandwidth", "11.2", "hdfs://localhost:9820/source/first", "hdfs://localhost:9820/target/" });
options.appendToConf(conf);
Assert.assertTrue(conf.getBoolean(DistCpOptionSwitch.SYNC_FOLDERS.getConfigLabel(), false));
Assert.assertTrue(conf.getBoolean(DistCpOptionSwitch.DELETE_MISSING.getConfigLabel(), false));
Assert.assertEquals(conf.get(DistCpOptionSwitch.PRESERVE_STATUS.getConfigLabel()), "U");
Assert.assertEquals(conf.getFloat(DistCpOptionSwitch.BANDWIDTH.getConfigLabel(), -1), 11.2, DELTA);
}
use of org.apache.hadoop.tools.DistCpOptions in project hadoop by apache.
the class CopyCommitter method deleteMissing.
// This method deletes "extra" files from the target, if they're not
// available at the source.
private void deleteMissing(Configuration conf) throws IOException {
LOG.info("-delete option is enabled. About to remove entries from " + "target that are missing in source");
// Sort the source-file listing alphabetically.
Path sourceListing = new Path(conf.get(DistCpConstants.CONF_LABEL_LISTING_FILE_PATH));
FileSystem clusterFS = sourceListing.getFileSystem(conf);
Path sortedSourceListing = DistCpUtils.sortListing(clusterFS, conf, sourceListing);
// Similarly, create the listing of target-files. Sort alphabetically.
Path targetListing = new Path(sourceListing.getParent(), "targetListing.seq");
CopyListing target = new GlobbedCopyListing(new Configuration(conf), null);
List<Path> targets = new ArrayList<Path>(1);
Path targetFinalPath = new Path(conf.get(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH));
targets.add(targetFinalPath);
Path resultNonePath = Path.getPathWithoutSchemeAndAuthority(targetFinalPath).toString().startsWith(DistCpConstants.HDFS_RESERVED_RAW_DIRECTORY_NAME) ? DistCpConstants.RAW_NONE_PATH : DistCpConstants.NONE_PATH;
DistCpOptions options = new DistCpOptions(targets, resultNonePath);
//
// Set up options to be the same from the CopyListing.buildListing's perspective,
// so to collect similar listings as when doing the copy
//
options.setOverwrite(overwrite);
options.setSyncFolder(syncFolder);
options.setTargetPathExists(targetPathExists);
target.buildListing(targetListing, options);
Path sortedTargetListing = DistCpUtils.sortListing(clusterFS, conf, targetListing);
long totalLen = clusterFS.getFileStatus(sortedTargetListing).getLen();
SequenceFile.Reader sourceReader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(sortedSourceListing));
SequenceFile.Reader targetReader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(sortedTargetListing));
// Walk both source and target file listings.
// Delete all from target that doesn't also exist on source.
long deletedEntries = 0;
try {
CopyListingFileStatus srcFileStatus = new CopyListingFileStatus();
Text srcRelPath = new Text();
CopyListingFileStatus trgtFileStatus = new CopyListingFileStatus();
Text trgtRelPath = new Text();
FileSystem targetFS = targetFinalPath.getFileSystem(conf);
boolean srcAvailable = sourceReader.next(srcRelPath, srcFileStatus);
while (targetReader.next(trgtRelPath, trgtFileStatus)) {
// Skip sources that don't exist on target.
while (srcAvailable && trgtRelPath.compareTo(srcRelPath) > 0) {
srcAvailable = sourceReader.next(srcRelPath, srcFileStatus);
}
if (srcAvailable && trgtRelPath.equals(srcRelPath))
continue;
// Target doesn't exist at source. Delete.
boolean result = targetFS.delete(trgtFileStatus.getPath(), true) || !targetFS.exists(trgtFileStatus.getPath());
if (result) {
LOG.info("Deleted " + trgtFileStatus.getPath() + " - Missing at source");
deletedEntries++;
} else {
throw new IOException("Unable to delete " + trgtFileStatus.getPath());
}
taskAttemptContext.progress();
taskAttemptContext.setStatus("Deleting missing files from target. [" + targetReader.getPosition() * 100 / totalLen + "%]");
}
} finally {
IOUtils.closeStream(sourceReader);
IOUtils.closeStream(targetReader);
}
LOG.info("Deleted " + deletedEntries + " from target: " + targets.get(0));
}
use of org.apache.hadoop.tools.DistCpOptions in project hive by apache.
the class HdfsUtils method runDistCp.
public static boolean runDistCp(List<Path> srcPaths, Path dst, Configuration conf) throws IOException {
DistCpOptions options = new DistCpOptions.Builder(srcPaths, dst).withSyncFolder(true).withCRC(true).preserve(FileAttribute.BLOCKSIZE).build();
// Creates the command-line parameters for distcp
List<String> params = constructDistCpParams(srcPaths, dst, conf);
try {
conf.setBoolean("mapred.mapper.new-api", true);
DistCp distcp = new DistCp(conf, options);
// added by HADOOP-10459
if (distcp.run(params.toArray(new String[params.size()])) == 0) {
return true;
} else {
return false;
}
} catch (Exception e) {
throw new IOException("Cannot execute DistCp process: " + e, e);
} finally {
conf.setBoolean("mapred.mapper.new-api", false);
}
}
Aggregations