Search in sources :

Example 1 with DistCpOptions

use of org.apache.hadoop.tools.DistCpOptions in project hadoop by apache.

the class TestDynamicInputFormat method getOptions.

private static DistCpOptions getOptions() throws Exception {
    Path sourcePath = new Path(cluster.getFileSystem().getUri().toString() + "/tmp/source");
    Path targetPath = new Path(cluster.getFileSystem().getUri().toString() + "/tmp/target");
    List<Path> sourceList = new ArrayList<Path>();
    sourceList.add(sourcePath);
    DistCpOptions options = new DistCpOptions(sourceList, targetPath);
    options.setMaxMaps(NUM_SPLITS);
    return options;
}
Also used : Path(org.apache.hadoop.fs.Path) DistCpOptions(org.apache.hadoop.tools.DistCpOptions) ArrayList(java.util.ArrayList)

Example 2 with DistCpOptions

use of org.apache.hadoop.tools.DistCpOptions in project hadoop by apache.

the class TestUniformSizeInputFormat method testGetSplits.

public void testGetSplits(int nMaps) throws Exception {
    DistCpOptions options = getOptions(nMaps);
    Configuration configuration = new Configuration();
    configuration.set("mapred.map.tasks", String.valueOf(options.getMaxMaps()));
    Path listFile = new Path(cluster.getFileSystem().getUri().toString() + "/tmp/testGetSplits_1/fileList.seq");
    CopyListing.getCopyListing(configuration, CREDENTIALS, options).buildListing(listFile, options);
    JobContext jobContext = new JobContextImpl(configuration, new JobID());
    UniformSizeInputFormat uniformSizeInputFormat = new UniformSizeInputFormat();
    List<InputSplit> splits = uniformSizeInputFormat.getSplits(jobContext);
    int sizePerMap = totalFileSize / nMaps;
    checkSplits(listFile, splits);
    int doubleCheckedTotalSize = 0;
    int previousSplitSize = -1;
    for (int i = 0; i < splits.size(); ++i) {
        InputSplit split = splits.get(i);
        int currentSplitSize = 0;
        RecordReader<Text, CopyListingFileStatus> recordReader = uniformSizeInputFormat.createRecordReader(split, null);
        StubContext stubContext = new StubContext(jobContext.getConfiguration(), recordReader, 0);
        final TaskAttemptContext taskAttemptContext = stubContext.getContext();
        recordReader.initialize(split, taskAttemptContext);
        while (recordReader.nextKeyValue()) {
            Path sourcePath = recordReader.getCurrentValue().getPath();
            FileSystem fs = sourcePath.getFileSystem(configuration);
            FileStatus[] fileStatus = fs.listStatus(sourcePath);
            if (fileStatus.length > 1) {
                continue;
            }
            currentSplitSize += fileStatus[0].getLen();
        }
        Assert.assertTrue(previousSplitSize == -1 || Math.abs(currentSplitSize - previousSplitSize) < 0.1 * sizePerMap || i == splits.size() - 1);
        doubleCheckedTotalSize += currentSplitSize;
    }
    Assert.assertEquals(totalFileSize, doubleCheckedTotalSize);
}
Also used : Path(org.apache.hadoop.fs.Path) JobContextImpl(org.apache.hadoop.mapreduce.task.JobContextImpl) FileStatus(org.apache.hadoop.fs.FileStatus) CopyListingFileStatus(org.apache.hadoop.tools.CopyListingFileStatus) Configuration(org.apache.hadoop.conf.Configuration) Text(org.apache.hadoop.io.Text) DistCpOptions(org.apache.hadoop.tools.DistCpOptions) CopyListingFileStatus(org.apache.hadoop.tools.CopyListingFileStatus) StubContext(org.apache.hadoop.tools.StubContext) FileSystem(org.apache.hadoop.fs.FileSystem)

Example 3 with DistCpOptions

use of org.apache.hadoop.tools.DistCpOptions in project hadoop by apache.

the class TestUniformSizeInputFormat method getOptions.

private static DistCpOptions getOptions(int nMaps) throws Exception {
    Path sourcePath = new Path(cluster.getFileSystem().getUri().toString() + "/tmp/source");
    Path targetPath = new Path(cluster.getFileSystem().getUri().toString() + "/tmp/target");
    List<Path> sourceList = new ArrayList<Path>();
    sourceList.add(sourcePath);
    final DistCpOptions distCpOptions = new DistCpOptions(sourceList, targetPath);
    distCpOptions.setMaxMaps(nMaps);
    return distCpOptions;
}
Also used : Path(org.apache.hadoop.fs.Path) DistCpOptions(org.apache.hadoop.tools.DistCpOptions) ArrayList(java.util.ArrayList)

Example 4 with DistCpOptions

use of org.apache.hadoop.tools.DistCpOptions in project hadoop by apache.

the class TestDynamicInputFormat method testGetSplits.

@Test
public void testGetSplits() throws Exception {
    DistCpOptions options = getOptions();
    Configuration configuration = new Configuration();
    configuration.set("mapred.map.tasks", String.valueOf(options.getMaxMaps()));
    CopyListing.getCopyListing(configuration, CREDENTIALS, options).buildListing(new Path(cluster.getFileSystem().getUri().toString() + "/tmp/testDynInputFormat/fileList.seq"), options);
    JobContext jobContext = new JobContextImpl(configuration, new JobID());
    DynamicInputFormat<Text, CopyListingFileStatus> inputFormat = new DynamicInputFormat<Text, CopyListingFileStatus>();
    List<InputSplit> splits = inputFormat.getSplits(jobContext);
    int nFiles = 0;
    int taskId = 0;
    for (InputSplit split : splits) {
        StubContext stubContext = new StubContext(jobContext.getConfiguration(), null, taskId);
        final TaskAttemptContext taskAttemptContext = stubContext.getContext();
        RecordReader<Text, CopyListingFileStatus> recordReader = inputFormat.createRecordReader(split, taskAttemptContext);
        stubContext.setReader(recordReader);
        recordReader.initialize(splits.get(0), taskAttemptContext);
        float previousProgressValue = 0f;
        while (recordReader.nextKeyValue()) {
            CopyListingFileStatus fileStatus = recordReader.getCurrentValue();
            String source = fileStatus.getPath().toString();
            System.out.println(source);
            Assert.assertTrue(expectedFilePaths.contains(source));
            final float progress = recordReader.getProgress();
            Assert.assertTrue(progress >= previousProgressValue);
            Assert.assertTrue(progress >= 0.0f);
            Assert.assertTrue(progress <= 1.0f);
            previousProgressValue = progress;
            ++nFiles;
        }
        Assert.assertTrue(recordReader.getProgress() == 1.0f);
        ++taskId;
    }
    Assert.assertEquals(expectedFilePaths.size(), nFiles);
}
Also used : Path(org.apache.hadoop.fs.Path) JobContextImpl(org.apache.hadoop.mapreduce.task.JobContextImpl) Configuration(org.apache.hadoop.conf.Configuration) Text(org.apache.hadoop.io.Text) DistCpOptions(org.apache.hadoop.tools.DistCpOptions) CopyListingFileStatus(org.apache.hadoop.tools.CopyListingFileStatus) StubContext(org.apache.hadoop.tools.StubContext) Test(org.junit.Test)

Example 5 with DistCpOptions

use of org.apache.hadoop.tools.DistCpOptions in project hadoop by apache.

the class TestCopyCommitter method testDeleteMissingFlatInterleavedFiles.

@Test
public void testDeleteMissingFlatInterleavedFiles() {
    TaskAttemptContext taskAttemptContext = getTaskAttemptContext(config);
    JobContext jobContext = new JobContextImpl(taskAttemptContext.getConfiguration(), taskAttemptContext.getTaskAttemptID().getJobID());
    Configuration conf = jobContext.getConfiguration();
    String sourceBase;
    String targetBase;
    FileSystem fs = null;
    try {
        OutputCommitter committer = new CopyCommitter(null, taskAttemptContext);
        fs = FileSystem.get(conf);
        sourceBase = "/tmp1/" + String.valueOf(rand.nextLong());
        targetBase = "/tmp1/" + String.valueOf(rand.nextLong());
        TestDistCpUtils.createFile(fs, sourceBase + "/1");
        TestDistCpUtils.createFile(fs, sourceBase + "/3");
        TestDistCpUtils.createFile(fs, sourceBase + "/4");
        TestDistCpUtils.createFile(fs, sourceBase + "/5");
        TestDistCpUtils.createFile(fs, sourceBase + "/7");
        TestDistCpUtils.createFile(fs, sourceBase + "/8");
        TestDistCpUtils.createFile(fs, sourceBase + "/9");
        TestDistCpUtils.createFile(fs, targetBase + "/2");
        TestDistCpUtils.createFile(fs, targetBase + "/4");
        TestDistCpUtils.createFile(fs, targetBase + "/5");
        TestDistCpUtils.createFile(fs, targetBase + "/7");
        TestDistCpUtils.createFile(fs, targetBase + "/9");
        TestDistCpUtils.createFile(fs, targetBase + "/A");
        DistCpOptions options = new DistCpOptions(Arrays.asList(new Path(sourceBase)), new Path("/out"));
        options.setSyncFolder(true);
        options.setDeleteMissing(true);
        options.appendToConf(conf);
        CopyListing listing = new GlobbedCopyListing(conf, CREDENTIALS);
        Path listingFile = new Path("/tmp1/" + String.valueOf(rand.nextLong()));
        listing.buildListing(listingFile, options);
        conf.set(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH, targetBase);
        conf.set(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH, targetBase);
        committer.commitJob(jobContext);
        if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, targetBase, sourceBase)) {
            Assert.fail("Source and target folders are not in sync");
        }
        Assert.assertEquals(fs.listStatus(new Path(targetBase)).length, 4);
        //Test for idempotent commit
        committer.commitJob(jobContext);
        if (!TestDistCpUtils.checkIfFoldersAreInSync(fs, targetBase, sourceBase)) {
            Assert.fail("Source and target folders are not in sync");
        }
        Assert.assertEquals(fs.listStatus(new Path(targetBase)).length, 4);
    } catch (IOException e) {
        LOG.error("Exception encountered while testing for delete missing", e);
        Assert.fail("Delete missing failure");
    } finally {
        TestDistCpUtils.delete(fs, "/tmp1");
        conf.set(DistCpConstants.CONF_LABEL_DELETE_MISSING, "false");
    }
}
Also used : Path(org.apache.hadoop.fs.Path) JobContextImpl(org.apache.hadoop.mapreduce.task.JobContextImpl) Configuration(org.apache.hadoop.conf.Configuration) IOException(java.io.IOException) CopyListing(org.apache.hadoop.tools.CopyListing) GlobbedCopyListing(org.apache.hadoop.tools.GlobbedCopyListing) DistCpOptions(org.apache.hadoop.tools.DistCpOptions) FileSystem(org.apache.hadoop.fs.FileSystem) GlobbedCopyListing(org.apache.hadoop.tools.GlobbedCopyListing)

Aggregations

DistCpOptions (org.apache.hadoop.tools.DistCpOptions)32 Test (org.junit.Test)22 Path (org.apache.hadoop.fs.Path)13 Configuration (org.apache.hadoop.conf.Configuration)10 FileSystem (org.apache.hadoop.fs.FileSystem)5 JobContextImpl (org.apache.hadoop.mapreduce.task.JobContextImpl)5 IOException (java.io.IOException)4 CopyListing (org.apache.hadoop.tools.CopyListing)4 GlobbedCopyListing (org.apache.hadoop.tools.GlobbedCopyListing)4 ArrayList (java.util.ArrayList)3 Text (org.apache.hadoop.io.Text)3 CopyListingFileStatus (org.apache.hadoop.tools.CopyListingFileStatus)3 DistCp (org.apache.hadoop.tools.DistCp)2 StubContext (org.apache.hadoop.tools.StubContext)2 FileNotFoundException (java.io.FileNotFoundException)1 InvocationTargetException (java.lang.reflect.InvocationTargetException)1 MalformedURLException (java.net.MalformedURLException)1 AccessControlException (java.security.AccessControlException)1 NoSuchAlgorithmException (java.security.NoSuchAlgorithmException)1 NoSuchElementException (java.util.NoSuchElementException)1