Search in sources :

Example 1 with CopyListingFileStatus

use of org.apache.hadoop.tools.CopyListingFileStatus in project hadoop by apache.

the class DynamicInputFormat method splitCopyListingIntoChunksWithShuffle.

private List<DynamicInputChunk> splitCopyListingIntoChunksWithShuffle(JobContext context) throws IOException {
    final Configuration configuration = context.getConfiguration();
    int numRecords = getNumberOfRecords(configuration);
    int numMaps = getNumMapTasks(configuration);
    int maxChunksTolerable = getMaxChunksTolerable(configuration);
    // Number of chunks each map will process, on average.
    int splitRatio = getListingSplitRatio(configuration, numMaps, numRecords);
    validateNumChunksUsing(splitRatio, numMaps, maxChunksTolerable);
    int numEntriesPerChunk = (int) Math.ceil((float) numRecords / (splitRatio * numMaps));
    DistCpUtils.publish(context.getConfiguration(), CONF_LABEL_NUM_ENTRIES_PER_CHUNK, numEntriesPerChunk);
    final int nChunksTotal = (int) Math.ceil((float) numRecords / numEntriesPerChunk);
    int nChunksOpenAtOnce = Math.min(N_CHUNKS_OPEN_AT_ONCE_DEFAULT, nChunksTotal);
    Path listingPath = getListingFilePath(configuration);
    SequenceFile.Reader reader = new SequenceFile.Reader(configuration, SequenceFile.Reader.file(listingPath));
    List<DynamicInputChunk> openChunks = new ArrayList<DynamicInputChunk>();
    List<DynamicInputChunk> chunksFinal = new ArrayList<DynamicInputChunk>();
    CopyListingFileStatus fileStatus = new CopyListingFileStatus();
    Text relPath = new Text();
    int recordCounter = 0;
    int chunkCount = 0;
    try {
        while (reader.next(relPath, fileStatus)) {
            if (recordCounter % (nChunksOpenAtOnce * numEntriesPerChunk) == 0) {
                // All chunks full. Create new chunk-set.
                closeAll(openChunks);
                chunksFinal.addAll(openChunks);
                openChunks = createChunks(chunkCount, nChunksTotal, nChunksOpenAtOnce);
                chunkCount += openChunks.size();
                nChunksOpenAtOnce = openChunks.size();
                recordCounter = 0;
            }
            // Shuffle into open chunks.
            openChunks.get(recordCounter % nChunksOpenAtOnce).write(relPath, fileStatus);
            ++recordCounter;
        }
    } finally {
        closeAll(openChunks);
        chunksFinal.addAll(openChunks);
        IOUtils.closeStream(reader);
    }
    LOG.info("Number of dynamic-chunk-files created: " + chunksFinal.size());
    return chunksFinal;
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) SequenceFile(org.apache.hadoop.io.SequenceFile) CopyListingFileStatus(org.apache.hadoop.tools.CopyListingFileStatus) ArrayList(java.util.ArrayList) Text(org.apache.hadoop.io.Text)

Example 2 with CopyListingFileStatus

use of org.apache.hadoop.tools.CopyListingFileStatus in project hadoop by apache.

the class DistCpUtils method toCopyListingFileStatus.

/**
   * Converts a FileStatus to a CopyListingFileStatus.  If preserving ACLs,
   * populates the CopyListingFileStatus with the ACLs. If preserving XAttrs,
   * populates the CopyListingFileStatus with the XAttrs.
   *
   * @param fileSystem FileSystem containing the file
   * @param fileStatus FileStatus of file
   * @param preserveAcls boolean true if preserving ACLs
   * @param preserveXAttrs boolean true if preserving XAttrs
   * @param preserveRawXAttrs boolean true if preserving raw.* XAttrs
   * @throws IOException if there is an I/O error
   */
public static CopyListingFileStatus toCopyListingFileStatus(FileSystem fileSystem, FileStatus fileStatus, boolean preserveAcls, boolean preserveXAttrs, boolean preserveRawXAttrs) throws IOException {
    CopyListingFileStatus copyListingFileStatus = new CopyListingFileStatus(fileStatus);
    if (preserveAcls) {
        FsPermission perm = fileStatus.getPermission();
        if (perm.getAclBit()) {
            List<AclEntry> aclEntries = fileSystem.getAclStatus(fileStatus.getPath()).getEntries();
            copyListingFileStatus.setAclEntries(aclEntries);
        }
    }
    if (preserveXAttrs || preserveRawXAttrs) {
        Map<String, byte[]> srcXAttrs = fileSystem.getXAttrs(fileStatus.getPath());
        if (preserveXAttrs && preserveRawXAttrs) {
            copyListingFileStatus.setXAttrs(srcXAttrs);
        } else {
            Map<String, byte[]> trgXAttrs = Maps.newHashMap();
            final String rawNS = StringUtils.toLowerCase(XAttr.NameSpace.RAW.name());
            for (Map.Entry<String, byte[]> ent : srcXAttrs.entrySet()) {
                final String xattrName = ent.getKey();
                if (xattrName.startsWith(rawNS)) {
                    if (preserveRawXAttrs) {
                        trgXAttrs.put(xattrName, ent.getValue());
                    }
                } else if (preserveXAttrs) {
                    trgXAttrs.put(xattrName, ent.getValue());
                }
            }
            copyListingFileStatus.setXAttrs(trgXAttrs);
        }
    }
    return copyListingFileStatus;
}
Also used : CopyListingFileStatus(org.apache.hadoop.tools.CopyListingFileStatus) AclEntry(org.apache.hadoop.fs.permission.AclEntry) FsPermission(org.apache.hadoop.fs.permission.FsPermission) Map(java.util.Map)

Example 3 with CopyListingFileStatus

use of org.apache.hadoop.tools.CopyListingFileStatus in project hadoop by apache.

the class RetriableFileCopyCommand method doExecute.

/**
   * Implementation of RetriableCommand::doExecute().
   * This is the actual copy-implementation.
   * @param arguments Argument-list to the command.
   * @return Number of bytes copied.
   * @throws Exception
   */
@SuppressWarnings("unchecked")
@Override
protected Object doExecute(Object... arguments) throws Exception {
    assert arguments.length == 4 : "Unexpected argument list.";
    CopyListingFileStatus source = (CopyListingFileStatus) arguments[0];
    assert !source.isDirectory() : "Unexpected file-status. Expected file.";
    Path target = (Path) arguments[1];
    Mapper.Context context = (Mapper.Context) arguments[2];
    EnumSet<FileAttribute> fileAttributes = (EnumSet<FileAttribute>) arguments[3];
    return doCopy(source, target, context, fileAttributes);
}
Also used : Path(org.apache.hadoop.fs.Path) Mapper(org.apache.hadoop.mapreduce.Mapper) CopyListingFileStatus(org.apache.hadoop.tools.CopyListingFileStatus) EnumSet(java.util.EnumSet) FileAttribute(org.apache.hadoop.tools.DistCpOptions.FileAttribute)

Example 4 with CopyListingFileStatus

use of org.apache.hadoop.tools.CopyListingFileStatus in project hadoop by apache.

the class TestDistCpUtils method testPreserveDefaults.

@Test
public void testPreserveDefaults() throws IOException {
    FileSystem fs = FileSystem.get(config);
    // preserve replication, block size, user, group, permission, 
    // checksum type and timestamps    
    EnumSet<FileAttribute> attributes = DistCpUtils.unpackAttributes(DistCpOptionSwitch.PRESERVE_STATUS_DEFAULT.substring(1));
    Path dst = new Path("/tmp/dest2");
    Path src = new Path("/tmp/src2");
    createFile(fs, src);
    createFile(fs, dst);
    fs.setPermission(src, fullPerm);
    fs.setOwner(src, "somebody", "somebody-group");
    fs.setTimes(src, 0, 0);
    fs.setReplication(src, (short) 1);
    fs.setPermission(dst, noPerm);
    fs.setOwner(dst, "nobody", "nobody-group");
    fs.setTimes(dst, 100, 100);
    fs.setReplication(dst, (short) 2);
    CopyListingFileStatus srcStatus = new CopyListingFileStatus(fs.getFileStatus(src));
    DistCpUtils.preserve(fs, dst, srcStatus, attributes, false);
    CopyListingFileStatus dstStatus = new CopyListingFileStatus(fs.getFileStatus(dst));
    // FileStatus.equals only compares path field, must explicitly compare all fields
    Assert.assertTrue(srcStatus.getPermission().equals(dstStatus.getPermission()));
    Assert.assertTrue(srcStatus.getOwner().equals(dstStatus.getOwner()));
    Assert.assertTrue(srcStatus.getGroup().equals(dstStatus.getGroup()));
    Assert.assertTrue(srcStatus.getAccessTime() == dstStatus.getAccessTime());
    Assert.assertTrue(srcStatus.getModificationTime() == dstStatus.getModificationTime());
    Assert.assertTrue(srcStatus.getReplication() == dstStatus.getReplication());
}
Also used : Path(org.apache.hadoop.fs.Path) CopyListingFileStatus(org.apache.hadoop.tools.CopyListingFileStatus) FileSystem(org.apache.hadoop.fs.FileSystem) FileAttribute(org.apache.hadoop.tools.DistCpOptions.FileAttribute) Test(org.junit.Test)

Example 5 with CopyListingFileStatus

use of org.apache.hadoop.tools.CopyListingFileStatus in project hadoop by apache.

the class TestDistCpUtils method testPreserveOnDirectoryDownwardRecursion.

@Test
public void testPreserveOnDirectoryDownwardRecursion() throws IOException {
    FileSystem fs = FileSystem.get(config);
    EnumSet<FileAttribute> attributes = EnumSet.allOf(FileAttribute.class);
    // Remove ACL because tests run with dfs.namenode.acls.enabled false
    attributes.remove(FileAttribute.ACL);
    Path src = new Path("/tmp/src2");
    Path f0 = new Path("/f0");
    Path f1 = new Path("/d1/f1");
    Path f2 = new Path("/d1/d2/f2");
    Path d1 = new Path("/d1/");
    Path d2 = new Path("/d1/d2/");
    Path root = new Path("/");
    createFile(fs, src);
    createFile(fs, f0);
    createFile(fs, f1);
    createFile(fs, f2);
    fs.setPermission(src, almostFullPerm);
    fs.setOwner(src, "somebody", "somebody-group");
    fs.setTimes(src, 0, 0);
    fs.setReplication(src, (short) 1);
    fs.setPermission(root, fullPerm);
    fs.setOwner(root, "anybody", "anybody-group");
    fs.setTimes(root, 400, 400);
    fs.setReplication(root, (short) 3);
    fs.setPermission(d1, fullPerm);
    fs.setOwner(d1, "anybody", "anybody-group");
    fs.setTimes(d1, 400, 400);
    fs.setReplication(d1, (short) 3);
    fs.setPermission(d2, fullPerm);
    fs.setOwner(d2, "anybody", "anybody-group");
    fs.setTimes(d2, 300, 300);
    fs.setReplication(d2, (short) 3);
    fs.setPermission(f0, fullPerm);
    fs.setOwner(f0, "anybody", "anybody-group");
    fs.setTimes(f0, 200, 200);
    fs.setReplication(f0, (short) 3);
    fs.setPermission(f1, fullPerm);
    fs.setOwner(f1, "anybody", "anybody-group");
    fs.setTimes(f1, 200, 200);
    fs.setReplication(f1, (short) 3);
    fs.setPermission(f2, fullPerm);
    fs.setOwner(f2, "anybody", "anybody-group");
    fs.setTimes(f2, 200, 200);
    fs.setReplication(f2, (short) 3);
    CopyListingFileStatus srcStatus = new CopyListingFileStatus(fs.getFileStatus(src));
    DistCpUtils.preserve(fs, root, srcStatus, attributes, false);
    cluster.triggerHeartbeats();
    // FileStatus.equals only compares path field, must explicitly compare all fields
    // attributes of src -> root ? should be yes
    CopyListingFileStatus rootStatus = new CopyListingFileStatus(fs.getFileStatus(root));
    Assert.assertTrue(srcStatus.getPermission().equals(rootStatus.getPermission()));
    Assert.assertTrue(srcStatus.getOwner().equals(rootStatus.getOwner()));
    Assert.assertTrue(srcStatus.getGroup().equals(rootStatus.getGroup()));
    Assert.assertTrue(srcStatus.getAccessTime() == rootStatus.getAccessTime());
    Assert.assertTrue(srcStatus.getModificationTime() == rootStatus.getModificationTime());
    Assert.assertTrue(srcStatus.getReplication() != rootStatus.getReplication());
    // attributes of src -> d1 ? should be no
    CopyListingFileStatus d1Status = new CopyListingFileStatus(fs.getFileStatus(d1));
    Assert.assertFalse(srcStatus.getPermission().equals(d1Status.getPermission()));
    Assert.assertFalse(srcStatus.getOwner().equals(d1Status.getOwner()));
    Assert.assertFalse(srcStatus.getGroup().equals(d1Status.getGroup()));
    Assert.assertFalse(srcStatus.getAccessTime() == d1Status.getAccessTime());
    Assert.assertFalse(srcStatus.getModificationTime() == d1Status.getModificationTime());
    Assert.assertTrue(srcStatus.getReplication() != d1Status.getReplication());
    // attributes of src -> d2 ? should be no
    CopyListingFileStatus d2Status = new CopyListingFileStatus(fs.getFileStatus(d2));
    Assert.assertFalse(srcStatus.getPermission().equals(d2Status.getPermission()));
    Assert.assertFalse(srcStatus.getOwner().equals(d2Status.getOwner()));
    Assert.assertFalse(srcStatus.getGroup().equals(d2Status.getGroup()));
    Assert.assertFalse(srcStatus.getAccessTime() == d2Status.getAccessTime());
    Assert.assertFalse(srcStatus.getModificationTime() == d2Status.getModificationTime());
    Assert.assertTrue(srcStatus.getReplication() != d2Status.getReplication());
    // attributes of src -> f0 ? should be no
    CopyListingFileStatus f0Status = new CopyListingFileStatus(fs.getFileStatus(f0));
    Assert.assertFalse(srcStatus.getPermission().equals(f0Status.getPermission()));
    Assert.assertFalse(srcStatus.getOwner().equals(f0Status.getOwner()));
    Assert.assertFalse(srcStatus.getGroup().equals(f0Status.getGroup()));
    Assert.assertFalse(srcStatus.getAccessTime() == f0Status.getAccessTime());
    Assert.assertFalse(srcStatus.getModificationTime() == f0Status.getModificationTime());
    Assert.assertFalse(srcStatus.getReplication() == f0Status.getReplication());
    // attributes of src -> f1 ? should be no
    CopyListingFileStatus f1Status = new CopyListingFileStatus(fs.getFileStatus(f1));
    Assert.assertFalse(srcStatus.getPermission().equals(f1Status.getPermission()));
    Assert.assertFalse(srcStatus.getOwner().equals(f1Status.getOwner()));
    Assert.assertFalse(srcStatus.getGroup().equals(f1Status.getGroup()));
    Assert.assertFalse(srcStatus.getAccessTime() == f1Status.getAccessTime());
    Assert.assertFalse(srcStatus.getModificationTime() == f1Status.getModificationTime());
    Assert.assertFalse(srcStatus.getReplication() == f1Status.getReplication());
    // attributes of src -> f2 ? should be no
    CopyListingFileStatus f2Status = new CopyListingFileStatus(fs.getFileStatus(f2));
    Assert.assertFalse(srcStatus.getPermission().equals(f2Status.getPermission()));
    Assert.assertFalse(srcStatus.getOwner().equals(f2Status.getOwner()));
    Assert.assertFalse(srcStatus.getGroup().equals(f2Status.getGroup()));
    Assert.assertFalse(srcStatus.getAccessTime() == f2Status.getAccessTime());
    Assert.assertFalse(srcStatus.getModificationTime() == f2Status.getModificationTime());
    Assert.assertFalse(srcStatus.getReplication() == f2Status.getReplication());
}
Also used : Path(org.apache.hadoop.fs.Path) CopyListingFileStatus(org.apache.hadoop.tools.CopyListingFileStatus) FileSystem(org.apache.hadoop.fs.FileSystem) FileAttribute(org.apache.hadoop.tools.DistCpOptions.FileAttribute) Test(org.junit.Test)

Aggregations

CopyListingFileStatus (org.apache.hadoop.tools.CopyListingFileStatus)44 Path (org.apache.hadoop.fs.Path)41 FileSystem (org.apache.hadoop.fs.FileSystem)36 Test (org.junit.Test)29 Text (org.apache.hadoop.io.Text)23 FileAttribute (org.apache.hadoop.tools.DistCpOptions.FileAttribute)20 StubContext (org.apache.hadoop.tools.StubContext)17 IOException (java.io.IOException)16 Mapper (org.apache.hadoop.mapreduce.Mapper)16 AccessControlException (org.apache.hadoop.security.AccessControlException)13 Configuration (org.apache.hadoop.conf.Configuration)11 DistCpOptions (org.apache.hadoop.tools.DistCpOptions)9 FileStatus (org.apache.hadoop.fs.FileStatus)8 FsPermission (org.apache.hadoop.fs.permission.FsPermission)6 SequenceFile (org.apache.hadoop.io.SequenceFile)5 UserGroupInformation (org.apache.hadoop.security.UserGroupInformation)5 ArrayList (java.util.ArrayList)3 OutputStream (java.io.OutputStream)2 AclEntry (org.apache.hadoop.fs.permission.AclEntry)2 FileSplit (org.apache.hadoop.mapreduce.lib.input.FileSplit)2