use of org.apache.hadoop.tools.CopyListingFileStatus in project hadoop by apache.
the class DynamicInputFormat method splitCopyListingIntoChunksWithShuffle.
private List<DynamicInputChunk> splitCopyListingIntoChunksWithShuffle(JobContext context) throws IOException {
final Configuration configuration = context.getConfiguration();
int numRecords = getNumberOfRecords(configuration);
int numMaps = getNumMapTasks(configuration);
int maxChunksTolerable = getMaxChunksTolerable(configuration);
// Number of chunks each map will process, on average.
int splitRatio = getListingSplitRatio(configuration, numMaps, numRecords);
validateNumChunksUsing(splitRatio, numMaps, maxChunksTolerable);
int numEntriesPerChunk = (int) Math.ceil((float) numRecords / (splitRatio * numMaps));
DistCpUtils.publish(context.getConfiguration(), CONF_LABEL_NUM_ENTRIES_PER_CHUNK, numEntriesPerChunk);
final int nChunksTotal = (int) Math.ceil((float) numRecords / numEntriesPerChunk);
int nChunksOpenAtOnce = Math.min(N_CHUNKS_OPEN_AT_ONCE_DEFAULT, nChunksTotal);
Path listingPath = getListingFilePath(configuration);
SequenceFile.Reader reader = new SequenceFile.Reader(configuration, SequenceFile.Reader.file(listingPath));
List<DynamicInputChunk> openChunks = new ArrayList<DynamicInputChunk>();
List<DynamicInputChunk> chunksFinal = new ArrayList<DynamicInputChunk>();
CopyListingFileStatus fileStatus = new CopyListingFileStatus();
Text relPath = new Text();
int recordCounter = 0;
int chunkCount = 0;
try {
while (reader.next(relPath, fileStatus)) {
if (recordCounter % (nChunksOpenAtOnce * numEntriesPerChunk) == 0) {
// All chunks full. Create new chunk-set.
closeAll(openChunks);
chunksFinal.addAll(openChunks);
openChunks = createChunks(chunkCount, nChunksTotal, nChunksOpenAtOnce);
chunkCount += openChunks.size();
nChunksOpenAtOnce = openChunks.size();
recordCounter = 0;
}
// Shuffle into open chunks.
openChunks.get(recordCounter % nChunksOpenAtOnce).write(relPath, fileStatus);
++recordCounter;
}
} finally {
closeAll(openChunks);
chunksFinal.addAll(openChunks);
IOUtils.closeStream(reader);
}
LOG.info("Number of dynamic-chunk-files created: " + chunksFinal.size());
return chunksFinal;
}
use of org.apache.hadoop.tools.CopyListingFileStatus in project hadoop by apache.
the class DistCpUtils method toCopyListingFileStatus.
/**
* Converts a FileStatus to a CopyListingFileStatus. If preserving ACLs,
* populates the CopyListingFileStatus with the ACLs. If preserving XAttrs,
* populates the CopyListingFileStatus with the XAttrs.
*
* @param fileSystem FileSystem containing the file
* @param fileStatus FileStatus of file
* @param preserveAcls boolean true if preserving ACLs
* @param preserveXAttrs boolean true if preserving XAttrs
* @param preserveRawXAttrs boolean true if preserving raw.* XAttrs
* @throws IOException if there is an I/O error
*/
public static CopyListingFileStatus toCopyListingFileStatus(FileSystem fileSystem, FileStatus fileStatus, boolean preserveAcls, boolean preserveXAttrs, boolean preserveRawXAttrs) throws IOException {
CopyListingFileStatus copyListingFileStatus = new CopyListingFileStatus(fileStatus);
if (preserveAcls) {
FsPermission perm = fileStatus.getPermission();
if (perm.getAclBit()) {
List<AclEntry> aclEntries = fileSystem.getAclStatus(fileStatus.getPath()).getEntries();
copyListingFileStatus.setAclEntries(aclEntries);
}
}
if (preserveXAttrs || preserveRawXAttrs) {
Map<String, byte[]> srcXAttrs = fileSystem.getXAttrs(fileStatus.getPath());
if (preserveXAttrs && preserveRawXAttrs) {
copyListingFileStatus.setXAttrs(srcXAttrs);
} else {
Map<String, byte[]> trgXAttrs = Maps.newHashMap();
final String rawNS = StringUtils.toLowerCase(XAttr.NameSpace.RAW.name());
for (Map.Entry<String, byte[]> ent : srcXAttrs.entrySet()) {
final String xattrName = ent.getKey();
if (xattrName.startsWith(rawNS)) {
if (preserveRawXAttrs) {
trgXAttrs.put(xattrName, ent.getValue());
}
} else if (preserveXAttrs) {
trgXAttrs.put(xattrName, ent.getValue());
}
}
copyListingFileStatus.setXAttrs(trgXAttrs);
}
}
return copyListingFileStatus;
}
use of org.apache.hadoop.tools.CopyListingFileStatus in project hadoop by apache.
the class RetriableFileCopyCommand method doExecute.
/**
* Implementation of RetriableCommand::doExecute().
* This is the actual copy-implementation.
* @param arguments Argument-list to the command.
* @return Number of bytes copied.
* @throws Exception
*/
@SuppressWarnings("unchecked")
@Override
protected Object doExecute(Object... arguments) throws Exception {
assert arguments.length == 4 : "Unexpected argument list.";
CopyListingFileStatus source = (CopyListingFileStatus) arguments[0];
assert !source.isDirectory() : "Unexpected file-status. Expected file.";
Path target = (Path) arguments[1];
Mapper.Context context = (Mapper.Context) arguments[2];
EnumSet<FileAttribute> fileAttributes = (EnumSet<FileAttribute>) arguments[3];
return doCopy(source, target, context, fileAttributes);
}
use of org.apache.hadoop.tools.CopyListingFileStatus in project hadoop by apache.
the class TestDistCpUtils method testPreserveDefaults.
@Test
public void testPreserveDefaults() throws IOException {
FileSystem fs = FileSystem.get(config);
// preserve replication, block size, user, group, permission,
// checksum type and timestamps
EnumSet<FileAttribute> attributes = DistCpUtils.unpackAttributes(DistCpOptionSwitch.PRESERVE_STATUS_DEFAULT.substring(1));
Path dst = new Path("/tmp/dest2");
Path src = new Path("/tmp/src2");
createFile(fs, src);
createFile(fs, dst);
fs.setPermission(src, fullPerm);
fs.setOwner(src, "somebody", "somebody-group");
fs.setTimes(src, 0, 0);
fs.setReplication(src, (short) 1);
fs.setPermission(dst, noPerm);
fs.setOwner(dst, "nobody", "nobody-group");
fs.setTimes(dst, 100, 100);
fs.setReplication(dst, (short) 2);
CopyListingFileStatus srcStatus = new CopyListingFileStatus(fs.getFileStatus(src));
DistCpUtils.preserve(fs, dst, srcStatus, attributes, false);
CopyListingFileStatus dstStatus = new CopyListingFileStatus(fs.getFileStatus(dst));
// FileStatus.equals only compares path field, must explicitly compare all fields
Assert.assertTrue(srcStatus.getPermission().equals(dstStatus.getPermission()));
Assert.assertTrue(srcStatus.getOwner().equals(dstStatus.getOwner()));
Assert.assertTrue(srcStatus.getGroup().equals(dstStatus.getGroup()));
Assert.assertTrue(srcStatus.getAccessTime() == dstStatus.getAccessTime());
Assert.assertTrue(srcStatus.getModificationTime() == dstStatus.getModificationTime());
Assert.assertTrue(srcStatus.getReplication() == dstStatus.getReplication());
}
use of org.apache.hadoop.tools.CopyListingFileStatus in project hadoop by apache.
the class TestDistCpUtils method testPreserveOnDirectoryDownwardRecursion.
@Test
public void testPreserveOnDirectoryDownwardRecursion() throws IOException {
FileSystem fs = FileSystem.get(config);
EnumSet<FileAttribute> attributes = EnumSet.allOf(FileAttribute.class);
// Remove ACL because tests run with dfs.namenode.acls.enabled false
attributes.remove(FileAttribute.ACL);
Path src = new Path("/tmp/src2");
Path f0 = new Path("/f0");
Path f1 = new Path("/d1/f1");
Path f2 = new Path("/d1/d2/f2");
Path d1 = new Path("/d1/");
Path d2 = new Path("/d1/d2/");
Path root = new Path("/");
createFile(fs, src);
createFile(fs, f0);
createFile(fs, f1);
createFile(fs, f2);
fs.setPermission(src, almostFullPerm);
fs.setOwner(src, "somebody", "somebody-group");
fs.setTimes(src, 0, 0);
fs.setReplication(src, (short) 1);
fs.setPermission(root, fullPerm);
fs.setOwner(root, "anybody", "anybody-group");
fs.setTimes(root, 400, 400);
fs.setReplication(root, (short) 3);
fs.setPermission(d1, fullPerm);
fs.setOwner(d1, "anybody", "anybody-group");
fs.setTimes(d1, 400, 400);
fs.setReplication(d1, (short) 3);
fs.setPermission(d2, fullPerm);
fs.setOwner(d2, "anybody", "anybody-group");
fs.setTimes(d2, 300, 300);
fs.setReplication(d2, (short) 3);
fs.setPermission(f0, fullPerm);
fs.setOwner(f0, "anybody", "anybody-group");
fs.setTimes(f0, 200, 200);
fs.setReplication(f0, (short) 3);
fs.setPermission(f1, fullPerm);
fs.setOwner(f1, "anybody", "anybody-group");
fs.setTimes(f1, 200, 200);
fs.setReplication(f1, (short) 3);
fs.setPermission(f2, fullPerm);
fs.setOwner(f2, "anybody", "anybody-group");
fs.setTimes(f2, 200, 200);
fs.setReplication(f2, (short) 3);
CopyListingFileStatus srcStatus = new CopyListingFileStatus(fs.getFileStatus(src));
DistCpUtils.preserve(fs, root, srcStatus, attributes, false);
cluster.triggerHeartbeats();
// FileStatus.equals only compares path field, must explicitly compare all fields
// attributes of src -> root ? should be yes
CopyListingFileStatus rootStatus = new CopyListingFileStatus(fs.getFileStatus(root));
Assert.assertTrue(srcStatus.getPermission().equals(rootStatus.getPermission()));
Assert.assertTrue(srcStatus.getOwner().equals(rootStatus.getOwner()));
Assert.assertTrue(srcStatus.getGroup().equals(rootStatus.getGroup()));
Assert.assertTrue(srcStatus.getAccessTime() == rootStatus.getAccessTime());
Assert.assertTrue(srcStatus.getModificationTime() == rootStatus.getModificationTime());
Assert.assertTrue(srcStatus.getReplication() != rootStatus.getReplication());
// attributes of src -> d1 ? should be no
CopyListingFileStatus d1Status = new CopyListingFileStatus(fs.getFileStatus(d1));
Assert.assertFalse(srcStatus.getPermission().equals(d1Status.getPermission()));
Assert.assertFalse(srcStatus.getOwner().equals(d1Status.getOwner()));
Assert.assertFalse(srcStatus.getGroup().equals(d1Status.getGroup()));
Assert.assertFalse(srcStatus.getAccessTime() == d1Status.getAccessTime());
Assert.assertFalse(srcStatus.getModificationTime() == d1Status.getModificationTime());
Assert.assertTrue(srcStatus.getReplication() != d1Status.getReplication());
// attributes of src -> d2 ? should be no
CopyListingFileStatus d2Status = new CopyListingFileStatus(fs.getFileStatus(d2));
Assert.assertFalse(srcStatus.getPermission().equals(d2Status.getPermission()));
Assert.assertFalse(srcStatus.getOwner().equals(d2Status.getOwner()));
Assert.assertFalse(srcStatus.getGroup().equals(d2Status.getGroup()));
Assert.assertFalse(srcStatus.getAccessTime() == d2Status.getAccessTime());
Assert.assertFalse(srcStatus.getModificationTime() == d2Status.getModificationTime());
Assert.assertTrue(srcStatus.getReplication() != d2Status.getReplication());
// attributes of src -> f0 ? should be no
CopyListingFileStatus f0Status = new CopyListingFileStatus(fs.getFileStatus(f0));
Assert.assertFalse(srcStatus.getPermission().equals(f0Status.getPermission()));
Assert.assertFalse(srcStatus.getOwner().equals(f0Status.getOwner()));
Assert.assertFalse(srcStatus.getGroup().equals(f0Status.getGroup()));
Assert.assertFalse(srcStatus.getAccessTime() == f0Status.getAccessTime());
Assert.assertFalse(srcStatus.getModificationTime() == f0Status.getModificationTime());
Assert.assertFalse(srcStatus.getReplication() == f0Status.getReplication());
// attributes of src -> f1 ? should be no
CopyListingFileStatus f1Status = new CopyListingFileStatus(fs.getFileStatus(f1));
Assert.assertFalse(srcStatus.getPermission().equals(f1Status.getPermission()));
Assert.assertFalse(srcStatus.getOwner().equals(f1Status.getOwner()));
Assert.assertFalse(srcStatus.getGroup().equals(f1Status.getGroup()));
Assert.assertFalse(srcStatus.getAccessTime() == f1Status.getAccessTime());
Assert.assertFalse(srcStatus.getModificationTime() == f1Status.getModificationTime());
Assert.assertFalse(srcStatus.getReplication() == f1Status.getReplication());
// attributes of src -> f2 ? should be no
CopyListingFileStatus f2Status = new CopyListingFileStatus(fs.getFileStatus(f2));
Assert.assertFalse(srcStatus.getPermission().equals(f2Status.getPermission()));
Assert.assertFalse(srcStatus.getOwner().equals(f2Status.getOwner()));
Assert.assertFalse(srcStatus.getGroup().equals(f2Status.getGroup()));
Assert.assertFalse(srcStatus.getAccessTime() == f2Status.getAccessTime());
Assert.assertFalse(srcStatus.getModificationTime() == f2Status.getModificationTime());
Assert.assertFalse(srcStatus.getReplication() == f2Status.getReplication());
}
Aggregations