Examples with CopyListingFileStatus - org.apache.hadoop.tools.CopyListingFileStatus

Example 21 with CopyListingFileStatus

use of org.apache.hadoop.tools.CopyListingFileStatus in project hadoop by apache.

the class CopyCommitter method preserveFileAttributesForDirectories.

// This method changes the target-directories' file-attributes (owner,
// user/group permissions, etc.) based on the corresponding source directories.
private void preserveFileAttributesForDirectories(Configuration conf) throws IOException {
    String attrSymbols = conf.get(DistCpConstants.CONF_LABEL_PRESERVE_STATUS);
    final boolean syncOrOverwrite = syncFolder || overwrite;
    LOG.info("About to preserve attributes: " + attrSymbols);
    EnumSet<FileAttribute> attributes = DistCpUtils.unpackAttributes(attrSymbols);
    final boolean preserveRawXattrs = conf.getBoolean(DistCpConstants.CONF_LABEL_PRESERVE_RAWXATTRS, false);
    Path sourceListing = new Path(conf.get(DistCpConstants.CONF_LABEL_LISTING_FILE_PATH));
    FileSystem clusterFS = sourceListing.getFileSystem(conf);
    SequenceFile.Reader sourceReader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(sourceListing));
    long totalLen = clusterFS.getFileStatus(sourceListing).getLen();
    Path targetRoot = new Path(conf.get(DistCpConstants.CONF_LABEL_TARGET_WORK_PATH));
    long preservedEntries = 0;
    try {
        CopyListingFileStatus srcFileStatus = new CopyListingFileStatus();
        Text srcRelPath = new Text();
        // Iterate over every source path that was copied.
        while (sourceReader.next(srcRelPath, srcFileStatus)) {
            // in the map-task.
            if (!srcFileStatus.isDirectory())
                continue;
            Path targetFile = new Path(targetRoot.toString() + "/" + srcRelPath);
            //
            if (targetRoot.equals(targetFile) && syncOrOverwrite)
                continue;
            FileSystem targetFS = targetFile.getFileSystem(conf);
            DistCpUtils.preserve(targetFS, targetFile, srcFileStatus, attributes, preserveRawXattrs);
            taskAttemptContext.progress();
            taskAttemptContext.setStatus("Preserving status on directory entries. [" + sourceReader.getPosition() * 100 / totalLen + "%]");
        }
    } finally {
        IOUtils.closeStream(sourceReader);
    }
    LOG.info("Preserved status on " + preservedEntries + " dir entries on target");
}

Also used : Path(org.apache.hadoop.fs.Path) SequenceFile(org.apache.hadoop.io.SequenceFile) CopyListingFileStatus(org.apache.hadoop.tools.CopyListingFileStatus) FileSystem(org.apache.hadoop.fs.FileSystem) Text(org.apache.hadoop.io.Text) FileAttribute(org.apache.hadoop.tools.DistCpOptions.FileAttribute)

Example 22 with CopyListingFileStatus

use of org.apache.hadoop.tools.CopyListingFileStatus in project hadoop by apache.

the class CopyCommitter method deleteMissing.

// This method deletes "extra" files from the target, if they're not
// available at the source.
private void deleteMissing(Configuration conf) throws IOException {
    LOG.info("-delete option is enabled. About to remove entries from " + "target that are missing in source");
    // Sort the source-file listing alphabetically.
    Path sourceListing = new Path(conf.get(DistCpConstants.CONF_LABEL_LISTING_FILE_PATH));
    FileSystem clusterFS = sourceListing.getFileSystem(conf);
    Path sortedSourceListing = DistCpUtils.sortListing(clusterFS, conf, sourceListing);
    // Similarly, create the listing of target-files. Sort alphabetically.
    Path targetListing = new Path(sourceListing.getParent(), "targetListing.seq");
    CopyListing target = new GlobbedCopyListing(new Configuration(conf), null);
    List<Path> targets = new ArrayList<Path>(1);
    Path targetFinalPath = new Path(conf.get(DistCpConstants.CONF_LABEL_TARGET_FINAL_PATH));
    targets.add(targetFinalPath);
    Path resultNonePath = Path.getPathWithoutSchemeAndAuthority(targetFinalPath).toString().startsWith(DistCpConstants.HDFS_RESERVED_RAW_DIRECTORY_NAME) ? DistCpConstants.RAW_NONE_PATH : DistCpConstants.NONE_PATH;
    DistCpOptions options = new DistCpOptions(targets, resultNonePath);
    //
    // Set up options to be the same from the CopyListing.buildListing's perspective,
    // so to collect similar listings as when doing the copy
    //
    options.setOverwrite(overwrite);
    options.setSyncFolder(syncFolder);
    options.setTargetPathExists(targetPathExists);
    target.buildListing(targetListing, options);
    Path sortedTargetListing = DistCpUtils.sortListing(clusterFS, conf, targetListing);
    long totalLen = clusterFS.getFileStatus(sortedTargetListing).getLen();
    SequenceFile.Reader sourceReader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(sortedSourceListing));
    SequenceFile.Reader targetReader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(sortedTargetListing));
    // Walk both source and target file listings.
    // Delete all from target that doesn't also exist on source.
    long deletedEntries = 0;
    try {
        CopyListingFileStatus srcFileStatus = new CopyListingFileStatus();
        Text srcRelPath = new Text();
        CopyListingFileStatus trgtFileStatus = new CopyListingFileStatus();
        Text trgtRelPath = new Text();
        FileSystem targetFS = targetFinalPath.getFileSystem(conf);
        boolean srcAvailable = sourceReader.next(srcRelPath, srcFileStatus);
        while (targetReader.next(trgtRelPath, trgtFileStatus)) {
            // Skip sources that don't exist on target.
            while (srcAvailable && trgtRelPath.compareTo(srcRelPath) > 0) {
                srcAvailable = sourceReader.next(srcRelPath, srcFileStatus);
            }
            if (srcAvailable && trgtRelPath.equals(srcRelPath))
                continue;
            // Target doesn't exist at source. Delete.
            boolean result = targetFS.delete(trgtFileStatus.getPath(), true) || !targetFS.exists(trgtFileStatus.getPath());
            if (result) {
                LOG.info("Deleted " + trgtFileStatus.getPath() + " - Missing at source");
                deletedEntries++;
            } else {
                throw new IOException("Unable to delete " + trgtFileStatus.getPath());
            }
            taskAttemptContext.progress();
            taskAttemptContext.setStatus("Deleting missing files from target. [" + targetReader.getPosition() * 100 / totalLen + "%]");
        }
    } finally {
        IOUtils.closeStream(sourceReader);
        IOUtils.closeStream(targetReader);
    }
    LOG.info("Deleted " + deletedEntries + " from target: " + targets.get(0));
}

Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) Text(org.apache.hadoop.io.Text) IOException(java.io.IOException) CopyListing(org.apache.hadoop.tools.CopyListing) GlobbedCopyListing(org.apache.hadoop.tools.GlobbedCopyListing) DistCpOptions(org.apache.hadoop.tools.DistCpOptions) SequenceFile(org.apache.hadoop.io.SequenceFile) CopyListingFileStatus(org.apache.hadoop.tools.CopyListingFileStatus) FileSystem(org.apache.hadoop.fs.FileSystem) GlobbedCopyListing(org.apache.hadoop.tools.GlobbedCopyListing)

Example 23 with CopyListingFileStatus

use of org.apache.hadoop.tools.CopyListingFileStatus in project hadoop by apache.

the class CopyMapper method map.

/**
   * Implementation of the Mapper::map(). Does the copy.
   * @param relPath The target path.
   * @param sourceFileStatus The source path.
   * @throws IOException
   * @throws InterruptedException
   */
@Override
public void map(Text relPath, CopyListingFileStatus sourceFileStatus, Context context) throws IOException, InterruptedException {
    Path sourcePath = sourceFileStatus.getPath();
    if (LOG.isDebugEnabled())
        LOG.debug("DistCpMapper::map(): Received " + sourcePath + ", " + relPath);
    Path target = new Path(targetWorkPath.makeQualified(targetFS.getUri(), targetFS.getWorkingDirectory()) + relPath.toString());
    EnumSet<DistCpOptions.FileAttribute> fileAttributes = getFileAttributeSettings(context);
    final boolean preserveRawXattrs = context.getConfiguration().getBoolean(DistCpConstants.CONF_LABEL_PRESERVE_RAWXATTRS, false);
    final String description = "Copying " + sourcePath + " to " + target;
    context.setStatus(description);
    LOG.info(description);
    try {
        CopyListingFileStatus sourceCurrStatus;
        FileSystem sourceFS;
        try {
            sourceFS = sourcePath.getFileSystem(conf);
            final boolean preserveXAttrs = fileAttributes.contains(FileAttribute.XATTR);
            sourceCurrStatus = DistCpUtils.toCopyListingFileStatus(sourceFS, sourceFS.getFileStatus(sourcePath), fileAttributes.contains(FileAttribute.ACL), preserveXAttrs, preserveRawXattrs);
        } catch (FileNotFoundException e) {
            throw new IOException(new RetriableFileCopyCommand.CopyReadException(e));
        }
        FileStatus targetStatus = null;
        try {
            targetStatus = targetFS.getFileStatus(target);
        } catch (FileNotFoundException ignore) {
            if (LOG.isDebugEnabled())
                LOG.debug("Path could not be found: " + target, ignore);
        }
        if (targetStatus != null && (targetStatus.isDirectory() != sourceCurrStatus.isDirectory())) {
            throw new IOException("Can't replace " + target + ". Target is " + getFileType(targetStatus) + ", Source is " + getFileType(sourceCurrStatus));
        }
        if (sourceCurrStatus.isDirectory()) {
            createTargetDirsWithRetry(description, target, context);
            return;
        }
        FileAction action = checkUpdate(sourceFS, sourceCurrStatus, target, targetStatus);
        if (action == FileAction.SKIP) {
            LOG.info("Skipping copy of " + sourceCurrStatus.getPath() + " to " + target);
            updateSkipCounters(context, sourceCurrStatus);
            context.write(null, new Text("SKIP: " + sourceCurrStatus.getPath()));
        } else {
            copyFileWithRetry(description, sourceCurrStatus, target, context, action, fileAttributes);
        }
        DistCpUtils.preserve(target.getFileSystem(conf), target, sourceCurrStatus, fileAttributes, preserveRawXattrs);
    } catch (IOException exception) {
        handleFailures(exception, sourceFileStatus, target, context);
    }
}

Also used : Path(org.apache.hadoop.fs.Path) FileStatus(org.apache.hadoop.fs.FileStatus) CopyListingFileStatus(org.apache.hadoop.tools.CopyListingFileStatus) FileNotFoundException(java.io.FileNotFoundException) Text(org.apache.hadoop.io.Text) IOException(java.io.IOException) CopyReadException(org.apache.hadoop.tools.mapred.RetriableFileCopyCommand.CopyReadException) CopyListingFileStatus(org.apache.hadoop.tools.CopyListingFileStatus) FileSystem(org.apache.hadoop.fs.FileSystem) FileAttribute(org.apache.hadoop.tools.DistCpOptions.FileAttribute)

Example 24 with CopyListingFileStatus

use of org.apache.hadoop.tools.CopyListingFileStatus in project hadoop by apache.

the class UniformSizeInputFormat method getSplits.

private List<InputSplit> getSplits(Configuration configuration, int numSplits, long totalSizeBytes) throws IOException {
    List<InputSplit> splits = new ArrayList<InputSplit>(numSplits);
    long nBytesPerSplit = (long) Math.ceil(totalSizeBytes * 1.0 / numSplits);
    CopyListingFileStatus srcFileStatus = new CopyListingFileStatus();
    Text srcRelPath = new Text();
    long currentSplitSize = 0;
    long lastSplitStart = 0;
    long lastPosition = 0;
    final Path listingFilePath = getListingFilePath(configuration);
    if (LOG.isDebugEnabled()) {
        LOG.debug("Average bytes per map: " + nBytesPerSplit + ", Number of maps: " + numSplits + ", total size: " + totalSizeBytes);
    }
    SequenceFile.Reader reader = null;
    try {
        reader = getListingFileReader(configuration);
        while (reader.next(srcRelPath, srcFileStatus)) {
            // limit. Add the current file to new split
            if (currentSplitSize + srcFileStatus.getLen() > nBytesPerSplit && lastPosition != 0) {
                FileSplit split = new FileSplit(listingFilePath, lastSplitStart, lastPosition - lastSplitStart, null);
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Creating split : " + split + ", bytes in split: " + currentSplitSize);
                }
                splits.add(split);
                lastSplitStart = lastPosition;
                currentSplitSize = 0;
            }
            currentSplitSize += srcFileStatus.getLen();
            lastPosition = reader.getPosition();
        }
        if (lastPosition > lastSplitStart) {
            FileSplit split = new FileSplit(listingFilePath, lastSplitStart, lastPosition - lastSplitStart, null);
            if (LOG.isDebugEnabled()) {
                LOG.debug("Creating split : " + split + ", bytes in split: " + currentSplitSize);
            }
            splits.add(split);
        }
    } finally {
        IOUtils.closeStream(reader);
    }
    return splits;
}

Also used : Path(org.apache.hadoop.fs.Path) SequenceFile(org.apache.hadoop.io.SequenceFile) CopyListingFileStatus(org.apache.hadoop.tools.CopyListingFileStatus) ArrayList(java.util.ArrayList) Text(org.apache.hadoop.io.Text) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) InputSplit(org.apache.hadoop.mapreduce.InputSplit)

Example 25 with CopyListingFileStatus

use of org.apache.hadoop.tools.CopyListingFileStatus in project hadoop by apache.

the class DistCpUtils method preserve.

/**
   * Preserve attribute on file matching that of the file status being sent
   * as argument. Barring the block size, all the other attributes are preserved
   * by this function
   *
   * @param targetFS - File system
   * @param path - Path that needs to preserve original file status
   * @param srcFileStatus - Original file status
   * @param attributes - Attribute set that needs to be preserved
   * @param preserveRawXattrs if true, raw.* xattrs should be preserved
   * @throws IOException - Exception if any (particularly relating to group/owner
   *                       change or any transient error)
   */
public static void preserve(FileSystem targetFS, Path path, CopyListingFileStatus srcFileStatus, EnumSet<FileAttribute> attributes, boolean preserveRawXattrs) throws IOException {
    // If not preserving anything from FileStatus, don't bother fetching it.
    FileStatus targetFileStatus = attributes.isEmpty() ? null : targetFS.getFileStatus(path);
    String group = targetFileStatus == null ? null : targetFileStatus.getGroup();
    String user = targetFileStatus == null ? null : targetFileStatus.getOwner();
    boolean chown = false;
    if (attributes.contains(FileAttribute.ACL)) {
        List<AclEntry> srcAcl = srcFileStatus.getAclEntries();
        List<AclEntry> targetAcl = getAcl(targetFS, targetFileStatus);
        if (!srcAcl.equals(targetAcl)) {
            targetFS.setAcl(path, srcAcl);
        }
        // setAcl doesn't preserve sticky bit, so also call setPermission if needed.
        if (srcFileStatus.getPermission().getStickyBit() != targetFileStatus.getPermission().getStickyBit()) {
            targetFS.setPermission(path, srcFileStatus.getPermission());
        }
    } else if (attributes.contains(FileAttribute.PERMISSION) && !srcFileStatus.getPermission().equals(targetFileStatus.getPermission())) {
        targetFS.setPermission(path, srcFileStatus.getPermission());
    }
    final boolean preserveXAttrs = attributes.contains(FileAttribute.XATTR);
    if (preserveXAttrs || preserveRawXattrs) {
        final String rawNS = StringUtils.toLowerCase(XAttr.NameSpace.RAW.name());
        Map<String, byte[]> srcXAttrs = srcFileStatus.getXAttrs();
        Map<String, byte[]> targetXAttrs = getXAttrs(targetFS, path);
        if (srcXAttrs != null && !srcXAttrs.equals(targetXAttrs)) {
            for (Entry<String, byte[]> entry : srcXAttrs.entrySet()) {
                String xattrName = entry.getKey();
                if (xattrName.startsWith(rawNS) || preserveXAttrs) {
                    targetFS.setXAttr(path, xattrName, entry.getValue());
                }
            }
        }
    }
    if (attributes.contains(FileAttribute.REPLICATION) && !targetFileStatus.isDirectory() && (srcFileStatus.getReplication() != targetFileStatus.getReplication())) {
        targetFS.setReplication(path, srcFileStatus.getReplication());
    }
    if (attributes.contains(FileAttribute.GROUP) && !group.equals(srcFileStatus.getGroup())) {
        group = srcFileStatus.getGroup();
        chown = true;
    }
    if (attributes.contains(FileAttribute.USER) && !user.equals(srcFileStatus.getOwner())) {
        user = srcFileStatus.getOwner();
        chown = true;
    }
    if (chown) {
        targetFS.setOwner(path, user, group);
    }
    if (attributes.contains(FileAttribute.TIMES)) {
        targetFS.setTimes(path, srcFileStatus.getModificationTime(), srcFileStatus.getAccessTime());
    }
}

Also used : FileStatus(org.apache.hadoop.fs.FileStatus) CopyListingFileStatus(org.apache.hadoop.tools.CopyListingFileStatus) AclEntry(org.apache.hadoop.fs.permission.AclEntry)

Aggregations

CopyListingFileStatus (org.apache.hadoop.tools.CopyListingFileStatus)44 Path (org.apache.hadoop.fs.Path)41 FileSystem (org.apache.hadoop.fs.FileSystem)36 Test (org.junit.Test)29 Text (org.apache.hadoop.io.Text)23 FileAttribute (org.apache.hadoop.tools.DistCpOptions.FileAttribute)20 StubContext (org.apache.hadoop.tools.StubContext)17 IOException (java.io.IOException)16 Mapper (org.apache.hadoop.mapreduce.Mapper)16 AccessControlException (org.apache.hadoop.security.AccessControlException)13 Configuration (org.apache.hadoop.conf.Configuration)11 DistCpOptions (org.apache.hadoop.tools.DistCpOptions)9 FileStatus (org.apache.hadoop.fs.FileStatus)8 FsPermission (org.apache.hadoop.fs.permission.FsPermission)6 SequenceFile (org.apache.hadoop.io.SequenceFile)5 UserGroupInformation (org.apache.hadoop.security.UserGroupInformation)5 ArrayList (java.util.ArrayList)3 OutputStream (java.io.OutputStream)2 AclEntry (org.apache.hadoop.fs.permission.AclEntry)2 FileSplit (org.apache.hadoop.mapreduce.lib.input.FileSplit)2