   * Choose a random node based on given scope, excludedScope and excludedNodes
   * set. Although in general the topology has at most three layers, this class
   * will not impose such assumption.
   * At high level, the idea is like this, say:
   * R has two children A and B, and storage type is X, say:
   * A has X = 6 (rooted at A there are 6 datanodes with X) and B has X = 8.
   * Then R will generate a random int between 1~14, if it's <= 6, recursively
   * call into A, otherwise B. This will maintain a uniformed randomness of
   * choosing datanodes.
   * The tricky part is how to handle excludes.
   * For excludedNodes, since this set is small: currently the main reason of
   * being an excluded node is because it already has a replica. So randomly
   * picking up this node again should be rare. Thus we only check that, if the
   * chosen node is excluded, we do chooseRandom again.
   * For excludedScope, we locate the root of the excluded scope. Subtracting
   * all it's ancestors' storage counters accordingly, this way the excluded
   * root is out of the picture.
   * TODO : this function has duplicate code as NetworkTopology, need to
   * refactor in the future.
   * @param scope
   * @param excludedScope
   * @param excludedNodes
   * @return
Node chooseRandomWithStorageType(final String scope, String excludedScope, final Collection<Node> excludedNodes, StorageType type) {
    if (excludedScope != null) {
        if (scope.startsWith(excludedScope)) {
            return null;
        if (!excludedScope.startsWith(scope)) {
            excludedScope = null;
    Node node = getNode(scope);
    if (node == null) {
        LOG.debug("Invalid scope {}, non-existing node", scope);
        return null;
    if (!(node instanceof DFSTopologyNodeImpl)) {
        // a node is either DFSTopologyNodeImpl, or a DatanodeDescriptor
        return ((DatanodeDescriptor) node).hasStorageType(type) ? node : null;
    DFSTopologyNodeImpl root = (DFSTopologyNodeImpl) node;
    Node excludeRoot = excludedScope == null ? null : getNode(excludedScope);
    // check to see if there are nodes satisfying the condition at all
    int availableCount = root.getSubtreeStorageCount(type);
    if (excludeRoot != null && root.isAncestor(excludeRoot)) {
        if (excludeRoot instanceof DFSTopologyNodeImpl) {
            availableCount -= ((DFSTopologyNodeImpl) excludeRoot).getSubtreeStorageCount(type);
        } else {
            availableCount -= ((DatanodeDescriptor) excludeRoot).hasStorageType(type) ? 1 : 0;
    if (excludedNodes != null) {
        for (Node excludedNode : excludedNodes) {
            // all excluded nodes should be DatanodeDescriptor
            Preconditions.checkArgument(excludedNode instanceof DatanodeDescriptor);
            availableCount -= ((DatanodeDescriptor) excludedNode).hasStorageType(type) ? 1 : 0;
    if (availableCount <= 0) {
        // should never be <0 in general, adding <0 check for safety purpose
        return null;
    // to this point, it is guaranteed that there is at least one node
    // that satisfies the requirement, keep trying until we found one.
    Node chosen;
    do {
        chosen = chooseRandomWithStorageTypeAndExcludeRoot(root, excludeRoot, type);
        if (excludedNodes == null || !excludedNodes.contains(chosen)) {
        } else {
            LOG.debug("Node {} is excluded, continuing.", chosen);
    } while (true);
    LOG.debug("chooseRandom returning {}", chosen);
    return chosen;
DatanodeDescriptor(org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor) Node(

   * Convert current INode to UnderConstruction. Recreate lease. Create new
   * block for the truncated copy. Schedule truncation of the replicas.
   * @param fsn namespace
   * @param iip inodes in the path containing the file
   * @param leaseHolder lease holder
   * @param clientMachine client machine info
   * @param lastBlockDelta last block delta size
   * @param newBlock new block
   * @return the returned block will be written to editLog and passed back
   *         into this method upon loading.
   * @throws IOException
static Block prepareFileForTruncate(FSNamesystem fsn, INodesInPath iip, String leaseHolder, String clientMachine, long lastBlockDelta, Block newBlock) throws IOException {
    assert fsn.hasWriteLock();
    INodeFile file = iip.getLastINode().asFile();
    assert !file.isStriped();
    file.toUnderConstruction(leaseHolder, clientMachine);
    assert file.isUnderConstruction() : "inode should be under construction.";
    fsn.getLeaseManager().addLease(file.getFileUnderConstructionFeature().getClientName(), file.getId());
    boolean shouldRecoverNow = (newBlock == null);
    BlockInfo oldBlock = file.getLastBlock();
    boolean shouldCopyOnTruncate = shouldCopyOnTruncate(fsn, file, oldBlock);
    if (newBlock == null) {
        newBlock = (shouldCopyOnTruncate) ? fsn.createNewBlock(BlockType.CONTIGUOUS) : new Block(oldBlock.getBlockId(), oldBlock.getNumBytes(), fsn.nextGenerationStamp(fsn.getBlockManager().isLegacyBlock(oldBlock)));
    final BlockInfo truncatedBlockUC;
    BlockManager blockManager = fsn.getFSDirectory().getBlockManager();
    if (shouldCopyOnTruncate) {
        // Add new truncateBlock into blocksMap and
        // use oldBlock as a source for copy-on-truncate recovery
        truncatedBlockUC = new BlockInfoContiguous(newBlock, file.getPreferredBlockReplication());
        truncatedBlockUC.convertToBlockUnderConstruction(BlockUCState.UNDER_CONSTRUCTION, blockManager.getStorages(oldBlock));
        truncatedBlockUC.setNumBytes(oldBlock.getNumBytes() - lastBlockDelta);
        blockManager.addBlockCollection(truncatedBlockUC, file);
        NameNode.stateChangeLog.debug("BLOCK* prepareFileForTruncate: Scheduling copy-on-truncate to new" + " size {}  new block {} old block {}", truncatedBlockUC.getNumBytes(), newBlock, oldBlock);
    } else {
        // Use new generation stamp for in-place truncate recovery
        blockManager.convertLastBlockToUnderConstruction(file, lastBlockDelta);
        oldBlock = file.getLastBlock();
        assert !oldBlock.isComplete() : "oldBlock should be under construction";
        BlockUnderConstructionFeature uc = oldBlock.getUnderConstructionFeature();
        uc.setTruncateBlock(new Block(oldBlock));
        uc.getTruncateBlock().setNumBytes(oldBlock.getNumBytes() - lastBlockDelta);
        truncatedBlockUC = oldBlock;
        NameNode.stateChangeLog.debug("BLOCK* prepareFileForTruncate: " + "{} Scheduling in-place block truncate to new size {}", uc, uc.getTruncateBlock().getNumBytes());
    if (shouldRecoverNow) {
        truncatedBlockUC.getUnderConstructionFeature().initializeBlockRecovery(truncatedBlockUC, newBlock.getGenerationStamp());
    return newBlock;
BlockInfoContiguous(org.apache.hadoop.hdfs.server.blockmanagement.BlockInfoContiguous) BlockUnderConstructionFeature(org.apache.hadoop.hdfs.server.blockmanagement.BlockUnderConstructionFeature) BlockInfo(org.apache.hadoop.hdfs.server.blockmanagement.BlockInfo) BlockManager(org.apache.hadoop.hdfs.server.blockmanagement.BlockManager) Block(org.apache.hadoop.hdfs.protocol.Block)

   * Resolves a given path into an INodesInPath.  All ancestor inodes that
   * exist are validated as traversable directories.  Symlinks in the ancestry
   * will generate an UnresolvedLinkException.  The returned IIP will be an
   * accessible path that also passed additional sanity checks based on how
   * the path will be used as specified by the DirOp.
   *   READ:   Expands reserved paths and performs permission checks
   *           during traversal.  Raw paths are only accessible by a superuser.
   *   WRITE:  In addition to READ checks, ensures the path is not a
   *           snapshot path.
   *   CREATE: In addition to WRITE checks, ensures path does not contain
   *           illegal character sequences.
   * @param pc  A permission checker for traversal checks.  Pass null for
   *            no permission checks.
   * @param src The path to resolve.
   * @param dirOp The {@link DirOp} that controls additional checks.
   * @param resolveLink If false, only ancestor symlinks will be checked.  If
   *         true, the last inode will also be checked.
   * @return if the path indicates an inode, return path after replacing up to
   *         <inodeid> with the corresponding path of the inode, else the path
   *         in {@code src} as is. If the path refers to a path in the "raw"
   *         directory, return the non-raw pathname.
   * @throws FileNotFoundException
   * @throws AccessControlException
   * @throws ParentNotDirectoryException
   * @throws UnresolvedLinkException
public INodesInPath resolvePath(FSPermissionChecker pc, String src, DirOp dirOp) throws UnresolvedLinkException, FileNotFoundException, AccessControlException, ParentNotDirectoryException {
    boolean isCreate = (dirOp == DirOp.CREATE || dirOp == DirOp.CREATE_LINK);
    // prevent creation of new invalid paths
    if (isCreate && !DFSUtil.isValidName(src)) {
        throw new InvalidPathException("Invalid file name: " + src);
    byte[][] components = INode.getPathComponents(src);
    boolean isRaw = isReservedRawName(components);
    if (isPermissionEnabled && pc != null && isRaw) {
    components = resolveComponents(components, this);
    INodesInPath iip = INodesInPath.resolve(rootDir, components, isRaw);
    // PNDE
    try {
        checkTraverse(pc, iip, dirOp);
    } catch (ParentNotDirectoryException pnde) {
        if (!isCreate) {
            throw new AccessControlException(pnde.getMessage());
        throw pnde;
    return iip;
ParentNotDirectoryException(org.apache.hadoop.fs.ParentNotDirectoryException) AccessControlException( SnapshotAccessControlException(org.apache.hadoop.hdfs.protocol.SnapshotAccessControlException) InvalidPathException(org.apache.hadoop.fs.InvalidPathException)

   * @return the current state of the given segment, or null if the
   * segment does not exist.
SegmentStateProto getSegmentInfo(long segmentTxId) throws IOException {
    EditLogFile elf = fjm.getLogFile(segmentTxId);
    if (elf == null) {
        return null;
    if (elf.isInProgress()) {
        elf.scanLog(Long.MAX_VALUE, false);
    if (elf.getLastTxId() == HdfsServerConstants.INVALID_TXID) {"Edit log file " + elf + " appears to be empty. " + "Moving it aside...");
        return null;
    SegmentStateProto ret = SegmentStateProto.newBuilder().setStartTxId(segmentTxId).setEndTxId(elf.getLastTxId()).setIsInProgress(elf.isInProgress()).build();"getSegmentInfo(" + segmentTxId + "): " + elf + " -> " + TextFormat.shortDebugString(ret));
    return ret;
SegmentStateProto(org.apache.hadoop.hdfs.qjournal.protocol.QJournalProtocolProtos.SegmentStateProto) EditLogFile(org.apache.hadoop.hdfs.server.namenode.FileJournalManager.EditLogFile)

   * Create a new checkpoint
   * @return if the image is fetched from primary or not
public boolean doCheckpoint() throws IOException {
    NNStorage dstStorage = checkpointImage.getStorage();
    // Tell the namenode to start logging transactions in a new edit file
    // Returns a token that would be used to upload the merged image.
    CheckpointSignature sig = namenode.rollEditLog();
    boolean loadImage = false;
    boolean isFreshCheckpointer = (checkpointImage.getNamespaceID() == 0);
    boolean isSameCluster = (dstStorage.versionSupportsFederation(NameNodeLayoutVersion.FEATURES) && sig.isSameCluster(checkpointImage)) || (!dstStorage.versionSupportsFederation(NameNodeLayoutVersion.FEATURES) && sig.namespaceIdMatches(checkpointImage));
    if (isFreshCheckpointer || (isSameCluster && !sig.storageVersionMatches(checkpointImage.getStorage()))) {
        // if we're a fresh 2NN, or if we're on the same cluster and our storage
        // needs an upgrade, just take the storage info from the server.
        loadImage = true;
    // error simulation code for junit test
    RemoteEditLogManifest manifest = namenode.getEditLogManifest(sig.mostRecentCheckpointTxId + 1);
    // Fetch fsimage and edits. Reload the image if previous merge failed.
    loadImage |= downloadCheckpointFiles(fsName, checkpointImage, sig, manifest) | checkpointImage.hasMergeError();
    try {
        doMerge(sig, manifest, loadImage, checkpointImage, namesystem);
    } catch (IOException ioe) {
        // A merge error occurred. The in-memory file system state may be
        // inconsistent, so the image and edits need to be reloaded.
        throw ioe;
    // Clear any error since merge was successful.
    // Upload the new image into the NameNode. Then tell the Namenode
    // to make this new uploaded image as the most current image.
    long txid = checkpointImage.getLastAppliedTxId();
    TransferFsImage.uploadImageFromStorage(fsName, conf, dstStorage, NameNodeFile.IMAGE, txid);
    // error simulation code for junit test
    LOG.warn("Checkpoint done. New Image Size: " + dstStorage.getFsImageName(txid).length());
    if (legacyOivImageDir != null && !legacyOivImageDir.isEmpty()) {
        try {
            checkpointImage.saveLegacyOIVImage(namesystem, legacyOivImageDir, new Canceler());
        } catch (IOException e) {
            LOG.warn("Failed to write legacy OIV image: ", e);
    return loadImage;
RemoteEditLogManifest(org.apache.hadoop.hdfs.server.protocol.RemoteEditLogManifest) Canceler(org.apache.hadoop.hdfs.util.Canceler) IOException(


