Search in sources :

Example 1 with VolumeFailureSummary

use of org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary in project hadoop by apache.

the class DatanodeProtocolServerSideTranslatorPB method sendHeartbeat.

@Override
public HeartbeatResponseProto sendHeartbeat(RpcController controller, HeartbeatRequestProto request) throws ServiceException {
    HeartbeatResponse response;
    try {
        final StorageReport[] report = PBHelperClient.convertStorageReports(request.getReportsList());
        VolumeFailureSummary volumeFailureSummary = request.hasVolumeFailureSummary() ? PBHelper.convertVolumeFailureSummary(request.getVolumeFailureSummary()) : null;
        response = impl.sendHeartbeat(PBHelper.convert(request.getRegistration()), report, request.getCacheCapacity(), request.getCacheUsed(), request.getXmitsInProgress(), request.getXceiverCount(), request.getFailedVolumes(), volumeFailureSummary, request.getRequestFullBlockReportLease(), PBHelper.convertSlowPeerInfo(request.getSlowPeersList()));
    } catch (IOException e) {
        throw new ServiceException(e);
    }
    HeartbeatResponseProto.Builder builder = HeartbeatResponseProto.newBuilder();
    DatanodeCommand[] cmds = response.getCommands();
    if (cmds != null) {
        for (int i = 0; i < cmds.length; i++) {
            if (cmds[i] != null) {
                builder.addCmds(PBHelper.convert(cmds[i]));
            }
        }
    }
    builder.setHaStatus(PBHelper.convert(response.getNameNodeHaState()));
    RollingUpgradeStatus rollingUpdateStatus = response.getRollingUpdateStatus();
    if (rollingUpdateStatus != null) {
        // V2 is always set for newer datanodes.
        // To be compatible with older datanodes, V1 is set to null
        //  if the RU was finalized.
        RollingUpgradeStatusProto rus = PBHelperClient.convertRollingUpgradeStatus(rollingUpdateStatus);
        builder.setRollingUpgradeStatusV2(rus);
        if (!rollingUpdateStatus.isFinalized()) {
            builder.setRollingUpgradeStatus(rus);
        }
    }
    builder.setFullBlockReportLeaseId(response.getFullBlockReportLeaseId());
    return builder.build();
}
Also used : HeartbeatResponse(org.apache.hadoop.hdfs.server.protocol.HeartbeatResponse) RollingUpgradeStatusProto(org.apache.hadoop.hdfs.protocol.proto.HdfsProtos.RollingUpgradeStatusProto) RollingUpgradeStatus(org.apache.hadoop.hdfs.protocol.RollingUpgradeStatus) StorageReport(org.apache.hadoop.hdfs.server.protocol.StorageReport) IOException(java.io.IOException) VolumeFailureSummary(org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary) DatanodeCommand(org.apache.hadoop.hdfs.server.protocol.DatanodeCommand) ServiceException(com.google.protobuf.ServiceException) HeartbeatResponseProto(org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.HeartbeatResponseProto)

Example 2 with VolumeFailureSummary

use of org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary in project hadoop by apache.

the class BPServiceActor method sendHeartBeat.

HeartbeatResponse sendHeartBeat(boolean requestBlockReportLease) throws IOException {
    scheduler.scheduleNextHeartbeat();
    StorageReport[] reports = dn.getFSDataset().getStorageReports(bpos.getBlockPoolId());
    if (LOG.isDebugEnabled()) {
        LOG.debug("Sending heartbeat with " + reports.length + " storage reports from service actor: " + this);
    }
    final long now = monotonicNow();
    scheduler.updateLastHeartbeatTime(now);
    VolumeFailureSummary volumeFailureSummary = dn.getFSDataset().getVolumeFailureSummary();
    int numFailedVolumes = volumeFailureSummary != null ? volumeFailureSummary.getFailedStorageLocations().length : 0;
    final boolean slowPeersReportDue = scheduler.isSlowPeersReportDue(now);
    final SlowPeerReports slowPeers = slowPeersReportDue && dn.getPeerMetrics() != null ? SlowPeerReports.create(dn.getPeerMetrics().getOutliers()) : SlowPeerReports.EMPTY_REPORT;
    HeartbeatResponse response = bpNamenode.sendHeartbeat(bpRegistration, reports, dn.getFSDataset().getCacheCapacity(), dn.getFSDataset().getCacheUsed(), dn.getXmitsInProgress(), dn.getXceiverCount(), numFailedVolumes, volumeFailureSummary, requestBlockReportLease, slowPeers);
    if (slowPeersReportDue) {
        // If the report was due and successfully sent, schedule the next one.
        scheduler.scheduleNextSlowPeerReport();
    }
    return response;
}
Also used : HeartbeatResponse(org.apache.hadoop.hdfs.server.protocol.HeartbeatResponse) StorageReport(org.apache.hadoop.hdfs.server.protocol.StorageReport) SlowPeerReports(org.apache.hadoop.hdfs.server.protocol.SlowPeerReports) VolumeFailureSummary(org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary)

Example 3 with VolumeFailureSummary

use of org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary in project hadoop by apache.

the class TestDataNodeVolumeFailure method testVolumeFailure.

/*
   * Verify the number of blocks and files are correct after volume failure,
   * and that we can replicate to both datanodes even after a single volume
   * failure if the configuration parameter allows this.
   */
@Test(timeout = 120000)
public void testVolumeFailure() throws Exception {
    System.out.println("Data dir: is " + dataDir.getPath());
    // Data dir structure is dataDir/data[1-4]/[current,tmp...]
    // data1,2 is for datanode 1, data2,3 - datanode2 
    String filename = "/test.txt";
    Path filePath = new Path(filename);
    // we use only small number of blocks to avoid creating subdirs in the data dir..
    int filesize = block_size * blocks_num;
    DFSTestUtil.createFile(fs, filePath, filesize, repl, 1L);
    DFSTestUtil.waitReplication(fs, filePath, repl);
    System.out.println("file " + filename + "(size " + filesize + ") is created and replicated");
    // fail the volume
    // delete/make non-writable one of the directories (failed volume)
    data_fail = new File(dataDir, "data3");
    failedDir = MiniDFSCluster.getFinalizedDir(data_fail, cluster.getNamesystem().getBlockPoolId());
    if (failedDir.exists() && //!FileUtil.fullyDelete(failedDir)
    !deteteBlocks(failedDir)) {
        throw new IOException("Could not delete hdfs directory '" + failedDir + "'");
    }
    data_fail.setReadOnly();
    failedDir.setReadOnly();
    System.out.println("Deleteing " + failedDir.getPath() + "; exist=" + failedDir.exists());
    // access all the blocks on the "failed" DataNode, 
    // we need to make sure that the "failed" volume is being accessed - 
    // and that will cause failure, blocks removal, "emergency" block report
    triggerFailure(filename, filesize);
    // DN eventually have latest volume failure information for next heartbeat
    final DataNode dn = cluster.getDataNodes().get(1);
    GenericTestUtils.waitFor(new Supplier<Boolean>() {

        @Override
        public Boolean get() {
            final VolumeFailureSummary summary = dn.getFSDataset().getVolumeFailureSummary();
            return summary != null && summary.getFailedStorageLocations() != null && summary.getFailedStorageLocations().length == 1;
        }
    }, 10, 30 * 1000);
    // trigger DN to send heartbeat
    DataNodeTestUtils.triggerHeartbeat(dn);
    final BlockManager bm = cluster.getNamesystem().getBlockManager();
    // trigger NN handel heartbeat
    BlockManagerTestUtil.checkHeartbeat(bm);
    // NN now should have latest volume failure
    assertEquals(1, cluster.getNamesystem().getVolumeFailuresTotal());
    // verify number of blocks and files...
    verify(filename, filesize);
    // create another file (with one volume failed).
    System.out.println("creating file test1.txt");
    Path fileName1 = new Path("/test1.txt");
    DFSTestUtil.createFile(fs, fileName1, filesize, repl, 1L);
    // should be able to replicate to both nodes (2 DN, repl=2)
    DFSTestUtil.waitReplication(fs, fileName1, repl);
    System.out.println("file " + fileName1.getName() + " is created and replicated");
}
Also used : Path(org.apache.hadoop.fs.Path) BlockManager(org.apache.hadoop.hdfs.server.blockmanagement.BlockManager) IOException(java.io.IOException) File(java.io.File) VolumeFailureSummary(org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary) Test(org.junit.Test)

Example 4 with VolumeFailureSummary

use of org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary in project hadoop by apache.

the class DatanodeLifelineProtocolServerSideTranslatorPB method sendLifeline.

@Override
public LifelineResponseProto sendLifeline(RpcController controller, HeartbeatRequestProto request) throws ServiceException {
    try {
        final StorageReport[] report = PBHelperClient.convertStorageReports(request.getReportsList());
        VolumeFailureSummary volumeFailureSummary = request.hasVolumeFailureSummary() ? PBHelper.convertVolumeFailureSummary(request.getVolumeFailureSummary()) : null;
        impl.sendLifeline(PBHelper.convert(request.getRegistration()), report, request.getCacheCapacity(), request.getCacheUsed(), request.getXmitsInProgress(), request.getXceiverCount(), request.getFailedVolumes(), volumeFailureSummary);
        return VOID_LIFELINE_RESPONSE_PROTO;
    } catch (IOException e) {
        throw new ServiceException(e);
    }
}
Also used : ServiceException(com.google.protobuf.ServiceException) StorageReport(org.apache.hadoop.hdfs.server.protocol.StorageReport) IOException(java.io.IOException) VolumeFailureSummary(org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary)

Example 5 with VolumeFailureSummary

use of org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary in project hadoop by apache.

the class FSNamesystem method getEstimatedCapacityLostTotal.

// FSNamesystemMBean
@Override
public long getEstimatedCapacityLostTotal() {
    List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
    getBlockManager().getDatanodeManager().fetchDatanodes(live, null, false);
    long estimatedCapacityLostTotal = 0;
    for (DatanodeDescriptor node : live) {
        VolumeFailureSummary volumeFailureSummary = node.getVolumeFailureSummary();
        if (volumeFailureSummary != null) {
            estimatedCapacityLostTotal += volumeFailureSummary.getEstimatedCapacityLostTotal();
        }
    }
    return estimatedCapacityLostTotal;
}
Also used : DatanodeDescriptor(org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor) ArrayList(java.util.ArrayList) VolumeFailureSummary(org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary)

Aggregations

VolumeFailureSummary (org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary)7 IOException (java.io.IOException)3 DatanodeDescriptor (org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor)3 StorageReport (org.apache.hadoop.hdfs.server.protocol.StorageReport)3 ServiceException (com.google.protobuf.ServiceException)2 ArrayList (java.util.ArrayList)2 HeartbeatResponse (org.apache.hadoop.hdfs.server.protocol.HeartbeatResponse)2 ImmutableMap (com.google.common.collect.ImmutableMap)1 File (java.io.File)1 HashMap (java.util.HashMap)1 Map (java.util.Map)1 TreeMap (java.util.TreeMap)1 Path (org.apache.hadoop.fs.Path)1 RollingUpgradeStatus (org.apache.hadoop.hdfs.protocol.RollingUpgradeStatus)1 HeartbeatResponseProto (org.apache.hadoop.hdfs.protocol.proto.DatanodeProtocolProtos.HeartbeatResponseProto)1 RollingUpgradeStatusProto (org.apache.hadoop.hdfs.protocol.proto.HdfsProtos.RollingUpgradeStatusProto)1 BlockManager (org.apache.hadoop.hdfs.server.blockmanagement.BlockManager)1 DatanodeCommand (org.apache.hadoop.hdfs.server.protocol.DatanodeCommand)1 SlowPeerReports (org.apache.hadoop.hdfs.server.protocol.SlowPeerReports)1 Test (org.junit.Test)1