use of org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary in project hadoop by apache.
the class DatanodeProtocolServerSideTranslatorPB method sendHeartbeat.
@Override
public HeartbeatResponseProto sendHeartbeat(RpcController controller, HeartbeatRequestProto request) throws ServiceException {
HeartbeatResponse response;
try {
final StorageReport[] report = PBHelperClient.convertStorageReports(request.getReportsList());
VolumeFailureSummary volumeFailureSummary = request.hasVolumeFailureSummary() ? PBHelper.convertVolumeFailureSummary(request.getVolumeFailureSummary()) : null;
response = impl.sendHeartbeat(PBHelper.convert(request.getRegistration()), report, request.getCacheCapacity(), request.getCacheUsed(), request.getXmitsInProgress(), request.getXceiverCount(), request.getFailedVolumes(), volumeFailureSummary, request.getRequestFullBlockReportLease(), PBHelper.convertSlowPeerInfo(request.getSlowPeersList()));
} catch (IOException e) {
throw new ServiceException(e);
}
HeartbeatResponseProto.Builder builder = HeartbeatResponseProto.newBuilder();
DatanodeCommand[] cmds = response.getCommands();
if (cmds != null) {
for (int i = 0; i < cmds.length; i++) {
if (cmds[i] != null) {
builder.addCmds(PBHelper.convert(cmds[i]));
}
}
}
builder.setHaStatus(PBHelper.convert(response.getNameNodeHaState()));
RollingUpgradeStatus rollingUpdateStatus = response.getRollingUpdateStatus();
if (rollingUpdateStatus != null) {
// V2 is always set for newer datanodes.
// To be compatible with older datanodes, V1 is set to null
// if the RU was finalized.
RollingUpgradeStatusProto rus = PBHelperClient.convertRollingUpgradeStatus(rollingUpdateStatus);
builder.setRollingUpgradeStatusV2(rus);
if (!rollingUpdateStatus.isFinalized()) {
builder.setRollingUpgradeStatus(rus);
}
}
builder.setFullBlockReportLeaseId(response.getFullBlockReportLeaseId());
return builder.build();
}
use of org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary in project hadoop by apache.
the class BPServiceActor method sendHeartBeat.
HeartbeatResponse sendHeartBeat(boolean requestBlockReportLease) throws IOException {
scheduler.scheduleNextHeartbeat();
StorageReport[] reports = dn.getFSDataset().getStorageReports(bpos.getBlockPoolId());
if (LOG.isDebugEnabled()) {
LOG.debug("Sending heartbeat with " + reports.length + " storage reports from service actor: " + this);
}
final long now = monotonicNow();
scheduler.updateLastHeartbeatTime(now);
VolumeFailureSummary volumeFailureSummary = dn.getFSDataset().getVolumeFailureSummary();
int numFailedVolumes = volumeFailureSummary != null ? volumeFailureSummary.getFailedStorageLocations().length : 0;
final boolean slowPeersReportDue = scheduler.isSlowPeersReportDue(now);
final SlowPeerReports slowPeers = slowPeersReportDue && dn.getPeerMetrics() != null ? SlowPeerReports.create(dn.getPeerMetrics().getOutliers()) : SlowPeerReports.EMPTY_REPORT;
HeartbeatResponse response = bpNamenode.sendHeartbeat(bpRegistration, reports, dn.getFSDataset().getCacheCapacity(), dn.getFSDataset().getCacheUsed(), dn.getXmitsInProgress(), dn.getXceiverCount(), numFailedVolumes, volumeFailureSummary, requestBlockReportLease, slowPeers);
if (slowPeersReportDue) {
// If the report was due and successfully sent, schedule the next one.
scheduler.scheduleNextSlowPeerReport();
}
return response;
}
use of org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary in project hadoop by apache.
the class TestDataNodeVolumeFailure method testVolumeFailure.
/*
* Verify the number of blocks and files are correct after volume failure,
* and that we can replicate to both datanodes even after a single volume
* failure if the configuration parameter allows this.
*/
@Test(timeout = 120000)
public void testVolumeFailure() throws Exception {
System.out.println("Data dir: is " + dataDir.getPath());
// Data dir structure is dataDir/data[1-4]/[current,tmp...]
// data1,2 is for datanode 1, data2,3 - datanode2
String filename = "/test.txt";
Path filePath = new Path(filename);
// we use only small number of blocks to avoid creating subdirs in the data dir..
int filesize = block_size * blocks_num;
DFSTestUtil.createFile(fs, filePath, filesize, repl, 1L);
DFSTestUtil.waitReplication(fs, filePath, repl);
System.out.println("file " + filename + "(size " + filesize + ") is created and replicated");
// fail the volume
// delete/make non-writable one of the directories (failed volume)
data_fail = new File(dataDir, "data3");
failedDir = MiniDFSCluster.getFinalizedDir(data_fail, cluster.getNamesystem().getBlockPoolId());
if (failedDir.exists() && //!FileUtil.fullyDelete(failedDir)
!deteteBlocks(failedDir)) {
throw new IOException("Could not delete hdfs directory '" + failedDir + "'");
}
data_fail.setReadOnly();
failedDir.setReadOnly();
System.out.println("Deleteing " + failedDir.getPath() + "; exist=" + failedDir.exists());
// access all the blocks on the "failed" DataNode,
// we need to make sure that the "failed" volume is being accessed -
// and that will cause failure, blocks removal, "emergency" block report
triggerFailure(filename, filesize);
// DN eventually have latest volume failure information for next heartbeat
final DataNode dn = cluster.getDataNodes().get(1);
GenericTestUtils.waitFor(new Supplier<Boolean>() {
@Override
public Boolean get() {
final VolumeFailureSummary summary = dn.getFSDataset().getVolumeFailureSummary();
return summary != null && summary.getFailedStorageLocations() != null && summary.getFailedStorageLocations().length == 1;
}
}, 10, 30 * 1000);
// trigger DN to send heartbeat
DataNodeTestUtils.triggerHeartbeat(dn);
final BlockManager bm = cluster.getNamesystem().getBlockManager();
// trigger NN handel heartbeat
BlockManagerTestUtil.checkHeartbeat(bm);
// NN now should have latest volume failure
assertEquals(1, cluster.getNamesystem().getVolumeFailuresTotal());
// verify number of blocks and files...
verify(filename, filesize);
// create another file (with one volume failed).
System.out.println("creating file test1.txt");
Path fileName1 = new Path("/test1.txt");
DFSTestUtil.createFile(fs, fileName1, filesize, repl, 1L);
// should be able to replicate to both nodes (2 DN, repl=2)
DFSTestUtil.waitReplication(fs, fileName1, repl);
System.out.println("file " + fileName1.getName() + " is created and replicated");
}
use of org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary in project hadoop by apache.
the class DatanodeLifelineProtocolServerSideTranslatorPB method sendLifeline.
@Override
public LifelineResponseProto sendLifeline(RpcController controller, HeartbeatRequestProto request) throws ServiceException {
try {
final StorageReport[] report = PBHelperClient.convertStorageReports(request.getReportsList());
VolumeFailureSummary volumeFailureSummary = request.hasVolumeFailureSummary() ? PBHelper.convertVolumeFailureSummary(request.getVolumeFailureSummary()) : null;
impl.sendLifeline(PBHelper.convert(request.getRegistration()), report, request.getCacheCapacity(), request.getCacheUsed(), request.getXmitsInProgress(), request.getXceiverCount(), request.getFailedVolumes(), volumeFailureSummary);
return VOID_LIFELINE_RESPONSE_PROTO;
} catch (IOException e) {
throw new ServiceException(e);
}
}
use of org.apache.hadoop.hdfs.server.protocol.VolumeFailureSummary in project hadoop by apache.
the class FSNamesystem method getEstimatedCapacityLostTotal.
// FSNamesystemMBean
@Override
public long getEstimatedCapacityLostTotal() {
List<DatanodeDescriptor> live = new ArrayList<DatanodeDescriptor>();
getBlockManager().getDatanodeManager().fetchDatanodes(live, null, false);
long estimatedCapacityLostTotal = 0;
for (DatanodeDescriptor node : live) {
VolumeFailureSummary volumeFailureSummary = node.getVolumeFailureSummary();
if (volumeFailureSummary != null) {
estimatedCapacityLostTotal += volumeFailureSummary.getEstimatedCapacityLostTotal();
}
}
return estimatedCapacityLostTotal;
}
Aggregations