use of org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager in project hadoop by apache.
the class TestDFSOutputStream method testNoLocalWriteFlag.
@Test
public void testNoLocalWriteFlag() throws IOException {
DistributedFileSystem fs = cluster.getFileSystem();
EnumSet<CreateFlag> flags = EnumSet.of(CreateFlag.NO_LOCAL_WRITE, CreateFlag.CREATE);
BlockManager bm = cluster.getNameNode().getNamesystem().getBlockManager();
DatanodeManager dm = bm.getDatanodeManager();
try (FSDataOutputStream os = fs.create(new Path("/test-no-local"), FsPermission.getDefault(), flags, 512, (short) 2, 512, null)) {
// Inject a DatanodeManager that returns one DataNode as local node for
// the client.
DatanodeManager spyDm = spy(dm);
DatanodeDescriptor dn1 = dm.getDatanodeListForReport(HdfsConstants.DatanodeReportType.LIVE).get(0);
doReturn(dn1).when(spyDm).getDatanodeByHost("127.0.0.1");
Whitebox.setInternalState(bm, "datanodeManager", spyDm);
byte[] buf = new byte[512 * 16];
new Random().nextBytes(buf);
os.write(buf);
} finally {
Whitebox.setInternalState(bm, "datanodeManager", dm);
}
cluster.triggerBlockReports();
final String bpid = cluster.getNamesystem().getBlockPoolId();
// Total number of DataNodes is 3.
assertEquals(3, cluster.getAllBlockReports(bpid).size());
int numDataNodesWithData = 0;
for (Map<DatanodeStorage, BlockListAsLongs> dnBlocks : cluster.getAllBlockReports(bpid)) {
for (BlockListAsLongs blocks : dnBlocks.values()) {
if (blocks.getNumberOfBlocks() > 0) {
numDataNodesWithData++;
break;
}
}
}
// Verify that only one DN has no data.
assertEquals(1, 3 - numDataNodesWithData);
}
use of org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager in project hadoop by apache.
the class TestDataNodeVolumeFailureToleration method testFailedVolumeOnStartupIsCounted.
/**
* Test that a volume that is considered failed on startup is seen as
* a failed volume by the NN.
*/
@Test
public void testFailedVolumeOnStartupIsCounted() throws Exception {
assumeNotWindows();
final DatanodeManager dm = cluster.getNamesystem().getBlockManager().getDatanodeManager();
long origCapacity = DFSTestUtil.getLiveDatanodeCapacity(dm);
File dir = new File(cluster.getInstanceStorageDir(0, 0), "current");
try {
prepareDirToFail(dir);
restartDatanodes(1, false);
// The cluster is up..
assertEquals(true, cluster.getDataNodes().get(0).isBPServiceAlive(cluster.getNamesystem().getBlockPoolId()));
// but there has been a single volume failure
DFSTestUtil.waitForDatanodeStatus(dm, 1, 0, 1, origCapacity / 2, WAIT_FOR_HEARTBEATS);
} finally {
FileUtil.chmod(dir.toString(), "755");
}
}
use of org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager in project hadoop by apache.
the class TestDataNodeVolumeFailureReporting method testAutoFormatEmptyDirectory.
@Test
public void testAutoFormatEmptyDirectory() throws Exception {
// remove the version file
File dn1Vol1 = cluster.getStorageDir(0, 0);
File current = new File(dn1Vol1, "current");
File currentVersion = new File(current, "VERSION");
currentVersion.delete();
// restart the data node
assertTrue(cluster.restartDataNodes(true));
// the DN should tolerate one volume failure.
cluster.waitActive();
ArrayList<DataNode> dns = cluster.getDataNodes();
DataNode dn = dns.get(0);
assertFalse("DataNode should not reformat if VERSION is missing", currentVersion.exists());
// Make sure DN's JMX sees the failed volume
final String[] expectedFailedVolumes = { dn1Vol1.getAbsolutePath() };
DataNodeTestUtils.triggerHeartbeat(dn);
FsDatasetSpi<?> fsd = dn.getFSDataset();
assertEquals(expectedFailedVolumes.length, fsd.getNumFailedVolumes());
assertArrayEquals(expectedFailedVolumes, convertToAbsolutePaths(fsd.getFailedStorageLocations()));
// there shouldn't be any more volume failures due to I/O failure
checkFailuresAtDataNode(dn, 0, false, expectedFailedVolumes);
// The NN reports one volume failures
final DatanodeManager dm = cluster.getNamesystem().getBlockManager().getDatanodeManager();
long dnCapacity = DFSTestUtil.getDatanodeCapacity(dm, 0);
DFSTestUtil.waitForDatanodeStatus(dm, 1, 0, 1, (1 * dnCapacity), WAIT_FOR_HEARTBEATS);
checkAggregateFailuresAtNameNode(false, 1);
checkFailuresAtNameNode(dm, dns.get(0), false, dn1Vol1.getAbsolutePath());
}
use of org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager in project hadoop by apache.
the class TestDataNodeVolumeFailureReporting method testDataNodeReconfigureWithVolumeFailures.
@Test
public void testDataNodeReconfigureWithVolumeFailures() throws Exception {
// Bring up two more datanodes
cluster.startDataNodes(conf, 2, true, null, null);
cluster.waitActive();
final DatanodeManager dm = cluster.getNamesystem().getBlockManager().getDatanodeManager();
long origCapacity = DFSTestUtil.getLiveDatanodeCapacity(dm);
long dnCapacity = DFSTestUtil.getDatanodeCapacity(dm, 0);
// Fail the first volume on both datanodes (we have to keep the
// third healthy so one node in the pipeline will not fail).
File dn1Vol1 = new File(dataDir, "data" + (2 * 0 + 1));
File dn1Vol2 = new File(dataDir, "data" + (2 * 0 + 2));
File dn2Vol1 = new File(dataDir, "data" + (2 * 1 + 1));
File dn2Vol2 = new File(dataDir, "data" + (2 * 1 + 2));
DataNodeTestUtils.injectDataDirFailure(dn1Vol1);
DataNodeTestUtils.injectDataDirFailure(dn2Vol1);
Path file1 = new Path("/test1");
DFSTestUtil.createFile(fs, file1, 1024, (short) 2, 1L);
DFSTestUtil.waitReplication(fs, file1, (short) 2);
ArrayList<DataNode> dns = cluster.getDataNodes();
assertTrue("DN1 should be up", dns.get(0).isDatanodeUp());
assertTrue("DN2 should be up", dns.get(1).isDatanodeUp());
assertTrue("DN3 should be up", dns.get(2).isDatanodeUp());
checkFailuresAtDataNode(dns.get(0), 1, true, dn1Vol1.getAbsolutePath());
checkFailuresAtDataNode(dns.get(1), 1, true, dn2Vol1.getAbsolutePath());
checkFailuresAtDataNode(dns.get(2), 0, true);
// Ensure we wait a sufficient amount of time
assert (WAIT_FOR_HEARTBEATS * 10) > WAIT_FOR_DEATH;
// The NN reports two volume failures
DFSTestUtil.waitForDatanodeStatus(dm, 3, 0, 2, origCapacity - (1 * dnCapacity), WAIT_FOR_HEARTBEATS);
checkAggregateFailuresAtNameNode(true, 2);
checkFailuresAtNameNode(dm, dns.get(0), true, dn1Vol1.getAbsolutePath());
checkFailuresAtNameNode(dm, dns.get(1), true, dn2Vol1.getAbsolutePath());
// Reconfigure again to try to add back the failed volumes.
DataNodeTestUtils.reconfigureDataNode(dns.get(0), dn1Vol1, dn1Vol2);
DataNodeTestUtils.reconfigureDataNode(dns.get(1), dn2Vol1, dn2Vol2);
DataNodeTestUtils.triggerHeartbeat(dns.get(0));
DataNodeTestUtils.triggerHeartbeat(dns.get(1));
checkFailuresAtDataNode(dns.get(0), 1, true, dn1Vol1.getAbsolutePath());
checkFailuresAtDataNode(dns.get(1), 1, true, dn2Vol1.getAbsolutePath());
// Ensure we wait a sufficient amount of time.
assert (WAIT_FOR_HEARTBEATS * 10) > WAIT_FOR_DEATH;
// The NN reports two volume failures again.
DFSTestUtil.waitForDatanodeStatus(dm, 3, 0, 2, origCapacity - (1 * dnCapacity), WAIT_FOR_HEARTBEATS);
checkAggregateFailuresAtNameNode(true, 2);
checkFailuresAtNameNode(dm, dns.get(0), true, dn1Vol1.getAbsolutePath());
checkFailuresAtNameNode(dm, dns.get(1), true, dn2Vol1.getAbsolutePath());
// Reconfigure a third time with the failed volumes. Afterwards, we expect
// the same volume failures to be reported. (No double-counting.)
DataNodeTestUtils.reconfigureDataNode(dns.get(0), dn1Vol1, dn1Vol2);
DataNodeTestUtils.reconfigureDataNode(dns.get(1), dn2Vol1, dn2Vol2);
DataNodeTestUtils.triggerHeartbeat(dns.get(0));
DataNodeTestUtils.triggerHeartbeat(dns.get(1));
checkFailuresAtDataNode(dns.get(0), 1, true, dn1Vol1.getAbsolutePath());
checkFailuresAtDataNode(dns.get(1), 1, true, dn2Vol1.getAbsolutePath());
// Ensure we wait a sufficient amount of time.
assert (WAIT_FOR_HEARTBEATS * 10) > WAIT_FOR_DEATH;
// The NN reports two volume failures again.
DFSTestUtil.waitForDatanodeStatus(dm, 3, 0, 2, origCapacity - (1 * dnCapacity), WAIT_FOR_HEARTBEATS);
checkAggregateFailuresAtNameNode(true, 2);
checkFailuresAtNameNode(dm, dns.get(0), true, dn1Vol1.getAbsolutePath());
checkFailuresAtNameNode(dm, dns.get(1), true, dn2Vol1.getAbsolutePath());
// Replace failed volume with healthy volume and run reconfigure DataNode.
// The failed volume information should be cleared.
DataNodeTestUtils.restoreDataDirFromFailure(dn1Vol1, dn2Vol1);
DataNodeTestUtils.reconfigureDataNode(dns.get(0), dn1Vol1, dn1Vol2);
DataNodeTestUtils.reconfigureDataNode(dns.get(1), dn2Vol1, dn2Vol2);
DataNodeTestUtils.triggerHeartbeat(dns.get(0));
DataNodeTestUtils.triggerHeartbeat(dns.get(1));
checkFailuresAtDataNode(dns.get(0), 1, true);
checkFailuresAtDataNode(dns.get(1), 1, true);
DFSTestUtil.waitForDatanodeStatus(dm, 3, 0, 0, origCapacity, WAIT_FOR_HEARTBEATS);
checkAggregateFailuresAtNameNode(true, 0);
checkFailuresAtNameNode(dm, dns.get(0), true);
checkFailuresAtNameNode(dm, dns.get(1), true);
}
use of org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager in project hadoop by apache.
the class TestDataNodeVolumeFailureReporting method testMultipleVolFailuresOnNode.
@Test
public void testMultipleVolFailuresOnNode() throws Exception {
// Reinitialize the cluster, configured with 4 storage locations per DataNode
// and tolerating up to 2 failures.
tearDown();
initCluster(3, 4, 2);
// Calculate the total capacity of all the datanodes. Sleep for three seconds
// to be sure the datanodes have had a chance to heartbeat their capacities.
Thread.sleep(WAIT_FOR_HEARTBEATS);
DatanodeManager dm = cluster.getNamesystem().getBlockManager().getDatanodeManager();
long origCapacity = DFSTestUtil.getLiveDatanodeCapacity(dm);
long dnCapacity = DFSTestUtil.getDatanodeCapacity(dm, 0);
File dn1Vol1 = new File(dataDir, "data" + (4 * 0 + 1));
File dn1Vol2 = new File(dataDir, "data" + (4 * 0 + 2));
File dn2Vol1 = new File(dataDir, "data" + (4 * 1 + 1));
File dn2Vol2 = new File(dataDir, "data" + (4 * 1 + 2));
// Make the first two volume directories on the first two datanodes
// non-accessible.
DataNodeTestUtils.injectDataDirFailure(dn1Vol1, dn1Vol2, dn2Vol1, dn2Vol2);
// Create file1 and wait for 3 replicas (ie all DNs can still store a block).
// Then assert that all DNs are up, despite the volume failures.
Path file1 = new Path("/test1");
DFSTestUtil.createFile(fs, file1, 1024, (short) 3, 1L);
DFSTestUtil.waitReplication(fs, file1, (short) 3);
// Create additional file to trigger failure based volume check on dn1Vol2
// and dn2Vol2.
Path file2 = new Path("/test2");
DFSTestUtil.createFile(fs, file2, 1024, (short) 3, 1L);
DFSTestUtil.waitReplication(fs, file2, (short) 3);
ArrayList<DataNode> dns = cluster.getDataNodes();
assertTrue("DN1 should be up", dns.get(0).isDatanodeUp());
assertTrue("DN2 should be up", dns.get(1).isDatanodeUp());
assertTrue("DN3 should be up", dns.get(2).isDatanodeUp());
checkFailuresAtDataNode(dns.get(0), 1, true, dn1Vol1.getAbsolutePath(), dn1Vol2.getAbsolutePath());
checkFailuresAtDataNode(dns.get(1), 1, true, dn2Vol1.getAbsolutePath(), dn2Vol2.getAbsolutePath());
checkFailuresAtDataNode(dns.get(2), 0, true);
// Ensure we wait a sufficient amount of time
assert (WAIT_FOR_HEARTBEATS * 10) > WAIT_FOR_DEATH;
// Eventually the NN should report four volume failures
DFSTestUtil.waitForDatanodeStatus(dm, 3, 0, 4, origCapacity - (1 * dnCapacity), WAIT_FOR_HEARTBEATS);
checkAggregateFailuresAtNameNode(true, 4);
checkFailuresAtNameNode(dm, dns.get(0), true, dn1Vol1.getAbsolutePath(), dn1Vol2.getAbsolutePath());
checkFailuresAtNameNode(dm, dns.get(1), true, dn2Vol1.getAbsolutePath(), dn2Vol2.getAbsolutePath());
checkFailuresAtNameNode(dm, dns.get(2), true);
}
Aggregations