Search in sources :

Example 21 with DatanodeManager

use of org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager in project hadoop by apache.

the class TestDFSOutputStream method testNoLocalWriteFlag.

@Test
public void testNoLocalWriteFlag() throws IOException {
    DistributedFileSystem fs = cluster.getFileSystem();
    EnumSet<CreateFlag> flags = EnumSet.of(CreateFlag.NO_LOCAL_WRITE, CreateFlag.CREATE);
    BlockManager bm = cluster.getNameNode().getNamesystem().getBlockManager();
    DatanodeManager dm = bm.getDatanodeManager();
    try (FSDataOutputStream os = fs.create(new Path("/test-no-local"), FsPermission.getDefault(), flags, 512, (short) 2, 512, null)) {
        // Inject a DatanodeManager that returns one DataNode as local node for
        // the client.
        DatanodeManager spyDm = spy(dm);
        DatanodeDescriptor dn1 = dm.getDatanodeListForReport(HdfsConstants.DatanodeReportType.LIVE).get(0);
        doReturn(dn1).when(spyDm).getDatanodeByHost("127.0.0.1");
        Whitebox.setInternalState(bm, "datanodeManager", spyDm);
        byte[] buf = new byte[512 * 16];
        new Random().nextBytes(buf);
        os.write(buf);
    } finally {
        Whitebox.setInternalState(bm, "datanodeManager", dm);
    }
    cluster.triggerBlockReports();
    final String bpid = cluster.getNamesystem().getBlockPoolId();
    // Total number of DataNodes is 3.
    assertEquals(3, cluster.getAllBlockReports(bpid).size());
    int numDataNodesWithData = 0;
    for (Map<DatanodeStorage, BlockListAsLongs> dnBlocks : cluster.getAllBlockReports(bpid)) {
        for (BlockListAsLongs blocks : dnBlocks.values()) {
            if (blocks.getNumberOfBlocks() > 0) {
                numDataNodesWithData++;
                break;
            }
        }
    }
    // Verify that only one DN has no data.
    assertEquals(1, 3 - numDataNodesWithData);
}
Also used : CreateFlag(org.apache.hadoop.fs.CreateFlag) Path(org.apache.hadoop.fs.Path) DatanodeDescriptor(org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor) DatanodeManager(org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager) Random(java.util.Random) BlockManager(org.apache.hadoop.hdfs.server.blockmanagement.BlockManager) DatanodeStorage(org.apache.hadoop.hdfs.server.protocol.DatanodeStorage) BlockListAsLongs(org.apache.hadoop.hdfs.protocol.BlockListAsLongs) FSDataOutputStream(org.apache.hadoop.fs.FSDataOutputStream) Test(org.junit.Test)

Example 22 with DatanodeManager

use of org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager in project hadoop by apache.

the class TestDataNodeVolumeFailureToleration method testFailedVolumeOnStartupIsCounted.

/**
   * Test that a volume that is considered failed on startup is seen as
   *  a failed volume by the NN.
   */
@Test
public void testFailedVolumeOnStartupIsCounted() throws Exception {
    assumeNotWindows();
    final DatanodeManager dm = cluster.getNamesystem().getBlockManager().getDatanodeManager();
    long origCapacity = DFSTestUtil.getLiveDatanodeCapacity(dm);
    File dir = new File(cluster.getInstanceStorageDir(0, 0), "current");
    try {
        prepareDirToFail(dir);
        restartDatanodes(1, false);
        // The cluster is up..
        assertEquals(true, cluster.getDataNodes().get(0).isBPServiceAlive(cluster.getNamesystem().getBlockPoolId()));
        // but there has been a single volume failure
        DFSTestUtil.waitForDatanodeStatus(dm, 1, 0, 1, origCapacity / 2, WAIT_FOR_HEARTBEATS);
    } finally {
        FileUtil.chmod(dir.toString(), "755");
    }
}
Also used : DatanodeManager(org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager) File(java.io.File) Test(org.junit.Test)

Example 23 with DatanodeManager

use of org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager in project hadoop by apache.

the class TestDataNodeVolumeFailureReporting method testAutoFormatEmptyDirectory.

@Test
public void testAutoFormatEmptyDirectory() throws Exception {
    // remove the version file
    File dn1Vol1 = cluster.getStorageDir(0, 0);
    File current = new File(dn1Vol1, "current");
    File currentVersion = new File(current, "VERSION");
    currentVersion.delete();
    // restart the data node
    assertTrue(cluster.restartDataNodes(true));
    // the DN should tolerate one volume failure.
    cluster.waitActive();
    ArrayList<DataNode> dns = cluster.getDataNodes();
    DataNode dn = dns.get(0);
    assertFalse("DataNode should not reformat if VERSION is missing", currentVersion.exists());
    // Make sure DN's JMX sees the failed volume
    final String[] expectedFailedVolumes = { dn1Vol1.getAbsolutePath() };
    DataNodeTestUtils.triggerHeartbeat(dn);
    FsDatasetSpi<?> fsd = dn.getFSDataset();
    assertEquals(expectedFailedVolumes.length, fsd.getNumFailedVolumes());
    assertArrayEquals(expectedFailedVolumes, convertToAbsolutePaths(fsd.getFailedStorageLocations()));
    // there shouldn't be any more volume failures due to I/O failure
    checkFailuresAtDataNode(dn, 0, false, expectedFailedVolumes);
    // The NN reports one volume failures
    final DatanodeManager dm = cluster.getNamesystem().getBlockManager().getDatanodeManager();
    long dnCapacity = DFSTestUtil.getDatanodeCapacity(dm, 0);
    DFSTestUtil.waitForDatanodeStatus(dm, 1, 0, 1, (1 * dnCapacity), WAIT_FOR_HEARTBEATS);
    checkAggregateFailuresAtNameNode(false, 1);
    checkFailuresAtNameNode(dm, dns.get(0), false, dn1Vol1.getAbsolutePath());
}
Also used : DatanodeManager(org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager) File(java.io.File) Test(org.junit.Test)

Example 24 with DatanodeManager

use of org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager in project hadoop by apache.

the class TestDataNodeVolumeFailureReporting method testDataNodeReconfigureWithVolumeFailures.

@Test
public void testDataNodeReconfigureWithVolumeFailures() throws Exception {
    // Bring up two more datanodes
    cluster.startDataNodes(conf, 2, true, null, null);
    cluster.waitActive();
    final DatanodeManager dm = cluster.getNamesystem().getBlockManager().getDatanodeManager();
    long origCapacity = DFSTestUtil.getLiveDatanodeCapacity(dm);
    long dnCapacity = DFSTestUtil.getDatanodeCapacity(dm, 0);
    // Fail the first volume on both datanodes (we have to keep the
    // third healthy so one node in the pipeline will not fail).
    File dn1Vol1 = new File(dataDir, "data" + (2 * 0 + 1));
    File dn1Vol2 = new File(dataDir, "data" + (2 * 0 + 2));
    File dn2Vol1 = new File(dataDir, "data" + (2 * 1 + 1));
    File dn2Vol2 = new File(dataDir, "data" + (2 * 1 + 2));
    DataNodeTestUtils.injectDataDirFailure(dn1Vol1);
    DataNodeTestUtils.injectDataDirFailure(dn2Vol1);
    Path file1 = new Path("/test1");
    DFSTestUtil.createFile(fs, file1, 1024, (short) 2, 1L);
    DFSTestUtil.waitReplication(fs, file1, (short) 2);
    ArrayList<DataNode> dns = cluster.getDataNodes();
    assertTrue("DN1 should be up", dns.get(0).isDatanodeUp());
    assertTrue("DN2 should be up", dns.get(1).isDatanodeUp());
    assertTrue("DN3 should be up", dns.get(2).isDatanodeUp());
    checkFailuresAtDataNode(dns.get(0), 1, true, dn1Vol1.getAbsolutePath());
    checkFailuresAtDataNode(dns.get(1), 1, true, dn2Vol1.getAbsolutePath());
    checkFailuresAtDataNode(dns.get(2), 0, true);
    // Ensure we wait a sufficient amount of time
    assert (WAIT_FOR_HEARTBEATS * 10) > WAIT_FOR_DEATH;
    // The NN reports two volume failures
    DFSTestUtil.waitForDatanodeStatus(dm, 3, 0, 2, origCapacity - (1 * dnCapacity), WAIT_FOR_HEARTBEATS);
    checkAggregateFailuresAtNameNode(true, 2);
    checkFailuresAtNameNode(dm, dns.get(0), true, dn1Vol1.getAbsolutePath());
    checkFailuresAtNameNode(dm, dns.get(1), true, dn2Vol1.getAbsolutePath());
    // Reconfigure again to try to add back the failed volumes.
    DataNodeTestUtils.reconfigureDataNode(dns.get(0), dn1Vol1, dn1Vol2);
    DataNodeTestUtils.reconfigureDataNode(dns.get(1), dn2Vol1, dn2Vol2);
    DataNodeTestUtils.triggerHeartbeat(dns.get(0));
    DataNodeTestUtils.triggerHeartbeat(dns.get(1));
    checkFailuresAtDataNode(dns.get(0), 1, true, dn1Vol1.getAbsolutePath());
    checkFailuresAtDataNode(dns.get(1), 1, true, dn2Vol1.getAbsolutePath());
    // Ensure we wait a sufficient amount of time.
    assert (WAIT_FOR_HEARTBEATS * 10) > WAIT_FOR_DEATH;
    // The NN reports two volume failures again.
    DFSTestUtil.waitForDatanodeStatus(dm, 3, 0, 2, origCapacity - (1 * dnCapacity), WAIT_FOR_HEARTBEATS);
    checkAggregateFailuresAtNameNode(true, 2);
    checkFailuresAtNameNode(dm, dns.get(0), true, dn1Vol1.getAbsolutePath());
    checkFailuresAtNameNode(dm, dns.get(1), true, dn2Vol1.getAbsolutePath());
    // Reconfigure a third time with the failed volumes.  Afterwards, we expect
    // the same volume failures to be reported.  (No double-counting.)
    DataNodeTestUtils.reconfigureDataNode(dns.get(0), dn1Vol1, dn1Vol2);
    DataNodeTestUtils.reconfigureDataNode(dns.get(1), dn2Vol1, dn2Vol2);
    DataNodeTestUtils.triggerHeartbeat(dns.get(0));
    DataNodeTestUtils.triggerHeartbeat(dns.get(1));
    checkFailuresAtDataNode(dns.get(0), 1, true, dn1Vol1.getAbsolutePath());
    checkFailuresAtDataNode(dns.get(1), 1, true, dn2Vol1.getAbsolutePath());
    // Ensure we wait a sufficient amount of time.
    assert (WAIT_FOR_HEARTBEATS * 10) > WAIT_FOR_DEATH;
    // The NN reports two volume failures again.
    DFSTestUtil.waitForDatanodeStatus(dm, 3, 0, 2, origCapacity - (1 * dnCapacity), WAIT_FOR_HEARTBEATS);
    checkAggregateFailuresAtNameNode(true, 2);
    checkFailuresAtNameNode(dm, dns.get(0), true, dn1Vol1.getAbsolutePath());
    checkFailuresAtNameNode(dm, dns.get(1), true, dn2Vol1.getAbsolutePath());
    // Replace failed volume with healthy volume and run reconfigure DataNode.
    // The failed volume information should be cleared.
    DataNodeTestUtils.restoreDataDirFromFailure(dn1Vol1, dn2Vol1);
    DataNodeTestUtils.reconfigureDataNode(dns.get(0), dn1Vol1, dn1Vol2);
    DataNodeTestUtils.reconfigureDataNode(dns.get(1), dn2Vol1, dn2Vol2);
    DataNodeTestUtils.triggerHeartbeat(dns.get(0));
    DataNodeTestUtils.triggerHeartbeat(dns.get(1));
    checkFailuresAtDataNode(dns.get(0), 1, true);
    checkFailuresAtDataNode(dns.get(1), 1, true);
    DFSTestUtil.waitForDatanodeStatus(dm, 3, 0, 0, origCapacity, WAIT_FOR_HEARTBEATS);
    checkAggregateFailuresAtNameNode(true, 0);
    checkFailuresAtNameNode(dm, dns.get(0), true);
    checkFailuresAtNameNode(dm, dns.get(1), true);
}
Also used : Path(org.apache.hadoop.fs.Path) DatanodeManager(org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager) File(java.io.File) Test(org.junit.Test)

Example 25 with DatanodeManager

use of org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager in project hadoop by apache.

the class TestDataNodeVolumeFailureReporting method testMultipleVolFailuresOnNode.

@Test
public void testMultipleVolFailuresOnNode() throws Exception {
    // Reinitialize the cluster, configured with 4 storage locations per DataNode
    // and tolerating up to 2 failures.
    tearDown();
    initCluster(3, 4, 2);
    // Calculate the total capacity of all the datanodes. Sleep for three seconds
    // to be sure the datanodes have had a chance to heartbeat their capacities.
    Thread.sleep(WAIT_FOR_HEARTBEATS);
    DatanodeManager dm = cluster.getNamesystem().getBlockManager().getDatanodeManager();
    long origCapacity = DFSTestUtil.getLiveDatanodeCapacity(dm);
    long dnCapacity = DFSTestUtil.getDatanodeCapacity(dm, 0);
    File dn1Vol1 = new File(dataDir, "data" + (4 * 0 + 1));
    File dn1Vol2 = new File(dataDir, "data" + (4 * 0 + 2));
    File dn2Vol1 = new File(dataDir, "data" + (4 * 1 + 1));
    File dn2Vol2 = new File(dataDir, "data" + (4 * 1 + 2));
    // Make the first two volume directories on the first two datanodes
    // non-accessible.
    DataNodeTestUtils.injectDataDirFailure(dn1Vol1, dn1Vol2, dn2Vol1, dn2Vol2);
    // Create file1 and wait for 3 replicas (ie all DNs can still store a block).
    // Then assert that all DNs are up, despite the volume failures.
    Path file1 = new Path("/test1");
    DFSTestUtil.createFile(fs, file1, 1024, (short) 3, 1L);
    DFSTestUtil.waitReplication(fs, file1, (short) 3);
    // Create additional file to trigger failure based volume check on dn1Vol2
    // and dn2Vol2.
    Path file2 = new Path("/test2");
    DFSTestUtil.createFile(fs, file2, 1024, (short) 3, 1L);
    DFSTestUtil.waitReplication(fs, file2, (short) 3);
    ArrayList<DataNode> dns = cluster.getDataNodes();
    assertTrue("DN1 should be up", dns.get(0).isDatanodeUp());
    assertTrue("DN2 should be up", dns.get(1).isDatanodeUp());
    assertTrue("DN3 should be up", dns.get(2).isDatanodeUp());
    checkFailuresAtDataNode(dns.get(0), 1, true, dn1Vol1.getAbsolutePath(), dn1Vol2.getAbsolutePath());
    checkFailuresAtDataNode(dns.get(1), 1, true, dn2Vol1.getAbsolutePath(), dn2Vol2.getAbsolutePath());
    checkFailuresAtDataNode(dns.get(2), 0, true);
    // Ensure we wait a sufficient amount of time
    assert (WAIT_FOR_HEARTBEATS * 10) > WAIT_FOR_DEATH;
    // Eventually the NN should report four volume failures
    DFSTestUtil.waitForDatanodeStatus(dm, 3, 0, 4, origCapacity - (1 * dnCapacity), WAIT_FOR_HEARTBEATS);
    checkAggregateFailuresAtNameNode(true, 4);
    checkFailuresAtNameNode(dm, dns.get(0), true, dn1Vol1.getAbsolutePath(), dn1Vol2.getAbsolutePath());
    checkFailuresAtNameNode(dm, dns.get(1), true, dn2Vol1.getAbsolutePath(), dn2Vol2.getAbsolutePath());
    checkFailuresAtNameNode(dm, dns.get(2), true);
}
Also used : Path(org.apache.hadoop.fs.Path) DatanodeManager(org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager) File(java.io.File) Test(org.junit.Test)

Aggregations

DatanodeManager (org.apache.hadoop.hdfs.server.blockmanagement.DatanodeManager)39 Test (org.junit.Test)30 Path (org.apache.hadoop.fs.Path)21 DatanodeDescriptor (org.apache.hadoop.hdfs.server.blockmanagement.DatanodeDescriptor)21 DatanodeInfo (org.apache.hadoop.hdfs.protocol.DatanodeInfo)12 DistributedFileSystem (org.apache.hadoop.hdfs.DistributedFileSystem)9 MiniDFSCluster (org.apache.hadoop.hdfs.MiniDFSCluster)9 File (java.io.File)8 ArrayList (java.util.ArrayList)8 DataNode (org.apache.hadoop.hdfs.server.datanode.DataNode)8 FSDataOutputStream (org.apache.hadoop.fs.FSDataOutputStream)7 BlockManager (org.apache.hadoop.hdfs.server.blockmanagement.BlockManager)7 Configuration (org.apache.hadoop.conf.Configuration)6 LocatedBlock (org.apache.hadoop.hdfs.protocol.LocatedBlock)6 IOException (java.io.IOException)5 FileNotFoundException (java.io.FileNotFoundException)4 TimeoutException (java.util.concurrent.TimeoutException)4 ChecksumException (org.apache.hadoop.fs.ChecksumException)4 FileSystem (org.apache.hadoop.fs.FileSystem)4 UnresolvedLinkException (org.apache.hadoop.fs.UnresolvedLinkException)4