use of org.apache.hadoop.test.GenericTestUtils.DelayAnswer in project hadoop by apache.
the class TestStandbyCheckpoints method testStandbyExceptionThrownDuringCheckpoint.
/**
* Make sure that clients will receive StandbyExceptions even when a
* checkpoint is in progress on the SBN, and therefore the StandbyCheckpointer
* thread will have FSNS lock. Regression test for HDFS-4591.
*/
@Test(timeout = 300000)
public void testStandbyExceptionThrownDuringCheckpoint() throws Exception {
// Set it up so that we know when the SBN checkpoint starts and ends.
FSImage spyImage1 = NameNodeAdapter.spyOnFsImage(nns[1]);
DelayAnswer answerer = new DelayAnswer(LOG);
Mockito.doAnswer(answerer).when(spyImage1).saveNamespace(Mockito.any(FSNamesystem.class), Mockito.eq(NameNodeFile.IMAGE), Mockito.any(Canceler.class));
// Perform some edits and wait for a checkpoint to start on the SBN.
doEdits(0, 1000);
nns[0].getRpcServer().rollEditLog();
answerer.waitForCall();
assertTrue("SBN is not performing checkpoint but it should be.", answerer.getFireCount() == 1 && answerer.getResultCount() == 0);
// Make sure that the lock has actually been taken by the checkpointing
// thread.
ThreadUtil.sleepAtLeastIgnoreInterrupts(1000);
try {
// Perform an RPC to the SBN and make sure it throws a StandbyException.
nns[1].getRpcServer().getFileInfo("/");
fail("Should have thrown StandbyException, but instead succeeded.");
} catch (StandbyException se) {
GenericTestUtils.assertExceptionContains("is not supported", se);
}
// Make sure new incremental block reports are processed during
// checkpointing on the SBN.
assertEquals(0, cluster.getNamesystem(1).getPendingDataNodeMessageCount());
doCreate();
Thread.sleep(1000);
assertTrue(cluster.getNamesystem(1).getPendingDataNodeMessageCount() > 0);
// Make sure that the checkpoint is still going on, implying that the client
// RPC to the SBN happened during the checkpoint.
assertTrue("SBN should have still been checkpointing.", answerer.getFireCount() == 1 && answerer.getResultCount() == 0);
answerer.proceed();
answerer.waitForResult();
assertTrue("SBN should have finished checkpointing.", answerer.getFireCount() == 1 && answerer.getResultCount() == 1);
}
use of org.apache.hadoop.test.GenericTestUtils.DelayAnswer in project hadoop by apache.
the class TestCheckpoint method testMultipleSecondaryNNsAgainstSameNN.
/**
* Test case where two secondary namenodes are checkpointing the same
* NameNode. This differs from {@link #testMultipleSecondaryNamenodes()}
* since that test runs against two distinct NNs.
*
* This case tests the following interleaving:
* - 2NN A downloads image (up to txid 2)
* - 2NN A about to save its own checkpoint
* - 2NN B downloads image (up to txid 4)
* - 2NN B uploads checkpoint (txid 4)
* - 2NN A uploads checkpoint (txid 2)
*
* It verifies that this works even though the earlier-txid checkpoint gets
* uploaded after the later-txid checkpoint.
*/
@Test
public void testMultipleSecondaryNNsAgainstSameNN() throws Exception {
Configuration conf = new HdfsConfiguration();
MiniDFSCluster cluster = null;
SecondaryNameNode secondary1 = null, secondary2 = null;
try {
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(0).format(true).build();
// Start 2NNs
secondary1 = startSecondaryNameNode(conf, 1);
secondary2 = startSecondaryNameNode(conf, 2);
// Make the first 2NN's checkpoint process delayable - we can pause it
// right before it saves its checkpoint image.
CheckpointStorage spyImage1 = spyOnSecondaryImage(secondary1);
DelayAnswer delayer = new DelayAnswer(LOG);
Mockito.doAnswer(delayer).when(spyImage1).saveFSImageInAllDirs(Mockito.<FSNamesystem>any(), Mockito.anyLong());
// Set up a thread to do a checkpoint from the first 2NN
DoCheckpointThread checkpointThread = new DoCheckpointThread(secondary1);
checkpointThread.start();
// Wait for the first checkpointer to get to where it should save its image.
delayer.waitForCall();
// Now make the second checkpointer run an entire checkpoint
secondary2.doCheckpoint();
// Let the first one finish
delayer.proceed();
// It should have succeeded even though another checkpoint raced with it.
checkpointThread.join();
checkpointThread.propagateExceptions();
// primary should record "last checkpoint" as the higher txid (even though
// a checkpoint with a lower txid finished most recently)
NNStorage storage = cluster.getNameNode().getFSImage().getStorage();
assertEquals(4, storage.getMostRecentCheckpointTxId());
// Should have accepted both checkpoints
assertNNHasCheckpoints(cluster, ImmutableList.of(2, 4));
// Now have second one checkpoint one more time just to make sure that
// the NN isn't left in a broken state
secondary2.doCheckpoint();
// NN should have received new checkpoint
assertEquals(6, storage.getMostRecentCheckpointTxId());
// Validate invariant that files named the same are the same.
assertParallelFilesInvariant(cluster, ImmutableList.of(secondary1, secondary2));
// NN should have removed the checkpoint at txid 2 at this point, but has
// one at txid 6
assertNNHasCheckpoints(cluster, ImmutableList.of(4, 6));
} finally {
cleanup(secondary1);
secondary1 = null;
cleanup(secondary2);
secondary2 = null;
if (cluster != null) {
cluster.shutdown();
cluster = null;
}
}
}
use of org.apache.hadoop.test.GenericTestUtils.DelayAnswer in project hadoop by apache.
the class TestReplication method testNoExtraReplicationWhenBlockReceivedIsLate.
/**
* This test makes sure that, when a file is closed before all
* of the datanodes in the pipeline have reported their replicas,
* the NameNode doesn't consider the block under-replicated too
* aggressively. It is a regression test for HDFS-1172.
*/
@Test(timeout = 60000)
public void testNoExtraReplicationWhenBlockReceivedIsLate() throws Exception {
LOG.info("Test block replication when blockReceived is late");
final short numDataNodes = 3;
final short replication = 3;
final Configuration conf = new Configuration();
conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, 1024);
final MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).numDataNodes(numDataNodes).build();
final String testFile = "/replication-test-file";
final Path testPath = new Path(testFile);
final BlockManager bm = cluster.getNameNode().getNamesystem().getBlockManager();
try {
cluster.waitActive();
// Artificially delay IBR from 1 DataNode.
// this ensures that the client's completeFile() RPC will get to the
// NN before some of the replicas are reported.
NameNode nn = cluster.getNameNode();
DataNode dn = cluster.getDataNodes().get(0);
DatanodeProtocolClientSideTranslatorPB spy = InternalDataNodeTestUtils.spyOnBposToNN(dn, nn);
DelayAnswer delayer = new GenericTestUtils.DelayAnswer(LOG);
Mockito.doAnswer(delayer).when(spy).blockReceivedAndDeleted(Mockito.<DatanodeRegistration>anyObject(), Mockito.anyString(), Mockito.<StorageReceivedDeletedBlocks[]>anyObject());
FileSystem fs = cluster.getFileSystem();
// Create and close a small file with two blocks
DFSTestUtil.createFile(fs, testPath, 1500, replication, 0);
// schedule replication via BlockManager#computeReplicationWork
BlockManagerTestUtil.computeAllPendingWork(bm);
// Initially, should have some pending replication since the close()
// is earlier than at lease one of the reportReceivedDeletedBlocks calls
assertTrue(pendingReplicationCount(bm) > 0);
// release pending IBR.
delayer.waitForCall();
delayer.proceed();
delayer.waitForResult();
// make sure DataNodes do replication work if exists
for (DataNode d : cluster.getDataNodes()) {
DataNodeTestUtils.triggerHeartbeat(d);
}
// Wait until there is nothing pending
try {
GenericTestUtils.waitFor(new Supplier<Boolean>() {
@Override
public Boolean get() {
return pendingReplicationCount(bm) == 0;
}
}, 100, 3000);
} catch (TimeoutException e) {
fail("timed out while waiting for no pending replication.");
}
// Check that none of the datanodes have serviced a replication request.
// i.e. that the NameNode didn't schedule any spurious replication.
assertNoReplicationWasPerformed(cluster);
} finally {
if (cluster != null) {
cluster.shutdown();
}
}
}
use of org.apache.hadoop.test.GenericTestUtils.DelayAnswer in project hadoop by apache.
the class TestPipelinesFailover method testFailoverRightBeforeCommitSynchronization.
/**
* Test the scenario where the NN fails over after issuing a block
* synchronization request, but before it is committed. The
* DN running the recovery should then fail to commit the synchronization
* and a later retry will succeed.
*/
@Test(timeout = 30000)
public void testFailoverRightBeforeCommitSynchronization() throws Exception {
final Configuration conf = new Configuration();
// Disable permissions so that another user can recover the lease.
conf.setBoolean(DFSConfigKeys.DFS_PERMISSIONS_ENABLED_KEY, false);
conf.setInt(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, BLOCK_SIZE);
FSDataOutputStream stm = null;
final MiniDFSCluster cluster = newMiniCluster(conf, 3);
try {
cluster.waitActive();
cluster.transitionToActive(0);
Thread.sleep(500);
LOG.info("Starting with NN 0 active");
FileSystem fs = HATestUtil.configureFailoverFs(cluster, conf);
stm = fs.create(TEST_PATH);
// write a half block
AppendTestUtil.write(stm, 0, BLOCK_SIZE / 2);
stm.hflush();
// Look into the block manager on the active node for the block
// under construction.
NameNode nn0 = cluster.getNameNode(0);
ExtendedBlock blk = DFSTestUtil.getFirstBlock(fs, TEST_PATH);
DatanodeDescriptor expectedPrimary = DFSTestUtil.getExpectedPrimaryNode(nn0, blk);
LOG.info("Expecting block recovery to be triggered on DN " + expectedPrimary);
// Find the corresponding DN daemon, and spy on its connection to the
// active.
DataNode primaryDN = cluster.getDataNode(expectedPrimary.getIpcPort());
DatanodeProtocolClientSideTranslatorPB nnSpy = InternalDataNodeTestUtils.spyOnBposToNN(primaryDN, nn0);
// Delay the commitBlockSynchronization call
DelayAnswer delayer = new DelayAnswer(LOG);
Mockito.doAnswer(delayer).when(nnSpy).commitBlockSynchronization(Mockito.eq(blk), // new genstamp
Mockito.anyInt(), // new length
Mockito.anyLong(), // close file
Mockito.eq(true), // delete block
Mockito.eq(false), // new targets
(DatanodeID[]) Mockito.anyObject(), // new target storages
(String[]) Mockito.anyObject());
DistributedFileSystem fsOtherUser = createFsAsOtherUser(cluster, conf);
assertFalse(fsOtherUser.recoverLease(TEST_PATH));
LOG.info("Waiting for commitBlockSynchronization call from primary");
delayer.waitForCall();
LOG.info("Failing over to NN 1");
cluster.transitionToStandby(0);
cluster.transitionToActive(1);
// Let the commitBlockSynchronization call go through, and check that
// it failed with the correct exception.
delayer.proceed();
delayer.waitForResult();
Throwable t = delayer.getThrown();
if (t == null) {
fail("commitBlockSynchronization call did not fail on standby");
}
GenericTestUtils.assertExceptionContains("Operation category WRITE is not supported", t);
// Now, if we try again to recover the block, it should succeed on the new
// active.
loopRecoverLease(fsOtherUser, TEST_PATH);
AppendTestUtil.check(fs, TEST_PATH, BLOCK_SIZE / 2);
} finally {
IOUtils.closeStream(stm);
cluster.shutdown();
}
}
use of org.apache.hadoop.test.GenericTestUtils.DelayAnswer in project hadoop by apache.
the class TestDNFencing method testRBWReportArrivesAfterEdits.
/**
* Another regression test for HDFS-2742. This tests the following sequence:
* - DN does a block report while file is open. This BR contains
* the block in RBW state.
* - The block report is delayed in reaching the standby.
* - The file is closed.
* - The standby processes the OP_ADD and OP_CLOSE operations before
* the RBW block report arrives.
* - The standby should not mark the block as corrupt.
*/
@Test
public void testRBWReportArrivesAfterEdits() throws Exception {
final CountDownLatch brFinished = new CountDownLatch(1);
DelayAnswer delayer = new GenericTestUtils.DelayAnswer(LOG) {
@Override
protected Object passThrough(InvocationOnMock invocation) throws Throwable {
try {
return super.passThrough(invocation);
} finally {
// inform the test that our block report went through.
brFinished.countDown();
}
}
};
FSDataOutputStream out = fs.create(TEST_FILE_PATH);
try {
AppendTestUtil.write(out, 0, 10);
out.hflush();
DataNode dn = cluster.getDataNodes().get(0);
DatanodeProtocolClientSideTranslatorPB spy = InternalDataNodeTestUtils.spyOnBposToNN(dn, nn2);
Mockito.doAnswer(delayer).when(spy).blockReport(Mockito.<DatanodeRegistration>anyObject(), Mockito.anyString(), Mockito.<StorageBlockReport[]>anyObject(), Mockito.<BlockReportContext>anyObject());
dn.scheduleAllBlockReport(0);
delayer.waitForCall();
} finally {
IOUtils.closeStream(out);
}
cluster.transitionToStandby(0);
cluster.transitionToActive(1);
delayer.proceed();
brFinished.await();
// Verify that no replicas are marked corrupt, and that the
// file is readable from the failed-over standby.
BlockManagerTestUtil.updateState(nn1.getNamesystem().getBlockManager());
BlockManagerTestUtil.updateState(nn2.getNamesystem().getBlockManager());
assertEquals(0, nn1.getNamesystem().getCorruptReplicaBlocks());
assertEquals(0, nn2.getNamesystem().getCorruptReplicaBlocks());
DFSTestUtil.readFile(fs, TEST_FILE_PATH);
}
Aggregations