use of org.apache.hadoop.hdfs.server.datanode.DataNodeFaultInjector in project hadoop by apache.
the class TestClientProtocolForPipelineRecovery method testPacketTransmissionDelay.
@Test
public void testPacketTransmissionDelay() throws Exception {
// Make the first datanode to not relay heartbeat packet.
DataNodeFaultInjector dnFaultInjector = new DataNodeFaultInjector() {
@Override
public boolean dropHeartbeatPacket() {
return true;
}
};
DataNodeFaultInjector oldDnInjector = DataNodeFaultInjector.get();
DataNodeFaultInjector.set(dnFaultInjector);
// Setting the timeout to be 3 seconds. Normally heartbeat packet
// would be sent every 1.5 seconds if there is no data traffic.
Configuration conf = new HdfsConfiguration();
conf.set(HdfsClientConfigKeys.DFS_CLIENT_SOCKET_TIMEOUT_KEY, "3000");
MiniDFSCluster cluster = null;
try {
int numDataNodes = 2;
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(numDataNodes).build();
cluster.waitActive();
FileSystem fs = cluster.getFileSystem();
FSDataOutputStream out = fs.create(new Path("noheartbeat.dat"), (short) 2);
out.write(0x31);
out.hflush();
DFSOutputStream dfsOut = (DFSOutputStream) out.getWrappedStream();
// original pipeline
DatanodeInfo[] orgNodes = dfsOut.getPipeline();
// Cause the second datanode to timeout on reading packet
Thread.sleep(3500);
out.write(0x32);
out.hflush();
// new pipeline
DatanodeInfo[] newNodes = dfsOut.getPipeline();
out.close();
boolean contains = false;
for (int i = 0; i < newNodes.length; i++) {
if (orgNodes[0].getXferAddr().equals(newNodes[i].getXferAddr())) {
throw new IOException("The first datanode should have been replaced.");
}
if (orgNodes[1].getXferAddr().equals(newNodes[i].getXferAddr())) {
contains = true;
}
}
Assert.assertTrue(contains);
} finally {
DataNodeFaultInjector.set(oldDnInjector);
if (cluster != null) {
cluster.shutdown();
}
}
}
use of org.apache.hadoop.hdfs.server.datanode.DataNodeFaultInjector in project hadoop by apache.
the class TestClientProtocolForPipelineRecovery method testZeroByteBlockRecovery.
/**
* Test to make sure the checksum is set correctly after pipeline
* recovery transfers 0 byte partial block. If fails the test case
* will say "java.io.IOException: Failed to replace a bad datanode
* on the existing pipeline due to no more good datanodes being
* available to try." This indicates there was a real failure
* after the staged failure.
*/
@Test
public void testZeroByteBlockRecovery() throws Exception {
// Make the first datanode fail once. With 3 nodes and a block being
// created with 2 replicas, anything more than this planned failure
// will cause a test failure.
DataNodeFaultInjector dnFaultInjector = new DataNodeFaultInjector() {
int tries = 1;
@Override
public void stopSendingPacketDownstream(final String mirrAddr) throws IOException {
if (tries > 0) {
tries--;
try {
Thread.sleep(60000);
} catch (InterruptedException ie) {
throw new IOException("Interrupted while sleeping. Bailing out.");
}
}
}
};
DataNodeFaultInjector oldDnInjector = DataNodeFaultInjector.get();
DataNodeFaultInjector.set(dnFaultInjector);
Configuration conf = new HdfsConfiguration();
conf.set(HdfsClientConfigKeys.DFS_CLIENT_SOCKET_TIMEOUT_KEY, "1000");
conf.set(HdfsClientConfigKeys.BlockWrite.ReplaceDatanodeOnFailure.POLICY_KEY, "ALWAYS");
MiniDFSCluster cluster = null;
try {
cluster = new MiniDFSCluster.Builder(conf).numDataNodes(3).build();
cluster.waitActive();
FileSystem fs = cluster.getFileSystem();
FSDataOutputStream out = fs.create(new Path("noheartbeat.dat"), (short) 2);
out.write(0x31);
out.hflush();
out.close();
} finally {
if (cluster != null) {
cluster.shutdown();
}
DataNodeFaultInjector.set(oldDnInjector);
}
}
use of org.apache.hadoop.hdfs.server.datanode.DataNodeFaultInjector in project hadoop by apache.
the class TestSpaceReservation method testReservedSpaceForPipelineRecovery.
@Test(timeout = 30000)
public void testReservedSpaceForPipelineRecovery() throws Exception {
final short replication = 3;
startCluster(BLOCK_SIZE, replication, -1);
final String methodName = GenericTestUtils.getMethodName();
final Path file = new Path("/" + methodName + ".01.dat");
old = DataNodeFaultInjector.get();
// Fault injector to fail connection to mirror first time.
DataNodeFaultInjector.set(new DataNodeFaultInjector() {
private int tries = 0;
@Override
public void failMirrorConnection() throws IOException {
if (tries++ == 0) {
throw new IOException("Failing Mirror for space reservation");
}
}
});
// Write 1 byte to the file and kill the writer.
FSDataOutputStream os = fs.create(file, replication);
os.write(new byte[1]);
os.close();
// Ensure all space reserved for the replica was released on each
// DataNode.
cluster.triggerBlockReports();
for (final DataNode dn : cluster.getDataNodes()) {
try (FsDatasetSpi.FsVolumeReferences volumes = dn.getFSDataset().getFsVolumeReferences()) {
final FsVolumeImpl volume = (FsVolumeImpl) volumes.get(0);
GenericTestUtils.waitFor(new Supplier<Boolean>() {
@Override
public Boolean get() {
LOG.info("dn " + dn.getDisplayName() + " space : " + volume.getReservedForReplicas());
return (volume.getReservedForReplicas() == 0);
}
}, 100, // Wait until the test times out.
Integer.MAX_VALUE);
}
}
}
use of org.apache.hadoop.hdfs.server.datanode.DataNodeFaultInjector in project hadoop by apache.
the class TestShortCircuitCache method testDataXceiverHandlesRequestShortCircuitShmFailure.
// Regression test for HADOOP-11802
@Test(timeout = 60000)
public void testDataXceiverHandlesRequestShortCircuitShmFailure() throws Exception {
BlockReaderTestUtil.enableShortCircuitShmTracing();
TemporarySocketDirectory sockDir = new TemporarySocketDirectory();
Configuration conf = createShortCircuitConf("testDataXceiverHandlesRequestShortCircuitShmFailure", sockDir);
conf.setLong(HdfsClientConfigKeys.Read.ShortCircuit.STREAMS_CACHE_EXPIRY_MS_KEY, 1000000000L);
MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).numDataNodes(1).build();
cluster.waitActive();
DistributedFileSystem fs = cluster.getFileSystem();
final Path TEST_PATH1 = new Path("/test_file1");
DFSTestUtil.createFile(fs, TEST_PATH1, 4096, (short) 1, 0xFADE1);
LOG.info("Setting failure injector and performing a read which " + "should fail...");
DataNodeFaultInjector failureInjector = Mockito.mock(DataNodeFaultInjector.class);
Mockito.doAnswer(new Answer<Void>() {
@Override
public Void answer(InvocationOnMock invocation) throws Throwable {
throw new IOException("injected error into sendShmResponse");
}
}).when(failureInjector).sendShortCircuitShmResponse();
DataNodeFaultInjector prevInjector = DataNodeFaultInjector.instance;
DataNodeFaultInjector.instance = failureInjector;
try {
// The first read will try to allocate a shared memory segment and slot.
// The shared memory segment allocation will fail because of the failure
// injector.
DFSTestUtil.readFileBuffer(fs, TEST_PATH1);
Assert.fail("expected readFileBuffer to fail, but it succeeded.");
} catch (Throwable t) {
GenericTestUtils.assertExceptionContains("TCP reads were disabled for " + "testing, but we failed to do a non-TCP read.", t);
}
checkNumberOfSegmentsAndSlots(0, 0, cluster.getDataNodes().get(0).getShortCircuitRegistry());
LOG.info("Clearing failure injector and performing another read...");
DataNodeFaultInjector.instance = prevInjector;
fs.getClient().getClientContext().getDomainSocketFactory().clearPathMap();
// The second read should succeed.
DFSTestUtil.readFileBuffer(fs, TEST_PATH1);
// We should have added a new short-circuit shared memory segment and slot.
checkNumberOfSegmentsAndSlots(1, 1, cluster.getDataNodes().get(0).getShortCircuitRegistry());
cluster.shutdown();
sockDir.close();
}
use of org.apache.hadoop.hdfs.server.datanode.DataNodeFaultInjector in project hadoop by apache.
the class TestClientProtocolForPipelineRecovery method testPipelineRecoveryWithTransferBlock.
// Test to verify that blocks are no longer corrupted after HDFS-4660.
// Revert HDFS-4660 and the other related ones (HDFS-9220, HDFS-8722), this
// test would fail.
// Scenario: Prior to the fix, block get corrupted when the transferBlock
// happens during pipeline recovery with extra bytes to make up the end of
// chunk.
// For verification, Need to fail the pipeline for last datanode when the
// second datanode have more bytes on disk than already acked bytes.
// This will enable to transfer extra bytes to the newNode to makeup
// end-of-chunk during pipeline recovery. This is achieved by the customized
// DataNodeFaultInjector class in this test.
// For detailed info, please refer to HDFS-4660 and HDFS-10587. HDFS-9220
// fixes an issue in HDFS-4660 patch, and HDFS-8722 is an optimization.
@Test
public void testPipelineRecoveryWithTransferBlock() throws Exception {
final int chunkSize = 512;
final int oneWriteSize = 5000;
final int totalSize = 1024 * 1024;
final int errorInjectionPos = 512;
Configuration conf = new HdfsConfiguration();
// Need 4 datanodes to verify the replaceDatanode during pipeline recovery
final MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf).numDataNodes(4).build();
DataNodeFaultInjector old = DataNodeFaultInjector.get();
try {
DistributedFileSystem fs = cluster.getFileSystem();
Path fileName = new Path("/f");
FSDataOutputStream o = fs.create(fileName);
int count = 0;
// Flush to get the pipeline created.
o.writeBytes("hello");
o.hflush();
DFSOutputStream dfsO = (DFSOutputStream) o.getWrappedStream();
final DatanodeInfo[] pipeline = dfsO.getStreamer().getNodes();
final String lastDn = pipeline[2].getXferAddr(false);
final AtomicBoolean failed = new AtomicBoolean(false);
DataNodeFaultInjector.set(new DataNodeFaultInjector() {
@Override
public void failPipeline(ReplicaInPipeline replicaInfo, String mirror) throws IOException {
if (!lastDn.equals(mirror)) {
// Only fail for second DN
return;
}
if (!failed.get() && (replicaInfo.getBytesAcked() > errorInjectionPos) && (replicaInfo.getBytesAcked() % chunkSize != 0)) {
int count = 0;
while (count < 10) {
// described in HDFS-4660 would occur.
if ((replicaInfo.getBytesOnDisk() / chunkSize) - (replicaInfo.getBytesAcked() / chunkSize) >= 1) {
failed.set(true);
throw new IOException("Failing Pipeline " + replicaInfo.getBytesAcked() + " : " + replicaInfo.getBytesOnDisk());
}
try {
Thread.sleep(200);
} catch (InterruptedException e) {
}
count++;
}
}
}
});
Random r = new Random();
byte[] b = new byte[oneWriteSize];
while (count < totalSize) {
r.nextBytes(b);
o.write(b);
count += oneWriteSize;
o.hflush();
}
assertTrue("Expected a failure in the pipeline", failed.get());
DatanodeInfo[] newNodes = dfsO.getStreamer().getNodes();
o.close();
// Trigger block report to NN
for (DataNode d : cluster.getDataNodes()) {
DataNodeTestUtils.triggerBlockReport(d);
}
// Read from the replaced datanode to verify the corruption. So shutdown
// all other nodes in the pipeline.
List<DatanodeInfo> pipelineList = Arrays.asList(pipeline);
DatanodeInfo newNode = null;
for (DatanodeInfo node : newNodes) {
if (!pipelineList.contains(node)) {
newNode = node;
break;
}
}
LOG.info("Number of nodes in pipeline: {} newNode {}", newNodes.length, newNode.getName());
// shutdown old 2 nodes
for (int i = 0; i < newNodes.length; i++) {
if (newNodes[i].getName().equals(newNode.getName())) {
continue;
}
LOG.info("shutdown {}", newNodes[i].getName());
cluster.stopDataNode(newNodes[i].getName());
}
// Read should be successfull from only the newNode. There should not be
// any corruption reported.
DFSTestUtil.readFile(fs, fileName);
} finally {
DataNodeFaultInjector.set(old);
cluster.shutdown();
}
}
Aggregations