use of org.apache.hbase.thirdparty.io.netty.channel.ChannelInboundHandlerAdapter in project hbase by apache.
the class TestFanOutOneBlockAsyncDFSOutputHang method testFlushHangWhenOneDataNodeFailedBeforeOtherDataNodeAck.
/**
* <pre>
* This test is for HBASE-26679. Consider there are two dataNodes: dn1 and dn2,dn2 is a slow DN.
* The threads sequence before HBASE-26679 is:
* 1.We write some data to {@link FanOutOneBlockAsyncDFSOutput} and then flush it, there are one
* {@link FanOutOneBlockAsyncDFSOutput.Callback} in
* {@link FanOutOneBlockAsyncDFSOutput#waitingAckQueue}.
* 2.The ack from dn1 arrives firstly and triggers Netty to invoke
* {@link FanOutOneBlockAsyncDFSOutput#completed} with dn1's channel, then in
* {@link FanOutOneBlockAsyncDFSOutput#completed}, dn1's channel is removed from
* {@link FanOutOneBlockAsyncDFSOutput.Callback#unfinishedReplicas}.
* 3.But dn2 responds slowly, before dn2 sending ack,dn1 is shut down or have a exception,
* so {@link FanOutOneBlockAsyncDFSOutput#failed} is triggered by Netty with dn1's channel,
* and because the {@link FanOutOneBlockAsyncDFSOutput.Callback#unfinishedReplicas} does not
* contain dn1's channel,the {@link FanOutOneBlockAsyncDFSOutput.Callback} is skipped in
* {@link FanOutOneBlockAsyncDFSOutput#failed} method,and
* {@link FanOutOneBlockAsyncDFSOutput#state} is set to
* {@link FanOutOneBlockAsyncDFSOutput.State#BROKEN},and dn1,dn2 are all closed at the end of
* {@link FanOutOneBlockAsyncDFSOutput#failed}.
* 4.{@link FanOutOneBlockAsyncDFSOutput#failed} is triggered again by dn2 because it is closed,
* but because {@link FanOutOneBlockAsyncDFSOutput#state} is already
* {@link FanOutOneBlockAsyncDFSOutput.State#BROKEN},the whole
* {@link FanOutOneBlockAsyncDFSOutput#failed} is skipped. So wait on the future
* returned by {@link FanOutOneBlockAsyncDFSOutput#flush} would be stuck for ever.
* After HBASE-26679, for above step 4,even if the {@link FanOutOneBlockAsyncDFSOutput#state}
* is already {@link FanOutOneBlockAsyncDFSOutput.State#BROKEN}, we would still try to trigger
* {@link FanOutOneBlockAsyncDFSOutput.Callback#future}.
* </pre>
*/
@Test
public void testFlushHangWhenOneDataNodeFailedBeforeOtherDataNodeAck() throws Exception {
DataNodeProperties firstDataNodeProperties = null;
try {
final CyclicBarrier dn1AckReceivedCyclicBarrier = new CyclicBarrier(2);
Map<Channel, DatanodeInfo> datanodeInfoMap = OUT.getDatanodeInfoMap();
Iterator<Map.Entry<Channel, DatanodeInfo>> iterator = datanodeInfoMap.entrySet().iterator();
assertTrue(iterator.hasNext());
Map.Entry<Channel, DatanodeInfo> dn1Entry = iterator.next();
Channel dn1Channel = dn1Entry.getKey();
DatanodeInfo dn1DatanodeInfo = dn1Entry.getValue();
final List<String> protobufDecoderNames = new ArrayList<String>();
dn1Channel.pipeline().forEach((entry) -> {
if (ProtobufDecoder.class.isInstance(entry.getValue())) {
protobufDecoderNames.add(entry.getKey());
}
});
assertTrue(protobufDecoderNames.size() == 1);
dn1Channel.pipeline().addAfter(protobufDecoderNames.get(0), "dn1AckReceivedHandler", new ChannelInboundHandlerAdapter() {
@Override
public void channelRead(ChannelHandlerContext ctx, Object msg) throws Exception {
super.channelRead(ctx, msg);
dn1AckReceivedCyclicBarrier.await();
}
});
assertTrue(iterator.hasNext());
Map.Entry<Channel, DatanodeInfo> dn2Entry = iterator.next();
Channel dn2Channel = dn2Entry.getKey();
/**
* Here we add a {@link ChannelInboundHandlerAdapter} to eat all the responses to simulate a
* slow dn2.
*/
dn2Channel.pipeline().addFirst(new ChannelInboundHandlerAdapter() {
@Override
public void channelRead(ChannelHandlerContext ctx, Object msg) throws Exception {
if (!(msg instanceof ByteBuf)) {
ctx.fireChannelRead(msg);
}
}
});
byte[] b = new byte[10];
ThreadLocalRandom.current().nextBytes(b);
OUT.write(b, 0, b.length);
CompletableFuture<Long> future = OUT.flush(false);
/**
* Wait for ack from dn1.
*/
dn1AckReceivedCyclicBarrier.await();
/**
* First ack is received from dn1,we could stop dn1 now.
*/
firstDataNodeProperties = findAndKillFirstDataNode(dn1DatanodeInfo);
assertTrue(firstDataNodeProperties != null);
try {
/**
* Before HBASE-26679,here we should be stuck, after HBASE-26679,we would fail soon with
* {@link ExecutionException}.
*/
future.get();
fail();
} catch (ExecutionException e) {
assertTrue(e != null);
LOG.info("expected exception caught when get future", e);
}
/**
* Make sure all the data node channel are closed.
*/
datanodeInfoMap.keySet().forEach(ch -> {
try {
ch.closeFuture().get();
} catch (InterruptedException | ExecutionException e) {
throw new RuntimeException(e);
}
});
} finally {
if (firstDataNodeProperties != null) {
CLUSTER.restartDataNode(firstDataNodeProperties);
}
}
}
Aggregations