use of org.apache.hadoop.hbase.client.RetriesExhaustedException in project hbase by apache.
the class TestMasterShutdown method testMasterShutdownBeforeStartingAnyRegionServer.
/**
* This test appears to be an intentional race between a thread that issues a shutdown RPC to the
* master, while the master is concurrently realizing it cannot initialize because there are no
* region servers available to it. The expected behavior is that master initialization is
* interruptable via said shutdown RPC.
*/
@Test
public void testMasterShutdownBeforeStartingAnyRegionServer() throws Exception {
LocalHBaseCluster hbaseCluster = null;
try {
htu = new HBaseTestingUtil(createMasterShutdownBeforeStartingAnyRegionServerConfiguration());
// configure a cluster with
final StartTestingClusterOption options = StartTestingClusterOption.builder().numDataNodes(1).numMasters(1).numRegionServers(0).masterClass(HMaster.class).rsClass(SingleProcessHBaseCluster.MiniHBaseClusterRegionServer.class).createRootDir(true).build();
// Can't simply `htu.startMiniCluster(options)` because that method waits for the master to
// start completely. However, this test's premise is that a partially started master should
// still respond to a shutdown RPC. So instead, we manage each component lifecycle
// independently.
// I think it's not worth refactoring HTU's helper methods just for this class.
htu.startMiniDFSCluster(options.getNumDataNodes());
htu.startMiniZKCluster(options.getNumZkServers());
htu.createRootDir();
hbaseCluster = new LocalHBaseCluster(htu.getConfiguration(), options.getNumMasters(), options.getNumRegionServers(), options.getMasterClass(), options.getRsClass());
final MasterThread masterThread = hbaseCluster.getMasters().get(0);
masterThread.start();
// Switching to master registry exacerbated a race in the master bootstrap that can result
// in a lost shutdown command (HBASE-8422, HBASE-23836). The race is essentially because
// the server manager in HMaster is not initialized by the time shutdown() RPC (below) is
// made to the master. The suspected reason as to why it was uncommon before HBASE-18095
// is because the connection creation with ZK registry is so slow that by then the server
// manager is usually init'ed in time for the RPC to be made. For now, adding an explicit
// wait() in the test, waiting for the server manager to become available.
final long timeout = TimeUnit.MINUTES.toMillis(10);
assertNotEquals("timeout waiting for server manager to become available.", -1, htu.waitFor(timeout, () -> masterThread.getMaster().getServerManager() != null));
// Master has come up far enough that we can terminate it without creating a zombie.
try {
// HBASE-24327 : (Resolve Flaky connection issues)
// shutdown() RPC can have flaky ZK connection issues.
// e.g
// ERROR [RpcServer.priority.RWQ.Fifo.read.handler=1,queue=1,port=53033]
// master.HMaster(2878): ZooKeeper exception trying to set cluster as down in ZK
// org.apache.zookeeper.KeeperException$SystemErrorException:
// KeeperErrorCode = SystemError
//
// However, even when above flakes happen, shutdown call does get completed even if
// RPC call has failure. Hence, subsequent retries will never succeed as HMaster is
// already shutdown. Hence, it can fail. To resolve it, after making one shutdown()
// call, we are ignoring IOException.
htu.getConnection().getAdmin().shutdown();
} catch (RetriesExhaustedException e) {
if (e.getCause() instanceof ConnectionClosedException) {
LOG.info("Connection is Closed to the cluster. The cluster is already down.", e);
} else {
throw e;
}
}
LOG.info("Shutdown RPC sent.");
masterThread.join();
} finally {
if (hbaseCluster != null) {
hbaseCluster.shutdown();
}
if (htu != null) {
htu.shutdownMiniCluster();
htu = null;
}
}
}
use of org.apache.hadoop.hbase.client.RetriesExhaustedException in project hbase by apache.
the class TestAMServerFailedOpen method testRetriesExhaustedFailure.
private void testRetriesExhaustedFailure(final TableName tableName, final MockRSExecutor executor) throws Exception {
RegionInfo hri = createRegionInfo(tableName, 1);
// collect AM metrics before test
collectAssignmentManagerMetrics();
// Test Assign operation failure
rsDispatcher.setMockRsExecutor(executor);
try {
waitOnFuture(submitProcedure(createAssignProcedure(hri)));
fail("unexpected assign completion");
} catch (RetriesExhaustedException e) {
// expected exception
LOG.info("expected exception from assign operation: " + e.getMessage(), e);
}
// Assign the region (without problems)
rsDispatcher.setMockRsExecutor(new GoodRsExecutor());
waitOnFuture(submitProcedure(createAssignProcedure(hri)));
}
use of org.apache.hadoop.hbase.client.RetriesExhaustedException in project hbase by apache.
the class TestAMServerFailedOpen method testFailedOpen.
private void testFailedOpen(final TableName tableName, final MockRSExecutor executor) throws Exception {
final RegionInfo hri = createRegionInfo(tableName, 1);
// Test Assign operation failure
rsDispatcher.setMockRsExecutor(executor);
try {
waitOnFuture(submitProcedure(createAssignProcedure(hri)));
fail("unexpected assign completion");
} catch (RetriesExhaustedException e) {
// expected exception
LOG.info("REGION STATE " + am.getRegionStates().getRegionStateNode(hri));
LOG.info("expected exception from assign operation: " + e.getMessage(), e);
assertEquals(true, am.getRegionStates().getRegionState(hri).isFailedOpen());
}
}
use of org.apache.hadoop.hbase.client.RetriesExhaustedException in project hbase by apache.
the class TestRpcClientLeaks method testSocketClosed.
@Test
public void testSocketClosed() throws IOException, InterruptedException {
TableName tableName = TableName.valueOf(name.getMethodName());
UTIL.createTable(tableName, fam1).close();
Configuration conf = new Configuration(UTIL.getConfiguration());
conf.set(RpcClientFactory.CUSTOM_RPC_CLIENT_IMPL_CONF_KEY, MyRpcClientImpl.class.getName());
conf.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 2);
try (Connection connection = ConnectionFactory.createConnection(conf);
Table table = connection.getTable(TableName.valueOf(name.getMethodName()))) {
MyRpcClientImpl.enableThrowExceptions();
table.get(new Get(Bytes.toBytes("asd")));
fail("Should fail because the injected error");
} catch (RetriesExhaustedException e) {
// expected
}
for (Socket socket : SAVED_SOCKETS) {
assertTrue("Socket " + socket + " is not closed", socket.isClosed());
}
}
use of org.apache.hadoop.hbase.client.RetriesExhaustedException in project hbase by apache.
the class TestSyncReplicationMoreLogsInLocalGiveUpSplitting method testSplitLog.
@Test
public void testSplitLog() throws Exception {
UTIL1.getAdmin().disableReplicationPeer(PEER_ID);
UTIL2.getAdmin().disableReplicationPeer(PEER_ID);
UTIL2.getAdmin().transitReplicationPeerSyncReplicationState(PEER_ID, SyncReplicationState.STANDBY);
UTIL1.getAdmin().transitReplicationPeerSyncReplicationState(PEER_ID, SyncReplicationState.ACTIVE);
try (Table table = UTIL1.getConnection().getTable(TABLE_NAME)) {
table.put(new Put(Bytes.toBytes(0)).addColumn(CF, CQ, Bytes.toBytes(0)));
}
HRegionServer rs = UTIL1.getRSForFirstRegionInTable(TABLE_NAME);
DualAsyncFSWALForTest wal = (DualAsyncFSWALForTest) rs.getWAL(RegionInfoBuilder.newBuilder(TABLE_NAME).build());
wal.setRemoteBroken();
wal.suspendLogRoll();
try (AsyncConnection conn = ConnectionFactory.createAsyncConnection(UTIL1.getConfiguration()).get()) {
AsyncTable<?> table = conn.getTableBuilder(TABLE_NAME).setMaxAttempts(1).setWriteRpcTimeout(5, TimeUnit.SECONDS).build();
try {
table.put(new Put(Bytes.toBytes(1)).addColumn(CF, CQ, Bytes.toBytes(1))).get();
fail("Should fail since the rs will hang and we will get a rpc timeout");
} catch (ExecutionException e) {
// expected
LOG.info("Expected error:", e);
}
}
wal.waitUntilArrive();
UTIL2.getAdmin().transitReplicationPeerSyncReplicationState(PEER_ID, SyncReplicationState.DOWNGRADE_ACTIVE);
wal.resumeLogRoll();
try (Table table = UTIL2.getConnection().getTable(TABLE_NAME)) {
assertEquals(0, Bytes.toInt(table.get(new Get(Bytes.toBytes(0))).getValue(CF, CQ)));
// we failed to write this entry to remote so it should not exist
assertFalse(table.exists(new Get(Bytes.toBytes(1))));
}
UTIL1.getAdmin().transitReplicationPeerSyncReplicationState(PEER_ID, SyncReplicationState.STANDBY);
// stand by state can not be read from client.
try (Table table = UTIL1.getConnection().getTable(TABLE_NAME)) {
try {
table.exists(new Get(Bytes.toBytes(0)));
} catch (DoNotRetryIOException | RetriesExhaustedException e) {
// expected
assertThat(e.getMessage(), containsString("STANDBY"));
}
}
HRegion region = UTIL1.getMiniHBaseCluster().getRegions(TABLE_NAME).get(0);
// we give up splitting the whole wal file so this record will also be gone.
assertTrue(region.get(new Get(Bytes.toBytes(0))).isEmpty());
UTIL2.getAdmin().enableReplicationPeer(PEER_ID);
// finally it should be replicated back
waitUntilReplicationDone(UTIL1, 1);
}
Aggregations