use of org.apache.hadoop.hbase.CallQueueTooBigException in project hbase by apache.
the class ConnectionImplementation method locateRegionInMeta.
* Search the hbase:meta table for the HRegionLocation
* info that contains the table and row we're seeking.
private RegionLocations locateRegionInMeta(TableName tableName, byte[] row, boolean useCache, boolean retry, int replicaId) throws IOException {
// we already have the region.
if (useCache) {
RegionLocations locations = getCachedLocation(tableName, row);
if (locations != null && locations.getRegionLocation(replicaId) != null) {
return locations;
// build the key of the meta region we should be looking for.
// the extra 9's on the end are necessary to allow "exact" matches
// without knowing the precise region names.
byte[] metaKey = HRegionInfo.createRegionName(tableName, row, HConstants.NINES, false);
Scan s = new Scan();
if (this.useMetaReplicas) {
int maxAttempts = (retry ? numTries : 1);
for (int tries = 0; true; tries++) {
if (tries >= maxAttempts) {
throw new NoServerForRegionException("Unable to find region for " + Bytes.toStringBinary(row) + " in " + tableName + " after " + tries + " tries.");
if (useCache) {
RegionLocations locations = getCachedLocation(tableName, row);
if (locations != null && locations.getRegionLocation(replicaId) != null) {
return locations;
} else {
// If we are not supposed to be using the cache, delete any existing cached location
// so it won't interfere.
metaCache.clearCache(tableName, row);
// Query the meta region
long pauseBase = this.pause;
try {
Result regionInfoRow = null;
try (ReversedClientScanner rcs = new ReversedClientScanner(conf, s, TableName.META_TABLE_NAME, this, rpcCallerFactory, rpcControllerFactory, getMetaLookupPool(), 0)) {
regionInfoRow =;
if (regionInfoRow == null) {
throw new TableNotFoundException(tableName);
// convert the row result into the HRegionLocation we need!
RegionLocations locations = MetaTableAccessor.getRegionLocations(regionInfoRow);
if (locations == null || locations.getRegionLocation(replicaId) == null) {
throw new IOException("HRegionInfo was null in " + tableName + ", row=" + regionInfoRow);
HRegionInfo regionInfo = locations.getRegionLocation(replicaId).getRegionInfo();
if (regionInfo == null) {
throw new IOException("HRegionInfo was null or empty in " + TableName.META_TABLE_NAME + ", row=" + regionInfoRow);
// possible we got a region of a different table...
if (!regionInfo.getTable().equals(tableName)) {
throw new TableNotFoundException("Table '" + tableName + "' was not found, got: " + regionInfo.getTable() + ".");
if (regionInfo.isSplit()) {
throw new RegionOfflineException("the only available region for" + " the required row is a split parent," + " the daughters should be online soon: " + regionInfo.getRegionNameAsString());
if (regionInfo.isOffline()) {
throw new RegionOfflineException("the region is offline, could" + " be caused by a disable table call: " + regionInfo.getRegionNameAsString());
ServerName serverName = locations.getRegionLocation(replicaId).getServerName();
if (serverName == null) {
throw new NoServerForRegionException("No server address listed " + "in " + TableName.META_TABLE_NAME + " for region " + regionInfo.getRegionNameAsString() + " containing row " + Bytes.toStringBinary(row));
if (isDeadServer(serverName)) {
throw new RegionServerStoppedException("hbase:meta says the region " + regionInfo.getRegionNameAsString() + " is managed by the server " + serverName + ", but it is dead.");
// Instantiate the location
cacheLocation(tableName, locations);
return locations;
} catch (TableNotFoundException e) {
// from the HTable constructor.
throw e;
} catch (IOException e) {
if (e instanceof RemoteException) {
e = ((RemoteException) e).unwrapRemoteException();
if (e instanceof CallQueueTooBigException) {
// Give a special check on CallQueueTooBigException, see #HBASE-17114
pauseBase = this.pauseForCQTBE;
if (tries < maxAttempts - 1) {
if (LOG.isDebugEnabled()) {
LOG.debug("locateRegionInMeta parentTable=" + TableName.META_TABLE_NAME + ", metaLocation=" + ", attempt=" + tries + " of " + maxAttempts + " failed; retrying after sleep of " + ConnectionUtils.getPauseTime(pauseBase, tries) + " because: " + e.getMessage());
} else {
throw e;
// Only relocate the parent region if necessary
if (!(e instanceof RegionOfflineException || e instanceof NoServerForRegionException)) {
relocateRegion(TableName.META_TABLE_NAME, metaKey, replicaId);
try {
Thread.sleep(ConnectionUtils.getPauseTime(pauseBase, tries));
} catch (InterruptedException e) {
throw new InterruptedIOException("Giving up trying to location region in " + "meta: thread is interrupted.");
use of org.apache.hadoop.hbase.CallQueueTooBigException in project hbase by apache.
the class TestAsyncProcess method testRetryPauseWithCallQueueTooBigException.
* Test and make sure we could use a special pause setting when retry with
* CallQueueTooBigException, see HBASE-17114
* @throws Exception if unexpected error happened during test
public void testRetryPauseWithCallQueueTooBigException() throws Exception {
Configuration myConf = new Configuration(CONF);
final long specialPause = 500L;
final int retries = 1;
myConf.setLong(HConstants.HBASE_CLIENT_PAUSE_FOR_CQTBE, specialPause);
myConf.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, retries);
ClusterConnection conn = new MyConnectionImpl(myConf);
AsyncProcessWithFailure ap = new AsyncProcessWithFailure(conn, myConf, new CallQueueTooBigException());
BufferedMutatorParams bufferParam = createBufferedMutatorParams(ap, DUMMY_TABLE);
BufferedMutatorImpl mutator = new BufferedMutatorImpl(conn, bufferParam, ap);
Put p = createPut(1, true);
long startTime = System.currentTimeMillis();
try {
} catch (RetriesExhaustedWithDetailsException expected) {
long actualSleep = System.currentTimeMillis() - startTime;
long expectedSleep = 0L;
for (int i = 0; i < retries; i++) {
expectedSleep += ConnectionUtils.getPauseTime(specialPause, i);
// Prevent jitter in CollectionUtils#getPauseTime to affect result
actualSleep += (long) (specialPause * 0.01f);
LOG.debug("Expected to sleep " + expectedSleep + "ms, actually slept " + actualSleep + "ms");
Assert.assertTrue("Expected to sleep " + expectedSleep + " but actually " + actualSleep + "ms", actualSleep >= expectedSleep);
// check and confirm normal IOE will use the normal pause
final long normalPause = myConf.getLong(HConstants.HBASE_CLIENT_PAUSE, HConstants.DEFAULT_HBASE_CLIENT_PAUSE);
ap = new AsyncProcessWithFailure(conn, myConf, new IOException());
bufferParam = createBufferedMutatorParams(ap, DUMMY_TABLE);
mutator = new BufferedMutatorImpl(conn, bufferParam, ap);
startTime = System.currentTimeMillis();
try {
} catch (RetriesExhaustedWithDetailsException expected) {
actualSleep = System.currentTimeMillis() - startTime;
expectedSleep = 0L;
for (int i = 0; i < retries; i++) {
expectedSleep += ConnectionUtils.getPauseTime(normalPause, i);
// plus an additional pause to balance the program execution time
expectedSleep += normalPause;
LOG.debug("Expected to sleep " + expectedSleep + "ms, actually slept " + actualSleep + "ms");
Assert.assertTrue("Slept for too long: " + actualSleep + "ms", actualSleep <= expectedSleep);
use of org.apache.hadoop.hbase.CallQueueTooBigException in project hbase by apache.
the class TestAsyncProcess method testCallQueueTooLarge.
public void testCallQueueTooLarge() throws IOException {
ClusterConnection conn = new MyConnectionImpl(CONF);
AsyncProcessWithFailure ap = new AsyncProcessWithFailure(conn, CONF, new CallQueueTooBigException());
BufferedMutatorParams bufferParam = createBufferedMutatorParams(ap, DUMMY_TABLE);
BufferedMutatorImpl mutator = new BufferedMutatorImpl(conn, bufferParam, ap);
Put p = createPut(1, true);
try {
} catch (RetriesExhaustedWithDetailsException expected) {
// Checking that the ErrorsServers came into play and didn't make us stop immediately
Assert.assertEquals(NB_RETRIES + 1, ap.callsCt.get());
use of org.apache.hadoop.hbase.CallQueueTooBigException in project hbase by apache.
the class RpcRetryingCallerImpl method callWithRetries.
public T callWithRetries(RetryingCallable<T> callable, int callTimeout) throws IOException, RuntimeException {
List<RetriesExhaustedException.ThrowableWithExtraContext> exceptions = new ArrayList<>();
for (int tries = 0; ; tries++) {
long expectedSleep;
try {
// bad cache entries are cleared in the call to RetryingCallable#throwable() in catch block
callable.prepare(tries != 0);
interceptor.intercept(context.prepare(callable, tries));
} catch (PreemptiveFastFailException e) {
throw e;
} catch (Throwable t) {
Throwable e = t.getCause();
// translateException throws exception when should not retry: i.e. when request is bad.
interceptor.handleFailure(context, t);
t = translateException(t);
if (tries > startLogErrorsCnt) {"Call exception, tries=" + tries + ", maxAttempts=" + maxAttempts + ", started=" + (EnvironmentEdgeManager.currentTime() - tracker.getStartTime()) + " ms ago, " + "cancelled=" + cancelled.get() + ", msg=" + t.getMessage() + " " + callable.getExceptionMessageAdditionalDetail());
callable.throwable(t, maxAttempts != 1);
RetriesExhaustedException.ThrowableWithExtraContext qt = new RetriesExhaustedException.ThrowableWithExtraContext(t, EnvironmentEdgeManager.currentTime(), toString());
if (tries >= maxAttempts - 1) {
throw new RetriesExhaustedException(tries, exceptions);
// If the server is dead, we need to wait a little before retrying, to give
// a chance to the regions to be moved
// get right pause time, start by RETRY_BACKOFF[0] * pauseBase, where pauseBase might be
// special when encountering CallQueueTooBigException, see #HBASE-17114
long pauseBase = (t instanceof CallQueueTooBigException) ? pauseForCQTBE : pause;
expectedSleep = callable.sleep(pauseBase, tries);
// If, after the planned sleep, there won't be enough time left, we stop now.
long duration = singleCallDuration(expectedSleep);
if (duration > callTimeout) {
String msg = "callTimeout=" + callTimeout + ", callDuration=" + duration + ": " + t.getMessage() + " " + callable.getExceptionMessageAdditionalDetail();
throw (SocketTimeoutException) (new SocketTimeoutException(msg).initCause(t));
} finally {
try {
if (expectedSleep > 0) {
synchronized (cancelled) {
if (cancelled.get())
return null;
if (cancelled.get())
return null;
} catch (InterruptedException e) {
throw new InterruptedIOException("Interrupted after " + tries + " tries while maxAttempts=" + maxAttempts);
use of org.apache.hadoop.hbase.CallQueueTooBigException in project hbase by apache.
the class HRegionServer method reportRegionStateTransition.
public boolean reportRegionStateTransition(final RegionStateTransitionContext context) {
return skipReportingTransition(context);
final ReportRegionStateTransitionRequest request = createReportRegionStateTransitionRequest(context);
int tries = 0;
long pauseTime = this.retryPauseTime;
// HRegionServer does down.
while (this.asyncClusterConnection != null && !this.asyncClusterConnection.isClosed()) {
RegionServerStatusService.BlockingInterface rss = rssStub;
try {
if (rss == null) {
ReportRegionStateTransitionResponse response = rss.reportRegionStateTransition(null, request);
if (response.hasErrorMessage()) {"TRANSITION FAILED " + request + ": " + response.getErrorMessage());
// know if were successful after an attempt showed in logs as failed.
if (tries > 0 || LOG.isTraceEnabled()) {"TRANSITION REPORTED " + request);
// NOTE: Return mid-method!!!
return true;
} catch (ServiceException se) {
IOException ioe = ProtobufUtil.getRemoteException(se);
boolean pause = ioe instanceof ServerNotRunningYetException || ioe instanceof PleaseHoldException || ioe instanceof CallQueueTooBigException;
if (pause) {
// Do backoff else we flood the Master with requests.
pauseTime = ConnectionUtils.getPauseTime(this.retryPauseTime, tries);
} else {
// Reset.
pauseTime = this.retryPauseTime;
}"Failed report transition " + TextFormat.shortDebugString(request) + "; retry (#" + tries + ")" + (pause ? " after " + pauseTime + "ms delay (Master is coming online...)." : " immediately."), ioe);
if (pause) {
if (rssStub == rss) {
rssStub = null;
return false;