Search in sources :

Example 1 with PreemptiveFastFailException

use of org.apache.hadoop.hbase.exceptions.PreemptiveFastFailException in project hbase by apache.

the class RpcRetryingCallerImpl method callWithRetries.

@Override
public T callWithRetries(RetryingCallable<T> callable, int callTimeout) throws IOException, RuntimeException {
    List<RetriesExhaustedException.ThrowableWithExtraContext> exceptions = new ArrayList<>();
    tracker.start();
    context.clear();
    for (int tries = 0; ; tries++) {
        long expectedSleep;
        try {
            // bad cache entries are cleared in the call to RetryingCallable#throwable() in catch block
            callable.prepare(tries != 0);
            interceptor.intercept(context.prepare(callable, tries));
            return callable.call(getTimeout(callTimeout));
        } catch (PreemptiveFastFailException e) {
            throw e;
        } catch (Throwable t) {
            Throwable e = t.getCause();
            ExceptionUtil.rethrowIfInterrupt(t);
            // translateException throws exception when should not retry: i.e. when request is bad.
            interceptor.handleFailure(context, t);
            t = translateException(t);
            if (tries > startLogErrorsCnt) {
                LOG.info("Call exception, tries=" + tries + ", maxAttempts=" + maxAttempts + ", started=" + (EnvironmentEdgeManager.currentTime() - tracker.getStartTime()) + " ms ago, " + "cancelled=" + cancelled.get() + ", msg=" + t.getMessage() + " " + callable.getExceptionMessageAdditionalDetail());
            }
            callable.throwable(t, maxAttempts != 1);
            RetriesExhaustedException.ThrowableWithExtraContext qt = new RetriesExhaustedException.ThrowableWithExtraContext(t, EnvironmentEdgeManager.currentTime(), toString());
            exceptions.add(qt);
            if (tries >= maxAttempts - 1) {
                throw new RetriesExhaustedException(tries, exceptions);
            }
            // If the server is dead, we need to wait a little before retrying, to give
            // a chance to the regions to be moved
            // get right pause time, start by RETRY_BACKOFF[0] * pauseBase, where pauseBase might be
            // special when encountering CallQueueTooBigException, see #HBASE-17114
            long pauseBase = (t instanceof CallQueueTooBigException) ? pauseForCQTBE : pause;
            expectedSleep = callable.sleep(pauseBase, tries);
            // If, after the planned sleep, there won't be enough time left, we stop now.
            long duration = singleCallDuration(expectedSleep);
            if (duration > callTimeout) {
                String msg = "callTimeout=" + callTimeout + ", callDuration=" + duration + ": " + t.getMessage() + " " + callable.getExceptionMessageAdditionalDetail();
                throw (SocketTimeoutException) (new SocketTimeoutException(msg).initCause(t));
            }
        } finally {
            interceptor.updateFailureInfo(context);
        }
        try {
            if (expectedSleep > 0) {
                synchronized (cancelled) {
                    if (cancelled.get())
                        return null;
                    cancelled.wait(expectedSleep);
                }
            }
            if (cancelled.get())
                return null;
        } catch (InterruptedException e) {
            throw new InterruptedIOException("Interrupted after " + tries + " tries while maxAttempts=" + maxAttempts);
        }
    }
}
Also used : InterruptedIOException(java.io.InterruptedIOException) CallQueueTooBigException(org.apache.hadoop.hbase.CallQueueTooBigException) ArrayList(java.util.ArrayList) PreemptiveFastFailException(org.apache.hadoop.hbase.exceptions.PreemptiveFastFailException) SocketTimeoutException(java.net.SocketTimeoutException)

Example 2 with PreemptiveFastFailException

use of org.apache.hadoop.hbase.exceptions.PreemptiveFastFailException in project hbase by apache.

the class TestFastFail method testFastFail.

@Ignore("Can go zombie -- see HBASE-14421; FIX")
@Test
public void testFastFail() throws IOException, InterruptedException {
    Admin admin = TEST_UTIL.getAdmin();
    final String tableName = name.getMethodName();
    HTableDescriptor desc = new HTableDescriptor(TableName.valueOf(Bytes.toBytes(tableName)));
    desc.addFamily(new HColumnDescriptor(FAMILY));
    admin.createTable(desc, Bytes.toBytes("aaaa"), Bytes.toBytes("zzzz"), 32);
    final long numRows = 1000;
    Configuration conf = TEST_UTIL.getConfiguration();
    conf.setLong(HConstants.HBASE_CLIENT_OPERATION_TIMEOUT, SLEEPTIME * 100);
    conf.setInt(HConstants.HBASE_CLIENT_PAUSE, SLEEPTIME / 10);
    conf.setBoolean(HConstants.HBASE_CLIENT_FAST_FAIL_MODE_ENABLED, true);
    conf.setLong(HConstants.HBASE_CLIENT_FAST_FAIL_THREASHOLD_MS, 0);
    conf.setClass(HConstants.HBASE_CLIENT_FAST_FAIL_INTERCEPTOR_IMPL, MyPreemptiveFastFailInterceptor.class, PreemptiveFastFailInterceptor.class);
    final Connection connection = ConnectionFactory.createConnection(conf);
    /**
     * Write numRows worth of data, so that the workers can arbitrarily read.
     */
    List<Put> puts = new ArrayList<>();
    for (long i = 0; i < numRows; i++) {
        byte[] rowKey = longToByteArrayKey(i);
        Put put = new Put(rowKey);
        // value is the same as the row key
        byte[] value = rowKey;
        put.addColumn(FAMILY, QUALIFIER, value);
        puts.add(put);
    }
    try (Table table = connection.getTable(TableName.valueOf(tableName))) {
        table.put(puts);
        LOG.info("Written all puts.");
    }
    /**
     * The number of threads that are going to perform actions against the test
     * table.
     */
    int nThreads = 100;
    ExecutorService service = Executors.newFixedThreadPool(nThreads);
    final CountDownLatch continueOtherHalf = new CountDownLatch(1);
    final CountDownLatch doneHalfway = new CountDownLatch(nThreads);
    final AtomicInteger numSuccessfullThreads = new AtomicInteger(0);
    final AtomicInteger numFailedThreads = new AtomicInteger(0);
    // The total time taken for the threads to perform the second put;
    final AtomicLong totalTimeTaken = new AtomicLong(0);
    final AtomicInteger numBlockedWorkers = new AtomicInteger(0);
    final AtomicInteger numPreemptiveFastFailExceptions = new AtomicInteger(0);
    List<Future<Boolean>> futures = new ArrayList<>();
    for (int i = 0; i < nThreads; i++) {
        futures.add(service.submit(new Callable<Boolean>() {

            /**
         * The workers are going to perform a couple of reads. The second read
         * will follow the killing of a regionserver so that we make sure that
         * some of threads go into PreemptiveFastFailExcception
         */
            public Boolean call() throws Exception {
                try (Table table = connection.getTable(TableName.valueOf(tableName))) {
                    // Add some jitter here
                    Thread.sleep(Math.abs(random.nextInt()) % 250);
                    byte[] row = longToByteArrayKey(Math.abs(random.nextLong()) % numRows);
                    Get g = new Get(row);
                    g.addColumn(FAMILY, QUALIFIER);
                    try {
                        table.get(g);
                    } catch (Exception e) {
                        LOG.debug("Get failed : ", e);
                        doneHalfway.countDown();
                        return false;
                    }
                    // Done with one get, proceeding to do the next one.
                    doneHalfway.countDown();
                    continueOtherHalf.await();
                    long startTime = System.currentTimeMillis();
                    g = new Get(row);
                    g.addColumn(FAMILY, QUALIFIER);
                    try {
                        table.get(g);
                        // The get was successful
                        numSuccessfullThreads.addAndGet(1);
                    } catch (Exception e) {
                        if (e instanceof PreemptiveFastFailException) {
                            // We were issued a PreemptiveFastFailException
                            numPreemptiveFastFailExceptions.addAndGet(1);
                        }
                        // Irrespective of PFFE, the request failed.
                        numFailedThreads.addAndGet(1);
                        return false;
                    } finally {
                        long enTime = System.currentTimeMillis();
                        totalTimeTaken.addAndGet(enTime - startTime);
                        if ((enTime - startTime) >= SLEEPTIME) {
                            // Considering the slow workers as the blockedWorkers.
                            // This assumes that the threads go full throttle at performing
                            // actions. In case the thread scheduling itself is as slow as
                            // SLEEPTIME, then this test might fail and so, we might have
                            // set it to a higher number on slower machines.
                            numBlockedWorkers.addAndGet(1);
                        }
                    }
                    return true;
                } catch (Exception e) {
                    LOG.error("Caught unknown exception", e);
                    doneHalfway.countDown();
                    return false;
                }
            }
        }));
    }
    doneHalfway.await();
    // Kill a regionserver
    TEST_UTIL.getHBaseCluster().getRegionServer(0).getRpcServer().stop();
    TEST_UTIL.getHBaseCluster().getRegionServer(0).stop("Testing");
    // Let the threads continue going
    continueOtherHalf.countDown();
    Thread.sleep(2 * SLEEPTIME);
    // Start a RS in the cluster
    TEST_UTIL.getHBaseCluster().startRegionServer();
    int numThreadsReturnedFalse = 0;
    int numThreadsReturnedTrue = 0;
    int numThreadsThrewExceptions = 0;
    for (Future<Boolean> f : futures) {
        try {
            numThreadsReturnedTrue += f.get() ? 1 : 0;
            numThreadsReturnedFalse += f.get() ? 0 : 1;
        } catch (Exception e) {
            numThreadsThrewExceptions++;
        }
    }
    LOG.debug("numThreadsReturnedFalse:" + numThreadsReturnedFalse + " numThreadsReturnedTrue:" + numThreadsReturnedTrue + " numThreadsThrewExceptions:" + numThreadsThrewExceptions + " numFailedThreads:" + numFailedThreads.get() + " numSuccessfullThreads:" + numSuccessfullThreads.get() + " numBlockedWorkers:" + numBlockedWorkers.get() + " totalTimeWaited: " + totalTimeTaken.get() / (numBlockedWorkers.get() == 0 ? Long.MAX_VALUE : numBlockedWorkers.get()) + " numPFFEs: " + numPreemptiveFastFailExceptions.get());
    assertEquals("The expected number of all the successfull and the failed " + "threads should equal the total number of threads that we spawned", nThreads, numFailedThreads.get() + numSuccessfullThreads.get());
    assertEquals("All the failures should be coming from the secondput failure", numFailedThreads.get(), numThreadsReturnedFalse);
    assertEquals("Number of threads that threw execution exceptions " + "otherwise should be 0", numThreadsThrewExceptions, 0);
    assertEquals("The regionservers that returned true should equal to the" + " number of successful threads", numThreadsReturnedTrue, numSuccessfullThreads.get());
    assertTrue("There will be atleast one thread that retried instead of failing", MyPreemptiveFastFailInterceptor.numBraveSouls.get() > 0);
    assertTrue("There will be atleast one PreemptiveFastFail exception," + " otherwise, the test makes little sense." + "numPreemptiveFastFailExceptions: " + numPreemptiveFastFailExceptions.get(), numPreemptiveFastFailExceptions.get() > 0);
    assertTrue("Only few thread should ideally be waiting for the dead " + "regionserver to be coming back. numBlockedWorkers:" + numBlockedWorkers.get() + " threads that retried : " + MyPreemptiveFastFailInterceptor.numBraveSouls.get(), numBlockedWorkers.get() <= MyPreemptiveFastFailInterceptor.numBraveSouls.get());
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) HBaseConfiguration(org.apache.hadoop.hbase.HBaseConfiguration) ArrayList(java.util.ArrayList) Callable(java.util.concurrent.Callable) HColumnDescriptor(org.apache.hadoop.hbase.HColumnDescriptor) PreemptiveFastFailException(org.apache.hadoop.hbase.exceptions.PreemptiveFastFailException) CountDownLatch(java.util.concurrent.CountDownLatch) PreemptiveFastFailException(org.apache.hadoop.hbase.exceptions.PreemptiveFastFailException) IOException(java.io.IOException) HTableDescriptor(org.apache.hadoop.hbase.HTableDescriptor) AtomicLong(java.util.concurrent.atomic.AtomicLong) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future) Ignore(org.junit.Ignore) Test(org.junit.Test)

Aggregations

ArrayList (java.util.ArrayList)2 PreemptiveFastFailException (org.apache.hadoop.hbase.exceptions.PreemptiveFastFailException)2 IOException (java.io.IOException)1 InterruptedIOException (java.io.InterruptedIOException)1 SocketTimeoutException (java.net.SocketTimeoutException)1 Callable (java.util.concurrent.Callable)1 CountDownLatch (java.util.concurrent.CountDownLatch)1 ExecutorService (java.util.concurrent.ExecutorService)1 Future (java.util.concurrent.Future)1 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)1 AtomicLong (java.util.concurrent.atomic.AtomicLong)1 Configuration (org.apache.hadoop.conf.Configuration)1 CallQueueTooBigException (org.apache.hadoop.hbase.CallQueueTooBigException)1 HBaseConfiguration (org.apache.hadoop.hbase.HBaseConfiguration)1 HColumnDescriptor (org.apache.hadoop.hbase.HColumnDescriptor)1 HTableDescriptor (org.apache.hadoop.hbase.HTableDescriptor)1 Ignore (org.junit.Ignore)1 Test (org.junit.Test)1