Example 26 with SocketTimeoutException

   * Do the shipping logic
public boolean replicate(ReplicateContext replicateContext) {
    CompletionService<Integer> pool = new ExecutorCompletionService<>(this.exec);
    List<Entry> entries = replicateContext.getEntries();
    String walGroupId = replicateContext.getWalGroupId();
    int sleepMultiplier = 1;
    int numReplicated = 0;
    if (!peersSelected && this.isRunning()) {
        peersSelected = true;
    int numSinks = replicationSinkMgr.getNumSinks();
    if (numSinks == 0) {
        LOG.warn("No replication sinks found, returning without replicating. The source should retry" + " with the same set of edits.");
        return false;
    // minimum of: configured threads, number of 100-waledit batches,
    //  and number of current sinks
    int n = Math.min(Math.min(this.maxThreads, entries.size() / 100 + 1), numSinks);
    List<List<Entry>> entryLists = new ArrayList<>(n);
    if (n == 1) {
    } else {
        for (int i = 0; i < n; i++) {
            entryLists.add(new ArrayList<>(entries.size() / n + 1));
        // now group by region
        for (Entry e : entries) {
            entryLists.get(Math.abs(Bytes.hashCode(e.getKey().getEncodedRegionName()) % n)).add(e);
    while (this.isRunning() && !exec.isShutdown()) {
        if (!isPeerEnabled()) {
            if (sleepForRetries("Replication is disabled", sleepMultiplier)) {
        try {
            if (LOG.isTraceEnabled()) {
                LOG.trace("Replicating " + entries.size() + " entries of total size " + replicateContext.getSize());
            int futures = 0;
            for (int i = 0; i < entryLists.size(); i++) {
                if (!entryLists.get(i).isEmpty()) {
                    if (LOG.isTraceEnabled()) {
                        LOG.trace("Submitting " + entryLists.get(i).size() + " entries of total size " + replicateContext.getSize());
                    // RuntimeExceptions encountered here bubble up and are handled in ReplicationSource
                    pool.submit(createReplicator(entryLists.get(i), i));
            IOException iox = null;
            for (int i = 0; i < futures; i++) {
                try {
                    // wait for all futures, remove successful parts
                    // (only the remaining parts will be retried)
                    Future<Integer> f = pool.take();
                    int index = f.get().intValue();
                    int batchSize = entryLists.get(index).size();
                    entryLists.set(index, Collections.<Entry>emptyList());
                    // Now, we have marked the batch as done replicating, record its size
                    numReplicated += batchSize;
                } catch (InterruptedException ie) {
                    iox = new IOException(ie);
                } catch (ExecutionException ee) {
                    // cause must be an IOException
                    iox = (IOException) ee.getCause();
            if (iox != null) {
                // if we had any exceptions, try again
                throw iox;
            if (numReplicated != entries.size()) {
                // Something went wrong here and we don't know what, let's just fail and retry.
                LOG.warn("The number of edits replicated is different from the number received," + " failing for now.");
                return false;
            // update metrics
            this.metrics.setAgeOfLastShippedOp(entries.get(entries.size() - 1).getKey().getWriteTime(), walGroupId);
            return true;
        } catch (IOException ioe) {
            // Didn't ship anything, but must still age the last time we did
            if (ioe instanceof RemoteException) {
                ioe = ((RemoteException) ioe).unwrapRemoteException();
                LOG.warn("Can't replicate because of an error on the remote cluster: ", ioe);
                if (ioe instanceof TableNotFoundException) {
                    if (sleepForRetries("A table is missing in the peer cluster. " + "Replication cannot proceed without losing data.", sleepMultiplier)) {
                } else if (ioe instanceof SaslException) {
                    LOG.warn("Peer encountered SaslException, rechecking all sinks: ", ioe);
            } else {
                if (ioe instanceof SocketTimeoutException) {
                    // This exception means we waited for more than 60s and nothing
                    // happened, the cluster is alive and calling it right away
                    // even for a test just makes things worse.
                    sleepForRetries("Encountered a SocketTimeoutException. Since the " + "call to the remote cluster timed out, which is usually " + "caused by a machine failure or a massive slowdown", this.socketTimeoutMultiplier);
                } else if (ioe instanceof ConnectException) {
                    LOG.warn("Peer is unavailable, rechecking all sinks: ", ioe);
                } else {
                    LOG.warn("Can't replicate because of a local or network error: ", ioe);
            if (sleepForRetries("Since we are unable to replicate", sleepMultiplier)) {
    // in case we exited before replicating
    return false;
Also used : ArrayList(java.util.ArrayList) ExecutorCompletionService(java.util.concurrent.ExecutorCompletionService) IOException( SaslException( HBaseReplicationEndpoint(org.apache.hadoop.hbase.replication.HBaseReplicationEndpoint) TableNotFoundException(org.apache.hadoop.hbase.TableNotFoundException) Entry(org.apache.hadoop.hbase.wal.WAL.Entry) SocketTimeoutException( ArrayList(java.util.ArrayList) List(java.util.List) ExecutionException(java.util.concurrent.ExecutionException) RemoteException(org.apache.hadoop.ipc.RemoteException) ConnectException(

Example 27 with SocketTimeoutException

private void processResponseForConnectionHeader() throws IOException {
    // if no response excepted, return
    if (!waitingConnectionHeaderResponse)
    try {
        // read the ConnectionHeaderResponse from server
        int len =;
        byte[] buff = new byte[len];
        int readSize =;
        if (LOG.isDebugEnabled()) {
            LOG.debug("Length of response for connection header:" + readSize);
        RPCProtos.ConnectionHeaderResponse connectionHeaderResponse = RPCProtos.ConnectionHeaderResponse.parseFrom(buff);
        // Get the CryptoCipherMeta, update the HBaseSaslRpcClient for Crypto Cipher
        if (connectionHeaderResponse.hasCryptoCipherMeta()) {
        waitingConnectionHeaderResponse = false;
    } catch (SocketTimeoutException ste) {
        LOG.fatal("Can't get the connection header response for rpc timeout, please check if" + " server has the correct configuration to support the additional function.", ste);
        // timeout when waiting the connection header response, ignore the additional function
        throw new IOException("Timeout while waiting connection header response", ste);
Also used : SocketTimeoutException( RPCProtos(org.apache.hadoop.hbase.shaded.protobuf.generated.RPCProtos) DoNotRetryIOException(org.apache.hadoop.hbase.DoNotRetryIOException) InterruptedIOException( IOException(

Example 28 with SocketTimeoutException

public T callWithRetries(RetryingCallable<T> callable, int callTimeout) throws IOException, RuntimeException {
    List<RetriesExhaustedException.ThrowableWithExtraContext> exceptions = new ArrayList<>();
    for (int tries = 0; ; tries++) {
        long expectedSleep;
        try {
            // bad cache entries are cleared in the call to RetryingCallable#throwable() in catch block
            callable.prepare(tries != 0);
            interceptor.intercept(context.prepare(callable, tries));
        } catch (PreemptiveFastFailException e) {
            throw e;
        } catch (Throwable t) {
            Throwable e = t.getCause();
            // translateException throws exception when should not retry: i.e. when request is bad.
            interceptor.handleFailure(context, t);
            t = translateException(t);
            if (tries > startLogErrorsCnt) {
      "Call exception, tries=" + tries + ", maxAttempts=" + maxAttempts + ", started=" + (EnvironmentEdgeManager.currentTime() - tracker.getStartTime()) + " ms ago, " + "cancelled=" + cancelled.get() + ", msg=" + t.getMessage() + " " + callable.getExceptionMessageAdditionalDetail());
            callable.throwable(t, maxAttempts != 1);
            RetriesExhaustedException.ThrowableWithExtraContext qt = new RetriesExhaustedException.ThrowableWithExtraContext(t, EnvironmentEdgeManager.currentTime(), toString());
            if (tries >= maxAttempts - 1) {
                throw new RetriesExhaustedException(tries, exceptions);
            // If the server is dead, we need to wait a little before retrying, to give
            // a chance to the regions to be moved
            // get right pause time, start by RETRY_BACKOFF[0] * pauseBase, where pauseBase might be
            // special when encountering CallQueueTooBigException, see #HBASE-17114
            long pauseBase = (t instanceof CallQueueTooBigException) ? pauseForCQTBE : pause;
            expectedSleep = callable.sleep(pauseBase, tries);
            // If, after the planned sleep, there won't be enough time left, we stop now.
            long duration = singleCallDuration(expectedSleep);
            if (duration > callTimeout) {
                String msg = "callTimeout=" + callTimeout + ", callDuration=" + duration + ": " + t.getMessage() + " " + callable.getExceptionMessageAdditionalDetail();
                throw (SocketTimeoutException) (new SocketTimeoutException(msg).initCause(t));
        } finally {
        try {
            if (expectedSleep > 0) {
                synchronized (cancelled) {
                    if (cancelled.get())
                        return null;
            if (cancelled.get())
                return null;
        } catch (InterruptedException e) {
            throw new InterruptedIOException("Interrupted after " + tries + " tries while maxAttempts=" + maxAttempts);
Also used : InterruptedIOException( CallQueueTooBigException(org.apache.hadoop.hbase.CallQueueTooBigException) ArrayList(java.util.ArrayList) PreemptiveFastFailException(org.apache.hadoop.hbase.exceptions.PreemptiveFastFailException) SocketTimeoutException(

Example 29 with SocketTimeoutException

public void testInterrupt50Percent() throws IOException, InterruptedException {
    final AtomicInteger noEx = new AtomicInteger(0);
    final AtomicInteger badEx = new AtomicInteger(0);
    final AtomicInteger noInt = new AtomicInteger(0);
    final AtomicInteger done = new AtomicInteger(0);
    List<Thread> threads = new ArrayList<>();
    final int nbThread = 100;
    for (int i = 0; i < nbThread; i++) {
        Thread t = new Thread() {

            public void run() {
                try {
                    Table ht = util.getConnection().getTable(tableName);
                    Result r = ht.get(new Get(row1));
                } catch (IOException e) {
          "exception", e);
                    if (!(e instanceof InterruptedIOException) || (e instanceof SocketTimeoutException)) {
                    } else {
                        if (Thread.currentThread().isInterrupted()) {
                  "The thread should NOT be with the 'interrupt' status.");
                } finally {
        t.setName("TestClientOperationInterrupt #" + i);
    for (int i = 0; i < nbThread / 2; i++) {
    boolean stillAlive = true;
    while (stillAlive) {
        stillAlive = false;
        for (Thread t : threads) {
            if (t.isAlive()) {
                stillAlive = true;
    Assert.assertTrue(" noEx: " + noEx.get() + ", badEx=" + badEx.get() + ", noInt=" + noInt.get(), noEx.get() == nbThread / 2 && badEx.get() == 0);
    // The problem here is that we need the server to free its handlers to handle all operations
    while (done.get() != nbThread) {
    Table ht = util.getConnection().getTable(tableName);
    Result r = ht.get(new Get(row1));
Also used : InterruptedIOException( SocketTimeoutException( AtomicInteger(java.util.concurrent.atomic.AtomicInteger) ArrayList(java.util.ArrayList) InterruptedIOException( IOException( Test(org.junit.Test)

Example 30 with SocketTimeoutException

   * Test that an operation can fail if we read the global operation timeout, even if the
   * individual timeout is fine. We do that with:
   * - client side: an operation timeout of 30 seconds
   * - server side: we sleep 20 second at each attempt. The first work fails, the second one
   * succeeds. But the client won't wait that much, because 20 + 20 > 30, so the client
   * timeouted when the server answers.
public void testGetOperationTimeout() throws Exception {
    HTableDescriptor hdt = TEST_UTIL.createTableDescriptor(TableName.valueOf(name.getMethodName()));
    Table table = TEST_UTIL.createTable(hdt, new byte[][] { FAM_NAM }, TEST_UTIL.getConfiguration());
    // Check that it works if the timeout is big enough
    table.setOperationTimeout(120 * 1000);
    table.get(new Get(FAM_NAM));
    // Resetting and retrying. Will fail this time, not enough time for the second try
    try {
        table.setOperationTimeout(30 * 1000);
        table.get(new Get(FAM_NAM));"We expect an exception here");
    } catch (SocketTimeoutException e) {
        // The client has a CallTimeout class, but it's not shared.We're not very clean today,
        //  in the general case you can expect the call to stop, but the exception may vary.
        // In this test however, we're sure that it will be a socket timeout."We received an exception, as expected ", e);
    } catch (IOException e) {"Wrong exception:" + e.getMessage());
    } finally {
Also used : SocketTimeoutException( IOException( HTableDescriptor(org.apache.hadoop.hbase.HTableDescriptor) Test(org.junit.Test)


