use of java.net.SocketTimeoutException in project hbase by apache.
the class HBaseInterClusterReplicationEndpoint method replicate.
/**
* Do the shipping logic
*/
@Override
public boolean replicate(ReplicateContext replicateContext) {
CompletionService<Integer> pool = new ExecutorCompletionService<>(this.exec);
List<Entry> entries = replicateContext.getEntries();
String walGroupId = replicateContext.getWalGroupId();
int sleepMultiplier = 1;
int numReplicated = 0;
if (!peersSelected && this.isRunning()) {
connectToPeers();
peersSelected = true;
}
int numSinks = replicationSinkMgr.getNumSinks();
if (numSinks == 0) {
LOG.warn("No replication sinks found, returning without replicating. The source should retry" + " with the same set of edits.");
return false;
}
// minimum of: configured threads, number of 100-waledit batches,
// and number of current sinks
int n = Math.min(Math.min(this.maxThreads, entries.size() / 100 + 1), numSinks);
List<List<Entry>> entryLists = new ArrayList<>(n);
if (n == 1) {
entryLists.add(entries);
} else {
for (int i = 0; i < n; i++) {
entryLists.add(new ArrayList<>(entries.size() / n + 1));
}
// now group by region
for (Entry e : entries) {
entryLists.get(Math.abs(Bytes.hashCode(e.getKey().getEncodedRegionName()) % n)).add(e);
}
}
while (this.isRunning() && !exec.isShutdown()) {
if (!isPeerEnabled()) {
if (sleepForRetries("Replication is disabled", sleepMultiplier)) {
sleepMultiplier++;
}
continue;
}
try {
if (LOG.isTraceEnabled()) {
LOG.trace("Replicating " + entries.size() + " entries of total size " + replicateContext.getSize());
}
int futures = 0;
for (int i = 0; i < entryLists.size(); i++) {
if (!entryLists.get(i).isEmpty()) {
if (LOG.isTraceEnabled()) {
LOG.trace("Submitting " + entryLists.get(i).size() + " entries of total size " + replicateContext.getSize());
}
// RuntimeExceptions encountered here bubble up and are handled in ReplicationSource
pool.submit(createReplicator(entryLists.get(i), i));
futures++;
}
}
IOException iox = null;
for (int i = 0; i < futures; i++) {
try {
// wait for all futures, remove successful parts
// (only the remaining parts will be retried)
Future<Integer> f = pool.take();
int index = f.get().intValue();
int batchSize = entryLists.get(index).size();
entryLists.set(index, Collections.<Entry>emptyList());
// Now, we have marked the batch as done replicating, record its size
numReplicated += batchSize;
} catch (InterruptedException ie) {
iox = new IOException(ie);
} catch (ExecutionException ee) {
// cause must be an IOException
iox = (IOException) ee.getCause();
}
}
if (iox != null) {
// if we had any exceptions, try again
throw iox;
}
if (numReplicated != entries.size()) {
// Something went wrong here and we don't know what, let's just fail and retry.
LOG.warn("The number of edits replicated is different from the number received," + " failing for now.");
return false;
}
// update metrics
this.metrics.setAgeOfLastShippedOp(entries.get(entries.size() - 1).getKey().getWriteTime(), walGroupId);
return true;
} catch (IOException ioe) {
// Didn't ship anything, but must still age the last time we did
this.metrics.refreshAgeOfLastShippedOp(walGroupId);
if (ioe instanceof RemoteException) {
ioe = ((RemoteException) ioe).unwrapRemoteException();
LOG.warn("Can't replicate because of an error on the remote cluster: ", ioe);
if (ioe instanceof TableNotFoundException) {
if (sleepForRetries("A table is missing in the peer cluster. " + "Replication cannot proceed without losing data.", sleepMultiplier)) {
sleepMultiplier++;
}
} else if (ioe instanceof SaslException) {
LOG.warn("Peer encountered SaslException, rechecking all sinks: ", ioe);
replicationSinkMgr.chooseSinks();
}
} else {
if (ioe instanceof SocketTimeoutException) {
// This exception means we waited for more than 60s and nothing
// happened, the cluster is alive and calling it right away
// even for a test just makes things worse.
sleepForRetries("Encountered a SocketTimeoutException. Since the " + "call to the remote cluster timed out, which is usually " + "caused by a machine failure or a massive slowdown", this.socketTimeoutMultiplier);
} else if (ioe instanceof ConnectException) {
LOG.warn("Peer is unavailable, rechecking all sinks: ", ioe);
replicationSinkMgr.chooseSinks();
} else {
LOG.warn("Can't replicate because of a local or network error: ", ioe);
}
}
if (sleepForRetries("Since we are unable to replicate", sleepMultiplier)) {
sleepMultiplier++;
}
}
}
// in case we exited before replicating
return false;
}
use of java.net.SocketTimeoutException in project hbase by apache.
the class BlockingRpcConnection method processResponseForConnectionHeader.
private void processResponseForConnectionHeader() throws IOException {
// if no response excepted, return
if (!waitingConnectionHeaderResponse)
return;
try {
// read the ConnectionHeaderResponse from server
int len = this.in.readInt();
byte[] buff = new byte[len];
int readSize = this.in.read(buff);
if (LOG.isDebugEnabled()) {
LOG.debug("Length of response for connection header:" + readSize);
}
RPCProtos.ConnectionHeaderResponse connectionHeaderResponse = RPCProtos.ConnectionHeaderResponse.parseFrom(buff);
// Get the CryptoCipherMeta, update the HBaseSaslRpcClient for Crypto Cipher
if (connectionHeaderResponse.hasCryptoCipherMeta()) {
negotiateCryptoAes(connectionHeaderResponse.getCryptoCipherMeta());
}
waitingConnectionHeaderResponse = false;
} catch (SocketTimeoutException ste) {
LOG.fatal("Can't get the connection header response for rpc timeout, please check if" + " server has the correct configuration to support the additional function.", ste);
// timeout when waiting the connection header response, ignore the additional function
throw new IOException("Timeout while waiting connection header response", ste);
}
}
use of java.net.SocketTimeoutException in project hbase by apache.
the class RpcRetryingCallerImpl method callWithRetries.
@Override
public T callWithRetries(RetryingCallable<T> callable, int callTimeout) throws IOException, RuntimeException {
List<RetriesExhaustedException.ThrowableWithExtraContext> exceptions = new ArrayList<>();
tracker.start();
context.clear();
for (int tries = 0; ; tries++) {
long expectedSleep;
try {
// bad cache entries are cleared in the call to RetryingCallable#throwable() in catch block
callable.prepare(tries != 0);
interceptor.intercept(context.prepare(callable, tries));
return callable.call(getTimeout(callTimeout));
} catch (PreemptiveFastFailException e) {
throw e;
} catch (Throwable t) {
Throwable e = t.getCause();
ExceptionUtil.rethrowIfInterrupt(t);
// translateException throws exception when should not retry: i.e. when request is bad.
interceptor.handleFailure(context, t);
t = translateException(t);
if (tries > startLogErrorsCnt) {
LOG.info("Call exception, tries=" + tries + ", maxAttempts=" + maxAttempts + ", started=" + (EnvironmentEdgeManager.currentTime() - tracker.getStartTime()) + " ms ago, " + "cancelled=" + cancelled.get() + ", msg=" + t.getMessage() + " " + callable.getExceptionMessageAdditionalDetail());
}
callable.throwable(t, maxAttempts != 1);
RetriesExhaustedException.ThrowableWithExtraContext qt = new RetriesExhaustedException.ThrowableWithExtraContext(t, EnvironmentEdgeManager.currentTime(), toString());
exceptions.add(qt);
if (tries >= maxAttempts - 1) {
throw new RetriesExhaustedException(tries, exceptions);
}
// If the server is dead, we need to wait a little before retrying, to give
// a chance to the regions to be moved
// get right pause time, start by RETRY_BACKOFF[0] * pauseBase, where pauseBase might be
// special when encountering CallQueueTooBigException, see #HBASE-17114
long pauseBase = (t instanceof CallQueueTooBigException) ? pauseForCQTBE : pause;
expectedSleep = callable.sleep(pauseBase, tries);
// If, after the planned sleep, there won't be enough time left, we stop now.
long duration = singleCallDuration(expectedSleep);
if (duration > callTimeout) {
String msg = "callTimeout=" + callTimeout + ", callDuration=" + duration + ": " + t.getMessage() + " " + callable.getExceptionMessageAdditionalDetail();
throw (SocketTimeoutException) (new SocketTimeoutException(msg).initCause(t));
}
} finally {
interceptor.updateFailureInfo(context);
}
try {
if (expectedSleep > 0) {
synchronized (cancelled) {
if (cancelled.get())
return null;
cancelled.wait(expectedSleep);
}
}
if (cancelled.get())
return null;
} catch (InterruptedException e) {
throw new InterruptedIOException("Interrupted after " + tries + " tries while maxAttempts=" + maxAttempts);
}
}
}
use of java.net.SocketTimeoutException in project hbase by apache.
the class TestClientOperationInterrupt method testInterrupt50Percent.
@Test
public void testInterrupt50Percent() throws IOException, InterruptedException {
final AtomicInteger noEx = new AtomicInteger(0);
final AtomicInteger badEx = new AtomicInteger(0);
final AtomicInteger noInt = new AtomicInteger(0);
final AtomicInteger done = new AtomicInteger(0);
List<Thread> threads = new ArrayList<>();
final int nbThread = 100;
for (int i = 0; i < nbThread; i++) {
Thread t = new Thread() {
@Override
public void run() {
try {
Table ht = util.getConnection().getTable(tableName);
Result r = ht.get(new Get(row1));
noEx.incrementAndGet();
} catch (IOException e) {
LOG.info("exception", e);
if (!(e instanceof InterruptedIOException) || (e instanceof SocketTimeoutException)) {
badEx.incrementAndGet();
} else {
if (Thread.currentThread().isInterrupted()) {
noInt.incrementAndGet();
LOG.info("The thread should NOT be with the 'interrupt' status.");
}
}
} finally {
done.incrementAndGet();
}
}
};
t.setName("TestClientOperationInterrupt #" + i);
threads.add(t);
t.start();
}
for (int i = 0; i < nbThread / 2; i++) {
threads.get(i).interrupt();
}
boolean stillAlive = true;
while (stillAlive) {
stillAlive = false;
for (Thread t : threads) {
if (t.isAlive()) {
stillAlive = true;
}
}
Threads.sleep(10);
}
Assert.assertFalse(Thread.currentThread().isInterrupted());
Assert.assertTrue(" noEx: " + noEx.get() + ", badEx=" + badEx.get() + ", noInt=" + noInt.get(), noEx.get() == nbThread / 2 && badEx.get() == 0);
// The problem here is that we need the server to free its handlers to handle all operations
while (done.get() != nbThread) {
Thread.sleep(1);
}
Table ht = util.getConnection().getTable(tableName);
Result r = ht.get(new Get(row1));
Assert.assertFalse(r.isEmpty());
}
use of java.net.SocketTimeoutException in project hbase by apache.
the class TestHCM method testGetOperationTimeout.
/**
* Test that an operation can fail if we read the global operation timeout, even if the
* individual timeout is fine. We do that with:
* - client side: an operation timeout of 30 seconds
* - server side: we sleep 20 second at each attempt. The first work fails, the second one
* succeeds. But the client won't wait that much, because 20 + 20 > 30, so the client
* timeouted when the server answers.
*/
@Test
public void testGetOperationTimeout() throws Exception {
HTableDescriptor hdt = TEST_UTIL.createTableDescriptor(TableName.valueOf(name.getMethodName()));
hdt.addCoprocessor(SleepAndFailFirstTime.class.getName());
Table table = TEST_UTIL.createTable(hdt, new byte[][] { FAM_NAM }, TEST_UTIL.getConfiguration());
table.setRpcTimeout(Integer.MAX_VALUE);
SleepAndFailFirstTime.ct.set(0);
// Check that it works if the timeout is big enough
table.setOperationTimeout(120 * 1000);
table.get(new Get(FAM_NAM));
// Resetting and retrying. Will fail this time, not enough time for the second try
SleepAndFailFirstTime.ct.set(0);
try {
table.setOperationTimeout(30 * 1000);
table.get(new Get(FAM_NAM));
Assert.fail("We expect an exception here");
} catch (SocketTimeoutException e) {
// The client has a CallTimeout class, but it's not shared.We're not very clean today,
// in the general case you can expect the call to stop, but the exception may vary.
// In this test however, we're sure that it will be a socket timeout.
LOG.info("We received an exception, as expected ", e);
} catch (IOException e) {
Assert.fail("Wrong exception:" + e.getMessage());
} finally {
table.close();
}
}
Aggregations