use of org.apache.hadoop.hbase.util.RetryCounterFactory in project hbase by apache.
the class HMaster method isRegionOnline.
/**
* @return True if region is online and scannable else false if an error or shutdown (Otherwise
* we just block in here holding up all forward-progess).
*/
private boolean isRegionOnline(RegionInfo ri) {
RetryCounter rc = null;
while (!isStopped()) {
RegionState rs = this.assignmentManager.getRegionStates().getRegionState(ri);
if (rs.isOpened()) {
if (this.getServerManager().isServerOnline(rs.getServerName())) {
return true;
}
}
// Region is not OPEN.
Optional<Procedure<MasterProcedureEnv>> optProc = this.procedureExecutor.getProcedures().stream().filter(p -> p instanceof ServerCrashProcedure).findAny();
// TODO: Add a page to refguide on how to do repair. Have this log message point to it.
// Page will talk about loss of edits, how to schedule at least the meta WAL recovery, and
// then how to assign including how to break region lock if one held.
LOG.warn("{} is NOT online; state={}; ServerCrashProcedures={}. Master startup cannot " + "progress, in holding-pattern until region onlined.", ri.getRegionNameAsString(), rs, optProc.isPresent());
// Check once-a-minute.
if (rc == null) {
rc = new RetryCounterFactory(Integer.MAX_VALUE, 1000, 60_000).create();
}
Threads.sleep(rc.getBackoffTimeAndIncrementAttempts());
}
return false;
}
use of org.apache.hadoop.hbase.util.RetryCounterFactory in project hbase by apache.
the class RegionReplicaFlushHandler method triggerFlushInPrimaryRegion.
void triggerFlushInPrimaryRegion(final HRegion region) throws IOException {
long pause = connection.getConfiguration().getLong(HConstants.HBASE_CLIENT_PAUSE, HConstants.DEFAULT_HBASE_CLIENT_PAUSE);
int maxAttempts = getRetriesCount(connection.getConfiguration());
RetryCounter counter = new RetryCounterFactory(maxAttempts, (int) pause).create();
if (LOG.isDebugEnabled()) {
LOG.debug("RPC'ing to primary " + ServerRegionReplicaUtil.getRegionInfoForDefaultReplica(region.getRegionInfo()).getRegionNameAsString() + " from " + region.getRegionInfo().getRegionNameAsString() + " to trigger FLUSH");
}
while (!region.isClosing() && !region.isClosed() && !server.isAborted() && !server.isStopped()) {
// TODO: flushRegion() is a blocking call waiting for the flush to complete. Ideally we
// do not have to wait for the whole flush here, just initiate it.
FlushRegionResponse response;
try {
response = FutureUtils.get(connection.flush(ServerRegionReplicaUtil.getRegionInfoForDefaultReplica(region.getRegionInfo()).getRegionName(), true));
} catch (IOException e) {
if (e instanceof TableNotFoundException || FutureUtils.get(connection.getAdmin().isTableDisabled(region.getRegionInfo().getTable()))) {
return;
}
if (!counter.shouldRetry()) {
throw e;
}
// The reason that why we need to retry here is that, the retry for asynchronous admin
// request is much simpler than the normal operation, if we failed to locate the region once
// then we will throw the exception out and will not try to relocate again. So here we need
// to add some retries by ourselves to prevent shutting down the region server too
// frequent...
LOG.debug("Failed to trigger a flush of primary region replica {} of region {}, retry={}", ServerRegionReplicaUtil.getRegionInfoForDefaultReplica(region.getRegionInfo()).getRegionNameAsString(), region.getRegionInfo().getRegionNameAsString(), counter.getAttemptTimes(), e);
try {
counter.sleepUntilNextRetry();
} catch (InterruptedException e1) {
throw new InterruptedIOException(e1.getMessage());
}
continue;
}
if (response.getFlushed()) {
// a complete flush cycle or replay a region open event
if (LOG.isDebugEnabled()) {
LOG.debug("Triggered flush of primary region replica " + ServerRegionReplicaUtil.getRegionInfoForDefaultReplica(region.getRegionInfo()).getRegionNameAsString() + " for " + region.getRegionInfo().getEncodedName() + "; now waiting and blocking reads until completes a full flush cycle");
}
region.setReadsEnabled(true);
break;
} else {
if (response.hasWroteFlushWalMarker()) {
if (response.getWroteFlushWalMarker()) {
if (LOG.isDebugEnabled()) {
LOG.debug("Triggered empty flush marker (memstore empty) on primary region replica " + ServerRegionReplicaUtil.getRegionInfoForDefaultReplica(region.getRegionInfo()).getRegionNameAsString() + " for " + region.getRegionInfo().getEncodedName() + "; now waiting and blocking reads until observing a flush marker");
}
region.setReadsEnabled(true);
break;
} else {
// closing or already flushing. Retry flush again after some sleep.
if (!counter.shouldRetry()) {
throw new IOException("Cannot cause primary to flush or drop a wal marker after " + counter.getAttemptTimes() + " retries. Failing opening of this region replica " + region.getRegionInfo().getRegionNameAsString());
} else {
LOG.warn("Cannot cause primary replica {} to flush or drop a wal marker " + "for region replica {}, retry={}", ServerRegionReplicaUtil.getRegionInfoForDefaultReplica(region.getRegionInfo()).getRegionNameAsString(), region.getRegionInfo().getRegionNameAsString(), counter.getAttemptTimes());
}
}
} else {
// nothing to do. Are we dealing with an old server?
LOG.warn("Was not able to trigger a flush from primary region due to old server version? " + "Continuing to open the secondary region replica: " + region.getRegionInfo().getRegionNameAsString());
break;
}
}
try {
counter.sleepUntilNextRetry();
} catch (InterruptedException e) {
throw new InterruptedIOException(e.getMessage());
}
}
region.setReadsEnabled(true);
}
use of org.apache.hadoop.hbase.util.RetryCounterFactory in project hbase by apache.
the class HBaseClusterManager method setConf.
@Override
public void setConf(Configuration conf) {
super.setConf(conf);
if (conf == null) {
// Configured gets passed null before real conf. Why? I don't know.
return;
}
sshUserName = conf.get("hbase.it.clustermanager.ssh.user", "");
String extraSshOptions = conf.get("hbase.it.clustermanager.ssh.opts", "");
sshOptions = System.getenv("HBASE_SSH_OPTS");
if (!extraSshOptions.isEmpty()) {
sshOptions = StringUtils.join(new Object[] { sshOptions, extraSshOptions }, " ");
}
sshOptions = (sshOptions == null) ? "" : sshOptions;
sshUserName = (sshUserName == null) ? "" : sshUserName;
tunnelCmd = conf.get("hbase.it.clustermanager.ssh.cmd", DEFAULT_TUNNEL_CMD);
tunnelSudoCmd = conf.get("hbase.it.clustermanager.ssh.sudo.cmd", DEFAULT_TUNNEL_SUDO_CMD);
// Print out ssh special config if any.
if ((sshUserName != null && sshUserName.length() > 0) || (sshOptions != null && sshOptions.length() > 0)) {
LOG.info("Running with SSH user [" + sshUserName + "] and options [" + sshOptions + "]");
}
this.retryCounterFactory = new RetryCounterFactory(new RetryConfig().setMaxAttempts(conf.getInt(RETRY_ATTEMPTS_KEY, DEFAULT_RETRY_ATTEMPTS)).setSleepInterval(conf.getLong(RETRY_SLEEP_INTERVAL_KEY, DEFAULT_RETRY_SLEEP_INTERVAL)));
}
use of org.apache.hadoop.hbase.util.RetryCounterFactory in project hbase by apache.
the class RESTApiClusterManager method setConf.
@Override
public void setConf(Configuration conf) {
super.setConf(conf);
if (conf == null) {
// `Configured()` constructor calls `setConf(null)` before calling again with a real value.
return;
}
final Class<? extends ClusterManager> clazz = conf.getClass(REST_API_DELEGATE_CLUSTER_MANAGER, HBaseClusterManager.class, ClusterManager.class);
hBaseClusterManager = ReflectionUtils.newInstance(clazz, conf);
serverHostname = conf.get(REST_API_CLUSTER_MANAGER_HOSTNAME, DEFAULT_SERVER_HOSTNAME);
clusterName = conf.get(REST_API_CLUSTER_MANAGER_CLUSTER_NAME, DEFAULT_CLUSTER_NAME);
// Add filter to Client instance to enable server authentication.
String serverUsername = conf.get(REST_API_CLUSTER_MANAGER_USERNAME, DEFAULT_SERVER_USERNAME);
String serverPassword = conf.get(REST_API_CLUSTER_MANAGER_PASSWORD, DEFAULT_SERVER_PASSWORD);
client.register(HttpAuthenticationFeature.basic(serverUsername, serverPassword));
this.retryCounterFactory = new RetryCounterFactory(new RetryConfig().setMaxAttempts(conf.getInt(RETRY_ATTEMPTS_KEY, DEFAULT_RETRY_ATTEMPTS)).setSleepInterval(conf.getLong(RETRY_SLEEP_INTERVAL_KEY, DEFAULT_RETRY_SLEEP_INTERVAL)));
}
use of org.apache.hadoop.hbase.util.RetryCounterFactory in project hbase by apache.
the class ChaosAgent method initChaosAgent.
/**
* sets global params and initiates connection with ZooKeeper then does registration.
* @param conf initial configuration to use
* @param quorum ZK Quorum
* @param agentName AgentName to use
*/
private void initChaosAgent(Configuration conf, String quorum, String agentName) {
this.conf = conf;
this.quorum = quorum;
this.agentName = agentName;
this.retryCounterFactory = new RetryCounterFactory(new RetryCounter.RetryConfig().setMaxAttempts(conf.getInt(ChaosConstants.RETRY_ATTEMPTS_KEY, ChaosConstants.DEFAULT_RETRY_ATTEMPTS)).setSleepInterval(conf.getLong(ChaosConstants.RETRY_SLEEP_INTERVAL_KEY, ChaosConstants.DEFAULT_RETRY_SLEEP_INTERVAL)));
try {
this.createZKConnection(null);
this.register();
} catch (IOException e) {
LOG.error("Error Creating Connection: " + e);
}
}
Aggregations