use of org.apache.hadoop.hbase.ipc.FailedServerException in project hbase by apache.
the class AssignmentManager method retrySendRegionOpen.
/**
* At master failover, for pending_open region, make sure
* sendRegionOpen RPC call is sent to the target regionserver
*/
private void retrySendRegionOpen(final RegionState regionState) {
this.executorService.submit(new EventHandler(server, EventType.M_MASTER_RECOVERY) {
@Override
public void process() throws IOException {
HRegionInfo hri = regionState.getRegion();
ServerName serverName = regionState.getServerName();
ReentrantLock lock = locker.acquireLock(hri.getEncodedName());
try {
for (int i = 1; i <= maximumAttempts; i++) {
if (!serverManager.isServerOnline(serverName) || server.isStopped() || server.isAborted()) {
// No need any more
return;
}
try {
if (!regionState.equals(regionStates.getRegionState(hri))) {
// Region is not in the expected state any more
return;
}
List<ServerName> favoredNodes = ServerName.EMPTY_SERVER_LIST;
if (shouldAssignFavoredNodes(hri)) {
FavoredNodesManager fnm = ((MasterServices) server).getFavoredNodesManager();
favoredNodes = fnm.getFavoredNodesWithDNPort(hri);
}
serverManager.sendRegionOpen(serverName, hri, favoredNodes);
// we're done
return;
} catch (Throwable t) {
if (t instanceof RemoteException) {
t = ((RemoteException) t).unwrapRemoteException();
}
if (t instanceof FailedServerException && i < maximumAttempts) {
// retry too soon. Retry after the failed_server_expiry time
try {
Configuration conf = this.server.getConfiguration();
long sleepTime = 1 + conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY, RpcClient.FAILED_SERVER_EXPIRY_DEFAULT);
if (LOG.isDebugEnabled()) {
LOG.debug(serverName + " is on failed server list; waiting " + sleepTime + "ms", t);
}
Thread.sleep(sleepTime);
continue;
} catch (InterruptedException ie) {
LOG.warn("Failed to assign " + hri.getRegionNameAsString() + " since interrupted", ie);
regionStates.updateRegionState(hri, State.FAILED_OPEN);
Thread.currentThread().interrupt();
return;
}
}
if (serverManager.isServerOnline(serverName) && t instanceof java.net.SocketTimeoutException) {
// reset the try count
i--;
} else {
LOG.info("Got exception in retrying sendRegionOpen for " + regionState + "; try=" + i + " of " + maximumAttempts, t);
}
Threads.sleep(100);
}
}
// Run out of attempts
regionStates.updateRegionState(hri, State.FAILED_OPEN);
} finally {
lock.unlock();
}
}
});
}
use of org.apache.hadoop.hbase.ipc.FailedServerException in project hbase by apache.
the class AssignmentManager method assign.
/**
* Bulk assign regions to <code>destination</code>.
* @param destination
* @param regions Regions to assign.
* @return true if successful
*/
boolean assign(final ServerName destination, final List<HRegionInfo> regions) throws InterruptedException {
long startTime = EnvironmentEdgeManager.currentTime();
try {
int regionCount = regions.size();
if (regionCount == 0) {
return true;
}
LOG.info("Assigning " + regionCount + " region(s) to " + destination.toString());
Set<String> encodedNames = new HashSet<>(regionCount);
for (HRegionInfo region : regions) {
encodedNames.add(region.getEncodedName());
}
List<HRegionInfo> failedToOpenRegions = new ArrayList<>();
Map<String, Lock> locks = locker.acquireLocks(encodedNames);
try {
Map<String, RegionPlan> plans = new HashMap<>(regionCount);
List<RegionState> states = new ArrayList<>(regionCount);
for (HRegionInfo region : regions) {
String encodedName = region.getEncodedName();
if (!isDisabledorDisablingRegionInRIT(region)) {
RegionState state = forceRegionStateToOffline(region, false);
boolean onDeadServer = false;
if (state != null) {
if (regionStates.wasRegionOnDeadServer(encodedName)) {
LOG.info("Skip assigning " + region.getRegionNameAsString() + ", it's host " + regionStates.getLastRegionServerOfRegion(encodedName) + " is dead but not processed yet");
onDeadServer = true;
} else {
RegionPlan plan = new RegionPlan(region, state.getServerName(), destination);
plans.put(encodedName, plan);
states.add(state);
continue;
}
}
// Reassign if the region wasn't on a dead server
if (!onDeadServer) {
LOG.info("failed to force region state to offline, " + "will reassign later: " + region);
// assign individually later
failedToOpenRegions.add(region);
}
}
// Release the lock, this region is excluded from bulk assign because
// we can't update its state, or set its znode to offline.
Lock lock = locks.remove(encodedName);
lock.unlock();
}
if (server.isStopped()) {
return false;
}
// Add region plans, so we can updateTimers when one region is opened so
// that unnecessary timeout on RIT is reduced.
this.addPlans(plans);
List<Pair<HRegionInfo, List<ServerName>>> regionOpenInfos = new ArrayList<>(states.size());
for (RegionState state : states) {
HRegionInfo region = state.getRegion();
regionStates.updateRegionState(region, State.PENDING_OPEN, destination);
List<ServerName> favoredNodes = ServerName.EMPTY_SERVER_LIST;
if (shouldAssignFavoredNodes(region)) {
favoredNodes = server.getFavoredNodesManager().getFavoredNodesWithDNPort(region);
}
regionOpenInfos.add(new Pair<>(region, favoredNodes));
}
// Move on to open regions.
try {
// Send OPEN RPC. If it fails on a IOE or RemoteException,
// regions will be assigned individually.
Configuration conf = server.getConfiguration();
long maxWaitTime = System.currentTimeMillis() + conf.getLong("hbase.regionserver.rpc.startup.waittime", 60000);
for (int i = 1; i <= maximumAttempts && !server.isStopped(); i++) {
try {
List<RegionOpeningState> regionOpeningStateList = serverManager.sendRegionOpen(destination, regionOpenInfos);
for (int k = 0, n = regionOpeningStateList.size(); k < n; k++) {
RegionOpeningState openingState = regionOpeningStateList.get(k);
if (openingState != RegionOpeningState.OPENED) {
HRegionInfo region = regionOpenInfos.get(k).getFirst();
LOG.info("Got opening state " + openingState + ", will reassign later: " + region);
// Failed opening this region, reassign it later
forceRegionStateToOffline(region, true);
failedToOpenRegions.add(region);
}
}
break;
} catch (IOException e) {
if (e instanceof RemoteException) {
e = ((RemoteException) e).unwrapRemoteException();
}
if (e instanceof RegionServerStoppedException) {
LOG.warn("The region server was shut down, ", e);
// No need to retry, the region server is a goner.
return false;
} else if (e instanceof ServerNotRunningYetException) {
long now = System.currentTimeMillis();
if (now < maxWaitTime) {
if (LOG.isDebugEnabled()) {
LOG.debug("Server is not yet up; waiting up to " + (maxWaitTime - now) + "ms", e);
}
Thread.sleep(100);
// reset the try count
i--;
continue;
}
} else if (e instanceof java.net.SocketTimeoutException && this.serverManager.isServerOnline(destination)) {
// open the region on the same server.
if (LOG.isDebugEnabled()) {
LOG.debug("Bulk assigner openRegion() to " + destination + " has timed out, but the regions might" + " already be opened on it.", e);
}
// wait and reset the re-try count, server might be just busy.
Thread.sleep(100);
i--;
continue;
} else if (e instanceof FailedServerException && i < maximumAttempts) {
// In case the server is in the failed server list, no point to
// retry too soon. Retry after the failed_server_expiry time
long sleepTime = 1 + conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY, RpcClient.FAILED_SERVER_EXPIRY_DEFAULT);
if (LOG.isDebugEnabled()) {
LOG.debug(destination + " is on failed server list; waiting " + sleepTime + "ms", e);
}
Thread.sleep(sleepTime);
continue;
}
throw e;
}
}
} catch (IOException e) {
// Can be a socket timeout, EOF, NoRouteToHost, etc
LOG.info("Unable to communicate with " + destination + " in order to assign regions, ", e);
for (RegionState state : states) {
HRegionInfo region = state.getRegion();
forceRegionStateToOffline(region, true);
}
return false;
}
} finally {
for (Lock lock : locks.values()) {
lock.unlock();
}
}
if (!failedToOpenRegions.isEmpty()) {
for (HRegionInfo region : failedToOpenRegions) {
if (!regionStates.isRegionOnline(region)) {
invokeAssign(region);
}
}
}
// wait for assignment completion
ArrayList<HRegionInfo> userRegionSet = new ArrayList<>(regions.size());
for (HRegionInfo region : regions) {
if (!region.getTable().isSystemTable()) {
userRegionSet.add(region);
}
}
if (!waitForAssignment(userRegionSet, true, userRegionSet.size(), System.currentTimeMillis())) {
LOG.debug("some user regions are still in transition: " + userRegionSet);
}
LOG.debug("Bulk assigning done for " + destination);
return true;
} finally {
metricsAssignmentManager.updateBulkAssignTime(EnvironmentEdgeManager.currentTime() - startTime);
}
}
use of org.apache.hadoop.hbase.ipc.FailedServerException in project hbase by apache.
the class ServerManager method checkForRSznode.
/**
* Check for an odd state, where we think an RS is up but it is not. Do it on OPEN.
* This is only case where the check makes sense.
*
* <p>We are checking for instance of HBASE-9593 where a RS registered but died before it put
* up its znode in zk. In this case, the RS made it into the list of online servers but it
* is not actually UP. We do the check here where there is an evident problem rather
* than do some crazy footwork where we'd have master check zk after a RS had reported
* for duty with provisional state followed by a confirmed state; that'd be a mess.
* Real fix is HBASE-17733.
*/
private void checkForRSznode(final ServerName serverName, final ServiceException se) {
if (se.getCause() == null)
return;
Throwable t = se.getCause();
if (t instanceof ConnectException) {
// If this, proceed to do cleanup.
} else {
// Look for FailedServerException
if (!(t instanceof IOException))
return;
if (t.getCause() == null)
return;
if (!(t.getCause() instanceof FailedServerException))
return;
// Ok, found FailedServerException -- continue.
}
if (!isServerOnline(serverName))
return;
// We think this server is online. Check it has a znode up. Currently, a RS
// registers an ephereral znode in zk. If not present, something is up. Maybe
// HBASE-9593 where RS crashed AFTER reportForDuty but BEFORE it put up an ephemeral
// znode.
List<String> servers = null;
try {
servers = getRegionServersInZK(this.master.getZooKeeper());
} catch (KeeperException ke) {
LOG.warn("Failed to list regionservers", ke);
// ZK is malfunctioning, don't hang here
}
boolean found = false;
if (servers != null) {
for (String serverNameAsStr : servers) {
ServerName sn = ServerName.valueOf(serverNameAsStr);
if (sn.equals(serverName)) {
// Found a server up in zk.
found = true;
break;
}
}
}
if (!found) {
LOG.warn("Online server " + serverName.toString() + " has no corresponding " + "ephemeral znode (Did it die before registering in zk?); " + "calling expire to clean it up!");
expireServer(serverName);
}
}
use of org.apache.hadoop.hbase.ipc.FailedServerException in project hbase by apache.
the class AssignmentManager method retrySendRegionClose.
/**
* At master failover, for pending_close region, make sure
* sendRegionClose RPC call is sent to the target regionserver
*/
private void retrySendRegionClose(final RegionState regionState) {
this.executorService.submit(new EventHandler(server, EventType.M_MASTER_RECOVERY) {
@Override
public void process() throws IOException {
HRegionInfo hri = regionState.getRegion();
ServerName serverName = regionState.getServerName();
ReentrantLock lock = locker.acquireLock(hri.getEncodedName());
try {
for (int i = 1; i <= maximumAttempts; i++) {
if (!serverManager.isServerOnline(serverName) || server.isStopped() || server.isAborted()) {
// No need any more
return;
}
try {
if (!regionState.equals(regionStates.getRegionState(hri))) {
// Region is not in the expected state any more
return;
}
serverManager.sendRegionClose(serverName, hri, null);
// Done.
return;
} catch (Throwable t) {
if (t instanceof RemoteException) {
t = ((RemoteException) t).unwrapRemoteException();
}
if (t instanceof FailedServerException && i < maximumAttempts) {
// retry too soon. Retry after the failed_server_expiry time
try {
Configuration conf = this.server.getConfiguration();
long sleepTime = 1 + conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY, RpcClient.FAILED_SERVER_EXPIRY_DEFAULT);
if (LOG.isDebugEnabled()) {
LOG.debug(serverName + " is on failed server list; waiting " + sleepTime + "ms", t);
}
Thread.sleep(sleepTime);
continue;
} catch (InterruptedException ie) {
LOG.warn("Failed to unassign " + hri.getRegionNameAsString() + " since interrupted", ie);
regionStates.updateRegionState(hri, RegionState.State.FAILED_CLOSE);
Thread.currentThread().interrupt();
return;
}
}
if (serverManager.isServerOnline(serverName) && t instanceof java.net.SocketTimeoutException) {
// reset the try count
i--;
} else {
LOG.info("Got exception in retrying sendRegionClose for " + regionState + "; try=" + i + " of " + maximumAttempts, t);
}
Threads.sleep(100);
}
}
// Run out of attempts
regionStates.updateRegionState(hri, State.FAILED_CLOSE);
} finally {
lock.unlock();
}
}
});
}
use of org.apache.hadoop.hbase.ipc.FailedServerException in project hbase by apache.
the class AssignmentManager method assign.
/**
* Caller must hold lock on the passed <code>state</code> object.
* @param state
* @param forceNewPlan
*/
private void assign(RegionState state, boolean forceNewPlan) {
long startTime = EnvironmentEdgeManager.currentTime();
try {
Configuration conf = server.getConfiguration();
RegionPlan plan = null;
long maxWaitTime = -1;
HRegionInfo region = state.getRegion();
Throwable previousException = null;
for (int i = 1; i <= maximumAttempts; i++) {
if (server.isStopped() || server.isAborted()) {
LOG.info("Skip assigning " + region.getRegionNameAsString() + ", the server is stopped/aborted");
return;
}
if (plan == null) {
// Get a server for the region at first
try {
plan = getRegionPlan(region, forceNewPlan);
} catch (HBaseIOException e) {
LOG.warn("Failed to get region plan", e);
}
}
if (plan == null) {
LOG.warn("Unable to determine a plan to assign " + region);
// For meta region, we have to keep retrying until succeeding
if (region.isMetaRegion()) {
if (i == maximumAttempts) {
// re-set attempt count to 0 for at least 1 retry
i = 0;
LOG.warn("Unable to determine a plan to assign a hbase:meta region " + region + " after maximumAttempts (" + this.maximumAttempts + "). Reset attempts count and continue retrying.");
}
waitForRetryingMetaAssignment();
continue;
}
regionStates.updateRegionState(region, State.FAILED_OPEN);
return;
}
LOG.info("Assigning " + region.getRegionNameAsString() + " to " + plan.getDestination());
// Transition RegionState to PENDING_OPEN
regionStates.updateRegionState(region, State.PENDING_OPEN, plan.getDestination());
boolean needNewPlan = false;
final String assignMsg = "Failed assignment of " + region.getRegionNameAsString() + " to " + plan.getDestination();
try {
List<ServerName> favoredNodes = ServerName.EMPTY_SERVER_LIST;
if (shouldAssignFavoredNodes(region)) {
favoredNodes = server.getFavoredNodesManager().getFavoredNodesWithDNPort(region);
}
serverManager.sendRegionOpen(plan.getDestination(), region, favoredNodes);
// we're done
return;
} catch (Throwable t) {
if (t instanceof RemoteException) {
t = ((RemoteException) t).unwrapRemoteException();
}
previousException = t;
// Should we wait a little before retrying? If the server is starting it's yes.
boolean hold = (t instanceof ServerNotRunningYetException);
// In case socket is timed out and the region server is still online,
// the openRegion RPC could have been accepted by the server and
// just the response didn't go through. So we will retry to
// open the region on the same server.
boolean retry = !hold && (t instanceof java.net.SocketTimeoutException && this.serverManager.isServerOnline(plan.getDestination()));
if (hold) {
LOG.warn(assignMsg + ", waiting a little before trying on the same region server " + "try=" + i + " of " + this.maximumAttempts, t);
if (maxWaitTime < 0) {
maxWaitTime = EnvironmentEdgeManager.currentTime() + this.server.getConfiguration().getLong("hbase.regionserver.rpc.startup.waittime", 60000);
}
try {
long now = EnvironmentEdgeManager.currentTime();
if (now < maxWaitTime) {
if (LOG.isDebugEnabled()) {
LOG.debug("Server is not yet up; waiting up to " + (maxWaitTime - now) + "ms", t);
}
Thread.sleep(100);
// reset the try count
i--;
} else {
LOG.debug("Server is not up for a while; try a new one", t);
needNewPlan = true;
}
} catch (InterruptedException ie) {
LOG.warn("Failed to assign " + region.getRegionNameAsString() + " since interrupted", ie);
regionStates.updateRegionState(region, State.FAILED_OPEN);
Thread.currentThread().interrupt();
return;
}
} else if (retry) {
// we want to retry as many times as needed as long as the RS is not dead.
i--;
if (LOG.isDebugEnabled()) {
LOG.debug(assignMsg + ", trying to assign to the same region server due ", t);
}
} else {
needNewPlan = true;
LOG.warn(assignMsg + ", trying to assign elsewhere instead;" + " try=" + i + " of " + this.maximumAttempts, t);
}
}
if (i == this.maximumAttempts) {
// For meta region, we have to keep retrying until succeeding
if (region.isMetaRegion()) {
// re-set attempt count to 0 for at least 1 retry
i = 0;
LOG.warn(assignMsg + ", trying to assign a hbase:meta region reached to maximumAttempts (" + this.maximumAttempts + "). Reset attempt counts and continue retrying.");
waitForRetryingMetaAssignment();
} else {
// This is the last try.
continue;
}
}
// reassigning to same RS.
if (needNewPlan) {
// Force a new plan and reassign. Will return null if no servers.
// The new plan could be the same as the existing plan since we don't
// exclude the server of the original plan, which should not be
// excluded since it could be the only server up now.
RegionPlan newPlan = null;
try {
newPlan = getRegionPlan(region, true);
} catch (HBaseIOException e) {
LOG.warn("Failed to get region plan", e);
}
if (newPlan == null) {
regionStates.updateRegionState(region, State.FAILED_OPEN);
LOG.warn("Unable to find a viable location to assign region " + region.getRegionNameAsString());
return;
}
if (plan != newPlan && !plan.getDestination().equals(newPlan.getDestination())) {
// Clean out plan we failed execute and one that doesn't look like it'll
// succeed anyways; we need a new plan!
// Transition back to OFFLINE
regionStates.updateRegionState(region, State.OFFLINE);
plan = newPlan;
} else if (plan.getDestination().equals(newPlan.getDestination()) && previousException instanceof FailedServerException) {
try {
LOG.info("Trying to re-assign " + region.getRegionNameAsString() + " to the same failed server.");
Thread.sleep(1 + conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY, RpcClient.FAILED_SERVER_EXPIRY_DEFAULT));
} catch (InterruptedException ie) {
LOG.warn("Failed to assign " + region.getRegionNameAsString() + " since interrupted", ie);
regionStates.updateRegionState(region, State.FAILED_OPEN);
Thread.currentThread().interrupt();
return;
}
}
}
}
// Run out of attempts
regionStates.updateRegionState(region, State.FAILED_OPEN);
} finally {
metricsAssignmentManager.updateAssignmentTime(EnvironmentEdgeManager.currentTime() - startTime);
}
}
Aggregations