use of org.apache.hadoop.hbase.HBaseIOException in project hbase by apache.
the class RSGroupBasedLoadBalancer method generateGroupMaps.
private void generateGroupMaps(List<HRegionInfo> regions, List<ServerName> servers, ListMultimap<String, HRegionInfo> regionMap, ListMultimap<String, ServerName> serverMap) throws HBaseIOException {
try {
for (HRegionInfo region : regions) {
String groupName = rsGroupInfoManager.getRSGroupOfTable(region.getTable());
if (groupName == null) {
LOG.warn("Group for table " + region.getTable() + " is null");
}
regionMap.put(groupName, region);
}
for (String groupKey : regionMap.keySet()) {
RSGroupInfo info = rsGroupInfoManager.getRSGroup(groupKey);
serverMap.putAll(groupKey, filterOfflineServers(info, servers));
if (serverMap.get(groupKey).size() < 1) {
serverMap.put(groupKey, LoadBalancer.BOGUS_SERVER_NAME);
}
}
} catch (IOException e) {
throw new HBaseIOException("Failed to generate group maps", e);
}
}
use of org.apache.hadoop.hbase.HBaseIOException in project hbase by apache.
the class RSGroupBasedLoadBalancer method retainAssignment.
@Override
public Map<ServerName, List<HRegionInfo>> retainAssignment(Map<HRegionInfo, ServerName> regions, List<ServerName> servers) throws HBaseIOException {
try {
Map<ServerName, List<HRegionInfo>> assignments = new TreeMap<>();
ListMultimap<String, HRegionInfo> groupToRegion = ArrayListMultimap.create();
Set<HRegionInfo> misplacedRegions = getMisplacedRegions(regions);
for (HRegionInfo region : regions.keySet()) {
if (!misplacedRegions.contains(region)) {
String groupName = rsGroupInfoManager.getRSGroupOfTable(region.getTable());
groupToRegion.put(groupName, region);
}
}
// assignments.
for (String key : groupToRegion.keySet()) {
Map<HRegionInfo, ServerName> currentAssignmentMap = new TreeMap<HRegionInfo, ServerName>();
List<HRegionInfo> regionList = groupToRegion.get(key);
RSGroupInfo info = rsGroupInfoManager.getRSGroup(key);
List<ServerName> candidateList = filterOfflineServers(info, servers);
for (HRegionInfo region : regionList) {
currentAssignmentMap.put(region, regions.get(region));
}
if (candidateList.size() > 0) {
assignments.putAll(this.internalBalancer.retainAssignment(currentAssignmentMap, candidateList));
}
}
for (HRegionInfo region : misplacedRegions) {
String groupName = rsGroupInfoManager.getRSGroupOfTable(region.getTable());
;
RSGroupInfo info = rsGroupInfoManager.getRSGroup(groupName);
List<ServerName> candidateList = filterOfflineServers(info, servers);
ServerName server = this.internalBalancer.randomAssignment(region, candidateList);
if (server != null) {
if (!assignments.containsKey(server)) {
assignments.put(server, new ArrayList<>());
}
assignments.get(server).add(region);
} else {
//if not server is available assign to bogus so it ends up in RIT
if (!assignments.containsKey(LoadBalancer.BOGUS_SERVER_NAME)) {
assignments.put(LoadBalancer.BOGUS_SERVER_NAME, new ArrayList<>());
}
assignments.get(LoadBalancer.BOGUS_SERVER_NAME).add(region);
}
}
return assignments;
} catch (IOException e) {
throw new HBaseIOException("Failed to do online retain assignment", e);
}
}
use of org.apache.hadoop.hbase.HBaseIOException in project hbase by apache.
the class AssignmentManager method assign.
/**
* Caller must hold lock on the passed <code>state</code> object.
* @param state
* @param forceNewPlan
*/
private void assign(RegionState state, boolean forceNewPlan) {
long startTime = EnvironmentEdgeManager.currentTime();
try {
Configuration conf = server.getConfiguration();
RegionPlan plan = null;
long maxWaitTime = -1;
HRegionInfo region = state.getRegion();
Throwable previousException = null;
for (int i = 1; i <= maximumAttempts; i++) {
if (server.isStopped() || server.isAborted()) {
LOG.info("Skip assigning " + region.getRegionNameAsString() + ", the server is stopped/aborted");
return;
}
if (plan == null) {
// Get a server for the region at first
try {
plan = getRegionPlan(region, forceNewPlan);
} catch (HBaseIOException e) {
LOG.warn("Failed to get region plan", e);
}
}
if (plan == null) {
LOG.warn("Unable to determine a plan to assign " + region);
// For meta region, we have to keep retrying until succeeding
if (region.isMetaRegion()) {
if (i == maximumAttempts) {
// re-set attempt count to 0 for at least 1 retry
i = 0;
LOG.warn("Unable to determine a plan to assign a hbase:meta region " + region + " after maximumAttempts (" + this.maximumAttempts + "). Reset attempts count and continue retrying.");
}
waitForRetryingMetaAssignment();
continue;
}
regionStates.updateRegionState(region, State.FAILED_OPEN);
return;
}
LOG.info("Assigning " + region.getRegionNameAsString() + " to " + plan.getDestination());
// Transition RegionState to PENDING_OPEN
regionStates.updateRegionState(region, State.PENDING_OPEN, plan.getDestination());
boolean needNewPlan = false;
final String assignMsg = "Failed assignment of " + region.getRegionNameAsString() + " to " + plan.getDestination();
try {
List<ServerName> favoredNodes = ServerName.EMPTY_SERVER_LIST;
if (shouldAssignFavoredNodes(region)) {
favoredNodes = server.getFavoredNodesManager().getFavoredNodesWithDNPort(region);
}
serverManager.sendRegionOpen(plan.getDestination(), region, favoredNodes);
// we're done
return;
} catch (Throwable t) {
if (t instanceof RemoteException) {
t = ((RemoteException) t).unwrapRemoteException();
}
previousException = t;
// Should we wait a little before retrying? If the server is starting it's yes.
boolean hold = (t instanceof ServerNotRunningYetException);
// In case socket is timed out and the region server is still online,
// the openRegion RPC could have been accepted by the server and
// just the response didn't go through. So we will retry to
// open the region on the same server.
boolean retry = !hold && (t instanceof java.net.SocketTimeoutException && this.serverManager.isServerOnline(plan.getDestination()));
if (hold) {
LOG.warn(assignMsg + ", waiting a little before trying on the same region server " + "try=" + i + " of " + this.maximumAttempts, t);
if (maxWaitTime < 0) {
maxWaitTime = EnvironmentEdgeManager.currentTime() + this.server.getConfiguration().getLong("hbase.regionserver.rpc.startup.waittime", 60000);
}
try {
long now = EnvironmentEdgeManager.currentTime();
if (now < maxWaitTime) {
if (LOG.isDebugEnabled()) {
LOG.debug("Server is not yet up; waiting up to " + (maxWaitTime - now) + "ms", t);
}
Thread.sleep(100);
// reset the try count
i--;
} else {
LOG.debug("Server is not up for a while; try a new one", t);
needNewPlan = true;
}
} catch (InterruptedException ie) {
LOG.warn("Failed to assign " + region.getRegionNameAsString() + " since interrupted", ie);
regionStates.updateRegionState(region, State.FAILED_OPEN);
Thread.currentThread().interrupt();
return;
}
} else if (retry) {
// we want to retry as many times as needed as long as the RS is not dead.
i--;
if (LOG.isDebugEnabled()) {
LOG.debug(assignMsg + ", trying to assign to the same region server due ", t);
}
} else {
needNewPlan = true;
LOG.warn(assignMsg + ", trying to assign elsewhere instead;" + " try=" + i + " of " + this.maximumAttempts, t);
}
}
if (i == this.maximumAttempts) {
// For meta region, we have to keep retrying until succeeding
if (region.isMetaRegion()) {
// re-set attempt count to 0 for at least 1 retry
i = 0;
LOG.warn(assignMsg + ", trying to assign a hbase:meta region reached to maximumAttempts (" + this.maximumAttempts + "). Reset attempt counts and continue retrying.");
waitForRetryingMetaAssignment();
} else {
// This is the last try.
continue;
}
}
// reassigning to same RS.
if (needNewPlan) {
// Force a new plan and reassign. Will return null if no servers.
// The new plan could be the same as the existing plan since we don't
// exclude the server of the original plan, which should not be
// excluded since it could be the only server up now.
RegionPlan newPlan = null;
try {
newPlan = getRegionPlan(region, true);
} catch (HBaseIOException e) {
LOG.warn("Failed to get region plan", e);
}
if (newPlan == null) {
regionStates.updateRegionState(region, State.FAILED_OPEN);
LOG.warn("Unable to find a viable location to assign region " + region.getRegionNameAsString());
return;
}
if (plan != newPlan && !plan.getDestination().equals(newPlan.getDestination())) {
// Clean out plan we failed execute and one that doesn't look like it'll
// succeed anyways; we need a new plan!
// Transition back to OFFLINE
regionStates.updateRegionState(region, State.OFFLINE);
plan = newPlan;
} else if (plan.getDestination().equals(newPlan.getDestination()) && previousException instanceof FailedServerException) {
try {
LOG.info("Trying to re-assign " + region.getRegionNameAsString() + " to the same failed server.");
Thread.sleep(1 + conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY, RpcClient.FAILED_SERVER_EXPIRY_DEFAULT));
} catch (InterruptedException ie) {
LOG.warn("Failed to assign " + region.getRegionNameAsString() + " since interrupted", ie);
regionStates.updateRegionState(region, State.FAILED_OPEN);
Thread.currentThread().interrupt();
return;
}
}
}
}
// Run out of attempts
regionStates.updateRegionState(region, State.FAILED_OPEN);
} finally {
metricsAssignmentManager.updateAssignmentTime(EnvironmentEdgeManager.currentTime() - startTime);
}
}
use of org.apache.hadoop.hbase.HBaseIOException in project hbase by apache.
the class AssignmentManager method acceptPlan.
private void acceptPlan(final HashMap<RegionInfo, RegionStateNode> regions, final Map<ServerName, List<RegionInfo>> plan) throws HBaseIOException {
final ProcedureEvent<?>[] events = new ProcedureEvent[regions.size()];
final long st = EnvironmentEdgeManager.currentTime();
if (plan.isEmpty()) {
throw new HBaseIOException("unable to compute plans for regions=" + regions.size());
}
int evcount = 0;
for (Map.Entry<ServerName, List<RegionInfo>> entry : plan.entrySet()) {
final ServerName server = entry.getKey();
for (RegionInfo hri : entry.getValue()) {
final RegionStateNode regionNode = regions.get(hri);
regionNode.setRegionLocation(server);
if (server.equals(LoadBalancer.BOGUS_SERVER_NAME) && regionNode.isSystemTable()) {
assignQueueLock.lock();
try {
pendingAssignQueue.add(regionNode);
} finally {
assignQueueLock.unlock();
}
} else {
events[evcount++] = regionNode.getProcedureEvent();
}
}
}
ProcedureEvent.wakeEvents(getProcedureScheduler(), events);
final long et = EnvironmentEdgeManager.currentTime();
if (LOG.isTraceEnabled()) {
LOG.trace("ASSIGN ACCEPT " + events.length + " -> " + StringUtils.humanTimeDiff(et - st));
}
}
use of org.apache.hadoop.hbase.HBaseIOException in project hbase by apache.
the class TransitRegionStateProcedure method confirmOpened.
private Flow confirmOpened(MasterProcedureEnv env, RegionStateNode regionNode) throws IOException {
if (regionNode.isInState(State.OPEN)) {
retryCounter = null;
if (lastState == RegionStateTransitionState.REGION_STATE_TRANSITION_CONFIRM_OPENED) {
// we are the last state, finish
regionNode.unsetProcedure(this);
ServerCrashProcedure.updateProgress(env, getParentProcId());
return Flow.NO_MORE_STATE;
}
// It is possible that we arrive here but confirm opened is not the last state, for example,
// when merging or splitting a region, we unassign the region from a RS and the RS is crashed,
// then there will be recovered edits for this region, we'd better make the region online
// again and then unassign it, otherwise we have to fail the merge/split procedure as we may
// loss data.
setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_CLOSE);
return Flow.HAS_MORE_STATE;
}
int retries = env.getAssignmentManager().getRegionStates().addToFailedOpen(regionNode).incrementAndGetRetries();
int maxAttempts = env.getAssignmentManager().getAssignMaxAttempts();
LOG.info("Retry={} of max={}; {}; {}", retries, maxAttempts, this, regionNode.toShortString());
if (retries >= maxAttempts) {
env.getAssignmentManager().regionFailedOpen(regionNode, true);
setFailure(getClass().getSimpleName(), new RetriesExhaustedException("Max attempts " + env.getAssignmentManager().getAssignMaxAttempts() + " exceeded"));
regionNode.unsetProcedure(this);
return Flow.NO_MORE_STATE;
}
env.getAssignmentManager().regionFailedOpen(regionNode, false);
// we failed to assign the region, force a new plan
forceNewPlan = true;
regionNode.setRegionLocation(null);
setNextState(RegionStateTransitionState.REGION_STATE_TRANSITION_GET_ASSIGN_CANDIDATE);
if (retries > env.getAssignmentManager().getAssignRetryImmediatelyMaxAttempts()) {
// Throw exception to backoff and retry when failed open too many times
throw new HBaseIOException("Failed confirm OPEN of " + regionNode + " (remote log may yield more detail on why).");
} else {
// Here we do not throw exception because we want to the region to be online ASAP
return Flow.HAS_MORE_STATE;
}
}
Aggregations