use of org.apache.hadoop.hbase.ServerName in project hbase by apache.
the class MasterMetaBootstrap method getPreviouselyFailedMetaServersFromZK.
/**
* This function returns a set of region server names under hbase:meta recovering region ZK node
* @return Set of meta server names which were recorded in ZK
*/
private Set<ServerName> getPreviouselyFailedMetaServersFromZK() throws KeeperException {
final ZooKeeperWatcher zooKeeper = master.getZooKeeper();
Set<ServerName> result = new HashSet<>();
String metaRecoveringZNode = ZKUtil.joinZNode(zooKeeper.znodePaths.recoveringRegionsZNode, HRegionInfo.FIRST_META_REGIONINFO.getEncodedName());
List<String> regionFailedServers = ZKUtil.listChildrenNoWatch(zooKeeper, metaRecoveringZNode);
if (regionFailedServers == null)
return result;
for (String failedServer : regionFailedServers) {
ServerName server = ServerName.parseServerName(failedServer);
result.add(server);
}
return result;
}
use of org.apache.hadoop.hbase.ServerName in project hbase by apache.
the class HMaster method move.
// Public so can be accessed by tests.
@VisibleForTesting
public void move(final byte[] encodedRegionName, final byte[] destServerName) throws HBaseIOException {
RegionState regionState = assignmentManager.getRegionStates().getRegionState(Bytes.toString(encodedRegionName));
HRegionInfo hri;
if (regionState != null) {
hri = regionState.getRegion();
} else {
throw new UnknownRegionException(Bytes.toStringBinary(encodedRegionName));
}
ServerName dest;
if (destServerName == null || destServerName.length == 0) {
LOG.info("Passed destination servername is null/empty so " + "choosing a server at random");
final List<ServerName> destServers = this.serverManager.createDestinationServersList(regionState.getServerName());
dest = balancer.randomAssignment(hri, destServers);
if (dest == null) {
LOG.debug("Unable to determine a plan to assign " + hri);
return;
}
} else {
ServerName candidate = ServerName.valueOf(Bytes.toString(destServerName));
dest = balancer.randomAssignment(hri, Lists.newArrayList(candidate));
if (dest == null) {
LOG.debug("Unable to determine a plan to assign " + hri);
return;
}
if (dest.equals(serverName) && balancer instanceof BaseLoadBalancer && !((BaseLoadBalancer) balancer).shouldBeOnMaster(hri)) {
// To avoid unnecessary region moving later by balancer. Don't put user
// regions on master. Regions on master could be put on other region
// server intentionally by test however.
LOG.debug("Skipping move of region " + hri.getRegionNameAsString() + " to avoid unnecessary region moving later by load balancer," + " because it should not be on master");
return;
}
}
if (dest.equals(regionState.getServerName())) {
LOG.debug("Skipping move of region " + hri.getRegionNameAsString() + " because region already assigned to the same server " + dest + ".");
return;
}
// Now we can do the move
RegionPlan rp = new RegionPlan(hri, regionState.getServerName(), dest);
try {
checkInitialized();
if (this.cpHost != null) {
if (this.cpHost.preMove(hri, rp.getSource(), rp.getDestination())) {
return;
}
}
// warmup the region on the destination before initiating the move. this call
// is synchronous and takes some time. doing it before the source region gets
// closed
serverManager.sendRegionWarmup(rp.getDestination(), hri);
LOG.info(getClientIdAuditPrefix() + " move " + rp + ", running balancer");
this.assignmentManager.balance(rp);
if (this.cpHost != null) {
this.cpHost.postMove(hri, rp.getSource(), rp.getDestination());
}
} catch (IOException ioe) {
if (ioe instanceof HBaseIOException) {
throw (HBaseIOException) ioe;
}
throw new HBaseIOException(ioe);
}
}
use of org.apache.hadoop.hbase.ServerName in project hbase by apache.
the class AssignmentManager method assign.
/**
* Bulk assign regions to <code>destination</code>.
* @param destination
* @param regions Regions to assign.
* @return true if successful
*/
boolean assign(final ServerName destination, final List<HRegionInfo> regions) throws InterruptedException {
long startTime = EnvironmentEdgeManager.currentTime();
try {
int regionCount = regions.size();
if (regionCount == 0) {
return true;
}
LOG.info("Assigning " + regionCount + " region(s) to " + destination.toString());
Set<String> encodedNames = new HashSet<>(regionCount);
for (HRegionInfo region : regions) {
encodedNames.add(region.getEncodedName());
}
List<HRegionInfo> failedToOpenRegions = new ArrayList<>();
Map<String, Lock> locks = locker.acquireLocks(encodedNames);
try {
Map<String, RegionPlan> plans = new HashMap<>(regionCount);
List<RegionState> states = new ArrayList<>(regionCount);
for (HRegionInfo region : regions) {
String encodedName = region.getEncodedName();
if (!isDisabledorDisablingRegionInRIT(region)) {
RegionState state = forceRegionStateToOffline(region, false);
boolean onDeadServer = false;
if (state != null) {
if (regionStates.wasRegionOnDeadServer(encodedName)) {
LOG.info("Skip assigning " + region.getRegionNameAsString() + ", it's host " + regionStates.getLastRegionServerOfRegion(encodedName) + " is dead but not processed yet");
onDeadServer = true;
} else {
RegionPlan plan = new RegionPlan(region, state.getServerName(), destination);
plans.put(encodedName, plan);
states.add(state);
continue;
}
}
// Reassign if the region wasn't on a dead server
if (!onDeadServer) {
LOG.info("failed to force region state to offline, " + "will reassign later: " + region);
// assign individually later
failedToOpenRegions.add(region);
}
}
// Release the lock, this region is excluded from bulk assign because
// we can't update its state, or set its znode to offline.
Lock lock = locks.remove(encodedName);
lock.unlock();
}
if (server.isStopped()) {
return false;
}
// Add region plans, so we can updateTimers when one region is opened so
// that unnecessary timeout on RIT is reduced.
this.addPlans(plans);
List<Pair<HRegionInfo, List<ServerName>>> regionOpenInfos = new ArrayList<>(states.size());
for (RegionState state : states) {
HRegionInfo region = state.getRegion();
regionStates.updateRegionState(region, State.PENDING_OPEN, destination);
List<ServerName> favoredNodes = ServerName.EMPTY_SERVER_LIST;
if (shouldAssignFavoredNodes(region)) {
favoredNodes = server.getFavoredNodesManager().getFavoredNodesWithDNPort(region);
}
regionOpenInfos.add(new Pair<>(region, favoredNodes));
}
// Move on to open regions.
try {
// Send OPEN RPC. If it fails on a IOE or RemoteException,
// regions will be assigned individually.
Configuration conf = server.getConfiguration();
long maxWaitTime = System.currentTimeMillis() + conf.getLong("hbase.regionserver.rpc.startup.waittime", 60000);
for (int i = 1; i <= maximumAttempts && !server.isStopped(); i++) {
try {
List<RegionOpeningState> regionOpeningStateList = serverManager.sendRegionOpen(destination, regionOpenInfos);
for (int k = 0, n = regionOpeningStateList.size(); k < n; k++) {
RegionOpeningState openingState = regionOpeningStateList.get(k);
if (openingState != RegionOpeningState.OPENED) {
HRegionInfo region = regionOpenInfos.get(k).getFirst();
LOG.info("Got opening state " + openingState + ", will reassign later: " + region);
// Failed opening this region, reassign it later
forceRegionStateToOffline(region, true);
failedToOpenRegions.add(region);
}
}
break;
} catch (IOException e) {
if (e instanceof RemoteException) {
e = ((RemoteException) e).unwrapRemoteException();
}
if (e instanceof RegionServerStoppedException) {
LOG.warn("The region server was shut down, ", e);
// No need to retry, the region server is a goner.
return false;
} else if (e instanceof ServerNotRunningYetException) {
long now = System.currentTimeMillis();
if (now < maxWaitTime) {
if (LOG.isDebugEnabled()) {
LOG.debug("Server is not yet up; waiting up to " + (maxWaitTime - now) + "ms", e);
}
Thread.sleep(100);
// reset the try count
i--;
continue;
}
} else if (e instanceof java.net.SocketTimeoutException && this.serverManager.isServerOnline(destination)) {
// open the region on the same server.
if (LOG.isDebugEnabled()) {
LOG.debug("Bulk assigner openRegion() to " + destination + " has timed out, but the regions might" + " already be opened on it.", e);
}
// wait and reset the re-try count, server might be just busy.
Thread.sleep(100);
i--;
continue;
} else if (e instanceof FailedServerException && i < maximumAttempts) {
// In case the server is in the failed server list, no point to
// retry too soon. Retry after the failed_server_expiry time
long sleepTime = 1 + conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY, RpcClient.FAILED_SERVER_EXPIRY_DEFAULT);
if (LOG.isDebugEnabled()) {
LOG.debug(destination + " is on failed server list; waiting " + sleepTime + "ms", e);
}
Thread.sleep(sleepTime);
continue;
}
throw e;
}
}
} catch (IOException e) {
// Can be a socket timeout, EOF, NoRouteToHost, etc
LOG.info("Unable to communicate with " + destination + " in order to assign regions, ", e);
for (RegionState state : states) {
HRegionInfo region = state.getRegion();
forceRegionStateToOffline(region, true);
}
return false;
}
} finally {
for (Lock lock : locks.values()) {
lock.unlock();
}
}
if (!failedToOpenRegions.isEmpty()) {
for (HRegionInfo region : failedToOpenRegions) {
if (!regionStates.isRegionOnline(region)) {
invokeAssign(region);
}
}
}
// wait for assignment completion
ArrayList<HRegionInfo> userRegionSet = new ArrayList<>(regions.size());
for (HRegionInfo region : regions) {
if (!region.getTable().isSystemTable()) {
userRegionSet.add(region);
}
}
if (!waitForAssignment(userRegionSet, true, userRegionSet.size(), System.currentTimeMillis())) {
LOG.debug("some user regions are still in transition: " + userRegionSet);
}
LOG.debug("Bulk assigning done for " + destination);
return true;
} finally {
metricsAssignmentManager.updateBulkAssignTime(EnvironmentEdgeManager.currentTime() - startTime);
}
}
use of org.apache.hadoop.hbase.ServerName in project hbase by apache.
the class AssignmentManager method processRegionsInTransition.
/**
* Processes list of regions in transition at startup
*/
void processRegionsInTransition(Collection<RegionState> regionsInTransition) {
// to the region if the master dies right after the RPC call is out.
for (RegionState regionState : regionsInTransition) {
LOG.info("Processing " + regionState);
ServerName serverName = regionState.getServerName();
// case, try assigning it here.
if (serverName != null && !serverManager.getOnlineServers().containsKey(serverName)) {
LOG.info("Server " + serverName + " isn't online. SSH will handle this");
// SSH will handle it
continue;
}
HRegionInfo regionInfo = regionState.getRegion();
RegionState.State state = regionState.getState();
switch(state) {
case CLOSED:
invokeAssign(regionState.getRegion());
break;
case PENDING_OPEN:
retrySendRegionOpen(regionState);
break;
case PENDING_CLOSE:
retrySendRegionClose(regionState);
break;
case FAILED_CLOSE:
case FAILED_OPEN:
invokeUnAssign(regionInfo);
break;
default:
// No process for other states
break;
}
}
}
use of org.apache.hadoop.hbase.ServerName in project hbase by apache.
the class AssignmentManager method cleanOutCrashedServerReferences.
/**
* Clean out crashed server removing any assignments.
* @param sn Server that went down.
* @return list of regions in transition on this server
*/
public List<HRegionInfo> cleanOutCrashedServerReferences(final ServerName sn) {
// Clean out any existing assignment plans for this server
synchronized (this.regionPlans) {
for (Iterator<Map.Entry<String, RegionPlan>> i = this.regionPlans.entrySet().iterator(); i.hasNext(); ) {
Map.Entry<String, RegionPlan> e = i.next();
ServerName otherSn = e.getValue().getDestination();
// The name will be null if the region is planned for a random assign.
if (otherSn != null && otherSn.equals(sn)) {
// Use iterator's remove else we'll get CME
i.remove();
}
}
}
List<HRegionInfo> rits = regionStates.serverOffline(sn);
for (Iterator<HRegionInfo> it = rits.iterator(); it.hasNext(); ) {
HRegionInfo hri = it.next();
String encodedName = hri.getEncodedName();
// We need a lock on the region as we could update it
Lock lock = locker.acquireLock(encodedName);
try {
RegionState regionState = regionStates.getRegionTransitionState(encodedName);
if (regionState == null || (regionState.getServerName() != null && !regionState.isOnServer(sn)) || !RegionStates.isOneOfStates(regionState, State.PENDING_OPEN, State.OPENING, State.FAILED_OPEN, State.FAILED_CLOSE, State.OFFLINE)) {
LOG.info("Skip " + regionState + " since it is not opening/failed_close" + " on the dead server any more: " + sn);
it.remove();
} else {
if (tableStateManager.isTableState(hri.getTable(), TableState.State.DISABLED, TableState.State.DISABLING)) {
regionStates.regionOffline(hri);
it.remove();
continue;
}
// Mark the region offline and assign it again by SSH
regionStates.updateRegionState(hri, State.OFFLINE);
}
} finally {
lock.unlock();
}
}
return rits;
}
Aggregations