use of org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException in project hbase by apache.
the class RegionRemoteProcedureBase method execute.
@Override
protected Procedure<MasterProcedureEnv>[] execute(MasterProcedureEnv env) throws ProcedureYieldException, ProcedureSuspendedException, InterruptedException {
RegionStateNode regionNode = getRegionNode(env);
regionNode.lock();
try {
switch(state) {
case REGION_REMOTE_PROCEDURE_DISPATCH:
{
// The code which wakes us up also needs to lock the RSN so here we do not need to
// synchronize
// on the event.
ProcedureEvent<?> event = regionNode.getProcedureEvent();
try {
env.getRemoteDispatcher().addOperationToNode(targetServer, this);
} catch (FailedRemoteDispatchException e) {
LOG.warn("Can not add remote operation {} for region {} to server {}, this usually " + "because the server is alread dead, give up and mark the procedure as complete, " + "the parent procedure will take care of this.", this, region, targetServer, e);
unattach(env);
return null;
}
event.suspend();
event.suspendIfNotReady(this);
throw new ProcedureSuspendedException();
}
case REGION_REMOTE_PROCEDURE_REPORT_SUCCEED:
env.getAssignmentManager().persistToMeta(regionNode);
unattach(env);
return null;
case REGION_REMOTE_PROCEDURE_DISPATCH_FAIL:
// the remote call is failed so we do not need to change the region state, just return.
unattach(env);
return null;
case REGION_REMOTE_PROCEDURE_SERVER_CRASH:
env.getAssignmentManager().regionClosedAbnormally(regionNode);
unattach(env);
return null;
default:
throw new IllegalStateException("Unknown state: " + state);
}
} catch (IOException e) {
if (retryCounter == null) {
retryCounter = ProcedureUtil.createRetryCounter(env.getMasterConfiguration());
}
long backoff = retryCounter.getBackoffTimeAndIncrementAttempts();
LOG.warn("Failed updating meta, suspend {}secs {}; {};", backoff / 1000, this, regionNode, e);
setTimeout(Math.toIntExact(backoff));
setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT);
skipPersistence();
throw new ProcedureSuspendedException();
} finally {
regionNode.unlock();
}
}
use of org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException in project hbase by apache.
the class ClaimReplicationQueuesProcedure method execute.
@Override
protected Procedure<MasterProcedureEnv>[] execute(MasterProcedureEnv env) throws ProcedureYieldException, ProcedureSuspendedException, InterruptedException {
ReplicationQueueStorage storage = env.getReplicationPeerManager().getQueueStorage();
try {
List<String> queues = storage.getAllQueues(crashedServer);
// as it may still be used by region servers which have not been upgraded yet.
for (Iterator<String> iter = queues.iterator(); iter.hasNext(); ) {
ReplicationQueueInfo queue = new ReplicationQueueInfo(iter.next());
if (queue.getPeerId().equals(ServerRegionReplicaUtil.REGION_REPLICA_REPLICATION_PEER)) {
LOG.info("Found replication queue {} for legacy region replication peer, " + "skipping claiming and removing...", queue.getQueueId());
iter.remove();
storage.removeQueue(crashedServer, queue.getQueueId());
}
}
if (queues.isEmpty()) {
LOG.debug("Finish claiming replication queues for {}", crashedServer);
storage.removeReplicatorIfQueueIsEmpty(crashedServer);
// we are done
return null;
}
LOG.debug("There are {} replication queues need to be claimed for {}", queues.size(), crashedServer);
List<ServerName> targetServers = env.getMasterServices().getServerManager().getOnlineServersList();
if (targetServers.isEmpty()) {
throw new ReplicationException("no region server available");
}
Collections.shuffle(targetServers);
ClaimReplicationQueueRemoteProcedure[] procs = new ClaimReplicationQueueRemoteProcedure[Math.min(queues.size(), targetServers.size())];
for (int i = 0; i < procs.length; i++) {
procs[i] = new ClaimReplicationQueueRemoteProcedure(crashedServer, queues.get(i), targetServers.get(i));
}
return procs;
} catch (ReplicationException e) {
if (retryCounter == null) {
retryCounter = ProcedureUtil.createRetryCounter(env.getMasterConfiguration());
}
long backoff = retryCounter.getBackoffTimeAndIncrementAttempts();
LOG.warn("Failed to claim replication queues for {}, suspend {}secs {}; {};", crashedServer, backoff / 1000, e);
setTimeout(Math.toIntExact(backoff));
setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT);
skipPersistence();
throw new ProcedureSuspendedException();
}
}
use of org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException in project hbase by apache.
the class ServerCrashProcedure method executeFromState.
@Override
protected Flow executeFromState(MasterProcedureEnv env, ServerCrashState state) throws ProcedureSuspendedException, ProcedureYieldException {
final MasterServices services = env.getMasterServices();
final AssignmentManager am = env.getAssignmentManager();
updateProgress(true);
// Server gets removed from processing list below on procedure successful finish.
if (!notifiedDeadServer) {
notifiedDeadServer = true;
}
switch(state) {
case SERVER_CRASH_START:
case SERVER_CRASH_SPLIT_META_LOGS:
case SERVER_CRASH_DELETE_SPLIT_META_WALS_DIR:
case SERVER_CRASH_ASSIGN_META:
break;
default:
// If hbase:meta is not assigned, yield.
if (env.getAssignmentManager().waitMetaLoaded(this)) {
throw new ProcedureSuspendedException();
}
}
try {
switch(state) {
case SERVER_CRASH_START:
LOG.info("Start " + this);
// If carrying meta, process it first. Else, get list of regions on crashed server.
if (this.carryingMeta) {
setNextState(ServerCrashState.SERVER_CRASH_SPLIT_META_LOGS);
} else {
setNextState(ServerCrashState.SERVER_CRASH_GET_REGIONS);
}
break;
case SERVER_CRASH_SPLIT_META_LOGS:
if (env.getMasterConfiguration().getBoolean(HBASE_SPLIT_WAL_COORDINATED_BY_ZK, DEFAULT_HBASE_SPLIT_COORDINATED_BY_ZK)) {
zkCoordinatedSplitMetaLogs(env);
setNextState(ServerCrashState.SERVER_CRASH_ASSIGN_META);
} else {
am.getRegionStates().metaLogSplitting(serverName);
addChildProcedure(createSplittingWalProcedures(env, true));
setNextState(ServerCrashState.SERVER_CRASH_DELETE_SPLIT_META_WALS_DIR);
}
break;
case SERVER_CRASH_DELETE_SPLIT_META_WALS_DIR:
if (isSplittingDone(env, true)) {
setNextState(ServerCrashState.SERVER_CRASH_ASSIGN_META);
am.getRegionStates().metaLogSplit(serverName);
} else {
setNextState(ServerCrashState.SERVER_CRASH_SPLIT_META_LOGS);
}
break;
case SERVER_CRASH_ASSIGN_META:
assignRegions(env, Arrays.asList(RegionInfoBuilder.FIRST_META_REGIONINFO));
setNextState(ServerCrashState.SERVER_CRASH_GET_REGIONS);
break;
case SERVER_CRASH_GET_REGIONS:
this.regionsOnCrashedServer = getRegionsOnCrashedServer(env);
// if we should do distributed log splitting.
if (regionsOnCrashedServer != null) {
LOG.info("{} had {} regions", serverName, regionsOnCrashedServer.size());
if (LOG.isTraceEnabled()) {
this.regionsOnCrashedServer.stream().forEach(ri -> LOG.trace(ri.getShortNameToLog()));
}
}
if (!this.shouldSplitWal) {
setNextState(ServerCrashState.SERVER_CRASH_ASSIGN);
} else {
setNextState(ServerCrashState.SERVER_CRASH_SPLIT_LOGS);
}
break;
case SERVER_CRASH_SPLIT_LOGS:
if (env.getMasterConfiguration().getBoolean(HBASE_SPLIT_WAL_COORDINATED_BY_ZK, DEFAULT_HBASE_SPLIT_COORDINATED_BY_ZK)) {
zkCoordinatedSplitLogs(env);
setNextState(ServerCrashState.SERVER_CRASH_ASSIGN);
} else {
am.getRegionStates().logSplitting(this.serverName);
addChildProcedure(createSplittingWalProcedures(env, false));
setNextState(ServerCrashState.SERVER_CRASH_DELETE_SPLIT_WALS_DIR);
}
break;
case SERVER_CRASH_DELETE_SPLIT_WALS_DIR:
if (isSplittingDone(env, false)) {
cleanupSplitDir(env);
setNextState(ServerCrashState.SERVER_CRASH_ASSIGN);
am.getRegionStates().logSplit(this.serverName);
} else {
setNextState(ServerCrashState.SERVER_CRASH_SPLIT_LOGS);
}
break;
case SERVER_CRASH_ASSIGN:
// Filter changes this.regionsOnCrashedServer.
if (filterDefaultMetaRegions()) {
if (LOG.isTraceEnabled()) {
LOG.trace("Assigning regions " + RegionInfo.getShortNameToLog(regionsOnCrashedServer) + ", " + this + "; cycles=" + getCycles());
}
assignRegions(env, regionsOnCrashedServer);
}
setNextState(ServerCrashState.SERVER_CRASH_CLAIM_REPLICATION_QUEUES);
break;
case SERVER_CRASH_HANDLE_RIT2:
// Noop. Left in place because we used to call handleRIT here for a second time
// but no longer necessary since HBASE-20634.
setNextState(ServerCrashState.SERVER_CRASH_CLAIM_REPLICATION_QUEUES);
break;
case SERVER_CRASH_CLAIM_REPLICATION_QUEUES:
addChildProcedure(new ClaimReplicationQueuesProcedure(serverName));
setNextState(ServerCrashState.SERVER_CRASH_FINISH);
break;
case SERVER_CRASH_FINISH:
LOG.info("removed crashed server {} after splitting done", serverName);
services.getAssignmentManager().getRegionStates().removeServer(serverName);
updateProgress(true);
return Flow.NO_MORE_STATE;
default:
throw new UnsupportedOperationException("unhandled state=" + state);
}
} catch (IOException e) {
LOG.warn("Failed state=" + state + ", retry " + this + "; cycles=" + getCycles(), e);
}
return Flow.HAS_MORE_STATE;
}
use of org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException in project hbase by apache.
the class SplitWALProcedure method executeFromState.
@Override
protected Flow executeFromState(MasterProcedureEnv env, MasterProcedureProtos.SplitWALState state) throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException {
SplitWALManager splitWALManager = env.getMasterServices().getSplitWALManager();
switch(state) {
case ACQUIRE_SPLIT_WAL_WORKER:
worker = splitWALManager.acquireSplitWALWorker(this);
setNextState(MasterProcedureProtos.SplitWALState.DISPATCH_WAL_TO_WORKER);
return Flow.HAS_MORE_STATE;
case DISPATCH_WAL_TO_WORKER:
assert worker != null;
addChildProcedure(new SplitWALRemoteProcedure(worker, crashedServer, walPath));
setNextState(MasterProcedureProtos.SplitWALState.RELEASE_SPLIT_WORKER);
return Flow.HAS_MORE_STATE;
case RELEASE_SPLIT_WORKER:
boolean finished;
try {
finished = splitWALManager.isSplitWALFinished(walPath);
} catch (IOException ioe) {
if (retryCounter == null) {
retryCounter = ProcedureUtil.createRetryCounter(env.getMasterConfiguration());
}
long backoff = retryCounter.getBackoffTimeAndIncrementAttempts();
LOG.warn("Failed to check whether splitting wal {} success, wait {} seconds to retry", walPath, backoff / 1000, ioe);
setTimeout(Math.toIntExact(backoff));
setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT);
skipPersistence();
throw new ProcedureSuspendedException();
}
splitWALManager.releaseSplitWALWorker(worker, env.getProcedureScheduler());
if (!finished) {
LOG.warn("Failed to split wal {} by server {}, retry...", walPath, worker);
setNextState(MasterProcedureProtos.SplitWALState.ACQUIRE_SPLIT_WAL_WORKER);
return Flow.HAS_MORE_STATE;
}
ServerCrashProcedure.updateProgress(env, getParentProcId());
return Flow.NO_MORE_STATE;
default:
throw new UnsupportedOperationException("unhandled state=" + state);
}
}
use of org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException in project hbase by apache.
the class SwitchRpcThrottleProcedure method executeFromState.
@Override
protected Flow executeFromState(MasterProcedureEnv env, SwitchRpcThrottleState state) throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException {
switch(state) {
case UPDATE_SWITCH_RPC_THROTTLE_STORAGE:
try {
switchThrottleState(env, rpcThrottleEnabled);
} catch (IOException e) {
if (retryCounter == null) {
retryCounter = ProcedureUtil.createRetryCounter(env.getMasterConfiguration());
}
long backoff = retryCounter.getBackoffTimeAndIncrementAttempts();
LOG.warn("Failed to store rpc throttle value {}, sleep {} secs and retry", rpcThrottleEnabled, backoff / 1000, e);
setTimeout(Math.toIntExact(backoff));
setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT);
skipPersistence();
throw new ProcedureSuspendedException();
}
setNextState(SwitchRpcThrottleState.SWITCH_RPC_THROTTLE_ON_RS);
return Flow.HAS_MORE_STATE;
case SWITCH_RPC_THROTTLE_ON_RS:
SwitchRpcThrottleRemoteProcedure[] subProcedures = env.getMasterServices().getServerManager().getOnlineServersList().stream().map(sn -> new SwitchRpcThrottleRemoteProcedure(sn, rpcThrottleEnabled)).toArray(SwitchRpcThrottleRemoteProcedure[]::new);
addChildProcedure(subProcedures);
setNextState(SwitchRpcThrottleState.POST_SWITCH_RPC_THROTTLE);
return Flow.HAS_MORE_STATE;
case POST_SWITCH_RPC_THROTTLE:
ProcedurePrepareLatch.releaseLatch(syncLatch, this);
return Flow.NO_MORE_STATE;
default:
throw new UnsupportedOperationException("unhandled state=" + state);
}
}
Aggregations