Search in sources :

Example 6 with ProcedureSuspendedException

use of org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException in project hbase by apache.

the class RegionRemoteProcedureBase method execute.

@Override
protected Procedure<MasterProcedureEnv>[] execute(MasterProcedureEnv env) throws ProcedureYieldException, ProcedureSuspendedException, InterruptedException {
    RegionStateNode regionNode = getRegionNode(env);
    regionNode.lock();
    try {
        switch(state) {
            case REGION_REMOTE_PROCEDURE_DISPATCH:
                {
                    // The code which wakes us up also needs to lock the RSN so here we do not need to
                    // synchronize
                    // on the event.
                    ProcedureEvent<?> event = regionNode.getProcedureEvent();
                    try {
                        env.getRemoteDispatcher().addOperationToNode(targetServer, this);
                    } catch (FailedRemoteDispatchException e) {
                        LOG.warn("Can not add remote operation {} for region {} to server {}, this usually " + "because the server is alread dead, give up and mark the procedure as complete, " + "the parent procedure will take care of this.", this, region, targetServer, e);
                        unattach(env);
                        return null;
                    }
                    event.suspend();
                    event.suspendIfNotReady(this);
                    throw new ProcedureSuspendedException();
                }
            case REGION_REMOTE_PROCEDURE_REPORT_SUCCEED:
                env.getAssignmentManager().persistToMeta(regionNode);
                unattach(env);
                return null;
            case REGION_REMOTE_PROCEDURE_DISPATCH_FAIL:
                // the remote call is failed so we do not need to change the region state, just return.
                unattach(env);
                return null;
            case REGION_REMOTE_PROCEDURE_SERVER_CRASH:
                env.getAssignmentManager().regionClosedAbnormally(regionNode);
                unattach(env);
                return null;
            default:
                throw new IllegalStateException("Unknown state: " + state);
        }
    } catch (IOException e) {
        if (retryCounter == null) {
            retryCounter = ProcedureUtil.createRetryCounter(env.getMasterConfiguration());
        }
        long backoff = retryCounter.getBackoffTimeAndIncrementAttempts();
        LOG.warn("Failed updating meta, suspend {}secs {}; {};", backoff / 1000, this, regionNode, e);
        setTimeout(Math.toIntExact(backoff));
        setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT);
        skipPersistence();
        throw new ProcedureSuspendedException();
    } finally {
        regionNode.unlock();
    }
}
Also used : FailedRemoteDispatchException(org.apache.hadoop.hbase.procedure2.FailedRemoteDispatchException) ProcedureEvent(org.apache.hadoop.hbase.procedure2.ProcedureEvent) IOException(java.io.IOException) ProcedureSuspendedException(org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException)

Example 7 with ProcedureSuspendedException

use of org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException in project hbase by apache.

the class ClaimReplicationQueuesProcedure method execute.

@Override
protected Procedure<MasterProcedureEnv>[] execute(MasterProcedureEnv env) throws ProcedureYieldException, ProcedureSuspendedException, InterruptedException {
    ReplicationQueueStorage storage = env.getReplicationPeerManager().getQueueStorage();
    try {
        List<String> queues = storage.getAllQueues(crashedServer);
        // as it may still be used by region servers which have not been upgraded yet.
        for (Iterator<String> iter = queues.iterator(); iter.hasNext(); ) {
            ReplicationQueueInfo queue = new ReplicationQueueInfo(iter.next());
            if (queue.getPeerId().equals(ServerRegionReplicaUtil.REGION_REPLICA_REPLICATION_PEER)) {
                LOG.info("Found replication queue {} for legacy region replication peer, " + "skipping claiming and removing...", queue.getQueueId());
                iter.remove();
                storage.removeQueue(crashedServer, queue.getQueueId());
            }
        }
        if (queues.isEmpty()) {
            LOG.debug("Finish claiming replication queues for {}", crashedServer);
            storage.removeReplicatorIfQueueIsEmpty(crashedServer);
            // we are done
            return null;
        }
        LOG.debug("There are {} replication queues need to be claimed for {}", queues.size(), crashedServer);
        List<ServerName> targetServers = env.getMasterServices().getServerManager().getOnlineServersList();
        if (targetServers.isEmpty()) {
            throw new ReplicationException("no region server available");
        }
        Collections.shuffle(targetServers);
        ClaimReplicationQueueRemoteProcedure[] procs = new ClaimReplicationQueueRemoteProcedure[Math.min(queues.size(), targetServers.size())];
        for (int i = 0; i < procs.length; i++) {
            procs[i] = new ClaimReplicationQueueRemoteProcedure(crashedServer, queues.get(i), targetServers.get(i));
        }
        return procs;
    } catch (ReplicationException e) {
        if (retryCounter == null) {
            retryCounter = ProcedureUtil.createRetryCounter(env.getMasterConfiguration());
        }
        long backoff = retryCounter.getBackoffTimeAndIncrementAttempts();
        LOG.warn("Failed to claim replication queues for {}, suspend {}secs {}; {};", crashedServer, backoff / 1000, e);
        setTimeout(Math.toIntExact(backoff));
        setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT);
        skipPersistence();
        throw new ProcedureSuspendedException();
    }
}
Also used : ReplicationQueueInfo(org.apache.hadoop.hbase.replication.ReplicationQueueInfo) ServerName(org.apache.hadoop.hbase.ServerName) ReplicationException(org.apache.hadoop.hbase.replication.ReplicationException) ReplicationQueueStorage(org.apache.hadoop.hbase.replication.ReplicationQueueStorage) ProcedureSuspendedException(org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException)

Example 8 with ProcedureSuspendedException

use of org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException in project hbase by apache.

the class ServerCrashProcedure method executeFromState.

@Override
protected Flow executeFromState(MasterProcedureEnv env, ServerCrashState state) throws ProcedureSuspendedException, ProcedureYieldException {
    final MasterServices services = env.getMasterServices();
    final AssignmentManager am = env.getAssignmentManager();
    updateProgress(true);
    // Server gets removed from processing list below on procedure successful finish.
    if (!notifiedDeadServer) {
        notifiedDeadServer = true;
    }
    switch(state) {
        case SERVER_CRASH_START:
        case SERVER_CRASH_SPLIT_META_LOGS:
        case SERVER_CRASH_DELETE_SPLIT_META_WALS_DIR:
        case SERVER_CRASH_ASSIGN_META:
            break;
        default:
            // If hbase:meta is not assigned, yield.
            if (env.getAssignmentManager().waitMetaLoaded(this)) {
                throw new ProcedureSuspendedException();
            }
    }
    try {
        switch(state) {
            case SERVER_CRASH_START:
                LOG.info("Start " + this);
                // If carrying meta, process it first. Else, get list of regions on crashed server.
                if (this.carryingMeta) {
                    setNextState(ServerCrashState.SERVER_CRASH_SPLIT_META_LOGS);
                } else {
                    setNextState(ServerCrashState.SERVER_CRASH_GET_REGIONS);
                }
                break;
            case SERVER_CRASH_SPLIT_META_LOGS:
                if (env.getMasterConfiguration().getBoolean(HBASE_SPLIT_WAL_COORDINATED_BY_ZK, DEFAULT_HBASE_SPLIT_COORDINATED_BY_ZK)) {
                    zkCoordinatedSplitMetaLogs(env);
                    setNextState(ServerCrashState.SERVER_CRASH_ASSIGN_META);
                } else {
                    am.getRegionStates().metaLogSplitting(serverName);
                    addChildProcedure(createSplittingWalProcedures(env, true));
                    setNextState(ServerCrashState.SERVER_CRASH_DELETE_SPLIT_META_WALS_DIR);
                }
                break;
            case SERVER_CRASH_DELETE_SPLIT_META_WALS_DIR:
                if (isSplittingDone(env, true)) {
                    setNextState(ServerCrashState.SERVER_CRASH_ASSIGN_META);
                    am.getRegionStates().metaLogSplit(serverName);
                } else {
                    setNextState(ServerCrashState.SERVER_CRASH_SPLIT_META_LOGS);
                }
                break;
            case SERVER_CRASH_ASSIGN_META:
                assignRegions(env, Arrays.asList(RegionInfoBuilder.FIRST_META_REGIONINFO));
                setNextState(ServerCrashState.SERVER_CRASH_GET_REGIONS);
                break;
            case SERVER_CRASH_GET_REGIONS:
                this.regionsOnCrashedServer = getRegionsOnCrashedServer(env);
                // if we should do distributed log splitting.
                if (regionsOnCrashedServer != null) {
                    LOG.info("{} had {} regions", serverName, regionsOnCrashedServer.size());
                    if (LOG.isTraceEnabled()) {
                        this.regionsOnCrashedServer.stream().forEach(ri -> LOG.trace(ri.getShortNameToLog()));
                    }
                }
                if (!this.shouldSplitWal) {
                    setNextState(ServerCrashState.SERVER_CRASH_ASSIGN);
                } else {
                    setNextState(ServerCrashState.SERVER_CRASH_SPLIT_LOGS);
                }
                break;
            case SERVER_CRASH_SPLIT_LOGS:
                if (env.getMasterConfiguration().getBoolean(HBASE_SPLIT_WAL_COORDINATED_BY_ZK, DEFAULT_HBASE_SPLIT_COORDINATED_BY_ZK)) {
                    zkCoordinatedSplitLogs(env);
                    setNextState(ServerCrashState.SERVER_CRASH_ASSIGN);
                } else {
                    am.getRegionStates().logSplitting(this.serverName);
                    addChildProcedure(createSplittingWalProcedures(env, false));
                    setNextState(ServerCrashState.SERVER_CRASH_DELETE_SPLIT_WALS_DIR);
                }
                break;
            case SERVER_CRASH_DELETE_SPLIT_WALS_DIR:
                if (isSplittingDone(env, false)) {
                    cleanupSplitDir(env);
                    setNextState(ServerCrashState.SERVER_CRASH_ASSIGN);
                    am.getRegionStates().logSplit(this.serverName);
                } else {
                    setNextState(ServerCrashState.SERVER_CRASH_SPLIT_LOGS);
                }
                break;
            case SERVER_CRASH_ASSIGN:
                // Filter changes this.regionsOnCrashedServer.
                if (filterDefaultMetaRegions()) {
                    if (LOG.isTraceEnabled()) {
                        LOG.trace("Assigning regions " + RegionInfo.getShortNameToLog(regionsOnCrashedServer) + ", " + this + "; cycles=" + getCycles());
                    }
                    assignRegions(env, regionsOnCrashedServer);
                }
                setNextState(ServerCrashState.SERVER_CRASH_CLAIM_REPLICATION_QUEUES);
                break;
            case SERVER_CRASH_HANDLE_RIT2:
                // Noop. Left in place because we used to call handleRIT here for a second time
                // but no longer necessary since HBASE-20634.
                setNextState(ServerCrashState.SERVER_CRASH_CLAIM_REPLICATION_QUEUES);
                break;
            case SERVER_CRASH_CLAIM_REPLICATION_QUEUES:
                addChildProcedure(new ClaimReplicationQueuesProcedure(serverName));
                setNextState(ServerCrashState.SERVER_CRASH_FINISH);
                break;
            case SERVER_CRASH_FINISH:
                LOG.info("removed crashed server {} after splitting done", serverName);
                services.getAssignmentManager().getRegionStates().removeServer(serverName);
                updateProgress(true);
                return Flow.NO_MORE_STATE;
            default:
                throw new UnsupportedOperationException("unhandled state=" + state);
        }
    } catch (IOException e) {
        LOG.warn("Failed state=" + state + ", retry " + this + "; cycles=" + getCycles(), e);
    }
    return Flow.HAS_MORE_STATE;
}
Also used : ClaimReplicationQueuesProcedure(org.apache.hadoop.hbase.master.replication.ClaimReplicationQueuesProcedure) AssignmentManager(org.apache.hadoop.hbase.master.assignment.AssignmentManager) MasterServices(org.apache.hadoop.hbase.master.MasterServices) DoNotRetryIOException(org.apache.hadoop.hbase.DoNotRetryIOException) IOException(java.io.IOException) ProcedureSuspendedException(org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException)

Example 9 with ProcedureSuspendedException

use of org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException in project hbase by apache.

the class SplitWALProcedure method executeFromState.

@Override
protected Flow executeFromState(MasterProcedureEnv env, MasterProcedureProtos.SplitWALState state) throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException {
    SplitWALManager splitWALManager = env.getMasterServices().getSplitWALManager();
    switch(state) {
        case ACQUIRE_SPLIT_WAL_WORKER:
            worker = splitWALManager.acquireSplitWALWorker(this);
            setNextState(MasterProcedureProtos.SplitWALState.DISPATCH_WAL_TO_WORKER);
            return Flow.HAS_MORE_STATE;
        case DISPATCH_WAL_TO_WORKER:
            assert worker != null;
            addChildProcedure(new SplitWALRemoteProcedure(worker, crashedServer, walPath));
            setNextState(MasterProcedureProtos.SplitWALState.RELEASE_SPLIT_WORKER);
            return Flow.HAS_MORE_STATE;
        case RELEASE_SPLIT_WORKER:
            boolean finished;
            try {
                finished = splitWALManager.isSplitWALFinished(walPath);
            } catch (IOException ioe) {
                if (retryCounter == null) {
                    retryCounter = ProcedureUtil.createRetryCounter(env.getMasterConfiguration());
                }
                long backoff = retryCounter.getBackoffTimeAndIncrementAttempts();
                LOG.warn("Failed to check whether splitting wal {} success, wait {} seconds to retry", walPath, backoff / 1000, ioe);
                setTimeout(Math.toIntExact(backoff));
                setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT);
                skipPersistence();
                throw new ProcedureSuspendedException();
            }
            splitWALManager.releaseSplitWALWorker(worker, env.getProcedureScheduler());
            if (!finished) {
                LOG.warn("Failed to split wal {} by server {}, retry...", walPath, worker);
                setNextState(MasterProcedureProtos.SplitWALState.ACQUIRE_SPLIT_WAL_WORKER);
                return Flow.HAS_MORE_STATE;
            }
            ServerCrashProcedure.updateProgress(env, getParentProcId());
            return Flow.NO_MORE_STATE;
        default:
            throw new UnsupportedOperationException("unhandled state=" + state);
    }
}
Also used : SplitWALManager(org.apache.hadoop.hbase.master.SplitWALManager) IOException(java.io.IOException) ProcedureSuspendedException(org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException)

Example 10 with ProcedureSuspendedException

use of org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException in project hbase by apache.

the class SwitchRpcThrottleProcedure method executeFromState.

@Override
protected Flow executeFromState(MasterProcedureEnv env, SwitchRpcThrottleState state) throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException {
    switch(state) {
        case UPDATE_SWITCH_RPC_THROTTLE_STORAGE:
            try {
                switchThrottleState(env, rpcThrottleEnabled);
            } catch (IOException e) {
                if (retryCounter == null) {
                    retryCounter = ProcedureUtil.createRetryCounter(env.getMasterConfiguration());
                }
                long backoff = retryCounter.getBackoffTimeAndIncrementAttempts();
                LOG.warn("Failed to store rpc throttle value {}, sleep {} secs and retry", rpcThrottleEnabled, backoff / 1000, e);
                setTimeout(Math.toIntExact(backoff));
                setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT);
                skipPersistence();
                throw new ProcedureSuspendedException();
            }
            setNextState(SwitchRpcThrottleState.SWITCH_RPC_THROTTLE_ON_RS);
            return Flow.HAS_MORE_STATE;
        case SWITCH_RPC_THROTTLE_ON_RS:
            SwitchRpcThrottleRemoteProcedure[] subProcedures = env.getMasterServices().getServerManager().getOnlineServersList().stream().map(sn -> new SwitchRpcThrottleRemoteProcedure(sn, rpcThrottleEnabled)).toArray(SwitchRpcThrottleRemoteProcedure[]::new);
            addChildProcedure(subProcedures);
            setNextState(SwitchRpcThrottleState.POST_SWITCH_RPC_THROTTLE);
            return Flow.HAS_MORE_STATE;
        case POST_SWITCH_RPC_THROTTLE:
            ProcedurePrepareLatch.releaseLatch(syncLatch, this);
            return Flow.NO_MORE_STATE;
        default:
            throw new UnsupportedOperationException("unhandled state=" + state);
    }
}
Also used : StateMachineProcedure(org.apache.hadoop.hbase.procedure2.StateMachineProcedure) Logger(org.slf4j.Logger) SwitchRpcThrottleState(org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.SwitchRpcThrottleState) ProcedureSuspendedException(org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException) ProcedureProtos(org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos) RetryCounter(org.apache.hadoop.hbase.util.RetryCounter) LoggerFactory(org.slf4j.LoggerFactory) IOException(java.io.IOException) RpcThrottleStorage(org.apache.hadoop.hbase.quotas.RpcThrottleStorage) ProcedureUtil(org.apache.hadoop.hbase.procedure2.ProcedureUtil) InterfaceAudience(org.apache.yetus.audience.InterfaceAudience) ProcedureYieldException(org.apache.hadoop.hbase.procedure2.ProcedureYieldException) ProcedureStateSerializer(org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer) ServerName(org.apache.hadoop.hbase.ServerName) SwitchRpcThrottleStateData(org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.SwitchRpcThrottleStateData) IOException(java.io.IOException) ProcedureSuspendedException(org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException)

Aggregations

ProcedureSuspendedException (org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException)11 IOException (java.io.IOException)9 ServerName (org.apache.hadoop.hbase.ServerName)5 ProcedureYieldException (org.apache.hadoop.hbase.procedure2.ProcedureYieldException)4 Test (org.junit.Test)3 ArrayList (java.util.ArrayList)2 Configuration (org.apache.hadoop.conf.Configuration)1 Path (org.apache.hadoop.fs.Path)1 DoNotRetryIOException (org.apache.hadoop.hbase.DoNotRetryIOException)1 HBaseIOException (org.apache.hadoop.hbase.HBaseIOException)1 HRegionLocation (org.apache.hadoop.hbase.HRegionLocation)1 RegionInfo (org.apache.hadoop.hbase.client.RegionInfo)1 TableDescriptor (org.apache.hadoop.hbase.client.TableDescriptor)1 MasterServices (org.apache.hadoop.hbase.master.MasterServices)1 SplitWALManager (org.apache.hadoop.hbase.master.SplitWALManager)1 AssignmentManager (org.apache.hadoop.hbase.master.assignment.AssignmentManager)1 RegionStateNode (org.apache.hadoop.hbase.master.assignment.RegionStateNode)1 TransitRegionStateProcedure (org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure)1 ClaimReplicationQueuesProcedure (org.apache.hadoop.hbase.master.replication.ClaimReplicationQueuesProcedure)1 FailedRemoteDispatchException (org.apache.hadoop.hbase.procedure2.FailedRemoteDispatchException)1