Search in sources :

Example 1 with ClaimReplicationQueuesProcedure

use of org.apache.hadoop.hbase.master.replication.ClaimReplicationQueuesProcedure in project hbase by apache.

the class ServerCrashProcedure method executeFromState.

@Override
protected Flow executeFromState(MasterProcedureEnv env, ServerCrashState state) throws ProcedureSuspendedException, ProcedureYieldException {
    final MasterServices services = env.getMasterServices();
    final AssignmentManager am = env.getAssignmentManager();
    updateProgress(true);
    // Server gets removed from processing list below on procedure successful finish.
    if (!notifiedDeadServer) {
        notifiedDeadServer = true;
    }
    switch(state) {
        case SERVER_CRASH_START:
        case SERVER_CRASH_SPLIT_META_LOGS:
        case SERVER_CRASH_DELETE_SPLIT_META_WALS_DIR:
        case SERVER_CRASH_ASSIGN_META:
            break;
        default:
            // If hbase:meta is not assigned, yield.
            if (env.getAssignmentManager().waitMetaLoaded(this)) {
                throw new ProcedureSuspendedException();
            }
    }
    try {
        switch(state) {
            case SERVER_CRASH_START:
                LOG.info("Start " + this);
                // If carrying meta, process it first. Else, get list of regions on crashed server.
                if (this.carryingMeta) {
                    setNextState(ServerCrashState.SERVER_CRASH_SPLIT_META_LOGS);
                } else {
                    setNextState(ServerCrashState.SERVER_CRASH_GET_REGIONS);
                }
                break;
            case SERVER_CRASH_SPLIT_META_LOGS:
                if (env.getMasterConfiguration().getBoolean(HBASE_SPLIT_WAL_COORDINATED_BY_ZK, DEFAULT_HBASE_SPLIT_COORDINATED_BY_ZK)) {
                    zkCoordinatedSplitMetaLogs(env);
                    setNextState(ServerCrashState.SERVER_CRASH_ASSIGN_META);
                } else {
                    am.getRegionStates().metaLogSplitting(serverName);
                    addChildProcedure(createSplittingWalProcedures(env, true));
                    setNextState(ServerCrashState.SERVER_CRASH_DELETE_SPLIT_META_WALS_DIR);
                }
                break;
            case SERVER_CRASH_DELETE_SPLIT_META_WALS_DIR:
                if (isSplittingDone(env, true)) {
                    setNextState(ServerCrashState.SERVER_CRASH_ASSIGN_META);
                    am.getRegionStates().metaLogSplit(serverName);
                } else {
                    setNextState(ServerCrashState.SERVER_CRASH_SPLIT_META_LOGS);
                }
                break;
            case SERVER_CRASH_ASSIGN_META:
                assignRegions(env, Arrays.asList(RegionInfoBuilder.FIRST_META_REGIONINFO));
                setNextState(ServerCrashState.SERVER_CRASH_GET_REGIONS);
                break;
            case SERVER_CRASH_GET_REGIONS:
                this.regionsOnCrashedServer = getRegionsOnCrashedServer(env);
                // if we should do distributed log splitting.
                if (regionsOnCrashedServer != null) {
                    LOG.info("{} had {} regions", serverName, regionsOnCrashedServer.size());
                    if (LOG.isTraceEnabled()) {
                        this.regionsOnCrashedServer.stream().forEach(ri -> LOG.trace(ri.getShortNameToLog()));
                    }
                }
                if (!this.shouldSplitWal) {
                    setNextState(ServerCrashState.SERVER_CRASH_ASSIGN);
                } else {
                    setNextState(ServerCrashState.SERVER_CRASH_SPLIT_LOGS);
                }
                break;
            case SERVER_CRASH_SPLIT_LOGS:
                if (env.getMasterConfiguration().getBoolean(HBASE_SPLIT_WAL_COORDINATED_BY_ZK, DEFAULT_HBASE_SPLIT_COORDINATED_BY_ZK)) {
                    zkCoordinatedSplitLogs(env);
                    setNextState(ServerCrashState.SERVER_CRASH_ASSIGN);
                } else {
                    am.getRegionStates().logSplitting(this.serverName);
                    addChildProcedure(createSplittingWalProcedures(env, false));
                    setNextState(ServerCrashState.SERVER_CRASH_DELETE_SPLIT_WALS_DIR);
                }
                break;
            case SERVER_CRASH_DELETE_SPLIT_WALS_DIR:
                if (isSplittingDone(env, false)) {
                    cleanupSplitDir(env);
                    setNextState(ServerCrashState.SERVER_CRASH_ASSIGN);
                    am.getRegionStates().logSplit(this.serverName);
                } else {
                    setNextState(ServerCrashState.SERVER_CRASH_SPLIT_LOGS);
                }
                break;
            case SERVER_CRASH_ASSIGN:
                // Filter changes this.regionsOnCrashedServer.
                if (filterDefaultMetaRegions()) {
                    if (LOG.isTraceEnabled()) {
                        LOG.trace("Assigning regions " + RegionInfo.getShortNameToLog(regionsOnCrashedServer) + ", " + this + "; cycles=" + getCycles());
                    }
                    assignRegions(env, regionsOnCrashedServer);
                }
                setNextState(ServerCrashState.SERVER_CRASH_CLAIM_REPLICATION_QUEUES);
                break;
            case SERVER_CRASH_HANDLE_RIT2:
                // Noop. Left in place because we used to call handleRIT here for a second time
                // but no longer necessary since HBASE-20634.
                setNextState(ServerCrashState.SERVER_CRASH_CLAIM_REPLICATION_QUEUES);
                break;
            case SERVER_CRASH_CLAIM_REPLICATION_QUEUES:
                addChildProcedure(new ClaimReplicationQueuesProcedure(serverName));
                setNextState(ServerCrashState.SERVER_CRASH_FINISH);
                break;
            case SERVER_CRASH_FINISH:
                LOG.info("removed crashed server {} after splitting done", serverName);
                services.getAssignmentManager().getRegionStates().removeServer(serverName);
                updateProgress(true);
                return Flow.NO_MORE_STATE;
            default:
                throw new UnsupportedOperationException("unhandled state=" + state);
        }
    } catch (IOException e) {
        LOG.warn("Failed state=" + state + ", retry " + this + "; cycles=" + getCycles(), e);
    }
    return Flow.HAS_MORE_STATE;
}
Also used : ClaimReplicationQueuesProcedure(org.apache.hadoop.hbase.master.replication.ClaimReplicationQueuesProcedure) AssignmentManager(org.apache.hadoop.hbase.master.assignment.AssignmentManager) MasterServices(org.apache.hadoop.hbase.master.MasterServices) DoNotRetryIOException(org.apache.hadoop.hbase.DoNotRetryIOException) IOException(java.io.IOException) ProcedureSuspendedException(org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException)

Example 2 with ClaimReplicationQueuesProcedure

use of org.apache.hadoop.hbase.master.replication.ClaimReplicationQueuesProcedure in project hbase by apache.

the class TestClaimReplicationQueue method testClaim.

@Test
public void testClaim() throws Exception {
    // disable the peers
    hbaseAdmin.disableReplicationPeer(PEER_ID2);
    hbaseAdmin.disableReplicationPeer(PEER_ID3);
    // put some data
    int count1 = UTIL1.loadTable(htable1, famName);
    int count2 = UTIL1.loadTable(table3, famName);
    EMPTY = true;
    UTIL1.getMiniHBaseCluster().stopRegionServer(0).join();
    UTIL1.getMiniHBaseCluster().startRegionServer();
    // since there is no active region server to get the replication queue, the procedure should be
    // in WAITING_TIMEOUT state for most time to retry
    HMaster master = UTIL1.getMiniHBaseCluster().getMaster();
    UTIL1.waitFor(30000, () -> master.getProcedures().stream().filter(p -> p instanceof ClaimReplicationQueuesProcedure).anyMatch(p -> p.getState() == ProcedureState.WAITING_TIMEOUT));
    hbaseAdmin.enableReplicationPeer(PEER_ID2);
    hbaseAdmin.enableReplicationPeer(PEER_ID3);
    EMPTY = false;
    // wait until the SCP finished, ClaimReplicationQueuesProcedure is a sub procedure of SCP
    UTIL1.waitFor(30000, () -> master.getProcedures().stream().filter(p -> p instanceof ServerCrashProcedure).allMatch(Procedure::isSuccess));
    // we should get all the data in the target cluster
    waitForReplication(htable2, count1, NB_RETRIES);
    waitForReplication(table4, count2, NB_RETRIES);
}
Also used : TableName(org.apache.hadoop.hbase.TableName) AfterClass(org.junit.AfterClass) BeforeClass(org.junit.BeforeClass) ProcedureState(org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos.ProcedureState) HBaseClassTestRule(org.apache.hadoop.hbase.HBaseClassTestRule) LargeTests(org.apache.hadoop.hbase.testclassification.LargeTests) IOException(java.io.IOException) Test(org.junit.Test) Category(org.junit.experimental.categories.Category) ServerManager(org.apache.hadoop.hbase.master.ServerManager) Procedure(org.apache.hadoop.hbase.procedure2.Procedure) ClaimReplicationQueuesProcedure(org.apache.hadoop.hbase.master.replication.ClaimReplicationQueuesProcedure) List(java.util.List) HConstants(org.apache.hadoop.hbase.HConstants) Configuration(org.apache.hadoop.conf.Configuration) MasterServices(org.apache.hadoop.hbase.master.MasterServices) Table(org.apache.hadoop.hbase.client.Table) ReplicationTests(org.apache.hadoop.hbase.testclassification.ReplicationTests) ServerCrashProcedure(org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure) Closeables(org.apache.hbase.thirdparty.com.google.common.io.Closeables) ClassRule(org.junit.ClassRule) Collections(java.util.Collections) HMaster(org.apache.hadoop.hbase.master.HMaster) ServerName(org.apache.hadoop.hbase.ServerName) ClaimReplicationQueuesProcedure(org.apache.hadoop.hbase.master.replication.ClaimReplicationQueuesProcedure) HMaster(org.apache.hadoop.hbase.master.HMaster) ServerCrashProcedure(org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure) Test(org.junit.Test)

Aggregations

IOException (java.io.IOException)2 MasterServices (org.apache.hadoop.hbase.master.MasterServices)2 ClaimReplicationQueuesProcedure (org.apache.hadoop.hbase.master.replication.ClaimReplicationQueuesProcedure)2 Collections (java.util.Collections)1 List (java.util.List)1 Configuration (org.apache.hadoop.conf.Configuration)1 DoNotRetryIOException (org.apache.hadoop.hbase.DoNotRetryIOException)1 HBaseClassTestRule (org.apache.hadoop.hbase.HBaseClassTestRule)1 HConstants (org.apache.hadoop.hbase.HConstants)1 ServerName (org.apache.hadoop.hbase.ServerName)1 TableName (org.apache.hadoop.hbase.TableName)1 Table (org.apache.hadoop.hbase.client.Table)1 HMaster (org.apache.hadoop.hbase.master.HMaster)1 ServerManager (org.apache.hadoop.hbase.master.ServerManager)1 AssignmentManager (org.apache.hadoop.hbase.master.assignment.AssignmentManager)1 ServerCrashProcedure (org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure)1 Procedure (org.apache.hadoop.hbase.procedure2.Procedure)1 ProcedureSuspendedException (org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException)1 ProcedureState (org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos.ProcedureState)1 LargeTests (org.apache.hadoop.hbase.testclassification.LargeTests)1