Search in sources :

Example 1 with SCMHAManagerImpl

use of org.apache.hadoop.hdds.scm.ha.SCMHAManagerImpl in project ozone by apache.

the class StorageContainerManager method initializeSystemManagers.

/**
 * This function initializes the following managers. If the configurator
 * specifies a value, we will use it, else we will use the default value.
 *
 *  Node Manager
 *  Pipeline Manager
 *  Container Manager
 *  Block Manager
 *  Replication Manager
 *  Safe Mode Manager
 *
 * @param conf - Ozone Configuration.
 * @param configurator - A customizer which allows different managers to be
 *                    used if needed.
 * @throws IOException - on Failure.
 */
private void initializeSystemManagers(OzoneConfiguration conf, SCMConfigurator configurator) throws IOException {
    if (configurator.getNetworkTopology() != null) {
        clusterMap = configurator.getNetworkTopology();
    } else {
        clusterMap = new NetworkTopologyImpl(conf);
    }
    // This needs to be done before initializing Ratis.
    RatisDropwizardExports.registerRatisMetricReporters(ratisMetricsMap);
    if (configurator.getSCMHAManager() != null) {
        scmHAManager = configurator.getSCMHAManager();
    } else {
        scmHAManager = new SCMHAManagerImpl(conf, this);
    }
    // inline upgrade for SequenceIdGenerator
    SequenceIdGenerator.upgradeToSequenceId(scmMetadataStore);
    // Distributed sequence id generator
    sequenceIdGen = new SequenceIdGenerator(conf, scmHAManager, scmMetadataStore.getSequenceIdTable());
    if (configurator.getScmContext() != null) {
        scmContext = configurator.getScmContext();
    } else {
        // When term equals SCMContext.INVALID_TERM, the isLeader() check
        // and getTermOfLeader() will always pass.
        long term = SCMHAUtils.isSCMHAEnabled(conf) ? 0 : SCMContext.INVALID_TERM;
        // non-leader of term 0, in safe mode, preCheck not completed.
        scmContext = new SCMContext.Builder().setLeader(false).setTerm(term).setIsInSafeMode(true).setIsPreCheckComplete(false).setSCM(this).build();
    }
    if (configurator.getScmNodeManager() != null) {
        scmNodeManager = configurator.getScmNodeManager();
    } else {
        scmNodeManager = new SCMNodeManager(conf, scmStorageConfig, eventQueue, clusterMap, scmContext, scmLayoutVersionManager);
    }
    placementMetrics = SCMContainerPlacementMetrics.create();
    containerPlacementPolicy = ContainerPlacementPolicyFactory.getPolicy(conf, scmNodeManager, clusterMap, true, placementMetrics);
    if (configurator.getPipelineManager() != null) {
        pipelineManager = configurator.getPipelineManager();
    } else {
        pipelineManager = PipelineManagerImpl.newPipelineManager(conf, scmHAManager, scmNodeManager, scmMetadataStore.getPipelineTable(), eventQueue, scmContext, serviceManager);
    }
    if (configurator.getContainerManager() != null) {
        containerManager = configurator.getContainerManager();
    } else {
        containerManager = new ContainerManagerImpl(conf, scmHAManager, sequenceIdGen, pipelineManager, scmMetadataStore.getContainerTable());
    }
    pipelineChoosePolicy = PipelineChoosePolicyFactory.getPolicy(conf);
    if (configurator.getWritableContainerFactory() != null) {
        writableContainerFactory = configurator.getWritableContainerFactory();
    } else {
        writableContainerFactory = new WritableContainerFactory(this);
    }
    if (configurator.getScmBlockManager() != null) {
        scmBlockManager = configurator.getScmBlockManager();
    } else {
        scmBlockManager = new BlockManagerImpl(conf, this);
    }
    if (configurator.getReplicationManager() != null) {
        replicationManager = configurator.getReplicationManager();
    } else {
        replicationManager = new ReplicationManager(conf, containerManager, containerPlacementPolicy, eventQueue, scmContext, serviceManager, scmNodeManager, new MonotonicClock(ZoneOffset.UTC), scmHAManager, getScmMetadataStore().getMoveTable());
    }
    if (configurator.getScmSafeModeManager() != null) {
        scmSafeModeManager = configurator.getScmSafeModeManager();
    } else {
        scmSafeModeManager = new SCMSafeModeManager(conf, containerManager.getContainers(), containerManager, pipelineManager, eventQueue, serviceManager, scmContext);
    }
    scmDecommissionManager = new NodeDecommissionManager(conf, scmNodeManager, containerManager, scmContext, eventQueue, replicationManager);
}
Also used : ReplicationManager(org.apache.hadoop.hdds.scm.container.ReplicationManager) SCMHAManagerImpl(org.apache.hadoop.hdds.scm.ha.SCMHAManagerImpl) SCMSafeModeManager(org.apache.hadoop.hdds.scm.safemode.SCMSafeModeManager) ContainerManagerImpl(org.apache.hadoop.hdds.scm.container.ContainerManagerImpl) NetworkTopologyImpl(org.apache.hadoop.hdds.scm.net.NetworkTopologyImpl) SCMContext(org.apache.hadoop.hdds.scm.ha.SCMContext) SCMNodeManager(org.apache.hadoop.hdds.scm.node.SCMNodeManager) SequenceIdGenerator(org.apache.hadoop.hdds.scm.ha.SequenceIdGenerator) WritableContainerFactory(org.apache.hadoop.hdds.scm.pipeline.WritableContainerFactory) NodeDecommissionManager(org.apache.hadoop.hdds.scm.node.NodeDecommissionManager) MonotonicClock(org.apache.hadoop.ozone.common.MonotonicClock) BlockManagerImpl(org.apache.hadoop.hdds.scm.block.BlockManagerImpl)

Example 2 with SCMHAManagerImpl

use of org.apache.hadoop.hdds.scm.ha.SCMHAManagerImpl in project ozone by apache.

the class TestSCMInstallSnapshotWithHA method testInstallCorruptedCheckpointFailure.

@Test
public void testInstallCorruptedCheckpointFailure() throws Exception {
    StorageContainerManager leaderSCM = getLeader(cluster);
    // Find the inactive SCM
    String followerId = getInactiveSCM(cluster).getSCMNodeId();
    StorageContainerManager followerSCM = cluster.getSCM(followerId);
    // Do some transactions so that the log index increases
    writeToIncreaseLogIndex(leaderSCM, 100);
    File oldDBLocation = followerSCM.getScmMetadataStore().getStore().getDbLocation();
    SCMStateMachine followerSM = followerSCM.getScmHAManager().getRatisServer().getSCMStateMachine();
    TermIndex termIndex = followerSM.getLastAppliedTermIndex();
    DBCheckpoint leaderDbCheckpoint = leaderSCM.getScmMetadataStore().getStore().getCheckpoint(false);
    Path leaderCheckpointLocation = leaderDbCheckpoint.getCheckpointLocation();
    TransactionInfo leaderCheckpointTrxnInfo = HAUtils.getTrxnInfoFromCheckpoint(conf, leaderCheckpointLocation, new SCMDBDefinition());
    Assert.assertNotNull(leaderCheckpointLocation);
    // Take a backup of the current DB
    String dbBackupName = "SCM_CHECKPOINT_BACKUP" + termIndex.getIndex() + "_" + System.currentTimeMillis();
    File dbDir = oldDBLocation.getParentFile();
    File checkpointBackup = new File(dbDir, dbBackupName);
    // Take a backup of the leader checkpoint
    FileUtils.copyDirectory(leaderCheckpointLocation.toFile(), checkpointBackup, false);
    // Corrupt the leader checkpoint and install that on the follower. The
    // operation should fail and  should shutdown.
    boolean delete = true;
    for (File file : leaderCheckpointLocation.toFile().listFiles()) {
        if (file.getName().contains(".sst")) {
            if (delete) {
                file.delete();
                delete = false;
            } else {
                delete = true;
            }
        }
    }
    SCMHAManagerImpl scmhaManager = (SCMHAManagerImpl) (followerSCM.getScmHAManager());
    GenericTestUtils.setLogLevel(SCMHAManagerImpl.getLogger(), Level.ERROR);
    GenericTestUtils.LogCapturer logCapture = GenericTestUtils.LogCapturer.captureLogs(SCMHAManagerImpl.getLogger());
    scmhaManager.setExitManagerForTesting(new DummyExitManager());
    followerSM.pause();
    scmhaManager.installCheckpoint(leaderCheckpointLocation, leaderCheckpointTrxnInfo);
    Assert.assertTrue(logCapture.getOutput().contains("Failed to reload SCM state and instantiate services."));
    Assert.assertTrue(followerSM.getLifeCycleState().isPausingOrPaused());
    // Verify correct reloading
    followerSM.setInstallingDBCheckpoint(new RocksDBCheckpoint(checkpointBackup.toPath()));
    followerSM.reinitialize();
    Assert.assertEquals(followerSM.getLastAppliedTermIndex(), leaderCheckpointTrxnInfo.getTermIndex());
}
Also used : Path(java.nio.file.Path) StorageContainerManager(org.apache.hadoop.hdds.scm.server.StorageContainerManager) RocksDBCheckpoint(org.apache.hadoop.hdds.utils.db.RocksDBCheckpoint) DBCheckpoint(org.apache.hadoop.hdds.utils.db.DBCheckpoint) RocksDBCheckpoint(org.apache.hadoop.hdds.utils.db.RocksDBCheckpoint) SCMHAManagerImpl(org.apache.hadoop.hdds.scm.ha.SCMHAManagerImpl) GenericTestUtils(org.apache.ozone.test.GenericTestUtils) TermIndex(org.apache.ratis.server.protocol.TermIndex) SCMDBDefinition(org.apache.hadoop.hdds.scm.metadata.SCMDBDefinition) TransactionInfo(org.apache.hadoop.hdds.utils.TransactionInfo) SCMStateMachine(org.apache.hadoop.hdds.scm.ha.SCMStateMachine) File(java.io.File) Test(org.junit.jupiter.api.Test)

Example 3 with SCMHAManagerImpl

use of org.apache.hadoop.hdds.scm.ha.SCMHAManagerImpl in project ozone by apache.

the class TestSCMInstallSnapshotWithHA method testInstallOldCheckpointFailure.

@Test
public void testInstallOldCheckpointFailure() throws Exception {
    // Get the leader SCM
    StorageContainerManager leaderSCM = getLeader(cluster);
    String followerId = getInactiveSCM(cluster).getSCMNodeId();
    // Find the inactive SCM
    StorageContainerManager followerSCM = cluster.getSCM(followerId);
    cluster.startInactiveSCM(followerId);
    followerSCM.exitSafeMode();
    DBCheckpoint leaderDbCheckpoint = leaderSCM.getScmMetadataStore().getStore().getCheckpoint(false);
    SCMStateMachine leaderSM = leaderSCM.getScmHAManager().getRatisServer().getSCMStateMachine();
    TermIndex lastTermIndex = leaderSM.getLastAppliedTermIndex();
    SCMStateMachine followerSM = followerSCM.getScmHAManager().getRatisServer().getSCMStateMachine();
    followerSCM.getScmMetadataStore().getTransactionInfoTable().put(OzoneConsts.TRANSACTION_INFO_KEY, TransactionInfo.builder().setCurrentTerm(lastTermIndex.getTerm()).setTransactionIndex(lastTermIndex.getIndex() + 100).build());
    // Advance the follower
    followerSM.notifyTermIndexUpdated(lastTermIndex.getTerm(), lastTermIndex.getIndex() + 100);
    GenericTestUtils.setLogLevel(SCMHAManagerImpl.getLogger(), Level.INFO);
    GenericTestUtils.LogCapturer logCapture = GenericTestUtils.LogCapturer.captureLogs(SCMHAManagerImpl.getLogger());
    // Install the old checkpoint on the follower . This should fail as the
    // follower is already ahead of that transactionLogIndex and the
    // state should be reloaded.
    TermIndex followerTermIndex = followerSM.getLastAppliedTermIndex();
    SCMHAManagerImpl scmhaManager = (SCMHAManagerImpl) (followerSCM.getScmHAManager());
    TermIndex newTermIndex = null;
    try {
        newTermIndex = scmhaManager.installCheckpoint(leaderDbCheckpoint);
    } catch (IOException ioe) {
    // throw IOException as expected
    }
    String errorMsg = "Reloading old state of SCM";
    Assert.assertTrue(logCapture.getOutput().contains(errorMsg));
    Assert.assertNull(" installed checkpoint even though checkpoint " + "logIndex is less than it's lastAppliedIndex", newTermIndex);
    Assert.assertEquals(followerTermIndex, followerSM.getLastAppliedTermIndex());
    Assert.assertFalse(followerSM.getLifeCycleState().isPausingOrPaused());
}
Also used : StorageContainerManager(org.apache.hadoop.hdds.scm.server.StorageContainerManager) DBCheckpoint(org.apache.hadoop.hdds.utils.db.DBCheckpoint) RocksDBCheckpoint(org.apache.hadoop.hdds.utils.db.RocksDBCheckpoint) GenericTestUtils(org.apache.ozone.test.GenericTestUtils) SCMHAManagerImpl(org.apache.hadoop.hdds.scm.ha.SCMHAManagerImpl) SCMStateMachine(org.apache.hadoop.hdds.scm.ha.SCMStateMachine) IOException(java.io.IOException) TermIndex(org.apache.ratis.server.protocol.TermIndex) Test(org.junit.jupiter.api.Test)

Aggregations

SCMHAManagerImpl (org.apache.hadoop.hdds.scm.ha.SCMHAManagerImpl)3 SCMStateMachine (org.apache.hadoop.hdds.scm.ha.SCMStateMachine)2 StorageContainerManager (org.apache.hadoop.hdds.scm.server.StorageContainerManager)2 DBCheckpoint (org.apache.hadoop.hdds.utils.db.DBCheckpoint)2 RocksDBCheckpoint (org.apache.hadoop.hdds.utils.db.RocksDBCheckpoint)2 GenericTestUtils (org.apache.ozone.test.GenericTestUtils)2 TermIndex (org.apache.ratis.server.protocol.TermIndex)2 Test (org.junit.jupiter.api.Test)2 File (java.io.File)1 IOException (java.io.IOException)1 Path (java.nio.file.Path)1 BlockManagerImpl (org.apache.hadoop.hdds.scm.block.BlockManagerImpl)1 ContainerManagerImpl (org.apache.hadoop.hdds.scm.container.ContainerManagerImpl)1 ReplicationManager (org.apache.hadoop.hdds.scm.container.ReplicationManager)1 SCMContext (org.apache.hadoop.hdds.scm.ha.SCMContext)1 SequenceIdGenerator (org.apache.hadoop.hdds.scm.ha.SequenceIdGenerator)1 SCMDBDefinition (org.apache.hadoop.hdds.scm.metadata.SCMDBDefinition)1 NetworkTopologyImpl (org.apache.hadoop.hdds.scm.net.NetworkTopologyImpl)1 NodeDecommissionManager (org.apache.hadoop.hdds.scm.node.NodeDecommissionManager)1 SCMNodeManager (org.apache.hadoop.hdds.scm.node.SCMNodeManager)1