use of org.apache.hadoop.hdds.scm.ha.SCMHAManagerImpl in project ozone by apache.
the class StorageContainerManager method initializeSystemManagers.
/**
* This function initializes the following managers. If the configurator
* specifies a value, we will use it, else we will use the default value.
*
* Node Manager
* Pipeline Manager
* Container Manager
* Block Manager
* Replication Manager
* Safe Mode Manager
*
* @param conf - Ozone Configuration.
* @param configurator - A customizer which allows different managers to be
* used if needed.
* @throws IOException - on Failure.
*/
private void initializeSystemManagers(OzoneConfiguration conf, SCMConfigurator configurator) throws IOException {
if (configurator.getNetworkTopology() != null) {
clusterMap = configurator.getNetworkTopology();
} else {
clusterMap = new NetworkTopologyImpl(conf);
}
// This needs to be done before initializing Ratis.
RatisDropwizardExports.registerRatisMetricReporters(ratisMetricsMap);
if (configurator.getSCMHAManager() != null) {
scmHAManager = configurator.getSCMHAManager();
} else {
scmHAManager = new SCMHAManagerImpl(conf, this);
}
// inline upgrade for SequenceIdGenerator
SequenceIdGenerator.upgradeToSequenceId(scmMetadataStore);
// Distributed sequence id generator
sequenceIdGen = new SequenceIdGenerator(conf, scmHAManager, scmMetadataStore.getSequenceIdTable());
if (configurator.getScmContext() != null) {
scmContext = configurator.getScmContext();
} else {
// When term equals SCMContext.INVALID_TERM, the isLeader() check
// and getTermOfLeader() will always pass.
long term = SCMHAUtils.isSCMHAEnabled(conf) ? 0 : SCMContext.INVALID_TERM;
// non-leader of term 0, in safe mode, preCheck not completed.
scmContext = new SCMContext.Builder().setLeader(false).setTerm(term).setIsInSafeMode(true).setIsPreCheckComplete(false).setSCM(this).build();
}
if (configurator.getScmNodeManager() != null) {
scmNodeManager = configurator.getScmNodeManager();
} else {
scmNodeManager = new SCMNodeManager(conf, scmStorageConfig, eventQueue, clusterMap, scmContext, scmLayoutVersionManager);
}
placementMetrics = SCMContainerPlacementMetrics.create();
containerPlacementPolicy = ContainerPlacementPolicyFactory.getPolicy(conf, scmNodeManager, clusterMap, true, placementMetrics);
if (configurator.getPipelineManager() != null) {
pipelineManager = configurator.getPipelineManager();
} else {
pipelineManager = PipelineManagerImpl.newPipelineManager(conf, scmHAManager, scmNodeManager, scmMetadataStore.getPipelineTable(), eventQueue, scmContext, serviceManager);
}
if (configurator.getContainerManager() != null) {
containerManager = configurator.getContainerManager();
} else {
containerManager = new ContainerManagerImpl(conf, scmHAManager, sequenceIdGen, pipelineManager, scmMetadataStore.getContainerTable());
}
pipelineChoosePolicy = PipelineChoosePolicyFactory.getPolicy(conf);
if (configurator.getWritableContainerFactory() != null) {
writableContainerFactory = configurator.getWritableContainerFactory();
} else {
writableContainerFactory = new WritableContainerFactory(this);
}
if (configurator.getScmBlockManager() != null) {
scmBlockManager = configurator.getScmBlockManager();
} else {
scmBlockManager = new BlockManagerImpl(conf, this);
}
if (configurator.getReplicationManager() != null) {
replicationManager = configurator.getReplicationManager();
} else {
replicationManager = new ReplicationManager(conf, containerManager, containerPlacementPolicy, eventQueue, scmContext, serviceManager, scmNodeManager, new MonotonicClock(ZoneOffset.UTC), scmHAManager, getScmMetadataStore().getMoveTable());
}
if (configurator.getScmSafeModeManager() != null) {
scmSafeModeManager = configurator.getScmSafeModeManager();
} else {
scmSafeModeManager = new SCMSafeModeManager(conf, containerManager.getContainers(), containerManager, pipelineManager, eventQueue, serviceManager, scmContext);
}
scmDecommissionManager = new NodeDecommissionManager(conf, scmNodeManager, containerManager, scmContext, eventQueue, replicationManager);
}
use of org.apache.hadoop.hdds.scm.ha.SCMHAManagerImpl in project ozone by apache.
the class TestSCMInstallSnapshotWithHA method testInstallCorruptedCheckpointFailure.
@Test
public void testInstallCorruptedCheckpointFailure() throws Exception {
StorageContainerManager leaderSCM = getLeader(cluster);
// Find the inactive SCM
String followerId = getInactiveSCM(cluster).getSCMNodeId();
StorageContainerManager followerSCM = cluster.getSCM(followerId);
// Do some transactions so that the log index increases
writeToIncreaseLogIndex(leaderSCM, 100);
File oldDBLocation = followerSCM.getScmMetadataStore().getStore().getDbLocation();
SCMStateMachine followerSM = followerSCM.getScmHAManager().getRatisServer().getSCMStateMachine();
TermIndex termIndex = followerSM.getLastAppliedTermIndex();
DBCheckpoint leaderDbCheckpoint = leaderSCM.getScmMetadataStore().getStore().getCheckpoint(false);
Path leaderCheckpointLocation = leaderDbCheckpoint.getCheckpointLocation();
TransactionInfo leaderCheckpointTrxnInfo = HAUtils.getTrxnInfoFromCheckpoint(conf, leaderCheckpointLocation, new SCMDBDefinition());
Assert.assertNotNull(leaderCheckpointLocation);
// Take a backup of the current DB
String dbBackupName = "SCM_CHECKPOINT_BACKUP" + termIndex.getIndex() + "_" + System.currentTimeMillis();
File dbDir = oldDBLocation.getParentFile();
File checkpointBackup = new File(dbDir, dbBackupName);
// Take a backup of the leader checkpoint
FileUtils.copyDirectory(leaderCheckpointLocation.toFile(), checkpointBackup, false);
// Corrupt the leader checkpoint and install that on the follower. The
// operation should fail and should shutdown.
boolean delete = true;
for (File file : leaderCheckpointLocation.toFile().listFiles()) {
if (file.getName().contains(".sst")) {
if (delete) {
file.delete();
delete = false;
} else {
delete = true;
}
}
}
SCMHAManagerImpl scmhaManager = (SCMHAManagerImpl) (followerSCM.getScmHAManager());
GenericTestUtils.setLogLevel(SCMHAManagerImpl.getLogger(), Level.ERROR);
GenericTestUtils.LogCapturer logCapture = GenericTestUtils.LogCapturer.captureLogs(SCMHAManagerImpl.getLogger());
scmhaManager.setExitManagerForTesting(new DummyExitManager());
followerSM.pause();
scmhaManager.installCheckpoint(leaderCheckpointLocation, leaderCheckpointTrxnInfo);
Assert.assertTrue(logCapture.getOutput().contains("Failed to reload SCM state and instantiate services."));
Assert.assertTrue(followerSM.getLifeCycleState().isPausingOrPaused());
// Verify correct reloading
followerSM.setInstallingDBCheckpoint(new RocksDBCheckpoint(checkpointBackup.toPath()));
followerSM.reinitialize();
Assert.assertEquals(followerSM.getLastAppliedTermIndex(), leaderCheckpointTrxnInfo.getTermIndex());
}
use of org.apache.hadoop.hdds.scm.ha.SCMHAManagerImpl in project ozone by apache.
the class TestSCMInstallSnapshotWithHA method testInstallOldCheckpointFailure.
@Test
public void testInstallOldCheckpointFailure() throws Exception {
// Get the leader SCM
StorageContainerManager leaderSCM = getLeader(cluster);
String followerId = getInactiveSCM(cluster).getSCMNodeId();
// Find the inactive SCM
StorageContainerManager followerSCM = cluster.getSCM(followerId);
cluster.startInactiveSCM(followerId);
followerSCM.exitSafeMode();
DBCheckpoint leaderDbCheckpoint = leaderSCM.getScmMetadataStore().getStore().getCheckpoint(false);
SCMStateMachine leaderSM = leaderSCM.getScmHAManager().getRatisServer().getSCMStateMachine();
TermIndex lastTermIndex = leaderSM.getLastAppliedTermIndex();
SCMStateMachine followerSM = followerSCM.getScmHAManager().getRatisServer().getSCMStateMachine();
followerSCM.getScmMetadataStore().getTransactionInfoTable().put(OzoneConsts.TRANSACTION_INFO_KEY, TransactionInfo.builder().setCurrentTerm(lastTermIndex.getTerm()).setTransactionIndex(lastTermIndex.getIndex() + 100).build());
// Advance the follower
followerSM.notifyTermIndexUpdated(lastTermIndex.getTerm(), lastTermIndex.getIndex() + 100);
GenericTestUtils.setLogLevel(SCMHAManagerImpl.getLogger(), Level.INFO);
GenericTestUtils.LogCapturer logCapture = GenericTestUtils.LogCapturer.captureLogs(SCMHAManagerImpl.getLogger());
// Install the old checkpoint on the follower . This should fail as the
// follower is already ahead of that transactionLogIndex and the
// state should be reloaded.
TermIndex followerTermIndex = followerSM.getLastAppliedTermIndex();
SCMHAManagerImpl scmhaManager = (SCMHAManagerImpl) (followerSCM.getScmHAManager());
TermIndex newTermIndex = null;
try {
newTermIndex = scmhaManager.installCheckpoint(leaderDbCheckpoint);
} catch (IOException ioe) {
// throw IOException as expected
}
String errorMsg = "Reloading old state of SCM";
Assert.assertTrue(logCapture.getOutput().contains(errorMsg));
Assert.assertNull(" installed checkpoint even though checkpoint " + "logIndex is less than it's lastAppliedIndex", newTermIndex);
Assert.assertEquals(followerTermIndex, followerSM.getLastAppliedTermIndex());
Assert.assertFalse(followerSM.getLifeCycleState().isPausingOrPaused());
}
Aggregations