use of org.apache.hadoop.hdds.scm.ha.SCMStateMachine in project ozone by apache.
the class TestSCMInstallSnapshotWithHA method testInstallSnapshot.
@Test
public void testInstallSnapshot() throws Exception {
// Get the leader SCM
StorageContainerManager leaderSCM = getLeader(cluster);
Assert.assertNotNull(leaderSCM);
// Find the inactive SCM
String followerId = getInactiveSCM(cluster).getSCMNodeId();
StorageContainerManager followerSCM = cluster.getSCM(followerId);
// Do some transactions so that the log index increases
List<ContainerInfo> containers = writeToIncreaseLogIndex(leaderSCM, 200);
// Start the inactive SCM. Install Snapshot will happen as part
// of setConfiguration() call to ratis leader and the follower will catch
// up
cluster.startInactiveSCM(followerId);
// The recently started should be lagging behind the leader .
SCMStateMachine followerSM = followerSCM.getScmHAManager().getRatisServer().getSCMStateMachine();
// Wait & retry for follower to update transactions to leader
// snapshot index.
// Timeout error if follower does not load update within 3s
GenericTestUtils.waitFor(() -> {
return followerSM.getLastAppliedTermIndex().getIndex() >= 200;
}, 100, 3000);
long followerLastAppliedIndex = followerSM.getLastAppliedTermIndex().getIndex();
assertTrue(followerLastAppliedIndex >= 200);
assertFalse(followerSM.getLifeCycleState().isPausingOrPaused());
// Verify that the follower 's DB contains the transactions which were
// made while it was inactive.
SCMMetadataStore followerMetaStore = followerSCM.getScmMetadataStore();
for (ContainerInfo containerInfo : containers) {
Assert.assertNotNull(followerMetaStore.getContainerTable().get(containerInfo.containerID()));
}
}
use of org.apache.hadoop.hdds.scm.ha.SCMStateMachine in project ozone by apache.
the class TestSCMInstallSnapshot method testInstallCheckPoint.
@Test
public void testInstallCheckPoint() throws Exception {
DBCheckpoint checkpoint = downloadSnapshot();
StorageContainerManager scm = cluster.getStorageContainerManager();
DBStore db = HAUtils.loadDB(conf, checkpoint.getCheckpointLocation().getParent().toFile(), checkpoint.getCheckpointLocation().getFileName().toString(), new SCMDBDefinition());
// Hack the transaction index in the checkpoint so as to ensure the
// checkpointed transaction index is higher than when it was downloaded
// from.
Assert.assertNotNull(db);
HAUtils.getTransactionInfoTable(db, new SCMDBDefinition()).put(OzoneConsts.TRANSACTION_INFO_KEY, TransactionInfo.builder().setCurrentTerm(10).setTransactionIndex(100).build());
db.close();
ContainerID cid = scm.getContainerManager().getContainers().get(0).containerID();
PipelineID pipelineID = scm.getPipelineManager().getPipelines().get(0).getId();
scm.getScmMetadataStore().getPipelineTable().delete(pipelineID);
scm.getContainerManager().deleteContainer(cid);
Assert.assertNull(scm.getScmMetadataStore().getPipelineTable().get(pipelineID));
Assert.assertFalse(scm.getContainerManager().containerExist(cid));
SCMStateMachine sm = scm.getScmHAManager().getRatisServer().getSCMStateMachine();
sm.pause();
sm.setInstallingDBCheckpoint(checkpoint);
sm.reinitialize();
Assert.assertNotNull(scm.getScmMetadataStore().getPipelineTable().get(pipelineID));
Assert.assertNotNull(scm.getScmMetadataStore().getContainerTable().get(cid));
Assert.assertTrue(scm.getPipelineManager().containsPipeline(pipelineID));
Assert.assertTrue(scm.getContainerManager().containerExist(cid));
Assert.assertEquals(100, scm.getScmMetadataStore().getTransactionInfoTable().get(OzoneConsts.TRANSACTION_INFO_KEY).getTransactionIndex());
Assert.assertEquals(100, scm.getScmHAManager().asSCMHADBTransactionBuffer().getLatestTrxInfo().getTermIndex().getIndex());
}
use of org.apache.hadoop.hdds.scm.ha.SCMStateMachine in project ozone by apache.
the class TestSCMInstallSnapshotWithHA method writeToIncreaseLogIndex.
private List<ContainerInfo> writeToIncreaseLogIndex(StorageContainerManager scm, long targetLogIndex) throws IOException, InterruptedException {
List<ContainerInfo> containers = new ArrayList<>();
SCMStateMachine stateMachine = scm.getScmHAManager().getRatisServer().getSCMStateMachine();
long logIndex = scm.getScmHAManager().getRatisServer().getSCMStateMachine().getLastAppliedTermIndex().getIndex();
while (logIndex <= targetLogIndex) {
containers.add(scm.getContainerManager().allocateContainer(RatisReplicationConfig.getInstance(ReplicationFactor.THREE), TestSCMInstallSnapshotWithHA.class.getName()));
Thread.sleep(100);
logIndex = stateMachine.getLastAppliedTermIndex().getIndex();
}
return containers;
}
use of org.apache.hadoop.hdds.scm.ha.SCMStateMachine in project ozone by apache.
the class TestSCMInstallSnapshotWithHA method testInstallCorruptedCheckpointFailure.
@Test
public void testInstallCorruptedCheckpointFailure() throws Exception {
StorageContainerManager leaderSCM = getLeader(cluster);
// Find the inactive SCM
String followerId = getInactiveSCM(cluster).getSCMNodeId();
StorageContainerManager followerSCM = cluster.getSCM(followerId);
// Do some transactions so that the log index increases
writeToIncreaseLogIndex(leaderSCM, 100);
File oldDBLocation = followerSCM.getScmMetadataStore().getStore().getDbLocation();
SCMStateMachine followerSM = followerSCM.getScmHAManager().getRatisServer().getSCMStateMachine();
TermIndex termIndex = followerSM.getLastAppliedTermIndex();
DBCheckpoint leaderDbCheckpoint = leaderSCM.getScmMetadataStore().getStore().getCheckpoint(false);
Path leaderCheckpointLocation = leaderDbCheckpoint.getCheckpointLocation();
TransactionInfo leaderCheckpointTrxnInfo = HAUtils.getTrxnInfoFromCheckpoint(conf, leaderCheckpointLocation, new SCMDBDefinition());
Assert.assertNotNull(leaderCheckpointLocation);
// Take a backup of the current DB
String dbBackupName = "SCM_CHECKPOINT_BACKUP" + termIndex.getIndex() + "_" + System.currentTimeMillis();
File dbDir = oldDBLocation.getParentFile();
File checkpointBackup = new File(dbDir, dbBackupName);
// Take a backup of the leader checkpoint
FileUtils.copyDirectory(leaderCheckpointLocation.toFile(), checkpointBackup, false);
// Corrupt the leader checkpoint and install that on the follower. The
// operation should fail and should shutdown.
boolean delete = true;
for (File file : leaderCheckpointLocation.toFile().listFiles()) {
if (file.getName().contains(".sst")) {
if (delete) {
file.delete();
delete = false;
} else {
delete = true;
}
}
}
SCMHAManagerImpl scmhaManager = (SCMHAManagerImpl) (followerSCM.getScmHAManager());
GenericTestUtils.setLogLevel(SCMHAManagerImpl.getLogger(), Level.ERROR);
GenericTestUtils.LogCapturer logCapture = GenericTestUtils.LogCapturer.captureLogs(SCMHAManagerImpl.getLogger());
scmhaManager.setExitManagerForTesting(new DummyExitManager());
followerSM.pause();
scmhaManager.installCheckpoint(leaderCheckpointLocation, leaderCheckpointTrxnInfo);
Assert.assertTrue(logCapture.getOutput().contains("Failed to reload SCM state and instantiate services."));
Assert.assertTrue(followerSM.getLifeCycleState().isPausingOrPaused());
// Verify correct reloading
followerSM.setInstallingDBCheckpoint(new RocksDBCheckpoint(checkpointBackup.toPath()));
followerSM.reinitialize();
Assert.assertEquals(followerSM.getLastAppliedTermIndex(), leaderCheckpointTrxnInfo.getTermIndex());
}
use of org.apache.hadoop.hdds.scm.ha.SCMStateMachine in project ozone by apache.
the class TestSCMInstallSnapshotWithHA method testInstallOldCheckpointFailure.
@Test
public void testInstallOldCheckpointFailure() throws Exception {
// Get the leader SCM
StorageContainerManager leaderSCM = getLeader(cluster);
String followerId = getInactiveSCM(cluster).getSCMNodeId();
// Find the inactive SCM
StorageContainerManager followerSCM = cluster.getSCM(followerId);
cluster.startInactiveSCM(followerId);
followerSCM.exitSafeMode();
DBCheckpoint leaderDbCheckpoint = leaderSCM.getScmMetadataStore().getStore().getCheckpoint(false);
SCMStateMachine leaderSM = leaderSCM.getScmHAManager().getRatisServer().getSCMStateMachine();
TermIndex lastTermIndex = leaderSM.getLastAppliedTermIndex();
SCMStateMachine followerSM = followerSCM.getScmHAManager().getRatisServer().getSCMStateMachine();
followerSCM.getScmMetadataStore().getTransactionInfoTable().put(OzoneConsts.TRANSACTION_INFO_KEY, TransactionInfo.builder().setCurrentTerm(lastTermIndex.getTerm()).setTransactionIndex(lastTermIndex.getIndex() + 100).build());
// Advance the follower
followerSM.notifyTermIndexUpdated(lastTermIndex.getTerm(), lastTermIndex.getIndex() + 100);
GenericTestUtils.setLogLevel(SCMHAManagerImpl.getLogger(), Level.INFO);
GenericTestUtils.LogCapturer logCapture = GenericTestUtils.LogCapturer.captureLogs(SCMHAManagerImpl.getLogger());
// Install the old checkpoint on the follower . This should fail as the
// follower is already ahead of that transactionLogIndex and the
// state should be reloaded.
TermIndex followerTermIndex = followerSM.getLastAppliedTermIndex();
SCMHAManagerImpl scmhaManager = (SCMHAManagerImpl) (followerSCM.getScmHAManager());
TermIndex newTermIndex = null;
try {
newTermIndex = scmhaManager.installCheckpoint(leaderDbCheckpoint);
} catch (IOException ioe) {
// throw IOException as expected
}
String errorMsg = "Reloading old state of SCM";
Assert.assertTrue(logCapture.getOutput().contains(errorMsg));
Assert.assertNull(" installed checkpoint even though checkpoint " + "logIndex is less than it's lastAppliedIndex", newTermIndex);
Assert.assertEquals(followerTermIndex, followerSM.getLastAppliedTermIndex());
Assert.assertFalse(followerSM.getLifeCycleState().isPausingOrPaused());
}
Aggregations