Search in sources :

Example 1 with SCMStateMachine

use of org.apache.hadoop.hdds.scm.ha.SCMStateMachine in project ozone by apache.

the class TestSCMInstallSnapshotWithHA method testInstallSnapshot.

@Test
public void testInstallSnapshot() throws Exception {
    // Get the leader SCM
    StorageContainerManager leaderSCM = getLeader(cluster);
    Assert.assertNotNull(leaderSCM);
    // Find the inactive SCM
    String followerId = getInactiveSCM(cluster).getSCMNodeId();
    StorageContainerManager followerSCM = cluster.getSCM(followerId);
    // Do some transactions so that the log index increases
    List<ContainerInfo> containers = writeToIncreaseLogIndex(leaderSCM, 200);
    // Start the inactive SCM. Install Snapshot will happen as part
    // of setConfiguration() call to ratis leader and the follower will catch
    // up
    cluster.startInactiveSCM(followerId);
    // The recently started  should be lagging behind the leader .
    SCMStateMachine followerSM = followerSCM.getScmHAManager().getRatisServer().getSCMStateMachine();
    // Wait & retry for follower to update transactions to leader
    // snapshot index.
    // Timeout error if follower does not load update within 3s
    GenericTestUtils.waitFor(() -> {
        return followerSM.getLastAppliedTermIndex().getIndex() >= 200;
    }, 100, 3000);
    long followerLastAppliedIndex = followerSM.getLastAppliedTermIndex().getIndex();
    assertTrue(followerLastAppliedIndex >= 200);
    assertFalse(followerSM.getLifeCycleState().isPausingOrPaused());
    // Verify that the follower 's DB contains the transactions which were
    // made while it was inactive.
    SCMMetadataStore followerMetaStore = followerSCM.getScmMetadataStore();
    for (ContainerInfo containerInfo : containers) {
        Assert.assertNotNull(followerMetaStore.getContainerTable().get(containerInfo.containerID()));
    }
}
Also used : StorageContainerManager(org.apache.hadoop.hdds.scm.server.StorageContainerManager) ContainerInfo(org.apache.hadoop.hdds.scm.container.ContainerInfo) SCMMetadataStore(org.apache.hadoop.hdds.scm.metadata.SCMMetadataStore) SCMStateMachine(org.apache.hadoop.hdds.scm.ha.SCMStateMachine) Test(org.junit.jupiter.api.Test)

Example 2 with SCMStateMachine

use of org.apache.hadoop.hdds.scm.ha.SCMStateMachine in project ozone by apache.

the class TestSCMInstallSnapshot method testInstallCheckPoint.

@Test
public void testInstallCheckPoint() throws Exception {
    DBCheckpoint checkpoint = downloadSnapshot();
    StorageContainerManager scm = cluster.getStorageContainerManager();
    DBStore db = HAUtils.loadDB(conf, checkpoint.getCheckpointLocation().getParent().toFile(), checkpoint.getCheckpointLocation().getFileName().toString(), new SCMDBDefinition());
    // Hack the transaction index in the checkpoint so as to ensure the
    // checkpointed transaction index is higher than when it was downloaded
    // from.
    Assert.assertNotNull(db);
    HAUtils.getTransactionInfoTable(db, new SCMDBDefinition()).put(OzoneConsts.TRANSACTION_INFO_KEY, TransactionInfo.builder().setCurrentTerm(10).setTransactionIndex(100).build());
    db.close();
    ContainerID cid = scm.getContainerManager().getContainers().get(0).containerID();
    PipelineID pipelineID = scm.getPipelineManager().getPipelines().get(0).getId();
    scm.getScmMetadataStore().getPipelineTable().delete(pipelineID);
    scm.getContainerManager().deleteContainer(cid);
    Assert.assertNull(scm.getScmMetadataStore().getPipelineTable().get(pipelineID));
    Assert.assertFalse(scm.getContainerManager().containerExist(cid));
    SCMStateMachine sm = scm.getScmHAManager().getRatisServer().getSCMStateMachine();
    sm.pause();
    sm.setInstallingDBCheckpoint(checkpoint);
    sm.reinitialize();
    Assert.assertNotNull(scm.getScmMetadataStore().getPipelineTable().get(pipelineID));
    Assert.assertNotNull(scm.getScmMetadataStore().getContainerTable().get(cid));
    Assert.assertTrue(scm.getPipelineManager().containsPipeline(pipelineID));
    Assert.assertTrue(scm.getContainerManager().containerExist(cid));
    Assert.assertEquals(100, scm.getScmMetadataStore().getTransactionInfoTable().get(OzoneConsts.TRANSACTION_INFO_KEY).getTransactionIndex());
    Assert.assertEquals(100, scm.getScmHAManager().asSCMHADBTransactionBuffer().getLatestTrxInfo().getTermIndex().getIndex());
}
Also used : StorageContainerManager(org.apache.hadoop.hdds.scm.server.StorageContainerManager) ContainerID(org.apache.hadoop.hdds.scm.container.ContainerID) DBCheckpoint(org.apache.hadoop.hdds.utils.db.DBCheckpoint) SCMDBDefinition(org.apache.hadoop.hdds.scm.metadata.SCMDBDefinition) PipelineID(org.apache.hadoop.hdds.scm.pipeline.PipelineID) SCMStateMachine(org.apache.hadoop.hdds.scm.ha.SCMStateMachine) DBStore(org.apache.hadoop.hdds.utils.db.DBStore) Test(org.junit.Test)

Example 3 with SCMStateMachine

use of org.apache.hadoop.hdds.scm.ha.SCMStateMachine in project ozone by apache.

the class TestSCMInstallSnapshotWithHA method writeToIncreaseLogIndex.

private List<ContainerInfo> writeToIncreaseLogIndex(StorageContainerManager scm, long targetLogIndex) throws IOException, InterruptedException {
    List<ContainerInfo> containers = new ArrayList<>();
    SCMStateMachine stateMachine = scm.getScmHAManager().getRatisServer().getSCMStateMachine();
    long logIndex = scm.getScmHAManager().getRatisServer().getSCMStateMachine().getLastAppliedTermIndex().getIndex();
    while (logIndex <= targetLogIndex) {
        containers.add(scm.getContainerManager().allocateContainer(RatisReplicationConfig.getInstance(ReplicationFactor.THREE), TestSCMInstallSnapshotWithHA.class.getName()));
        Thread.sleep(100);
        logIndex = stateMachine.getLastAppliedTermIndex().getIndex();
    }
    return containers;
}
Also used : ContainerInfo(org.apache.hadoop.hdds.scm.container.ContainerInfo) ArrayList(java.util.ArrayList) SCMStateMachine(org.apache.hadoop.hdds.scm.ha.SCMStateMachine)

Example 4 with SCMStateMachine

use of org.apache.hadoop.hdds.scm.ha.SCMStateMachine in project ozone by apache.

the class TestSCMInstallSnapshotWithHA method testInstallCorruptedCheckpointFailure.

@Test
public void testInstallCorruptedCheckpointFailure() throws Exception {
    StorageContainerManager leaderSCM = getLeader(cluster);
    // Find the inactive SCM
    String followerId = getInactiveSCM(cluster).getSCMNodeId();
    StorageContainerManager followerSCM = cluster.getSCM(followerId);
    // Do some transactions so that the log index increases
    writeToIncreaseLogIndex(leaderSCM, 100);
    File oldDBLocation = followerSCM.getScmMetadataStore().getStore().getDbLocation();
    SCMStateMachine followerSM = followerSCM.getScmHAManager().getRatisServer().getSCMStateMachine();
    TermIndex termIndex = followerSM.getLastAppliedTermIndex();
    DBCheckpoint leaderDbCheckpoint = leaderSCM.getScmMetadataStore().getStore().getCheckpoint(false);
    Path leaderCheckpointLocation = leaderDbCheckpoint.getCheckpointLocation();
    TransactionInfo leaderCheckpointTrxnInfo = HAUtils.getTrxnInfoFromCheckpoint(conf, leaderCheckpointLocation, new SCMDBDefinition());
    Assert.assertNotNull(leaderCheckpointLocation);
    // Take a backup of the current DB
    String dbBackupName = "SCM_CHECKPOINT_BACKUP" + termIndex.getIndex() + "_" + System.currentTimeMillis();
    File dbDir = oldDBLocation.getParentFile();
    File checkpointBackup = new File(dbDir, dbBackupName);
    // Take a backup of the leader checkpoint
    FileUtils.copyDirectory(leaderCheckpointLocation.toFile(), checkpointBackup, false);
    // Corrupt the leader checkpoint and install that on the follower. The
    // operation should fail and  should shutdown.
    boolean delete = true;
    for (File file : leaderCheckpointLocation.toFile().listFiles()) {
        if (file.getName().contains(".sst")) {
            if (delete) {
                file.delete();
                delete = false;
            } else {
                delete = true;
            }
        }
    }
    SCMHAManagerImpl scmhaManager = (SCMHAManagerImpl) (followerSCM.getScmHAManager());
    GenericTestUtils.setLogLevel(SCMHAManagerImpl.getLogger(), Level.ERROR);
    GenericTestUtils.LogCapturer logCapture = GenericTestUtils.LogCapturer.captureLogs(SCMHAManagerImpl.getLogger());
    scmhaManager.setExitManagerForTesting(new DummyExitManager());
    followerSM.pause();
    scmhaManager.installCheckpoint(leaderCheckpointLocation, leaderCheckpointTrxnInfo);
    Assert.assertTrue(logCapture.getOutput().contains("Failed to reload SCM state and instantiate services."));
    Assert.assertTrue(followerSM.getLifeCycleState().isPausingOrPaused());
    // Verify correct reloading
    followerSM.setInstallingDBCheckpoint(new RocksDBCheckpoint(checkpointBackup.toPath()));
    followerSM.reinitialize();
    Assert.assertEquals(followerSM.getLastAppliedTermIndex(), leaderCheckpointTrxnInfo.getTermIndex());
}
Also used : Path(java.nio.file.Path) StorageContainerManager(org.apache.hadoop.hdds.scm.server.StorageContainerManager) RocksDBCheckpoint(org.apache.hadoop.hdds.utils.db.RocksDBCheckpoint) DBCheckpoint(org.apache.hadoop.hdds.utils.db.DBCheckpoint) RocksDBCheckpoint(org.apache.hadoop.hdds.utils.db.RocksDBCheckpoint) SCMHAManagerImpl(org.apache.hadoop.hdds.scm.ha.SCMHAManagerImpl) GenericTestUtils(org.apache.ozone.test.GenericTestUtils) TermIndex(org.apache.ratis.server.protocol.TermIndex) SCMDBDefinition(org.apache.hadoop.hdds.scm.metadata.SCMDBDefinition) TransactionInfo(org.apache.hadoop.hdds.utils.TransactionInfo) SCMStateMachine(org.apache.hadoop.hdds.scm.ha.SCMStateMachine) File(java.io.File) Test(org.junit.jupiter.api.Test)

Example 5 with SCMStateMachine

use of org.apache.hadoop.hdds.scm.ha.SCMStateMachine in project ozone by apache.

the class TestSCMInstallSnapshotWithHA method testInstallOldCheckpointFailure.

@Test
public void testInstallOldCheckpointFailure() throws Exception {
    // Get the leader SCM
    StorageContainerManager leaderSCM = getLeader(cluster);
    String followerId = getInactiveSCM(cluster).getSCMNodeId();
    // Find the inactive SCM
    StorageContainerManager followerSCM = cluster.getSCM(followerId);
    cluster.startInactiveSCM(followerId);
    followerSCM.exitSafeMode();
    DBCheckpoint leaderDbCheckpoint = leaderSCM.getScmMetadataStore().getStore().getCheckpoint(false);
    SCMStateMachine leaderSM = leaderSCM.getScmHAManager().getRatisServer().getSCMStateMachine();
    TermIndex lastTermIndex = leaderSM.getLastAppliedTermIndex();
    SCMStateMachine followerSM = followerSCM.getScmHAManager().getRatisServer().getSCMStateMachine();
    followerSCM.getScmMetadataStore().getTransactionInfoTable().put(OzoneConsts.TRANSACTION_INFO_KEY, TransactionInfo.builder().setCurrentTerm(lastTermIndex.getTerm()).setTransactionIndex(lastTermIndex.getIndex() + 100).build());
    // Advance the follower
    followerSM.notifyTermIndexUpdated(lastTermIndex.getTerm(), lastTermIndex.getIndex() + 100);
    GenericTestUtils.setLogLevel(SCMHAManagerImpl.getLogger(), Level.INFO);
    GenericTestUtils.LogCapturer logCapture = GenericTestUtils.LogCapturer.captureLogs(SCMHAManagerImpl.getLogger());
    // Install the old checkpoint on the follower . This should fail as the
    // follower is already ahead of that transactionLogIndex and the
    // state should be reloaded.
    TermIndex followerTermIndex = followerSM.getLastAppliedTermIndex();
    SCMHAManagerImpl scmhaManager = (SCMHAManagerImpl) (followerSCM.getScmHAManager());
    TermIndex newTermIndex = null;
    try {
        newTermIndex = scmhaManager.installCheckpoint(leaderDbCheckpoint);
    } catch (IOException ioe) {
    // throw IOException as expected
    }
    String errorMsg = "Reloading old state of SCM";
    Assert.assertTrue(logCapture.getOutput().contains(errorMsg));
    Assert.assertNull(" installed checkpoint even though checkpoint " + "logIndex is less than it's lastAppliedIndex", newTermIndex);
    Assert.assertEquals(followerTermIndex, followerSM.getLastAppliedTermIndex());
    Assert.assertFalse(followerSM.getLifeCycleState().isPausingOrPaused());
}
Also used : StorageContainerManager(org.apache.hadoop.hdds.scm.server.StorageContainerManager) DBCheckpoint(org.apache.hadoop.hdds.utils.db.DBCheckpoint) RocksDBCheckpoint(org.apache.hadoop.hdds.utils.db.RocksDBCheckpoint) GenericTestUtils(org.apache.ozone.test.GenericTestUtils) SCMHAManagerImpl(org.apache.hadoop.hdds.scm.ha.SCMHAManagerImpl) SCMStateMachine(org.apache.hadoop.hdds.scm.ha.SCMStateMachine) IOException(java.io.IOException) TermIndex(org.apache.ratis.server.protocol.TermIndex) Test(org.junit.jupiter.api.Test)

Aggregations

SCMStateMachine (org.apache.hadoop.hdds.scm.ha.SCMStateMachine)5 StorageContainerManager (org.apache.hadoop.hdds.scm.server.StorageContainerManager)4 DBCheckpoint (org.apache.hadoop.hdds.utils.db.DBCheckpoint)3 Test (org.junit.jupiter.api.Test)3 ContainerInfo (org.apache.hadoop.hdds.scm.container.ContainerInfo)2 SCMHAManagerImpl (org.apache.hadoop.hdds.scm.ha.SCMHAManagerImpl)2 SCMDBDefinition (org.apache.hadoop.hdds.scm.metadata.SCMDBDefinition)2 RocksDBCheckpoint (org.apache.hadoop.hdds.utils.db.RocksDBCheckpoint)2 GenericTestUtils (org.apache.ozone.test.GenericTestUtils)2 TermIndex (org.apache.ratis.server.protocol.TermIndex)2 File (java.io.File)1 IOException (java.io.IOException)1 Path (java.nio.file.Path)1 ArrayList (java.util.ArrayList)1 ContainerID (org.apache.hadoop.hdds.scm.container.ContainerID)1 SCMMetadataStore (org.apache.hadoop.hdds.scm.metadata.SCMMetadataStore)1 PipelineID (org.apache.hadoop.hdds.scm.pipeline.PipelineID)1 TransactionInfo (org.apache.hadoop.hdds.utils.TransactionInfo)1 DBStore (org.apache.hadoop.hdds.utils.db.DBStore)1 Test (org.junit.Test)1