Search in sources :

Example 11 with DBCheckpoint

use of org.apache.hadoop.hdds.utils.db.DBCheckpoint in project ozone by apache.

the class TestOMRatisSnapshots method testInstallOldCheckpointFailure.

@Test
public void testInstallOldCheckpointFailure() throws Exception {
    // Get the leader OM
    String leaderOMNodeId = OmFailoverProxyUtil.getFailoverProxyProvider(objectStore.getClientProxy()).getCurrentProxyOMNodeId();
    OzoneManager leaderOM = cluster.getOzoneManager(leaderOMNodeId);
    // Find the inactive OM and start it
    String followerNodeId = leaderOM.getPeerNodes().get(0).getNodeId();
    if (cluster.isOMActive(followerNodeId)) {
        followerNodeId = leaderOM.getPeerNodes().get(1).getNodeId();
    }
    cluster.startInactiveOM(followerNodeId);
    OzoneManager followerOM = cluster.getOzoneManager(followerNodeId);
    OzoneManagerRatisServer followerRatisServer = followerOM.getOmRatisServer();
    // Do some transactions so that the log index increases on follower OM
    writeKeysToIncreaseLogIndex(followerRatisServer, 100);
    TermIndex leaderCheckpointTermIndex = leaderOM.getOmRatisServer().getLastAppliedTermIndex();
    DBCheckpoint leaderDbCheckpoint = leaderOM.getMetadataManager().getStore().getCheckpoint(false);
    // Do some more transactions to increase the log index further on
    // follower OM such that it is more than the checkpoint index taken on
    // leader OM.
    writeKeysToIncreaseLogIndex(followerOM.getOmRatisServer(), leaderCheckpointTermIndex.getIndex() + 100);
    GenericTestUtils.setLogLevel(OzoneManager.LOG, Level.INFO);
    GenericTestUtils.LogCapturer logCapture = GenericTestUtils.LogCapturer.captureLogs(OzoneManager.LOG);
    // Install the old checkpoint on the follower OM. This should fail as the
    // followerOM is already ahead of that transactionLogIndex and the OM
    // state should be reloaded.
    TermIndex followerTermIndex = followerRatisServer.getLastAppliedTermIndex();
    TermIndex newTermIndex = followerOM.installCheckpoint(leaderOMNodeId, leaderDbCheckpoint);
    String errorMsg = "Cannot proceed with InstallSnapshot as OM is at " + "TermIndex " + followerTermIndex + " and checkpoint has lower " + "TermIndex";
    Assert.assertTrue(logCapture.getOutput().contains(errorMsg));
    Assert.assertNull("OM installed checkpoint even though checkpoint " + "logIndex is less than it's lastAppliedIndex", newTermIndex);
    Assert.assertEquals(followerTermIndex, followerRatisServer.getLastAppliedTermIndex());
}
Also used : DBCheckpoint(org.apache.hadoop.hdds.utils.db.DBCheckpoint) GenericTestUtils(org.apache.ozone.test.GenericTestUtils) OzoneManagerRatisServer(org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServer) TermIndex(org.apache.ratis.server.protocol.TermIndex) Test(org.junit.jupiter.api.Test)

Example 12 with DBCheckpoint

use of org.apache.hadoop.hdds.utils.db.DBCheckpoint in project ozone by apache.

the class TestSCMInstallSnapshotWithHA method testInstallCorruptedCheckpointFailure.

@Test
public void testInstallCorruptedCheckpointFailure() throws Exception {
    StorageContainerManager leaderSCM = getLeader(cluster);
    // Find the inactive SCM
    String followerId = getInactiveSCM(cluster).getSCMNodeId();
    StorageContainerManager followerSCM = cluster.getSCM(followerId);
    // Do some transactions so that the log index increases
    writeToIncreaseLogIndex(leaderSCM, 100);
    File oldDBLocation = followerSCM.getScmMetadataStore().getStore().getDbLocation();
    SCMStateMachine followerSM = followerSCM.getScmHAManager().getRatisServer().getSCMStateMachine();
    TermIndex termIndex = followerSM.getLastAppliedTermIndex();
    DBCheckpoint leaderDbCheckpoint = leaderSCM.getScmMetadataStore().getStore().getCheckpoint(false);
    Path leaderCheckpointLocation = leaderDbCheckpoint.getCheckpointLocation();
    TransactionInfo leaderCheckpointTrxnInfo = HAUtils.getTrxnInfoFromCheckpoint(conf, leaderCheckpointLocation, new SCMDBDefinition());
    Assert.assertNotNull(leaderCheckpointLocation);
    // Take a backup of the current DB
    String dbBackupName = "SCM_CHECKPOINT_BACKUP" + termIndex.getIndex() + "_" + System.currentTimeMillis();
    File dbDir = oldDBLocation.getParentFile();
    File checkpointBackup = new File(dbDir, dbBackupName);
    // Take a backup of the leader checkpoint
    FileUtils.copyDirectory(leaderCheckpointLocation.toFile(), checkpointBackup, false);
    // Corrupt the leader checkpoint and install that on the follower. The
    // operation should fail and  should shutdown.
    boolean delete = true;
    for (File file : leaderCheckpointLocation.toFile().listFiles()) {
        if (file.getName().contains(".sst")) {
            if (delete) {
                file.delete();
                delete = false;
            } else {
                delete = true;
            }
        }
    }
    SCMHAManagerImpl scmhaManager = (SCMHAManagerImpl) (followerSCM.getScmHAManager());
    GenericTestUtils.setLogLevel(SCMHAManagerImpl.getLogger(), Level.ERROR);
    GenericTestUtils.LogCapturer logCapture = GenericTestUtils.LogCapturer.captureLogs(SCMHAManagerImpl.getLogger());
    scmhaManager.setExitManagerForTesting(new DummyExitManager());
    followerSM.pause();
    scmhaManager.installCheckpoint(leaderCheckpointLocation, leaderCheckpointTrxnInfo);
    Assert.assertTrue(logCapture.getOutput().contains("Failed to reload SCM state and instantiate services."));
    Assert.assertTrue(followerSM.getLifeCycleState().isPausingOrPaused());
    // Verify correct reloading
    followerSM.setInstallingDBCheckpoint(new RocksDBCheckpoint(checkpointBackup.toPath()));
    followerSM.reinitialize();
    Assert.assertEquals(followerSM.getLastAppliedTermIndex(), leaderCheckpointTrxnInfo.getTermIndex());
}
Also used : Path(java.nio.file.Path) StorageContainerManager(org.apache.hadoop.hdds.scm.server.StorageContainerManager) RocksDBCheckpoint(org.apache.hadoop.hdds.utils.db.RocksDBCheckpoint) DBCheckpoint(org.apache.hadoop.hdds.utils.db.DBCheckpoint) RocksDBCheckpoint(org.apache.hadoop.hdds.utils.db.RocksDBCheckpoint) SCMHAManagerImpl(org.apache.hadoop.hdds.scm.ha.SCMHAManagerImpl) GenericTestUtils(org.apache.ozone.test.GenericTestUtils) TermIndex(org.apache.ratis.server.protocol.TermIndex) SCMDBDefinition(org.apache.hadoop.hdds.scm.metadata.SCMDBDefinition) TransactionInfo(org.apache.hadoop.hdds.utils.TransactionInfo) SCMStateMachine(org.apache.hadoop.hdds.scm.ha.SCMStateMachine) File(java.io.File) Test(org.junit.jupiter.api.Test)

Example 13 with DBCheckpoint

use of org.apache.hadoop.hdds.utils.db.DBCheckpoint in project ozone by apache.

the class TestSCMInstallSnapshotWithHA method testInstallOldCheckpointFailure.

@Test
public void testInstallOldCheckpointFailure() throws Exception {
    // Get the leader SCM
    StorageContainerManager leaderSCM = getLeader(cluster);
    String followerId = getInactiveSCM(cluster).getSCMNodeId();
    // Find the inactive SCM
    StorageContainerManager followerSCM = cluster.getSCM(followerId);
    cluster.startInactiveSCM(followerId);
    followerSCM.exitSafeMode();
    DBCheckpoint leaderDbCheckpoint = leaderSCM.getScmMetadataStore().getStore().getCheckpoint(false);
    SCMStateMachine leaderSM = leaderSCM.getScmHAManager().getRatisServer().getSCMStateMachine();
    TermIndex lastTermIndex = leaderSM.getLastAppliedTermIndex();
    SCMStateMachine followerSM = followerSCM.getScmHAManager().getRatisServer().getSCMStateMachine();
    followerSCM.getScmMetadataStore().getTransactionInfoTable().put(OzoneConsts.TRANSACTION_INFO_KEY, TransactionInfo.builder().setCurrentTerm(lastTermIndex.getTerm()).setTransactionIndex(lastTermIndex.getIndex() + 100).build());
    // Advance the follower
    followerSM.notifyTermIndexUpdated(lastTermIndex.getTerm(), lastTermIndex.getIndex() + 100);
    GenericTestUtils.setLogLevel(SCMHAManagerImpl.getLogger(), Level.INFO);
    GenericTestUtils.LogCapturer logCapture = GenericTestUtils.LogCapturer.captureLogs(SCMHAManagerImpl.getLogger());
    // Install the old checkpoint on the follower . This should fail as the
    // follower is already ahead of that transactionLogIndex and the
    // state should be reloaded.
    TermIndex followerTermIndex = followerSM.getLastAppliedTermIndex();
    SCMHAManagerImpl scmhaManager = (SCMHAManagerImpl) (followerSCM.getScmHAManager());
    TermIndex newTermIndex = null;
    try {
        newTermIndex = scmhaManager.installCheckpoint(leaderDbCheckpoint);
    } catch (IOException ioe) {
    // throw IOException as expected
    }
    String errorMsg = "Reloading old state of SCM";
    Assert.assertTrue(logCapture.getOutput().contains(errorMsg));
    Assert.assertNull(" installed checkpoint even though checkpoint " + "logIndex is less than it's lastAppliedIndex", newTermIndex);
    Assert.assertEquals(followerTermIndex, followerSM.getLastAppliedTermIndex());
    Assert.assertFalse(followerSM.getLifeCycleState().isPausingOrPaused());
}
Also used : StorageContainerManager(org.apache.hadoop.hdds.scm.server.StorageContainerManager) DBCheckpoint(org.apache.hadoop.hdds.utils.db.DBCheckpoint) RocksDBCheckpoint(org.apache.hadoop.hdds.utils.db.RocksDBCheckpoint) GenericTestUtils(org.apache.ozone.test.GenericTestUtils) SCMHAManagerImpl(org.apache.hadoop.hdds.scm.ha.SCMHAManagerImpl) SCMStateMachine(org.apache.hadoop.hdds.scm.ha.SCMStateMachine) IOException(java.io.IOException) TermIndex(org.apache.ratis.server.protocol.TermIndex) Test(org.junit.jupiter.api.Test)

Example 14 with DBCheckpoint

use of org.apache.hadoop.hdds.utils.db.DBCheckpoint in project ozone by apache.

the class TestOzoneManagerSnapshotProvider method testDownloadCheckpoint.

@Test
public void testDownloadCheckpoint() throws Exception {
    String userName = "user" + RandomStringUtils.randomNumeric(5);
    String adminName = "admin" + RandomStringUtils.randomNumeric(5);
    String volumeName = "volume" + RandomStringUtils.randomNumeric(5);
    String bucketName = "bucket" + RandomStringUtils.randomNumeric(5);
    VolumeArgs createVolumeArgs = VolumeArgs.newBuilder().setOwner(userName).setAdmin(adminName).build();
    objectStore.createVolume(volumeName, createVolumeArgs);
    OzoneVolume retVolumeinfo = objectStore.getVolume(volumeName);
    retVolumeinfo.createBucket(bucketName);
    String leaderOMNodeId = OmFailoverProxyUtil.getFailoverProxyProvider(objectStore.getClientProxy()).getCurrentProxyOMNodeId();
    OzoneManager leaderOM = cluster.getOzoneManager(leaderOMNodeId);
    // Get a follower OM
    String followerNodeId = leaderOM.getPeerNodes().get(0).getNodeId();
    OzoneManager followerOM = cluster.getOzoneManager(followerNodeId);
    // Download latest checkpoint from leader OM to follower OM
    DBCheckpoint omSnapshot = followerOM.getOmSnapshotProvider().getOzoneManagerDBSnapshot(leaderOMNodeId);
    long leaderSnapshotIndex = leaderOM.getRatisSnapshotIndex();
    long downloadedSnapshotIndex = getDownloadedSnapshotIndex(omSnapshot);
    // The snapshot index downloaded from leader OM should match the ratis
    // snapshot index on the leader OM
    Assert.assertEquals("The snapshot index downloaded from leader OM does " + "not match its ratis snapshot index", leaderSnapshotIndex, downloadedSnapshotIndex);
}
Also used : OzoneVolume(org.apache.hadoop.ozone.client.OzoneVolume) OzoneManager(org.apache.hadoop.ozone.om.OzoneManager) DBCheckpoint(org.apache.hadoop.hdds.utils.db.DBCheckpoint) VolumeArgs(org.apache.hadoop.ozone.client.VolumeArgs) Test(org.junit.Test)

Example 15 with DBCheckpoint

use of org.apache.hadoop.hdds.utils.db.DBCheckpoint in project ozone by apache.

the class OzoneManagerServiceProviderImpl method updateReconOmDBWithNewSnapshot.

/**
 * Update Local OM DB with new OM DB snapshot.
 * @throws IOException
 */
@VisibleForTesting
boolean updateReconOmDBWithNewSnapshot() throws IOException {
    // Obtain the current DB snapshot from OM and
    // update the in house OM metadata managed DB instance.
    long startTime = Time.monotonicNow();
    DBCheckpoint dbSnapshot = getOzoneManagerDBSnapshot();
    metrics.updateSnapshotRequestLatency(Time.monotonicNow() - startTime);
    if (dbSnapshot != null && dbSnapshot.getCheckpointLocation() != null) {
        LOG.info("Got new checkpoint from OM : " + dbSnapshot.getCheckpointLocation());
        try {
            omMetadataManager.updateOmDB(dbSnapshot.getCheckpointLocation().toFile());
            return true;
        } catch (IOException e) {
            LOG.error("Unable to refresh Recon OM DB Snapshot. ", e);
        }
    } else {
        LOG.error("Null snapshot location got from OM.");
    }
    return false;
}
Also used : DBCheckpoint(org.apache.hadoop.hdds.utils.db.DBCheckpoint) RocksDBCheckpoint(org.apache.hadoop.hdds.utils.db.RocksDBCheckpoint) IOException(java.io.IOException) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Aggregations

DBCheckpoint (org.apache.hadoop.hdds.utils.db.DBCheckpoint)23 File (java.io.File)10 IOException (java.io.IOException)10 Path (java.nio.file.Path)7 TermIndex (org.apache.ratis.server.protocol.TermIndex)7 Test (org.junit.Test)6 StorageContainerManager (org.apache.hadoop.hdds.scm.server.StorageContainerManager)5 RocksDBCheckpoint (org.apache.hadoop.hdds.utils.db.RocksDBCheckpoint)5 Test (org.junit.jupiter.api.Test)5 InputStream (java.io.InputStream)4 ReconUtils (org.apache.hadoop.ozone.recon.ReconUtils)4 GenericTestUtils (org.apache.ozone.test.GenericTestUtils)4 HttpURLConnection (java.net.HttpURLConnection)3 OzoneConfiguration (org.apache.hadoop.hdds.conf.OzoneConfiguration)3 SCMStateMachine (org.apache.hadoop.hdds.scm.ha.SCMStateMachine)3 TransactionInfo (org.apache.hadoop.hdds.utils.TransactionInfo)3 OzoneManagerRatisServer (org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServer)3 VisibleForTesting (com.google.common.annotations.VisibleForTesting)2 InvalidProtocolBufferException (com.google.protobuf.InvalidProtocolBufferException)2 FileInputStream (java.io.FileInputStream)2