use of org.apache.hadoop.hdds.utils.db.DBCheckpoint in project ozone by apache.
the class TestOMRatisSnapshots method testInstallOldCheckpointFailure.
@Test
public void testInstallOldCheckpointFailure() throws Exception {
// Get the leader OM
String leaderOMNodeId = OmFailoverProxyUtil.getFailoverProxyProvider(objectStore.getClientProxy()).getCurrentProxyOMNodeId();
OzoneManager leaderOM = cluster.getOzoneManager(leaderOMNodeId);
// Find the inactive OM and start it
String followerNodeId = leaderOM.getPeerNodes().get(0).getNodeId();
if (cluster.isOMActive(followerNodeId)) {
followerNodeId = leaderOM.getPeerNodes().get(1).getNodeId();
}
cluster.startInactiveOM(followerNodeId);
OzoneManager followerOM = cluster.getOzoneManager(followerNodeId);
OzoneManagerRatisServer followerRatisServer = followerOM.getOmRatisServer();
// Do some transactions so that the log index increases on follower OM
writeKeysToIncreaseLogIndex(followerRatisServer, 100);
TermIndex leaderCheckpointTermIndex = leaderOM.getOmRatisServer().getLastAppliedTermIndex();
DBCheckpoint leaderDbCheckpoint = leaderOM.getMetadataManager().getStore().getCheckpoint(false);
// Do some more transactions to increase the log index further on
// follower OM such that it is more than the checkpoint index taken on
// leader OM.
writeKeysToIncreaseLogIndex(followerOM.getOmRatisServer(), leaderCheckpointTermIndex.getIndex() + 100);
GenericTestUtils.setLogLevel(OzoneManager.LOG, Level.INFO);
GenericTestUtils.LogCapturer logCapture = GenericTestUtils.LogCapturer.captureLogs(OzoneManager.LOG);
// Install the old checkpoint on the follower OM. This should fail as the
// followerOM is already ahead of that transactionLogIndex and the OM
// state should be reloaded.
TermIndex followerTermIndex = followerRatisServer.getLastAppliedTermIndex();
TermIndex newTermIndex = followerOM.installCheckpoint(leaderOMNodeId, leaderDbCheckpoint);
String errorMsg = "Cannot proceed with InstallSnapshot as OM is at " + "TermIndex " + followerTermIndex + " and checkpoint has lower " + "TermIndex";
Assert.assertTrue(logCapture.getOutput().contains(errorMsg));
Assert.assertNull("OM installed checkpoint even though checkpoint " + "logIndex is less than it's lastAppliedIndex", newTermIndex);
Assert.assertEquals(followerTermIndex, followerRatisServer.getLastAppliedTermIndex());
}
use of org.apache.hadoop.hdds.utils.db.DBCheckpoint in project ozone by apache.
the class TestSCMInstallSnapshotWithHA method testInstallCorruptedCheckpointFailure.
@Test
public void testInstallCorruptedCheckpointFailure() throws Exception {
StorageContainerManager leaderSCM = getLeader(cluster);
// Find the inactive SCM
String followerId = getInactiveSCM(cluster).getSCMNodeId();
StorageContainerManager followerSCM = cluster.getSCM(followerId);
// Do some transactions so that the log index increases
writeToIncreaseLogIndex(leaderSCM, 100);
File oldDBLocation = followerSCM.getScmMetadataStore().getStore().getDbLocation();
SCMStateMachine followerSM = followerSCM.getScmHAManager().getRatisServer().getSCMStateMachine();
TermIndex termIndex = followerSM.getLastAppliedTermIndex();
DBCheckpoint leaderDbCheckpoint = leaderSCM.getScmMetadataStore().getStore().getCheckpoint(false);
Path leaderCheckpointLocation = leaderDbCheckpoint.getCheckpointLocation();
TransactionInfo leaderCheckpointTrxnInfo = HAUtils.getTrxnInfoFromCheckpoint(conf, leaderCheckpointLocation, new SCMDBDefinition());
Assert.assertNotNull(leaderCheckpointLocation);
// Take a backup of the current DB
String dbBackupName = "SCM_CHECKPOINT_BACKUP" + termIndex.getIndex() + "_" + System.currentTimeMillis();
File dbDir = oldDBLocation.getParentFile();
File checkpointBackup = new File(dbDir, dbBackupName);
// Take a backup of the leader checkpoint
FileUtils.copyDirectory(leaderCheckpointLocation.toFile(), checkpointBackup, false);
// Corrupt the leader checkpoint and install that on the follower. The
// operation should fail and should shutdown.
boolean delete = true;
for (File file : leaderCheckpointLocation.toFile().listFiles()) {
if (file.getName().contains(".sst")) {
if (delete) {
file.delete();
delete = false;
} else {
delete = true;
}
}
}
SCMHAManagerImpl scmhaManager = (SCMHAManagerImpl) (followerSCM.getScmHAManager());
GenericTestUtils.setLogLevel(SCMHAManagerImpl.getLogger(), Level.ERROR);
GenericTestUtils.LogCapturer logCapture = GenericTestUtils.LogCapturer.captureLogs(SCMHAManagerImpl.getLogger());
scmhaManager.setExitManagerForTesting(new DummyExitManager());
followerSM.pause();
scmhaManager.installCheckpoint(leaderCheckpointLocation, leaderCheckpointTrxnInfo);
Assert.assertTrue(logCapture.getOutput().contains("Failed to reload SCM state and instantiate services."));
Assert.assertTrue(followerSM.getLifeCycleState().isPausingOrPaused());
// Verify correct reloading
followerSM.setInstallingDBCheckpoint(new RocksDBCheckpoint(checkpointBackup.toPath()));
followerSM.reinitialize();
Assert.assertEquals(followerSM.getLastAppliedTermIndex(), leaderCheckpointTrxnInfo.getTermIndex());
}
use of org.apache.hadoop.hdds.utils.db.DBCheckpoint in project ozone by apache.
the class TestSCMInstallSnapshotWithHA method testInstallOldCheckpointFailure.
@Test
public void testInstallOldCheckpointFailure() throws Exception {
// Get the leader SCM
StorageContainerManager leaderSCM = getLeader(cluster);
String followerId = getInactiveSCM(cluster).getSCMNodeId();
// Find the inactive SCM
StorageContainerManager followerSCM = cluster.getSCM(followerId);
cluster.startInactiveSCM(followerId);
followerSCM.exitSafeMode();
DBCheckpoint leaderDbCheckpoint = leaderSCM.getScmMetadataStore().getStore().getCheckpoint(false);
SCMStateMachine leaderSM = leaderSCM.getScmHAManager().getRatisServer().getSCMStateMachine();
TermIndex lastTermIndex = leaderSM.getLastAppliedTermIndex();
SCMStateMachine followerSM = followerSCM.getScmHAManager().getRatisServer().getSCMStateMachine();
followerSCM.getScmMetadataStore().getTransactionInfoTable().put(OzoneConsts.TRANSACTION_INFO_KEY, TransactionInfo.builder().setCurrentTerm(lastTermIndex.getTerm()).setTransactionIndex(lastTermIndex.getIndex() + 100).build());
// Advance the follower
followerSM.notifyTermIndexUpdated(lastTermIndex.getTerm(), lastTermIndex.getIndex() + 100);
GenericTestUtils.setLogLevel(SCMHAManagerImpl.getLogger(), Level.INFO);
GenericTestUtils.LogCapturer logCapture = GenericTestUtils.LogCapturer.captureLogs(SCMHAManagerImpl.getLogger());
// Install the old checkpoint on the follower . This should fail as the
// follower is already ahead of that transactionLogIndex and the
// state should be reloaded.
TermIndex followerTermIndex = followerSM.getLastAppliedTermIndex();
SCMHAManagerImpl scmhaManager = (SCMHAManagerImpl) (followerSCM.getScmHAManager());
TermIndex newTermIndex = null;
try {
newTermIndex = scmhaManager.installCheckpoint(leaderDbCheckpoint);
} catch (IOException ioe) {
// throw IOException as expected
}
String errorMsg = "Reloading old state of SCM";
Assert.assertTrue(logCapture.getOutput().contains(errorMsg));
Assert.assertNull(" installed checkpoint even though checkpoint " + "logIndex is less than it's lastAppliedIndex", newTermIndex);
Assert.assertEquals(followerTermIndex, followerSM.getLastAppliedTermIndex());
Assert.assertFalse(followerSM.getLifeCycleState().isPausingOrPaused());
}
use of org.apache.hadoop.hdds.utils.db.DBCheckpoint in project ozone by apache.
the class TestOzoneManagerSnapshotProvider method testDownloadCheckpoint.
@Test
public void testDownloadCheckpoint() throws Exception {
String userName = "user" + RandomStringUtils.randomNumeric(5);
String adminName = "admin" + RandomStringUtils.randomNumeric(5);
String volumeName = "volume" + RandomStringUtils.randomNumeric(5);
String bucketName = "bucket" + RandomStringUtils.randomNumeric(5);
VolumeArgs createVolumeArgs = VolumeArgs.newBuilder().setOwner(userName).setAdmin(adminName).build();
objectStore.createVolume(volumeName, createVolumeArgs);
OzoneVolume retVolumeinfo = objectStore.getVolume(volumeName);
retVolumeinfo.createBucket(bucketName);
String leaderOMNodeId = OmFailoverProxyUtil.getFailoverProxyProvider(objectStore.getClientProxy()).getCurrentProxyOMNodeId();
OzoneManager leaderOM = cluster.getOzoneManager(leaderOMNodeId);
// Get a follower OM
String followerNodeId = leaderOM.getPeerNodes().get(0).getNodeId();
OzoneManager followerOM = cluster.getOzoneManager(followerNodeId);
// Download latest checkpoint from leader OM to follower OM
DBCheckpoint omSnapshot = followerOM.getOmSnapshotProvider().getOzoneManagerDBSnapshot(leaderOMNodeId);
long leaderSnapshotIndex = leaderOM.getRatisSnapshotIndex();
long downloadedSnapshotIndex = getDownloadedSnapshotIndex(omSnapshot);
// The snapshot index downloaded from leader OM should match the ratis
// snapshot index on the leader OM
Assert.assertEquals("The snapshot index downloaded from leader OM does " + "not match its ratis snapshot index", leaderSnapshotIndex, downloadedSnapshotIndex);
}
use of org.apache.hadoop.hdds.utils.db.DBCheckpoint in project ozone by apache.
the class OzoneManagerServiceProviderImpl method updateReconOmDBWithNewSnapshot.
/**
* Update Local OM DB with new OM DB snapshot.
* @throws IOException
*/
@VisibleForTesting
boolean updateReconOmDBWithNewSnapshot() throws IOException {
// Obtain the current DB snapshot from OM and
// update the in house OM metadata managed DB instance.
long startTime = Time.monotonicNow();
DBCheckpoint dbSnapshot = getOzoneManagerDBSnapshot();
metrics.updateSnapshotRequestLatency(Time.monotonicNow() - startTime);
if (dbSnapshot != null && dbSnapshot.getCheckpointLocation() != null) {
LOG.info("Got new checkpoint from OM : " + dbSnapshot.getCheckpointLocation());
try {
omMetadataManager.updateOmDB(dbSnapshot.getCheckpointLocation().toFile());
return true;
} catch (IOException e) {
LOG.error("Unable to refresh Recon OM DB Snapshot. ", e);
}
} else {
LOG.error("Null snapshot location got from OM.");
}
return false;
}
Aggregations