use of org.apache.hadoop.hdds.utils.db.DBCheckpoint in project ozone by apache.
the class StorageContainerServiceProviderImpl method getSCMDBSnapshot.
public DBCheckpoint getSCMDBSnapshot() {
String snapshotFileName = RECON_SCM_SNAPSHOT_DB + "_" + System.currentTimeMillis();
File targetFile = new File(scmSnapshotDBParentDir, snapshotFileName + ".tar.gz");
try {
if (!SCMHAUtils.isSCMHAEnabled(configuration)) {
SecurityUtil.doAsLoginUser(() -> {
try (InputStream inputStream = reconUtils.makeHttpCall(connectionFactory, getScmDBSnapshotUrl(), isOmSpnegoEnabled()).getInputStream()) {
FileUtils.copyInputStreamToFile(inputStream, targetFile);
}
return null;
});
LOG.info("Downloaded SCM Snapshot from SCM");
} else {
List<String> ratisRoles = scmClient.getScmInfo().getRatisPeerRoles();
for (String ratisRole : ratisRoles) {
String[] role = ratisRole.split(":");
if (role[2].equals(RaftProtos.RaftPeerRole.LEADER.toString())) {
String hostAddress = role[4].trim();
int grpcPort = configuration.getInt(ScmConfigKeys.OZONE_SCM_GRPC_PORT_KEY, ScmConfigKeys.OZONE_SCM_GRPC_PORT_DEFAULT);
try (SCMSnapshotDownloader downloadClient = new InterSCMGrpcClient(hostAddress, grpcPort, configuration, new ReconCertificateClient(new SecurityConfig(configuration), reconStorage.getReconCertSerialId()))) {
downloadClient.download(targetFile.toPath()).get();
} catch (ExecutionException | InterruptedException e) {
LOG.error("Rocks DB checkpoint downloading failed", e);
throw new IOException(e);
}
LOG.info("Downloaded SCM Snapshot from Leader SCM");
break;
}
}
}
Path untarredDbDir = Paths.get(scmSnapshotDBParentDir.getAbsolutePath(), snapshotFileName);
reconUtils.untarCheckpointFile(targetFile, untarredDbDir);
FileUtils.deleteQuietly(targetFile);
return new RocksDBCheckpoint(untarredDbDir);
} catch (IOException e) {
LOG.error("Unable to obtain SCM DB Snapshot. ", e);
}
return null;
}
use of org.apache.hadoop.hdds.utils.db.DBCheckpoint in project ozone by apache.
the class OzoneManager method installSnapshotFromLeader.
/**
* Download and install latest checkpoint from leader OM.
*
* @param leaderId peerNodeID of the leader OM
* @return If checkpoint is installed successfully, return the
* corresponding termIndex. Otherwise, return null.
*/
public TermIndex installSnapshotFromLeader(String leaderId) {
if (omSnapshotProvider == null) {
LOG.error("OM Snapshot Provider is not configured as there are no peer " + "nodes.");
return null;
}
DBCheckpoint omDBCheckpoint = getDBCheckpointFromLeader(leaderId);
LOG.info("Downloaded checkpoint from Leader {} to the location {}", leaderId, omDBCheckpoint.getCheckpointLocation());
TermIndex termIndex = null;
try {
termIndex = installCheckpoint(leaderId, omDBCheckpoint);
} catch (Exception ex) {
LOG.error("Failed to install snapshot from Leader OM.", ex);
}
return termIndex;
}
use of org.apache.hadoop.hdds.utils.db.DBCheckpoint in project ozone by apache.
the class DBCheckpointServlet method doGet.
/**
* Process a GET request for the DB checkpoint snapshot.
*
* @param request The servlet request we are processing
* @param response The servlet response we are creating
*/
@Override
public void doGet(HttpServletRequest request, HttpServletResponse response) {
LOG.info("Received request to obtain DB checkpoint snapshot");
if (dbStore == null) {
LOG.error("Unable to process metadata snapshot request. DB Store is null");
response.setStatus(HttpServletResponse.SC_INTERNAL_SERVER_ERROR);
return;
}
// Check ACL for dbCheckpoint only when global Ozone ACL is enable
if (aclEnabled) {
final java.security.Principal userPrincipal = request.getUserPrincipal();
if (userPrincipal == null) {
final String remoteUser = request.getRemoteUser();
LOG.error("Permission denied: Unauthorized access to /dbCheckpoint," + " no user principal found. Current login user is {}.", remoteUser != null ? "'" + remoteUser + "'" : "UNKNOWN");
response.setStatus(HttpServletResponse.SC_FORBIDDEN);
return;
} else {
final String userPrincipalName = userPrincipal.getName();
UserGroupInformation ugi = UserGroupInformation.createRemoteUser(userPrincipalName);
if (!hasPermission(ugi)) {
LOG.error("Permission denied: User principal '{}' does not have" + " access to /dbCheckpoint.\nThis can happen when Ozone" + " Manager is started with a different user.\n" + " Please append '{}' to OM 'ozone.administrators'" + " config and restart OM to grant current" + " user access to this endpoint.", userPrincipalName, userPrincipalName);
response.setStatus(HttpServletResponse.SC_FORBIDDEN);
return;
}
LOG.debug("Granted user principal '{}' access to /dbCheckpoint.", userPrincipalName);
}
}
DBCheckpoint checkpoint = null;
try {
boolean flush = false;
String flushParam = request.getParameter(OZONE_DB_CHECKPOINT_REQUEST_FLUSH);
if (StringUtils.isNotEmpty(flushParam)) {
flush = Boolean.valueOf(flushParam);
}
checkpoint = dbStore.getCheckpoint(flush);
if (checkpoint == null || checkpoint.getCheckpointLocation() == null) {
LOG.error("Unable to process metadata snapshot request. " + "Checkpoint request returned null.");
response.setStatus(HttpServletResponse.SC_INTERNAL_SERVER_ERROR);
return;
}
dbMetrics.setLastCheckpointCreationTimeTaken(checkpoint.checkpointCreationTimeTaken());
Path file = checkpoint.getCheckpointLocation().getFileName();
if (file == null) {
return;
}
response.setContentType("application/x-tgz");
response.setHeader("Content-Disposition", "attachment; filename=\"" + file.toString() + ".tgz\"");
Instant start = Instant.now();
writeDBCheckpointToStream(checkpoint, response.getOutputStream());
Instant end = Instant.now();
long duration = Duration.between(start, end).toMillis();
LOG.info("Time taken to write the checkpoint to response output " + "stream: {} milliseconds", duration);
dbMetrics.setLastCheckpointStreamingTimeTaken(duration);
dbMetrics.incNumCheckpoints();
} catch (Exception e) {
LOG.error("Unable to process metadata snapshot request. ", e);
response.setStatus(HttpServletResponse.SC_INTERNAL_SERVER_ERROR);
dbMetrics.incNumCheckpointFails();
} finally {
if (checkpoint != null) {
try {
checkpoint.cleanupCheckpoint();
} catch (IOException e) {
LOG.error("Error trying to clean checkpoint at {} .", checkpoint.getCheckpointLocation().toString());
}
}
}
}
use of org.apache.hadoop.hdds.utils.db.DBCheckpoint in project ozone by apache.
the class TestOMRatisSnapshots method testInstallSnapshot.
@Test
public void testInstallSnapshot() throws Exception {
// Get the leader OM
String leaderOMNodeId = OmFailoverProxyUtil.getFailoverProxyProvider(objectStore.getClientProxy()).getCurrentProxyOMNodeId();
OzoneManager leaderOM = cluster.getOzoneManager(leaderOMNodeId);
OzoneManagerRatisServer leaderRatisServer = leaderOM.getOmRatisServer();
// Find the inactive OM
String followerNodeId = leaderOM.getPeerNodes().get(0).getNodeId();
if (cluster.isOMActive(followerNodeId)) {
followerNodeId = leaderOM.getPeerNodes().get(1).getNodeId();
}
OzoneManager followerOM = cluster.getOzoneManager(followerNodeId);
// Do some transactions so that the log index increases
List<String> keys = writeKeysToIncreaseLogIndex(leaderRatisServer, 200);
// Get the latest db checkpoint from the leader OM.
TransactionInfo transactionInfo = TransactionInfo.readTransactionInfo(leaderOM.getMetadataManager());
TermIndex leaderOMTermIndex = TermIndex.valueOf(transactionInfo.getTerm(), transactionInfo.getTransactionIndex());
long leaderOMSnapshotIndex = leaderOMTermIndex.getIndex();
long leaderOMSnapshotTermIndex = leaderOMTermIndex.getTerm();
DBCheckpoint leaderDbCheckpoint = leaderOM.getMetadataManager().getStore().getCheckpoint(false);
// Start the inactive OM
cluster.startInactiveOM(followerNodeId);
// The recently started OM should be lagging behind the leader OM.
// Wait & for follower to update transactions to leader snapshot index.
// Timeout error if follower does not load update within 3s
GenericTestUtils.waitFor(() -> {
return followerOM.getOmRatisServer().getLastAppliedTermIndex().getIndex() >= leaderOMSnapshotIndex - 1;
}, 100, 3000);
long followerOMLastAppliedIndex = followerOM.getOmRatisServer().getLastAppliedTermIndex().getIndex();
assertTrue(followerOMLastAppliedIndex >= leaderOMSnapshotIndex - 1);
// Install leader OM's db checkpoint on the lagging OM.
followerOM.installCheckpoint(leaderOMNodeId, leaderDbCheckpoint);
// After the new checkpoint is installed, the follower OM
// lastAppliedIndex must >= the snapshot index of the checkpoint. It
// could be great than snapshot index if there is any conf entry from ratis.
followerOMLastAppliedIndex = followerOM.getOmRatisServer().getLastAppliedTermIndex().getIndex();
assertTrue(followerOMLastAppliedIndex >= leaderOMSnapshotIndex);
assertTrue(followerOM.getOmRatisServer().getLastAppliedTermIndex().getTerm() >= leaderOMSnapshotTermIndex);
// Verify that the follower OM's DB contains the transactions which were
// made while it was inactive.
OMMetadataManager followerOMMetaMngr = followerOM.getMetadataManager();
Assert.assertNotNull(followerOMMetaMngr.getVolumeTable().get(followerOMMetaMngr.getVolumeKey(volumeName)));
Assert.assertNotNull(followerOMMetaMngr.getBucketTable().get(followerOMMetaMngr.getBucketKey(volumeName, bucketName)));
for (String key : keys) {
Assert.assertNotNull(followerOMMetaMngr.getKeyTable(getDefaultBucketLayout()).get(followerOMMetaMngr.getOzoneKey(volumeName, bucketName, key)));
}
}
use of org.apache.hadoop.hdds.utils.db.DBCheckpoint in project ozone by apache.
the class TestOMRatisSnapshots method testInstallCorruptedCheckpointFailure.
@Test
public void testInstallCorruptedCheckpointFailure() throws Exception {
// Get the leader OM
String leaderOMNodeId = OmFailoverProxyUtil.getFailoverProxyProvider(objectStore.getClientProxy()).getCurrentProxyOMNodeId();
OzoneManager leaderOM = cluster.getOzoneManager(leaderOMNodeId);
OzoneManagerRatisServer leaderRatisServer = leaderOM.getOmRatisServer();
// Find the inactive OM
String followerNodeId = leaderOM.getPeerNodes().get(0).getNodeId();
if (cluster.isOMActive(followerNodeId)) {
followerNodeId = leaderOM.getPeerNodes().get(1).getNodeId();
}
OzoneManager followerOM = cluster.getOzoneManager(followerNodeId);
// Do some transactions so that the log index increases
writeKeysToIncreaseLogIndex(leaderRatisServer, 100);
DBCheckpoint leaderDbCheckpoint = leaderOM.getMetadataManager().getStore().getCheckpoint(false);
Path leaderCheckpointLocation = leaderDbCheckpoint.getCheckpointLocation();
TransactionInfo leaderCheckpointTrxnInfo = OzoneManagerRatisUtils.getTrxnInfoFromCheckpoint(conf, leaderCheckpointLocation);
// Corrupt the leader checkpoint and install that on the OM. The
// operation should fail and OM should shutdown.
boolean delete = true;
for (File file : leaderCheckpointLocation.toFile().listFiles()) {
if (file.getName().contains(".sst")) {
if (delete) {
file.delete();
delete = false;
} else {
delete = true;
}
}
}
GenericTestUtils.setLogLevel(OzoneManager.LOG, Level.ERROR);
GenericTestUtils.LogCapturer logCapture = GenericTestUtils.LogCapturer.captureLogs(OzoneManager.LOG);
followerOM.setExitManagerForTesting(new DummyExitManager());
followerOM.installCheckpoint(leaderOMNodeId, leaderCheckpointLocation, leaderCheckpointTrxnInfo);
Assert.assertTrue(logCapture.getOutput().contains("System Exit: " + "Failed to reload OM state and instantiate services."));
}
Aggregations