Search in sources :

Example 6 with DBCheckpoint

use of org.apache.hadoop.hdds.utils.db.DBCheckpoint in project ozone by apache.

the class StorageContainerServiceProviderImpl method getSCMDBSnapshot.

public DBCheckpoint getSCMDBSnapshot() {
    String snapshotFileName = RECON_SCM_SNAPSHOT_DB + "_" + System.currentTimeMillis();
    File targetFile = new File(scmSnapshotDBParentDir, snapshotFileName + ".tar.gz");
    try {
        if (!SCMHAUtils.isSCMHAEnabled(configuration)) {
            SecurityUtil.doAsLoginUser(() -> {
                try (InputStream inputStream = reconUtils.makeHttpCall(connectionFactory, getScmDBSnapshotUrl(), isOmSpnegoEnabled()).getInputStream()) {
                    FileUtils.copyInputStreamToFile(inputStream, targetFile);
                }
                return null;
            });
            LOG.info("Downloaded SCM Snapshot from SCM");
        } else {
            List<String> ratisRoles = scmClient.getScmInfo().getRatisPeerRoles();
            for (String ratisRole : ratisRoles) {
                String[] role = ratisRole.split(":");
                if (role[2].equals(RaftProtos.RaftPeerRole.LEADER.toString())) {
                    String hostAddress = role[4].trim();
                    int grpcPort = configuration.getInt(ScmConfigKeys.OZONE_SCM_GRPC_PORT_KEY, ScmConfigKeys.OZONE_SCM_GRPC_PORT_DEFAULT);
                    try (SCMSnapshotDownloader downloadClient = new InterSCMGrpcClient(hostAddress, grpcPort, configuration, new ReconCertificateClient(new SecurityConfig(configuration), reconStorage.getReconCertSerialId()))) {
                        downloadClient.download(targetFile.toPath()).get();
                    } catch (ExecutionException | InterruptedException e) {
                        LOG.error("Rocks DB checkpoint downloading failed", e);
                        throw new IOException(e);
                    }
                    LOG.info("Downloaded SCM Snapshot from Leader SCM");
                    break;
                }
            }
        }
        Path untarredDbDir = Paths.get(scmSnapshotDBParentDir.getAbsolutePath(), snapshotFileName);
        reconUtils.untarCheckpointFile(targetFile, untarredDbDir);
        FileUtils.deleteQuietly(targetFile);
        return new RocksDBCheckpoint(untarredDbDir);
    } catch (IOException e) {
        LOG.error("Unable to obtain SCM DB Snapshot. ", e);
    }
    return null;
}
Also used : Path(java.nio.file.Path) RocksDBCheckpoint(org.apache.hadoop.hdds.utils.db.RocksDBCheckpoint) InputStream(java.io.InputStream) SCMSnapshotDownloader(org.apache.hadoop.hdds.scm.ha.SCMSnapshotDownloader) InterSCMGrpcClient(org.apache.hadoop.hdds.scm.ha.InterSCMGrpcClient) IOException(java.io.IOException) DBCheckpoint(org.apache.hadoop.hdds.utils.db.DBCheckpoint) RocksDBCheckpoint(org.apache.hadoop.hdds.utils.db.RocksDBCheckpoint) ReconCertificateClient(org.apache.hadoop.hdds.security.x509.certificate.client.ReconCertificateClient) SecurityConfig(org.apache.hadoop.hdds.security.x509.SecurityConfig) ExecutionException(java.util.concurrent.ExecutionException) File(java.io.File)

Example 7 with DBCheckpoint

use of org.apache.hadoop.hdds.utils.db.DBCheckpoint in project ozone by apache.

the class OzoneManager method installSnapshotFromLeader.

/**
 * Download and install latest checkpoint from leader OM.
 *
 * @param leaderId peerNodeID of the leader OM
 * @return If checkpoint is installed successfully, return the
 *         corresponding termIndex. Otherwise, return null.
 */
public TermIndex installSnapshotFromLeader(String leaderId) {
    if (omSnapshotProvider == null) {
        LOG.error("OM Snapshot Provider is not configured as there are no peer " + "nodes.");
        return null;
    }
    DBCheckpoint omDBCheckpoint = getDBCheckpointFromLeader(leaderId);
    LOG.info("Downloaded checkpoint from Leader {} to the location {}", leaderId, omDBCheckpoint.getCheckpointLocation());
    TermIndex termIndex = null;
    try {
        termIndex = installCheckpoint(leaderId, omDBCheckpoint);
    } catch (Exception ex) {
        LOG.error("Failed to install snapshot from Leader OM.", ex);
    }
    return termIndex;
}
Also used : DBCheckpoint(org.apache.hadoop.hdds.utils.db.DBCheckpoint) AuthenticationException(org.apache.hadoop.security.authentication.client.AuthenticationException) OMNotLeaderException(org.apache.hadoop.ozone.om.exceptions.OMNotLeaderException) IOException(java.io.IOException) SequenceNumberNotFoundException(org.apache.hadoop.hdds.utils.db.SequenceNumberNotFoundException) UncheckedIOException(java.io.UncheckedIOException) OzoneSecurityException(org.apache.hadoop.hdds.security.OzoneSecurityException) OMLeaderNotReadyException(org.apache.hadoop.ozone.om.exceptions.OMLeaderNotReadyException) OMException(org.apache.hadoop.ozone.om.exceptions.OMException) ConfigurationException(org.apache.hadoop.hdds.conf.ConfigurationException) CertificateException(java.security.cert.CertificateException) TermIndex(org.apache.ratis.server.protocol.TermIndex)

Example 8 with DBCheckpoint

use of org.apache.hadoop.hdds.utils.db.DBCheckpoint in project ozone by apache.

the class DBCheckpointServlet method doGet.

/**
 * Process a GET request for the DB checkpoint snapshot.
 *
 * @param request  The servlet request we are processing
 * @param response The servlet response we are creating
 */
@Override
public void doGet(HttpServletRequest request, HttpServletResponse response) {
    LOG.info("Received request to obtain DB checkpoint snapshot");
    if (dbStore == null) {
        LOG.error("Unable to process metadata snapshot request. DB Store is null");
        response.setStatus(HttpServletResponse.SC_INTERNAL_SERVER_ERROR);
        return;
    }
    // Check ACL for dbCheckpoint only when global Ozone ACL is enable
    if (aclEnabled) {
        final java.security.Principal userPrincipal = request.getUserPrincipal();
        if (userPrincipal == null) {
            final String remoteUser = request.getRemoteUser();
            LOG.error("Permission denied: Unauthorized access to /dbCheckpoint," + " no user principal found. Current login user is {}.", remoteUser != null ? "'" + remoteUser + "'" : "UNKNOWN");
            response.setStatus(HttpServletResponse.SC_FORBIDDEN);
            return;
        } else {
            final String userPrincipalName = userPrincipal.getName();
            UserGroupInformation ugi = UserGroupInformation.createRemoteUser(userPrincipalName);
            if (!hasPermission(ugi)) {
                LOG.error("Permission denied: User principal '{}' does not have" + " access to /dbCheckpoint.\nThis can happen when Ozone" + " Manager is started with a different user.\n" + " Please append '{}' to OM 'ozone.administrators'" + " config and restart OM to grant current" + " user access to this endpoint.", userPrincipalName, userPrincipalName);
                response.setStatus(HttpServletResponse.SC_FORBIDDEN);
                return;
            }
            LOG.debug("Granted user principal '{}' access to /dbCheckpoint.", userPrincipalName);
        }
    }
    DBCheckpoint checkpoint = null;
    try {
        boolean flush = false;
        String flushParam = request.getParameter(OZONE_DB_CHECKPOINT_REQUEST_FLUSH);
        if (StringUtils.isNotEmpty(flushParam)) {
            flush = Boolean.valueOf(flushParam);
        }
        checkpoint = dbStore.getCheckpoint(flush);
        if (checkpoint == null || checkpoint.getCheckpointLocation() == null) {
            LOG.error("Unable to process metadata snapshot request. " + "Checkpoint request returned null.");
            response.setStatus(HttpServletResponse.SC_INTERNAL_SERVER_ERROR);
            return;
        }
        dbMetrics.setLastCheckpointCreationTimeTaken(checkpoint.checkpointCreationTimeTaken());
        Path file = checkpoint.getCheckpointLocation().getFileName();
        if (file == null) {
            return;
        }
        response.setContentType("application/x-tgz");
        response.setHeader("Content-Disposition", "attachment; filename=\"" + file.toString() + ".tgz\"");
        Instant start = Instant.now();
        writeDBCheckpointToStream(checkpoint, response.getOutputStream());
        Instant end = Instant.now();
        long duration = Duration.between(start, end).toMillis();
        LOG.info("Time taken to write the checkpoint to response output " + "stream: {} milliseconds", duration);
        dbMetrics.setLastCheckpointStreamingTimeTaken(duration);
        dbMetrics.incNumCheckpoints();
    } catch (Exception e) {
        LOG.error("Unable to process metadata snapshot request. ", e);
        response.setStatus(HttpServletResponse.SC_INTERNAL_SERVER_ERROR);
        dbMetrics.incNumCheckpointFails();
    } finally {
        if (checkpoint != null) {
            try {
                checkpoint.cleanupCheckpoint();
            } catch (IOException e) {
                LOG.error("Error trying to clean checkpoint at {} .", checkpoint.getCheckpointLocation().toString());
            }
        }
    }
}
Also used : Path(java.nio.file.Path) DBCheckpoint(org.apache.hadoop.hdds.utils.db.DBCheckpoint) Instant(java.time.Instant) IOException(java.io.IOException) ServletException(javax.servlet.ServletException) CompressorException(org.apache.commons.compress.compressors.CompressorException) IOException(java.io.IOException) UserGroupInformation(org.apache.hadoop.security.UserGroupInformation)

Example 9 with DBCheckpoint

use of org.apache.hadoop.hdds.utils.db.DBCheckpoint in project ozone by apache.

the class TestOMRatisSnapshots method testInstallSnapshot.

@Test
public void testInstallSnapshot() throws Exception {
    // Get the leader OM
    String leaderOMNodeId = OmFailoverProxyUtil.getFailoverProxyProvider(objectStore.getClientProxy()).getCurrentProxyOMNodeId();
    OzoneManager leaderOM = cluster.getOzoneManager(leaderOMNodeId);
    OzoneManagerRatisServer leaderRatisServer = leaderOM.getOmRatisServer();
    // Find the inactive OM
    String followerNodeId = leaderOM.getPeerNodes().get(0).getNodeId();
    if (cluster.isOMActive(followerNodeId)) {
        followerNodeId = leaderOM.getPeerNodes().get(1).getNodeId();
    }
    OzoneManager followerOM = cluster.getOzoneManager(followerNodeId);
    // Do some transactions so that the log index increases
    List<String> keys = writeKeysToIncreaseLogIndex(leaderRatisServer, 200);
    // Get the latest db checkpoint from the leader OM.
    TransactionInfo transactionInfo = TransactionInfo.readTransactionInfo(leaderOM.getMetadataManager());
    TermIndex leaderOMTermIndex = TermIndex.valueOf(transactionInfo.getTerm(), transactionInfo.getTransactionIndex());
    long leaderOMSnapshotIndex = leaderOMTermIndex.getIndex();
    long leaderOMSnapshotTermIndex = leaderOMTermIndex.getTerm();
    DBCheckpoint leaderDbCheckpoint = leaderOM.getMetadataManager().getStore().getCheckpoint(false);
    // Start the inactive OM
    cluster.startInactiveOM(followerNodeId);
    // The recently started OM should be lagging behind the leader OM.
    // Wait & for follower to update transactions to leader snapshot index.
    // Timeout error if follower does not load update within 3s
    GenericTestUtils.waitFor(() -> {
        return followerOM.getOmRatisServer().getLastAppliedTermIndex().getIndex() >= leaderOMSnapshotIndex - 1;
    }, 100, 3000);
    long followerOMLastAppliedIndex = followerOM.getOmRatisServer().getLastAppliedTermIndex().getIndex();
    assertTrue(followerOMLastAppliedIndex >= leaderOMSnapshotIndex - 1);
    // Install leader OM's db checkpoint on the lagging OM.
    followerOM.installCheckpoint(leaderOMNodeId, leaderDbCheckpoint);
    // After the new checkpoint is installed, the follower OM
    // lastAppliedIndex must >= the snapshot index of the checkpoint. It
    // could be great than snapshot index if there is any conf entry from ratis.
    followerOMLastAppliedIndex = followerOM.getOmRatisServer().getLastAppliedTermIndex().getIndex();
    assertTrue(followerOMLastAppliedIndex >= leaderOMSnapshotIndex);
    assertTrue(followerOM.getOmRatisServer().getLastAppliedTermIndex().getTerm() >= leaderOMSnapshotTermIndex);
    // Verify that the follower OM's DB contains the transactions which were
    // made while it was inactive.
    OMMetadataManager followerOMMetaMngr = followerOM.getMetadataManager();
    Assert.assertNotNull(followerOMMetaMngr.getVolumeTable().get(followerOMMetaMngr.getVolumeKey(volumeName)));
    Assert.assertNotNull(followerOMMetaMngr.getBucketTable().get(followerOMMetaMngr.getBucketKey(volumeName, bucketName)));
    for (String key : keys) {
        Assert.assertNotNull(followerOMMetaMngr.getKeyTable(getDefaultBucketLayout()).get(followerOMMetaMngr.getOzoneKey(volumeName, bucketName, key)));
    }
}
Also used : DBCheckpoint(org.apache.hadoop.hdds.utils.db.DBCheckpoint) TransactionInfo(org.apache.hadoop.hdds.utils.TransactionInfo) OzoneManagerRatisServer(org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServer) TermIndex(org.apache.ratis.server.protocol.TermIndex) Test(org.junit.jupiter.api.Test)

Example 10 with DBCheckpoint

use of org.apache.hadoop.hdds.utils.db.DBCheckpoint in project ozone by apache.

the class TestOMRatisSnapshots method testInstallCorruptedCheckpointFailure.

@Test
public void testInstallCorruptedCheckpointFailure() throws Exception {
    // Get the leader OM
    String leaderOMNodeId = OmFailoverProxyUtil.getFailoverProxyProvider(objectStore.getClientProxy()).getCurrentProxyOMNodeId();
    OzoneManager leaderOM = cluster.getOzoneManager(leaderOMNodeId);
    OzoneManagerRatisServer leaderRatisServer = leaderOM.getOmRatisServer();
    // Find the inactive OM
    String followerNodeId = leaderOM.getPeerNodes().get(0).getNodeId();
    if (cluster.isOMActive(followerNodeId)) {
        followerNodeId = leaderOM.getPeerNodes().get(1).getNodeId();
    }
    OzoneManager followerOM = cluster.getOzoneManager(followerNodeId);
    // Do some transactions so that the log index increases
    writeKeysToIncreaseLogIndex(leaderRatisServer, 100);
    DBCheckpoint leaderDbCheckpoint = leaderOM.getMetadataManager().getStore().getCheckpoint(false);
    Path leaderCheckpointLocation = leaderDbCheckpoint.getCheckpointLocation();
    TransactionInfo leaderCheckpointTrxnInfo = OzoneManagerRatisUtils.getTrxnInfoFromCheckpoint(conf, leaderCheckpointLocation);
    // Corrupt the leader checkpoint and install that on the OM. The
    // operation should fail and OM should shutdown.
    boolean delete = true;
    for (File file : leaderCheckpointLocation.toFile().listFiles()) {
        if (file.getName().contains(".sst")) {
            if (delete) {
                file.delete();
                delete = false;
            } else {
                delete = true;
            }
        }
    }
    GenericTestUtils.setLogLevel(OzoneManager.LOG, Level.ERROR);
    GenericTestUtils.LogCapturer logCapture = GenericTestUtils.LogCapturer.captureLogs(OzoneManager.LOG);
    followerOM.setExitManagerForTesting(new DummyExitManager());
    followerOM.installCheckpoint(leaderOMNodeId, leaderCheckpointLocation, leaderCheckpointTrxnInfo);
    Assert.assertTrue(logCapture.getOutput().contains("System Exit: " + "Failed to reload OM state and instantiate services."));
}
Also used : Path(java.nio.file.Path) DBCheckpoint(org.apache.hadoop.hdds.utils.db.DBCheckpoint) TransactionInfo(org.apache.hadoop.hdds.utils.TransactionInfo) GenericTestUtils(org.apache.ozone.test.GenericTestUtils) OzoneManagerRatisServer(org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServer) File(java.io.File) Test(org.junit.jupiter.api.Test)

Aggregations

DBCheckpoint (org.apache.hadoop.hdds.utils.db.DBCheckpoint)23 File (java.io.File)10 IOException (java.io.IOException)10 Path (java.nio.file.Path)7 TermIndex (org.apache.ratis.server.protocol.TermIndex)7 Test (org.junit.Test)6 StorageContainerManager (org.apache.hadoop.hdds.scm.server.StorageContainerManager)5 RocksDBCheckpoint (org.apache.hadoop.hdds.utils.db.RocksDBCheckpoint)5 Test (org.junit.jupiter.api.Test)5 InputStream (java.io.InputStream)4 ReconUtils (org.apache.hadoop.ozone.recon.ReconUtils)4 GenericTestUtils (org.apache.ozone.test.GenericTestUtils)4 HttpURLConnection (java.net.HttpURLConnection)3 OzoneConfiguration (org.apache.hadoop.hdds.conf.OzoneConfiguration)3 SCMStateMachine (org.apache.hadoop.hdds.scm.ha.SCMStateMachine)3 TransactionInfo (org.apache.hadoop.hdds.utils.TransactionInfo)3 OzoneManagerRatisServer (org.apache.hadoop.ozone.om.ratis.OzoneManagerRatisServer)3 VisibleForTesting (com.google.common.annotations.VisibleForTesting)2 InvalidProtocolBufferException (com.google.protobuf.InvalidProtocolBufferException)2 FileInputStream (java.io.FileInputStream)2