use of org.apache.curator.framework.recipes.locks.InterProcessLock in project coprhd-controller by CoprHD.
the class ComputeSystemDiscoveryEngine method discover.
/**
* Performs discovery of a given target. An exclusive lock is obtained for the target so that only a single node can
* be performing discovery for any given object at a time.
*
* @param targetId
* the ID of the target to discover.
*
* @throws Exception
* if an error occurs obtaining a lock.
*/
public void discover(String targetId) throws Exception {
InterProcessLock lock = coordinatorClient.getLock(targetId);
if (LOG.isInfoEnabled()) {
LOG.info("Acquiring lock for compute system discovery: {}", targetId);
}
lock.acquire();
try {
if (LOG.isInfoEnabled()) {
LOG.info("Acquired lock for compute system discovery: {}", targetId);
}
discoverInLock(targetId);
} finally {
lock.release();
if (LOG.isInfoEnabled()) {
LOG.info("Lock Released for compute system discovery: {}", targetId);
}
}
}
use of org.apache.curator.framework.recipes.locks.InterProcessLock in project coprhd-controller by CoprHD.
the class DisasterRecoveryService method addStandby.
/**
* Attach one fresh install site to this active site as standby
* Or attach a active site for the local standby site when it's first being added.
*
* @param param site detail information
* @brief Add standby site
* @return site response information
*/
@POST
@Consumes({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON })
@Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON })
@CheckPermission(roles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN }, blockProxies = true)
public SiteRestRep addStandby(SiteAddParam param) {
log.info("Adding standby site: {}", param.getVip());
precheckForSiteNumber();
precheckForGeo();
List<Site> existingSites = drUtil.listStandbySites();
// parameter validation and precheck
validateAddParam(param, existingSites);
// check the version before using the ViPR client, otherwise there might be compatibility issues.
precheckStandbyVersion(param);
ViPRCoreClient viprCoreClient;
SiteConfigRestRep standbyConfig;
try {
viprCoreClient = createViPRCoreClient(param.getVip(), param.getUsername(), param.getPassword());
standbyConfig = viprCoreClient.site().getStandbyConfig();
} catch (Exception e) {
log.error("Unexpected error when retrieving standby config", e);
throw APIException.internalServerErrors.addStandbyPrecheckFailed("Cannot retrieve config from standby site");
}
String siteId = standbyConfig.getUuid();
precheckForStandbyAdd(standbyConfig, viprCoreClient);
InterProcessLock lock = drUtil.getDROperationLock();
Site standbySite = null;
try {
standbySite = new Site();
standbySite.setCreationTime((new Date()).getTime());
standbySite.setName(param.getName());
standbySite.setVdcShortId(drUtil.getLocalVdcShortId());
standbySite.setVip(standbyConfig.getVip());
standbySite.setVip6(standbyConfig.getVip6());
standbySite.getHostIPv4AddressMap().putAll(new StringMap(standbyConfig.getHostIPv4AddressMap()));
standbySite.getHostIPv6AddressMap().putAll(new StringMap(standbyConfig.getHostIPv6AddressMap()));
standbySite.setNodeCount(standbyConfig.getNodeCount());
standbySite.setUuid(standbyConfig.getUuid());
String shortId = generateShortId(drUtil.listSites());
standbySite.setSiteShortId(shortId);
standbySite.setDescription(param.getDescription());
standbySite.setState(SiteState.STANDBY_ADDING);
if (log.isDebugEnabled()) {
log.debug(standbySite.toString());
}
// Do this before tx get started which might write key to zk.
SecretKey secretKey = apiSignatureGenerator.getSignatureKey(SignatureKeyType.INTERVDC_API);
coordinator.startTransaction();
coordinator.addSite(standbyConfig.getUuid());
log.info("Persist standby site to ZK {}", shortId);
// coordinator.setTargetInfo(standbySite);
coordinator.persistServiceConfiguration(standbySite.toConfiguration());
drUtil.recordDrOperationStatus(standbySite.getUuid(), InterState.ADDING_STANDBY);
// wake up syssvc to regenerate configurations
long vdcConfigVersion = DrUtil.newVdcConfigVersion();
drUtil.updateVdcTargetVersion(coordinator.getSiteId(), SiteInfo.DR_OP_ADD_STANDBY, vdcConfigVersion);
for (Site site : existingSites) {
drUtil.updateVdcTargetVersion(site.getUuid(), SiteInfo.DR_OP_ADD_STANDBY, vdcConfigVersion);
}
// sync site related info with to be added standby site
long dataRevision = vdcConfigVersion;
List<Site> allStandbySites = new ArrayList<>();
allStandbySites.add(standbySite);
allStandbySites.addAll(existingSites);
SiteConfigParam configParam = prepareSiteConfigParam(allStandbySites, ipsecConfig.getPreSharedKey(), standbyConfig.getUuid(), dataRevision, vdcConfigVersion, secretKey);
viprCoreClient.site().syncSite(standbyConfig.getUuid(), configParam);
drUtil.updateVdcTargetVersion(siteId, SiteInfo.DR_OP_CHANGE_DATA_REVISION, vdcConfigVersion, dataRevision);
coordinator.commitTransaction();
auditDisasterRecoveryOps(OperationTypeEnum.ADD_STANDBY, AuditLogManager.AUDITLOG_SUCCESS, AuditLogManager.AUDITOP_BEGIN, standbySite.toBriefString());
return siteMapper.map(standbySite);
} catch (Exception e) {
log.error("Internal error for updating coordinator on standby", e);
coordinator.discardTransaction();
auditDisasterRecoveryOps(OperationTypeEnum.ADD_STANDBY, AuditLogManager.AUDITLOG_FAILURE, null, standbySite.toBriefString());
InternalServerErrorException addStandbyFailedException = APIException.internalServerErrors.addStandbyFailed(e.getMessage());
throw addStandbyFailedException;
} finally {
try {
lock.release();
} catch (Exception ignore) {
log.error(String.format("Lock release failed when adding standby %s", siteId));
}
}
}
use of org.apache.curator.framework.recipes.locks.InterProcessLock in project coprhd-controller by CoprHD.
the class DisasterRecoveryService method retryOperation.
/**
* Query the latest error message & Retry the Last operation for specific standby site
*
* @param uuid site UUID
* @brief Query the latest error message & Retry the Last operation for specific standby site
* @return updated standby site representation
*/
@POST
@Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON })
@CheckPermission(roles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN, Role.SYSTEM_ADMIN, Role.RESTRICTED_SYSTEM_ADMIN })
@Path("/{uuid}/retry")
public SiteRestRep retryOperation(@PathParam("uuid") String uuid) {
log.info("Begin to get site error by uuid {}", uuid);
Site standby;
try {
standby = drUtil.getSiteFromLocalVdc(uuid);
} catch (CoordinatorException e) {
log.error("Can't find site {} from ZK", uuid);
throw APIException.badRequests.siteIdNotFound();
}
if (!standby.getState().equals(SiteState.STANDBY_ERROR)) {
log.error("site {} is in state {}, should be STANDBY_ERROR", uuid, standby.getState());
throw APIException.badRequests.operationOnlyAllowedOnErrorSite(standby.getName(), standby.getState().toString());
}
if (!standby.getLastState().equals(SiteState.STANDBY_PAUSING) && !standby.getLastState().equals(SiteState.STANDBY_RESUMING) && !standby.getLastState().equals(SiteState.STANDBY_FAILING_OVER)) {
log.error("site {} lastState was {}, retry is only supported for Pause, Resume and Failover", uuid, standby.getLastState());
throw APIException.badRequests.operationRetryOnlyAllowedOnLastState(standby.getName(), standby.getLastState().toString());
}
// Reuse the current action required
Site localSite = drUtil.getLocalSite();
SiteInfo siteInfo = coordinator.getTargetInfo(localSite.getUuid(), SiteInfo.class);
String drOperation = siteInfo.getActionRequired();
// Check that last action matches retry action
if (!drOperation.equals(standby.getLastState().getDRAction())) {
log.error("Active site last operation was {}, retry is only supported if no other operations have been performed", drOperation);
throw APIException.internalServerErrors.retryStandbyPrecheckFailed(standby.getName(), standby.getLastState().toString(), String.format("Another DR operation %s has been run on Active site. Only the latest operation can be retried. " + "This is an unrecoverable Error, please remove site and deploy a new one.", drOperation));
}
InterProcessLock lock = drUtil.getDROperationLock();
try {
coordinator.startTransaction();
standby.setState(standby.getLastState());
// Failover requires setting old active site to last state as well.
if (standby.getState() == SiteState.STANDBY_FAILING_OVER) {
for (Site site : drUtil.listSites()) {
if (site.getLastState() == SiteState.ACTIVE_FAILING_OVER) {
site.setState(SiteState.ACTIVE_FAILING_OVER);
coordinator.persistServiceConfiguration(site.toConfiguration());
}
}
}
coordinator.persistServiceConfiguration(standby.toConfiguration());
log.info("Notify all sites for reconfig");
long vdcTargetVersion = DrUtil.newVdcConfigVersion();
for (Site site : drUtil.listSites()) {
String siteUuid = site.getUuid();
if (site.getLastState() == SiteState.STANDBY_RESUMING) {
SiteInfo siteTargetInfo = coordinator.getTargetInfo(siteUuid, SiteInfo.class);
String resumeSiteOperation = siteTargetInfo.getActionRequired();
if (resumeSiteOperation.equals(SiteInfo.DR_OP_CHANGE_DATA_REVISION)) {
long dataRevision = vdcTargetVersion;
drUtil.updateVdcTargetVersion(siteUuid, resumeSiteOperation, vdcTargetVersion, dataRevision);
continue;
}
}
log.info("Set dr operation {} on site {}", drOperation, siteUuid);
drUtil.updateVdcTargetVersion(siteUuid, drOperation, vdcTargetVersion);
}
coordinator.commitTransaction();
return siteMapper.map(standby);
} catch (Exception e) {
log.error("Error retrying site operation for site {}", uuid, e);
coordinator.discardTransaction();
auditDisasterRecoveryOps(OperationTypeEnum.RETRY_STANDBY_OP, AuditLogManager.AUDITLOG_FAILURE, null, standby);
InternalServerErrorException retryStandbyOpFailedException = APIException.internalServerErrors.retryStandbyOpFailed(standby.getName(), e.getMessage());
throw retryStandbyOpFailedException;
} finally {
try {
lock.release();
} catch (Exception ignore) {
log.error(String.format("Lock release failed when retrying standby site last op: %s", uuid));
}
}
}
use of org.apache.curator.framework.recipes.locks.InterProcessLock in project coprhd-controller by CoprHD.
the class DisasterRecoveryService method resumeStandby.
/**
* Resume data replication for a paused standby site
*
* @param uuid site UUID
* @brief Resume data replication for a paused standby site
* @return updated standby site representation
*/
@POST
@Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON })
@CheckPermission(roles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN, Role.SYSTEM_ADMIN, Role.RESTRICTED_SYSTEM_ADMIN }, blockProxies = true)
@Path("/{uuid}/resume")
public SiteRestRep resumeStandby(@PathParam("uuid") String uuid) {
log.info("Begin to resume data sync to standby site identified by uuid: {}", uuid);
Site standby = validateSiteConfig(uuid);
SiteState state = standby.getState();
if (!state.equals(SiteState.STANDBY_PAUSED) && !state.equals(SiteState.ACTIVE_DEGRADED)) {
log.error("site {} is in state {}, should be STANDBY_PAUSED or ACTIVE_DEGRADED", uuid, standby.getState());
throw APIException.badRequests.operationOnlyAllowedOnPausedSite(standby.getName(), standby.getState().toString());
}
SiteNetworkState networkState = drUtil.getSiteNetworkState(uuid);
if (networkState.getNetworkHealth() == NetworkHealth.BROKEN) {
throw APIException.internalServerErrors.siteConnectionBroken(standby.getName(), "Network health state is broken.");
}
try (InternalSiteServiceClient client = createInternalSiteServiceClient(standby)) {
commonPrecheck();
client.setCoordinatorClient(coordinator);
client.setKeyGenerator(apiSignatureGenerator);
client.resumePrecheck();
} catch (APIException e) {
throw e;
} catch (Exception e) {
throw APIException.internalServerErrors.resumeStandbyPrecheckFailed(standby.getName(), e.getMessage());
}
// Do this before tx get started which might write key to zk.
SecretKey secretKey = apiSignatureGenerator.getSignatureKey(SignatureKeyType.INTERVDC_API);
InterProcessLock lock = drUtil.getDROperationLock();
long vdcTargetVersion = DrUtil.newVdcConfigVersion();
try {
coordinator.startTransaction();
for (Site site : drUtil.listStandbySites()) {
if (site.getUuid().equals(uuid)) {
log.error("Re-init the target standby", uuid);
// init the to-be resumed standby site
long dataRevision = vdcTargetVersion;
List<Site> standbySites = drUtil.listStandbySites();
SiteConfigParam configParam = prepareSiteConfigParam(standbySites, ipsecConfig.getPreSharedKey(), uuid, dataRevision, vdcTargetVersion, secretKey);
try (InternalSiteServiceClient internalSiteServiceClient = new InternalSiteServiceClient()) {
internalSiteServiceClient.setCoordinatorClient(coordinator);
internalSiteServiceClient.setServer(site.getVipEndPoint());
internalSiteServiceClient.initStandby(configParam);
}
site.setState(SiteState.STANDBY_RESUMING);
coordinator.persistServiceConfiguration(site.toConfiguration());
drUtil.recordDrOperationStatus(site.getUuid(), InterState.RESUMING_STANDBY);
drUtil.updateVdcTargetVersion(uuid, SiteInfo.DR_OP_CHANGE_DATA_REVISION, vdcTargetVersion, dataRevision);
} else {
drUtil.updateVdcTargetVersion(site.getUuid(), SiteInfo.DR_OP_RESUME_STANDBY, vdcTargetVersion);
}
}
// update the local(active) site last
drUtil.updateVdcTargetVersion(coordinator.getSiteId(), SiteInfo.DR_OP_RESUME_STANDBY, vdcTargetVersion);
coordinator.commitTransaction();
auditDisasterRecoveryOps(OperationTypeEnum.RESUME_STANDBY, AuditLogManager.AUDITLOG_SUCCESS, AuditLogManager.AUDITOP_BEGIN, standby.toBriefString());
return siteMapper.map(standby);
} catch (Exception e) {
log.error("Error resuming site {}", uuid, e);
coordinator.discardTransaction();
auditDisasterRecoveryOps(OperationTypeEnum.RESUME_STANDBY, AuditLogManager.AUDITLOG_FAILURE, null, standby.toBriefString());
InternalServerErrorException resumeStandbyFailedException = APIException.internalServerErrors.resumeStandbyFailed(standby.getName(), e.getMessage());
throw resumeStandbyFailedException;
} finally {
try {
lock.release();
} catch (Exception ignore) {
log.error(String.format("Lock release failed when resuming standby site: %s", uuid));
}
}
}
use of org.apache.curator.framework.recipes.locks.InterProcessLock in project coprhd-controller by CoprHD.
the class DisasterRecoveryService method doSwitchover.
/**
* Do Site Switchover
* This API will do switchover to target new active site according passed in site UUID. After failover, old active site will
* work as normal standby site and target site will be promoted to active. All site will update properties to trigger reconfig.
*
* @param uuid target new active site UUID
* @brief Do site switchover
* @return return accepted response if operation is successful
*/
@POST
@Produces({ MediaType.APPLICATION_XML, MediaType.APPLICATION_JSON })
@Path("/{uuid}/switchover")
@CheckPermission(roles = { Role.SECURITY_ADMIN, Role.RESTRICTED_SECURITY_ADMIN }, blockProxies = true)
public Response doSwitchover(@PathParam("uuid") String uuid) {
log.info("Begin to switchover for standby UUID {}", uuid);
precheckForSwitchoverForActiveSite(uuid);
List<Site> allStandbySites = drUtil.listStandbySites();
for (Site site : allStandbySites) {
if (!site.getUuid().equals(uuid) && site.getState() == SiteState.STANDBY_PAUSED) {
try (InternalSiteServiceClient client = new InternalSiteServiceClient(site)) {
client.setCoordinatorClient(coordinator);
client.setKeyGenerator(apiSignatureGenerator);
client.switchoverPrecheck();
}
}
}
String oldActiveUUID = drUtil.getActiveSite().getUuid();
InterProcessLock lock = drUtil.getDROperationLock();
Site newActiveSite = null;
Site oldActiveSite = null;
try {
newActiveSite = drUtil.getSiteFromLocalVdc(uuid);
// Set old active site's state, short id and key
oldActiveSite = drUtil.getSiteFromLocalVdc(oldActiveUUID);
if (StringUtils.isEmpty(oldActiveSite.getSiteShortId())) {
oldActiveSite.setSiteShortId(newActiveSite.getVdcShortId());
}
coordinator.startTransaction();
oldActiveSite.setState(SiteState.ACTIVE_SWITCHING_OVER);
coordinator.persistServiceConfiguration(oldActiveSite.toConfiguration());
// this barrier is set when begin switchover and will be removed by new active site. Old active site will wait and reboot after
// barrier is removed
DistributedBarrier restartBarrier = coordinator.getDistributedBarrier(String.format("%s/%s/%s", ZkPath.SITES, oldActiveSite.getUuid(), Constants.SWITCHOVER_BARRIER_RESTART));
restartBarrier.setBarrier();
drUtil.recordDrOperationStatus(oldActiveSite.getUuid(), InterState.SWITCHINGOVER_ACTIVE);
// trigger reconfig
// a version for all sites.
long vdcConfigVersion = DrUtil.newVdcConfigVersion();
for (Site eachSite : drUtil.listSites()) {
if (!eachSite.getUuid().equals(uuid) && eachSite.getState() == SiteState.STANDBY_PAUSED) {
try (InternalSiteServiceClient client = new InternalSiteServiceClient(eachSite)) {
client.setCoordinatorClient(coordinator);
client.setKeyGenerator(apiSignatureGenerator);
client.switchover(newActiveSite.getUuid(), vdcConfigVersion);
}
} else {
drUtil.updateVdcTargetVersion(eachSite.getUuid(), SiteInfo.DR_OP_SWITCHOVER, vdcConfigVersion, oldActiveSite.getUuid(), newActiveSite.getUuid());
}
}
coordinator.commitTransaction();
auditDisasterRecoveryOps(OperationTypeEnum.SWITCHOVER, AuditLogManager.AUDITLOG_SUCCESS, AuditLogManager.AUDITOP_BEGIN, oldActiveSite.toBriefString(), newActiveSite.toBriefString());
return Response.status(Response.Status.ACCEPTED).build();
} catch (Exception e) {
log.error(String.format("Error happened when switchover from site %s to site %s", oldActiveUUID, uuid), e);
coordinator.discardTransaction();
auditDisasterRecoveryOps(OperationTypeEnum.SWITCHOVER, AuditLogManager.AUDITLOG_FAILURE, null, newActiveSite.getName(), newActiveSite.getVipEndPoint());
throw APIException.internalServerErrors.switchoverFailed(oldActiveSite.getName(), newActiveSite.getName(), e.getMessage());
} finally {
try {
lock.release();
} catch (Exception ignore) {
log.error(String.format("Lock release failed when switchover from %s to %s", oldActiveUUID, uuid));
}
}
}
Aggregations