use of org.apache.geode.distributed.internal.membership.MembershipManager in project geode by apache.
the class ProductUseLog method monitorUse.
/**
* adds the log as a membership listener to the given system and logs the view when members join
*/
public void monitorUse(InternalDistributedSystem system) {
this.system = system;
DM dmgr = system.getDistributionManager();
dmgr.addMembershipListener(this);
MembershipManager mmgr = dmgr.getMembershipManager();
if (mmgr != null) {
log("Log opened with new distributed system connection. " + system.getDM().getMembershipManager().getView());
} else {
// membership manager not initialized?
log("Log opened with new distributed system connection. Membership view not yet available in this VM.");
}
}
use of org.apache.geode.distributed.internal.membership.MembershipManager in project geode by apache.
the class MembershipManagerHelper method getMembershipManager.
/** returns the JGroupMembershipManager for the given distributed system */
public static MembershipManager getMembershipManager(DistributedSystem sys) {
InternalDistributedSystem isys = (InternalDistributedSystem) sys;
DistributionManager dm = (DistributionManager) isys.getDM();
MembershipManager mgr = dm.getMembershipManager();
return mgr;
}
use of org.apache.geode.distributed.internal.membership.MembershipManager in project geode by apache.
the class DistributionManagerDUnitTest method testSurpriseMemberHandling.
/**
* Test the handling of "surprise members" in the membership manager. Create a DistributedSystem
* in this VM and then add a fake member to its surpriseMember set. Then ensure that it stays in
* the set when a new membership view arrives that doesn't contain it. Then wait until the member
* should be gone and force more view processing to have it scrubbed from the set.
**/
@Test
public void testSurpriseMemberHandling() throws Exception {
System.setProperty(DistributionConfig.GEMFIRE_PREFIX + "surprise-member-timeout", "3000");
InternalDistributedSystem sys = getSystem();
MembershipManager mgr = MembershipManagerHelper.getMembershipManager(sys);
assertTrue(((GMSMembershipManager) mgr).isCleanupTimerStarted());
try {
InternalDistributedMember mbr = new InternalDistributedMember(NetworkUtils.getIPLiteral(), 12345);
// first make sure we can't add this as a surprise member (bug #44566)
// if the view number isn't being recorded correctly the test will pass but the
// functionality is broken
Assert.assertTrue("expected view ID to be greater than zero", mgr.getView().getViewId() > 0);
int oldViewId = mbr.getVmViewId();
mbr.setVmViewId((int) mgr.getView().getViewId() - 1);
org.apache.geode.test.dunit.LogWriterUtils.getLogWriter().info("current membership view is " + mgr.getView());
org.apache.geode.test.dunit.LogWriterUtils.getLogWriter().info("created ID " + mbr + " with view ID " + mbr.getVmViewId());
IgnoredException.addIgnoredException("attempt to add old member");
IgnoredException.addIgnoredException("Removing shunned GemFire node");
boolean accepted = mgr.addSurpriseMember(mbr);
Assert.assertTrue("member with old ID was not rejected (bug #44566)", !accepted);
mbr.setVmViewId(oldViewId);
// now forcibly add it as a surprise member and show that it is reaped
long gracePeriod = 5000;
long startTime = System.currentTimeMillis();
long timeout = ((GMSMembershipManager) mgr).getSurpriseMemberTimeout();
long birthTime = startTime - timeout + gracePeriod;
MembershipManagerHelper.addSurpriseMember(sys, mbr, birthTime);
assertTrue("Member was not a surprise member", mgr.isSurpriseMember(mbr));
// if (birthTime < (System.currentTimeMillis() - timeout)) {
// return; // machine is too busy and we didn't get enough CPU to perform more assertions
// }
Awaitility.await("waiting for member to be removed").atMost((timeout / 3) + gracePeriod, TimeUnit.MILLISECONDS).until(() -> !mgr.isSurpriseMember(mbr));
} finally {
if (sys != null && sys.isConnected()) {
sys.disconnect();
}
}
}
use of org.apache.geode.distributed.internal.membership.MembershipManager in project geode by apache.
the class DistributionManagerDUnitTest method testConnectAfterBeingShunned.
/**
* Demonstrate that a new UDP port is used when an attempt is made to reconnect using a shunned
* port
*/
@Test
public void testConnectAfterBeingShunned() {
InternalDistributedSystem sys = getSystem();
MembershipManager mgr = MembershipManagerHelper.getMembershipManager(sys);
InternalDistributedMember idm = mgr.getLocalMember();
// TODO GMS needs to have a system property allowing the bind-port to be set
System.setProperty(DistributionConfig.GEMFIRE_PREFIX + "jg-bind-port", "" + idm.getPort());
sys.disconnect();
sys = getSystem();
mgr = MembershipManagerHelper.getMembershipManager(sys);
sys.disconnect();
InternalDistributedMember idm2 = mgr.getLocalMember();
org.apache.geode.test.dunit.LogWriterUtils.getLogWriter().info("original ID=" + idm + " and after connecting=" + idm2);
assertTrue("should not have used a different udp port", idm.getPort() == idm2.getPort());
}
use of org.apache.geode.distributed.internal.membership.MembershipManager in project geode by apache.
the class InternalDistributedSystem method reconnect.
/**
* A reconnect is tried when gemfire is configured to reconnect in case of a required role loss.
* The reconnect will try reconnecting to the distributed system every max-time-out millseconds
* for max-number-of-tries configured in gemfire.properties file. It uses the cache.xml file to
* intialize the cache and create regions.
*/
private void reconnect(boolean forcedDisconnect, String reason) {
// Collect all the state for cache
// Collect all the state for Regions
// Close the cache,
// loop trying to connect, waiting before each attempt
//
// If reconnecting for lost-roles the reconnected system's cache will decide
// whether the reconnected system should stay up. After max-tries we will
// give up.
//
// If reconnecting for forced-disconnect we ignore max-tries and keep attempting
// to join the distributed system until successful
this.attemptingToReconnect = true;
InternalDistributedSystem ids = InternalDistributedSystem.getAnyInstance();
if (ids == null) {
ids = this;
}
// first save the current cache description. This is created by
// the membership manager when forced-disconnect starts. If we're
// reconnecting for lost roles then this will be null
String cacheXML = null;
List<CacheServerCreation> cacheServerCreation = null;
InternalCache cache = GemFireCacheImpl.getInstance();
if (cache != null) {
cacheXML = cache.getCacheConfig().getCacheXMLDescription();
cacheServerCreation = cache.getCacheConfig().getCacheServerCreation();
}
DistributionConfig oldConfig = ids.getConfig();
Properties configProps = getProperties();
int timeOut = oldConfig.getMaxWaitTimeForReconnect();
int maxTries = oldConfig.getMaxNumReconnectTries();
final boolean isDebugEnabled = logger.isDebugEnabled();
if (Thread.currentThread().getName().equals("DisconnectThread")) {
if (isDebugEnabled) {
logger.debug("changing thread name to ReconnectThread");
}
Thread.currentThread().setName("ReconnectThread");
}
// get the membership manager for quorum checks
MembershipManager mbrMgr = this.dm.getMembershipManager();
this.quorumChecker = mbrMgr.getQuorumChecker();
if (logger.isDebugEnabled()) {
if (quorumChecker == null) {
logger.debug("No quorum checks will be performed during reconnect attempts");
} else {
logger.debug("Initialized quorum checking service: {}", quorumChecker);
}
}
// LOG:CLEANUP: deal with reconnect and INHIBIT_DM_BANNER -- this should be ok now
String appendToLogFile = System.getProperty(APPEND_TO_LOG_FILE);
if (appendToLogFile == null) {
System.setProperty(APPEND_TO_LOG_FILE, "true");
}
String inhibitBanner = System.getProperty(InternalLocator.INHIBIT_DM_BANNER);
if (inhibitBanner == null) {
System.setProperty(InternalLocator.INHIBIT_DM_BANNER, "true");
}
if (forcedDisconnect) {
systemAttemptingReconnect = this;
}
try {
while (this.reconnectDS == null || !this.reconnectDS.isConnected()) {
if (isReconnectCancelled()) {
break;
}
if (!forcedDisconnect) {
if (isDebugEnabled) {
logger.debug("Max number of tries : {} and max time out : {}", maxTries, timeOut);
}
if (reconnectAttemptCounter >= maxTries) {
if (isDebugEnabled) {
logger.debug("Stopping the checkrequiredrole thread because reconnect : {} reached the max number of reconnect tries : {}", reconnectAttemptCounter, maxTries);
}
throw new CacheClosedException(LocalizedStrings.InternalDistributedSystem_SOME_REQUIRED_ROLES_MISSING.toLocalizedString());
}
}
if (reconnectAttemptCounter == 0) {
reconnectAttemptTime = System.currentTimeMillis();
}
reconnectAttemptCounter++;
if (isReconnectCancelled()) {
return;
}
logger.info("Disconnecting old DistributedSystem to prepare for a reconnect attempt");
try {
disconnect(true, reason, false);
} catch (Exception ee) {
logger.warn("Exception disconnecting for reconnect", ee);
}
try {
reconnectLock.wait(timeOut);
} catch (InterruptedException e) {
logger.warn(LocalizedMessage.create(LocalizedStrings.InternalDistributedSystem_WAITING_THREAD_FOR_RECONNECT_GOT_INTERRUPTED));
Thread.currentThread().interrupt();
return;
}
if (isReconnectCancelled()) {
return;
}
logger.info(LocalizedMessage.create(LocalizedStrings.DISTRIBUTED_SYSTEM_RECONNECTING, new Object[] { reconnectAttemptCounter }));
int savNumOfTries = reconnectAttemptCounter;
try {
// notify listeners of each attempt and then again after successful
notifyReconnectListeners(this, this.reconnectDS, true);
if (this.locatorDMTypeForced) {
System.setProperty(InternalLocator.FORCE_LOCATOR_DM_TYPE, "true");
}
configProps.put(DistributionConfig.DS_RECONNECTING_NAME, Boolean.TRUE);
if (quorumChecker != null) {
configProps.put(DistributionConfig.DS_QUORUM_CHECKER_NAME, quorumChecker);
}
InternalDistributedSystem newDS = null;
if (isReconnectCancelled()) {
return;
}
try {
newDS = (InternalDistributedSystem) connect(configProps);
} catch (CancelException e) {
if (isReconnectCancelled()) {
return;
} else {
throw e;
}
} finally {
if (newDS == null && quorumChecker != null) {
// make sure the quorum checker is listening for messages from former members
quorumChecker.resume();
}
}
if (this.reconnectCancelled) {
newDS.disconnect();
continue;
}
this.reconnectDS = newDS;
} catch (SystemConnectException e) {
logger.debug("Attempt to reconnect failed with SystemConnectException");
if (e.getMessage().contains("Rejecting the attempt of a member using an older version")) {
logger.warn(LocalizedMessage.create(LocalizedStrings.InternalDistributedSystem_EXCEPTION_OCCURRED_WHILE_TRYING_TO_CONNECT_THE_SYSTEM_DURING_RECONNECT), e);
attemptingToReconnect = false;
return;
}
continue;
} catch (GemFireConfigException e) {
if (isDebugEnabled) {
logger.debug("Attempt to reconnect failed with GemFireConfigException");
}
continue;
} catch (Exception ee) {
logger.warn(LocalizedMessage.create(LocalizedStrings.InternalDistributedSystem_EXCEPTION_OCCURRED_WHILE_TRYING_TO_CONNECT_THE_SYSTEM_DURING_RECONNECT), ee);
attemptingToReconnect = false;
return;
} finally {
if (this.locatorDMTypeForced) {
System.getProperties().remove(InternalLocator.FORCE_LOCATOR_DM_TYPE);
}
reconnectAttemptCounter = savNumOfTries;
}
DM newDM = this.reconnectDS.getDistributionManager();
if (newDM instanceof DistributionManager) {
// a cache
if (((DistributionManager) newDM).getDMType() != DistributionManager.ADMIN_ONLY_DM_TYPE) {
try {
CacheConfig config = new CacheConfig();
if (cacheXML != null) {
config.setCacheXMLDescription(cacheXML);
}
cache = GemFireCacheImpl.create(this.reconnectDS, config);
createAndStartCacheServers(cacheServerCreation, cache);
if (cache.getCachePerfStats().getReliableRegionsMissing() == 0) {
reconnectAttemptCounter = 0;
} else {
// this try failed. The new cache will call reconnect again
}
} catch (CacheXmlException e) {
logger.warn("Exception occurred while trying to create the cache during reconnect", e);
reconnectDS.disconnect();
reconnectDS = null;
reconnectCancelled = true;
break;
} catch (CancelException ignor) {
logger.warn("Exception occurred while trying to create the cache during reconnect", ignor);
reconnectDS.disconnect();
reconnectDS = null;
} catch (Exception e) {
logger.warn(LocalizedMessage.create(LocalizedStrings.InternalDistributedSystem_EXCEPTION_OCCURRED_WHILE_TRYING_TO_CREATE_THE_CACHE_DURING_RECONNECT), e);
}
}
}
if (reconnectDS != null && reconnectDS.isConnected()) {
// make sure the new DS and cache are stable before exiting this loop
try {
Thread.sleep(config.getMemberTimeout() * 3);
} catch (InterruptedException e) {
logger.info("Reconnect thread has been interrupted - exiting");
Thread.currentThread().interrupt();
return;
}
}
}
if (isReconnectCancelled()) {
reconnectDS.disconnect();
} else {
reconnectDS.isReconnectingDS = false;
notifyReconnectListeners(this, this.reconnectDS, false);
}
} finally {
systemAttemptingReconnect = null;
attemptingToReconnect = false;
if (appendToLogFile == null) {
System.getProperties().remove(APPEND_TO_LOG_FILE);
} else {
System.setProperty(APPEND_TO_LOG_FILE, appendToLogFile);
}
if (inhibitBanner == null) {
System.getProperties().remove(InternalLocator.INHIBIT_DM_BANNER);
} else {
System.setProperty(InternalLocator.INHIBIT_DM_BANNER, inhibitBanner);
}
if (quorumChecker != null) {
mbrMgr.releaseQuorumChecker(quorumChecker);
}
}
if (isReconnectCancelled()) {
logger.debug("reconnect can no longer be done because of an explicit disconnect");
if (reconnectDS != null) {
reconnectDS.disconnect();
}
attemptingToReconnect = false;
return;
} else {
logger.info("Reconnect completed.\nNew DistributedSystem is {}\nNew Cache is {}", reconnectDS, cache);
}
}
Aggregations