use of org.apache.ignite.configuration.CommunicationFailureResolver in project ignite by apache.
the class GridDiscoveryManager method initCommunicationErrorResolveConfiguration.
/**
* @param cfg Configuration.
* @throws IgniteCheckedException If configuration is not valid.
*/
public static void initCommunicationErrorResolveConfiguration(IgniteConfiguration cfg) throws IgniteCheckedException {
CommunicationFailureResolver rslvr = cfg.getCommunicationFailureResolver();
CommunicationSpi commSpi = cfg.getCommunicationSpi();
DiscoverySpi discoverySpi = cfg.getDiscoverySpi();
if (rslvr != null) {
if (!supportsCommunicationErrorResolve(commSpi)) {
throw new IgniteCheckedException("CommunicationFailureResolver is configured, but CommunicationSpi does not support communication" + "problem resolve: " + commSpi.getClass().getName());
}
if (!supportsCommunicationErrorResolve(discoverySpi)) {
throw new IgniteCheckedException("CommunicationFailureResolver is configured, but DiscoverySpi does not support communication" + "problem resolve: " + discoverySpi.getClass().getName());
}
} else {
if (supportsCommunicationErrorResolve(commSpi) && supportsCommunicationErrorResolve(discoverySpi))
cfg.setCommunicationFailureResolver(new DefaultCommunicationFailureResolver());
}
}
use of org.apache.ignite.configuration.CommunicationFailureResolver in project ignite by apache.
the class ZookeeperDiscoveryImpl method onCommunicationErrorResolveStatusReceived.
/**
* @param rtState Runtime state.
* @throws Exception If failed.
*/
private void onCommunicationErrorResolveStatusReceived(final ZkRuntimeState rtState) throws Exception {
ZkDiscoveryEventsData evtsData = rtState.evtsData;
UUID futId = evtsData.communicationErrorResolveFutureId();
if (log.isInfoEnabled())
log.info("Received communication status from all nodes [reqId=" + futId + ']');
assert futId != null;
String futPath = zkPaths.distributedFutureBasePath(futId);
List<ClusterNode> initialNodes = rtState.commErrProcNodes;
assert initialNodes != null;
rtState.commErrProcNodes = null;
List<ClusterNode> topSnapshot = rtState.top.topologySnapshot();
Map<UUID, BitSet> nodesRes = U.newHashMap(topSnapshot.size());
Exception err = null;
for (ClusterNode node : topSnapshot) {
byte[] stateBytes = ZkDistributedCollectDataFuture.readNodeResult(futPath, rtState.zkClient, node.order());
ZkCommunicationErrorNodeState nodeState = unmarshalZip(stateBytes);
if (nodeState.err != null) {
if (err == null)
err = new Exception("Failed to resolve communication error.");
err.addSuppressed(nodeState.err);
} else {
assert nodeState.commState != null;
nodesRes.put(node.id(), nodeState.commState);
}
}
long topVer = evtsData.topVer;
GridLongList killedNodesList = null;
if (err == null) {
boolean fullyConnected = true;
for (Map.Entry<UUID, BitSet> e : nodesRes.entrySet()) {
if (!checkFullyConnected(e.getValue(), initialNodes, rtState.top)) {
fullyConnected = false;
break;
}
}
if (fullyConnected) {
if (log.isInfoEnabled()) {
log.info("Finish communication error resolve process automatically, there are no " + "communication errors [reqId=" + futId + ']');
}
} else {
CommunicationFailureResolver rslvr = spi.ignite().configuration().getCommunicationFailureResolver();
if (rslvr != null) {
if (log.isInfoEnabled()) {
log.info("Call communication error resolver [reqId=" + futId + ", rslvr=" + rslvr.getClass().getSimpleName() + ']');
}
ZkCommunicationFailureContext ctx = new ZkCommunicationFailureContext(((IgniteKernal) spi.ignite()).context().cache().context(), topSnapshot, initialNodes, nodesRes);
try {
rslvr.resolve(ctx);
Set<ClusterNode> killedNodes = ctx.killedNodes();
if (killedNodes != null) {
if (log.isInfoEnabled()) {
log.info("Communication error resolver forced nodes stop [reqId=" + futId + ", killNodeCnt=" + killedNodes.size() + ", nodeIds=" + U.nodeIds(killedNodes) + ']');
}
killedNodesList = new GridLongList(killedNodes.size());
for (ClusterNode killedNode : killedNodes) {
killedNodesList.add(((ZookeeperClusterNode) killedNode).internalId());
evtsData.topVer++;
}
}
} catch (Exception e) {
err = e;
U.error(log, "Failed to resolve communication error with configured resolver [reqId=" + futId + ']', e);
}
}
}
}
evtsData.communicationErrorResolveFutureId(null);
ZkCommunicationErrorResolveResult res = new ZkCommunicationErrorResolveResult(killedNodesList, err);
ZkCommunicationErrorResolveFinishMessage msg = new ZkCommunicationErrorResolveFinishMessage(futId, topVer);
msg.res = res;
ZkDistributedCollectDataFuture.saveResult(zkPaths.distributedFutureResultPath(futId), rtState.zkClient, marshalZip(res));
evtsData.evtIdGen++;
ZkDiscoveryCustomEventData evtData = new ZkDiscoveryCustomEventData(evtsData.evtIdGen, 0L, topVer, locNode.id(), msg, null);
evtData.resolvedMsg = msg;
evtsData.addEvent(rtState.top.nodesByOrder.values(), evtData);
saveAndProcessNewEvents();
// Need re-check alive nodes in case join was delayed.
rtState.zkClient.getChildrenAsync(zkPaths.aliveNodesDir, rtState.watcher, rtState.watcher);
}
use of org.apache.ignite.configuration.CommunicationFailureResolver in project ignite by apache.
the class ZookeeperDiscoveryCommunicationFailureTest method testCommunicationFailureResolve_CachesInfo1.
/**
* @throws Exception If failed.
*/
@Test
public void testCommunicationFailureResolve_CachesInfo1() throws Exception {
testCommSpi = true;
sesTimeout = 5000;
final CacheInfoCommunicationFailureResolver rslvr = new CacheInfoCommunicationFailureResolver();
commFailureRslvr = new IgniteOutClosure<CommunicationFailureResolver>() {
@Override
public CommunicationFailureResolver apply() {
return rslvr;
}
};
startGrids(2);
awaitPartitionMapExchange();
Map<String, T3<Integer, Integer, Integer>> expCaches = new HashMap<>();
expCaches.put(DEFAULT_CACHE_NAME, new T3<>(RendezvousAffinityFunction.DFLT_PARTITION_COUNT, 0, 1));
checkResolverCachesInfo(ignite(0), expCaches);
List<CacheConfiguration> caches = new ArrayList<>();
CacheConfiguration c1 = new CacheConfiguration("c1");
c1.setBackups(1);
c1.setAffinity(new RendezvousAffinityFunction(false, 64));
caches.add(c1);
CacheConfiguration c2 = new CacheConfiguration("c2");
c2.setBackups(2);
c2.setAffinity(new RendezvousAffinityFunction(false, 128));
caches.add(c2);
CacheConfiguration c3 = new CacheConfiguration("c3");
c3.setCacheMode(CacheMode.REPLICATED);
c3.setAffinity(new RendezvousAffinityFunction(false, 256));
caches.add(c3);
ignite(0).createCaches(caches);
expCaches.put("c1", new T3<>(64, 1, 2));
expCaches.put("c2", new T3<>(128, 2, 2));
expCaches.put("c3", new T3<>(256, 1, 2));
checkResolverCachesInfo(ignite(0), expCaches);
startGrid(2);
startGrid(3);
awaitPartitionMapExchange();
expCaches.put("c2", new T3<>(128, 2, 3));
expCaches.put("c3", new T3<>(256, 1, 4));
checkResolverCachesInfo(ignite(0), expCaches);
CacheConfiguration<Object, Object> c4 = new CacheConfiguration<>("c4");
c4.setCacheMode(CacheMode.PARTITIONED);
c4.setBackups(0);
c4.setAffinity(new RendezvousAffinityFunction(false, 256));
c4.setNodeFilter(new TestCacheNodeExcludingFilter(getTestIgniteInstanceName(0), getTestIgniteInstanceName(1)));
ignite(2).createCache(c4);
expCaches.put("c4", new T3<>(256, 0, 1));
checkResolverCachesInfo(ignite(0), expCaches);
// Stop current coordinator, check new coordinator will initialize required caches information.
stopGrid(0);
awaitPartitionMapExchange();
expCaches.put("c3", new T3<>(256, 1, 3));
checkResolverCachesInfo(ignite(1), expCaches);
startGrid(0);
expCaches.put("c3", new T3<>(256, 1, 4));
checkResolverCachesInfo(ignite(1), expCaches);
stopGrid(1);
expCaches.put("c3", new T3<>(256, 1, 3));
checkResolverCachesInfo(ignite(3), expCaches);
}
use of org.apache.ignite.configuration.CommunicationFailureResolver in project ignite by apache.
the class ZookeeperDiscoveryCommunicationFailureTest method testCommunicationFailureResolve_CachesInfo2.
/**
* @throws Exception If failed.
*/
@Test
public void testCommunicationFailureResolve_CachesInfo2() throws Exception {
testCommSpi = true;
sesTimeout = 5000;
final CacheInfoCommunicationFailureResolver rslvr = new CacheInfoCommunicationFailureResolver();
commFailureRslvr = new IgniteOutClosure<CommunicationFailureResolver>() {
@Override
public CommunicationFailureResolver apply() {
return rslvr;
}
};
Ignite srv0 = startGrid(0);
CacheConfiguration<Object, Object> ccfg = new CacheConfiguration<>("c1");
ccfg.setBackups(1);
srv0.createCache(ccfg);
// Block rebalance to make sure node0 will be the only owner.
TestRecordingCommunicationSpi.spi(srv0).blockMessages(new IgniteBiPredicate<ClusterNode, Message>() {
@Override
public boolean apply(ClusterNode node, Message msg) {
return msg instanceof GridDhtPartitionSupplyMessage && ((GridDhtPartitionSupplyMessage) msg).groupId() == CU.cacheId("c1");
}
});
startGrid(1);
U.sleep(1000);
ZookeeperDiscoverySpi spi = spi(srv0);
rslvr.latch = new CountDownLatch(1);
ZkTestCommunicationSpi.testSpi(srv0).initCheckResult(2, 0);
spi.resolveCommunicationFailure(spi.getRemoteNodes().iterator().next(), new Exception("test"));
assertTrue(rslvr.latch.await(10, SECONDS));
List<List<ClusterNode>> cacheOwners = rslvr.ownersMap.get("c1");
ClusterNode node0 = srv0.cluster().localNode();
for (int p = 0; p < RendezvousAffinityFunction.DFLT_PARTITION_COUNT; p++) {
List<ClusterNode> owners = cacheOwners.get(p);
assertEquals(1, owners.size());
assertEquals(node0, owners.get(0));
}
TestRecordingCommunicationSpi.spi(srv0).stopBlock();
awaitPartitionMapExchange();
Map<String, T3<Integer, Integer, Integer>> expCaches = new HashMap<>();
expCaches.put(DEFAULT_CACHE_NAME, new T3<>(RendezvousAffinityFunction.DFLT_PARTITION_COUNT, 0, 1));
expCaches.put("c1", new T3<>(RendezvousAffinityFunction.DFLT_PARTITION_COUNT, 1, 2));
checkResolverCachesInfo(srv0, expCaches);
}
Aggregations