use of org.apache.ignite.internal.cluster.ClusterTopologyCheckedException in project ignite by apache.
the class GridJobWorker method finishJob.
/**
* @param res Resuilt.
* @param ex Exception
* @param sndReply If {@code true}, reply will be sent.
* @param retry If {@code true}, retry response will be sent.
*/
void finishJob(@Nullable Object res, @Nullable IgniteException ex, boolean sndReply, boolean retry) {
// Avoid finishing a job more than once from different threads.
if (!finishing.compareAndSet(false, true))
return;
// Do not send reply if job has been cancelled from system.
if (sndReply)
sndReply = !sysCancelled;
// We should save message ID here since listener callback will reset sequence.
ClusterNode sndNode = ctx.discovery().node(taskNode.id());
finishTime = U.currentTimeMillis();
Collection<IgniteBiTuple<Integer, String>> evts = null;
try {
if (ses.isFullSupport())
evtLsnr.onBeforeJobResponseSent(this);
// Send response back only if job has not timed out.
if (!isTimedOut()) {
if (sndReply) {
if (sndNode == null) {
onMasterNodeLeft();
U.warn(log, "Failed to reply to sender node because it left grid [nodeId=" + taskNode.id() + ", ses=" + ses + ", jobId=" + ses.getJobId() + ", job=" + job + ']');
status = FAILED;
// Record job reply failure.
if (!internal && ctx.event().isRecordable(EVT_JOB_FAILED))
evts = addEvent(evts, EVT_JOB_FAILED, "Job reply failed (task node left grid): " + job);
} else {
try {
byte[] resBytes = null;
byte[] exBytes = null;
byte[] attrBytes = null;
boolean loc = ctx.localNodeId().equals(sndNode.id()) && !ctx.config().isMarshalLocalJobs();
Map<Object, Object> attrs = jobCtx.getAttributes();
// Try to serialize response, and if exception - return to client.
if (!loc) {
try {
resBytes = U.marshal(marsh, res);
} catch (IgniteCheckedException e) {
resBytes = U.marshal(marsh, null);
if (ex != null)
ex.addSuppressed(e);
else
ex = U.convertException(e);
logError("Failed to serialize job response [nodeId=" + taskNode.id() + ", ses=" + ses + ", jobId=" + ses.getJobId() + ", job=" + job + ", resCls=" + (res == null ? null : res.getClass()) + ']', e);
}
try {
attrBytes = U.marshal(marsh, attrs);
} catch (IgniteCheckedException e) {
attrBytes = U.marshal(marsh, Collections.emptyMap());
if (ex != null)
ex.addSuppressed(e);
else
ex = U.convertException(e);
logError("Failed to serialize job attributes [nodeId=" + taskNode.id() + ", ses=" + ses + ", jobId=" + ses.getJobId() + ", job=" + job + ", attrs=" + attrs + ']', e);
}
try {
exBytes = U.marshal(marsh, ex);
} catch (IgniteCheckedException e) {
String msg = "Failed to serialize job exception [nodeId=" + taskNode.id() + ", ses=" + ses + ", jobId=" + ses.getJobId() + ", job=" + job + ", msg=\"" + e.getMessage() + "\"]";
ex = new IgniteException(msg);
logError(msg, e);
exBytes = U.marshal(marsh, ex);
}
}
if (ex != null) {
status = FAILED;
if (isStarted) {
// Job failed.
if (!internal && ctx.event().isRecordable(EVT_JOB_FAILED))
evts = addEvent(evts, EVT_JOB_FAILED, "Job failed due to exception [ex=" + ex + ", job=" + job + ']');
} else if (!internal && ctx.event().isRecordable(EVT_JOB_REJECTED))
evts = addEvent(evts, EVT_JOB_REJECTED, "Job has not been started " + "[ex=" + ex + ", job=" + job + ']');
} else {
status = FINISHED;
if (!internal && ctx.event().isRecordable(EVT_JOB_FINISHED))
evts = addEvent(evts, EVT_JOB_FINISHED, /*no message for success. */
null);
}
GridJobExecuteResponse jobRes = new GridJobExecuteResponse(ctx.localNodeId(), ses.getId(), ses.getJobId(), exBytes, loc ? ex : null, resBytes, loc ? res : null, attrBytes, loc ? attrs : null, isCancelled(), retry ? ctx.cache().context().exchange().readyAffinityVersion() : null);
long timeout = ses.getEndTime() - U.currentTimeMillis();
if (timeout <= 0)
// Ignore the actual timeout and send response anyway.
timeout = 1;
if (ses.isFullSupport()) {
// Send response to designated job topic.
// Always go through communication to preserve order,
// if attributes are enabled.
ctx.io().sendOrderedMessage(sndNode, taskTopic, jobRes, internal ? MANAGEMENT_POOL : SYSTEM_POOL, timeout, false);
} else if (ctx.localNodeId().equals(sndNode.id()))
ctx.task().processJobExecuteResponse(ctx.localNodeId(), jobRes);
else
// Send response to common topic as unordered message.
ctx.io().sendToGridTopic(sndNode, TOPIC_TASK, jobRes, internal ? MANAGEMENT_POOL : SYSTEM_POOL);
} catch (IgniteCheckedException e) {
// Log and invoke the master-leave callback.
if ((e instanceof ClusterTopologyCheckedException) || isDeadNode(taskNode.id())) {
onMasterNodeLeft();
// Avoid stack trace for left nodes.
U.warn(log, "Failed to reply to sender node because it left grid " + "[nodeId=" + taskNode.id() + ", jobId=" + ses.getJobId() + ", ses=" + ses + ", job=" + job + ']');
} else
logError("Error sending reply for job [nodeId=" + sndNode.id() + ", jobId=" + ses.getJobId() + ", ses=" + ses + ", job=" + job + ']', e);
if (!internal && ctx.event().isRecordable(EVT_JOB_FAILED))
evts = addEvent(evts, EVT_JOB_FAILED, "Failed to send reply for job [nodeId=" + taskNode.id() + ", job=" + job + ']');
}// it gets thrown for some reason.
catch (Exception e) {
String msg = "Failed to send reply for job [nodeId=" + taskNode.id() + ", job=" + job + ']';
logError(msg, e);
if (!internal && ctx.event().isRecordable(EVT_JOB_FAILED))
evts = addEvent(evts, EVT_JOB_FAILED, msg);
}
}
} else {
if (ex != null) {
status = FAILED;
if (isStarted) {
if (!internal && ctx.event().isRecordable(EVT_JOB_FAILED))
evts = addEvent(evts, EVT_JOB_FAILED, "Job failed due to exception [ex=" + ex + ", job=" + job + ']');
} else if (!internal && ctx.event().isRecordable(EVT_JOB_REJECTED))
evts = addEvent(evts, EVT_JOB_REJECTED, "Job has not been started [ex=" + ex + ", job=" + job + ']');
} else {
status = FINISHED;
if (!internal && ctx.event().isRecordable(EVT_JOB_FINISHED))
evts = addEvent(evts, EVT_JOB_FINISHED, /*no message for success. */
null);
}
}
} else {
// Job timed out.
status = FAILED;
if (!internal && ctx.event().isRecordable(EVT_JOB_FAILED))
evts = addEvent(evts, EVT_JOB_FAILED, "Job failed due to timeout: " + job);
}
} finally {
if (evts != null) {
for (IgniteBiTuple<Integer, String> t : evts) recordEvent(t.get1(), t.get2());
}
// Listener callback.
evtLsnr.onJobFinished(this);
}
}
use of org.apache.ignite.internal.cluster.ClusterTopologyCheckedException in project ignite by apache.
the class GridTaskWorker method sendRequest.
/**
* @param res Job result.
*/
private void sendRequest(ComputeJobResult res) {
assert res != null;
GridJobExecuteRequest req = null;
ClusterNode node = res.getNode();
try {
ClusterNode curNode = ctx.discovery().node(node.id());
// thrown in case of send failure.
if (curNode == null) {
if (log.isDebugEnabled()) {
U.warn(log, "Failed to send job request because remote node left grid (if fail-over is enabled, " + "will attempt fail-over to another node) [node=" + node + ", taskName=" + ses.getTaskName() + ", taskSesId=" + ses.getId() + ", jobSesId=" + res.getJobContext().getJobId() + ']');
}
ctx.resource().invokeAnnotated(dep, res.getJob(), ComputeJobAfterSend.class);
GridJobExecuteResponse fakeRes = new GridJobExecuteResponse(node.id(), ses.getId(), res.getJobContext().getJobId(), null, null, null, null, null, null, false, null);
fakeRes.setFakeException(new ClusterTopologyException("Failed to send job due to node failure: " + node));
onResponse(fakeRes);
} else {
long timeout = ses.getEndTime() == Long.MAX_VALUE ? Long.MAX_VALUE : ses.getEndTime() - U.currentTimeMillis();
if (timeout > 0) {
boolean loc = node.id().equals(ctx.discovery().localNode().id()) && !ctx.config().isMarshalLocalJobs();
Map<Object, Object> sesAttrs = ses.isFullSupport() ? ses.getAttributes() : null;
Map<? extends Serializable, ? extends Serializable> jobAttrs = (Map<? extends Serializable, ? extends Serializable>) res.getJobContext().getAttributes();
boolean forceLocDep = internal || !ctx.deploy().enabled();
try {
MarshallerUtils.jobReceiverVersion(node.version());
req = new GridJobExecuteRequest(ses.getId(), res.getJobContext().getJobId(), ses.getTaskName(), ses.getUserVersion(), ses.getTaskClassName(), loc ? null : U.marshal(marsh, res.getJob()), loc ? res.getJob() : null, ses.getStartTime(), timeout, ses.getTopology(), loc ? ses.getTopologyPredicate() : null, loc ? null : U.marshal(marsh, ses.getTopologyPredicate()), loc ? null : U.marshal(marsh, ses.getJobSiblings()), loc ? ses.getJobSiblings() : null, loc ? null : U.marshal(marsh, sesAttrs), loc ? sesAttrs : null, loc ? null : U.marshal(marsh, jobAttrs), loc ? jobAttrs : null, ses.getCheckpointSpi(), dep.classLoaderId(), dep.deployMode(), continuous, dep.participants(), forceLocDep, ses.isFullSupport(), internal, affCacheIds, affPartId, mapTopVer, ses.executorName());
} finally {
MarshallerUtils.jobReceiverVersion(null);
}
if (loc)
ctx.job().processJobExecuteRequest(ctx.discovery().localNode(), req);
else {
byte plc;
if (internal)
plc = MANAGEMENT_POOL;
else {
Byte ctxPlc = getThreadContext(TC_IO_POLICY);
if (ctxPlc != null)
plc = ctxPlc;
else
plc = PUBLIC_POOL;
}
// Send job execution request.
ctx.io().sendToGridTopic(node, TOPIC_JOB, req, plc);
if (log.isDebugEnabled())
log.debug("Sent job request [req=" + req + ", node=" + node + ']');
}
if (!loc)
ctx.resource().invokeAnnotated(dep, res.getJob(), ComputeJobAfterSend.class);
} else
U.warn(log, "Job timed out prior to sending job execution request: " + res.getJob());
}
} catch (IgniteCheckedException e) {
IgniteException fakeErr = null;
try {
boolean deadNode = e instanceof ClusterTopologyCheckedException || isDeadNode(res.getNode().id());
// Avoid stack trace if node has left grid.
if (deadNode) {
if (log.isDebugEnabled()) {
U.warn(log, "Failed to send job request because remote node left grid (if failover is enabled, " + "will attempt fail-over to another node) [node=" + node + ", taskName=" + ses.getTaskName() + ", taskSesId=" + ses.getId() + ", jobSesId=" + res.getJobContext().getJobId() + ']');
}
fakeErr = new ClusterTopologyException("Failed to send job due to node failure: " + node, e);
} else if (log.isDebugEnabled())
U.error(log, "Failed to send job request: " + req, e);
} catch (IgniteClientDisconnectedCheckedException e0) {
if (log.isDebugEnabled())
log.debug("Failed to send job request, client disconnected [node=" + node + ", taskName=" + ses.getTaskName() + ", taskSesId=" + ses.getId() + ", jobSesId=" + res.getJobContext().getJobId() + ']');
fakeErr = U.convertException(e0);
}
GridJobExecuteResponse fakeRes = new GridJobExecuteResponse(node.id(), ses.getId(), res.getJobContext().getJobId(), null, null, null, null, null, null, false, null);
if (fakeErr == null)
fakeErr = U.convertException(e);
fakeRes.setFakeException(fakeErr);
onResponse(fakeRes);
}
}
use of org.apache.ignite.internal.cluster.ClusterTopologyCheckedException in project ignite by apache.
the class CommunicationWorker method processDisconnect.
/**
* @param sesInfo Disconnected session information.
*/
private void processDisconnect(DisconnectedSessionInfo sesInfo) {
GridNioRecoveryDescriptor recoveryDesc = sesInfo.recoveryDescription();
ClusterNode node = recoveryDesc.node();
if (!recoveryDesc.nodeAlive(nodeGetter.apply(node.id())))
return;
try {
if (log.isDebugEnabled())
log.debug("Recovery reconnect [rmtNode=" + recoveryDesc.node().id() + ']');
GridCommunicationClient client = clientPool.reserveClient(node, sesInfo.connectionIndex());
client.release();
} catch (ClusterTopologyCheckedException e) {
if (log.isDebugEnabled())
log.debug("Recovery reconnect failed, node stopping [rmtNode=" + recoveryDesc.node().id() + ']');
} catch (IgniteTooManyOpenFilesException e) {
eRegistrySupplier.get().onException(e.getMessage(), e);
throw e;
} catch (IgniteCheckedException | IgniteException e) {
try {
if (recoveryDesc.nodeAlive(nodeGetter.apply(node.id())) && pingNode.apply(node.id())) {
if (log.isDebugEnabled()) {
log.debug("Recovery reconnect failed, will retry " + "[rmtNode=" + recoveryDesc.node().id() + ", err=" + e + ']');
}
addProcessDisconnectRequest(sesInfo);
} else {
if (log.isDebugEnabled()) {
log.debug("Recovery reconnect failed, " + "node left [rmtNode=" + recoveryDesc.node().id() + ", err=" + e + ']');
}
eRegistrySupplier.get().onException("Recovery reconnect failed, node left [rmtNode=" + recoveryDesc.node().id() + "]", e);
}
} catch (IgniteClientDisconnectedException ignored) {
if (log.isDebugEnabled())
log.debug("Failed to ping node, client disconnected.");
}
}
}
use of org.apache.ignite.internal.cluster.ClusterTopologyCheckedException in project ignite by apache.
the class AuthenticationProcessorNodeRestartTest method testConcurrentAuthorize.
/**
* @throws Exception If failed.
*/
@Test
public void testConcurrentAuthorize() throws Exception {
final int testUsersCnt = 10;
withSecurityContextOnAllNodes(secCtxDflt);
for (int i = 0; i < testUsersCnt; ++i) grid(CLI_NODE).context().security().createUser("test" + i, ("passwd_test" + i).toCharArray());
final IgniteInternalFuture restartFut = GridTestUtils.runAsync(() -> {
try {
for (int i = 0; i < RESTARTS; ++i) {
int nodeIdx = RND.nextInt(NODES_COUNT - 1);
stopGrid(nodeIdx);
U.sleep(500);
startGrid(nodeIdx);
U.sleep(500);
}
} catch (Exception e) {
log.error("Unexpected exception.", e);
fail("Unexpected exception on server restart: " + e.getMessage());
}
});
final AtomicInteger usrCnt = new AtomicInteger();
GridTestUtils.runMultiThreaded(() -> {
String user = "test" + usrCnt.getAndIncrement();
try {
while (!restartFut.isDone()) {
SecurityContext secCtx = authenticate(grid(CLI_NODE), user, "passwd_" + user);
assertNotNull(secCtx);
}
} catch (ClusterTopologyCheckedException ignored) {
// No-op.
} catch (Exception e) {
log.error("Unexpected exception.", e);
fail("Unexpected exception: " + e.getMessage());
}
}, testUsersCnt, "user-op");
restartFut.get();
}
use of org.apache.ignite.internal.cluster.ClusterTopologyCheckedException in project ignite by apache.
the class GridNearTxEnlistFuture method onNodeLeft.
/**
* {@inheritDoc}
*/
@Override
public boolean onNodeLeft(UUID nodeId) {
if (batches.keySet().contains(nodeId)) {
if (log.isDebugEnabled())
log.debug("Found unacknowledged batch for left node [nodeId=" + nodeId + ", fut=" + this + ']');
ClusterTopologyCheckedException topEx = new ClusterTopologyCheckedException("Failed to enlist keys " + "(primary node left grid, retry transaction if possible) [node=" + nodeId + ']');
topEx.retryReadyFuture(cctx.shared().nextAffinityReadyFuture(topVer));
onDone(topEx);
}
if (log.isDebugEnabled())
log.debug("Future does not have mapping for left node (ignoring) [nodeId=" + nodeId + ", fut=" + this + ']');
return false;
}
Aggregations