Search in sources :

Example 41 with ClusterTopologyCheckedException

use of org.apache.ignite.internal.cluster.ClusterTopologyCheckedException in project ignite by apache.

the class GridJobWorker method finishJob.

/**
 * @param res Resuilt.
 * @param ex Exception
 * @param sndReply If {@code true}, reply will be sent.
 * @param retry If {@code true}, retry response will be sent.
 */
void finishJob(@Nullable Object res, @Nullable IgniteException ex, boolean sndReply, boolean retry) {
    // Avoid finishing a job more than once from different threads.
    if (!finishing.compareAndSet(false, true))
        return;
    // Do not send reply if job has been cancelled from system.
    if (sndReply)
        sndReply = !sysCancelled;
    // We should save message ID here since listener callback will reset sequence.
    ClusterNode sndNode = ctx.discovery().node(taskNode.id());
    finishTime = U.currentTimeMillis();
    Collection<IgniteBiTuple<Integer, String>> evts = null;
    try {
        if (ses.isFullSupport())
            evtLsnr.onBeforeJobResponseSent(this);
        // Send response back only if job has not timed out.
        if (!isTimedOut()) {
            if (sndReply) {
                if (sndNode == null) {
                    onMasterNodeLeft();
                    U.warn(log, "Failed to reply to sender node because it left grid [nodeId=" + taskNode.id() + ", ses=" + ses + ", jobId=" + ses.getJobId() + ", job=" + job + ']');
                    status = FAILED;
                    // Record job reply failure.
                    if (!internal && ctx.event().isRecordable(EVT_JOB_FAILED))
                        evts = addEvent(evts, EVT_JOB_FAILED, "Job reply failed (task node left grid): " + job);
                } else {
                    try {
                        byte[] resBytes = null;
                        byte[] exBytes = null;
                        byte[] attrBytes = null;
                        boolean loc = ctx.localNodeId().equals(sndNode.id()) && !ctx.config().isMarshalLocalJobs();
                        Map<Object, Object> attrs = jobCtx.getAttributes();
                        // Try to serialize response, and if exception - return to client.
                        if (!loc) {
                            try {
                                resBytes = U.marshal(marsh, res);
                            } catch (IgniteCheckedException e) {
                                resBytes = U.marshal(marsh, null);
                                if (ex != null)
                                    ex.addSuppressed(e);
                                else
                                    ex = U.convertException(e);
                                logError("Failed to serialize job response [nodeId=" + taskNode.id() + ", ses=" + ses + ", jobId=" + ses.getJobId() + ", job=" + job + ", resCls=" + (res == null ? null : res.getClass()) + ']', e);
                            }
                            try {
                                attrBytes = U.marshal(marsh, attrs);
                            } catch (IgniteCheckedException e) {
                                attrBytes = U.marshal(marsh, Collections.emptyMap());
                                if (ex != null)
                                    ex.addSuppressed(e);
                                else
                                    ex = U.convertException(e);
                                logError("Failed to serialize job attributes [nodeId=" + taskNode.id() + ", ses=" + ses + ", jobId=" + ses.getJobId() + ", job=" + job + ", attrs=" + attrs + ']', e);
                            }
                            try {
                                exBytes = U.marshal(marsh, ex);
                            } catch (IgniteCheckedException e) {
                                String msg = "Failed to serialize job exception [nodeId=" + taskNode.id() + ", ses=" + ses + ", jobId=" + ses.getJobId() + ", job=" + job + ", msg=\"" + e.getMessage() + "\"]";
                                ex = new IgniteException(msg);
                                logError(msg, e);
                                exBytes = U.marshal(marsh, ex);
                            }
                        }
                        if (ex != null) {
                            status = FAILED;
                            if (isStarted) {
                                // Job failed.
                                if (!internal && ctx.event().isRecordable(EVT_JOB_FAILED))
                                    evts = addEvent(evts, EVT_JOB_FAILED, "Job failed due to exception [ex=" + ex + ", job=" + job + ']');
                            } else if (!internal && ctx.event().isRecordable(EVT_JOB_REJECTED))
                                evts = addEvent(evts, EVT_JOB_REJECTED, "Job has not been started " + "[ex=" + ex + ", job=" + job + ']');
                        } else {
                            status = FINISHED;
                            if (!internal && ctx.event().isRecordable(EVT_JOB_FINISHED))
                                evts = addEvent(evts, EVT_JOB_FINISHED, /*no message for success. */
                                null);
                        }
                        GridJobExecuteResponse jobRes = new GridJobExecuteResponse(ctx.localNodeId(), ses.getId(), ses.getJobId(), exBytes, loc ? ex : null, resBytes, loc ? res : null, attrBytes, loc ? attrs : null, isCancelled(), retry ? ctx.cache().context().exchange().readyAffinityVersion() : null);
                        long timeout = ses.getEndTime() - U.currentTimeMillis();
                        if (timeout <= 0)
                            // Ignore the actual timeout and send response anyway.
                            timeout = 1;
                        if (ses.isFullSupport()) {
                            // Send response to designated job topic.
                            // Always go through communication to preserve order,
                            // if attributes are enabled.
                            ctx.io().sendOrderedMessage(sndNode, taskTopic, jobRes, internal ? MANAGEMENT_POOL : SYSTEM_POOL, timeout, false);
                        } else if (ctx.localNodeId().equals(sndNode.id()))
                            ctx.task().processJobExecuteResponse(ctx.localNodeId(), jobRes);
                        else
                            // Send response to common topic as unordered message.
                            ctx.io().sendToGridTopic(sndNode, TOPIC_TASK, jobRes, internal ? MANAGEMENT_POOL : SYSTEM_POOL);
                    } catch (IgniteCheckedException e) {
                        // Log and invoke the master-leave callback.
                        if ((e instanceof ClusterTopologyCheckedException) || isDeadNode(taskNode.id())) {
                            onMasterNodeLeft();
                            // Avoid stack trace for left nodes.
                            U.warn(log, "Failed to reply to sender node because it left grid " + "[nodeId=" + taskNode.id() + ", jobId=" + ses.getJobId() + ", ses=" + ses + ", job=" + job + ']');
                        } else
                            logError("Error sending reply for job [nodeId=" + sndNode.id() + ", jobId=" + ses.getJobId() + ", ses=" + ses + ", job=" + job + ']', e);
                        if (!internal && ctx.event().isRecordable(EVT_JOB_FAILED))
                            evts = addEvent(evts, EVT_JOB_FAILED, "Failed to send reply for job [nodeId=" + taskNode.id() + ", job=" + job + ']');
                    }// it gets thrown for some reason.
                     catch (Exception e) {
                        String msg = "Failed to send reply for job [nodeId=" + taskNode.id() + ", job=" + job + ']';
                        logError(msg, e);
                        if (!internal && ctx.event().isRecordable(EVT_JOB_FAILED))
                            evts = addEvent(evts, EVT_JOB_FAILED, msg);
                    }
                }
            } else {
                if (ex != null) {
                    status = FAILED;
                    if (isStarted) {
                        if (!internal && ctx.event().isRecordable(EVT_JOB_FAILED))
                            evts = addEvent(evts, EVT_JOB_FAILED, "Job failed due to exception [ex=" + ex + ", job=" + job + ']');
                    } else if (!internal && ctx.event().isRecordable(EVT_JOB_REJECTED))
                        evts = addEvent(evts, EVT_JOB_REJECTED, "Job has not been started [ex=" + ex + ", job=" + job + ']');
                } else {
                    status = FINISHED;
                    if (!internal && ctx.event().isRecordable(EVT_JOB_FINISHED))
                        evts = addEvent(evts, EVT_JOB_FINISHED, /*no message for success. */
                        null);
                }
            }
        } else {
            // Job timed out.
            status = FAILED;
            if (!internal && ctx.event().isRecordable(EVT_JOB_FAILED))
                evts = addEvent(evts, EVT_JOB_FAILED, "Job failed due to timeout: " + job);
        }
    } finally {
        if (evts != null) {
            for (IgniteBiTuple<Integer, String> t : evts) recordEvent(t.get1(), t.get2());
        }
        // Listener callback.
        evtLsnr.onJobFinished(this);
    }
}
Also used : ClusterNode(org.apache.ignite.cluster.ClusterNode) IgniteBiTuple(org.apache.ignite.lang.IgniteBiTuple) ComputeExecutionRejectedException(org.apache.ignite.compute.ComputeExecutionRejectedException) GridServiceNotFoundException(org.apache.ignite.internal.processors.service.GridServiceNotFoundException) IgniteCheckedException(org.apache.ignite.IgniteCheckedException) IgniteException(org.apache.ignite.IgniteException) NodeStoppingException(org.apache.ignite.internal.NodeStoppingException) IgniteInterruptedCheckedException(org.apache.ignite.internal.IgniteInterruptedCheckedException) ClusterTopologyCheckedException(org.apache.ignite.internal.cluster.ClusterTopologyCheckedException) ComputeUserUndeclaredException(org.apache.ignite.compute.ComputeUserUndeclaredException) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) IgniteCheckedException(org.apache.ignite.IgniteCheckedException) IgniteException(org.apache.ignite.IgniteException) GridJobExecuteResponse(org.apache.ignite.internal.GridJobExecuteResponse) GridTimeoutObject(org.apache.ignite.internal.processors.timeout.GridTimeoutObject) ClusterTopologyCheckedException(org.apache.ignite.internal.cluster.ClusterTopologyCheckedException)

Example 42 with ClusterTopologyCheckedException

use of org.apache.ignite.internal.cluster.ClusterTopologyCheckedException in project ignite by apache.

the class GridTaskWorker method sendRequest.

/**
 * @param res Job result.
 */
private void sendRequest(ComputeJobResult res) {
    assert res != null;
    GridJobExecuteRequest req = null;
    ClusterNode node = res.getNode();
    try {
        ClusterNode curNode = ctx.discovery().node(node.id());
        // thrown in case of send failure.
        if (curNode == null) {
            if (log.isDebugEnabled()) {
                U.warn(log, "Failed to send job request because remote node left grid (if fail-over is enabled, " + "will attempt fail-over to another node) [node=" + node + ", taskName=" + ses.getTaskName() + ", taskSesId=" + ses.getId() + ", jobSesId=" + res.getJobContext().getJobId() + ']');
            }
            ctx.resource().invokeAnnotated(dep, res.getJob(), ComputeJobAfterSend.class);
            GridJobExecuteResponse fakeRes = new GridJobExecuteResponse(node.id(), ses.getId(), res.getJobContext().getJobId(), null, null, null, null, null, null, false, null);
            fakeRes.setFakeException(new ClusterTopologyException("Failed to send job due to node failure: " + node));
            onResponse(fakeRes);
        } else {
            long timeout = ses.getEndTime() == Long.MAX_VALUE ? Long.MAX_VALUE : ses.getEndTime() - U.currentTimeMillis();
            if (timeout > 0) {
                boolean loc = node.id().equals(ctx.discovery().localNode().id()) && !ctx.config().isMarshalLocalJobs();
                Map<Object, Object> sesAttrs = ses.isFullSupport() ? ses.getAttributes() : null;
                Map<? extends Serializable, ? extends Serializable> jobAttrs = (Map<? extends Serializable, ? extends Serializable>) res.getJobContext().getAttributes();
                boolean forceLocDep = internal || !ctx.deploy().enabled();
                try {
                    MarshallerUtils.jobReceiverVersion(node.version());
                    req = new GridJobExecuteRequest(ses.getId(), res.getJobContext().getJobId(), ses.getTaskName(), ses.getUserVersion(), ses.getTaskClassName(), loc ? null : U.marshal(marsh, res.getJob()), loc ? res.getJob() : null, ses.getStartTime(), timeout, ses.getTopology(), loc ? ses.getTopologyPredicate() : null, loc ? null : U.marshal(marsh, ses.getTopologyPredicate()), loc ? null : U.marshal(marsh, ses.getJobSiblings()), loc ? ses.getJobSiblings() : null, loc ? null : U.marshal(marsh, sesAttrs), loc ? sesAttrs : null, loc ? null : U.marshal(marsh, jobAttrs), loc ? jobAttrs : null, ses.getCheckpointSpi(), dep.classLoaderId(), dep.deployMode(), continuous, dep.participants(), forceLocDep, ses.isFullSupport(), internal, affCacheIds, affPartId, mapTopVer, ses.executorName());
                } finally {
                    MarshallerUtils.jobReceiverVersion(null);
                }
                if (loc)
                    ctx.job().processJobExecuteRequest(ctx.discovery().localNode(), req);
                else {
                    byte plc;
                    if (internal)
                        plc = MANAGEMENT_POOL;
                    else {
                        Byte ctxPlc = getThreadContext(TC_IO_POLICY);
                        if (ctxPlc != null)
                            plc = ctxPlc;
                        else
                            plc = PUBLIC_POOL;
                    }
                    // Send job execution request.
                    ctx.io().sendToGridTopic(node, TOPIC_JOB, req, plc);
                    if (log.isDebugEnabled())
                        log.debug("Sent job request [req=" + req + ", node=" + node + ']');
                }
                if (!loc)
                    ctx.resource().invokeAnnotated(dep, res.getJob(), ComputeJobAfterSend.class);
            } else
                U.warn(log, "Job timed out prior to sending job execution request: " + res.getJob());
        }
    } catch (IgniteCheckedException e) {
        IgniteException fakeErr = null;
        try {
            boolean deadNode = e instanceof ClusterTopologyCheckedException || isDeadNode(res.getNode().id());
            // Avoid stack trace if node has left grid.
            if (deadNode) {
                if (log.isDebugEnabled()) {
                    U.warn(log, "Failed to send job request because remote node left grid (if failover is enabled, " + "will attempt fail-over to another node) [node=" + node + ", taskName=" + ses.getTaskName() + ", taskSesId=" + ses.getId() + ", jobSesId=" + res.getJobContext().getJobId() + ']');
                }
                fakeErr = new ClusterTopologyException("Failed to send job due to node failure: " + node, e);
            } else if (log.isDebugEnabled())
                U.error(log, "Failed to send job request: " + req, e);
        } catch (IgniteClientDisconnectedCheckedException e0) {
            if (log.isDebugEnabled())
                log.debug("Failed to send job request, client disconnected [node=" + node + ", taskName=" + ses.getTaskName() + ", taskSesId=" + ses.getId() + ", jobSesId=" + res.getJobContext().getJobId() + ']');
            fakeErr = U.convertException(e0);
        }
        GridJobExecuteResponse fakeRes = new GridJobExecuteResponse(node.id(), ses.getId(), res.getJobContext().getJobId(), null, null, null, null, null, null, false, null);
        if (fakeErr == null)
            fakeErr = U.convertException(e);
        fakeRes.setFakeException(fakeErr);
        onResponse(fakeRes);
    }
}
Also used : ClusterNode(org.apache.ignite.cluster.ClusterNode) Serializable(java.io.Serializable) GridJobExecuteRequest(org.apache.ignite.internal.GridJobExecuteRequest) IgniteCheckedException(org.apache.ignite.IgniteCheckedException) GridJobExecuteResponse(org.apache.ignite.internal.GridJobExecuteResponse) ComputeJobAfterSend(org.apache.ignite.compute.ComputeJobAfterSend) IgniteException(org.apache.ignite.IgniteException) IgniteClientDisconnectedCheckedException(org.apache.ignite.internal.IgniteClientDisconnectedCheckedException) ClusterTopologyException(org.apache.ignite.cluster.ClusterTopologyException) GridTimeoutObject(org.apache.ignite.internal.processors.timeout.GridTimeoutObject) Map(java.util.Map) EnumMap(java.util.EnumMap) HashMap(java.util.HashMap) Collections.emptyMap(java.util.Collections.emptyMap) ClusterTopologyCheckedException(org.apache.ignite.internal.cluster.ClusterTopologyCheckedException)

Example 43 with ClusterTopologyCheckedException

use of org.apache.ignite.internal.cluster.ClusterTopologyCheckedException in project ignite by apache.

the class CommunicationWorker method processDisconnect.

/**
 * @param sesInfo Disconnected session information.
 */
private void processDisconnect(DisconnectedSessionInfo sesInfo) {
    GridNioRecoveryDescriptor recoveryDesc = sesInfo.recoveryDescription();
    ClusterNode node = recoveryDesc.node();
    if (!recoveryDesc.nodeAlive(nodeGetter.apply(node.id())))
        return;
    try {
        if (log.isDebugEnabled())
            log.debug("Recovery reconnect [rmtNode=" + recoveryDesc.node().id() + ']');
        GridCommunicationClient client = clientPool.reserveClient(node, sesInfo.connectionIndex());
        client.release();
    } catch (ClusterTopologyCheckedException e) {
        if (log.isDebugEnabled())
            log.debug("Recovery reconnect failed, node stopping [rmtNode=" + recoveryDesc.node().id() + ']');
    } catch (IgniteTooManyOpenFilesException e) {
        eRegistrySupplier.get().onException(e.getMessage(), e);
        throw e;
    } catch (IgniteCheckedException | IgniteException e) {
        try {
            if (recoveryDesc.nodeAlive(nodeGetter.apply(node.id())) && pingNode.apply(node.id())) {
                if (log.isDebugEnabled()) {
                    log.debug("Recovery reconnect failed, will retry " + "[rmtNode=" + recoveryDesc.node().id() + ", err=" + e + ']');
                }
                addProcessDisconnectRequest(sesInfo);
            } else {
                if (log.isDebugEnabled()) {
                    log.debug("Recovery reconnect failed, " + "node left [rmtNode=" + recoveryDesc.node().id() + ", err=" + e + ']');
                }
                eRegistrySupplier.get().onException("Recovery reconnect failed, node left [rmtNode=" + recoveryDesc.node().id() + "]", e);
            }
        } catch (IgniteClientDisconnectedException ignored) {
            if (log.isDebugEnabled())
                log.debug("Failed to ping node, client disconnected.");
        }
    }
}
Also used : ClusterNode(org.apache.ignite.cluster.ClusterNode) IgniteCheckedException(org.apache.ignite.IgniteCheckedException) IgniteException(org.apache.ignite.IgniteException) IgniteClientDisconnectedException(org.apache.ignite.IgniteClientDisconnectedException) GridNioRecoveryDescriptor(org.apache.ignite.internal.util.nio.GridNioRecoveryDescriptor) GridCommunicationClient(org.apache.ignite.internal.util.nio.GridCommunicationClient) ClusterTopologyCheckedException(org.apache.ignite.internal.cluster.ClusterTopologyCheckedException) IgniteTooManyOpenFilesException(org.apache.ignite.internal.IgniteTooManyOpenFilesException)

Example 44 with ClusterTopologyCheckedException

use of org.apache.ignite.internal.cluster.ClusterTopologyCheckedException in project ignite by apache.

the class AuthenticationProcessorNodeRestartTest method testConcurrentAuthorize.

/**
 * @throws Exception If failed.
 */
@Test
public void testConcurrentAuthorize() throws Exception {
    final int testUsersCnt = 10;
    withSecurityContextOnAllNodes(secCtxDflt);
    for (int i = 0; i < testUsersCnt; ++i) grid(CLI_NODE).context().security().createUser("test" + i, ("passwd_test" + i).toCharArray());
    final IgniteInternalFuture restartFut = GridTestUtils.runAsync(() -> {
        try {
            for (int i = 0; i < RESTARTS; ++i) {
                int nodeIdx = RND.nextInt(NODES_COUNT - 1);
                stopGrid(nodeIdx);
                U.sleep(500);
                startGrid(nodeIdx);
                U.sleep(500);
            }
        } catch (Exception e) {
            log.error("Unexpected exception.", e);
            fail("Unexpected exception on server restart: " + e.getMessage());
        }
    });
    final AtomicInteger usrCnt = new AtomicInteger();
    GridTestUtils.runMultiThreaded(() -> {
        String user = "test" + usrCnt.getAndIncrement();
        try {
            while (!restartFut.isDone()) {
                SecurityContext secCtx = authenticate(grid(CLI_NODE), user, "passwd_" + user);
                assertNotNull(secCtx);
            }
        } catch (ClusterTopologyCheckedException ignored) {
        // No-op.
        } catch (Exception e) {
            log.error("Unexpected exception.", e);
            fail("Unexpected exception: " + e.getMessage());
        }
    }, testUsersCnt, "user-op");
    restartFut.get();
}
Also used : AtomicInteger(java.util.concurrent.atomic.AtomicInteger) SecurityContext(org.apache.ignite.internal.processors.security.SecurityContext) IgniteInternalFuture(org.apache.ignite.internal.IgniteInternalFuture) IgniteCheckedException(org.apache.ignite.IgniteCheckedException) ClusterTopologyCheckedException(org.apache.ignite.internal.cluster.ClusterTopologyCheckedException) ClusterTopologyCheckedException(org.apache.ignite.internal.cluster.ClusterTopologyCheckedException) GridCommonAbstractTest(org.apache.ignite.testframework.junits.common.GridCommonAbstractTest) Test(org.junit.Test)

Example 45 with ClusterTopologyCheckedException

use of org.apache.ignite.internal.cluster.ClusterTopologyCheckedException in project ignite by apache.

the class GridNearTxEnlistFuture method onNodeLeft.

/**
 * {@inheritDoc}
 */
@Override
public boolean onNodeLeft(UUID nodeId) {
    if (batches.keySet().contains(nodeId)) {
        if (log.isDebugEnabled())
            log.debug("Found unacknowledged batch for left node [nodeId=" + nodeId + ", fut=" + this + ']');
        ClusterTopologyCheckedException topEx = new ClusterTopologyCheckedException("Failed to enlist keys " + "(primary node left grid, retry transaction if possible) [node=" + nodeId + ']');
        topEx.retryReadyFuture(cctx.shared().nextAffinityReadyFuture(topVer));
        onDone(topEx);
    }
    if (log.isDebugEnabled())
        log.debug("Future does not have mapping for left node (ignoring) [nodeId=" + nodeId + ", fut=" + this + ']');
    return false;
}
Also used : ClusterTopologyCheckedException(org.apache.ignite.internal.cluster.ClusterTopologyCheckedException)

Aggregations

ClusterTopologyCheckedException (org.apache.ignite.internal.cluster.ClusterTopologyCheckedException)112 IgniteCheckedException (org.apache.ignite.IgniteCheckedException)82 ClusterNode (org.apache.ignite.cluster.ClusterNode)62 UUID (java.util.UUID)31 AffinityTopologyVersion (org.apache.ignite.internal.processors.affinity.AffinityTopologyVersion)25 Map (java.util.Map)23 IgniteInternalFuture (org.apache.ignite.internal.IgniteInternalFuture)22 HashMap (java.util.HashMap)20 ArrayList (java.util.ArrayList)18 IgniteException (org.apache.ignite.IgniteException)18 Collection (java.util.Collection)16 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)15 KeyCacheObject (org.apache.ignite.internal.processors.cache.KeyCacheObject)15 GridCacheEntryRemovedException (org.apache.ignite.internal.processors.cache.GridCacheEntryRemovedException)14 GridCacheContext (org.apache.ignite.internal.processors.cache.GridCacheContext)12 IgniteSpiException (org.apache.ignite.spi.IgniteSpiException)12 Nullable (org.jetbrains.annotations.Nullable)12 List (java.util.List)11 ConcurrentMap (java.util.concurrent.ConcurrentMap)11 ClusterTopologyServerNotFoundException (org.apache.ignite.internal.cluster.ClusterTopologyServerNotFoundException)11