Search in sources :

Example 1 with JobTerminateRequestedException

use of com.hazelcast.jet.impl.exception.JobTerminateRequestedException in project hazelcast by hazelcast.

the class MasterJobContext method finalizeJob.

void finalizeJob(@Nullable Throwable failure) {
    mc.coordinationService().submitToCoordinatorThread(() -> {
        final Runnable nonSynchronizedAction;
        mc.lock();
        try {
            JobStatus status = mc.jobStatus();
            if (status == COMPLETED || status == FAILED) {
                logIgnoredCompletion(failure, status);
                return;
            }
            completeVertices(failure);
            mc.getJetServiceBackend().getJobClassLoaderService().tryRemoveClassloadersForJob(mc.jobId(), COORDINATOR);
            ActionAfterTerminate terminationModeAction = failure instanceof JobTerminateRequestedException ? ((JobTerminateRequestedException) failure).mode().actionAfterTerminate() : null;
            mc.snapshotContext().onExecutionTerminated();
            // if restart was requested, restart immediately
            if (terminationModeAction == RESTART) {
                mc.setJobStatus(NOT_RUNNING);
                nonSynchronizedAction = () -> mc.coordinationService().restartJob(mc.jobId());
            } else if (!isCancelled() && isRestartableException(failure) && mc.jobConfig().isAutoScaling()) {
                // if restart is due to a failure, schedule a restart after a delay
                scheduleRestart();
                nonSynchronizedAction = NO_OP;
            } else if (terminationModeAction == SUSPEND || isRestartableException(failure) && !isCancelled() && !mc.jobConfig().isAutoScaling() && mc.jobConfig().getProcessingGuarantee() != NONE) {
                mc.setJobStatus(SUSPENDED);
                mc.jobExecutionRecord().setSuspended(null);
                nonSynchronizedAction = () -> mc.writeJobExecutionRecord(false);
            } else if (failure != null && !isCancelled() && mc.jobConfig().isSuspendOnFailure()) {
                mc.setJobStatus(SUSPENDED);
                mc.jobExecutionRecord().setSuspended("Execution failure:\n" + ExceptionUtil.stackTraceToString(failure));
                nonSynchronizedAction = () -> mc.writeJobExecutionRecord(false);
            } else {
                long completionTime = System.currentTimeMillis();
                boolean isSuccess = logExecutionSummary(failure, completionTime);
                mc.setJobStatus(isSuccess ? COMPLETED : FAILED);
                if (failure instanceof LocalMemberResetException) {
                    logger.fine("Cancelling job " + mc.jobIdString() + " locally: member (local or remote) reset. " + "We don't delete job metadata: job will restart on majority cluster");
                    setFinalResult(new CancellationException());
                } else {
                    mc.coordinationService().completeJob(mc, failure, completionTime).whenComplete(withTryCatch(logger, (r, f) -> {
                        if (f != null) {
                            logger.warning("Completion of " + mc.jobIdString() + " failed", f);
                        } else {
                            setFinalResult(failure);
                        }
                    }));
                }
                nonSynchronizedAction = NO_OP;
            }
            // reset the state for the next execution
            requestedTerminationMode = null;
            executionFailureCallback = null;
        } finally {
            mc.unlock();
        }
        executionCompletionFuture.complete(null);
        nonSynchronizedAction.run();
    });
}
Also used : JobStatus(com.hazelcast.jet.core.JobStatus) ActionAfterTerminate(com.hazelcast.jet.impl.TerminationMode.ActionAfterTerminate) CancellationException(java.util.concurrent.CancellationException) LocalMemberResetException(com.hazelcast.core.LocalMemberResetException) JobTerminateRequestedException(com.hazelcast.jet.impl.exception.JobTerminateRequestedException)

Example 2 with JobTerminateRequestedException

use of com.hazelcast.jet.impl.exception.JobTerminateRequestedException in project hazelcast by hazelcast.

the class MasterJobContext method getErrorFromResponses.

/**
 * <ul>
 * <li>Returns {@code null} if there is no failure
 * <li>Returns a {@link CancellationException} if the job is cancelled
 *     forcefully.
 * <li>Returns a {@link JobTerminateRequestedException} if the current
 *     execution is stopped due to a requested termination, except for
 *     CANCEL_GRACEFUL, in which case CancellationException is returned.
 * <li>If there is at least one user failure, such as an exception in user
 *     code (restartable or not), then returns that failure.
 * <li>Otherwise, the failure is because a job participant has left the
 *     cluster. In that case, it returns {@code TopologyChangeException} so
 *     that the job will be restarted
 * </ul>
 */
private Throwable getErrorFromResponses(String opName, Collection<Map.Entry<MemberInfo, Object>> responses) {
    if (isCancelled()) {
        logger.fine(mc.jobIdString() + " to be cancelled after " + opName);
        return new CancellationException();
    }
    Map<Boolean, List<Entry<Address, Object>>> grouped = responses.stream().map(en -> entry(en.getKey().getAddress(), en.getValue())).collect(partitioningBy(e1 -> e1.getValue() instanceof Throwable));
    int successfulMembersCount = grouped.getOrDefault(false, emptyList()).size();
    if (successfulMembersCount == mc.executionPlanMap().size()) {
        logger.fine(opName + " of " + mc.jobIdString() + " was successful");
        return null;
    }
    List<Entry<Address, Object>> failures = grouped.getOrDefault(true, emptyList());
    if (!failures.isEmpty()) {
        logger.fine(opName + " of " + mc.jobIdString() + " has failures: " + failures);
    }
    // other exceptions, ignore this and handle the other exception.
    if (failures.stream().allMatch(entry -> entry.getValue() instanceof TerminatedWithSnapshotException)) {
        assert opName.equals("Execution") : "opName is '" + opName + "', expected 'Execution'";
        logger.fine(opName + " of " + mc.jobIdString() + " terminated after a terminal snapshot");
        TerminationMode mode = requestedTerminationMode;
        assert mode != null && mode.isWithTerminalSnapshot() : "mode=" + mode;
        return mode == CANCEL_GRACEFUL ? new CancellationException() : new JobTerminateRequestedException(mode);
    }
    // If all exceptions are of certain type, treat it as TopologyChangedException
    Map<Boolean, List<Entry<Address, Object>>> splitFailures = failures.stream().collect(Collectors.partitioningBy(e -> e.getValue() instanceof CancellationException || e.getValue() instanceof TerminatedWithSnapshotException || isTopologyException((Throwable) e.getValue())));
    List<Entry<Address, Object>> topologyFailures = splitFailures.getOrDefault(true, emptyList());
    List<Entry<Address, Object>> otherFailures = splitFailures.getOrDefault(false, emptyList());
    if (!otherFailures.isEmpty()) {
        return (Throwable) otherFailures.get(0).getValue();
    } else {
        return new TopologyChangedException("Causes from members: " + topologyFailures);
    }
}
Also used : Address(com.hazelcast.cluster.Address) SUSPEND(com.hazelcast.jet.impl.TerminationMode.ActionAfterTerminate.SUSPEND) NOT_RUNNING(com.hazelcast.jet.core.JobStatus.NOT_RUNNING) GetLocalJobMetricsOperation(com.hazelcast.jet.impl.operation.GetLocalJobMetricsOperation) CompletableFuture.completedFuture(java.util.concurrent.CompletableFuture.completedFuture) NonCompletableFuture(com.hazelcast.jet.impl.util.NonCompletableFuture) ExceptionUtil.isTopologyException(com.hazelcast.jet.impl.util.ExceptionUtil.isTopologyException) JobTerminateRequestedException(com.hazelcast.jet.impl.exception.JobTerminateRequestedException) SourceProcessors.readMapP(com.hazelcast.jet.core.processor.SourceProcessors.readMapP) RESTART(com.hazelcast.jet.impl.TerminationMode.ActionAfterTerminate.RESTART) JetDelegatingClassLoader(com.hazelcast.jet.impl.deployment.JetDelegatingClassLoader) TerminatedWithSnapshotException(com.hazelcast.jet.impl.exception.TerminatedWithSnapshotException) Collectors.toMap(java.util.stream.Collectors.toMap) Functions.entryKey(com.hazelcast.function.Functions.entryKey) MemberInfo(com.hazelcast.internal.cluster.MemberInfo) Map(java.util.Map) STARTING(com.hazelcast.jet.core.JobStatus.STARTING) SUSPENDED(com.hazelcast.jet.core.JobStatus.SUSPENDED) DAG(com.hazelcast.jet.core.DAG) JobStatus(com.hazelcast.jet.core.JobStatus) ExceptionUtil(com.hazelcast.jet.impl.util.ExceptionUtil) JobMetrics(com.hazelcast.jet.core.metrics.JobMetrics) CancellationException(java.util.concurrent.CancellationException) CANCEL_GRACEFUL(com.hazelcast.jet.impl.TerminationMode.CANCEL_GRACEFUL) Collections.emptyList(java.util.Collections.emptyList) Collection(java.util.Collection) Set(java.util.Set) UUID(java.util.UUID) MILLISECONDS(java.util.concurrent.TimeUnit.MILLISECONDS) Collectors(java.util.stream.Collectors) CANCEL_FORCEFUL(com.hazelcast.jet.impl.TerminationMode.CANCEL_FORCEFUL) Objects(java.util.Objects) Util(com.hazelcast.jet.impl.util.Util) List(java.util.List) Util.idToString(com.hazelcast.jet.Util.idToString) ExecutionPlan(com.hazelcast.jet.impl.execution.init.ExecutionPlan) MetricNames(com.hazelcast.jet.core.metrics.MetricNames) Entry(java.util.Map.Entry) TopologyChangedException(com.hazelcast.jet.core.TopologyChangedException) COMPLETED(com.hazelcast.jet.core.JobStatus.COMPLETED) JetDisabledException(com.hazelcast.jet.impl.exception.JetDisabledException) LoggingUtil(com.hazelcast.jet.impl.util.LoggingUtil) ExecutionPlanBuilder.createExecutionPlans(com.hazelcast.jet.impl.execution.init.ExecutionPlanBuilder.createExecutionPlans) Collectors.partitioningBy(java.util.stream.Collectors.partitioningBy) TerminateExecutionOperation(com.hazelcast.jet.impl.operation.TerminateExecutionOperation) ExceptionUtil.isRestartableException(com.hazelcast.jet.impl.util.ExceptionUtil.isRestartableException) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) LoggingUtil.logFinest(com.hazelcast.jet.impl.util.LoggingUtil.logFinest) Util.doWithClassLoader(com.hazelcast.jet.impl.util.Util.doWithClassLoader) HashMap(java.util.HashMap) CompletableFuture(java.util.concurrent.CompletableFuture) ExecutionService(com.hazelcast.spi.impl.executionservice.ExecutionService) StartExecutionOperation(com.hazelcast.jet.impl.operation.StartExecutionOperation) Function(java.util.function.Function) Supplier(java.util.function.Supplier) Util.formatJobDuration(com.hazelcast.jet.impl.util.Util.formatJobDuration) ActionAfterTerminate(com.hazelcast.jet.impl.TerminationMode.ActionAfterTerminate) ExecutionNotFoundException(com.hazelcast.jet.impl.exception.ExecutionNotFoundException) ArrayList(java.util.ArrayList) JetException(com.hazelcast.jet.JetException) HashSet(java.util.HashSet) InitExecutionOperation(com.hazelcast.jet.impl.operation.InitExecutionOperation) COORDINATOR(com.hazelcast.jet.impl.JobClassLoaderService.JobPhase.COORDINATOR) ILogger(com.hazelcast.logging.ILogger) SnapshotValidator.validateSnapshot(com.hazelcast.jet.impl.SnapshotValidator.validateSnapshot) ExceptionUtil.rethrow(com.hazelcast.jet.impl.util.ExceptionUtil.rethrow) Operation(com.hazelcast.spi.impl.operationservice.Operation) Util.entry(com.hazelcast.jet.Util.entry) ExceptionUtil.withTryCatch(com.hazelcast.jet.impl.util.ExceptionUtil.withTryCatch) BiConsumer(java.util.function.BiConsumer) MembersView(com.hazelcast.internal.cluster.impl.MembersView) LocalMemberResetException(com.hazelcast.core.LocalMemberResetException) RESTART_GRACEFUL(com.hazelcast.jet.impl.TerminationMode.RESTART_GRACEFUL) Edge(com.hazelcast.jet.core.Edge) Version(com.hazelcast.version.Version) EXPORTED_SNAPSHOTS_PREFIX(com.hazelcast.jet.impl.JobRepository.EXPORTED_SNAPSHOTS_PREFIX) Nonnull(javax.annotation.Nonnull) Tuple2(com.hazelcast.jet.datamodel.Tuple2) Nullable(javax.annotation.Nullable) Job(com.hazelcast.jet.Job) Measurement(com.hazelcast.jet.core.metrics.Measurement) SUSPENDED_EXPORTING_SNAPSHOT(com.hazelcast.jet.core.JobStatus.SUSPENDED_EXPORTING_SNAPSHOT) Util.toList(com.hazelcast.jet.impl.util.Util.toList) RawJobMetrics(com.hazelcast.jet.impl.metrics.RawJobMetrics) MetricTags(com.hazelcast.jet.core.metrics.MetricTags) NONE(com.hazelcast.jet.config.ProcessingGuarantee.NONE) Consumer(java.util.function.Consumer) Vertex(com.hazelcast.jet.core.Vertex) Tuple2.tuple2(com.hazelcast.jet.datamodel.Tuple2.tuple2) CustomClassLoadedObject.deserializeWithCustomClassLoader(com.hazelcast.jet.impl.execution.init.CustomClassLoadedObject.deserializeWithCustomClassLoader) ExceptionUtil.peel(com.hazelcast.jet.impl.util.ExceptionUtil.peel) FAILED(com.hazelcast.jet.core.JobStatus.FAILED) RUNNING(com.hazelcast.jet.core.JobStatus.RUNNING) Collections(java.util.Collections) IMap(com.hazelcast.map.IMap) Edge.between(com.hazelcast.jet.core.Edge.between) TerminatedWithSnapshotException(com.hazelcast.jet.impl.exception.TerminatedWithSnapshotException) Address(com.hazelcast.cluster.Address) TopologyChangedException(com.hazelcast.jet.core.TopologyChangedException) Entry(java.util.Map.Entry) CancellationException(java.util.concurrent.CancellationException) Collections.emptyList(java.util.Collections.emptyList) List(java.util.List) ArrayList(java.util.ArrayList) Util.toList(com.hazelcast.jet.impl.util.Util.toList) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) JobTerminateRequestedException(com.hazelcast.jet.impl.exception.JobTerminateRequestedException)

Example 3 with JobTerminateRequestedException

use of com.hazelcast.jet.impl.exception.JobTerminateRequestedException in project hazelcast by hazelcast.

the class ExecutionLifecycleTest method assertPmsClosedWithError.

private void assertPmsClosedWithError() {
    assertTrue("init not called", MockPMS.initCalled.get());
    assertTrue("close not called", MockPMS.closeCalled.get());
    assertOneOfExceptionsInCauses(MockPMS.receivedCloseError.get(), MOCK_ERROR, new CancellationException(), new JobTerminateRequestedException(CANCEL_FORCEFUL));
}
Also used : CancellationException(java.util.concurrent.CancellationException) JobTerminateRequestedException(com.hazelcast.jet.impl.exception.JobTerminateRequestedException)

Example 4 with JobTerminateRequestedException

use of com.hazelcast.jet.impl.exception.JobTerminateRequestedException in project hazelcast by hazelcast.

the class ExecutionLifecycleTest method assertPsClosedWithError.

private void assertPsClosedWithError() {
    assertEquals(MEMBER_COUNT, MockPS.initCount.get());
    // with light jobs the init can be called on not all the members - the execution on one member
    // can be cancelled due to the failure on the other member before it was initialized.
    int minCount = useLightJob ? 1 : MEMBER_COUNT;
    assertBetween("close count", MockPS.closeCount.get(), minCount, MEMBER_COUNT);
    assertBetween("received close errors", MockPS.receivedCloseErrors.size(), minCount, MEMBER_COUNT);
    for (int i = 0; i < MockPS.receivedCloseErrors.size(); i++) {
        assertOneOfExceptionsInCauses(MockPS.receivedCloseErrors.get(i), MOCK_ERROR, new CancellationException(), new JobTerminateRequestedException(CANCEL_FORCEFUL));
    }
}
Also used : CancellationException(java.util.concurrent.CancellationException) JobTerminateRequestedException(com.hazelcast.jet.impl.exception.JobTerminateRequestedException)

Example 5 with JobTerminateRequestedException

use of com.hazelcast.jet.impl.exception.JobTerminateRequestedException in project hazelcast by hazelcast.

the class JobExecutionService method terminateExecution.

public void terminateExecution(long jobId, long executionId, Address callerAddress, TerminationMode mode) {
    failIfNotRunning();
    ExecutionContext executionContext = executionContexts.get(executionId);
    if (executionContext == null) {
        // job. We ignore too and rely on the CheckLightJobsOperation.
        return;
    }
    if (!executionContext.isLightJob()) {
        Address masterAddress = nodeEngine.getMasterAddress();
        if (!callerAddress.equals(masterAddress)) {
            failIfNotRunning();
            throw new IllegalStateException(String.format("Caller %s cannot do '%s' for terminateExecution: it is not the master, the master is %s", callerAddress, jobIdAndExecutionId(jobId, executionId), masterAddress));
        }
    }
    Address coordinator = executionContext.coordinator();
    if (coordinator == null) {
        // It can't happen for normal jobs
        assert executionContext.isLightJob() : "null coordinator for non-light job";
    } else if (!coordinator.equals(callerAddress)) {
        throw new IllegalStateException(String.format("%s, originally from coordinator %s, cannot do 'terminateExecution' by coordinator %s and execution %s", executionContext.jobNameAndExecutionId(), coordinator, callerAddress, idToString(executionId)));
    }
    Exception cause = mode == null ? new CancellationException() : new JobTerminateRequestedException(mode);
    terminateExecution0(executionContext, mode, cause);
}
Also used : ExecutionContext(com.hazelcast.jet.impl.execution.ExecutionContext) Address(com.hazelcast.cluster.Address) CancellationException(java.util.concurrent.CancellationException) JobTerminateRequestedException(com.hazelcast.jet.impl.exception.JobTerminateRequestedException) HazelcastInstanceNotActiveException(com.hazelcast.core.HazelcastInstanceNotActiveException) JobTerminateRequestedException(com.hazelcast.jet.impl.exception.JobTerminateRequestedException) RetryableHazelcastException(com.hazelcast.spi.exception.RetryableHazelcastException) CancellationException(java.util.concurrent.CancellationException) TargetNotMemberException(com.hazelcast.spi.exception.TargetNotMemberException) TopologyChangedException(com.hazelcast.jet.core.TopologyChangedException) ExecutionNotFoundException(com.hazelcast.jet.impl.exception.ExecutionNotFoundException) MemberLeftException(com.hazelcast.core.MemberLeftException)

Aggregations

JobTerminateRequestedException (com.hazelcast.jet.impl.exception.JobTerminateRequestedException)8 CancellationException (java.util.concurrent.CancellationException)8 LocalMemberResetException (com.hazelcast.core.LocalMemberResetException)4 TopologyChangedException (com.hazelcast.jet.core.TopologyChangedException)4 Address (com.hazelcast.cluster.Address)3 JetException (com.hazelcast.jet.JetException)3 DAG (com.hazelcast.jet.core.DAG)3 JobStatus (com.hazelcast.jet.core.JobStatus)3 Vertex (com.hazelcast.jet.core.Vertex)3 ExecutionNotFoundException (com.hazelcast.jet.impl.exception.ExecutionNotFoundException)3 Functions.entryKey (com.hazelcast.function.Functions.entryKey)2 MemberInfo (com.hazelcast.internal.cluster.MemberInfo)2 MembersView (com.hazelcast.internal.cluster.impl.MembersView)2 Job (com.hazelcast.jet.Job)2 Util.entry (com.hazelcast.jet.Util.entry)2 Util.idToString (com.hazelcast.jet.Util.idToString)2 NONE (com.hazelcast.jet.config.ProcessingGuarantee.NONE)2 Edge (com.hazelcast.jet.core.Edge)2 Edge.between (com.hazelcast.jet.core.Edge.between)2 COMPLETED (com.hazelcast.jet.core.JobStatus.COMPLETED)2