use of com.hazelcast.jet.impl.exception.JobTerminateRequestedException in project hazelcast by hazelcast.
the class MasterJobContext method finalizeJob.
void finalizeJob(@Nullable Throwable failure) {
mc.coordinationService().submitToCoordinatorThread(() -> {
final Runnable nonSynchronizedAction;
mc.lock();
try {
JobStatus status = mc.jobStatus();
if (status == COMPLETED || status == FAILED) {
logIgnoredCompletion(failure, status);
return;
}
completeVertices(failure);
mc.getJetServiceBackend().getJobClassLoaderService().tryRemoveClassloadersForJob(mc.jobId(), COORDINATOR);
ActionAfterTerminate terminationModeAction = failure instanceof JobTerminateRequestedException ? ((JobTerminateRequestedException) failure).mode().actionAfterTerminate() : null;
mc.snapshotContext().onExecutionTerminated();
// if restart was requested, restart immediately
if (terminationModeAction == RESTART) {
mc.setJobStatus(NOT_RUNNING);
nonSynchronizedAction = () -> mc.coordinationService().restartJob(mc.jobId());
} else if (!isCancelled() && isRestartableException(failure) && mc.jobConfig().isAutoScaling()) {
// if restart is due to a failure, schedule a restart after a delay
scheduleRestart();
nonSynchronizedAction = NO_OP;
} else if (terminationModeAction == SUSPEND || isRestartableException(failure) && !isCancelled() && !mc.jobConfig().isAutoScaling() && mc.jobConfig().getProcessingGuarantee() != NONE) {
mc.setJobStatus(SUSPENDED);
mc.jobExecutionRecord().setSuspended(null);
nonSynchronizedAction = () -> mc.writeJobExecutionRecord(false);
} else if (failure != null && !isCancelled() && mc.jobConfig().isSuspendOnFailure()) {
mc.setJobStatus(SUSPENDED);
mc.jobExecutionRecord().setSuspended("Execution failure:\n" + ExceptionUtil.stackTraceToString(failure));
nonSynchronizedAction = () -> mc.writeJobExecutionRecord(false);
} else {
long completionTime = System.currentTimeMillis();
boolean isSuccess = logExecutionSummary(failure, completionTime);
mc.setJobStatus(isSuccess ? COMPLETED : FAILED);
if (failure instanceof LocalMemberResetException) {
logger.fine("Cancelling job " + mc.jobIdString() + " locally: member (local or remote) reset. " + "We don't delete job metadata: job will restart on majority cluster");
setFinalResult(new CancellationException());
} else {
mc.coordinationService().completeJob(mc, failure, completionTime).whenComplete(withTryCatch(logger, (r, f) -> {
if (f != null) {
logger.warning("Completion of " + mc.jobIdString() + " failed", f);
} else {
setFinalResult(failure);
}
}));
}
nonSynchronizedAction = NO_OP;
}
// reset the state for the next execution
requestedTerminationMode = null;
executionFailureCallback = null;
} finally {
mc.unlock();
}
executionCompletionFuture.complete(null);
nonSynchronizedAction.run();
});
}
use of com.hazelcast.jet.impl.exception.JobTerminateRequestedException in project hazelcast by hazelcast.
the class MasterJobContext method getErrorFromResponses.
/**
* <ul>
* <li>Returns {@code null} if there is no failure
* <li>Returns a {@link CancellationException} if the job is cancelled
* forcefully.
* <li>Returns a {@link JobTerminateRequestedException} if the current
* execution is stopped due to a requested termination, except for
* CANCEL_GRACEFUL, in which case CancellationException is returned.
* <li>If there is at least one user failure, such as an exception in user
* code (restartable or not), then returns that failure.
* <li>Otherwise, the failure is because a job participant has left the
* cluster. In that case, it returns {@code TopologyChangeException} so
* that the job will be restarted
* </ul>
*/
private Throwable getErrorFromResponses(String opName, Collection<Map.Entry<MemberInfo, Object>> responses) {
if (isCancelled()) {
logger.fine(mc.jobIdString() + " to be cancelled after " + opName);
return new CancellationException();
}
Map<Boolean, List<Entry<Address, Object>>> grouped = responses.stream().map(en -> entry(en.getKey().getAddress(), en.getValue())).collect(partitioningBy(e1 -> e1.getValue() instanceof Throwable));
int successfulMembersCount = grouped.getOrDefault(false, emptyList()).size();
if (successfulMembersCount == mc.executionPlanMap().size()) {
logger.fine(opName + " of " + mc.jobIdString() + " was successful");
return null;
}
List<Entry<Address, Object>> failures = grouped.getOrDefault(true, emptyList());
if (!failures.isEmpty()) {
logger.fine(opName + " of " + mc.jobIdString() + " has failures: " + failures);
}
// other exceptions, ignore this and handle the other exception.
if (failures.stream().allMatch(entry -> entry.getValue() instanceof TerminatedWithSnapshotException)) {
assert opName.equals("Execution") : "opName is '" + opName + "', expected 'Execution'";
logger.fine(opName + " of " + mc.jobIdString() + " terminated after a terminal snapshot");
TerminationMode mode = requestedTerminationMode;
assert mode != null && mode.isWithTerminalSnapshot() : "mode=" + mode;
return mode == CANCEL_GRACEFUL ? new CancellationException() : new JobTerminateRequestedException(mode);
}
// If all exceptions are of certain type, treat it as TopologyChangedException
Map<Boolean, List<Entry<Address, Object>>> splitFailures = failures.stream().collect(Collectors.partitioningBy(e -> e.getValue() instanceof CancellationException || e.getValue() instanceof TerminatedWithSnapshotException || isTopologyException((Throwable) e.getValue())));
List<Entry<Address, Object>> topologyFailures = splitFailures.getOrDefault(true, emptyList());
List<Entry<Address, Object>> otherFailures = splitFailures.getOrDefault(false, emptyList());
if (!otherFailures.isEmpty()) {
return (Throwable) otherFailures.get(0).getValue();
} else {
return new TopologyChangedException("Causes from members: " + topologyFailures);
}
}
use of com.hazelcast.jet.impl.exception.JobTerminateRequestedException in project hazelcast by hazelcast.
the class ExecutionLifecycleTest method assertPmsClosedWithError.
private void assertPmsClosedWithError() {
assertTrue("init not called", MockPMS.initCalled.get());
assertTrue("close not called", MockPMS.closeCalled.get());
assertOneOfExceptionsInCauses(MockPMS.receivedCloseError.get(), MOCK_ERROR, new CancellationException(), new JobTerminateRequestedException(CANCEL_FORCEFUL));
}
use of com.hazelcast.jet.impl.exception.JobTerminateRequestedException in project hazelcast by hazelcast.
the class ExecutionLifecycleTest method assertPsClosedWithError.
private void assertPsClosedWithError() {
assertEquals(MEMBER_COUNT, MockPS.initCount.get());
// with light jobs the init can be called on not all the members - the execution on one member
// can be cancelled due to the failure on the other member before it was initialized.
int minCount = useLightJob ? 1 : MEMBER_COUNT;
assertBetween("close count", MockPS.closeCount.get(), minCount, MEMBER_COUNT);
assertBetween("received close errors", MockPS.receivedCloseErrors.size(), minCount, MEMBER_COUNT);
for (int i = 0; i < MockPS.receivedCloseErrors.size(); i++) {
assertOneOfExceptionsInCauses(MockPS.receivedCloseErrors.get(i), MOCK_ERROR, new CancellationException(), new JobTerminateRequestedException(CANCEL_FORCEFUL));
}
}
use of com.hazelcast.jet.impl.exception.JobTerminateRequestedException in project hazelcast by hazelcast.
the class JobExecutionService method terminateExecution.
public void terminateExecution(long jobId, long executionId, Address callerAddress, TerminationMode mode) {
failIfNotRunning();
ExecutionContext executionContext = executionContexts.get(executionId);
if (executionContext == null) {
// job. We ignore too and rely on the CheckLightJobsOperation.
return;
}
if (!executionContext.isLightJob()) {
Address masterAddress = nodeEngine.getMasterAddress();
if (!callerAddress.equals(masterAddress)) {
failIfNotRunning();
throw new IllegalStateException(String.format("Caller %s cannot do '%s' for terminateExecution: it is not the master, the master is %s", callerAddress, jobIdAndExecutionId(jobId, executionId), masterAddress));
}
}
Address coordinator = executionContext.coordinator();
if (coordinator == null) {
// It can't happen for normal jobs
assert executionContext.isLightJob() : "null coordinator for non-light job";
} else if (!coordinator.equals(callerAddress)) {
throw new IllegalStateException(String.format("%s, originally from coordinator %s, cannot do 'terminateExecution' by coordinator %s and execution %s", executionContext.jobNameAndExecutionId(), coordinator, callerAddress, idToString(executionId)));
}
Exception cause = mode == null ? new CancellationException() : new JobTerminateRequestedException(mode);
terminateExecution0(executionContext, mode, cause);
}
Aggregations