use of com.hazelcast.jet.core.JobStatus in project hazelcast by hazelcast.
the class MasterJobContext method scheduleRestart.
private void scheduleRestart() {
mc.assertLockHeld();
JobStatus jobStatus = mc.jobStatus();
if (jobStatus != NOT_RUNNING && jobStatus != STARTING && jobStatus != RUNNING) {
throw new IllegalStateException("Restart scheduled in an unexpected state: " + jobStatus);
}
mc.setJobStatus(NOT_RUNNING);
mc.coordinationService().scheduleRestart(mc.jobId());
}
use of com.hazelcast.jet.core.JobStatus in project hazelcast by hazelcast.
the class MasterJobContext method onStartExecutionComplete.
private void onStartExecutionComplete(Throwable error, Collection<Entry<MemberInfo, Object>> responses) {
JobStatus status = mc.jobStatus();
if (status != STARTING && status != RUNNING) {
logCannotComplete(error);
error = new IllegalStateException("Job coordination failed");
}
setJobMetrics(responses.stream().filter(en -> en.getValue() instanceof RawJobMetrics).map(e1 -> (RawJobMetrics) e1.getValue()).collect(Collectors.toList()));
if (error instanceof JobTerminateRequestedException && ((JobTerminateRequestedException) error).mode().isWithTerminalSnapshot()) {
Throwable finalError = error;
// The terminal snapshot on members is always completed before replying to StartExecutionOp.
// However, the response to snapshot operations can be processed after the response to
// StartExecutionOp, so wait for that too.
mc.snapshotContext().terminalSnapshotFuture().whenCompleteAsync(withTryCatch(logger, (r, e) -> finalizeJob(finalError)));
} else {
if (error instanceof ExecutionNotFoundException) {
// If the StartExecutionOperation didn't find the execution, it means that it was cancelled.
if (requestedTerminationMode != null) {
// This cancellation can be because the master cancelled it. If that's the case, convert the exception
// to JobTerminateRequestedException.
error = new JobTerminateRequestedException(requestedTerminationMode).initCause(error);
}
// The cancellation can also happen if some participant left and
// the target cancelled the execution locally in JobExecutionService.onMemberRemoved().
// We keep this (and possibly other) exceptions as they are
// and let the execution complete with failure.
}
finalizeJob(error);
}
}
use of com.hazelcast.jet.core.JobStatus in project hazelcast by hazelcast.
the class MasterJobContext method requestTermination.
/**
* Returns a tuple of:<ol>
* <li>a future that will be completed when the execution completes (or
* a completed future, if execution is not RUNNING or STARTING)
* <li>a string with a message why this call did nothing or null, if
* this call actually initiated the termination
* </ol>
*
* @param allowWhileExportingSnapshot if false and jobStatus is
* SUSPENDED_EXPORTING_SNAPSHOT, termination will be rejected
*/
@Nonnull
Tuple2<CompletableFuture<Void>, String> requestTermination(TerminationMode mode, @SuppressWarnings("SameParameterValue") boolean allowWhileExportingSnapshot) {
mc.coordinationService().assertOnCoordinatorThread();
// cancellation, which is allowed even if not snapshotting.
if (mc.jobConfig().getProcessingGuarantee() == NONE && mode != CANCEL_GRACEFUL) {
mode = mode.withoutTerminalSnapshot();
}
JobStatus localStatus;
Tuple2<CompletableFuture<Void>, String> result;
mc.lock();
try {
localStatus = mc.jobStatus();
if (localStatus == SUSPENDED_EXPORTING_SNAPSHOT && !allowWhileExportingSnapshot) {
return tuple2(executionCompletionFuture, "Cannot cancel when job status is " + SUSPENDED_EXPORTING_SNAPSHOT);
}
if (localStatus == SUSPENDED && mode != CANCEL_FORCEFUL) {
// if suspended, we can only cancel the job. Other terminations have no effect.
return tuple2(executionCompletionFuture, "Job is " + SUSPENDED);
}
if (requestedTerminationMode != null) {
// don't report the cancellation of a cancelled job as an error
String message = requestedTerminationMode == CANCEL_FORCEFUL && mode == CANCEL_FORCEFUL ? null : "Job is already terminating in mode: " + requestedTerminationMode.name();
return tuple2(executionCompletionFuture, message);
}
requestedTerminationMode = mode;
// handle cancellation of a suspended job
if (localStatus == SUSPENDED || localStatus == SUSPENDED_EXPORTING_SNAPSHOT) {
mc.setJobStatus(FAILED);
setFinalResult(new CancellationException());
}
if (mode.isWithTerminalSnapshot()) {
mc.snapshotContext().enqueueSnapshot(null, true, null);
}
result = tuple2(executionCompletionFuture, null);
} finally {
mc.unlock();
}
if (localStatus == SUSPENDED || localStatus == SUSPENDED_EXPORTING_SNAPSHOT) {
try {
mc.coordinationService().completeJob(mc, new CancellationException(), System.currentTimeMillis()).get();
} catch (Exception e) {
throw rethrow(e);
}
} else {
if (localStatus == RUNNING || localStatus == STARTING) {
handleTermination(mode);
}
}
return result;
}
use of com.hazelcast.jet.core.JobStatus in project hazelcast by hazelcast.
the class MasterJobContext method onInitStepCompleted.
// Called as callback when all InitOperation invocations are done
private void onInitStepCompleted(Collection<Map.Entry<MemberInfo, Object>> responses) {
mc.coordinationService().submitToCoordinatorThread(() -> {
Throwable error = getErrorFromResponses("Init", responses);
JobStatus status = mc.jobStatus();
if (error == null && status == STARTING) {
invokeStartExecution();
} else {
cancelExecutionInvocations(mc.jobId(), mc.executionId(), null, () -> onStartExecutionComplete(error != null ? error : new IllegalStateException("Cannot execute " + mc.jobIdString() + ": status is " + status), emptyList()));
}
});
}
use of com.hazelcast.jet.core.JobStatus in project hazelcast by hazelcast.
the class WatermarkCoalescer_TerminalSnapshotTest method test.
@Test
public void test() throws Exception {
/*
This test tests the issue that after a terminal barrier is processed, no other work should
be done by the ProcessorTasklet or CIES after that (except for emitting the DONE_ITEM).
Also, if at-least-once guarantee is used, the tasklet should not continue to drain
the queue that had the barrier while waiting for other barriers.
Specifically, the issue was that in at-least-once mode the DONE_ITEM was processed
after the terminal barrier while waiting for the barrier on other queues/edges. The
DONE_ITEM could have caused a WM being emitted after the barrier, which is ok
for the at-least-once mode, but the terminal snapshot should behave as if exactly-once
mode was used.
This test ensures that we're waiting for a WM in coalescer (by having a stream skew)
and then does a graceful restart in at-least-once mode and checks that the results are
correct.
*/
String key0 = generateKeyForPartition(instance, 0);
String key1 = generateKeyForPartition(instance, 1);
Pipeline p = Pipeline.create();
p.readFrom(Sources.mapJournal(sourceMap, JournalInitialPosition.START_FROM_OLDEST)).withTimestamps(Map.Entry::getValue, 0).setLocalParallelism(PARTITION_COUNT).groupingKey(Map.Entry::getKey).window(WindowDefinition.sliding(1, 1)).aggregate(AggregateOperations.counting()).setLocalParallelism(PARTITION_COUNT).writeTo(SinkBuilder.sinkBuilder("throwing", ctx -> "").<KeyedWindowResult<String, Long>>receiveFn((w, kwr) -> {
if (kwr.result() != COUNT) {
throw new RuntimeException("Received unexpected item " + kwr + ", expected count is " + COUNT);
}
}).build());
Job job = instance.getJet().newJob(p, new JobConfig().setProcessingGuarantee(ProcessingGuarantee.AT_LEAST_ONCE));
List<Future> futures = new ArrayList<>();
futures.add(spawn(() -> {
for (; ; ) {
assertJobStatusEventually(job, JobStatus.RUNNING);
System.out.println("============RESTARTING JOB=========");
job.restart();
Thread.sleep(2000);
}
}));
// one producer is twice as fast as the other, to cause waiting for WM while doing snapshot
futures.add(spawn(() -> producer(key0, 1)));
futures.add(spawn(() -> producer(key1, 2)));
sleepSeconds(20);
for (Future f : futures) {
f.cancel(true);
// check that the future was cancelled and didn't fail with another error
try {
f.get();
fail("Exception was expected");
} catch (CancellationException expected) {
}
}
// check that the job is running
JobStatus status = job.getStatus();
assertTrue("job should not be completed, status=" + status, status != FAILED && status != COMPLETED && status != SUSPENDED);
}
Aggregations