use of com.hazelcast.jet.core.JobStatus.FAILED in project hazelcast-jet by hazelcast.
the class MasterContext method invokeCompleteExecution.
private void invokeCompleteExecution(Throwable error) {
JobStatus status = jobStatus();
Throwable finalError;
if (status == STARTING || status == RESTARTING || status == RUNNING) {
logger.fine("Completing " + jobIdString());
finalError = error;
} else {
if (error != null) {
logger.severe("Cannot properly complete failed " + jobIdString() + ": status is " + status, error);
} else {
logger.severe("Cannot properly complete " + jobIdString() + ": status is " + status);
}
finalError = new IllegalStateException("Job coordination failed.");
}
Function<ExecutionPlan, Operation> operationCtor = plan -> new CompleteExecutionOperation(executionId, finalError);
invoke(operationCtor, responses -> finalizeJob(error), null);
}
use of com.hazelcast.jet.core.JobStatus.FAILED in project hazelcast-jet by hazelcast.
the class MasterContext method tryStartJob.
/**
* Starts execution of the job if it is not already completed, cancelled or failed.
* If the job is already cancelled, the job completion procedure is triggered.
* If the job quorum is not satisfied, job restart is rescheduled.
* If there was a membership change and the partition table is not completely
* fixed yet, job restart is rescheduled.
*/
void tryStartJob(Function<Long, Long> executionIdSupplier) {
if (!setJobStatusToStarting()) {
return;
}
if (scheduleRestartIfQuorumAbsent() || scheduleRestartIfClusterIsNotSafe()) {
return;
}
DAG dag;
try {
dag = deserializeDAG();
} catch (Exception e) {
logger.warning("DAG deserialization failed", e);
finalizeJob(e);
return;
}
// save a copy of the vertex list, because it is going to change
vertices = new HashSet<>();
dag.iterator().forEachRemaining(vertices::add);
executionId = executionIdSupplier.apply(jobId);
// last started snapshot complete or not complete. The next started snapshot must be greater than this number
long lastSnapshotId = NO_SNAPSHOT;
if (isSnapshottingEnabled()) {
Long snapshotIdToRestore = snapshotRepository.latestCompleteSnapshot(jobId);
snapshotRepository.deleteAllSnapshotsExceptOne(jobId, snapshotIdToRestore);
Long lastStartedSnapshot = snapshotRepository.latestStartedSnapshot(jobId);
if (snapshotIdToRestore != null) {
logger.info("State of " + jobIdString() + " will be restored from snapshot " + snapshotIdToRestore);
rewriteDagWithSnapshotRestore(dag, snapshotIdToRestore);
} else {
logger.info("No previous snapshot for " + jobIdString() + " found.");
}
if (lastStartedSnapshot != null) {
lastSnapshotId = lastStartedSnapshot;
}
}
MembersView membersView = getMembersView();
ClassLoader previousCL = swapContextClassLoader(coordinationService.getClassLoader(jobId));
try {
int defaultLocalParallelism = getJetInstance(nodeEngine).getConfig().getInstanceConfig().getCooperativeThreadCount();
logger.info("Start executing " + jobIdString() + ", status " + jobStatus() + "\n" + dag.toString(defaultLocalParallelism));
logger.fine("Building execution plan for " + jobIdString());
executionPlanMap = createExecutionPlans(nodeEngine, membersView, dag, getJobConfig(), lastSnapshotId);
} catch (Exception e) {
logger.severe("Exception creating execution plan for " + jobIdString(), e);
finalizeJob(e);
return;
} finally {
Thread.currentThread().setContextClassLoader(previousCL);
}
logger.fine("Built execution plans for " + jobIdString());
Set<MemberInfo> participants = executionPlanMap.keySet();
Function<ExecutionPlan, Operation> operationCtor = plan -> new InitExecutionOperation(jobId, executionId, membersView.getVersion(), participants, nodeEngine.getSerializationService().toData(plan));
invoke(operationCtor, this::onInitStepCompleted, null);
}
use of com.hazelcast.jet.core.JobStatus.FAILED in project hazelcast by hazelcast.
the class MasterJobContext method tryStartJob.
/**
* Starts the execution of the job if it is not already completed,
* cancelled or failed.
* <p>
* If the job is already cancelled, triggers the job completion procedure.
* <p>
* If the job quorum is not satisfied, reschedules the job restart.
* <p>
* If there was a membership change and the partition table is not completely
* fixed yet, reschedules the job restart.
*/
void tryStartJob(Supplier<Long> executionIdSupplier) {
mc.coordinationService().submitToCoordinatorThread(() -> {
executionStartTime = System.currentTimeMillis();
try {
JobExecutionRecord jobExecRec = mc.jobExecutionRecord();
jobExecRec.markExecuted();
Tuple2<DAG, ClassLoader> dagAndClassloader = resolveDagAndCL(executionIdSupplier);
if (dagAndClassloader == null) {
return;
}
DAG dag = dagAndClassloader.f0();
assert dag != null;
ClassLoader classLoader = dagAndClassloader.f1();
// must call this before rewriteDagWithSnapshotRestore()
String dotRepresentation = dag.toDotString(defaultParallelism, defaultQueueSize);
long snapshotId = jobExecRec.snapshotId();
String snapshotName = mc.jobConfig().getInitialSnapshotName();
String mapName = snapshotId >= 0 ? jobExecRec.successfulSnapshotDataMapName(mc.jobId()) : snapshotName != null ? EXPORTED_SNAPSHOTS_PREFIX + snapshotName : null;
if (mapName != null) {
rewriteDagWithSnapshotRestore(dag, snapshotId, mapName, snapshotName);
} else {
logger.info("Didn't find any snapshot to restore for " + mc.jobIdString());
}
MembersView membersView = Util.getMembersView(mc.nodeEngine());
logger.info("Start executing " + mc.jobIdString() + ", execution graph in DOT format:\n" + dotRepresentation + "\nHINT: You can use graphviz or http://viz-js.com to visualize the printed graph.");
logger.fine("Building execution plan for " + mc.jobIdString());
Util.doWithClassLoader(classLoader, () -> mc.setExecutionPlanMap(createExecutionPlans(mc.nodeEngine(), membersView.getMembers(), dag, mc.jobId(), mc.executionId(), mc.jobConfig(), jobExecRec.ongoingSnapshotId(), false, mc.jobRecord().getSubject())));
logger.fine("Built execution plans for " + mc.jobIdString());
Set<MemberInfo> participants = mc.executionPlanMap().keySet();
Version coordinatorVersion = mc.nodeEngine().getLocalMember().getVersion().asVersion();
Function<ExecutionPlan, Operation> operationCtor = plan -> new InitExecutionOperation(mc.jobId(), mc.executionId(), membersView.getVersion(), coordinatorVersion, participants, mc.nodeEngine().getSerializationService().toData(plan), false);
mc.invokeOnParticipants(operationCtor, this::onInitStepCompleted, null, false);
} catch (Throwable e) {
finalizeJob(e);
}
});
}
use of com.hazelcast.jet.core.JobStatus.FAILED in project hazelcast by hazelcast.
the class MasterJobContext method onStartExecutionComplete.
private void onStartExecutionComplete(Throwable error, Collection<Entry<MemberInfo, Object>> responses) {
JobStatus status = mc.jobStatus();
if (status != STARTING && status != RUNNING) {
logCannotComplete(error);
error = new IllegalStateException("Job coordination failed");
}
setJobMetrics(responses.stream().filter(en -> en.getValue() instanceof RawJobMetrics).map(e1 -> (RawJobMetrics) e1.getValue()).collect(Collectors.toList()));
if (error instanceof JobTerminateRequestedException && ((JobTerminateRequestedException) error).mode().isWithTerminalSnapshot()) {
Throwable finalError = error;
// The terminal snapshot on members is always completed before replying to StartExecutionOp.
// However, the response to snapshot operations can be processed after the response to
// StartExecutionOp, so wait for that too.
mc.snapshotContext().terminalSnapshotFuture().whenCompleteAsync(withTryCatch(logger, (r, e) -> finalizeJob(finalError)));
} else {
if (error instanceof ExecutionNotFoundException) {
// If the StartExecutionOperation didn't find the execution, it means that it was cancelled.
if (requestedTerminationMode != null) {
// This cancellation can be because the master cancelled it. If that's the case, convert the exception
// to JobTerminateRequestedException.
error = new JobTerminateRequestedException(requestedTerminationMode).initCause(error);
}
// The cancellation can also happen if some participant left and
// the target cancelled the execution locally in JobExecutionService.onMemberRemoved().
// We keep this (and possibly other) exceptions as they are
// and let the execution complete with failure.
}
finalizeJob(error);
}
}
use of com.hazelcast.jet.core.JobStatus.FAILED in project hazelcast by hazelcast.
the class WatermarkCoalescer_TerminalSnapshotTest method test.
@Test
public void test() throws Exception {
/*
This test tests the issue that after a terminal barrier is processed, no other work should
be done by the ProcessorTasklet or CIES after that (except for emitting the DONE_ITEM).
Also, if at-least-once guarantee is used, the tasklet should not continue to drain
the queue that had the barrier while waiting for other barriers.
Specifically, the issue was that in at-least-once mode the DONE_ITEM was processed
after the terminal barrier while waiting for the barrier on other queues/edges. The
DONE_ITEM could have caused a WM being emitted after the barrier, which is ok
for the at-least-once mode, but the terminal snapshot should behave as if exactly-once
mode was used.
This test ensures that we're waiting for a WM in coalescer (by having a stream skew)
and then does a graceful restart in at-least-once mode and checks that the results are
correct.
*/
String key0 = generateKeyForPartition(instance, 0);
String key1 = generateKeyForPartition(instance, 1);
Pipeline p = Pipeline.create();
p.readFrom(Sources.mapJournal(sourceMap, JournalInitialPosition.START_FROM_OLDEST)).withTimestamps(Map.Entry::getValue, 0).setLocalParallelism(PARTITION_COUNT).groupingKey(Map.Entry::getKey).window(WindowDefinition.sliding(1, 1)).aggregate(AggregateOperations.counting()).setLocalParallelism(PARTITION_COUNT).writeTo(SinkBuilder.sinkBuilder("throwing", ctx -> "").<KeyedWindowResult<String, Long>>receiveFn((w, kwr) -> {
if (kwr.result() != COUNT) {
throw new RuntimeException("Received unexpected item " + kwr + ", expected count is " + COUNT);
}
}).build());
Job job = instance.getJet().newJob(p, new JobConfig().setProcessingGuarantee(ProcessingGuarantee.AT_LEAST_ONCE));
List<Future> futures = new ArrayList<>();
futures.add(spawn(() -> {
for (; ; ) {
assertJobStatusEventually(job, JobStatus.RUNNING);
System.out.println("============RESTARTING JOB=========");
job.restart();
Thread.sleep(2000);
}
}));
// one producer is twice as fast as the other, to cause waiting for WM while doing snapshot
futures.add(spawn(() -> producer(key0, 1)));
futures.add(spawn(() -> producer(key1, 2)));
sleepSeconds(20);
for (Future f : futures) {
f.cancel(true);
// check that the future was cancelled and didn't fail with another error
try {
f.get();
fail("Exception was expected");
} catch (CancellationException expected) {
}
}
// check that the job is running
JobStatus status = job.getStatus();
assertTrue("job should not be completed, status=" + status, status != FAILED && status != COMPLETED && status != SUSPENDED);
}
Aggregations