use of com.hazelcast.jet.core.JobStatus in project hazelcast by hazelcast.
the class JobCoordinationService method getJobSummary.
private JobSummary getJobSummary(JobRecord record) {
MasterContext ctx = masterContexts.get(record.getJobId());
long execId = ctx == null ? 0 : ctx.executionId();
JobStatus status;
if (ctx == null) {
JobExecutionRecord executionRecord = jobRepository.getJobExecutionRecord(record.getJobId());
status = executionRecord != null && executionRecord.isSuspended() ? JobStatus.SUSPENDED : JobStatus.NOT_RUNNING;
} else {
status = ctx.jobStatus();
}
return new JobSummary(false, record.getJobId(), execId, record.getJobNameOrId(), status, record.getCreationTime(), 0, null, null);
}
use of com.hazelcast.jet.core.JobStatus in project hazelcast by hazelcast.
the class MasterJobContext method maybeScaleUp.
/**
* Checks if the job is running on all members and maybe restart it.
* <p>
* Returns {@code false}, if this method should be scheduled to
* be called later. That is, when the job is running, but we've
* failed to request the restart.
* <p>
* Returns {@code true}, if the job is not running, has
* auto-scaling disabled, is already running on all members or if
* we've managed to request a restart.
*/
boolean maybeScaleUp(int dataMembersWithPartitionsCount) {
mc.coordinationService().assertOnCoordinatorThread();
if (!mc.jobConfig().isAutoScaling()) {
return true;
}
// this job will be restarted anyway. If it's the other way, then the sizes won't match.
if (mc.executionPlanMap() == null || mc.executionPlanMap().size() == dataMembersWithPartitionsCount) {
LoggingUtil.logFine(logger, "Not scaling up %s: not running or already running on all members", mc.jobIdString());
return true;
}
JobStatus localStatus = mc.jobStatus();
if (localStatus == RUNNING && requestTermination(TerminationMode.RESTART_GRACEFUL, false).f1() == null) {
logger.info("Requested restart of " + mc.jobIdString() + " to make use of added member(s). " + "Job was running on " + mc.executionPlanMap().size() + " members, cluster now has " + dataMembersWithPartitionsCount + " data members with assigned partitions");
return true;
}
// if status was not RUNNING or requestTermination didn't succeed, we'll try again later.
return false;
}
use of com.hazelcast.jet.core.JobStatus in project hazelcast by hazelcast.
the class MasterJobContext method finalizeJob.
void finalizeJob(@Nullable Throwable failure) {
mc.coordinationService().submitToCoordinatorThread(() -> {
final Runnable nonSynchronizedAction;
mc.lock();
try {
JobStatus status = mc.jobStatus();
if (status == COMPLETED || status == FAILED) {
logIgnoredCompletion(failure, status);
return;
}
completeVertices(failure);
mc.getJetServiceBackend().getJobClassLoaderService().tryRemoveClassloadersForJob(mc.jobId(), COORDINATOR);
ActionAfterTerminate terminationModeAction = failure instanceof JobTerminateRequestedException ? ((JobTerminateRequestedException) failure).mode().actionAfterTerminate() : null;
mc.snapshotContext().onExecutionTerminated();
// if restart was requested, restart immediately
if (terminationModeAction == RESTART) {
mc.setJobStatus(NOT_RUNNING);
nonSynchronizedAction = () -> mc.coordinationService().restartJob(mc.jobId());
} else if (!isCancelled() && isRestartableException(failure) && mc.jobConfig().isAutoScaling()) {
// if restart is due to a failure, schedule a restart after a delay
scheduleRestart();
nonSynchronizedAction = NO_OP;
} else if (terminationModeAction == SUSPEND || isRestartableException(failure) && !isCancelled() && !mc.jobConfig().isAutoScaling() && mc.jobConfig().getProcessingGuarantee() != NONE) {
mc.setJobStatus(SUSPENDED);
mc.jobExecutionRecord().setSuspended(null);
nonSynchronizedAction = () -> mc.writeJobExecutionRecord(false);
} else if (failure != null && !isCancelled() && mc.jobConfig().isSuspendOnFailure()) {
mc.setJobStatus(SUSPENDED);
mc.jobExecutionRecord().setSuspended("Execution failure:\n" + ExceptionUtil.stackTraceToString(failure));
nonSynchronizedAction = () -> mc.writeJobExecutionRecord(false);
} else {
long completionTime = System.currentTimeMillis();
boolean isSuccess = logExecutionSummary(failure, completionTime);
mc.setJobStatus(isSuccess ? COMPLETED : FAILED);
if (failure instanceof LocalMemberResetException) {
logger.fine("Cancelling job " + mc.jobIdString() + " locally: member (local or remote) reset. " + "We don't delete job metadata: job will restart on majority cluster");
setFinalResult(new CancellationException());
} else {
mc.coordinationService().completeJob(mc, failure, completionTime).whenComplete(withTryCatch(logger, (r, f) -> {
if (f != null) {
logger.warning("Completion of " + mc.jobIdString() + " failed", f);
} else {
setFinalResult(failure);
}
}));
}
nonSynchronizedAction = NO_OP;
}
// reset the state for the next execution
requestedTerminationMode = null;
executionFailureCallback = null;
} finally {
mc.unlock();
}
executionCompletionFuture.complete(null);
nonSynchronizedAction.run();
});
}
use of com.hazelcast.jet.core.JobStatus in project hazelcast by hazelcast.
the class AsyncTransformUsingServiceBatchP_IntegrationTest method stressTestInt.
private void stressTestInt(boolean restart) {
/*
This is a stress test of the cooperative emission using the DAG api. Only through DAG
API we can configure edge queue sizes, which we use to cause more trouble for the
cooperative emission.
*/
// add more input to the source map
int numItems = 10_000;
journaledMap.putAll(IntStream.range(NUM_ITEMS, numItems).boxed().collect(toMap(i -> i, i -> i)));
DAG dag = new DAG();
Vertex source = dag.newVertex("source", throttle(streamMapP(journaledMap.getName(), alwaysTrue(), EventJournalMapEvent::getNewValue, START_FROM_OLDEST, eventTimePolicy(i -> (long) ((Integer) i), WatermarkPolicy.limitingLag(10), 10, 0, 0)), 5000));
BiFunctionEx<ExecutorService, List<Integer>, CompletableFuture<Traverser<String>>> flatMapAsyncFn = transformNotPartitionedFn(i -> traverseItems(i + "-1", i + "-2", i + "-3", i + "-4", i + "-5")).andThen(r -> r.thenApply(results -> traverseIterable(results).flatMap(Function.identity())));
ProcessorSupplier processorSupplier = AsyncTransformUsingServiceBatchedP.supplier(serviceFactory, DEFAULT_MAX_CONCURRENT_OPS, 128, flatMapAsyncFn);
Vertex map = dag.newVertex("map", processorSupplier).localParallelism(2);
Vertex sink = dag.newVertex("sink", SinkProcessors.writeListP(sinkList.getName()));
// Use a shorter queue to not block the barrier from the source for too long due to
// the backpressure from the slow mapper
EdgeConfig edgeToMapperConfig = new EdgeConfig().setQueueSize(128);
// Use a shorter queue on output from the mapper so that we experience backpressure
// from the sink
EdgeConfig edgeFromMapperConfig = new EdgeConfig().setQueueSize(10);
dag.edge(between(source, map).setConfig(edgeToMapperConfig)).edge(between(map, sink).setConfig(edgeFromMapperConfig));
Job job = instance().getJet().newJob(dag, jobConfig);
for (int i = 0; restart && i < 5; i++) {
assertNotNull(job);
assertTrueEventually(() -> {
JobStatus status = job.getStatus();
assertTrue("status=" + status, status == RUNNING || status == COMPLETED);
});
sleepMillis(100);
try {
job.restart();
} catch (IllegalStateException e) {
assertTrue(e.toString(), e.getMessage().startsWith("Cannot RESTART_GRACEFUL"));
break;
}
}
assertResult(i -> Stream.of(i + "-1", i + "-2", i + "-3", i + "-4", i + "-5"), numItems);
}
use of com.hazelcast.jet.core.JobStatus in project hazelcast by hazelcast.
the class JobCoordinationService method terminateJob.
public CompletableFuture<Void> terminateJob(long jobId, TerminationMode terminationMode) {
return runWithJob(jobId, masterContext -> {
// User can cancel in any state, other terminations are allowed only when running.
// This is not technically required (we can request termination in any state),
// but this method is only called by the user. It would be weird for the client to
// request a restart if the job didn't start yet etc.
// Also, it would be weird to restart the job during STARTING: as soon as it will start,
// it will restart.
// In any case, it doesn't make sense to restart a suspended job.
JobStatus jobStatus = masterContext.jobStatus();
if (jobStatus != RUNNING && terminationMode != CANCEL_FORCEFUL) {
throw new IllegalStateException("Cannot " + terminationMode + ", job status is " + jobStatus + ", should be " + RUNNING);
}
String terminationResult = masterContext.jobContext().requestTermination(terminationMode, false).f1();
if (terminationResult != null) {
throw new IllegalStateException("Cannot " + terminationMode + ": " + terminationResult);
}
}, jobResult -> {
if (terminationMode != CANCEL_FORCEFUL) {
throw new IllegalStateException("Cannot " + terminationMode + " job " + idToString(jobId) + " because it already has a result: " + jobResult);
}
logger.fine("Ignoring cancellation of a completed job " + idToString(jobId));
}, jobRecord -> {
// we'll eventually learn of the job through scanning of records or from a join operation
throw new RetryableHazelcastException("No MasterContext found for job " + idToString(jobId) + " for " + terminationMode);
});
}
Aggregations