Search in sources :

Example 11 with JobStatus

use of com.hazelcast.jet.core.JobStatus in project hazelcast by hazelcast.

the class MasterJobContext method scheduleRestart.

private void scheduleRestart() {
    mc.assertLockHeld();
    JobStatus jobStatus = mc.jobStatus();
    if (jobStatus != NOT_RUNNING && jobStatus != STARTING && jobStatus != RUNNING) {
        throw new IllegalStateException("Restart scheduled in an unexpected state: " + jobStatus);
    }
    mc.setJobStatus(NOT_RUNNING);
    mc.coordinationService().scheduleRestart(mc.jobId());
}
Also used : JobStatus(com.hazelcast.jet.core.JobStatus)

Example 12 with JobStatus

use of com.hazelcast.jet.core.JobStatus in project hazelcast by hazelcast.

the class MasterJobContext method onStartExecutionComplete.

private void onStartExecutionComplete(Throwable error, Collection<Entry<MemberInfo, Object>> responses) {
    JobStatus status = mc.jobStatus();
    if (status != STARTING && status != RUNNING) {
        logCannotComplete(error);
        error = new IllegalStateException("Job coordination failed");
    }
    setJobMetrics(responses.stream().filter(en -> en.getValue() instanceof RawJobMetrics).map(e1 -> (RawJobMetrics) e1.getValue()).collect(Collectors.toList()));
    if (error instanceof JobTerminateRequestedException && ((JobTerminateRequestedException) error).mode().isWithTerminalSnapshot()) {
        Throwable finalError = error;
        // The terminal snapshot on members is always completed before replying to StartExecutionOp.
        // However, the response to snapshot operations can be processed after the response to
        // StartExecutionOp, so wait for that too.
        mc.snapshotContext().terminalSnapshotFuture().whenCompleteAsync(withTryCatch(logger, (r, e) -> finalizeJob(finalError)));
    } else {
        if (error instanceof ExecutionNotFoundException) {
            // If the StartExecutionOperation didn't find the execution, it means that it was cancelled.
            if (requestedTerminationMode != null) {
                // This cancellation can be because the master cancelled it. If that's the case, convert the exception
                // to JobTerminateRequestedException.
                error = new JobTerminateRequestedException(requestedTerminationMode).initCause(error);
            }
        // The cancellation can also happen if some participant left and
        // the target cancelled the execution locally in JobExecutionService.onMemberRemoved().
        // We keep this (and possibly other) exceptions as they are
        // and let the execution complete with failure.
        }
        finalizeJob(error);
    }
}
Also used : JobStatus(com.hazelcast.jet.core.JobStatus) Address(com.hazelcast.cluster.Address) SUSPEND(com.hazelcast.jet.impl.TerminationMode.ActionAfterTerminate.SUSPEND) NOT_RUNNING(com.hazelcast.jet.core.JobStatus.NOT_RUNNING) GetLocalJobMetricsOperation(com.hazelcast.jet.impl.operation.GetLocalJobMetricsOperation) CompletableFuture.completedFuture(java.util.concurrent.CompletableFuture.completedFuture) NonCompletableFuture(com.hazelcast.jet.impl.util.NonCompletableFuture) ExceptionUtil.isTopologyException(com.hazelcast.jet.impl.util.ExceptionUtil.isTopologyException) JobTerminateRequestedException(com.hazelcast.jet.impl.exception.JobTerminateRequestedException) SourceProcessors.readMapP(com.hazelcast.jet.core.processor.SourceProcessors.readMapP) RESTART(com.hazelcast.jet.impl.TerminationMode.ActionAfterTerminate.RESTART) JetDelegatingClassLoader(com.hazelcast.jet.impl.deployment.JetDelegatingClassLoader) TerminatedWithSnapshotException(com.hazelcast.jet.impl.exception.TerminatedWithSnapshotException) Collectors.toMap(java.util.stream.Collectors.toMap) Functions.entryKey(com.hazelcast.function.Functions.entryKey) MemberInfo(com.hazelcast.internal.cluster.MemberInfo) Map(java.util.Map) STARTING(com.hazelcast.jet.core.JobStatus.STARTING) SUSPENDED(com.hazelcast.jet.core.JobStatus.SUSPENDED) DAG(com.hazelcast.jet.core.DAG) JobStatus(com.hazelcast.jet.core.JobStatus) ExceptionUtil(com.hazelcast.jet.impl.util.ExceptionUtil) JobMetrics(com.hazelcast.jet.core.metrics.JobMetrics) CancellationException(java.util.concurrent.CancellationException) CANCEL_GRACEFUL(com.hazelcast.jet.impl.TerminationMode.CANCEL_GRACEFUL) Collections.emptyList(java.util.Collections.emptyList) Collection(java.util.Collection) Set(java.util.Set) UUID(java.util.UUID) MILLISECONDS(java.util.concurrent.TimeUnit.MILLISECONDS) Collectors(java.util.stream.Collectors) CANCEL_FORCEFUL(com.hazelcast.jet.impl.TerminationMode.CANCEL_FORCEFUL) Objects(java.util.Objects) Util(com.hazelcast.jet.impl.util.Util) List(java.util.List) Util.idToString(com.hazelcast.jet.Util.idToString) ExecutionPlan(com.hazelcast.jet.impl.execution.init.ExecutionPlan) MetricNames(com.hazelcast.jet.core.metrics.MetricNames) Entry(java.util.Map.Entry) TopologyChangedException(com.hazelcast.jet.core.TopologyChangedException) COMPLETED(com.hazelcast.jet.core.JobStatus.COMPLETED) JetDisabledException(com.hazelcast.jet.impl.exception.JetDisabledException) LoggingUtil(com.hazelcast.jet.impl.util.LoggingUtil) ExecutionPlanBuilder.createExecutionPlans(com.hazelcast.jet.impl.execution.init.ExecutionPlanBuilder.createExecutionPlans) Collectors.partitioningBy(java.util.stream.Collectors.partitioningBy) TerminateExecutionOperation(com.hazelcast.jet.impl.operation.TerminateExecutionOperation) ExceptionUtil.isRestartableException(com.hazelcast.jet.impl.util.ExceptionUtil.isRestartableException) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) LoggingUtil.logFinest(com.hazelcast.jet.impl.util.LoggingUtil.logFinest) Util.doWithClassLoader(com.hazelcast.jet.impl.util.Util.doWithClassLoader) HashMap(java.util.HashMap) CompletableFuture(java.util.concurrent.CompletableFuture) ExecutionService(com.hazelcast.spi.impl.executionservice.ExecutionService) StartExecutionOperation(com.hazelcast.jet.impl.operation.StartExecutionOperation) Function(java.util.function.Function) Supplier(java.util.function.Supplier) Util.formatJobDuration(com.hazelcast.jet.impl.util.Util.formatJobDuration) ActionAfterTerminate(com.hazelcast.jet.impl.TerminationMode.ActionAfterTerminate) ExecutionNotFoundException(com.hazelcast.jet.impl.exception.ExecutionNotFoundException) ArrayList(java.util.ArrayList) JetException(com.hazelcast.jet.JetException) HashSet(java.util.HashSet) InitExecutionOperation(com.hazelcast.jet.impl.operation.InitExecutionOperation) COORDINATOR(com.hazelcast.jet.impl.JobClassLoaderService.JobPhase.COORDINATOR) ILogger(com.hazelcast.logging.ILogger) SnapshotValidator.validateSnapshot(com.hazelcast.jet.impl.SnapshotValidator.validateSnapshot) ExceptionUtil.rethrow(com.hazelcast.jet.impl.util.ExceptionUtil.rethrow) Operation(com.hazelcast.spi.impl.operationservice.Operation) Util.entry(com.hazelcast.jet.Util.entry) ExceptionUtil.withTryCatch(com.hazelcast.jet.impl.util.ExceptionUtil.withTryCatch) BiConsumer(java.util.function.BiConsumer) MembersView(com.hazelcast.internal.cluster.impl.MembersView) LocalMemberResetException(com.hazelcast.core.LocalMemberResetException) RESTART_GRACEFUL(com.hazelcast.jet.impl.TerminationMode.RESTART_GRACEFUL) Edge(com.hazelcast.jet.core.Edge) Version(com.hazelcast.version.Version) EXPORTED_SNAPSHOTS_PREFIX(com.hazelcast.jet.impl.JobRepository.EXPORTED_SNAPSHOTS_PREFIX) Nonnull(javax.annotation.Nonnull) Tuple2(com.hazelcast.jet.datamodel.Tuple2) Nullable(javax.annotation.Nullable) Job(com.hazelcast.jet.Job) Measurement(com.hazelcast.jet.core.metrics.Measurement) SUSPENDED_EXPORTING_SNAPSHOT(com.hazelcast.jet.core.JobStatus.SUSPENDED_EXPORTING_SNAPSHOT) Util.toList(com.hazelcast.jet.impl.util.Util.toList) RawJobMetrics(com.hazelcast.jet.impl.metrics.RawJobMetrics) MetricTags(com.hazelcast.jet.core.metrics.MetricTags) NONE(com.hazelcast.jet.config.ProcessingGuarantee.NONE) Consumer(java.util.function.Consumer) Vertex(com.hazelcast.jet.core.Vertex) Tuple2.tuple2(com.hazelcast.jet.datamodel.Tuple2.tuple2) CustomClassLoadedObject.deserializeWithCustomClassLoader(com.hazelcast.jet.impl.execution.init.CustomClassLoadedObject.deserializeWithCustomClassLoader) ExceptionUtil.peel(com.hazelcast.jet.impl.util.ExceptionUtil.peel) FAILED(com.hazelcast.jet.core.JobStatus.FAILED) RUNNING(com.hazelcast.jet.core.JobStatus.RUNNING) Collections(java.util.Collections) IMap(com.hazelcast.map.IMap) Edge.between(com.hazelcast.jet.core.Edge.between) ExecutionNotFoundException(com.hazelcast.jet.impl.exception.ExecutionNotFoundException) RawJobMetrics(com.hazelcast.jet.impl.metrics.RawJobMetrics) JobTerminateRequestedException(com.hazelcast.jet.impl.exception.JobTerminateRequestedException)

Example 13 with JobStatus

use of com.hazelcast.jet.core.JobStatus in project hazelcast by hazelcast.

the class MasterJobContext method requestTermination.

/**
 * Returns a tuple of:<ol>
 *     <li>a future that will be completed when the execution completes (or
 *          a completed future, if execution is not RUNNING or STARTING)
 *     <li>a string with a message why this call did nothing or null, if
 *          this call actually initiated the termination
 * </ol>
 *
 * @param allowWhileExportingSnapshot if false and jobStatus is
 *      SUSPENDED_EXPORTING_SNAPSHOT, termination will be rejected
 */
@Nonnull
Tuple2<CompletableFuture<Void>, String> requestTermination(TerminationMode mode, @SuppressWarnings("SameParameterValue") boolean allowWhileExportingSnapshot) {
    mc.coordinationService().assertOnCoordinatorThread();
    // cancellation, which is allowed even if not snapshotting.
    if (mc.jobConfig().getProcessingGuarantee() == NONE && mode != CANCEL_GRACEFUL) {
        mode = mode.withoutTerminalSnapshot();
    }
    JobStatus localStatus;
    Tuple2<CompletableFuture<Void>, String> result;
    mc.lock();
    try {
        localStatus = mc.jobStatus();
        if (localStatus == SUSPENDED_EXPORTING_SNAPSHOT && !allowWhileExportingSnapshot) {
            return tuple2(executionCompletionFuture, "Cannot cancel when job status is " + SUSPENDED_EXPORTING_SNAPSHOT);
        }
        if (localStatus == SUSPENDED && mode != CANCEL_FORCEFUL) {
            // if suspended, we can only cancel the job. Other terminations have no effect.
            return tuple2(executionCompletionFuture, "Job is " + SUSPENDED);
        }
        if (requestedTerminationMode != null) {
            // don't report the cancellation of a cancelled job as an error
            String message = requestedTerminationMode == CANCEL_FORCEFUL && mode == CANCEL_FORCEFUL ? null : "Job is already terminating in mode: " + requestedTerminationMode.name();
            return tuple2(executionCompletionFuture, message);
        }
        requestedTerminationMode = mode;
        // handle cancellation of a suspended job
        if (localStatus == SUSPENDED || localStatus == SUSPENDED_EXPORTING_SNAPSHOT) {
            mc.setJobStatus(FAILED);
            setFinalResult(new CancellationException());
        }
        if (mode.isWithTerminalSnapshot()) {
            mc.snapshotContext().enqueueSnapshot(null, true, null);
        }
        result = tuple2(executionCompletionFuture, null);
    } finally {
        mc.unlock();
    }
    if (localStatus == SUSPENDED || localStatus == SUSPENDED_EXPORTING_SNAPSHOT) {
        try {
            mc.coordinationService().completeJob(mc, new CancellationException(), System.currentTimeMillis()).get();
        } catch (Exception e) {
            throw rethrow(e);
        }
    } else {
        if (localStatus == RUNNING || localStatus == STARTING) {
            handleTermination(mode);
        }
    }
    return result;
}
Also used : JobStatus(com.hazelcast.jet.core.JobStatus) NonCompletableFuture(com.hazelcast.jet.impl.util.NonCompletableFuture) CompletableFuture(java.util.concurrent.CompletableFuture) CancellationException(java.util.concurrent.CancellationException) Util.idToString(com.hazelcast.jet.Util.idToString) ExceptionUtil.isTopologyException(com.hazelcast.jet.impl.util.ExceptionUtil.isTopologyException) JobTerminateRequestedException(com.hazelcast.jet.impl.exception.JobTerminateRequestedException) TerminatedWithSnapshotException(com.hazelcast.jet.impl.exception.TerminatedWithSnapshotException) CancellationException(java.util.concurrent.CancellationException) TopologyChangedException(com.hazelcast.jet.core.TopologyChangedException) JetDisabledException(com.hazelcast.jet.impl.exception.JetDisabledException) ExceptionUtil.isRestartableException(com.hazelcast.jet.impl.util.ExceptionUtil.isRestartableException) ExecutionNotFoundException(com.hazelcast.jet.impl.exception.ExecutionNotFoundException) JetException(com.hazelcast.jet.JetException) LocalMemberResetException(com.hazelcast.core.LocalMemberResetException) Nonnull(javax.annotation.Nonnull)

Example 14 with JobStatus

use of com.hazelcast.jet.core.JobStatus in project hazelcast by hazelcast.

the class MasterJobContext method onInitStepCompleted.

// Called as callback when all InitOperation invocations are done
private void onInitStepCompleted(Collection<Map.Entry<MemberInfo, Object>> responses) {
    mc.coordinationService().submitToCoordinatorThread(() -> {
        Throwable error = getErrorFromResponses("Init", responses);
        JobStatus status = mc.jobStatus();
        if (error == null && status == STARTING) {
            invokeStartExecution();
        } else {
            cancelExecutionInvocations(mc.jobId(), mc.executionId(), null, () -> onStartExecutionComplete(error != null ? error : new IllegalStateException("Cannot execute " + mc.jobIdString() + ": status is " + status), emptyList()));
        }
    });
}
Also used : JobStatus(com.hazelcast.jet.core.JobStatus)

Example 15 with JobStatus

use of com.hazelcast.jet.core.JobStatus in project hazelcast by hazelcast.

the class WatermarkCoalescer_TerminalSnapshotTest method test.

@Test
public void test() throws Exception {
    /*
        This test tests the issue that after a terminal barrier is processed, no other work should
        be done by the ProcessorTasklet or CIES after that (except for emitting the DONE_ITEM).
        Also, if at-least-once guarantee is used, the tasklet should not continue to drain
        the queue that had the barrier while waiting for other barriers.

        Specifically, the issue was that in at-least-once mode the DONE_ITEM was processed
        after the terminal barrier while waiting for the barrier on other queues/edges. The
        DONE_ITEM could have caused a WM being emitted after the barrier, which is ok
        for the at-least-once mode, but the terminal snapshot should behave as if exactly-once
        mode was used.

        This test ensures that we're waiting for a WM in coalescer (by having a stream skew)
        and then does a graceful restart in at-least-once mode and checks that the results are
        correct.
         */
    String key0 = generateKeyForPartition(instance, 0);
    String key1 = generateKeyForPartition(instance, 1);
    Pipeline p = Pipeline.create();
    p.readFrom(Sources.mapJournal(sourceMap, JournalInitialPosition.START_FROM_OLDEST)).withTimestamps(Map.Entry::getValue, 0).setLocalParallelism(PARTITION_COUNT).groupingKey(Map.Entry::getKey).window(WindowDefinition.sliding(1, 1)).aggregate(AggregateOperations.counting()).setLocalParallelism(PARTITION_COUNT).writeTo(SinkBuilder.sinkBuilder("throwing", ctx -> "").<KeyedWindowResult<String, Long>>receiveFn((w, kwr) -> {
        if (kwr.result() != COUNT) {
            throw new RuntimeException("Received unexpected item " + kwr + ", expected count is " + COUNT);
        }
    }).build());
    Job job = instance.getJet().newJob(p, new JobConfig().setProcessingGuarantee(ProcessingGuarantee.AT_LEAST_ONCE));
    List<Future> futures = new ArrayList<>();
    futures.add(spawn(() -> {
        for (; ; ) {
            assertJobStatusEventually(job, JobStatus.RUNNING);
            System.out.println("============RESTARTING JOB=========");
            job.restart();
            Thread.sleep(2000);
        }
    }));
    // one producer is twice as fast as the other, to cause waiting for WM while doing snapshot
    futures.add(spawn(() -> producer(key0, 1)));
    futures.add(spawn(() -> producer(key1, 2)));
    sleepSeconds(20);
    for (Future f : futures) {
        f.cancel(true);
        // check that the future was cancelled and didn't fail with another error
        try {
            f.get();
            fail("Exception was expected");
        } catch (CancellationException expected) {
        }
    }
    // check that the job is running
    JobStatus status = job.getStatus();
    assertTrue("job should not be completed, status=" + status, status != FAILED && status != COMPLETED && status != SUSPENDED);
}
Also used : ParallelJVMTest(com.hazelcast.test.annotation.ParallelJVMTest) KeyedWindowResult(com.hazelcast.jet.datamodel.KeyedWindowResult) QuickTest(com.hazelcast.test.annotation.QuickTest) RunWith(org.junit.runner.RunWith) EventJournalConfig(com.hazelcast.config.EventJournalConfig) ArrayList(java.util.ArrayList) Future(java.util.concurrent.Future) Map(java.util.Map) SUSPENDED(com.hazelcast.jet.core.JobStatus.SUSPENDED) Assert.fail(org.junit.Assert.fail) JobStatus(com.hazelcast.jet.core.JobStatus) Job(com.hazelcast.jet.Job) Before(org.junit.Before) Config(com.hazelcast.config.Config) HazelcastInstance(com.hazelcast.core.HazelcastInstance) WindowDefinition(com.hazelcast.jet.pipeline.WindowDefinition) Pipeline(com.hazelcast.jet.pipeline.Pipeline) CancellationException(java.util.concurrent.CancellationException) JetTestSupport(com.hazelcast.jet.core.JetTestSupport) JobConfig(com.hazelcast.jet.config.JobConfig) AggregateOperations(com.hazelcast.jet.aggregate.AggregateOperations) Assert.assertTrue(org.junit.Assert.assertTrue) Test(org.junit.Test) Category(org.junit.experimental.categories.Category) ClusterProperty(com.hazelcast.spi.properties.ClusterProperty) Sources(com.hazelcast.jet.pipeline.Sources) TimeUnit(java.util.concurrent.TimeUnit) LockSupport(java.util.concurrent.locks.LockSupport) List(java.util.List) JournalInitialPosition(com.hazelcast.jet.pipeline.JournalInitialPosition) HazelcastParallelClassRunner(com.hazelcast.test.HazelcastParallelClassRunner) FAILED(com.hazelcast.jet.core.JobStatus.FAILED) SinkBuilder(com.hazelcast.jet.pipeline.SinkBuilder) ProcessingGuarantee(com.hazelcast.jet.config.ProcessingGuarantee) COMPLETED(com.hazelcast.jet.core.JobStatus.COMPLETED) IMap(com.hazelcast.map.IMap) ArrayList(java.util.ArrayList) KeyedWindowResult(com.hazelcast.jet.datamodel.KeyedWindowResult) JobConfig(com.hazelcast.jet.config.JobConfig) Pipeline(com.hazelcast.jet.pipeline.Pipeline) JobStatus(com.hazelcast.jet.core.JobStatus) CancellationException(java.util.concurrent.CancellationException) Future(java.util.concurrent.Future) Job(com.hazelcast.jet.Job) ParallelJVMTest(com.hazelcast.test.annotation.ParallelJVMTest) QuickTest(com.hazelcast.test.annotation.QuickTest) Test(org.junit.Test)

Aggregations

JobStatus (com.hazelcast.jet.core.JobStatus)16 CancellationException (java.util.concurrent.CancellationException)7 COMPLETED (com.hazelcast.jet.core.JobStatus.COMPLETED)5 List (java.util.List)5 CompletableFuture (java.util.concurrent.CompletableFuture)5 JobConfig (com.hazelcast.jet.config.JobConfig)4 DAG (com.hazelcast.jet.core.DAG)4 Edge.between (com.hazelcast.jet.core.Edge.between)4 FAILED (com.hazelcast.jet.core.JobStatus.FAILED)4 TopologyChangedException (com.hazelcast.jet.core.TopologyChangedException)4 MemberInfo (com.hazelcast.internal.cluster.MemberInfo)3 MembersView (com.hazelcast.internal.cluster.impl.MembersView)3 Util.idToString (com.hazelcast.jet.Util.idToString)3 ProcessingGuarantee (com.hazelcast.jet.config.ProcessingGuarantee)3 Edge (com.hazelcast.jet.core.Edge)3 RUNNING (com.hazelcast.jet.core.JobStatus.RUNNING)3 Vertex (com.hazelcast.jet.core.Vertex)3 Function (java.util.function.Function)3 ExecutionCallback (com.hazelcast.core.ExecutionCallback)2 LocalMemberResetException (com.hazelcast.core.LocalMemberResetException)2