Search in sources :

Example 31 with JobStatus

use of org.apache.flink.api.common.JobStatus in project flink by apache.

the class JobStatusMetrics method registerMetrics.

@Override
public void registerMetrics(MetricGroup metricGroup) {
    for (JobStatus jobStatus : JobStatus.values()) {
        if (!jobStatus.isTerminalState() && jobStatus != JobStatus.RECONCILING) {
            final StateTimeMetric stateTimeMetric = createTimeMetric(jobStatus);
            StateTimeMetric.register(jobStatusMetricsSettings, metricGroup, stateTimeMetric, getBaseMetricName(jobStatus));
        }
    }
}
Also used : JobStatus(org.apache.flink.api.common.JobStatus)

Example 32 with JobStatus

use of org.apache.flink.api.common.JobStatus in project flink by apache.

the class DefaultSchedulerTest method cancelWhileRestartingShouldWaitForRunningTasks.

@Test
public void cancelWhileRestartingShouldWaitForRunningTasks() {
    final JobGraph jobGraph = singleJobVertexJobGraph(2);
    final DefaultScheduler scheduler = createSchedulerAndStartScheduling(jobGraph);
    final SchedulingTopology topology = scheduler.getSchedulingTopology();
    final Iterator<ArchivedExecutionVertex> vertexIterator = scheduler.requestJob().getArchivedExecutionGraph().getAllExecutionVertices().iterator();
    final ExecutionAttemptID attemptId1 = vertexIterator.next().getCurrentExecutionAttempt().getAttemptId();
    final ExecutionAttemptID attemptId2 = vertexIterator.next().getCurrentExecutionAttempt().getAttemptId();
    final ExecutionVertexID executionVertex2 = scheduler.getExecutionVertexIdOrThrow(attemptId2);
    scheduler.updateTaskExecutionState(new TaskExecutionState(attemptId1, ExecutionState.FAILED, new RuntimeException("expected")));
    scheduler.cancel();
    final ExecutionState vertex2StateAfterCancel = topology.getVertex(executionVertex2).getState();
    final JobStatus statusAfterCancelWhileRestarting = scheduler.requestJobStatus();
    scheduler.updateTaskExecutionState(new TaskExecutionState(attemptId2, ExecutionState.CANCELED, new RuntimeException("expected")));
    assertThat(vertex2StateAfterCancel, is(equalTo(ExecutionState.CANCELING)));
    assertThat(statusAfterCancelWhileRestarting, is(equalTo(JobStatus.CANCELLING)));
    assertThat(scheduler.requestJobStatus(), is(equalTo(JobStatus.CANCELED)));
}
Also used : JobStatus(org.apache.flink.api.common.JobStatus) ExecutionState(org.apache.flink.runtime.execution.ExecutionState) TaskExecutionState(org.apache.flink.runtime.taskmanager.TaskExecutionState) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) ExecutionVertexID(org.apache.flink.runtime.scheduler.strategy.ExecutionVertexID) ArchivedExecutionVertex(org.apache.flink.runtime.executiongraph.ArchivedExecutionVertex) SchedulingTopology(org.apache.flink.runtime.scheduler.strategy.SchedulingTopology) TaskExecutionState(org.apache.flink.runtime.taskmanager.TaskExecutionState) AdaptiveSchedulerTest(org.apache.flink.runtime.scheduler.adaptive.AdaptiveSchedulerTest) Test(org.junit.Test)

Example 33 with JobStatus

use of org.apache.flink.api.common.JobStatus in project flink by apache.

the class DownTimeGauge method getValue.

// ------------------------------------------------------------------------
@Override
public Long getValue() {
    final JobStatus status = jobStatusProvider.getState();
    // not running any more -> finished or not on leader
    if (status.isTerminalState()) {
        return NO_LONGER_RUNNING;
    }
    final long runningTimestamp = jobStatusProvider.getStatusTimestamp(JobStatus.RUNNING);
    final long failingTimestamp = jobStatusProvider.getStatusTimestamp(JobStatus.FAILING);
    if (failingTimestamp <= runningTimestamp) {
        return NOT_FAILING;
    } else {
        // we use 'Math.max' here to avoid negative timestamps when clocks change
        return Math.max(System.currentTimeMillis() - failingTimestamp, 0);
    }
}
Also used : JobStatus(org.apache.flink.api.common.JobStatus)

Example 34 with JobStatus

use of org.apache.flink.api.common.JobStatus in project flink by apache.

the class JobMaster method jobStatusChanged.

private void jobStatusChanged(final JobStatus newJobStatus) {
    validateRunsInMainThread();
    if (newJobStatus.isGloballyTerminalState()) {
        runAsync(() -> {
            Collection<ResultPartitionID> allTracked = partitionTracker.getAllTrackedPartitions().stream().map(d -> d.getShuffleDescriptor().getResultPartitionID()).collect(Collectors.toList());
            if (newJobStatus == JobStatus.FINISHED) {
                partitionTracker.stopTrackingAndReleaseOrPromotePartitions(allTracked);
            } else {
                partitionTracker.stopTrackingAndReleasePartitions(allTracked);
            }
        });
        final ExecutionGraphInfo executionGraphInfo = schedulerNG.requestJob();
        futureExecutor.execute(() -> jobCompletionActions.jobReachedGloballyTerminalState(executionGraphInfo));
    }
}
Also used : ShuffleMaster(org.apache.flink.runtime.shuffle.ShuffleMaster) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) RegistrationResponse(org.apache.flink.runtime.registration.RegistrationResponse) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) SlotPoolService(org.apache.flink.runtime.jobmaster.slotpool.SlotPoolService) RpcServiceUtils(org.apache.flink.runtime.rpc.RpcServiceUtils) ResourceRequirement(org.apache.flink.runtime.slots.ResourceRequirement) TimeoutException(java.util.concurrent.TimeoutException) ExceptionUtils(org.apache.flink.util.ExceptionUtils) HeartbeatListener(org.apache.flink.runtime.heartbeat.HeartbeatListener) JobManagerJobMetricGroupFactory(org.apache.flink.runtime.jobmaster.factories.JobManagerJobMetricGroupFactory) TaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TaskExecutorGateway) TaskExecutorToJobManagerHeartbeatPayload(org.apache.flink.runtime.taskexecutor.TaskExecutorToJobManagerHeartbeatPayload) AggregateFunction(org.apache.flink.api.common.functions.AggregateFunction) HeartbeatManager(org.apache.flink.runtime.heartbeat.HeartbeatManager) InstantiationUtil(org.apache.flink.util.InstantiationUtil) JobMasterPartitionTracker(org.apache.flink.runtime.io.network.partition.JobMasterPartitionTracker) ResultPartitionID(org.apache.flink.runtime.io.network.partition.ResultPartitionID) Map(java.util.Map) PermanentlyFencedRpcEndpoint(org.apache.flink.runtime.rpc.PermanentlyFencedRpcEndpoint) JobShuffleContext(org.apache.flink.runtime.shuffle.JobShuffleContext) Preconditions.checkNotNull(org.apache.flink.util.Preconditions.checkNotNull) BlobWriter(org.apache.flink.runtime.blob.BlobWriter) TaskStateSnapshot(org.apache.flink.runtime.checkpoint.TaskStateSnapshot) KeyGroupRange(org.apache.flink.runtime.state.KeyGroupRange) HighAvailabilityServices(org.apache.flink.runtime.highavailability.HighAvailabilityServices) UnknownKvStateLocation(org.apache.flink.runtime.query.UnknownKvStateLocation) JobDetails(org.apache.flink.runtime.messages.webmonitor.JobDetails) JobManagerJobMetricGroup(org.apache.flink.runtime.metrics.groups.JobManagerJobMetricGroup) HeartbeatReceiver(org.apache.flink.runtime.heartbeat.HeartbeatReceiver) Collection(java.util.Collection) ResourceManagerId(org.apache.flink.runtime.resourcemanager.ResourceManagerId) Set(java.util.Set) CompletionException(java.util.concurrent.CompletionException) RegisteredRpcConnection(org.apache.flink.runtime.registration.RegisteredRpcConnection) UUID(java.util.UUID) JobManagerOptions(org.apache.flink.configuration.JobManagerOptions) IntermediateDataSetID(org.apache.flink.runtime.jobgraph.IntermediateDataSetID) InetSocketAddress(java.net.InetSocketAddress) Collectors(java.util.stream.Collectors) Acknowledge(org.apache.flink.runtime.messages.Acknowledge) HeartbeatServices(org.apache.flink.runtime.heartbeat.HeartbeatServices) SlotOffer(org.apache.flink.runtime.taskexecutor.slot.SlotOffer) Objects(java.util.Objects) RetryingRegistration(org.apache.flink.runtime.registration.RetryingRegistration) SerializedValue(org.apache.flink.util.SerializedValue) CoordinationRequest(org.apache.flink.runtime.operators.coordination.CoordinationRequest) FlinkJobNotFoundException(org.apache.flink.runtime.messages.FlinkJobNotFoundException) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) Optional(java.util.Optional) CheckpointMetrics(org.apache.flink.runtime.checkpoint.CheckpointMetrics) Time(org.apache.flink.api.common.time.Time) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) ResourceManagerGateway(org.apache.flink.runtime.resourcemanager.ResourceManagerGateway) ResolutionMode(org.apache.flink.runtime.taskmanager.TaskManagerLocation.ResolutionMode) FlinkException(org.apache.flink.util.FlinkException) LeaderRetrievalListener(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalListener) SavepointFormatType(org.apache.flink.core.execution.SavepointFormatType) OnCompletionActions(org.apache.flink.runtime.jobmanager.OnCompletionActions) JobShuffleContextImpl(org.apache.flink.runtime.shuffle.JobShuffleContextImpl) HeartbeatSender(org.apache.flink.runtime.heartbeat.HeartbeatSender) CoordinationResponse(org.apache.flink.runtime.operators.coordination.CoordinationResponse) HashMap(java.util.HashMap) CompletableFuture(java.util.concurrent.CompletableFuture) JobStatus(org.apache.flink.api.common.JobStatus) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) HashSet(java.util.HashSet) RpcService(org.apache.flink.runtime.rpc.RpcService) SchedulerNG(org.apache.flink.runtime.scheduler.SchedulerNG) FutureUtils(org.apache.flink.util.concurrent.FutureUtils) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) PartitionProducerDisposedException(org.apache.flink.runtime.jobmanager.PartitionProducerDisposedException) FatalErrorHandler(org.apache.flink.runtime.rpc.FatalErrorHandler) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) Nonnull(javax.annotation.Nonnull) Nullable(javax.annotation.Nullable) Logger(org.slf4j.Logger) KvStateLocation(org.apache.flink.runtime.query.KvStateLocation) Executor(java.util.concurrent.Executor) ExecutionState(org.apache.flink.runtime.execution.ExecutionState) IOException(java.io.IOException) AccumulatorSnapshot(org.apache.flink.runtime.accumulators.AccumulatorSnapshot) JobStatusListener(org.apache.flink.runtime.executiongraph.JobStatusListener) DeclineCheckpoint(org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint) ExecutionGraphInfo(org.apache.flink.runtime.scheduler.ExecutionGraphInfo) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) NoOpHeartbeatManager(org.apache.flink.runtime.heartbeat.NoOpHeartbeatManager) JobID(org.apache.flink.api.common.JobID) TaskStateSnapshot.deserializeTaskStateSnapshot(org.apache.flink.runtime.checkpoint.TaskStateSnapshot.deserializeTaskStateSnapshot) PartitionTrackerFactory(org.apache.flink.runtime.io.network.partition.PartitionTrackerFactory) TaskExecutionState(org.apache.flink.runtime.taskmanager.TaskExecutionState) UnresolvedTaskManagerLocation(org.apache.flink.runtime.taskmanager.UnresolvedTaskManagerLocation) KvStateID(org.apache.flink.queryablestate.KvStateID) OperatorEvent(org.apache.flink.runtime.operators.coordination.OperatorEvent) ExecutionGraphInfo(org.apache.flink.runtime.scheduler.ExecutionGraphInfo) ResultPartitionID(org.apache.flink.runtime.io.network.partition.ResultPartitionID)

Example 35 with JobStatus

use of org.apache.flink.api.common.JobStatus in project flink by apache.

the class DispatcherTest method testCacheJobExecutionResult.

/**
 * Test that {@link JobResult} is cached when the job finishes.
 */
@Test
public void testCacheJobExecutionResult() throws Exception {
    dispatcher = createAndStartDispatcher(heartbeatServices, haServices, new ExpectedJobIdJobManagerRunnerFactory(jobId, createdJobManagerRunnerLatch));
    final DispatcherGateway dispatcherGateway = dispatcher.getSelfGateway(DispatcherGateway.class);
    final JobID failedJobId = new JobID();
    final JobStatus expectedState = JobStatus.FAILED;
    final ExecutionGraphInfo failedExecutionGraphInfo = new ExecutionGraphInfo(new ArchivedExecutionGraphBuilder().setJobID(failedJobId).setState(expectedState).setFailureCause(new ErrorInfo(new RuntimeException("expected"), 1L)).build());
    dispatcher.completeJobExecution(failedExecutionGraphInfo);
    assertThat(dispatcherGateway.requestJobStatus(failedJobId, TIMEOUT).get(), equalTo(expectedState));
    assertThat(dispatcherGateway.requestExecutionGraphInfo(failedJobId, TIMEOUT).get(), equalTo(failedExecutionGraphInfo));
}
Also used : JobStatus(org.apache.flink.api.common.JobStatus) ExecutionGraphInfo(org.apache.flink.runtime.scheduler.ExecutionGraphInfo) ErrorInfo(org.apache.flink.runtime.executiongraph.ErrorInfo) ArchivedExecutionGraphBuilder(org.apache.flink.runtime.rest.handler.legacy.utils.ArchivedExecutionGraphBuilder) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Aggregations

JobStatus (org.apache.flink.api.common.JobStatus)62 Test (org.junit.Test)28 JobID (org.apache.flink.api.common.JobID)19 CompletableFuture (java.util.concurrent.CompletableFuture)15 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)14 FlinkException (org.apache.flink.util.FlinkException)8 ExecutionException (java.util.concurrent.ExecutionException)7 IOException (java.io.IOException)6 ArrayList (java.util.ArrayList)6 Time (org.apache.flink.api.common.time.Time)6 ExecutionGraphInfo (org.apache.flink.runtime.scheduler.ExecutionGraphInfo)6 TaskExecutionState (org.apache.flink.runtime.taskmanager.TaskExecutionState)6 Collections (java.util.Collections)5 HashMap (java.util.HashMap)5 ExecutionState (org.apache.flink.runtime.execution.ExecutionState)5 FutureUtils (org.apache.flink.util.concurrent.FutureUtils)5 TimeUnit (java.util.concurrent.TimeUnit)4 Configuration (org.apache.flink.configuration.Configuration)4 ExecutionAttemptID (org.apache.flink.runtime.executiongraph.ExecutionAttemptID)4 Acknowledge (org.apache.flink.runtime.messages.Acknowledge)4