Search in sources :

Example 1 with TaskExecutionState

use of org.apache.flink.runtime.taskmanager.TaskExecutionState in project flink by apache.

the class TaskExecutor method unregisterTaskAndNotifyFinalState.

private void unregisterTaskAndNotifyFinalState(final UUID jobMasterLeaderId, final JobMasterGateway jobMasterGateway, final ExecutionAttemptID executionAttemptID) {
    Task task = taskSlotTable.removeTask(executionAttemptID);
    if (task != null) {
        if (!task.getExecutionState().isTerminal()) {
            try {
                task.failExternally(new IllegalStateException("Task is being remove from TaskManager."));
            } catch (Exception e) {
                log.error("Could not properly fail task.", e);
            }
        }
        log.info("Un-registering task and sending final execution state {} to JobManager for task {} {}.", task.getExecutionState(), task.getTaskInfo().getTaskName(), task.getExecutionId());
        AccumulatorSnapshot accumulatorSnapshot = task.getAccumulatorRegistry().getSnapshot();
        updateTaskExecutionState(jobMasterLeaderId, jobMasterGateway, new TaskExecutionState(task.getJobID(), task.getExecutionId(), task.getExecutionState(), task.getFailureCause(), accumulatorSnapshot, task.getMetricGroup().getIOMetricGroup().createSnapshot()));
    } else {
        log.error("Cannot find task with ID {} to unregister.", executionAttemptID);
    }
}
Also used : Task(org.apache.flink.runtime.taskmanager.Task) AccumulatorSnapshot(org.apache.flink.runtime.accumulators.AccumulatorSnapshot) TimeoutException(java.util.concurrent.TimeoutException) PartitionException(org.apache.flink.runtime.taskexecutor.exceptions.PartitionException) CheckpointException(org.apache.flink.runtime.taskexecutor.exceptions.CheckpointException) SlotAllocationException(org.apache.flink.runtime.taskexecutor.exceptions.SlotAllocationException) TaskSubmissionException(org.apache.flink.runtime.taskexecutor.exceptions.TaskSubmissionException) TaskException(org.apache.flink.runtime.taskexecutor.exceptions.TaskException) SlotNotActiveException(org.apache.flink.runtime.taskexecutor.slot.SlotNotActiveException) SlotNotFoundException(org.apache.flink.runtime.taskexecutor.slot.SlotNotFoundException) IOException(java.io.IOException) TaskExecutionState(org.apache.flink.runtime.taskmanager.TaskExecutionState)

Example 2 with TaskExecutionState

use of org.apache.flink.runtime.taskmanager.TaskExecutionState in project flink by apache.

the class ExecutionGraphMetricsTest method testExecutionGraphRestartTimeMetric.

/**
	 * This test tests that the restarting time metric correctly displays restarting times.
	 */
@Test
public void testExecutionGraphRestartTimeMetric() throws JobException, IOException, InterruptedException {
    final ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor();
    try {
        // setup execution graph with mocked scheduling logic
        int parallelism = 1;
        JobVertex jobVertex = new JobVertex("TestVertex");
        jobVertex.setParallelism(parallelism);
        jobVertex.setInvokableClass(NoOpInvokable.class);
        JobGraph jobGraph = new JobGraph("Test Job", jobVertex);
        Configuration config = new Configuration();
        config.setString(ConfigConstants.METRICS_REPORTERS_LIST, "test");
        config.setString(ConfigConstants.METRICS_REPORTER_PREFIX + "test." + ConfigConstants.METRICS_REPORTER_CLASS_SUFFIX, TestingReporter.class.getName());
        Configuration jobConfig = new Configuration();
        Time timeout = Time.seconds(10L);
        MetricRegistry metricRegistry = new MetricRegistry(MetricRegistryConfiguration.fromConfiguration(config));
        assertTrue(metricRegistry.getReporters().size() == 1);
        MetricReporter reporter = metricRegistry.getReporters().get(0);
        assertTrue(reporter instanceof TestingReporter);
        TestingReporter testingReporter = (TestingReporter) reporter;
        MetricGroup metricGroup = new JobManagerMetricGroup(metricRegistry, "localhost");
        Scheduler scheduler = mock(Scheduler.class);
        ResourceID taskManagerId = ResourceID.generate();
        TaskManagerLocation taskManagerLocation = mock(TaskManagerLocation.class);
        when(taskManagerLocation.getResourceID()).thenReturn(taskManagerId);
        when(taskManagerLocation.getHostname()).thenReturn("localhost");
        TaskManagerGateway taskManagerGateway = mock(TaskManagerGateway.class);
        Instance instance = mock(Instance.class);
        when(instance.getTaskManagerLocation()).thenReturn(taskManagerLocation);
        when(instance.getTaskManagerID()).thenReturn(taskManagerId);
        when(instance.getTaskManagerGateway()).thenReturn(taskManagerGateway);
        Slot rootSlot = mock(Slot.class);
        AllocatedSlot mockAllocatedSlot = mock(AllocatedSlot.class);
        when(mockAllocatedSlot.getSlotAllocationId()).thenReturn(new AllocationID());
        SimpleSlot simpleSlot = mock(SimpleSlot.class);
        when(simpleSlot.isAlive()).thenReturn(true);
        when(simpleSlot.getTaskManagerLocation()).thenReturn(taskManagerLocation);
        when(simpleSlot.getTaskManagerID()).thenReturn(taskManagerId);
        when(simpleSlot.getTaskManagerGateway()).thenReturn(taskManagerGateway);
        when(simpleSlot.setExecutedVertex(Matchers.any(Execution.class))).thenReturn(true);
        when(simpleSlot.getRoot()).thenReturn(rootSlot);
        when(simpleSlot.getAllocatedSlot()).thenReturn(mockAllocatedSlot);
        FlinkCompletableFuture<SimpleSlot> future = new FlinkCompletableFuture<>();
        future.complete(simpleSlot);
        when(scheduler.allocateSlot(any(ScheduledUnit.class), anyBoolean())).thenReturn(future);
        when(rootSlot.getSlotNumber()).thenReturn(0);
        when(taskManagerGateway.submitTask(any(TaskDeploymentDescriptor.class), any(Time.class))).thenReturn(FlinkCompletableFuture.completed(Acknowledge.get()));
        TestingRestartStrategy testingRestartStrategy = new TestingRestartStrategy();
        ExecutionGraph executionGraph = new ExecutionGraph(executor, executor, jobGraph.getJobID(), jobGraph.getName(), jobConfig, new SerializedValue<ExecutionConfig>(null), timeout, testingRestartStrategy, Collections.<BlobKey>emptyList(), Collections.<URL>emptyList(), scheduler, getClass().getClassLoader(), metricGroup);
        // get restarting time metric
        Metric metric = testingReporter.getMetric(ExecutionGraph.RESTARTING_TIME_METRIC_NAME);
        assertNotNull(metric);
        assertTrue(metric instanceof Gauge);
        @SuppressWarnings("unchecked") Gauge<Long> restartingTime = (Gauge<Long>) metric;
        // check that the restarting time is 0 since it's the initial start
        assertTrue(0L == restartingTime.getValue());
        executionGraph.attachJobGraph(jobGraph.getVerticesSortedTopologicallyFromSources());
        // start execution
        executionGraph.scheduleForExecution();
        assertTrue(0L == restartingTime.getValue());
        List<ExecutionAttemptID> executionIDs = new ArrayList<>();
        for (ExecutionVertex executionVertex : executionGraph.getAllExecutionVertices()) {
            executionIDs.add(executionVertex.getCurrentExecutionAttempt().getAttemptId());
        }
        // tell execution graph that the tasks are in state running --> job status switches to state running
        for (ExecutionAttemptID executionID : executionIDs) {
            executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.RUNNING));
        }
        assertEquals(JobStatus.RUNNING, executionGraph.getState());
        assertTrue(0L == restartingTime.getValue());
        // fail the job so that it goes into state restarting
        for (ExecutionAttemptID executionID : executionIDs) {
            executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.FAILED, new Exception()));
        }
        assertEquals(JobStatus.RESTARTING, executionGraph.getState());
        long firstRestartingTimestamp = executionGraph.getStatusTimestamp(JobStatus.RESTARTING);
        // wait some time so that the restarting time gauge shows a value different from 0
        Thread.sleep(50);
        long previousRestartingTime = restartingTime.getValue();
        // check that the restarting time is monotonically increasing
        for (int i = 0; i < 10; i++) {
            long currentRestartingTime = restartingTime.getValue();
            assertTrue(currentRestartingTime >= previousRestartingTime);
            previousRestartingTime = currentRestartingTime;
        }
        // check that we have measured some restarting time
        assertTrue(previousRestartingTime > 0);
        // restart job
        testingRestartStrategy.restartExecutionGraph();
        executionIDs.clear();
        for (ExecutionVertex executionVertex : executionGraph.getAllExecutionVertices()) {
            executionIDs.add(executionVertex.getCurrentExecutionAttempt().getAttemptId());
        }
        for (ExecutionAttemptID executionID : executionIDs) {
            executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.RUNNING));
        }
        assertEquals(JobStatus.RUNNING, executionGraph.getState());
        assertTrue(firstRestartingTimestamp != 0);
        previousRestartingTime = restartingTime.getValue();
        // check that the restarting time does not increase after we've reached the running state
        for (int i = 0; i < 10; i++) {
            long currentRestartingTime = restartingTime.getValue();
            assertTrue(currentRestartingTime == previousRestartingTime);
            previousRestartingTime = currentRestartingTime;
        }
        // fail job again
        for (ExecutionAttemptID executionID : executionIDs) {
            executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.FAILED, new Exception()));
        }
        assertEquals(JobStatus.RESTARTING, executionGraph.getState());
        long secondRestartingTimestamp = executionGraph.getStatusTimestamp(JobStatus.RESTARTING);
        assertTrue(firstRestartingTimestamp != secondRestartingTimestamp);
        Thread.sleep(50);
        previousRestartingTime = restartingTime.getValue();
        // check that the restarting time is increasing again
        for (int i = 0; i < 10; i++) {
            long currentRestartingTime = restartingTime.getValue();
            assertTrue(currentRestartingTime >= previousRestartingTime);
            previousRestartingTime = currentRestartingTime;
        }
        assertTrue(previousRestartingTime > 0);
        // now lets fail the job while it is in restarting and see whether the restarting time then stops to increase
        // for this to work, we have to use a SuppressRestartException
        executionGraph.fail(new SuppressRestartsException(new Exception()));
        assertEquals(JobStatus.FAILED, executionGraph.getState());
        previousRestartingTime = restartingTime.getValue();
        for (int i = 0; i < 10; i++) {
            long currentRestartingTime = restartingTime.getValue();
            assertTrue(currentRestartingTime == previousRestartingTime);
            previousRestartingTime = currentRestartingTime;
        }
    } finally {
        executor.shutdownNow();
    }
}
Also used : JobManagerMetricGroup(org.apache.flink.runtime.metrics.groups.JobManagerMetricGroup) MetricRegistryConfiguration(org.apache.flink.runtime.metrics.MetricRegistryConfiguration) Configuration(org.apache.flink.configuration.Configuration) Instance(org.apache.flink.runtime.instance.Instance) Scheduler(org.apache.flink.runtime.jobmanager.scheduler.Scheduler) MetricGroup(org.apache.flink.metrics.MetricGroup) JobManagerMetricGroup(org.apache.flink.runtime.metrics.groups.JobManagerMetricGroup) TaskManagerGateway(org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway) ArrayList(java.util.ArrayList) Time(org.apache.flink.api.common.time.Time) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) SimpleSlot(org.apache.flink.runtime.instance.SimpleSlot) FlinkCompletableFuture(org.apache.flink.runtime.concurrent.impl.FlinkCompletableFuture) Gauge(org.apache.flink.metrics.Gauge) SuppressRestartsException(org.apache.flink.runtime.execution.SuppressRestartsException) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) TaskDeploymentDescriptor(org.apache.flink.runtime.deployment.TaskDeploymentDescriptor) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) AllocatedSlot(org.apache.flink.runtime.jobmanager.slots.AllocatedSlot) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) ScheduledUnit(org.apache.flink.runtime.jobmanager.scheduler.ScheduledUnit) MetricReporter(org.apache.flink.metrics.reporter.MetricReporter) TaskExecutionState(org.apache.flink.runtime.taskmanager.TaskExecutionState) SuppressRestartsException(org.apache.flink.runtime.execution.SuppressRestartsException) JobException(org.apache.flink.runtime.JobException) IOException(java.io.IOException) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) SimpleSlot(org.apache.flink.runtime.instance.SimpleSlot) Slot(org.apache.flink.runtime.instance.Slot) AllocatedSlot(org.apache.flink.runtime.jobmanager.slots.AllocatedSlot) Metric(org.apache.flink.metrics.Metric) Test(org.junit.Test)

Example 3 with TaskExecutionState

use of org.apache.flink.runtime.taskmanager.TaskExecutionState in project flink by apache.

the class ExecutionGraphDeploymentTest method testAccumulatorsAndMetricsForwarding.

/**
	 * Verifies that {@link ExecutionGraph#updateState(TaskExecutionState)} updates the accumulators and metrics for an
	 * execution that failed or was canceled.
	 */
@Test
public void testAccumulatorsAndMetricsForwarding() throws Exception {
    final JobVertexID jid1 = new JobVertexID();
    final JobVertexID jid2 = new JobVertexID();
    JobVertex v1 = new JobVertex("v1", jid1);
    JobVertex v2 = new JobVertex("v2", jid2);
    Tuple2<ExecutionGraph, Map<ExecutionAttemptID, Execution>> graphAndExecutions = setupExecution(v1, 1, v2, 1);
    ExecutionGraph graph = graphAndExecutions.f0;
    // verify behavior for canceled executions
    Execution execution1 = graphAndExecutions.f1.values().iterator().next();
    IOMetrics ioMetrics = new IOMetrics(0, 0, 0, 0, 0, 0.0, 0.0, 0.0, 0.0, 0.0);
    Map<String, Accumulator<?, ?>> accumulators = new HashMap<>();
    accumulators.put("acc", new IntCounter(4));
    AccumulatorSnapshot accumulatorSnapshot = new AccumulatorSnapshot(graph.getJobID(), execution1.getAttemptId(), accumulators);
    TaskExecutionState state = new TaskExecutionState(graph.getJobID(), execution1.getAttemptId(), ExecutionState.CANCELED, null, accumulatorSnapshot, ioMetrics);
    graph.updateState(state);
    assertEquals(ioMetrics, execution1.getIOMetrics());
    assertNotNull(execution1.getUserAccumulators());
    assertEquals(4, execution1.getUserAccumulators().get("acc").getLocalValue());
    // verify behavior for failed executions
    Execution execution2 = graphAndExecutions.f1.values().iterator().next();
    IOMetrics ioMetrics2 = new IOMetrics(0, 0, 0, 0, 0, 0.0, 0.0, 0.0, 0.0, 0.0);
    Map<String, Accumulator<?, ?>> accumulators2 = new HashMap<>();
    accumulators2.put("acc", new IntCounter(8));
    AccumulatorSnapshot accumulatorSnapshot2 = new AccumulatorSnapshot(graph.getJobID(), execution2.getAttemptId(), accumulators2);
    TaskExecutionState state2 = new TaskExecutionState(graph.getJobID(), execution2.getAttemptId(), ExecutionState.FAILED, null, accumulatorSnapshot2, ioMetrics2);
    graph.updateState(state2);
    assertEquals(ioMetrics2, execution2.getIOMetrics());
    assertNotNull(execution2.getUserAccumulators());
    assertEquals(8, execution2.getUserAccumulators().get("acc").getLocalValue());
}
Also used : Accumulator(org.apache.flink.api.common.accumulators.Accumulator) HashMap(java.util.HashMap) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) TaskExecutionState(org.apache.flink.runtime.taskmanager.TaskExecutionState) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) AccumulatorSnapshot(org.apache.flink.runtime.accumulators.AccumulatorSnapshot) IntCounter(org.apache.flink.api.common.accumulators.IntCounter) Map(java.util.Map) HashMap(java.util.HashMap) Test(org.junit.Test)

Example 4 with TaskExecutionState

use of org.apache.flink.runtime.taskmanager.TaskExecutionState in project flink by apache.

the class ExecutionGraphDeploymentTest method testNoResourceAvailableFailure.

@Test
public /**
	 * Tests that a blocking batch job fails if there are not enough resources left to schedule the
	 * succeeding tasks. This test case is related to [FLINK-4296] where finished producing tasks
	 * swallow the fail exception when scheduling a consumer task.
	 */
void testNoResourceAvailableFailure() throws Exception {
    final JobID jobId = new JobID();
    JobVertex v1 = new JobVertex("source");
    JobVertex v2 = new JobVertex("sink");
    int dop1 = 1;
    int dop2 = 1;
    v1.setParallelism(dop1);
    v2.setParallelism(dop2);
    v1.setInvokableClass(BatchTask.class);
    v2.setInvokableClass(BatchTask.class);
    v2.connectNewDataSetAsInput(v1, DistributionPattern.POINTWISE, ResultPartitionType.BLOCKING);
    Scheduler scheduler = new Scheduler(TestingUtils.directExecutionContext());
    for (int i = 0; i < dop1; i++) {
        scheduler.newInstanceAvailable(ExecutionGraphTestUtils.getInstance(new ActorTaskManagerGateway(new ExecutionGraphTestUtils.SimpleActorGateway(TestingUtils.directExecutionContext()))));
    }
    // execution graph that executes actions synchronously
    ExecutionGraph eg = new ExecutionGraph(new DirectScheduledExecutorService(), TestingUtils.defaultExecutor(), jobId, "failing test job", new Configuration(), new SerializedValue<>(new ExecutionConfig()), AkkaUtils.getDefaultTimeout(), new NoRestartStrategy(), scheduler);
    eg.setQueuedSchedulingAllowed(false);
    List<JobVertex> ordered = Arrays.asList(v1, v2);
    eg.attachJobGraph(ordered);
    assertEquals(dop1, scheduler.getNumberOfAvailableSlots());
    // schedule, this triggers mock deployment
    eg.scheduleForExecution();
    ExecutionAttemptID attemptID = eg.getJobVertex(v1.getID()).getTaskVertices()[0].getCurrentExecutionAttempt().getAttemptId();
    eg.updateState(new TaskExecutionState(jobId, attemptID, ExecutionState.RUNNING));
    eg.updateState(new TaskExecutionState(jobId, attemptID, ExecutionState.FINISHED, null));
    assertEquals(JobStatus.FAILED, eg.getState());
}
Also used : Configuration(org.apache.flink.configuration.Configuration) Scheduler(org.apache.flink.runtime.jobmanager.scheduler.Scheduler) DirectScheduledExecutorService(org.apache.flink.runtime.testutils.DirectScheduledExecutorService) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) NoRestartStrategy(org.apache.flink.runtime.executiongraph.restart.NoRestartStrategy) TaskExecutionState(org.apache.flink.runtime.taskmanager.TaskExecutionState) ActorTaskManagerGateway(org.apache.flink.runtime.jobmanager.slots.ActorTaskManagerGateway) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Aggregations

TaskExecutionState (org.apache.flink.runtime.taskmanager.TaskExecutionState)4 JobVertex (org.apache.flink.runtime.jobgraph.JobVertex)3 Test (org.junit.Test)3 IOException (java.io.IOException)2 ExecutionConfig (org.apache.flink.api.common.ExecutionConfig)2 Configuration (org.apache.flink.configuration.Configuration)2 AccumulatorSnapshot (org.apache.flink.runtime.accumulators.AccumulatorSnapshot)2 Scheduler (org.apache.flink.runtime.jobmanager.scheduler.Scheduler)2 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 Map (java.util.Map)1 ScheduledExecutorService (java.util.concurrent.ScheduledExecutorService)1 TimeoutException (java.util.concurrent.TimeoutException)1 JobID (org.apache.flink.api.common.JobID)1 Accumulator (org.apache.flink.api.common.accumulators.Accumulator)1 IntCounter (org.apache.flink.api.common.accumulators.IntCounter)1 Time (org.apache.flink.api.common.time.Time)1 Gauge (org.apache.flink.metrics.Gauge)1 Metric (org.apache.flink.metrics.Metric)1 MetricGroup (org.apache.flink.metrics.MetricGroup)1