use of org.apache.flink.runtime.taskmanager.TaskExecutionState in project flink by apache.
the class TaskExecutor method unregisterTaskAndNotifyFinalState.
private void unregisterTaskAndNotifyFinalState(final UUID jobMasterLeaderId, final JobMasterGateway jobMasterGateway, final ExecutionAttemptID executionAttemptID) {
Task task = taskSlotTable.removeTask(executionAttemptID);
if (task != null) {
if (!task.getExecutionState().isTerminal()) {
try {
task.failExternally(new IllegalStateException("Task is being remove from TaskManager."));
} catch (Exception e) {
log.error("Could not properly fail task.", e);
}
}
log.info("Un-registering task and sending final execution state {} to JobManager for task {} {}.", task.getExecutionState(), task.getTaskInfo().getTaskName(), task.getExecutionId());
AccumulatorSnapshot accumulatorSnapshot = task.getAccumulatorRegistry().getSnapshot();
updateTaskExecutionState(jobMasterLeaderId, jobMasterGateway, new TaskExecutionState(task.getJobID(), task.getExecutionId(), task.getExecutionState(), task.getFailureCause(), accumulatorSnapshot, task.getMetricGroup().getIOMetricGroup().createSnapshot()));
} else {
log.error("Cannot find task with ID {} to unregister.", executionAttemptID);
}
}
use of org.apache.flink.runtime.taskmanager.TaskExecutionState in project flink by apache.
the class ExecutionGraphMetricsTest method testExecutionGraphRestartTimeMetric.
/**
* This test tests that the restarting time metric correctly displays restarting times.
*/
@Test
public void testExecutionGraphRestartTimeMetric() throws JobException, IOException, InterruptedException {
final ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor();
try {
// setup execution graph with mocked scheduling logic
int parallelism = 1;
JobVertex jobVertex = new JobVertex("TestVertex");
jobVertex.setParallelism(parallelism);
jobVertex.setInvokableClass(NoOpInvokable.class);
JobGraph jobGraph = new JobGraph("Test Job", jobVertex);
Configuration config = new Configuration();
config.setString(ConfigConstants.METRICS_REPORTERS_LIST, "test");
config.setString(ConfigConstants.METRICS_REPORTER_PREFIX + "test." + ConfigConstants.METRICS_REPORTER_CLASS_SUFFIX, TestingReporter.class.getName());
Configuration jobConfig = new Configuration();
Time timeout = Time.seconds(10L);
MetricRegistry metricRegistry = new MetricRegistry(MetricRegistryConfiguration.fromConfiguration(config));
assertTrue(metricRegistry.getReporters().size() == 1);
MetricReporter reporter = metricRegistry.getReporters().get(0);
assertTrue(reporter instanceof TestingReporter);
TestingReporter testingReporter = (TestingReporter) reporter;
MetricGroup metricGroup = new JobManagerMetricGroup(metricRegistry, "localhost");
Scheduler scheduler = mock(Scheduler.class);
ResourceID taskManagerId = ResourceID.generate();
TaskManagerLocation taskManagerLocation = mock(TaskManagerLocation.class);
when(taskManagerLocation.getResourceID()).thenReturn(taskManagerId);
when(taskManagerLocation.getHostname()).thenReturn("localhost");
TaskManagerGateway taskManagerGateway = mock(TaskManagerGateway.class);
Instance instance = mock(Instance.class);
when(instance.getTaskManagerLocation()).thenReturn(taskManagerLocation);
when(instance.getTaskManagerID()).thenReturn(taskManagerId);
when(instance.getTaskManagerGateway()).thenReturn(taskManagerGateway);
Slot rootSlot = mock(Slot.class);
AllocatedSlot mockAllocatedSlot = mock(AllocatedSlot.class);
when(mockAllocatedSlot.getSlotAllocationId()).thenReturn(new AllocationID());
SimpleSlot simpleSlot = mock(SimpleSlot.class);
when(simpleSlot.isAlive()).thenReturn(true);
when(simpleSlot.getTaskManagerLocation()).thenReturn(taskManagerLocation);
when(simpleSlot.getTaskManagerID()).thenReturn(taskManagerId);
when(simpleSlot.getTaskManagerGateway()).thenReturn(taskManagerGateway);
when(simpleSlot.setExecutedVertex(Matchers.any(Execution.class))).thenReturn(true);
when(simpleSlot.getRoot()).thenReturn(rootSlot);
when(simpleSlot.getAllocatedSlot()).thenReturn(mockAllocatedSlot);
FlinkCompletableFuture<SimpleSlot> future = new FlinkCompletableFuture<>();
future.complete(simpleSlot);
when(scheduler.allocateSlot(any(ScheduledUnit.class), anyBoolean())).thenReturn(future);
when(rootSlot.getSlotNumber()).thenReturn(0);
when(taskManagerGateway.submitTask(any(TaskDeploymentDescriptor.class), any(Time.class))).thenReturn(FlinkCompletableFuture.completed(Acknowledge.get()));
TestingRestartStrategy testingRestartStrategy = new TestingRestartStrategy();
ExecutionGraph executionGraph = new ExecutionGraph(executor, executor, jobGraph.getJobID(), jobGraph.getName(), jobConfig, new SerializedValue<ExecutionConfig>(null), timeout, testingRestartStrategy, Collections.<BlobKey>emptyList(), Collections.<URL>emptyList(), scheduler, getClass().getClassLoader(), metricGroup);
// get restarting time metric
Metric metric = testingReporter.getMetric(ExecutionGraph.RESTARTING_TIME_METRIC_NAME);
assertNotNull(metric);
assertTrue(metric instanceof Gauge);
@SuppressWarnings("unchecked") Gauge<Long> restartingTime = (Gauge<Long>) metric;
// check that the restarting time is 0 since it's the initial start
assertTrue(0L == restartingTime.getValue());
executionGraph.attachJobGraph(jobGraph.getVerticesSortedTopologicallyFromSources());
// start execution
executionGraph.scheduleForExecution();
assertTrue(0L == restartingTime.getValue());
List<ExecutionAttemptID> executionIDs = new ArrayList<>();
for (ExecutionVertex executionVertex : executionGraph.getAllExecutionVertices()) {
executionIDs.add(executionVertex.getCurrentExecutionAttempt().getAttemptId());
}
// tell execution graph that the tasks are in state running --> job status switches to state running
for (ExecutionAttemptID executionID : executionIDs) {
executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.RUNNING));
}
assertEquals(JobStatus.RUNNING, executionGraph.getState());
assertTrue(0L == restartingTime.getValue());
// fail the job so that it goes into state restarting
for (ExecutionAttemptID executionID : executionIDs) {
executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.FAILED, new Exception()));
}
assertEquals(JobStatus.RESTARTING, executionGraph.getState());
long firstRestartingTimestamp = executionGraph.getStatusTimestamp(JobStatus.RESTARTING);
// wait some time so that the restarting time gauge shows a value different from 0
Thread.sleep(50);
long previousRestartingTime = restartingTime.getValue();
// check that the restarting time is monotonically increasing
for (int i = 0; i < 10; i++) {
long currentRestartingTime = restartingTime.getValue();
assertTrue(currentRestartingTime >= previousRestartingTime);
previousRestartingTime = currentRestartingTime;
}
// check that we have measured some restarting time
assertTrue(previousRestartingTime > 0);
// restart job
testingRestartStrategy.restartExecutionGraph();
executionIDs.clear();
for (ExecutionVertex executionVertex : executionGraph.getAllExecutionVertices()) {
executionIDs.add(executionVertex.getCurrentExecutionAttempt().getAttemptId());
}
for (ExecutionAttemptID executionID : executionIDs) {
executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.RUNNING));
}
assertEquals(JobStatus.RUNNING, executionGraph.getState());
assertTrue(firstRestartingTimestamp != 0);
previousRestartingTime = restartingTime.getValue();
// check that the restarting time does not increase after we've reached the running state
for (int i = 0; i < 10; i++) {
long currentRestartingTime = restartingTime.getValue();
assertTrue(currentRestartingTime == previousRestartingTime);
previousRestartingTime = currentRestartingTime;
}
// fail job again
for (ExecutionAttemptID executionID : executionIDs) {
executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.FAILED, new Exception()));
}
assertEquals(JobStatus.RESTARTING, executionGraph.getState());
long secondRestartingTimestamp = executionGraph.getStatusTimestamp(JobStatus.RESTARTING);
assertTrue(firstRestartingTimestamp != secondRestartingTimestamp);
Thread.sleep(50);
previousRestartingTime = restartingTime.getValue();
// check that the restarting time is increasing again
for (int i = 0; i < 10; i++) {
long currentRestartingTime = restartingTime.getValue();
assertTrue(currentRestartingTime >= previousRestartingTime);
previousRestartingTime = currentRestartingTime;
}
assertTrue(previousRestartingTime > 0);
// now lets fail the job while it is in restarting and see whether the restarting time then stops to increase
// for this to work, we have to use a SuppressRestartException
executionGraph.fail(new SuppressRestartsException(new Exception()));
assertEquals(JobStatus.FAILED, executionGraph.getState());
previousRestartingTime = restartingTime.getValue();
for (int i = 0; i < 10; i++) {
long currentRestartingTime = restartingTime.getValue();
assertTrue(currentRestartingTime == previousRestartingTime);
previousRestartingTime = currentRestartingTime;
}
} finally {
executor.shutdownNow();
}
}
use of org.apache.flink.runtime.taskmanager.TaskExecutionState in project flink by apache.
the class ExecutionGraphDeploymentTest method testAccumulatorsAndMetricsForwarding.
/**
* Verifies that {@link ExecutionGraph#updateState(TaskExecutionState)} updates the accumulators and metrics for an
* execution that failed or was canceled.
*/
@Test
public void testAccumulatorsAndMetricsForwarding() throws Exception {
final JobVertexID jid1 = new JobVertexID();
final JobVertexID jid2 = new JobVertexID();
JobVertex v1 = new JobVertex("v1", jid1);
JobVertex v2 = new JobVertex("v2", jid2);
Tuple2<ExecutionGraph, Map<ExecutionAttemptID, Execution>> graphAndExecutions = setupExecution(v1, 1, v2, 1);
ExecutionGraph graph = graphAndExecutions.f0;
// verify behavior for canceled executions
Execution execution1 = graphAndExecutions.f1.values().iterator().next();
IOMetrics ioMetrics = new IOMetrics(0, 0, 0, 0, 0, 0.0, 0.0, 0.0, 0.0, 0.0);
Map<String, Accumulator<?, ?>> accumulators = new HashMap<>();
accumulators.put("acc", new IntCounter(4));
AccumulatorSnapshot accumulatorSnapshot = new AccumulatorSnapshot(graph.getJobID(), execution1.getAttemptId(), accumulators);
TaskExecutionState state = new TaskExecutionState(graph.getJobID(), execution1.getAttemptId(), ExecutionState.CANCELED, null, accumulatorSnapshot, ioMetrics);
graph.updateState(state);
assertEquals(ioMetrics, execution1.getIOMetrics());
assertNotNull(execution1.getUserAccumulators());
assertEquals(4, execution1.getUserAccumulators().get("acc").getLocalValue());
// verify behavior for failed executions
Execution execution2 = graphAndExecutions.f1.values().iterator().next();
IOMetrics ioMetrics2 = new IOMetrics(0, 0, 0, 0, 0, 0.0, 0.0, 0.0, 0.0, 0.0);
Map<String, Accumulator<?, ?>> accumulators2 = new HashMap<>();
accumulators2.put("acc", new IntCounter(8));
AccumulatorSnapshot accumulatorSnapshot2 = new AccumulatorSnapshot(graph.getJobID(), execution2.getAttemptId(), accumulators2);
TaskExecutionState state2 = new TaskExecutionState(graph.getJobID(), execution2.getAttemptId(), ExecutionState.FAILED, null, accumulatorSnapshot2, ioMetrics2);
graph.updateState(state2);
assertEquals(ioMetrics2, execution2.getIOMetrics());
assertNotNull(execution2.getUserAccumulators());
assertEquals(8, execution2.getUserAccumulators().get("acc").getLocalValue());
}
use of org.apache.flink.runtime.taskmanager.TaskExecutionState in project flink by apache.
the class ExecutionGraphDeploymentTest method testNoResourceAvailableFailure.
@Test
public /**
* Tests that a blocking batch job fails if there are not enough resources left to schedule the
* succeeding tasks. This test case is related to [FLINK-4296] where finished producing tasks
* swallow the fail exception when scheduling a consumer task.
*/
void testNoResourceAvailableFailure() throws Exception {
final JobID jobId = new JobID();
JobVertex v1 = new JobVertex("source");
JobVertex v2 = new JobVertex("sink");
int dop1 = 1;
int dop2 = 1;
v1.setParallelism(dop1);
v2.setParallelism(dop2);
v1.setInvokableClass(BatchTask.class);
v2.setInvokableClass(BatchTask.class);
v2.connectNewDataSetAsInput(v1, DistributionPattern.POINTWISE, ResultPartitionType.BLOCKING);
Scheduler scheduler = new Scheduler(TestingUtils.directExecutionContext());
for (int i = 0; i < dop1; i++) {
scheduler.newInstanceAvailable(ExecutionGraphTestUtils.getInstance(new ActorTaskManagerGateway(new ExecutionGraphTestUtils.SimpleActorGateway(TestingUtils.directExecutionContext()))));
}
// execution graph that executes actions synchronously
ExecutionGraph eg = new ExecutionGraph(new DirectScheduledExecutorService(), TestingUtils.defaultExecutor(), jobId, "failing test job", new Configuration(), new SerializedValue<>(new ExecutionConfig()), AkkaUtils.getDefaultTimeout(), new NoRestartStrategy(), scheduler);
eg.setQueuedSchedulingAllowed(false);
List<JobVertex> ordered = Arrays.asList(v1, v2);
eg.attachJobGraph(ordered);
assertEquals(dop1, scheduler.getNumberOfAvailableSlots());
// schedule, this triggers mock deployment
eg.scheduleForExecution();
ExecutionAttemptID attemptID = eg.getJobVertex(v1.getID()).getTaskVertices()[0].getCurrentExecutionAttempt().getAttemptId();
eg.updateState(new TaskExecutionState(jobId, attemptID, ExecutionState.RUNNING));
eg.updateState(new TaskExecutionState(jobId, attemptID, ExecutionState.FINISHED, null));
assertEquals(JobStatus.FAILED, eg.getState());
}
Aggregations