use of org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway in project flink by apache.
the class ExecutionGraphMetricsTest method testExecutionGraphRestartTimeMetric.
/**
* This test tests that the restarting time metric correctly displays restarting times.
*/
@Test
public void testExecutionGraphRestartTimeMetric() throws JobException, IOException, InterruptedException {
final ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor();
try {
// setup execution graph with mocked scheduling logic
int parallelism = 1;
JobVertex jobVertex = new JobVertex("TestVertex");
jobVertex.setParallelism(parallelism);
jobVertex.setInvokableClass(NoOpInvokable.class);
JobGraph jobGraph = new JobGraph("Test Job", jobVertex);
Configuration config = new Configuration();
config.setString(ConfigConstants.METRICS_REPORTERS_LIST, "test");
config.setString(ConfigConstants.METRICS_REPORTER_PREFIX + "test." + ConfigConstants.METRICS_REPORTER_CLASS_SUFFIX, TestingReporter.class.getName());
Configuration jobConfig = new Configuration();
Time timeout = Time.seconds(10L);
MetricRegistry metricRegistry = new MetricRegistry(MetricRegistryConfiguration.fromConfiguration(config));
assertTrue(metricRegistry.getReporters().size() == 1);
MetricReporter reporter = metricRegistry.getReporters().get(0);
assertTrue(reporter instanceof TestingReporter);
TestingReporter testingReporter = (TestingReporter) reporter;
MetricGroup metricGroup = new JobManagerMetricGroup(metricRegistry, "localhost");
Scheduler scheduler = mock(Scheduler.class);
ResourceID taskManagerId = ResourceID.generate();
TaskManagerLocation taskManagerLocation = mock(TaskManagerLocation.class);
when(taskManagerLocation.getResourceID()).thenReturn(taskManagerId);
when(taskManagerLocation.getHostname()).thenReturn("localhost");
TaskManagerGateway taskManagerGateway = mock(TaskManagerGateway.class);
Instance instance = mock(Instance.class);
when(instance.getTaskManagerLocation()).thenReturn(taskManagerLocation);
when(instance.getTaskManagerID()).thenReturn(taskManagerId);
when(instance.getTaskManagerGateway()).thenReturn(taskManagerGateway);
Slot rootSlot = mock(Slot.class);
AllocatedSlot mockAllocatedSlot = mock(AllocatedSlot.class);
when(mockAllocatedSlot.getSlotAllocationId()).thenReturn(new AllocationID());
SimpleSlot simpleSlot = mock(SimpleSlot.class);
when(simpleSlot.isAlive()).thenReturn(true);
when(simpleSlot.getTaskManagerLocation()).thenReturn(taskManagerLocation);
when(simpleSlot.getTaskManagerID()).thenReturn(taskManagerId);
when(simpleSlot.getTaskManagerGateway()).thenReturn(taskManagerGateway);
when(simpleSlot.setExecutedVertex(Matchers.any(Execution.class))).thenReturn(true);
when(simpleSlot.getRoot()).thenReturn(rootSlot);
when(simpleSlot.getAllocatedSlot()).thenReturn(mockAllocatedSlot);
FlinkCompletableFuture<SimpleSlot> future = new FlinkCompletableFuture<>();
future.complete(simpleSlot);
when(scheduler.allocateSlot(any(ScheduledUnit.class), anyBoolean())).thenReturn(future);
when(rootSlot.getSlotNumber()).thenReturn(0);
when(taskManagerGateway.submitTask(any(TaskDeploymentDescriptor.class), any(Time.class))).thenReturn(FlinkCompletableFuture.completed(Acknowledge.get()));
TestingRestartStrategy testingRestartStrategy = new TestingRestartStrategy();
ExecutionGraph executionGraph = new ExecutionGraph(executor, executor, jobGraph.getJobID(), jobGraph.getName(), jobConfig, new SerializedValue<ExecutionConfig>(null), timeout, testingRestartStrategy, Collections.<BlobKey>emptyList(), Collections.<URL>emptyList(), scheduler, getClass().getClassLoader(), metricGroup);
// get restarting time metric
Metric metric = testingReporter.getMetric(ExecutionGraph.RESTARTING_TIME_METRIC_NAME);
assertNotNull(metric);
assertTrue(metric instanceof Gauge);
@SuppressWarnings("unchecked") Gauge<Long> restartingTime = (Gauge<Long>) metric;
// check that the restarting time is 0 since it's the initial start
assertTrue(0L == restartingTime.getValue());
executionGraph.attachJobGraph(jobGraph.getVerticesSortedTopologicallyFromSources());
// start execution
executionGraph.scheduleForExecution();
assertTrue(0L == restartingTime.getValue());
List<ExecutionAttemptID> executionIDs = new ArrayList<>();
for (ExecutionVertex executionVertex : executionGraph.getAllExecutionVertices()) {
executionIDs.add(executionVertex.getCurrentExecutionAttempt().getAttemptId());
}
// tell execution graph that the tasks are in state running --> job status switches to state running
for (ExecutionAttemptID executionID : executionIDs) {
executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.RUNNING));
}
assertEquals(JobStatus.RUNNING, executionGraph.getState());
assertTrue(0L == restartingTime.getValue());
// fail the job so that it goes into state restarting
for (ExecutionAttemptID executionID : executionIDs) {
executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.FAILED, new Exception()));
}
assertEquals(JobStatus.RESTARTING, executionGraph.getState());
long firstRestartingTimestamp = executionGraph.getStatusTimestamp(JobStatus.RESTARTING);
// wait some time so that the restarting time gauge shows a value different from 0
Thread.sleep(50);
long previousRestartingTime = restartingTime.getValue();
// check that the restarting time is monotonically increasing
for (int i = 0; i < 10; i++) {
long currentRestartingTime = restartingTime.getValue();
assertTrue(currentRestartingTime >= previousRestartingTime);
previousRestartingTime = currentRestartingTime;
}
// check that we have measured some restarting time
assertTrue(previousRestartingTime > 0);
// restart job
testingRestartStrategy.restartExecutionGraph();
executionIDs.clear();
for (ExecutionVertex executionVertex : executionGraph.getAllExecutionVertices()) {
executionIDs.add(executionVertex.getCurrentExecutionAttempt().getAttemptId());
}
for (ExecutionAttemptID executionID : executionIDs) {
executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.RUNNING));
}
assertEquals(JobStatus.RUNNING, executionGraph.getState());
assertTrue(firstRestartingTimestamp != 0);
previousRestartingTime = restartingTime.getValue();
// check that the restarting time does not increase after we've reached the running state
for (int i = 0; i < 10; i++) {
long currentRestartingTime = restartingTime.getValue();
assertTrue(currentRestartingTime == previousRestartingTime);
previousRestartingTime = currentRestartingTime;
}
// fail job again
for (ExecutionAttemptID executionID : executionIDs) {
executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.FAILED, new Exception()));
}
assertEquals(JobStatus.RESTARTING, executionGraph.getState());
long secondRestartingTimestamp = executionGraph.getStatusTimestamp(JobStatus.RESTARTING);
assertTrue(firstRestartingTimestamp != secondRestartingTimestamp);
Thread.sleep(50);
previousRestartingTime = restartingTime.getValue();
// check that the restarting time is increasing again
for (int i = 0; i < 10; i++) {
long currentRestartingTime = restartingTime.getValue();
assertTrue(currentRestartingTime >= previousRestartingTime);
previousRestartingTime = currentRestartingTime;
}
assertTrue(previousRestartingTime > 0);
// now lets fail the job while it is in restarting and see whether the restarting time then stops to increase
// for this to work, we have to use a SuppressRestartException
executionGraph.fail(new SuppressRestartsException(new Exception()));
assertEquals(JobStatus.FAILED, executionGraph.getState());
previousRestartingTime = restartingTime.getValue();
for (int i = 0; i < 10; i++) {
long currentRestartingTime = restartingTime.getValue();
assertTrue(currentRestartingTime == previousRestartingTime);
previousRestartingTime = currentRestartingTime;
}
} finally {
executor.shutdownNow();
}
}
use of org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway in project flink by apache.
the class AvailableSlotsTest method createAllocatedSlot.
static AllocatedSlot createAllocatedSlot(final ResourceID resourceId) {
TaskManagerLocation mockTaskManagerLocation = mock(TaskManagerLocation.class);
when(mockTaskManagerLocation.getResourceID()).thenReturn(resourceId);
TaskManagerGateway mockTaskManagerGateway = mock(TaskManagerGateway.class);
return new AllocatedSlot(new AllocationID(), new JobID(), mockTaskManagerLocation, 0, DEFAULT_TESTING_PROFILE, mockTaskManagerGateway);
}
use of org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway in project flink by apache.
the class ExecutionGraphSchedulingTest method testScheduleSourceBeforeTarget.
// ------------------------------------------------------------------------
// Tests
// ------------------------------------------------------------------------
/**
* Tests that with scheduling futures and pipelined deployment, the target vertex will
* not deploy its task before the source vertex does.
*/
@Test
public void testScheduleSourceBeforeTarget() throws Exception {
// [pipelined]
// we construct a simple graph (source) ----------------> (target)
final int parallelism = 1;
final JobVertex sourceVertex = new JobVertex("source");
sourceVertex.setParallelism(parallelism);
sourceVertex.setInvokableClass(NoOpInvokable.class);
final JobVertex targetVertex = new JobVertex("target");
targetVertex.setParallelism(parallelism);
targetVertex.setInvokableClass(NoOpInvokable.class);
targetVertex.connectNewDataSetAsInput(sourceVertex, DistributionPattern.ALL_TO_ALL, ResultPartitionType.PIPELINED);
final JobID jobId = new JobID();
final JobGraph jobGraph = new JobGraph(jobId, "test", sourceVertex, targetVertex);
final FlinkCompletableFuture<SimpleSlot> sourceFuture = new FlinkCompletableFuture<>();
final FlinkCompletableFuture<SimpleSlot> targetFuture = new FlinkCompletableFuture<>();
ProgrammedSlotProvider slotProvider = new ProgrammedSlotProvider(parallelism);
slotProvider.addSlot(sourceVertex.getID(), 0, sourceFuture);
slotProvider.addSlot(targetVertex.getID(), 0, targetFuture);
final ExecutionGraph eg = createExecutionGraph(jobGraph, slotProvider);
// set up two TaskManager gateways and slots
final TaskManagerGateway gatewaySource = createTaskManager();
final TaskManagerGateway gatewayTarget = createTaskManager();
final SimpleSlot sourceSlot = createSlot(gatewaySource, jobId);
final SimpleSlot targetSlot = createSlot(gatewayTarget, jobId);
eg.setScheduleMode(ScheduleMode.EAGER);
eg.setQueuedSchedulingAllowed(true);
eg.scheduleForExecution();
// job should be running
assertEquals(JobStatus.RUNNING, eg.getState());
// we fulfill the target slot before the source slot
// that should not cause a deployment or deployment related failure
targetFuture.complete(targetSlot);
verify(gatewayTarget, new Timeout(50, times(0))).submitTask(any(TaskDeploymentDescriptor.class), any(Time.class));
assertEquals(JobStatus.RUNNING, eg.getState());
// now supply the source slot
sourceFuture.complete(sourceSlot);
// by now, all deployments should have happened
verify(gatewaySource, timeout(1000)).submitTask(any(TaskDeploymentDescriptor.class), any(Time.class));
verify(gatewayTarget, timeout(1000)).submitTask(any(TaskDeploymentDescriptor.class), any(Time.class));
assertEquals(JobStatus.RUNNING, eg.getState());
}
use of org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway in project flink by apache.
the class ExecutionGraphSchedulingTest method testDeployPipelinedConnectedComponentsTogether.
/**
* This test verifies that before deploying a pipelined connected component, the
* full set of slots is available, and that not some tasks are deployed, and later the
* system realizes that not enough resources are available.
*/
@Test
public void testDeployPipelinedConnectedComponentsTogether() throws Exception {
// [pipelined]
// we construct a simple graph (source) ----------------> (target)
final int parallelism = 8;
final JobVertex sourceVertex = new JobVertex("source");
sourceVertex.setParallelism(parallelism);
sourceVertex.setInvokableClass(NoOpInvokable.class);
final JobVertex targetVertex = new JobVertex("target");
targetVertex.setParallelism(parallelism);
targetVertex.setInvokableClass(NoOpInvokable.class);
targetVertex.connectNewDataSetAsInput(sourceVertex, DistributionPattern.ALL_TO_ALL, ResultPartitionType.PIPELINED);
final JobID jobId = new JobID();
final JobGraph jobGraph = new JobGraph(jobId, "test", sourceVertex, targetVertex);
@SuppressWarnings({ "unchecked", "rawtypes" }) final FlinkCompletableFuture<SimpleSlot>[] sourceFutures = new FlinkCompletableFuture[parallelism];
@SuppressWarnings({ "unchecked", "rawtypes" }) final FlinkCompletableFuture<SimpleSlot>[] targetFutures = new FlinkCompletableFuture[parallelism];
//
// Create the slots, futures, and the slot provider
final TaskManagerGateway[] sourceTaskManagers = new TaskManagerGateway[parallelism];
final TaskManagerGateway[] targetTaskManagers = new TaskManagerGateway[parallelism];
final SimpleSlot[] sourceSlots = new SimpleSlot[parallelism];
final SimpleSlot[] targetSlots = new SimpleSlot[parallelism];
for (int i = 0; i < parallelism; i++) {
sourceTaskManagers[i] = createTaskManager();
targetTaskManagers[i] = createTaskManager();
sourceSlots[i] = createSlot(sourceTaskManagers[i], jobId);
targetSlots[i] = createSlot(targetTaskManagers[i], jobId);
sourceFutures[i] = new FlinkCompletableFuture<>();
targetFutures[i] = new FlinkCompletableFuture<>();
}
ProgrammedSlotProvider slotProvider = new ProgrammedSlotProvider(parallelism);
slotProvider.addSlots(sourceVertex.getID(), sourceFutures);
slotProvider.addSlots(targetVertex.getID(), targetFutures);
final ExecutionGraph eg = createExecutionGraph(jobGraph, slotProvider);
for (int i = 0; i < parallelism; i += 2) {
sourceFutures[i].complete(sourceSlots[i]);
}
//
// kick off the scheduling
eg.setScheduleMode(ScheduleMode.EAGER);
eg.setQueuedSchedulingAllowed(true);
eg.scheduleForExecution();
verifyNothingDeployed(eg, sourceTaskManagers);
// complete the remaining sources
for (int i = 1; i < parallelism; i += 2) {
sourceFutures[i].complete(sourceSlots[i]);
}
verifyNothingDeployed(eg, sourceTaskManagers);
// complete the targets except for one
for (int i = 1; i < parallelism; i++) {
targetFutures[i].complete(targetSlots[i]);
}
verifyNothingDeployed(eg, targetTaskManagers);
// complete the last target slot future
targetFutures[0].complete(targetSlots[0]);
for (TaskManagerGateway gateway : sourceTaskManagers) {
verify(gateway, timeout(50)).submitTask(any(TaskDeploymentDescriptor.class), any(Time.class));
}
for (TaskManagerGateway gateway : targetTaskManagers) {
verify(gateway, timeout(50)).submitTask(any(TaskDeploymentDescriptor.class), any(Time.class));
}
}
use of org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway in project flink by apache.
the class ExecutionGraphSchedulingTest method testExecutionJobVertexAllocateResourcesReleasesOnException.
/**
* Tests that the {@link ExecutionJobVertex#allocateResourcesForAll(SlotProvider, boolean)} method
* releases partially acquired resources upon exception.
*/
@Test
public void testExecutionJobVertexAllocateResourcesReleasesOnException() throws Exception {
final int parallelism = 8;
final JobVertex vertex = new JobVertex("vertex");
vertex.setParallelism(parallelism);
vertex.setInvokableClass(NoOpInvokable.class);
final JobID jobId = new JobID();
final JobGraph jobGraph = new JobGraph(jobId, "test", vertex);
// set up some available slots and some slot owner that accepts released slots back
final List<SimpleSlot> returnedSlots = new ArrayList<>();
final SlotOwner recycler = new SlotOwner() {
@Override
public boolean returnAllocatedSlot(Slot slot) {
returnedSlots.add((SimpleSlot) slot);
return true;
}
};
// slot provider that hand out parallelism / 3 slots, then throws an exception
final SlotProvider slotProvider = mock(SlotProvider.class);
final TaskManagerGateway taskManager = mock(TaskManagerGateway.class);
final List<SimpleSlot> availableSlots = new ArrayList<>(Arrays.asList(createSlot(taskManager, jobId, recycler), createSlot(taskManager, jobId, recycler), createSlot(taskManager, jobId, recycler)));
when(slotProvider.allocateSlot(any(ScheduledUnit.class), anyBoolean())).then(new Answer<Future<SimpleSlot>>() {
@Override
public Future<SimpleSlot> answer(InvocationOnMock invocation) {
if (availableSlots.isEmpty()) {
throw new TestRuntimeException();
} else {
return FlinkCompletableFuture.completed(availableSlots.remove(0));
}
}
});
final ExecutionGraph eg = createExecutionGraph(jobGraph, slotProvider);
final ExecutionJobVertex ejv = eg.getJobVertex(vertex.getID());
// acquire resources and check that all are back after the failure
final int numSlotsToExpectBack = availableSlots.size();
try {
ejv.allocateResourcesForAll(slotProvider, false);
fail("should have failed with an exception");
} catch (TestRuntimeException e) {
// expected
}
assertEquals(numSlotsToExpectBack, returnedSlots.size());
}
Aggregations