use of org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway in project flink by apache.
the class ExecutionGraphSchedulingTest method testExecutionGraphScheduleReleasesResourcesOnException.
/**
* Tests that the {@link ExecutionGraph#scheduleForExecution()} method
* releases partially acquired resources upon exception.
*/
@Test
public void testExecutionGraphScheduleReleasesResourcesOnException() throws Exception {
// [pipelined]
// we construct a simple graph (source) ----------------> (target)
final int parallelism = 3;
final JobVertex sourceVertex = new JobVertex("source");
sourceVertex.setParallelism(parallelism);
sourceVertex.setInvokableClass(NoOpInvokable.class);
final JobVertex targetVertex = new JobVertex("target");
targetVertex.setParallelism(parallelism);
targetVertex.setInvokableClass(NoOpInvokable.class);
targetVertex.connectNewDataSetAsInput(sourceVertex, DistributionPattern.ALL_TO_ALL, ResultPartitionType.PIPELINED);
final JobID jobId = new JobID();
final JobGraph jobGraph = new JobGraph(jobId, "test", sourceVertex, targetVertex);
// set up some available slots and some slot owner that accepts released slots back
final List<SimpleSlot> returnedSlots = new ArrayList<>();
final SlotOwner recycler = new SlotOwner() {
@Override
public boolean returnAllocatedSlot(Slot slot) {
returnedSlots.add((SimpleSlot) slot);
return true;
}
};
final TaskManagerGateway taskManager = mock(TaskManagerGateway.class);
final List<SimpleSlot> availableSlots = new ArrayList<>(Arrays.asList(createSlot(taskManager, jobId, recycler), createSlot(taskManager, jobId, recycler), createSlot(taskManager, jobId, recycler), createSlot(taskManager, jobId, recycler), createSlot(taskManager, jobId, recycler)));
// slot provider that hand out parallelism / 3 slots, then throws an exception
final SlotProvider slotProvider = mock(SlotProvider.class);
when(slotProvider.allocateSlot(any(ScheduledUnit.class), anyBoolean())).then(new Answer<Future<SimpleSlot>>() {
@Override
public Future<SimpleSlot> answer(InvocationOnMock invocation) {
if (availableSlots.isEmpty()) {
throw new TestRuntimeException();
} else {
return FlinkCompletableFuture.completed(availableSlots.remove(0));
}
}
});
final ExecutionGraph eg = createExecutionGraph(jobGraph, slotProvider);
// acquire resources and check that all are back after the failure
final int numSlotsToExpectBack = availableSlots.size();
try {
eg.setScheduleMode(ScheduleMode.EAGER);
eg.scheduleForExecution();
fail("should have failed with an exception");
} catch (TestRuntimeException e) {
// expected
}
assertEquals(numSlotsToExpectBack, returnedSlots.size());
}
use of org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway in project flink by apache.
the class ExecutionGraphSchedulingTest method testOneSlotFailureAbortsDeploy.
/**
* This test verifies that if one slot future fails, the deployment will be aborted.
*/
@Test
public void testOneSlotFailureAbortsDeploy() throws Exception {
// [pipelined]
// we construct a simple graph (source) ----------------> (target)
final int parallelism = 6;
final JobVertex sourceVertex = new JobVertex("source");
sourceVertex.setParallelism(parallelism);
sourceVertex.setInvokableClass(NoOpInvokable.class);
final JobVertex targetVertex = new JobVertex("target");
targetVertex.setParallelism(parallelism);
targetVertex.setInvokableClass(NoOpInvokable.class);
targetVertex.connectNewDataSetAsInput(sourceVertex, DistributionPattern.POINTWISE, ResultPartitionType.PIPELINED);
final JobID jobId = new JobID();
final JobGraph jobGraph = new JobGraph(jobId, "test", sourceVertex, targetVertex);
//
// Create the slots, futures, and the slot provider
final TaskManagerGateway taskManager = mock(TaskManagerGateway.class);
final SlotOwner slotOwner = mock(SlotOwner.class);
final SimpleSlot[] sourceSlots = new SimpleSlot[parallelism];
final SimpleSlot[] targetSlots = new SimpleSlot[parallelism];
@SuppressWarnings({ "unchecked", "rawtypes" }) final FlinkCompletableFuture<SimpleSlot>[] sourceFutures = new FlinkCompletableFuture[parallelism];
@SuppressWarnings({ "unchecked", "rawtypes" }) final FlinkCompletableFuture<SimpleSlot>[] targetFutures = new FlinkCompletableFuture[parallelism];
for (int i = 0; i < parallelism; i++) {
sourceSlots[i] = createSlot(taskManager, jobId, slotOwner);
targetSlots[i] = createSlot(taskManager, jobId, slotOwner);
sourceFutures[i] = new FlinkCompletableFuture<>();
targetFutures[i] = new FlinkCompletableFuture<>();
}
ProgrammedSlotProvider slotProvider = new ProgrammedSlotProvider(parallelism);
slotProvider.addSlots(sourceVertex.getID(), sourceFutures);
slotProvider.addSlots(targetVertex.getID(), targetFutures);
final ExecutionGraph eg = createExecutionGraph(jobGraph, slotProvider);
TerminalJobStatusListener testListener = new TerminalJobStatusListener();
eg.registerJobStatusListener(testListener);
for (int i = 0; i < parallelism; i += 2) {
sourceFutures[i].complete(sourceSlots[i]);
targetFutures[i + 1].complete(targetSlots[i + 1]);
}
//
// kick off the scheduling
eg.setScheduleMode(ScheduleMode.EAGER);
eg.setQueuedSchedulingAllowed(true);
eg.scheduleForExecution();
// fail one slot
sourceFutures[1].completeExceptionally(new TestRuntimeException());
// wait until the job failed as a whole
testListener.waitForTerminalState(2000);
// wait until all slots are back
verify(slotOwner, new Timeout(2000, times(6))).returnAllocatedSlot(any(Slot.class));
// no deployment calls must have happened
verify(taskManager, times(0)).submitTask(any(TaskDeploymentDescriptor.class), any(Time.class));
// all completed futures must have been returns
for (int i = 0; i < parallelism; i += 2) {
assertTrue(sourceSlots[i].isCanceled());
assertTrue(targetSlots[i + 1].isCanceled());
}
}
use of org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway in project flink by apache.
the class TaskManagerLogHandlerTest method testLogFetchingFailure.
@Test
public void testLogFetchingFailure() throws Exception {
// ========= setup TaskManager =================================================================================
InstanceID tmID = new InstanceID();
ResourceID tmRID = new ResourceID(tmID.toString());
TaskManagerGateway taskManagerGateway = mock(TaskManagerGateway.class);
when(taskManagerGateway.getAddress()).thenReturn("/tm/address");
Instance taskManager = mock(Instance.class);
when(taskManager.getId()).thenReturn(tmID);
when(taskManager.getTaskManagerID()).thenReturn(tmRID);
when(taskManager.getTaskManagerGateway()).thenReturn(taskManagerGateway);
CompletableFuture<BlobKey> future = new FlinkCompletableFuture<>();
future.completeExceptionally(new IOException("failure"));
when(taskManagerGateway.requestTaskManagerLog(any(Time.class))).thenReturn(future);
// ========= setup JobManager ==================================================================================
ActorGateway jobManagerGateway = mock(ActorGateway.class);
Object registeredTaskManagersAnswer = new JobManagerMessages.RegisteredTaskManagers(JavaConverters.collectionAsScalaIterableConverter(Collections.singletonList(taskManager)).asScala());
when(jobManagerGateway.ask(isA(JobManagerMessages.RequestRegisteredTaskManagers$.class), any(FiniteDuration.class))).thenReturn(Future$.MODULE$.successful(registeredTaskManagersAnswer));
when(jobManagerGateway.ask(isA(JobManagerMessages.getRequestBlobManagerPort().getClass()), any(FiniteDuration.class))).thenReturn(Future$.MODULE$.successful((Object) 5));
when(jobManagerGateway.ask(isA(JobManagerMessages.RequestTaskManagerInstance.class), any(FiniteDuration.class))).thenReturn(Future$.MODULE$.successful((Object) new JobManagerMessages.TaskManagerInstance(Option.apply(taskManager))));
when(jobManagerGateway.path()).thenReturn("/jm/address");
JobManagerRetriever retriever = mock(JobManagerRetriever.class);
when(retriever.getJobManagerGatewayAndWebPort()).thenReturn(Option.apply(new scala.Tuple2<ActorGateway, Integer>(jobManagerGateway, 0)));
TaskManagerLogHandler handler = new TaskManagerLogHandler(retriever, ExecutionContext$.MODULE$.fromExecutor(Executors.directExecutor()), Future$.MODULE$.successful("/jm/address"), AkkaUtils.getDefaultClientTimeout(), TaskManagerLogHandler.FileMode.LOG, new Configuration(), false);
final AtomicReference<String> exception = new AtomicReference<>();
ChannelHandlerContext ctx = mock(ChannelHandlerContext.class);
when(ctx.write(isA(ByteBuf.class))).thenAnswer(new Answer<Object>() {
@Override
public Object answer(InvocationOnMock invocationOnMock) throws Throwable {
ByteBuf data = invocationOnMock.getArgumentAt(0, ByteBuf.class);
exception.set(new String(data.array(), ConfigConstants.DEFAULT_CHARSET));
return null;
}
});
Map<String, String> pathParams = new HashMap<>();
pathParams.put(TaskManagersHandler.TASK_MANAGER_ID_KEY, tmID.toString());
Routed routed = mock(Routed.class);
when(routed.pathParams()).thenReturn(pathParams);
when(routed.request()).thenReturn(new DefaultFullHttpRequest(HttpVersion.HTTP_1_1, HttpMethod.GET, "/taskmanagers/" + tmID + "/log"));
handler.respondAsLeader(ctx, routed, jobManagerGateway);
Assert.assertEquals("Fetching TaskManager log failed.", exception.get());
}
use of org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway in project flink by apache.
the class MetricFetcherTest method testUpdate.
@Test
public void testUpdate() throws Exception {
// ========= setup TaskManager =================================================================================
JobID jobID = new JobID();
InstanceID tmID = new InstanceID();
ResourceID tmRID = new ResourceID(tmID.toString());
TaskManagerGateway taskManagerGateway = mock(TaskManagerGateway.class);
when(taskManagerGateway.getAddress()).thenReturn("/tm/address");
Instance taskManager = mock(Instance.class);
when(taskManager.getTaskManagerGateway()).thenReturn(taskManagerGateway);
when(taskManager.getId()).thenReturn(tmID);
when(taskManager.getTaskManagerID()).thenReturn(tmRID);
// ========= setup JobManager ==================================================================================
JobDetails details = mock(JobDetails.class);
when(details.getJobId()).thenReturn(jobID);
ActorGateway jobManagerGateway = mock(ActorGateway.class);
Object registeredTaskManagersAnswer = new JobManagerMessages.RegisteredTaskManagers(JavaConverters.collectionAsScalaIterableConverter(Collections.singletonList(taskManager)).asScala());
when(jobManagerGateway.ask(isA(RequestJobDetails.class), any(FiniteDuration.class))).thenReturn(Future$.MODULE$.successful((Object) new MultipleJobsDetails(new JobDetails[0], new JobDetails[0])));
when(jobManagerGateway.ask(isA(JobManagerMessages.RequestRegisteredTaskManagers$.class), any(FiniteDuration.class))).thenReturn(Future$.MODULE$.successful(registeredTaskManagersAnswer));
when(jobManagerGateway.path()).thenReturn("/jm/address");
JobManagerRetriever retriever = mock(JobManagerRetriever.class);
when(retriever.getJobManagerGatewayAndWebPort()).thenReturn(Option.apply(new scala.Tuple2<ActorGateway, Integer>(jobManagerGateway, 0)));
// ========= setup QueryServices ================================================================================
Object requestMetricsAnswer = createRequestDumpAnswer(tmID, jobID);
final ActorRef jmQueryService = mock(ActorRef.class);
final ActorRef tmQueryService = mock(ActorRef.class);
ActorSystem actorSystem = mock(ActorSystem.class);
when(actorSystem.actorFor(eq("/jm/" + METRIC_QUERY_SERVICE_NAME))).thenReturn(jmQueryService);
when(actorSystem.actorFor(eq("/tm/" + METRIC_QUERY_SERVICE_NAME + "_" + tmRID.getResourceIdString()))).thenReturn(tmQueryService);
MetricFetcher.BasicGateway jmQueryServiceGateway = mock(MetricFetcher.BasicGateway.class);
when(jmQueryServiceGateway.ask(any(MetricQueryService.getCreateDump().getClass()), any(FiniteDuration.class))).thenReturn(Future$.MODULE$.successful((Object) new MetricDumpSerialization.MetricSerializationResult(new byte[0], 0, 0, 0, 0)));
MetricFetcher.BasicGateway tmQueryServiceGateway = mock(MetricFetcher.BasicGateway.class);
when(tmQueryServiceGateway.ask(any(MetricQueryService.getCreateDump().getClass()), any(FiniteDuration.class))).thenReturn(Future$.MODULE$.successful(requestMetricsAnswer));
whenNew(MetricFetcher.BasicGateway.class).withArguments(eq(new Object() {
@Override
public boolean equals(Object o) {
return o == jmQueryService;
}
})).thenReturn(jmQueryServiceGateway);
whenNew(MetricFetcher.BasicGateway.class).withArguments(eq(new Object() {
@Override
public boolean equals(Object o) {
return o == tmQueryService;
}
})).thenReturn(tmQueryServiceGateway);
// ========= start MetricFetcher testing =======================================================================
ExecutionContextExecutor context = ExecutionContext$.MODULE$.fromExecutor(new CurrentThreadExecutor());
MetricFetcher fetcher = new MetricFetcher(actorSystem, retriever, context);
// verify that update fetches metrics and updates the store
fetcher.update();
MetricStore store = fetcher.getMetricStore();
synchronized (store) {
assertEquals("7", store.jobManager.metrics.get("abc.hist_min"));
assertEquals("6", store.jobManager.metrics.get("abc.hist_max"));
assertEquals("4.0", store.jobManager.metrics.get("abc.hist_mean"));
assertEquals("0.5", store.jobManager.metrics.get("abc.hist_median"));
assertEquals("5.0", store.jobManager.metrics.get("abc.hist_stddev"));
assertEquals("0.75", store.jobManager.metrics.get("abc.hist_p75"));
assertEquals("0.9", store.jobManager.metrics.get("abc.hist_p90"));
assertEquals("0.95", store.jobManager.metrics.get("abc.hist_p95"));
assertEquals("0.98", store.jobManager.metrics.get("abc.hist_p98"));
assertEquals("0.99", store.jobManager.metrics.get("abc.hist_p99"));
assertEquals("0.999", store.jobManager.metrics.get("abc.hist_p999"));
assertEquals("x", store.getTaskManagerMetricStore(tmID.toString()).metrics.get("abc.gauge"));
assertEquals("5.0", store.getJobMetricStore(jobID.toString()).metrics.get("abc.jc"));
assertEquals("2", store.getTaskMetricStore(jobID.toString(), "taskid").metrics.get("2.abc.tc"));
assertEquals("1", store.getTaskMetricStore(jobID.toString(), "taskid").metrics.get("2.opname.abc.oc"));
}
}
use of org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway in project flink by apache.
the class Execution method triggerCheckpoint.
/**
* Trigger a new checkpoint on the task of this execution.
*
* @param checkpointId of th checkpoint to trigger
* @param timestamp of the checkpoint to trigger
* @param checkpointOptions of the checkpoint to trigger
*/
public void triggerCheckpoint(long checkpointId, long timestamp, CheckpointOptions checkpointOptions) {
final SimpleSlot slot = assignedResource;
if (slot != null) {
final TaskManagerGateway taskManagerGateway = slot.getTaskManagerGateway();
taskManagerGateway.triggerCheckpoint(attemptId, getVertex().getJobId(), checkpointId, timestamp, checkpointOptions);
} else {
LOG.debug("The execution has no slot assigned. This indicates that the execution is " + "no longer running.");
}
}
Aggregations