use of org.apache.flink.runtime.executiongraph.ExecutionGraph in project flink by apache.
the class StackTraceSampleCoordinatorITCase method testTaskClearedWhileSampling.
/**
* Tests that a cleared task is answered with a partial success response.
*/
@Test
public void testTaskClearedWhileSampling() throws Exception {
new JavaTestKit(testActorSystem) {
{
final FiniteDuration deadline = new FiniteDuration(60, TimeUnit.SECONDS);
// The JobGraph
final JobGraph jobGraph = new JobGraph();
final int parallelism = 1;
final JobVertex task = new JobVertex("Task");
task.setInvokableClass(BlockingNoOpInvokable.class);
task.setParallelism(parallelism);
jobGraph.addVertex(task);
ActorGateway jobManger = null;
ActorGateway taskManager = null;
try {
jobManger = TestingUtils.createJobManager(testActorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), new Configuration());
final Configuration config = new Configuration();
config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, parallelism);
taskManager = TestingUtils.createTaskManager(testActorSystem, jobManger, config, true, true);
final ActorGateway jm = jobManger;
new Within(deadline) {
@Override
protected void run() {
try {
ActorGateway testActor = new AkkaActorGateway(getTestActor(), null);
int maxAttempts = 10;
int sleepTime = 100;
for (int i = 0; i < maxAttempts; i++, sleepTime *= 2) {
// Submit the job and wait until it is running
JobClient.submitJobDetached(jm, config, jobGraph, deadline, ClassLoader.getSystemClassLoader());
jm.tell(new WaitForAllVerticesToBeRunning(jobGraph.getJobID()), testActor);
expectMsgEquals(new AllVerticesRunning(jobGraph.getJobID()));
// Get the ExecutionGraph
jm.tell(new RequestExecutionGraph(jobGraph.getJobID()), testActor);
ExecutionGraphFound executionGraphResponse = expectMsgClass(ExecutionGraphFound.class);
ExecutionGraph executionGraph = (ExecutionGraph) executionGraphResponse.executionGraph();
ExecutionJobVertex vertex = executionGraph.getJobVertex(task.getID());
StackTraceSampleCoordinator coordinator = new StackTraceSampleCoordinator(testActorSystem.dispatcher(), 60000);
Future<StackTraceSample> sampleFuture = coordinator.triggerStackTraceSample(vertex.getTaskVertices(), // sampling.
21474700 * 100, Time.milliseconds(10L), 0);
// Wait before cancelling so that some samples
// are actually taken.
Thread.sleep(sleepTime);
// Cancel job
scala.concurrent.Future<?> removeFuture = jm.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobGraph.getJobID()), remaining());
jm.tell(new JobManagerMessages.CancelJob(jobGraph.getJobID()));
try {
// Throws Exception on failure
sampleFuture.get(remaining().toMillis(), TimeUnit.MILLISECONDS);
// partial result.
break;
} catch (Throwable t) {
// We were too fast in cancelling the job.
// Fall through and retry.
} finally {
Await.ready(removeFuture, remaining());
}
}
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
};
} finally {
TestingUtils.stopActor(jobManger);
TestingUtils.stopActor(taskManager);
}
}
};
}
use of org.apache.flink.runtime.executiongraph.ExecutionGraph in project flink by apache.
the class JobCancellationWithSavepointHandlersTest method testTriggerNewRequest.
/**
* Tests triggering a new request and monitoring it.
*/
@Test
public void testTriggerNewRequest() throws Exception {
JobID jobId = new JobID();
ExecutionGraphHolder holder = mock(ExecutionGraphHolder.class);
ExecutionGraph graph = mock(ExecutionGraph.class);
CheckpointCoordinator coord = mock(CheckpointCoordinator.class);
when(holder.getExecutionGraph(eq(jobId), any(ActorGateway.class))).thenReturn(graph);
when(graph.getCheckpointCoordinator()).thenReturn(coord);
JobCancellationWithSavepointHandlers handlers = new JobCancellationWithSavepointHandlers(holder, EC);
JobCancellationWithSavepointHandlers.TriggerHandler trigger = handlers.getTriggerHandler();
JobCancellationWithSavepointHandlers.InProgressHandler progress = handlers.getInProgressHandler();
Map<String, String> params = new HashMap<>();
params.put("jobid", jobId.toString());
params.put("targetDirectory", "custom-directory");
ActorGateway jobManager = mock(ActorGateway.class);
// Successful
Promise<Object> promise = new Promise.DefaultPromise<>();
when(jobManager.ask(any(Object.class), any(FiniteDuration.class))).thenReturn(promise);
// Trigger
FullHttpResponse response = trigger.handleRequest(params, Collections.<String, String>emptyMap(), jobManager);
verify(jobManager).ask(eq(new CancelJobWithSavepoint(jobId, "custom-directory")), any(FiniteDuration.class));
String location = String.format("/jobs/%s/cancel-with-savepoint/in-progress/1", jobId);
assertEquals(HttpResponseStatus.ACCEPTED, response.getStatus());
assertEquals("application/json", response.headers().get(HttpHeaders.Names.CONTENT_TYPE));
assertEquals(Integer.toString(response.content().readableBytes()), response.headers().get(HttpHeaders.Names.CONTENT_LENGTH));
assertEquals(location, response.headers().get(HttpHeaders.Names.LOCATION));
String json = response.content().toString(Charset.forName("UTF-8"));
JsonNode root = new ObjectMapper().readTree(json);
assertEquals("accepted", root.get("status").getValueAsText());
assertEquals("1", root.get("request-id").getValueAsText());
assertEquals(location, root.get("location").getValueAsText());
// Trigger again
response = trigger.handleRequest(params, Collections.<String, String>emptyMap(), jobManager);
assertEquals(HttpResponseStatus.ACCEPTED, response.getStatus());
assertEquals("application/json", response.headers().get(HttpHeaders.Names.CONTENT_TYPE));
assertEquals(Integer.toString(response.content().readableBytes()), response.headers().get(HttpHeaders.Names.CONTENT_LENGTH));
assertEquals(location, response.headers().get(HttpHeaders.Names.LOCATION));
json = response.content().toString(Charset.forName("UTF-8"));
root = new ObjectMapper().readTree(json);
assertEquals("accepted", root.get("status").getValueAsText());
assertEquals("1", root.get("request-id").getValueAsText());
assertEquals(location, root.get("location").getValueAsText());
// Only single actual request
verify(jobManager).ask(eq(new CancelJobWithSavepoint(jobId, "custom-directory")), any(FiniteDuration.class));
// Query progress
params.put("requestId", "1");
response = progress.handleRequest(params, Collections.<String, String>emptyMap(), jobManager);
assertEquals(HttpResponseStatus.ACCEPTED, response.getStatus());
assertEquals("application/json", response.headers().get(HttpHeaders.Names.CONTENT_TYPE));
assertEquals(Integer.toString(response.content().readableBytes()), response.headers().get(HttpHeaders.Names.CONTENT_LENGTH));
json = response.content().toString(Charset.forName("UTF-8"));
root = new ObjectMapper().readTree(json);
assertEquals("in-progress", root.get("status").getValueAsText());
assertEquals("1", root.get("request-id").getValueAsText());
// Complete
promise.success(new CancellationSuccess(jobId, "_path-savepoint_"));
response = progress.handleRequest(params, Collections.<String, String>emptyMap(), jobManager);
assertEquals(HttpResponseStatus.CREATED, response.getStatus());
assertEquals("application/json", response.headers().get(HttpHeaders.Names.CONTENT_TYPE));
assertEquals(Integer.toString(response.content().readableBytes()), response.headers().get(HttpHeaders.Names.CONTENT_LENGTH));
json = response.content().toString(Charset.forName("UTF-8"));
root = new ObjectMapper().readTree(json);
assertEquals("success", root.get("status").getValueAsText());
assertEquals("1", root.get("request-id").getValueAsText());
assertEquals("_path-savepoint_", root.get("savepoint-path").getValueAsText());
// Query again, keep recent history
response = progress.handleRequest(params, Collections.<String, String>emptyMap(), jobManager);
assertEquals(HttpResponseStatus.CREATED, response.getStatus());
assertEquals("application/json", response.headers().get(HttpHeaders.Names.CONTENT_TYPE));
assertEquals(Integer.toString(response.content().readableBytes()), response.headers().get(HttpHeaders.Names.CONTENT_LENGTH));
json = response.content().toString(Charset.forName("UTF-8"));
root = new ObjectMapper().readTree(json);
assertEquals("success", root.get("status").getValueAsText());
assertEquals("1", root.get("request-id").getValueAsText());
assertEquals("_path-savepoint_", root.get("savepoint-path").getValueAsText());
// Query for unknown request
params.put("requestId", "9929");
response = progress.handleRequest(params, Collections.<String, String>emptyMap(), jobManager);
assertEquals(HttpResponseStatus.BAD_REQUEST, response.getStatus());
assertEquals("application/json", response.headers().get(HttpHeaders.Names.CONTENT_TYPE));
assertEquals(Integer.toString(response.content().readableBytes()), response.headers().get(HttpHeaders.Names.CONTENT_LENGTH));
json = response.content().toString(Charset.forName("UTF-8"));
root = new ObjectMapper().readTree(json);
assertEquals("failed", root.get("status").getValueAsText());
assertEquals("9929", root.get("request-id").getValueAsText());
assertEquals("Unknown job/request ID", root.get("cause").getValueAsText());
}
use of org.apache.flink.runtime.executiongraph.ExecutionGraph in project flink by apache.
the class CoordinatorShutdownTest method testCoordinatorShutsDownOnFailure.
@Test
public void testCoordinatorShutsDownOnFailure() {
LocalFlinkMiniCluster cluster = null;
try {
Configuration config = new Configuration();
config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, 1);
config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, 1);
cluster = new LocalFlinkMiniCluster(config, true);
cluster.start();
// build a test graph with snapshotting enabled
JobVertex vertex = new JobVertex("Test Vertex");
vertex.setInvokableClass(FailingBlockingInvokable.class);
List<JobVertexID> vertexIdList = Collections.singletonList(vertex.getID());
JobGraph testGraph = new JobGraph("test job", vertex);
testGraph.setSnapshotSettings(new JobSnapshottingSettings(vertexIdList, vertexIdList, vertexIdList, 5000, 60000, 0L, Integer.MAX_VALUE, ExternalizedCheckpointSettings.none(), null, true));
ActorGateway jmGateway = cluster.getLeaderGateway(TestingUtils.TESTING_DURATION());
FiniteDuration timeout = new FiniteDuration(60, TimeUnit.SECONDS);
JobManagerMessages.SubmitJob submitMessage = new JobManagerMessages.SubmitJob(testGraph, ListeningBehaviour.EXECUTION_RESULT);
// submit is successful, but then the job blocks due to the invokable
Future<Object> submitFuture = jmGateway.ask(submitMessage, timeout);
Await.result(submitFuture, timeout);
// get the execution graph and store the ExecutionGraph reference
Future<Object> jobRequestFuture = jmGateway.ask(new JobManagerMessages.RequestJob(testGraph.getJobID()), timeout);
ExecutionGraph graph = (ExecutionGraph) ((JobManagerMessages.JobFound) Await.result(jobRequestFuture, timeout)).executionGraph();
assertNotNull(graph);
FailingBlockingInvokable.unblock();
graph.waitUntilFinished();
// verify that the coordinator was shut down
CheckpointCoordinator coord = graph.getCheckpointCoordinator();
assertTrue(coord == null || coord.isShutdown());
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
} finally {
if (cluster != null) {
cluster.shutdown();
cluster.awaitTermination();
}
}
}
use of org.apache.flink.runtime.executiongraph.ExecutionGraph in project flink by apache.
the class ExecutionGraphCheckpointCoordinatorTest method testSuspendCheckpointCoordinator.
/**
* Tests that a suspended checkpoint coordinator calls suspend on
* the store and counter.
*/
@Test
public void testSuspendCheckpointCoordinator() throws Exception {
CheckpointIDCounter counter = mock(CheckpointIDCounter.class);
CompletedCheckpointStore store = mock(CompletedCheckpointStore.class);
ExecutionGraph graph = createExecutionGraphAndEnableCheckpointing(counter, store);
graph.suspend(new Exception("Test Exception"));
// No shutdown
verify(counter, times(1)).shutdown(Matchers.eq(JobStatus.SUSPENDED));
verify(store, times(1)).shutdown(Matchers.eq(JobStatus.SUSPENDED));
}
use of org.apache.flink.runtime.executiongraph.ExecutionGraph in project flink by apache.
the class LeaderChangeJobRecoveryTest method testNotRestartedWhenLosingLeadership.
/**
* Tests that the job is not restarted or at least terminates eventually in case that the
* JobManager loses its leadership.
*
* @throws Exception
*/
@Test
public void testNotRestartedWhenLosingLeadership() throws Exception {
UUID leaderSessionID = UUID.randomUUID();
cluster.grantLeadership(0, leaderSessionID);
cluster.notifyRetrievalListeners(0, leaderSessionID);
cluster.waitForTaskManagersToBeRegistered(timeout);
cluster.submitJobDetached(job);
ActorGateway jm = cluster.getLeaderGateway(timeout);
Future<Object> wait = jm.ask(new TestingJobManagerMessages.WaitForAllVerticesToBeRunningOrFinished(job.getJobID()), timeout);
Await.ready(wait, timeout);
Future<Object> futureExecutionGraph = jm.ask(new TestingJobManagerMessages.RequestExecutionGraph(job.getJobID()), timeout);
TestingJobManagerMessages.ResponseExecutionGraph responseExecutionGraph = (TestingJobManagerMessages.ResponseExecutionGraph) Await.result(futureExecutionGraph, timeout);
assertTrue(responseExecutionGraph instanceof TestingJobManagerMessages.ExecutionGraphFound);
ExecutionGraph executionGraph = (ExecutionGraph) ((TestingJobManagerMessages.ExecutionGraphFound) responseExecutionGraph).executionGraph();
TerminalJobStatusListener testListener = new TerminalJobStatusListener();
executionGraph.registerJobStatusListener(testListener);
cluster.revokeLeadership();
testListener.waitForTerminalState(30000);
}
Aggregations