Search in sources :

Example 1 with CheckpointCoordinator

use of org.apache.flink.runtime.checkpoint.CheckpointCoordinator in project flink by apache.

the class ExecutionGraph method postRunCleanup.

private void postRunCleanup() {
    try {
        CheckpointCoordinator coord = this.checkpointCoordinator;
        this.checkpointCoordinator = null;
        if (coord != null) {
            coord.shutdown(state);
        }
    } catch (Exception e) {
        LOG.error("Error while cleaning up after execution", e);
    }
}
Also used : CheckpointCoordinator(org.apache.flink.runtime.checkpoint.CheckpointCoordinator) SuppressRestartsException(org.apache.flink.runtime.execution.SuppressRestartsException) StoppingException(org.apache.flink.runtime.StoppingException) NoResourceAvailableException(org.apache.flink.runtime.jobmanager.scheduler.NoResourceAvailableException) JobException(org.apache.flink.runtime.JobException) NoSuchElementException(java.util.NoSuchElementException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException)

Example 2 with CheckpointCoordinator

use of org.apache.flink.runtime.checkpoint.CheckpointCoordinator in project flink by apache.

the class JobCancellationWithSavepointHandlersTest method testSavepointDirectoryConfiguration.

/**
	 * Tests that the savepoint directory configuration is respected.
	 */
@Test
public void testSavepointDirectoryConfiguration() throws Exception {
    long timeout = 128288238L;
    JobID jobId = new JobID();
    ExecutionGraphHolder holder = mock(ExecutionGraphHolder.class);
    ExecutionGraph graph = mock(ExecutionGraph.class);
    CheckpointCoordinator coord = mock(CheckpointCoordinator.class);
    when(holder.getExecutionGraph(eq(jobId), any(ActorGateway.class))).thenReturn(graph);
    when(graph.getCheckpointCoordinator()).thenReturn(coord);
    when(coord.getCheckpointTimeout()).thenReturn(timeout);
    JobCancellationWithSavepointHandlers handlers = new JobCancellationWithSavepointHandlers(holder, EC, "the-default-directory");
    JobCancellationWithSavepointHandlers.TriggerHandler handler = handlers.getTriggerHandler();
    Map<String, String> params = new HashMap<>();
    params.put("jobid", jobId.toString());
    ActorGateway jobManager = mock(ActorGateway.class);
    Future<Object> future = Futures.successful((Object) new CancellationSuccess(jobId, null));
    when(jobManager.ask(any(Object.class), any(FiniteDuration.class))).thenReturn(future);
    // 1. Use targetDirectory path param
    params.put("targetDirectory", "custom-directory");
    handler.handleRequest(params, Collections.<String, String>emptyMap(), jobManager);
    verify(jobManager).ask(eq(new CancelJobWithSavepoint(jobId, "custom-directory")), eq(FiniteDuration.apply(timeout, "ms")));
    // 2. Use default
    params.remove("targetDirectory");
    handler.handleRequest(params, Collections.<String, String>emptyMap(), jobManager);
    verify(jobManager).ask(eq(new CancelJobWithSavepoint(jobId, "the-default-directory")), eq(FiniteDuration.apply(timeout, "ms")));
    // 3. Throw Exception
    handlers = new JobCancellationWithSavepointHandlers(holder, EC, null);
    handler = handlers.getTriggerHandler();
    try {
        handler.handleRequest(params, Collections.<String, String>emptyMap(), jobManager);
        fail("Did not throw expected test Exception");
    } catch (Exception e) {
        IllegalStateException cause = (IllegalStateException) e.getCause();
        assertEquals(true, cause.getMessage().contains(ConfigConstants.SAVEPOINT_DIRECTORY_KEY));
    }
}
Also used : HashMap(java.util.HashMap) FiniteDuration(scala.concurrent.duration.FiniteDuration) CancelJobWithSavepoint(org.apache.flink.runtime.messages.JobManagerMessages.CancelJobWithSavepoint) ExecutionGraphHolder(org.apache.flink.runtime.webmonitor.ExecutionGraphHolder) CheckpointCoordinator(org.apache.flink.runtime.checkpoint.CheckpointCoordinator) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) CancellationSuccess(org.apache.flink.runtime.messages.JobManagerMessages.CancellationSuccess) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 3 with CheckpointCoordinator

use of org.apache.flink.runtime.checkpoint.CheckpointCoordinator in project flink by apache.

the class JobCancellationWithSavepointHandlersTest method testAskTimeoutEqualsCheckpointTimeout.

/**
	 * Tests that the cancellation ask timeout respects the checkpoint timeout.
	 * Otherwise, AskTimeoutExceptions are bound to happen for large state.
	 */
@Test
public void testAskTimeoutEqualsCheckpointTimeout() throws Exception {
    long timeout = 128288238L;
    JobID jobId = new JobID();
    ExecutionGraphHolder holder = mock(ExecutionGraphHolder.class);
    ExecutionGraph graph = mock(ExecutionGraph.class);
    CheckpointCoordinator coord = mock(CheckpointCoordinator.class);
    when(holder.getExecutionGraph(eq(jobId), any(ActorGateway.class))).thenReturn(graph);
    when(graph.getCheckpointCoordinator()).thenReturn(coord);
    when(coord.getCheckpointTimeout()).thenReturn(timeout);
    JobCancellationWithSavepointHandlers handlers = new JobCancellationWithSavepointHandlers(holder, EC);
    JobCancellationWithSavepointHandlers.TriggerHandler handler = handlers.getTriggerHandler();
    Map<String, String> params = new HashMap<>();
    params.put("jobid", jobId.toString());
    params.put("targetDirectory", "placeholder");
    ActorGateway jobManager = mock(ActorGateway.class);
    Future<Object> future = Futures.successful((Object) new CancellationSuccess(jobId, null));
    when(jobManager.ask(any(Object.class), any(FiniteDuration.class))).thenReturn(future);
    handler.handleRequest(params, Collections.<String, String>emptyMap(), jobManager);
    verify(jobManager).ask(any(CancelJobWithSavepoint.class), eq(FiniteDuration.apply(timeout, "ms")));
}
Also used : HashMap(java.util.HashMap) FiniteDuration(scala.concurrent.duration.FiniteDuration) CancelJobWithSavepoint(org.apache.flink.runtime.messages.JobManagerMessages.CancelJobWithSavepoint) ExecutionGraphHolder(org.apache.flink.runtime.webmonitor.ExecutionGraphHolder) CheckpointCoordinator(org.apache.flink.runtime.checkpoint.CheckpointCoordinator) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) CancellationSuccess(org.apache.flink.runtime.messages.JobManagerMessages.CancellationSuccess) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 4 with CheckpointCoordinator

use of org.apache.flink.runtime.checkpoint.CheckpointCoordinator in project flink by apache.

the class JobCancellationWithSavepointHandlersTest method testFailedCancellation.

/**
	 * Tests response when a request fails.
	 */
@Test
public void testFailedCancellation() throws Exception {
    JobID jobId = new JobID();
    ExecutionGraphHolder holder = mock(ExecutionGraphHolder.class);
    ExecutionGraph graph = mock(ExecutionGraph.class);
    CheckpointCoordinator coord = mock(CheckpointCoordinator.class);
    when(holder.getExecutionGraph(eq(jobId), any(ActorGateway.class))).thenReturn(graph);
    when(graph.getCheckpointCoordinator()).thenReturn(coord);
    JobCancellationWithSavepointHandlers handlers = new JobCancellationWithSavepointHandlers(holder, EC);
    JobCancellationWithSavepointHandlers.TriggerHandler trigger = handlers.getTriggerHandler();
    JobCancellationWithSavepointHandlers.InProgressHandler progress = handlers.getInProgressHandler();
    Map<String, String> params = new HashMap<>();
    params.put("jobid", jobId.toString());
    params.put("targetDirectory", "custom-directory");
    ActorGateway jobManager = mock(ActorGateway.class);
    // Successful
    Future<Object> future = Futures.failed(new Exception("Test Exception"));
    when(jobManager.ask(any(Object.class), any(FiniteDuration.class))).thenReturn(future);
    // Trigger
    trigger.handleRequest(params, Collections.<String, String>emptyMap(), jobManager);
    verify(jobManager).ask(eq(new CancelJobWithSavepoint(jobId, "custom-directory")), any(FiniteDuration.class));
    // Query progress
    params.put("requestId", "1");
    FullHttpResponse response = progress.handleRequest(params, Collections.<String, String>emptyMap(), jobManager);
    assertEquals(HttpResponseStatus.INTERNAL_SERVER_ERROR, response.getStatus());
    assertEquals("application/json", response.headers().get(HttpHeaders.Names.CONTENT_TYPE));
    assertEquals(Integer.toString(response.content().readableBytes()), response.headers().get(HttpHeaders.Names.CONTENT_LENGTH));
    String json = response.content().toString(Charset.forName("UTF-8"));
    JsonNode root = new ObjectMapper().readTree(json);
    assertEquals("failed", root.get("status").getValueAsText());
    assertEquals("1", root.get("request-id").getValueAsText());
    assertEquals("Test Exception", root.get("cause").getValueAsText());
}
Also used : HashMap(java.util.HashMap) FiniteDuration(scala.concurrent.duration.FiniteDuration) CancelJobWithSavepoint(org.apache.flink.runtime.messages.JobManagerMessages.CancelJobWithSavepoint) JsonNode(org.codehaus.jackson.JsonNode) ExecutionGraphHolder(org.apache.flink.runtime.webmonitor.ExecutionGraphHolder) CheckpointCoordinator(org.apache.flink.runtime.checkpoint.CheckpointCoordinator) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) FullHttpResponse(io.netty.handler.codec.http.FullHttpResponse) JobID(org.apache.flink.api.common.JobID) ObjectMapper(org.codehaus.jackson.map.ObjectMapper) Test(org.junit.Test)

Example 5 with CheckpointCoordinator

use of org.apache.flink.runtime.checkpoint.CheckpointCoordinator in project flink by apache.

the class JobCancellationWithSavepointHandlersTest method testTriggerNewRequest.

/**
	 * Tests triggering a new request and monitoring it.
	 */
@Test
public void testTriggerNewRequest() throws Exception {
    JobID jobId = new JobID();
    ExecutionGraphHolder holder = mock(ExecutionGraphHolder.class);
    ExecutionGraph graph = mock(ExecutionGraph.class);
    CheckpointCoordinator coord = mock(CheckpointCoordinator.class);
    when(holder.getExecutionGraph(eq(jobId), any(ActorGateway.class))).thenReturn(graph);
    when(graph.getCheckpointCoordinator()).thenReturn(coord);
    JobCancellationWithSavepointHandlers handlers = new JobCancellationWithSavepointHandlers(holder, EC);
    JobCancellationWithSavepointHandlers.TriggerHandler trigger = handlers.getTriggerHandler();
    JobCancellationWithSavepointHandlers.InProgressHandler progress = handlers.getInProgressHandler();
    Map<String, String> params = new HashMap<>();
    params.put("jobid", jobId.toString());
    params.put("targetDirectory", "custom-directory");
    ActorGateway jobManager = mock(ActorGateway.class);
    // Successful
    Promise<Object> promise = new Promise.DefaultPromise<>();
    when(jobManager.ask(any(Object.class), any(FiniteDuration.class))).thenReturn(promise);
    // Trigger
    FullHttpResponse response = trigger.handleRequest(params, Collections.<String, String>emptyMap(), jobManager);
    verify(jobManager).ask(eq(new CancelJobWithSavepoint(jobId, "custom-directory")), any(FiniteDuration.class));
    String location = String.format("/jobs/%s/cancel-with-savepoint/in-progress/1", jobId);
    assertEquals(HttpResponseStatus.ACCEPTED, response.getStatus());
    assertEquals("application/json", response.headers().get(HttpHeaders.Names.CONTENT_TYPE));
    assertEquals(Integer.toString(response.content().readableBytes()), response.headers().get(HttpHeaders.Names.CONTENT_LENGTH));
    assertEquals(location, response.headers().get(HttpHeaders.Names.LOCATION));
    String json = response.content().toString(Charset.forName("UTF-8"));
    JsonNode root = new ObjectMapper().readTree(json);
    assertEquals("accepted", root.get("status").getValueAsText());
    assertEquals("1", root.get("request-id").getValueAsText());
    assertEquals(location, root.get("location").getValueAsText());
    // Trigger again
    response = trigger.handleRequest(params, Collections.<String, String>emptyMap(), jobManager);
    assertEquals(HttpResponseStatus.ACCEPTED, response.getStatus());
    assertEquals("application/json", response.headers().get(HttpHeaders.Names.CONTENT_TYPE));
    assertEquals(Integer.toString(response.content().readableBytes()), response.headers().get(HttpHeaders.Names.CONTENT_LENGTH));
    assertEquals(location, response.headers().get(HttpHeaders.Names.LOCATION));
    json = response.content().toString(Charset.forName("UTF-8"));
    root = new ObjectMapper().readTree(json);
    assertEquals("accepted", root.get("status").getValueAsText());
    assertEquals("1", root.get("request-id").getValueAsText());
    assertEquals(location, root.get("location").getValueAsText());
    // Only single actual request
    verify(jobManager).ask(eq(new CancelJobWithSavepoint(jobId, "custom-directory")), any(FiniteDuration.class));
    // Query progress
    params.put("requestId", "1");
    response = progress.handleRequest(params, Collections.<String, String>emptyMap(), jobManager);
    assertEquals(HttpResponseStatus.ACCEPTED, response.getStatus());
    assertEquals("application/json", response.headers().get(HttpHeaders.Names.CONTENT_TYPE));
    assertEquals(Integer.toString(response.content().readableBytes()), response.headers().get(HttpHeaders.Names.CONTENT_LENGTH));
    json = response.content().toString(Charset.forName("UTF-8"));
    root = new ObjectMapper().readTree(json);
    assertEquals("in-progress", root.get("status").getValueAsText());
    assertEquals("1", root.get("request-id").getValueAsText());
    // Complete
    promise.success(new CancellationSuccess(jobId, "_path-savepoint_"));
    response = progress.handleRequest(params, Collections.<String, String>emptyMap(), jobManager);
    assertEquals(HttpResponseStatus.CREATED, response.getStatus());
    assertEquals("application/json", response.headers().get(HttpHeaders.Names.CONTENT_TYPE));
    assertEquals(Integer.toString(response.content().readableBytes()), response.headers().get(HttpHeaders.Names.CONTENT_LENGTH));
    json = response.content().toString(Charset.forName("UTF-8"));
    root = new ObjectMapper().readTree(json);
    assertEquals("success", root.get("status").getValueAsText());
    assertEquals("1", root.get("request-id").getValueAsText());
    assertEquals("_path-savepoint_", root.get("savepoint-path").getValueAsText());
    // Query again, keep recent history
    response = progress.handleRequest(params, Collections.<String, String>emptyMap(), jobManager);
    assertEquals(HttpResponseStatus.CREATED, response.getStatus());
    assertEquals("application/json", response.headers().get(HttpHeaders.Names.CONTENT_TYPE));
    assertEquals(Integer.toString(response.content().readableBytes()), response.headers().get(HttpHeaders.Names.CONTENT_LENGTH));
    json = response.content().toString(Charset.forName("UTF-8"));
    root = new ObjectMapper().readTree(json);
    assertEquals("success", root.get("status").getValueAsText());
    assertEquals("1", root.get("request-id").getValueAsText());
    assertEquals("_path-savepoint_", root.get("savepoint-path").getValueAsText());
    // Query for unknown request
    params.put("requestId", "9929");
    response = progress.handleRequest(params, Collections.<String, String>emptyMap(), jobManager);
    assertEquals(HttpResponseStatus.BAD_REQUEST, response.getStatus());
    assertEquals("application/json", response.headers().get(HttpHeaders.Names.CONTENT_TYPE));
    assertEquals(Integer.toString(response.content().readableBytes()), response.headers().get(HttpHeaders.Names.CONTENT_LENGTH));
    json = response.content().toString(Charset.forName("UTF-8"));
    root = new ObjectMapper().readTree(json);
    assertEquals("failed", root.get("status").getValueAsText());
    assertEquals("9929", root.get("request-id").getValueAsText());
    assertEquals("Unknown job/request ID", root.get("cause").getValueAsText());
}
Also used : HashMap(java.util.HashMap) FiniteDuration(scala.concurrent.duration.FiniteDuration) CancelJobWithSavepoint(org.apache.flink.runtime.messages.JobManagerMessages.CancelJobWithSavepoint) JsonNode(org.codehaus.jackson.JsonNode) ExecutionGraphHolder(org.apache.flink.runtime.webmonitor.ExecutionGraphHolder) CheckpointCoordinator(org.apache.flink.runtime.checkpoint.CheckpointCoordinator) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) CancellationSuccess(org.apache.flink.runtime.messages.JobManagerMessages.CancellationSuccess) FullHttpResponse(io.netty.handler.codec.http.FullHttpResponse) JobID(org.apache.flink.api.common.JobID) ObjectMapper(org.codehaus.jackson.map.ObjectMapper) Test(org.junit.Test)

Aggregations

CheckpointCoordinator (org.apache.flink.runtime.checkpoint.CheckpointCoordinator)8 HashMap (java.util.HashMap)4 JobID (org.apache.flink.api.common.JobID)4 ExecutionGraph (org.apache.flink.runtime.executiongraph.ExecutionGraph)4 ActorGateway (org.apache.flink.runtime.instance.ActorGateway)4 CancelJobWithSavepoint (org.apache.flink.runtime.messages.JobManagerMessages.CancelJobWithSavepoint)4 ExecutionGraphHolder (org.apache.flink.runtime.webmonitor.ExecutionGraphHolder)4 Test (org.junit.Test)4 FiniteDuration (scala.concurrent.duration.FiniteDuration)4 CancellationSuccess (org.apache.flink.runtime.messages.JobManagerMessages.CancellationSuccess)3 FullHttpResponse (io.netty.handler.codec.http.FullHttpResponse)2 IOException (java.io.IOException)2 RpcMethod (org.apache.flink.runtime.rpc.RpcMethod)2 SerializedThrowable (org.apache.flink.runtime.util.SerializedThrowable)2 JsonNode (org.codehaus.jackson.JsonNode)2 ObjectMapper (org.codehaus.jackson.map.ObjectMapper)2 NoSuchElementException (java.util.NoSuchElementException)1 ExecutionException (java.util.concurrent.ExecutionException)1 TimeoutException (java.util.concurrent.TimeoutException)1 JobException (org.apache.flink.runtime.JobException)1