use of org.apache.flink.runtime.instance.ActorGateway in project flink by apache.
the class JobCancellationWithSavepointHandlersTest method testFailedCancellation.
/**
* Tests response when a request fails.
*/
@Test
public void testFailedCancellation() throws Exception {
JobID jobId = new JobID();
ExecutionGraphHolder holder = mock(ExecutionGraphHolder.class);
ExecutionGraph graph = mock(ExecutionGraph.class);
CheckpointCoordinator coord = mock(CheckpointCoordinator.class);
when(holder.getExecutionGraph(eq(jobId), any(ActorGateway.class))).thenReturn(graph);
when(graph.getCheckpointCoordinator()).thenReturn(coord);
JobCancellationWithSavepointHandlers handlers = new JobCancellationWithSavepointHandlers(holder, EC);
JobCancellationWithSavepointHandlers.TriggerHandler trigger = handlers.getTriggerHandler();
JobCancellationWithSavepointHandlers.InProgressHandler progress = handlers.getInProgressHandler();
Map<String, String> params = new HashMap<>();
params.put("jobid", jobId.toString());
params.put("targetDirectory", "custom-directory");
ActorGateway jobManager = mock(ActorGateway.class);
// Successful
Future<Object> future = Futures.failed(new Exception("Test Exception"));
when(jobManager.ask(any(Object.class), any(FiniteDuration.class))).thenReturn(future);
// Trigger
trigger.handleRequest(params, Collections.<String, String>emptyMap(), jobManager);
verify(jobManager).ask(eq(new CancelJobWithSavepoint(jobId, "custom-directory")), any(FiniteDuration.class));
// Query progress
params.put("requestId", "1");
FullHttpResponse response = progress.handleRequest(params, Collections.<String, String>emptyMap(), jobManager);
assertEquals(HttpResponseStatus.INTERNAL_SERVER_ERROR, response.getStatus());
assertEquals("application/json", response.headers().get(HttpHeaders.Names.CONTENT_TYPE));
assertEquals(Integer.toString(response.content().readableBytes()), response.headers().get(HttpHeaders.Names.CONTENT_LENGTH));
String json = response.content().toString(Charset.forName("UTF-8"));
JsonNode root = new ObjectMapper().readTree(json);
assertEquals("failed", root.get("status").getValueAsText());
assertEquals("1", root.get("request-id").getValueAsText());
assertEquals("Test Exception", root.get("cause").getValueAsText());
}
use of org.apache.flink.runtime.instance.ActorGateway in project flink by apache.
the class RescalingITCase method testSavepointRescalingPartitionedOperatorState.
/**
* Tests rescaling of partitioned operator state. More specific, we test the mechanism with {@link ListCheckpointed}
* as it subsumes {@link org.apache.flink.streaming.api.checkpoint.CheckpointedFunction}.
*/
public void testSavepointRescalingPartitionedOperatorState(boolean scaleOut, OperatorCheckpointMethod checkpointMethod) throws Exception {
final int parallelism = scaleOut ? numSlots : numSlots / 2;
final int parallelism2 = scaleOut ? numSlots / 2 : numSlots;
final int maxParallelism = 13;
FiniteDuration timeout = new FiniteDuration(3, TimeUnit.MINUTES);
Deadline deadline = timeout.fromNow();
JobID jobID = null;
ActorGateway jobManager = null;
int counterSize = Math.max(parallelism, parallelism2);
if (checkpointMethod == OperatorCheckpointMethod.CHECKPOINTED_FUNCTION || checkpointMethod == OperatorCheckpointMethod.CHECKPOINTED_FUNCTION_BROADCAST) {
PartitionedStateSource.CHECK_CORRECT_SNAPSHOT = new int[counterSize];
PartitionedStateSource.CHECK_CORRECT_RESTORE = new int[counterSize];
} else {
PartitionedStateSourceListCheckpointed.CHECK_CORRECT_SNAPSHOT = new int[counterSize];
PartitionedStateSourceListCheckpointed.CHECK_CORRECT_RESTORE = new int[counterSize];
}
try {
jobManager = cluster.getLeaderGateway(deadline.timeLeft());
JobGraph jobGraph = createJobGraphWithOperatorState(parallelism, maxParallelism, checkpointMethod);
jobID = jobGraph.getJobID();
cluster.submitJobDetached(jobGraph);
Object savepointResponse = null;
// wait until the operator is started
StateSourceBase.workStartedLatch.await();
while (deadline.hasTimeLeft()) {
Future<Object> savepointPathFuture = jobManager.ask(new JobManagerMessages.TriggerSavepoint(jobID, Option.<String>empty()), deadline.timeLeft());
FiniteDuration waitingTime = new FiniteDuration(10, TimeUnit.SECONDS);
savepointResponse = Await.result(savepointPathFuture, waitingTime);
if (savepointResponse instanceof JobManagerMessages.TriggerSavepointSuccess) {
break;
}
System.out.println(savepointResponse);
}
assertTrue(savepointResponse instanceof JobManagerMessages.TriggerSavepointSuccess);
final String savepointPath = ((JobManagerMessages.TriggerSavepointSuccess) savepointResponse).savepointPath();
Future<Object> jobRemovedFuture = jobManager.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), deadline.timeLeft());
Future<Object> cancellationResponseFuture = jobManager.ask(new JobManagerMessages.CancelJob(jobID), deadline.timeLeft());
Object cancellationResponse = Await.result(cancellationResponseFuture, deadline.timeLeft());
assertTrue(cancellationResponse instanceof JobManagerMessages.CancellationSuccess);
Await.ready(jobRemovedFuture, deadline.timeLeft());
// job successfully removed
jobID = null;
JobGraph scaledJobGraph = createJobGraphWithOperatorState(parallelism2, maxParallelism, checkpointMethod);
scaledJobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));
jobID = scaledJobGraph.getJobID();
cluster.submitJobAndWait(scaledJobGraph, false);
int sumExp = 0;
int sumAct = 0;
if (checkpointMethod == OperatorCheckpointMethod.CHECKPOINTED_FUNCTION) {
for (int c : PartitionedStateSource.CHECK_CORRECT_SNAPSHOT) {
sumExp += c;
}
for (int c : PartitionedStateSource.CHECK_CORRECT_RESTORE) {
sumAct += c;
}
} else if (checkpointMethod == OperatorCheckpointMethod.CHECKPOINTED_FUNCTION_BROADCAST) {
for (int c : PartitionedStateSource.CHECK_CORRECT_SNAPSHOT) {
sumExp += c;
}
for (int c : PartitionedStateSource.CHECK_CORRECT_RESTORE) {
sumAct += c;
}
sumExp *= parallelism2;
} else {
for (int c : PartitionedStateSourceListCheckpointed.CHECK_CORRECT_SNAPSHOT) {
sumExp += c;
}
for (int c : PartitionedStateSourceListCheckpointed.CHECK_CORRECT_RESTORE) {
sumAct += c;
}
}
assertEquals(sumExp, sumAct);
jobID = null;
} finally {
// clear any left overs from a possibly failed job
if (jobID != null && jobManager != null) {
Future<Object> jobRemovedFuture = jobManager.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), timeout);
try {
Await.ready(jobRemovedFuture, timeout);
} catch (TimeoutException | InterruptedException ie) {
fail("Failed while cleaning up the cluster.");
}
}
}
}
use of org.apache.flink.runtime.instance.ActorGateway in project flink by apache.
the class SavepointITCase method testCanRestoreWithModifiedStatelessOperators.
/**
* FLINK-5985
*
* This test ensures we can restore from a savepoint under modifications to the job graph that only concern
* stateless operators.
*/
@Test
public void testCanRestoreWithModifiedStatelessOperators() throws Exception {
// Config
int numTaskManagers = 2;
int numSlotsPerTaskManager = 2;
int parallelism = 2;
// Test deadline
final Deadline deadline = new FiniteDuration(5, TimeUnit.MINUTES).fromNow();
final File tmpDir = CommonTestUtils.createTempDirectory();
final File savepointDir = new File(tmpDir, "savepoints");
TestingCluster flink = null;
String savepointPath;
try {
// Flink configuration
final Configuration config = new Configuration();
config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, numTaskManagers);
config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, numSlotsPerTaskManager);
config.setString(ConfigConstants.SAVEPOINT_DIRECTORY_KEY, savepointDir.toURI().toString());
LOG.info("Flink configuration: " + config + ".");
// Start Flink
flink = new TestingCluster(config);
LOG.info("Starting Flink cluster.");
flink.start(true);
// Retrieve the job manager
LOG.info("Retrieving JobManager.");
ActorGateway jobManager = Await.result(flink.leaderGateway().future(), deadline.timeLeft());
LOG.info("JobManager: " + jobManager + ".");
final StatefulCounter statefulCounter = new StatefulCounter();
StatefulCounter.resetForTest(parallelism);
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(parallelism);
env.addSource(new InfiniteTestSource()).shuffle().map(new MapFunction<Integer, Integer>() {
@Override
public Integer map(Integer value) throws Exception {
return 4 * value;
}
}).shuffle().map(statefulCounter).uid("statefulCounter").shuffle().map(new MapFunction<Integer, Integer>() {
@Override
public Integer map(Integer value) throws Exception {
return 2 * value;
}
}).addSink(new DiscardingSink<Integer>());
JobGraph originalJobGraph = env.getStreamGraph().getJobGraph();
JobSubmissionResult submissionResult = flink.submitJobDetached(originalJobGraph);
JobID jobID = submissionResult.getJobID();
// wait for the Tasks to be ready
StatefulCounter.getProgressLatch().await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
Future<Object> savepointPathFuture = jobManager.ask(new TriggerSavepoint(jobID, Option.<String>empty()), deadline.timeLeft());
savepointPath = ((TriggerSavepointSuccess) Await.result(savepointPathFuture, deadline.timeLeft())).savepointPath();
Future<Object> savepointFuture = jobManager.ask(new RequestSavepoint(savepointPath), deadline.timeLeft());
((ResponseSavepoint) Await.result(savepointFuture, deadline.timeLeft())).savepoint();
LOG.info("Retrieved savepoint: " + savepointPath + ".");
// Shut down the Flink cluster (thereby canceling the job)
LOG.info("Shutting down Flink cluster.");
flink.shutdown();
flink.awaitTermination();
} finally {
flink.shutdown();
flink.awaitTermination();
}
try {
LOG.info("Restarting Flink cluster.");
flink.start(true);
// Retrieve the job manager
LOG.info("Retrieving JobManager.");
ActorGateway jobManager = Await.result(flink.leaderGateway().future(), deadline.timeLeft());
LOG.info("JobManager: " + jobManager + ".");
// Reset static test helpers
StatefulCounter.resetForTest(parallelism);
// Gather all task deployment descriptors
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(parallelism);
// generate a modified job graph that adds a stateless op
env.addSource(new InfiniteTestSource()).shuffle().map(new StatefulCounter()).uid("statefulCounter").shuffle().map(new MapFunction<Integer, Integer>() {
@Override
public Integer map(Integer value) throws Exception {
return value;
}
}).addSink(new DiscardingSink<Integer>());
JobGraph modifiedJobGraph = env.getStreamGraph().getJobGraph();
// Set the savepoint path
modifiedJobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));
LOG.info("Resubmitting job " + modifiedJobGraph.getJobID() + " with " + "savepoint path " + savepointPath + " in detached mode.");
// Submit the job
flink.submitJobDetached(modifiedJobGraph);
// Await state is restored
StatefulCounter.getRestoreLatch().await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
// Await some progress after restore
StatefulCounter.getProgressLatch().await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
} finally {
flink.shutdown();
flink.awaitTermination();
}
}
use of org.apache.flink.runtime.instance.ActorGateway in project flink by apache.
the class CancelingTestBase method runAndCancelJob.
public void runAndCancelJob(Plan plan, final int msecsTillCanceling, int maxTimeTillCanceled) throws Exception {
try {
// submit job
final JobGraph jobGraph = getJobGraph(plan);
executor.submitJobDetached(jobGraph);
// Wait for the job to make some progress and then cancel
JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, executor.getLeaderGateway(TestingUtils.TESTING_DURATION()), TestingUtils.TESTING_DURATION());
Thread.sleep(msecsTillCanceling);
FiniteDuration timeout = new FiniteDuration(maxTimeTillCanceled, TimeUnit.MILLISECONDS);
ActorGateway jobManager = executor.getLeaderGateway(TestingUtils.TESTING_DURATION());
Future<Object> ask = jobManager.ask(new CancelJob(jobGraph.getJobID()), timeout);
Object result = Await.result(ask, timeout);
if (result instanceof CancellationSuccess) {
// all good
} else if (result instanceof CancellationFailure) {
// Failure
CancellationFailure failure = (CancellationFailure) result;
throw new Exception("Failed to cancel job with ID " + failure.jobID() + ".", failure.cause());
} else {
throw new Exception("Unexpected response to cancel request: " + result);
}
// Wait for the job to be cancelled
JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.CANCELED, executor.getLeaderGateway(TestingUtils.TESTING_DURATION()), TestingUtils.TESTING_DURATION());
} catch (Exception e) {
LOG.error("Exception found in runAndCancelJob.", e);
e.printStackTrace();
Assert.fail(e.getMessage());
}
}
use of org.apache.flink.runtime.instance.ActorGateway in project flink by apache.
the class YARNHighAvailabilityITCase method testMultipleAMKill.
/**
* Tests that the application master can be killed multiple times and that the surviving
* TaskManager successfully reconnects to the newly started JobManager.
* @throws Exception
*/
@Test
public void testMultipleAMKill() throws Exception {
final int numberKillingAttempts = numberApplicationAttempts - 1;
TestingYarnClusterDescriptor flinkYarnClient = new TestingYarnClusterDescriptor();
Assert.assertNotNull("unable to get yarn client", flinkYarnClient);
flinkYarnClient.setTaskManagerCount(1);
flinkYarnClient.setJobManagerMemory(768);
flinkYarnClient.setTaskManagerMemory(1024);
flinkYarnClient.setLocalJarPath(new Path(flinkUberjar.getAbsolutePath()));
flinkYarnClient.addShipFiles(Arrays.asList(flinkLibFolder.listFiles()));
String confDirPath = System.getenv(ConfigConstants.ENV_FLINK_CONF_DIR);
flinkYarnClient.setConfigurationDirectory(confDirPath);
String fsStateHandlePath = temp.getRoot().getPath();
// load the configuration
File configDirectory = new File(confDirPath);
GlobalConfiguration.loadConfiguration(configDirectory.getAbsolutePath());
flinkYarnClient.setFlinkConfiguration(GlobalConfiguration.loadConfiguration());
flinkYarnClient.setDynamicPropertiesEncoded("recovery.mode=zookeeper@@recovery.zookeeper.quorum=" + zkServer.getConnectString() + "@@yarn.application-attempts=" + numberApplicationAttempts + "@@" + CoreOptions.STATE_BACKEND + "=FILESYSTEM" + "@@" + FsStateBackendFactory.CHECKPOINT_DIRECTORY_URI_CONF_KEY + "=" + fsStateHandlePath + "/checkpoints" + "@@" + HighAvailabilityOptions.HA_STORAGE_PATH.key() + "=" + fsStateHandlePath + "/recovery");
flinkYarnClient.setConfigurationFilePath(new Path(confDirPath + File.separator + "flink-conf.yaml"));
ClusterClient yarnCluster = null;
final FiniteDuration timeout = new FiniteDuration(2, TimeUnit.MINUTES);
try {
yarnCluster = flinkYarnClient.deploy();
final Configuration config = yarnCluster.getFlinkConfiguration();
new JavaTestKit(actorSystem) {
{
for (int attempt = 0; attempt < numberKillingAttempts; attempt++) {
new Within(timeout) {
@Override
protected void run() {
try {
LeaderRetrievalService lrs = LeaderRetrievalUtils.createLeaderRetrievalService(config);
ActorGateway gateway = LeaderRetrievalUtils.retrieveLeaderGateway(lrs, actorSystem, timeout);
ActorGateway selfGateway = new AkkaActorGateway(getRef(), gateway.leaderSessionID());
gateway.tell(new TestingJobManagerMessages.NotifyWhenAtLeastNumTaskManagerAreRegistered(1), selfGateway);
expectMsgEquals(Acknowledge.get());
gateway.tell(PoisonPill.getInstance());
} catch (Exception e) {
throw new AssertionError("Could not complete test.", e);
}
}
};
}
new Within(timeout) {
@Override
protected void run() {
try {
LeaderRetrievalService lrs = LeaderRetrievalUtils.createLeaderRetrievalService(config);
ActorGateway gateway2 = LeaderRetrievalUtils.retrieveLeaderGateway(lrs, actorSystem, timeout);
ActorGateway selfGateway = new AkkaActorGateway(getRef(), gateway2.leaderSessionID());
gateway2.tell(new TestingJobManagerMessages.NotifyWhenAtLeastNumTaskManagerAreRegistered(1), selfGateway);
expectMsgEquals(Acknowledge.get());
} catch (Exception e) {
throw new AssertionError("Could not complete test.", e);
}
}
};
}
};
} finally {
if (yarnCluster != null) {
yarnCluster.shutdown();
}
}
}
Aggregations