use of scala.concurrent.duration.Deadline in project flink by apache.
the class SavepointITCase method testCanRestoreWithModifiedStatelessOperators.
/**
* FLINK-5985
*
* This test ensures we can restore from a savepoint under modifications to the job graph that only concern
* stateless operators.
*/
@Test
public void testCanRestoreWithModifiedStatelessOperators() throws Exception {
// Config
int numTaskManagers = 2;
int numSlotsPerTaskManager = 2;
int parallelism = 2;
// Test deadline
final Deadline deadline = new FiniteDuration(5, TimeUnit.MINUTES).fromNow();
final File tmpDir = CommonTestUtils.createTempDirectory();
final File savepointDir = new File(tmpDir, "savepoints");
TestingCluster flink = null;
String savepointPath;
try {
// Flink configuration
final Configuration config = new Configuration();
config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, numTaskManagers);
config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, numSlotsPerTaskManager);
config.setString(ConfigConstants.SAVEPOINT_DIRECTORY_KEY, savepointDir.toURI().toString());
LOG.info("Flink configuration: " + config + ".");
// Start Flink
flink = new TestingCluster(config);
LOG.info("Starting Flink cluster.");
flink.start(true);
// Retrieve the job manager
LOG.info("Retrieving JobManager.");
ActorGateway jobManager = Await.result(flink.leaderGateway().future(), deadline.timeLeft());
LOG.info("JobManager: " + jobManager + ".");
final StatefulCounter statefulCounter = new StatefulCounter();
StatefulCounter.resetForTest(parallelism);
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(parallelism);
env.addSource(new InfiniteTestSource()).shuffle().map(new MapFunction<Integer, Integer>() {
@Override
public Integer map(Integer value) throws Exception {
return 4 * value;
}
}).shuffle().map(statefulCounter).uid("statefulCounter").shuffle().map(new MapFunction<Integer, Integer>() {
@Override
public Integer map(Integer value) throws Exception {
return 2 * value;
}
}).addSink(new DiscardingSink<Integer>());
JobGraph originalJobGraph = env.getStreamGraph().getJobGraph();
JobSubmissionResult submissionResult = flink.submitJobDetached(originalJobGraph);
JobID jobID = submissionResult.getJobID();
// wait for the Tasks to be ready
StatefulCounter.getProgressLatch().await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
Future<Object> savepointPathFuture = jobManager.ask(new TriggerSavepoint(jobID, Option.<String>empty()), deadline.timeLeft());
savepointPath = ((TriggerSavepointSuccess) Await.result(savepointPathFuture, deadline.timeLeft())).savepointPath();
Future<Object> savepointFuture = jobManager.ask(new RequestSavepoint(savepointPath), deadline.timeLeft());
((ResponseSavepoint) Await.result(savepointFuture, deadline.timeLeft())).savepoint();
LOG.info("Retrieved savepoint: " + savepointPath + ".");
// Shut down the Flink cluster (thereby canceling the job)
LOG.info("Shutting down Flink cluster.");
flink.shutdown();
flink.awaitTermination();
} finally {
flink.shutdown();
flink.awaitTermination();
}
try {
LOG.info("Restarting Flink cluster.");
flink.start(true);
// Retrieve the job manager
LOG.info("Retrieving JobManager.");
ActorGateway jobManager = Await.result(flink.leaderGateway().future(), deadline.timeLeft());
LOG.info("JobManager: " + jobManager + ".");
// Reset static test helpers
StatefulCounter.resetForTest(parallelism);
// Gather all task deployment descriptors
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(parallelism);
// generate a modified job graph that adds a stateless op
env.addSource(new InfiniteTestSource()).shuffle().map(new StatefulCounter()).uid("statefulCounter").shuffle().map(new MapFunction<Integer, Integer>() {
@Override
public Integer map(Integer value) throws Exception {
return value;
}
}).addSink(new DiscardingSink<Integer>());
JobGraph modifiedJobGraph = env.getStreamGraph().getJobGraph();
// Set the savepoint path
modifiedJobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));
LOG.info("Resubmitting job " + modifiedJobGraph.getJobID() + " with " + "savepoint path " + savepointPath + " in detached mode.");
// Submit the job
flink.submitJobDetached(modifiedJobGraph);
// Await state is restored
StatefulCounter.getRestoreLatch().await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
// Await some progress after restore
StatefulCounter.getProgressLatch().await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
} finally {
flink.shutdown();
flink.awaitTermination();
}
}
use of scala.concurrent.duration.Deadline in project flink by apache.
the class ExecutionGraphRestartTest method testCancelWhileFailing.
@Test
public void testCancelWhileFailing() throws Exception {
// We want to manually control the restart and delay
RestartStrategy restartStrategy = new InfiniteDelayRestartStrategy();
Tuple2<ExecutionGraph, Instance> executionGraphInstanceTuple = createSpyExecutionGraph(restartStrategy);
ExecutionGraph executionGraph = executionGraphInstanceTuple.f0;
Instance instance = executionGraphInstanceTuple.f1;
doNothing().when(executionGraph).jobVertexInFinalState();
// Kill the instance...
instance.markDead();
Deadline deadline = TestingUtils.TESTING_DURATION().fromNow();
// ...and wait for all vertices to be in state FAILED. The
// jobVertexInFinalState does nothing, that's why we don't wait on the
// job status.
boolean success = false;
while (deadline.hasTimeLeft() && !success) {
success = true;
for (ExecutionVertex vertex : executionGraph.getAllExecutionVertices()) {
ExecutionState state = vertex.getExecutionState();
if (state != ExecutionState.FAILED && state != ExecutionState.CANCELED) {
success = false;
Thread.sleep(100);
break;
}
}
}
// Still in failing
assertEquals(JobStatus.FAILING, executionGraph.getState());
// The cancel call needs to change the state to CANCELLING
executionGraph.cancel();
assertEquals(JobStatus.CANCELLING, executionGraph.getState());
// Unspy and finalize the job state
doCallRealMethod().when(executionGraph).jobVertexInFinalState();
executionGraph.jobVertexInFinalState();
assertEquals(JobStatus.CANCELED, executionGraph.getState());
}
use of scala.concurrent.duration.Deadline in project flink by apache.
the class ExecutionGraphRestartTest method testCancelWhileRestarting.
@Test
public void testCancelWhileRestarting() throws Exception {
// We want to manually control the restart and delay
RestartStrategy restartStrategy = new InfiniteDelayRestartStrategy();
Tuple2<ExecutionGraph, Instance> executionGraphInstanceTuple = createExecutionGraph(restartStrategy);
ExecutionGraph executionGraph = executionGraphInstanceTuple.f0;
Instance instance = executionGraphInstanceTuple.f1;
// Kill the instance and wait for the job to restart
instance.markDead();
Deadline deadline = TestingUtils.TESTING_DURATION().fromNow();
while (deadline.hasTimeLeft() && executionGraph.getState() != JobStatus.RESTARTING) {
Thread.sleep(100);
}
assertEquals(JobStatus.RESTARTING, executionGraph.getState());
// Canceling needs to abort the restart
executionGraph.cancel();
assertEquals(JobStatus.CANCELED, executionGraph.getState());
// The restart has been aborted
executionGraph.restart();
assertEquals(JobStatus.CANCELED, executionGraph.getState());
}
use of scala.concurrent.duration.Deadline in project flink by apache.
the class JobClientActorRecoveryITCase method testJobClientRecovery.
/**
* Tests wether the JobClientActor can connect to a newly elected leading job manager to obtain
* the JobExecutionResult. The submitted job blocks for the first execution attempt. The
* leading job manager will be killed so that the second job manager will be elected as the
* leader. The newly elected leader has to retrieve the checkpointed job from ZooKeeper
* and continue its execution. This time, the job does not block and, thus, can be finished.
* The execution result should be sent to the JobClientActor which originally submitted the
* job.
*
* @throws Exception
*/
@Test
public void testJobClientRecovery() throws Exception {
File rootFolder = tempFolder.getRoot();
Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(zkServer.getConnectString(), rootFolder.getPath());
config.setInteger(ConfigConstants.LOCAL_NUMBER_JOB_MANAGER, 2);
config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, 1);
final TestingCluster cluster = new TestingCluster(config);
cluster.start();
JobVertex blockingVertex = new JobVertex("Blocking Vertex");
blockingVertex.setInvokableClass(BlockingTask.class);
blockingVertex.setParallelism(1);
final JobGraph jobGraph = new JobGraph("Blocking Test Job", blockingVertex);
final Promise<JobExecutionResult> promise = new scala.concurrent.impl.Promise.DefaultPromise<>();
Deadline deadline = new FiniteDuration(2, TimeUnit.MINUTES).fromNow();
try {
Thread submitter = new Thread(new Runnable() {
@Override
public void run() {
try {
JobExecutionResult result = cluster.submitJobAndWait(jobGraph, false);
promise.success(result);
} catch (Exception e) {
promise.failure(e);
}
}
});
submitter.start();
synchronized (BlockingTask.waitLock) {
while (BlockingTask.HasBlockedExecution < 1 && deadline.hasTimeLeft()) {
BlockingTask.waitLock.wait(deadline.timeLeft().toMillis());
}
}
if (deadline.isOverdue()) {
Assert.fail("The job has not blocked within the given deadline.");
}
ActorGateway gateway = cluster.getLeaderGateway(deadline.timeLeft());
gateway.tell(TestingJobManagerMessages.getDisablePostStop());
gateway.tell(PoisonPill.getInstance());
// if the job fails then an exception is thrown here
Await.result(promise.future(), deadline.timeLeft());
} finally {
cluster.shutdown();
}
}
use of scala.concurrent.duration.Deadline in project flink by apache.
the class KvStateClientTest method testServerClosesChannel.
/**
* Tests that a server channel close, closes the connection and removes it
* from the established connections.
*/
@Test
public void testServerClosesChannel() throws Exception {
Deadline deadline = TEST_TIMEOUT.fromNow();
AtomicKvStateRequestStats stats = new AtomicKvStateRequestStats();
KvStateClient client = null;
Channel serverChannel = null;
try {
client = new KvStateClient(1, stats);
final AtomicBoolean received = new AtomicBoolean();
final AtomicReference<Channel> channel = new AtomicReference<>();
serverChannel = createServerChannel(new ChannelInboundHandlerAdapter() {
@Override
public void channelActive(ChannelHandlerContext ctx) throws Exception {
channel.set(ctx.channel());
}
@Override
public void channelRead(ChannelHandlerContext ctx, Object msg) throws Exception {
received.set(true);
}
});
KvStateServerAddress serverAddress = getKvStateServerAddress(serverChannel);
// Requests
Future<byte[]> future = client.getKvState(serverAddress, new KvStateID(), new byte[0]);
while (!received.get() && deadline.hasTimeLeft()) {
Thread.sleep(50);
}
assertTrue("Receive timed out", received.get());
assertEquals(1, stats.getNumConnections());
channel.get().close().await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
try {
Await.result(future, deadline.timeLeft());
fail("Did not throw expected server failure");
} catch (ClosedChannelException ignored) {
// Expected
}
assertEquals(0, stats.getNumConnections());
// Counts can take some time to propagate
while (deadline.hasTimeLeft() && (stats.getNumSuccessful() != 0 || stats.getNumFailed() != 1)) {
Thread.sleep(100);
}
assertEquals(1, stats.getNumRequests());
assertEquals(0, stats.getNumSuccessful());
assertEquals(1, stats.getNumFailed());
} finally {
if (client != null) {
client.shutDown();
}
if (serverChannel != null) {
serverChannel.close();
}
assertEquals("Channel leak", 0, stats.getNumConnections());
}
}
Aggregations