use of scala.concurrent.duration.Deadline in project flink by apache.
the class RescalingITCase method testSavepointRescalingKeyedState.
/**
* Tests that a a job with purely keyed state can be restarted from a savepoint
* with a different parallelism.
*/
public void testSavepointRescalingKeyedState(boolean scaleOut, boolean deriveMaxParallelism) throws Exception {
final int numberKeys = 42;
final int numberElements = 1000;
final int numberElements2 = 500;
final int parallelism = scaleOut ? numSlots / 2 : numSlots;
final int parallelism2 = scaleOut ? numSlots : numSlots / 2;
final int maxParallelism = 13;
FiniteDuration timeout = new FiniteDuration(3, TimeUnit.MINUTES);
Deadline deadline = timeout.fromNow();
ActorGateway jobManager = null;
JobID jobID = null;
try {
jobManager = cluster.getLeaderGateway(deadline.timeLeft());
JobGraph jobGraph = createJobGraphWithKeyedState(parallelism, maxParallelism, numberKeys, numberElements, false, 100);
jobID = jobGraph.getJobID();
cluster.submitJobDetached(jobGraph);
// wait til the sources have emitted numberElements for each key and completed a checkpoint
SubtaskIndexFlatMapper.workCompletedLatch.await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
// verify the current state
Set<Tuple2<Integer, Integer>> actualResult = CollectionSink.getElementsSet();
Set<Tuple2<Integer, Integer>> expectedResult = new HashSet<>();
for (int key = 0; key < numberKeys; key++) {
int keyGroupIndex = KeyGroupRangeAssignment.assignToKeyGroup(key, maxParallelism);
expectedResult.add(Tuple2.of(KeyGroupRangeAssignment.computeOperatorIndexForKeyGroup(maxParallelism, parallelism, keyGroupIndex), numberElements * key));
}
assertEquals(expectedResult, actualResult);
// clear the CollectionSink set for the restarted job
CollectionSink.clearElementsSet();
Future<Object> savepointPathFuture = jobManager.ask(new JobManagerMessages.TriggerSavepoint(jobID, Option.<String>empty()), deadline.timeLeft());
final String savepointPath = ((JobManagerMessages.TriggerSavepointSuccess) Await.result(savepointPathFuture, deadline.timeLeft())).savepointPath();
Future<Object> jobRemovedFuture = jobManager.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), deadline.timeLeft());
Future<Object> cancellationResponseFuture = jobManager.ask(new JobManagerMessages.CancelJob(jobID), deadline.timeLeft());
Object cancellationResponse = Await.result(cancellationResponseFuture, deadline.timeLeft());
assertTrue(cancellationResponse instanceof JobManagerMessages.CancellationSuccess);
Await.ready(jobRemovedFuture, deadline.timeLeft());
jobID = null;
int restoreMaxParallelism = deriveMaxParallelism ? ExecutionJobVertex.VALUE_NOT_SET : maxParallelism;
JobGraph scaledJobGraph = createJobGraphWithKeyedState(parallelism2, restoreMaxParallelism, numberKeys, numberElements2, true, 100);
scaledJobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));
jobID = scaledJobGraph.getJobID();
cluster.submitJobAndWait(scaledJobGraph, false);
jobID = null;
Set<Tuple2<Integer, Integer>> actualResult2 = CollectionSink.getElementsSet();
Set<Tuple2<Integer, Integer>> expectedResult2 = new HashSet<>();
for (int key = 0; key < numberKeys; key++) {
int keyGroupIndex = KeyGroupRangeAssignment.assignToKeyGroup(key, maxParallelism);
expectedResult2.add(Tuple2.of(KeyGroupRangeAssignment.computeOperatorIndexForKeyGroup(maxParallelism, parallelism2, keyGroupIndex), key * (numberElements + numberElements2)));
}
assertEquals(expectedResult2, actualResult2);
} finally {
// clear the CollectionSink set for the restarted job
CollectionSink.clearElementsSet();
// clear any left overs from a possibly failed job
if (jobID != null && jobManager != null) {
Future<Object> jobRemovedFuture = jobManager.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), timeout);
try {
Await.ready(jobRemovedFuture, timeout);
} catch (TimeoutException | InterruptedException ie) {
fail("Failed while cleaning up the cluster.");
}
}
}
}
use of scala.concurrent.duration.Deadline in project flink by apache.
the class UtilsTest method testYarnFlinkResourceManagerJobManagerLostLeadership.
@Test
public void testYarnFlinkResourceManagerJobManagerLostLeadership() throws Exception {
new JavaTestKit(system) {
{
final Deadline deadline = new FiniteDuration(3, TimeUnit.MINUTES).fromNow();
Configuration flinkConfig = new Configuration();
YarnConfiguration yarnConfig = new YarnConfiguration();
TestingLeaderRetrievalService leaderRetrievalService = new TestingLeaderRetrievalService();
String applicationMasterHostName = "localhost";
String webInterfaceURL = "foobar";
ContaineredTaskManagerParameters taskManagerParameters = new ContaineredTaskManagerParameters(1l, 1l, 1l, 1, new HashMap<String, String>());
ContainerLaunchContext taskManagerLaunchContext = mock(ContainerLaunchContext.class);
int yarnHeartbeatIntervalMillis = 1000;
int maxFailedContainers = 10;
int numInitialTaskManagers = 5;
final YarnResourceManagerCallbackHandler callbackHandler = new YarnResourceManagerCallbackHandler();
AMRMClientAsync<AMRMClient.ContainerRequest> resourceManagerClient = mock(AMRMClientAsync.class);
NMClient nodeManagerClient = mock(NMClient.class);
UUID leaderSessionID = UUID.randomUUID();
final List<Container> containerList = new ArrayList<>();
for (int i = 0; i < numInitialTaskManagers; i++) {
containerList.add(new TestingContainer("container_" + i, "localhost"));
}
doAnswer(new Answer() {
int counter = 0;
@Override
public Object answer(InvocationOnMock invocation) throws Throwable {
if (counter < containerList.size()) {
callbackHandler.onContainersAllocated(Collections.singletonList(containerList.get(counter++)));
}
return null;
}
}).when(resourceManagerClient).addContainerRequest(Matchers.any(AMRMClient.ContainerRequest.class));
ActorRef resourceManager = null;
ActorRef leader1;
try {
leader1 = system.actorOf(Props.create(TestingUtils.ForwardingActor.class, getRef(), Option.apply(leaderSessionID)));
resourceManager = system.actorOf(Props.create(TestingYarnFlinkResourceManager.class, flinkConfig, yarnConfig, leaderRetrievalService, applicationMasterHostName, webInterfaceURL, taskManagerParameters, taskManagerLaunchContext, yarnHeartbeatIntervalMillis, maxFailedContainers, numInitialTaskManagers, callbackHandler, resourceManagerClient, nodeManagerClient));
leaderRetrievalService.notifyListener(leader1.path().toString(), leaderSessionID);
final AkkaActorGateway leader1Gateway = new AkkaActorGateway(leader1, leaderSessionID);
final AkkaActorGateway resourceManagerGateway = new AkkaActorGateway(resourceManager, leaderSessionID);
doAnswer(new Answer() {
@Override
public Object answer(InvocationOnMock invocation) throws Throwable {
Container container = (Container) invocation.getArguments()[0];
resourceManagerGateway.tell(new NotifyResourceStarted(YarnFlinkResourceManager.extractResourceID(container)), leader1Gateway);
return null;
}
}).when(nodeManagerClient).startContainer(Matchers.any(Container.class), Matchers.any(ContainerLaunchContext.class));
expectMsgClass(deadline.timeLeft(), RegisterResourceManager.class);
resourceManagerGateway.tell(new RegisterResourceManagerSuccessful(leader1, Collections.EMPTY_LIST));
for (int i = 0; i < containerList.size(); i++) {
expectMsgClass(deadline.timeLeft(), Acknowledge.class);
}
Future<Object> taskManagerRegisteredFuture = resourceManagerGateway.ask(new NotifyWhenResourcesRegistered(numInitialTaskManagers), deadline.timeLeft());
Await.ready(taskManagerRegisteredFuture, deadline.timeLeft());
leaderRetrievalService.notifyListener(null, null);
leaderRetrievalService.notifyListener(leader1.path().toString(), leaderSessionID);
expectMsgClass(deadline.timeLeft(), RegisterResourceManager.class);
resourceManagerGateway.tell(new RegisterResourceManagerSuccessful(leader1, Collections.EMPTY_LIST));
for (Container container : containerList) {
resourceManagerGateway.tell(new NotifyResourceStarted(YarnFlinkResourceManager.extractResourceID(container)), leader1Gateway);
}
for (int i = 0; i < containerList.size(); i++) {
expectMsgClass(deadline.timeLeft(), Acknowledge.class);
}
Future<Object> numberOfRegisteredResourcesFuture = resourceManagerGateway.ask(RequestNumberOfRegisteredResources.Instance, deadline.timeLeft());
int numberOfRegisteredResources = (Integer) Await.result(numberOfRegisteredResourcesFuture, deadline.timeLeft());
assertEquals(numInitialTaskManagers, numberOfRegisteredResources);
} finally {
if (resourceManager != null) {
resourceManager.tell(PoisonPill.getInstance(), ActorRef.noSender());
}
}
}
};
}
use of scala.concurrent.duration.Deadline in project flink by apache.
the class ExecutionGraphRestartTest method testNoRestartOnSuppressException.
@Test
public void testNoRestartOnSuppressException() throws Exception {
Tuple2<ExecutionGraph, Instance> executionGraphInstanceTuple = createSpyExecutionGraph(new FixedDelayRestartStrategy(1, 1000));
ExecutionGraph eg = executionGraphInstanceTuple.f0;
// Fail with unrecoverable Exception
eg.getAllExecutionVertices().iterator().next().fail(new SuppressRestartsException(new Exception("Test Exception")));
assertEquals(JobStatus.FAILING, eg.getState());
for (ExecutionVertex vertex : eg.getAllExecutionVertices()) {
vertex.getCurrentExecutionAttempt().cancelingComplete();
}
FiniteDuration timeout = new FiniteDuration(2, TimeUnit.MINUTES);
// Wait for async restart
Deadline deadline = timeout.fromNow();
while (deadline.hasTimeLeft() && eg.getState() != JobStatus.FAILED) {
Thread.sleep(100);
}
assertEquals(JobStatus.FAILED, eg.getState());
// No restart
verify(eg, never()).restart();
RestartStrategy restartStrategy = eg.getRestartStrategy();
assertTrue(restartStrategy instanceof FixedDelayRestartStrategy);
assertEquals(0, ((FixedDelayRestartStrategy) restartStrategy).getCurrentRestartAttempt());
}
use of scala.concurrent.duration.Deadline in project flink by apache.
the class ExecutionGraphRestartTest method testFailWhileRestarting.
@Test
public void testFailWhileRestarting() throws Exception {
Scheduler scheduler = new Scheduler(TestingUtils.defaultExecutionContext());
Instance instance = ExecutionGraphTestUtils.getInstance(new ActorTaskManagerGateway(new SimpleActorGateway(TestingUtils.directExecutionContext())), NUM_TASKS);
scheduler.newInstanceAvailable(instance);
// Blocking program
ExecutionGraph executionGraph = new ExecutionGraph(TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), new JobID(), "TestJob", new Configuration(), new SerializedValue<>(new ExecutionConfig()), AkkaUtils.getDefaultTimeout(), // We want to manually control the restart and delay
new InfiniteDelayRestartStrategy(), scheduler);
JobVertex jobVertex = new JobVertex("NoOpInvokable");
jobVertex.setInvokableClass(NoOpInvokable.class);
jobVertex.setParallelism(NUM_TASKS);
JobGraph jobGraph = new JobGraph("TestJob", jobVertex);
executionGraph.attachJobGraph(jobGraph.getVerticesSortedTopologicallyFromSources());
assertEquals(JobStatus.CREATED, executionGraph.getState());
executionGraph.scheduleForExecution();
assertEquals(JobStatus.RUNNING, executionGraph.getState());
// Kill the instance and wait for the job to restart
instance.markDead();
Deadline deadline = TestingUtils.TESTING_DURATION().fromNow();
while (deadline.hasTimeLeft() && executionGraph.getState() != JobStatus.RESTARTING) {
Thread.sleep(100);
}
assertEquals(JobStatus.RESTARTING, executionGraph.getState());
// The restarting should not fail with an ordinary exception
executionGraph.fail(new Exception("Test exception"));
assertEquals(JobStatus.RESTARTING, executionGraph.getState());
// but it should fail when sending a SuppressRestartsException
executionGraph.fail(new SuppressRestartsException(new Exception("Test exception")));
assertEquals(JobStatus.FAILED, executionGraph.getState());
// The restart has been aborted
executionGraph.restart();
assertEquals(JobStatus.FAILED, executionGraph.getState());
}
use of scala.concurrent.duration.Deadline in project flink by apache.
the class ExecutionGraphRestartTest method testSuspendWhileRestarting.
/**
* Tests that a suspend call while restarting a job, will abort the restarting.
*
* @throws Exception
*/
@Test
public void testSuspendWhileRestarting() throws Exception {
FiniteDuration timeout = new FiniteDuration(1, TimeUnit.MINUTES);
Deadline deadline = timeout.fromNow();
Instance instance = ExecutionGraphTestUtils.getInstance(new ActorTaskManagerGateway(new SimpleActorGateway(TestingUtils.directExecutionContext())), NUM_TASKS);
Scheduler scheduler = new Scheduler(TestingUtils.defaultExecutionContext());
scheduler.newInstanceAvailable(instance);
JobVertex sender = new JobVertex("Task");
sender.setInvokableClass(NoOpInvokable.class);
sender.setParallelism(NUM_TASKS);
JobGraph jobGraph = new JobGraph("Pointwise job", sender);
ControllableRestartStrategy controllableRestartStrategy = new ControllableRestartStrategy(timeout);
ExecutionGraph eg = new ExecutionGraph(TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), new JobID(), "Test job", new Configuration(), new SerializedValue<>(new ExecutionConfig()), AkkaUtils.getDefaultTimeout(), controllableRestartStrategy, scheduler);
eg.attachJobGraph(jobGraph.getVerticesSortedTopologicallyFromSources());
assertEquals(JobStatus.CREATED, eg.getState());
eg.scheduleForExecution();
assertEquals(JobStatus.RUNNING, eg.getState());
instance.markDead();
Await.ready(controllableRestartStrategy.getReachedCanRestart(), deadline.timeLeft());
assertEquals(JobStatus.RESTARTING, eg.getState());
eg.suspend(new Exception("Test exception"));
assertEquals(JobStatus.SUSPENDED, eg.getState());
controllableRestartStrategy.unlockRestart();
Await.ready(controllableRestartStrategy.getRestartDone(), deadline.timeLeft());
assertEquals(JobStatus.SUSPENDED, eg.getState());
}
Aggregations