Search in sources :

Example 21 with FiniteDuration

use of scala.concurrent.duration.FiniteDuration in project flink by apache.

the class YARNHighAvailabilityITCase method testMultipleAMKill.

/**
	 * Tests that the application master can be killed multiple times and that the surviving
	 * TaskManager successfully reconnects to the newly started JobManager.
	 * @throws Exception
	 */
@Test
public void testMultipleAMKill() throws Exception {
    final int numberKillingAttempts = numberApplicationAttempts - 1;
    TestingYarnClusterDescriptor flinkYarnClient = new TestingYarnClusterDescriptor();
    Assert.assertNotNull("unable to get yarn client", flinkYarnClient);
    flinkYarnClient.setTaskManagerCount(1);
    flinkYarnClient.setJobManagerMemory(768);
    flinkYarnClient.setTaskManagerMemory(1024);
    flinkYarnClient.setLocalJarPath(new Path(flinkUberjar.getAbsolutePath()));
    flinkYarnClient.addShipFiles(Arrays.asList(flinkLibFolder.listFiles()));
    String confDirPath = System.getenv(ConfigConstants.ENV_FLINK_CONF_DIR);
    flinkYarnClient.setConfigurationDirectory(confDirPath);
    String fsStateHandlePath = temp.getRoot().getPath();
    // load the configuration
    File configDirectory = new File(confDirPath);
    GlobalConfiguration.loadConfiguration(configDirectory.getAbsolutePath());
    flinkYarnClient.setFlinkConfiguration(GlobalConfiguration.loadConfiguration());
    flinkYarnClient.setDynamicPropertiesEncoded("recovery.mode=zookeeper@@recovery.zookeeper.quorum=" + zkServer.getConnectString() + "@@yarn.application-attempts=" + numberApplicationAttempts + "@@" + CoreOptions.STATE_BACKEND + "=FILESYSTEM" + "@@" + FsStateBackendFactory.CHECKPOINT_DIRECTORY_URI_CONF_KEY + "=" + fsStateHandlePath + "/checkpoints" + "@@" + HighAvailabilityOptions.HA_STORAGE_PATH.key() + "=" + fsStateHandlePath + "/recovery");
    flinkYarnClient.setConfigurationFilePath(new Path(confDirPath + File.separator + "flink-conf.yaml"));
    ClusterClient yarnCluster = null;
    final FiniteDuration timeout = new FiniteDuration(2, TimeUnit.MINUTES);
    try {
        yarnCluster = flinkYarnClient.deploy();
        final Configuration config = yarnCluster.getFlinkConfiguration();
        new JavaTestKit(actorSystem) {

            {
                for (int attempt = 0; attempt < numberKillingAttempts; attempt++) {
                    new Within(timeout) {

                        @Override
                        protected void run() {
                            try {
                                LeaderRetrievalService lrs = LeaderRetrievalUtils.createLeaderRetrievalService(config);
                                ActorGateway gateway = LeaderRetrievalUtils.retrieveLeaderGateway(lrs, actorSystem, timeout);
                                ActorGateway selfGateway = new AkkaActorGateway(getRef(), gateway.leaderSessionID());
                                gateway.tell(new TestingJobManagerMessages.NotifyWhenAtLeastNumTaskManagerAreRegistered(1), selfGateway);
                                expectMsgEquals(Acknowledge.get());
                                gateway.tell(PoisonPill.getInstance());
                            } catch (Exception e) {
                                throw new AssertionError("Could not complete test.", e);
                            }
                        }
                    };
                }
                new Within(timeout) {

                    @Override
                    protected void run() {
                        try {
                            LeaderRetrievalService lrs = LeaderRetrievalUtils.createLeaderRetrievalService(config);
                            ActorGateway gateway2 = LeaderRetrievalUtils.retrieveLeaderGateway(lrs, actorSystem, timeout);
                            ActorGateway selfGateway = new AkkaActorGateway(getRef(), gateway2.leaderSessionID());
                            gateway2.tell(new TestingJobManagerMessages.NotifyWhenAtLeastNumTaskManagerAreRegistered(1), selfGateway);
                            expectMsgEquals(Acknowledge.get());
                        } catch (Exception e) {
                            throw new AssertionError("Could not complete test.", e);
                        }
                    }
                };
            }
        };
    } finally {
        if (yarnCluster != null) {
            yarnCluster.shutdown();
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) Configuration(org.apache.flink.configuration.Configuration) GlobalConfiguration(org.apache.flink.configuration.GlobalConfiguration) FiniteDuration(scala.concurrent.duration.FiniteDuration) ClusterClient(org.apache.flink.client.program.ClusterClient) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) File(java.io.File) JavaTestKit(akka.testkit.JavaTestKit) Test(org.junit.Test)

Example 22 with FiniteDuration

use of scala.concurrent.duration.FiniteDuration in project flink by apache.

the class YarnFlinkApplicationMasterRunner method createRpcService.

// ------------------------------------------------------------------------
//  Utilities
// ------------------------------------------------------------------------
protected RpcService createRpcService(Configuration configuration, String bindAddress, String portRange) throws Exception {
    ActorSystem actorSystem = BootstrapTools.startActorSystem(configuration, bindAddress, portRange, LOG);
    FiniteDuration duration = AkkaUtils.getTimeout(configuration);
    return new AkkaRpcService(actorSystem, Time.of(duration.length(), duration.unit()));
}
Also used : ActorSystem(akka.actor.ActorSystem) AkkaRpcService(org.apache.flink.runtime.rpc.akka.AkkaRpcService) FiniteDuration(scala.concurrent.duration.FiniteDuration)

Example 23 with FiniteDuration

use of scala.concurrent.duration.FiniteDuration in project flink by apache.

the class ExecutionGraphRestartTest method testConstraintsAfterRestart.

@Test
public void testConstraintsAfterRestart() throws Exception {
    //setting up
    Instance instance = ExecutionGraphTestUtils.getInstance(new ActorTaskManagerGateway(new SimpleActorGateway(TestingUtils.directExecutionContext())), NUM_TASKS);
    Scheduler scheduler = new Scheduler(TestingUtils.defaultExecutionContext());
    scheduler.newInstanceAvailable(instance);
    JobVertex groupVertex = newJobVertex("Task1", NUM_TASKS, NoOpInvokable.class);
    JobVertex groupVertex2 = newJobVertex("Task2", NUM_TASKS, NoOpInvokable.class);
    SlotSharingGroup sharingGroup = new SlotSharingGroup();
    groupVertex.setSlotSharingGroup(sharingGroup);
    groupVertex2.setSlotSharingGroup(sharingGroup);
    groupVertex.setStrictlyCoLocatedWith(groupVertex2);
    //initiate and schedule job
    JobGraph jobGraph = new JobGraph("Pointwise job", groupVertex, groupVertex2);
    ExecutionGraph eg = newExecutionGraph(new FixedDelayRestartStrategy(1, 0L), scheduler);
    eg.attachJobGraph(jobGraph.getVerticesSortedTopologicallyFromSources());
    assertEquals(JobStatus.CREATED, eg.getState());
    eg.scheduleForExecution();
    assertEquals(JobStatus.RUNNING, eg.getState());
    //sanity checks
    validateConstraints(eg);
    //restart automatically
    restartAfterFailure(eg, new FiniteDuration(2, TimeUnit.MINUTES), false);
    //checking execution vertex properties
    validateConstraints(eg);
    haltExecution(eg);
}
Also used : JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) Instance(org.apache.flink.runtime.instance.Instance) FixedDelayRestartStrategy(org.apache.flink.runtime.executiongraph.restart.FixedDelayRestartStrategy) Scheduler(org.apache.flink.runtime.jobmanager.scheduler.Scheduler) FiniteDuration(scala.concurrent.duration.FiniteDuration) SimpleActorGateway(org.apache.flink.runtime.executiongraph.ExecutionGraphTestUtils.SimpleActorGateway) SlotSharingGroup(org.apache.flink.runtime.jobmanager.scheduler.SlotSharingGroup) ActorTaskManagerGateway(org.apache.flink.runtime.jobmanager.slots.ActorTaskManagerGateway) Test(org.junit.Test)

Example 24 with FiniteDuration

use of scala.concurrent.duration.FiniteDuration in project flink by apache.

the class ExecutionGraphRestartTest method testFailingExecutionAfterRestart.

/**
	 * Tests that a failing execution does not affect a restarted job. This is important if a
	 * callback handler fails an execution after it has already reached a final state and the job
	 * has been restarted.
	 */
@Test
public void testFailingExecutionAfterRestart() throws Exception {
    Instance instance = ExecutionGraphTestUtils.getInstance(new ActorTaskManagerGateway(new SimpleActorGateway(TestingUtils.directExecutionContext())), 2);
    Scheduler scheduler = new Scheduler(TestingUtils.defaultExecutionContext());
    scheduler.newInstanceAvailable(instance);
    JobVertex sender = newJobVertex("Task1", 1, NoOpInvokable.class);
    JobVertex receiver = newJobVertex("Task2", 1, NoOpInvokable.class);
    JobGraph jobGraph = new JobGraph("Pointwise job", sender, receiver);
    ExecutionGraph eg = newExecutionGraph(new FixedDelayRestartStrategy(1, 1000), scheduler);
    eg.attachJobGraph(jobGraph.getVerticesSortedTopologicallyFromSources());
    assertEquals(JobStatus.CREATED, eg.getState());
    eg.scheduleForExecution();
    assertEquals(JobStatus.RUNNING, eg.getState());
    Iterator<ExecutionVertex> executionVertices = eg.getAllExecutionVertices().iterator();
    Execution finishedExecution = executionVertices.next().getCurrentExecutionAttempt();
    Execution failedExecution = executionVertices.next().getCurrentExecutionAttempt();
    finishedExecution.markFinished();
    failedExecution.fail(new Exception("Test Exception"));
    failedExecution.cancelingComplete();
    FiniteDuration timeout = new FiniteDuration(2, TimeUnit.MINUTES);
    waitForAsyncRestart(eg, timeout);
    assertEquals(JobStatus.RUNNING, eg.getState());
    // Wait for all resources to be assigned after async restart
    waitForAllResourcesToBeAssignedAfterAsyncRestart(eg, timeout.fromNow());
    // At this point all resources have been assigned
    for (ExecutionVertex vertex : eg.getAllExecutionVertices()) {
        assertNotNull("No assigned resource (test instability).", vertex.getCurrentAssignedResource());
        vertex.getCurrentExecutionAttempt().switchToRunning();
    }
    // fail old finished execution, this should not affect the execution
    finishedExecution.fail(new Exception("This should have no effect"));
    for (ExecutionVertex vertex : eg.getAllExecutionVertices()) {
        vertex.getCurrentExecutionAttempt().markFinished();
    }
    // the state of the finished execution should have not changed since it is terminal
    assertEquals(ExecutionState.FINISHED, finishedExecution.getState());
    assertEquals(JobStatus.FINISHED, eg.getState());
}
Also used : Instance(org.apache.flink.runtime.instance.Instance) FixedDelayRestartStrategy(org.apache.flink.runtime.executiongraph.restart.FixedDelayRestartStrategy) Scheduler(org.apache.flink.runtime.jobmanager.scheduler.Scheduler) FiniteDuration(scala.concurrent.duration.FiniteDuration) SimpleActorGateway(org.apache.flink.runtime.executiongraph.ExecutionGraphTestUtils.SimpleActorGateway) SuppressRestartsException(org.apache.flink.runtime.execution.SuppressRestartsException) IOException(java.io.IOException) ActorTaskManagerGateway(org.apache.flink.runtime.jobmanager.slots.ActorTaskManagerGateway) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) Test(org.junit.Test)

Example 25 with FiniteDuration

use of scala.concurrent.duration.FiniteDuration in project flink by apache.

the class CoordinatorShutdownTest method testCoordinatorShutsDownOnSuccess.

@Test
public void testCoordinatorShutsDownOnSuccess() {
    LocalFlinkMiniCluster cluster = null;
    try {
        Configuration config = new Configuration();
        config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, 1);
        config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, 1);
        cluster = new LocalFlinkMiniCluster(config, true);
        cluster.start();
        // build a test graph with snapshotting enabled
        JobVertex vertex = new JobVertex("Test Vertex");
        vertex.setInvokableClass(BlockingInvokable.class);
        List<JobVertexID> vertexIdList = Collections.singletonList(vertex.getID());
        JobGraph testGraph = new JobGraph("test job", vertex);
        testGraph.setSnapshotSettings(new JobSnapshottingSettings(vertexIdList, vertexIdList, vertexIdList, 5000, 60000, 0L, Integer.MAX_VALUE, ExternalizedCheckpointSettings.none(), null, true));
        ActorGateway jmGateway = cluster.getLeaderGateway(TestingUtils.TESTING_DURATION());
        FiniteDuration timeout = new FiniteDuration(60, TimeUnit.SECONDS);
        JobManagerMessages.SubmitJob submitMessage = new JobManagerMessages.SubmitJob(testGraph, ListeningBehaviour.EXECUTION_RESULT);
        // submit is successful, but then the job blocks due to the invokable
        Future<Object> submitFuture = jmGateway.ask(submitMessage, timeout);
        Await.result(submitFuture, timeout);
        // get the execution graph and store the ExecutionGraph reference
        Future<Object> jobRequestFuture = jmGateway.ask(new JobManagerMessages.RequestJob(testGraph.getJobID()), timeout);
        ExecutionGraph graph = (ExecutionGraph) ((JobManagerMessages.JobFound) Await.result(jobRequestFuture, timeout)).executionGraph();
        assertNotNull(graph);
        BlockingInvokable.unblock();
        graph.waitUntilFinished();
        // verify that the coordinator was shut down
        CheckpointCoordinator coord = graph.getCheckpointCoordinator();
        assertTrue(coord == null || coord.isShutdown());
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    } finally {
        if (cluster != null) {
            cluster.shutdown();
            cluster.awaitTermination();
        }
    }
}
Also used : Configuration(org.apache.flink.configuration.Configuration) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) JobSnapshottingSettings(org.apache.flink.runtime.jobgraph.tasks.JobSnapshottingSettings) JobManagerMessages(org.apache.flink.runtime.messages.JobManagerMessages) FiniteDuration(scala.concurrent.duration.FiniteDuration) LocalFlinkMiniCluster(org.apache.flink.runtime.minicluster.LocalFlinkMiniCluster) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) Test(org.junit.Test)

Aggregations

FiniteDuration (scala.concurrent.duration.FiniteDuration)77 Test (org.junit.Test)61 Configuration (org.apache.flink.configuration.Configuration)37 ActorGateway (org.apache.flink.runtime.instance.ActorGateway)30 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)27 ActorRef (akka.actor.ActorRef)25 Deadline (scala.concurrent.duration.Deadline)24 JobID (org.apache.flink.api.common.JobID)19 JobVertex (org.apache.flink.runtime.jobgraph.JobVertex)19 JobManagerMessages (org.apache.flink.runtime.messages.JobManagerMessages)17 TestingJobManagerMessages (org.apache.flink.runtime.testingUtils.TestingJobManagerMessages)13 ActorSystem (akka.actor.ActorSystem)12 JavaTestKit (akka.testkit.JavaTestKit)11 Timeout (akka.util.Timeout)11 File (java.io.File)11 TimeoutException (java.util.concurrent.TimeoutException)11 AkkaActorGateway (org.apache.flink.runtime.instance.AkkaActorGateway)11 JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)11 Props (akka.actor.Props)10 IOException (java.io.IOException)10