Search in sources :

Example 16 with FiniteDuration

use of scala.concurrent.duration.FiniteDuration in project flink by apache.

the class TaskManagerMetricsTest method testMetricRegistryLifeCycle.

/**
	 * Tests the metric registry life cycle on JobManager re-connects.
	 */
@Test
public void testMetricRegistryLifeCycle() throws Exception {
    ActorSystem actorSystem = null;
    try {
        actorSystem = AkkaUtils.createLocalActorSystem(new Configuration());
        // ================================================================
        // Start JobManager
        // ================================================================
        final ActorRef jobManager = JobManager.startJobManagerActors(new Configuration(), actorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), JobManager.class, MemoryArchivist.class)._1();
        LeaderRetrievalService leaderRetrievalService = new StandaloneLeaderRetrievalService(jobManager.path().toString());
        // ================================================================
        // Start TaskManager
        // ================================================================
        final Configuration config = new Configuration();
        final ResourceID tmResourceID = ResourceID.generate();
        TaskManagerServicesConfiguration taskManagerServicesConfiguration = TaskManagerServicesConfiguration.fromConfiguration(config, InetAddress.getLocalHost(), false);
        TaskManagerConfiguration taskManagerConfiguration = TaskManagerConfiguration.fromConfiguration(config);
        TaskManagerServices taskManagerServices = TaskManagerServices.fromConfiguration(taskManagerServicesConfiguration, tmResourceID);
        final MetricRegistry tmRegistry = taskManagerServices.getMetricRegistry();
        // create the task manager
        final Props tmProps = TaskManager.getTaskManagerProps(TaskManager.class, taskManagerConfiguration, tmResourceID, taskManagerServices.getTaskManagerLocation(), taskManagerServices.getMemoryManager(), taskManagerServices.getIOManager(), taskManagerServices.getNetworkEnvironment(), leaderRetrievalService, tmRegistry);
        final ActorRef taskManager = actorSystem.actorOf(tmProps);
        new JavaTestKit(actorSystem) {

            {
                new Within(new FiniteDuration(5000, TimeUnit.SECONDS)) {

                    @Override
                    protected void run() {
                        taskManager.tell(TaskManagerMessages.getNotifyWhenRegisteredAtJobManagerMessage(), getTestActor());
                        // wait for the TM to be registered
                        expectMsgEquals(TaskManagerMessages.getRegisteredAtJobManagerMessage());
                        // trigger re-registration of TM; this should include a disconnect from the current JM
                        taskManager.tell(new TaskManagerMessages.JobManagerLeaderAddress(jobManager.path().toString(), null), jobManager);
                        // wait for re-registration to be completed
                        taskManager.tell(TaskManagerMessages.getNotifyWhenRegisteredAtJobManagerMessage(), getTestActor());
                        expectMsgEquals(TaskManagerMessages.getRegisteredAtJobManagerMessage());
                    }
                };
            }
        };
        // verify that the registry was not shutdown due to the disconnect
        Assert.assertFalse(tmRegistry.isShutdown());
        // shut down the actors and the actor system
        actorSystem.shutdown();
        actorSystem.awaitTermination();
    } finally {
        if (actorSystem != null) {
            actorSystem.shutdown();
        }
    }
}
Also used : ActorSystem(akka.actor.ActorSystem) TaskManagerConfiguration(org.apache.flink.runtime.taskexecutor.TaskManagerConfiguration) MemoryArchivist(org.apache.flink.runtime.jobmanager.MemoryArchivist) Configuration(org.apache.flink.configuration.Configuration) TaskManagerConfiguration(org.apache.flink.runtime.taskexecutor.TaskManagerConfiguration) TaskManagerServicesConfiguration(org.apache.flink.runtime.taskexecutor.TaskManagerServicesConfiguration) ActorRef(akka.actor.ActorRef) TaskManagerServices(org.apache.flink.runtime.taskexecutor.TaskManagerServices) FiniteDuration(scala.concurrent.duration.FiniteDuration) JobManager(org.apache.flink.runtime.jobmanager.JobManager) Props(akka.actor.Props) TaskManagerMessages(org.apache.flink.runtime.messages.TaskManagerMessages) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) StandaloneLeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.StandaloneLeaderRetrievalService) StandaloneLeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.StandaloneLeaderRetrievalService) TaskManagerServicesConfiguration(org.apache.flink.runtime.taskexecutor.TaskManagerServicesConfiguration) JavaTestKit(akka.testkit.JavaTestKit) Test(org.junit.Test)

Example 17 with FiniteDuration

use of scala.concurrent.duration.FiniteDuration in project flink by apache.

the class RescalingITCase method testSavepointRescalingPartitionedOperatorState.

/**
	 * Tests rescaling of partitioned operator state. More specific, we test the mechanism with {@link ListCheckpointed}
	 * as it subsumes {@link org.apache.flink.streaming.api.checkpoint.CheckpointedFunction}.
	 */
public void testSavepointRescalingPartitionedOperatorState(boolean scaleOut, OperatorCheckpointMethod checkpointMethod) throws Exception {
    final int parallelism = scaleOut ? numSlots : numSlots / 2;
    final int parallelism2 = scaleOut ? numSlots / 2 : numSlots;
    final int maxParallelism = 13;
    FiniteDuration timeout = new FiniteDuration(3, TimeUnit.MINUTES);
    Deadline deadline = timeout.fromNow();
    JobID jobID = null;
    ActorGateway jobManager = null;
    int counterSize = Math.max(parallelism, parallelism2);
    if (checkpointMethod == OperatorCheckpointMethod.CHECKPOINTED_FUNCTION || checkpointMethod == OperatorCheckpointMethod.CHECKPOINTED_FUNCTION_BROADCAST) {
        PartitionedStateSource.CHECK_CORRECT_SNAPSHOT = new int[counterSize];
        PartitionedStateSource.CHECK_CORRECT_RESTORE = new int[counterSize];
    } else {
        PartitionedStateSourceListCheckpointed.CHECK_CORRECT_SNAPSHOT = new int[counterSize];
        PartitionedStateSourceListCheckpointed.CHECK_CORRECT_RESTORE = new int[counterSize];
    }
    try {
        jobManager = cluster.getLeaderGateway(deadline.timeLeft());
        JobGraph jobGraph = createJobGraphWithOperatorState(parallelism, maxParallelism, checkpointMethod);
        jobID = jobGraph.getJobID();
        cluster.submitJobDetached(jobGraph);
        Object savepointResponse = null;
        // wait until the operator is started
        StateSourceBase.workStartedLatch.await();
        while (deadline.hasTimeLeft()) {
            Future<Object> savepointPathFuture = jobManager.ask(new JobManagerMessages.TriggerSavepoint(jobID, Option.<String>empty()), deadline.timeLeft());
            FiniteDuration waitingTime = new FiniteDuration(10, TimeUnit.SECONDS);
            savepointResponse = Await.result(savepointPathFuture, waitingTime);
            if (savepointResponse instanceof JobManagerMessages.TriggerSavepointSuccess) {
                break;
            }
            System.out.println(savepointResponse);
        }
        assertTrue(savepointResponse instanceof JobManagerMessages.TriggerSavepointSuccess);
        final String savepointPath = ((JobManagerMessages.TriggerSavepointSuccess) savepointResponse).savepointPath();
        Future<Object> jobRemovedFuture = jobManager.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), deadline.timeLeft());
        Future<Object> cancellationResponseFuture = jobManager.ask(new JobManagerMessages.CancelJob(jobID), deadline.timeLeft());
        Object cancellationResponse = Await.result(cancellationResponseFuture, deadline.timeLeft());
        assertTrue(cancellationResponse instanceof JobManagerMessages.CancellationSuccess);
        Await.ready(jobRemovedFuture, deadline.timeLeft());
        // job successfully removed
        jobID = null;
        JobGraph scaledJobGraph = createJobGraphWithOperatorState(parallelism2, maxParallelism, checkpointMethod);
        scaledJobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));
        jobID = scaledJobGraph.getJobID();
        cluster.submitJobAndWait(scaledJobGraph, false);
        int sumExp = 0;
        int sumAct = 0;
        if (checkpointMethod == OperatorCheckpointMethod.CHECKPOINTED_FUNCTION) {
            for (int c : PartitionedStateSource.CHECK_CORRECT_SNAPSHOT) {
                sumExp += c;
            }
            for (int c : PartitionedStateSource.CHECK_CORRECT_RESTORE) {
                sumAct += c;
            }
        } else if (checkpointMethod == OperatorCheckpointMethod.CHECKPOINTED_FUNCTION_BROADCAST) {
            for (int c : PartitionedStateSource.CHECK_CORRECT_SNAPSHOT) {
                sumExp += c;
            }
            for (int c : PartitionedStateSource.CHECK_CORRECT_RESTORE) {
                sumAct += c;
            }
            sumExp *= parallelism2;
        } else {
            for (int c : PartitionedStateSourceListCheckpointed.CHECK_CORRECT_SNAPSHOT) {
                sumExp += c;
            }
            for (int c : PartitionedStateSourceListCheckpointed.CHECK_CORRECT_RESTORE) {
                sumAct += c;
            }
        }
        assertEquals(sumExp, sumAct);
        jobID = null;
    } finally {
        // clear any left overs from a possibly failed job
        if (jobID != null && jobManager != null) {
            Future<Object> jobRemovedFuture = jobManager.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), timeout);
            try {
                Await.ready(jobRemovedFuture, timeout);
            } catch (TimeoutException | InterruptedException ie) {
                fail("Failed while cleaning up the cluster.");
            }
        }
    }
}
Also used : Deadline(scala.concurrent.duration.Deadline) JobManagerMessages(org.apache.flink.runtime.messages.JobManagerMessages) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) FiniteDuration(scala.concurrent.duration.FiniteDuration) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) JobID(org.apache.flink.api.common.JobID) TimeoutException(java.util.concurrent.TimeoutException)

Example 18 with FiniteDuration

use of scala.concurrent.duration.FiniteDuration in project flink by apache.

the class SavepointITCase method testCanRestoreWithModifiedStatelessOperators.

/**
	 * FLINK-5985
	 *
	 * This test ensures we can restore from a savepoint under modifications to the job graph that only concern
	 * stateless operators.
	 */
@Test
public void testCanRestoreWithModifiedStatelessOperators() throws Exception {
    // Config
    int numTaskManagers = 2;
    int numSlotsPerTaskManager = 2;
    int parallelism = 2;
    // Test deadline
    final Deadline deadline = new FiniteDuration(5, TimeUnit.MINUTES).fromNow();
    final File tmpDir = CommonTestUtils.createTempDirectory();
    final File savepointDir = new File(tmpDir, "savepoints");
    TestingCluster flink = null;
    String savepointPath;
    try {
        // Flink configuration
        final Configuration config = new Configuration();
        config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, numTaskManagers);
        config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, numSlotsPerTaskManager);
        config.setString(ConfigConstants.SAVEPOINT_DIRECTORY_KEY, savepointDir.toURI().toString());
        LOG.info("Flink configuration: " + config + ".");
        // Start Flink
        flink = new TestingCluster(config);
        LOG.info("Starting Flink cluster.");
        flink.start(true);
        // Retrieve the job manager
        LOG.info("Retrieving JobManager.");
        ActorGateway jobManager = Await.result(flink.leaderGateway().future(), deadline.timeLeft());
        LOG.info("JobManager: " + jobManager + ".");
        final StatefulCounter statefulCounter = new StatefulCounter();
        StatefulCounter.resetForTest(parallelism);
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(parallelism);
        env.addSource(new InfiniteTestSource()).shuffle().map(new MapFunction<Integer, Integer>() {

            @Override
            public Integer map(Integer value) throws Exception {
                return 4 * value;
            }
        }).shuffle().map(statefulCounter).uid("statefulCounter").shuffle().map(new MapFunction<Integer, Integer>() {

            @Override
            public Integer map(Integer value) throws Exception {
                return 2 * value;
            }
        }).addSink(new DiscardingSink<Integer>());
        JobGraph originalJobGraph = env.getStreamGraph().getJobGraph();
        JobSubmissionResult submissionResult = flink.submitJobDetached(originalJobGraph);
        JobID jobID = submissionResult.getJobID();
        // wait for the Tasks to be ready
        StatefulCounter.getProgressLatch().await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
        Future<Object> savepointPathFuture = jobManager.ask(new TriggerSavepoint(jobID, Option.<String>empty()), deadline.timeLeft());
        savepointPath = ((TriggerSavepointSuccess) Await.result(savepointPathFuture, deadline.timeLeft())).savepointPath();
        Future<Object> savepointFuture = jobManager.ask(new RequestSavepoint(savepointPath), deadline.timeLeft());
        ((ResponseSavepoint) Await.result(savepointFuture, deadline.timeLeft())).savepoint();
        LOG.info("Retrieved savepoint: " + savepointPath + ".");
        // Shut down the Flink cluster (thereby canceling the job)
        LOG.info("Shutting down Flink cluster.");
        flink.shutdown();
        flink.awaitTermination();
    } finally {
        flink.shutdown();
        flink.awaitTermination();
    }
    try {
        LOG.info("Restarting Flink cluster.");
        flink.start(true);
        // Retrieve the job manager
        LOG.info("Retrieving JobManager.");
        ActorGateway jobManager = Await.result(flink.leaderGateway().future(), deadline.timeLeft());
        LOG.info("JobManager: " + jobManager + ".");
        // Reset static test helpers
        StatefulCounter.resetForTest(parallelism);
        // Gather all task deployment descriptors
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(parallelism);
        // generate a modified job graph that adds a stateless op
        env.addSource(new InfiniteTestSource()).shuffle().map(new StatefulCounter()).uid("statefulCounter").shuffle().map(new MapFunction<Integer, Integer>() {

            @Override
            public Integer map(Integer value) throws Exception {
                return value;
            }
        }).addSink(new DiscardingSink<Integer>());
        JobGraph modifiedJobGraph = env.getStreamGraph().getJobGraph();
        // Set the savepoint path
        modifiedJobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));
        LOG.info("Resubmitting job " + modifiedJobGraph.getJobID() + " with " + "savepoint path " + savepointPath + " in detached mode.");
        // Submit the job
        flink.submitJobDetached(modifiedJobGraph);
        // Await state is restored
        StatefulCounter.getRestoreLatch().await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
        // Await some progress after restore
        StatefulCounter.getProgressLatch().await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
    } finally {
        flink.shutdown();
        flink.awaitTermination();
    }
}
Also used : RequestSavepoint(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.RequestSavepoint) Configuration(org.apache.flink.configuration.Configuration) MapFunction(org.apache.flink.api.common.functions.MapFunction) RichFlatMapFunction(org.apache.flink.api.common.functions.RichFlatMapFunction) RichMapFunction(org.apache.flink.api.common.functions.RichMapFunction) ResponseSavepoint(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.ResponseSavepoint) TestingCluster(org.apache.flink.runtime.testingUtils.TestingCluster) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) Deadline(scala.concurrent.duration.Deadline) FiniteDuration(scala.concurrent.duration.FiniteDuration) TriggerSavepoint(org.apache.flink.runtime.messages.JobManagerMessages.TriggerSavepoint) ResponseSavepoint(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.ResponseSavepoint) RequestSavepoint(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.RequestSavepoint) DisposeSavepoint(org.apache.flink.runtime.messages.JobManagerMessages.DisposeSavepoint) FileNotFoundException(java.io.FileNotFoundException) JobExecutionException(org.apache.flink.runtime.client.JobExecutionException) JobSubmissionResult(org.apache.flink.api.common.JobSubmissionResult) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) TriggerSavepoint(org.apache.flink.runtime.messages.JobManagerMessages.TriggerSavepoint) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) File(java.io.File) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 19 with FiniteDuration

use of scala.concurrent.duration.FiniteDuration in project flink by apache.

the class CancelingTestBase method runAndCancelJob.

public void runAndCancelJob(Plan plan, final int msecsTillCanceling, int maxTimeTillCanceled) throws Exception {
    try {
        // submit job
        final JobGraph jobGraph = getJobGraph(plan);
        executor.submitJobDetached(jobGraph);
        // Wait for the job to make some progress and then cancel
        JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, executor.getLeaderGateway(TestingUtils.TESTING_DURATION()), TestingUtils.TESTING_DURATION());
        Thread.sleep(msecsTillCanceling);
        FiniteDuration timeout = new FiniteDuration(maxTimeTillCanceled, TimeUnit.MILLISECONDS);
        ActorGateway jobManager = executor.getLeaderGateway(TestingUtils.TESTING_DURATION());
        Future<Object> ask = jobManager.ask(new CancelJob(jobGraph.getJobID()), timeout);
        Object result = Await.result(ask, timeout);
        if (result instanceof CancellationSuccess) {
        // all good
        } else if (result instanceof CancellationFailure) {
            // Failure
            CancellationFailure failure = (CancellationFailure) result;
            throw new Exception("Failed to cancel job with ID " + failure.jobID() + ".", failure.cause());
        } else {
            throw new Exception("Unexpected response to cancel request: " + result);
        }
        // Wait for the job to be cancelled
        JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.CANCELED, executor.getLeaderGateway(TestingUtils.TESTING_DURATION()), TestingUtils.TESTING_DURATION());
    } catch (Exception e) {
        LOG.error("Exception found in runAndCancelJob.", e);
        e.printStackTrace();
        Assert.fail(e.getMessage());
    }
}
Also used : JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) CancellationSuccess(org.apache.flink.runtime.messages.JobManagerMessages.CancellationSuccess) CancellationFailure(org.apache.flink.runtime.messages.JobManagerMessages.CancellationFailure) FiniteDuration(scala.concurrent.duration.FiniteDuration) CancelJob(org.apache.flink.runtime.messages.JobManagerMessages.CancelJob)

Example 20 with FiniteDuration

use of scala.concurrent.duration.FiniteDuration in project flink by apache.

the class YarnResourceManager method createTaskExecutorLaunchContext.

private ContainerLaunchContext createTaskExecutorLaunchContext(Resource resource, String containerId, String host) throws Exception {
    // init the ContainerLaunchContext
    final String currDir = ENV.get(ApplicationConstants.Environment.PWD.key());
    final ContaineredTaskManagerParameters taskManagerParameters = ContaineredTaskManagerParameters.create(flinkConfig, resource.getMemory(), 1);
    LOG.info("TaskExecutor{} will be started with container size {} MB, JVM heap size {} MB, " + "JVM direct memory limit {} MB", containerId, taskManagerParameters.taskManagerTotalMemoryMB(), taskManagerParameters.taskManagerHeapSizeMB(), taskManagerParameters.taskManagerDirectMemoryLimitMB());
    int timeout = flinkConfig.getInteger(ConfigConstants.TASK_MANAGER_MAX_REGISTRATION_DURATION, DEFAULT_TASK_MANAGER_REGISTRATION_DURATION);
    FiniteDuration teRegistrationTimeout = new FiniteDuration(timeout, TimeUnit.SECONDS);
    final Configuration taskManagerConfig = BootstrapTools.generateTaskManagerConfiguration(flinkConfig, "", 0, 1, teRegistrationTimeout);
    LOG.debug("TaskManager configuration: {}", taskManagerConfig);
    ContainerLaunchContext taskExecutorLaunchContext = Utils.createTaskExecutorContext(flinkConfig, yarnConfig, ENV, taskManagerParameters, taskManagerConfig, currDir, YarnTaskExecutorRunner.class, LOG);
    // set a special environment variable to uniquely identify this container
    taskExecutorLaunchContext.getEnvironment().put(ENV_FLINK_CONTAINER_ID, containerId);
    taskExecutorLaunchContext.getEnvironment().put(ENV_FLINK_NODE_ID, host);
    return taskExecutorLaunchContext;
}
Also used : YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) ResourceManagerConfiguration(org.apache.flink.runtime.resourcemanager.ResourceManagerConfiguration) Configuration(org.apache.flink.configuration.Configuration) ContaineredTaskManagerParameters(org.apache.flink.runtime.clusterframework.ContaineredTaskManagerParameters) FiniteDuration(scala.concurrent.duration.FiniteDuration) ContainerLaunchContext(org.apache.hadoop.yarn.api.records.ContainerLaunchContext)

Aggregations

FiniteDuration (scala.concurrent.duration.FiniteDuration)77 Test (org.junit.Test)61 Configuration (org.apache.flink.configuration.Configuration)37 ActorGateway (org.apache.flink.runtime.instance.ActorGateway)30 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)27 ActorRef (akka.actor.ActorRef)25 Deadline (scala.concurrent.duration.Deadline)24 JobID (org.apache.flink.api.common.JobID)19 JobVertex (org.apache.flink.runtime.jobgraph.JobVertex)19 JobManagerMessages (org.apache.flink.runtime.messages.JobManagerMessages)17 TestingJobManagerMessages (org.apache.flink.runtime.testingUtils.TestingJobManagerMessages)13 ActorSystem (akka.actor.ActorSystem)12 JavaTestKit (akka.testkit.JavaTestKit)11 Timeout (akka.util.Timeout)11 File (java.io.File)11 TimeoutException (java.util.concurrent.TimeoutException)11 AkkaActorGateway (org.apache.flink.runtime.instance.AkkaActorGateway)11 JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)11 Props (akka.actor.Props)10 IOException (java.io.IOException)10