Search in sources :

Example 86 with ActorGateway

use of org.apache.flink.runtime.instance.ActorGateway in project flink by apache.

the class TaskManagerRegistrationTest method testTaskManagerResumesConnectAfterRefusedRegistration.

/**
	 * Make sure that the TaskManager keeps trying to register, even after
	 * registration attempts have been refused.
	 */
@Test
public void testTaskManagerResumesConnectAfterRefusedRegistration() {
    new JavaTestKit(actorSystem) {

        {
            ActorGateway jm = null;
            ActorGateway taskManager = null;
            try {
                jm = TestingUtils.createForwardingActor(actorSystem, getTestActor(), Option.<String>empty());
                final ActorGateway jmGateway = jm;
                FiniteDuration refusedRegistrationPause = new FiniteDuration(500, TimeUnit.MILLISECONDS);
                Configuration tmConfig = new Configuration(config);
                tmConfig.setString(ConfigConstants.TASK_MANAGER_REFUSED_REGISTRATION_PAUSE, refusedRegistrationPause.toString());
                // we make the test actor (the test kit) the JobManager to intercept
                // the messages
                taskManager = createTaskManager(actorSystem, jmGateway, tmConfig, true, false);
                final ActorGateway taskManagerGateway = taskManager;
                // check and decline initial registration
                new Within(timeout) {

                    @Override
                    protected void run() {
                        // the TaskManager should try to register
                        expectMsgClass(RegisterTaskManager.class);
                        // we decline the registration
                        taskManagerGateway.tell(new RefuseRegistration(new Exception("test reason")), jmGateway);
                    }
                };
                // the TaskManager should wait a bit an retry...
                FiniteDuration maxDelay = (FiniteDuration) refusedRegistrationPause.$times(3.0);
                new Within(maxDelay) {

                    @Override
                    protected void run() {
                        expectMsgClass(RegisterTaskManager.class);
                    }
                };
            } catch (Throwable e) {
                e.printStackTrace();
                fail(e.getMessage());
            } finally {
                stopActor(taskManager);
                stopActor(jm);
            }
        }
    };
}
Also used : Configuration(org.apache.flink.configuration.Configuration) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) FiniteDuration(scala.concurrent.duration.FiniteDuration) RefuseRegistration(org.apache.flink.runtime.messages.RegistrationMessages.RefuseRegistration) JavaTestKit(akka.testkit.JavaTestKit) InvalidActorNameException(akka.actor.InvalidActorNameException) Test(org.junit.Test)

Example 87 with ActorGateway

use of org.apache.flink.runtime.instance.ActorGateway in project flink by apache.

the class TaskManagerTest method testLocalPartitionNotFound.

/**
	 *  Tests that repeated local {@link PartitionNotFoundException}s ultimately fail the receiver.
	 */
@Test
public void testLocalPartitionNotFound() throws Exception {
    new JavaTestKit(system) {

        {
            ActorGateway jobManager = null;
            ActorGateway taskManager = null;
            final ActorGateway testActorGateway = new AkkaActorGateway(getTestActor(), leaderSessionID);
            try {
                final IntermediateDataSetID resultId = new IntermediateDataSetID();
                // Create the JM
                ActorRef jm = system.actorOf(Props.create(new SimplePartitionStateLookupJobManagerCreator(leaderSessionID, getTestActor())));
                jobManager = new AkkaActorGateway(jm, leaderSessionID);
                final Configuration config = new Configuration();
                config.setInteger(TaskManagerOptions.NETWORK_REQUEST_BACKOFF_INITIAL, 100);
                config.setInteger(TaskManagerOptions.NETWORK_REQUEST_BACKOFF_MAX, 200);
                taskManager = TestingUtils.createTaskManager(system, jobManager, config, true, true);
                // ---------------------------------------------------------------------------------
                final ActorGateway tm = taskManager;
                final JobID jid = new JobID();
                final JobVertexID vid = new JobVertexID();
                final ExecutionAttemptID eid = new ExecutionAttemptID();
                final ResultPartitionID partitionId = new ResultPartitionID();
                // Local location (on the same TM though) for the partition
                final ResultPartitionLocation loc = ResultPartitionLocation.createLocal();
                final InputChannelDeploymentDescriptor[] icdd = new InputChannelDeploymentDescriptor[] { new InputChannelDeploymentDescriptor(partitionId, loc) };
                final InputGateDeploymentDescriptor igdd = new InputGateDeploymentDescriptor(resultId, ResultPartitionType.PIPELINED, 0, icdd);
                final TaskDeploymentDescriptor tdd = createTaskDeploymentDescriptor(jid, "TestJob", vid, eid, new SerializedValue<>(new ExecutionConfig()), "Receiver", 1, 0, 1, 0, new Configuration(), new Configuration(), Tasks.AgnosticReceiver.class.getName(), Collections.<ResultPartitionDeploymentDescriptor>emptyList(), Collections.singletonList(igdd), Collections.<BlobKey>emptyList(), Collections.<URL>emptyList(), 0);
                new Within(new FiniteDuration(120, TimeUnit.SECONDS)) {

                    @Override
                    protected void run() {
                        // Submit the task
                        tm.tell(new SubmitTask(tdd), testActorGateway);
                        expectMsgClass(Acknowledge.get().getClass());
                        // Wait to be notified about the final execution state by the mock JM
                        TaskExecutionState msg = expectMsgClass(TaskExecutionState.class);
                        // The task should fail after repeated requests
                        assertEquals(msg.getExecutionState(), ExecutionState.FAILED);
                        Throwable error = msg.getError(getClass().getClassLoader());
                        if (error.getClass() != PartitionNotFoundException.class) {
                            error.printStackTrace();
                            fail("Wrong exception: " + error.getMessage());
                        }
                    }
                };
            } catch (Exception e) {
                e.printStackTrace();
                fail(e.getMessage());
            } finally {
                TestingUtils.stopActor(taskManager);
                TestingUtils.stopActor(jobManager);
            }
        }
    };
}
Also used : AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) TaskManagerServicesConfiguration(org.apache.flink.runtime.taskexecutor.TaskManagerServicesConfiguration) Configuration(org.apache.flink.configuration.Configuration) ActorRef(akka.actor.ActorRef) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) ResultPartitionLocation(org.apache.flink.runtime.deployment.ResultPartitionLocation) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) ResultPartitionID(org.apache.flink.runtime.io.network.partition.ResultPartitionID) IntermediateResultPartitionID(org.apache.flink.runtime.jobgraph.IntermediateResultPartitionID) TaskDeploymentDescriptor(org.apache.flink.runtime.deployment.TaskDeploymentDescriptor) SubmitTask(org.apache.flink.runtime.messages.TaskMessages.SubmitTask) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) FiniteDuration(scala.concurrent.duration.FiniteDuration) InputGateDeploymentDescriptor(org.apache.flink.runtime.deployment.InputGateDeploymentDescriptor) PartitionNotFoundException(org.apache.flink.runtime.io.network.partition.PartitionNotFoundException) IOException(java.io.IOException) InputChannelDeploymentDescriptor(org.apache.flink.runtime.deployment.InputChannelDeploymentDescriptor) IntermediateDataSetID(org.apache.flink.runtime.jobgraph.IntermediateDataSetID) JavaTestKit(akka.testkit.JavaTestKit) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 88 with ActorGateway

use of org.apache.flink.runtime.instance.ActorGateway in project flink by apache.

the class AccumulatorLiveITCase method verifyResults.

private static void verifyResults() {
    new JavaTestKit(system) {

        {
            ActorGateway selfGateway = new AkkaActorGateway(getRef(), jobManagerGateway.leaderSessionID());
            // register for accumulator changes
            jobManagerGateway.tell(new TestingJobManagerMessages.NotifyWhenAccumulatorChange(jobID), selfGateway);
            expectMsgEquals(TIMEOUT, true);
            // submit job
            jobManagerGateway.tell(new JobManagerMessages.SubmitJob(jobGraph, ListeningBehaviour.EXECUTION_RESULT), selfGateway);
            expectMsgClass(TIMEOUT, JobManagerMessages.JobSubmitSuccess.class);
            TestingJobManagerMessages.UpdatedAccumulators msg = (TestingJobManagerMessages.UpdatedAccumulators) receiveOne(TIMEOUT);
            Map<String, Accumulator<?, ?>> userAccumulators = msg.userAccumulators();
            ExecutionAttemptID mapperTaskID = null;
            ExecutionAttemptID sinkTaskID = null;
            /* Check for accumulator values */
            if (checkUserAccumulators(0, userAccumulators)) {
                LOG.info("Passed initial check for map task.");
            } else {
                fail("Wrong accumulator results when map task begins execution.");
            }
            int expectedAccVal = 0;
            /* for mapper task */
            for (int i = 1; i <= NUM_ITERATIONS; i++) {
                expectedAccVal += i;
                // receive message
                msg = (TestingJobManagerMessages.UpdatedAccumulators) receiveOne(TIMEOUT);
                userAccumulators = msg.userAccumulators();
                LOG.info("{}", userAccumulators);
                if (checkUserAccumulators(expectedAccVal, userAccumulators)) {
                    LOG.info("Passed round #" + i);
                } else if (checkUserAccumulators(expectedAccVal, userAccumulators)) {
                    // we determined the wrong task id and need to switch the two here
                    ExecutionAttemptID temp = mapperTaskID;
                    mapperTaskID = sinkTaskID;
                    sinkTaskID = temp;
                    LOG.info("Passed round #" + i);
                } else {
                    fail("Failed in round #" + i);
                }
            }
            msg = (TestingJobManagerMessages.UpdatedAccumulators) receiveOne(TIMEOUT);
            userAccumulators = msg.userAccumulators();
            if (checkUserAccumulators(expectedAccVal, userAccumulators)) {
                LOG.info("Passed initial check for sink task.");
            } else {
                fail("Wrong accumulator results when sink task begins execution.");
            }
            /* for sink task */
            for (int i = 1; i <= NUM_ITERATIONS; i++) {
                // receive message
                msg = (TestingJobManagerMessages.UpdatedAccumulators) receiveOne(TIMEOUT);
                userAccumulators = msg.userAccumulators();
                LOG.info("{}", userAccumulators);
                if (checkUserAccumulators(expectedAccVal, userAccumulators)) {
                    LOG.info("Passed round #" + i);
                } else {
                    fail("Failed in round #" + i);
                }
            }
            expectMsgClass(TIMEOUT, JobManagerMessages.JobResultSuccess.class);
        }
    };
}
Also used : AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) Accumulator(org.apache.flink.api.common.accumulators.Accumulator) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) JobManagerMessages(org.apache.flink.runtime.messages.JobManagerMessages) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) JavaTestKit(akka.testkit.JavaTestKit)

Example 89 with ActorGateway

use of org.apache.flink.runtime.instance.ActorGateway in project flink by apache.

the class SavepointITCase method testTriggerSavepointAndResumeWithFileBasedCheckpoints.

/**
	 * Triggers a savepoint for a job that uses the FsStateBackend. We expect
	 * that all checkpoint files are written to a new savepoint directory.
	 *
	 * <ol>
	 * <li>Submit job, wait for some progress</li>
	 * <li>Trigger savepoint and verify that savepoint has been created</li>
	 * <li>Shut down the cluster, re-submit the job from the savepoint,
	 * verify that the initial state has been reset, and
	 * all tasks are running again</li>
	 * <li>Cancel job, dispose the savepoint, and verify that everything
	 * has been cleaned up</li>
	 * </ol>
	 */
@Test
public void testTriggerSavepointAndResumeWithFileBasedCheckpoints() throws Exception {
    // Config
    final int numTaskManagers = 2;
    final int numSlotsPerTaskManager = 2;
    final int parallelism = numTaskManagers * numSlotsPerTaskManager;
    final Deadline deadline = new FiniteDuration(5, TimeUnit.MINUTES).fromNow();
    final File testRoot = folder.newFolder();
    TestingCluster flink = null;
    try {
        // Create a test actor system
        ActorSystem testActorSystem = AkkaUtils.createDefaultActorSystem();
        // Flink configuration
        final Configuration config = new Configuration();
        config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, numTaskManagers);
        config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, numSlotsPerTaskManager);
        final File checkpointDir = new File(testRoot, "checkpoints");
        final File savepointRootDir = new File(testRoot, "savepoints");
        if (!checkpointDir.mkdir() || !savepointRootDir.mkdirs()) {
            fail("Test setup failed: failed to create temporary directories.");
        }
        // Use file based checkpoints
        config.setString(CoreOptions.STATE_BACKEND, "filesystem");
        config.setString(FsStateBackendFactory.CHECKPOINT_DIRECTORY_URI_CONF_KEY, checkpointDir.toURI().toString());
        config.setString(FsStateBackendFactory.MEMORY_THRESHOLD_CONF_KEY, "0");
        config.setString(ConfigConstants.SAVEPOINT_DIRECTORY_KEY, savepointRootDir.toURI().toString());
        // Start Flink
        flink = new TestingCluster(config);
        flink.start(true);
        // Submit the job
        final JobGraph jobGraph = createJobGraph(parallelism, 0, 1000);
        final JobID jobId = jobGraph.getJobID();
        // Reset the static test job helpers
        StatefulCounter.resetForTest(parallelism);
        // Retrieve the job manager
        ActorGateway jobManager = Await.result(flink.leaderGateway().future(), deadline.timeLeft());
        LOG.info("Submitting job " + jobGraph.getJobID() + " in detached mode.");
        flink.submitJobDetached(jobGraph);
        LOG.info("Waiting for some progress.");
        // wait for the JobManager to be ready
        Future<Object> allRunning = jobManager.ask(new WaitForAllVerticesToBeRunning(jobId), deadline.timeLeft());
        Await.ready(allRunning, deadline.timeLeft());
        // wait for the Tasks to be ready
        StatefulCounter.getProgressLatch().await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
        LOG.info("Triggering a savepoint.");
        Future<Object> savepointPathFuture = jobManager.ask(new TriggerSavepoint(jobId, Option.<String>empty()), deadline.timeLeft());
        final String savepointPath = ((TriggerSavepointSuccess) Await.result(savepointPathFuture, deadline.timeLeft())).savepointPath();
        LOG.info("Retrieved savepoint path: " + savepointPath + ".");
        // Retrieve the savepoint from the testing job manager
        LOG.info("Requesting the savepoint.");
        Future<Object> savepointFuture = jobManager.ask(new RequestSavepoint(savepointPath), deadline.timeLeft());
        SavepointV1 savepoint = (SavepointV1) ((ResponseSavepoint) Await.result(savepointFuture, deadline.timeLeft())).savepoint();
        LOG.info("Retrieved savepoint: " + savepointPath + ".");
        // Shut down the Flink cluster (thereby canceling the job)
        LOG.info("Shutting down Flink cluster.");
        flink.shutdown();
        flink.awaitTermination();
        // - Verification START -------------------------------------------
        // Only one savepoint should exist
        File[] files = savepointRootDir.listFiles();
        if (files != null) {
            assertEquals("Savepoint not created in expected directory", 1, files.length);
            assertTrue("Savepoint did not create self-contained directory", files[0].isDirectory());
            File savepointDir = files[0];
            File[] savepointFiles = savepointDir.listFiles();
            assertNotNull(savepointFiles);
            // Expect one metadata file and one checkpoint file per stateful
            // parallel subtask
            String errMsg = "Did not write expected number of savepoint/checkpoint files to directory: " + Arrays.toString(savepointFiles);
            assertEquals(errMsg, 1 + parallelism, savepointFiles.length);
        } else {
            fail("Savepoint not created in expected directory");
        }
        // We currently have the following directory layout: checkpointDir/jobId/chk-ID
        File jobCheckpoints = new File(checkpointDir, jobId.toString());
        if (jobCheckpoints.exists()) {
            files = jobCheckpoints.listFiles();
            assertNotNull("Checkpoint directory empty", files);
            assertEquals("Checkpoints directory not clean: " + Arrays.toString(files), 0, files.length);
        }
        // - Verification END ---------------------------------------------
        // Restart the cluster
        LOG.info("Restarting Flink cluster.");
        flink.start();
        // Retrieve the job manager
        LOG.info("Retrieving JobManager.");
        jobManager = Await.result(flink.leaderGateway().future(), deadline.timeLeft());
        LOG.info("JobManager: " + jobManager + ".");
        // Reset static test helpers
        StatefulCounter.resetForTest(parallelism);
        // Gather all task deployment descriptors
        final Throwable[] error = new Throwable[1];
        final TestingCluster finalFlink = flink;
        final Multimap<JobVertexID, TaskDeploymentDescriptor> tdds = HashMultimap.create();
        new JavaTestKit(testActorSystem) {

            {
                new Within(deadline.timeLeft()) {

                    @Override
                    protected void run() {
                        try {
                            // Register to all submit task messages for job
                            for (ActorRef taskManager : finalFlink.getTaskManagersAsJava()) {
                                taskManager.tell(new TestingTaskManagerMessages.RegisterSubmitTaskListener(jobId), getTestActor());
                            }
                            // Set the savepoint path
                            jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));
                            LOG.info("Resubmitting job " + jobGraph.getJobID() + " with " + "savepoint path " + savepointPath + " in detached mode.");
                            // Submit the job
                            finalFlink.submitJobDetached(jobGraph);
                            int numTasks = 0;
                            for (JobVertex jobVertex : jobGraph.getVertices()) {
                                numTasks += jobVertex.getParallelism();
                            }
                            // Gather the task deployment descriptors
                            LOG.info("Gathering " + numTasks + " submitted " + "TaskDeploymentDescriptor instances.");
                            for (int i = 0; i < numTasks; i++) {
                                ResponseSubmitTaskListener resp = (ResponseSubmitTaskListener) expectMsgAnyClassOf(getRemainingTime(), ResponseSubmitTaskListener.class);
                                TaskDeploymentDescriptor tdd = resp.tdd();
                                LOG.info("Received: " + tdd.toString() + ".");
                                TaskInformation taskInformation = tdd.getSerializedTaskInformation().deserializeValue(getClass().getClassLoader());
                                tdds.put(taskInformation.getJobVertexId(), tdd);
                            }
                        } catch (Throwable t) {
                            error[0] = t;
                        }
                    }
                };
            }
        };
        // - Verification START -------------------------------------------
        String errMsg = "Error during gathering of TaskDeploymentDescriptors";
        assertNull(errMsg, error[0]);
        // have a matching task deployment descriptor.
        for (TaskState taskState : savepoint.getTaskStates()) {
            Collection<TaskDeploymentDescriptor> taskTdds = tdds.get(taskState.getJobVertexID());
            errMsg = "Missing task for savepoint state for operator " + taskState.getJobVertexID() + ".";
            assertTrue(errMsg, taskTdds.size() > 0);
            assertEquals(taskState.getNumberCollectedStates(), taskTdds.size());
            for (TaskDeploymentDescriptor tdd : taskTdds) {
                SubtaskState subtaskState = taskState.getState(tdd.getSubtaskIndex());
                assertNotNull(subtaskState);
                errMsg = "Initial operator state mismatch.";
                assertEquals(errMsg, subtaskState.getLegacyOperatorState(), tdd.getTaskStateHandles().getLegacyOperatorState());
            }
        }
        // Await state is restored
        StatefulCounter.getRestoreLatch().await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
        // Await some progress after restore
        StatefulCounter.getProgressLatch().await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
        // - Verification END ---------------------------------------------
        LOG.info("Cancelling job " + jobId + ".");
        jobManager.tell(new CancelJob(jobId));
        LOG.info("Disposing savepoint " + savepointPath + ".");
        Future<Object> disposeFuture = jobManager.ask(new DisposeSavepoint(savepointPath), deadline.timeLeft());
        errMsg = "Failed to dispose savepoint " + savepointPath + ".";
        Object resp = Await.result(disposeFuture, deadline.timeLeft());
        assertTrue(errMsg, resp.getClass() == getDisposeSavepointSuccess().getClass());
        // - Verification START -------------------------------------------
        // The checkpoint files
        List<File> checkpointFiles = new ArrayList<>();
        for (TaskState stateForTaskGroup : savepoint.getTaskStates()) {
            for (SubtaskState subtaskState : stateForTaskGroup.getStates()) {
                ChainedStateHandle<StreamStateHandle> streamTaskState = subtaskState.getLegacyOperatorState();
                for (int i = 0; i < streamTaskState.getLength(); i++) {
                    if (streamTaskState.get(i) != null) {
                        FileStateHandle fileStateHandle = (FileStateHandle) streamTaskState.get(i);
                        checkpointFiles.add(new File(fileStateHandle.getFilePath().toUri()));
                    }
                }
            }
        }
        // The checkpoint files of the savepoint should have been discarded
        for (File f : checkpointFiles) {
            errMsg = "Checkpoint file " + f + " not cleaned up properly.";
            assertFalse(errMsg, f.exists());
        }
        if (checkpointFiles.size() > 0) {
            File parent = checkpointFiles.get(0).getParentFile();
            errMsg = "Checkpoint parent directory " + parent + " not cleaned up properly.";
            assertFalse(errMsg, parent.exists());
        }
        // All savepoints should have been cleaned up
        errMsg = "Savepoints directory not cleaned up properly: " + Arrays.toString(savepointRootDir.listFiles()) + ".";
        assertEquals(errMsg, 0, savepointRootDir.listFiles().length);
    // - Verification END ---------------------------------------------
    } finally {
        if (flink != null) {
            flink.shutdown();
        }
    }
}
Also used : ActorSystem(akka.actor.ActorSystem) RequestSavepoint(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.RequestSavepoint) Configuration(org.apache.flink.configuration.Configuration) ActorRef(akka.actor.ActorRef) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) ArrayList(java.util.ArrayList) ResponseSubmitTaskListener(org.apache.flink.runtime.testingUtils.TestingTaskManagerMessages.ResponseSubmitTaskListener) TestingCluster(org.apache.flink.runtime.testingUtils.TestingCluster) StreamStateHandle(org.apache.flink.runtime.state.StreamStateHandle) SavepointV1(org.apache.flink.runtime.checkpoint.savepoint.SavepointV1) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) TaskDeploymentDescriptor(org.apache.flink.runtime.deployment.TaskDeploymentDescriptor) CancelJob(org.apache.flink.runtime.messages.JobManagerMessages.CancelJob) TestingTaskManagerMessages(org.apache.flink.runtime.testingUtils.TestingTaskManagerMessages) TaskInformation(org.apache.flink.runtime.executiongraph.TaskInformation) WaitForAllVerticesToBeRunning(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.WaitForAllVerticesToBeRunning) Deadline(scala.concurrent.duration.Deadline) FiniteDuration(scala.concurrent.duration.FiniteDuration) FileStateHandle(org.apache.flink.runtime.state.filesystem.FileStateHandle) TriggerSavepoint(org.apache.flink.runtime.messages.JobManagerMessages.TriggerSavepoint) ResponseSavepoint(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.ResponseSavepoint) RequestSavepoint(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.RequestSavepoint) DisposeSavepoint(org.apache.flink.runtime.messages.JobManagerMessages.DisposeSavepoint) TriggerSavepointSuccess(org.apache.flink.runtime.messages.JobManagerMessages.TriggerSavepointSuccess) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) DisposeSavepoint(org.apache.flink.runtime.messages.JobManagerMessages.DisposeSavepoint) SubtaskState(org.apache.flink.runtime.checkpoint.SubtaskState) TriggerSavepoint(org.apache.flink.runtime.messages.JobManagerMessages.TriggerSavepoint) File(java.io.File) TaskState(org.apache.flink.runtime.checkpoint.TaskState) JobID(org.apache.flink.api.common.JobID) JavaTestKit(akka.testkit.JavaTestKit) Test(org.junit.Test)

Example 90 with ActorGateway

use of org.apache.flink.runtime.instance.ActorGateway in project flink by apache.

the class SavepointMigrationTestBase method executeAndSavepoint.

@SafeVarargs
protected final void executeAndSavepoint(StreamExecutionEnvironment env, String savepointPath, Tuple2<String, Integer>... expectedAccumulators) throws Exception {
    // Retrieve the job manager
    ActorGateway jobManager = Await.result(cluster.leaderGateway().future(), DEADLINE.timeLeft());
    // Submit the job
    JobGraph jobGraph = env.getStreamGraph().getJobGraph();
    JobSubmissionResult jobSubmissionResult = cluster.submitJobDetached(jobGraph);
    LOG.info("Submitted job {} and waiting...", jobSubmissionResult.getJobID());
    StandaloneClusterClient clusterClient = new StandaloneClusterClient(cluster.configuration());
    boolean done = false;
    while (DEADLINE.hasTimeLeft()) {
        Thread.sleep(100);
        Map<String, Object> accumulators = clusterClient.getAccumulators(jobSubmissionResult.getJobID());
        boolean allDone = true;
        for (Tuple2<String, Integer> acc : expectedAccumulators) {
            Integer numFinished = (Integer) accumulators.get(acc.f0);
            if (numFinished == null) {
                allDone = false;
                break;
            }
            if (!numFinished.equals(acc.f1)) {
                allDone = false;
                break;
            }
        }
        if (allDone) {
            done = true;
            break;
        }
    }
    if (!done) {
        fail("Did not see the expected accumulator results within time limit.");
    }
    LOG.info("Triggering savepoint.");
    // Flink 1.2
    final Future<Object> savepointResultFuture = jobManager.ask(new JobManagerMessages.TriggerSavepoint(jobSubmissionResult.getJobID(), Option.<String>empty()), DEADLINE.timeLeft());
    // Flink 1.1
    //		final Future<Object> savepointResultFuture =
    //				jobManager.ask(new JobManagerMessages.TriggerSavepoint(jobSubmissionResult.getJobID()), DEADLINE.timeLeft());
    Object savepointResult = Await.result(savepointResultFuture, DEADLINE.timeLeft());
    if (savepointResult instanceof JobManagerMessages.TriggerSavepointFailure) {
        fail("Error drawing savepoint: " + ((JobManagerMessages.TriggerSavepointFailure) savepointResult).cause());
    }
    // jobmanager will store savepoint in heap, we have to retrieve it
    final String jobmanagerSavepointPath = ((JobManagerMessages.TriggerSavepointSuccess) savepointResult).savepointPath();
    LOG.info("Saved savepoint: " + jobmanagerSavepointPath);
    // Flink 1.2
    FileUtils.moveFile(new File(new URI(jobmanagerSavepointPath).getPath()), new File(savepointPath));
// Flink 1.1
// Retrieve the savepoint from the testing job manager
//		LOG.info("Requesting the savepoint.");
//		Future<Object> savepointFuture = jobManager.ask(new TestingJobManagerMessages.RequestSavepoint(jobmanagerSavepointPath), DEADLINE.timeLeft());
//
//		Savepoint savepoint = ((TestingJobManagerMessages.ResponseSavepoint) Await.result(savepointFuture, DEADLINE.timeLeft())).savepoint();
//		LOG.info("Retrieved savepoint: " + jobmanagerSavepointPath + ".");
//
//		LOG.info("Storing savepoint to file.");
//		Configuration config = new Configuration();
//		config.setString(org.apache.flink.runtime.checkpoint.savepoint.SavepointStoreFactory.SAVEPOINT_BACKEND_KEY, "filesystem");
//		config.setString(org.apache.flink.runtime.checkpoint.savepoint.SavepointStoreFactory.SAVEPOINT_DIRECTORY_KEY, "file:///Users/aljoscha/Downloads");
//		String path = org.apache.flink.runtime.checkpoint.savepoint.SavepointStoreFactory.createFromConfig(config).storeSavepoint(savepoint);
//
//		FileUtils.moveFile(new File(new URI(path).getPath()), new File(savepointPath));
}
Also used : JobManagerMessages(org.apache.flink.runtime.messages.JobManagerMessages) StandaloneClusterClient(org.apache.flink.client.program.StandaloneClusterClient) URI(java.net.URI) JobSubmissionResult(org.apache.flink.api.common.JobSubmissionResult) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) File(java.io.File)

Aggregations

ActorGateway (org.apache.flink.runtime.instance.ActorGateway)102 Test (org.junit.Test)81 Configuration (org.apache.flink.configuration.Configuration)44 AkkaActorGateway (org.apache.flink.runtime.instance.AkkaActorGateway)41 FiniteDuration (scala.concurrent.duration.FiniteDuration)37 JobID (org.apache.flink.api.common.JobID)36 JavaTestKit (akka.testkit.JavaTestKit)34 ActorRef (akka.actor.ActorRef)30 IOException (java.io.IOException)26 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)25 JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)22 JobManagerMessages (org.apache.flink.runtime.messages.JobManagerMessages)22 JobVertex (org.apache.flink.runtime.jobgraph.JobVertex)20 ExecutionAttemptID (org.apache.flink.runtime.executiongraph.ExecutionAttemptID)17 TaskManagerServicesConfiguration (org.apache.flink.runtime.taskexecutor.TaskManagerServicesConfiguration)16 SubmitJob (org.apache.flink.runtime.messages.JobManagerMessages.SubmitJob)15 ExecutionConfig (org.apache.flink.api.common.ExecutionConfig)14 TaskDeploymentDescriptor (org.apache.flink.runtime.deployment.TaskDeploymentDescriptor)14 TriggerSavepoint (org.apache.flink.runtime.messages.JobManagerMessages.TriggerSavepoint)13 SubmitTask (org.apache.flink.runtime.messages.TaskMessages.SubmitTask)13