Search in sources :

Example 1 with CancellationFailure

use of org.apache.flink.runtime.messages.JobManagerMessages.CancellationFailure in project flink by apache.

the class CliFrontend method cancel.

/**
	 * Executes the CANCEL action.
	 * 
	 * @param args Command line arguments for the cancel action.
	 */
protected int cancel(String[] args) {
    LOG.info("Running 'cancel' command.");
    CancelOptions options;
    try {
        options = CliFrontendParser.parseCancelCommand(args);
    } catch (CliArgsException e) {
        return handleArgException(e);
    } catch (Throwable t) {
        return handleError(t);
    }
    // evaluate help flag
    if (options.isPrintHelp()) {
        CliFrontendParser.printHelpForCancel();
        return 0;
    }
    String[] cleanedArgs = options.getArgs();
    boolean withSavepoint = options.isWithSavepoint();
    String targetDirectory = options.getSavepointTargetDirectory();
    JobID jobId;
    // - cancel -s <targetDir> <jobID> => custom target dir (parsed correctly)
    if (cleanedArgs.length > 0) {
        String jobIdString = cleanedArgs[0];
        try {
            jobId = new JobID(StringUtils.hexStringToByte(jobIdString));
        } catch (Exception e) {
            LOG.error("Error: The value for the Job ID is not a valid ID.");
            System.out.println("Error: The value for the Job ID is not a valid ID.");
            return 1;
        }
    } else if (targetDirectory != null) {
        // Try this for case: cancel -s <jobID> (default savepoint target dir)
        String jobIdString = targetDirectory;
        try {
            jobId = new JobID(StringUtils.hexStringToByte(jobIdString));
            targetDirectory = null;
        } catch (Exception e) {
            LOG.error("Missing JobID in the command line arguments.");
            System.out.println("Error: Specify a Job ID to cancel a job.");
            return 1;
        }
    } else {
        LOG.error("Missing JobID in the command line arguments.");
        System.out.println("Error: Specify a Job ID to cancel a job.");
        return 1;
    }
    try {
        ActorGateway jobManager = getJobManagerGateway(options);
        Object cancelMsg;
        if (withSavepoint) {
            if (targetDirectory == null) {
                logAndSysout("Cancelling job " + jobId + " with savepoint to default savepoint directory.");
            } else {
                logAndSysout("Cancelling job " + jobId + " with savepoint to " + targetDirectory + ".");
            }
            cancelMsg = new CancelJobWithSavepoint(jobId, targetDirectory);
        } else {
            logAndSysout("Cancelling job " + jobId + ".");
            cancelMsg = new CancelJob(jobId);
        }
        Future<Object> response = jobManager.ask(cancelMsg, clientTimeout);
        final Object rc = Await.result(response, clientTimeout);
        if (rc instanceof CancellationSuccess) {
            if (withSavepoint) {
                CancellationSuccess success = (CancellationSuccess) rc;
                String savepointPath = success.savepointPath();
                logAndSysout("Cancelled job " + jobId + ". Savepoint stored in " + savepointPath + ".");
            } else {
                logAndSysout("Cancelled job " + jobId + ".");
            }
        } else if (rc instanceof CancellationFailure) {
            throw new Exception("Canceling the job with ID " + jobId + " failed.", ((CancellationFailure) rc).cause());
        } else {
            throw new IllegalStateException("Unexpected response: " + rc);
        }
        return 0;
    } catch (Throwable t) {
        return handleError(t);
    }
}
Also used : CancelOptions(org.apache.flink.client.cli.CancelOptions) CancelJobWithSavepoint(org.apache.flink.runtime.messages.JobManagerMessages.CancelJobWithSavepoint) CliArgsException(org.apache.flink.client.cli.CliArgsException) ProgramInvocationException(org.apache.flink.client.program.ProgramInvocationException) ProgramMissingJobException(org.apache.flink.client.program.ProgramMissingJobException) InvalidProgramException(org.apache.flink.api.common.InvalidProgramException) ProgramParametrizationException(org.apache.flink.client.program.ProgramParametrizationException) FileNotFoundException(java.io.FileNotFoundException) InvocationTargetException(java.lang.reflect.InvocationTargetException) IllegalConfigurationException(org.apache.flink.configuration.IllegalConfigurationException) CliArgsException(org.apache.flink.client.cli.CliArgsException) IOException(java.io.IOException) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) CancellationSuccess(org.apache.flink.runtime.messages.JobManagerMessages.CancellationSuccess) CancellationFailure(org.apache.flink.runtime.messages.JobManagerMessages.CancellationFailure) CancelJob(org.apache.flink.runtime.messages.JobManagerMessages.CancelJob) JobID(org.apache.flink.api.common.JobID)

Example 2 with CancellationFailure

use of org.apache.flink.runtime.messages.JobManagerMessages.CancellationFailure in project flink by apache.

the class JobManagerTest method testCancelWithSavepoint.

@Test
public void testCancelWithSavepoint() throws Exception {
    File defaultSavepointDir = tmpFolder.newFolder();
    FiniteDuration timeout = new FiniteDuration(30, TimeUnit.SECONDS);
    Configuration config = new Configuration();
    config.setString(ConfigConstants.SAVEPOINT_DIRECTORY_KEY, defaultSavepointDir.getAbsolutePath());
    ActorSystem actorSystem = null;
    ActorGateway jobManager = null;
    ActorGateway archiver = null;
    ActorGateway taskManager = null;
    try {
        actorSystem = AkkaUtils.createLocalActorSystem(new Configuration());
        Tuple2<ActorRef, ActorRef> master = JobManager.startJobManagerActors(config, actorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), Option.apply("jm"), Option.apply("arch"), TestingJobManager.class, TestingMemoryArchivist.class);
        jobManager = new AkkaActorGateway(master._1(), null);
        archiver = new AkkaActorGateway(master._2(), null);
        ActorRef taskManagerRef = TaskManager.startTaskManagerComponentsAndActor(config, ResourceID.generate(), actorSystem, "localhost", Option.apply("tm"), Option.<LeaderRetrievalService>apply(new StandaloneLeaderRetrievalService(jobManager.path())), true, TestingTaskManager.class);
        taskManager = new AkkaActorGateway(taskManagerRef, null);
        // Wait until connected
        Object msg = new TestingTaskManagerMessages.NotifyWhenRegisteredAtJobManager(jobManager.actor());
        Await.ready(taskManager.ask(msg, timeout), timeout);
        // Create job graph
        JobVertex sourceVertex = new JobVertex("Source");
        sourceVertex.setInvokableClass(BlockingStatefulInvokable.class);
        sourceVertex.setParallelism(1);
        JobGraph jobGraph = new JobGraph("TestingJob", sourceVertex);
        JobSnapshottingSettings snapshottingSettings = new JobSnapshottingSettings(Collections.singletonList(sourceVertex.getID()), Collections.singletonList(sourceVertex.getID()), Collections.singletonList(sourceVertex.getID()), 3600000, 3600000, 0, Integer.MAX_VALUE, ExternalizedCheckpointSettings.none(), null, true);
        jobGraph.setSnapshotSettings(snapshottingSettings);
        // Submit job graph
        msg = new JobManagerMessages.SubmitJob(jobGraph, ListeningBehaviour.DETACHED);
        Await.result(jobManager.ask(msg, timeout), timeout);
        // Wait for all tasks to be running
        msg = new TestingJobManagerMessages.WaitForAllVerticesToBeRunning(jobGraph.getJobID());
        Await.result(jobManager.ask(msg, timeout), timeout);
        // Notify when canelled
        msg = new NotifyWhenJobStatus(jobGraph.getJobID(), JobStatus.CANCELED);
        Future<Object> cancelled = jobManager.ask(msg, timeout);
        // Cancel with savepoint
        String savepointPath = null;
        for (int i = 0; i < 10; i++) {
            msg = new JobManagerMessages.CancelJobWithSavepoint(jobGraph.getJobID(), null);
            CancellationResponse cancelResp = (CancellationResponse) Await.result(jobManager.ask(msg, timeout), timeout);
            if (cancelResp instanceof CancellationFailure) {
                CancellationFailure failure = (CancellationFailure) cancelResp;
                if (failure.cause().getMessage().contains(CheckpointDeclineReason.NOT_ALL_REQUIRED_TASKS_RUNNING.message())) {
                    // wait and retry
                    Thread.sleep(200);
                } else {
                    failure.cause().printStackTrace();
                    fail("Failed to cancel job: " + failure.cause().getMessage());
                }
            } else {
                savepointPath = ((CancellationSuccess) cancelResp).savepointPath();
                break;
            }
        }
        // Verify savepoint path
        assertNotEquals("Savepoint not triggered", null, savepointPath);
        // Wait for job status change
        Await.ready(cancelled, timeout);
        File savepointFile = new File(savepointPath);
        assertEquals(true, savepointFile.exists());
    } finally {
        if (actorSystem != null) {
            actorSystem.shutdown();
        }
        if (archiver != null) {
            archiver.actor().tell(PoisonPill.getInstance(), ActorRef.noSender());
        }
        if (jobManager != null) {
            jobManager.actor().tell(PoisonPill.getInstance(), ActorRef.noSender());
        }
        if (taskManager != null) {
            taskManager.actor().tell(PoisonPill.getInstance(), ActorRef.noSender());
        }
    }
}
Also used : ActorSystem(akka.actor.ActorSystem) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) Configuration(org.apache.flink.configuration.Configuration) ActorRef(akka.actor.ActorRef) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) WaitForAllVerticesToBeRunning(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.WaitForAllVerticesToBeRunning) JobSnapshottingSettings(org.apache.flink.runtime.jobgraph.tasks.JobSnapshottingSettings) JobManagerMessages(org.apache.flink.runtime.messages.JobManagerMessages) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) FiniteDuration(scala.concurrent.duration.FiniteDuration) SubmitJob(org.apache.flink.runtime.messages.JobManagerMessages.SubmitJob) TriggerSavepoint(org.apache.flink.runtime.messages.JobManagerMessages.TriggerSavepoint) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) StandaloneLeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.StandaloneLeaderRetrievalService) CancellationFailure(org.apache.flink.runtime.messages.JobManagerMessages.CancellationFailure) File(java.io.File) NotifyWhenJobStatus(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.NotifyWhenJobStatus) CancellationResponse(org.apache.flink.runtime.messages.JobManagerMessages.CancellationResponse) Test(org.junit.Test)

Example 3 with CancellationFailure

use of org.apache.flink.runtime.messages.JobManagerMessages.CancellationFailure in project flink by apache.

the class JobManagerTest method testCancelWithSavepointNoDirectoriesConfigured.

/**
	 * Tests that a meaningful exception is returned if no savepoint directory is
	 * configured.
	 */
@Test
public void testCancelWithSavepointNoDirectoriesConfigured() throws Exception {
    FiniteDuration timeout = new FiniteDuration(30, TimeUnit.SECONDS);
    Configuration config = new Configuration();
    ActorSystem actorSystem = null;
    ActorGateway jobManager = null;
    ActorGateway archiver = null;
    ActorGateway taskManager = null;
    try {
        actorSystem = AkkaUtils.createLocalActorSystem(new Configuration());
        Tuple2<ActorRef, ActorRef> master = JobManager.startJobManagerActors(config, actorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), Option.apply("jm"), Option.apply("arch"), TestingJobManager.class, TestingMemoryArchivist.class);
        jobManager = new AkkaActorGateway(master._1(), null);
        archiver = new AkkaActorGateway(master._2(), null);
        ActorRef taskManagerRef = TaskManager.startTaskManagerComponentsAndActor(config, ResourceID.generate(), actorSystem, "localhost", Option.apply("tm"), Option.<LeaderRetrievalService>apply(new StandaloneLeaderRetrievalService(jobManager.path())), true, TestingTaskManager.class);
        taskManager = new AkkaActorGateway(taskManagerRef, null);
        // Wait until connected
        Object msg = new TestingTaskManagerMessages.NotifyWhenRegisteredAtJobManager(jobManager.actor());
        Await.ready(taskManager.ask(msg, timeout), timeout);
        // Create job graph
        JobVertex sourceVertex = new JobVertex("Source");
        sourceVertex.setInvokableClass(BlockingStatefulInvokable.class);
        sourceVertex.setParallelism(1);
        JobGraph jobGraph = new JobGraph("TestingJob", sourceVertex);
        JobSnapshottingSettings snapshottingSettings = new JobSnapshottingSettings(Collections.singletonList(sourceVertex.getID()), Collections.singletonList(sourceVertex.getID()), Collections.singletonList(sourceVertex.getID()), 3600000, 3600000, 0, Integer.MAX_VALUE, ExternalizedCheckpointSettings.none(), null, true);
        jobGraph.setSnapshotSettings(snapshottingSettings);
        // Submit job graph
        msg = new JobManagerMessages.SubmitJob(jobGraph, ListeningBehaviour.DETACHED);
        Await.result(jobManager.ask(msg, timeout), timeout);
        // Wait for all tasks to be running
        msg = new TestingJobManagerMessages.WaitForAllVerticesToBeRunning(jobGraph.getJobID());
        Await.result(jobManager.ask(msg, timeout), timeout);
        // Cancel with savepoint
        msg = new JobManagerMessages.CancelJobWithSavepoint(jobGraph.getJobID(), null);
        CancellationResponse cancelResp = (CancellationResponse) Await.result(jobManager.ask(msg, timeout), timeout);
        if (cancelResp instanceof CancellationFailure) {
            CancellationFailure failure = (CancellationFailure) cancelResp;
            assertTrue(failure.cause() instanceof IllegalStateException);
            assertTrue(failure.cause().getMessage().contains("savepoint directory"));
        } else {
            fail("Unexpected cancellation response from JobManager: " + cancelResp);
        }
    } finally {
        if (actorSystem != null) {
            actorSystem.shutdown();
        }
        if (archiver != null) {
            archiver.actor().tell(PoisonPill.getInstance(), ActorRef.noSender());
        }
        if (jobManager != null) {
            jobManager.actor().tell(PoisonPill.getInstance(), ActorRef.noSender());
        }
        if (taskManager != null) {
            taskManager.actor().tell(PoisonPill.getInstance(), ActorRef.noSender());
        }
    }
}
Also used : ActorSystem(akka.actor.ActorSystem) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) Configuration(org.apache.flink.configuration.Configuration) ActorRef(akka.actor.ActorRef) JobSnapshottingSettings(org.apache.flink.runtime.jobgraph.tasks.JobSnapshottingSettings) JobManagerMessages(org.apache.flink.runtime.messages.JobManagerMessages) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) FiniteDuration(scala.concurrent.duration.FiniteDuration) SubmitJob(org.apache.flink.runtime.messages.JobManagerMessages.SubmitJob) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) StandaloneLeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.StandaloneLeaderRetrievalService) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) CancellationFailure(org.apache.flink.runtime.messages.JobManagerMessages.CancellationFailure) WaitForAllVerticesToBeRunning(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.WaitForAllVerticesToBeRunning) CancellationResponse(org.apache.flink.runtime.messages.JobManagerMessages.CancellationResponse) Test(org.junit.Test)

Aggregations

ActorGateway (org.apache.flink.runtime.instance.ActorGateway)3 CancellationFailure (org.apache.flink.runtime.messages.JobManagerMessages.CancellationFailure)3 ActorRef (akka.actor.ActorRef)2 ActorSystem (akka.actor.ActorSystem)2 Configuration (org.apache.flink.configuration.Configuration)2 AkkaActorGateway (org.apache.flink.runtime.instance.AkkaActorGateway)2 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)2 JobVertex (org.apache.flink.runtime.jobgraph.JobVertex)2 JobSnapshottingSettings (org.apache.flink.runtime.jobgraph.tasks.JobSnapshottingSettings)2 StandaloneLeaderRetrievalService (org.apache.flink.runtime.leaderretrieval.StandaloneLeaderRetrievalService)2 JobManagerMessages (org.apache.flink.runtime.messages.JobManagerMessages)2 CancellationResponse (org.apache.flink.runtime.messages.JobManagerMessages.CancellationResponse)2 SubmitJob (org.apache.flink.runtime.messages.JobManagerMessages.SubmitJob)2 TestingJobManagerMessages (org.apache.flink.runtime.testingUtils.TestingJobManagerMessages)2 WaitForAllVerticesToBeRunning (org.apache.flink.runtime.testingUtils.TestingJobManagerMessages.WaitForAllVerticesToBeRunning)2 Test (org.junit.Test)2 FiniteDuration (scala.concurrent.duration.FiniteDuration)2 File (java.io.File)1 FileNotFoundException (java.io.FileNotFoundException)1 IOException (java.io.IOException)1