Search in sources :

Example 56 with JobStatus

use of org.apache.flink.api.common.JobStatus in project flink by apache.

the class JobMasterStopWithSavepointITCase method testRestartCheckpointCoordinatorIfStopWithSavepointFails.

@Test
public void testRestartCheckpointCoordinatorIfStopWithSavepointFails() throws Exception {
    setUpJobGraph(CheckpointCountingTask.class, RestartStrategies.noRestart());
    try {
        Files.setPosixFilePermissions(savepointDirectory, Collections.emptySet());
    } catch (IOException e) {
        Assume.assumeNoException(e);
    }
    try {
        stopWithSavepoint(true).get();
        fail();
    } catch (Exception e) {
        Optional<CheckpointException> checkpointExceptionOptional = ExceptionUtils.findThrowable(e, CheckpointException.class);
        if (!checkpointExceptionOptional.isPresent()) {
            throw e;
        }
        String exceptionMessage = checkpointExceptionOptional.get().getMessage();
        assertTrue("Stop with savepoint failed because of another cause " + exceptionMessage, exceptionMessage.contains(CheckpointFailureReason.IO_EXCEPTION.message()));
    }
    final JobStatus jobStatus = clusterClient.getJobStatus(jobGraph.getJobID()).get(60, TimeUnit.SECONDS);
    assertThat(jobStatus, equalTo(JobStatus.RUNNING));
    // assert that checkpoints are continued to be triggered
    checkpointsToWaitFor = new CountDownLatch(1);
    assertTrue(checkpointsToWaitFor.await(60L, TimeUnit.SECONDS));
}
Also used : JobStatus(org.apache.flink.api.common.JobStatus) Optional(java.util.Optional) CheckpointException(org.apache.flink.runtime.checkpoint.CheckpointException) IOException(java.io.IOException) CountDownLatch(java.util.concurrent.CountDownLatch) CheckpointException(org.apache.flink.runtime.checkpoint.CheckpointException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) Test(org.junit.Test)

Example 57 with JobStatus

use of org.apache.flink.api.common.JobStatus in project flink by apache.

the class TestJobExecutor method assertFinishedSuccessfully.

public TestJobExecutor assertFinishedSuccessfully() throws Exception {
    LOG.debug("assertFinishedSuccessfully");
    JobStatus jobStatus = miniClusterResource.getClusterClient().getJobStatus(jobID).get();
    if (!jobStatus.equals(FINISHED)) {
        String message = String.format("Job didn't finish successfully, status: %s", jobStatus);
        Optional<SerializedThrowable> throwable = miniClusterResource.getClusterClient().requestJobResult(jobID).get().getSerializedThrowable();
        if (throwable.isPresent()) {
            throw new AssertionError(message, throwable.get());
        } else {
            fail(message);
        }
    }
    return this;
}
Also used : JobStatus(org.apache.flink.api.common.JobStatus) SerializedThrowable(org.apache.flink.util.SerializedThrowable)

Example 58 with JobStatus

use of org.apache.flink.api.common.JobStatus in project flink by apache.

the class JobMasterTriggerSavepointITCase method testStopJobAfterSavepoint.

@Test
public void testStopJobAfterSavepoint() throws Exception {
    setUpWithCheckpointInterval(10L);
    final String savepointLocation = cancelWithSavepoint();
    final JobStatus jobStatus = clusterClient.getJobStatus(jobGraph.getJobID()).get();
    assertThat(jobStatus, isOneOf(JobStatus.CANCELED, JobStatus.CANCELLING));
    final List<Path> savepoints;
    try (Stream<Path> savepointFiles = Files.list(savepointDirectory)) {
        savepoints = savepointFiles.map(Path::getFileName).collect(Collectors.toList());
    }
    assertThat(savepoints, hasItem(Paths.get(savepointLocation).getFileName()));
}
Also used : JobStatus(org.apache.flink.api.common.JobStatus) Path(java.nio.file.Path) Test(org.junit.Test)

Example 59 with JobStatus

use of org.apache.flink.api.common.JobStatus in project flink by apache.

the class JobMasterTriggerSavepointITCase method waitForJob.

private void waitForJob() throws Exception {
    for (int i = 0; i < 60; i++) {
        try {
            final JobStatus jobStatus = clusterClient.getJobStatus(jobGraph.getJobID()).get(60, TimeUnit.SECONDS);
            assertThat(jobStatus.isGloballyTerminalState(), equalTo(false));
            if (jobStatus == JobStatus.RUNNING) {
                return;
            }
        } catch (ExecutionException ignored) {
        // JobManagerRunner is not yet registered in Dispatcher
        }
        Thread.sleep(1000);
    }
    throw new AssertionError("Job did not become running within timeout.");
}
Also used : JobStatus(org.apache.flink.api.common.JobStatus) ExecutionException(java.util.concurrent.ExecutionException)

Example 60 with JobStatus

use of org.apache.flink.api.common.JobStatus in project flink by apache.

the class AbstractOperatorRestoreTestBase method migrateJob.

private String migrateJob(ClusterClient<?> clusterClient, Deadline deadline) throws Throwable {
    URL savepointResource = AbstractOperatorRestoreTestBase.class.getClassLoader().getResource("operatorstate/" + getMigrationSavepointName());
    if (savepointResource == null) {
        throw new IllegalArgumentException("Savepoint file does not exist.");
    }
    JobGraph jobToMigrate = createJobGraph(ExecutionMode.MIGRATE);
    jobToMigrate.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointResource.getFile()));
    assertNotNull(jobToMigrate.getJobID());
    clusterClient.submitJob(jobToMigrate).get();
    CompletableFuture<JobStatus> jobRunningFuture = FutureUtils.retrySuccessfulWithDelay(() -> clusterClient.getJobStatus(jobToMigrate.getJobID()), Time.milliseconds(50), deadline, (jobStatus) -> jobStatus == JobStatus.RUNNING, TestingUtils.defaultScheduledExecutor());
    assertEquals(JobStatus.RUNNING, jobRunningFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS));
    // Trigger savepoint
    File targetDirectory = tmpFolder.newFolder();
    String savepointPath = null;
    // FLINK-4714)
    while (deadline.hasTimeLeft() && savepointPath == null) {
        try {
            savepointPath = clusterClient.cancelWithSavepoint(jobToMigrate.getJobID(), targetDirectory.getAbsolutePath(), SavepointFormatType.CANONICAL).get();
        } catch (Exception e) {
            String exceptionString = ExceptionUtils.stringifyException(e);
            if (!PATTERN_CANCEL_WITH_SAVEPOINT_TOLERATED_EXCEPTIONS.matcher(exceptionString).find()) {
                throw e;
            }
        }
    }
    assertNotNull("Could not take savepoint.", savepointPath);
    CompletableFuture<JobStatus> jobCanceledFuture = FutureUtils.retrySuccessfulWithDelay(() -> clusterClient.getJobStatus(jobToMigrate.getJobID()), Time.milliseconds(50), deadline, (jobStatus) -> jobStatus == JobStatus.CANCELED, TestingUtils.defaultScheduledExecutor());
    assertEquals(JobStatus.CANCELED, jobCanceledFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS));
    return savepointPath;
}
Also used : JobStatus(org.apache.flink.api.common.JobStatus) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) File(java.io.File) URL(java.net.URL)

Aggregations

JobStatus (org.apache.flink.api.common.JobStatus)62 Test (org.junit.Test)28 JobID (org.apache.flink.api.common.JobID)19 CompletableFuture (java.util.concurrent.CompletableFuture)15 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)14 FlinkException (org.apache.flink.util.FlinkException)8 ExecutionException (java.util.concurrent.ExecutionException)7 IOException (java.io.IOException)6 ArrayList (java.util.ArrayList)6 Time (org.apache.flink.api.common.time.Time)6 ExecutionGraphInfo (org.apache.flink.runtime.scheduler.ExecutionGraphInfo)6 TaskExecutionState (org.apache.flink.runtime.taskmanager.TaskExecutionState)6 Collections (java.util.Collections)5 HashMap (java.util.HashMap)5 ExecutionState (org.apache.flink.runtime.execution.ExecutionState)5 FutureUtils (org.apache.flink.util.concurrent.FutureUtils)5 TimeUnit (java.util.concurrent.TimeUnit)4 Configuration (org.apache.flink.configuration.Configuration)4 ExecutionAttemptID (org.apache.flink.runtime.executiongraph.ExecutionAttemptID)4 Acknowledge (org.apache.flink.runtime.messages.Acknowledge)4