Search in sources :

Example 16 with JobStatus

use of org.apache.flink.api.common.JobStatus in project flink by apache.

the class JobMasterTriggerSavepointITCase method testDoNotCancelJobIfSavepointFails.

@Test
public void testDoNotCancelJobIfSavepointFails() throws Exception {
    setUpWithCheckpointInterval(10L);
    try {
        Files.setPosixFilePermissions(savepointDirectory, Collections.emptySet());
    } catch (IOException e) {
        Assume.assumeNoException(e);
    }
    try {
        cancelWithSavepoint();
    } catch (Exception e) {
        assertThat(ExceptionUtils.findThrowable(e, CheckpointException.class).isPresent(), equalTo(true));
    }
    final JobStatus jobStatus = clusterClient.getJobStatus(jobGraph.getJobID()).get(60, TimeUnit.SECONDS);
    assertThat(jobStatus, equalTo(JobStatus.RUNNING));
    // assert that checkpoints are continued to be triggered
    triggerCheckpointLatch = new CountDownLatch(1);
    assertThat(triggerCheckpointLatch.await(60L, TimeUnit.SECONDS), equalTo(true));
}
Also used : JobStatus(org.apache.flink.api.common.JobStatus) CheckpointException(org.apache.flink.runtime.checkpoint.CheckpointException) IOException(java.io.IOException) CountDownLatch(java.util.concurrent.CountDownLatch) CheckpointException(org.apache.flink.runtime.checkpoint.CheckpointException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) Test(org.junit.Test)

Example 17 with JobStatus

use of org.apache.flink.api.common.JobStatus in project flink by apache.

the class ResourceManagerTest method testDisconnectJobManager.

private void testDisconnectJobManager(JobStatus jobStatus) throws Exception {
    final TestingJobMasterGateway jobMasterGateway = new TestingJobMasterGatewayBuilder().setAddress(UUID.randomUUID().toString()).build();
    rpcService.registerGateway(jobMasterGateway.getAddress(), jobMasterGateway);
    final OneShotLatch jobAdded = new OneShotLatch();
    final OneShotLatch jobRemoved = new OneShotLatch();
    final JobLeaderIdService jobLeaderIdService = TestingJobLeaderIdService.newBuilder().setAddJobConsumer(ignored -> jobAdded.trigger()).setRemoveJobConsumer(ignored -> jobRemoved.trigger()).build();
    resourceManager = new ResourceManagerBuilder().withJobLeaderIdService(jobLeaderIdService).buildAndStart();
    highAvailabilityServices.setJobMasterLeaderRetrieverFunction(requestedJobId -> new SettableLeaderRetrievalService(jobMasterGateway.getAddress(), jobMasterGateway.getFencingToken().toUUID()));
    final JobID jobId = JobID.generate();
    final ResourceManagerGateway resourceManagerGateway = resourceManager.getSelfGateway(ResourceManagerGateway.class);
    resourceManagerGateway.registerJobMaster(jobMasterGateway.getFencingToken(), ResourceID.generate(), jobMasterGateway.getAddress(), jobId, TIMEOUT);
    jobAdded.await();
    resourceManagerGateway.disconnectJobManager(jobId, jobStatus, new FlinkException("Test exception"));
    if (jobStatus.isGloballyTerminalState()) {
        jobRemoved.await();
    } else {
        // job should not get removed
        try {
            jobRemoved.await(10L, TimeUnit.MILLISECONDS);
            fail("We should not have removed the job.");
        } catch (TimeoutException expected) {
        }
    }
}
Also used : RegistrationResponse(org.apache.flink.runtime.registration.RegistrationResponse) TestingRpcService(org.apache.flink.runtime.rpc.TestingRpcService) ResourceRequirement(org.apache.flink.runtime.slots.ResourceRequirement) TimeoutException(java.util.concurrent.TimeoutException) TaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TaskExecutorGateway) SettableLeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.SettableLeaderRetrievalService) TestingFatalErrorHandler(org.apache.flink.runtime.util.TestingFatalErrorHandler) After(org.junit.After) Matchers.nullValue(org.hamcrest.Matchers.nullValue) TestLogger(org.apache.flink.util.TestLogger) TestingJobMasterGatewayBuilder(org.apache.flink.runtime.jobmaster.utils.TestingJobMasterGatewayBuilder) Assert.fail(org.junit.Assert.fail) AfterClass(org.junit.AfterClass) UUID(java.util.UUID) ResourceProfile(org.apache.flink.runtime.clusterframework.types.ResourceProfile) HeartbeatServices(org.apache.flink.runtime.heartbeat.HeartbeatServices) Matchers.instanceOf(org.hamcrest.Matchers.instanceOf) TestingUtils(org.apache.flink.testutils.TestingUtils) Matchers.equalTo(org.hamcrest.Matchers.equalTo) Matchers.is(org.hamcrest.Matchers.is) Matchers.anyOf(org.hamcrest.Matchers.anyOf) Time(org.apache.flink.api.common.time.Time) OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) FlinkException(org.apache.flink.util.FlinkException) BeforeClass(org.junit.BeforeClass) TaskExecutorMemoryConfiguration(org.apache.flink.runtime.taskexecutor.TaskExecutorMemoryConfiguration) CompletableFuture(java.util.concurrent.CompletableFuture) JobStatus(org.apache.flink.api.common.JobStatus) Function(java.util.function.Function) TestingJobMasterGateway(org.apache.flink.runtime.jobmaster.utils.TestingJobMasterGateway) DeclarativeSlotManagerBuilder(org.apache.flink.runtime.resourcemanager.slotmanager.DeclarativeSlotManagerBuilder) FutureUtils(org.apache.flink.util.concurrent.FutureUtils) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) ResourceManagerException(org.apache.flink.runtime.resourcemanager.exceptions.ResourceManagerException) NoOpResourceManagerPartitionTracker(org.apache.flink.runtime.io.network.partition.NoOpResourceManagerPartitionTracker) SlotManager(org.apache.flink.runtime.resourcemanager.slotmanager.SlotManager) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) ResourceRequirements(org.apache.flink.runtime.slots.ResourceRequirements) ThrowingConsumer(org.apache.flink.util.function.ThrowingConsumer) Before(org.junit.Before) Matchers.empty(org.hamcrest.Matchers.empty) TestingLeaderElectionService(org.apache.flink.runtime.leaderelection.TestingLeaderElectionService) HardwareDescription(org.apache.flink.runtime.instance.HardwareDescription) TaskManagerInfo(org.apache.flink.runtime.rest.messages.taskmanager.TaskManagerInfo) Test(org.junit.Test) TaskExecutorThreadInfoGateway(org.apache.flink.runtime.taskexecutor.TaskExecutorThreadInfoGateway) RpcUtils(org.apache.flink.runtime.rpc.RpcUtils) TimeUnit(java.util.concurrent.TimeUnit) Consumer(java.util.function.Consumer) JobID(org.apache.flink.api.common.JobID) UnregisteredMetricGroups(org.apache.flink.runtime.metrics.groups.UnregisteredMetricGroups) TestingTaskExecutorGatewayBuilder(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGatewayBuilder) TestingSlotManagerBuilder(org.apache.flink.runtime.resourcemanager.slotmanager.TestingSlotManagerBuilder) TestingHighAvailabilityServices(org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices) Collections(java.util.Collections) Assert.assertEquals(org.junit.Assert.assertEquals) RecipientUnreachableException(org.apache.flink.runtime.rpc.exceptions.RecipientUnreachableException) TestingJobMasterGateway(org.apache.flink.runtime.jobmaster.utils.TestingJobMasterGateway) SettableLeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.SettableLeaderRetrievalService) TestingJobMasterGatewayBuilder(org.apache.flink.runtime.jobmaster.utils.TestingJobMasterGatewayBuilder) OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) JobID(org.apache.flink.api.common.JobID) FlinkException(org.apache.flink.util.FlinkException) TimeoutException(java.util.concurrent.TimeoutException)

Example 18 with JobStatus

use of org.apache.flink.api.common.JobStatus in project flink by apache.

the class DefaultSchedulerTest method failJobIfNotEnoughResources.

@Test
public void failJobIfNotEnoughResources() throws Exception {
    final JobGraph jobGraph = singleNonParallelJobVertexJobGraph();
    testRestartBackoffTimeStrategy.setCanRestart(false);
    testExecutionSlotAllocator.disableAutoCompletePendingRequests();
    final DefaultScheduler scheduler = createSchedulerAndStartScheduling(jobGraph);
    testExecutionSlotAllocator.timeoutPendingRequests();
    waitForTermination(scheduler);
    final JobStatus jobStatus = scheduler.requestJobStatus();
    assertThat(jobStatus, is(equalTo(JobStatus.FAILED)));
    Throwable failureCause = scheduler.requestJob().getArchivedExecutionGraph().getFailureInfo().getException().deserializeError(DefaultSchedulerTest.class.getClassLoader());
    assertTrue(findThrowable(failureCause, NoResourceAvailableException.class).isPresent());
    assertTrue(findThrowableWithMessage(failureCause, "Could not allocate the required slot within slot request timeout.").isPresent());
    assertThat(jobStatus, is(equalTo(JobStatus.FAILED)));
}
Also used : JobStatus(org.apache.flink.api.common.JobStatus) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) ExceptionUtils.findThrowable(org.apache.flink.util.ExceptionUtils.findThrowable) AdaptiveSchedulerTest(org.apache.flink.runtime.scheduler.adaptive.AdaptiveSchedulerTest) Test(org.junit.Test)

Example 19 with JobStatus

use of org.apache.flink.api.common.JobStatus in project flink by apache.

the class DefaultSchedulerTest method jobStatusIsRestartingIfOneVertexIsWaitingForRestart.

@Test
public void jobStatusIsRestartingIfOneVertexIsWaitingForRestart() {
    final JobGraph jobGraph = singleJobVertexJobGraph(2);
    final DefaultScheduler scheduler = createSchedulerAndStartScheduling(jobGraph);
    final Iterator<ArchivedExecutionVertex> vertexIterator = scheduler.requestJob().getArchivedExecutionGraph().getAllExecutionVertices().iterator();
    final ExecutionAttemptID attemptId1 = vertexIterator.next().getCurrentExecutionAttempt().getAttemptId();
    final ExecutionAttemptID attemptId2 = vertexIterator.next().getCurrentExecutionAttempt().getAttemptId();
    scheduler.updateTaskExecutionState(new TaskExecutionState(attemptId1, ExecutionState.FAILED, new RuntimeException("expected")));
    final JobStatus jobStatusAfterFirstFailure = scheduler.requestJobStatus();
    scheduler.updateTaskExecutionState(new TaskExecutionState(attemptId2, ExecutionState.FAILED, new RuntimeException("expected")));
    taskRestartExecutor.triggerNonPeriodicScheduledTask();
    final JobStatus jobStatusWithPendingRestarts = scheduler.requestJobStatus();
    taskRestartExecutor.triggerNonPeriodicScheduledTask();
    final JobStatus jobStatusAfterRestarts = scheduler.requestJobStatus();
    assertThat(jobStatusAfterFirstFailure, equalTo(JobStatus.RESTARTING));
    assertThat(jobStatusWithPendingRestarts, equalTo(JobStatus.RESTARTING));
    assertThat(jobStatusAfterRestarts, equalTo(JobStatus.RUNNING));
}
Also used : JobStatus(org.apache.flink.api.common.JobStatus) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) ArchivedExecutionVertex(org.apache.flink.runtime.executiongraph.ArchivedExecutionVertex) TaskExecutionState(org.apache.flink.runtime.taskmanager.TaskExecutionState) AdaptiveSchedulerTest(org.apache.flink.runtime.scheduler.adaptive.AdaptiveSchedulerTest) Test(org.junit.Test)

Example 20 with JobStatus

use of org.apache.flink.api.common.JobStatus in project flink by apache.

the class DefaultSchedulerTest method doTestCheckpointCleanerIsClosedAfterCheckpointServices.

/**
 * Visible for re-use in {@link
 * org.apache.flink.runtime.scheduler.adaptive.AdaptiveSchedulerTest}.
 */
public static void doTestCheckpointCleanerIsClosedAfterCheckpointServices(BiFunction<CheckpointRecoveryFactory, CheckpointsCleaner, SchedulerNG> schedulerFactory, ScheduledExecutorService executorService) throws Exception {
    final CountDownLatch checkpointServicesShutdownBlocked = new CountDownLatch(1);
    final CountDownLatch cleanerClosed = new CountDownLatch(1);
    final CompletedCheckpointStore completedCheckpointStore = new StandaloneCompletedCheckpointStore(1) {

        @Override
        public void shutdown(JobStatus jobStatus, CheckpointsCleaner checkpointsCleaner) throws Exception {
            checkpointServicesShutdownBlocked.await();
            super.shutdown(jobStatus, checkpointsCleaner);
        }
    };
    final CheckpointIDCounter checkpointIDCounter = new StandaloneCheckpointIDCounter() {

        @Override
        public void shutdown(JobStatus jobStatus) throws Exception {
            checkpointServicesShutdownBlocked.await();
            super.shutdown(jobStatus);
        }
    };
    final CheckpointsCleaner checkpointsCleaner = new CheckpointsCleaner() {

        @Override
        public synchronized CompletableFuture<Void> closeAsync() {
            cleanerClosed.countDown();
            return super.closeAsync();
        }
    };
    final SchedulerNG scheduler = schedulerFactory.apply(new TestingCheckpointRecoveryFactory(completedCheckpointStore, checkpointIDCounter), checkpointsCleaner);
    final CompletableFuture<Void> schedulerClosed = new CompletableFuture<>();
    final CountDownLatch schedulerClosing = new CountDownLatch(1);
    executorService.submit(() -> {
        scheduler.closeAsync().thenRun(() -> schedulerClosed.complete(null));
        schedulerClosing.countDown();
    });
    // Wait for scheduler to start closing.
    schedulerClosing.await();
    assertFalse("CheckpointCleaner should not close before checkpoint services.", cleanerClosed.await(10, TimeUnit.MILLISECONDS));
    checkpointServicesShutdownBlocked.countDown();
    cleanerClosed.await();
    schedulerClosed.get();
}
Also used : JobStatus(org.apache.flink.api.common.JobStatus) CompletableFuture(java.util.concurrent.CompletableFuture) StandaloneCompletedCheckpointStore(org.apache.flink.runtime.checkpoint.StandaloneCompletedCheckpointStore) CheckpointsCleaner(org.apache.flink.runtime.checkpoint.CheckpointsCleaner) CheckpointIDCounter(org.apache.flink.runtime.checkpoint.CheckpointIDCounter) StandaloneCheckpointIDCounter(org.apache.flink.runtime.checkpoint.StandaloneCheckpointIDCounter) CountDownLatch(java.util.concurrent.CountDownLatch) TestingCheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.TestingCheckpointRecoveryFactory) StandaloneCompletedCheckpointStore(org.apache.flink.runtime.checkpoint.StandaloneCompletedCheckpointStore) CompletedCheckpointStore(org.apache.flink.runtime.checkpoint.CompletedCheckpointStore) StandaloneCheckpointIDCounter(org.apache.flink.runtime.checkpoint.StandaloneCheckpointIDCounter)

Aggregations

JobStatus (org.apache.flink.api.common.JobStatus)62 Test (org.junit.Test)28 JobID (org.apache.flink.api.common.JobID)19 CompletableFuture (java.util.concurrent.CompletableFuture)15 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)14 FlinkException (org.apache.flink.util.FlinkException)8 ExecutionException (java.util.concurrent.ExecutionException)7 IOException (java.io.IOException)6 ArrayList (java.util.ArrayList)6 Time (org.apache.flink.api.common.time.Time)6 ExecutionGraphInfo (org.apache.flink.runtime.scheduler.ExecutionGraphInfo)6 TaskExecutionState (org.apache.flink.runtime.taskmanager.TaskExecutionState)6 Collections (java.util.Collections)5 HashMap (java.util.HashMap)5 ExecutionState (org.apache.flink.runtime.execution.ExecutionState)5 FutureUtils (org.apache.flink.util.concurrent.FutureUtils)5 TimeUnit (java.util.concurrent.TimeUnit)4 Configuration (org.apache.flink.configuration.Configuration)4 ExecutionAttemptID (org.apache.flink.runtime.executiongraph.ExecutionAttemptID)4 Acknowledge (org.apache.flink.runtime.messages.Acknowledge)4