use of org.apache.flink.api.common.JobStatus in project flink by apache.
the class JobMasterTriggerSavepointITCase method testDoNotCancelJobIfSavepointFails.
@Test
public void testDoNotCancelJobIfSavepointFails() throws Exception {
setUpWithCheckpointInterval(10L);
try {
Files.setPosixFilePermissions(savepointDirectory, Collections.emptySet());
} catch (IOException e) {
Assume.assumeNoException(e);
}
try {
cancelWithSavepoint();
} catch (Exception e) {
assertThat(ExceptionUtils.findThrowable(e, CheckpointException.class).isPresent(), equalTo(true));
}
final JobStatus jobStatus = clusterClient.getJobStatus(jobGraph.getJobID()).get(60, TimeUnit.SECONDS);
assertThat(jobStatus, equalTo(JobStatus.RUNNING));
// assert that checkpoints are continued to be triggered
triggerCheckpointLatch = new CountDownLatch(1);
assertThat(triggerCheckpointLatch.await(60L, TimeUnit.SECONDS), equalTo(true));
}
use of org.apache.flink.api.common.JobStatus in project flink by apache.
the class ResourceManagerTest method testDisconnectJobManager.
private void testDisconnectJobManager(JobStatus jobStatus) throws Exception {
final TestingJobMasterGateway jobMasterGateway = new TestingJobMasterGatewayBuilder().setAddress(UUID.randomUUID().toString()).build();
rpcService.registerGateway(jobMasterGateway.getAddress(), jobMasterGateway);
final OneShotLatch jobAdded = new OneShotLatch();
final OneShotLatch jobRemoved = new OneShotLatch();
final JobLeaderIdService jobLeaderIdService = TestingJobLeaderIdService.newBuilder().setAddJobConsumer(ignored -> jobAdded.trigger()).setRemoveJobConsumer(ignored -> jobRemoved.trigger()).build();
resourceManager = new ResourceManagerBuilder().withJobLeaderIdService(jobLeaderIdService).buildAndStart();
highAvailabilityServices.setJobMasterLeaderRetrieverFunction(requestedJobId -> new SettableLeaderRetrievalService(jobMasterGateway.getAddress(), jobMasterGateway.getFencingToken().toUUID()));
final JobID jobId = JobID.generate();
final ResourceManagerGateway resourceManagerGateway = resourceManager.getSelfGateway(ResourceManagerGateway.class);
resourceManagerGateway.registerJobMaster(jobMasterGateway.getFencingToken(), ResourceID.generate(), jobMasterGateway.getAddress(), jobId, TIMEOUT);
jobAdded.await();
resourceManagerGateway.disconnectJobManager(jobId, jobStatus, new FlinkException("Test exception"));
if (jobStatus.isGloballyTerminalState()) {
jobRemoved.await();
} else {
// job should not get removed
try {
jobRemoved.await(10L, TimeUnit.MILLISECONDS);
fail("We should not have removed the job.");
} catch (TimeoutException expected) {
}
}
}
use of org.apache.flink.api.common.JobStatus in project flink by apache.
the class DefaultSchedulerTest method failJobIfNotEnoughResources.
@Test
public void failJobIfNotEnoughResources() throws Exception {
final JobGraph jobGraph = singleNonParallelJobVertexJobGraph();
testRestartBackoffTimeStrategy.setCanRestart(false);
testExecutionSlotAllocator.disableAutoCompletePendingRequests();
final DefaultScheduler scheduler = createSchedulerAndStartScheduling(jobGraph);
testExecutionSlotAllocator.timeoutPendingRequests();
waitForTermination(scheduler);
final JobStatus jobStatus = scheduler.requestJobStatus();
assertThat(jobStatus, is(equalTo(JobStatus.FAILED)));
Throwable failureCause = scheduler.requestJob().getArchivedExecutionGraph().getFailureInfo().getException().deserializeError(DefaultSchedulerTest.class.getClassLoader());
assertTrue(findThrowable(failureCause, NoResourceAvailableException.class).isPresent());
assertTrue(findThrowableWithMessage(failureCause, "Could not allocate the required slot within slot request timeout.").isPresent());
assertThat(jobStatus, is(equalTo(JobStatus.FAILED)));
}
use of org.apache.flink.api.common.JobStatus in project flink by apache.
the class DefaultSchedulerTest method jobStatusIsRestartingIfOneVertexIsWaitingForRestart.
@Test
public void jobStatusIsRestartingIfOneVertexIsWaitingForRestart() {
final JobGraph jobGraph = singleJobVertexJobGraph(2);
final DefaultScheduler scheduler = createSchedulerAndStartScheduling(jobGraph);
final Iterator<ArchivedExecutionVertex> vertexIterator = scheduler.requestJob().getArchivedExecutionGraph().getAllExecutionVertices().iterator();
final ExecutionAttemptID attemptId1 = vertexIterator.next().getCurrentExecutionAttempt().getAttemptId();
final ExecutionAttemptID attemptId2 = vertexIterator.next().getCurrentExecutionAttempt().getAttemptId();
scheduler.updateTaskExecutionState(new TaskExecutionState(attemptId1, ExecutionState.FAILED, new RuntimeException("expected")));
final JobStatus jobStatusAfterFirstFailure = scheduler.requestJobStatus();
scheduler.updateTaskExecutionState(new TaskExecutionState(attemptId2, ExecutionState.FAILED, new RuntimeException("expected")));
taskRestartExecutor.triggerNonPeriodicScheduledTask();
final JobStatus jobStatusWithPendingRestarts = scheduler.requestJobStatus();
taskRestartExecutor.triggerNonPeriodicScheduledTask();
final JobStatus jobStatusAfterRestarts = scheduler.requestJobStatus();
assertThat(jobStatusAfterFirstFailure, equalTo(JobStatus.RESTARTING));
assertThat(jobStatusWithPendingRestarts, equalTo(JobStatus.RESTARTING));
assertThat(jobStatusAfterRestarts, equalTo(JobStatus.RUNNING));
}
use of org.apache.flink.api.common.JobStatus in project flink by apache.
the class DefaultSchedulerTest method doTestCheckpointCleanerIsClosedAfterCheckpointServices.
/**
* Visible for re-use in {@link
* org.apache.flink.runtime.scheduler.adaptive.AdaptiveSchedulerTest}.
*/
public static void doTestCheckpointCleanerIsClosedAfterCheckpointServices(BiFunction<CheckpointRecoveryFactory, CheckpointsCleaner, SchedulerNG> schedulerFactory, ScheduledExecutorService executorService) throws Exception {
final CountDownLatch checkpointServicesShutdownBlocked = new CountDownLatch(1);
final CountDownLatch cleanerClosed = new CountDownLatch(1);
final CompletedCheckpointStore completedCheckpointStore = new StandaloneCompletedCheckpointStore(1) {
@Override
public void shutdown(JobStatus jobStatus, CheckpointsCleaner checkpointsCleaner) throws Exception {
checkpointServicesShutdownBlocked.await();
super.shutdown(jobStatus, checkpointsCleaner);
}
};
final CheckpointIDCounter checkpointIDCounter = new StandaloneCheckpointIDCounter() {
@Override
public void shutdown(JobStatus jobStatus) throws Exception {
checkpointServicesShutdownBlocked.await();
super.shutdown(jobStatus);
}
};
final CheckpointsCleaner checkpointsCleaner = new CheckpointsCleaner() {
@Override
public synchronized CompletableFuture<Void> closeAsync() {
cleanerClosed.countDown();
return super.closeAsync();
}
};
final SchedulerNG scheduler = schedulerFactory.apply(new TestingCheckpointRecoveryFactory(completedCheckpointStore, checkpointIDCounter), checkpointsCleaner);
final CompletableFuture<Void> schedulerClosed = new CompletableFuture<>();
final CountDownLatch schedulerClosing = new CountDownLatch(1);
executorService.submit(() -> {
scheduler.closeAsync().thenRun(() -> schedulerClosed.complete(null));
schedulerClosing.countDown();
});
// Wait for scheduler to start closing.
schedulerClosing.await();
assertFalse("CheckpointCleaner should not close before checkpoint services.", cleanerClosed.await(10, TimeUnit.MILLISECONDS));
checkpointServicesShutdownBlocked.countDown();
cleanerClosed.await();
schedulerClosed.get();
}
Aggregations