use of org.apache.flink.runtime.executiongraph.ArchivedExecutionVertex in project flink by apache.
the class DefaultSchedulerTest method vertexIsNotAffectedByOutdatedDeployment.
@Test
public void vertexIsNotAffectedByOutdatedDeployment() {
final JobGraph jobGraph = singleJobVertexJobGraph(2);
testExecutionSlotAllocator.disableAutoCompletePendingRequests();
final DefaultScheduler scheduler = createSchedulerAndStartScheduling(jobGraph);
final Iterator<ArchivedExecutionVertex> vertexIterator = scheduler.requestJob().getArchivedExecutionGraph().getAllExecutionVertices().iterator();
final ArchivedExecutionVertex v1 = vertexIterator.next();
final ArchivedExecutionVertex v2 = vertexIterator.next();
final SchedulingExecutionVertex sv1 = scheduler.getSchedulingTopology().getVertices().iterator().next();
// fail v1 and let it recover to SCHEDULED
// the initial deployment of v1 will be outdated
scheduler.updateTaskExecutionState(createFailedTaskExecutionState(v1.getCurrentExecutionAttempt().getAttemptId()));
taskRestartExecutor.triggerScheduledTasks();
// fail v2 to get all pending slot requests in the initial deployments to be done
// this triggers the outdated deployment of v1
scheduler.updateTaskExecutionState(createFailedTaskExecutionState(v2.getCurrentExecutionAttempt().getAttemptId()));
// v1 should not be affected
assertThat(sv1.getState(), is(equalTo(ExecutionState.SCHEDULED)));
}
use of org.apache.flink.runtime.executiongraph.ArchivedExecutionVertex in project flink by apache.
the class DefaultSchedulerTest method failGlobalWhenRestoringStateFails.
@Test
public void failGlobalWhenRestoringStateFails() throws Exception {
final JobGraph jobGraph = singleNonParallelJobVertexJobGraph();
final JobVertex onlyJobVertex = getOnlyJobVertex(jobGraph);
enableCheckpointing(jobGraph);
final CountDownLatch checkpointTriggeredLatch = getCheckpointTriggeredLatch();
final DefaultScheduler scheduler = createSchedulerAndStartScheduling(jobGraph);
final ArchivedExecutionVertex onlyExecutionVertex = Iterables.getOnlyElement(scheduler.requestJob().getArchivedExecutionGraph().getAllExecutionVertices());
final ExecutionAttemptID attemptId = onlyExecutionVertex.getCurrentExecutionAttempt().getAttemptId();
transitionToRunning(scheduler, attemptId);
final CheckpointCoordinator checkpointCoordinator = getCheckpointCoordinator(scheduler);
// register a master hook to fail state restore
final TestMasterHook masterHook = TestMasterHook.fromId("testHook");
masterHook.enableFailOnRestore();
checkpointCoordinator.addMasterHook(masterHook);
// complete one checkpoint for state restore
checkpointCoordinator.triggerCheckpoint(false);
checkpointTriggeredLatch.await();
final long checkpointId = checkpointCoordinator.getPendingCheckpoints().keySet().iterator().next();
acknowledgePendingCheckpoint(scheduler, checkpointId);
scheduler.updateTaskExecutionState(createFailedTaskExecutionState(attemptId));
taskRestartExecutor.triggerScheduledTasks();
final List<ExecutionVertexID> deployedExecutionVertices = testExecutionVertexOperations.getDeployedVertices();
// the first task failover should be skipped on state restore failure
final ExecutionVertexID executionVertexId = new ExecutionVertexID(onlyJobVertex.getID(), 0);
assertThat(deployedExecutionVertices, contains(executionVertexId));
// a global failure should be triggered on state restore failure
masterHook.disableFailOnRestore();
taskRestartExecutor.triggerScheduledTasks();
assertThat(deployedExecutionVertices, contains(executionVertexId, executionVertexId));
}
use of org.apache.flink.runtime.executiongraph.ArchivedExecutionVertex in project flink by apache.
the class DefaultSchedulerTest method testExceptionHistoryWithPreDeployFailure.
@Test
public void testExceptionHistoryWithPreDeployFailure() {
// disable auto-completing slot requests to simulate timeout
executionSlotAllocatorFactory.getTestExecutionSlotAllocator().disableAutoCompletePendingRequests();
final DefaultScheduler scheduler = createSchedulerAndStartScheduling(singleNonParallelJobVertexJobGraph());
executionSlotAllocatorFactory.getTestExecutionSlotAllocator().timeoutPendingRequests();
final ArchivedExecutionVertex taskFailureExecutionVertex = Iterables.getOnlyElement(scheduler.requestJob().getArchivedExecutionGraph().getAllExecutionVertices());
// pending slot request timeout triggers a task failure that needs to be processed
taskRestartExecutor.triggerNonPeriodicScheduledTask();
// sanity check that the TaskManagerLocation of the failed task is indeed null, as expected
assertThat(taskFailureExecutionVertex.getCurrentAssignedResourceLocation(), is(nullValue()));
final ErrorInfo failureInfo = taskFailureExecutionVertex.getFailureInfo().orElseThrow(() -> new AssertionError("A failureInfo should be set."));
final Iterable<RootExceptionHistoryEntry> actualExceptionHistory = scheduler.getExceptionHistory();
assertThat(actualExceptionHistory, IsIterableContainingInOrder.contains(ExceptionHistoryEntryMatcher.matchesFailure(failureInfo.getException(), failureInfo.getTimestamp(), taskFailureExecutionVertex.getTaskNameWithSubtaskIndex(), taskFailureExecutionVertex.getCurrentAssignedResourceLocation())));
}
use of org.apache.flink.runtime.executiongraph.ArchivedExecutionVertex in project flink by apache.
the class AdaptiveSchedulerTest method runExceptionHistoryTests.
private Iterable<RootExceptionHistoryEntry> runExceptionHistoryTests(BiConsumer<AdaptiveScheduler, List<ExecutionAttemptID>> testLogic, Consumer<AdaptiveSchedulerBuilder> setupScheduler, Consumer<JobGraph> setupJobGraph) throws Exception {
final int numAvailableSlots = 4;
final JobGraph jobGraph = createJobGraph();
setupJobGraph.accept(jobGraph);
RunFailedJobListener listener = new RunFailedJobListener();
List<ExecutionAttemptID> cancelledTasks = new ArrayList<>();
final CompletedCheckpointStore completedCheckpointStore = new StandaloneCompletedCheckpointStore(1);
final CheckpointIDCounter checkpointIDCounter = new StandaloneCheckpointIDCounter();
final CheckpointsCleaner checkpointCleaner = new CheckpointsCleaner();
TestingCheckpointRecoveryFactory checkpointRecoveryFactory = new TestingCheckpointRecoveryFactory(completedCheckpointStore, checkpointIDCounter);
final DefaultDeclarativeSlotPool declarativeSlotPool = createDeclarativeSlotPool(jobGraph.getJobID());
final Configuration configuration = new Configuration();
configuration.set(JobManagerOptions.RESOURCE_WAIT_TIMEOUT, Duration.ofMillis(1L));
AdaptiveSchedulerBuilder builder = new AdaptiveSchedulerBuilder(jobGraph, singleThreadMainThreadExecutor).setJobMasterConfiguration(configuration).setDeclarativeSlotPool(declarativeSlotPool).setCheckpointRecoveryFactory(checkpointRecoveryFactory).setCheckpointCleaner(checkpointCleaner).setJobStatusListener(listener);
setupScheduler.accept(builder);
final AdaptiveScheduler scheduler = builder.build();
final SubmissionBufferingTaskManagerGateway taskManagerGateway = new SubmissionBufferingTaskManagerGateway(numAvailableSlots);
taskManagerGateway.setCancelConsumer(cancelledTasks::add);
singleThreadMainThreadExecutor.execute(() -> {
scheduler.startScheduling();
offerSlots(declarativeSlotPool, createSlotOffersForResourceRequirements(ResourceCounter.withResource(ResourceProfile.UNKNOWN, numAvailableSlots)), taskManagerGateway);
});
listener.waitForRunning();
CompletableFuture<Iterable<ArchivedExecutionVertex>> vertexFuture = new CompletableFuture<>();
singleThreadMainThreadExecutor.execute(() -> vertexFuture.complete(scheduler.requestJob().getArchivedExecutionGraph().getAllExecutionVertices()));
final Iterable<ArchivedExecutionVertex> executionVertices = vertexFuture.get();
final List<ExecutionAttemptID> attemptIds = IterableUtils.toStream(executionVertices).map(ArchivedExecutionVertex::getCurrentExecutionAttempt).map(ArchivedExecution::getAttemptId).collect(Collectors.toList());
CompletableFuture<Void> runTestLogicFuture = CompletableFuture.runAsync(() -> testLogic.accept(scheduler, attemptIds), singleThreadMainThreadExecutor);
runTestLogicFuture.get();
Consumer<ExecutionAttemptID> canceller = attemptId -> scheduler.updateTaskExecutionState(new TaskExecutionStateTransition(new TaskExecutionState(attemptId, ExecutionState.CANCELED, null)));
CompletableFuture<Void> cancelFuture = CompletableFuture.runAsync(() -> cancelledTasks.forEach(canceller), singleThreadMainThreadExecutor);
cancelFuture.get();
listener.waitForTerminal();
return scheduler.requestJob().getExceptionHistory();
}
Aggregations