Search in sources :

Example 1 with SlotPool

use of org.apache.flink.runtime.jobmaster.slotpool.SlotPool in project flink by apache.

the class AdaptiveBatchSchedulerFactory method createInstance.

@Override
public SchedulerNG createInstance(Logger log, JobGraph jobGraph, Executor ioExecutor, Configuration jobMasterConfiguration, SlotPoolService slotPoolService, ScheduledExecutorService futureExecutor, ClassLoader userCodeLoader, CheckpointRecoveryFactory checkpointRecoveryFactory, Time rpcTimeout, BlobWriter blobWriter, JobManagerJobMetricGroup jobManagerJobMetricGroup, Time slotRequestTimeout, ShuffleMaster<?> shuffleMaster, JobMasterPartitionTracker partitionTracker, ExecutionDeploymentTracker executionDeploymentTracker, long initializationTimestamp, ComponentMainThreadExecutor mainThreadExecutor, FatalErrorHandler fatalErrorHandler, JobStatusListener jobStatusListener) throws Exception {
    checkState(jobGraph.getJobType() == JobType.BATCH, "Adaptive batch scheduler only supports batch jobs");
    checkAllExchangesBlocking(jobGraph);
    final SlotPool slotPool = slotPoolService.castInto(SlotPool.class).orElseThrow(() -> new IllegalStateException("The DefaultScheduler requires a SlotPool."));
    final SlotSelectionStrategy slotSelectionStrategy = SlotSelectionStrategyUtils.selectSlotSelectionStrategy(JobType.BATCH, jobMasterConfiguration);
    final PhysicalSlotRequestBulkChecker bulkChecker = PhysicalSlotRequestBulkCheckerImpl.createFromSlotPool(slotPool, SystemClock.getInstance());
    final PhysicalSlotProvider physicalSlotProvider = new PhysicalSlotProviderImpl(slotSelectionStrategy, slotPool);
    final ExecutionSlotAllocatorFactory allocatorFactory = new SlotSharingExecutionSlotAllocatorFactory(physicalSlotProvider, false, bulkChecker, slotRequestTimeout);
    final RestartBackoffTimeStrategy restartBackoffTimeStrategy = RestartBackoffTimeStrategyFactoryLoader.createRestartBackoffTimeStrategyFactory(jobGraph.getSerializedExecutionConfig().deserializeValue(userCodeLoader).getRestartStrategy(), jobMasterConfiguration, jobGraph.isCheckpointingEnabled()).create();
    log.info("Using restart back off time strategy {} for {} ({}).", restartBackoffTimeStrategy, jobGraph.getName(), jobGraph.getJobID());
    final ExecutionGraphFactory executionGraphFactory = new DefaultExecutionGraphFactory(jobMasterConfiguration, userCodeLoader, executionDeploymentTracker, futureExecutor, ioExecutor, rpcTimeout, jobManagerJobMetricGroup, blobWriter, shuffleMaster, partitionTracker, true);
    return new AdaptiveBatchScheduler(log, jobGraph, ioExecutor, jobMasterConfiguration, bulkChecker::start, new ScheduledExecutorServiceAdapter(futureExecutor), userCodeLoader, new CheckpointsCleaner(), checkpointRecoveryFactory, jobManagerJobMetricGroup, new VertexwiseSchedulingStrategy.Factory(), FailoverStrategyFactoryLoader.loadFailoverStrategyFactory(jobMasterConfiguration), restartBackoffTimeStrategy, new DefaultExecutionVertexOperations(), new ExecutionVertexVersioner(), allocatorFactory, initializationTimestamp, mainThreadExecutor, jobStatusListener, executionGraphFactory, shuffleMaster, rpcTimeout, DefaultVertexParallelismDecider.from(jobMasterConfiguration), jobMasterConfiguration.getInteger(JobManagerOptions.ADAPTIVE_BATCH_SCHEDULER_MAX_PARALLELISM));
}
Also used : DefaultExecutionVertexOperations(org.apache.flink.runtime.scheduler.DefaultExecutionVertexOperations) SlotSharingExecutionSlotAllocatorFactory(org.apache.flink.runtime.scheduler.SlotSharingExecutionSlotAllocatorFactory) SlotSelectionStrategy(org.apache.flink.runtime.jobmaster.slotpool.SlotSelectionStrategy) VertexwiseSchedulingStrategy(org.apache.flink.runtime.scheduler.strategy.VertexwiseSchedulingStrategy) SlotPool(org.apache.flink.runtime.jobmaster.slotpool.SlotPool) PhysicalSlotRequestBulkChecker(org.apache.flink.runtime.jobmaster.slotpool.PhysicalSlotRequestBulkChecker) PhysicalSlotProviderImpl(org.apache.flink.runtime.jobmaster.slotpool.PhysicalSlotProviderImpl) SlotSharingExecutionSlotAllocatorFactory(org.apache.flink.runtime.scheduler.SlotSharingExecutionSlotAllocatorFactory) ExecutionSlotAllocatorFactory(org.apache.flink.runtime.scheduler.ExecutionSlotAllocatorFactory) ScheduledExecutorServiceAdapter(org.apache.flink.util.concurrent.ScheduledExecutorServiceAdapter) RestartBackoffTimeStrategy(org.apache.flink.runtime.executiongraph.failover.flip1.RestartBackoffTimeStrategy) CheckpointsCleaner(org.apache.flink.runtime.checkpoint.CheckpointsCleaner) DefaultExecutionGraphFactory(org.apache.flink.runtime.scheduler.DefaultExecutionGraphFactory) ExecutionVertexVersioner(org.apache.flink.runtime.scheduler.ExecutionVertexVersioner) ExecutionGraphFactory(org.apache.flink.runtime.scheduler.ExecutionGraphFactory) DefaultExecutionGraphFactory(org.apache.flink.runtime.scheduler.DefaultExecutionGraphFactory) PhysicalSlotProvider(org.apache.flink.runtime.jobmaster.slotpool.PhysicalSlotProvider)

Example 2 with SlotPool

use of org.apache.flink.runtime.jobmaster.slotpool.SlotPool in project flink by apache.

the class DefaultSchedulerTest method testStatusMetrics.

@Test
public void testStatusMetrics() throws Exception {
    // running time acts as a stand-in for generic status time metrics
    final CompletableFuture<Gauge<Long>> runningTimeMetricFuture = new CompletableFuture<>();
    final MetricRegistry metricRegistry = TestingMetricRegistry.builder().setRegisterConsumer((metric, name, group) -> {
        switch(name) {
            case "runningTimeTotal":
                runningTimeMetricFuture.complete((Gauge<Long>) metric);
                break;
        }
    }).build();
    final JobGraph jobGraph = singleNonParallelJobVertexJobGraph();
    final JobVertex onlyJobVertex = getOnlyJobVertex(jobGraph);
    final Configuration configuration = new Configuration();
    configuration.set(MetricOptions.JOB_STATUS_METRICS, Arrays.asList(MetricOptions.JobStatusMetrics.TOTAL_TIME));
    final ComponentMainThreadExecutor singleThreadMainThreadExecutor = ComponentMainThreadExecutorServiceAdapter.forSingleThreadExecutor(scheduledExecutorService);
    final Time slotTimeout = Time.milliseconds(5L);
    final SlotPool slotPool = new DeclarativeSlotPoolBridgeBuilder().setBatchSlotTimeout(slotTimeout).buildAndStart(singleThreadMainThreadExecutor);
    final PhysicalSlotProvider slotProvider = new PhysicalSlotProviderImpl(LocationPreferenceSlotSelectionStrategy.createDefault(), slotPool);
    final DefaultScheduler scheduler = createSchedulerBuilder(jobGraph, singleThreadMainThreadExecutor).setJobMasterConfiguration(configuration).setJobManagerJobMetricGroup(JobManagerMetricGroup.createJobManagerMetricGroup(metricRegistry, "localhost").addJob(new JobID(), "jobName")).setExecutionSlotAllocatorFactory(SchedulerTestingUtils.newSlotSharingExecutionSlotAllocatorFactory(slotProvider, slotTimeout)).build();
    final AdaptiveSchedulerTest.SubmissionBufferingTaskManagerGateway taskManagerGateway = new AdaptiveSchedulerTest.SubmissionBufferingTaskManagerGateway(1);
    taskManagerGateway.setCancelConsumer(executionAttemptId -> {
        singleThreadMainThreadExecutor.execute(() -> scheduler.updateTaskExecutionState(new TaskExecutionState(executionAttemptId, ExecutionState.CANCELED)));
    });
    singleThreadMainThreadExecutor.execute(() -> {
        scheduler.startScheduling();
        offerSlots(slotPool, createSlotOffersForResourceRequirements(ResourceCounter.withResource(ResourceProfile.UNKNOWN, 1)), taskManagerGateway);
    });
    // wait for the first task submission
    taskManagerGateway.waitForSubmissions(1, Duration.ofSeconds(5));
    // sleep a bit to ensure uptime is > 0
    Thread.sleep(10L);
    final Gauge<Long> runningTimeGauge = runningTimeMetricFuture.get();
    Assert.assertThat(runningTimeGauge.getValue(), greaterThan(0L));
}
Also used : ManuallyTriggeredScheduledExecutor(org.apache.flink.util.concurrent.ManuallyTriggeredScheduledExecutor) Arrays(java.util.Arrays) ArchivedExecution(org.apache.flink.runtime.executiongraph.ArchivedExecution) TestMasterHook(org.apache.flink.runtime.checkpoint.hooks.TestMasterHook) Is(org.hamcrest.core.Is) ExceptionUtils.findThrowable(org.apache.flink.util.ExceptionUtils.findThrowable) ResultPartitionID(org.apache.flink.runtime.io.network.partition.ResultPartitionID) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry) Duration(java.time.Duration) Matchers.nullValue(org.hamcrest.Matchers.nullValue) ClassRule(org.junit.ClassRule) TestingCheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.TestingCheckpointRecoveryFactory) SchedulingStrategyFactory(org.apache.flink.runtime.scheduler.strategy.SchedulingStrategyFactory) ScheduledExecutor(org.apache.flink.util.concurrent.ScheduledExecutor) Matchers.notNullValue(org.hamcrest.Matchers.notNullValue) PhysicalSlotProvider(org.apache.flink.runtime.jobmaster.slotpool.PhysicalSlotProvider) Set(java.util.Set) MetricOptions(org.apache.flink.configuration.MetricOptions) Executors(java.util.concurrent.Executors) CountDownLatch(java.util.concurrent.CountDownLatch) Matchers.contains(org.hamcrest.Matchers.contains) Assert.assertFalse(org.junit.Assert.assertFalse) Matchers.greaterThan(org.hamcrest.Matchers.greaterThan) Matchers.is(org.hamcrest.Matchers.is) Matchers.containsString(org.hamcrest.Matchers.containsString) Time(org.apache.flink.api.common.time.Time) RootExceptionHistoryEntry(org.apache.flink.runtime.scheduler.exceptionhistory.RootExceptionHistoryEntry) PipelinedRegionSchedulingStrategy(org.apache.flink.runtime.scheduler.strategy.PipelinedRegionSchedulingStrategy) FlinkException(org.apache.flink.util.FlinkException) ComponentMainThreadExecutor(org.apache.flink.runtime.concurrent.ComponentMainThreadExecutor) LocalTaskManagerLocation(org.apache.flink.runtime.taskmanager.LocalTaskManagerLocation) ResourceCounter(org.apache.flink.runtime.util.ResourceCounter) Callable(java.util.concurrent.Callable) JobStatus(org.apache.flink.api.common.JobStatus) TestFailoverStrategyFactory(org.apache.flink.runtime.executiongraph.utils.TestFailoverStrategyFactory) ArrayList(java.util.ArrayList) DirectScheduledExecutorService(org.apache.flink.runtime.testutils.DirectScheduledExecutorService) DeclarativeSlotPoolBridgeBuilder(org.apache.flink.runtime.jobmaster.slotpool.DeclarativeSlotPoolBridgeBuilder) Gauge(org.apache.flink.metrics.Gauge) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) Matchers.hasSize(org.hamcrest.Matchers.hasSize) StreamSupport(java.util.stream.StreamSupport) Iterables(org.apache.flink.shaded.guava30.com.google.common.collect.Iterables) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) AdaptiveSchedulerTest(org.apache.flink.runtime.scheduler.adaptive.AdaptiveSchedulerTest) DistributionPattern(org.apache.flink.runtime.jobgraph.DistributionPattern) ArchivedExecutionVertex(org.apache.flink.runtime.executiongraph.ArchivedExecutionVertex) Before(org.junit.Before) ExecutionState(org.apache.flink.runtime.execution.ExecutionState) ErrorInfo(org.apache.flink.runtime.executiongraph.ErrorInfo) Assert.assertTrue(org.junit.Assert.assertTrue) CheckpointsCleaner(org.apache.flink.runtime.checkpoint.CheckpointsCleaner) Test(org.junit.Test) LocationPreferenceSlotSelectionStrategy(org.apache.flink.runtime.jobmaster.slotpool.LocationPreferenceSlotSelectionStrategy) SimpleAckingTaskManagerGateway(org.apache.flink.runtime.executiongraph.utils.SimpleAckingTaskManagerGateway) JobID(org.apache.flink.api.common.JobID) IsIterableWithSize(org.hamcrest.collection.IsIterableWithSize) SlotPoolTestUtils.offerSlots(org.apache.flink.runtime.jobmaster.slotpool.SlotPoolTestUtils.offerSlots) Assert(org.junit.Assert) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) JobManagerMetricGroup(org.apache.flink.runtime.metrics.groups.JobManagerMetricGroup) IsEmptyIterable(org.hamcrest.collection.IsEmptyIterable) Assert.assertEquals(org.junit.Assert.assertEquals) ComponentMainThreadExecutorServiceAdapter(org.apache.flink.runtime.concurrent.ComponentMainThreadExecutorServiceAdapter) IsIterableContainingInOrder(org.hamcrest.collection.IsIterableContainingInOrder) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) ScheduledFuture(java.util.concurrent.ScheduledFuture) ExceptionUtils.findThrowableWithMessage(org.apache.flink.util.ExceptionUtils.findThrowableWithMessage) ScheduledTask(org.apache.flink.core.testutils.ScheduledTask) CheckpointCoordinator(org.apache.flink.runtime.checkpoint.CheckpointCoordinator) BiFunction(java.util.function.BiFunction) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) TestingLogicalSlotBuilder(org.apache.flink.runtime.jobmaster.TestingLogicalSlotBuilder) TestSchedulingStrategy(org.apache.flink.runtime.scheduler.strategy.TestSchedulingStrategy) After(org.junit.After) SchedulingTopology(org.apache.flink.runtime.scheduler.strategy.SchedulingTopology) TestLogger(org.apache.flink.util.TestLogger) Assert.fail(org.junit.Assert.fail) TestingShuffleMaster(org.apache.flink.runtime.shuffle.TestingShuffleMaster) CheckpointIDCounter(org.apache.flink.runtime.checkpoint.CheckpointIDCounter) AbstractInvokable(org.apache.flink.runtime.jobgraph.tasks.AbstractInvokable) TestRestartBackoffTimeStrategy(org.apache.flink.runtime.executiongraph.failover.flip1.TestRestartBackoffTimeStrategy) ExecutionVertexID(org.apache.flink.runtime.scheduler.strategy.ExecutionVertexID) Preconditions(org.apache.flink.util.Preconditions) Collectors(java.util.stream.Collectors) ResourceProfile(org.apache.flink.runtime.clusterframework.types.ResourceProfile) ExecutorUtils(org.apache.flink.util.ExecutorUtils) PhysicalSlotProviderImpl(org.apache.flink.runtime.jobmaster.slotpool.PhysicalSlotProviderImpl) List(java.util.List) SchedulerTestingUtils.acknowledgePendingCheckpoint(org.apache.flink.runtime.scheduler.SchedulerTestingUtils.acknowledgePendingCheckpoint) Matchers.containsInAnyOrder(org.hamcrest.Matchers.containsInAnyOrder) StandaloneCheckpointIDCounter(org.apache.flink.runtime.checkpoint.StandaloneCheckpointIDCounter) FailoverStrategy(org.apache.flink.runtime.executiongraph.failover.flip1.FailoverStrategy) Matchers.equalTo(org.hamcrest.Matchers.equalTo) ExceptionHistoryEntryMatcher(org.apache.flink.runtime.scheduler.exceptionhistory.ExceptionHistoryEntryMatcher) RestartAllFailoverStrategy(org.apache.flink.runtime.executiongraph.failover.flip1.RestartAllFailoverStrategy) RestartPipelinedRegionFailoverStrategy(org.apache.flink.runtime.executiongraph.failover.flip1.RestartPipelinedRegionFailoverStrategy) NoResourceAvailableException(org.apache.flink.runtime.jobmanager.scheduler.NoResourceAvailableException) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) ResultPartitionType(org.apache.flink.runtime.io.network.partition.ResultPartitionType) CompletableFuture(java.util.concurrent.CompletableFuture) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) TestingJobMasterPartitionTracker(org.apache.flink.runtime.io.network.partition.TestingJobMasterPartitionTracker) TestingMetricRegistry(org.apache.flink.runtime.metrics.util.TestingMetricRegistry) JobGraphTestUtils(org.apache.flink.runtime.jobgraph.JobGraphTestUtils) StandaloneCompletedCheckpointStore(org.apache.flink.runtime.checkpoint.StandaloneCompletedCheckpointStore) ExecutorService(java.util.concurrent.ExecutorService) ArchivedExecutionGraph(org.apache.flink.runtime.executiongraph.ArchivedExecutionGraph) SlotPool(org.apache.flink.runtime.jobmaster.slotpool.SlotPool) Iterator(java.util.Iterator) Configuration(org.apache.flink.configuration.Configuration) CompletedCheckpointStore(org.apache.flink.runtime.checkpoint.CompletedCheckpointStore) LogicalSlot(org.apache.flink.runtime.jobmaster.LogicalSlot) DefaultDeclarativeSlotPoolTest.createSlotOffersForResourceRequirements(org.apache.flink.runtime.jobmaster.slotpool.DefaultDeclarativeSlotPoolTest.createSlotOffersForResourceRequirements) CheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.CheckpointRecoveryFactory) SchedulerTestingUtils.getCheckpointCoordinator(org.apache.flink.runtime.scheduler.SchedulerTestingUtils.getCheckpointCoordinator) TimeUnit(java.util.concurrent.TimeUnit) Consumer(java.util.function.Consumer) WebOptions(org.apache.flink.configuration.WebOptions) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) SchedulingExecutionVertex(org.apache.flink.runtime.scheduler.strategy.SchedulingExecutionVertex) SchedulerTestingUtils.enableCheckpointing(org.apache.flink.runtime.scheduler.SchedulerTestingUtils.enableCheckpointing) TaskExecutionState(org.apache.flink.runtime.taskmanager.TaskExecutionState) Collections(java.util.Collections) TemporaryFolder(org.junit.rules.TemporaryFolder) NoOpInvokable(org.apache.flink.runtime.testtasks.NoOpInvokable) DeclarativeSlotPoolBridgeBuilder(org.apache.flink.runtime.jobmaster.slotpool.DeclarativeSlotPoolBridgeBuilder) Configuration(org.apache.flink.configuration.Configuration) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry) TestingMetricRegistry(org.apache.flink.runtime.metrics.util.TestingMetricRegistry) Time(org.apache.flink.api.common.time.Time) AdaptiveSchedulerTest(org.apache.flink.runtime.scheduler.adaptive.AdaptiveSchedulerTest) TaskExecutionState(org.apache.flink.runtime.taskmanager.TaskExecutionState) Gauge(org.apache.flink.metrics.Gauge) SlotPool(org.apache.flink.runtime.jobmaster.slotpool.SlotPool) CompletableFuture(java.util.concurrent.CompletableFuture) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) PhysicalSlotProviderImpl(org.apache.flink.runtime.jobmaster.slotpool.PhysicalSlotProviderImpl) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) ComponentMainThreadExecutor(org.apache.flink.runtime.concurrent.ComponentMainThreadExecutor) PhysicalSlotProvider(org.apache.flink.runtime.jobmaster.slotpool.PhysicalSlotProvider) JobID(org.apache.flink.api.common.JobID) AdaptiveSchedulerTest(org.apache.flink.runtime.scheduler.adaptive.AdaptiveSchedulerTest) Test(org.junit.Test)

Example 3 with SlotPool

use of org.apache.flink.runtime.jobmaster.slotpool.SlotPool in project flink by apache.

the class ExecutionGraphRestartTest method testCancelWhileFailing.

@Test
public void testCancelWhileFailing() throws Exception {
    try (SlotPool slotPool = SlotPoolUtils.createDeclarativeSlotPoolBridge()) {
        SchedulerBase scheduler = SchedulerTestingUtils.newSchedulerBuilder(createJobGraph(), mainThreadExecutor).setExecutionSlotAllocatorFactory(createExecutionSlotAllocatorFactory(slotPool)).setRestartBackoffTimeStrategy(new TestRestartBackoffTimeStrategy(false, Long.MAX_VALUE)).build();
        ExecutionGraph graph = scheduler.getExecutionGraph();
        startScheduling(scheduler);
        offerSlots(slotPool, NUM_TASKS);
        assertEquals(JobStatus.RUNNING, graph.getState());
        switchAllTasksToRunning(graph);
        scheduler.handleGlobalFailure(new Exception("test"));
        assertEquals(JobStatus.FAILING, graph.getState());
        scheduler.cancel();
        assertEquals(JobStatus.CANCELLING, graph.getState());
        // let all tasks finish cancelling
        completeCanceling(graph);
        assertEquals(JobStatus.CANCELED, graph.getState());
    }
}
Also used : TestRestartBackoffTimeStrategy(org.apache.flink.runtime.executiongraph.failover.flip1.TestRestartBackoffTimeStrategy) SchedulerBase(org.apache.flink.runtime.scheduler.SchedulerBase) IOException(java.io.IOException) SlotPool(org.apache.flink.runtime.jobmaster.slotpool.SlotPool) Test(org.junit.Test)

Example 4 with SlotPool

use of org.apache.flink.runtime.jobmaster.slotpool.SlotPool in project flink by apache.

the class ExecutionGraphRestartTest method testCancelWhileRestarting.

@Test
public void testCancelWhileRestarting() throws Exception {
    // We want to manually control the restart and delay
    try (SlotPool slotPool = SlotPoolUtils.createDeclarativeSlotPoolBridge()) {
        SchedulerBase scheduler = SchedulerTestingUtils.newSchedulerBuilder(createJobGraph(), mainThreadExecutor).setExecutionSlotAllocatorFactory(createExecutionSlotAllocatorFactory(slotPool)).setRestartBackoffTimeStrategy(new TestRestartBackoffTimeStrategy(true, Long.MAX_VALUE)).setDelayExecutor(taskRestartExecutor).build();
        ExecutionGraph executionGraph = scheduler.getExecutionGraph();
        startScheduling(scheduler);
        final ResourceID taskManagerResourceId = offerSlots(slotPool, NUM_TASKS);
        // Release the TaskManager and wait for the job to restart
        slotPool.releaseTaskManager(taskManagerResourceId, new Exception("Test Exception"));
        assertEquals(JobStatus.RESTARTING, executionGraph.getState());
        // Canceling needs to abort the restart
        scheduler.cancel();
        assertEquals(JobStatus.CANCELED, executionGraph.getState());
        taskRestartExecutor.triggerScheduledTasks();
        assertEquals(JobStatus.CANCELED, executionGraph.getState());
        for (ExecutionVertex vertex : executionGraph.getAllExecutionVertices()) {
            assertEquals(ExecutionState.FAILED, vertex.getExecutionState());
        }
    }
}
Also used : TestRestartBackoffTimeStrategy(org.apache.flink.runtime.executiongraph.failover.flip1.TestRestartBackoffTimeStrategy) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) SchedulerBase(org.apache.flink.runtime.scheduler.SchedulerBase) IOException(java.io.IOException) SlotPool(org.apache.flink.runtime.jobmaster.slotpool.SlotPool) Test(org.junit.Test)

Example 5 with SlotPool

use of org.apache.flink.runtime.jobmaster.slotpool.SlotPool in project flink by apache.

the class ExecutionGraphRestartTest method testFailingExecutionAfterRestart.

/**
 * Tests that a failing execution does not affect a restarted job. This is important if a
 * callback handler fails an execution after it has already reached a final state and the job
 * has been restarted.
 */
@Test
public void testFailingExecutionAfterRestart() throws Exception {
    JobVertex sender = ExecutionGraphTestUtils.createJobVertex("Task1", 1, NoOpInvokable.class);
    JobVertex receiver = ExecutionGraphTestUtils.createJobVertex("Task2", 1, NoOpInvokable.class);
    JobGraph jobGraph = JobGraphTestUtils.streamingJobGraph(sender, receiver);
    try (SlotPool slotPool = SlotPoolUtils.createDeclarativeSlotPoolBridge()) {
        SchedulerBase scheduler = SchedulerTestingUtils.newSchedulerBuilder(jobGraph, mainThreadExecutor).setExecutionSlotAllocatorFactory(createExecutionSlotAllocatorFactory(slotPool)).setRestartBackoffTimeStrategy(new TestRestartBackoffTimeStrategy(true, Long.MAX_VALUE)).setDelayExecutor(taskRestartExecutor).build();
        ExecutionGraph eg = scheduler.getExecutionGraph();
        startScheduling(scheduler);
        offerSlots(slotPool, 2);
        Iterator<ExecutionVertex> executionVertices = eg.getAllExecutionVertices().iterator();
        Execution finishedExecution = executionVertices.next().getCurrentExecutionAttempt();
        Execution failedExecution = executionVertices.next().getCurrentExecutionAttempt();
        finishedExecution.markFinished();
        failedExecution.fail(new Exception("Test Exception"));
        failedExecution.completeCancelling();
        taskRestartExecutor.triggerScheduledTasks();
        assertEquals(JobStatus.RUNNING, eg.getState());
        // At this point all resources have been assigned
        for (ExecutionVertex vertex : eg.getAllExecutionVertices()) {
            assertNotNull("No assigned resource (test instability).", vertex.getCurrentAssignedResource());
            vertex.getCurrentExecutionAttempt().switchToRecovering();
            vertex.getCurrentExecutionAttempt().switchToRunning();
        }
        // fail old finished execution, this should not affect the execution
        finishedExecution.fail(new Exception("This should have no effect"));
        for (ExecutionVertex vertex : eg.getAllExecutionVertices()) {
            vertex.getCurrentExecutionAttempt().markFinished();
        }
        // the state of the finished execution should have not changed since it is terminal
        assertEquals(ExecutionState.FINISHED, finishedExecution.getState());
        assertEquals(JobStatus.FINISHED, eg.getState());
    }
}
Also used : JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) TestRestartBackoffTimeStrategy(org.apache.flink.runtime.executiongraph.failover.flip1.TestRestartBackoffTimeStrategy) SchedulerBase(org.apache.flink.runtime.scheduler.SchedulerBase) IOException(java.io.IOException) SlotPool(org.apache.flink.runtime.jobmaster.slotpool.SlotPool) Test(org.junit.Test)

Aggregations

SlotPool (org.apache.flink.runtime.jobmaster.slotpool.SlotPool)9 Test (org.junit.Test)7 TestRestartBackoffTimeStrategy (org.apache.flink.runtime.executiongraph.failover.flip1.TestRestartBackoffTimeStrategy)6 IOException (java.io.IOException)5 SchedulerBase (org.apache.flink.runtime.scheduler.SchedulerBase)5 CheckpointsCleaner (org.apache.flink.runtime.checkpoint.CheckpointsCleaner)3 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)3 JobVertex (org.apache.flink.runtime.jobgraph.JobVertex)3 PhysicalSlotProvider (org.apache.flink.runtime.jobmaster.slotpool.PhysicalSlotProvider)3 PhysicalSlotProviderImpl (org.apache.flink.runtime.jobmaster.slotpool.PhysicalSlotProviderImpl)3 Collections (java.util.Collections)2 CompletableFuture (java.util.concurrent.CompletableFuture)2 Executors (java.util.concurrent.Executors)2 ScheduledExecutorService (java.util.concurrent.ScheduledExecutorService)2 JobStatus (org.apache.flink.api.common.JobStatus)2 Time (org.apache.flink.api.common.time.Time)2 Duration (java.time.Duration)1 ArrayList (java.util.ArrayList)1 Arrays (java.util.Arrays)1 Iterator (java.util.Iterator)1