Search in sources :

Example 6 with SlotPool

use of org.apache.flink.runtime.jobmaster.slotpool.SlotPool in project flink by apache.

the class ExecutionGraphRestartTest method testFailWhileCanceling.

@Test
public void testFailWhileCanceling() throws Exception {
    try (SlotPool slotPool = SlotPoolUtils.createDeclarativeSlotPoolBridge()) {
        SchedulerBase scheduler = SchedulerTestingUtils.newSchedulerBuilder(createJobGraph(), mainThreadExecutor).setExecutionSlotAllocatorFactory(createExecutionSlotAllocatorFactory(slotPool)).setRestartBackoffTimeStrategy(new TestRestartBackoffTimeStrategy(false, Long.MAX_VALUE)).build();
        ExecutionGraph graph = scheduler.getExecutionGraph();
        startScheduling(scheduler);
        offerSlots(slotPool, NUM_TASKS);
        assertEquals(JobStatus.RUNNING, graph.getState());
        switchAllTasksToRunning(graph);
        scheduler.cancel();
        assertEquals(JobStatus.CANCELLING, graph.getState());
        scheduler.handleGlobalFailure(new Exception("test"));
        assertEquals(JobStatus.FAILING, graph.getState());
        // let all tasks finish cancelling
        completeCanceling(graph);
        assertEquals(JobStatus.FAILED, graph.getState());
    }
}
Also used : TestRestartBackoffTimeStrategy(org.apache.flink.runtime.executiongraph.failover.flip1.TestRestartBackoffTimeStrategy) SchedulerBase(org.apache.flink.runtime.scheduler.SchedulerBase) IOException(java.io.IOException) SlotPool(org.apache.flink.runtime.jobmaster.slotpool.SlotPool) Test(org.junit.Test)

Example 7 with SlotPool

use of org.apache.flink.runtime.jobmaster.slotpool.SlotPool in project flink by apache.

the class ExecutionGraphRestartTest method testFailExecutionAfterCancel.

/**
 * Tests that a graph is not restarted after cancellation via a call to {@link
 * Execution#fail(Throwable)}. This can happen when a slot is released concurrently with
 * cancellation.
 */
@Test
public void testFailExecutionAfterCancel() throws Exception {
    try (SlotPool slotPool = SlotPoolUtils.createDeclarativeSlotPoolBridge()) {
        SchedulerBase scheduler = SchedulerTestingUtils.newSchedulerBuilder(createJobGraphToCancel(), mainThreadExecutor).setExecutionSlotAllocatorFactory(createExecutionSlotAllocatorFactory(slotPool)).setRestartBackoffTimeStrategy(new TestRestartBackoffTimeStrategy(false, Long.MAX_VALUE)).setDelayExecutor(taskRestartExecutor).build();
        ExecutionGraph eg = scheduler.getExecutionGraph();
        startScheduling(scheduler);
        offerSlots(slotPool, 1);
        // Fail right after cancel (for example with concurrent slot release)
        scheduler.cancel();
        for (ExecutionVertex v : eg.getAllExecutionVertices()) {
            v.getCurrentExecutionAttempt().fail(new Exception("Test Exception"));
        }
        assertEquals(JobStatus.CANCELED, eg.getTerminationFuture().get());
        Execution execution = eg.getAllExecutionVertices().iterator().next().getCurrentExecutionAttempt();
        execution.completeCancelling();
        assertEquals(JobStatus.CANCELED, eg.getState());
    }
}
Also used : TestRestartBackoffTimeStrategy(org.apache.flink.runtime.executiongraph.failover.flip1.TestRestartBackoffTimeStrategy) SchedulerBase(org.apache.flink.runtime.scheduler.SchedulerBase) IOException(java.io.IOException) SlotPool(org.apache.flink.runtime.jobmaster.slotpool.SlotPool) Test(org.junit.Test)

Example 8 with SlotPool

use of org.apache.flink.runtime.jobmaster.slotpool.SlotPool in project flink by apache.

the class DefaultSchedulerFactory method createInstance.

@Override
public SchedulerNG createInstance(final Logger log, final JobGraph jobGraph, final Executor ioExecutor, final Configuration jobMasterConfiguration, final SlotPoolService slotPoolService, final ScheduledExecutorService futureExecutor, final ClassLoader userCodeLoader, final CheckpointRecoveryFactory checkpointRecoveryFactory, final Time rpcTimeout, final BlobWriter blobWriter, final JobManagerJobMetricGroup jobManagerJobMetricGroup, final Time slotRequestTimeout, final ShuffleMaster<?> shuffleMaster, final JobMasterPartitionTracker partitionTracker, final ExecutionDeploymentTracker executionDeploymentTracker, long initializationTimestamp, final ComponentMainThreadExecutor mainThreadExecutor, final FatalErrorHandler fatalErrorHandler, final JobStatusListener jobStatusListener) throws Exception {
    final SlotPool slotPool = slotPoolService.castInto(SlotPool.class).orElseThrow(() -> new IllegalStateException("The DefaultScheduler requires a SlotPool."));
    final DefaultSchedulerComponents schedulerComponents = createSchedulerComponents(jobGraph.getJobType(), jobGraph.isApproximateLocalRecoveryEnabled(), jobMasterConfiguration, slotPool, slotRequestTimeout);
    final RestartBackoffTimeStrategy restartBackoffTimeStrategy = RestartBackoffTimeStrategyFactoryLoader.createRestartBackoffTimeStrategyFactory(jobGraph.getSerializedExecutionConfig().deserializeValue(userCodeLoader).getRestartStrategy(), jobMasterConfiguration, jobGraph.isCheckpointingEnabled()).create();
    log.info("Using restart back off time strategy {} for {} ({}).", restartBackoffTimeStrategy, jobGraph.getName(), jobGraph.getJobID());
    final ExecutionGraphFactory executionGraphFactory = new DefaultExecutionGraphFactory(jobMasterConfiguration, userCodeLoader, executionDeploymentTracker, futureExecutor, ioExecutor, rpcTimeout, jobManagerJobMetricGroup, blobWriter, shuffleMaster, partitionTracker);
    return new DefaultScheduler(log, jobGraph, ioExecutor, jobMasterConfiguration, schedulerComponents.getStartUpAction(), new ScheduledExecutorServiceAdapter(futureExecutor), userCodeLoader, new CheckpointsCleaner(), checkpointRecoveryFactory, jobManagerJobMetricGroup, schedulerComponents.getSchedulingStrategyFactory(), FailoverStrategyFactoryLoader.loadFailoverStrategyFactory(jobMasterConfiguration), restartBackoffTimeStrategy, new DefaultExecutionVertexOperations(), new ExecutionVertexVersioner(), schedulerComponents.getAllocatorFactory(), initializationTimestamp, mainThreadExecutor, (jobId, jobStatus, timestamp) -> {
        if (jobStatus == JobStatus.RESTARTING) {
            slotPool.setIsJobRestarting(true);
        } else {
            slotPool.setIsJobRestarting(false);
        }
        jobStatusListener.jobStatusChanges(jobId, jobStatus, timestamp);
    }, executionGraphFactory, shuffleMaster, rpcTimeout);
}
Also used : ScheduledExecutorServiceAdapter(org.apache.flink.util.concurrent.ScheduledExecutorServiceAdapter) RestartBackoffTimeStrategy(org.apache.flink.runtime.executiongraph.failover.flip1.RestartBackoffTimeStrategy) CheckpointsCleaner(org.apache.flink.runtime.checkpoint.CheckpointsCleaner) SlotPool(org.apache.flink.runtime.jobmaster.slotpool.SlotPool)

Example 9 with SlotPool

use of org.apache.flink.runtime.jobmaster.slotpool.SlotPool in project flink by apache.

the class DefaultSchedulerBatchSchedulingTest method testSchedulingOfJobWithFewerSlotsThanParallelism.

/**
 * Tests that a batch job can be executed with fewer slots than its parallelism. See FLINK-13187
 * for more information.
 */
@Test
public void testSchedulingOfJobWithFewerSlotsThanParallelism() throws Exception {
    final int parallelism = 5;
    final Time batchSlotTimeout = Time.milliseconds(5L);
    final JobGraph jobGraph = createBatchJobGraph(parallelism);
    try (final SlotPool slotPool = createSlotPool(mainThreadExecutor, batchSlotTimeout)) {
        final ArrayBlockingQueue<ExecutionAttemptID> submittedTasksQueue = new ArrayBlockingQueue<>(parallelism);
        TestingTaskExecutorGateway testingTaskExecutorGateway = new TestingTaskExecutorGatewayBuilder().setSubmitTaskConsumer((tdd, ignored) -> {
            submittedTasksQueue.offer(tdd.getExecutionAttemptId());
            return CompletableFuture.completedFuture(Acknowledge.get());
        }).createTestingTaskExecutorGateway();
        final PhysicalSlotProvider slotProvider = new PhysicalSlotProviderImpl(LocationPreferenceSlotSelectionStrategy.createDefault(), slotPool);
        final GloballyTerminalJobStatusListener jobStatusListener = new GloballyTerminalJobStatusListener();
        final SchedulerNG scheduler = createScheduler(jobGraph, mainThreadExecutor, slotProvider, batchSlotTimeout, jobStatusListener);
        CompletableFuture.runAsync(scheduler::startScheduling, mainThreadExecutor).join();
        // register a single slot at the slot pool
        SlotPoolUtils.offerSlots(slotPool, mainThreadExecutor, Collections.singletonList(ResourceProfile.ANY), new RpcTaskManagerGateway(testingTaskExecutorGateway, JobMasterId.generate()));
        // wait until the batch slot timeout has been reached
        Thread.sleep(batchSlotTimeout.toMilliseconds());
        final CompletableFuture<JobStatus> terminationFuture = jobStatusListener.getTerminationFuture();
        for (int i = 0; i < parallelism; i++) {
            final CompletableFuture<ExecutionAttemptID> submittedTaskFuture = CompletableFuture.supplyAsync(CheckedSupplier.unchecked(submittedTasksQueue::take));
            // wait until one of them is completed
            CompletableFuture.anyOf(submittedTaskFuture, terminationFuture).join();
            if (submittedTaskFuture.isDone()) {
                finishExecution(submittedTaskFuture.get(), scheduler, mainThreadExecutor);
            } else {
                fail(String.format("Job reached a globally terminal state %s before all executions were finished.", terminationFuture.get()));
            }
        }
        assertThat(terminationFuture.get(), is(JobStatus.FINISHED));
    }
}
Also used : ComponentMainThreadExecutorServiceAdapter(org.apache.flink.runtime.concurrent.ComponentMainThreadExecutorServiceAdapter) TestingTaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGateway) ComponentMainThreadExecutor(org.apache.flink.runtime.concurrent.ComponentMainThreadExecutor) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) BeforeClass(org.junit.BeforeClass) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) LoggerFactory(org.slf4j.LoggerFactory) CompletableFuture(java.util.concurrent.CompletableFuture) JobStatus(org.apache.flink.api.common.JobStatus) DeclarativeSlotPoolBridgeBuilder(org.apache.flink.runtime.jobmaster.slotpool.DeclarativeSlotPoolBridgeBuilder) SlotPoolUtils(org.apache.flink.runtime.jobmaster.slotpool.SlotPoolUtils) RpcTaskManagerGateway(org.apache.flink.runtime.jobmaster.RpcTaskManagerGateway) JobGraphTestUtils(org.apache.flink.runtime.jobgraph.JobGraphTestUtils) TestLogger(org.apache.flink.util.TestLogger) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) Assert.fail(org.junit.Assert.fail) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) CheckedSupplier(org.apache.flink.util.function.CheckedSupplier) SlotPool(org.apache.flink.runtime.jobmaster.slotpool.SlotPool) AfterClass(org.junit.AfterClass) Logger(org.slf4j.Logger) PhysicalSlotProvider(org.apache.flink.runtime.jobmaster.slotpool.PhysicalSlotProvider) ExecutionState(org.apache.flink.runtime.execution.ExecutionState) JobMasterId(org.apache.flink.runtime.jobmaster.JobMasterId) Test(org.junit.Test) LocationPreferenceSlotSelectionStrategy(org.apache.flink.runtime.jobmaster.slotpool.LocationPreferenceSlotSelectionStrategy) JobStatusListener(org.apache.flink.runtime.executiongraph.JobStatusListener) Acknowledge(org.apache.flink.runtime.messages.Acknowledge) Executors(java.util.concurrent.Executors) ResourceProfile(org.apache.flink.runtime.clusterframework.types.ResourceProfile) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) PhysicalSlotProviderImpl(org.apache.flink.runtime.jobmaster.slotpool.PhysicalSlotProviderImpl) TestingTaskExecutorGatewayBuilder(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGatewayBuilder) Matchers.is(org.hamcrest.Matchers.is) TaskExecutionState(org.apache.flink.runtime.taskmanager.TaskExecutionState) Collections(java.util.Collections) Time(org.apache.flink.api.common.time.Time) NoOpInvokable(org.apache.flink.runtime.testtasks.NoOpInvokable) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) RpcTaskManagerGateway(org.apache.flink.runtime.jobmaster.RpcTaskManagerGateway) Time(org.apache.flink.api.common.time.Time) TestingTaskExecutorGatewayBuilder(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGatewayBuilder) SlotPool(org.apache.flink.runtime.jobmaster.slotpool.SlotPool) JobStatus(org.apache.flink.api.common.JobStatus) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) PhysicalSlotProviderImpl(org.apache.flink.runtime.jobmaster.slotpool.PhysicalSlotProviderImpl) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) TestingTaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TestingTaskExecutorGateway) PhysicalSlotProvider(org.apache.flink.runtime.jobmaster.slotpool.PhysicalSlotProvider) Test(org.junit.Test)

Aggregations

SlotPool (org.apache.flink.runtime.jobmaster.slotpool.SlotPool)9 Test (org.junit.Test)7 TestRestartBackoffTimeStrategy (org.apache.flink.runtime.executiongraph.failover.flip1.TestRestartBackoffTimeStrategy)6 IOException (java.io.IOException)5 SchedulerBase (org.apache.flink.runtime.scheduler.SchedulerBase)5 CheckpointsCleaner (org.apache.flink.runtime.checkpoint.CheckpointsCleaner)3 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)3 JobVertex (org.apache.flink.runtime.jobgraph.JobVertex)3 PhysicalSlotProvider (org.apache.flink.runtime.jobmaster.slotpool.PhysicalSlotProvider)3 PhysicalSlotProviderImpl (org.apache.flink.runtime.jobmaster.slotpool.PhysicalSlotProviderImpl)3 Collections (java.util.Collections)2 CompletableFuture (java.util.concurrent.CompletableFuture)2 Executors (java.util.concurrent.Executors)2 ScheduledExecutorService (java.util.concurrent.ScheduledExecutorService)2 JobStatus (org.apache.flink.api.common.JobStatus)2 Time (org.apache.flink.api.common.time.Time)2 Duration (java.time.Duration)1 ArrayList (java.util.ArrayList)1 Arrays (java.util.Arrays)1 Iterator (java.util.Iterator)1