use of org.apache.flink.runtime.jobmaster.slotpool.SlotPool in project flink by apache.
the class AdaptiveBatchSchedulerFactory method createInstance.
@Override
public SchedulerNG createInstance(Logger log, JobGraph jobGraph, Executor ioExecutor, Configuration jobMasterConfiguration, SlotPoolService slotPoolService, ScheduledExecutorService futureExecutor, ClassLoader userCodeLoader, CheckpointRecoveryFactory checkpointRecoveryFactory, Time rpcTimeout, BlobWriter blobWriter, JobManagerJobMetricGroup jobManagerJobMetricGroup, Time slotRequestTimeout, ShuffleMaster<?> shuffleMaster, JobMasterPartitionTracker partitionTracker, ExecutionDeploymentTracker executionDeploymentTracker, long initializationTimestamp, ComponentMainThreadExecutor mainThreadExecutor, FatalErrorHandler fatalErrorHandler, JobStatusListener jobStatusListener) throws Exception {
checkState(jobGraph.getJobType() == JobType.BATCH, "Adaptive batch scheduler only supports batch jobs");
checkAllExchangesBlocking(jobGraph);
final SlotPool slotPool = slotPoolService.castInto(SlotPool.class).orElseThrow(() -> new IllegalStateException("The DefaultScheduler requires a SlotPool."));
final SlotSelectionStrategy slotSelectionStrategy = SlotSelectionStrategyUtils.selectSlotSelectionStrategy(JobType.BATCH, jobMasterConfiguration);
final PhysicalSlotRequestBulkChecker bulkChecker = PhysicalSlotRequestBulkCheckerImpl.createFromSlotPool(slotPool, SystemClock.getInstance());
final PhysicalSlotProvider physicalSlotProvider = new PhysicalSlotProviderImpl(slotSelectionStrategy, slotPool);
final ExecutionSlotAllocatorFactory allocatorFactory = new SlotSharingExecutionSlotAllocatorFactory(physicalSlotProvider, false, bulkChecker, slotRequestTimeout);
final RestartBackoffTimeStrategy restartBackoffTimeStrategy = RestartBackoffTimeStrategyFactoryLoader.createRestartBackoffTimeStrategyFactory(jobGraph.getSerializedExecutionConfig().deserializeValue(userCodeLoader).getRestartStrategy(), jobMasterConfiguration, jobGraph.isCheckpointingEnabled()).create();
log.info("Using restart back off time strategy {} for {} ({}).", restartBackoffTimeStrategy, jobGraph.getName(), jobGraph.getJobID());
final ExecutionGraphFactory executionGraphFactory = new DefaultExecutionGraphFactory(jobMasterConfiguration, userCodeLoader, executionDeploymentTracker, futureExecutor, ioExecutor, rpcTimeout, jobManagerJobMetricGroup, blobWriter, shuffleMaster, partitionTracker, true);
return new AdaptiveBatchScheduler(log, jobGraph, ioExecutor, jobMasterConfiguration, bulkChecker::start, new ScheduledExecutorServiceAdapter(futureExecutor), userCodeLoader, new CheckpointsCleaner(), checkpointRecoveryFactory, jobManagerJobMetricGroup, new VertexwiseSchedulingStrategy.Factory(), FailoverStrategyFactoryLoader.loadFailoverStrategyFactory(jobMasterConfiguration), restartBackoffTimeStrategy, new DefaultExecutionVertexOperations(), new ExecutionVertexVersioner(), allocatorFactory, initializationTimestamp, mainThreadExecutor, jobStatusListener, executionGraphFactory, shuffleMaster, rpcTimeout, DefaultVertexParallelismDecider.from(jobMasterConfiguration), jobMasterConfiguration.getInteger(JobManagerOptions.ADAPTIVE_BATCH_SCHEDULER_MAX_PARALLELISM));
}
use of org.apache.flink.runtime.jobmaster.slotpool.SlotPool in project flink by apache.
the class DefaultSchedulerTest method testStatusMetrics.
@Test
public void testStatusMetrics() throws Exception {
// running time acts as a stand-in for generic status time metrics
final CompletableFuture<Gauge<Long>> runningTimeMetricFuture = new CompletableFuture<>();
final MetricRegistry metricRegistry = TestingMetricRegistry.builder().setRegisterConsumer((metric, name, group) -> {
switch(name) {
case "runningTimeTotal":
runningTimeMetricFuture.complete((Gauge<Long>) metric);
break;
}
}).build();
final JobGraph jobGraph = singleNonParallelJobVertexJobGraph();
final JobVertex onlyJobVertex = getOnlyJobVertex(jobGraph);
final Configuration configuration = new Configuration();
configuration.set(MetricOptions.JOB_STATUS_METRICS, Arrays.asList(MetricOptions.JobStatusMetrics.TOTAL_TIME));
final ComponentMainThreadExecutor singleThreadMainThreadExecutor = ComponentMainThreadExecutorServiceAdapter.forSingleThreadExecutor(scheduledExecutorService);
final Time slotTimeout = Time.milliseconds(5L);
final SlotPool slotPool = new DeclarativeSlotPoolBridgeBuilder().setBatchSlotTimeout(slotTimeout).buildAndStart(singleThreadMainThreadExecutor);
final PhysicalSlotProvider slotProvider = new PhysicalSlotProviderImpl(LocationPreferenceSlotSelectionStrategy.createDefault(), slotPool);
final DefaultScheduler scheduler = createSchedulerBuilder(jobGraph, singleThreadMainThreadExecutor).setJobMasterConfiguration(configuration).setJobManagerJobMetricGroup(JobManagerMetricGroup.createJobManagerMetricGroup(metricRegistry, "localhost").addJob(new JobID(), "jobName")).setExecutionSlotAllocatorFactory(SchedulerTestingUtils.newSlotSharingExecutionSlotAllocatorFactory(slotProvider, slotTimeout)).build();
final AdaptiveSchedulerTest.SubmissionBufferingTaskManagerGateway taskManagerGateway = new AdaptiveSchedulerTest.SubmissionBufferingTaskManagerGateway(1);
taskManagerGateway.setCancelConsumer(executionAttemptId -> {
singleThreadMainThreadExecutor.execute(() -> scheduler.updateTaskExecutionState(new TaskExecutionState(executionAttemptId, ExecutionState.CANCELED)));
});
singleThreadMainThreadExecutor.execute(() -> {
scheduler.startScheduling();
offerSlots(slotPool, createSlotOffersForResourceRequirements(ResourceCounter.withResource(ResourceProfile.UNKNOWN, 1)), taskManagerGateway);
});
// wait for the first task submission
taskManagerGateway.waitForSubmissions(1, Duration.ofSeconds(5));
// sleep a bit to ensure uptime is > 0
Thread.sleep(10L);
final Gauge<Long> runningTimeGauge = runningTimeMetricFuture.get();
Assert.assertThat(runningTimeGauge.getValue(), greaterThan(0L));
}
use of org.apache.flink.runtime.jobmaster.slotpool.SlotPool in project flink by apache.
the class ExecutionGraphRestartTest method testCancelWhileFailing.
@Test
public void testCancelWhileFailing() throws Exception {
try (SlotPool slotPool = SlotPoolUtils.createDeclarativeSlotPoolBridge()) {
SchedulerBase scheduler = SchedulerTestingUtils.newSchedulerBuilder(createJobGraph(), mainThreadExecutor).setExecutionSlotAllocatorFactory(createExecutionSlotAllocatorFactory(slotPool)).setRestartBackoffTimeStrategy(new TestRestartBackoffTimeStrategy(false, Long.MAX_VALUE)).build();
ExecutionGraph graph = scheduler.getExecutionGraph();
startScheduling(scheduler);
offerSlots(slotPool, NUM_TASKS);
assertEquals(JobStatus.RUNNING, graph.getState());
switchAllTasksToRunning(graph);
scheduler.handleGlobalFailure(new Exception("test"));
assertEquals(JobStatus.FAILING, graph.getState());
scheduler.cancel();
assertEquals(JobStatus.CANCELLING, graph.getState());
// let all tasks finish cancelling
completeCanceling(graph);
assertEquals(JobStatus.CANCELED, graph.getState());
}
}
use of org.apache.flink.runtime.jobmaster.slotpool.SlotPool in project flink by apache.
the class ExecutionGraphRestartTest method testCancelWhileRestarting.
@Test
public void testCancelWhileRestarting() throws Exception {
// We want to manually control the restart and delay
try (SlotPool slotPool = SlotPoolUtils.createDeclarativeSlotPoolBridge()) {
SchedulerBase scheduler = SchedulerTestingUtils.newSchedulerBuilder(createJobGraph(), mainThreadExecutor).setExecutionSlotAllocatorFactory(createExecutionSlotAllocatorFactory(slotPool)).setRestartBackoffTimeStrategy(new TestRestartBackoffTimeStrategy(true, Long.MAX_VALUE)).setDelayExecutor(taskRestartExecutor).build();
ExecutionGraph executionGraph = scheduler.getExecutionGraph();
startScheduling(scheduler);
final ResourceID taskManagerResourceId = offerSlots(slotPool, NUM_TASKS);
// Release the TaskManager and wait for the job to restart
slotPool.releaseTaskManager(taskManagerResourceId, new Exception("Test Exception"));
assertEquals(JobStatus.RESTARTING, executionGraph.getState());
// Canceling needs to abort the restart
scheduler.cancel();
assertEquals(JobStatus.CANCELED, executionGraph.getState());
taskRestartExecutor.triggerScheduledTasks();
assertEquals(JobStatus.CANCELED, executionGraph.getState());
for (ExecutionVertex vertex : executionGraph.getAllExecutionVertices()) {
assertEquals(ExecutionState.FAILED, vertex.getExecutionState());
}
}
}
use of org.apache.flink.runtime.jobmaster.slotpool.SlotPool in project flink by apache.
the class ExecutionGraphRestartTest method testFailingExecutionAfterRestart.
/**
* Tests that a failing execution does not affect a restarted job. This is important if a
* callback handler fails an execution after it has already reached a final state and the job
* has been restarted.
*/
@Test
public void testFailingExecutionAfterRestart() throws Exception {
JobVertex sender = ExecutionGraphTestUtils.createJobVertex("Task1", 1, NoOpInvokable.class);
JobVertex receiver = ExecutionGraphTestUtils.createJobVertex("Task2", 1, NoOpInvokable.class);
JobGraph jobGraph = JobGraphTestUtils.streamingJobGraph(sender, receiver);
try (SlotPool slotPool = SlotPoolUtils.createDeclarativeSlotPoolBridge()) {
SchedulerBase scheduler = SchedulerTestingUtils.newSchedulerBuilder(jobGraph, mainThreadExecutor).setExecutionSlotAllocatorFactory(createExecutionSlotAllocatorFactory(slotPool)).setRestartBackoffTimeStrategy(new TestRestartBackoffTimeStrategy(true, Long.MAX_VALUE)).setDelayExecutor(taskRestartExecutor).build();
ExecutionGraph eg = scheduler.getExecutionGraph();
startScheduling(scheduler);
offerSlots(slotPool, 2);
Iterator<ExecutionVertex> executionVertices = eg.getAllExecutionVertices().iterator();
Execution finishedExecution = executionVertices.next().getCurrentExecutionAttempt();
Execution failedExecution = executionVertices.next().getCurrentExecutionAttempt();
finishedExecution.markFinished();
failedExecution.fail(new Exception("Test Exception"));
failedExecution.completeCancelling();
taskRestartExecutor.triggerScheduledTasks();
assertEquals(JobStatus.RUNNING, eg.getState());
// At this point all resources have been assigned
for (ExecutionVertex vertex : eg.getAllExecutionVertices()) {
assertNotNull("No assigned resource (test instability).", vertex.getCurrentAssignedResource());
vertex.getCurrentExecutionAttempt().switchToRecovering();
vertex.getCurrentExecutionAttempt().switchToRunning();
}
// fail old finished execution, this should not affect the execution
finishedExecution.fail(new Exception("This should have no effect"));
for (ExecutionVertex vertex : eg.getAllExecutionVertices()) {
vertex.getCurrentExecutionAttempt().markFinished();
}
// the state of the finished execution should have not changed since it is terminal
assertEquals(ExecutionState.FINISHED, finishedExecution.getState());
assertEquals(JobStatus.FINISHED, eg.getState());
}
}
Aggregations