Search in sources :

Example 1 with UpTimeGauge

use of org.apache.flink.runtime.executiongraph.metrics.UpTimeGauge in project flink by apache.

the class SchedulerBase method registerJobMetrics.

public static void registerJobMetrics(MetricGroup metrics, JobStatusProvider jobStatusProvider, Gauge<Long> numberOfRestarts, DeploymentStateTimeMetrics deploymentTimeMetrics, Consumer<JobStatusListener> jobStatusListenerRegistrar, long initializationTimestamp, MetricOptions.JobStatusMetricsSettings jobStatusMetricsSettings) {
    metrics.gauge(DownTimeGauge.METRIC_NAME, new DownTimeGauge(jobStatusProvider));
    metrics.gauge(UpTimeGauge.METRIC_NAME, new UpTimeGauge(jobStatusProvider));
    metrics.gauge(MetricNames.NUM_RESTARTS, numberOfRestarts);
    metrics.gauge(MetricNames.FULL_RESTARTS, numberOfRestarts);
    final JobStatusMetrics jobStatusMetrics = new JobStatusMetrics(initializationTimestamp, jobStatusMetricsSettings);
    jobStatusMetrics.registerMetrics(metrics);
    jobStatusListenerRegistrar.accept(jobStatusMetrics);
    deploymentTimeMetrics.registerMetrics(metrics);
}
Also used : DownTimeGauge(org.apache.flink.runtime.executiongraph.metrics.DownTimeGauge) UpTimeGauge(org.apache.flink.runtime.executiongraph.metrics.UpTimeGauge) JobStatusMetrics(org.apache.flink.runtime.scheduler.metrics.JobStatusMetrics)

Example 2 with UpTimeGauge

use of org.apache.flink.runtime.executiongraph.metrics.UpTimeGauge in project flink by apache.

the class AdaptiveSchedulerTest method testStatusMetrics.

@Test
public void testStatusMetrics() throws Exception {
    final CompletableFuture<UpTimeGauge> upTimeMetricFuture = new CompletableFuture<>();
    final CompletableFuture<DownTimeGauge> downTimeMetricFuture = new CompletableFuture<>();
    // restartingTime acts as a stand-in for generic status time metrics
    final CompletableFuture<Gauge<Long>> restartTimeMetricFuture = new CompletableFuture<>();
    final MetricRegistry metricRegistry = TestingMetricRegistry.builder().setRegisterConsumer((metric, name, group) -> {
        switch(name) {
            case UpTimeGauge.METRIC_NAME:
                upTimeMetricFuture.complete((UpTimeGauge) metric);
                break;
            case DownTimeGauge.METRIC_NAME:
                downTimeMetricFuture.complete((DownTimeGauge) metric);
                break;
            case "restartingTimeTotal":
                restartTimeMetricFuture.complete((Gauge<Long>) metric);
                break;
        }
    }).build();
    final JobGraph jobGraph = createJobGraph();
    final DefaultDeclarativeSlotPool declarativeSlotPool = createDeclarativeSlotPool(jobGraph.getJobID());
    final Configuration configuration = new Configuration();
    configuration.set(JobManagerOptions.MIN_PARALLELISM_INCREASE, 1);
    configuration.set(JobManagerOptions.RESOURCE_WAIT_TIMEOUT, Duration.ofMillis(10L));
    configuration.set(MetricOptions.JOB_STATUS_METRICS, Arrays.asList(MetricOptions.JobStatusMetrics.TOTAL_TIME));
    final AdaptiveScheduler scheduler = new AdaptiveSchedulerBuilder(jobGraph, singleThreadMainThreadExecutor).setJobMasterConfiguration(configuration).setJobManagerJobMetricGroup(JobManagerMetricGroup.createJobManagerMetricGroup(metricRegistry, "localhost").addJob(new JobID(), "jobName")).setDeclarativeSlotPool(declarativeSlotPool).build();
    final UpTimeGauge upTimeGauge = upTimeMetricFuture.get();
    final DownTimeGauge downTimeGauge = downTimeMetricFuture.get();
    final Gauge<Long> restartTimeGauge = restartTimeMetricFuture.get();
    final SubmissionBufferingTaskManagerGateway taskManagerGateway = new SubmissionBufferingTaskManagerGateway(1 + PARALLELISM);
    taskManagerGateway.setCancelConsumer(createCancelConsumer(scheduler));
    singleThreadMainThreadExecutor.execute(() -> {
        scheduler.startScheduling();
        offerSlots(declarativeSlotPool, createSlotOffersForResourceRequirements(ResourceCounter.withResource(ResourceProfile.UNKNOWN, 1)), taskManagerGateway);
    });
    // wait for the first task submission
    taskManagerGateway.waitForSubmissions(1, Duration.ofSeconds(5));
    // sleep a bit to ensure uptime is > 0
    Thread.sleep(10L);
    assertThat(upTimeGauge.getValue()).isGreaterThan(0L);
    assertThat(downTimeGauge.getValue()).isEqualTo(0L);
    assertThat(restartTimeGauge.getValue()).isEqualTo(0L);
    singleThreadMainThreadExecutor.execute(() -> {
        // offer more slots, which will cause a restart in order to scale up
        offerSlots(declarativeSlotPool, createSlotOffersForResourceRequirements(ResourceCounter.withResource(ResourceProfile.UNKNOWN, 1)), taskManagerGateway);
    });
    // wait for the second task submissions
    taskManagerGateway.waitForSubmissions(2, Duration.ofSeconds(5));
    // sleep a bit to ensure uptime is > 0
    Thread.sleep(10L);
    assertThat(upTimeGauge.getValue()).isGreaterThan(0L);
    assertThat(downTimeGauge.getValue()).isEqualTo(0L);
    // can be zero if the restart is very quick
    assertThat(restartTimeGauge.getValue()).isGreaterThanOrEqualTo(0L);
}
Also used : Arrays(java.util.Arrays) ResourceRequirement(org.apache.flink.runtime.slots.ResourceRequirement) TaskNotRunningException(org.apache.flink.runtime.operators.coordination.TaskNotRunningException) ArchivedExecution(org.apache.flink.runtime.executiongraph.ArchivedExecution) TestingSlotAllocator(org.apache.flink.runtime.scheduler.adaptive.allocator.TestingSlotAllocator) CheckpointException(org.apache.flink.runtime.checkpoint.CheckpointException) ResultPartitionID(org.apache.flink.runtime.io.network.partition.ResultPartitionID) TestingFatalErrorHandler(org.apache.flink.runtime.util.TestingFatalErrorHandler) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry) Duration(java.time.Duration) ClassRule(org.junit.ClassRule) TestingCheckpointRecoveryFactory(org.apache.flink.runtime.checkpoint.TestingCheckpointRecoveryFactory) ManuallyTriggeredComponentMainThreadExecutor(org.apache.flink.runtime.concurrent.ManuallyTriggeredComponentMainThreadExecutor) BlockingQueue(java.util.concurrent.BlockingQueue) MetricOptions(org.apache.flink.configuration.MetricOptions) JobManagerOptions(org.apache.flink.configuration.JobManagerOptions) Executors(java.util.concurrent.Executors) MetricNames(org.apache.flink.runtime.metrics.MetricNames) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) VertexParallelismStore(org.apache.flink.runtime.scheduler.VertexParallelismStore) Time(org.apache.flink.api.common.time.Time) RootExceptionHistoryEntry(org.apache.flink.runtime.scheduler.exceptionhistory.RootExceptionHistoryEntry) FlinkException(org.apache.flink.util.FlinkException) ComponentMainThreadExecutor(org.apache.flink.runtime.concurrent.ComponentMainThreadExecutor) UpTimeGauge(org.apache.flink.runtime.executiongraph.metrics.UpTimeGauge) ExceptionHistoryEntry(org.apache.flink.runtime.scheduler.exceptionhistory.ExceptionHistoryEntry) LocalTaskManagerLocation(org.apache.flink.runtime.taskmanager.LocalTaskManagerLocation) ResourceCounter(org.apache.flink.runtime.util.ResourceCounter) JobStatus(org.apache.flink.api.common.JobStatus) DefaultAllocatedSlotPool(org.apache.flink.runtime.jobmaster.slotpool.DefaultAllocatedSlotPool) ArrayList(java.util.ArrayList) SchedulerNG(org.apache.flink.runtime.scheduler.SchedulerNG) DownTimeGauge(org.apache.flink.runtime.executiongraph.metrics.DownTimeGauge) PartitionProducerDisposedException(org.apache.flink.runtime.jobmanager.PartitionProducerDisposedException) Gauge(org.apache.flink.metrics.Gauge) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) BiConsumer(java.util.function.BiConsumer) FixedDelayRestartBackoffTimeStrategy(org.apache.flink.runtime.executiongraph.failover.flip1.FixedDelayRestartBackoffTimeStrategy) Nullable(javax.annotation.Nullable) ArchivedExecutionVertex(org.apache.flink.runtime.executiongraph.ArchivedExecutionVertex) TestExecutorResource(org.apache.flink.testutils.executor.TestExecutorResource) ExecutionState(org.apache.flink.runtime.execution.ExecutionState) CheckpointsCleaner(org.apache.flink.runtime.checkpoint.CheckpointsCleaner) TestOperatorEvent(org.apache.flink.runtime.operators.coordination.TestOperatorEvent) Test(org.junit.Test) IOException(java.io.IOException) IterableUtils(org.apache.flink.util.IterableUtils) SimpleAckingTaskManagerGateway(org.apache.flink.runtime.executiongraph.utils.SimpleAckingTaskManagerGateway) ExecutionException(java.util.concurrent.ExecutionException) TaskExecutionStateTransition(org.apache.flink.runtime.executiongraph.TaskExecutionStateTransition) JobID(org.apache.flink.api.common.JobID) NoRestartBackoffTimeStrategy(org.apache.flink.runtime.executiongraph.failover.flip1.NoRestartBackoffTimeStrategy) SlotPoolTestUtils.offerSlots(org.apache.flink.runtime.jobmaster.slotpool.SlotPoolTestUtils.offerSlots) JobManagerMetricGroup(org.apache.flink.runtime.metrics.groups.JobManagerMetricGroup) ComponentMainThreadExecutorServiceAdapter(org.apache.flink.runtime.concurrent.ComponentMainThreadExecutorServiceAdapter) Assertions.assertThat(org.assertj.core.api.Assertions.assertThat) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) CheckpointCoordinatorConfiguration(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration) SuppressRestartsException(org.apache.flink.runtime.execution.SuppressRestartsException) ArchivedExecutionGraphTest(org.apache.flink.runtime.executiongraph.ArchivedExecutionGraphTest) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) TestLogger(org.apache.flink.util.TestLogger) JobCheckpointingSettings(org.apache.flink.runtime.jobgraph.tasks.JobCheckpointingSettings) CheckpointIDCounter(org.apache.flink.runtime.checkpoint.CheckpointIDCounter) ArchivedExecutionJobVertex(org.apache.flink.runtime.executiongraph.ArchivedExecutionJobVertex) TestingCompletedCheckpointStore(org.apache.flink.runtime.checkpoint.TestingCompletedCheckpointStore) TestRestartBackoffTimeStrategy(org.apache.flink.runtime.executiongraph.failover.flip1.TestRestartBackoffTimeStrategy) IntermediateDataSetID(org.apache.flink.runtime.jobgraph.IntermediateDataSetID) Collectors(java.util.stream.Collectors) ResourceProfile(org.apache.flink.runtime.clusterframework.types.ResourceProfile) DefaultDeclarativeSlotPool(org.apache.flink.runtime.jobmaster.slotpool.DefaultDeclarativeSlotPool) List(java.util.List) StandaloneCheckpointIDCounter(org.apache.flink.runtime.checkpoint.StandaloneCheckpointIDCounter) CoordinationRequest(org.apache.flink.runtime.operators.coordination.CoordinationRequest) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) Optional(java.util.Optional) SchedulerExecutionMode(org.apache.flink.configuration.SchedulerExecutionMode) KeyGroupRangeAssignment(org.apache.flink.runtime.state.KeyGroupRangeAssignment) OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) SchedulerBase(org.apache.flink.runtime.scheduler.SchedulerBase) SavepointFormatType(org.apache.flink.core.execution.SavepointFormatType) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) CompletableFuture(java.util.concurrent.CompletableFuture) VertexParallelismInformation(org.apache.flink.runtime.scheduler.VertexParallelismInformation) TaskDeploymentDescriptor(org.apache.flink.runtime.deployment.TaskDeploymentDescriptor) TestingCheckpointIDCounter(org.apache.flink.runtime.checkpoint.TestingCheckpointIDCounter) TestingMetricRegistry(org.apache.flink.runtime.metrics.util.TestingMetricRegistry) Nonnull(javax.annotation.Nonnull) StandaloneCompletedCheckpointStore(org.apache.flink.runtime.checkpoint.StandaloneCompletedCheckpointStore) DefaultSchedulerTest(org.apache.flink.runtime.scheduler.DefaultSchedulerTest) JobGraphTestUtils.streamingJobGraph(org.apache.flink.runtime.jobgraph.JobGraphTestUtils.streamingJobGraph) ArchivedExecutionGraph(org.apache.flink.runtime.executiongraph.ArchivedExecutionGraph) ArchivedExecutionGraphBuilder(org.apache.flink.runtime.rest.handler.legacy.utils.ArchivedExecutionGraphBuilder) Logger(org.slf4j.Logger) Configuration(org.apache.flink.configuration.Configuration) CompletedCheckpointStore(org.apache.flink.runtime.checkpoint.CompletedCheckpointStore) JobStatusListener(org.apache.flink.runtime.executiongraph.JobStatusListener) DefaultDeclarativeSlotPoolTest.createSlotOffersForResourceRequirements(org.apache.flink.runtime.jobmaster.slotpool.DefaultDeclarativeSlotPoolTest.createSlotOffersForResourceRequirements) TimeUnit(java.util.concurrent.TimeUnit) Consumer(java.util.function.Consumer) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) ExecutionGraphTestUtils.createNoOpVertex(org.apache.flink.runtime.executiongraph.ExecutionGraphTestUtils.createNoOpVertex) SchedulerTestingUtils.enableCheckpointing(org.apache.flink.runtime.scheduler.SchedulerTestingUtils.enableCheckpointing) TaskExecutionState(org.apache.flink.runtime.taskmanager.TaskExecutionState) TemporaryFolder(org.junit.rules.TemporaryFolder) CheckpointCoordinatorConfiguration(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration) Configuration(org.apache.flink.configuration.Configuration) UpTimeGauge(org.apache.flink.runtime.executiongraph.metrics.UpTimeGauge) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry) TestingMetricRegistry(org.apache.flink.runtime.metrics.util.TestingMetricRegistry) UpTimeGauge(org.apache.flink.runtime.executiongraph.metrics.UpTimeGauge) DownTimeGauge(org.apache.flink.runtime.executiongraph.metrics.DownTimeGauge) Gauge(org.apache.flink.metrics.Gauge) CompletableFuture(java.util.concurrent.CompletableFuture) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobGraphTestUtils.streamingJobGraph(org.apache.flink.runtime.jobgraph.JobGraphTestUtils.streamingJobGraph) DownTimeGauge(org.apache.flink.runtime.executiongraph.metrics.DownTimeGauge) DefaultDeclarativeSlotPool(org.apache.flink.runtime.jobmaster.slotpool.DefaultDeclarativeSlotPool) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test) ArchivedExecutionGraphTest(org.apache.flink.runtime.executiongraph.ArchivedExecutionGraphTest) DefaultSchedulerTest(org.apache.flink.runtime.scheduler.DefaultSchedulerTest)

Aggregations

DownTimeGauge (org.apache.flink.runtime.executiongraph.metrics.DownTimeGauge)2 UpTimeGauge (org.apache.flink.runtime.executiongraph.metrics.UpTimeGauge)2 IOException (java.io.IOException)1 Duration (java.time.Duration)1 ArrayList (java.util.ArrayList)1 Arrays (java.util.Arrays)1 List (java.util.List)1 Optional (java.util.Optional)1 ArrayBlockingQueue (java.util.concurrent.ArrayBlockingQueue)1 BlockingQueue (java.util.concurrent.BlockingQueue)1 CompletableFuture (java.util.concurrent.CompletableFuture)1 ExecutionException (java.util.concurrent.ExecutionException)1 Executors (java.util.concurrent.Executors)1 ScheduledExecutorService (java.util.concurrent.ScheduledExecutorService)1 TimeUnit (java.util.concurrent.TimeUnit)1 AtomicBoolean (java.util.concurrent.atomic.AtomicBoolean)1 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)1 BiConsumer (java.util.function.BiConsumer)1 Consumer (java.util.function.Consumer)1 Collectors (java.util.stream.Collectors)1