Search in sources :

Example 1 with Metric

use of org.apache.flink.metrics.Metric in project flink by apache.

the class ExecutionGraphMetricsTest method testExecutionGraphRestartTimeMetric.

/**
	 * This test tests that the restarting time metric correctly displays restarting times.
	 */
@Test
public void testExecutionGraphRestartTimeMetric() throws JobException, IOException, InterruptedException {
    final ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor();
    try {
        // setup execution graph with mocked scheduling logic
        int parallelism = 1;
        JobVertex jobVertex = new JobVertex("TestVertex");
        jobVertex.setParallelism(parallelism);
        jobVertex.setInvokableClass(NoOpInvokable.class);
        JobGraph jobGraph = new JobGraph("Test Job", jobVertex);
        Configuration config = new Configuration();
        config.setString(ConfigConstants.METRICS_REPORTERS_LIST, "test");
        config.setString(ConfigConstants.METRICS_REPORTER_PREFIX + "test." + ConfigConstants.METRICS_REPORTER_CLASS_SUFFIX, TestingReporter.class.getName());
        Configuration jobConfig = new Configuration();
        Time timeout = Time.seconds(10L);
        MetricRegistry metricRegistry = new MetricRegistry(MetricRegistryConfiguration.fromConfiguration(config));
        assertTrue(metricRegistry.getReporters().size() == 1);
        MetricReporter reporter = metricRegistry.getReporters().get(0);
        assertTrue(reporter instanceof TestingReporter);
        TestingReporter testingReporter = (TestingReporter) reporter;
        MetricGroup metricGroup = new JobManagerMetricGroup(metricRegistry, "localhost");
        Scheduler scheduler = mock(Scheduler.class);
        ResourceID taskManagerId = ResourceID.generate();
        TaskManagerLocation taskManagerLocation = mock(TaskManagerLocation.class);
        when(taskManagerLocation.getResourceID()).thenReturn(taskManagerId);
        when(taskManagerLocation.getHostname()).thenReturn("localhost");
        TaskManagerGateway taskManagerGateway = mock(TaskManagerGateway.class);
        Instance instance = mock(Instance.class);
        when(instance.getTaskManagerLocation()).thenReturn(taskManagerLocation);
        when(instance.getTaskManagerID()).thenReturn(taskManagerId);
        when(instance.getTaskManagerGateway()).thenReturn(taskManagerGateway);
        Slot rootSlot = mock(Slot.class);
        AllocatedSlot mockAllocatedSlot = mock(AllocatedSlot.class);
        when(mockAllocatedSlot.getSlotAllocationId()).thenReturn(new AllocationID());
        SimpleSlot simpleSlot = mock(SimpleSlot.class);
        when(simpleSlot.isAlive()).thenReturn(true);
        when(simpleSlot.getTaskManagerLocation()).thenReturn(taskManagerLocation);
        when(simpleSlot.getTaskManagerID()).thenReturn(taskManagerId);
        when(simpleSlot.getTaskManagerGateway()).thenReturn(taskManagerGateway);
        when(simpleSlot.setExecutedVertex(Matchers.any(Execution.class))).thenReturn(true);
        when(simpleSlot.getRoot()).thenReturn(rootSlot);
        when(simpleSlot.getAllocatedSlot()).thenReturn(mockAllocatedSlot);
        FlinkCompletableFuture<SimpleSlot> future = new FlinkCompletableFuture<>();
        future.complete(simpleSlot);
        when(scheduler.allocateSlot(any(ScheduledUnit.class), anyBoolean())).thenReturn(future);
        when(rootSlot.getSlotNumber()).thenReturn(0);
        when(taskManagerGateway.submitTask(any(TaskDeploymentDescriptor.class), any(Time.class))).thenReturn(FlinkCompletableFuture.completed(Acknowledge.get()));
        TestingRestartStrategy testingRestartStrategy = new TestingRestartStrategy();
        ExecutionGraph executionGraph = new ExecutionGraph(executor, executor, jobGraph.getJobID(), jobGraph.getName(), jobConfig, new SerializedValue<ExecutionConfig>(null), timeout, testingRestartStrategy, Collections.<BlobKey>emptyList(), Collections.<URL>emptyList(), scheduler, getClass().getClassLoader(), metricGroup);
        // get restarting time metric
        Metric metric = testingReporter.getMetric(ExecutionGraph.RESTARTING_TIME_METRIC_NAME);
        assertNotNull(metric);
        assertTrue(metric instanceof Gauge);
        @SuppressWarnings("unchecked") Gauge<Long> restartingTime = (Gauge<Long>) metric;
        // check that the restarting time is 0 since it's the initial start
        assertTrue(0L == restartingTime.getValue());
        executionGraph.attachJobGraph(jobGraph.getVerticesSortedTopologicallyFromSources());
        // start execution
        executionGraph.scheduleForExecution();
        assertTrue(0L == restartingTime.getValue());
        List<ExecutionAttemptID> executionIDs = new ArrayList<>();
        for (ExecutionVertex executionVertex : executionGraph.getAllExecutionVertices()) {
            executionIDs.add(executionVertex.getCurrentExecutionAttempt().getAttemptId());
        }
        // tell execution graph that the tasks are in state running --> job status switches to state running
        for (ExecutionAttemptID executionID : executionIDs) {
            executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.RUNNING));
        }
        assertEquals(JobStatus.RUNNING, executionGraph.getState());
        assertTrue(0L == restartingTime.getValue());
        // fail the job so that it goes into state restarting
        for (ExecutionAttemptID executionID : executionIDs) {
            executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.FAILED, new Exception()));
        }
        assertEquals(JobStatus.RESTARTING, executionGraph.getState());
        long firstRestartingTimestamp = executionGraph.getStatusTimestamp(JobStatus.RESTARTING);
        // wait some time so that the restarting time gauge shows a value different from 0
        Thread.sleep(50);
        long previousRestartingTime = restartingTime.getValue();
        // check that the restarting time is monotonically increasing
        for (int i = 0; i < 10; i++) {
            long currentRestartingTime = restartingTime.getValue();
            assertTrue(currentRestartingTime >= previousRestartingTime);
            previousRestartingTime = currentRestartingTime;
        }
        // check that we have measured some restarting time
        assertTrue(previousRestartingTime > 0);
        // restart job
        testingRestartStrategy.restartExecutionGraph();
        executionIDs.clear();
        for (ExecutionVertex executionVertex : executionGraph.getAllExecutionVertices()) {
            executionIDs.add(executionVertex.getCurrentExecutionAttempt().getAttemptId());
        }
        for (ExecutionAttemptID executionID : executionIDs) {
            executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.RUNNING));
        }
        assertEquals(JobStatus.RUNNING, executionGraph.getState());
        assertTrue(firstRestartingTimestamp != 0);
        previousRestartingTime = restartingTime.getValue();
        // check that the restarting time does not increase after we've reached the running state
        for (int i = 0; i < 10; i++) {
            long currentRestartingTime = restartingTime.getValue();
            assertTrue(currentRestartingTime == previousRestartingTime);
            previousRestartingTime = currentRestartingTime;
        }
        // fail job again
        for (ExecutionAttemptID executionID : executionIDs) {
            executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.FAILED, new Exception()));
        }
        assertEquals(JobStatus.RESTARTING, executionGraph.getState());
        long secondRestartingTimestamp = executionGraph.getStatusTimestamp(JobStatus.RESTARTING);
        assertTrue(firstRestartingTimestamp != secondRestartingTimestamp);
        Thread.sleep(50);
        previousRestartingTime = restartingTime.getValue();
        // check that the restarting time is increasing again
        for (int i = 0; i < 10; i++) {
            long currentRestartingTime = restartingTime.getValue();
            assertTrue(currentRestartingTime >= previousRestartingTime);
            previousRestartingTime = currentRestartingTime;
        }
        assertTrue(previousRestartingTime > 0);
        // now lets fail the job while it is in restarting and see whether the restarting time then stops to increase
        // for this to work, we have to use a SuppressRestartException
        executionGraph.fail(new SuppressRestartsException(new Exception()));
        assertEquals(JobStatus.FAILED, executionGraph.getState());
        previousRestartingTime = restartingTime.getValue();
        for (int i = 0; i < 10; i++) {
            long currentRestartingTime = restartingTime.getValue();
            assertTrue(currentRestartingTime == previousRestartingTime);
            previousRestartingTime = currentRestartingTime;
        }
    } finally {
        executor.shutdownNow();
    }
}
Also used : JobManagerMetricGroup(org.apache.flink.runtime.metrics.groups.JobManagerMetricGroup) MetricRegistryConfiguration(org.apache.flink.runtime.metrics.MetricRegistryConfiguration) Configuration(org.apache.flink.configuration.Configuration) Instance(org.apache.flink.runtime.instance.Instance) Scheduler(org.apache.flink.runtime.jobmanager.scheduler.Scheduler) MetricGroup(org.apache.flink.metrics.MetricGroup) JobManagerMetricGroup(org.apache.flink.runtime.metrics.groups.JobManagerMetricGroup) TaskManagerGateway(org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway) ArrayList(java.util.ArrayList) Time(org.apache.flink.api.common.time.Time) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) SimpleSlot(org.apache.flink.runtime.instance.SimpleSlot) FlinkCompletableFuture(org.apache.flink.runtime.concurrent.impl.FlinkCompletableFuture) Gauge(org.apache.flink.metrics.Gauge) SuppressRestartsException(org.apache.flink.runtime.execution.SuppressRestartsException) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) TaskDeploymentDescriptor(org.apache.flink.runtime.deployment.TaskDeploymentDescriptor) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) AllocatedSlot(org.apache.flink.runtime.jobmanager.slots.AllocatedSlot) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) ScheduledUnit(org.apache.flink.runtime.jobmanager.scheduler.ScheduledUnit) MetricReporter(org.apache.flink.metrics.reporter.MetricReporter) TaskExecutionState(org.apache.flink.runtime.taskmanager.TaskExecutionState) SuppressRestartsException(org.apache.flink.runtime.execution.SuppressRestartsException) JobException(org.apache.flink.runtime.JobException) IOException(java.io.IOException) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) SimpleSlot(org.apache.flink.runtime.instance.SimpleSlot) Slot(org.apache.flink.runtime.instance.Slot) AllocatedSlot(org.apache.flink.runtime.jobmanager.slots.AllocatedSlot) Metric(org.apache.flink.metrics.Metric) Test(org.junit.Test)

Example 2 with Metric

use of org.apache.flink.metrics.Metric in project flink by apache.

the class AbstractMetricGroup method addMetric.

/**
 * Adds the given metric to the group and registers it at the registry, if the group is not yet
 * closed, and if no metric with the same name has been registered before.
 *
 * @param name the name to register the metric under
 * @param metric the metric to register
 */
protected void addMetric(String name, Metric metric) {
    if (metric == null) {
        LOG.warn("Ignoring attempted registration of a metric due to being null for name {}.", name);
        return;
    }
    // add the metric only if the group is still open
    synchronized (this) {
        if (!closed) {
            // immediately put without a 'contains' check to optimize the common case (no
            // collision)
            // collisions are resolved later
            Metric prior = metrics.put(name, metric);
            // check for collisions with other metric names
            if (prior == null) {
                if (groups.containsKey(name)) {
                    // we warn here, rather than failing, because metrics are tools that should
                    // not fail the
                    // program when used incorrectly
                    LOG.warn("Name collision: Adding a metric with the same name as a metric subgroup: '" + name + "'. Metric might not get properly reported. " + Arrays.toString(scopeComponents));
                }
                registry.register(metric, name, this);
            } else {
                // we had a collision. put back the original value
                metrics.put(name, prior);
                // we warn here, rather than failing, because metrics are tools that should not
                // fail the
                // program when used incorrectly
                LOG.warn("Name collision: Group already contains a Metric with the name '" + name + "'. Metric will not be reported." + Arrays.toString(scopeComponents));
            }
        }
    }
}
Also used : Metric(org.apache.flink.metrics.Metric)

Example 3 with Metric

use of org.apache.flink.metrics.Metric in project flink by apache.

the class SourceMetricsITCase method assertSourceMetrics.

private void assertSourceMetrics(JobID jobId, InMemoryReporter reporter, long processedRecordsPerSubtask, long numTotalPerSubtask, int parallelism, int numSplits, boolean hasTimestamps) {
    List<OperatorMetricGroup> groups = reporter.findOperatorMetricGroups(jobId, "MetricTestingSource");
    assertThat(groups, hasSize(parallelism));
    int subtaskWithMetrics = 0;
    for (OperatorMetricGroup group : groups) {
        Map<String, Metric> metrics = reporter.getMetricsByGroup(group);
        // there are only 2 splits assigned; so two groups will not update metrics
        if (group.getIOMetricGroup().getNumRecordsInCounter().getCount() == 0) {
            // assert that optional metrics are not initialized when no split assigned
            assertThat(metrics.get(MetricNames.CURRENT_EMIT_EVENT_TIME_LAG), isGauge(equalTo(InternalSourceReaderMetricGroup.UNDEFINED)));
            assertThat(metrics.get(MetricNames.WATERMARK_LAG), nullValue());
            continue;
        }
        subtaskWithMetrics++;
        // I/O metrics
        assertThat(group.getIOMetricGroup().getNumRecordsInCounter(), isCounter(equalTo(processedRecordsPerSubtask)));
        assertThat(group.getIOMetricGroup().getNumBytesInCounter(), isCounter(equalTo(processedRecordsPerSubtask * MockRecordEmitter.RECORD_SIZE_IN_BYTES)));
        // MockRecordEmitter is just incrementing errors every even record
        assertThat(metrics.get(MetricNames.NUM_RECORDS_IN_ERRORS), isCounter(equalTo(processedRecordsPerSubtask / 2)));
        if (hasTimestamps) {
            // Timestamp assigner subtracting EVENTTIME_LAG from wall clock
            assertThat(metrics.get(MetricNames.CURRENT_EMIT_EVENT_TIME_LAG), isGauge(isCloseTo(EVENTTIME_LAG, EVENTTIME_EPSILON)));
            // Watermark is derived from timestamp, so it has to be in the same order of
            // magnitude
            assertThat(metrics.get(MetricNames.WATERMARK_LAG), isGauge(isCloseTo(EVENTTIME_LAG, EVENTTIME_EPSILON)));
            // Calculate the additional watermark lag (on top of event time lag)
            Long watermarkLag = ((Gauge<Long>) metrics.get(MetricNames.WATERMARK_LAG)).getValue() - ((Gauge<Long>) metrics.get(MetricNames.CURRENT_EMIT_EVENT_TIME_LAG)).getValue();
            // That should correspond to the out-of-order boundedness
            assertThat(watermarkLag, isCloseTo(WATERMARK_LAG, WATERMARK_EPSILON));
        } else {
            // assert that optional metrics are not initialized when no timestamp assigned
            assertThat(metrics.get(MetricNames.CURRENT_EMIT_EVENT_TIME_LAG), isGauge(equalTo(InternalSourceReaderMetricGroup.UNDEFINED)));
            assertThat(metrics.get(MetricNames.WATERMARK_LAG), nullValue());
        }
        long pendingRecords = numTotalPerSubtask - processedRecordsPerSubtask;
        assertThat(metrics.get(MetricNames.PENDING_RECORDS), isGauge(equalTo(pendingRecords)));
        assertThat(metrics.get(MetricNames.PENDING_BYTES), isGauge(equalTo(pendingRecords * MockRecordEmitter.RECORD_SIZE_IN_BYTES)));
        // test is keeping source idle time metric busy with the barrier
        assertThat(metrics.get(MetricNames.SOURCE_IDLE_TIME), isGauge(equalTo(0L)));
    }
    assertThat(subtaskWithMetrics, equalTo(numSplits));
}
Also used : Metric(org.apache.flink.metrics.Metric) OperatorMetricGroup(org.apache.flink.metrics.groups.OperatorMetricGroup)

Example 4 with Metric

use of org.apache.flink.metrics.Metric in project flink by apache.

the class AlignedWatermarksITCase method testAlignment.

@Test
public void testAlignment(@InjectMiniCluster MiniCluster miniCluster) throws Exception {
    final JobGraph jobGraph = getJobGraph();
    final CompletableFuture<JobSubmissionResult> submission = miniCluster.submitJob(jobGraph);
    final JobID jobID = submission.get().getJobID();
    CommonTestUtils.waitForAllTaskRunning(miniCluster, jobID, false);
    long oldDrift = Long.MAX_VALUE;
    do {
        final Optional<Metric> drift = reporter.findMetric(jobID, FAST_SOURCE_NAME + ".*" + MetricNames.WATERMARK_ALIGNMENT_DRIFT);
        Thread.sleep(200);
        final Optional<Long> newDriftOptional = drift.map(m -> ((Gauge<Long>) m).getValue());
        if (newDriftOptional.isPresent()) {
            final Long newDrift = newDriftOptional.get();
            assertThat(newDrift).isLessThanOrEqualTo(oldDrift);
            oldDrift = newDrift;
        }
    } while (oldDrift >= MAX_DRIFT);
}
Also used : JobSubmissionResult(org.apache.flink.api.common.JobSubmissionResult) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) Metric(org.apache.flink.metrics.Metric) JobID(org.apache.flink.api.common.JobID) Test(org.junit.jupiter.api.Test)

Example 5 with Metric

use of org.apache.flink.metrics.Metric in project flink by apache.

the class NettyShuffleEnvironmentTest method testRegisteringDebloatingMetrics.

@Test
@SuppressWarnings("unchecked")
public void testRegisteringDebloatingMetrics() throws IOException {
    Map<String, Metric> metrics = new ConcurrentHashMap<>();
    final TaskMetricGroup taskMetricGroup = createTaskMetricGroup(metrics);
    final Configuration config = new Configuration();
    config.set(TaskManagerOptions.BUFFER_DEBLOAT_ENABLED, true);
    final NettyShuffleEnvironment shuffleEnvironment = new NettyShuffleEnvironmentBuilder().setDebloatConfig(BufferDebloatConfiguration.fromConfiguration(config)).build();
    shuffleEnvironment.createInputGates(shuffleEnvironment.createShuffleIOOwnerContext("test", new ExecutionAttemptID(), taskMetricGroup), (dsid, id, consumer) -> {
    }, Arrays.asList(new InputGateDeploymentDescriptor(new IntermediateDataSetID(), ResultPartitionType.PIPELINED, 0, new ShuffleDescriptor[] { new NettyShuffleDescriptorBuilder().buildRemote() }), new InputGateDeploymentDescriptor(new IntermediateDataSetID(), ResultPartitionType.PIPELINED, 1, new ShuffleDescriptor[] { new NettyShuffleDescriptorBuilder().buildRemote() })));
    for (int i = 0; i < 2; i++) {
        assertEquals(TaskManagerOptions.MEMORY_SEGMENT_SIZE.defaultValue().getBytes(), (long) ((Gauge<Integer>) getDebloatingMetric(metrics, i, MetricNames.DEBLOATED_BUFFER_SIZE)).getValue());
        assertEquals(0L, (long) ((Gauge<Long>) getDebloatingMetric(metrics, i, MetricNames.ESTIMATED_TIME_TO_CONSUME_BUFFERS)).getValue());
    }
}
Also used : ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) NettyShuffleDescriptorBuilder(org.apache.flink.runtime.util.NettyShuffleDescriptorBuilder) BufferDebloatConfiguration(org.apache.flink.runtime.throughput.BufferDebloatConfiguration) Configuration(org.apache.flink.configuration.Configuration) TaskMetricGroup(org.apache.flink.runtime.metrics.groups.TaskMetricGroup) InputGateDeploymentDescriptor(org.apache.flink.runtime.deployment.InputGateDeploymentDescriptor) Gauge(org.apache.flink.metrics.Gauge) Metric(org.apache.flink.metrics.Metric) IntermediateDataSetID(org.apache.flink.runtime.jobgraph.IntermediateDataSetID) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) Test(org.junit.Test)

Aggregations

Metric (org.apache.flink.metrics.Metric)14 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)7 TaskMetricGroup (org.apache.flink.runtime.metrics.groups.TaskMetricGroup)6 Test (org.junit.Test)6 Gauge (org.apache.flink.metrics.Gauge)4 OperatorMetricGroup (org.apache.flink.metrics.groups.OperatorMetricGroup)4 InterceptingTaskMetricGroup (org.apache.flink.runtime.metrics.util.InterceptingTaskMetricGroup)4 Configuration (org.apache.flink.configuration.Configuration)3 MetricGroup (org.apache.flink.metrics.MetricGroup)3 IOException (java.io.IOException)2 JobID (org.apache.flink.api.common.JobID)2 MetricReporter (org.apache.flink.metrics.reporter.MetricReporter)2 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)2 StreamConfig (org.apache.flink.streaming.api.graph.StreamConfig)2 Status (akka.actor.Status)1 SimpleEntry (java.util.AbstractMap.SimpleEntry)1 ArrayDeque (java.util.ArrayDeque)1 ArrayList (java.util.ArrayList)1 Arrays (java.util.Arrays)1 Collections (java.util.Collections)1