use of org.apache.flink.metrics.Metric in project flink by apache.
the class ExecutionGraphMetricsTest method testExecutionGraphRestartTimeMetric.
/**
* This test tests that the restarting time metric correctly displays restarting times.
*/
@Test
public void testExecutionGraphRestartTimeMetric() throws JobException, IOException, InterruptedException {
final ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor();
try {
// setup execution graph with mocked scheduling logic
int parallelism = 1;
JobVertex jobVertex = new JobVertex("TestVertex");
jobVertex.setParallelism(parallelism);
jobVertex.setInvokableClass(NoOpInvokable.class);
JobGraph jobGraph = new JobGraph("Test Job", jobVertex);
Configuration config = new Configuration();
config.setString(ConfigConstants.METRICS_REPORTERS_LIST, "test");
config.setString(ConfigConstants.METRICS_REPORTER_PREFIX + "test." + ConfigConstants.METRICS_REPORTER_CLASS_SUFFIX, TestingReporter.class.getName());
Configuration jobConfig = new Configuration();
Time timeout = Time.seconds(10L);
MetricRegistry metricRegistry = new MetricRegistry(MetricRegistryConfiguration.fromConfiguration(config));
assertTrue(metricRegistry.getReporters().size() == 1);
MetricReporter reporter = metricRegistry.getReporters().get(0);
assertTrue(reporter instanceof TestingReporter);
TestingReporter testingReporter = (TestingReporter) reporter;
MetricGroup metricGroup = new JobManagerMetricGroup(metricRegistry, "localhost");
Scheduler scheduler = mock(Scheduler.class);
ResourceID taskManagerId = ResourceID.generate();
TaskManagerLocation taskManagerLocation = mock(TaskManagerLocation.class);
when(taskManagerLocation.getResourceID()).thenReturn(taskManagerId);
when(taskManagerLocation.getHostname()).thenReturn("localhost");
TaskManagerGateway taskManagerGateway = mock(TaskManagerGateway.class);
Instance instance = mock(Instance.class);
when(instance.getTaskManagerLocation()).thenReturn(taskManagerLocation);
when(instance.getTaskManagerID()).thenReturn(taskManagerId);
when(instance.getTaskManagerGateway()).thenReturn(taskManagerGateway);
Slot rootSlot = mock(Slot.class);
AllocatedSlot mockAllocatedSlot = mock(AllocatedSlot.class);
when(mockAllocatedSlot.getSlotAllocationId()).thenReturn(new AllocationID());
SimpleSlot simpleSlot = mock(SimpleSlot.class);
when(simpleSlot.isAlive()).thenReturn(true);
when(simpleSlot.getTaskManagerLocation()).thenReturn(taskManagerLocation);
when(simpleSlot.getTaskManagerID()).thenReturn(taskManagerId);
when(simpleSlot.getTaskManagerGateway()).thenReturn(taskManagerGateway);
when(simpleSlot.setExecutedVertex(Matchers.any(Execution.class))).thenReturn(true);
when(simpleSlot.getRoot()).thenReturn(rootSlot);
when(simpleSlot.getAllocatedSlot()).thenReturn(mockAllocatedSlot);
FlinkCompletableFuture<SimpleSlot> future = new FlinkCompletableFuture<>();
future.complete(simpleSlot);
when(scheduler.allocateSlot(any(ScheduledUnit.class), anyBoolean())).thenReturn(future);
when(rootSlot.getSlotNumber()).thenReturn(0);
when(taskManagerGateway.submitTask(any(TaskDeploymentDescriptor.class), any(Time.class))).thenReturn(FlinkCompletableFuture.completed(Acknowledge.get()));
TestingRestartStrategy testingRestartStrategy = new TestingRestartStrategy();
ExecutionGraph executionGraph = new ExecutionGraph(executor, executor, jobGraph.getJobID(), jobGraph.getName(), jobConfig, new SerializedValue<ExecutionConfig>(null), timeout, testingRestartStrategy, Collections.<BlobKey>emptyList(), Collections.<URL>emptyList(), scheduler, getClass().getClassLoader(), metricGroup);
// get restarting time metric
Metric metric = testingReporter.getMetric(ExecutionGraph.RESTARTING_TIME_METRIC_NAME);
assertNotNull(metric);
assertTrue(metric instanceof Gauge);
@SuppressWarnings("unchecked") Gauge<Long> restartingTime = (Gauge<Long>) metric;
// check that the restarting time is 0 since it's the initial start
assertTrue(0L == restartingTime.getValue());
executionGraph.attachJobGraph(jobGraph.getVerticesSortedTopologicallyFromSources());
// start execution
executionGraph.scheduleForExecution();
assertTrue(0L == restartingTime.getValue());
List<ExecutionAttemptID> executionIDs = new ArrayList<>();
for (ExecutionVertex executionVertex : executionGraph.getAllExecutionVertices()) {
executionIDs.add(executionVertex.getCurrentExecutionAttempt().getAttemptId());
}
// tell execution graph that the tasks are in state running --> job status switches to state running
for (ExecutionAttemptID executionID : executionIDs) {
executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.RUNNING));
}
assertEquals(JobStatus.RUNNING, executionGraph.getState());
assertTrue(0L == restartingTime.getValue());
// fail the job so that it goes into state restarting
for (ExecutionAttemptID executionID : executionIDs) {
executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.FAILED, new Exception()));
}
assertEquals(JobStatus.RESTARTING, executionGraph.getState());
long firstRestartingTimestamp = executionGraph.getStatusTimestamp(JobStatus.RESTARTING);
// wait some time so that the restarting time gauge shows a value different from 0
Thread.sleep(50);
long previousRestartingTime = restartingTime.getValue();
// check that the restarting time is monotonically increasing
for (int i = 0; i < 10; i++) {
long currentRestartingTime = restartingTime.getValue();
assertTrue(currentRestartingTime >= previousRestartingTime);
previousRestartingTime = currentRestartingTime;
}
// check that we have measured some restarting time
assertTrue(previousRestartingTime > 0);
// restart job
testingRestartStrategy.restartExecutionGraph();
executionIDs.clear();
for (ExecutionVertex executionVertex : executionGraph.getAllExecutionVertices()) {
executionIDs.add(executionVertex.getCurrentExecutionAttempt().getAttemptId());
}
for (ExecutionAttemptID executionID : executionIDs) {
executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.RUNNING));
}
assertEquals(JobStatus.RUNNING, executionGraph.getState());
assertTrue(firstRestartingTimestamp != 0);
previousRestartingTime = restartingTime.getValue();
// check that the restarting time does not increase after we've reached the running state
for (int i = 0; i < 10; i++) {
long currentRestartingTime = restartingTime.getValue();
assertTrue(currentRestartingTime == previousRestartingTime);
previousRestartingTime = currentRestartingTime;
}
// fail job again
for (ExecutionAttemptID executionID : executionIDs) {
executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.FAILED, new Exception()));
}
assertEquals(JobStatus.RESTARTING, executionGraph.getState());
long secondRestartingTimestamp = executionGraph.getStatusTimestamp(JobStatus.RESTARTING);
assertTrue(firstRestartingTimestamp != secondRestartingTimestamp);
Thread.sleep(50);
previousRestartingTime = restartingTime.getValue();
// check that the restarting time is increasing again
for (int i = 0; i < 10; i++) {
long currentRestartingTime = restartingTime.getValue();
assertTrue(currentRestartingTime >= previousRestartingTime);
previousRestartingTime = currentRestartingTime;
}
assertTrue(previousRestartingTime > 0);
// now lets fail the job while it is in restarting and see whether the restarting time then stops to increase
// for this to work, we have to use a SuppressRestartException
executionGraph.fail(new SuppressRestartsException(new Exception()));
assertEquals(JobStatus.FAILED, executionGraph.getState());
previousRestartingTime = restartingTime.getValue();
for (int i = 0; i < 10; i++) {
long currentRestartingTime = restartingTime.getValue();
assertTrue(currentRestartingTime == previousRestartingTime);
previousRestartingTime = currentRestartingTime;
}
} finally {
executor.shutdownNow();
}
}
use of org.apache.flink.metrics.Metric in project flink by apache.
the class AbstractMetricGroup method addMetric.
/**
* Adds the given metric to the group and registers it at the registry, if the group is not yet
* closed, and if no metric with the same name has been registered before.
*
* @param name the name to register the metric under
* @param metric the metric to register
*/
protected void addMetric(String name, Metric metric) {
if (metric == null) {
LOG.warn("Ignoring attempted registration of a metric due to being null for name {}.", name);
return;
}
// add the metric only if the group is still open
synchronized (this) {
if (!closed) {
// immediately put without a 'contains' check to optimize the common case (no
// collision)
// collisions are resolved later
Metric prior = metrics.put(name, metric);
// check for collisions with other metric names
if (prior == null) {
if (groups.containsKey(name)) {
// we warn here, rather than failing, because metrics are tools that should
// not fail the
// program when used incorrectly
LOG.warn("Name collision: Adding a metric with the same name as a metric subgroup: '" + name + "'. Metric might not get properly reported. " + Arrays.toString(scopeComponents));
}
registry.register(metric, name, this);
} else {
// we had a collision. put back the original value
metrics.put(name, prior);
// we warn here, rather than failing, because metrics are tools that should not
// fail the
// program when used incorrectly
LOG.warn("Name collision: Group already contains a Metric with the name '" + name + "'. Metric will not be reported." + Arrays.toString(scopeComponents));
}
}
}
}
use of org.apache.flink.metrics.Metric in project flink by apache.
the class SourceMetricsITCase method assertSourceMetrics.
private void assertSourceMetrics(JobID jobId, InMemoryReporter reporter, long processedRecordsPerSubtask, long numTotalPerSubtask, int parallelism, int numSplits, boolean hasTimestamps) {
List<OperatorMetricGroup> groups = reporter.findOperatorMetricGroups(jobId, "MetricTestingSource");
assertThat(groups, hasSize(parallelism));
int subtaskWithMetrics = 0;
for (OperatorMetricGroup group : groups) {
Map<String, Metric> metrics = reporter.getMetricsByGroup(group);
// there are only 2 splits assigned; so two groups will not update metrics
if (group.getIOMetricGroup().getNumRecordsInCounter().getCount() == 0) {
// assert that optional metrics are not initialized when no split assigned
assertThat(metrics.get(MetricNames.CURRENT_EMIT_EVENT_TIME_LAG), isGauge(equalTo(InternalSourceReaderMetricGroup.UNDEFINED)));
assertThat(metrics.get(MetricNames.WATERMARK_LAG), nullValue());
continue;
}
subtaskWithMetrics++;
// I/O metrics
assertThat(group.getIOMetricGroup().getNumRecordsInCounter(), isCounter(equalTo(processedRecordsPerSubtask)));
assertThat(group.getIOMetricGroup().getNumBytesInCounter(), isCounter(equalTo(processedRecordsPerSubtask * MockRecordEmitter.RECORD_SIZE_IN_BYTES)));
// MockRecordEmitter is just incrementing errors every even record
assertThat(metrics.get(MetricNames.NUM_RECORDS_IN_ERRORS), isCounter(equalTo(processedRecordsPerSubtask / 2)));
if (hasTimestamps) {
// Timestamp assigner subtracting EVENTTIME_LAG from wall clock
assertThat(metrics.get(MetricNames.CURRENT_EMIT_EVENT_TIME_LAG), isGauge(isCloseTo(EVENTTIME_LAG, EVENTTIME_EPSILON)));
// Watermark is derived from timestamp, so it has to be in the same order of
// magnitude
assertThat(metrics.get(MetricNames.WATERMARK_LAG), isGauge(isCloseTo(EVENTTIME_LAG, EVENTTIME_EPSILON)));
// Calculate the additional watermark lag (on top of event time lag)
Long watermarkLag = ((Gauge<Long>) metrics.get(MetricNames.WATERMARK_LAG)).getValue() - ((Gauge<Long>) metrics.get(MetricNames.CURRENT_EMIT_EVENT_TIME_LAG)).getValue();
// That should correspond to the out-of-order boundedness
assertThat(watermarkLag, isCloseTo(WATERMARK_LAG, WATERMARK_EPSILON));
} else {
// assert that optional metrics are not initialized when no timestamp assigned
assertThat(metrics.get(MetricNames.CURRENT_EMIT_EVENT_TIME_LAG), isGauge(equalTo(InternalSourceReaderMetricGroup.UNDEFINED)));
assertThat(metrics.get(MetricNames.WATERMARK_LAG), nullValue());
}
long pendingRecords = numTotalPerSubtask - processedRecordsPerSubtask;
assertThat(metrics.get(MetricNames.PENDING_RECORDS), isGauge(equalTo(pendingRecords)));
assertThat(metrics.get(MetricNames.PENDING_BYTES), isGauge(equalTo(pendingRecords * MockRecordEmitter.RECORD_SIZE_IN_BYTES)));
// test is keeping source idle time metric busy with the barrier
assertThat(metrics.get(MetricNames.SOURCE_IDLE_TIME), isGauge(equalTo(0L)));
}
assertThat(subtaskWithMetrics, equalTo(numSplits));
}
use of org.apache.flink.metrics.Metric in project flink by apache.
the class AlignedWatermarksITCase method testAlignment.
@Test
public void testAlignment(@InjectMiniCluster MiniCluster miniCluster) throws Exception {
final JobGraph jobGraph = getJobGraph();
final CompletableFuture<JobSubmissionResult> submission = miniCluster.submitJob(jobGraph);
final JobID jobID = submission.get().getJobID();
CommonTestUtils.waitForAllTaskRunning(miniCluster, jobID, false);
long oldDrift = Long.MAX_VALUE;
do {
final Optional<Metric> drift = reporter.findMetric(jobID, FAST_SOURCE_NAME + ".*" + MetricNames.WATERMARK_ALIGNMENT_DRIFT);
Thread.sleep(200);
final Optional<Long> newDriftOptional = drift.map(m -> ((Gauge<Long>) m).getValue());
if (newDriftOptional.isPresent()) {
final Long newDrift = newDriftOptional.get();
assertThat(newDrift).isLessThanOrEqualTo(oldDrift);
oldDrift = newDrift;
}
} while (oldDrift >= MAX_DRIFT);
}
use of org.apache.flink.metrics.Metric in project flink by apache.
the class NettyShuffleEnvironmentTest method testRegisteringDebloatingMetrics.
@Test
@SuppressWarnings("unchecked")
public void testRegisteringDebloatingMetrics() throws IOException {
Map<String, Metric> metrics = new ConcurrentHashMap<>();
final TaskMetricGroup taskMetricGroup = createTaskMetricGroup(metrics);
final Configuration config = new Configuration();
config.set(TaskManagerOptions.BUFFER_DEBLOAT_ENABLED, true);
final NettyShuffleEnvironment shuffleEnvironment = new NettyShuffleEnvironmentBuilder().setDebloatConfig(BufferDebloatConfiguration.fromConfiguration(config)).build();
shuffleEnvironment.createInputGates(shuffleEnvironment.createShuffleIOOwnerContext("test", new ExecutionAttemptID(), taskMetricGroup), (dsid, id, consumer) -> {
}, Arrays.asList(new InputGateDeploymentDescriptor(new IntermediateDataSetID(), ResultPartitionType.PIPELINED, 0, new ShuffleDescriptor[] { new NettyShuffleDescriptorBuilder().buildRemote() }), new InputGateDeploymentDescriptor(new IntermediateDataSetID(), ResultPartitionType.PIPELINED, 1, new ShuffleDescriptor[] { new NettyShuffleDescriptorBuilder().buildRemote() })));
for (int i = 0; i < 2; i++) {
assertEquals(TaskManagerOptions.MEMORY_SEGMENT_SIZE.defaultValue().getBytes(), (long) ((Gauge<Integer>) getDebloatingMetric(metrics, i, MetricNames.DEBLOATED_BUFFER_SIZE)).getValue());
assertEquals(0L, (long) ((Gauge<Long>) getDebloatingMetric(metrics, i, MetricNames.ESTIMATED_TIME_TO_CONSUME_BUFFERS)).getValue());
}
}
Aggregations