Search in sources :

Example 21 with MetricRegistry

use of org.apache.flink.runtime.metrics.MetricRegistry in project flink by apache.

the class TaskManagerJobGroupTest method testGenerateScopeCustom.

@Test
public void testGenerateScopeCustom() {
    Configuration cfg = new Configuration();
    cfg.setString(ConfigConstants.METRICS_SCOPE_NAMING_TM, "abc");
    cfg.setString(ConfigConstants.METRICS_SCOPE_NAMING_TM_JOB, "some-constant.<job_name>");
    MetricRegistry registry = new MetricRegistry(MetricRegistryConfiguration.fromConfiguration(cfg));
    JobID jid = new JobID();
    TaskManagerMetricGroup tmGroup = new TaskManagerMetricGroup(registry, "theHostName", "test-tm-id");
    JobMetricGroup jmGroup = new TaskManagerJobMetricGroup(registry, tmGroup, jid, "myJobName");
    assertArrayEquals(new String[] { "some-constant", "myJobName" }, jmGroup.getScopeComponents());
    assertEquals("some-constant.myJobName.name", jmGroup.getMetricIdentifier("name"));
    registry.shutdown();
}
Also used : MetricRegistryConfiguration(org.apache.flink.runtime.metrics.MetricRegistryConfiguration) Configuration(org.apache.flink.configuration.Configuration) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 22 with MetricRegistry

use of org.apache.flink.runtime.metrics.MetricRegistry in project flink by apache.

the class TaskManagerJobGroupTest method testCreateQueryServiceMetricInfo.

@Test
public void testCreateQueryServiceMetricInfo() {
    JobID jid = new JobID();
    MetricRegistry registry = new MetricRegistry(MetricRegistryConfiguration.defaultMetricRegistryConfiguration());
    TaskManagerMetricGroup tm = new TaskManagerMetricGroup(registry, "host", "id");
    TaskManagerJobMetricGroup job = new TaskManagerJobMetricGroup(registry, tm, jid, "jobname");
    QueryScopeInfo.JobQueryScopeInfo info = job.createQueryServiceMetricInfo(new DummyCharacterFilter());
    assertEquals("", info.scope);
    assertEquals(jid.toString(), info.jobID);
}
Also used : QueryScopeInfo(org.apache.flink.runtime.metrics.dump.QueryScopeInfo) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry) DummyCharacterFilter(org.apache.flink.runtime.metrics.util.DummyCharacterFilter) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 23 with MetricRegistry

use of org.apache.flink.runtime.metrics.MetricRegistry in project flink by apache.

the class YarnTaskExecutorRunner method runTaskExecutor.

// ------------------------------------------------------------------------
//  Core work method
// ------------------------------------------------------------------------
/**
	 * The main work method, must run as a privileged action.
	 *
	 * @return The return code for the Java process.
	 */
protected int runTaskExecutor(Configuration config) {
    try {
        // ---- (1) create common services
        // first get the ResouceId, resource id is the container id for yarn.
        final String containerId = ENV.get(YarnFlinkResourceManager.ENV_FLINK_CONTAINER_ID);
        Preconditions.checkArgument(containerId != null, "ContainerId variable %s not set", YarnFlinkResourceManager.ENV_FLINK_CONTAINER_ID);
        // use the hostname passed by job manager
        final String taskExecutorHostname = ENV.get(YarnResourceManager.ENV_FLINK_NODE_ID);
        if (taskExecutorHostname != null) {
            config.setString(ConfigConstants.TASK_MANAGER_HOSTNAME_KEY, taskExecutorHostname);
        }
        ResourceID resourceID = new ResourceID(containerId);
        LOG.info("YARN assigned resource id {} for the task executor.", resourceID.toString());
        haServices = HighAvailabilityServicesUtils.createAvailableOrEmbeddedServices(config);
        HeartbeatServices heartbeatServices = HeartbeatServices.fromConfiguration(config);
        metricRegistry = new MetricRegistry(MetricRegistryConfiguration.fromConfiguration(config));
        // ---- (2) init task manager runner -------
        taskExecutorRpcService = TaskManagerRunner.createRpcService(config, haServices);
        taskManagerRunner = new TaskManagerRunner(config, resourceID, taskExecutorRpcService, haServices, heartbeatServices, metricRegistry);
        // ---- (3) start the task manager runner
        taskManagerRunner.start();
        LOG.debug("YARN task executor started");
        taskManagerRunner.getTerminationFuture().get();
        // everything started, we can wait until all is done or the process is killed
        LOG.info("YARN task manager runner finished");
        shutdown();
    } catch (Throwable t) {
        // make sure that everything whatever ends up in the log
        LOG.error("YARN task executor initialization failed", t);
        shutdown();
        return INIT_ERROR_EXIT_CODE;
    }
    return 0;
}
Also used : HeartbeatServices(org.apache.flink.runtime.heartbeat.HeartbeatServices) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) TaskManagerRunner(org.apache.flink.runtime.taskexecutor.TaskManagerRunner) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry)

Example 24 with MetricRegistry

use of org.apache.flink.runtime.metrics.MetricRegistry in project flink by apache.

the class ExecutionGraphMetricsTest method testExecutionGraphRestartTimeMetric.

/**
	 * This test tests that the restarting time metric correctly displays restarting times.
	 */
@Test
public void testExecutionGraphRestartTimeMetric() throws JobException, IOException, InterruptedException {
    final ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor();
    try {
        // setup execution graph with mocked scheduling logic
        int parallelism = 1;
        JobVertex jobVertex = new JobVertex("TestVertex");
        jobVertex.setParallelism(parallelism);
        jobVertex.setInvokableClass(NoOpInvokable.class);
        JobGraph jobGraph = new JobGraph("Test Job", jobVertex);
        Configuration config = new Configuration();
        config.setString(ConfigConstants.METRICS_REPORTERS_LIST, "test");
        config.setString(ConfigConstants.METRICS_REPORTER_PREFIX + "test." + ConfigConstants.METRICS_REPORTER_CLASS_SUFFIX, TestingReporter.class.getName());
        Configuration jobConfig = new Configuration();
        Time timeout = Time.seconds(10L);
        MetricRegistry metricRegistry = new MetricRegistry(MetricRegistryConfiguration.fromConfiguration(config));
        assertTrue(metricRegistry.getReporters().size() == 1);
        MetricReporter reporter = metricRegistry.getReporters().get(0);
        assertTrue(reporter instanceof TestingReporter);
        TestingReporter testingReporter = (TestingReporter) reporter;
        MetricGroup metricGroup = new JobManagerMetricGroup(metricRegistry, "localhost");
        Scheduler scheduler = mock(Scheduler.class);
        ResourceID taskManagerId = ResourceID.generate();
        TaskManagerLocation taskManagerLocation = mock(TaskManagerLocation.class);
        when(taskManagerLocation.getResourceID()).thenReturn(taskManagerId);
        when(taskManagerLocation.getHostname()).thenReturn("localhost");
        TaskManagerGateway taskManagerGateway = mock(TaskManagerGateway.class);
        Instance instance = mock(Instance.class);
        when(instance.getTaskManagerLocation()).thenReturn(taskManagerLocation);
        when(instance.getTaskManagerID()).thenReturn(taskManagerId);
        when(instance.getTaskManagerGateway()).thenReturn(taskManagerGateway);
        Slot rootSlot = mock(Slot.class);
        AllocatedSlot mockAllocatedSlot = mock(AllocatedSlot.class);
        when(mockAllocatedSlot.getSlotAllocationId()).thenReturn(new AllocationID());
        SimpleSlot simpleSlot = mock(SimpleSlot.class);
        when(simpleSlot.isAlive()).thenReturn(true);
        when(simpleSlot.getTaskManagerLocation()).thenReturn(taskManagerLocation);
        when(simpleSlot.getTaskManagerID()).thenReturn(taskManagerId);
        when(simpleSlot.getTaskManagerGateway()).thenReturn(taskManagerGateway);
        when(simpleSlot.setExecutedVertex(Matchers.any(Execution.class))).thenReturn(true);
        when(simpleSlot.getRoot()).thenReturn(rootSlot);
        when(simpleSlot.getAllocatedSlot()).thenReturn(mockAllocatedSlot);
        FlinkCompletableFuture<SimpleSlot> future = new FlinkCompletableFuture<>();
        future.complete(simpleSlot);
        when(scheduler.allocateSlot(any(ScheduledUnit.class), anyBoolean())).thenReturn(future);
        when(rootSlot.getSlotNumber()).thenReturn(0);
        when(taskManagerGateway.submitTask(any(TaskDeploymentDescriptor.class), any(Time.class))).thenReturn(FlinkCompletableFuture.completed(Acknowledge.get()));
        TestingRestartStrategy testingRestartStrategy = new TestingRestartStrategy();
        ExecutionGraph executionGraph = new ExecutionGraph(executor, executor, jobGraph.getJobID(), jobGraph.getName(), jobConfig, new SerializedValue<ExecutionConfig>(null), timeout, testingRestartStrategy, Collections.<BlobKey>emptyList(), Collections.<URL>emptyList(), scheduler, getClass().getClassLoader(), metricGroup);
        // get restarting time metric
        Metric metric = testingReporter.getMetric(ExecutionGraph.RESTARTING_TIME_METRIC_NAME);
        assertNotNull(metric);
        assertTrue(metric instanceof Gauge);
        @SuppressWarnings("unchecked") Gauge<Long> restartingTime = (Gauge<Long>) metric;
        // check that the restarting time is 0 since it's the initial start
        assertTrue(0L == restartingTime.getValue());
        executionGraph.attachJobGraph(jobGraph.getVerticesSortedTopologicallyFromSources());
        // start execution
        executionGraph.scheduleForExecution();
        assertTrue(0L == restartingTime.getValue());
        List<ExecutionAttemptID> executionIDs = new ArrayList<>();
        for (ExecutionVertex executionVertex : executionGraph.getAllExecutionVertices()) {
            executionIDs.add(executionVertex.getCurrentExecutionAttempt().getAttemptId());
        }
        // tell execution graph that the tasks are in state running --> job status switches to state running
        for (ExecutionAttemptID executionID : executionIDs) {
            executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.RUNNING));
        }
        assertEquals(JobStatus.RUNNING, executionGraph.getState());
        assertTrue(0L == restartingTime.getValue());
        // fail the job so that it goes into state restarting
        for (ExecutionAttemptID executionID : executionIDs) {
            executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.FAILED, new Exception()));
        }
        assertEquals(JobStatus.RESTARTING, executionGraph.getState());
        long firstRestartingTimestamp = executionGraph.getStatusTimestamp(JobStatus.RESTARTING);
        // wait some time so that the restarting time gauge shows a value different from 0
        Thread.sleep(50);
        long previousRestartingTime = restartingTime.getValue();
        // check that the restarting time is monotonically increasing
        for (int i = 0; i < 10; i++) {
            long currentRestartingTime = restartingTime.getValue();
            assertTrue(currentRestartingTime >= previousRestartingTime);
            previousRestartingTime = currentRestartingTime;
        }
        // check that we have measured some restarting time
        assertTrue(previousRestartingTime > 0);
        // restart job
        testingRestartStrategy.restartExecutionGraph();
        executionIDs.clear();
        for (ExecutionVertex executionVertex : executionGraph.getAllExecutionVertices()) {
            executionIDs.add(executionVertex.getCurrentExecutionAttempt().getAttemptId());
        }
        for (ExecutionAttemptID executionID : executionIDs) {
            executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.RUNNING));
        }
        assertEquals(JobStatus.RUNNING, executionGraph.getState());
        assertTrue(firstRestartingTimestamp != 0);
        previousRestartingTime = restartingTime.getValue();
        // check that the restarting time does not increase after we've reached the running state
        for (int i = 0; i < 10; i++) {
            long currentRestartingTime = restartingTime.getValue();
            assertTrue(currentRestartingTime == previousRestartingTime);
            previousRestartingTime = currentRestartingTime;
        }
        // fail job again
        for (ExecutionAttemptID executionID : executionIDs) {
            executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.FAILED, new Exception()));
        }
        assertEquals(JobStatus.RESTARTING, executionGraph.getState());
        long secondRestartingTimestamp = executionGraph.getStatusTimestamp(JobStatus.RESTARTING);
        assertTrue(firstRestartingTimestamp != secondRestartingTimestamp);
        Thread.sleep(50);
        previousRestartingTime = restartingTime.getValue();
        // check that the restarting time is increasing again
        for (int i = 0; i < 10; i++) {
            long currentRestartingTime = restartingTime.getValue();
            assertTrue(currentRestartingTime >= previousRestartingTime);
            previousRestartingTime = currentRestartingTime;
        }
        assertTrue(previousRestartingTime > 0);
        // now lets fail the job while it is in restarting and see whether the restarting time then stops to increase
        // for this to work, we have to use a SuppressRestartException
        executionGraph.fail(new SuppressRestartsException(new Exception()));
        assertEquals(JobStatus.FAILED, executionGraph.getState());
        previousRestartingTime = restartingTime.getValue();
        for (int i = 0; i < 10; i++) {
            long currentRestartingTime = restartingTime.getValue();
            assertTrue(currentRestartingTime == previousRestartingTime);
            previousRestartingTime = currentRestartingTime;
        }
    } finally {
        executor.shutdownNow();
    }
}
Also used : JobManagerMetricGroup(org.apache.flink.runtime.metrics.groups.JobManagerMetricGroup) MetricRegistryConfiguration(org.apache.flink.runtime.metrics.MetricRegistryConfiguration) Configuration(org.apache.flink.configuration.Configuration) Instance(org.apache.flink.runtime.instance.Instance) Scheduler(org.apache.flink.runtime.jobmanager.scheduler.Scheduler) MetricGroup(org.apache.flink.metrics.MetricGroup) JobManagerMetricGroup(org.apache.flink.runtime.metrics.groups.JobManagerMetricGroup) TaskManagerGateway(org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway) ArrayList(java.util.ArrayList) Time(org.apache.flink.api.common.time.Time) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) SimpleSlot(org.apache.flink.runtime.instance.SimpleSlot) FlinkCompletableFuture(org.apache.flink.runtime.concurrent.impl.FlinkCompletableFuture) Gauge(org.apache.flink.metrics.Gauge) SuppressRestartsException(org.apache.flink.runtime.execution.SuppressRestartsException) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) TaskDeploymentDescriptor(org.apache.flink.runtime.deployment.TaskDeploymentDescriptor) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) AllocatedSlot(org.apache.flink.runtime.jobmanager.slots.AllocatedSlot) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) ScheduledUnit(org.apache.flink.runtime.jobmanager.scheduler.ScheduledUnit) MetricReporter(org.apache.flink.metrics.reporter.MetricReporter) TaskExecutionState(org.apache.flink.runtime.taskmanager.TaskExecutionState) SuppressRestartsException(org.apache.flink.runtime.execution.SuppressRestartsException) JobException(org.apache.flink.runtime.JobException) IOException(java.io.IOException) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) SimpleSlot(org.apache.flink.runtime.instance.SimpleSlot) Slot(org.apache.flink.runtime.instance.Slot) AllocatedSlot(org.apache.flink.runtime.jobmanager.slots.AllocatedSlot) Metric(org.apache.flink.metrics.Metric) Test(org.junit.Test)

Example 25 with MetricRegistry

use of org.apache.flink.runtime.metrics.MetricRegistry in project flink by apache.

the class StatsDReporterTest method testStatsDMetersReporting.

/**
	 * Tests that meters are properly reported via the StatsD reporter
	 */
@Test
public void testStatsDMetersReporting() throws Exception {
    MetricRegistry registry = null;
    DatagramSocketReceiver receiver = null;
    Thread receiverThread = null;
    long timeout = 5000;
    long joinTimeout = 30000;
    String meterName = "meter";
    try {
        receiver = new DatagramSocketReceiver();
        receiverThread = new Thread(receiver);
        receiverThread.start();
        int port = receiver.getPort();
        Configuration config = new Configuration();
        config.setString(ConfigConstants.METRICS_REPORTERS_LIST, "test");
        config.setString(ConfigConstants.METRICS_REPORTER_PREFIX + "test." + ConfigConstants.METRICS_REPORTER_CLASS_SUFFIX, StatsDReporter.class.getName());
        config.setString(ConfigConstants.METRICS_REPORTER_PREFIX + "test." + ConfigConstants.METRICS_REPORTER_INTERVAL_SUFFIX, "1 SECONDS");
        config.setString(ConfigConstants.METRICS_REPORTER_PREFIX + "test.host", "localhost");
        config.setString(ConfigConstants.METRICS_REPORTER_PREFIX + "test.port", "" + port);
        registry = new MetricRegistry(MetricRegistryConfiguration.fromConfiguration(config));
        TaskManagerMetricGroup metricGroup = new TaskManagerMetricGroup(registry, "localhost", "tmId");
        TestMeter meter = new TestMeter();
        metricGroup.meter(meterName, meter);
        String prefix = metricGroup.getMetricIdentifier(meterName);
        Set<String> expectedLines = new HashSet<>();
        expectedLines.add(prefix + ".rate:5.0|g");
        expectedLines.add(prefix + ".count:100|g");
        receiver.waitUntilNumLines(expectedLines.size(), timeout);
        Set<String> lines = receiver.getLines();
        assertEquals(expectedLines, lines);
    } finally {
        if (registry != null) {
            registry.shutdown();
        }
        if (receiver != null) {
            receiver.stop();
        }
        if (receiverThread != null) {
            receiverThread.join(joinTimeout);
        }
    }
}
Also used : TestMeter(org.apache.flink.metrics.util.TestMeter) MetricRegistryConfiguration(org.apache.flink.runtime.metrics.MetricRegistryConfiguration) Configuration(org.apache.flink.configuration.Configuration) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry) TaskManagerMetricGroup(org.apache.flink.runtime.metrics.groups.TaskManagerMetricGroup) HashSet(java.util.HashSet) Test(org.junit.Test)

Aggregations

MetricRegistry (org.apache.flink.runtime.metrics.MetricRegistry)53 Test (org.junit.Test)47 Configuration (org.apache.flink.configuration.Configuration)27 JobID (org.apache.flink.api.common.JobID)26 MetricRegistryConfiguration (org.apache.flink.runtime.metrics.MetricRegistryConfiguration)25 AbstractID (org.apache.flink.util.AbstractID)13 TaskManagerMetricGroup (org.apache.flink.runtime.metrics.groups.TaskManagerMetricGroup)12 MetricReporter (org.apache.flink.metrics.reporter.MetricReporter)7 QueryScopeInfo (org.apache.flink.runtime.metrics.dump.QueryScopeInfo)7 DummyCharacterFilter (org.apache.flink.runtime.metrics.util.DummyCharacterFilter)7 Counter (org.apache.flink.metrics.Counter)4 MetricGroup (org.apache.flink.metrics.MetricGroup)4 ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)4 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)4 ActorRef (akka.actor.ActorRef)3 UUID (java.util.UUID)3 ObjectName (javax.management.ObjectName)3 Time (org.apache.flink.api.common.time.Time)3 Gauge (org.apache.flink.metrics.Gauge)3 SimpleCounter (org.apache.flink.metrics.SimpleCounter)3