Search in sources :

Example 16 with ResourceID

use of org.apache.flink.runtime.clusterframework.types.ResourceID in project flink by apache.

the class ExecutionGraphMetricsTest method testExecutionGraphRestartTimeMetric.

/**
	 * This test tests that the restarting time metric correctly displays restarting times.
	 */
@Test
public void testExecutionGraphRestartTimeMetric() throws JobException, IOException, InterruptedException {
    final ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor();
    try {
        // setup execution graph with mocked scheduling logic
        int parallelism = 1;
        JobVertex jobVertex = new JobVertex("TestVertex");
        jobVertex.setParallelism(parallelism);
        jobVertex.setInvokableClass(NoOpInvokable.class);
        JobGraph jobGraph = new JobGraph("Test Job", jobVertex);
        Configuration config = new Configuration();
        config.setString(ConfigConstants.METRICS_REPORTERS_LIST, "test");
        config.setString(ConfigConstants.METRICS_REPORTER_PREFIX + "test." + ConfigConstants.METRICS_REPORTER_CLASS_SUFFIX, TestingReporter.class.getName());
        Configuration jobConfig = new Configuration();
        Time timeout = Time.seconds(10L);
        MetricRegistry metricRegistry = new MetricRegistry(MetricRegistryConfiguration.fromConfiguration(config));
        assertTrue(metricRegistry.getReporters().size() == 1);
        MetricReporter reporter = metricRegistry.getReporters().get(0);
        assertTrue(reporter instanceof TestingReporter);
        TestingReporter testingReporter = (TestingReporter) reporter;
        MetricGroup metricGroup = new JobManagerMetricGroup(metricRegistry, "localhost");
        Scheduler scheduler = mock(Scheduler.class);
        ResourceID taskManagerId = ResourceID.generate();
        TaskManagerLocation taskManagerLocation = mock(TaskManagerLocation.class);
        when(taskManagerLocation.getResourceID()).thenReturn(taskManagerId);
        when(taskManagerLocation.getHostname()).thenReturn("localhost");
        TaskManagerGateway taskManagerGateway = mock(TaskManagerGateway.class);
        Instance instance = mock(Instance.class);
        when(instance.getTaskManagerLocation()).thenReturn(taskManagerLocation);
        when(instance.getTaskManagerID()).thenReturn(taskManagerId);
        when(instance.getTaskManagerGateway()).thenReturn(taskManagerGateway);
        Slot rootSlot = mock(Slot.class);
        AllocatedSlot mockAllocatedSlot = mock(AllocatedSlot.class);
        when(mockAllocatedSlot.getSlotAllocationId()).thenReturn(new AllocationID());
        SimpleSlot simpleSlot = mock(SimpleSlot.class);
        when(simpleSlot.isAlive()).thenReturn(true);
        when(simpleSlot.getTaskManagerLocation()).thenReturn(taskManagerLocation);
        when(simpleSlot.getTaskManagerID()).thenReturn(taskManagerId);
        when(simpleSlot.getTaskManagerGateway()).thenReturn(taskManagerGateway);
        when(simpleSlot.setExecutedVertex(Matchers.any(Execution.class))).thenReturn(true);
        when(simpleSlot.getRoot()).thenReturn(rootSlot);
        when(simpleSlot.getAllocatedSlot()).thenReturn(mockAllocatedSlot);
        FlinkCompletableFuture<SimpleSlot> future = new FlinkCompletableFuture<>();
        future.complete(simpleSlot);
        when(scheduler.allocateSlot(any(ScheduledUnit.class), anyBoolean())).thenReturn(future);
        when(rootSlot.getSlotNumber()).thenReturn(0);
        when(taskManagerGateway.submitTask(any(TaskDeploymentDescriptor.class), any(Time.class))).thenReturn(FlinkCompletableFuture.completed(Acknowledge.get()));
        TestingRestartStrategy testingRestartStrategy = new TestingRestartStrategy();
        ExecutionGraph executionGraph = new ExecutionGraph(executor, executor, jobGraph.getJobID(), jobGraph.getName(), jobConfig, new SerializedValue<ExecutionConfig>(null), timeout, testingRestartStrategy, Collections.<BlobKey>emptyList(), Collections.<URL>emptyList(), scheduler, getClass().getClassLoader(), metricGroup);
        // get restarting time metric
        Metric metric = testingReporter.getMetric(ExecutionGraph.RESTARTING_TIME_METRIC_NAME);
        assertNotNull(metric);
        assertTrue(metric instanceof Gauge);
        @SuppressWarnings("unchecked") Gauge<Long> restartingTime = (Gauge<Long>) metric;
        // check that the restarting time is 0 since it's the initial start
        assertTrue(0L == restartingTime.getValue());
        executionGraph.attachJobGraph(jobGraph.getVerticesSortedTopologicallyFromSources());
        // start execution
        executionGraph.scheduleForExecution();
        assertTrue(0L == restartingTime.getValue());
        List<ExecutionAttemptID> executionIDs = new ArrayList<>();
        for (ExecutionVertex executionVertex : executionGraph.getAllExecutionVertices()) {
            executionIDs.add(executionVertex.getCurrentExecutionAttempt().getAttemptId());
        }
        // tell execution graph that the tasks are in state running --> job status switches to state running
        for (ExecutionAttemptID executionID : executionIDs) {
            executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.RUNNING));
        }
        assertEquals(JobStatus.RUNNING, executionGraph.getState());
        assertTrue(0L == restartingTime.getValue());
        // fail the job so that it goes into state restarting
        for (ExecutionAttemptID executionID : executionIDs) {
            executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.FAILED, new Exception()));
        }
        assertEquals(JobStatus.RESTARTING, executionGraph.getState());
        long firstRestartingTimestamp = executionGraph.getStatusTimestamp(JobStatus.RESTARTING);
        // wait some time so that the restarting time gauge shows a value different from 0
        Thread.sleep(50);
        long previousRestartingTime = restartingTime.getValue();
        // check that the restarting time is monotonically increasing
        for (int i = 0; i < 10; i++) {
            long currentRestartingTime = restartingTime.getValue();
            assertTrue(currentRestartingTime >= previousRestartingTime);
            previousRestartingTime = currentRestartingTime;
        }
        // check that we have measured some restarting time
        assertTrue(previousRestartingTime > 0);
        // restart job
        testingRestartStrategy.restartExecutionGraph();
        executionIDs.clear();
        for (ExecutionVertex executionVertex : executionGraph.getAllExecutionVertices()) {
            executionIDs.add(executionVertex.getCurrentExecutionAttempt().getAttemptId());
        }
        for (ExecutionAttemptID executionID : executionIDs) {
            executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.RUNNING));
        }
        assertEquals(JobStatus.RUNNING, executionGraph.getState());
        assertTrue(firstRestartingTimestamp != 0);
        previousRestartingTime = restartingTime.getValue();
        // check that the restarting time does not increase after we've reached the running state
        for (int i = 0; i < 10; i++) {
            long currentRestartingTime = restartingTime.getValue();
            assertTrue(currentRestartingTime == previousRestartingTime);
            previousRestartingTime = currentRestartingTime;
        }
        // fail job again
        for (ExecutionAttemptID executionID : executionIDs) {
            executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.FAILED, new Exception()));
        }
        assertEquals(JobStatus.RESTARTING, executionGraph.getState());
        long secondRestartingTimestamp = executionGraph.getStatusTimestamp(JobStatus.RESTARTING);
        assertTrue(firstRestartingTimestamp != secondRestartingTimestamp);
        Thread.sleep(50);
        previousRestartingTime = restartingTime.getValue();
        // check that the restarting time is increasing again
        for (int i = 0; i < 10; i++) {
            long currentRestartingTime = restartingTime.getValue();
            assertTrue(currentRestartingTime >= previousRestartingTime);
            previousRestartingTime = currentRestartingTime;
        }
        assertTrue(previousRestartingTime > 0);
        // now lets fail the job while it is in restarting and see whether the restarting time then stops to increase
        // for this to work, we have to use a SuppressRestartException
        executionGraph.fail(new SuppressRestartsException(new Exception()));
        assertEquals(JobStatus.FAILED, executionGraph.getState());
        previousRestartingTime = restartingTime.getValue();
        for (int i = 0; i < 10; i++) {
            long currentRestartingTime = restartingTime.getValue();
            assertTrue(currentRestartingTime == previousRestartingTime);
            previousRestartingTime = currentRestartingTime;
        }
    } finally {
        executor.shutdownNow();
    }
}
Also used : JobManagerMetricGroup(org.apache.flink.runtime.metrics.groups.JobManagerMetricGroup) MetricRegistryConfiguration(org.apache.flink.runtime.metrics.MetricRegistryConfiguration) Configuration(org.apache.flink.configuration.Configuration) Instance(org.apache.flink.runtime.instance.Instance) Scheduler(org.apache.flink.runtime.jobmanager.scheduler.Scheduler) MetricGroup(org.apache.flink.metrics.MetricGroup) JobManagerMetricGroup(org.apache.flink.runtime.metrics.groups.JobManagerMetricGroup) TaskManagerGateway(org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway) ArrayList(java.util.ArrayList) Time(org.apache.flink.api.common.time.Time) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) SimpleSlot(org.apache.flink.runtime.instance.SimpleSlot) FlinkCompletableFuture(org.apache.flink.runtime.concurrent.impl.FlinkCompletableFuture) Gauge(org.apache.flink.metrics.Gauge) SuppressRestartsException(org.apache.flink.runtime.execution.SuppressRestartsException) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) TaskDeploymentDescriptor(org.apache.flink.runtime.deployment.TaskDeploymentDescriptor) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) AllocatedSlot(org.apache.flink.runtime.jobmanager.slots.AllocatedSlot) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) ScheduledUnit(org.apache.flink.runtime.jobmanager.scheduler.ScheduledUnit) MetricReporter(org.apache.flink.metrics.reporter.MetricReporter) TaskExecutionState(org.apache.flink.runtime.taskmanager.TaskExecutionState) SuppressRestartsException(org.apache.flink.runtime.execution.SuppressRestartsException) JobException(org.apache.flink.runtime.JobException) IOException(java.io.IOException) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) SimpleSlot(org.apache.flink.runtime.instance.SimpleSlot) Slot(org.apache.flink.runtime.instance.Slot) AllocatedSlot(org.apache.flink.runtime.jobmanager.slots.AllocatedSlot) Metric(org.apache.flink.metrics.Metric) Test(org.junit.Test)

Example 17 with ResourceID

use of org.apache.flink.runtime.clusterframework.types.ResourceID in project flink by apache.

the class AvailableSlotsTest method testPollFreeSlot.

@Test
public void testPollFreeSlot() {
    SlotPool.AvailableSlots availableSlots = new SlotPool.AvailableSlots();
    final ResourceID resource1 = new ResourceID("resource1");
    final AllocatedSlot slot1 = createAllocatedSlot(resource1);
    availableSlots.add(slot1, 1L);
    assertEquals(1, availableSlots.size());
    assertTrue(availableSlots.contains(slot1.getSlotAllocationId()));
    assertTrue(availableSlots.containsTaskManager(resource1));
    assertNull(availableSlots.poll(DEFAULT_TESTING_BIG_PROFILE, null));
    SlotAndLocality slotAndLocality = availableSlots.poll(DEFAULT_TESTING_PROFILE, null);
    assertEquals(slot1, slotAndLocality.slot());
    assertEquals(0, availableSlots.size());
    assertFalse(availableSlots.contains(slot1.getSlotAllocationId()));
    assertFalse(availableSlots.containsTaskManager(resource1));
}
Also used : AllocatedSlot(org.apache.flink.runtime.jobmanager.slots.AllocatedSlot) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) SlotAndLocality(org.apache.flink.runtime.jobmanager.slots.SlotAndLocality) Test(org.junit.Test)

Example 18 with ResourceID

use of org.apache.flink.runtime.clusterframework.types.ResourceID in project flink by apache.

the class AvailableSlotsTest method testAddAndRemove.

@Test
public void testAddAndRemove() throws Exception {
    SlotPool.AvailableSlots availableSlots = new SlotPool.AvailableSlots();
    final ResourceID resource1 = new ResourceID("resource1");
    final ResourceID resource2 = new ResourceID("resource2");
    final AllocatedSlot slot1 = createAllocatedSlot(resource1);
    final AllocatedSlot slot2 = createAllocatedSlot(resource1);
    final AllocatedSlot slot3 = createAllocatedSlot(resource2);
    availableSlots.add(slot1, 1L);
    availableSlots.add(slot2, 2L);
    availableSlots.add(slot3, 3L);
    assertEquals(3, availableSlots.size());
    assertTrue(availableSlots.contains(slot1.getSlotAllocationId()));
    assertTrue(availableSlots.contains(slot2.getSlotAllocationId()));
    assertTrue(availableSlots.contains(slot3.getSlotAllocationId()));
    assertTrue(availableSlots.containsTaskManager(resource1));
    assertTrue(availableSlots.containsTaskManager(resource2));
    availableSlots.removeAllForTaskManager(resource1);
    assertEquals(1, availableSlots.size());
    assertFalse(availableSlots.contains(slot1.getSlotAllocationId()));
    assertFalse(availableSlots.contains(slot2.getSlotAllocationId()));
    assertTrue(availableSlots.contains(slot3.getSlotAllocationId()));
    assertFalse(availableSlots.containsTaskManager(resource1));
    assertTrue(availableSlots.containsTaskManager(resource2));
    availableSlots.removeAllForTaskManager(resource2);
    assertEquals(0, availableSlots.size());
    assertFalse(availableSlots.contains(slot1.getSlotAllocationId()));
    assertFalse(availableSlots.contains(slot2.getSlotAllocationId()));
    assertFalse(availableSlots.contains(slot3.getSlotAllocationId()));
    assertFalse(availableSlots.containsTaskManager(resource1));
    assertFalse(availableSlots.containsTaskManager(resource2));
}
Also used : AllocatedSlot(org.apache.flink.runtime.jobmanager.slots.AllocatedSlot) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) Test(org.junit.Test)

Example 19 with ResourceID

use of org.apache.flink.runtime.clusterframework.types.ResourceID in project flink by apache.

the class InstanceManagerTest method testInstanceRegistering.

@Test
public void testInstanceRegistering() {
    try {
        InstanceManager cm = new InstanceManager();
        final int dataPort = 20000;
        HardwareDescription hardwareDescription = HardwareDescription.extractFromSystem(4096);
        InetAddress address = InetAddress.getByName("127.0.0.1");
        // register three instances
        ResourceID resID1 = ResourceID.generate();
        ResourceID resID2 = ResourceID.generate();
        ResourceID resID3 = ResourceID.generate();
        TaskManagerLocation ici1 = new TaskManagerLocation(resID1, address, dataPort);
        TaskManagerLocation ici2 = new TaskManagerLocation(resID2, address, dataPort + 15);
        TaskManagerLocation ici3 = new TaskManagerLocation(resID3, address, dataPort + 30);
        final JavaTestKit probe1 = new JavaTestKit(system);
        final JavaTestKit probe2 = new JavaTestKit(system);
        final JavaTestKit probe3 = new JavaTestKit(system);
        cm.registerTaskManager(new ActorTaskManagerGateway(new AkkaActorGateway(probe1.getRef(), leaderSessionID)), ici1, hardwareDescription, 1);
        cm.registerTaskManager(new ActorTaskManagerGateway(new AkkaActorGateway(probe2.getRef(), leaderSessionID)), ici2, hardwareDescription, 2);
        cm.registerTaskManager(new ActorTaskManagerGateway(new AkkaActorGateway(probe3.getRef(), leaderSessionID)), ici3, hardwareDescription, 5);
        assertEquals(3, cm.getNumberOfRegisteredTaskManagers());
        assertEquals(8, cm.getTotalNumberOfSlots());
        Collection<Instance> instances = cm.getAllRegisteredInstances();
        Set<TaskManagerLocation> taskManagerLocations = new HashSet<TaskManagerLocation>();
        for (Instance instance : instances) {
            taskManagerLocations.add(instance.getTaskManagerLocation());
        }
        assertTrue(taskManagerLocations.contains(ici1));
        assertTrue(taskManagerLocations.contains(ici2));
        assertTrue(taskManagerLocations.contains(ici3));
        cm.shutdown();
    } catch (Exception e) {
        System.err.println(e.getMessage());
        e.printStackTrace();
        Assert.fail("Test erroneous: " + e.getMessage());
    }
}
Also used : TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) ActorTaskManagerGateway(org.apache.flink.runtime.jobmanager.slots.ActorTaskManagerGateway) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) InetAddress(java.net.InetAddress) JavaTestKit(akka.testkit.JavaTestKit) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 20 with ResourceID

use of org.apache.flink.runtime.clusterframework.types.ResourceID in project flink by apache.

the class InstanceManagerTest method testShutdown.

@Test
public void testShutdown() {
    try {
        InstanceManager cm = new InstanceManager();
        cm.shutdown();
        try {
            ResourceID resID = ResourceID.generate();
            HardwareDescription resources = HardwareDescription.extractFromSystem(4096);
            InetAddress address = InetAddress.getByName("127.0.0.1");
            TaskManagerLocation ici = new TaskManagerLocation(resID, address, 20000);
            JavaTestKit probe = new JavaTestKit(system);
            cm.registerTaskManager(new ActorTaskManagerGateway(new AkkaActorGateway(probe.getRef(), leaderSessionID)), ici, resources, 1);
            fail("Should raise exception in shutdown state");
        } catch (IllegalStateException e) {
        // expected
        }
        assertFalse(cm.reportHeartBeat(new InstanceID()));
    } catch (Exception e) {
        System.err.println(e.getMessage());
        e.printStackTrace();
        Assert.fail("Test erroneous: " + e.getMessage());
    }
}
Also used : ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) InetAddress(java.net.InetAddress) JavaTestKit(akka.testkit.JavaTestKit) ActorTaskManagerGateway(org.apache.flink.runtime.jobmanager.slots.ActorTaskManagerGateway) Test(org.junit.Test)

Aggregations

ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)74 Test (org.junit.Test)48 TaskManagerLocation (org.apache.flink.runtime.taskmanager.TaskManagerLocation)25 Time (org.apache.flink.api.common.time.Time)18 UUID (java.util.UUID)16 JobID (org.apache.flink.api.common.JobID)16 Configuration (org.apache.flink.configuration.Configuration)14 AllocationID (org.apache.flink.runtime.clusterframework.types.AllocationID)13 JavaTestKit (akka.testkit.JavaTestKit)12 MetricRegistry (org.apache.flink.runtime.metrics.MetricRegistry)12 InetAddress (java.net.InetAddress)11 SlotID (org.apache.flink.runtime.clusterframework.types.SlotID)10 HeartbeatServices (org.apache.flink.runtime.heartbeat.HeartbeatServices)10 TestingHighAvailabilityServices (org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices)10 SlotRequest (org.apache.flink.runtime.resourcemanager.SlotRequest)10 IOManager (org.apache.flink.runtime.io.disk.iomanager.IOManager)9 NetworkEnvironment (org.apache.flink.runtime.io.network.NetworkEnvironment)9 ActorTaskManagerGateway (org.apache.flink.runtime.jobmanager.slots.ActorTaskManagerGateway)9 MemoryManager (org.apache.flink.runtime.memory.MemoryManager)9 TestingSerialRpcService (org.apache.flink.runtime.rpc.TestingSerialRpcService)9