Search in sources :

Example 6 with Instance

use of org.apache.flink.runtime.instance.Instance in project flink by apache.

the class SchedulerIsolatedTasksTest method testScheduleWithDyingInstances.

@Test
public void testScheduleWithDyingInstances() {
    try {
        Scheduler scheduler = new Scheduler(TestingUtils.defaultExecutionContext());
        Instance i1 = getRandomInstance(2);
        Instance i2 = getRandomInstance(2);
        Instance i3 = getRandomInstance(1);
        scheduler.newInstanceAvailable(i1);
        scheduler.newInstanceAvailable(i2);
        scheduler.newInstanceAvailable(i3);
        List<SimpleSlot> slots = new ArrayList<SimpleSlot>();
        slots.add(scheduler.allocateSlot(new ScheduledUnit(getDummyTask()), false).get());
        slots.add(scheduler.allocateSlot(new ScheduledUnit(getDummyTask()), false).get());
        slots.add(scheduler.allocateSlot(new ScheduledUnit(getDummyTask()), false).get());
        slots.add(scheduler.allocateSlot(new ScheduledUnit(getDummyTask()), false).get());
        slots.add(scheduler.allocateSlot(new ScheduledUnit(getDummyTask()), false).get());
        i2.markDead();
        for (SimpleSlot slot : slots) {
            if (slot.getOwner() == i2) {
                assertTrue(slot.isCanceled());
            } else {
                assertFalse(slot.isCanceled());
            }
            slot.releaseSlot();
        }
        assertEquals(3, scheduler.getNumberOfAvailableSlots());
        i1.markDead();
        i3.markDead();
        // cannot get another slot, since all instances are dead
        try {
            scheduler.allocateSlot(new ScheduledUnit(getDummyTask()), false).get();
            fail("Scheduler served a slot from a dead instance");
        } catch (ExecutionException e) {
            assertTrue(e.getCause() instanceof NoResourceAvailableException);
        } catch (Exception e) {
            fail("Wrong exception type.");
        }
        // now the latest, the scheduler should have noticed (through the lazy mechanisms)
        // that all instances have vanished
        assertEquals(0, scheduler.getNumberOfInstancesWithAvailableSlots());
        assertEquals(0, scheduler.getNumberOfAvailableSlots());
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
}
Also used : Instance(org.apache.flink.runtime.instance.Instance) SchedulerTestUtils.getRandomInstance(org.apache.flink.runtime.jobmanager.scheduler.SchedulerTestUtils.getRandomInstance) ArrayList(java.util.ArrayList) ExecutionException(java.util.concurrent.ExecutionException) SimpleSlot(org.apache.flink.runtime.instance.SimpleSlot) ExecutionException(java.util.concurrent.ExecutionException) Test(org.junit.Test)

Example 7 with Instance

use of org.apache.flink.runtime.instance.Instance in project flink by apache.

the class SchedulerIsolatedTasksTest method testAddAndRemoveInstance.

@Test
public void testAddAndRemoveInstance() {
    try {
        Scheduler scheduler = new Scheduler(TestingUtils.defaultExecutionContext());
        Instance i1 = getRandomInstance(2);
        Instance i2 = getRandomInstance(2);
        Instance i3 = getRandomInstance(2);
        assertEquals(0, scheduler.getNumberOfAvailableInstances());
        assertEquals(0, scheduler.getNumberOfAvailableSlots());
        scheduler.newInstanceAvailable(i1);
        assertEquals(1, scheduler.getNumberOfAvailableInstances());
        assertEquals(2, scheduler.getNumberOfAvailableSlots());
        scheduler.newInstanceAvailable(i2);
        assertEquals(2, scheduler.getNumberOfAvailableInstances());
        assertEquals(4, scheduler.getNumberOfAvailableSlots());
        scheduler.newInstanceAvailable(i3);
        assertEquals(3, scheduler.getNumberOfAvailableInstances());
        assertEquals(6, scheduler.getNumberOfAvailableSlots());
        // cannot add available instance again
        try {
            scheduler.newInstanceAvailable(i2);
            fail("Scheduler accepted instance twice");
        } catch (IllegalArgumentException e) {
        // bueno!
        }
        // some instances die
        assertEquals(3, scheduler.getNumberOfAvailableInstances());
        assertEquals(6, scheduler.getNumberOfAvailableSlots());
        scheduler.instanceDied(i2);
        assertEquals(2, scheduler.getNumberOfAvailableInstances());
        assertEquals(4, scheduler.getNumberOfAvailableSlots());
        // try to add a dead instance
        try {
            scheduler.newInstanceAvailable(i2);
            fail("Scheduler accepted dead instance");
        } catch (IllegalArgumentException e) {
        // stimmt
        }
        scheduler.instanceDied(i1);
        assertEquals(1, scheduler.getNumberOfAvailableInstances());
        assertEquals(2, scheduler.getNumberOfAvailableSlots());
        scheduler.instanceDied(i3);
        assertEquals(0, scheduler.getNumberOfAvailableInstances());
        assertEquals(0, scheduler.getNumberOfAvailableSlots());
        assertFalse(i1.isAlive());
        assertFalse(i2.isAlive());
        assertFalse(i3.isAlive());
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
}
Also used : Instance(org.apache.flink.runtime.instance.Instance) SchedulerTestUtils.getRandomInstance(org.apache.flink.runtime.jobmanager.scheduler.SchedulerTestUtils.getRandomInstance) ExecutionException(java.util.concurrent.ExecutionException) Test(org.junit.Test)

Example 8 with Instance

use of org.apache.flink.runtime.instance.Instance in project flink by apache.

the class Scheduler method handleNewSlot.

private void handleNewSlot() {
    synchronized (globalLock) {
        Instance instance = this.newlyAvailableInstances.poll();
        if (instance == null || !instance.hasResourcesAvailable()) {
            // someone else took it
            return;
        }
        QueuedTask queued = taskQueue.peek();
        if (queued != null) {
            ScheduledUnit task = queued.getTask();
            ExecutionVertex vertex = task.getTaskToExecute().getVertex();
            try {
                SimpleSlot newSlot = instance.allocateSimpleSlot(vertex.getJobId());
                if (newSlot != null) {
                    // success, remove from the task queue and notify the future
                    taskQueue.poll();
                    if (queued.getFuture() != null) {
                        try {
                            queued.getFuture().complete(newSlot);
                        } catch (Throwable t) {
                            LOG.error("Error calling allocation future for task " + vertex.getSimpleName(), t);
                            task.getTaskToExecute().fail(t);
                        }
                    }
                }
            } catch (InstanceDiedException e) {
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Instance " + instance + " was marked dead asynchronously.");
                }
                removeInstance(instance);
            }
        } else {
            this.instancesWithAvailableResources.put(instance.getTaskManagerID(), instance);
        }
    }
}
Also used : Instance(org.apache.flink.runtime.instance.Instance) SimpleSlot(org.apache.flink.runtime.instance.SimpleSlot) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) InstanceDiedException(org.apache.flink.runtime.instance.InstanceDiedException)

Example 9 with Instance

use of org.apache.flink.runtime.instance.Instance in project flink by apache.

the class ExecutionGraphMetricsTest method testExecutionGraphRestartTimeMetric.

/**
	 * This test tests that the restarting time metric correctly displays restarting times.
	 */
@Test
public void testExecutionGraphRestartTimeMetric() throws JobException, IOException, InterruptedException {
    final ScheduledExecutorService executor = Executors.newSingleThreadScheduledExecutor();
    try {
        // setup execution graph with mocked scheduling logic
        int parallelism = 1;
        JobVertex jobVertex = new JobVertex("TestVertex");
        jobVertex.setParallelism(parallelism);
        jobVertex.setInvokableClass(NoOpInvokable.class);
        JobGraph jobGraph = new JobGraph("Test Job", jobVertex);
        Configuration config = new Configuration();
        config.setString(ConfigConstants.METRICS_REPORTERS_LIST, "test");
        config.setString(ConfigConstants.METRICS_REPORTER_PREFIX + "test." + ConfigConstants.METRICS_REPORTER_CLASS_SUFFIX, TestingReporter.class.getName());
        Configuration jobConfig = new Configuration();
        Time timeout = Time.seconds(10L);
        MetricRegistry metricRegistry = new MetricRegistry(MetricRegistryConfiguration.fromConfiguration(config));
        assertTrue(metricRegistry.getReporters().size() == 1);
        MetricReporter reporter = metricRegistry.getReporters().get(0);
        assertTrue(reporter instanceof TestingReporter);
        TestingReporter testingReporter = (TestingReporter) reporter;
        MetricGroup metricGroup = new JobManagerMetricGroup(metricRegistry, "localhost");
        Scheduler scheduler = mock(Scheduler.class);
        ResourceID taskManagerId = ResourceID.generate();
        TaskManagerLocation taskManagerLocation = mock(TaskManagerLocation.class);
        when(taskManagerLocation.getResourceID()).thenReturn(taskManagerId);
        when(taskManagerLocation.getHostname()).thenReturn("localhost");
        TaskManagerGateway taskManagerGateway = mock(TaskManagerGateway.class);
        Instance instance = mock(Instance.class);
        when(instance.getTaskManagerLocation()).thenReturn(taskManagerLocation);
        when(instance.getTaskManagerID()).thenReturn(taskManagerId);
        when(instance.getTaskManagerGateway()).thenReturn(taskManagerGateway);
        Slot rootSlot = mock(Slot.class);
        AllocatedSlot mockAllocatedSlot = mock(AllocatedSlot.class);
        when(mockAllocatedSlot.getSlotAllocationId()).thenReturn(new AllocationID());
        SimpleSlot simpleSlot = mock(SimpleSlot.class);
        when(simpleSlot.isAlive()).thenReturn(true);
        when(simpleSlot.getTaskManagerLocation()).thenReturn(taskManagerLocation);
        when(simpleSlot.getTaskManagerID()).thenReturn(taskManagerId);
        when(simpleSlot.getTaskManagerGateway()).thenReturn(taskManagerGateway);
        when(simpleSlot.setExecutedVertex(Matchers.any(Execution.class))).thenReturn(true);
        when(simpleSlot.getRoot()).thenReturn(rootSlot);
        when(simpleSlot.getAllocatedSlot()).thenReturn(mockAllocatedSlot);
        FlinkCompletableFuture<SimpleSlot> future = new FlinkCompletableFuture<>();
        future.complete(simpleSlot);
        when(scheduler.allocateSlot(any(ScheduledUnit.class), anyBoolean())).thenReturn(future);
        when(rootSlot.getSlotNumber()).thenReturn(0);
        when(taskManagerGateway.submitTask(any(TaskDeploymentDescriptor.class), any(Time.class))).thenReturn(FlinkCompletableFuture.completed(Acknowledge.get()));
        TestingRestartStrategy testingRestartStrategy = new TestingRestartStrategy();
        ExecutionGraph executionGraph = new ExecutionGraph(executor, executor, jobGraph.getJobID(), jobGraph.getName(), jobConfig, new SerializedValue<ExecutionConfig>(null), timeout, testingRestartStrategy, Collections.<BlobKey>emptyList(), Collections.<URL>emptyList(), scheduler, getClass().getClassLoader(), metricGroup);
        // get restarting time metric
        Metric metric = testingReporter.getMetric(ExecutionGraph.RESTARTING_TIME_METRIC_NAME);
        assertNotNull(metric);
        assertTrue(metric instanceof Gauge);
        @SuppressWarnings("unchecked") Gauge<Long> restartingTime = (Gauge<Long>) metric;
        // check that the restarting time is 0 since it's the initial start
        assertTrue(0L == restartingTime.getValue());
        executionGraph.attachJobGraph(jobGraph.getVerticesSortedTopologicallyFromSources());
        // start execution
        executionGraph.scheduleForExecution();
        assertTrue(0L == restartingTime.getValue());
        List<ExecutionAttemptID> executionIDs = new ArrayList<>();
        for (ExecutionVertex executionVertex : executionGraph.getAllExecutionVertices()) {
            executionIDs.add(executionVertex.getCurrentExecutionAttempt().getAttemptId());
        }
        // tell execution graph that the tasks are in state running --> job status switches to state running
        for (ExecutionAttemptID executionID : executionIDs) {
            executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.RUNNING));
        }
        assertEquals(JobStatus.RUNNING, executionGraph.getState());
        assertTrue(0L == restartingTime.getValue());
        // fail the job so that it goes into state restarting
        for (ExecutionAttemptID executionID : executionIDs) {
            executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.FAILED, new Exception()));
        }
        assertEquals(JobStatus.RESTARTING, executionGraph.getState());
        long firstRestartingTimestamp = executionGraph.getStatusTimestamp(JobStatus.RESTARTING);
        // wait some time so that the restarting time gauge shows a value different from 0
        Thread.sleep(50);
        long previousRestartingTime = restartingTime.getValue();
        // check that the restarting time is monotonically increasing
        for (int i = 0; i < 10; i++) {
            long currentRestartingTime = restartingTime.getValue();
            assertTrue(currentRestartingTime >= previousRestartingTime);
            previousRestartingTime = currentRestartingTime;
        }
        // check that we have measured some restarting time
        assertTrue(previousRestartingTime > 0);
        // restart job
        testingRestartStrategy.restartExecutionGraph();
        executionIDs.clear();
        for (ExecutionVertex executionVertex : executionGraph.getAllExecutionVertices()) {
            executionIDs.add(executionVertex.getCurrentExecutionAttempt().getAttemptId());
        }
        for (ExecutionAttemptID executionID : executionIDs) {
            executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.RUNNING));
        }
        assertEquals(JobStatus.RUNNING, executionGraph.getState());
        assertTrue(firstRestartingTimestamp != 0);
        previousRestartingTime = restartingTime.getValue();
        // check that the restarting time does not increase after we've reached the running state
        for (int i = 0; i < 10; i++) {
            long currentRestartingTime = restartingTime.getValue();
            assertTrue(currentRestartingTime == previousRestartingTime);
            previousRestartingTime = currentRestartingTime;
        }
        // fail job again
        for (ExecutionAttemptID executionID : executionIDs) {
            executionGraph.updateState(new TaskExecutionState(jobGraph.getJobID(), executionID, ExecutionState.FAILED, new Exception()));
        }
        assertEquals(JobStatus.RESTARTING, executionGraph.getState());
        long secondRestartingTimestamp = executionGraph.getStatusTimestamp(JobStatus.RESTARTING);
        assertTrue(firstRestartingTimestamp != secondRestartingTimestamp);
        Thread.sleep(50);
        previousRestartingTime = restartingTime.getValue();
        // check that the restarting time is increasing again
        for (int i = 0; i < 10; i++) {
            long currentRestartingTime = restartingTime.getValue();
            assertTrue(currentRestartingTime >= previousRestartingTime);
            previousRestartingTime = currentRestartingTime;
        }
        assertTrue(previousRestartingTime > 0);
        // now lets fail the job while it is in restarting and see whether the restarting time then stops to increase
        // for this to work, we have to use a SuppressRestartException
        executionGraph.fail(new SuppressRestartsException(new Exception()));
        assertEquals(JobStatus.FAILED, executionGraph.getState());
        previousRestartingTime = restartingTime.getValue();
        for (int i = 0; i < 10; i++) {
            long currentRestartingTime = restartingTime.getValue();
            assertTrue(currentRestartingTime == previousRestartingTime);
            previousRestartingTime = currentRestartingTime;
        }
    } finally {
        executor.shutdownNow();
    }
}
Also used : JobManagerMetricGroup(org.apache.flink.runtime.metrics.groups.JobManagerMetricGroup) MetricRegistryConfiguration(org.apache.flink.runtime.metrics.MetricRegistryConfiguration) Configuration(org.apache.flink.configuration.Configuration) Instance(org.apache.flink.runtime.instance.Instance) Scheduler(org.apache.flink.runtime.jobmanager.scheduler.Scheduler) MetricGroup(org.apache.flink.metrics.MetricGroup) JobManagerMetricGroup(org.apache.flink.runtime.metrics.groups.JobManagerMetricGroup) TaskManagerGateway(org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway) ArrayList(java.util.ArrayList) Time(org.apache.flink.api.common.time.Time) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) SimpleSlot(org.apache.flink.runtime.instance.SimpleSlot) FlinkCompletableFuture(org.apache.flink.runtime.concurrent.impl.FlinkCompletableFuture) Gauge(org.apache.flink.metrics.Gauge) SuppressRestartsException(org.apache.flink.runtime.execution.SuppressRestartsException) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) TaskDeploymentDescriptor(org.apache.flink.runtime.deployment.TaskDeploymentDescriptor) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) AllocatedSlot(org.apache.flink.runtime.jobmanager.slots.AllocatedSlot) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) ScheduledUnit(org.apache.flink.runtime.jobmanager.scheduler.ScheduledUnit) MetricReporter(org.apache.flink.metrics.reporter.MetricReporter) TaskExecutionState(org.apache.flink.runtime.taskmanager.TaskExecutionState) SuppressRestartsException(org.apache.flink.runtime.execution.SuppressRestartsException) JobException(org.apache.flink.runtime.JobException) IOException(java.io.IOException) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) SimpleSlot(org.apache.flink.runtime.instance.SimpleSlot) Slot(org.apache.flink.runtime.instance.Slot) AllocatedSlot(org.apache.flink.runtime.jobmanager.slots.AllocatedSlot) Metric(org.apache.flink.metrics.Metric) Test(org.junit.Test)

Example 10 with Instance

use of org.apache.flink.runtime.instance.Instance in project flink by apache.

the class ExecutionGraphRestartTest method testCancelWhileFailing.

@Test
public void testCancelWhileFailing() throws Exception {
    // We want to manually control the restart and delay
    RestartStrategy restartStrategy = new InfiniteDelayRestartStrategy();
    Tuple2<ExecutionGraph, Instance> executionGraphInstanceTuple = createSpyExecutionGraph(restartStrategy);
    ExecutionGraph executionGraph = executionGraphInstanceTuple.f0;
    Instance instance = executionGraphInstanceTuple.f1;
    doNothing().when(executionGraph).jobVertexInFinalState();
    // Kill the instance...
    instance.markDead();
    Deadline deadline = TestingUtils.TESTING_DURATION().fromNow();
    // ...and wait for all vertices to be in state FAILED. The
    // jobVertexInFinalState does nothing, that's why we don't wait on the
    // job status.
    boolean success = false;
    while (deadline.hasTimeLeft() && !success) {
        success = true;
        for (ExecutionVertex vertex : executionGraph.getAllExecutionVertices()) {
            ExecutionState state = vertex.getExecutionState();
            if (state != ExecutionState.FAILED && state != ExecutionState.CANCELED) {
                success = false;
                Thread.sleep(100);
                break;
            }
        }
    }
    // Still in failing
    assertEquals(JobStatus.FAILING, executionGraph.getState());
    // The cancel call needs to change the state to CANCELLING
    executionGraph.cancel();
    assertEquals(JobStatus.CANCELLING, executionGraph.getState());
    // Unspy and finalize the job state
    doCallRealMethod().when(executionGraph).jobVertexInFinalState();
    executionGraph.jobVertexInFinalState();
    assertEquals(JobStatus.CANCELED, executionGraph.getState());
}
Also used : ExecutionState(org.apache.flink.runtime.execution.ExecutionState) InfiniteDelayRestartStrategy(org.apache.flink.runtime.executiongraph.restart.InfiniteDelayRestartStrategy) Instance(org.apache.flink.runtime.instance.Instance) Deadline(scala.concurrent.duration.Deadline) FailureRateRestartStrategy(org.apache.flink.runtime.executiongraph.restart.FailureRateRestartStrategy) InfiniteDelayRestartStrategy(org.apache.flink.runtime.executiongraph.restart.InfiniteDelayRestartStrategy) NoRestartStrategy(org.apache.flink.runtime.executiongraph.restart.NoRestartStrategy) RestartStrategy(org.apache.flink.runtime.executiongraph.restart.RestartStrategy) FixedDelayRestartStrategy(org.apache.flink.runtime.executiongraph.restart.FixedDelayRestartStrategy) Test(org.junit.Test)

Aggregations

Instance (org.apache.flink.runtime.instance.Instance)63 Test (org.junit.Test)52 SimpleSlot (org.apache.flink.runtime.instance.SimpleSlot)38 JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)33 ActorTaskManagerGateway (org.apache.flink.runtime.jobmanager.slots.ActorTaskManagerGateway)29 IOException (java.io.IOException)19 JobID (org.apache.flink.api.common.JobID)15 ExecutionException (java.util.concurrent.ExecutionException)14 Scheduler (org.apache.flink.runtime.jobmanager.scheduler.Scheduler)14 SchedulerTestUtils.getRandomInstance (org.apache.flink.runtime.jobmanager.scheduler.SchedulerTestUtils.getRandomInstance)14 ExecutionGraphTestUtils.getInstance (org.apache.flink.runtime.executiongraph.ExecutionGraphTestUtils.getInstance)12 TaskManagerLocation (org.apache.flink.runtime.taskmanager.TaskManagerLocation)12 SimpleActorGateway (org.apache.flink.runtime.executiongraph.ExecutionGraphTestUtils.SimpleActorGateway)11 ExecutionGraphTestUtils.getExecutionVertex (org.apache.flink.runtime.executiongraph.ExecutionGraphTestUtils.getExecutionVertex)11 ActorGateway (org.apache.flink.runtime.instance.ActorGateway)11 JobVertex (org.apache.flink.runtime.jobgraph.JobVertex)10 FiniteDuration (scala.concurrent.duration.FiniteDuration)9 SuppressRestartsException (org.apache.flink.runtime.execution.SuppressRestartsException)8 BaseTestingActorGateway (org.apache.flink.runtime.instance.BaseTestingActorGateway)8 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)8