Search in sources :

Example 51 with SimpleSlot

use of org.apache.flink.runtime.instance.SimpleSlot in project flink by apache.

the class Scheduler method getFreeSlotForTask.

/**
	 * Gets a suitable instance to schedule the vertex execution to.
	 * <p>
	 * NOTE: This method does is not thread-safe, it needs to be synchronized by the caller.
	 * 
	 * @param vertex The task to run. 
	 * @return The instance to run the vertex on, it {@code null}, if no instance is available.
	 */
protected SimpleSlot getFreeSlotForTask(ExecutionVertex vertex, Iterable<TaskManagerLocation> requestedLocations, boolean localOnly) {
    // in the set-with-available-instances
    while (true) {
        Pair<Instance, Locality> instanceLocalityPair = findInstance(requestedLocations, localOnly);
        if (instanceLocalityPair == null) {
            return null;
        }
        Instance instanceToUse = instanceLocalityPair.getLeft();
        Locality locality = instanceLocalityPair.getRight();
        try {
            SimpleSlot slot = instanceToUse.allocateSimpleSlot(vertex.getJobId());
            // if the instance has further available slots, re-add it to the set of available resources.
            if (instanceToUse.hasResourcesAvailable()) {
                this.instancesWithAvailableResources.put(instanceToUse.getTaskManagerID(), instanceToUse);
            }
            if (slot != null) {
                slot.setLocality(locality);
                return slot;
            }
        } catch (InstanceDiedException e) {
            // the instance died it has not yet been propagated to this scheduler
            // remove the instance from the set of available instances
            removeInstance(instanceToUse);
        }
    // if we failed to get a slot, fall through the loop
    }
}
Also used : Instance(org.apache.flink.runtime.instance.Instance) SimpleSlot(org.apache.flink.runtime.instance.SimpleSlot) InstanceDiedException(org.apache.flink.runtime.instance.InstanceDiedException)

Example 52 with SimpleSlot

use of org.apache.flink.runtime.instance.SimpleSlot in project flink by apache.

the class ExecutionGraphSchedulingTest method createSlot.

private SimpleSlot createSlot(TaskManagerGateway taskManager, JobID jobId, SlotOwner slotOwner) {
    TaskManagerLocation location = new TaskManagerLocation(ResourceID.generate(), InetAddress.getLoopbackAddress(), 12345);
    AllocatedSlot slot = new AllocatedSlot(new AllocationID(), jobId, location, 0, ResourceProfile.UNKNOWN, taskManager);
    return new SimpleSlot(slot, slotOwner, 0);
}
Also used : AllocatedSlot(org.apache.flink.runtime.jobmanager.slots.AllocatedSlot) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) SimpleSlot(org.apache.flink.runtime.instance.SimpleSlot)

Example 53 with SimpleSlot

use of org.apache.flink.runtime.instance.SimpleSlot in project flink by apache.

the class ExecutionGraphSchedulingTest method testTimeoutForSlotAllocation.

/**
	 * This test verifies that the slot allocations times out after a certain time, and that
	 * all slots are released in that case.
	 */
@Test
public void testTimeoutForSlotAllocation() throws Exception {
    //  we construct a simple graph:    (task)
    final int parallelism = 3;
    final JobVertex vertex = new JobVertex("task");
    vertex.setParallelism(parallelism);
    vertex.setInvokableClass(NoOpInvokable.class);
    final JobID jobId = new JobID();
    final JobGraph jobGraph = new JobGraph(jobId, "test", vertex);
    final SlotOwner slotOwner = mock(SlotOwner.class);
    final TaskManagerGateway taskManager = mock(TaskManagerGateway.class);
    final SimpleSlot[] slots = new SimpleSlot[parallelism];
    @SuppressWarnings({ "unchecked", "rawtypes" }) final FlinkCompletableFuture<SimpleSlot>[] slotFutures = new FlinkCompletableFuture[parallelism];
    for (int i = 0; i < parallelism; i++) {
        slots[i] = createSlot(taskManager, jobId, slotOwner);
        slotFutures[i] = new FlinkCompletableFuture<>();
    }
    ProgrammedSlotProvider slotProvider = new ProgrammedSlotProvider(parallelism);
    slotProvider.addSlots(vertex.getID(), slotFutures);
    final ExecutionGraph eg = createExecutionGraph(jobGraph, slotProvider, Time.milliseconds(20));
    final TerminalJobStatusListener statusListener = new TerminalJobStatusListener();
    eg.registerJobStatusListener(statusListener);
    //  we complete one future
    slotFutures[1].complete(slots[1]);
    //  kick off the scheduling
    eg.setScheduleMode(ScheduleMode.EAGER);
    eg.setQueuedSchedulingAllowed(true);
    eg.scheduleForExecution();
    //  we complete another future
    slotFutures[2].complete(slots[2]);
    // since future[0] is still missing the while operation must time out
    // we have no restarts allowed, so the job will go terminal
    statusListener.waitForTerminalState(2000);
    // wait until all slots are back
    verify(slotOwner, new Timeout(2000, times(2))).returnAllocatedSlot(any(Slot.class));
    //  verify that no deployments have happened
    verify(taskManager, times(0)).submitTask(any(TaskDeploymentDescriptor.class), any(Time.class));
    for (Future<SimpleSlot> future : slotFutures) {
        if (future.isDone()) {
            assertTrue(future.get().isCanceled());
        }
    }
}
Also used : Timeout(org.mockito.verification.Timeout) TaskManagerGateway(org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway) Time(org.apache.flink.api.common.time.Time) SimpleSlot(org.apache.flink.runtime.instance.SimpleSlot) FlinkCompletableFuture(org.apache.flink.runtime.concurrent.impl.FlinkCompletableFuture) SlotOwner(org.apache.flink.runtime.jobmanager.slots.SlotOwner) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) Slot(org.apache.flink.runtime.instance.Slot) SimpleSlot(org.apache.flink.runtime.instance.SimpleSlot) AllocatedSlot(org.apache.flink.runtime.jobmanager.slots.AllocatedSlot) TaskDeploymentDescriptor(org.apache.flink.runtime.deployment.TaskDeploymentDescriptor) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 54 with SimpleSlot

use of org.apache.flink.runtime.instance.SimpleSlot in project flink by apache.

the class ExecutionVertexCancelTest method testCancelConcurrentlyToDeploying_CallsOvertaking.

@Test
public void testCancelConcurrentlyToDeploying_CallsOvertaking() {
    try {
        final JobVertexID jid = new JobVertexID();
        final TestingUtils.QueuedActionExecutionContext executionContext = TestingUtils.queuedActionExecutionContext();
        final TestingUtils.ActionQueue actions = executionContext.actionQueue();
        final ExecutionJobVertex ejv = getExecutionVertex(jid, executionContext);
        final ExecutionVertex vertex = new ExecutionVertex(ejv, 0, new IntermediateResult[0], AkkaUtils.getDefaultTimeout());
        setVertexState(vertex, ExecutionState.SCHEDULED);
        assertEquals(ExecutionState.SCHEDULED, vertex.getExecutionState());
        // task manager cancel sequence mock actor
        // first return NOT SUCCESS (task not found, cancel call overtook deploy call), then success (cancel call after deploy call)
        ActorGateway actorGateway = new CancelSequenceActorGateway(executionContext, 2);
        Instance instance = getInstance(new ActorTaskManagerGateway(actorGateway));
        SimpleSlot slot = instance.allocateSimpleSlot(new JobID());
        vertex.deployToSlot(slot);
        assertEquals(ExecutionState.DEPLOYING, vertex.getExecutionState());
        vertex.cancel();
        assertEquals(ExecutionState.CANCELING, vertex.getExecutionState());
        // first action happens (deploy)
        Runnable deployAction = actions.popNextAction();
        Runnable cancelAction = actions.popNextAction();
        // cancel call first
        cancelAction.run();
        // process onComplete callback
        actions.triggerNextAction();
        // did not find the task, not properly cancelled, stay in canceling
        assertEquals(ExecutionState.CANCELING, vertex.getExecutionState());
        // deploy action next
        deployAction.run();
        // the deploy call found itself in canceling after it returned and needs to send a cancel call
        // the call did not yet execute, so it is still in canceling
        assertEquals(ExecutionState.CANCELING, vertex.getExecutionState());
        vertex.getCurrentExecutionAttempt().cancelingComplete();
        assertEquals(ExecutionState.CANCELED, vertex.getExecutionState());
        assertTrue(slot.isReleased());
        assertNull(vertex.getFailureCause());
        assertTrue(vertex.getStateTimestamp(ExecutionState.CREATED) > 0);
        assertTrue(vertex.getStateTimestamp(ExecutionState.CANCELING) > 0);
        assertTrue(vertex.getStateTimestamp(ExecutionState.CANCELED) > 0);
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
}
Also used : Instance(org.apache.flink.runtime.instance.Instance) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) SimpleSlot(org.apache.flink.runtime.instance.SimpleSlot) IOException(java.io.IOException) ActorTaskManagerGateway(org.apache.flink.runtime.jobmanager.slots.ActorTaskManagerGateway) TestingUtils(org.apache.flink.runtime.testingUtils.TestingUtils) BaseTestingActorGateway(org.apache.flink.runtime.instance.BaseTestingActorGateway) DummyActorGateway(org.apache.flink.runtime.instance.DummyActorGateway) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 55 with SimpleSlot

use of org.apache.flink.runtime.instance.SimpleSlot in project flink by apache.

the class ExecutionVertexCancelTest method testActionsWhileCancelling.

@Test
public void testActionsWhileCancelling() {
    try {
        final JobVertexID jid = new JobVertexID();
        final ExecutionJobVertex ejv = getExecutionVertex(jid);
        // scheduling while canceling is an illegal state transition
        try {
            ExecutionVertex vertex = new ExecutionVertex(ejv, 0, new IntermediateResult[0], AkkaUtils.getDefaultTimeout());
            setVertexState(vertex, ExecutionState.CANCELING);
            Scheduler scheduler = mock(Scheduler.class);
            vertex.scheduleForExecution(scheduler, false);
        } catch (Exception e) {
            fail("should not throw an exception");
        }
        // deploying while in canceling state is illegal (should immediately go to canceled)
        try {
            ExecutionVertex vertex = new ExecutionVertex(ejv, 0, new IntermediateResult[0], AkkaUtils.getDefaultTimeout());
            setVertexState(vertex, ExecutionState.CANCELING);
            Instance instance = getInstance(new ActorTaskManagerGateway(DummyActorGateway.INSTANCE));
            SimpleSlot slot = instance.allocateSimpleSlot(new JobID());
            vertex.deployToSlot(slot);
            fail("Method should throw an exception");
        } catch (IllegalStateException e) {
        // that is what we expect
        }
        // fail while canceling
        {
            ExecutionVertex vertex = new ExecutionVertex(ejv, 0, new IntermediateResult[0], AkkaUtils.getDefaultTimeout());
            Instance instance = getInstance(new ActorTaskManagerGateway(DummyActorGateway.INSTANCE));
            SimpleSlot slot = instance.allocateSimpleSlot(new JobID());
            setVertexResource(vertex, slot);
            setVertexState(vertex, ExecutionState.CANCELING);
            Exception failureCause = new Exception("test exception");
            vertex.fail(failureCause);
            assertEquals(ExecutionState.CANCELED, vertex.getExecutionState());
            assertTrue(slot.isReleased());
        }
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
}
Also used : Instance(org.apache.flink.runtime.instance.Instance) Scheduler(org.apache.flink.runtime.jobmanager.scheduler.Scheduler) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) SimpleSlot(org.apache.flink.runtime.instance.SimpleSlot) IOException(java.io.IOException) JobID(org.apache.flink.api.common.JobID) ActorTaskManagerGateway(org.apache.flink.runtime.jobmanager.slots.ActorTaskManagerGateway) Test(org.junit.Test)

Aggregations

SimpleSlot (org.apache.flink.runtime.instance.SimpleSlot)78 Test (org.junit.Test)59 JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)46 Instance (org.apache.flink.runtime.instance.Instance)38 ExecutionException (java.util.concurrent.ExecutionException)25 ActorTaskManagerGateway (org.apache.flink.runtime.jobmanager.slots.ActorTaskManagerGateway)22 JobID (org.apache.flink.api.common.JobID)20 TaskManagerLocation (org.apache.flink.runtime.taskmanager.TaskManagerLocation)16 TaskManagerGateway (org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway)14 FlinkCompletableFuture (org.apache.flink.runtime.concurrent.impl.FlinkCompletableFuture)13 SchedulerTestUtils.getRandomInstance (org.apache.flink.runtime.jobmanager.scheduler.SchedulerTestUtils.getRandomInstance)13 ExecutionGraphTestUtils.getExecutionVertex (org.apache.flink.runtime.executiongraph.ExecutionGraphTestUtils.getExecutionVertex)12 ExecutionGraphTestUtils.getInstance (org.apache.flink.runtime.executiongraph.ExecutionGraphTestUtils.getInstance)12 IOException (java.io.IOException)10 ActorGateway (org.apache.flink.runtime.instance.ActorGateway)8 BaseTestingActorGateway (org.apache.flink.runtime.instance.BaseTestingActorGateway)8 JobVertex (org.apache.flink.runtime.jobgraph.JobVertex)8 Scheduler (org.apache.flink.runtime.jobmanager.scheduler.Scheduler)8 AllocatedSlot (org.apache.flink.runtime.jobmanager.slots.AllocatedSlot)8 Future (org.apache.flink.runtime.concurrent.Future)7