Search in sources :

Example 16 with ExecutionState

use of org.apache.flink.runtime.execution.ExecutionState in project flink by apache.

the class InputChannelDeploymentDescriptor method fromEdges.

// ------------------------------------------------------------------------
/**
	 * Creates an input channel deployment descriptor for each partition.
	 */
public static InputChannelDeploymentDescriptor[] fromEdges(ExecutionEdge[] edges, SimpleSlot consumerSlot, boolean allowLazyDeployment) throws ExecutionGraphException {
    final ResourceID consumerTaskManager = consumerSlot.getTaskManagerID();
    final InputChannelDeploymentDescriptor[] icdd = new InputChannelDeploymentDescriptor[edges.length];
    // Each edge is connected to a different result partition
    for (int i = 0; i < edges.length; i++) {
        final IntermediateResultPartition consumedPartition = edges[i].getSource();
        final Execution producer = consumedPartition.getProducer().getCurrentExecutionAttempt();
        final ExecutionState producerState = producer.getState();
        final SimpleSlot producerSlot = producer.getAssignedResource();
        final ResultPartitionLocation partitionLocation;
        // The producing task needs to be RUNNING or already FINISHED
        if (consumedPartition.isConsumable() && producerSlot != null && (producerState == ExecutionState.RUNNING || producerState == ExecutionState.FINISHED || producerState == ExecutionState.SCHEDULED || producerState == ExecutionState.DEPLOYING)) {
            final TaskManagerLocation partitionTaskManagerLocation = producerSlot.getTaskManagerLocation();
            final ResourceID partitionTaskManager = partitionTaskManagerLocation.getResourceID();
            if (partitionTaskManager.equals(consumerTaskManager)) {
                // Consuming task is deployed to the same TaskManager as the partition => local
                partitionLocation = ResultPartitionLocation.createLocal();
            } else {
                // Different instances => remote
                final ConnectionID connectionId = new ConnectionID(partitionTaskManagerLocation, consumedPartition.getIntermediateResult().getConnectionIndex());
                partitionLocation = ResultPartitionLocation.createRemote(connectionId);
            }
        } else if (allowLazyDeployment) {
            // The producing task might not have registered the partition yet
            partitionLocation = ResultPartitionLocation.createUnknown();
        } else if (producerState == ExecutionState.CANCELING || producerState == ExecutionState.CANCELED || producerState == ExecutionState.FAILED) {
            String msg = "Trying to schedule a task whose inputs were canceled or failed. " + "The producer is in state " + producerState + ".";
            throw new ExecutionGraphException(msg);
        } else {
            String msg = String.format("Trying to eagerly schedule a task whose inputs " + "are not ready (partition consumable? %s, producer state: %s, producer slot: %s).", consumedPartition.isConsumable(), producerState, producerSlot);
            throw new ExecutionGraphException(msg);
        }
        final ResultPartitionID consumedPartitionId = new ResultPartitionID(consumedPartition.getPartitionId(), producer.getAttemptId());
        icdd[i] = new InputChannelDeploymentDescriptor(consumedPartitionId, partitionLocation);
    }
    return icdd;
}
Also used : ExecutionState(org.apache.flink.runtime.execution.ExecutionState) ExecutionGraphException(org.apache.flink.runtime.executiongraph.ExecutionGraphException) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) SimpleSlot(org.apache.flink.runtime.instance.SimpleSlot) ConnectionID(org.apache.flink.runtime.io.network.ConnectionID) IntermediateResultPartition(org.apache.flink.runtime.executiongraph.IntermediateResultPartition) Execution(org.apache.flink.runtime.executiongraph.Execution) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) ResultPartitionID(org.apache.flink.runtime.io.network.partition.ResultPartitionID)

Example 17 with ExecutionState

use of org.apache.flink.runtime.execution.ExecutionState in project flink by apache.

the class Execution method scheduleOrUpdateConsumers.

void scheduleOrUpdateConsumers(List<List<ExecutionEdge>> allConsumers) {
    final int numConsumers = allConsumers.size();
    if (numConsumers > 1) {
        fail(new IllegalStateException("Currently, only a single consumer group per partition is supported."));
    } else if (numConsumers == 0) {
        return;
    }
    for (ExecutionEdge edge : allConsumers.get(0)) {
        final ExecutionVertex consumerVertex = edge.getTarget();
        final Execution consumer = consumerVertex.getCurrentExecutionAttempt();
        final ExecutionState consumerState = consumer.getState();
        final IntermediateResultPartition partition = edge.getSource();
        // ----------------------------------------------------------------
        if (consumerState == CREATED) {
            final Execution partitionExecution = partition.getProducer().getCurrentExecutionAttempt();
            consumerVertex.cachePartitionInfo(PartialInputChannelDeploymentDescriptor.fromEdge(partition, partitionExecution));
            // When deploying a consuming task, its task deployment descriptor will contain all
            // deployment information available at the respective time. It is possible that some
            // of the partitions to be consumed have not been created yet. These are updated
            // runtime via the update messages.
            //
            // TODO The current approach may send many update messages even though the consuming
            // task has already been deployed with all necessary information. We have to check
            // whether this is a problem and fix it, if it is.
            FlinkFuture.supplyAsync(new Callable<Void>() {

                @Override
                public Void call() throws Exception {
                    try {
                        consumerVertex.scheduleForExecution(consumerVertex.getExecutionGraph().getSlotProvider(), consumerVertex.getExecutionGraph().isQueuedSchedulingAllowed());
                    } catch (Throwable t) {
                        consumerVertex.fail(new IllegalStateException("Could not schedule consumer " + "vertex " + consumerVertex, t));
                    }
                    return null;
                }
            }, executor);
            // double check to resolve race conditions
            if (consumerVertex.getExecutionState() == RUNNING) {
                consumerVertex.sendPartitionInfos();
            }
        } else // ----------------------------------------------------------------
        // Consumer is running => send update message now
        // ----------------------------------------------------------------
        {
            if (consumerState == RUNNING) {
                final SimpleSlot consumerSlot = consumer.getAssignedResource();
                if (consumerSlot == null) {
                    // The consumer has been reset concurrently
                    continue;
                }
                final TaskManagerLocation partitionTaskManagerLocation = partition.getProducer().getCurrentAssignedResource().getTaskManagerLocation();
                final ResourceID partitionTaskManager = partitionTaskManagerLocation.getResourceID();
                final ResourceID consumerTaskManager = consumerSlot.getTaskManagerID();
                final ResultPartitionID partitionId = new ResultPartitionID(partition.getPartitionId(), attemptId);
                final ResultPartitionLocation partitionLocation;
                if (consumerTaskManager.equals(partitionTaskManager)) {
                    // Consuming task is deployed to the same instance as the partition => local
                    partitionLocation = ResultPartitionLocation.createLocal();
                } else {
                    // Different instances => remote
                    final ConnectionID connectionId = new ConnectionID(partitionTaskManagerLocation, partition.getIntermediateResult().getConnectionIndex());
                    partitionLocation = ResultPartitionLocation.createRemote(connectionId);
                }
                final InputChannelDeploymentDescriptor descriptor = new InputChannelDeploymentDescriptor(partitionId, partitionLocation);
                consumer.sendUpdatePartitionInfoRpcCall(Collections.singleton(new PartitionInfo(partition.getIntermediateResult().getId(), descriptor)));
            } else // ----------------------------------------------------------------
            if (consumerState == SCHEDULED || consumerState == DEPLOYING) {
                final Execution partitionExecution = partition.getProducer().getCurrentExecutionAttempt();
                consumerVertex.cachePartitionInfo(PartialInputChannelDeploymentDescriptor.fromEdge(partition, partitionExecution));
                // double check to resolve race conditions
                if (consumerVertex.getExecutionState() == RUNNING) {
                    consumerVertex.sendPartitionInfos();
                }
            }
        }
    }
}
Also used : ExecutionState(org.apache.flink.runtime.execution.ExecutionState) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) SimpleSlot(org.apache.flink.runtime.instance.SimpleSlot) CoLocationConstraint(org.apache.flink.runtime.jobmanager.scheduler.CoLocationConstraint) TimeoutException(java.util.concurrent.TimeoutException) JobException(org.apache.flink.runtime.JobException) ConnectionID(org.apache.flink.runtime.io.network.ConnectionID) ResultPartitionLocation(org.apache.flink.runtime.deployment.ResultPartitionLocation) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) PartialInputChannelDeploymentDescriptor(org.apache.flink.runtime.deployment.PartialInputChannelDeploymentDescriptor) InputChannelDeploymentDescriptor(org.apache.flink.runtime.deployment.InputChannelDeploymentDescriptor) ResultPartitionID(org.apache.flink.runtime.io.network.partition.ResultPartitionID)

Example 18 with ExecutionState

use of org.apache.flink.runtime.execution.ExecutionState in project flink by apache.

the class Execution method markFinished.

void markFinished(Map<String, Accumulator<?, ?>> userAccumulators, IOMetrics metrics) {
    // this call usually comes during RUNNING, but may also come while still in deploying (very fast tasks!)
    while (true) {
        ExecutionState current = this.state;
        if (current == RUNNING || current == DEPLOYING) {
            if (transitionState(current, FINISHED)) {
                try {
                    for (IntermediateResultPartition finishedPartition : getVertex().finishAllBlockingPartitions()) {
                        IntermediateResultPartition[] allPartitions = finishedPartition.getIntermediateResult().getPartitions();
                        for (IntermediateResultPartition partition : allPartitions) {
                            scheduleOrUpdateConsumers(partition.getConsumers());
                        }
                    }
                    updateAccumulatorsAndMetrics(userAccumulators, metrics);
                    assignedResource.releaseSlot();
                    vertex.getExecutionGraph().deregisterExecution(this);
                } finally {
                    vertex.executionFinished();
                }
                return;
            }
        } else if (current == CANCELING) {
            // we sent a cancel call, and the task manager finished before it arrived. We
            // will never get a CANCELED call back from the job manager
            cancelingComplete(userAccumulators, metrics);
            return;
        } else if (current == CANCELED || current == FAILED) {
            if (LOG.isDebugEnabled()) {
                LOG.debug("Task FINISHED, but concurrently went to state " + state);
            }
            return;
        } else {
            // this should not happen, we need to fail this
            markFailed(new Exception("Vertex received FINISHED message while being in state " + state));
            return;
        }
    }
}
Also used : ExecutionState(org.apache.flink.runtime.execution.ExecutionState) TimeoutException(java.util.concurrent.TimeoutException) JobException(org.apache.flink.runtime.JobException)

Example 19 with ExecutionState

use of org.apache.flink.runtime.execution.ExecutionState in project flink by apache.

the class CheckpointCoordinatorTest method testPeriodicSchedulingWithInactiveTasks.

@Test
public void testPeriodicSchedulingWithInactiveTasks() {
    try {
        final JobID jid = new JobID();
        // create some mock execution vertices and trigger some checkpoint
        final ExecutionAttemptID triggerAttemptID = new ExecutionAttemptID();
        final ExecutionAttemptID ackAttemptID = new ExecutionAttemptID();
        final ExecutionAttemptID commitAttemptID = new ExecutionAttemptID();
        ExecutionVertex triggerVertex = mockExecutionVertex(triggerAttemptID);
        ExecutionVertex ackVertex = mockExecutionVertex(ackAttemptID);
        ExecutionVertex commitVertex = mockExecutionVertex(commitAttemptID);
        final AtomicReference<ExecutionState> currentState = new AtomicReference<>(ExecutionState.CREATED);
        when(triggerVertex.getCurrentExecutionAttempt().getState()).thenAnswer(new Answer<ExecutionState>() {

            @Override
            public ExecutionState answer(InvocationOnMock invocation) {
                return currentState.get();
            }
        });
        CheckpointCoordinator coord = new CheckpointCoordinator(jid, // periodic interval is 10 ms
        10, // timeout is very long (200 s)
        200000, // no extra delay
        0L, // max two concurrent checkpoints
        2, ExternalizedCheckpointSettings.none(), new ExecutionVertex[] { triggerVertex }, new ExecutionVertex[] { ackVertex }, new ExecutionVertex[] { commitVertex }, new StandaloneCheckpointIDCounter(), new StandaloneCompletedCheckpointStore(2), null, Executors.directExecutor());
        coord.startCheckpointScheduler();
        // no checkpoint should have started so far
        Thread.sleep(200);
        assertEquals(0, coord.getNumberOfPendingCheckpoints());
        // now move the state to RUNNING
        currentState.set(ExecutionState.RUNNING);
        // the coordinator should start checkpointing now
        final long timeout = System.currentTimeMillis() + 10000;
        do {
            Thread.sleep(20);
        } while (System.currentTimeMillis() < timeout && coord.getNumberOfPendingCheckpoints() == 0);
        assertTrue(coord.getNumberOfPendingCheckpoints() > 0);
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
}
Also used : ExecutionState(org.apache.flink.runtime.execution.ExecutionState) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) AtomicReference(java.util.concurrent.atomic.AtomicReference) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) IOException(java.io.IOException) InvocationOnMock(org.mockito.invocation.InvocationOnMock) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 20 with ExecutionState

use of org.apache.flink.runtime.execution.ExecutionState in project flink by apache.

the class TaskAsyncCallTest method testCheckpointCallsInOrder.

// ------------------------------------------------------------------------
//  Tests 
// ------------------------------------------------------------------------
@Test
public void testCheckpointCallsInOrder() {
    try {
        Task task = createTask();
        task.startTaskThread();
        awaitLatch.await();
        for (int i = 1; i <= NUM_CALLS; i++) {
            task.triggerCheckpointBarrier(i, 156865867234L, CheckpointOptions.forFullCheckpoint());
        }
        triggerLatch.await();
        assertFalse(task.isCanceledOrFailed());
        ExecutionState currentState = task.getExecutionState();
        if (currentState != ExecutionState.RUNNING && currentState != ExecutionState.FINISHED) {
            fail("Task should be RUNNING or FINISHED, but is " + currentState);
        }
        task.cancelExecution();
        task.getExecutingThread().join();
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
}
Also used : ExecutionState(org.apache.flink.runtime.execution.ExecutionState) StatefulTask(org.apache.flink.runtime.jobgraph.tasks.StatefulTask) Test(org.junit.Test)

Aggregations

ExecutionState (org.apache.flink.runtime.execution.ExecutionState)26 Test (org.junit.Test)11 ResultPartitionID (org.apache.flink.runtime.io.network.partition.ResultPartitionID)6 TaskManagerLocation (org.apache.flink.runtime.taskmanager.TaskManagerLocation)6 JsonGenerator (com.fasterxml.jackson.core.JsonGenerator)5 IOException (java.io.IOException)5 StringWriter (java.io.StringWriter)5 TimeoutException (java.util.concurrent.TimeoutException)5 JobID (org.apache.flink.api.common.JobID)5 AccessExecutionVertex (org.apache.flink.runtime.executiongraph.AccessExecutionVertex)5 ExecutionAttemptID (org.apache.flink.runtime.executiongraph.ExecutionAttemptID)4 HashMap (java.util.HashMap)3 JobException (org.apache.flink.runtime.JobException)3 ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)3 ExecutionVertex (org.apache.flink.runtime.executiongraph.ExecutionVertex)3 IntermediateResultPartition (org.apache.flink.runtime.executiongraph.IntermediateResultPartition)3 SimpleSlot (org.apache.flink.runtime.instance.SimpleSlot)3 ConnectionID (org.apache.flink.runtime.io.network.ConnectionID)3 IntermediateDataSetID (org.apache.flink.runtime.jobgraph.IntermediateDataSetID)3 MutableIOMetrics (org.apache.flink.runtime.webmonitor.utils.MutableIOMetrics)3