use of org.apache.flink.runtime.execution.ExecutionState in project flink by apache.
the class InputChannelDeploymentDescriptor method fromEdges.
// ------------------------------------------------------------------------
/**
* Creates an input channel deployment descriptor for each partition.
*/
public static InputChannelDeploymentDescriptor[] fromEdges(ExecutionEdge[] edges, SimpleSlot consumerSlot, boolean allowLazyDeployment) throws ExecutionGraphException {
final ResourceID consumerTaskManager = consumerSlot.getTaskManagerID();
final InputChannelDeploymentDescriptor[] icdd = new InputChannelDeploymentDescriptor[edges.length];
// Each edge is connected to a different result partition
for (int i = 0; i < edges.length; i++) {
final IntermediateResultPartition consumedPartition = edges[i].getSource();
final Execution producer = consumedPartition.getProducer().getCurrentExecutionAttempt();
final ExecutionState producerState = producer.getState();
final SimpleSlot producerSlot = producer.getAssignedResource();
final ResultPartitionLocation partitionLocation;
// The producing task needs to be RUNNING or already FINISHED
if (consumedPartition.isConsumable() && producerSlot != null && (producerState == ExecutionState.RUNNING || producerState == ExecutionState.FINISHED || producerState == ExecutionState.SCHEDULED || producerState == ExecutionState.DEPLOYING)) {
final TaskManagerLocation partitionTaskManagerLocation = producerSlot.getTaskManagerLocation();
final ResourceID partitionTaskManager = partitionTaskManagerLocation.getResourceID();
if (partitionTaskManager.equals(consumerTaskManager)) {
// Consuming task is deployed to the same TaskManager as the partition => local
partitionLocation = ResultPartitionLocation.createLocal();
} else {
// Different instances => remote
final ConnectionID connectionId = new ConnectionID(partitionTaskManagerLocation, consumedPartition.getIntermediateResult().getConnectionIndex());
partitionLocation = ResultPartitionLocation.createRemote(connectionId);
}
} else if (allowLazyDeployment) {
// The producing task might not have registered the partition yet
partitionLocation = ResultPartitionLocation.createUnknown();
} else if (producerState == ExecutionState.CANCELING || producerState == ExecutionState.CANCELED || producerState == ExecutionState.FAILED) {
String msg = "Trying to schedule a task whose inputs were canceled or failed. " + "The producer is in state " + producerState + ".";
throw new ExecutionGraphException(msg);
} else {
String msg = String.format("Trying to eagerly schedule a task whose inputs " + "are not ready (partition consumable? %s, producer state: %s, producer slot: %s).", consumedPartition.isConsumable(), producerState, producerSlot);
throw new ExecutionGraphException(msg);
}
final ResultPartitionID consumedPartitionId = new ResultPartitionID(consumedPartition.getPartitionId(), producer.getAttemptId());
icdd[i] = new InputChannelDeploymentDescriptor(consumedPartitionId, partitionLocation);
}
return icdd;
}
use of org.apache.flink.runtime.execution.ExecutionState in project flink by apache.
the class Execution method scheduleOrUpdateConsumers.
void scheduleOrUpdateConsumers(List<List<ExecutionEdge>> allConsumers) {
final int numConsumers = allConsumers.size();
if (numConsumers > 1) {
fail(new IllegalStateException("Currently, only a single consumer group per partition is supported."));
} else if (numConsumers == 0) {
return;
}
for (ExecutionEdge edge : allConsumers.get(0)) {
final ExecutionVertex consumerVertex = edge.getTarget();
final Execution consumer = consumerVertex.getCurrentExecutionAttempt();
final ExecutionState consumerState = consumer.getState();
final IntermediateResultPartition partition = edge.getSource();
// ----------------------------------------------------------------
if (consumerState == CREATED) {
final Execution partitionExecution = partition.getProducer().getCurrentExecutionAttempt();
consumerVertex.cachePartitionInfo(PartialInputChannelDeploymentDescriptor.fromEdge(partition, partitionExecution));
// When deploying a consuming task, its task deployment descriptor will contain all
// deployment information available at the respective time. It is possible that some
// of the partitions to be consumed have not been created yet. These are updated
// runtime via the update messages.
//
// TODO The current approach may send many update messages even though the consuming
// task has already been deployed with all necessary information. We have to check
// whether this is a problem and fix it, if it is.
FlinkFuture.supplyAsync(new Callable<Void>() {
@Override
public Void call() throws Exception {
try {
consumerVertex.scheduleForExecution(consumerVertex.getExecutionGraph().getSlotProvider(), consumerVertex.getExecutionGraph().isQueuedSchedulingAllowed());
} catch (Throwable t) {
consumerVertex.fail(new IllegalStateException("Could not schedule consumer " + "vertex " + consumerVertex, t));
}
return null;
}
}, executor);
// double check to resolve race conditions
if (consumerVertex.getExecutionState() == RUNNING) {
consumerVertex.sendPartitionInfos();
}
} else // ----------------------------------------------------------------
// Consumer is running => send update message now
// ----------------------------------------------------------------
{
if (consumerState == RUNNING) {
final SimpleSlot consumerSlot = consumer.getAssignedResource();
if (consumerSlot == null) {
// The consumer has been reset concurrently
continue;
}
final TaskManagerLocation partitionTaskManagerLocation = partition.getProducer().getCurrentAssignedResource().getTaskManagerLocation();
final ResourceID partitionTaskManager = partitionTaskManagerLocation.getResourceID();
final ResourceID consumerTaskManager = consumerSlot.getTaskManagerID();
final ResultPartitionID partitionId = new ResultPartitionID(partition.getPartitionId(), attemptId);
final ResultPartitionLocation partitionLocation;
if (consumerTaskManager.equals(partitionTaskManager)) {
// Consuming task is deployed to the same instance as the partition => local
partitionLocation = ResultPartitionLocation.createLocal();
} else {
// Different instances => remote
final ConnectionID connectionId = new ConnectionID(partitionTaskManagerLocation, partition.getIntermediateResult().getConnectionIndex());
partitionLocation = ResultPartitionLocation.createRemote(connectionId);
}
final InputChannelDeploymentDescriptor descriptor = new InputChannelDeploymentDescriptor(partitionId, partitionLocation);
consumer.sendUpdatePartitionInfoRpcCall(Collections.singleton(new PartitionInfo(partition.getIntermediateResult().getId(), descriptor)));
} else // ----------------------------------------------------------------
if (consumerState == SCHEDULED || consumerState == DEPLOYING) {
final Execution partitionExecution = partition.getProducer().getCurrentExecutionAttempt();
consumerVertex.cachePartitionInfo(PartialInputChannelDeploymentDescriptor.fromEdge(partition, partitionExecution));
// double check to resolve race conditions
if (consumerVertex.getExecutionState() == RUNNING) {
consumerVertex.sendPartitionInfos();
}
}
}
}
}
use of org.apache.flink.runtime.execution.ExecutionState in project flink by apache.
the class Execution method markFinished.
void markFinished(Map<String, Accumulator<?, ?>> userAccumulators, IOMetrics metrics) {
// this call usually comes during RUNNING, but may also come while still in deploying (very fast tasks!)
while (true) {
ExecutionState current = this.state;
if (current == RUNNING || current == DEPLOYING) {
if (transitionState(current, FINISHED)) {
try {
for (IntermediateResultPartition finishedPartition : getVertex().finishAllBlockingPartitions()) {
IntermediateResultPartition[] allPartitions = finishedPartition.getIntermediateResult().getPartitions();
for (IntermediateResultPartition partition : allPartitions) {
scheduleOrUpdateConsumers(partition.getConsumers());
}
}
updateAccumulatorsAndMetrics(userAccumulators, metrics);
assignedResource.releaseSlot();
vertex.getExecutionGraph().deregisterExecution(this);
} finally {
vertex.executionFinished();
}
return;
}
} else if (current == CANCELING) {
// we sent a cancel call, and the task manager finished before it arrived. We
// will never get a CANCELED call back from the job manager
cancelingComplete(userAccumulators, metrics);
return;
} else if (current == CANCELED || current == FAILED) {
if (LOG.isDebugEnabled()) {
LOG.debug("Task FINISHED, but concurrently went to state " + state);
}
return;
} else {
// this should not happen, we need to fail this
markFailed(new Exception("Vertex received FINISHED message while being in state " + state));
return;
}
}
}
use of org.apache.flink.runtime.execution.ExecutionState in project flink by apache.
the class CheckpointCoordinatorTest method testPeriodicSchedulingWithInactiveTasks.
@Test
public void testPeriodicSchedulingWithInactiveTasks() {
try {
final JobID jid = new JobID();
// create some mock execution vertices and trigger some checkpoint
final ExecutionAttemptID triggerAttemptID = new ExecutionAttemptID();
final ExecutionAttemptID ackAttemptID = new ExecutionAttemptID();
final ExecutionAttemptID commitAttemptID = new ExecutionAttemptID();
ExecutionVertex triggerVertex = mockExecutionVertex(triggerAttemptID);
ExecutionVertex ackVertex = mockExecutionVertex(ackAttemptID);
ExecutionVertex commitVertex = mockExecutionVertex(commitAttemptID);
final AtomicReference<ExecutionState> currentState = new AtomicReference<>(ExecutionState.CREATED);
when(triggerVertex.getCurrentExecutionAttempt().getState()).thenAnswer(new Answer<ExecutionState>() {
@Override
public ExecutionState answer(InvocationOnMock invocation) {
return currentState.get();
}
});
CheckpointCoordinator coord = new CheckpointCoordinator(jid, // periodic interval is 10 ms
10, // timeout is very long (200 s)
200000, // no extra delay
0L, // max two concurrent checkpoints
2, ExternalizedCheckpointSettings.none(), new ExecutionVertex[] { triggerVertex }, new ExecutionVertex[] { ackVertex }, new ExecutionVertex[] { commitVertex }, new StandaloneCheckpointIDCounter(), new StandaloneCompletedCheckpointStore(2), null, Executors.directExecutor());
coord.startCheckpointScheduler();
// no checkpoint should have started so far
Thread.sleep(200);
assertEquals(0, coord.getNumberOfPendingCheckpoints());
// now move the state to RUNNING
currentState.set(ExecutionState.RUNNING);
// the coordinator should start checkpointing now
final long timeout = System.currentTimeMillis() + 10000;
do {
Thread.sleep(20);
} while (System.currentTimeMillis() < timeout && coord.getNumberOfPendingCheckpoints() == 0);
assertTrue(coord.getNumberOfPendingCheckpoints() > 0);
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
use of org.apache.flink.runtime.execution.ExecutionState in project flink by apache.
the class TaskAsyncCallTest method testCheckpointCallsInOrder.
// ------------------------------------------------------------------------
// Tests
// ------------------------------------------------------------------------
@Test
public void testCheckpointCallsInOrder() {
try {
Task task = createTask();
task.startTaskThread();
awaitLatch.await();
for (int i = 1; i <= NUM_CALLS; i++) {
task.triggerCheckpointBarrier(i, 156865867234L, CheckpointOptions.forFullCheckpoint());
}
triggerLatch.await();
assertFalse(task.isCanceledOrFailed());
ExecutionState currentState = task.getExecutionState();
if (currentState != ExecutionState.RUNNING && currentState != ExecutionState.FINISHED) {
fail("Task should be RUNNING or FINISHED, but is " + currentState);
}
task.cancelExecution();
task.getExecutingThread().join();
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
Aggregations