Search in sources :

Example 21 with ExecutionState

use of org.apache.flink.runtime.execution.ExecutionState in project flink by apache.

the class TaskExecutionStateTest method testSerialization.

@Test
public void testSerialization() {
    try {
        final JobID jid = new JobID();
        final ExecutionAttemptID executionId = new ExecutionAttemptID();
        final ExecutionState state = ExecutionState.DEPLOYING;
        final Throwable error = new IOException("fubar");
        TaskExecutionState original1 = new TaskExecutionState(jid, executionId, state, error);
        TaskExecutionState original2 = new TaskExecutionState(jid, executionId, state);
        TaskExecutionState javaSerCopy1 = CommonTestUtils.createCopySerializable(original1);
        TaskExecutionState javaSerCopy2 = CommonTestUtils.createCopySerializable(original2);
        // equalities
        assertEquals(original1, javaSerCopy1);
        assertEquals(javaSerCopy1, original1);
        assertEquals(original2, javaSerCopy2);
        assertEquals(javaSerCopy2, original2);
        // hash codes
        assertEquals(original1.hashCode(), javaSerCopy1.hashCode());
        assertEquals(original2.hashCode(), javaSerCopy2.hashCode());
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
}
Also used : ExecutionState(org.apache.flink.runtime.execution.ExecutionState) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) IOException(java.io.IOException) JobID(org.apache.flink.api.common.JobID) IOException(java.io.IOException) Test(org.junit.Test)

Example 22 with ExecutionState

use of org.apache.flink.runtime.execution.ExecutionState in project flink by apache.

the class TaskTest method validateCancelingAndCanceledListenerMessage.

private void validateCancelingAndCanceledListenerMessage(Task task) {
    try {
        // we may have to wait for a bit to give the actors time to receive the message
        // and put it into the queue
        TaskMessages.UpdateTaskExecutionState message1 = (TaskMessages.UpdateTaskExecutionState) listenerMessages.take();
        TaskMessages.UpdateTaskExecutionState message2 = (TaskMessages.UpdateTaskExecutionState) listenerMessages.take();
        assertNotNull("There is no additional listener message", message1);
        assertNotNull("There is no additional listener message", message2);
        TaskExecutionState taskState1 = message1.taskExecutionState();
        TaskExecutionState taskState2 = message2.taskExecutionState();
        assertEquals(task.getJobID(), taskState1.getJobID());
        assertEquals(task.getJobID(), taskState2.getJobID());
        assertEquals(task.getExecutionId(), taskState1.getID());
        assertEquals(task.getExecutionId(), taskState2.getID());
        ExecutionState state1 = taskState1.getExecutionState();
        ExecutionState state2 = taskState2.getExecutionState();
        // it may be (very rarely) that the following race happens:
        //  - OUTSIDE THREAD: call to cancel()
        //  - OUTSIDE THREAD: atomic state change from running to canceling
        //  - TASK THREAD: finishes, atomic change from canceling to canceled
        //  - TASK THREAD: send notification that state is canceled
        //  - OUTSIDE THREAD: send notification that state is canceling
        // for that reason, we allow the notification messages in any order.
        assertTrue((state1 == ExecutionState.CANCELING && state2 == ExecutionState.CANCELED) || (state2 == ExecutionState.CANCELING && state1 == ExecutionState.CANCELED));
    } catch (InterruptedException e) {
        fail("interrupted");
    }
}
Also used : ExecutionState(org.apache.flink.runtime.execution.ExecutionState) TaskMessages(org.apache.flink.runtime.messages.TaskMessages)

Example 23 with ExecutionState

use of org.apache.flink.runtime.execution.ExecutionState in project flink by apache.

the class StreamTaskTest method testEarlyCanceling.

/**
	 * This test checks that cancel calls that are issued before the operator is
	 * instantiated still lead to proper canceling.
	 */
@Test
public void testEarlyCanceling() throws Exception {
    Deadline deadline = new FiniteDuration(2, TimeUnit.MINUTES).fromNow();
    StreamConfig cfg = new StreamConfig(new Configuration());
    cfg.setStreamOperator(new SlowlyDeserializingOperator());
    cfg.setTimeCharacteristic(TimeCharacteristic.ProcessingTime);
    Task task = createTask(SourceStreamTask.class, cfg, new Configuration());
    TestingExecutionStateListener testingExecutionStateListener = new TestingExecutionStateListener();
    task.registerExecutionListener(testingExecutionStateListener);
    task.startTaskThread();
    Future<ExecutionState> running = testingExecutionStateListener.notifyWhenExecutionState(ExecutionState.RUNNING);
    // wait until the task thread reached state RUNNING
    ExecutionState executionState = Await.result(running, deadline.timeLeft());
    // make sure the task is really running
    if (executionState != ExecutionState.RUNNING) {
        fail("Task entered state " + task.getExecutionState() + " with error " + ExceptionUtils.stringifyException(task.getFailureCause()));
    }
    // send a cancel. because the operator takes a long time to deserialize, this should
    // hit the task before the operator is deserialized
    task.cancelExecution();
    Future<ExecutionState> canceling = testingExecutionStateListener.notifyWhenExecutionState(ExecutionState.CANCELING);
    executionState = Await.result(canceling, deadline.timeLeft());
    // the task should reach state canceled eventually
    assertTrue(executionState == ExecutionState.CANCELING || executionState == ExecutionState.CANCELED);
    task.getExecutingThread().join(deadline.timeLeft().toMillis());
    assertFalse("Task did not cancel", task.getExecutingThread().isAlive());
    assertEquals(ExecutionState.CANCELED, task.getExecutionState());
}
Also used : ExecutionState(org.apache.flink.runtime.execution.ExecutionState) TaskExecutionState(org.apache.flink.runtime.taskmanager.TaskExecutionState) Task(org.apache.flink.runtime.taskmanager.Task) Configuration(org.apache.flink.configuration.Configuration) Deadline(scala.concurrent.duration.Deadline) FiniteDuration(scala.concurrent.duration.FiniteDuration) StreamConfig(org.apache.flink.streaming.api.graph.StreamConfig) Test(org.junit.Test) PrepareForTest(org.powermock.core.classloader.annotations.PrepareForTest)

Example 24 with ExecutionState

use of org.apache.flink.runtime.execution.ExecutionState in project flink by apache.

the class Task method cancelOrFailAndCancelInvokable.

private void cancelOrFailAndCancelInvokable(ExecutionState targetState, Throwable cause) {
    while (true) {
        ExecutionState current = executionState;
        // then we need not do anything
        if (current.isTerminal() || current == ExecutionState.CANCELING) {
            LOG.info("Task {} is already in state {}", taskNameWithSubtask, current);
            return;
        }
        if (current == ExecutionState.DEPLOYING || current == ExecutionState.CREATED) {
            if (transitionState(current, targetState, cause)) {
                // if we manage this state transition, then the invokable gets never called
                // we need not call cancel on it
                this.failureCause = cause;
                notifyObservers(targetState, new Exception(String.format("Cancel or fail execution of %s (%s).", taskNameWithSubtask, executionId), cause));
                return;
            }
        } else if (current == ExecutionState.RUNNING) {
            if (transitionState(ExecutionState.RUNNING, targetState, cause)) {
                // we need to cancel the invokable
                if (invokable != null && invokableHasBeenCanceled.compareAndSet(false, true)) {
                    this.failureCause = cause;
                    notifyObservers(targetState, new Exception(String.format("Cancel or fail execution of %s (%s).", taskNameWithSubtask, executionId), cause));
                    LOG.info("Triggering cancellation of task code {} ({}).", taskNameWithSubtask, executionId);
                    // because the canceling may block on user code, we cancel from a separate thread
                    // we do not reuse the async call handler, because that one may be blocked, in which
                    // case the canceling could not continue
                    // The canceller calls cancel and interrupts the executing thread once
                    Runnable canceler = new TaskCanceler(LOG, invokable, executingThread, taskNameWithSubtask, taskCancellationInterval, taskCancellationTimeout, taskManagerActions, producedPartitions, inputGates);
                    Thread cancelThread = new Thread(executingThread.getThreadGroup(), canceler, String.format("Canceler for %s (%s).", taskNameWithSubtask, executionId));
                    cancelThread.setDaemon(true);
                    cancelThread.start();
                }
                return;
            }
        } else {
            throw new IllegalStateException(String.format("Unexpected state: %s of task %s (%s).", current, taskNameWithSubtask, executionId));
        }
    }
}
Also used : ExecutionState(org.apache.flink.runtime.execution.ExecutionState) CheckpointDeclineTaskNotCheckpointingException(org.apache.flink.runtime.checkpoint.decline.CheckpointDeclineTaskNotCheckpointingException) TimeoutException(java.util.concurrent.TimeoutException) CancelTaskException(org.apache.flink.runtime.execution.CancelTaskException) RejectedExecutionException(java.util.concurrent.RejectedExecutionException) PartitionProducerDisposedException(org.apache.flink.runtime.jobmanager.PartitionProducerDisposedException) CheckpointDeclineTaskNotReadyException(org.apache.flink.runtime.checkpoint.decline.CheckpointDeclineTaskNotReadyException) IOException(java.io.IOException)

Example 25 with ExecutionState

use of org.apache.flink.runtime.execution.ExecutionState in project flink by apache.

the class Task method run.

/**
	 * The core work method that bootstraps the task and executes it code
	 */
@Override
public void run() {
    // ----------------------------
    while (true) {
        ExecutionState current = this.executionState;
        if (current == ExecutionState.CREATED) {
            if (transitionState(ExecutionState.CREATED, ExecutionState.DEPLOYING)) {
                // success, we can start our work
                break;
            }
        } else if (current == ExecutionState.FAILED) {
            // we were immediately failed. tell the TaskManager that we reached our final state
            notifyFinalState();
            return;
        } else if (current == ExecutionState.CANCELING) {
            if (transitionState(ExecutionState.CANCELING, ExecutionState.CANCELED)) {
                // we were immediately canceled. tell the TaskManager that we reached our final state
                notifyFinalState();
                return;
            }
        } else {
            throw new IllegalStateException("Invalid state for beginning of operation of task " + this + '.');
        }
    }
    // all resource acquisitions and registrations from here on
    // need to be undone in the end
    Map<String, Future<Path>> distributedCacheEntries = new HashMap<String, Future<Path>>();
    AbstractInvokable invokable = null;
    ClassLoader userCodeClassLoader;
    try {
        // ----------------------------
        //  Task Bootstrap - We periodically
        //  check for canceling as a shortcut
        // ----------------------------
        // activate safety net for task thread
        LOG.info("Creating FileSystem stream leak safety net for task {}", this);
        FileSystemSafetyNet.initializeSafetyNetForThread();
        // first of all, get a user-code classloader
        // this may involve downloading the job's JAR files and/or classes
        LOG.info("Loading JAR files for task {}.", this);
        userCodeClassLoader = createUserCodeClassloader(libraryCache);
        final ExecutionConfig executionConfig = serializedExecutionConfig.deserializeValue(userCodeClassLoader);
        if (executionConfig.getTaskCancellationInterval() >= 0) {
            // override task cancellation interval from Flink config if set in ExecutionConfig
            taskCancellationInterval = executionConfig.getTaskCancellationInterval();
        }
        if (executionConfig.getTaskCancellationTimeout() >= 0) {
            // override task cancellation timeout from Flink config if set in ExecutionConfig
            taskCancellationTimeout = executionConfig.getTaskCancellationTimeout();
        }
        // now load the task's invokable code
        invokable = loadAndInstantiateInvokable(userCodeClassLoader, nameOfInvokableClass);
        if (isCanceledOrFailed()) {
            throw new CancelTaskException();
        }
        // ----------------------------------------------------------------
        // register the task with the network stack
        // this operation may fail if the system does not have enough
        // memory to run the necessary data exchanges
        // the registration must also strictly be undone
        // ----------------------------------------------------------------
        LOG.info("Registering task at network: {}.", this);
        network.registerTask(this);
        // next, kick off the background copying of files for the distributed cache
        try {
            for (Map.Entry<String, DistributedCache.DistributedCacheEntry> entry : DistributedCache.readFileInfoFromConfig(jobConfiguration)) {
                LOG.info("Obtaining local cache file for '{}'.", entry.getKey());
                Future<Path> cp = fileCache.createTmpFile(entry.getKey(), entry.getValue(), jobId);
                distributedCacheEntries.put(entry.getKey(), cp);
            }
        } catch (Exception e) {
            throw new Exception(String.format("Exception while adding files to distributed cache of task %s (%s).", taskNameWithSubtask, executionId), e);
        }
        if (isCanceledOrFailed()) {
            throw new CancelTaskException();
        }
        // ----------------------------------------------------------------
        //  call the user code initialization methods
        // ----------------------------------------------------------------
        TaskKvStateRegistry kvStateRegistry = network.createKvStateTaskRegistry(jobId, getJobVertexId());
        Environment env = new RuntimeEnvironment(jobId, vertexId, executionId, executionConfig, taskInfo, jobConfiguration, taskConfiguration, userCodeClassLoader, memoryManager, ioManager, broadcastVariableManager, accumulatorRegistry, kvStateRegistry, inputSplitProvider, distributedCacheEntries, writers, inputGates, checkpointResponder, taskManagerConfig, metrics, this);
        // let the task code create its readers and writers
        invokable.setEnvironment(env);
        if (null != taskStateHandles) {
            if (invokable instanceof StatefulTask) {
                StatefulTask op = (StatefulTask) invokable;
                op.setInitialState(taskStateHandles);
            } else {
                throw new IllegalStateException("Found operator state for a non-stateful task invokable");
            }
            // be memory and GC friendly - since the code stays in invoke() for a potentially long time,
            // we clear the reference to the state handle
            //noinspection UnusedAssignment
            taskStateHandles = null;
        }
        // ----------------------------------------------------------------
        //  actual task core work
        // ----------------------------------------------------------------
        // we must make strictly sure that the invokable is accessible to the cancel() call
        // by the time we switched to running.
        this.invokable = invokable;
        // switch to the RUNNING state, if that fails, we have been canceled/failed in the meantime
        if (!transitionState(ExecutionState.DEPLOYING, ExecutionState.RUNNING)) {
            throw new CancelTaskException();
        }
        // notify everyone that we switched to running
        notifyObservers(ExecutionState.RUNNING, null);
        taskManagerActions.updateTaskExecutionState(new TaskExecutionState(jobId, executionId, ExecutionState.RUNNING));
        // make sure the user code classloader is accessible thread-locally
        executingThread.setContextClassLoader(userCodeClassLoader);
        // run the invokable
        invokable.invoke();
        // to the fact that it has been canceled
        if (isCanceledOrFailed()) {
            throw new CancelTaskException();
        }
        // finish the produced partitions. if this fails, we consider the execution failed.
        for (ResultPartition partition : producedPartitions) {
            if (partition != null) {
                partition.finish();
            }
        }
        // if that fails, the task was canceled/failed in the meantime
        if (transitionState(ExecutionState.RUNNING, ExecutionState.FINISHED)) {
            notifyObservers(ExecutionState.FINISHED, null);
        } else {
            throw new CancelTaskException();
        }
    } catch (Throwable t) {
        try {
            // check if the exception is unrecoverable
            if (ExceptionUtils.isJvmFatalError(t) || (t instanceof OutOfMemoryError && taskManagerConfig.shouldExitJvmOnOutOfMemoryError())) {
                // don't attempt a clean shutdown, because we cannot expect the clean shutdown to complete
                try {
                    LOG.error("Encountered fatal error {} - terminating the JVM", t.getClass().getName(), t);
                } finally {
                    Runtime.getRuntime().halt(-1);
                }
            }
            // to failExternally()
            while (true) {
                ExecutionState current = this.executionState;
                if (current == ExecutionState.RUNNING || current == ExecutionState.DEPLOYING) {
                    if (t instanceof CancelTaskException) {
                        if (transitionState(current, ExecutionState.CANCELED)) {
                            cancelInvokable();
                            notifyObservers(ExecutionState.CANCELED, null);
                            break;
                        }
                    } else {
                        if (transitionState(current, ExecutionState.FAILED, t)) {
                            // proper failure of the task. record the exception as the root cause
                            String errorMessage = String.format("Execution of %s (%s) failed.", taskNameWithSubtask, executionId);
                            failureCause = t;
                            cancelInvokable();
                            notifyObservers(ExecutionState.FAILED, new Exception(errorMessage, t));
                            break;
                        }
                    }
                } else if (current == ExecutionState.CANCELING) {
                    if (transitionState(current, ExecutionState.CANCELED)) {
                        notifyObservers(ExecutionState.CANCELED, null);
                        break;
                    }
                } else if (current == ExecutionState.FAILED) {
                    // in state failed already, no transition necessary any more
                    break;
                } else // unexpected state, go to failed
                if (transitionState(current, ExecutionState.FAILED, t)) {
                    LOG.error("Unexpected state in task {} ({}) during an exception: {}.", taskNameWithSubtask, executionId, current);
                    break;
                }
            // else fall through the loop and
            }
        } catch (Throwable tt) {
            String message = String.format("FATAL - exception in exception handler of task %s (%s).", taskNameWithSubtask, executionId);
            LOG.error(message, tt);
            notifyFatalError(message, tt);
        }
    } finally {
        try {
            LOG.info("Freeing task resources for {} ({}).", taskNameWithSubtask, executionId);
            // stop the async dispatcher.
            // copy dispatcher reference to stack, against concurrent release
            ExecutorService dispatcher = this.asyncCallDispatcher;
            if (dispatcher != null && !dispatcher.isShutdown()) {
                dispatcher.shutdownNow();
            }
            // free the network resources
            network.unregisterTask(this);
            // free memory resources
            if (invokable != null) {
                memoryManager.releaseAll(invokable);
            }
            // remove all of the tasks library resources
            libraryCache.unregisterTask(jobId, executionId);
            // remove all files in the distributed cache
            removeCachedFiles(distributedCacheEntries, fileCache);
            // close and de-activate safety net for task thread
            LOG.info("Ensuring all FileSystem streams are closed for task {}", this);
            FileSystemSafetyNet.closeSafetyNetAndGuardedResourcesForThread();
            notifyFinalState();
        } catch (Throwable t) {
            // an error in the resource cleanup is fatal
            String message = String.format("FATAL - exception in resource cleanup of task %s (%s).", taskNameWithSubtask, executionId);
            LOG.error(message, t);
            notifyFatalError(message, t);
        }
        // errors here will only be logged
        try {
            metrics.close();
        } catch (Throwable t) {
            LOG.error("Error during metrics de-registration of task {} ({}).", taskNameWithSubtask, executionId, t);
        }
    }
}
Also used : ExecutionState(org.apache.flink.runtime.execution.ExecutionState) HashMap(java.util.HashMap) TaskKvStateRegistry(org.apache.flink.runtime.query.TaskKvStateRegistry) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) AbstractInvokable(org.apache.flink.runtime.jobgraph.tasks.AbstractInvokable) Path(org.apache.flink.core.fs.Path) CheckpointDeclineTaskNotCheckpointingException(org.apache.flink.runtime.checkpoint.decline.CheckpointDeclineTaskNotCheckpointingException) TimeoutException(java.util.concurrent.TimeoutException) CancelTaskException(org.apache.flink.runtime.execution.CancelTaskException) RejectedExecutionException(java.util.concurrent.RejectedExecutionException) PartitionProducerDisposedException(org.apache.flink.runtime.jobmanager.PartitionProducerDisposedException) CheckpointDeclineTaskNotReadyException(org.apache.flink.runtime.checkpoint.decline.CheckpointDeclineTaskNotReadyException) IOException(java.io.IOException) ResultPartition(org.apache.flink.runtime.io.network.partition.ResultPartition) StatefulTask(org.apache.flink.runtime.jobgraph.tasks.StatefulTask) CancelTaskException(org.apache.flink.runtime.execution.CancelTaskException) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future) Environment(org.apache.flink.runtime.execution.Environment) NetworkEnvironment(org.apache.flink.runtime.io.network.NetworkEnvironment) Map(java.util.Map) HashMap(java.util.HashMap)

Aggregations

ExecutionState (org.apache.flink.runtime.execution.ExecutionState)26 Test (org.junit.Test)11 ResultPartitionID (org.apache.flink.runtime.io.network.partition.ResultPartitionID)6 TaskManagerLocation (org.apache.flink.runtime.taskmanager.TaskManagerLocation)6 JsonGenerator (com.fasterxml.jackson.core.JsonGenerator)5 IOException (java.io.IOException)5 StringWriter (java.io.StringWriter)5 TimeoutException (java.util.concurrent.TimeoutException)5 JobID (org.apache.flink.api.common.JobID)5 AccessExecutionVertex (org.apache.flink.runtime.executiongraph.AccessExecutionVertex)5 ExecutionAttemptID (org.apache.flink.runtime.executiongraph.ExecutionAttemptID)4 HashMap (java.util.HashMap)3 JobException (org.apache.flink.runtime.JobException)3 ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)3 ExecutionVertex (org.apache.flink.runtime.executiongraph.ExecutionVertex)3 IntermediateResultPartition (org.apache.flink.runtime.executiongraph.IntermediateResultPartition)3 SimpleSlot (org.apache.flink.runtime.instance.SimpleSlot)3 ConnectionID (org.apache.flink.runtime.io.network.ConnectionID)3 IntermediateDataSetID (org.apache.flink.runtime.jobgraph.IntermediateDataSetID)3 MutableIOMetrics (org.apache.flink.runtime.webmonitor.utils.MutableIOMetrics)3