Search in sources :

Example 36 with CancelTaskException

use of org.apache.flink.runtime.execution.CancelTaskException in project flink by splunk.

the class RocksDBAsyncSnapshotTest method testCancelFullyAsyncCheckpoints.

/**
 * This tests ensures that canceling of asynchronous snapshots works as expected and does not
 * block.
 */
@Test
public void testCancelFullyAsyncCheckpoints() throws Exception {
    final OneInputStreamTaskTestHarness<String, String> testHarness = new OneInputStreamTaskTestHarness<>(OneInputStreamTask::new, BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO);
    testHarness.setupOutputForSingletonOperatorChain();
    testHarness.configureForKeyedStream(value -> value, BasicTypeInfo.STRING_TYPE_INFO);
    StreamConfig streamConfig = testHarness.getStreamConfig();
    File dbDir = temporaryFolder.newFolder();
    final EmbeddedRocksDBStateBackend.PriorityQueueStateType timerServicePriorityQueueType = RocksDBOptions.TIMER_SERVICE_FACTORY.defaultValue();
    final int skipStreams;
    if (timerServicePriorityQueueType == EmbeddedRocksDBStateBackend.PriorityQueueStateType.HEAP) {
        // we skip the first created stream, because it is used to checkpoint the timer service,
        // which is
        // currently not asynchronous.
        skipStreams = 1;
    } else if (timerServicePriorityQueueType == EmbeddedRocksDBStateBackend.PriorityQueueStateType.ROCKSDB) {
        skipStreams = 0;
    } else {
        throw new AssertionError(String.format("Unknown timer service priority queue type %s.", timerServicePriorityQueueType));
    }
    // this is the proper instance that we need to call.
    BlockerCheckpointStreamFactory blockerCheckpointStreamFactory = new BlockerCheckpointStreamFactory(4 * 1024 * 1024) {

        int count = skipStreams;

        @Override
        public CheckpointStateOutputStream createCheckpointStateOutputStream(CheckpointedStateScope scope) throws IOException {
            if (count > 0) {
                --count;
                return new BlockingCheckpointOutputStream(new MemCheckpointStreamFactory.MemoryCheckpointOutputStream(maxSize), null, null, Integer.MAX_VALUE);
            } else {
                return super.createCheckpointStateOutputStream(scope);
            }
        }
    };
    // to avoid serialization of the above factory instance, we need to pass it in
    // through a static variable
    StateBackend stateBackend = new BackendForTestStream(new StaticForwardFactory(blockerCheckpointStreamFactory));
    RocksDBStateBackend backend = new RocksDBStateBackend(stateBackend);
    backend.setDbStoragePath(dbDir.getAbsolutePath());
    streamConfig.setStateBackend(backend);
    streamConfig.setStreamOperator(new AsyncCheckpointOperator());
    streamConfig.setOperatorID(new OperatorID());
    TestTaskStateManager taskStateManagerTestMock = new TestTaskStateManager();
    StreamMockEnvironment mockEnv = new StreamMockEnvironment(testHarness.jobConfig, testHarness.taskConfig, testHarness.memorySize, new MockInputSplitProvider(), testHarness.bufferSize, taskStateManagerTestMock);
    blockerCheckpointStreamFactory.setBlockerLatch(new OneShotLatch());
    blockerCheckpointStreamFactory.setWaiterLatch(new OneShotLatch());
    testHarness.invoke(mockEnv);
    testHarness.waitForTaskRunning();
    final OneInputStreamTask<String, String> task = testHarness.getTask();
    task.triggerCheckpointAsync(new CheckpointMetaData(42, 17), CheckpointOptions.forCheckpointWithDefaultLocation()).get();
    testHarness.processElement(new StreamRecord<>("Wohoo", 0));
    blockerCheckpointStreamFactory.getWaiterLatch().await();
    task.cancel();
    blockerCheckpointStreamFactory.getBlockerLatch().trigger();
    testHarness.endInput();
    ExecutorService threadPool = task.getAsyncOperationsThreadPool();
    threadPool.shutdown();
    Assert.assertTrue(threadPool.awaitTermination(60_000, TimeUnit.MILLISECONDS));
    Set<BlockingCheckpointOutputStream> createdStreams = blockerCheckpointStreamFactory.getAllCreatedStreams();
    for (BlockingCheckpointOutputStream stream : createdStreams) {
        Assert.assertTrue("Not all of the " + createdStreams.size() + " created streams have been closed.", stream.isClosed());
    }
    try {
        testHarness.waitForTaskCompletion();
        fail("Operation completed. Cancel failed.");
    } catch (Exception expected) {
        Throwable cause = expected.getCause();
        if (!(cause instanceof CancelTaskException)) {
            fail("Unexpected exception: " + expected);
        }
    }
}
Also used : OneInputStreamTask(org.apache.flink.streaming.runtime.tasks.OneInputStreamTask) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) StateBackend(org.apache.flink.runtime.state.StateBackend) MemoryStateBackend(org.apache.flink.runtime.state.memory.MemoryStateBackend) AbstractKeyedStateBackend(org.apache.flink.runtime.state.AbstractKeyedStateBackend) MemCheckpointStreamFactory(org.apache.flink.runtime.state.memory.MemCheckpointStreamFactory) BlockerCheckpointStreamFactory(org.apache.flink.runtime.util.BlockerCheckpointStreamFactory) OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) StreamMockEnvironment(org.apache.flink.streaming.runtime.tasks.StreamMockEnvironment) MockInputSplitProvider(org.apache.flink.runtime.operators.testutils.MockInputSplitProvider) StreamConfig(org.apache.flink.streaming.api.graph.StreamConfig) BlockingCheckpointOutputStream(org.apache.flink.runtime.util.BlockingCheckpointOutputStream) CheckpointMetaData(org.apache.flink.runtime.checkpoint.CheckpointMetaData) CheckpointException(org.apache.flink.runtime.checkpoint.CheckpointException) CancelTaskException(org.apache.flink.runtime.execution.CancelTaskException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) BackendForTestStream(org.apache.flink.runtime.state.testutils.BackendForTestStream) TestTaskStateManager(org.apache.flink.runtime.state.TestTaskStateManager) CancelTaskException(org.apache.flink.runtime.execution.CancelTaskException) OneInputStreamTaskTestHarness(org.apache.flink.streaming.runtime.tasks.OneInputStreamTaskTestHarness) ExecutorService(java.util.concurrent.ExecutorService) CheckpointedStateScope(org.apache.flink.runtime.state.CheckpointedStateScope) File(java.io.File) Test(org.junit.Test)

Example 37 with CancelTaskException

use of org.apache.flink.runtime.execution.CancelTaskException in project flink-mirror by flink-ci.

the class RocksDBAsyncSnapshotTest method testCancelFullyAsyncCheckpoints.

/**
 * This tests ensures that canceling of asynchronous snapshots works as expected and does not
 * block.
 */
@Test
public void testCancelFullyAsyncCheckpoints() throws Exception {
    final OneInputStreamTaskTestHarness<String, String> testHarness = new OneInputStreamTaskTestHarness<>(OneInputStreamTask::new, BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO);
    testHarness.setupOutputForSingletonOperatorChain();
    testHarness.configureForKeyedStream(value -> value, BasicTypeInfo.STRING_TYPE_INFO);
    StreamConfig streamConfig = testHarness.getStreamConfig();
    File dbDir = temporaryFolder.newFolder();
    final EmbeddedRocksDBStateBackend.PriorityQueueStateType timerServicePriorityQueueType = RocksDBOptions.TIMER_SERVICE_FACTORY.defaultValue();
    final int skipStreams;
    if (timerServicePriorityQueueType == EmbeddedRocksDBStateBackend.PriorityQueueStateType.HEAP) {
        // we skip the first created stream, because it is used to checkpoint the timer service,
        // which is
        // currently not asynchronous.
        skipStreams = 1;
    } else if (timerServicePriorityQueueType == EmbeddedRocksDBStateBackend.PriorityQueueStateType.ROCKSDB) {
        skipStreams = 0;
    } else {
        throw new AssertionError(String.format("Unknown timer service priority queue type %s.", timerServicePriorityQueueType));
    }
    // this is the proper instance that we need to call.
    BlockerCheckpointStreamFactory blockerCheckpointStreamFactory = new BlockerCheckpointStreamFactory(4 * 1024 * 1024) {

        int count = skipStreams;

        @Override
        public CheckpointStateOutputStream createCheckpointStateOutputStream(CheckpointedStateScope scope) throws IOException {
            if (count > 0) {
                --count;
                return new BlockingCheckpointOutputStream(new MemCheckpointStreamFactory.MemoryCheckpointOutputStream(maxSize), null, null, Integer.MAX_VALUE);
            } else {
                return super.createCheckpointStateOutputStream(scope);
            }
        }
    };
    // to avoid serialization of the above factory instance, we need to pass it in
    // through a static variable
    StateBackend stateBackend = new BackendForTestStream(new StaticForwardFactory(blockerCheckpointStreamFactory));
    RocksDBStateBackend backend = new RocksDBStateBackend(stateBackend);
    backend.setDbStoragePath(dbDir.getAbsolutePath());
    streamConfig.setStateBackend(backend);
    streamConfig.setStreamOperator(new AsyncCheckpointOperator());
    streamConfig.setOperatorID(new OperatorID());
    TestTaskStateManager taskStateManagerTestMock = new TestTaskStateManager();
    StreamMockEnvironment mockEnv = new StreamMockEnvironment(testHarness.jobConfig, testHarness.taskConfig, testHarness.memorySize, new MockInputSplitProvider(), testHarness.bufferSize, taskStateManagerTestMock);
    blockerCheckpointStreamFactory.setBlockerLatch(new OneShotLatch());
    blockerCheckpointStreamFactory.setWaiterLatch(new OneShotLatch());
    testHarness.invoke(mockEnv);
    testHarness.waitForTaskRunning();
    final OneInputStreamTask<String, String> task = testHarness.getTask();
    task.triggerCheckpointAsync(new CheckpointMetaData(42, 17), CheckpointOptions.forCheckpointWithDefaultLocation()).get();
    testHarness.processElement(new StreamRecord<>("Wohoo", 0));
    blockerCheckpointStreamFactory.getWaiterLatch().await();
    task.cancel();
    blockerCheckpointStreamFactory.getBlockerLatch().trigger();
    testHarness.endInput();
    ExecutorService threadPool = task.getAsyncOperationsThreadPool();
    threadPool.shutdown();
    Assert.assertTrue(threadPool.awaitTermination(60_000, TimeUnit.MILLISECONDS));
    Set<BlockingCheckpointOutputStream> createdStreams = blockerCheckpointStreamFactory.getAllCreatedStreams();
    for (BlockingCheckpointOutputStream stream : createdStreams) {
        Assert.assertTrue("Not all of the " + createdStreams.size() + " created streams have been closed.", stream.isClosed());
    }
    try {
        testHarness.waitForTaskCompletion();
        fail("Operation completed. Cancel failed.");
    } catch (Exception expected) {
        Throwable cause = expected.getCause();
        if (!(cause instanceof CancelTaskException)) {
            fail("Unexpected exception: " + expected);
        }
    }
}
Also used : OneInputStreamTask(org.apache.flink.streaming.runtime.tasks.OneInputStreamTask) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) StateBackend(org.apache.flink.runtime.state.StateBackend) MemoryStateBackend(org.apache.flink.runtime.state.memory.MemoryStateBackend) AbstractKeyedStateBackend(org.apache.flink.runtime.state.AbstractKeyedStateBackend) MemCheckpointStreamFactory(org.apache.flink.runtime.state.memory.MemCheckpointStreamFactory) BlockerCheckpointStreamFactory(org.apache.flink.runtime.util.BlockerCheckpointStreamFactory) OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) StreamMockEnvironment(org.apache.flink.streaming.runtime.tasks.StreamMockEnvironment) MockInputSplitProvider(org.apache.flink.runtime.operators.testutils.MockInputSplitProvider) StreamConfig(org.apache.flink.streaming.api.graph.StreamConfig) BlockingCheckpointOutputStream(org.apache.flink.runtime.util.BlockingCheckpointOutputStream) CheckpointMetaData(org.apache.flink.runtime.checkpoint.CheckpointMetaData) CheckpointException(org.apache.flink.runtime.checkpoint.CheckpointException) CancelTaskException(org.apache.flink.runtime.execution.CancelTaskException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) BackendForTestStream(org.apache.flink.runtime.state.testutils.BackendForTestStream) TestTaskStateManager(org.apache.flink.runtime.state.TestTaskStateManager) CancelTaskException(org.apache.flink.runtime.execution.CancelTaskException) OneInputStreamTaskTestHarness(org.apache.flink.streaming.runtime.tasks.OneInputStreamTaskTestHarness) ExecutorService(java.util.concurrent.ExecutorService) CheckpointedStateScope(org.apache.flink.runtime.state.CheckpointedStateScope) File(java.io.File) Test(org.junit.Test)

Example 38 with CancelTaskException

use of org.apache.flink.runtime.execution.CancelTaskException in project flink-mirror by flink-ci.

the class BatchTask method run.

protected void run() throws Exception {
    // check for asynchronous canceling
    if (!this.running) {
        return;
    }
    boolean stubOpen = false;
    try {
        // run the data preparation
        try {
            this.driver.prepare();
        } catch (Throwable t) {
            // errors during clean-up are swallowed, because we have already a root exception
            throw new Exception("The data preparation for task '" + this.getEnvironment().getTaskInfo().getTaskName() + "' , caused an error: " + t.getMessage(), t);
        }
        // check for canceling
        if (!this.running) {
            return;
        }
        // start all chained tasks
        BatchTask.openChainedTasks(this.chainedTasks, this);
        // open stub implementation
        if (this.stub != null) {
            try {
                Configuration stubConfig = this.config.getStubParameters();
                FunctionUtils.openFunction(this.stub, stubConfig);
                stubOpen = true;
            } catch (Throwable t) {
                throw new Exception("The user defined 'open()' method caused an exception: " + t.getMessage(), t);
            }
        }
        // run the user code
        this.driver.run();
        // failed.
        if (this.running && this.stub != null) {
            FunctionUtils.closeFunction(this.stub);
            stubOpen = false;
        }
        // close all chained tasks letting them report failure
        BatchTask.closeChainedTasks(this.chainedTasks, this);
        // close the output collector
        this.output.close();
    } catch (Exception ex) {
        // cause
        if (stubOpen) {
            try {
                FunctionUtils.closeFunction(this.stub);
            } catch (Throwable t) {
            // do nothing
            }
        }
        // if resettable driver invoke teardown
        if (this.driver instanceof ResettableDriver) {
            final ResettableDriver<?, ?> resDriver = (ResettableDriver<?, ?>) this.driver;
            try {
                resDriver.teardown();
            } catch (Throwable t) {
                throw new Exception("Error while shutting down an iterative operator: " + t.getMessage(), t);
            }
        }
        BatchTask.cancelChainedTasks(this.chainedTasks);
        ex = ExceptionInChainedStubException.exceptionUnwrap(ex);
        if (ex instanceof CancelTaskException) {
            // forward canceling exception
            throw ex;
        } else if (this.running) {
            // throw only if task was not cancelled. in the case of canceling, exceptions are
            // expected
            BatchTask.logAndThrowException(ex, this);
        }
    } finally {
        this.driver.cleanup();
    }
}
Also used : Configuration(org.apache.flink.configuration.Configuration) CancelTaskException(org.apache.flink.runtime.execution.CancelTaskException) ExceptionInChainedStubException(org.apache.flink.runtime.operators.chaining.ExceptionInChainedStubException) CancelTaskException(org.apache.flink.runtime.execution.CancelTaskException) IOException(java.io.IOException)

Example 39 with CancelTaskException

use of org.apache.flink.runtime.execution.CancelTaskException in project flink-mirror by flink-ci.

the class DataSourceTask method invoke.

@Override
public void invoke() throws Exception {
    // --------------------------------------------------------------------
    // Initialize
    // --------------------------------------------------------------------
    initInputFormat();
    LOG.debug(getLogString("Start registering input and output"));
    try {
        initOutputs(getEnvironment().getUserCodeClassLoader());
    } catch (Exception ex) {
        throw new RuntimeException("The initialization of the DataSource's outputs caused an error: " + ex.getMessage(), ex);
    }
    LOG.debug(getLogString("Finished registering input and output"));
    // --------------------------------------------------------------------
    // Invoke
    // --------------------------------------------------------------------
    LOG.debug(getLogString("Starting data source operator"));
    RuntimeContext ctx = createRuntimeContext();
    final Counter numRecordsOut;
    {
        Counter tmpNumRecordsOut;
        try {
            InternalOperatorIOMetricGroup ioMetricGroup = ((InternalOperatorMetricGroup) ctx.getMetricGroup()).getIOMetricGroup();
            ioMetricGroup.reuseInputMetricsForTask();
            if (this.config.getNumberOfChainedStubs() == 0) {
                ioMetricGroup.reuseOutputMetricsForTask();
            }
            tmpNumRecordsOut = ioMetricGroup.getNumRecordsOutCounter();
        } catch (Exception e) {
            LOG.warn("An exception occurred during the metrics setup.", e);
            tmpNumRecordsOut = new SimpleCounter();
        }
        numRecordsOut = tmpNumRecordsOut;
    }
    Counter completedSplitsCounter = ctx.getMetricGroup().counter("numSplitsProcessed");
    if (RichInputFormat.class.isAssignableFrom(this.format.getClass())) {
        ((RichInputFormat) this.format).setRuntimeContext(ctx);
        LOG.debug(getLogString("Rich Source detected. Initializing runtime context."));
        ((RichInputFormat) this.format).openInputFormat();
        LOG.debug(getLogString("Rich Source detected. Opening the InputFormat."));
    }
    ExecutionConfig executionConfig = getExecutionConfig();
    boolean objectReuseEnabled = executionConfig.isObjectReuseEnabled();
    LOG.debug("DataSourceTask object reuse: " + (objectReuseEnabled ? "ENABLED" : "DISABLED") + ".");
    final TypeSerializer<OT> serializer = this.serializerFactory.getSerializer();
    try {
        // start all chained tasks
        BatchTask.openChainedTasks(this.chainedTasks, this);
        // get input splits to read
        final Iterator<InputSplit> splitIterator = getInputSplits();
        // for each assigned input split
        while (!this.taskCanceled && splitIterator.hasNext()) {
            // get start and end
            final InputSplit split = splitIterator.next();
            LOG.debug(getLogString("Opening input split " + split.toString()));
            final InputFormat<OT, InputSplit> format = this.format;
            // open input format
            format.open(split);
            LOG.debug(getLogString("Starting to read input from split " + split.toString()));
            try {
                final Collector<OT> output = new CountingCollector<>(this.output, numRecordsOut);
                if (objectReuseEnabled) {
                    OT reuse = serializer.createInstance();
                    // as long as there is data to read
                    while (!this.taskCanceled && !format.reachedEnd()) {
                        OT returned;
                        if ((returned = format.nextRecord(reuse)) != null) {
                            output.collect(returned);
                        }
                    }
                } else {
                    // as long as there is data to read
                    while (!this.taskCanceled && !format.reachedEnd()) {
                        OT returned;
                        if ((returned = format.nextRecord(serializer.createInstance())) != null) {
                            output.collect(returned);
                        }
                    }
                }
                if (LOG.isDebugEnabled() && !this.taskCanceled) {
                    LOG.debug(getLogString("Closing input split " + split.toString()));
                }
            } finally {
                // close. We close here such that a regular close throwing an exception marks a
                // task as failed.
                format.close();
            }
            completedSplitsCounter.inc();
        }
        // end for all input splits
        // close all chained tasks letting them report failure
        BatchTask.closeChainedTasks(this.chainedTasks, this);
        // close the output collector
        this.output.close();
    } catch (Exception ex) {
        // cause
        try {
            this.format.close();
        } catch (Throwable ignored) {
        }
        BatchTask.cancelChainedTasks(this.chainedTasks);
        ex = ExceptionInChainedStubException.exceptionUnwrap(ex);
        if (ex instanceof CancelTaskException) {
            // forward canceling exception
            throw ex;
        } else if (!this.taskCanceled) {
            // drop exception, if the task was canceled
            BatchTask.logAndThrowException(ex, this);
        }
    } finally {
        BatchTask.clearWriters(eventualOutputs);
        // --------------------------------------------------------------------
        if (this.format != null && RichInputFormat.class.isAssignableFrom(this.format.getClass())) {
            ((RichInputFormat) this.format).closeInputFormat();
            LOG.debug(getLogString("Rich Source detected. Closing the InputFormat."));
        }
    }
    if (!this.taskCanceled) {
        LOG.debug(getLogString("Finished data source operator"));
    } else {
        LOG.debug(getLogString("Data source operator cancelled"));
    }
}
Also used : RichInputFormat(org.apache.flink.api.common.io.RichInputFormat) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) ExceptionInChainedStubException(org.apache.flink.runtime.operators.chaining.ExceptionInChainedStubException) NoSuchElementException(java.util.NoSuchElementException) CancelTaskException(org.apache.flink.runtime.execution.CancelTaskException) InputSplitProviderException(org.apache.flink.runtime.jobgraph.tasks.InputSplitProviderException) CountingCollector(org.apache.flink.runtime.operators.util.metrics.CountingCollector) InternalOperatorIOMetricGroup(org.apache.flink.runtime.metrics.groups.InternalOperatorIOMetricGroup) SimpleCounter(org.apache.flink.metrics.SimpleCounter) Counter(org.apache.flink.metrics.Counter) SimpleCounter(org.apache.flink.metrics.SimpleCounter) CancelTaskException(org.apache.flink.runtime.execution.CancelTaskException) RuntimeContext(org.apache.flink.api.common.functions.RuntimeContext) InputSplit(org.apache.flink.core.io.InputSplit)

Example 40 with CancelTaskException

use of org.apache.flink.runtime.execution.CancelTaskException in project flink-mirror by flink-ci.

the class Task method doRun.

private void doRun() {
    // ----------------------------
    while (true) {
        ExecutionState current = this.executionState;
        if (current == ExecutionState.CREATED) {
            if (transitionState(ExecutionState.CREATED, ExecutionState.DEPLOYING)) {
                // success, we can start our work
                break;
            }
        } else if (current == ExecutionState.FAILED) {
            // we were immediately failed. tell the TaskManager that we reached our final state
            notifyFinalState();
            if (metrics != null) {
                metrics.close();
            }
            return;
        } else if (current == ExecutionState.CANCELING) {
            if (transitionState(ExecutionState.CANCELING, ExecutionState.CANCELED)) {
                // we were immediately canceled. tell the TaskManager that we reached our final
                // state
                notifyFinalState();
                if (metrics != null) {
                    metrics.close();
                }
                return;
            }
        } else {
            if (metrics != null) {
                metrics.close();
            }
            throw new IllegalStateException("Invalid state for beginning of operation of task " + this + '.');
        }
    }
    // all resource acquisitions and registrations from here on
    // need to be undone in the end
    Map<String, Future<Path>> distributedCacheEntries = new HashMap<>();
    TaskInvokable invokable = null;
    try {
        // ----------------------------
        // Task Bootstrap - We periodically
        // check for canceling as a shortcut
        // ----------------------------
        // activate safety net for task thread
        LOG.debug("Creating FileSystem stream leak safety net for task {}", this);
        FileSystemSafetyNet.initializeSafetyNetForThread();
        // first of all, get a user-code classloader
        // this may involve downloading the job's JAR files and/or classes
        LOG.info("Loading JAR files for task {}.", this);
        userCodeClassLoader = createUserCodeClassloader();
        final ExecutionConfig executionConfig = serializedExecutionConfig.deserializeValue(userCodeClassLoader.asClassLoader());
        if (executionConfig.getTaskCancellationInterval() >= 0) {
            // override task cancellation interval from Flink config if set in ExecutionConfig
            taskCancellationInterval = executionConfig.getTaskCancellationInterval();
        }
        if (executionConfig.getTaskCancellationTimeout() >= 0) {
            // override task cancellation timeout from Flink config if set in ExecutionConfig
            taskCancellationTimeout = executionConfig.getTaskCancellationTimeout();
        }
        if (isCanceledOrFailed()) {
            throw new CancelTaskException();
        }
        // ----------------------------------------------------------------
        // register the task with the network stack
        // this operation may fail if the system does not have enough
        // memory to run the necessary data exchanges
        // the registration must also strictly be undone
        // ----------------------------------------------------------------
        LOG.debug("Registering task at network: {}.", this);
        setupPartitionsAndGates(consumableNotifyingPartitionWriters, inputGates);
        for (ResultPartitionWriter partitionWriter : consumableNotifyingPartitionWriters) {
            taskEventDispatcher.registerPartition(partitionWriter.getPartitionId());
        }
        // next, kick off the background copying of files for the distributed cache
        try {
            for (Map.Entry<String, DistributedCache.DistributedCacheEntry> entry : DistributedCache.readFileInfoFromConfig(jobConfiguration)) {
                LOG.info("Obtaining local cache file for '{}'.", entry.getKey());
                Future<Path> cp = fileCache.createTmpFile(entry.getKey(), entry.getValue(), jobId, executionId);
                distributedCacheEntries.put(entry.getKey(), cp);
            }
        } catch (Exception e) {
            throw new Exception(String.format("Exception while adding files to distributed cache of task %s (%s).", taskNameWithSubtask, executionId), e);
        }
        if (isCanceledOrFailed()) {
            throw new CancelTaskException();
        }
        // ----------------------------------------------------------------
        // call the user code initialization methods
        // ----------------------------------------------------------------
        TaskKvStateRegistry kvStateRegistry = kvStateService.createKvStateTaskRegistry(jobId, getJobVertexId());
        Environment env = new RuntimeEnvironment(jobId, vertexId, executionId, executionConfig, taskInfo, jobConfiguration, taskConfiguration, userCodeClassLoader, memoryManager, ioManager, broadcastVariableManager, taskStateManager, aggregateManager, accumulatorRegistry, kvStateRegistry, inputSplitProvider, distributedCacheEntries, consumableNotifyingPartitionWriters, inputGates, taskEventDispatcher, checkpointResponder, operatorCoordinatorEventGateway, taskManagerConfig, metrics, this, externalResourceInfoProvider);
        // Make sure the user code classloader is accessible thread-locally.
        // We are setting the correct context class loader before instantiating the invokable
        // so that it is available to the invokable during its entire lifetime.
        executingThread.setContextClassLoader(userCodeClassLoader.asClassLoader());
        // When constructing invokable, separate threads can be constructed and thus should be
        // monitored for system exit (in addition to invoking thread itself monitored below).
        FlinkSecurityManager.monitorUserSystemExitForCurrentThread();
        try {
            // now load and instantiate the task's invokable code
            invokable = loadAndInstantiateInvokable(userCodeClassLoader.asClassLoader(), nameOfInvokableClass, env);
        } finally {
            FlinkSecurityManager.unmonitorUserSystemExitForCurrentThread();
        }
        // ----------------------------------------------------------------
        // actual task core work
        // ----------------------------------------------------------------
        // we must make strictly sure that the invokable is accessible to the cancel() call
        // by the time we switched to running.
        this.invokable = invokable;
        restoreAndInvoke(invokable);
        // to the fact that it has been canceled
        if (isCanceledOrFailed()) {
            throw new CancelTaskException();
        }
        // finish the produced partitions. if this fails, we consider the execution failed.
        for (ResultPartitionWriter partitionWriter : consumableNotifyingPartitionWriters) {
            if (partitionWriter != null) {
                partitionWriter.finish();
            }
        }
        // if that fails, the task was canceled/failed in the meantime
        if (!transitionState(ExecutionState.RUNNING, ExecutionState.FINISHED)) {
            throw new CancelTaskException();
        }
    } catch (Throwable t) {
        // ----------------------------------------------------------------
        // the execution failed. either the invokable code properly failed, or
        // an exception was thrown as a side effect of cancelling
        // ----------------------------------------------------------------
        t = preProcessException(t);
        try {
            // or to failExternally()
            while (true) {
                ExecutionState current = this.executionState;
                if (current == ExecutionState.RUNNING || current == ExecutionState.INITIALIZING || current == ExecutionState.DEPLOYING) {
                    if (ExceptionUtils.findThrowable(t, CancelTaskException.class).isPresent()) {
                        if (transitionState(current, ExecutionState.CANCELED, t)) {
                            cancelInvokable(invokable);
                            break;
                        }
                    } else {
                        if (transitionState(current, ExecutionState.FAILED, t)) {
                            cancelInvokable(invokable);
                            break;
                        }
                    }
                } else if (current == ExecutionState.CANCELING) {
                    if (transitionState(current, ExecutionState.CANCELED)) {
                        break;
                    }
                } else if (current == ExecutionState.FAILED) {
                    // in state failed already, no transition necessary any more
                    break;
                } else // unexpected state, go to failed
                if (transitionState(current, ExecutionState.FAILED, t)) {
                    LOG.error("Unexpected state in task {} ({}) during an exception: {}.", taskNameWithSubtask, executionId, current);
                    break;
                }
            // else fall through the loop and
            }
        } catch (Throwable tt) {
            String message = String.format("FATAL - exception in exception handler of task %s (%s).", taskNameWithSubtask, executionId);
            LOG.error(message, tt);
            notifyFatalError(message, tt);
        }
    } finally {
        try {
            LOG.info("Freeing task resources for {} ({}).", taskNameWithSubtask, executionId);
            // clear the reference to the invokable. this helps guard against holding references
            // to the invokable and its structures in cases where this Task object is still
            // referenced
            this.invokable = null;
            // free the network resources
            releaseResources();
            // free memory resources
            if (invokable != null) {
                memoryManager.releaseAll(invokable);
            }
            // remove all of the tasks resources
            fileCache.releaseJob(jobId, executionId);
            // close and de-activate safety net for task thread
            LOG.debug("Ensuring all FileSystem streams are closed for task {}", this);
            FileSystemSafetyNet.closeSafetyNetAndGuardedResourcesForThread();
            notifyFinalState();
        } catch (Throwable t) {
            // an error in the resource cleanup is fatal
            String message = String.format("FATAL - exception in resource cleanup of task %s (%s).", taskNameWithSubtask, executionId);
            LOG.error(message, t);
            notifyFatalError(message, t);
        }
        // errors here will only be logged
        try {
            metrics.close();
        } catch (Throwable t) {
            LOG.error("Error during metrics de-registration of task {} ({}).", taskNameWithSubtask, executionId, t);
        }
    }
}
Also used : Path(org.apache.flink.core.fs.Path) ExecutionState(org.apache.flink.runtime.execution.ExecutionState) HashMap(java.util.HashMap) ResultPartitionWriter(org.apache.flink.runtime.io.network.api.writer.ResultPartitionWriter) TaskKvStateRegistry(org.apache.flink.runtime.query.TaskKvStateRegistry) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) TaskNotRunningException(org.apache.flink.runtime.operators.coordination.TaskNotRunningException) WrappingRuntimeException(org.apache.flink.util.WrappingRuntimeException) CheckpointException(org.apache.flink.runtime.checkpoint.CheckpointException) CancelTaskException(org.apache.flink.runtime.execution.CancelTaskException) InvocationTargetException(java.lang.reflect.InvocationTargetException) FlinkException(org.apache.flink.util.FlinkException) RunnableWithException(org.apache.flink.util.function.RunnableWithException) RejectedExecutionException(java.util.concurrent.RejectedExecutionException) FlinkRuntimeException(org.apache.flink.util.FlinkRuntimeException) IOException(java.io.IOException) CancelTaskException(org.apache.flink.runtime.execution.CancelTaskException) Future(java.util.concurrent.Future) CompletableFuture(java.util.concurrent.CompletableFuture) TaskInvokable(org.apache.flink.runtime.jobgraph.tasks.TaskInvokable) ShuffleEnvironment(org.apache.flink.runtime.shuffle.ShuffleEnvironment) NettyShuffleEnvironment(org.apache.flink.runtime.io.network.NettyShuffleEnvironment) Environment(org.apache.flink.runtime.execution.Environment) Map(java.util.Map) HashMap(java.util.HashMap)

Aggregations

CancelTaskException (org.apache.flink.runtime.execution.CancelTaskException)43 IOException (java.io.IOException)16 Test (org.junit.Test)16 ExecutionConfig (org.apache.flink.api.common.ExecutionConfig)10 ResultSubpartitionView (org.apache.flink.runtime.io.network.partition.ResultSubpartitionView)10 ExceptionInChainedStubException (org.apache.flink.runtime.operators.chaining.ExceptionInChainedStubException)9 RejectedExecutionException (java.util.concurrent.RejectedExecutionException)7 Buffer (org.apache.flink.runtime.io.network.buffer.Buffer)6 CheckpointException (org.apache.flink.runtime.checkpoint.CheckpointException)5 ExecutorService (java.util.concurrent.ExecutorService)4 RuntimeContext (org.apache.flink.api.common.functions.RuntimeContext)4 Counter (org.apache.flink.metrics.Counter)4 SimpleCounter (org.apache.flink.metrics.SimpleCounter)4 InputChannelTestUtils.createLocalInputChannel (org.apache.flink.runtime.io.network.partition.InputChannelTestUtils.createLocalInputChannel)4 TestingResultPartitionManager (org.apache.flink.runtime.io.network.partition.consumer.SingleInputGateTest.TestingResultPartitionManager)4 InternalOperatorIOMetricGroup (org.apache.flink.runtime.metrics.groups.InternalOperatorIOMetricGroup)4 StreamConfig (org.apache.flink.streaming.api.graph.StreamConfig)4 FlinkException (org.apache.flink.util.FlinkException)4 WrappingRuntimeException (org.apache.flink.util.WrappingRuntimeException)4 File (java.io.File)3