use of org.apache.flink.runtime.execution.CancelTaskException in project flink by apache.
the class PartitionRequestQueueTest method testProducerFailedException.
@Test
public void testProducerFailedException() throws Exception {
PartitionRequestQueue queue = new PartitionRequestQueue();
ResultPartitionProvider partitionProvider = mock(ResultPartitionProvider.class);
ResultPartitionID rpid = new ResultPartitionID();
BufferProvider bufferProvider = mock(BufferProvider.class);
ResultSubpartitionView view = mock(ResultSubpartitionView.class);
when(view.isReleased()).thenReturn(true);
when(view.getFailureCause()).thenReturn(new RuntimeException("Expected test exception"));
when(partitionProvider.createSubpartitionView(eq(rpid), eq(0), eq(bufferProvider), any(BufferAvailabilityListener.class))).thenReturn(view);
EmbeddedChannel ch = new EmbeddedChannel(queue);
SequenceNumberingViewReader seqView = new SequenceNumberingViewReader(new InputChannelID(), queue);
seqView.requestSubpartitionView(partitionProvider, rpid, 0, bufferProvider);
// Enqueue the erroneous view
queue.notifyReaderNonEmpty(seqView);
ch.runPendingTasks();
// Read the enqueued msg
Object msg = ch.readOutbound();
assertEquals(msg.getClass(), NettyMessage.ErrorResponse.class);
NettyMessage.ErrorResponse err = (NettyMessage.ErrorResponse) msg;
assertTrue(err.cause instanceof CancelTaskException);
}
use of org.apache.flink.runtime.execution.CancelTaskException in project flink by apache.
the class Task method doRun.
private void doRun() {
// ----------------------------
while (true) {
ExecutionState current = this.executionState;
if (current == ExecutionState.CREATED) {
if (transitionState(ExecutionState.CREATED, ExecutionState.DEPLOYING)) {
// success, we can start our work
break;
}
} else if (current == ExecutionState.FAILED) {
// we were immediately failed. tell the TaskManager that we reached our final state
notifyFinalState();
if (metrics != null) {
metrics.close();
}
return;
} else if (current == ExecutionState.CANCELING) {
if (transitionState(ExecutionState.CANCELING, ExecutionState.CANCELED)) {
// we were immediately canceled. tell the TaskManager that we reached our final
// state
notifyFinalState();
if (metrics != null) {
metrics.close();
}
return;
}
} else {
if (metrics != null) {
metrics.close();
}
throw new IllegalStateException("Invalid state for beginning of operation of task " + this + '.');
}
}
// all resource acquisitions and registrations from here on
// need to be undone in the end
Map<String, Future<Path>> distributedCacheEntries = new HashMap<>();
TaskInvokable invokable = null;
try {
// ----------------------------
// Task Bootstrap - We periodically
// check for canceling as a shortcut
// ----------------------------
// activate safety net for task thread
LOG.debug("Creating FileSystem stream leak safety net for task {}", this);
FileSystemSafetyNet.initializeSafetyNetForThread();
// first of all, get a user-code classloader
// this may involve downloading the job's JAR files and/or classes
LOG.info("Loading JAR files for task {}.", this);
userCodeClassLoader = createUserCodeClassloader();
final ExecutionConfig executionConfig = serializedExecutionConfig.deserializeValue(userCodeClassLoader.asClassLoader());
if (executionConfig.getTaskCancellationInterval() >= 0) {
// override task cancellation interval from Flink config if set in ExecutionConfig
taskCancellationInterval = executionConfig.getTaskCancellationInterval();
}
if (executionConfig.getTaskCancellationTimeout() >= 0) {
// override task cancellation timeout from Flink config if set in ExecutionConfig
taskCancellationTimeout = executionConfig.getTaskCancellationTimeout();
}
if (isCanceledOrFailed()) {
throw new CancelTaskException();
}
// ----------------------------------------------------------------
// register the task with the network stack
// this operation may fail if the system does not have enough
// memory to run the necessary data exchanges
// the registration must also strictly be undone
// ----------------------------------------------------------------
LOG.debug("Registering task at network: {}.", this);
setupPartitionsAndGates(consumableNotifyingPartitionWriters, inputGates);
for (ResultPartitionWriter partitionWriter : consumableNotifyingPartitionWriters) {
taskEventDispatcher.registerPartition(partitionWriter.getPartitionId());
}
// next, kick off the background copying of files for the distributed cache
try {
for (Map.Entry<String, DistributedCache.DistributedCacheEntry> entry : DistributedCache.readFileInfoFromConfig(jobConfiguration)) {
LOG.info("Obtaining local cache file for '{}'.", entry.getKey());
Future<Path> cp = fileCache.createTmpFile(entry.getKey(), entry.getValue(), jobId, executionId);
distributedCacheEntries.put(entry.getKey(), cp);
}
} catch (Exception e) {
throw new Exception(String.format("Exception while adding files to distributed cache of task %s (%s).", taskNameWithSubtask, executionId), e);
}
if (isCanceledOrFailed()) {
throw new CancelTaskException();
}
// ----------------------------------------------------------------
// call the user code initialization methods
// ----------------------------------------------------------------
TaskKvStateRegistry kvStateRegistry = kvStateService.createKvStateTaskRegistry(jobId, getJobVertexId());
Environment env = new RuntimeEnvironment(jobId, vertexId, executionId, executionConfig, taskInfo, jobConfiguration, taskConfiguration, userCodeClassLoader, memoryManager, ioManager, broadcastVariableManager, taskStateManager, aggregateManager, accumulatorRegistry, kvStateRegistry, inputSplitProvider, distributedCacheEntries, consumableNotifyingPartitionWriters, inputGates, taskEventDispatcher, checkpointResponder, operatorCoordinatorEventGateway, taskManagerConfig, metrics, this, externalResourceInfoProvider);
// Make sure the user code classloader is accessible thread-locally.
// We are setting the correct context class loader before instantiating the invokable
// so that it is available to the invokable during its entire lifetime.
executingThread.setContextClassLoader(userCodeClassLoader.asClassLoader());
// When constructing invokable, separate threads can be constructed and thus should be
// monitored for system exit (in addition to invoking thread itself monitored below).
FlinkSecurityManager.monitorUserSystemExitForCurrentThread();
try {
// now load and instantiate the task's invokable code
invokable = loadAndInstantiateInvokable(userCodeClassLoader.asClassLoader(), nameOfInvokableClass, env);
} finally {
FlinkSecurityManager.unmonitorUserSystemExitForCurrentThread();
}
// ----------------------------------------------------------------
// actual task core work
// ----------------------------------------------------------------
// we must make strictly sure that the invokable is accessible to the cancel() call
// by the time we switched to running.
this.invokable = invokable;
restoreAndInvoke(invokable);
// to the fact that it has been canceled
if (isCanceledOrFailed()) {
throw new CancelTaskException();
}
// finish the produced partitions. if this fails, we consider the execution failed.
for (ResultPartitionWriter partitionWriter : consumableNotifyingPartitionWriters) {
if (partitionWriter != null) {
partitionWriter.finish();
}
}
// if that fails, the task was canceled/failed in the meantime
if (!transitionState(ExecutionState.RUNNING, ExecutionState.FINISHED)) {
throw new CancelTaskException();
}
} catch (Throwable t) {
// ----------------------------------------------------------------
// the execution failed. either the invokable code properly failed, or
// an exception was thrown as a side effect of cancelling
// ----------------------------------------------------------------
t = preProcessException(t);
try {
// or to failExternally()
while (true) {
ExecutionState current = this.executionState;
if (current == ExecutionState.RUNNING || current == ExecutionState.INITIALIZING || current == ExecutionState.DEPLOYING) {
if (ExceptionUtils.findThrowable(t, CancelTaskException.class).isPresent()) {
if (transitionState(current, ExecutionState.CANCELED, t)) {
cancelInvokable(invokable);
break;
}
} else {
if (transitionState(current, ExecutionState.FAILED, t)) {
cancelInvokable(invokable);
break;
}
}
} else if (current == ExecutionState.CANCELING) {
if (transitionState(current, ExecutionState.CANCELED)) {
break;
}
} else if (current == ExecutionState.FAILED) {
// in state failed already, no transition necessary any more
break;
} else // unexpected state, go to failed
if (transitionState(current, ExecutionState.FAILED, t)) {
LOG.error("Unexpected state in task {} ({}) during an exception: {}.", taskNameWithSubtask, executionId, current);
break;
}
// else fall through the loop and
}
} catch (Throwable tt) {
String message = String.format("FATAL - exception in exception handler of task %s (%s).", taskNameWithSubtask, executionId);
LOG.error(message, tt);
notifyFatalError(message, tt);
}
} finally {
try {
LOG.info("Freeing task resources for {} ({}).", taskNameWithSubtask, executionId);
// clear the reference to the invokable. this helps guard against holding references
// to the invokable and its structures in cases where this Task object is still
// referenced
this.invokable = null;
// free the network resources
releaseResources();
// free memory resources
if (invokable != null) {
memoryManager.releaseAll(invokable);
}
// remove all of the tasks resources
fileCache.releaseJob(jobId, executionId);
// close and de-activate safety net for task thread
LOG.debug("Ensuring all FileSystem streams are closed for task {}", this);
FileSystemSafetyNet.closeSafetyNetAndGuardedResourcesForThread();
notifyFinalState();
} catch (Throwable t) {
// an error in the resource cleanup is fatal
String message = String.format("FATAL - exception in resource cleanup of task %s (%s).", taskNameWithSubtask, executionId);
LOG.error(message, t);
notifyFatalError(message, t);
}
// errors here will only be logged
try {
metrics.close();
} catch (Throwable t) {
LOG.error("Error during metrics de-registration of task {} ({}).", taskNameWithSubtask, executionId, t);
}
}
}
use of org.apache.flink.runtime.execution.CancelTaskException in project flink by apache.
the class LocalInputChannel method getNextBuffer.
@Override
Optional<BufferAndAvailability> getNextBuffer() throws IOException {
checkError();
ResultSubpartitionView subpartitionView = this.subpartitionView;
if (subpartitionView == null) {
// during) it was released during reading the EndOfPartitionEvent (2).
if (isReleased) {
return Optional.empty();
}
// this can happen if the request for the partition was triggered asynchronously
// by the time trigger
// would be good to avoid that, by guaranteeing that the requestPartition() and
// getNextBuffer() always come from the same thread
// we could do that by letting the timer insert a special "requesting channel" into the
// input gate's queue
subpartitionView = checkAndWaitForSubpartitionView();
}
BufferAndBacklog next = subpartitionView.getNextBuffer();
// ignore the empty buffer directly
while (next != null && next.buffer().readableBytes() == 0) {
next.buffer().recycleBuffer();
next = subpartitionView.getNextBuffer();
numBuffersIn.inc();
}
if (next == null) {
if (subpartitionView.isReleased()) {
throw new CancelTaskException("Consumed partition " + subpartitionView + " has been released.");
} else {
return Optional.empty();
}
}
Buffer buffer = next.buffer();
if (buffer instanceof FileRegionBuffer) {
buffer = ((FileRegionBuffer) buffer).readInto(inputGate.getUnpooledSegment());
}
numBytesIn.inc(buffer.getSize());
numBuffersIn.inc();
channelStatePersister.checkForBarrier(buffer);
channelStatePersister.maybePersist(buffer);
NetworkActionsLogger.traceInput("LocalInputChannel#getNextBuffer", buffer, inputGate.getOwningTaskName(), channelInfo, channelStatePersister, next.getSequenceNumber());
return Optional.of(new BufferAndAvailability(buffer, next.getNextDataType(), next.buffersInBacklog(), next.getSequenceNumber()));
}
use of org.apache.flink.runtime.execution.CancelTaskException in project flink by apache.
the class DataSourceTask method invoke.
@Override
public void invoke() throws Exception {
// --------------------------------------------------------------------
// Initialize
// --------------------------------------------------------------------
initInputFormat();
LOG.debug(getLogString("Start registering input and output"));
try {
initOutputs(getEnvironment().getUserCodeClassLoader());
} catch (Exception ex) {
throw new RuntimeException("The initialization of the DataSource's outputs caused an error: " + ex.getMessage(), ex);
}
LOG.debug(getLogString("Finished registering input and output"));
// --------------------------------------------------------------------
// Invoke
// --------------------------------------------------------------------
LOG.debug(getLogString("Starting data source operator"));
RuntimeContext ctx = createRuntimeContext();
final Counter numRecordsOut;
{
Counter tmpNumRecordsOut;
try {
InternalOperatorIOMetricGroup ioMetricGroup = ((InternalOperatorMetricGroup) ctx.getMetricGroup()).getIOMetricGroup();
ioMetricGroup.reuseInputMetricsForTask();
if (this.config.getNumberOfChainedStubs() == 0) {
ioMetricGroup.reuseOutputMetricsForTask();
}
tmpNumRecordsOut = ioMetricGroup.getNumRecordsOutCounter();
} catch (Exception e) {
LOG.warn("An exception occurred during the metrics setup.", e);
tmpNumRecordsOut = new SimpleCounter();
}
numRecordsOut = tmpNumRecordsOut;
}
Counter completedSplitsCounter = ctx.getMetricGroup().counter("numSplitsProcessed");
if (RichInputFormat.class.isAssignableFrom(this.format.getClass())) {
((RichInputFormat) this.format).setRuntimeContext(ctx);
LOG.debug(getLogString("Rich Source detected. Initializing runtime context."));
((RichInputFormat) this.format).openInputFormat();
LOG.debug(getLogString("Rich Source detected. Opening the InputFormat."));
}
ExecutionConfig executionConfig = getExecutionConfig();
boolean objectReuseEnabled = executionConfig.isObjectReuseEnabled();
LOG.debug("DataSourceTask object reuse: " + (objectReuseEnabled ? "ENABLED" : "DISABLED") + ".");
final TypeSerializer<OT> serializer = this.serializerFactory.getSerializer();
try {
// start all chained tasks
BatchTask.openChainedTasks(this.chainedTasks, this);
// get input splits to read
final Iterator<InputSplit> splitIterator = getInputSplits();
// for each assigned input split
while (!this.taskCanceled && splitIterator.hasNext()) {
// get start and end
final InputSplit split = splitIterator.next();
LOG.debug(getLogString("Opening input split " + split.toString()));
final InputFormat<OT, InputSplit> format = this.format;
// open input format
format.open(split);
LOG.debug(getLogString("Starting to read input from split " + split.toString()));
try {
final Collector<OT> output = new CountingCollector<>(this.output, numRecordsOut);
if (objectReuseEnabled) {
OT reuse = serializer.createInstance();
// as long as there is data to read
while (!this.taskCanceled && !format.reachedEnd()) {
OT returned;
if ((returned = format.nextRecord(reuse)) != null) {
output.collect(returned);
}
}
} else {
// as long as there is data to read
while (!this.taskCanceled && !format.reachedEnd()) {
OT returned;
if ((returned = format.nextRecord(serializer.createInstance())) != null) {
output.collect(returned);
}
}
}
if (LOG.isDebugEnabled() && !this.taskCanceled) {
LOG.debug(getLogString("Closing input split " + split.toString()));
}
} finally {
// close. We close here such that a regular close throwing an exception marks a
// task as failed.
format.close();
}
completedSplitsCounter.inc();
}
// end for all input splits
// close all chained tasks letting them report failure
BatchTask.closeChainedTasks(this.chainedTasks, this);
// close the output collector
this.output.close();
} catch (Exception ex) {
// cause
try {
this.format.close();
} catch (Throwable ignored) {
}
BatchTask.cancelChainedTasks(this.chainedTasks);
ex = ExceptionInChainedStubException.exceptionUnwrap(ex);
if (ex instanceof CancelTaskException) {
// forward canceling exception
throw ex;
} else if (!this.taskCanceled) {
// drop exception, if the task was canceled
BatchTask.logAndThrowException(ex, this);
}
} finally {
BatchTask.clearWriters(eventualOutputs);
// --------------------------------------------------------------------
if (this.format != null && RichInputFormat.class.isAssignableFrom(this.format.getClass())) {
((RichInputFormat) this.format).closeInputFormat();
LOG.debug(getLogString("Rich Source detected. Closing the InputFormat."));
}
}
if (!this.taskCanceled) {
LOG.debug(getLogString("Finished data source operator"));
} else {
LOG.debug(getLogString("Data source operator cancelled"));
}
}
use of org.apache.flink.runtime.execution.CancelTaskException in project flink by apache.
the class RocksDBAsyncSnapshotTest method testCancelFullyAsyncCheckpoints.
/**
* This tests ensures that canceling of asynchronous snapshots works as expected and does not
* block.
*/
@Test
public void testCancelFullyAsyncCheckpoints() throws Exception {
final OneInputStreamTaskTestHarness<String, String> testHarness = new OneInputStreamTaskTestHarness<>(OneInputStreamTask::new, BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO);
testHarness.setupOutputForSingletonOperatorChain();
testHarness.configureForKeyedStream(value -> value, BasicTypeInfo.STRING_TYPE_INFO);
StreamConfig streamConfig = testHarness.getStreamConfig();
File dbDir = temporaryFolder.newFolder();
final EmbeddedRocksDBStateBackend.PriorityQueueStateType timerServicePriorityQueueType = RocksDBOptions.TIMER_SERVICE_FACTORY.defaultValue();
final int skipStreams;
if (timerServicePriorityQueueType == EmbeddedRocksDBStateBackend.PriorityQueueStateType.HEAP) {
// we skip the first created stream, because it is used to checkpoint the timer service,
// which is
// currently not asynchronous.
skipStreams = 1;
} else if (timerServicePriorityQueueType == EmbeddedRocksDBStateBackend.PriorityQueueStateType.ROCKSDB) {
skipStreams = 0;
} else {
throw new AssertionError(String.format("Unknown timer service priority queue type %s.", timerServicePriorityQueueType));
}
// this is the proper instance that we need to call.
BlockerCheckpointStreamFactory blockerCheckpointStreamFactory = new BlockerCheckpointStreamFactory(4 * 1024 * 1024) {
int count = skipStreams;
@Override
public CheckpointStateOutputStream createCheckpointStateOutputStream(CheckpointedStateScope scope) throws IOException {
if (count > 0) {
--count;
return new BlockingCheckpointOutputStream(new MemCheckpointStreamFactory.MemoryCheckpointOutputStream(maxSize), null, null, Integer.MAX_VALUE);
} else {
return super.createCheckpointStateOutputStream(scope);
}
}
};
// to avoid serialization of the above factory instance, we need to pass it in
// through a static variable
StateBackend stateBackend = new BackendForTestStream(new StaticForwardFactory(blockerCheckpointStreamFactory));
RocksDBStateBackend backend = new RocksDBStateBackend(stateBackend);
backend.setDbStoragePath(dbDir.getAbsolutePath());
streamConfig.setStateBackend(backend);
streamConfig.setStreamOperator(new AsyncCheckpointOperator());
streamConfig.setOperatorID(new OperatorID());
TestTaskStateManager taskStateManagerTestMock = new TestTaskStateManager();
StreamMockEnvironment mockEnv = new StreamMockEnvironment(testHarness.jobConfig, testHarness.taskConfig, testHarness.memorySize, new MockInputSplitProvider(), testHarness.bufferSize, taskStateManagerTestMock);
blockerCheckpointStreamFactory.setBlockerLatch(new OneShotLatch());
blockerCheckpointStreamFactory.setWaiterLatch(new OneShotLatch());
testHarness.invoke(mockEnv);
testHarness.waitForTaskRunning();
final OneInputStreamTask<String, String> task = testHarness.getTask();
task.triggerCheckpointAsync(new CheckpointMetaData(42, 17), CheckpointOptions.forCheckpointWithDefaultLocation()).get();
testHarness.processElement(new StreamRecord<>("Wohoo", 0));
blockerCheckpointStreamFactory.getWaiterLatch().await();
task.cancel();
blockerCheckpointStreamFactory.getBlockerLatch().trigger();
testHarness.endInput();
ExecutorService threadPool = task.getAsyncOperationsThreadPool();
threadPool.shutdown();
Assert.assertTrue(threadPool.awaitTermination(60_000, TimeUnit.MILLISECONDS));
Set<BlockingCheckpointOutputStream> createdStreams = blockerCheckpointStreamFactory.getAllCreatedStreams();
for (BlockingCheckpointOutputStream stream : createdStreams) {
Assert.assertTrue("Not all of the " + createdStreams.size() + " created streams have been closed.", stream.isClosed());
}
try {
testHarness.waitForTaskCompletion();
fail("Operation completed. Cancel failed.");
} catch (Exception expected) {
Throwable cause = expected.getCause();
if (!(cause instanceof CancelTaskException)) {
fail("Unexpected exception: " + expected);
}
}
}
Aggregations