use of org.apache.flink.runtime.execution.Environment in project flink by apache.
the class AsyncWaitOperatorTest method testClosingWithBlockedEmitter.
/**
* Test case for FLINK-5638: Tests that the async wait operator can be closed even if the
* emitter is currently waiting on the checkpoint lock (e.g. in the case of two chained async
* wait operators where the latter operator's queue is currently full).
*
* Note that this test does not enforce the exact strict ordering because with the fix it is no
* longer possible. However, it provokes the described situation without the fix.
*/
@Test(timeout = 10000L)
public void testClosingWithBlockedEmitter() throws Exception {
final Object lock = new Object();
ArgumentCaptor<Throwable> failureReason = ArgumentCaptor.forClass(Throwable.class);
Environment environment = mock(Environment.class);
when(environment.getMetricGroup()).thenReturn(new UnregisteredTaskMetricsGroup());
when(environment.getTaskManagerInfo()).thenReturn(new TestingTaskManagerRuntimeInfo());
when(environment.getUserClassLoader()).thenReturn(getClass().getClassLoader());
when(environment.getTaskInfo()).thenReturn(new TaskInfo("testTask", 1, 0, 1, 0));
doNothing().when(environment).failExternally(failureReason.capture());
StreamTask<?, ?> containingTask = mock(StreamTask.class);
when(containingTask.getEnvironment()).thenReturn(environment);
when(containingTask.getCheckpointLock()).thenReturn(lock);
when(containingTask.getProcessingTimeService()).thenReturn(new TestProcessingTimeService());
StreamConfig streamConfig = mock(StreamConfig.class);
doReturn(IntSerializer.INSTANCE).when(streamConfig).getTypeSerializerIn1(any(ClassLoader.class));
final OneShotLatch closingLatch = new OneShotLatch();
final OneShotLatch outputLatch = new OneShotLatch();
Output<StreamRecord<Integer>> output = mock(Output.class);
doAnswer(new Answer() {
@Override
public Object answer(InvocationOnMock invocation) throws Throwable {
assertTrue("Output should happen under the checkpoint lock.", Thread.currentThread().holdsLock(lock));
outputLatch.trigger();
// wait until we're in the closing method of the operator
while (!closingLatch.isTriggered()) {
lock.wait();
}
return null;
}
}).when(output).collect(any(StreamRecord.class));
AsyncWaitOperator<Integer, Integer> operator = new TestAsyncWaitOperator<>(new MyAsyncFunction(), 1000L, 1, AsyncDataStream.OutputMode.ORDERED, closingLatch);
operator.setup(containingTask, streamConfig, output);
operator.open();
synchronized (lock) {
operator.processElement(new StreamRecord<>(42));
}
outputLatch.await();
synchronized (lock) {
operator.close();
}
// check that no concurrent exception has occurred
try {
verify(environment, never()).failExternally(any(Throwable.class));
} catch (Error e) {
// add the exception occurring in the emitter thread (root cause) as a suppressed
// exception
e.addSuppressed(failureReason.getValue());
throw e;
}
}
use of org.apache.flink.runtime.execution.Environment in project flink by apache.
the class StreamTaskTest method testFailingCheckpointStreamOperator.
@Test
public void testFailingCheckpointStreamOperator() throws Exception {
final long checkpointId = 42L;
final long timestamp = 1L;
TaskInfo mockTaskInfo = mock(TaskInfo.class);
when(mockTaskInfo.getTaskNameWithSubtasks()).thenReturn("foobar");
when(mockTaskInfo.getIndexOfThisSubtask()).thenReturn(0);
Environment mockEnvironment = mock(Environment.class);
when(mockEnvironment.getTaskInfo()).thenReturn(mockTaskInfo);
StreamTask<?, AbstractStreamOperator<?>> streamTask = mock(StreamTask.class, Mockito.CALLS_REAL_METHODS);
CheckpointMetaData checkpointMetaData = new CheckpointMetaData(checkpointId, timestamp);
streamTask.setEnvironment(mockEnvironment);
// mock the operators
StreamOperator<?> streamOperator1 = mock(StreamOperator.class, withSettings().extraInterfaces(StreamCheckpointedOperator.class));
StreamOperator<?> streamOperator2 = mock(StreamOperator.class, withSettings().extraInterfaces(StreamCheckpointedOperator.class));
StreamOperator<?> streamOperator3 = mock(StreamOperator.class, withSettings().extraInterfaces(StreamCheckpointedOperator.class));
// mock the returned snapshots
OperatorSnapshotResult operatorSnapshotResult1 = mock(OperatorSnapshotResult.class);
OperatorSnapshotResult operatorSnapshotResult2 = mock(OperatorSnapshotResult.class);
final Exception testException = new Exception("Test exception");
when(streamOperator1.snapshotState(anyLong(), anyLong(), any(CheckpointOptions.class))).thenReturn(operatorSnapshotResult1);
when(streamOperator2.snapshotState(anyLong(), anyLong(), any(CheckpointOptions.class))).thenReturn(operatorSnapshotResult2);
when(streamOperator3.snapshotState(anyLong(), anyLong(), any(CheckpointOptions.class))).thenThrow(testException);
// mock the returned legacy snapshots
StreamStateHandle streamStateHandle1 = mock(StreamStateHandle.class);
StreamStateHandle streamStateHandle2 = mock(StreamStateHandle.class);
StreamStateHandle streamStateHandle3 = mock(StreamStateHandle.class);
when(streamOperator1.snapshotLegacyOperatorState(anyLong(), anyLong(), any(CheckpointOptions.class))).thenReturn(streamStateHandle1);
when(streamOperator2.snapshotLegacyOperatorState(anyLong(), anyLong(), any(CheckpointOptions.class))).thenReturn(streamStateHandle2);
when(streamOperator3.snapshotLegacyOperatorState(anyLong(), anyLong(), any(CheckpointOptions.class))).thenReturn(streamStateHandle3);
// set up the task
StreamOperator<?>[] streamOperators = { streamOperator1, streamOperator2, streamOperator3 };
OperatorChain<Void, AbstractStreamOperator<Void>> operatorChain = mock(OperatorChain.class);
when(operatorChain.getAllOperators()).thenReturn(streamOperators);
Whitebox.setInternalState(streamTask, "isRunning", true);
Whitebox.setInternalState(streamTask, "lock", new Object());
Whitebox.setInternalState(streamTask, "operatorChain", operatorChain);
Whitebox.setInternalState(streamTask, "cancelables", new CloseableRegistry());
Whitebox.setInternalState(streamTask, "configuration", new StreamConfig(new Configuration()));
try {
streamTask.triggerCheckpoint(checkpointMetaData, CheckpointOptions.forFullCheckpoint());
fail("Expected test exception here.");
} catch (Exception e) {
assertEquals(testException, e.getCause());
}
verify(operatorSnapshotResult1).cancel();
verify(operatorSnapshotResult2).cancel();
verify(streamStateHandle1).discardState();
verify(streamStateHandle2).discardState();
verify(streamStateHandle3).discardState();
}
use of org.apache.flink.runtime.execution.Environment in project flink by apache.
the class StreamOperatorChainingTest method createMockTask.
private <IN, OT extends StreamOperator<IN>> StreamTask<IN, OT> createMockTask(StreamConfig streamConfig, String taskName) {
final Object checkpointLock = new Object();
final Environment env = new MockEnvironment(taskName, 3 * 1024 * 1024, new MockInputSplitProvider(), 1024);
@SuppressWarnings("unchecked") StreamTask<IN, OT> mockTask = mock(StreamTask.class);
when(mockTask.getName()).thenReturn("Mock Task");
when(mockTask.getCheckpointLock()).thenReturn(checkpointLock);
when(mockTask.getConfiguration()).thenReturn(streamConfig);
when(mockTask.getEnvironment()).thenReturn(env);
when(mockTask.getExecutionConfig()).thenReturn(new ExecutionConfig().enableObjectReuse());
return mockTask;
}
use of org.apache.flink.runtime.execution.Environment in project flink by apache.
the class Task method run.
/**
* The core work method that bootstraps the task and executes it code
*/
@Override
public void run() {
// ----------------------------
while (true) {
ExecutionState current = this.executionState;
if (current == ExecutionState.CREATED) {
if (transitionState(ExecutionState.CREATED, ExecutionState.DEPLOYING)) {
// success, we can start our work
break;
}
} else if (current == ExecutionState.FAILED) {
// we were immediately failed. tell the TaskManager that we reached our final state
notifyFinalState();
return;
} else if (current == ExecutionState.CANCELING) {
if (transitionState(ExecutionState.CANCELING, ExecutionState.CANCELED)) {
// we were immediately canceled. tell the TaskManager that we reached our final state
notifyFinalState();
return;
}
} else {
throw new IllegalStateException("Invalid state for beginning of operation of task " + this + '.');
}
}
// all resource acquisitions and registrations from here on
// need to be undone in the end
Map<String, Future<Path>> distributedCacheEntries = new HashMap<String, Future<Path>>();
AbstractInvokable invokable = null;
ClassLoader userCodeClassLoader;
try {
// ----------------------------
// Task Bootstrap - We periodically
// check for canceling as a shortcut
// ----------------------------
// activate safety net for task thread
LOG.info("Creating FileSystem stream leak safety net for task {}", this);
FileSystemSafetyNet.initializeSafetyNetForThread();
// first of all, get a user-code classloader
// this may involve downloading the job's JAR files and/or classes
LOG.info("Loading JAR files for task {}.", this);
userCodeClassLoader = createUserCodeClassloader(libraryCache);
final ExecutionConfig executionConfig = serializedExecutionConfig.deserializeValue(userCodeClassLoader);
if (executionConfig.getTaskCancellationInterval() >= 0) {
// override task cancellation interval from Flink config if set in ExecutionConfig
taskCancellationInterval = executionConfig.getTaskCancellationInterval();
}
if (executionConfig.getTaskCancellationTimeout() >= 0) {
// override task cancellation timeout from Flink config if set in ExecutionConfig
taskCancellationTimeout = executionConfig.getTaskCancellationTimeout();
}
// now load the task's invokable code
invokable = loadAndInstantiateInvokable(userCodeClassLoader, nameOfInvokableClass);
if (isCanceledOrFailed()) {
throw new CancelTaskException();
}
// ----------------------------------------------------------------
// register the task with the network stack
// this operation may fail if the system does not have enough
// memory to run the necessary data exchanges
// the registration must also strictly be undone
// ----------------------------------------------------------------
LOG.info("Registering task at network: {}.", this);
network.registerTask(this);
// next, kick off the background copying of files for the distributed cache
try {
for (Map.Entry<String, DistributedCache.DistributedCacheEntry> entry : DistributedCache.readFileInfoFromConfig(jobConfiguration)) {
LOG.info("Obtaining local cache file for '{}'.", entry.getKey());
Future<Path> cp = fileCache.createTmpFile(entry.getKey(), entry.getValue(), jobId);
distributedCacheEntries.put(entry.getKey(), cp);
}
} catch (Exception e) {
throw new Exception(String.format("Exception while adding files to distributed cache of task %s (%s).", taskNameWithSubtask, executionId), e);
}
if (isCanceledOrFailed()) {
throw new CancelTaskException();
}
// ----------------------------------------------------------------
// call the user code initialization methods
// ----------------------------------------------------------------
TaskKvStateRegistry kvStateRegistry = network.createKvStateTaskRegistry(jobId, getJobVertexId());
Environment env = new RuntimeEnvironment(jobId, vertexId, executionId, executionConfig, taskInfo, jobConfiguration, taskConfiguration, userCodeClassLoader, memoryManager, ioManager, broadcastVariableManager, accumulatorRegistry, kvStateRegistry, inputSplitProvider, distributedCacheEntries, writers, inputGates, checkpointResponder, taskManagerConfig, metrics, this);
// let the task code create its readers and writers
invokable.setEnvironment(env);
if (null != taskStateHandles) {
if (invokable instanceof StatefulTask) {
StatefulTask op = (StatefulTask) invokable;
op.setInitialState(taskStateHandles);
} else {
throw new IllegalStateException("Found operator state for a non-stateful task invokable");
}
// be memory and GC friendly - since the code stays in invoke() for a potentially long time,
// we clear the reference to the state handle
//noinspection UnusedAssignment
taskStateHandles = null;
}
// ----------------------------------------------------------------
// actual task core work
// ----------------------------------------------------------------
// we must make strictly sure that the invokable is accessible to the cancel() call
// by the time we switched to running.
this.invokable = invokable;
// switch to the RUNNING state, if that fails, we have been canceled/failed in the meantime
if (!transitionState(ExecutionState.DEPLOYING, ExecutionState.RUNNING)) {
throw new CancelTaskException();
}
// notify everyone that we switched to running
notifyObservers(ExecutionState.RUNNING, null);
taskManagerActions.updateTaskExecutionState(new TaskExecutionState(jobId, executionId, ExecutionState.RUNNING));
// make sure the user code classloader is accessible thread-locally
executingThread.setContextClassLoader(userCodeClassLoader);
// run the invokable
invokable.invoke();
// to the fact that it has been canceled
if (isCanceledOrFailed()) {
throw new CancelTaskException();
}
// finish the produced partitions. if this fails, we consider the execution failed.
for (ResultPartition partition : producedPartitions) {
if (partition != null) {
partition.finish();
}
}
// if that fails, the task was canceled/failed in the meantime
if (transitionState(ExecutionState.RUNNING, ExecutionState.FINISHED)) {
notifyObservers(ExecutionState.FINISHED, null);
} else {
throw new CancelTaskException();
}
} catch (Throwable t) {
try {
// check if the exception is unrecoverable
if (ExceptionUtils.isJvmFatalError(t) || (t instanceof OutOfMemoryError && taskManagerConfig.shouldExitJvmOnOutOfMemoryError())) {
// don't attempt a clean shutdown, because we cannot expect the clean shutdown to complete
try {
LOG.error("Encountered fatal error {} - terminating the JVM", t.getClass().getName(), t);
} finally {
Runtime.getRuntime().halt(-1);
}
}
// to failExternally()
while (true) {
ExecutionState current = this.executionState;
if (current == ExecutionState.RUNNING || current == ExecutionState.DEPLOYING) {
if (t instanceof CancelTaskException) {
if (transitionState(current, ExecutionState.CANCELED)) {
cancelInvokable();
notifyObservers(ExecutionState.CANCELED, null);
break;
}
} else {
if (transitionState(current, ExecutionState.FAILED, t)) {
// proper failure of the task. record the exception as the root cause
String errorMessage = String.format("Execution of %s (%s) failed.", taskNameWithSubtask, executionId);
failureCause = t;
cancelInvokable();
notifyObservers(ExecutionState.FAILED, new Exception(errorMessage, t));
break;
}
}
} else if (current == ExecutionState.CANCELING) {
if (transitionState(current, ExecutionState.CANCELED)) {
notifyObservers(ExecutionState.CANCELED, null);
break;
}
} else if (current == ExecutionState.FAILED) {
// in state failed already, no transition necessary any more
break;
} else // unexpected state, go to failed
if (transitionState(current, ExecutionState.FAILED, t)) {
LOG.error("Unexpected state in task {} ({}) during an exception: {}.", taskNameWithSubtask, executionId, current);
break;
}
// else fall through the loop and
}
} catch (Throwable tt) {
String message = String.format("FATAL - exception in exception handler of task %s (%s).", taskNameWithSubtask, executionId);
LOG.error(message, tt);
notifyFatalError(message, tt);
}
} finally {
try {
LOG.info("Freeing task resources for {} ({}).", taskNameWithSubtask, executionId);
// stop the async dispatcher.
// copy dispatcher reference to stack, against concurrent release
ExecutorService dispatcher = this.asyncCallDispatcher;
if (dispatcher != null && !dispatcher.isShutdown()) {
dispatcher.shutdownNow();
}
// free the network resources
network.unregisterTask(this);
// free memory resources
if (invokable != null) {
memoryManager.releaseAll(invokable);
}
// remove all of the tasks library resources
libraryCache.unregisterTask(jobId, executionId);
// remove all files in the distributed cache
removeCachedFiles(distributedCacheEntries, fileCache);
// close and de-activate safety net for task thread
LOG.info("Ensuring all FileSystem streams are closed for task {}", this);
FileSystemSafetyNet.closeSafetyNetAndGuardedResourcesForThread();
notifyFinalState();
} catch (Throwable t) {
// an error in the resource cleanup is fatal
String message = String.format("FATAL - exception in resource cleanup of task %s (%s).", taskNameWithSubtask, executionId);
LOG.error(message, t);
notifyFatalError(message, t);
}
// errors here will only be logged
try {
metrics.close();
} catch (Throwable t) {
LOG.error("Error during metrics de-registration of task {} ({}).", taskNameWithSubtask, executionId, t);
}
}
}
use of org.apache.flink.runtime.execution.Environment in project flink by apache.
the class ChainedDriver method setup.
public void setup(TaskConfig config, String taskName, Collector<OT> outputCollector, AbstractInvokable parent, UserCodeClassLoader userCodeClassLoader, ExecutionConfig executionConfig, Map<String, Accumulator<?, ?>> accumulatorMap) {
this.config = config;
this.taskName = taskName;
this.userCodeClassLoader = userCodeClassLoader.asClassLoader();
this.metrics = parent.getEnvironment().getMetricGroup().getOrAddOperator(taskName);
this.numRecordsIn = this.metrics.getIOMetricGroup().getNumRecordsInCounter();
this.numRecordsOut = this.metrics.getIOMetricGroup().getNumRecordsOutCounter();
this.outputCollector = new CountingCollector<>(outputCollector, numRecordsOut);
Environment env = parent.getEnvironment();
if (parent instanceof BatchTask) {
this.udfContext = ((BatchTask<?, ?>) parent).createRuntimeContext(metrics);
} else {
this.udfContext = new DistributedRuntimeUDFContext(env.getTaskInfo(), userCodeClassLoader, parent.getExecutionConfig(), env.getDistributedCacheEntries(), accumulatorMap, metrics, env.getExternalResourceInfoProvider(), env.getJobID());
}
this.executionConfig = executionConfig;
this.objectReuseEnabled = executionConfig.isObjectReuseEnabled();
setup(parent);
}
Aggregations