Search in sources :

Example 21 with ResultPartition

use of in project flink by apache.

the class StreamTaskFinalCheckpointsTest method testTriggeringCheckpointWithFinishedChannels.

private void testTriggeringCheckpointWithFinishedChannels(CheckpointOptions checkpointOptions) throws Exception {
    ResultPartition[] partitionWriters = new ResultPartition[2];
    try {
        for (int i = 0; i < partitionWriters.length; ++i) {
            partitionWriters[i] = PartitionTestUtils.createPartition(ResultPartitionType.PIPELINED_BOUNDED);
        try (StreamTaskMailboxTestHarness<String> testHarness = createTestHarness(partitionWriters, new CompletingCheckpointResponder(), checkpointOptions.isUnalignedCheckpoint() || checkpointOptions.isTimeoutable())) {
            int numChannels = testHarness.inputGates[0].getInputGate().getNumberOfInputChannels();
            int[] resumedCount = new int[numChannels];
            for (int i = 0; i < numChannels; ++i) {
                TestInputChannel inputChannel = (TestInputChannel) testHarness.inputGates[0].getInputGate().getChannel(i);
                inputChannel.setActionOnResumed(() -> resumedCount[inputChannel.getChannelIndex()]++);
            // Tests triggering checkpoint when all the inputs are alive.
            CompletableFuture<Boolean> checkpointFuture = triggerCheckpoint(testHarness, 2, checkpointOptions);
            processMailTillCheckpointSucceeds(testHarness, checkpointFuture);
            assertEquals(2, testHarness.getTaskStateManager().getReportedCheckpointId());
            assertArrayEquals(new int[] { 0, 0, 0 }, resumedCount);
            // Tests triggering checkpoint after some inputs have received EndOfPartition.
            testHarness.processEvent(new EndOfData(StopMode.DRAIN), 0, 0);
            testHarness.processEvent(EndOfPartitionEvent.INSTANCE, 0, 0);
            checkpointFuture = triggerCheckpoint(testHarness, 4, checkpointOptions);
            processMailTillCheckpointSucceeds(testHarness, checkpointFuture);
            assertEquals(4, testHarness.getTaskStateManager().getReportedCheckpointId());
            assertArrayEquals(new int[] { 0, 0, 0 }, resumedCount);
            // Tests triggering checkpoint after received all the inputs have received
            // EndOfPartition.
            testHarness.processEvent(new EndOfData(StopMode.DRAIN), 0, 1);
            testHarness.processEvent(new EndOfData(StopMode.DRAIN), 0, 2);
            testHarness.processEvent(EndOfPartitionEvent.INSTANCE, 0, 1);
            testHarness.processEvent(EndOfPartitionEvent.INSTANCE, 0, 2);
            checkpointFuture = triggerCheckpoint(testHarness, 6, checkpointOptions);
            // Notifies the result partition that all records are processed after the
            // last checkpoint is triggered.
            checkpointFuture.thenAccept((ignored) -> {
                for (ResultPartition resultPartition : partitionWriters) {
            // The checkpoint 6 would be triggered successfully.
            assertEquals(6, testHarness.getTaskStateManager().getReportedCheckpointId());
            assertArrayEquals(new int[] { 0, 0, 0 }, resumedCount);
            // Each result partition should have emitted 3 barriers and 1 EndOfUserRecordsEvent.
            for (ResultPartition resultPartition : partitionWriters) {
                assertEquals(4, resultPartition.getNumberOfQueuedBuffers());
    } finally {
        for (ResultPartitionWriter writer : partitionWriters) {
            if (writer != null) {
Also used : EndOfData( CompletingCheckpointResponder(org.apache.flink.streaming.util.CompletingCheckpointResponder) TestInputChannel( ResultPartitionWriter( ResultPartition( PipelinedResultPartition(

Example 22 with ResultPartition

use of in project flink by apache.

the class NetworkEnvironment method registerTask.

// --------------------------------------------------------------------------------------------
//  Task operations
// --------------------------------------------------------------------------------------------
public void registerTask(Task task) throws IOException {
    final ResultPartition[] producedPartitions = task.getProducedPartitions();
    final ResultPartitionWriter[] writers = task.getAllWriters();
    if (writers.length != producedPartitions.length) {
        throw new IllegalStateException("Unequal number of writers and partitions.");
    synchronized (lock) {
        if (isShutdown) {
            throw new IllegalStateException("NetworkEnvironment is shut down");
        for (int i = 0; i < producedPartitions.length; i++) {
            final ResultPartition partition = producedPartitions[i];
            final ResultPartitionWriter writer = writers[i];
            // Buffer pool for the partition
            BufferPool bufferPool = null;
            try {
                int maxNumberOfMemorySegments = partition.getPartitionType().isBounded() ? partition.getNumberOfSubpartitions() * networkBuffersPerChannel + extraNetworkBuffersPerGate : Integer.MAX_VALUE;
                bufferPool = networkBufferPool.createBufferPool(partition.getNumberOfSubpartitions(), maxNumberOfMemorySegments);
            } catch (Throwable t) {
                if (bufferPool != null) {
                if (t instanceof IOException) {
                    throw (IOException) t;
                } else {
                    throw new IOException(t.getMessage(), t);
            // Register writer with task event dispatcher
            taskEventDispatcher.registerWriterForIncomingTaskEvents(writer.getPartitionId(), writer);
        // Setup the buffer pool for each buffer reader
        final SingleInputGate[] inputGates = task.getAllInputGates();
        for (SingleInputGate gate : inputGates) {
            BufferPool bufferPool = null;
            try {
                int maxNumberOfMemorySegments = gate.getConsumedPartitionType().isBounded() ? gate.getNumberOfInputChannels() * networkBuffersPerChannel + extraNetworkBuffersPerGate : Integer.MAX_VALUE;
                bufferPool = networkBufferPool.createBufferPool(gate.getNumberOfInputChannels(), maxNumberOfMemorySegments);
            } catch (Throwable t) {
                if (bufferPool != null) {
                if (t instanceof IOException) {
                    throw (IOException) t;
                } else {
                    throw new IOException(t.getMessage(), t);
Also used : BufferPool( NetworkBufferPool( ResultPartitionWriter( IOException( SingleInputGate( ResultPartition(

Example 23 with ResultPartition

use of in project flink by apache.

the class ResultPartitionWriterTest method testWriteBufferToAllChannelsReferenceCounting.

// ---------------------------------------------------------------------------------------------
// Resource release tests
// ---------------------------------------------------------------------------------------------
	 * Tests that event buffers are properly recycled when broadcasting events
	 * to multiple channels.
	 * @throws Exception
public void testWriteBufferToAllChannelsReferenceCounting() throws Exception {
    Buffer buffer = EventSerializer.toBuffer(EndOfPartitionEvent.INSTANCE);
    ResultPartition partition = new ResultPartition("TestTask", mock(TaskActions.class), new JobID(), new ResultPartitionID(), ResultPartitionType.PIPELINED, 2, 2, mock(ResultPartitionManager.class), mock(ResultPartitionConsumableNotifier.class), mock(IOManager.class), false);
    ResultPartitionWriter partitionWriter = new ResultPartitionWriter(partition);
    // Verify added to all queues, i.e. two buffers in total
    assertEquals(2, partition.getTotalNumberOfBuffers());
    // release the buffers in the partition
Also used : Buffer( IOManager( ResultPartitionID( TaskActions(org.apache.flink.runtime.taskmanager.TaskActions) ResultPartitionManager( ResultPartitionConsumableNotifier( JobID(org.apache.flink.api.common.JobID) ResultPartition( Test(org.junit.Test) PrepareForTest(org.powermock.core.classloader.annotations.PrepareForTest)

Example 24 with ResultPartition

use of in project flink by apache.

the class Task method run.

	 * The core work method that bootstraps the task and executes it code
public void run() {
    // ----------------------------
    while (true) {
        ExecutionState current = this.executionState;
        if (current == ExecutionState.CREATED) {
            if (transitionState(ExecutionState.CREATED, ExecutionState.DEPLOYING)) {
                // success, we can start our work
        } else if (current == ExecutionState.FAILED) {
            // we were immediately failed. tell the TaskManager that we reached our final state
        } else if (current == ExecutionState.CANCELING) {
            if (transitionState(ExecutionState.CANCELING, ExecutionState.CANCELED)) {
                // we were immediately canceled. tell the TaskManager that we reached our final state
        } else {
            throw new IllegalStateException("Invalid state for beginning of operation of task " + this + '.');
    // all resource acquisitions and registrations from here on
    // need to be undone in the end
    Map<String, Future<Path>> distributedCacheEntries = new HashMap<String, Future<Path>>();
    AbstractInvokable invokable = null;
    ClassLoader userCodeClassLoader;
    try {
        // ----------------------------
        //  Task Bootstrap - We periodically
        //  check for canceling as a shortcut
        // ----------------------------
        // activate safety net for task thread"Creating FileSystem stream leak safety net for task {}", this);
        // first of all, get a user-code classloader
        // this may involve downloading the job's JAR files and/or classes"Loading JAR files for task {}.", this);
        userCodeClassLoader = createUserCodeClassloader(libraryCache);
        final ExecutionConfig executionConfig = serializedExecutionConfig.deserializeValue(userCodeClassLoader);
        if (executionConfig.getTaskCancellationInterval() >= 0) {
            // override task cancellation interval from Flink config if set in ExecutionConfig
            taskCancellationInterval = executionConfig.getTaskCancellationInterval();
        if (executionConfig.getTaskCancellationTimeout() >= 0) {
            // override task cancellation timeout from Flink config if set in ExecutionConfig
            taskCancellationTimeout = executionConfig.getTaskCancellationTimeout();
        // now load the task's invokable code
        invokable = loadAndInstantiateInvokable(userCodeClassLoader, nameOfInvokableClass);
        if (isCanceledOrFailed()) {
            throw new CancelTaskException();
        // ----------------------------------------------------------------
        // register the task with the network stack
        // this operation may fail if the system does not have enough
        // memory to run the necessary data exchanges
        // the registration must also strictly be undone
        // ----------------------------------------------------------------"Registering task at network: {}.", this);
        // next, kick off the background copying of files for the distributed cache
        try {
            for (Map.Entry<String, DistributedCache.DistributedCacheEntry> entry : DistributedCache.readFileInfoFromConfig(jobConfiguration)) {
      "Obtaining local cache file for '{}'.", entry.getKey());
                Future<Path> cp = fileCache.createTmpFile(entry.getKey(), entry.getValue(), jobId);
                distributedCacheEntries.put(entry.getKey(), cp);
        } catch (Exception e) {
            throw new Exception(String.format("Exception while adding files to distributed cache of task %s (%s).", taskNameWithSubtask, executionId), e);
        if (isCanceledOrFailed()) {
            throw new CancelTaskException();
        // ----------------------------------------------------------------
        //  call the user code initialization methods
        // ----------------------------------------------------------------
        TaskKvStateRegistry kvStateRegistry = network.createKvStateTaskRegistry(jobId, getJobVertexId());
        Environment env = new RuntimeEnvironment(jobId, vertexId, executionId, executionConfig, taskInfo, jobConfiguration, taskConfiguration, userCodeClassLoader, memoryManager, ioManager, broadcastVariableManager, accumulatorRegistry, kvStateRegistry, inputSplitProvider, distributedCacheEntries, writers, inputGates, checkpointResponder, taskManagerConfig, metrics, this);
        // let the task code create its readers and writers
        if (null != taskStateHandles) {
            if (invokable instanceof StatefulTask) {
                StatefulTask op = (StatefulTask) invokable;
            } else {
                throw new IllegalStateException("Found operator state for a non-stateful task invokable");
            // be memory and GC friendly - since the code stays in invoke() for a potentially long time,
            // we clear the reference to the state handle
            //noinspection UnusedAssignment
            taskStateHandles = null;
        // ----------------------------------------------------------------
        //  actual task core work
        // ----------------------------------------------------------------
        // we must make strictly sure that the invokable is accessible to the cancel() call
        // by the time we switched to running.
        this.invokable = invokable;
        // switch to the RUNNING state, if that fails, we have been canceled/failed in the meantime
        if (!transitionState(ExecutionState.DEPLOYING, ExecutionState.RUNNING)) {
            throw new CancelTaskException();
        // notify everyone that we switched to running
        notifyObservers(ExecutionState.RUNNING, null);
        taskManagerActions.updateTaskExecutionState(new TaskExecutionState(jobId, executionId, ExecutionState.RUNNING));
        // make sure the user code classloader is accessible thread-locally
        // run the invokable
        // to the fact that it has been canceled
        if (isCanceledOrFailed()) {
            throw new CancelTaskException();
        // finish the produced partitions. if this fails, we consider the execution failed.
        for (ResultPartition partition : producedPartitions) {
            if (partition != null) {
        // if that fails, the task was canceled/failed in the meantime
        if (transitionState(ExecutionState.RUNNING, ExecutionState.FINISHED)) {
            notifyObservers(ExecutionState.FINISHED, null);
        } else {
            throw new CancelTaskException();
    } catch (Throwable t) {
        try {
            // check if the exception is unrecoverable
            if (ExceptionUtils.isJvmFatalError(t) || (t instanceof OutOfMemoryError && taskManagerConfig.shouldExitJvmOnOutOfMemoryError())) {
                // don't attempt a clean shutdown, because we cannot expect the clean shutdown to complete
                try {
                    LOG.error("Encountered fatal error {} - terminating the JVM", t.getClass().getName(), t);
                } finally {
            // to failExternally()
            while (true) {
                ExecutionState current = this.executionState;
                if (current == ExecutionState.RUNNING || current == ExecutionState.DEPLOYING) {
                    if (t instanceof CancelTaskException) {
                        if (transitionState(current, ExecutionState.CANCELED)) {
                            notifyObservers(ExecutionState.CANCELED, null);
                    } else {
                        if (transitionState(current, ExecutionState.FAILED, t)) {
                            // proper failure of the task. record the exception as the root cause
                            String errorMessage = String.format("Execution of %s (%s) failed.", taskNameWithSubtask, executionId);
                            failureCause = t;
                            notifyObservers(ExecutionState.FAILED, new Exception(errorMessage, t));
                } else if (current == ExecutionState.CANCELING) {
                    if (transitionState(current, ExecutionState.CANCELED)) {
                        notifyObservers(ExecutionState.CANCELED, null);
                } else if (current == ExecutionState.FAILED) {
                    // in state failed already, no transition necessary any more
                } else // unexpected state, go to failed
                if (transitionState(current, ExecutionState.FAILED, t)) {
                    LOG.error("Unexpected state in task {} ({}) during an exception: {}.", taskNameWithSubtask, executionId, current);
            // else fall through the loop and
        } catch (Throwable tt) {
            String message = String.format("FATAL - exception in exception handler of task %s (%s).", taskNameWithSubtask, executionId);
            LOG.error(message, tt);
            notifyFatalError(message, tt);
    } finally {
        try {
  "Freeing task resources for {} ({}).", taskNameWithSubtask, executionId);
            // stop the async dispatcher.
            // copy dispatcher reference to stack, against concurrent release
            ExecutorService dispatcher = this.asyncCallDispatcher;
            if (dispatcher != null && !dispatcher.isShutdown()) {
            // free the network resources
            // free memory resources
            if (invokable != null) {
            // remove all of the tasks library resources
            libraryCache.unregisterTask(jobId, executionId);
            // remove all files in the distributed cache
            removeCachedFiles(distributedCacheEntries, fileCache);
            // close and de-activate safety net for task thread
  "Ensuring all FileSystem streams are closed for task {}", this);
        } catch (Throwable t) {
            // an error in the resource cleanup is fatal
            String message = String.format("FATAL - exception in resource cleanup of task %s (%s).", taskNameWithSubtask, executionId);
            LOG.error(message, t);
            notifyFatalError(message, t);
        // errors here will only be logged
        try {
        } catch (Throwable t) {
            LOG.error("Error during metrics de-registration of task {} ({}).", taskNameWithSubtask, executionId, t);
Also used : ExecutionState(org.apache.flink.runtime.execution.ExecutionState) HashMap(java.util.HashMap) TaskKvStateRegistry(org.apache.flink.runtime.query.TaskKvStateRegistry) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) AbstractInvokable(org.apache.flink.runtime.jobgraph.tasks.AbstractInvokable) Path(org.apache.flink.core.fs.Path) CheckpointDeclineTaskNotCheckpointingException(org.apache.flink.runtime.checkpoint.decline.CheckpointDeclineTaskNotCheckpointingException) TimeoutException(java.util.concurrent.TimeoutException) CancelTaskException(org.apache.flink.runtime.execution.CancelTaskException) RejectedExecutionException(java.util.concurrent.RejectedExecutionException) PartitionProducerDisposedException(org.apache.flink.runtime.jobmanager.PartitionProducerDisposedException) CheckpointDeclineTaskNotReadyException(org.apache.flink.runtime.checkpoint.decline.CheckpointDeclineTaskNotReadyException) IOException( ResultPartition( StatefulTask(org.apache.flink.runtime.jobgraph.tasks.StatefulTask) CancelTaskException(org.apache.flink.runtime.execution.CancelTaskException) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future) Environment(org.apache.flink.runtime.execution.Environment) NetworkEnvironment( Map(java.util.Map) HashMap(java.util.HashMap)

Example 25 with ResultPartition

use of in project flink by apache.

the class StreamTaskFinalCheckpointsTest method testWaitingForFinalCheckpoint.

public void testWaitingForFinalCheckpoint() throws Exception {
    ResultPartition[] partitionWriters = new ResultPartition[2];
    try {
        for (int i = 0; i < partitionWriters.length; ++i) {
            partitionWriters[i] = PartitionTestUtils.createPartition(ResultPartitionType.PIPELINED_BOUNDED);
        int lastCheckpointId = 6;
        CompletingCheckpointResponder checkpointResponder = new CompletingCheckpointResponder();
        try (StreamTaskMailboxTestHarness<String> testHarness = createTestHarness(partitionWriters, checkpointResponder, false)) {
            // Tests triggering checkpoint when all the inputs are alive.
            CompletableFuture<Boolean> checkpointFuture = triggerCheckpoint(testHarness, 2);
            processMailTillCheckpointSucceeds(testHarness, checkpointFuture);
            assertEquals(2, testHarness.getTaskStateManager().getReportedCheckpointId());
            // Tests triggering checkpoint after some inputs have received EndOfPartition.
            testHarness.processEvent(new EndOfData(StopMode.DRAIN), 0, 0);
            testHarness.processEvent(EndOfPartitionEvent.INSTANCE, 0, 0);
            checkpointFuture = triggerCheckpoint(testHarness, 4);
            processMailTillCheckpointSucceeds(testHarness, checkpointFuture);
            assertEquals(4, testHarness.getTaskStateManager().getReportedCheckpointId());
            // Tests triggering checkpoint after received all the inputs have received
            // EndOfPartition.
            testHarness.processEvent(new EndOfData(StopMode.DRAIN), 0, 1);
            testHarness.processEvent(new EndOfData(StopMode.DRAIN), 0, 2);
            testHarness.processEvent(EndOfPartitionEvent.INSTANCE, 0, 1);
            testHarness.processEvent(EndOfPartitionEvent.INSTANCE, 0, 2);
            checkpointFuture = triggerCheckpoint(testHarness, lastCheckpointId);
            // Notifies the result partition that all records are processed after the
            // last checkpoint is triggered.
            checkpointFuture.thenAccept((ignored) -> {
                for (ResultPartition resultPartition : partitionWriters) {
            // The checkpoint 6 would be triggered successfully.
            assertEquals(6, testHarness.getTaskStateManager().getReportedCheckpointId());
            assertEquals(6, testHarness.getTaskStateManager().getNotifiedCompletedCheckpointId());
            // Each result partition should have emitted 3 barriers and 1 EndOfUserRecordsEvent.
            for (ResultPartition resultPartition : partitionWriters) {
                assertEquals(4, resultPartition.getNumberOfQueuedBuffers());
    } finally {
        for (ResultPartitionWriter writer : partitionWriters) {
            if (writer != null) {
Also used : EndOfData( CompletingCheckpointResponder(org.apache.flink.streaming.util.CompletingCheckpointResponder) ResultPartitionWriter( ResultPartition( PipelinedResultPartition( Test(org.junit.Test)


ResultPartition ( Test (org.junit.Test)23 ResultPartitionWriter ( IOException ( NoOpBufferAvailablityListener ( ResultSubpartitionView ( SingleInputGate ( NettyShuffleEnvironment ( CompletingCheckpointResponder (org.apache.flink.streaming.util.CompletingCheckpointResponder)7 Assert.assertEquals (org.junit.Assert.assertEquals)7 Assert.assertTrue (org.junit.Assert.assertTrue)7 TaskStateSnapshot (org.apache.flink.runtime.checkpoint.TaskStateSnapshot)6 NettyShuffleEnvironmentBuilder ( Buffer ( ResultPartitionManager ( Collections (java.util.Collections)5 EndOfData ( ResultPartitionType ( ArrayList (java.util.ArrayList)4 CompletableFuture (java.util.concurrent.CompletableFuture)4