Search in sources :

Example 6 with ConnectionID

use of org.apache.flink.runtime.io.network.ConnectionID in project flink by apache.

the class PartialInputChannelDeploymentDescriptor method createInputChannelDeploymentDescriptor.

/**
	 * Creates a channel deployment descriptor by completing the partition location.
	 *
	 * @see InputChannelDeploymentDescriptor
	 */
public InputChannelDeploymentDescriptor createInputChannelDeploymentDescriptor(Execution consumerExecution) {
    checkNotNull(consumerExecution, "consumerExecution");
    TaskManagerLocation consumerLocation = consumerExecution.getAssignedResourceLocation();
    checkNotNull(consumerLocation, "Consumer connection info null");
    final ResultPartitionLocation partitionLocation;
    if (consumerLocation.equals(partitionTaskManagerLocation)) {
        partitionLocation = ResultPartitionLocation.createLocal();
    } else {
        partitionLocation = ResultPartitionLocation.createRemote(new ConnectionID(partitionTaskManagerLocation, partitionConnectionIndex));
    }
    return new InputChannelDeploymentDescriptor(partitionID, partitionLocation);
}
Also used : ConnectionID(org.apache.flink.runtime.io.network.ConnectionID) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation)

Example 7 with ConnectionID

use of org.apache.flink.runtime.io.network.ConnectionID in project flink by apache.

the class Execution method scheduleOrUpdateConsumers.

void scheduleOrUpdateConsumers(List<List<ExecutionEdge>> allConsumers) {
    final int numConsumers = allConsumers.size();
    if (numConsumers > 1) {
        fail(new IllegalStateException("Currently, only a single consumer group per partition is supported."));
    } else if (numConsumers == 0) {
        return;
    }
    for (ExecutionEdge edge : allConsumers.get(0)) {
        final ExecutionVertex consumerVertex = edge.getTarget();
        final Execution consumer = consumerVertex.getCurrentExecutionAttempt();
        final ExecutionState consumerState = consumer.getState();
        final IntermediateResultPartition partition = edge.getSource();
        // ----------------------------------------------------------------
        if (consumerState == CREATED) {
            final Execution partitionExecution = partition.getProducer().getCurrentExecutionAttempt();
            consumerVertex.cachePartitionInfo(PartialInputChannelDeploymentDescriptor.fromEdge(partition, partitionExecution));
            // When deploying a consuming task, its task deployment descriptor will contain all
            // deployment information available at the respective time. It is possible that some
            // of the partitions to be consumed have not been created yet. These are updated
            // runtime via the update messages.
            //
            // TODO The current approach may send many update messages even though the consuming
            // task has already been deployed with all necessary information. We have to check
            // whether this is a problem and fix it, if it is.
            FlinkFuture.supplyAsync(new Callable<Void>() {

                @Override
                public Void call() throws Exception {
                    try {
                        consumerVertex.scheduleForExecution(consumerVertex.getExecutionGraph().getSlotProvider(), consumerVertex.getExecutionGraph().isQueuedSchedulingAllowed());
                    } catch (Throwable t) {
                        consumerVertex.fail(new IllegalStateException("Could not schedule consumer " + "vertex " + consumerVertex, t));
                    }
                    return null;
                }
            }, executor);
            // double check to resolve race conditions
            if (consumerVertex.getExecutionState() == RUNNING) {
                consumerVertex.sendPartitionInfos();
            }
        } else // ----------------------------------------------------------------
        // Consumer is running => send update message now
        // ----------------------------------------------------------------
        {
            if (consumerState == RUNNING) {
                final SimpleSlot consumerSlot = consumer.getAssignedResource();
                if (consumerSlot == null) {
                    // The consumer has been reset concurrently
                    continue;
                }
                final TaskManagerLocation partitionTaskManagerLocation = partition.getProducer().getCurrentAssignedResource().getTaskManagerLocation();
                final ResourceID partitionTaskManager = partitionTaskManagerLocation.getResourceID();
                final ResourceID consumerTaskManager = consumerSlot.getTaskManagerID();
                final ResultPartitionID partitionId = new ResultPartitionID(partition.getPartitionId(), attemptId);
                final ResultPartitionLocation partitionLocation;
                if (consumerTaskManager.equals(partitionTaskManager)) {
                    // Consuming task is deployed to the same instance as the partition => local
                    partitionLocation = ResultPartitionLocation.createLocal();
                } else {
                    // Different instances => remote
                    final ConnectionID connectionId = new ConnectionID(partitionTaskManagerLocation, partition.getIntermediateResult().getConnectionIndex());
                    partitionLocation = ResultPartitionLocation.createRemote(connectionId);
                }
                final InputChannelDeploymentDescriptor descriptor = new InputChannelDeploymentDescriptor(partitionId, partitionLocation);
                consumer.sendUpdatePartitionInfoRpcCall(Collections.singleton(new PartitionInfo(partition.getIntermediateResult().getId(), descriptor)));
            } else // ----------------------------------------------------------------
            if (consumerState == SCHEDULED || consumerState == DEPLOYING) {
                final Execution partitionExecution = partition.getProducer().getCurrentExecutionAttempt();
                consumerVertex.cachePartitionInfo(PartialInputChannelDeploymentDescriptor.fromEdge(partition, partitionExecution));
                // double check to resolve race conditions
                if (consumerVertex.getExecutionState() == RUNNING) {
                    consumerVertex.sendPartitionInfos();
                }
            }
        }
    }
}
Also used : ExecutionState(org.apache.flink.runtime.execution.ExecutionState) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) SimpleSlot(org.apache.flink.runtime.instance.SimpleSlot) CoLocationConstraint(org.apache.flink.runtime.jobmanager.scheduler.CoLocationConstraint) TimeoutException(java.util.concurrent.TimeoutException) JobException(org.apache.flink.runtime.JobException) ConnectionID(org.apache.flink.runtime.io.network.ConnectionID) ResultPartitionLocation(org.apache.flink.runtime.deployment.ResultPartitionLocation) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) PartialInputChannelDeploymentDescriptor(org.apache.flink.runtime.deployment.PartialInputChannelDeploymentDescriptor) InputChannelDeploymentDescriptor(org.apache.flink.runtime.deployment.InputChannelDeploymentDescriptor) ResultPartitionID(org.apache.flink.runtime.io.network.partition.ResultPartitionID)

Example 8 with ConnectionID

use of org.apache.flink.runtime.io.network.ConnectionID in project flink by apache.

the class RemoteInputChannelTest method testOnFailedPartitionRequestDoesNotBlockNetworkThreads.

/**
 * Test to guard against FLINK-13249.
 */
@Test
public void testOnFailedPartitionRequestDoesNotBlockNetworkThreads() throws Exception {
    final long testBlockedWaitTimeoutMillis = 30_000L;
    final PartitionProducerStateChecker partitionProducerStateChecker = (jobId, intermediateDataSetId, resultPartitionId) -> CompletableFuture.completedFuture(ExecutionState.RUNNING);
    final NettyShuffleEnvironment shuffleEnvironment = new NettyShuffleEnvironmentBuilder().build();
    final Task task = new TestTaskBuilder(shuffleEnvironment).setPartitionProducerStateChecker(partitionProducerStateChecker).build();
    final SingleInputGate inputGate = new SingleInputGateBuilder().setPartitionProducerStateProvider(task).build();
    TestTaskBuilder.setTaskState(task, ExecutionState.RUNNING);
    final OneShotLatch ready = new OneShotLatch();
    final OneShotLatch blocker = new OneShotLatch();
    final AtomicBoolean timedOutOrInterrupted = new AtomicBoolean(false);
    final ConnectionManager blockingConnectionManager = new TestingConnectionManager() {

        @Override
        public PartitionRequestClient createPartitionRequestClient(ConnectionID connectionId) {
            ready.trigger();
            try {
                // We block here, in a section that holds the
                // SingleInputGate#requestLock
                blocker.await(testBlockedWaitTimeoutMillis, TimeUnit.MILLISECONDS);
            } catch (InterruptedException | TimeoutException e) {
                timedOutOrInterrupted.set(true);
            }
            return new TestingPartitionRequestClient();
        }
    };
    final RemoteInputChannel remoteInputChannel = InputChannelBuilder.newBuilder().setConnectionManager(blockingConnectionManager).buildRemoteChannel(inputGate);
    inputGate.setInputChannels(remoteInputChannel);
    final Thread simulatedNetworkThread = new Thread(() -> {
        try {
            ready.await();
            // We want to make sure that our simulated network thread does not
            // block on
            // SingleInputGate#requestLock as well through this call.
            remoteInputChannel.onFailedPartitionRequest();
            // Will only give free the blocker if we did not block ourselves.
            blocker.trigger();
        } catch (InterruptedException e) {
            Thread.currentThread().interrupt();
        }
    });
    simulatedNetworkThread.start();
    // The entry point to that will lead us into
    // blockingConnectionManager#createPartitionRequestClient(...).
    inputGate.requestPartitions();
    simulatedNetworkThread.join();
    Assert.assertFalse("Test ended by timeout or interruption - this indicates that the network thread was blocked.", timedOutOrInterrupted.get());
}
Also used : TestTaskBuilder(org.apache.flink.runtime.taskmanager.TestTaskBuilder) Arrays(java.util.Arrays) Matchers.isA(org.hamcrest.Matchers.isA) AvailabilityUtil.assertAvailability(org.apache.flink.runtime.io.network.partition.AvailabilityUtil.assertAvailability) ProducerFailedException(org.apache.flink.runtime.io.network.partition.ProducerFailedException) TimeoutException(java.util.concurrent.TimeoutException) ExceptionUtils(org.apache.flink.util.ExceptionUtils) Random(java.util.Random) PartitionRequestClient(org.apache.flink.runtime.io.network.PartitionRequestClient) NetworkBuffer(org.apache.flink.runtime.io.network.buffer.NetworkBuffer) AvailabilityUtil.assertPriorityAvailability(org.apache.flink.runtime.io.network.partition.AvailabilityUtil.assertPriorityAvailability) Lists(org.apache.flink.shaded.guava30.com.google.common.collect.Lists) Future(java.util.concurrent.Future) CheckpointException(org.apache.flink.runtime.checkpoint.CheckpointException) ResultPartitionID(org.apache.flink.runtime.io.network.partition.ResultPartitionID) CheckpointStorageLocationReference.getDefault(org.apache.flink.runtime.state.CheckpointStorageLocationReference.getDefault) Assert.fail(org.junit.Assert.fail) Preconditions.checkNotNull(org.apache.flink.util.Preconditions.checkNotNull) CheckpointType(org.apache.flink.runtime.checkpoint.CheckpointType) CancelTaskException(org.apache.flink.runtime.execution.CancelTaskException) EventSerializer(org.apache.flink.runtime.io.network.api.serialization.EventSerializer) TestBufferFactory.createBuffer(org.apache.flink.runtime.io.network.util.TestBufferFactory.createBuffer) ExpectedTestException(org.apache.flink.runtime.operators.testutils.ExpectedTestException) InputChannelTestUtils(org.apache.flink.runtime.io.network.partition.InputChannelTestUtils) DataType(org.apache.flink.runtime.io.network.buffer.Buffer.DataType) ConnectionID(org.apache.flink.runtime.io.network.ConnectionID) FreeingBufferRecycler(org.apache.flink.runtime.io.network.buffer.FreeingBufferRecycler) CheckpointOptions(org.apache.flink.runtime.checkpoint.CheckpointOptions) NettyShuffleEnvironment(org.apache.flink.runtime.io.network.NettyShuffleEnvironment) IntermediateDataSetID(org.apache.flink.runtime.jobgraph.IntermediateDataSetID) PartitionNotFoundException(org.apache.flink.runtime.io.network.partition.PartitionNotFoundException) Collectors(java.util.stream.Collectors) Buffer(org.apache.flink.runtime.io.network.buffer.Buffer) Executors(java.util.concurrent.Executors) Matchers.any(org.mockito.Matchers.any) CloseableIterator(org.apache.flink.util.CloseableIterator) List(java.util.List) CheckpointBarrier(org.apache.flink.runtime.io.network.api.CheckpointBarrier) Matchers.contains(org.hamcrest.Matchers.contains) Assert.assertFalse(org.junit.Assert.assertFalse) Optional(java.util.Optional) TestingPartitionRequestClient(org.apache.flink.runtime.io.network.TestingPartitionRequestClient) Matchers.is(org.hamcrest.Matchers.is) Queue(java.util.Queue) Mockito.mock(org.mockito.Mockito.mock) OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) TestingConnectionManager(org.apache.flink.runtime.io.network.TestingConnectionManager) TestBufferFactory(org.apache.flink.runtime.io.network.util.TestBufferFactory) CHECKPOINT(org.apache.flink.runtime.checkpoint.CheckpointType.CHECKPOINT) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) InputChannelTestUtils.createSingleInputGate(org.apache.flink.runtime.io.network.partition.InputChannelTestUtils.createSingleInputGate) Callable(java.util.concurrent.Callable) CompletableFuture(java.util.concurrent.CompletableFuture) Mockito.spy(org.mockito.Mockito.spy) NetworkBufferPool(org.apache.flink.runtime.io.network.buffer.NetworkBufferPool) ArrayList(java.util.ArrayList) Matchers.hasProperty(org.hamcrest.Matchers.hasProperty) BufferBuilder(org.apache.flink.runtime.io.network.buffer.BufferBuilder) EventSerializer.toBuffer(org.apache.flink.runtime.io.network.api.serialization.EventSerializer.toBuffer) ChannelStateWriter(org.apache.flink.runtime.checkpoint.channel.ChannelStateWriter) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) CheckpointOptions.alignedWithTimeout(org.apache.flink.runtime.checkpoint.CheckpointOptions.alignedWithTimeout) InputChannelInfo(org.apache.flink.runtime.checkpoint.channel.InputChannelInfo) Nullable(javax.annotation.Nullable) ExecutorService(java.util.concurrent.ExecutorService) MemorySegment(org.apache.flink.core.memory.MemorySegment) MemorySegmentFactory(org.apache.flink.core.memory.MemorySegmentFactory) NettyShuffleEnvironmentBuilder(org.apache.flink.runtime.io.network.NettyShuffleEnvironmentBuilder) ConnectionManager(org.apache.flink.runtime.io.network.ConnectionManager) Assert.assertNotNull(org.junit.Assert.assertNotNull) BufferPool(org.apache.flink.runtime.io.network.buffer.BufferPool) ExecutionState(org.apache.flink.runtime.execution.ExecutionState) Assert.assertTrue(org.junit.Assert.assertTrue) Test(org.junit.Test) IOException(java.io.IOException) Mockito.times(org.mockito.Mockito.times) Mockito.when(org.mockito.Mockito.when) PartitionProducerStateChecker(org.apache.flink.runtime.taskexecutor.PartitionProducerStateChecker) Mockito.verify(org.mockito.Mockito.verify) TimeUnit(java.util.concurrent.TimeUnit) Consumer(java.util.function.Consumer) BufferBuilderTestUtils.buildSingleBuffer(org.apache.flink.runtime.io.network.buffer.BufferBuilderTestUtils.buildSingleBuffer) BufferAndAvailability(org.apache.flink.runtime.io.network.partition.consumer.InputChannel.BufferAndAvailability) Task(org.apache.flink.runtime.taskmanager.Task) Assert.assertNull(org.junit.Assert.assertNull) NoOpBufferPool(org.apache.flink.runtime.io.network.buffer.NoOpBufferPool) Assert(org.junit.Assert) ArrayDeque(java.util.ArrayDeque) PartitionProducerStateProvider(org.apache.flink.runtime.io.network.partition.PartitionProducerStateProvider) Assert.assertEquals(org.junit.Assert.assertEquals) Task(org.apache.flink.runtime.taskmanager.Task) TestingConnectionManager(org.apache.flink.runtime.io.network.TestingConnectionManager) NettyShuffleEnvironmentBuilder(org.apache.flink.runtime.io.network.NettyShuffleEnvironmentBuilder) NettyShuffleEnvironment(org.apache.flink.runtime.io.network.NettyShuffleEnvironment) InputChannelTestUtils.createSingleInputGate(org.apache.flink.runtime.io.network.partition.InputChannelTestUtils.createSingleInputGate) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) ConnectionID(org.apache.flink.runtime.io.network.ConnectionID) TestingConnectionManager(org.apache.flink.runtime.io.network.TestingConnectionManager) ConnectionManager(org.apache.flink.runtime.io.network.ConnectionManager) TestTaskBuilder(org.apache.flink.runtime.taskmanager.TestTaskBuilder) OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) PartitionProducerStateChecker(org.apache.flink.runtime.taskexecutor.PartitionProducerStateChecker) TestingPartitionRequestClient(org.apache.flink.runtime.io.network.TestingPartitionRequestClient) TimeoutException(java.util.concurrent.TimeoutException) Test(org.junit.Test)

Example 9 with ConnectionID

use of org.apache.flink.runtime.io.network.ConnectionID in project flink by apache.

the class PartitionRequestClientFactory method createPartitionRequestClient.

/**
 * Atomically establishes a TCP connection to the given remote address and creates a {@link
 * NettyPartitionRequestClient} instance for this connection.
 */
NettyPartitionRequestClient createPartitionRequestClient(ConnectionID connectionId) throws IOException, InterruptedException {
    // We map the input ConnectionID to a new value to restrict the number of tcp connections
    connectionId = new ConnectionID(connectionId.getAddress(), connectionId.getConnectionIndex() % maxNumberOfConnections);
    while (true) {
        final CompletableFuture<NettyPartitionRequestClient> newClientFuture = new CompletableFuture<>();
        final CompletableFuture<NettyPartitionRequestClient> clientFuture = clients.putIfAbsent(connectionId, newClientFuture);
        final NettyPartitionRequestClient client;
        if (clientFuture == null) {
            try {
                client = connectWithRetries(connectionId);
            } catch (Throwable e) {
                newClientFuture.completeExceptionally(new IOException("Could not create Netty client.", e));
                clients.remove(connectionId, newClientFuture);
                throw e;
            }
            newClientFuture.complete(client);
        } else {
            try {
                client = clientFuture.get();
            } catch (ExecutionException e) {
                ExceptionUtils.rethrowIOException(ExceptionUtils.stripExecutionException(e));
                return null;
            }
        }
        // out to ensure correct bookkeeping for channel closing.
        if (client.validateClientAndIncrementReferenceCounter()) {
            return client;
        } else if (client.canBeDisposed()) {
            client.closeConnection();
        } else {
            destroyPartitionRequestClient(connectionId, client);
        }
    }
}
Also used : ConnectionID(org.apache.flink.runtime.io.network.ConnectionID) CompletableFuture(java.util.concurrent.CompletableFuture) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException)

Example 10 with ConnectionID

use of org.apache.flink.runtime.io.network.ConnectionID in project flink by apache.

the class PartitionRequestClientFactoryTest method testFailureReportedToSubsequentRequests.

// see https://issues.apache.org/jira/browse/FLINK-18821
@Test(expected = IOException.class)
public void testFailureReportedToSubsequentRequests() throws Exception {
    PartitionRequestClientFactory factory = new PartitionRequestClientFactory(new FailingNettyClient(), 2, 1, connectionReuseEnabled);
    try {
        factory.createPartitionRequestClient(new ConnectionID(new InetSocketAddress(InetAddress.getLocalHost(), 8080), 0));
    } catch (Exception e) {
    // expected
    }
    factory.createPartitionRequestClient(new ConnectionID(new InetSocketAddress(InetAddress.getLocalHost(), 8080), 0));
}
Also used : ConnectionID(org.apache.flink.runtime.io.network.ConnectionID) InetSocketAddress(java.net.InetSocketAddress) RemoteTransportException(org.apache.flink.runtime.io.network.netty.exception.RemoteTransportException) ChannelException(org.apache.flink.shaded.netty4.io.netty.channel.ChannelException) IOException(java.io.IOException) Test(org.junit.Test)

Aggregations

ConnectionID (org.apache.flink.runtime.io.network.ConnectionID)12 Test (org.junit.Test)6 ResultPartitionID (org.apache.flink.runtime.io.network.partition.ResultPartitionID)5 IOException (java.io.IOException)4 ExecutionState (org.apache.flink.runtime.execution.ExecutionState)4 InetSocketAddress (java.net.InetSocketAddress)3 ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)3 SimpleSlot (org.apache.flink.runtime.instance.SimpleSlot)3 TaskManagerLocation (org.apache.flink.runtime.taskmanager.TaskManagerLocation)3 CompletableFuture (java.util.concurrent.CompletableFuture)2 TimeoutException (java.util.concurrent.TimeoutException)2 Configuration (org.apache.flink.configuration.Configuration)2 IntermediateResultPartitionID (org.apache.flink.runtime.jobgraph.IntermediateResultPartitionID)2 ActorRef (akka.actor.ActorRef)1 JavaTestKit (akka.testkit.JavaTestKit)1 ArrayDeque (java.util.ArrayDeque)1 ArrayList (java.util.ArrayList)1 Arrays (java.util.Arrays)1 HashSet (java.util.HashSet)1 List (java.util.List)1