use of org.apache.flink.runtime.io.network.ConnectionID in project flink by apache.
the class PartialInputChannelDeploymentDescriptor method createInputChannelDeploymentDescriptor.
/**
* Creates a channel deployment descriptor by completing the partition location.
*
* @see InputChannelDeploymentDescriptor
*/
public InputChannelDeploymentDescriptor createInputChannelDeploymentDescriptor(Execution consumerExecution) {
checkNotNull(consumerExecution, "consumerExecution");
TaskManagerLocation consumerLocation = consumerExecution.getAssignedResourceLocation();
checkNotNull(consumerLocation, "Consumer connection info null");
final ResultPartitionLocation partitionLocation;
if (consumerLocation.equals(partitionTaskManagerLocation)) {
partitionLocation = ResultPartitionLocation.createLocal();
} else {
partitionLocation = ResultPartitionLocation.createRemote(new ConnectionID(partitionTaskManagerLocation, partitionConnectionIndex));
}
return new InputChannelDeploymentDescriptor(partitionID, partitionLocation);
}
use of org.apache.flink.runtime.io.network.ConnectionID in project flink by apache.
the class Execution method scheduleOrUpdateConsumers.
void scheduleOrUpdateConsumers(List<List<ExecutionEdge>> allConsumers) {
final int numConsumers = allConsumers.size();
if (numConsumers > 1) {
fail(new IllegalStateException("Currently, only a single consumer group per partition is supported."));
} else if (numConsumers == 0) {
return;
}
for (ExecutionEdge edge : allConsumers.get(0)) {
final ExecutionVertex consumerVertex = edge.getTarget();
final Execution consumer = consumerVertex.getCurrentExecutionAttempt();
final ExecutionState consumerState = consumer.getState();
final IntermediateResultPartition partition = edge.getSource();
// ----------------------------------------------------------------
if (consumerState == CREATED) {
final Execution partitionExecution = partition.getProducer().getCurrentExecutionAttempt();
consumerVertex.cachePartitionInfo(PartialInputChannelDeploymentDescriptor.fromEdge(partition, partitionExecution));
// When deploying a consuming task, its task deployment descriptor will contain all
// deployment information available at the respective time. It is possible that some
// of the partitions to be consumed have not been created yet. These are updated
// runtime via the update messages.
//
// TODO The current approach may send many update messages even though the consuming
// task has already been deployed with all necessary information. We have to check
// whether this is a problem and fix it, if it is.
FlinkFuture.supplyAsync(new Callable<Void>() {
@Override
public Void call() throws Exception {
try {
consumerVertex.scheduleForExecution(consumerVertex.getExecutionGraph().getSlotProvider(), consumerVertex.getExecutionGraph().isQueuedSchedulingAllowed());
} catch (Throwable t) {
consumerVertex.fail(new IllegalStateException("Could not schedule consumer " + "vertex " + consumerVertex, t));
}
return null;
}
}, executor);
// double check to resolve race conditions
if (consumerVertex.getExecutionState() == RUNNING) {
consumerVertex.sendPartitionInfos();
}
} else // ----------------------------------------------------------------
// Consumer is running => send update message now
// ----------------------------------------------------------------
{
if (consumerState == RUNNING) {
final SimpleSlot consumerSlot = consumer.getAssignedResource();
if (consumerSlot == null) {
// The consumer has been reset concurrently
continue;
}
final TaskManagerLocation partitionTaskManagerLocation = partition.getProducer().getCurrentAssignedResource().getTaskManagerLocation();
final ResourceID partitionTaskManager = partitionTaskManagerLocation.getResourceID();
final ResourceID consumerTaskManager = consumerSlot.getTaskManagerID();
final ResultPartitionID partitionId = new ResultPartitionID(partition.getPartitionId(), attemptId);
final ResultPartitionLocation partitionLocation;
if (consumerTaskManager.equals(partitionTaskManager)) {
// Consuming task is deployed to the same instance as the partition => local
partitionLocation = ResultPartitionLocation.createLocal();
} else {
// Different instances => remote
final ConnectionID connectionId = new ConnectionID(partitionTaskManagerLocation, partition.getIntermediateResult().getConnectionIndex());
partitionLocation = ResultPartitionLocation.createRemote(connectionId);
}
final InputChannelDeploymentDescriptor descriptor = new InputChannelDeploymentDescriptor(partitionId, partitionLocation);
consumer.sendUpdatePartitionInfoRpcCall(Collections.singleton(new PartitionInfo(partition.getIntermediateResult().getId(), descriptor)));
} else // ----------------------------------------------------------------
if (consumerState == SCHEDULED || consumerState == DEPLOYING) {
final Execution partitionExecution = partition.getProducer().getCurrentExecutionAttempt();
consumerVertex.cachePartitionInfo(PartialInputChannelDeploymentDescriptor.fromEdge(partition, partitionExecution));
// double check to resolve race conditions
if (consumerVertex.getExecutionState() == RUNNING) {
consumerVertex.sendPartitionInfos();
}
}
}
}
}
use of org.apache.flink.runtime.io.network.ConnectionID in project flink by apache.
the class RemoteInputChannelTest method testOnFailedPartitionRequestDoesNotBlockNetworkThreads.
/**
* Test to guard against FLINK-13249.
*/
@Test
public void testOnFailedPartitionRequestDoesNotBlockNetworkThreads() throws Exception {
final long testBlockedWaitTimeoutMillis = 30_000L;
final PartitionProducerStateChecker partitionProducerStateChecker = (jobId, intermediateDataSetId, resultPartitionId) -> CompletableFuture.completedFuture(ExecutionState.RUNNING);
final NettyShuffleEnvironment shuffleEnvironment = new NettyShuffleEnvironmentBuilder().build();
final Task task = new TestTaskBuilder(shuffleEnvironment).setPartitionProducerStateChecker(partitionProducerStateChecker).build();
final SingleInputGate inputGate = new SingleInputGateBuilder().setPartitionProducerStateProvider(task).build();
TestTaskBuilder.setTaskState(task, ExecutionState.RUNNING);
final OneShotLatch ready = new OneShotLatch();
final OneShotLatch blocker = new OneShotLatch();
final AtomicBoolean timedOutOrInterrupted = new AtomicBoolean(false);
final ConnectionManager blockingConnectionManager = new TestingConnectionManager() {
@Override
public PartitionRequestClient createPartitionRequestClient(ConnectionID connectionId) {
ready.trigger();
try {
// We block here, in a section that holds the
// SingleInputGate#requestLock
blocker.await(testBlockedWaitTimeoutMillis, TimeUnit.MILLISECONDS);
} catch (InterruptedException | TimeoutException e) {
timedOutOrInterrupted.set(true);
}
return new TestingPartitionRequestClient();
}
};
final RemoteInputChannel remoteInputChannel = InputChannelBuilder.newBuilder().setConnectionManager(blockingConnectionManager).buildRemoteChannel(inputGate);
inputGate.setInputChannels(remoteInputChannel);
final Thread simulatedNetworkThread = new Thread(() -> {
try {
ready.await();
// We want to make sure that our simulated network thread does not
// block on
// SingleInputGate#requestLock as well through this call.
remoteInputChannel.onFailedPartitionRequest();
// Will only give free the blocker if we did not block ourselves.
blocker.trigger();
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
}
});
simulatedNetworkThread.start();
// The entry point to that will lead us into
// blockingConnectionManager#createPartitionRequestClient(...).
inputGate.requestPartitions();
simulatedNetworkThread.join();
Assert.assertFalse("Test ended by timeout or interruption - this indicates that the network thread was blocked.", timedOutOrInterrupted.get());
}
use of org.apache.flink.runtime.io.network.ConnectionID in project flink by apache.
the class PartitionRequestClientFactory method createPartitionRequestClient.
/**
* Atomically establishes a TCP connection to the given remote address and creates a {@link
* NettyPartitionRequestClient} instance for this connection.
*/
NettyPartitionRequestClient createPartitionRequestClient(ConnectionID connectionId) throws IOException, InterruptedException {
// We map the input ConnectionID to a new value to restrict the number of tcp connections
connectionId = new ConnectionID(connectionId.getAddress(), connectionId.getConnectionIndex() % maxNumberOfConnections);
while (true) {
final CompletableFuture<NettyPartitionRequestClient> newClientFuture = new CompletableFuture<>();
final CompletableFuture<NettyPartitionRequestClient> clientFuture = clients.putIfAbsent(connectionId, newClientFuture);
final NettyPartitionRequestClient client;
if (clientFuture == null) {
try {
client = connectWithRetries(connectionId);
} catch (Throwable e) {
newClientFuture.completeExceptionally(new IOException("Could not create Netty client.", e));
clients.remove(connectionId, newClientFuture);
throw e;
}
newClientFuture.complete(client);
} else {
try {
client = clientFuture.get();
} catch (ExecutionException e) {
ExceptionUtils.rethrowIOException(ExceptionUtils.stripExecutionException(e));
return null;
}
}
// out to ensure correct bookkeeping for channel closing.
if (client.validateClientAndIncrementReferenceCounter()) {
return client;
} else if (client.canBeDisposed()) {
client.closeConnection();
} else {
destroyPartitionRequestClient(connectionId, client);
}
}
}
use of org.apache.flink.runtime.io.network.ConnectionID in project flink by apache.
the class PartitionRequestClientFactoryTest method testFailureReportedToSubsequentRequests.
// see https://issues.apache.org/jira/browse/FLINK-18821
@Test(expected = IOException.class)
public void testFailureReportedToSubsequentRequests() throws Exception {
PartitionRequestClientFactory factory = new PartitionRequestClientFactory(new FailingNettyClient(), 2, 1, connectionReuseEnabled);
try {
factory.createPartitionRequestClient(new ConnectionID(new InetSocketAddress(InetAddress.getLocalHost(), 8080), 0));
} catch (Exception e) {
// expected
}
factory.createPartitionRequestClient(new ConnectionID(new InetSocketAddress(InetAddress.getLocalHost(), 8080), 0));
}
Aggregations