Search in sources :

Example 1 with ConnectionID

use of org.apache.flink.runtime.io.network.ConnectionID in project flink by apache.

the class InputChannelDeploymentDescriptorTest method testMixedLocalRemoteUnknownDeployment.

/**
	 * Tests the deployment descriptors for local, remote, and unknown partition
	 * locations (with lazy deployment allowed and all execution states for the
	 * producers).
	 */
@Test
public void testMixedLocalRemoteUnknownDeployment() throws Exception {
    boolean allowLazyDeployment = true;
    ResourceID consumerResourceId = ResourceID.generate();
    ExecutionVertex consumer = mock(ExecutionVertex.class);
    SimpleSlot consumerSlot = mockSlot(consumerResourceId);
    // states.
    for (ExecutionState state : ExecutionState.values()) {
        // Local partition
        ExecutionVertex localProducer = mockExecutionVertex(state, consumerResourceId);
        IntermediateResultPartition localPartition = mockPartition(localProducer);
        ResultPartitionID localPartitionId = new ResultPartitionID(localPartition.getPartitionId(), localProducer.getCurrentExecutionAttempt().getAttemptId());
        ExecutionEdge localEdge = new ExecutionEdge(localPartition, consumer, 0);
        // Remote partition
        // new resource ID
        ExecutionVertex remoteProducer = mockExecutionVertex(state, ResourceID.generate());
        IntermediateResultPartition remotePartition = mockPartition(remoteProducer);
        ResultPartitionID remotePartitionId = new ResultPartitionID(remotePartition.getPartitionId(), remoteProducer.getCurrentExecutionAttempt().getAttemptId());
        ConnectionID remoteConnectionId = new ConnectionID(remoteProducer.getCurrentAssignedResource().getTaskManagerLocation(), 0);
        ExecutionEdge remoteEdge = new ExecutionEdge(remotePartition, consumer, 1);
        // Unknown partition
        // no assigned resource
        ExecutionVertex unknownProducer = mockExecutionVertex(state, null);
        IntermediateResultPartition unknownPartition = mockPartition(unknownProducer);
        ResultPartitionID unknownPartitionId = new ResultPartitionID(unknownPartition.getPartitionId(), unknownProducer.getCurrentExecutionAttempt().getAttemptId());
        ExecutionEdge unknownEdge = new ExecutionEdge(unknownPartition, consumer, 2);
        InputChannelDeploymentDescriptor[] desc = InputChannelDeploymentDescriptor.fromEdges(new ExecutionEdge[] { localEdge, remoteEdge, unknownEdge }, consumerSlot, allowLazyDeployment);
        assertEquals(3, desc.length);
        // These states are allowed
        if (state == ExecutionState.RUNNING || state == ExecutionState.FINISHED || state == ExecutionState.SCHEDULED || state == ExecutionState.DEPLOYING) {
            // Create local or remote channels
            assertEquals(localPartitionId, desc[0].getConsumedPartitionId());
            assertTrue(desc[0].getConsumedPartitionLocation().isLocal());
            assertNull(desc[0].getConsumedPartitionLocation().getConnectionId());
            assertEquals(remotePartitionId, desc[1].getConsumedPartitionId());
            assertTrue(desc[1].getConsumedPartitionLocation().isRemote());
            assertEquals(remoteConnectionId, desc[1].getConsumedPartitionLocation().getConnectionId());
        } else {
            // Unknown (lazy deployment allowed)
            assertEquals(localPartitionId, desc[0].getConsumedPartitionId());
            assertTrue(desc[0].getConsumedPartitionLocation().isUnknown());
            assertNull(desc[0].getConsumedPartitionLocation().getConnectionId());
            assertEquals(remotePartitionId, desc[1].getConsumedPartitionId());
            assertTrue(desc[1].getConsumedPartitionLocation().isUnknown());
            assertNull(desc[1].getConsumedPartitionLocation().getConnectionId());
        }
        assertEquals(unknownPartitionId, desc[2].getConsumedPartitionId());
        assertTrue(desc[2].getConsumedPartitionLocation().isUnknown());
        assertNull(desc[2].getConsumedPartitionLocation().getConnectionId());
    }
}
Also used : ExecutionState(org.apache.flink.runtime.execution.ExecutionState) ConnectionID(org.apache.flink.runtime.io.network.ConnectionID) IntermediateResultPartition(org.apache.flink.runtime.executiongraph.IntermediateResultPartition) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) ExecutionEdge(org.apache.flink.runtime.executiongraph.ExecutionEdge) IntermediateResultPartitionID(org.apache.flink.runtime.jobgraph.IntermediateResultPartitionID) ResultPartitionID(org.apache.flink.runtime.io.network.partition.ResultPartitionID) SimpleSlot(org.apache.flink.runtime.instance.SimpleSlot) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) Test(org.junit.Test)

Example 2 with ConnectionID

use of org.apache.flink.runtime.io.network.ConnectionID in project flink by apache.

the class PartitionRequestClientFactoryTest method testResourceReleaseAfterInterruptedConnect.

@Test
public void testResourceReleaseAfterInterruptedConnect() throws Exception {
    // Latch to synchronize on the connect call.
    final CountDownLatch syncOnConnect = new CountDownLatch(1);
    final Tuple2<NettyServer, NettyClient> netty = createNettyServerAndClient(new NettyProtocol() {

        @Override
        public ChannelHandler[] getServerChannelHandlers() {
            return new ChannelHandler[0];
        }

        @Override
        public ChannelHandler[] getClientChannelHandlers() {
            return new ChannelHandler[] { new CountDownLatchOnConnectHandler(syncOnConnect) };
        }
    });
    final NettyServer server = netty.f0;
    final NettyClient client = netty.f1;
    final UncaughtTestExceptionHandler exceptionHandler = new UncaughtTestExceptionHandler();
    try {
        final PartitionRequestClientFactory factory = new PartitionRequestClientFactory(client);
        final Thread connect = new Thread(new Runnable() {

            @Override
            public void run() {
                ConnectionID serverAddress = null;
                try {
                    serverAddress = createServerConnectionID(0);
                    // This triggers a connect
                    factory.createPartitionRequestClient(serverAddress);
                } catch (Throwable t) {
                    if (serverAddress != null) {
                        factory.closeOpenChannelConnections(serverAddress);
                        Thread.getDefaultUncaughtExceptionHandler().uncaughtException(Thread.currentThread(), t);
                    } else {
                        t.printStackTrace();
                        fail("Could not create RemoteAddress for server.");
                    }
                }
            }
        });
        connect.setUncaughtExceptionHandler(exceptionHandler);
        connect.start();
        // Wait on the connect
        syncOnConnect.await();
        connect.interrupt();
        connect.join();
        // Make sure that after a failed connect all resources are cleared.
        assertEquals(0, factory.getNumberOfActiveClients());
        // Make sure that the interrupt exception is not swallowed
        assertTrue(exceptionHandler.getErrors().size() > 0);
    } finally {
        if (server != null) {
            server.shutdown();
        }
        if (client != null) {
            client.shutdown();
        }
    }
}
Also used : CountDownLatch(java.util.concurrent.CountDownLatch) ConnectionID(org.apache.flink.runtime.io.network.ConnectionID) Test(org.junit.Test)

Example 3 with ConnectionID

use of org.apache.flink.runtime.io.network.ConnectionID in project flink by apache.

the class TaskManagerTest method testRemotePartitionNotFound.

/**
	 * Tests that repeated remote {@link PartitionNotFoundException}s ultimately fail the receiver.
	 */
@Test
public void testRemotePartitionNotFound() throws Exception {
    new JavaTestKit(system) {

        {
            ActorGateway jobManager = null;
            ActorGateway taskManager = null;
            final ActorGateway testActorGateway = new AkkaActorGateway(getTestActor(), leaderSessionID);
            try {
                final IntermediateDataSetID resultId = new IntermediateDataSetID();
                // Create the JM
                ActorRef jm = system.actorOf(Props.create(new SimplePartitionStateLookupJobManagerCreator(leaderSessionID, getTestActor())));
                jobManager = new AkkaActorGateway(jm, leaderSessionID);
                final int dataPort = NetUtils.getAvailablePort();
                Configuration config = new Configuration();
                config.setInteger(ConfigConstants.TASK_MANAGER_DATA_PORT_KEY, dataPort);
                config.setInteger(TaskManagerOptions.NETWORK_REQUEST_BACKOFF_INITIAL, 100);
                config.setInteger(TaskManagerOptions.NETWORK_REQUEST_BACKOFF_MAX, 200);
                taskManager = TestingUtils.createTaskManager(system, jobManager, config, false, true);
                // ---------------------------------------------------------------------------------
                final ActorGateway tm = taskManager;
                final JobID jid = new JobID();
                final JobVertexID vid = new JobVertexID();
                final ExecutionAttemptID eid = new ExecutionAttemptID();
                final ResultPartitionID partitionId = new ResultPartitionID();
                // Remote location (on the same TM though) for the partition
                final ResultPartitionLocation loc = ResultPartitionLocation.createRemote(new ConnectionID(new InetSocketAddress("localhost", dataPort), 0));
                final InputChannelDeploymentDescriptor[] icdd = new InputChannelDeploymentDescriptor[] { new InputChannelDeploymentDescriptor(partitionId, loc) };
                final InputGateDeploymentDescriptor igdd = new InputGateDeploymentDescriptor(resultId, ResultPartitionType.PIPELINED, 0, icdd);
                final TaskDeploymentDescriptor tdd = createTaskDeploymentDescriptor(jid, "TestJob", vid, eid, new SerializedValue<>(new ExecutionConfig()), "Receiver", 1, 0, 1, 0, new Configuration(), new Configuration(), Tasks.AgnosticReceiver.class.getName(), Collections.<ResultPartitionDeploymentDescriptor>emptyList(), Collections.singletonList(igdd), Collections.<BlobKey>emptyList(), Collections.<URL>emptyList(), 0);
                new Within(d) {

                    @Override
                    protected void run() {
                        // Submit the task
                        tm.tell(new SubmitTask(tdd), testActorGateway);
                        expectMsgClass(Acknowledge.get().getClass());
                        // Wait to be notified about the final execution state by the mock JM
                        TaskExecutionState msg = expectMsgClass(TaskExecutionState.class);
                        // The task should fail after repeated requests
                        assertEquals(ExecutionState.FAILED, msg.getExecutionState());
                        Throwable t = msg.getError(ClassLoader.getSystemClassLoader());
                        assertEquals("Thrown exception was not a PartitionNotFoundException: " + t.getMessage(), PartitionNotFoundException.class, t.getClass());
                    }
                };
            } catch (Exception e) {
                e.printStackTrace();
                fail(e.getMessage());
            } finally {
                TestingUtils.stopActor(taskManager);
                TestingUtils.stopActor(jobManager);
            }
        }
    };
}
Also used : AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) TaskManagerServicesConfiguration(org.apache.flink.runtime.taskexecutor.TaskManagerServicesConfiguration) Configuration(org.apache.flink.configuration.Configuration) ActorRef(akka.actor.ActorRef) InetSocketAddress(java.net.InetSocketAddress) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) ResultPartitionLocation(org.apache.flink.runtime.deployment.ResultPartitionLocation) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) ResultPartitionID(org.apache.flink.runtime.io.network.partition.ResultPartitionID) IntermediateResultPartitionID(org.apache.flink.runtime.jobgraph.IntermediateResultPartitionID) TaskDeploymentDescriptor(org.apache.flink.runtime.deployment.TaskDeploymentDescriptor) SubmitTask(org.apache.flink.runtime.messages.TaskMessages.SubmitTask) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) InputGateDeploymentDescriptor(org.apache.flink.runtime.deployment.InputGateDeploymentDescriptor) PartitionNotFoundException(org.apache.flink.runtime.io.network.partition.PartitionNotFoundException) IOException(java.io.IOException) ConnectionID(org.apache.flink.runtime.io.network.ConnectionID) InputChannelDeploymentDescriptor(org.apache.flink.runtime.deployment.InputChannelDeploymentDescriptor) IntermediateDataSetID(org.apache.flink.runtime.jobgraph.IntermediateDataSetID) JavaTestKit(akka.testkit.JavaTestKit) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 4 with ConnectionID

use of org.apache.flink.runtime.io.network.ConnectionID in project flink by apache.

the class PartitionRequestClientFactoryTest method testExceptionsAreNotCached.

@Test
public void testExceptionsAreNotCached() throws Exception {
    NettyTestUtil.NettyServerAndClient nettyServerAndClient = createNettyServerAndClient();
    try {
        final PartitionRequestClientFactory factory = new PartitionRequestClientFactory(new UnstableNettyClient(nettyServerAndClient.client(), 1), connectionReuseEnabled);
        final ConnectionID connectionID = nettyServerAndClient.getConnectionID(0);
        try {
            factory.createPartitionRequestClient(connectionID);
            fail("Expected the first request to fail.");
        } catch (RemoteTransportException expected) {
        // expected
        }
        factory.createPartitionRequestClient(connectionID);
    } finally {
        nettyServerAndClient.client().shutdown();
        nettyServerAndClient.server().shutdown();
    }
}
Also used : RemoteTransportException(org.apache.flink.runtime.io.network.netty.exception.RemoteTransportException) ConnectionID(org.apache.flink.runtime.io.network.ConnectionID) Test(org.junit.Test)

Example 5 with ConnectionID

use of org.apache.flink.runtime.io.network.ConnectionID in project flink by apache.

the class InputChannelDeploymentDescriptor method fromEdges.

// ------------------------------------------------------------------------
/**
	 * Creates an input channel deployment descriptor for each partition.
	 */
public static InputChannelDeploymentDescriptor[] fromEdges(ExecutionEdge[] edges, SimpleSlot consumerSlot, boolean allowLazyDeployment) throws ExecutionGraphException {
    final ResourceID consumerTaskManager = consumerSlot.getTaskManagerID();
    final InputChannelDeploymentDescriptor[] icdd = new InputChannelDeploymentDescriptor[edges.length];
    // Each edge is connected to a different result partition
    for (int i = 0; i < edges.length; i++) {
        final IntermediateResultPartition consumedPartition = edges[i].getSource();
        final Execution producer = consumedPartition.getProducer().getCurrentExecutionAttempt();
        final ExecutionState producerState = producer.getState();
        final SimpleSlot producerSlot = producer.getAssignedResource();
        final ResultPartitionLocation partitionLocation;
        // The producing task needs to be RUNNING or already FINISHED
        if (consumedPartition.isConsumable() && producerSlot != null && (producerState == ExecutionState.RUNNING || producerState == ExecutionState.FINISHED || producerState == ExecutionState.SCHEDULED || producerState == ExecutionState.DEPLOYING)) {
            final TaskManagerLocation partitionTaskManagerLocation = producerSlot.getTaskManagerLocation();
            final ResourceID partitionTaskManager = partitionTaskManagerLocation.getResourceID();
            if (partitionTaskManager.equals(consumerTaskManager)) {
                // Consuming task is deployed to the same TaskManager as the partition => local
                partitionLocation = ResultPartitionLocation.createLocal();
            } else {
                // Different instances => remote
                final ConnectionID connectionId = new ConnectionID(partitionTaskManagerLocation, consumedPartition.getIntermediateResult().getConnectionIndex());
                partitionLocation = ResultPartitionLocation.createRemote(connectionId);
            }
        } else if (allowLazyDeployment) {
            // The producing task might not have registered the partition yet
            partitionLocation = ResultPartitionLocation.createUnknown();
        } else if (producerState == ExecutionState.CANCELING || producerState == ExecutionState.CANCELED || producerState == ExecutionState.FAILED) {
            String msg = "Trying to schedule a task whose inputs were canceled or failed. " + "The producer is in state " + producerState + ".";
            throw new ExecutionGraphException(msg);
        } else {
            String msg = String.format("Trying to eagerly schedule a task whose inputs " + "are not ready (partition consumable? %s, producer state: %s, producer slot: %s).", consumedPartition.isConsumable(), producerState, producerSlot);
            throw new ExecutionGraphException(msg);
        }
        final ResultPartitionID consumedPartitionId = new ResultPartitionID(consumedPartition.getPartitionId(), producer.getAttemptId());
        icdd[i] = new InputChannelDeploymentDescriptor(consumedPartitionId, partitionLocation);
    }
    return icdd;
}
Also used : ExecutionState(org.apache.flink.runtime.execution.ExecutionState) ExecutionGraphException(org.apache.flink.runtime.executiongraph.ExecutionGraphException) TaskManagerLocation(org.apache.flink.runtime.taskmanager.TaskManagerLocation) SimpleSlot(org.apache.flink.runtime.instance.SimpleSlot) ConnectionID(org.apache.flink.runtime.io.network.ConnectionID) IntermediateResultPartition(org.apache.flink.runtime.executiongraph.IntermediateResultPartition) Execution(org.apache.flink.runtime.executiongraph.Execution) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) ResultPartitionID(org.apache.flink.runtime.io.network.partition.ResultPartitionID)

Aggregations

ConnectionID (org.apache.flink.runtime.io.network.ConnectionID)12 Test (org.junit.Test)6 ResultPartitionID (org.apache.flink.runtime.io.network.partition.ResultPartitionID)5 IOException (java.io.IOException)4 ExecutionState (org.apache.flink.runtime.execution.ExecutionState)4 InetSocketAddress (java.net.InetSocketAddress)3 ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)3 SimpleSlot (org.apache.flink.runtime.instance.SimpleSlot)3 TaskManagerLocation (org.apache.flink.runtime.taskmanager.TaskManagerLocation)3 CompletableFuture (java.util.concurrent.CompletableFuture)2 TimeoutException (java.util.concurrent.TimeoutException)2 Configuration (org.apache.flink.configuration.Configuration)2 IntermediateResultPartitionID (org.apache.flink.runtime.jobgraph.IntermediateResultPartitionID)2 ActorRef (akka.actor.ActorRef)1 JavaTestKit (akka.testkit.JavaTestKit)1 ArrayDeque (java.util.ArrayDeque)1 ArrayList (java.util.ArrayList)1 Arrays (java.util.Arrays)1 HashSet (java.util.HashSet)1 List (java.util.List)1