Search in sources :

Example 61 with ResourceID

use of org.apache.flink.runtime.clusterframework.types.ResourceID in project flink by apache.

the class TaskManagerComponentsStartupShutdownTest method testComponentsStartupShutdown.

/**
	 * Makes sure that all components are shut down when the TaskManager
	 * actor is shut down.
	 */
@Test
public void testComponentsStartupShutdown() {
    final String[] TMP_DIR = new String[] { ConfigConstants.DEFAULT_TASK_MANAGER_TMP_PATH };
    final Time timeout = Time.seconds(100);
    final int BUFFER_SIZE = 32 * 1024;
    Configuration config = new Configuration();
    config.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_INTERVAL, "200 ms");
    config.setString(ConfigConstants.AKKA_WATCH_HEARTBEAT_PAUSE, "1 s");
    config.setInteger(ConfigConstants.AKKA_WATCH_THRESHOLD, 1);
    ActorSystem actorSystem = null;
    try {
        actorSystem = AkkaUtils.createLocalActorSystem(config);
        final ActorRef jobManager = JobManager.startJobManagerActors(config, actorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), JobManager.class, MemoryArchivist.class)._1();
        FlinkResourceManager.startResourceManagerActors(config, actorSystem, LeaderRetrievalUtils.createLeaderRetrievalService(config, jobManager), StandaloneResourceManager.class);
        final int numberOfSlots = 1;
        // create the components for the TaskManager manually
        final TaskManagerConfiguration tmConfig = new TaskManagerConfiguration(numberOfSlots, TMP_DIR, timeout, null, Time.milliseconds(500), Time.seconds(30), Time.seconds(10), // cleanup interval
        1000000, config, // exit-jvm-on-fatal-error
        false);
        final NetworkEnvironmentConfiguration netConf = new NetworkEnvironmentConfiguration(32, BUFFER_SIZE, MemoryType.HEAP, IOManager.IOMode.SYNC, 0, 0, 2, 8, null);
        ResourceID taskManagerId = ResourceID.generate();
        final TaskManagerLocation connectionInfo = new TaskManagerLocation(taskManagerId, InetAddress.getLocalHost(), 10000);
        final MemoryManager memManager = new MemoryManager(32 * BUFFER_SIZE, 1, BUFFER_SIZE, MemoryType.HEAP, false);
        final IOManager ioManager = new IOManagerAsync(TMP_DIR);
        final NetworkEnvironment network = new NetworkEnvironment(new NetworkBufferPool(netConf.numNetworkBuffers(), netConf.networkBufferSize(), netConf.memoryType()), new LocalConnectionManager(), new ResultPartitionManager(), new TaskEventDispatcher(), new KvStateRegistry(), null, netConf.ioMode(), netConf.partitionRequestInitialBackoff(), netConf.partitionRequestMaxBackoff(), netConf.networkBuffersPerChannel(), netConf.extraNetworkBuffersPerGate());
        network.start();
        LeaderRetrievalService leaderRetrievalService = new StandaloneLeaderRetrievalService(jobManager.path().toString());
        MetricRegistryConfiguration metricRegistryConfiguration = MetricRegistryConfiguration.fromConfiguration(config);
        // create the task manager
        final Props tmProps = Props.create(TaskManager.class, tmConfig, taskManagerId, connectionInfo, memManager, ioManager, network, numberOfSlots, leaderRetrievalService, new MetricRegistry(metricRegistryConfiguration));
        final ActorRef taskManager = actorSystem.actorOf(tmProps);
        new JavaTestKit(actorSystem) {

            {
                // wait for the TaskManager to be registered
                new Within(new FiniteDuration(5000, TimeUnit.SECONDS)) {

                    @Override
                    protected void run() {
                        taskManager.tell(TaskManagerMessages.getNotifyWhenRegisteredAtJobManagerMessage(), getTestActor());
                        expectMsgEquals(TaskManagerMessages.getRegisteredAtJobManagerMessage());
                    }
                };
            }
        };
        // shut down all actors and the actor system
        // Kill the Task down the JobManager
        taskManager.tell(Kill.getInstance(), ActorRef.noSender());
        jobManager.tell(Kill.getInstance(), ActorRef.noSender());
        // shut down the actors and the actor system
        actorSystem.shutdown();
        actorSystem.awaitTermination();
        actorSystem = null;
        // now that the TaskManager is shut down, the components should be shut down as well
        assertTrue(network.isShutdown());
        assertTrue(ioManager.isProperlyShutDown());
        assertTrue(memManager.isShutdown());
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    } finally {
        if (actorSystem != null) {
            actorSystem.shutdown();
        }
    }
}
Also used : ActorSystem(akka.actor.ActorSystem) KvStateRegistry(org.apache.flink.runtime.query.KvStateRegistry) MemoryArchivist(org.apache.flink.runtime.jobmanager.MemoryArchivist) MetricRegistryConfiguration(org.apache.flink.runtime.metrics.MetricRegistryConfiguration) Configuration(org.apache.flink.configuration.Configuration) TaskManagerConfiguration(org.apache.flink.runtime.taskexecutor.TaskManagerConfiguration) ActorRef(akka.actor.ActorRef) Time(org.apache.flink.api.common.time.Time) JobManager(org.apache.flink.runtime.jobmanager.JobManager) MetricRegistryConfiguration(org.apache.flink.runtime.metrics.MetricRegistryConfiguration) Props(akka.actor.Props) IOManagerAsync(org.apache.flink.runtime.io.disk.iomanager.IOManagerAsync) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) TaskManagerConfiguration(org.apache.flink.runtime.taskexecutor.TaskManagerConfiguration) IOManager(org.apache.flink.runtime.io.disk.iomanager.IOManager) MetricRegistry(org.apache.flink.runtime.metrics.MetricRegistry) FiniteDuration(scala.concurrent.duration.FiniteDuration) MemoryManager(org.apache.flink.runtime.memory.MemoryManager) ResultPartitionManager(org.apache.flink.runtime.io.network.partition.ResultPartitionManager) NetworkBufferPool(org.apache.flink.runtime.io.network.buffer.NetworkBufferPool) LocalConnectionManager(org.apache.flink.runtime.io.network.LocalConnectionManager) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) StandaloneLeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.StandaloneLeaderRetrievalService) StandaloneLeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.StandaloneLeaderRetrievalService) NetworkEnvironment(org.apache.flink.runtime.io.network.NetworkEnvironment) TaskEventDispatcher(org.apache.flink.runtime.io.network.TaskEventDispatcher) JavaTestKit(akka.testkit.JavaTestKit) Test(org.junit.Test)

Example 62 with ResourceID

use of org.apache.flink.runtime.clusterframework.types.ResourceID in project flink by apache.

the class TaskManagerLocationTest method testEqualsHashAndCompareTo.

@Test
public void testEqualsHashAndCompareTo() {
    try {
        ResourceID resourceID1 = new ResourceID("a");
        ResourceID resourceID2 = new ResourceID("b");
        ResourceID resourceID3 = new ResourceID("c");
        // we mock the addresses to save the times of the reverse name lookups
        InetAddress address1 = mock(InetAddress.class);
        when(address1.getCanonicalHostName()).thenReturn("localhost");
        when(address1.getHostName()).thenReturn("localhost");
        when(address1.getHostAddress()).thenReturn("127.0.0.1");
        when(address1.getAddress()).thenReturn(new byte[] { 127, 0, 0, 1 });
        InetAddress address2 = mock(InetAddress.class);
        when(address2.getCanonicalHostName()).thenReturn("testhost1");
        when(address2.getHostName()).thenReturn("testhost1");
        when(address2.getHostAddress()).thenReturn("0.0.0.0");
        when(address2.getAddress()).thenReturn(new byte[] { 0, 0, 0, 0 });
        InetAddress address3 = mock(InetAddress.class);
        when(address3.getCanonicalHostName()).thenReturn("testhost2");
        when(address3.getHostName()).thenReturn("testhost2");
        when(address3.getHostAddress()).thenReturn("192.168.0.1");
        when(address3.getAddress()).thenReturn(new byte[] { (byte) 192, (byte) 168, 0, 1 });
        // one == four != two != three
        TaskManagerLocation one = new TaskManagerLocation(resourceID1, address1, 19871);
        TaskManagerLocation two = new TaskManagerLocation(resourceID2, address2, 19871);
        TaskManagerLocation three = new TaskManagerLocation(resourceID3, address3, 10871);
        TaskManagerLocation four = new TaskManagerLocation(resourceID1, address1, 19871);
        assertTrue(one.equals(four));
        assertTrue(!one.equals(two));
        assertTrue(!one.equals(three));
        assertTrue(!two.equals(three));
        assertTrue(!three.equals(four));
        assertTrue(one.compareTo(four) == 0);
        assertTrue(four.compareTo(one) == 0);
        assertTrue(one.compareTo(two) != 0);
        assertTrue(one.compareTo(three) != 0);
        assertTrue(two.compareTo(three) != 0);
        assertTrue(three.compareTo(four) != 0);
        {
            int val = one.compareTo(two);
            assertTrue(two.compareTo(one) == -val);
        }
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
}
Also used : ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) InetAddress(java.net.InetAddress) Test(org.junit.Test)

Example 63 with ResourceID

use of org.apache.flink.runtime.clusterframework.types.ResourceID in project flink by apache.

the class YarnFlinkResourceManager method reacceptRegisteredWorkers.

@Override
protected Collection<RegisteredYarnWorkerNode> reacceptRegisteredWorkers(Collection<ResourceID> toConsolidate) {
    // we check for each task manager if we recognize its container
    List<RegisteredYarnWorkerNode> accepted = new ArrayList<>();
    for (ResourceID resourceID : toConsolidate) {
        YarnContainerInLaunch yci = containersInLaunch.remove(resourceID);
        if (yci != null) {
            LOG.info("YARN container consolidation recognizes Resource {} ", resourceID);
            accepted.add(new RegisteredYarnWorkerNode(yci.container()));
        } else {
            if (isStarted(resourceID)) {
                LOG.info("TaskManager {} has already been registered at the resource manager.", resourceID);
            } else {
                LOG.info("YARN container consolidation does not recognize TaskManager {}", resourceID);
            }
        }
    }
    return accepted;
}
Also used : ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) ArrayList(java.util.ArrayList)

Example 64 with ResourceID

use of org.apache.flink.runtime.clusterframework.types.ResourceID in project flink by apache.

the class YarnTaskManagerRunner method runYarnTaskManager.

public static void runYarnTaskManager(String[] args, final Class<? extends YarnTaskManager> taskManager) throws IOException {
    EnvironmentInformation.logEnvironmentInfo(LOG, "YARN TaskManager", args);
    SignalHandler.register(LOG);
    JvmShutdownSafeguard.installAsShutdownHook(LOG);
    // try to parse the command line arguments
    final Configuration configuration;
    try {
        configuration = TaskManager.parseArgsAndLoadConfig(args);
    } catch (Throwable t) {
        LOG.error(t.getMessage(), t);
        System.exit(TaskManager.STARTUP_FAILURE_RETURN_CODE());
        return;
    }
    // read the environment variables for YARN
    final Map<String, String> envs = System.getenv();
    final String yarnClientUsername = envs.get(YarnConfigKeys.ENV_HADOOP_USER_NAME);
    final String localDirs = envs.get(Environment.LOCAL_DIRS.key());
    LOG.info("Current working/local Directory: {}", localDirs);
    final String currDir = envs.get(Environment.PWD.key());
    LOG.info("Current working Directory: {}", currDir);
    final String remoteKeytabPath = envs.get(YarnConfigKeys.KEYTAB_PATH);
    LOG.info("TM: remoteKeytabPath obtained {}", remoteKeytabPath);
    final String remoteKeytabPrincipal = envs.get(YarnConfigKeys.KEYTAB_PRINCIPAL);
    LOG.info("TM: remoteKeytabPrincipal obtained {}", remoteKeytabPrincipal);
    // configure local directory
    String flinkTempDirs = configuration.getString(ConfigConstants.TASK_MANAGER_TMP_DIR_KEY, null);
    if (flinkTempDirs == null) {
        LOG.info("Setting directories for temporary file " + localDirs);
        configuration.setString(ConfigConstants.TASK_MANAGER_TMP_DIR_KEY, localDirs);
    } else {
        LOG.info("Overriding YARN's temporary file directories with those " + "specified in the Flink config: " + flinkTempDirs);
    }
    // tell akka to die in case of an error
    configuration.setBoolean(ConfigConstants.AKKA_JVM_EXIT_ON_FATAL_ERROR, true);
    String localKeytabPath = null;
    if (remoteKeytabPath != null) {
        File f = new File(currDir, Utils.KEYTAB_FILE_NAME);
        localKeytabPath = f.getAbsolutePath();
        LOG.info("localKeytabPath: {}", localKeytabPath);
    }
    UserGroupInformation currentUser = UserGroupInformation.getCurrentUser();
    LOG.info("YARN daemon is running as: {} Yarn client user obtainer: {}", currentUser.getShortUserName(), yarnClientUsername);
    // Infer the resource identifier from the environment variable
    String containerID = Preconditions.checkNotNull(envs.get(YarnFlinkResourceManager.ENV_FLINK_CONTAINER_ID));
    final ResourceID resourceId = new ResourceID(containerID);
    LOG.info("ResourceID assigned for this container: {}", resourceId);
    try {
        org.apache.hadoop.conf.Configuration hadoopConfiguration = null;
        //To support Yarn Secure Integration Test Scenario
        File krb5Conf = new File(currDir, Utils.KRB5_FILE_NAME);
        if (krb5Conf.exists() && krb5Conf.canRead()) {
            String krb5Path = krb5Conf.getAbsolutePath();
            LOG.info("KRB5 Conf: {}", krb5Path);
            hadoopConfiguration = new org.apache.hadoop.conf.Configuration();
            hadoopConfiguration.set(CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHENTICATION, "kerberos");
            hadoopConfiguration.set(CommonConfigurationKeysPublic.HADOOP_SECURITY_AUTHORIZATION, "true");
        }
        // set keytab principal and replace path with the local path of the shipped keytab file in NodeManager
        if (localKeytabPath != null && remoteKeytabPrincipal != null) {
            configuration.setString(SecurityOptions.KERBEROS_LOGIN_KEYTAB, localKeytabPath);
            configuration.setString(SecurityOptions.KERBEROS_LOGIN_PRINCIPAL, remoteKeytabPrincipal);
        }
        SecurityUtils.SecurityConfiguration sc;
        if (hadoopConfiguration != null) {
            sc = new SecurityUtils.SecurityConfiguration(configuration, hadoopConfiguration);
        } else {
            sc = new SecurityUtils.SecurityConfiguration(configuration);
        }
        SecurityUtils.install(sc);
        SecurityUtils.getInstalledContext().runSecured(new Callable<Object>() {

            @Override
            public Integer call() {
                try {
                    TaskManager.selectNetworkInterfaceAndRunTaskManager(configuration, resourceId, taskManager);
                } catch (Throwable t) {
                    LOG.error("Error while starting the TaskManager", t);
                    System.exit(TaskManager.STARTUP_FAILURE_RETURN_CODE());
                }
                return null;
            }
        });
    } catch (Exception e) {
        LOG.error("Exception occurred while launching Task Manager", e);
        throw new RuntimeException(e);
    }
}
Also used : Configuration(org.apache.flink.configuration.Configuration) SecurityUtils(org.apache.flink.runtime.security.SecurityUtils) IOException(java.io.IOException) ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) File(java.io.File) UserGroupInformation(org.apache.hadoop.security.UserGroupInformation)

Example 65 with ResourceID

use of org.apache.flink.runtime.clusterframework.types.ResourceID in project flink by apache.

the class ResourceManagerTaskExecutorTest method mockTaskExecutor.

private ResourceID mockTaskExecutor(String taskExecutorAddress) {
    TaskExecutorGateway taskExecutorGateway = mock(TaskExecutorGateway.class);
    ResourceID taskExecutorResourceID = ResourceID.generate();
    rpcService.registerGateway(taskExecutorAddress, taskExecutorGateway);
    return taskExecutorResourceID;
}
Also used : ResourceID(org.apache.flink.runtime.clusterframework.types.ResourceID) TaskExecutorGateway(org.apache.flink.runtime.taskexecutor.TaskExecutorGateway)

Aggregations

ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)74 Test (org.junit.Test)48 TaskManagerLocation (org.apache.flink.runtime.taskmanager.TaskManagerLocation)25 Time (org.apache.flink.api.common.time.Time)18 UUID (java.util.UUID)16 JobID (org.apache.flink.api.common.JobID)16 Configuration (org.apache.flink.configuration.Configuration)14 AllocationID (org.apache.flink.runtime.clusterframework.types.AllocationID)13 JavaTestKit (akka.testkit.JavaTestKit)12 MetricRegistry (org.apache.flink.runtime.metrics.MetricRegistry)12 InetAddress (java.net.InetAddress)11 SlotID (org.apache.flink.runtime.clusterframework.types.SlotID)10 HeartbeatServices (org.apache.flink.runtime.heartbeat.HeartbeatServices)10 TestingHighAvailabilityServices (org.apache.flink.runtime.highavailability.TestingHighAvailabilityServices)10 SlotRequest (org.apache.flink.runtime.resourcemanager.SlotRequest)10 IOManager (org.apache.flink.runtime.io.disk.iomanager.IOManager)9 NetworkEnvironment (org.apache.flink.runtime.io.network.NetworkEnvironment)9 ActorTaskManagerGateway (org.apache.flink.runtime.jobmanager.slots.ActorTaskManagerGateway)9 MemoryManager (org.apache.flink.runtime.memory.MemoryManager)9 TestingSerialRpcService (org.apache.flink.runtime.rpc.TestingSerialRpcService)9