Search in sources :

Example 16 with ClusterSpecification

use of org.apache.flink.client.deployment.ClusterSpecification in project flink by apache.

the class KubernetesClusterDescriptor method deployClusterInternal.

private ClusterClientProvider<String> deployClusterInternal(String entryPoint, ClusterSpecification clusterSpecification, boolean detached) throws ClusterDeploymentException {
    final ClusterEntrypoint.ExecutionMode executionMode = detached ? ClusterEntrypoint.ExecutionMode.DETACHED : ClusterEntrypoint.ExecutionMode.NORMAL;
    flinkConfig.setString(ClusterEntrypoint.INTERNAL_CLUSTER_EXECUTION_MODE, executionMode.toString());
    flinkConfig.setString(KubernetesConfigOptionsInternal.ENTRY_POINT_CLASS, entryPoint);
    // Rpc, blob, rest, taskManagerRpc ports need to be exposed, so update them to fixed values.
    KubernetesUtils.checkAndUpdatePortConfigOption(flinkConfig, BlobServerOptions.PORT, Constants.BLOB_SERVER_PORT);
    KubernetesUtils.checkAndUpdatePortConfigOption(flinkConfig, TaskManagerOptions.RPC_PORT, Constants.TASK_MANAGER_RPC_PORT);
    KubernetesUtils.checkAndUpdatePortConfigOption(flinkConfig, RestOptions.BIND_PORT, Constants.REST_PORT);
    if (HighAvailabilityMode.isHighAvailabilityModeActivated(flinkConfig)) {
        flinkConfig.setString(HighAvailabilityOptions.HA_CLUSTER_ID, clusterId);
        KubernetesUtils.checkAndUpdatePortConfigOption(flinkConfig, HighAvailabilityOptions.HA_JOB_MANAGER_PORT_RANGE, flinkConfig.get(JobManagerOptions.PORT));
    }
    try {
        final KubernetesJobManagerParameters kubernetesJobManagerParameters = new KubernetesJobManagerParameters(flinkConfig, clusterSpecification);
        final FlinkPod podTemplate = kubernetesJobManagerParameters.getPodTemplateFilePath().map(file -> KubernetesUtils.loadPodFromTemplateFile(client, file, Constants.MAIN_CONTAINER_NAME)).orElse(new FlinkPod.Builder().build());
        final KubernetesJobManagerSpecification kubernetesJobManagerSpec = KubernetesJobManagerFactory.buildKubernetesJobManagerSpecification(podTemplate, kubernetesJobManagerParameters);
        client.createJobManagerComponent(kubernetesJobManagerSpec);
        return createClusterClientProvider(clusterId);
    } catch (Exception e) {
        try {
            LOG.warn("Failed to create the Kubernetes cluster \"{}\", try to clean up the residual resources.", clusterId);
            client.stopAndCleanupCluster(clusterId);
        } catch (Exception e1) {
            LOG.info("Failed to stop and clean up the Kubernetes cluster \"{}\".", clusterId, e1);
        }
        throw new ClusterDeploymentException("Could not create Kubernetes cluster \"" + clusterId + "\".", e);
    }
}
Also used : FlinkException(org.apache.flink.util.FlinkException) ClusterSpecification(org.apache.flink.client.deployment.ClusterSpecification) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) LoggerFactory(org.slf4j.LoggerFactory) KubernetesConfigOptionsInternal(org.apache.flink.kubernetes.configuration.KubernetesConfigOptionsInternal) KubernetesService(org.apache.flink.kubernetes.kubeclient.resources.KubernetesService) RestClusterClient(org.apache.flink.client.program.rest.RestClusterClient) ApplicationConfiguration(org.apache.flink.client.deployment.application.ApplicationConfiguration) TaskManagerOptions(org.apache.flink.configuration.TaskManagerOptions) RestOptions(org.apache.flink.configuration.RestOptions) Endpoint(org.apache.flink.kubernetes.kubeclient.Endpoint) StandaloneClientHAServices(org.apache.flink.runtime.highavailability.nonha.standalone.StandaloneClientHAServices) Preconditions.checkNotNull(org.apache.flink.util.Preconditions.checkNotNull) ClusterRetrieveException(org.apache.flink.client.deployment.ClusterRetrieveException) KubernetesJobManagerParameters(org.apache.flink.kubernetes.kubeclient.parameters.KubernetesJobManagerParameters) HighAvailabilityMode(org.apache.flink.runtime.jobmanager.HighAvailabilityMode) KubernetesDeploymentTarget(org.apache.flink.kubernetes.configuration.KubernetesDeploymentTarget) PackagedProgramUtils(org.apache.flink.client.program.PackagedProgramUtils) BlobServerOptions(org.apache.flink.configuration.BlobServerOptions) KubernetesUtils(org.apache.flink.kubernetes.utils.KubernetesUtils) KubernetesConfigOptions(org.apache.flink.kubernetes.configuration.KubernetesConfigOptions) Logger(org.slf4j.Logger) Configuration(org.apache.flink.configuration.Configuration) KubernetesJobManagerSpecification(org.apache.flink.kubernetes.kubeclient.KubernetesJobManagerSpecification) KubernetesApplicationClusterEntrypoint(org.apache.flink.kubernetes.entrypoint.KubernetesApplicationClusterEntrypoint) HighAvailabilityServicesUtils(org.apache.flink.runtime.highavailability.HighAvailabilityServicesUtils) JobManagerOptions(org.apache.flink.configuration.JobManagerOptions) FlinkPod(org.apache.flink.kubernetes.kubeclient.FlinkPod) Preconditions(org.apache.flink.util.Preconditions) KubernetesSessionClusterEntrypoint(org.apache.flink.kubernetes.entrypoint.KubernetesSessionClusterEntrypoint) File(java.io.File) ClusterDeploymentException(org.apache.flink.client.deployment.ClusterDeploymentException) List(java.util.List) ClusterDescriptor(org.apache.flink.client.deployment.ClusterDescriptor) ClusterClient(org.apache.flink.client.program.ClusterClient) KubernetesJobManagerFactory(org.apache.flink.kubernetes.kubeclient.factory.KubernetesJobManagerFactory) Optional(java.util.Optional) AddressResolution(org.apache.flink.runtime.rpc.AddressResolution) ClusterClientProvider(org.apache.flink.client.program.ClusterClientProvider) ClusterEntrypoint(org.apache.flink.runtime.entrypoint.ClusterEntrypoint) Constants(org.apache.flink.kubernetes.utils.Constants) HighAvailabilityOptions(org.apache.flink.configuration.HighAvailabilityOptions) FlinkKubeClient(org.apache.flink.kubernetes.kubeclient.FlinkKubeClient) ClusterDeploymentException(org.apache.flink.client.deployment.ClusterDeploymentException) FlinkPod(org.apache.flink.kubernetes.kubeclient.FlinkPod) KubernetesApplicationClusterEntrypoint(org.apache.flink.kubernetes.entrypoint.KubernetesApplicationClusterEntrypoint) KubernetesSessionClusterEntrypoint(org.apache.flink.kubernetes.entrypoint.KubernetesSessionClusterEntrypoint) ClusterEntrypoint(org.apache.flink.runtime.entrypoint.ClusterEntrypoint) KubernetesJobManagerSpecification(org.apache.flink.kubernetes.kubeclient.KubernetesJobManagerSpecification) FlinkException(org.apache.flink.util.FlinkException) ClusterRetrieveException(org.apache.flink.client.deployment.ClusterRetrieveException) ClusterDeploymentException(org.apache.flink.client.deployment.ClusterDeploymentException) KubernetesJobManagerParameters(org.apache.flink.kubernetes.kubeclient.parameters.KubernetesJobManagerParameters)

Example 17 with ClusterSpecification

use of org.apache.flink.client.deployment.ClusterSpecification in project flink by apache.

the class ApplicationClusterDeployer method run.

public <ClusterID> void run(final Configuration configuration, final ApplicationConfiguration applicationConfiguration) throws Exception {
    checkNotNull(configuration);
    checkNotNull(applicationConfiguration);
    LOG.info("Submitting application in 'Application Mode'.");
    final ClusterClientFactory<ClusterID> clientFactory = clientServiceLoader.getClusterClientFactory(configuration);
    try (final ClusterDescriptor<ClusterID> clusterDescriptor = clientFactory.createClusterDescriptor(configuration)) {
        final ClusterSpecification clusterSpecification = clientFactory.getClusterSpecification(configuration);
        clusterDescriptor.deployApplicationCluster(clusterSpecification, applicationConfiguration);
    }
}
Also used : ClusterSpecification(org.apache.flink.client.deployment.ClusterSpecification)

Example 18 with ClusterSpecification

use of org.apache.flink.client.deployment.ClusterSpecification in project flink by apache.

the class AbstractJobClusterExecutor method execute.

@Override
public CompletableFuture<JobClient> execute(@Nonnull final Pipeline pipeline, @Nonnull final Configuration configuration, @Nonnull final ClassLoader userCodeClassloader) throws Exception {
    final JobGraph jobGraph = PipelineExecutorUtils.getJobGraph(pipeline, configuration);
    try (final ClusterDescriptor<ClusterID> clusterDescriptor = clusterClientFactory.createClusterDescriptor(configuration)) {
        final ExecutionConfigAccessor configAccessor = ExecutionConfigAccessor.fromConfiguration(configuration);
        final ClusterSpecification clusterSpecification = clusterClientFactory.getClusterSpecification(configuration);
        final ClusterClientProvider<ClusterID> clusterClientProvider = clusterDescriptor.deployJobCluster(clusterSpecification, jobGraph, configAccessor.getDetachedMode());
        LOG.info("Job has been submitted with JobID " + jobGraph.getJobID());
        return CompletableFuture.completedFuture(new ClusterClientJobClientAdapter<>(clusterClientProvider, jobGraph.getJobID(), userCodeClassloader));
    }
}
Also used : JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) ClusterSpecification(org.apache.flink.client.deployment.ClusterSpecification) ExecutionConfigAccessor(org.apache.flink.client.cli.ExecutionConfigAccessor)

Example 19 with ClusterSpecification

use of org.apache.flink.client.deployment.ClusterSpecification in project flink by apache.

the class YarnConfigurationITCase method testFlinkContainerMemory.

/**
 * Tests that the Flink components are started with the correct memory settings.
 */
@Test(timeout = 60000)
public void testFlinkContainerMemory() throws Exception {
    runTest(() -> {
        final YarnClient yarnClient = getYarnClient();
        final Configuration configuration = new Configuration(flinkConfiguration);
        final int slotsPerTaskManager = 3;
        configuration.set(TaskManagerOptions.NUM_TASK_SLOTS, slotsPerTaskManager);
        final int masterMemory = 768;
        configuration.set(JobManagerOptions.TOTAL_PROCESS_MEMORY, MemorySize.ofMebiBytes(masterMemory));
        final TaskExecutorProcessSpec tmResourceSpec = TaskExecutorProcessUtils.processSpecFromConfig(configuration);
        final int taskManagerMemory = tmResourceSpec.getTotalProcessMemorySize().getMebiBytes();
        final YarnConfiguration yarnConfiguration = getYarnConfiguration();
        final YarnClusterDescriptor clusterDescriptor = YarnTestUtils.createClusterDescriptorWithLogging(CliFrontend.getConfigurationDirectoryFromEnv(), configuration, yarnConfiguration, yarnClient, true);
        clusterDescriptor.setLocalJarPath(new Path(flinkUberjar.getAbsolutePath()));
        clusterDescriptor.addShipFiles(Arrays.asList(flinkLibFolder.listFiles()));
        final File streamingWordCountFile = getTestJarPath("WindowJoin.jar");
        final PackagedProgram packagedProgram = PackagedProgram.newBuilder().setJarFile(streamingWordCountFile).build();
        final JobGraph jobGraph = PackagedProgramUtils.createJobGraph(packagedProgram, configuration, 1, false);
        try {
            final ClusterSpecification clusterSpecification = new ClusterSpecification.ClusterSpecificationBuilder().setMasterMemoryMB(masterMemory).setTaskManagerMemoryMB(taskManagerMemory).setSlotsPerTaskManager(slotsPerTaskManager).createClusterSpecification();
            final ClusterClient<ApplicationId> clusterClient = clusterDescriptor.deployJobCluster(clusterSpecification, jobGraph, true).getClusterClient();
            final ApplicationId clusterId = clusterClient.getClusterId();
            final RestClient restClient = new RestClient(configuration, TestingUtils.defaultExecutor());
            try {
                final ApplicationReport applicationReport = yarnClient.getApplicationReport(clusterId);
                final ApplicationAttemptId currentApplicationAttemptId = applicationReport.getCurrentApplicationAttemptId();
                // wait until we have second container allocated
                List<ContainerReport> containers = yarnClient.getContainers(currentApplicationAttemptId);
                while (containers.size() < 2) {
                    // this is nasty but Yarn does not offer a better way to wait
                    Thread.sleep(50L);
                    containers = yarnClient.getContainers(currentApplicationAttemptId);
                }
                for (ContainerReport container : containers) {
                    if (container.getContainerId().getId() == 1) {
                        // this should be the application master
                        assertThat(container.getAllocatedResource().getMemory(), is(masterMemory));
                    } else {
                        assertThat(container.getAllocatedResource().getMemory(), is(taskManagerMemory));
                    }
                }
                final URI webURI = new URI(clusterClient.getWebInterfaceURL());
                CompletableFuture<TaskManagersInfo> taskManagersInfoCompletableFuture;
                Collection<TaskManagerInfo> taskManagerInfos;
                while (true) {
                    taskManagersInfoCompletableFuture = restClient.sendRequest(webURI.getHost(), webURI.getPort(), TaskManagersHeaders.getInstance(), EmptyMessageParameters.getInstance(), EmptyRequestBody.getInstance());
                    final TaskManagersInfo taskManagersInfo = taskManagersInfoCompletableFuture.get();
                    taskManagerInfos = taskManagersInfo.getTaskManagerInfos();
                    // wait until the task manager has registered and reported its slots
                    if (hasTaskManagerConnectedAndReportedSlots(taskManagerInfos)) {
                        break;
                    } else {
                        Thread.sleep(100L);
                    }
                }
                // there should be at least one TaskManagerInfo
                final TaskManagerInfo taskManagerInfo = taskManagerInfos.iterator().next();
                assertThat(taskManagerInfo.getNumberSlots(), is(slotsPerTaskManager));
                final long expectedHeapSizeBytes = tmResourceSpec.getJvmHeapMemorySize().getBytes();
                // We compare here physical memory assigned to a container with the heap
                // memory that we should pass to
                // jvm as Xmx parameter. Those value might differ significantly due to
                // system page size or jvm
                // implementation therefore we use 15% threshold here.
                assertThat((double) taskManagerInfo.getHardwareDescription().getSizeOfJvmHeap() / (double) expectedHeapSizeBytes, is(closeTo(1.0, 0.15)));
                final int expectedManagedMemoryMB = tmResourceSpec.getManagedMemorySize().getMebiBytes();
                assertThat((int) (taskManagerInfo.getHardwareDescription().getSizeOfManagedMemory() >> 20), is(expectedManagedMemoryMB));
            } finally {
                restClient.shutdown(TIMEOUT);
                clusterClient.close();
            }
            clusterDescriptor.killCluster(clusterId);
        } finally {
            clusterDescriptor.close();
        }
    });
}
Also used : YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) Configuration(org.apache.flink.configuration.Configuration) URI(java.net.URI) PackagedProgram(org.apache.flink.client.program.PackagedProgram) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) ContainerReport(org.apache.hadoop.yarn.api.records.ContainerReport) TaskManagersInfo(org.apache.flink.runtime.rest.messages.taskmanager.TaskManagersInfo) TestUtils.getTestJarPath(org.apache.flink.yarn.util.TestUtils.getTestJarPath) Path(org.apache.hadoop.fs.Path) TaskExecutorProcessSpec(org.apache.flink.runtime.clusterframework.TaskExecutorProcessSpec) ClusterSpecification(org.apache.flink.client.deployment.ClusterSpecification) RestClient(org.apache.flink.runtime.rest.RestClient) ApplicationAttemptId(org.apache.hadoop.yarn.api.records.ApplicationAttemptId) YarnClient(org.apache.hadoop.yarn.client.api.YarnClient) ApplicationReport(org.apache.hadoop.yarn.api.records.ApplicationReport) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) TaskManagerInfo(org.apache.flink.runtime.rest.messages.taskmanager.TaskManagerInfo) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) File(java.io.File) Test(org.junit.Test)

Example 20 with ClusterSpecification

use of org.apache.flink.client.deployment.ClusterSpecification in project flink by apache.

the class YarnClusterDescriptor method startAppMaster.

private ApplicationReport startAppMaster(Configuration configuration, String applicationName, String yarnClusterEntrypoint, JobGraph jobGraph, YarnClient yarnClient, YarnClientApplication yarnApplication, ClusterSpecification clusterSpecification) throws Exception {
    // ------------------ Initialize the file systems -------------------------
    org.apache.flink.core.fs.FileSystem.initialize(configuration, PluginUtils.createPluginManagerFromRootFolder(configuration));
    final FileSystem fs = FileSystem.get(yarnConfiguration);
    // method.
    if (!fs.getClass().getSimpleName().equals("GoogleHadoopFileSystem") && fs.getScheme().startsWith("file")) {
        LOG.warn("The file system scheme is '" + fs.getScheme() + "'. This indicates that the " + "specified Hadoop configuration path is wrong and the system is using the default Hadoop configuration values." + "The Flink YARN client needs to store its files in a distributed file system");
    }
    ApplicationSubmissionContext appContext = yarnApplication.getApplicationSubmissionContext();
    final List<Path> providedLibDirs = Utils.getQualifiedRemoteSharedPaths(configuration, yarnConfiguration);
    Path stagingDirPath = getStagingDir(fs);
    FileSystem stagingDirFs = stagingDirPath.getFileSystem(yarnConfiguration);
    final YarnApplicationFileUploader fileUploader = YarnApplicationFileUploader.from(stagingDirFs, stagingDirPath, providedLibDirs, appContext.getApplicationId(), getFileReplication());
    // The files need to be shipped and added to classpath.
    Set<File> systemShipFiles = new HashSet<>(shipFiles.size());
    for (File file : shipFiles) {
        systemShipFiles.add(file.getAbsoluteFile());
    }
    final String logConfigFilePath = configuration.getString(YarnConfigOptionsInternal.APPLICATION_LOG_CONFIG_FILE);
    if (logConfigFilePath != null) {
        systemShipFiles.add(new File(logConfigFilePath));
    }
    // Set-up ApplicationSubmissionContext for the application
    final ApplicationId appId = appContext.getApplicationId();
    // ------------------ Add Zookeeper namespace to local flinkConfiguraton ------
    setHAClusterIdIfNotSet(configuration, appId);
    if (HighAvailabilityMode.isHighAvailabilityModeActivated(configuration)) {
        // activate re-execution of failed applications
        appContext.setMaxAppAttempts(configuration.getInteger(YarnConfigOptions.APPLICATION_ATTEMPTS.key(), YarnConfiguration.DEFAULT_RM_AM_MAX_ATTEMPTS));
        activateHighAvailabilitySupport(appContext);
    } else {
        // set number of application retries to 1 in the default case
        appContext.setMaxAppAttempts(configuration.getInteger(YarnConfigOptions.APPLICATION_ATTEMPTS.key(), 1));
    }
    final Set<Path> userJarFiles = new HashSet<>();
    if (jobGraph != null) {
        userJarFiles.addAll(jobGraph.getUserJars().stream().map(f -> f.toUri()).map(Path::new).collect(Collectors.toSet()));
    }
    final List<URI> jarUrls = ConfigUtils.decodeListFromConfig(configuration, PipelineOptions.JARS, URI::create);
    if (jarUrls != null && YarnApplicationClusterEntryPoint.class.getName().equals(yarnClusterEntrypoint)) {
        userJarFiles.addAll(jarUrls.stream().map(Path::new).collect(Collectors.toSet()));
    }
    // only for per job mode
    if (jobGraph != null) {
        for (Map.Entry<String, DistributedCache.DistributedCacheEntry> entry : jobGraph.getUserArtifacts().entrySet()) {
            // only upload local files
            if (!Utils.isRemotePath(entry.getValue().filePath)) {
                Path localPath = new Path(entry.getValue().filePath);
                Tuple2<Path, Long> remoteFileInfo = fileUploader.uploadLocalFileToRemote(localPath, entry.getKey());
                jobGraph.setUserArtifactRemotePath(entry.getKey(), remoteFileInfo.f0.toString());
            }
        }
        jobGraph.writeUserArtifactEntriesToConfiguration();
    }
    if (providedLibDirs == null || providedLibDirs.isEmpty()) {
        addLibFoldersToShipFiles(systemShipFiles);
    }
    // Register all files in provided lib dirs as local resources with public visibility
    // and upload the remaining dependencies as local resources with APPLICATION visibility.
    final List<String> systemClassPaths = fileUploader.registerProvidedLocalResources();
    final List<String> uploadedDependencies = fileUploader.registerMultipleLocalResources(systemShipFiles.stream().map(e -> new Path(e.toURI())).collect(Collectors.toSet()), Path.CUR_DIR, LocalResourceType.FILE);
    systemClassPaths.addAll(uploadedDependencies);
    // Plugin files only need to be shipped and should not be added to classpath.
    if (providedLibDirs == null || providedLibDirs.isEmpty()) {
        Set<File> shipOnlyFiles = new HashSet<>();
        addPluginsFoldersToShipFiles(shipOnlyFiles);
        fileUploader.registerMultipleLocalResources(shipOnlyFiles.stream().map(e -> new Path(e.toURI())).collect(Collectors.toSet()), Path.CUR_DIR, LocalResourceType.FILE);
    }
    if (!shipArchives.isEmpty()) {
        fileUploader.registerMultipleLocalResources(shipArchives.stream().map(e -> new Path(e.toURI())).collect(Collectors.toSet()), Path.CUR_DIR, LocalResourceType.ARCHIVE);
    }
    // Upload and register user jars
    final List<String> userClassPaths = fileUploader.registerMultipleLocalResources(userJarFiles, userJarInclusion == YarnConfigOptions.UserJarInclusion.DISABLED ? ConfigConstants.DEFAULT_FLINK_USR_LIB_DIR : Path.CUR_DIR, LocalResourceType.FILE);
    // usrlib will be automatically shipped if it exists.
    if (ClusterEntrypointUtils.tryFindUserLibDirectory().isPresent()) {
        final Set<File> usrLibShipFiles = new HashSet<>();
        addUsrLibFolderToShipFiles(usrLibShipFiles);
        final List<String> usrLibClassPaths = fileUploader.registerMultipleLocalResources(usrLibShipFiles.stream().map(e -> new Path(e.toURI())).collect(Collectors.toSet()), Path.CUR_DIR, LocalResourceType.FILE);
        userClassPaths.addAll(usrLibClassPaths);
    }
    if (userJarInclusion == YarnConfigOptions.UserJarInclusion.ORDER) {
        systemClassPaths.addAll(userClassPaths);
    }
    // normalize classpath by sorting
    Collections.sort(systemClassPaths);
    Collections.sort(userClassPaths);
    // classpath assembler
    StringBuilder classPathBuilder = new StringBuilder();
    if (userJarInclusion == YarnConfigOptions.UserJarInclusion.FIRST) {
        for (String userClassPath : userClassPaths) {
            classPathBuilder.append(userClassPath).append(File.pathSeparator);
        }
    }
    for (String classPath : systemClassPaths) {
        classPathBuilder.append(classPath).append(File.pathSeparator);
    }
    // Setup jar for ApplicationMaster
    final YarnLocalResourceDescriptor localResourceDescFlinkJar = fileUploader.uploadFlinkDist(flinkJarPath);
    classPathBuilder.append(localResourceDescFlinkJar.getResourceKey()).append(File.pathSeparator);
    // TODO: server use user main method to generate job graph
    if (jobGraph != null) {
        File tmpJobGraphFile = null;
        try {
            tmpJobGraphFile = File.createTempFile(appId.toString(), null);
            try (FileOutputStream output = new FileOutputStream(tmpJobGraphFile);
                ObjectOutputStream obOutput = new ObjectOutputStream(output)) {
                obOutput.writeObject(jobGraph);
            }
            final String jobGraphFilename = "job.graph";
            configuration.setString(JOB_GRAPH_FILE_PATH, jobGraphFilename);
            fileUploader.registerSingleLocalResource(jobGraphFilename, new Path(tmpJobGraphFile.toURI()), "", LocalResourceType.FILE, true, false);
            classPathBuilder.append(jobGraphFilename).append(File.pathSeparator);
        } catch (Exception e) {
            LOG.warn("Add job graph to local resource fail.");
            throw e;
        } finally {
            if (tmpJobGraphFile != null && !tmpJobGraphFile.delete()) {
                LOG.warn("Fail to delete temporary file {}.", tmpJobGraphFile.toPath());
            }
        }
    }
    // Upload the flink configuration
    // write out configuration file
    File tmpConfigurationFile = null;
    try {
        tmpConfigurationFile = File.createTempFile(appId + "-flink-conf.yaml", null);
        BootstrapTools.writeConfiguration(configuration, tmpConfigurationFile);
        String flinkConfigKey = "flink-conf.yaml";
        fileUploader.registerSingleLocalResource(flinkConfigKey, new Path(tmpConfigurationFile.getAbsolutePath()), "", LocalResourceType.FILE, true, true);
        classPathBuilder.append("flink-conf.yaml").append(File.pathSeparator);
    } finally {
        if (tmpConfigurationFile != null && !tmpConfigurationFile.delete()) {
            LOG.warn("Fail to delete temporary file {}.", tmpConfigurationFile.toPath());
        }
    }
    if (userJarInclusion == YarnConfigOptions.UserJarInclusion.LAST) {
        for (String userClassPath : userClassPaths) {
            classPathBuilder.append(userClassPath).append(File.pathSeparator);
        }
    }
    // To support Yarn Secure Integration Test Scenario
    // In Integration test setup, the Yarn containers created by YarnMiniCluster does not have
    // the Yarn site XML
    // and KRB5 configuration files. We are adding these files as container local resources for
    // the container
    // applications (JM/TMs) to have proper secure cluster setup
    Path remoteYarnSiteXmlPath = null;
    if (System.getenv("IN_TESTS") != null) {
        File f = new File(System.getenv("YARN_CONF_DIR"), Utils.YARN_SITE_FILE_NAME);
        LOG.info("Adding Yarn configuration {} to the AM container local resource bucket", f.getAbsolutePath());
        Path yarnSitePath = new Path(f.getAbsolutePath());
        remoteYarnSiteXmlPath = fileUploader.registerSingleLocalResource(Utils.YARN_SITE_FILE_NAME, yarnSitePath, "", LocalResourceType.FILE, false, false).getPath();
        if (System.getProperty("java.security.krb5.conf") != null) {
            configuration.set(SecurityOptions.KERBEROS_KRB5_PATH, System.getProperty("java.security.krb5.conf"));
        }
    }
    Path remoteKrb5Path = null;
    boolean hasKrb5 = false;
    String krb5Config = configuration.get(SecurityOptions.KERBEROS_KRB5_PATH);
    if (!StringUtils.isNullOrWhitespaceOnly(krb5Config)) {
        final File krb5 = new File(krb5Config);
        LOG.info("Adding KRB5 configuration {} to the AM container local resource bucket", krb5.getAbsolutePath());
        final Path krb5ConfPath = new Path(krb5.getAbsolutePath());
        remoteKrb5Path = fileUploader.registerSingleLocalResource(Utils.KRB5_FILE_NAME, krb5ConfPath, "", LocalResourceType.FILE, false, false).getPath();
        hasKrb5 = true;
    }
    Path remotePathKeytab = null;
    String localizedKeytabPath = null;
    String keytab = configuration.getString(SecurityOptions.KERBEROS_LOGIN_KEYTAB);
    if (keytab != null) {
        boolean localizeKeytab = flinkConfiguration.getBoolean(YarnConfigOptions.SHIP_LOCAL_KEYTAB);
        localizedKeytabPath = flinkConfiguration.getString(YarnConfigOptions.LOCALIZED_KEYTAB_PATH);
        if (localizeKeytab) {
            // Localize the keytab to YARN containers via local resource.
            LOG.info("Adding keytab {} to the AM container local resource bucket", keytab);
            remotePathKeytab = fileUploader.registerSingleLocalResource(localizedKeytabPath, new Path(keytab), "", LocalResourceType.FILE, false, false).getPath();
        } else {
            // // Assume Keytab is pre-installed in the container.
            localizedKeytabPath = flinkConfiguration.getString(YarnConfigOptions.LOCALIZED_KEYTAB_PATH);
        }
    }
    final JobManagerProcessSpec processSpec = JobManagerProcessUtils.processSpecFromConfigWithNewOptionToInterpretLegacyHeap(flinkConfiguration, JobManagerOptions.TOTAL_PROCESS_MEMORY);
    final ContainerLaunchContext amContainer = setupApplicationMasterContainer(yarnClusterEntrypoint, hasKrb5, processSpec);
    // setup security tokens
    if (UserGroupInformation.isSecurityEnabled()) {
        // set HDFS delegation tokens when security is enabled
        LOG.info("Adding delegation token to the AM container.");
        final List<Path> pathsToObtainToken = new ArrayList<>();
        boolean fetchToken = configuration.getBoolean(SecurityOptions.KERBEROS_FETCH_DELEGATION_TOKEN);
        if (fetchToken) {
            List<Path> yarnAccessList = ConfigUtils.decodeListFromConfig(configuration, YarnConfigOptions.YARN_ACCESS, Path::new);
            pathsToObtainToken.addAll(yarnAccessList);
            pathsToObtainToken.addAll(fileUploader.getRemotePaths());
        }
        Utils.setTokensFor(amContainer, pathsToObtainToken, yarnConfiguration, fetchToken);
    }
    amContainer.setLocalResources(fileUploader.getRegisteredLocalResources());
    fileUploader.close();
    // Setup CLASSPATH and environment variables for ApplicationMaster
    final Map<String, String> appMasterEnv = new HashMap<>();
    // set user specified app master environment variables
    appMasterEnv.putAll(ConfigurationUtils.getPrefixedKeyValuePairs(ResourceManagerOptions.CONTAINERIZED_MASTER_ENV_PREFIX, configuration));
    // set Flink app class path
    appMasterEnv.put(YarnConfigKeys.ENV_FLINK_CLASSPATH, classPathBuilder.toString());
    // set Flink on YARN internal configuration values
    appMasterEnv.put(YarnConfigKeys.FLINK_DIST_JAR, localResourceDescFlinkJar.toString());
    appMasterEnv.put(YarnConfigKeys.ENV_APP_ID, appId.toString());
    appMasterEnv.put(YarnConfigKeys.ENV_CLIENT_HOME_DIR, fileUploader.getHomeDir().toString());
    appMasterEnv.put(YarnConfigKeys.ENV_CLIENT_SHIP_FILES, encodeYarnLocalResourceDescriptorListToString(fileUploader.getEnvShipResourceList()));
    appMasterEnv.put(YarnConfigKeys.FLINK_YARN_FILES, fileUploader.getApplicationDir().toUri().toString());
    // https://github.com/apache/hadoop/blob/trunk/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/YarnApplicationSecurity.md#identity-on-an-insecure-cluster-hadoop_user_name
    appMasterEnv.put(YarnConfigKeys.ENV_HADOOP_USER_NAME, UserGroupInformation.getCurrentUser().getUserName());
    if (localizedKeytabPath != null) {
        appMasterEnv.put(YarnConfigKeys.LOCAL_KEYTAB_PATH, localizedKeytabPath);
        String principal = configuration.getString(SecurityOptions.KERBEROS_LOGIN_PRINCIPAL);
        appMasterEnv.put(YarnConfigKeys.KEYTAB_PRINCIPAL, principal);
        if (remotePathKeytab != null) {
            appMasterEnv.put(YarnConfigKeys.REMOTE_KEYTAB_PATH, remotePathKeytab.toString());
        }
    }
    // To support Yarn Secure Integration Test Scenario
    if (remoteYarnSiteXmlPath != null) {
        appMasterEnv.put(YarnConfigKeys.ENV_YARN_SITE_XML_PATH, remoteYarnSiteXmlPath.toString());
    }
    if (remoteKrb5Path != null) {
        appMasterEnv.put(YarnConfigKeys.ENV_KRB5_PATH, remoteKrb5Path.toString());
    }
    // set classpath from YARN configuration
    Utils.setupYarnClassPath(yarnConfiguration, appMasterEnv);
    amContainer.setEnvironment(appMasterEnv);
    // Set up resource type requirements for ApplicationMaster
    Resource capability = Records.newRecord(Resource.class);
    capability.setMemory(clusterSpecification.getMasterMemoryMB());
    capability.setVirtualCores(flinkConfiguration.getInteger(YarnConfigOptions.APP_MASTER_VCORES));
    final String customApplicationName = customName != null ? customName : applicationName;
    appContext.setApplicationName(customApplicationName);
    appContext.setApplicationType(applicationType != null ? applicationType : "Apache Flink");
    appContext.setAMContainerSpec(amContainer);
    appContext.setResource(capability);
    // Set priority for application
    int priorityNum = flinkConfiguration.getInteger(YarnConfigOptions.APPLICATION_PRIORITY);
    if (priorityNum >= 0) {
        Priority priority = Priority.newInstance(priorityNum);
        appContext.setPriority(priority);
    }
    if (yarnQueue != null) {
        appContext.setQueue(yarnQueue);
    }
    setApplicationNodeLabel(appContext);
    setApplicationTags(appContext);
    // add a hook to clean up in case deployment fails
    Thread deploymentFailureHook = new DeploymentFailureHook(yarnApplication, fileUploader.getApplicationDir());
    Runtime.getRuntime().addShutdownHook(deploymentFailureHook);
    LOG.info("Submitting application master " + appId);
    yarnClient.submitApplication(appContext);
    LOG.info("Waiting for the cluster to be allocated");
    final long startTime = System.currentTimeMillis();
    ApplicationReport report;
    YarnApplicationState lastAppState = YarnApplicationState.NEW;
    loop: while (true) {
        try {
            report = yarnClient.getApplicationReport(appId);
        } catch (IOException e) {
            throw new YarnDeploymentException("Failed to deploy the cluster.", e);
        }
        YarnApplicationState appState = report.getYarnApplicationState();
        LOG.debug("Application State: {}", appState);
        switch(appState) {
            case FAILED:
            case KILLED:
                throw new YarnDeploymentException("The YARN application unexpectedly switched to state " + appState + " during deployment. \n" + "Diagnostics from YARN: " + report.getDiagnostics() + "\n" + "If log aggregation is enabled on your cluster, use this command to further investigate the issue:\n" + "yarn logs -applicationId " + appId);
            // break ..
            case RUNNING:
                LOG.info("YARN application has been deployed successfully.");
                break loop;
            case FINISHED:
                LOG.info("YARN application has been finished successfully.");
                break loop;
            default:
                if (appState != lastAppState) {
                    LOG.info("Deploying cluster, current state " + appState);
                }
                if (System.currentTimeMillis() - startTime > 60000) {
                    LOG.info("Deployment took more than 60 seconds. Please check if the requested resources are available in the YARN cluster");
                }
        }
        lastAppState = appState;
        Thread.sleep(250);
    }
    // since deployment was successful, remove the hook
    ShutdownHookUtil.removeShutdownHook(deploymentFailureHook, getClass().getSimpleName(), LOG);
    return report;
}
Also used : SecurityOptions(org.apache.flink.configuration.SecurityOptions) Tuple2(org.apache.flink.api.java.tuple.Tuple2) FileSystem(org.apache.hadoop.fs.FileSystem) ResourceManagerOptions(org.apache.flink.configuration.ResourceManagerOptions) YarnClientApplication(org.apache.hadoop.yarn.client.api.YarnClientApplication) YarnJobClusterEntrypoint(org.apache.flink.yarn.entrypoint.YarnJobClusterEntrypoint) ConfigUtils(org.apache.flink.configuration.ConfigUtils) YarnClusterMetrics(org.apache.hadoop.yarn.api.records.YarnClusterMetrics) ConverterUtils(org.apache.hadoop.yarn.util.ConverterUtils) Map(java.util.Map) ApplicationReport(org.apache.hadoop.yarn.api.records.ApplicationReport) Resource(org.apache.hadoop.yarn.api.records.Resource) YarnLogConfigUtil(org.apache.flink.yarn.configuration.YarnLogConfigUtil) YarnConfigOptionsInternal(org.apache.flink.yarn.configuration.YarnConfigOptionsInternal) Set(java.util.Set) JobManagerOptions(org.apache.flink.configuration.JobManagerOptions) YarnConfigOptions(org.apache.flink.yarn.configuration.YarnConfigOptions) InvocationTargetException(java.lang.reflect.InvocationTargetException) ContainerLaunchContext(org.apache.hadoop.yarn.api.records.ContainerLaunchContext) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) ClusterClientProvider(org.apache.flink.client.program.ClusterClientProvider) HadoopUtils(org.apache.flink.runtime.util.HadoopUtils) ClusterEntrypointUtils(org.apache.flink.runtime.entrypoint.ClusterEntrypointUtils) FlinkException(org.apache.flink.util.FlinkException) ByteArrayOutputStream(java.io.ByteArrayOutputStream) DFSConfigKeys(org.apache.hadoop.hdfs.DFSConfigKeys) ShutdownHookUtil(org.apache.flink.util.ShutdownHookUtil) ArrayList(java.util.ArrayList) YarnConfiguration(org.apache.hadoop.yarn.conf.YarnConfiguration) JobManagerProcessUtils(org.apache.flink.runtime.jobmanager.JobManagerProcessUtils) UserGroupInformation(org.apache.hadoop.security.UserGroupInformation) RestOptions(org.apache.flink.configuration.RestOptions) ObjectOutputStream(java.io.ObjectOutputStream) ConfigOption(org.apache.flink.configuration.ConfigOption) Nullable(javax.annotation.Nullable) FileOutputStream(java.io.FileOutputStream) IOException(java.io.IOException) GetNewApplicationResponse(org.apache.hadoop.yarn.api.protocolrecords.GetNewApplicationResponse) BootstrapTools(org.apache.flink.runtime.clusterframework.BootstrapTools) File(java.io.File) VisibleForTesting(org.apache.flink.annotation.VisibleForTesting) ConfigurationUtils(org.apache.flink.configuration.ConfigurationUtils) ClusterDescriptor(org.apache.flink.client.deployment.ClusterDescriptor) CoreOptions(org.apache.flink.configuration.CoreOptions) PluginUtils(org.apache.flink.core.plugin.PluginUtils) Records(org.apache.hadoop.yarn.util.Records) LOCAL_RESOURCE_DESCRIPTOR_SEPARATOR(org.apache.flink.yarn.YarnConfigKeys.LOCAL_RESOURCE_DESCRIPTOR_SEPARATOR) PluginConfig(org.apache.flink.core.plugin.PluginConfig) ApplicationConstants(org.apache.hadoop.yarn.api.ApplicationConstants) URLDecoder(java.net.URLDecoder) PipelineOptions(org.apache.flink.configuration.PipelineOptions) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) LoggerFactory(org.slf4j.LoggerFactory) ENV_FLINK_LIB_DIR(org.apache.flink.configuration.ConfigConstants.ENV_FLINK_LIB_DIR) ApplicationConfiguration(org.apache.flink.client.deployment.application.ApplicationConfiguration) YarnDeploymentTarget(org.apache.flink.yarn.configuration.YarnDeploymentTarget) Path(org.apache.hadoop.fs.Path) NodeReport(org.apache.hadoop.yarn.api.records.NodeReport) URI(java.net.URI) Preconditions.checkNotNull(org.apache.flink.util.Preconditions.checkNotNull) Method(java.lang.reflect.Method) Priority(org.apache.hadoop.yarn.api.records.Priority) Collection(java.util.Collection) YarnSessionClusterEntrypoint(org.apache.flink.yarn.entrypoint.YarnSessionClusterEntrypoint) DistributedCache(org.apache.flink.api.common.cache.DistributedCache) Preconditions(org.apache.flink.util.Preconditions) StringUtils(org.apache.flink.util.StringUtils) Collectors(java.util.stream.Collectors) ClusterDeploymentException(org.apache.flink.client.deployment.ClusterDeploymentException) List(java.util.List) Preconditions.checkArgument(org.apache.flink.util.Preconditions.checkArgument) FinalApplicationStatus(org.apache.hadoop.yarn.api.records.FinalApplicationStatus) YarnApplicationState(org.apache.hadoop.yarn.api.records.YarnApplicationState) Optional(java.util.Optional) UnsupportedEncodingException(java.io.UnsupportedEncodingException) IllegalConfigurationException(org.apache.flink.configuration.IllegalConfigurationException) LocalResourceType(org.apache.hadoop.yarn.api.records.LocalResourceType) YarnClient(org.apache.hadoop.yarn.client.api.YarnClient) NodeState(org.apache.hadoop.yarn.api.records.NodeState) ClusterSpecification(org.apache.flink.client.deployment.ClusterSpecification) HashMap(java.util.HashMap) RestClusterClient(org.apache.flink.client.program.rest.RestClusterClient) HashSet(java.util.HashSet) Charset(java.nio.charset.Charset) YarnException(org.apache.hadoop.yarn.exceptions.YarnException) ConfigConstants(org.apache.flink.configuration.ConfigConstants) QueueInfo(org.apache.hadoop.yarn.api.records.QueueInfo) ClusterRetrieveException(org.apache.flink.client.deployment.ClusterRetrieveException) HighAvailabilityMode(org.apache.flink.runtime.jobmanager.HighAvailabilityMode) LinkedList(java.util.LinkedList) PrintStream(java.io.PrintStream) JOB_GRAPH_FILE_PATH(org.apache.flink.runtime.entrypoint.component.FileJobGraphRetriever.JOB_GRAPH_FILE_PATH) Logger(org.slf4j.Logger) JobManagerProcessSpec(org.apache.flink.runtime.jobmanager.JobManagerProcessSpec) YarnApplicationClusterEntryPoint(org.apache.flink.yarn.entrypoint.YarnApplicationClusterEntryPoint) Configuration(org.apache.flink.configuration.Configuration) CollectionUtil(org.apache.flink.util.CollectionUtil) DEFAULT_FLINK_USR_LIB_DIR(org.apache.flink.configuration.ConfigConstants.DEFAULT_FLINK_USR_LIB_DIR) ApplicationSubmissionContext(org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext) ClusterEntrypoint(org.apache.flink.runtime.entrypoint.ClusterEntrypoint) Collections(java.util.Collections) HighAvailabilityOptions(org.apache.flink.configuration.HighAvailabilityOptions) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) YarnApplicationState(org.apache.hadoop.yarn.api.records.YarnApplicationState) ObjectOutputStream(java.io.ObjectOutputStream) URI(java.net.URI) FileSystem(org.apache.hadoop.fs.FileSystem) ApplicationSubmissionContext(org.apache.hadoop.yarn.api.records.ApplicationSubmissionContext) HashSet(java.util.HashSet) Path(org.apache.hadoop.fs.Path) Priority(org.apache.hadoop.yarn.api.records.Priority) Resource(org.apache.hadoop.yarn.api.records.Resource) ContainerLaunchContext(org.apache.hadoop.yarn.api.records.ContainerLaunchContext) IOException(java.io.IOException) InvocationTargetException(java.lang.reflect.InvocationTargetException) FlinkException(org.apache.flink.util.FlinkException) IOException(java.io.IOException) ClusterDeploymentException(org.apache.flink.client.deployment.ClusterDeploymentException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) IllegalConfigurationException(org.apache.flink.configuration.IllegalConfigurationException) YarnException(org.apache.hadoop.yarn.exceptions.YarnException) ClusterRetrieveException(org.apache.flink.client.deployment.ClusterRetrieveException) YarnJobClusterEntrypoint(org.apache.flink.yarn.entrypoint.YarnJobClusterEntrypoint) YarnSessionClusterEntrypoint(org.apache.flink.yarn.entrypoint.YarnSessionClusterEntrypoint) YarnApplicationClusterEntryPoint(org.apache.flink.yarn.entrypoint.YarnApplicationClusterEntryPoint) ClusterEntrypoint(org.apache.flink.runtime.entrypoint.ClusterEntrypoint) ApplicationReport(org.apache.hadoop.yarn.api.records.ApplicationReport) JobManagerProcessSpec(org.apache.flink.runtime.jobmanager.JobManagerProcessSpec) FileOutputStream(java.io.FileOutputStream) ApplicationId(org.apache.hadoop.yarn.api.records.ApplicationId) File(java.io.File) Map(java.util.Map) HashMap(java.util.HashMap)

Aggregations

ClusterSpecification (org.apache.flink.client.deployment.ClusterSpecification)28 Configuration (org.apache.flink.configuration.Configuration)20 Test (org.junit.Test)17 ApplicationId (org.apache.hadoop.yarn.api.records.ApplicationId)15 CommandLine (org.apache.commons.cli.CommandLine)9 CustomCommandLine (org.apache.flink.client.cli.CustomCommandLine)8 FlinkYarnSessionCli (org.apache.flink.yarn.cli.FlinkYarnSessionCli)8 File (java.io.File)5 ClusterDeploymentException (org.apache.flink.client.deployment.ClusterDeploymentException)4 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)4 FlinkException (org.apache.flink.util.FlinkException)4 Path (org.apache.hadoop.fs.Path)4 IOException (java.io.IOException)3 ClusterRetrieveException (org.apache.flink.client.deployment.ClusterRetrieveException)3 ApplicationConfiguration (org.apache.flink.client.deployment.application.ApplicationConfiguration)3 IllegalConfigurationException (org.apache.flink.configuration.IllegalConfigurationException)3 KubernetesJobManagerParameters (org.apache.flink.kubernetes.kubeclient.parameters.KubernetesJobManagerParameters)3 ClusterEntrypoint (org.apache.flink.runtime.entrypoint.ClusterEntrypoint)3 ApplicationReport (org.apache.hadoop.yarn.api.records.ApplicationReport)3 YarnConfiguration (org.apache.hadoop.yarn.conf.YarnConfiguration)3