Search in sources :

Example 1 with ConfigProperty

use of io.hops.hopsworks.common.util.templates.ConfigProperty in project hopsworks by logicalclocks.

the class HopsUtils method mergeHopsworksAndUserParams.

/**
 * Merge system and user defined configuration properties based on the replacement policy of each property
 * @param hopsworksParams System/default properties
 * @param userParameters User defined properties parsed by parseUserProperties(String sparkProps)
 * @return A map with the replacement pattern and value for each property
 */
public static Map<String, String> mergeHopsworksAndUserParams(Map<String, ConfigProperty> hopsworksParams, Map<String, String> userParameters) {
    Map<String, String> finalParams = new HashMap<>();
    Set<String> notReplacedUserParams = new HashSet<>();
    for (Map.Entry<String, String> userParam : userParameters.entrySet()) {
        if (hopsworksParams.containsKey(userParam.getKey())) {
            ConfigProperty prop = hopsworksParams.get(userParam.getKey());
            prop.replaceValue(userParam.getValue());
            finalParams.put(prop.getReplacementPattern(), prop.getValue());
        } else {
            finalParams.put(userParam.getKey(), userParam.getValue());
        }
    }
    String userParamsStr = "";
    if (!notReplacedUserParams.isEmpty()) {
        StringBuilder userParamsSb = new StringBuilder();
        userParamsSb.append(",\n");
        notReplacedUserParams.stream().forEach(p -> userParamsSb.append("\"").append(p).append("\": ").append("\"").append(userParameters.get(p)).append("\"," + "\n"));
        userParamsStr = userParamsSb.toString();
        // Remove last comma and add a new line char
        userParamsStr = userParamsStr.trim().substring(0, userParamsStr.length() - 2) + "\n";
    }
    for (ConfigProperty configProperty : hopsworksParams.values()) {
        finalParams.putIfAbsent(configProperty.getReplacementPattern(), configProperty.getValue());
    }
    return finalParams;
}
Also used : HashMap(java.util.HashMap) ConfigProperty(io.hops.hopsworks.common.util.templates.ConfigProperty) Map(java.util.Map) HashMap(java.util.HashMap) HashSet(java.util.HashSet)

Example 2 with ConfigProperty

use of io.hops.hopsworks.common.util.templates.ConfigProperty in project hopsworks by logicalclocks.

the class SparkYarnRunnerBuilder method getYarnRunner.

/**
 * Get a YarnRunner instance that will launch a Spark job.
 *
 * @param project name of the project
 * @param jobUser
 * @param services
 * @param dfsClient
 * @param yarnClient
 * @param settings
 * @return The YarnRunner instance to launch the Spark job on Yarn.
 * @throws IOException If creation failed.
 */
public YarnRunner getYarnRunner(Project project, String jobUser, Users hopsworksUser, AsynchronousJobExecutor services, final DistributedFileSystemOps dfsClient, final YarnClient yarnClient, Settings settings, String kafkaBrokersString, String hopsworksRestEndpoint, ServingConfig servingConfig, ServiceDiscoveryController serviceDiscoveryController) throws IOException, ServiceDiscoveryException, JobException, ApiKeyException {
    Map<String, ConfigProperty> jobHopsworksProps = new HashMap<>();
    JobType jobType = job.getJobConfig().getJobType();
    String appPath = ((SparkJobConfiguration) job.getJobConfig()).getAppPath();
    // Create a builder
    YarnRunner.Builder builder = new YarnRunner.Builder(Settings.SPARK_AM_MAIN);
    builder.setJobType(jobType);
    builder.setYarnClient(yarnClient);
    builder.setDfsClient(dfsClient);
    /**
     * * 1. Set stagingPath **
     */
    String stagingPath = "/Projects/" + project.getName() + "/" + Settings.PROJECT_STAGING_DIR + "/.sparkjobstaging-" + YarnRunner.APPID_PLACEHOLDER;
    builder.localResourcesBasePath(stagingPath);
    // //////////////////////////////////////////////////////////////////////////////////////////////////////////////////
    /**
     * * 2. Set job local resources, i.e. project certificates, job jar etc. **
     */
    // Add hdfs prefix so the monitor knows it should find it there
    builder.addFileToRemove("hdfs://" + stagingPath);
    // Add app file
    String appExecName = null;
    if (jobType == JobType.SPARK) {
        appExecName = Settings.SPARK_LOCRSC_APP_JAR;
    } else if (jobType == JobType.PYSPARK) {
        appExecName = appPath.substring(appPath.lastIndexOf(File.separator) + 1);
    }
    builder.addLocalResource(new LocalResourceDTO(appExecName, appPath, LocalResourceVisibility.APPLICATION.toString(), LocalResourceType.FILE.toString(), null), dfsClient);
    builder.addToAppMasterEnvironment(YarnRunner.KEY_CLASSPATH, Settings.SPARK_LOCRSC_APP_JAR);
    // Set executor extraJavaOptions to make parameters available to executors
    Map<String, String> extraJavaOptions = new HashMap<>();
    // These properties are set so that spark history server picks them up
    jobHopsworksProps.put(Settings.SPARK_DRIVER_STAGINGDIR_ENV, new ConfigProperty(Settings.SPARK_DRIVER_STAGINGDIR_ENV, HopsUtils.IGNORE, stagingPath));
    jobHopsworksProps.put(Settings.HOPSWORKS_APPID_PROPERTY, new ConfigProperty(Settings.HOPSWORKS_APPID_PROPERTY, HopsUtils.IGNORE, YarnRunner.APPID_PLACEHOLDER));
    extraJavaOptions.put(Settings.HOPSWORKS_APPID_PROPERTY, YarnRunner.APPID_PLACEHOLDER);
    extraJavaOptions.put(Settings.LOGSTASH_JOB_INFO, project.getName().toLowerCase() + "," + jobName + "," + job.getId() + "," + YarnRunner.APPID_PLACEHOLDER);
    // Set up command
    StringBuilder amargs = new StringBuilder("--class ");
    amargs.append(((SparkJobConfiguration) job.getJobConfig()).getMainClass());
    if (jobType == JobType.PYSPARK) {
        amargs.append(" --primary-py-file ").append(appExecName);
    }
    Map<String, String> finalJobProps = new HashMap<>();
    finalJobProps.putAll(sparkConfigurationUtil.setFrameworkProperties(project, job.getJobConfig(), settings, jobUser, hopsworksUser, extraJavaOptions, kafkaBrokersString, hopsworksRestEndpoint, servingConfig, serviceDiscoveryController));
    finalJobProps.put(Settings.SPARK_YARN_APPMASTER_SPARK_USER, jobUser);
    finalJobProps.put(Settings.SPARK_EXECUTOR_SPARK_USER, jobUser);
    finalJobProps.put(Settings.SPARK_YARN_APPMASTER_YARN_MODE, "true");
    finalJobProps.put(Settings.SPARK_YARN_APPMASTER_YARN_STAGING_DIR, stagingPath);
    // Parse properties from Spark config file
    Properties sparkProperties = new Properties();
    try (InputStream is = new FileInputStream(settings.getSparkDir() + "/" + Settings.SPARK_CONFIG_FILE)) {
        sparkProperties.load(is);
        // For every property that is in the spark configuration file but is not already set, create a system property.
        for (String property : sparkProperties.stringPropertyNames()) {
            if (!finalJobProps.containsKey(property)) {
                finalJobProps.put(property, sparkProperties.getProperty(property).trim());
            }
        }
    }
    for (String jvmOption : finalJobProps.get(Settings.SPARK_DRIVER_EXTRA_JAVA_OPTIONS).split(" +")) {
        builder.addJavaOption(jvmOption);
    }
    for (String key : finalJobProps.keySet()) {
        if (key.startsWith("spark.yarn.appMasterEnv.")) {
            builder.addToAppMasterEnvironment(key.replace("spark.yarn.appMasterEnv.", ""), finalJobProps.get(key));
        }
        addSystemProperty(key, finalJobProps.get(key));
    }
    builder.addToAppMasterEnvironment("CLASSPATH", finalJobProps.get(Settings.SPARK_DRIVER_EXTRACLASSPATH));
    for (String s : sysProps.keySet()) {
        String option = YarnRunner.escapeForShell("-D" + s + "=" + sysProps.get(s));
        builder.addJavaOption(option);
    }
    for (String s : jobArgs) {
        amargs.append(" --arg '").append(s).append("'");
    }
    amargs.append(" --dist-cache-conf 'distcache.conf'");
    builder.amArgs(amargs.toString());
    // Set up Yarn properties
    builder.amMemory(sparkJobConfiguration.getAmMemory());
    builder.amVCores(sparkJobConfiguration.getAmVCores());
    builder.amQueue(sparkJobConfiguration.getAmQueue());
    // pyfiles, jars and files are distributed as spark.yarn.dist.files
    String hopsFiles = finalJobProps.get("spark.yarn.dist.files");
    if (!Strings.isNullOrEmpty(hopsFiles)) {
        for (String filePath : hopsFiles.split(",")) {
            String fileName = filePath.substring(filePath.lastIndexOf("/") + 1);
            if (filePath.contains("#")) {
                fileName = filePath.split("#")[1];
                filePath = filePath.substring(0, filePath.indexOf("#"));
            }
            builder.addLocalResource(new LocalResourceDTO(fileName, filePath, LocalResourceVisibility.APPLICATION.toString(), LocalResourceType.FILE.toString(), null), dfsClient);
        }
    }
    String archives = finalJobProps.get("spark.yarn.dist.archives");
    if (!Strings.isNullOrEmpty(archives)) {
        for (String archivePath : archives.split(",")) {
            String fileName = archivePath.substring(archivePath.lastIndexOf("/") + 1);
            if (archivePath.contains("#")) {
                fileName = archivePath.split("#")[1];
                archivePath = archivePath.substring(0, archivePath.indexOf("#"));
            }
            builder.addLocalResource(new LocalResourceDTO(fileName, archivePath, LocalResourceVisibility.APPLICATION.toString(), LocalResourceType.ARCHIVE.toString(), null), dfsClient);
        }
    }
    // Set app name
    builder.appName(jobName);
    return builder.build(settings.getSparkDir(), JobType.SPARK, services);
}
Also used : HashMap(java.util.HashMap) FileInputStream(java.io.FileInputStream) InputStream(java.io.InputStream) SparkJobConfiguration(io.hops.hopsworks.persistence.entity.jobs.configuration.spark.SparkJobConfiguration) Properties(java.util.Properties) LocalResourceDTO(io.hops.hopsworks.persistence.entity.jobs.configuration.yarn.LocalResourceDTO) FileInputStream(java.io.FileInputStream) YarnRunner(io.hops.hopsworks.common.jobs.yarn.YarnRunner) JobType(io.hops.hopsworks.persistence.entity.jobs.configuration.JobType) ConfigProperty(io.hops.hopsworks.common.util.templates.ConfigProperty)

Example 3 with ConfigProperty

use of io.hops.hopsworks.common.util.templates.ConfigProperty in project hopsworks by logicalclocks.

the class SparkConfigurationUtil method setFrameworkProperties.

public Map<String, String> setFrameworkProperties(Project project, JobConfiguration jobConfiguration, Settings settings, String hdfsUser, Users hopsworksUser, Map<String, String> extraJavaOptions, String kafkaBrokersString, String hopsworksRestEndpoint, ServingConfig servingConfig, ServiceDiscoveryController serviceDiscoveryController) throws IOException, ServiceDiscoveryException, JobException, ApiKeyException {
    SparkJobConfiguration sparkJobConfiguration = (SparkJobConfiguration) jobConfiguration;
    validateExecutorMemory(sparkJobConfiguration.getExecutorMemory(), settings);
    ExperimentType experimentType = sparkJobConfiguration.getExperimentType();
    DistributionStrategy distributionStrategy = sparkJobConfiguration.getDistributionStrategy();
    String userSparkProperties = sparkJobConfiguration.getProperties();
    Map<String, ConfigProperty> sparkProps = new HashMap<>();
    if (jobConfiguration.getAppName() != null) {
        sparkProps.put(Settings.SPARK_APP_NAME_ENV, new ConfigProperty(Settings.SPARK_APP_NAME_ENV, HopsUtils.OVERWRITE, sparkJobConfiguration.getAppName()));
    }
    if (sparkJobConfiguration.getJobType() != null && sparkJobConfiguration.getJobType() == JobType.PYSPARK) {
        sparkProps.put(Settings.SPARK_YARN_IS_PYTHON_ENV, new ConfigProperty(Settings.SPARK_YARN_IS_PYTHON_ENV, HopsUtils.OVERWRITE, "true"));
    }
    sparkProps.put(Settings.SPARK_YARN_APPMASTER_CONTAINER_RUNTIME, new ConfigProperty(Settings.SPARK_YARN_APPMASTER_CONTAINER_RUNTIME, HopsUtils.OVERWRITE, settings.getYarnRuntime()));
    sparkProps.put(Settings.SPARK_YARN_APPMASTER_DOCKER_IMAGE, new ConfigProperty(Settings.SPARK_YARN_APPMASTER_DOCKER_IMAGE, HopsUtils.OVERWRITE, ProjectUtils.getFullDockerImageName(project, settings, serviceDiscoveryController, false)));
    sparkProps.put(Settings.SPARK_YARN_APPMASTER_DOCKER_MOUNTS, new ConfigProperty(Settings.SPARK_YARN_APPMASTER_DOCKER_MOUNTS, HopsUtils.OVERWRITE, settings.getDockerMounts()));
    sparkProps.put(Settings.SPARK_EXECUTOR_CONTAINER_RUNTIME, new ConfigProperty(Settings.SPARK_EXECUTOR_CONTAINER_RUNTIME, HopsUtils.OVERWRITE, settings.getYarnRuntime()));
    sparkProps.put(Settings.SPARK_EXECUTOR_DOCKER_IMAGE, new ConfigProperty(Settings.SPARK_EXECUTOR_DOCKER_IMAGE, HopsUtils.OVERWRITE, ProjectUtils.getFullDockerImageName(project, settings, serviceDiscoveryController, false)));
    sparkProps.put(Settings.SPARK_EXECUTOR_DOCKER_MOUNTS, new ConfigProperty(Settings.SPARK_EXECUTOR_DOCKER_MOUNTS, HopsUtils.OVERWRITE, settings.getDockerMounts()));
    sparkProps.put(Settings.SPARK_HADOOP_FS_PERMISSIONS_UMASK, new ConfigProperty(Settings.SPARK_HADOOP_FS_PERMISSIONS_UMASK, HopsUtils.OVERWRITE, Settings.SPARK_HADOOP_FS_PERMISSIONS_UMASK_DEFAULT));
    sparkProps.put(Settings.SPARK_YARN_APPMASTER_IS_DRIVER, new ConfigProperty(Settings.SPARK_YARN_APPMASTER_IS_DRIVER, HopsUtils.IGNORE, "true"));
    sparkProps.put(Settings.SPARK_PYSPARK_PYTHON_OPTION, new ConfigProperty(Settings.SPARK_PYSPARK_PYTHON_OPTION, HopsUtils.IGNORE, settings.getAnacondaProjectDir() + "/bin/python"));
    // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html
    // Needs to be set for CUDA libraries to not initialize GPU context
    sparkProps.put(Settings.SPARK_YARN_APPMASTER_CUDA_DEVICES, new ConfigProperty(Settings.SPARK_YARN_APPMASTER_CUDA_DEVICES, HopsUtils.IGNORE, ""));
    // https://rocm-documentation.readthedocs.io/en/latest/Other_Solutions/Other-Solutions.html
    // Needs to be set for ROCm libraries to not initialize GPU context
    sparkProps.put(Settings.SPARK_YARN_APPMASTER_HIP_DEVICES, new ConfigProperty(Settings.SPARK_YARN_APPMASTER_HIP_DEVICES, HopsUtils.IGNORE, "-1"));
    sparkProps.put(Settings.SPARK_YARN_APPMASTER_ENV_EXECUTOR_GPUS, new ConfigProperty(Settings.SPARK_YARN_APPMASTER_ENV_EXECUTOR_GPUS, HopsUtils.IGNORE, "0"));
    sparkProps.put(Settings.SPARK_EXECUTOR_ENV_EXECUTOR_GPUS, new ConfigProperty(Settings.SPARK_EXECUTOR_ENV_EXECUTOR_GPUS, HopsUtils.IGNORE, Integer.toString(sparkJobConfiguration.getExecutorGpus())));
    sparkProps.put(Settings.SPARK_SUBMIT_DEPLOYMODE, new ConfigProperty(Settings.SPARK_SUBMIT_DEPLOYMODE, HopsUtils.OVERWRITE, "cluster"));
    if (sparkJobConfiguration.getExecutorGpus() == 0) {
        addToSparkEnvironment(sparkProps, "HIP_VISIBLE_DEVICES", "-1", HopsUtils.IGNORE);
        addToSparkEnvironment(sparkProps, "CUDA_VISIBLE_DEVICES", "", HopsUtils.IGNORE);
        sparkProps.put(Settings.SPARK_EXECUTOR_GPU_AMOUNT, new ConfigProperty(Settings.SPARK_EXECUTOR_GPU_AMOUNT, HopsUtils.IGNORE, Integer.toString(0)));
    } else if (experimentType != null && sparkJobConfiguration.getExecutorGpus() > 0) {
        // Number of GPU allocated for each executor
        sparkProps.put(Settings.SPARK_EXECUTOR_GPU_AMOUNT, new ConfigProperty(Settings.SPARK_EXECUTOR_GPU_AMOUNT, HopsUtils.IGNORE, Integer.toString(sparkJobConfiguration.getExecutorGpus())));
        // Spark tasks should not share GPUs so we set it to the number of GPUs allocated for each executor
        sparkProps.put(Settings.SPARK_TASK_RESOURCE_GPU_AMOUNT, new ConfigProperty(Settings.SPARK_TASK_RESOURCE_GPU_AMOUNT, HopsUtils.OVERWRITE, Integer.toString(sparkJobConfiguration.getExecutorGpus())));
        // Script needed to find all the GPUs that the Executor has access to
        sparkProps.put(Settings.SPARK_EXECUTOR_RESOURCE_GPU_DISCOVERY_SCRIPT, new ConfigProperty(Settings.SPARK_EXECUTOR_RESOURCE_GPU_DISCOVERY_SCRIPT, HopsUtils.IGNORE, settings.getSparkDir() + "/bin/getGpusResources.sh"));
    }
    addToSparkEnvironment(sparkProps, "SPARK_HOME", settings.getSparkDir(), HopsUtils.IGNORE);
    addToSparkEnvironment(sparkProps, "SPARK_CONF_DIR", settings.getSparkConfDir(), HopsUtils.IGNORE);
    addToSparkEnvironment(sparkProps, "ELASTIC_ENDPOINT", settings.getElasticRESTEndpoint(), HopsUtils.IGNORE);
    addToSparkEnvironment(sparkProps, "HADOOP_VERSION", settings.getHadoopVersion(), HopsUtils.IGNORE);
    addToSparkEnvironment(sparkProps, "HOPSWORKS_VERSION", settings.getHopsworksVersion(), HopsUtils.IGNORE);
    addToSparkEnvironment(sparkProps, "TENSORFLOW_VERSION", settings.getTensorflowVersion(), HopsUtils.IGNORE);
    addToSparkEnvironment(sparkProps, "KAFKA_VERSION", settings.getKafkaVersion(), HopsUtils.IGNORE);
    addToSparkEnvironment(sparkProps, "SPARK_VERSION", settings.getSparkVersion(), HopsUtils.IGNORE);
    addToSparkEnvironment(sparkProps, "LIVY_VERSION", settings.getLivyVersion(), HopsUtils.IGNORE);
    addToSparkEnvironment(sparkProps, "HADOOP_HOME", settings.getHadoopSymbolicLinkDir(), HopsUtils.IGNORE);
    addToSparkEnvironment(sparkProps, "HADOOP_HDFS_HOME", settings.getHadoopSymbolicLinkDir(), HopsUtils.IGNORE);
    addToSparkEnvironment(sparkProps, "HADOOP_USER_NAME", hdfsUser, HopsUtils.IGNORE);
    if (!Strings.isNullOrEmpty(sparkJobConfiguration.getAppName())) {
        addToSparkEnvironment(sparkProps, "HOPSWORKS_JOB_NAME", sparkJobConfiguration.getAppName(), HopsUtils.IGNORE);
    }
    if (!Strings.isNullOrEmpty(kafkaBrokersString)) {
        addToSparkEnvironment(sparkProps, "KAFKA_BROKERS", kafkaBrokersString, HopsUtils.IGNORE);
    }
    addToSparkEnvironment(sparkProps, "REST_ENDPOINT", hopsworksRestEndpoint, HopsUtils.IGNORE);
    addToSparkEnvironment(sparkProps, Settings.SPARK_PYSPARK_PYTHON, settings.getAnacondaProjectDir() + "/bin/python", HopsUtils.IGNORE);
    addToSparkEnvironment(sparkProps, "HOPSWORKS_PROJECT_ID", Integer.toString(project.getId()), HopsUtils.IGNORE);
    addToSparkEnvironment(sparkProps, "FLINK_CONF_DIR", settings.getFlinkConfDir(), HopsUtils.IGNORE);
    addToSparkEnvironment(sparkProps, "REQUESTS_VERIFY", String.valueOf(settings.getRequestsVerify()), HopsUtils.IGNORE);
    addToSparkEnvironment(sparkProps, "DOMAIN_CA_TRUSTSTORE", Settings.DOMAIN_CA_TRUSTSTORE, HopsUtils.IGNORE);
    addToSparkEnvironment(sparkProps, "SERVICE_DISCOVERY_DOMAIN", settings.getServiceDiscoveryDomain(), HopsUtils.IGNORE);
    // add extra env vars
    if (servingConfig != null) {
        Map<String, String> servingEnvVars = servingConfig.getEnvVars(hopsworksUser, true);
        if (servingEnvVars != null) {
            servingEnvVars.forEach((key, value) -> addToSparkEnvironment(sparkProps, key, value, HopsUtils.IGNORE));
        }
    }
    addLibHdfsOpts(userSparkProperties, settings, sparkProps, sparkJobConfiguration);
    // If DynamicExecutors are not enabled, set the user defined number
    // of executors
    // Force dynamic allocation if we are running a DL experiment (we never want users to lock up GPUs)
    sparkProps.put(Settings.SPARK_DYNAMIC_ALLOC_ENV, new ConfigProperty(Settings.SPARK_DYNAMIC_ALLOC_ENV, HopsUtils.OVERWRITE, String.valueOf(sparkJobConfiguration.isDynamicAllocationEnabled() || experimentType != null)));
    if (experimentType != null) {
        // Dynamic executors requires the shuffle service to be enabled
        sparkProps.put(Settings.SPARK_SHUFFLE_SERVICE, new ConfigProperty(Settings.SPARK_SHUFFLE_SERVICE, HopsUtils.OVERWRITE, "true"));
        // To avoid deadlock in resource allocation this configuration is needed
        if (experimentType == ExperimentType.DISTRIBUTED_TRAINING) {
            if (distributionStrategy == DistributionStrategy.MULTI_WORKER_MIRRORED) {
                sparkProps.put(Settings.SPARK_DYNAMIC_ALLOC_MIN_EXECS_ENV, new ConfigProperty(Settings.SPARK_DYNAMIC_ALLOC_MIN_EXECS_ENV, HopsUtils.OVERWRITE, "0"));
                sparkProps.put(Settings.SPARK_DYNAMIC_ALLOC_MAX_EXECS_ENV, new ConfigProperty(Settings.SPARK_DYNAMIC_ALLOC_MAX_EXECS_ENV, HopsUtils.OVERWRITE, String.valueOf(sparkJobConfiguration.getDynamicAllocationMaxExecutors())));
                sparkProps.put(Settings.SPARK_DYNAMIC_ALLOC_INIT_EXECS_ENV, new ConfigProperty(Settings.SPARK_DYNAMIC_ALLOC_INIT_EXECS_ENV, HopsUtils.OVERWRITE, String.valueOf(sparkJobConfiguration.getDynamicAllocationMaxExecutors())));
            } else if (distributionStrategy == DistributionStrategy.PARAMETER_SERVER) {
                sparkProps.put(Settings.SPARK_DYNAMIC_ALLOC_MIN_EXECS_ENV, new ConfigProperty(Settings.SPARK_DYNAMIC_ALLOC_MIN_EXECS_ENV, HopsUtils.OVERWRITE, "0"));
                sparkProps.put(Settings.SPARK_DYNAMIC_ALLOC_MAX_EXECS_ENV, new ConfigProperty(Settings.SPARK_DYNAMIC_ALLOC_MAX_EXECS_ENV, HopsUtils.OVERWRITE, String.valueOf(sparkJobConfiguration.getDynamicAllocationMaxExecutors()) + sparkJobConfiguration.getNumPs()));
                sparkProps.put(Settings.SPARK_DYNAMIC_ALLOC_INIT_EXECS_ENV, new ConfigProperty(Settings.SPARK_DYNAMIC_ALLOC_INIT_EXECS_ENV, HopsUtils.OVERWRITE, String.valueOf(sparkJobConfiguration.getDynamicAllocationMaxExecutors()) + sparkJobConfiguration.getNumPs()));
                addToSparkEnvironment(sparkProps, "NUM_TF_PS", Integer.toString(sparkJobConfiguration.getNumPs()), HopsUtils.IGNORE);
            }
            // These values were set based on:
            // https://docs.nvidia.com/deeplearning/nccl/archives/nccl_256/nccl-developer-guide/docs/env.html
            addToSparkEnvironment(sparkProps, Settings.NCCL_SOCKET_NTHREADS, "2", HopsUtils.OVERWRITE);
            addToSparkEnvironment(sparkProps, Settings.NCCL_NSOCKS_PERTHREAD, "8", HopsUtils.OVERWRITE);
        } else if (experimentType == ExperimentType.PARALLEL_EXPERIMENTS) {
            sparkProps.put(Settings.SPARK_DYNAMIC_ALLOC_MIN_EXECS_ENV, new ConfigProperty(Settings.SPARK_DYNAMIC_ALLOC_MIN_EXECS_ENV, HopsUtils.OVERWRITE, "0"));
            sparkProps.put(Settings.SPARK_DYNAMIC_ALLOC_MAX_EXECS_ENV, new ConfigProperty(Settings.SPARK_DYNAMIC_ALLOC_MAX_EXECS_ENV, HopsUtils.OVERWRITE, String.valueOf(sparkJobConfiguration.getDynamicAllocationMaxExecutors())));
            sparkProps.put(Settings.SPARK_DYNAMIC_ALLOC_INIT_EXECS_ENV, new ConfigProperty(Settings.SPARK_DYNAMIC_ALLOC_INIT_EXECS_ENV, HopsUtils.OVERWRITE, "0"));
        } else {
            // EXPERIMENT
            sparkProps.put(Settings.SPARK_DYNAMIC_ALLOC_MIN_EXECS_ENV, new ConfigProperty(Settings.SPARK_DYNAMIC_ALLOC_MIN_EXECS_ENV, HopsUtils.OVERWRITE, "0"));
            sparkProps.put(Settings.SPARK_DYNAMIC_ALLOC_MAX_EXECS_ENV, new ConfigProperty(Settings.SPARK_DYNAMIC_ALLOC_MAX_EXECS_ENV, HopsUtils.OVERWRITE, "1"));
            sparkProps.put(Settings.SPARK_DYNAMIC_ALLOC_INIT_EXECS_ENV, new ConfigProperty(Settings.SPARK_DYNAMIC_ALLOC_INIT_EXECS_ENV, HopsUtils.OVERWRITE, "0"));
        }
    } else if (sparkJobConfiguration.isDynamicAllocationEnabled()) {
        // Spark dynamic
        sparkProps.put(Settings.SPARK_SHUFFLE_SERVICE, new ConfigProperty(Settings.SPARK_SHUFFLE_SERVICE, HopsUtils.OVERWRITE, "true"));
        // Initial executors should not be greater than MaxExecutors
        if (sparkJobConfiguration.getDynamicAllocationInitialExecutors() > sparkJobConfiguration.getDynamicAllocationMaxExecutors()) {
            sparkProps.put(Settings.SPARK_DYNAMIC_ALLOC_INIT_EXECS_ENV, new ConfigProperty(Settings.SPARK_DYNAMIC_ALLOC_INIT_EXECS_ENV, HopsUtils.OVERWRITE, String.valueOf(sparkJobConfiguration.getDynamicAllocationMaxExecutors())));
        // Initial executors should not be less than MinExecutors
        } else if (sparkJobConfiguration.getDynamicAllocationInitialExecutors() < sparkJobConfiguration.getDynamicAllocationMinExecutors()) {
            sparkProps.put(Settings.SPARK_DYNAMIC_ALLOC_INIT_EXECS_ENV, new ConfigProperty(Settings.SPARK_DYNAMIC_ALLOC_INIT_EXECS_ENV, HopsUtils.OVERWRITE, String.valueOf(sparkJobConfiguration.getDynamicAllocationMinExecutors())));
        } else {
            // User set it to a valid value
            sparkProps.put(Settings.SPARK_DYNAMIC_ALLOC_INIT_EXECS_ENV, new ConfigProperty(Settings.SPARK_DYNAMIC_ALLOC_INIT_EXECS_ENV, HopsUtils.OVERWRITE, String.valueOf(sparkJobConfiguration.getDynamicAllocationInitialExecutors())));
        }
        sparkProps.put(Settings.SPARK_DYNAMIC_ALLOC_MIN_EXECS_ENV, new ConfigProperty(Settings.SPARK_DYNAMIC_ALLOC_MIN_EXECS_ENV, HopsUtils.OVERWRITE, String.valueOf(sparkJobConfiguration.getDynamicAllocationMinExecutors())));
        sparkProps.put(Settings.SPARK_DYNAMIC_ALLOC_MAX_EXECS_ENV, new ConfigProperty(Settings.SPARK_DYNAMIC_ALLOC_MAX_EXECS_ENV, HopsUtils.OVERWRITE, String.valueOf(sparkJobConfiguration.getDynamicAllocationMaxExecutors())));
        sparkProps.put(Settings.SPARK_NUMBER_EXECUTORS_ENV, new ConfigProperty(Settings.SPARK_NUMBER_EXECUTORS_ENV, HopsUtils.OVERWRITE, Integer.toString(sparkJobConfiguration.getDynamicAllocationMinExecutors())));
    } else {
        // Spark Static
        sparkProps.put(Settings.SPARK_NUMBER_EXECUTORS_ENV, new ConfigProperty(Settings.SPARK_NUMBER_EXECUTORS_ENV, HopsUtils.OVERWRITE, Integer.toString(sparkJobConfiguration.getExecutorInstances())));
    }
    sparkProps.put(Settings.SPARK_DRIVER_MEMORY_ENV, new ConfigProperty(Settings.SPARK_DRIVER_MEMORY_ENV, HopsUtils.OVERWRITE, sparkJobConfiguration.getAmMemory() + "m"));
    sparkProps.put(Settings.SPARK_DRIVER_CORES_ENV, new ConfigProperty(Settings.SPARK_DRIVER_CORES_ENV, HopsUtils.OVERWRITE, Integer.toString(experimentType != null ? 1 : sparkJobConfiguration.getAmVCores())));
    sparkProps.put(Settings.SPARK_EXECUTOR_MEMORY_ENV, new ConfigProperty(Settings.SPARK_EXECUTOR_MEMORY_ENV, HopsUtils.OVERWRITE, sparkJobConfiguration.getExecutorMemory() + "m"));
    sparkProps.put(Settings.SPARK_EXECUTOR_CORES_ENV, new ConfigProperty(Settings.SPARK_EXECUTOR_CORES_ENV, HopsUtils.OVERWRITE, Integer.toString(experimentType != null ? 1 : sparkJobConfiguration.getExecutorCores())));
    StringBuilder extraClassPath = new StringBuilder();
    extraClassPath.append("{{PWD}}").append(File.pathSeparator).append(settings.getSparkDir()).append("/jars/*").append(File.pathSeparator).append(settings.getSparkDir()).append("/hopsworks-jars/*");
    StringBuilder sparkFiles = new StringBuilder(settings.getSparkLog4JPath());
    String applicationsJars = sparkJobConfiguration.getJars();
    if (!Strings.isNullOrEmpty(applicationsJars)) {
        applicationsJars = formatResources(applicationsJars);
        for (String jar : applicationsJars.split(",")) {
            String name = jar.substring(jar.lastIndexOf("/") + 1);
            extraClassPath.append(File.pathSeparator).append(name);
        }
        applicationsJars = formatResources(applicationsJars);
        sparkFiles.append(",").append(applicationsJars);
    }
    String applicationArchives = sparkJobConfiguration.getArchives();
    if (!Strings.isNullOrEmpty(applicationArchives)) {
        applicationArchives = formatResources(applicationArchives);
        sparkProps.put(Settings.SPARK_YARN_DIST_ARCHIVES, new ConfigProperty(Settings.SPARK_YARN_DIST_ARCHIVES, HopsUtils.APPEND_COMMA, applicationArchives));
    }
    // NodeManagers. We don't need to add it as LocalResource
    if (!settings.getHopsRpcTls()) {
        sparkFiles.append(",hdfs://").append(settings.getHdfsTmpCertDir()).append(File.separator).append(hdfsUser).append(File.separator).append(hdfsUser).append("__kstore.jks#").append(Settings.K_CERTIFICATE).append(",").append("hdfs://").append(settings.getHdfsTmpCertDir()).append(File.separator).append(hdfsUser).append(File.separator).append(hdfsUser).append("__tstore.jks#").append(Settings.T_CERTIFICATE).append(",").append("hdfs://").append(settings.getHdfsTmpCertDir()).append(File.separator).append(hdfsUser).append(File.separator).append(hdfsUser).append("__cert.key#").append(Settings.CRYPTO_MATERIAL_PASSWORD);
    }
    String applicationFiles = sparkJobConfiguration.getFiles();
    if (!Strings.isNullOrEmpty(applicationFiles)) {
        applicationFiles = formatResources(applicationFiles);
        sparkFiles.append(",").append(applicationFiles);
    }
    String applicationPyFiles = sparkJobConfiguration.getPyFiles();
    if (!Strings.isNullOrEmpty(applicationPyFiles)) {
        StringBuilder pythonPath = new StringBuilder();
        applicationPyFiles = formatResources(applicationPyFiles);
        for (String pythonDep : applicationPyFiles.split(",")) {
            String name = pythonDep.substring(pythonDep.lastIndexOf("/") + 1);
            pythonPath.append("{{PWD}}/" + name + File.pathSeparator);
        }
        addToSparkEnvironment(sparkProps, "PYTHONPATH", pythonPath.toString(), HopsUtils.APPEND_PATH);
        sparkFiles.append(",").append(applicationPyFiles);
    }
    applicationFiles = formatResources(sparkFiles.toString());
    sparkProps.put(Settings.SPARK_YARN_DIST_FILES, new ConfigProperty(Settings.SPARK_YARN_DIST_FILES, HopsUtils.APPEND_COMMA, applicationFiles));
    sparkProps.put(Settings.SPARK_DRIVER_EXTRACLASSPATH, new ConfigProperty(Settings.SPARK_DRIVER_EXTRACLASSPATH, HopsUtils.APPEND_PATH, extraClassPath.toString()));
    sparkProps.put(Settings.SPARK_EXECUTOR_EXTRACLASSPATH, new ConfigProperty(Settings.SPARK_EXECUTOR_EXTRACLASSPATH, HopsUtils.APPEND_PATH, extraClassPath.toString()));
    // We do not support fault-tolerance for distributed training
    if (experimentType == ExperimentType.DISTRIBUTED_TRAINING) {
        sparkProps.put(Settings.SPARK_BLACKLIST_ENABLED, new ConfigProperty(Settings.SPARK_BLACKLIST_ENABLED, HopsUtils.OVERWRITE, "false"));
    } else if (sparkJobConfiguration.isBlacklistingEnabled()) {
        sparkProps.put(Settings.SPARK_BLACKLIST_ENABLED, new ConfigProperty(Settings.SPARK_BLACKLIST_ENABLED, HopsUtils.OVERWRITE, Boolean.toString(sparkJobConfiguration.isBlacklistingEnabled())));
        // If any task fails on an executor - kill it instantly (need fresh working directory for each task)
        sparkProps.put(Settings.SPARK_BLACKLIST_MAX_TASK_ATTEMPTS_PER_EXECUTOR, new ConfigProperty(Settings.SPARK_BLACKLIST_MAX_TASK_ATTEMPTS_PER_EXECUTOR, HopsUtils.OVERWRITE, "1"));
        // Blacklist node after 2 tasks fails on it
        sparkProps.put(Settings.SPARK_BLACKLIST_MAX_TASK_ATTEMPTS_PER_NODE, new ConfigProperty(Settings.SPARK_BLACKLIST_MAX_TASK_ATTEMPTS_PER_NODE, HopsUtils.OVERWRITE, "2"));
        // If any task fails on an executor within a stage - blacklist it
        sparkProps.put(Settings.SPARK_BLACKLIST_STAGE_MAX_FAILED_TASKS_PER_EXECUTOR, new ConfigProperty(Settings.SPARK_BLACKLIST_STAGE_MAX_FAILED_TASKS_PER_EXECUTOR, HopsUtils.OVERWRITE, "1"));
        // Blacklist node after 2 tasks within a stage fails on it
        sparkProps.put(Settings.SPARK_BLACKLIST_STAGE_MAX_FAILED_TASKS_PER_NODE, new ConfigProperty(Settings.SPARK_BLACKLIST_STAGE_MAX_FAILED_TASKS_PER_NODE, HopsUtils.OVERWRITE, "2"));
        // If any task fails on an executor within an application - blacklist it
        sparkProps.put(Settings.SPARK_BLACKLIST_APPLICATION_MAX_FAILED_TASKS_PER_EXECUTOR, new ConfigProperty(Settings.SPARK_BLACKLIST_APPLICATION_MAX_FAILED_TASKS_PER_EXECUTOR, HopsUtils.OVERWRITE, "1"));
        // If 2 task fails on a node within an application - blacklist it
        sparkProps.put(Settings.SPARK_BLACKLIST_APPLICATION_MAX_FAILED_TASKS_PER_NODE, new ConfigProperty(Settings.SPARK_BLACKLIST_APPLICATION_MAX_FAILED_TASKS_PER_NODE, HopsUtils.OVERWRITE, "2"));
        // Always kill the blacklisted executors (further failures could be results of local files from the failed task)
        sparkProps.put(Settings.SPARK_BLACKLIST_KILL_BLACKLISTED_EXECUTORS, new ConfigProperty(Settings.SPARK_BLACKLIST_KILL_BLACKLISTED_EXECUTORS, HopsUtils.OVERWRITE, "true"));
    }
    // settings above
    if (experimentType != null) {
        // Blacklisting is enabled and we are dealing with an Experiment/Parallel Experiment
        if (sparkJobConfiguration.isBlacklistingEnabled() && (experimentType == ExperimentType.EXPERIMENT || experimentType == ExperimentType.PARALLEL_EXPERIMENTS)) {
            sparkProps.put(Settings.SPARK_TASK_MAX_FAILURES, new ConfigProperty(Settings.SPARK_TASK_MAX_FAILURES, HopsUtils.OVERWRITE, "3"));
        // All other configurations should not retry to avoid wasting time during development (syntax errors etc)
        } else {
            sparkProps.put(Settings.SPARK_TASK_MAX_FAILURES, new ConfigProperty(Settings.SPARK_TASK_MAX_FAILURES, HopsUtils.OVERWRITE, "1"));
        }
    }
    extraJavaOptions.put(Settings.JOB_LOG4J_CONFIG, Settings.JOB_LOG4J_PROPERTIES);
    extraJavaOptions.put(Settings.HOPSWORKS_REST_ENDPOINT_PROPERTY, hopsworksRestEndpoint);
    extraJavaOptions.put(Settings.HOPSUTIL_INSECURE_PROPERTY, String.valueOf(settings.isHopsUtilInsecure()));
    extraJavaOptions.put(Settings.SERVER_TRUSTSTORE_PROPERTY, Settings.SERVER_TRUSTSTORE_PROPERTY);
    extraJavaOptions.put(Settings.HOPSWORKS_ELASTIC_ENDPOINT_PROPERTY, settings.getElasticRESTEndpoint());
    extraJavaOptions.put(Settings.HOPSWORKS_PROJECTID_PROPERTY, Integer.toString(project.getId()));
    extraJavaOptions.put(Settings.HOPSWORKS_PROJECTNAME_PROPERTY, project.getName());
    extraJavaOptions.put(Settings.SPARK_JAVA_LIBRARY_PROP, settings.getHadoopSymbolicLinkDir() + "/lib/native/");
    extraJavaOptions.put(Settings.HOPSWORKS_PROJECTUSER_PROPERTY, hdfsUser);
    extraJavaOptions.put(Settings.KAFKA_BROKERADDR_PROPERTY, kafkaBrokersString);
    extraJavaOptions.put(Settings.HOPSWORKS_JOBTYPE_PROPERTY, JobType.SPARK.name());
    extraJavaOptions.put(Settings.HOPSWORKS_DOMAIN_CA_TRUSTSTORE_PROPERTY, Settings.DOMAIN_CA_TRUSTSTORE);
    if (jobConfiguration.getAppName() != null) {
        extraJavaOptions.put(Settings.HOPSWORKS_JOBNAME_PROPERTY, jobConfiguration.getAppName());
    }
    StringBuilder extraJavaOptionsSb = new StringBuilder();
    for (String key : extraJavaOptions.keySet()) {
        extraJavaOptionsSb.append(" -D").append(key).append("=").append(extraJavaOptions.get(key));
    }
    sparkProps.put(Settings.SPARK_EXECUTOR_EXTRA_JAVA_OPTS, new ConfigProperty(Settings.SPARK_EXECUTOR_EXTRA_JAVA_OPTS, HopsUtils.APPEND_SPACE, extraJavaOptionsSb.toString()));
    sparkProps.put(Settings.SPARK_DRIVER_EXTRA_JAVA_OPTIONS, new ConfigProperty(Settings.SPARK_DRIVER_EXTRA_JAVA_OPTIONS, HopsUtils.APPEND_SPACE, extraJavaOptionsSb.toString()));
    Map<String, String> validatedSparkProperties = HopsUtils.validateUserProperties(userSparkProperties, settings.getSparkDir());
    // Merge system and user defined properties
    return HopsUtils.mergeHopsworksAndUserParams(sparkProps, validatedSparkProperties);
}
Also used : DistributionStrategy(io.hops.hopsworks.persistence.entity.jobs.configuration.DistributionStrategy) HashMap(java.util.HashMap) SparkJobConfiguration(io.hops.hopsworks.persistence.entity.jobs.configuration.spark.SparkJobConfiguration) ExperimentType(io.hops.hopsworks.persistence.entity.jobs.configuration.ExperimentType) ConfigProperty(io.hops.hopsworks.common.util.templates.ConfigProperty)

Example 4 with ConfigProperty

use of io.hops.hopsworks.common.util.templates.ConfigProperty in project hopsworks by logicalclocks.

the class SparkConfigurationUtil method addLibHdfsOpts.

private void addLibHdfsOpts(String userSparkProperties, Settings settings, Map<String, ConfigProperty> sparkProps, SparkJobConfiguration sparkJobConfiguration) {
    String defaultLibHdfsOpts = "-Dlog4j.configuration=" + settings.getHadoopSymbolicLinkDir() + "/etc/hadoop/log4j.properties -Dhadoop.root.logger=ERROR,RFA";
    Map<String, String> userProperties = HopsUtils.parseUserProperties(userSparkProperties);
    if (userProperties.containsKey(Settings.SPARK_YARN_APPMASTER_LIBHDFS_OPTS)) {
        // if user supplied xmx then append what they provided
        sparkProps.put(Settings.SPARK_YARN_APPMASTER_LIBHDFS_OPTS, new ConfigProperty(Settings.SPARK_YARN_APPMASTER_LIBHDFS_OPTS, HopsUtils.APPEND_SPACE, defaultLibHdfsOpts));
    } else {
        addDefaultXmx(sparkProps, Settings.SPARK_YARN_APPMASTER_ENV, (int) (sparkJobConfiguration.getAmMemory() * 0.2), defaultLibHdfsOpts);
    }
    if (userProperties.containsKey(Settings.SPARK_EXECUTOR_ENV + "LIBHDFS_OPTS")) {
        // if user supplied xmx then append what they provided
        sparkProps.put(Settings.SPARK_EXECUTOR_ENV + "LIBHDFS_OPTS", new ConfigProperty(Settings.SPARK_EXECUTOR_ENV + "LIBHDFS_OPTS", HopsUtils.APPEND_SPACE, defaultLibHdfsOpts));
    } else {
        addDefaultXmx(sparkProps, Settings.SPARK_EXECUTOR_ENV, (int) (sparkJobConfiguration.getExecutorMemory() * 0.2), defaultLibHdfsOpts);
    }
}
Also used : ConfigProperty(io.hops.hopsworks.common.util.templates.ConfigProperty)

Example 5 with ConfigProperty

use of io.hops.hopsworks.common.util.templates.ConfigProperty in project hopsworks by logicalclocks.

the class SparkConfigurationUtil method addToSparkEnvironment.

private void addToSparkEnvironment(Map<String, ConfigProperty> sparkProps, String envName, String value, ConfigReplacementPolicy replacementPolicy) {
    sparkProps.put(Settings.SPARK_EXECUTOR_ENV + envName, new ConfigProperty(Settings.SPARK_EXECUTOR_ENV + envName, replacementPolicy, value));
    sparkProps.put(Settings.SPARK_YARN_APPMASTER_ENV + envName, new ConfigProperty(Settings.SPARK_YARN_APPMASTER_ENV + envName, replacementPolicy, value));
}
Also used : ConfigProperty(io.hops.hopsworks.common.util.templates.ConfigProperty)

Aggregations

ConfigProperty (io.hops.hopsworks.common.util.templates.ConfigProperty)6 HashMap (java.util.HashMap)4 SparkJobConfiguration (io.hops.hopsworks.persistence.entity.jobs.configuration.spark.SparkJobConfiguration)2 YarnRunner (io.hops.hopsworks.common.jobs.yarn.YarnRunner)1 DistributionStrategy (io.hops.hopsworks.persistence.entity.jobs.configuration.DistributionStrategy)1 ExperimentType (io.hops.hopsworks.persistence.entity.jobs.configuration.ExperimentType)1 JobType (io.hops.hopsworks.persistence.entity.jobs.configuration.JobType)1 FlinkJobConfiguration (io.hops.hopsworks.persistence.entity.jobs.configuration.flink.FlinkJobConfiguration)1 LocalResourceDTO (io.hops.hopsworks.persistence.entity.jobs.configuration.yarn.LocalResourceDTO)1 FileInputStream (java.io.FileInputStream)1 InputStream (java.io.InputStream)1 HashSet (java.util.HashSet)1 Map (java.util.Map)1 Properties (java.util.Properties)1