Search in sources :

Example 1 with ClusterConfig

use of com.google.cloud.dataproc.v1.ClusterConfig in project cdap by caskdata.

the class DataprocClient method createCluster.

/**
 * Create a cluster. This will return after the initial request to create the cluster is completed.
 * At this point, the cluster is likely not yet running, but in a provisioning state.
 *
 * @param name         the name of the cluster to create
 * @param imageVersion the image version for the cluster
 * @param labels       labels to set on the cluster
 * @param privateInstance {@code true} to indicate using private instance
 * @return create operation metadata
 * @throws InterruptedException        if the thread was interrupted while waiting for the initial request to complete
 * @throws AlreadyExistsException      if the cluster already exists
 * @throws IOException                 if there was an I/O error talking to Google Compute APIs
 * @throws RetryableProvisionException if there was a non 4xx error code returned
 */
ClusterOperationMetadata createCluster(String name, String imageVersion, Map<String, String> labels, boolean privateInstance) throws RetryableProvisionException, InterruptedException, IOException {
    if (network == null) {
        // yet being used to create cluster.
        throw new IllegalArgumentException("Missing network information");
    }
    try {
        Map<String, String> metadata = new HashMap<>();
        SSHPublicKey publicKey = conf.getPublicKey();
        if (publicKey != null) {
            // Don't fail if there is no public key. It is for tooling case that the key might be generated differently.
            metadata.put("ssh-keys", publicKey.getUser() + ":" + publicKey.getKey());
            // override any os-login that may be set on the project-level metadata
            // this metadata is only needed if ssh is being used to launch the jobs - CDAP-15369
            metadata.put("enable-oslogin", "false");
        }
        // Check if ClusterMetaData is provided and add them.
        metadata.putAll(conf.getClusterMetaData());
        GceClusterConfig.Builder clusterConfig = GceClusterConfig.newBuilder().addServiceAccountScopes(DataprocConf.CLOUD_PLATFORM_SCOPE).setShieldedInstanceConfig(ShieldedInstanceConfig.newBuilder().setEnableSecureBoot(conf.isSecureBootEnabled()).setEnableVtpm(conf.isvTpmEnabled()).setEnableIntegrityMonitoring(conf.isIntegrityMonitoringEnabled()).build()).putAllMetadata(metadata);
        if (conf.getServiceAccount() != null) {
            clusterConfig.setServiceAccount(conf.getServiceAccount());
        }
        if (conf.getZone() != null) {
            clusterConfig.setZoneUri(conf.getZone());
        }
        // subnets are unique within a location, not within a network, which is why these configs are mutually exclusive.
        if (conf.getSubnet() != null) {
            clusterConfig.setSubnetworkUri(conf.getSubnet());
        } else {
            clusterConfig.setNetworkUri(network.getSelfLink());
        }
        // Add any defined Network Tags
        clusterConfig.addAllTags(conf.getNetworkTags());
        boolean internalIPOnly = isInternalIPOnly(network, privateInstance, publicKey != null);
        // if public key is not null that means ssh is used to launch / monitor job on dataproc
        if (publicKey != null) {
            int maxTags = Math.max(0, DataprocConf.MAX_NETWORK_TAGS - clusterConfig.getTagsCount());
            List<String> tags = getFirewallTargetTags(network, internalIPOnly);
            if (tags.size() > maxTags) {
                LOG.warn("No more than 64 tags can be added. Firewall tags ignored: {}", tags.subList(maxTags, tags.size()));
            }
            tags.stream().limit(maxTags).forEach(clusterConfig::addTags);
        }
        // if internal ip is preferred then create dataproc cluster without external ip for better security
        clusterConfig.setInternalIpOnly(internalIPOnly);
        Map<String, String> clusterProperties = new HashMap<>(conf.getClusterProperties());
        // Enable/Disable stackdriver
        clusterProperties.put("dataproc:dataproc.logging.stackdriver.enable", Boolean.toString(conf.isStackdriverLoggingEnabled()));
        clusterProperties.put("dataproc:dataproc.monitoring.stackdriver.enable", Boolean.toString(conf.isStackdriverMonitoringEnabled()));
        DiskConfig workerDiskConfig = DiskConfig.newBuilder().setBootDiskSizeGb(conf.getWorkerDiskGB()).setBootDiskType(conf.getWorkerDiskType()).setNumLocalSsds(0).build();
        InstanceGroupConfig.Builder primaryWorkerConfig = InstanceGroupConfig.newBuilder().setNumInstances(conf.getWorkerNumNodes()).setMachineTypeUri(conf.getWorkerMachineType()).setDiskConfig(workerDiskConfig);
        InstanceGroupConfig.Builder secondaryWorkerConfig = InstanceGroupConfig.newBuilder().setNumInstances(conf.getSecondaryWorkerNumNodes()).setMachineTypeUri(conf.getWorkerMachineType()).setPreemptibility(InstanceGroupConfig.Preemptibility.NON_PREEMPTIBLE).setDiskConfig(workerDiskConfig);
        // Set default concurrency settings for fixed cluster
        if (Strings.isNullOrEmpty(conf.getAutoScalingPolicy()) && !conf.isPredefinedAutoScaleEnabled()) {
            // Set spark.default.parallelism according to cluster size.
            // Spark defaults it to number of current executors, but when we configure the job
            // executors may not have started yet, so this value gets artificially low.
            int defaultConcurrency = Math.max(conf.getTotalWorkerCPUs(), MIN_DEFAULT_CONCURRENCY);
            // Set spark.sql.adaptive.coalescePartitions.initialPartitionNum as 32x of default parallelism,
            // but no more than 8192. This value is used only in spark 3 with adaptive execution and
            // according to our tests spark can handle really large numbers and 32x is a reasonable default.
            int initialPartitionNum = Math.min(Math.max(conf.getTotalWorkerCPUs() * PARTITION_NUM_FACTOR, MIN_INITIAL_PARTITIONS_DEFAULT), MAX_INITIAL_PARTITIONS_DEFAULT);
            clusterProperties.putIfAbsent("spark:spark.default.parallelism", Integer.toString(defaultConcurrency));
            clusterProperties.putIfAbsent("spark:spark.sql.adaptive.coalescePartitions.initialPartitionNum", Integer.toString(initialPartitionNum));
        }
        SoftwareConfig.Builder softwareConfigBuilder = SoftwareConfig.newBuilder().putAllProperties(clusterProperties);
        // Use image version only if custom Image URI is not specified, otherwise may cause image version conflicts
        if (conf.getCustomImageUri() == null || conf.getCustomImageUri().isEmpty()) {
            softwareConfigBuilder.setImageVersion(imageVersion);
        } else {
            // If custom Image URI is specified, use that for cluster creation
            primaryWorkerConfig.setImageUri(conf.getCustomImageUri());
            secondaryWorkerConfig.setImageUri(conf.getCustomImageUri());
        }
        ClusterConfig.Builder builder = ClusterConfig.newBuilder().setEndpointConfig(EndpointConfig.newBuilder().setEnableHttpPortAccess(conf.isComponentGatewayEnabled()).build()).setMasterConfig(InstanceGroupConfig.newBuilder().setNumInstances(conf.getMasterNumNodes()).setMachineTypeUri(conf.getMasterMachineType()).setDiskConfig(DiskConfig.newBuilder().setBootDiskType(conf.getMasterDiskType()).setBootDiskSizeGb(conf.getMasterDiskGB()).setNumLocalSsds(0).build()).build()).setWorkerConfig(primaryWorkerConfig.build()).setSecondaryWorkerConfig(secondaryWorkerConfig.build()).setGceClusterConfig(clusterConfig.build()).setSoftwareConfig(softwareConfigBuilder);
        // Cluster TTL if one should be set
        if (conf.getIdleTTLMinutes() > 0) {
            long seconds = TimeUnit.MINUTES.toSeconds(conf.getIdleTTLMinutes());
            builder.setLifecycleConfig(LifecycleConfig.newBuilder().setIdleDeleteTtl(Duration.newBuilder().setSeconds(seconds).build()).build());
        }
        // Add any Node Initialization action scripts
        for (String action : conf.getInitActions()) {
            builder.addInitializationActions(NodeInitializationAction.newBuilder().setExecutableFile(action).build());
        }
        // Set Auto Scaling Policy
        String autoScalingPolicy = conf.getAutoScalingPolicy();
        if (conf.isPredefinedAutoScaleEnabled()) {
            PredefinedAutoScaling predefinedAutoScaling = new PredefinedAutoScaling(conf);
            autoScalingPolicy = predefinedAutoScaling.createPredefinedAutoScalingPolicy();
        }
        if (!Strings.isNullOrEmpty(autoScalingPolicy)) {
            // Check if policy is URI or ID. If ID Convert to URI
            if (!autoScalingPolicy.contains("/")) {
                autoScalingPolicy = "projects/" + conf.getProjectId() + "/regions/" + conf.getRegion() + "/autoscalingPolicies/" + autoScalingPolicy;
            }
            builder.setAutoscalingConfig(AutoscalingConfig.newBuilder().setPolicyUri(autoScalingPolicy).build());
        }
        if (conf.getEncryptionKeyName() != null) {
            builder.setEncryptionConfig(EncryptionConfig.newBuilder().setGcePdKmsKeyName(conf.getEncryptionKeyName()).build());
        }
        if (conf.getGcsBucket() != null) {
            builder.setConfigBucket(conf.getGcsBucket());
        }
        Cluster cluster = com.google.cloud.dataproc.v1.Cluster.newBuilder().setClusterName(name).putAllLabels(labels).setConfig(builder.build()).build();
        OperationFuture<Cluster, ClusterOperationMetadata> operationFuture = client.createClusterAsync(conf.getProjectId(), conf.getRegion(), cluster);
        return operationFuture.getMetadata().get();
    } catch (ExecutionException e) {
        cleanUpClusterAfterCreationFailure(name);
        Throwable cause = e.getCause();
        if (cause instanceof ApiException) {
            throw handleApiException((ApiException) cause);
        }
        throw new DataprocRuntimeException(cause);
    }
}
Also used : ClusterOperationMetadata(com.google.cloud.dataproc.v1.ClusterOperationMetadata) HashMap(java.util.HashMap) DiskConfig(com.google.cloud.dataproc.v1.DiskConfig) SoftwareConfig(com.google.cloud.dataproc.v1.SoftwareConfig) ExecutionException(java.util.concurrent.ExecutionException) InstanceGroupConfig(com.google.cloud.dataproc.v1.InstanceGroupConfig) GceClusterConfig(com.google.cloud.dataproc.v1.GceClusterConfig) Cluster(com.google.cloud.dataproc.v1.Cluster) SSHPublicKey(io.cdap.cdap.runtime.spi.ssh.SSHPublicKey) ClusterConfig(com.google.cloud.dataproc.v1.ClusterConfig) GceClusterConfig(com.google.cloud.dataproc.v1.GceClusterConfig) ApiException(com.google.api.gax.rpc.ApiException)

Aggregations

ApiException (com.google.api.gax.rpc.ApiException)1 Cluster (com.google.cloud.dataproc.v1.Cluster)1 ClusterConfig (com.google.cloud.dataproc.v1.ClusterConfig)1 ClusterOperationMetadata (com.google.cloud.dataproc.v1.ClusterOperationMetadata)1 DiskConfig (com.google.cloud.dataproc.v1.DiskConfig)1 GceClusterConfig (com.google.cloud.dataproc.v1.GceClusterConfig)1 InstanceGroupConfig (com.google.cloud.dataproc.v1.InstanceGroupConfig)1 SoftwareConfig (com.google.cloud.dataproc.v1.SoftwareConfig)1 SSHPublicKey (io.cdap.cdap.runtime.spi.ssh.SSHPublicKey)1 HashMap (java.util.HashMap)1 ExecutionException (java.util.concurrent.ExecutionException)1