Search in sources :

Example 1 with JobControllerSettings

use of com.google.cloud.dataproc.v1.JobControllerSettings in project java-dataproc by googleapis.

the class SubmitJob method submitJob.

public static void submitJob(String projectId, String region, String clusterName) throws IOException, InterruptedException {
    String myEndpoint = String.format("%s-dataproc.googleapis.com:443", region);
    // Configure the settings for the job controller client.
    JobControllerSettings jobControllerSettings = JobControllerSettings.newBuilder().setEndpoint(myEndpoint).build();
    // but this can also be done manually with the .close() method.
    try (JobControllerClient jobControllerClient = JobControllerClient.create(jobControllerSettings)) {
        // Configure cluster placement for the job.
        JobPlacement jobPlacement = JobPlacement.newBuilder().setClusterName(clusterName).build();
        // Configure Spark job settings.
        SparkJob sparkJob = SparkJob.newBuilder().setMainClass("org.apache.spark.examples.SparkPi").addJarFileUris("file:///usr/lib/spark/examples/jars/spark-examples.jar").addArgs("1000").build();
        Job job = Job.newBuilder().setPlacement(jobPlacement).setSparkJob(sparkJob).build();
        // Submit an asynchronous request to execute the job.
        OperationFuture<Job, JobMetadata> submitJobAsOperationAsyncRequest = jobControllerClient.submitJobAsOperationAsync(projectId, region, job);
        Job response = submitJobAsOperationAsyncRequest.get();
        // Print output from Google Cloud Storage.
        Matcher matches = Pattern.compile("gs://(.*?)/(.*)").matcher(response.getDriverOutputResourceUri());
        matches.matches();
        Storage storage = StorageOptions.getDefaultInstance().getService();
        Blob blob = storage.get(matches.group(1), String.format("%s.000000000", matches.group(2)));
        System.out.println(String.format("Job finished successfully: %s", new String(blob.getContent())));
    } catch (ExecutionException e) {
        // If the job does not complete successfully, print the error message.
        System.err.println(String.format("submitJob: %s ", e.getMessage()));
    }
}
Also used : JobControllerSettings(com.google.cloud.dataproc.v1.JobControllerSettings) JobMetadata(com.google.cloud.dataproc.v1.JobMetadata) Blob(com.google.cloud.storage.Blob) Storage(com.google.cloud.storage.Storage) Matcher(java.util.regex.Matcher) JobPlacement(com.google.cloud.dataproc.v1.JobPlacement) SparkJob(com.google.cloud.dataproc.v1.SparkJob) JobControllerClient(com.google.cloud.dataproc.v1.JobControllerClient) SparkJob(com.google.cloud.dataproc.v1.SparkJob) Job(com.google.cloud.dataproc.v1.Job) ExecutionException(java.util.concurrent.ExecutionException)

Example 2 with JobControllerSettings

use of com.google.cloud.dataproc.v1.JobControllerSettings in project java-dataproc by googleapis.

the class Quickstart method quickstart.

public static void quickstart(String projectId, String region, String clusterName, String jobFilePath) throws IOException, InterruptedException {
    String myEndpoint = String.format("%s-dataproc.googleapis.com:443", region);
    // Configure the settings for the cluster controller client.
    ClusterControllerSettings clusterControllerSettings = ClusterControllerSettings.newBuilder().setEndpoint(myEndpoint).build();
    // Configure the settings for the job controller client.
    JobControllerSettings jobControllerSettings = JobControllerSettings.newBuilder().setEndpoint(myEndpoint).build();
    // manually with the .close() method.
    try (ClusterControllerClient clusterControllerClient = ClusterControllerClient.create(clusterControllerSettings);
        JobControllerClient jobControllerClient = JobControllerClient.create(jobControllerSettings)) {
        // Configure the settings for our cluster.
        InstanceGroupConfig masterConfig = InstanceGroupConfig.newBuilder().setMachineTypeUri("n1-standard-2").setNumInstances(1).build();
        InstanceGroupConfig workerConfig = InstanceGroupConfig.newBuilder().setMachineTypeUri("n1-standard-2").setNumInstances(2).build();
        ClusterConfig clusterConfig = ClusterConfig.newBuilder().setMasterConfig(masterConfig).setWorkerConfig(workerConfig).build();
        // Create the cluster object with the desired cluster config.
        Cluster cluster = Cluster.newBuilder().setClusterName(clusterName).setConfig(clusterConfig).build();
        // Create the Cloud Dataproc cluster.
        OperationFuture<Cluster, ClusterOperationMetadata> createClusterAsyncRequest = clusterControllerClient.createClusterAsync(projectId, region, cluster);
        Cluster clusterResponse = createClusterAsyncRequest.get();
        System.out.println(String.format("Cluster created successfully: %s", clusterResponse.getClusterName()));
        // Configure the settings for our job.
        JobPlacement jobPlacement = JobPlacement.newBuilder().setClusterName(clusterName).build();
        PySparkJob pySparkJob = PySparkJob.newBuilder().setMainPythonFileUri(jobFilePath).build();
        Job job = Job.newBuilder().setPlacement(jobPlacement).setPysparkJob(pySparkJob).build();
        // Submit an asynchronous request to execute the job.
        OperationFuture<Job, JobMetadata> submitJobAsOperationAsyncRequest = jobControllerClient.submitJobAsOperationAsync(projectId, region, job);
        Job jobResponse = submitJobAsOperationAsyncRequest.get();
        // Print output from Google Cloud Storage.
        Matcher matches = Pattern.compile("gs://(.*?)/(.*)").matcher(jobResponse.getDriverOutputResourceUri());
        matches.matches();
        Storage storage = StorageOptions.getDefaultInstance().getService();
        Blob blob = storage.get(matches.group(1), String.format("%s.000000000", matches.group(2)));
        System.out.println(String.format("Job finished successfully: %s", new String(blob.getContent())));
        // Delete the cluster.
        OperationFuture<Empty, ClusterOperationMetadata> deleteClusterAsyncRequest = clusterControllerClient.deleteClusterAsync(projectId, region, clusterName);
        deleteClusterAsyncRequest.get();
        System.out.println(String.format("Cluster \"%s\" successfully deleted.", clusterName));
    } catch (ExecutionException e) {
        System.err.println(String.format("quickstart: %s ", e.getMessage()));
    }
}
Also used : JobControllerSettings(com.google.cloud.dataproc.v1.JobControllerSettings) JobMetadata(com.google.cloud.dataproc.v1.JobMetadata) Blob(com.google.cloud.storage.Blob) ClusterOperationMetadata(com.google.cloud.dataproc.v1.ClusterOperationMetadata) Matcher(java.util.regex.Matcher) Cluster(com.google.cloud.dataproc.v1.Cluster) ClusterControllerSettings(com.google.cloud.dataproc.v1.ClusterControllerSettings) PySparkJob(com.google.cloud.dataproc.v1.PySparkJob) Empty(com.google.protobuf.Empty) Storage(com.google.cloud.storage.Storage) ClusterControllerClient(com.google.cloud.dataproc.v1.ClusterControllerClient) JobPlacement(com.google.cloud.dataproc.v1.JobPlacement) JobControllerClient(com.google.cloud.dataproc.v1.JobControllerClient) PySparkJob(com.google.cloud.dataproc.v1.PySparkJob) Job(com.google.cloud.dataproc.v1.Job) ExecutionException(java.util.concurrent.ExecutionException) InstanceGroupConfig(com.google.cloud.dataproc.v1.InstanceGroupConfig) ClusterConfig(com.google.cloud.dataproc.v1.ClusterConfig)

Example 3 with JobControllerSettings

use of com.google.cloud.dataproc.v1.JobControllerSettings in project java-dataproc by googleapis.

the class SubmitHadoopFsJob method submitHadoopFsJob.

public static void submitHadoopFsJob(String projectId, String region, String clusterName, String hadoopFsQuery) throws IOException, InterruptedException {
    String myEndpoint = String.format("%s-dataproc.googleapis.com:443", region);
    // Configure the settings for the job controller client.
    JobControllerSettings jobControllerSettings = JobControllerSettings.newBuilder().setEndpoint(myEndpoint).build();
    // but this can also be done manually with the .close() method.
    try (JobControllerClient jobControllerClient = JobControllerClient.create(jobControllerSettings)) {
        // Configure cluster placement for the job.
        JobPlacement jobPlacement = JobPlacement.newBuilder().setClusterName(clusterName).build();
        // Configure Hadoop job settings. The HadoopFS query is set here.
        HadoopJob hadoopJob = HadoopJob.newBuilder().setMainClass("org.apache.hadoop.fs.FsShell").addAllArgs(stringToList(hadoopFsQuery)).build();
        Job job = Job.newBuilder().setPlacement(jobPlacement).setHadoopJob(hadoopJob).build();
        // Submit an asynchronous request to execute the job.
        OperationFuture<Job, JobMetadata> submitJobAsOperationAsyncRequest = jobControllerClient.submitJobAsOperationAsync(projectId, region, job);
        Job response = submitJobAsOperationAsyncRequest.get();
        // Print output from Google Cloud Storage.
        Matcher matches = Pattern.compile("gs://(.*?)/(.*)").matcher(response.getDriverOutputResourceUri());
        matches.matches();
        Storage storage = StorageOptions.getDefaultInstance().getService();
        Blob blob = storage.get(matches.group(1), String.format("%s.000000000", matches.group(2)));
        System.out.println(String.format("Job finished successfully: %s", new String(blob.getContent())));
    } catch (ExecutionException e) {
        // If the job does not complete successfully, print the error message.
        System.err.println(String.format("submitHadoopFSJob: %s ", e.getMessage()));
    }
}
Also used : JobControllerSettings(com.google.cloud.dataproc.v1.JobControllerSettings) HadoopJob(com.google.cloud.dataproc.v1.HadoopJob) JobMetadata(com.google.cloud.dataproc.v1.JobMetadata) Blob(com.google.cloud.storage.Blob) Storage(com.google.cloud.storage.Storage) Matcher(java.util.regex.Matcher) JobPlacement(com.google.cloud.dataproc.v1.JobPlacement) JobControllerClient(com.google.cloud.dataproc.v1.JobControllerClient) HadoopJob(com.google.cloud.dataproc.v1.HadoopJob) Job(com.google.cloud.dataproc.v1.Job) ExecutionException(java.util.concurrent.ExecutionException)

Example 4 with JobControllerSettings

use of com.google.cloud.dataproc.v1.JobControllerSettings in project java-pubsublite-spark by googleapis.

the class SampleTestBase method runDataprocJob.

protected Job runDataprocJob(SparkJob.Builder sparkJobBuilder) throws Exception {
    String myEndpoint = String.format("%s-dataproc.googleapis.com:443", cloudRegion.value());
    JobControllerSettings jobControllerSettings = JobControllerSettings.newBuilder().setEndpoint(myEndpoint).build();
    try (JobControllerClient jobControllerClient = JobControllerClient.create(jobControllerSettings)) {
        JobPlacement jobPlacement = JobPlacement.newBuilder().setClusterName(clusterName).build();
        sparkJobBuilder.addJarFileUris(String.format("gs://%s/%s", bucketName, sampleJarNameInGCS)).addJarFileUris(String.format("gs://%s/%s", bucketName, connectorJarNameInGCS));
        Job job = Job.newBuilder().setPlacement(jobPlacement).setSparkJob(sparkJobBuilder.build()).build();
        OperationFuture<Job, JobMetadata> submitJobAsOperationAsyncRequest = jobControllerClient.submitJobAsOperationAsync(projectId.value(), cloudRegion.value(), job);
        return submitJobAsOperationAsyncRequest.get();
    }
}
Also used : JobControllerSettings(com.google.cloud.dataproc.v1.JobControllerSettings) JobMetadata(com.google.cloud.dataproc.v1.JobMetadata) JobPlacement(com.google.cloud.dataproc.v1.JobPlacement) JobControllerClient(com.google.cloud.dataproc.v1.JobControllerClient) SparkJob(com.google.cloud.dataproc.v1.SparkJob) Job(com.google.cloud.dataproc.v1.Job)

Aggregations

Job (com.google.cloud.dataproc.v1.Job)4 JobControllerClient (com.google.cloud.dataproc.v1.JobControllerClient)4 JobControllerSettings (com.google.cloud.dataproc.v1.JobControllerSettings)4 JobMetadata (com.google.cloud.dataproc.v1.JobMetadata)4 JobPlacement (com.google.cloud.dataproc.v1.JobPlacement)4 Blob (com.google.cloud.storage.Blob)3 Storage (com.google.cloud.storage.Storage)3 ExecutionException (java.util.concurrent.ExecutionException)3 Matcher (java.util.regex.Matcher)3 SparkJob (com.google.cloud.dataproc.v1.SparkJob)2 Cluster (com.google.cloud.dataproc.v1.Cluster)1 ClusterConfig (com.google.cloud.dataproc.v1.ClusterConfig)1 ClusterControllerClient (com.google.cloud.dataproc.v1.ClusterControllerClient)1 ClusterControllerSettings (com.google.cloud.dataproc.v1.ClusterControllerSettings)1 ClusterOperationMetadata (com.google.cloud.dataproc.v1.ClusterOperationMetadata)1 HadoopJob (com.google.cloud.dataproc.v1.HadoopJob)1 InstanceGroupConfig (com.google.cloud.dataproc.v1.InstanceGroupConfig)1 PySparkJob (com.google.cloud.dataproc.v1.PySparkJob)1 Empty (com.google.protobuf.Empty)1