Search in sources :

Example 1 with HadoopJob

use of com.google.cloud.dataproc.v1.HadoopJob in project java-dataproc by googleapis.

the class InstantiateInlineWorkflowTemplate method instantiateInlineWorkflowTemplate.

public static void instantiateInlineWorkflowTemplate(String projectId, String region) throws IOException, InterruptedException {
    String myEndpoint = String.format("%s-dataproc.googleapis.com:443", region);
    // Configure the settings for the workflow template service client.
    WorkflowTemplateServiceSettings workflowTemplateServiceSettings = WorkflowTemplateServiceSettings.newBuilder().setEndpoint(myEndpoint).build();
    // closes the client, but this can also be done manually with the .close() method.
    try (WorkflowTemplateServiceClient workflowTemplateServiceClient = WorkflowTemplateServiceClient.create(workflowTemplateServiceSettings)) {
        // Configure the jobs within the workflow.
        HadoopJob teragenHadoopJob = HadoopJob.newBuilder().setMainJarFileUri("file:///usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar").addArgs("teragen").addArgs("1000").addArgs("hdfs:///gen/").build();
        OrderedJob teragen = OrderedJob.newBuilder().setHadoopJob(teragenHadoopJob).setStepId("teragen").build();
        HadoopJob terasortHadoopJob = HadoopJob.newBuilder().setMainJarFileUri("file:///usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar").addArgs("terasort").addArgs("hdfs:///gen/").addArgs("hdfs:///sort/").build();
        OrderedJob terasort = OrderedJob.newBuilder().setHadoopJob(terasortHadoopJob).addPrerequisiteStepIds("teragen").setStepId("terasort").build();
        // Configure the cluster placement for the workflow.
        // Leave "ZoneUri" empty for "Auto Zone Placement".
        // GceClusterConfig gceClusterConfig =
        // GceClusterConfig.newBuilder().setZoneUri("").build();
        GceClusterConfig gceClusterConfig = GceClusterConfig.newBuilder().setZoneUri("us-central1-a").build();
        ClusterConfig clusterConfig = ClusterConfig.newBuilder().setGceClusterConfig(gceClusterConfig).build();
        ManagedCluster managedCluster = ManagedCluster.newBuilder().setClusterName("my-managed-cluster").setConfig(clusterConfig).build();
        WorkflowTemplatePlacement workflowTemplatePlacement = WorkflowTemplatePlacement.newBuilder().setManagedCluster(managedCluster).build();
        // Create the inline workflow template.
        WorkflowTemplate workflowTemplate = WorkflowTemplate.newBuilder().addJobs(teragen).addJobs(terasort).setPlacement(workflowTemplatePlacement).build();
        // Submit the instantiated inline workflow template request.
        String parent = RegionName.format(projectId, region);
        OperationFuture<Empty, WorkflowMetadata> instantiateInlineWorkflowTemplateAsync = workflowTemplateServiceClient.instantiateInlineWorkflowTemplateAsync(parent, workflowTemplate);
        instantiateInlineWorkflowTemplateAsync.get();
        // Print out a success message.
        System.out.printf("Workflow ran successfully.");
    } catch (ExecutionException e) {
        System.err.println(String.format("Error running workflow: %s ", e.getMessage()));
    }
}
Also used : GceClusterConfig(com.google.cloud.dataproc.v1.GceClusterConfig) WorkflowMetadata(com.google.cloud.dataproc.v1.WorkflowMetadata) HadoopJob(com.google.cloud.dataproc.v1.HadoopJob) WorkflowTemplateServiceClient(com.google.cloud.dataproc.v1.WorkflowTemplateServiceClient) OrderedJob(com.google.cloud.dataproc.v1.OrderedJob) WorkflowTemplate(com.google.cloud.dataproc.v1.WorkflowTemplate) Empty(com.google.protobuf.Empty) ManagedCluster(com.google.cloud.dataproc.v1.ManagedCluster) WorkflowTemplatePlacement(com.google.cloud.dataproc.v1.WorkflowTemplatePlacement) WorkflowTemplateServiceSettings(com.google.cloud.dataproc.v1.WorkflowTemplateServiceSettings) ExecutionException(java.util.concurrent.ExecutionException) ClusterConfig(com.google.cloud.dataproc.v1.ClusterConfig) GceClusterConfig(com.google.cloud.dataproc.v1.GceClusterConfig)

Example 2 with HadoopJob

use of com.google.cloud.dataproc.v1.HadoopJob in project java-dataproc by googleapis.

the class SubmitHadoopFsJob method submitHadoopFsJob.

public static void submitHadoopFsJob(String projectId, String region, String clusterName, String hadoopFsQuery) throws IOException, InterruptedException {
    String myEndpoint = String.format("%s-dataproc.googleapis.com:443", region);
    // Configure the settings for the job controller client.
    JobControllerSettings jobControllerSettings = JobControllerSettings.newBuilder().setEndpoint(myEndpoint).build();
    // but this can also be done manually with the .close() method.
    try (JobControllerClient jobControllerClient = JobControllerClient.create(jobControllerSettings)) {
        // Configure cluster placement for the job.
        JobPlacement jobPlacement = JobPlacement.newBuilder().setClusterName(clusterName).build();
        // Configure Hadoop job settings. The HadoopFS query is set here.
        HadoopJob hadoopJob = HadoopJob.newBuilder().setMainClass("org.apache.hadoop.fs.FsShell").addAllArgs(stringToList(hadoopFsQuery)).build();
        Job job = Job.newBuilder().setPlacement(jobPlacement).setHadoopJob(hadoopJob).build();
        // Submit an asynchronous request to execute the job.
        OperationFuture<Job, JobMetadata> submitJobAsOperationAsyncRequest = jobControllerClient.submitJobAsOperationAsync(projectId, region, job);
        Job response = submitJobAsOperationAsyncRequest.get();
        // Print output from Google Cloud Storage.
        Matcher matches = Pattern.compile("gs://(.*?)/(.*)").matcher(response.getDriverOutputResourceUri());
        matches.matches();
        Storage storage = StorageOptions.getDefaultInstance().getService();
        Blob blob = storage.get(matches.group(1), String.format("%s.000000000", matches.group(2)));
        System.out.println(String.format("Job finished successfully: %s", new String(blob.getContent())));
    } catch (ExecutionException e) {
        // If the job does not complete successfully, print the error message.
        System.err.println(String.format("submitHadoopFSJob: %s ", e.getMessage()));
    }
}
Also used : JobControllerSettings(com.google.cloud.dataproc.v1.JobControllerSettings) HadoopJob(com.google.cloud.dataproc.v1.HadoopJob) JobMetadata(com.google.cloud.dataproc.v1.JobMetadata) Blob(com.google.cloud.storage.Blob) Storage(com.google.cloud.storage.Storage) Matcher(java.util.regex.Matcher) JobPlacement(com.google.cloud.dataproc.v1.JobPlacement) JobControllerClient(com.google.cloud.dataproc.v1.JobControllerClient) HadoopJob(com.google.cloud.dataproc.v1.HadoopJob) Job(com.google.cloud.dataproc.v1.Job) ExecutionException(java.util.concurrent.ExecutionException)

Aggregations

HadoopJob (com.google.cloud.dataproc.v1.HadoopJob)2 ExecutionException (java.util.concurrent.ExecutionException)2 ClusterConfig (com.google.cloud.dataproc.v1.ClusterConfig)1 GceClusterConfig (com.google.cloud.dataproc.v1.GceClusterConfig)1 Job (com.google.cloud.dataproc.v1.Job)1 JobControllerClient (com.google.cloud.dataproc.v1.JobControllerClient)1 JobControllerSettings (com.google.cloud.dataproc.v1.JobControllerSettings)1 JobMetadata (com.google.cloud.dataproc.v1.JobMetadata)1 JobPlacement (com.google.cloud.dataproc.v1.JobPlacement)1 ManagedCluster (com.google.cloud.dataproc.v1.ManagedCluster)1 OrderedJob (com.google.cloud.dataproc.v1.OrderedJob)1 WorkflowMetadata (com.google.cloud.dataproc.v1.WorkflowMetadata)1 WorkflowTemplate (com.google.cloud.dataproc.v1.WorkflowTemplate)1 WorkflowTemplatePlacement (com.google.cloud.dataproc.v1.WorkflowTemplatePlacement)1 WorkflowTemplateServiceClient (com.google.cloud.dataproc.v1.WorkflowTemplateServiceClient)1 WorkflowTemplateServiceSettings (com.google.cloud.dataproc.v1.WorkflowTemplateServiceSettings)1 Blob (com.google.cloud.storage.Blob)1 Storage (com.google.cloud.storage.Storage)1 Empty (com.google.protobuf.Empty)1 Matcher (java.util.regex.Matcher)1