use of com.google.cloud.dataproc.v1beta2.SubmitJobRequest in project cdap by caskdata.
the class DataprocRuntimeJobManager method launch.
@Override
public void launch(RuntimeJobInfo runtimeJobInfo) throws Exception {
String bucket = DataprocUtils.getBucketName(this.bucket);
ProgramRunInfo runInfo = runtimeJobInfo.getProgramRunInfo();
LOG.debug("Launching run {} with following configurations: cluster {}, project {}, region {}, bucket {}.", runInfo.getRun(), clusterName, projectId, region, bucket);
// TODO: CDAP-16408 use fixed directory for caching twill, application, artifact jars
File tempDir = Files.createTempDirectory("dataproc.launcher").toFile();
// on dataproc bucket the run root will be <bucket>/cdap-job/<runid>/. All the files for this run will be copied
// under that base dir.
String runRootPath = getPath(DataprocUtils.CDAP_GCS_ROOT, runInfo.getRun());
try {
// step 1: build twill.jar and launcher.jar and add them to files to be copied to gcs
List<LocalFile> localFiles = getRuntimeLocalFiles(runtimeJobInfo.getLocalizeFiles(), tempDir);
// step 2: upload all the necessary files to gcs so that those files are available to dataproc job
List<Future<LocalFile>> uploadFutures = new ArrayList<>();
for (LocalFile fileToUpload : localFiles) {
String targetFilePath = getPath(runRootPath, fileToUpload.getName());
uploadFutures.add(provisionerContext.execute(() -> uploadFile(bucket, targetFilePath, fileToUpload)).toCompletableFuture());
}
List<LocalFile> uploadedFiles = new ArrayList<>();
for (Future<LocalFile> uploadFuture : uploadFutures) {
uploadedFiles.add(uploadFuture.get());
}
// step 3: build the hadoop job request to be submitted to dataproc
SubmitJobRequest request = getSubmitJobRequest(runtimeJobInfo, uploadedFiles);
// step 4: submit hadoop job to dataproc
try {
Job job = getJobControllerClient().submitJob(request);
LOG.debug("Successfully submitted hadoop job {} to cluster {}.", job.getReference().getJobId(), clusterName);
} catch (AlreadyExistsException ex) {
// the job id already exists, ignore the job.
LOG.warn("The dataproc job {} already exists. Ignoring resubmission of the job.", request.getJob().getReference().getJobId());
}
DataprocUtils.emitMetric(provisionerContext, region, "provisioner.submitJob.response.count");
} catch (Exception e) {
// delete all uploaded gcs files in case of exception
DataprocUtils.deleteGCSPath(getStorageClient(), bucket, runRootPath);
DataprocUtils.emitMetric(provisionerContext, region, "provisioner.submitJob.response.count", e);
throw new Exception(String.format("Error while launching job %s on cluster %s", getJobId(runInfo), clusterName), e);
} finally {
// delete local temp directory
deleteDirectoryContents(tempDir);
}
}
use of com.google.cloud.dataproc.v1beta2.SubmitJobRequest in project cdap by caskdata.
the class DataprocRuntimeJobManager method getSubmitJobRequest.
/**
* Creates and returns dataproc job submit request.
*/
private SubmitJobRequest getSubmitJobRequest(RuntimeJobInfo runtimeJobInfo, List<LocalFile> localFiles) {
ProgramRunInfo runInfo = runtimeJobInfo.getProgramRunInfo();
String runId = runInfo.getRun();
// The DataprocJobMain argument is <class-name> <spark-compat> <list of archive files...>
List<String> arguments = new ArrayList<>();
arguments.add("--" + DataprocJobMain.RUNTIME_JOB_CLASS + "=" + runtimeJobInfo.getRuntimeJobClassname());
arguments.add("--" + DataprocJobMain.SPARK_COMPAT + "=" + provisionerContext.getSparkCompat().getCompat());
localFiles.stream().filter(LocalFile::isArchive).map(f -> "--" + DataprocJobMain.ARCHIVE + "=" + f.getName()).forEach(arguments::add);
for (Map.Entry<String, String> entry : runtimeJobInfo.getJvmProperties().entrySet()) {
arguments.add("--" + DataprocJobMain.PROPERTY_PREFIX + entry.getKey() + "=\"" + entry.getValue() + "\"");
}
Map<String, String> properties = new LinkedHashMap<>();
properties.put(CDAP_RUNTIME_NAMESPACE, runInfo.getNamespace());
properties.put(CDAP_RUNTIME_APPLICATION, runInfo.getApplication());
properties.put(CDAP_RUNTIME_VERSION, runInfo.getVersion());
properties.put(CDAP_RUNTIME_PROGRAM, runInfo.getProgram());
properties.put(CDAP_RUNTIME_PROGRAM_TYPE, runInfo.getProgramType());
properties.put(CDAP_RUNTIME_RUNID, runId);
HadoopJob.Builder hadoopJobBuilder = HadoopJob.newBuilder().setMainClass(DataprocJobMain.class.getName()).addAllArgs(arguments).putAllProperties(properties);
for (LocalFile localFile : localFiles) {
// add jar file
URI uri = localFile.getURI();
if (localFile.getName().endsWith("jar")) {
hadoopJobBuilder.addJarFileUris(uri.toString());
} else {
hadoopJobBuilder.addFileUris(uri.toString());
}
}
return SubmitJobRequest.newBuilder().setRegion(region).setProjectId(projectId).setJob(Job.newBuilder().setReference(JobReference.newBuilder().setJobId(getJobId(runInfo))).setPlacement(JobPlacement.newBuilder().setClusterName(clusterName).build()).putAllLabels(labels).putLabels(LABEL_CDAP_PROGRAM, runInfo.getProgram().toLowerCase()).putLabels(LABEL_CDAP_PROGRAM_TYPE, runInfo.getProgramType().toLowerCase()).setHadoopJob(hadoopJobBuilder.build()).build()).build();
}
Aggregations