use of com.google.api.services.dataflow.model.Job in project beam by apache.
the class DataflowPipelineTranslatorTest method testZoneConfig.
@Test
public void testZoneConfig() throws IOException {
final String testZone = "test-zone-1";
DataflowPipelineOptions options = buildPipelineOptions();
options.setZone(testZone);
Pipeline p = buildPipeline(options);
p.traverseTopologically(new RecordingPipelineVisitor());
Job job = DataflowPipelineTranslator.fromOptions(options).translate(p, DataflowRunner.fromOptions(options), Collections.<DataflowPackage>emptyList()).getJob();
assertEquals(1, job.getEnvironment().getWorkerPools().size());
assertEquals(testZone, job.getEnvironment().getWorkerPools().get(0).getZone());
}
use of com.google.api.services.dataflow.model.Job in project beam by apache.
the class DataflowPipelineTranslatorTest method buildMockDataflow.
private static Dataflow buildMockDataflow(ArgumentMatcher<Job> jobMatcher) throws IOException {
Dataflow mockDataflowClient = mock(Dataflow.class);
Dataflow.Projects mockProjects = mock(Dataflow.Projects.class);
Dataflow.Projects.Jobs mockJobs = mock(Dataflow.Projects.Jobs.class);
Dataflow.Projects.Jobs.Create mockRequest = mock(Dataflow.Projects.Jobs.Create.class);
when(mockDataflowClient.projects()).thenReturn(mockProjects);
when(mockProjects.jobs()).thenReturn(mockJobs);
when(mockJobs.create(eq("someProject"), argThat(jobMatcher))).thenReturn(mockRequest);
Job resultJob = new Job();
resultJob.setId("newid");
when(mockRequest.execute()).thenReturn(resultJob);
return mockDataflowClient;
}
use of com.google.api.services.dataflow.model.Job in project beam by apache.
the class DataflowRunner method run.
@Override
public DataflowPipelineJob run(Pipeline pipeline) {
logWarningIfPCollectionViewHasNonDeterministicKeyCoder(pipeline);
if (containsUnboundedPCollection(pipeline)) {
options.setStreaming(true);
}
replaceTransforms(pipeline);
LOG.info("Executing pipeline on the Dataflow Service, which will have billing implications " + "related to Google Compute Engine usage and other Google Cloud Services.");
List<DataflowPackage> packages = options.getStager().stageFiles();
// Set a unique client_request_id in the CreateJob request.
// This is used to ensure idempotence of job creation across retried
// attempts to create a job. Specifically, if the service returns a job with
// a different client_request_id, it means the returned one is a different
// job previously created with the same job name, and that the job creation
// has been effectively rejected. The SDK should return
// Error::Already_Exists to user in that case.
int randomNum = new Random().nextInt(9000) + 1000;
String requestId = DateTimeFormat.forPattern("YYYYMMddHHmmssmmm").withZone(DateTimeZone.UTC).print(DateTimeUtils.currentTimeMillis()) + "_" + randomNum;
// Try to create a debuggee ID. This must happen before the job is translated since it may
// update the options.
DataflowPipelineOptions dataflowOptions = options.as(DataflowPipelineOptions.class);
maybeRegisterDebuggee(dataflowOptions, requestId);
JobSpecification jobSpecification = translator.translate(pipeline, this, packages);
Job newJob = jobSpecification.getJob();
newJob.setClientRequestId(requestId);
ReleaseInfo releaseInfo = ReleaseInfo.getReleaseInfo();
String version = releaseInfo.getVersion();
checkState(!version.equals("${pom.version}"), "Unable to submit a job to the Dataflow service with unset version ${pom.version}");
System.out.println("Dataflow SDK version: " + version);
newJob.getEnvironment().setUserAgent((Map) releaseInfo.getProperties());
// must be verified.
if (!isNullOrEmpty(options.getGcpTempLocation())) {
newJob.getEnvironment().setTempStoragePrefix(dataflowOptions.getPathValidator().verifyPath(options.getGcpTempLocation()));
}
newJob.getEnvironment().setDataset(options.getTempDatasetId());
newJob.getEnvironment().setExperiments(options.getExperiments());
// Set the Docker container image that executes Dataflow worker harness, residing in Google
// Container Registry. Translator is guaranteed to create a worker pool prior to this point.
String workerHarnessContainerImage = getContainerImageForJob(options);
for (WorkerPool workerPool : newJob.getEnvironment().getWorkerPools()) {
workerPool.setWorkerHarnessContainerImage(workerHarnessContainerImage);
}
newJob.getEnvironment().setVersion(getEnvironmentVersion(options));
if (hooks != null) {
hooks.modifyEnvironmentBeforeSubmission(newJob.getEnvironment());
}
if (!isNullOrEmpty(options.getDataflowJobFile()) || !isNullOrEmpty(options.getTemplateLocation())) {
boolean isTemplate = !isNullOrEmpty(options.getTemplateLocation());
if (isTemplate) {
checkArgument(isNullOrEmpty(options.getDataflowJobFile()), "--dataflowJobFile and --templateLocation are mutually exclusive.");
}
String fileLocation = firstNonNull(options.getTemplateLocation(), options.getDataflowJobFile());
checkArgument(fileLocation.startsWith("/") || fileLocation.startsWith("gs://"), "Location must be local or on Cloud Storage, got %s.", fileLocation);
ResourceId fileResource = FileSystems.matchNewResource(fileLocation, false);
String workSpecJson = DataflowPipelineTranslator.jobToString(newJob);
try (PrintWriter printWriter = new PrintWriter(Channels.newOutputStream(FileSystems.create(fileResource, MimeTypes.TEXT)))) {
printWriter.print(workSpecJson);
LOG.info("Printed job specification to {}", fileLocation);
} catch (IOException ex) {
String error = String.format("Cannot create output file at %s", fileLocation);
if (isTemplate) {
throw new RuntimeException(error, ex);
} else {
LOG.warn(error, ex);
}
}
if (isTemplate) {
LOG.info("Template successfully created.");
return new DataflowTemplateJob();
}
}
String jobIdToUpdate = null;
if (options.isUpdate()) {
jobIdToUpdate = getJobIdFromName(options.getJobName());
newJob.setTransformNameMapping(options.getTransformNameMapping());
newJob.setReplaceJobId(jobIdToUpdate);
}
Job jobResult;
try {
jobResult = dataflowClient.createJob(newJob);
} catch (GoogleJsonResponseException e) {
String errorMessages = "Unexpected errors";
if (e.getDetails() != null) {
if (Utf8.encodedLength(newJob.toString()) >= CREATE_JOB_REQUEST_LIMIT_BYTES) {
errorMessages = "The size of the serialized JSON representation of the pipeline " + "exceeds the allowable limit. " + "For more information, please check the FAQ link below:\n" + "https://cloud.google.com/dataflow/faq";
} else {
errorMessages = e.getDetails().getMessage();
}
}
throw new RuntimeException("Failed to create a workflow job: " + errorMessages, e);
} catch (IOException e) {
throw new RuntimeException("Failed to create a workflow job", e);
}
// Use a raw client for post-launch monitoring, as status calls may fail
// regularly and need not be retried automatically.
DataflowPipelineJob dataflowPipelineJob = new DataflowPipelineJob(DataflowClient.create(options), jobResult.getId(), options, jobSpecification.getStepNames());
// depending on whether this is a reload or not.
if (jobResult.getClientRequestId() != null && !jobResult.getClientRequestId().isEmpty() && !jobResult.getClientRequestId().equals(requestId)) {
// If updating a job.
if (options.isUpdate()) {
throw new DataflowJobAlreadyUpdatedException(dataflowPipelineJob, String.format("The job named %s with id: %s has already been updated into job id: %s " + "and cannot be updated again.", newJob.getName(), jobIdToUpdate, jobResult.getId()));
} else {
throw new DataflowJobAlreadyExistsException(dataflowPipelineJob, String.format("There is already an active job named %s with id: %s. If you want " + "to submit a second job, try again by setting a different name using --jobName.", newJob.getName(), jobResult.getId()));
}
}
LOG.info("To access the Dataflow monitoring console, please navigate to {}", MonitoringUtil.getJobMonitoringPageURL(options.getProject(), jobResult.getId()));
System.out.println("Submitted job: " + jobResult.getId());
LOG.info("To cancel the job using the 'gcloud' tool, run:\n> {}", MonitoringUtil.getGcloudCancelCommand(options, jobResult.getId()));
return dataflowPipelineJob;
}
use of com.google.api.services.dataflow.model.Job in project beam by apache.
the class DataflowRunnerTest method testUpdateAlreadyUpdatedPipeline.
@Test
public void testUpdateAlreadyUpdatedPipeline() throws IOException {
DataflowPipelineOptions options = buildPipelineOptions();
options.setUpdate(true);
options.setJobName("oldJobName");
Dataflow mockDataflowClient = options.getDataflowClient();
Dataflow.Projects.Locations.Jobs.Create mockRequest = mock(Dataflow.Projects.Locations.Jobs.Create.class);
when(mockDataflowClient.projects().locations().jobs().create(eq(PROJECT_ID), eq(REGION_ID), any(Job.class))).thenReturn(mockRequest);
final Job resultJob = new Job();
resultJob.setId("newid");
// Return a different request id.
resultJob.setClientRequestId("different_request_id");
when(mockRequest.execute()).thenReturn(resultJob);
Pipeline p = buildDataflowPipeline(options);
thrown.expect(DataflowJobAlreadyUpdatedException.class);
thrown.expect(new TypeSafeMatcher<DataflowJobAlreadyUpdatedException>() {
@Override
public void describeTo(Description description) {
description.appendText("Expected job ID: " + resultJob.getId());
}
@Override
protected boolean matchesSafely(DataflowJobAlreadyUpdatedException item) {
return resultJob.getId().equals(item.getJob().getJobId());
}
});
thrown.expectMessage("The job named oldjobname with id: oldJobId has already been updated " + "into job id: newid and cannot be updated again.");
p.run();
}
use of com.google.api.services.dataflow.model.Job in project beam by apache.
the class DataflowRunnerTest method testNonExistentStagingLocation.
@Test
public void testNonExistentStagingLocation() throws IOException {
DataflowPipelineOptions options = buildPipelineOptions();
options.setStagingLocation(NON_EXISTENT_BUCKET);
thrown.expect(IllegalArgumentException.class);
thrown.expectMessage(containsString("Output path does not exist or is not writeable: " + NON_EXISTENT_BUCKET));
DataflowRunner.fromOptions(options);
ArgumentCaptor<Job> jobCaptor = ArgumentCaptor.forClass(Job.class);
Mockito.verify(mockJobs).create(eq(PROJECT_ID), eq(REGION_ID), jobCaptor.capture());
assertValidJob(jobCaptor.getValue());
}
Aggregations