Search in sources :

Example 26 with DataflowPackage

use of com.google.api.services.dataflow.model.DataflowPackage in project beam by apache.

the class DataflowPipelineTranslatorTest method testScalingAlgorithmNone.

@Test
public void testScalingAlgorithmNone() throws IOException {
    final DataflowPipelineWorkerPoolOptions.AutoscalingAlgorithmType noScaling = DataflowPipelineWorkerPoolOptions.AutoscalingAlgorithmType.NONE;
    DataflowPipelineOptions options = buildPipelineOptions();
    options.setAutoscalingAlgorithm(noScaling);
    Pipeline p = buildPipeline(options);
    p.traverseTopologically(new RecordingPipelineVisitor());
    Job job = DataflowPipelineTranslator.fromOptions(options).translate(p, DataflowRunner.fromOptions(options), Collections.<DataflowPackage>emptyList()).getJob();
    assertEquals(1, job.getEnvironment().getWorkerPools().size());
    assertEquals("AUTOSCALING_ALGORITHM_NONE", job.getEnvironment().getWorkerPools().get(0).getAutoscalingSettings().getAlgorithm());
    assertEquals(0, job.getEnvironment().getWorkerPools().get(0).getAutoscalingSettings().getMaxNumWorkers().intValue());
}
Also used : DataflowPipelineOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineOptions) Job(com.google.api.services.dataflow.model.Job) DataflowPackage(com.google.api.services.dataflow.model.DataflowPackage) DataflowPipelineWorkerPoolOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineWorkerPoolOptions) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test)

Example 27 with DataflowPackage

use of com.google.api.services.dataflow.model.DataflowPackage in project beam by apache.

the class DataflowPipelineTranslatorTest method testZoneConfig.

@Test
public void testZoneConfig() throws IOException {
    final String testZone = "test-zone-1";
    DataflowPipelineOptions options = buildPipelineOptions();
    options.setZone(testZone);
    Pipeline p = buildPipeline(options);
    p.traverseTopologically(new RecordingPipelineVisitor());
    Job job = DataflowPipelineTranslator.fromOptions(options).translate(p, DataflowRunner.fromOptions(options), Collections.<DataflowPackage>emptyList()).getJob();
    assertEquals(1, job.getEnvironment().getWorkerPools().size());
    assertEquals(testZone, job.getEnvironment().getWorkerPools().get(0).getZone());
}
Also used : DataflowPipelineOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineOptions) Structs.getString(org.apache.beam.runners.dataflow.util.Structs.getString) Job(com.google.api.services.dataflow.model.Job) DataflowPackage(com.google.api.services.dataflow.model.DataflowPackage) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test)

Example 28 with DataflowPackage

use of com.google.api.services.dataflow.model.DataflowPackage in project beam by apache.

the class DataflowRunner method run.

@Override
public DataflowPipelineJob run(Pipeline pipeline) {
    logWarningIfPCollectionViewHasNonDeterministicKeyCoder(pipeline);
    if (containsUnboundedPCollection(pipeline)) {
        options.setStreaming(true);
    }
    replaceTransforms(pipeline);
    LOG.info("Executing pipeline on the Dataflow Service, which will have billing implications " + "related to Google Compute Engine usage and other Google Cloud Services.");
    List<DataflowPackage> packages = options.getStager().stageFiles();
    // Set a unique client_request_id in the CreateJob request.
    // This is used to ensure idempotence of job creation across retried
    // attempts to create a job. Specifically, if the service returns a job with
    // a different client_request_id, it means the returned one is a different
    // job previously created with the same job name, and that the job creation
    // has been effectively rejected. The SDK should return
    // Error::Already_Exists to user in that case.
    int randomNum = new Random().nextInt(9000) + 1000;
    String requestId = DateTimeFormat.forPattern("YYYYMMddHHmmssmmm").withZone(DateTimeZone.UTC).print(DateTimeUtils.currentTimeMillis()) + "_" + randomNum;
    // Try to create a debuggee ID. This must happen before the job is translated since it may
    // update the options.
    DataflowPipelineOptions dataflowOptions = options.as(DataflowPipelineOptions.class);
    maybeRegisterDebuggee(dataflowOptions, requestId);
    JobSpecification jobSpecification = translator.translate(pipeline, this, packages);
    Job newJob = jobSpecification.getJob();
    newJob.setClientRequestId(requestId);
    ReleaseInfo releaseInfo = ReleaseInfo.getReleaseInfo();
    String version = releaseInfo.getVersion();
    checkState(!version.equals("${pom.version}"), "Unable to submit a job to the Dataflow service with unset version ${pom.version}");
    System.out.println("Dataflow SDK version: " + version);
    newJob.getEnvironment().setUserAgent((Map) releaseInfo.getProperties());
    // must be verified.
    if (!isNullOrEmpty(options.getGcpTempLocation())) {
        newJob.getEnvironment().setTempStoragePrefix(dataflowOptions.getPathValidator().verifyPath(options.getGcpTempLocation()));
    }
    newJob.getEnvironment().setDataset(options.getTempDatasetId());
    newJob.getEnvironment().setExperiments(options.getExperiments());
    // Set the Docker container image that executes Dataflow worker harness, residing in Google
    // Container Registry. Translator is guaranteed to create a worker pool prior to this point.
    String workerHarnessContainerImage = getContainerImageForJob(options);
    for (WorkerPool workerPool : newJob.getEnvironment().getWorkerPools()) {
        workerPool.setWorkerHarnessContainerImage(workerHarnessContainerImage);
    }
    newJob.getEnvironment().setVersion(getEnvironmentVersion(options));
    if (hooks != null) {
        hooks.modifyEnvironmentBeforeSubmission(newJob.getEnvironment());
    }
    if (!isNullOrEmpty(options.getDataflowJobFile()) || !isNullOrEmpty(options.getTemplateLocation())) {
        boolean isTemplate = !isNullOrEmpty(options.getTemplateLocation());
        if (isTemplate) {
            checkArgument(isNullOrEmpty(options.getDataflowJobFile()), "--dataflowJobFile and --templateLocation are mutually exclusive.");
        }
        String fileLocation = firstNonNull(options.getTemplateLocation(), options.getDataflowJobFile());
        checkArgument(fileLocation.startsWith("/") || fileLocation.startsWith("gs://"), "Location must be local or on Cloud Storage, got %s.", fileLocation);
        ResourceId fileResource = FileSystems.matchNewResource(fileLocation, false);
        String workSpecJson = DataflowPipelineTranslator.jobToString(newJob);
        try (PrintWriter printWriter = new PrintWriter(Channels.newOutputStream(FileSystems.create(fileResource, MimeTypes.TEXT)))) {
            printWriter.print(workSpecJson);
            LOG.info("Printed job specification to {}", fileLocation);
        } catch (IOException ex) {
            String error = String.format("Cannot create output file at %s", fileLocation);
            if (isTemplate) {
                throw new RuntimeException(error, ex);
            } else {
                LOG.warn(error, ex);
            }
        }
        if (isTemplate) {
            LOG.info("Template successfully created.");
            return new DataflowTemplateJob();
        }
    }
    String jobIdToUpdate = null;
    if (options.isUpdate()) {
        jobIdToUpdate = getJobIdFromName(options.getJobName());
        newJob.setTransformNameMapping(options.getTransformNameMapping());
        newJob.setReplaceJobId(jobIdToUpdate);
    }
    Job jobResult;
    try {
        jobResult = dataflowClient.createJob(newJob);
    } catch (GoogleJsonResponseException e) {
        String errorMessages = "Unexpected errors";
        if (e.getDetails() != null) {
            if (Utf8.encodedLength(newJob.toString()) >= CREATE_JOB_REQUEST_LIMIT_BYTES) {
                errorMessages = "The size of the serialized JSON representation of the pipeline " + "exceeds the allowable limit. " + "For more information, please check the FAQ link below:\n" + "https://cloud.google.com/dataflow/faq";
            } else {
                errorMessages = e.getDetails().getMessage();
            }
        }
        throw new RuntimeException("Failed to create a workflow job: " + errorMessages, e);
    } catch (IOException e) {
        throw new RuntimeException("Failed to create a workflow job", e);
    }
    // Use a raw client for post-launch monitoring, as status calls may fail
    // regularly and need not be retried automatically.
    DataflowPipelineJob dataflowPipelineJob = new DataflowPipelineJob(DataflowClient.create(options), jobResult.getId(), options, jobSpecification.getStepNames());
    // depending on whether this is a reload or not.
    if (jobResult.getClientRequestId() != null && !jobResult.getClientRequestId().isEmpty() && !jobResult.getClientRequestId().equals(requestId)) {
        // If updating a job.
        if (options.isUpdate()) {
            throw new DataflowJobAlreadyUpdatedException(dataflowPipelineJob, String.format("The job named %s with id: %s has already been updated into job id: %s " + "and cannot be updated again.", newJob.getName(), jobIdToUpdate, jobResult.getId()));
        } else {
            throw new DataflowJobAlreadyExistsException(dataflowPipelineJob, String.format("There is already an active job named %s with id: %s. If you want " + "to submit a second job, try again by setting a different name using --jobName.", newJob.getName(), jobResult.getId()));
        }
    }
    LOG.info("To access the Dataflow monitoring console, please navigate to {}", MonitoringUtil.getJobMonitoringPageURL(options.getProject(), jobResult.getId()));
    System.out.println("Submitted job: " + jobResult.getId());
    LOG.info("To cancel the job using the 'gcloud' tool, run:\n> {}", MonitoringUtil.getGcloudCancelCommand(options, jobResult.getId()));
    return dataflowPipelineJob;
}
Also used : DataflowTemplateJob(org.apache.beam.runners.dataflow.util.DataflowTemplateJob) DataflowPipelineOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineOptions) ReleaseInfo(org.apache.beam.sdk.util.ReleaseInfo) StringUtils.byteArrayToJsonString(org.apache.beam.sdk.util.StringUtils.byteArrayToJsonString) IOException(java.io.IOException) GoogleJsonResponseException(com.google.api.client.googleapis.json.GoogleJsonResponseException) WorkerPool(com.google.api.services.dataflow.model.WorkerPool) Random(java.util.Random) ResourceId(org.apache.beam.sdk.io.fs.ResourceId) JobSpecification(org.apache.beam.runners.dataflow.DataflowPipelineTranslator.JobSpecification) DataflowTemplateJob(org.apache.beam.runners.dataflow.util.DataflowTemplateJob) Job(com.google.api.services.dataflow.model.Job) DataflowPackage(com.google.api.services.dataflow.model.DataflowPackage) PrintWriter(java.io.PrintWriter) PTransformOverride(org.apache.beam.sdk.runners.PTransformOverride)

Example 29 with DataflowPackage

use of com.google.api.services.dataflow.model.DataflowPackage in project beam by apache.

the class PackageUtilTest method testPackageNamingWithDirectory.

@Test
public void testPackageNamingWithDirectory() throws Exception {
    File tmpDirectory = tmpFolder.newFolder("folder");
    DataflowPackage target = makePackageAttributes(tmpDirectory, null).getDataflowPackage();
    assertThat(target.getName(), RegexMatcher.matches("folder-" + HASH_PATTERN + ".jar"));
    assertThat(target.getLocation(), equalTo(STAGING_PATH + target.getName()));
}
Also used : File(java.io.File) DataflowPackage(com.google.api.services.dataflow.model.DataflowPackage) Test(org.junit.Test)

Example 30 with DataflowPackage

use of com.google.api.services.dataflow.model.DataflowPackage in project beam by apache.

the class DataflowRunnerTest method testRunWithFiles.

@Test
public void testRunWithFiles() throws IOException {
    // Test that the function DataflowRunner.stageFiles works as expected.
    final String cloudDataflowDataset = "somedataset";
    // Create some temporary files.
    File temp1 = File.createTempFile("DataflowRunnerTest", "txt");
    temp1.deleteOnExit();
    File temp2 = File.createTempFile("DataflowRunnerTest2", "txt");
    temp2.deleteOnExit();
    String overridePackageName = "alias.txt";
    when(mockGcsUtil.getObjects(anyListOf(GcsPath.class))).thenReturn(ImmutableList.of(GcsUtil.StorageObjectOrIOException.create(new FileNotFoundException("some/path"))));
    DataflowPipelineOptions options = PipelineOptionsFactory.as(DataflowPipelineOptions.class);
    options.setFilesToStage(ImmutableList.of(temp1.getAbsolutePath(), overridePackageName + "=" + temp2.getAbsolutePath()));
    options.setStagingLocation(VALID_STAGING_BUCKET);
    options.setTempLocation(VALID_TEMP_BUCKET);
    options.setTempDatasetId(cloudDataflowDataset);
    options.setProject(PROJECT_ID);
    options.setRegion(REGION_ID);
    options.setJobName("job");
    options.setDataflowClient(buildMockDataflow());
    options.setGcsUtil(mockGcsUtil);
    options.setGcpCredential(new TestCredential());
    when(mockGcsUtil.create(any(GcsPath.class), anyString(), anyInt())).then(new Answer<SeekableByteChannel>() {

        @Override
        public SeekableByteChannel answer(InvocationOnMock invocation) throws Throwable {
            return FileChannel.open(Files.createTempFile("channel-", ".tmp"), StandardOpenOption.CREATE, StandardOpenOption.DELETE_ON_CLOSE);
        }
    });
    Pipeline p = buildDataflowPipeline(options);
    DataflowPipelineJob job = (DataflowPipelineJob) p.run();
    assertEquals("newid", job.getJobId());
    ArgumentCaptor<Job> jobCaptor = ArgumentCaptor.forClass(Job.class);
    Mockito.verify(mockJobs).create(eq(PROJECT_ID), eq(REGION_ID), jobCaptor.capture());
    Job workflowJob = jobCaptor.getValue();
    assertValidJob(workflowJob);
    assertEquals(2, workflowJob.getEnvironment().getWorkerPools().get(0).getPackages().size());
    DataflowPackage workflowPackage1 = workflowJob.getEnvironment().getWorkerPools().get(0).getPackages().get(0);
    assertThat(workflowPackage1.getName(), startsWith(temp1.getName()));
    DataflowPackage workflowPackage2 = workflowJob.getEnvironment().getWorkerPools().get(0).getPackages().get(1);
    assertEquals(overridePackageName, workflowPackage2.getName());
    assertEquals(GcsPath.fromUri(VALID_TEMP_BUCKET).toResourceName(), workflowJob.getEnvironment().getTempStoragePrefix());
    assertEquals(cloudDataflowDataset, workflowJob.getEnvironment().getDataset());
    assertEquals(ReleaseInfo.getReleaseInfo().getName(), workflowJob.getEnvironment().getUserAgent().get("name"));
    assertEquals(ReleaseInfo.getReleaseInfo().getVersion(), workflowJob.getEnvironment().getUserAgent().get("version"));
}
Also used : DataflowPipelineOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineOptions) FileNotFoundException(java.io.FileNotFoundException) Matchers.containsString(org.hamcrest.Matchers.containsString) Matchers.anyString(org.mockito.Matchers.anyString) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Pipeline(org.apache.beam.sdk.Pipeline) SeekableByteChannel(java.nio.channels.SeekableByteChannel) TestCredential(org.apache.beam.sdk.extensions.gcp.auth.TestCredential) InvocationOnMock(org.mockito.invocation.InvocationOnMock) GcsPath(org.apache.beam.sdk.util.gcsfs.GcsPath) Job(com.google.api.services.dataflow.model.Job) DataflowRunner.getContainerImageForJob(org.apache.beam.runners.dataflow.DataflowRunner.getContainerImageForJob) File(java.io.File) DataflowPackage(com.google.api.services.dataflow.model.DataflowPackage) Test(org.junit.Test)

Aggregations

DataflowPackage (com.google.api.services.dataflow.model.DataflowPackage)34 Test (org.junit.Test)30 Job (com.google.api.services.dataflow.model.Job)22 DataflowPipelineOptions (org.apache.beam.runners.dataflow.options.DataflowPipelineOptions)22 Pipeline (org.apache.beam.sdk.Pipeline)21 Structs.getString (org.apache.beam.runners.dataflow.util.Structs.getString)13 File (java.io.File)11 Step (com.google.api.services.dataflow.model.Step)8 ImmutableMap (com.google.common.collect.ImmutableMap)6 Map (java.util.Map)6 Structs.addObject (org.apache.beam.runners.dataflow.util.Structs.addObject)6 FileNotFoundException (java.io.FileNotFoundException)5 GcsPath (org.apache.beam.sdk.util.gcsfs.GcsPath)4 Matchers.anyString (org.mockito.Matchers.anyString)4 Pipe (java.nio.channels.Pipe)3 LinkedList (java.util.LinkedList)3 WorkerPool (com.google.api.services.dataflow.model.WorkerPool)2 ImmutableList (com.google.common.collect.ImmutableList)2 IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2