Search in sources :

Example 66 with DataflowPipelineOptions

use of org.apache.beam.runners.dataflow.options.DataflowPipelineOptions in project beam by apache.

the class DataflowPipelineTranslatorTest method testPredefinedAddStep.

@Test
public void testPredefinedAddStep() throws Exception {
    DataflowPipelineOptions options = buildPipelineOptions();
    DataflowPipelineTranslator translator = DataflowPipelineTranslator.fromOptions(options);
    DataflowPipelineTranslator.registerTransformTranslator(EmbeddedTransform.class, new EmbeddedTranslator());
    // Create a predefined step using another pipeline
    Step predefinedStep = createPredefinedStep();
    // Create a pipeline that the predefined step will be embedded into
    Pipeline pipeline = Pipeline.create(options);
    pipeline.apply("ReadMyFile", TextIO.read().from("gs://bucket/in")).apply(ParDo.of(new NoOpFn())).apply(new EmbeddedTransform(predefinedStep.clone())).apply(ParDo.of(new NoOpFn()));
    DataflowRunner runner = DataflowRunner.fromOptions(options);
    runner.replaceTransforms(pipeline);
    Job job = translator.translate(pipeline, runner, Collections.<DataflowPackage>emptyList()).getJob();
    assertAllStepOutputsHaveUniqueIds(job);
    List<Step> steps = job.getSteps();
    assertEquals(4, steps.size());
    // The input to the embedded step should match the output of the step before
    Map<String, Object> step1Out = getOutputPortReference(steps.get(1));
    Map<String, Object> step2In = getDictionary(steps.get(2).getProperties(), PropertyNames.PARALLEL_INPUT);
    assertEquals(step1Out, step2In);
    // The output from the embedded step should match the input of the step after
    Map<String, Object> step2Out = getOutputPortReference(steps.get(2));
    Map<String, Object> step3In = getDictionary(steps.get(3).getProperties(), PropertyNames.PARALLEL_INPUT);
    assertEquals(step2Out, step3In);
    // The step should not have been modified other than remapping the input
    Step predefinedStepClone = predefinedStep.clone();
    Step embeddedStepClone = steps.get(2).clone();
    predefinedStepClone.getProperties().remove(PropertyNames.PARALLEL_INPUT);
    embeddedStepClone.getProperties().remove(PropertyNames.PARALLEL_INPUT);
    assertEquals(predefinedStepClone, embeddedStepClone);
}
Also used : DataflowPipelineOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineOptions) Step(com.google.api.services.dataflow.model.Step) Structs.getString(org.apache.beam.runners.dataflow.util.Structs.getString) Pipeline(org.apache.beam.sdk.Pipeline) Structs.addObject(org.apache.beam.runners.dataflow.util.Structs.addObject) Job(com.google.api.services.dataflow.model.Job) DataflowPackage(com.google.api.services.dataflow.model.DataflowPackage) Test(org.junit.Test)

Example 67 with DataflowPipelineOptions

use of org.apache.beam.runners.dataflow.options.DataflowPipelineOptions in project beam by apache.

the class DataflowPipelineTranslatorTest method testSettingOfSdkPipelineOptions.

@Test
public void testSettingOfSdkPipelineOptions() throws IOException {
    DataflowPipelineOptions options = buildPipelineOptions();
    options.setRunner(DataflowRunner.class);
    Pipeline p = Pipeline.create(options);
    p.traverseTopologically(new RecordingPipelineVisitor());
    Job job = DataflowPipelineTranslator.fromOptions(options).translate(p, DataflowRunner.fromOptions(options), Collections.<DataflowPackage>emptyList()).getJob();
    Map<String, Object> sdkPipelineOptions = job.getEnvironment().getSdkPipelineOptions();
    assertThat(sdkPipelineOptions, hasKey("options"));
    Map<String, Object> optionsMap = (Map<String, Object>) sdkPipelineOptions.get("options");
    assertThat(optionsMap, hasEntry("appName", (Object) "DataflowPipelineTranslatorTest"));
    assertThat(optionsMap, hasEntry("project", (Object) "some-project"));
    assertThat(optionsMap, hasEntry("pathValidatorClass", (Object) GcsPathValidator.class.getName()));
    assertThat(optionsMap, hasEntry("runner", (Object) DataflowRunner.class.getName()));
    assertThat(optionsMap, hasEntry("jobName", (Object) "some-job-name"));
    assertThat(optionsMap, hasEntry("tempLocation", (Object) "gs://somebucket/some/path"));
    assertThat(optionsMap, hasEntry("stagingLocation", (Object) "gs://somebucket/some/path/staging/"));
    assertThat(optionsMap, hasEntry("stableUniqueNames", (Object) "WARNING"));
    assertThat(optionsMap, hasEntry("streaming", (Object) false));
    assertThat(optionsMap, hasEntry("numberOfWorkerHarnessThreads", (Object) 0));
}
Also used : DataflowPipelineOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineOptions) GcsPathValidator(org.apache.beam.sdk.extensions.gcp.storage.GcsPathValidator) Structs.addObject(org.apache.beam.runners.dataflow.util.Structs.addObject) Structs.getString(org.apache.beam.runners.dataflow.util.Structs.getString) Job(com.google.api.services.dataflow.model.Job) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) DataflowPackage(com.google.api.services.dataflow.model.DataflowPackage) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test)

Example 68 with DataflowPipelineOptions

use of org.apache.beam.runners.dataflow.options.DataflowPipelineOptions in project beam by apache.

the class DataflowPipelineTranslatorTest method testNetworkConfig.

@Test
public void testNetworkConfig() throws IOException {
    final String testNetwork = "test-network";
    DataflowPipelineOptions options = buildPipelineOptions();
    options.setNetwork(testNetwork);
    Pipeline p = buildPipeline(options);
    p.traverseTopologically(new RecordingPipelineVisitor());
    Job job = DataflowPipelineTranslator.fromOptions(options).translate(p, DataflowRunner.fromOptions(options), Collections.<DataflowPackage>emptyList()).getJob();
    assertEquals(1, job.getEnvironment().getWorkerPools().size());
    assertEquals(testNetwork, job.getEnvironment().getWorkerPools().get(0).getNetwork());
}
Also used : DataflowPipelineOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineOptions) Structs.getString(org.apache.beam.runners.dataflow.util.Structs.getString) Job(com.google.api.services.dataflow.model.Job) DataflowPackage(com.google.api.services.dataflow.model.DataflowPackage) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test)

Example 69 with DataflowPipelineOptions

use of org.apache.beam.runners.dataflow.options.DataflowPipelineOptions in project beam by apache.

the class DataflowPipelineTranslatorTest method testToIterableTranslationWithIsmSideInput.

@Test
public void testToIterableTranslationWithIsmSideInput() throws Exception {
    // A "change detector" test that makes sure the translation
    // of getting a PCollectionView<Iterable<T>> does not change
    // in bad ways during refactor
    DataflowPipelineOptions options = buildPipelineOptions();
    DataflowPipelineTranslator translator = DataflowPipelineTranslator.fromOptions(options);
    Pipeline pipeline = Pipeline.create(options);
    pipeline.apply(Create.of(1, 2, 3)).apply(View.<Integer>asIterable());
    DataflowRunner runner = DataflowRunner.fromOptions(options);
    runner.replaceTransforms(pipeline);
    Job job = translator.translate(pipeline, runner, Collections.<DataflowPackage>emptyList()).getJob();
    assertAllStepOutputsHaveUniqueIds(job);
    List<Step> steps = job.getSteps();
    assertEquals(3, steps.size());
    @SuppressWarnings("unchecked") List<Map<String, Object>> toIsmRecordOutputs = (List<Map<String, Object>>) steps.get(1).getProperties().get(PropertyNames.OUTPUT_INFO);
    assertTrue(Structs.getBoolean(Iterables.getOnlyElement(toIsmRecordOutputs), "use_indexed_format"));
    Step collectionToSingletonStep = steps.get(2);
    assertEquals("CollectionToSingleton", collectionToSingletonStep.getKind());
}
Also used : DataflowPipelineOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineOptions) Step(com.google.api.services.dataflow.model.Step) Structs.getString(org.apache.beam.runners.dataflow.util.Structs.getString) Pipeline(org.apache.beam.sdk.Pipeline) List(java.util.List) TupleTagList(org.apache.beam.sdk.values.TupleTagList) ArrayList(java.util.ArrayList) ImmutableList(com.google.common.collect.ImmutableList) LinkedList(java.util.LinkedList) Structs.addObject(org.apache.beam.runners.dataflow.util.Structs.addObject) Job(com.google.api.services.dataflow.model.Job) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) DataflowPackage(com.google.api.services.dataflow.model.DataflowPackage) Test(org.junit.Test)

Example 70 with DataflowPipelineOptions

use of org.apache.beam.runners.dataflow.options.DataflowPipelineOptions in project beam by apache.

the class DataflowPipelineTranslatorTest method testNamesOverridden.

/**
   * Test that in translation the name for a collection (in this case just a Create output) is
   * overriden to be what the Dataflow service expects.
   */
@Test
public void testNamesOverridden() throws Exception {
    DataflowPipelineOptions options = buildPipelineOptions();
    DataflowRunner runner = DataflowRunner.fromOptions(options);
    options.setStreaming(false);
    DataflowPipelineTranslator translator = DataflowPipelineTranslator.fromOptions(options);
    Pipeline pipeline = Pipeline.create(options);
    pipeline.apply("Jazzy", Create.of(3)).setName("foobizzle");
    runner.replaceTransforms(pipeline);
    Job job = translator.translate(pipeline, runner, Collections.<DataflowPackage>emptyList()).getJob();
    // The Create step
    Step step = job.getSteps().get(0);
    // This is the name that is "set by the user" that the Dataflow translator must override
    String userSpecifiedName = Structs.getString(Structs.getListOfMaps(step.getProperties(), PropertyNames.OUTPUT_INFO, null).get(0), PropertyNames.USER_NAME);
    // This is the calculated name that must actually be used
    String calculatedName = getString(step.getProperties(), PropertyNames.USER_NAME) + ".out0";
    assertThat(userSpecifiedName, equalTo(calculatedName));
}
Also used : DataflowPipelineOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineOptions) Step(com.google.api.services.dataflow.model.Step) Structs.getString(org.apache.beam.runners.dataflow.util.Structs.getString) Job(com.google.api.services.dataflow.model.Job) DataflowPackage(com.google.api.services.dataflow.model.DataflowPackage) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test)

Aggregations

DataflowPipelineOptions (org.apache.beam.runners.dataflow.options.DataflowPipelineOptions)75 Test (org.junit.Test)66 Pipeline (org.apache.beam.sdk.Pipeline)39 Job (com.google.api.services.dataflow.model.Job)31 DataflowPackage (com.google.api.services.dataflow.model.DataflowPackage)22 Structs.getString (org.apache.beam.runners.dataflow.util.Structs.getString)14 TestCredential (org.apache.beam.sdk.extensions.gcp.auth.TestCredential)14 TestPipeline (org.apache.beam.sdk.testing.TestPipeline)14 DataflowRunner.getContainerImageForJob (org.apache.beam.runners.dataflow.DataflowRunner.getContainerImageForJob)10 Step (com.google.api.services.dataflow.model.Step)8 Matchers.containsString (org.hamcrest.Matchers.containsString)7 Matchers.anyString (org.mockito.Matchers.anyString)7 ImmutableMap (com.google.common.collect.ImmutableMap)6 Map (java.util.Map)6 Structs.addObject (org.apache.beam.runners.dataflow.util.Structs.addObject)6 ImmutableList (com.google.common.collect.ImmutableList)4 ArrayList (java.util.ArrayList)4 LinkedList (java.util.LinkedList)4 List (java.util.List)4 TupleTagList (org.apache.beam.sdk.values.TupleTagList)4