Search in sources :

Example 36 with Coder

use of org.apache.beam.model.pipeline.v1.RunnerApi.Coder in project beam by apache.

the class SdkComponents method registerCoder.

/**
 * Registers the provided {@link Coder} into this {@link SdkComponents}, returning a unique ID for
 * the {@link Coder}. Multiple registrations of the same {@link Coder} will return the same unique
 * ID.
 *
 * <p>Coders are stored by identity to ensure that coders with implementations of {@link
 * #equals(Object)} and {@link #hashCode()} but incompatible binary formats are not considered the
 * same coder.
 */
public String registerCoder(Coder<?> coder) throws IOException {
    String existing = coderIds.get(coder);
    if (existing != null) {
        return existing;
    }
    // Unlike StructuredCoder, custom coders may not have proper implementation of hashCode() and
    // equals(), this lead to unnecessary duplications. In order to avoid this we examine already
    // registered coders and see if we can find a matching proto, and consider them same coder.
    RunnerApi.Coder coderProto = CoderTranslation.toProto(coder, this);
    if (coderProtoToId.containsKey(coderProto)) {
        return coderProtoToId.get(coderProto);
    }
    String baseName = NameUtils.approximateSimpleName(coder);
    String name = uniqify(baseName, coderIds.values());
    coderIds.put(coder, name);
    coderProtoToId.put(coderProto, name);
    componentsBuilder.putCoders(name, coderProto);
    return name;
}
Also used : RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) ByteString(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString)

Example 37 with Coder

use of org.apache.beam.model.pipeline.v1.RunnerApi.Coder in project beam by apache.

the class GreedyPipelineFuserTest method flattenWithHeterogeneousInputsSingleEnvOutputPartiallyMaterialized.

/*
   * impulseA -> .out -> goRead -> .out \
   *                                     -> flatten -> .out -> goParDo -> .out
   * impulseB -> .out -> pyRead -> .out /
   *
   * becomes
   * (impulseA.out) -> goRead -> goRead.out -> flatten -> flatten.out -> goParDo
   * (impulseB.out) -> pyRead -> pyRead.out -> flatten -> (flatten.out)
   * (flatten.out) -> goParDo
   */
@Test
public void flattenWithHeterogeneousInputsSingleEnvOutputPartiallyMaterialized() {
    Components components = Components.newBuilder().putCoders("coder", Coder.newBuilder().build()).putCoders("windowCoder", Coder.newBuilder().build()).putWindowingStrategies("ws", WindowingStrategy.newBuilder().setWindowCoderId("windowCoder").build()).putTransforms("pyImpulse", PTransform.newBuilder().setUniqueName("PyImpulse").putOutputs("output", "pyImpulse.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.IMPULSE_TRANSFORM_URN)).build()).putPcollections("pyImpulse.out", pc("pyImpulse.out")).putTransforms("pyRead", PTransform.newBuilder().setUniqueName("PyRead").putInputs("input", "pyImpulse.out").putOutputs("output", "pyRead.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("py").build()).putPcollections("pyRead.out", pc("pyRead.out")).putTransforms("goImpulse", PTransform.newBuilder().setUniqueName("GoImpulse").putOutputs("output", "goImpulse.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.IMPULSE_TRANSFORM_URN)).build()).putPcollections("goImpulse.out", pc("goImpulse.out")).putTransforms("goRead", PTransform.newBuilder().setUniqueName("GoRead").putInputs("input", "goImpulse.out").putOutputs("output", "goRead.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("go").build()).putPcollections("goRead.out", pc("goRead.out")).putTransforms("flatten", PTransform.newBuilder().setUniqueName("Flatten").putInputs("goReadInput", "goRead.out").putInputs("pyReadInput", "pyRead.out").putOutputs("output", "flatten.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.FLATTEN_TRANSFORM_URN)).build()).putPcollections("flatten.out", pc("flatten.out")).putTransforms("goParDo", PTransform.newBuilder().setUniqueName("GoParDo").putInputs("input", "flatten.out").putOutputs("output", "goParDo.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("go").build()).putPcollections("goParDo.out", pc("goParDo.out")).putEnvironments("go", Environments.createDockerEnvironment("go")).putEnvironments("py", Environments.createDockerEnvironment("py")).build();
    FusedPipeline fused = GreedyPipelineFuser.fuse(Pipeline.newBuilder().setComponents(components).build());
    assertThat(fused.getRunnerExecutedTransforms(), containsInAnyOrder(PipelineNode.pTransform("pyImpulse", components.getTransformsOrThrow("pyImpulse")), PipelineNode.pTransform("goImpulse", components.getTransformsOrThrow("goImpulse"))));
    assertThat(fused.getFusedStages(), containsInAnyOrder(ExecutableStageMatcher.withInput("goImpulse.out").withNoOutputs().withTransforms("goRead", "flatten", "goParDo"), ExecutableStageMatcher.withInput("pyImpulse.out").withOutputs("flatten.out").withTransforms("pyRead", "flatten"), ExecutableStageMatcher.withInput("flatten.out").withNoOutputs().withTransforms("goParDo")));
}
Also used : Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) Test(org.junit.Test)

Example 38 with Coder

use of org.apache.beam.model.pipeline.v1.RunnerApi.Coder in project beam by apache.

the class GreedyPipelineFuserTest method sanitizedTransforms.

@Test
public void sanitizedTransforms() throws Exception {
    PCollection flattenOutput = pc("flatten.out");
    PCollection read1Output = pc("read1.out");
    PCollection read2Output = pc("read2.out");
    PCollection impulse1Output = pc("impulse1.out");
    PCollection impulse2Output = pc("impulse2.out");
    PTransform flattenTransform = PTransform.newBuilder().setUniqueName("Flatten").putInputs(read1Output.getUniqueName(), read1Output.getUniqueName()).putInputs(read2Output.getUniqueName(), read2Output.getUniqueName()).putOutputs(flattenOutput.getUniqueName(), flattenOutput.getUniqueName()).setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.FLATTEN_TRANSFORM_URN).setPayload(WindowIntoPayload.newBuilder().setWindowFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("py").build();
    PTransform read1Transform = PTransform.newBuilder().setUniqueName("read1").putInputs(impulse1Output.getUniqueName(), impulse1Output.getUniqueName()).putOutputs(read1Output.getUniqueName(), read1Output.getUniqueName()).setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(WindowIntoPayload.newBuilder().setWindowFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("py").build();
    PTransform read2Transform = PTransform.newBuilder().setUniqueName("read2").putInputs(impulse2Output.getUniqueName(), impulse2Output.getUniqueName()).putOutputs(read2Output.getUniqueName(), read2Output.getUniqueName()).setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(WindowIntoPayload.newBuilder().setWindowFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("py").build();
    PTransform impulse1Transform = PTransform.newBuilder().setUniqueName("impulse1").putOutputs(impulse1Output.getUniqueName(), impulse1Output.getUniqueName()).setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.IMPULSE_TRANSFORM_URN).setPayload(WindowIntoPayload.newBuilder().setWindowFn(FunctionSpec.newBuilder()).build().toByteString())).build();
    PTransform impulse2Transform = PTransform.newBuilder().setUniqueName("impulse2").putOutputs(impulse2Output.getUniqueName(), impulse2Output.getUniqueName()).setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.IMPULSE_TRANSFORM_URN).setPayload(WindowIntoPayload.newBuilder().setWindowFn(FunctionSpec.newBuilder()).build().toByteString())).build();
    Pipeline impulse = Pipeline.newBuilder().addRootTransformIds(impulse1Transform.getUniqueName()).addRootTransformIds(impulse2Transform.getUniqueName()).addRootTransformIds(flattenTransform.getUniqueName()).setComponents(Components.newBuilder().putCoders("coder", Coder.newBuilder().build()).putCoders("windowCoder", Coder.newBuilder().build()).putWindowingStrategies("ws", WindowingStrategy.newBuilder().setWindowCoderId("windowCoder").build()).putEnvironments("py", Environments.createDockerEnvironment("py")).putPcollections(flattenOutput.getUniqueName(), flattenOutput).putTransforms(flattenTransform.getUniqueName(), flattenTransform).putPcollections(read1Output.getUniqueName(), read1Output).putTransforms(read1Transform.getUniqueName(), read1Transform).putPcollections(read2Output.getUniqueName(), read2Output).putTransforms(read2Transform.getUniqueName(), read2Transform).putPcollections(impulse1Output.getUniqueName(), impulse1Output).putTransforms(impulse1Transform.getUniqueName(), impulse1Transform).putPcollections(impulse2Output.getUniqueName(), impulse2Output).putTransforms(impulse2Transform.getUniqueName(), impulse2Transform).build()).build();
    FusedPipeline fused = GreedyPipelineFuser.fuse(impulse);
    assertThat(fused.getRunnerExecutedTransforms(), hasSize(2));
    assertThat(fused.getFusedStages(), hasSize(2));
    assertThat(fused.getFusedStages(), containsInAnyOrder(ExecutableStageMatcher.withInput(impulse1Output.getUniqueName()).withTransforms(flattenTransform.getUniqueName(), read1Transform.getUniqueName()), ExecutableStageMatcher.withInput(impulse2Output.getUniqueName()).withTransforms(flattenTransform.getUniqueName(), read2Transform.getUniqueName())));
    assertThat(fused.getFusedStages().stream().flatMap(s -> s.getComponents().getTransformsOrThrow(flattenTransform.getUniqueName()).getInputsMap().values().stream()).collect(Collectors.toList()), containsInAnyOrder(read1Output.getUniqueName(), read2Output.getUniqueName()));
}
Also used : PCollection(org.apache.beam.model.pipeline.v1.RunnerApi.PCollection) PTransform(org.apache.beam.model.pipeline.v1.RunnerApi.PTransform) Pipeline(org.apache.beam.model.pipeline.v1.RunnerApi.Pipeline) Test(org.junit.Test)

Example 39 with Coder

use of org.apache.beam.model.pipeline.v1.RunnerApi.Coder in project beam by apache.

the class GreedyPipelineFuserTest method flattenWithHeterogenousInputsAndOutputsEntirelyMaterialized.

/*
   * goImpulse -> .out -> goRead -> .out \                    -> goParDo -> .out
   *                                      -> flatten -> .out |
   * pyImpulse -> .out -> pyRead -> .out /                    -> pyParDo -> .out
   *
   * becomes
   * (goImpulse.out) -> goRead -> goRead.out -> flatten -> (flatten.out_synthetic0)
   * (pyImpulse.out) -> pyRead -> pyRead.out -> flatten -> (flatten.out_synthetic1)
   * flatten.out_synthetic0 & flatten.out_synthetic1 -> synthetic_flatten -> flatten.out
   * (flatten.out) -> goParDo
   * (flatten.out) -> pyParDo
   */
@Test
public void flattenWithHeterogenousInputsAndOutputsEntirelyMaterialized() {
    Components components = Components.newBuilder().putCoders("coder", Coder.newBuilder().build()).putCoders("windowCoder", Coder.newBuilder().build()).putWindowingStrategies("ws", WindowingStrategy.newBuilder().setWindowCoderId("windowCoder").build()).putTransforms("pyImpulse", PTransform.newBuilder().setUniqueName("PyImpulse").putOutputs("output", "pyImpulse.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.IMPULSE_TRANSFORM_URN)).build()).putPcollections("pyImpulse.out", pc("pyImpulse.out")).putTransforms("pyRead", PTransform.newBuilder().setUniqueName("PyRead").putInputs("input", "pyImpulse.out").putOutputs("output", "pyRead.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("py").build()).putPcollections("pyRead.out", pc("pyRead.out")).putTransforms("goImpulse", PTransform.newBuilder().setUniqueName("GoImpulse").putOutputs("output", "goImpulse.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.IMPULSE_TRANSFORM_URN)).build()).putPcollections("goImpulse.out", pc("goImpulse.out")).putTransforms("goRead", PTransform.newBuilder().setUniqueName("GoRead").putInputs("input", "goImpulse.out").putOutputs("output", "goRead.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("go").build()).putPcollections("goRead.out", pc("goRead.out")).putTransforms("flatten", PTransform.newBuilder().setUniqueName("Flatten").putInputs("goReadInput", "goRead.out").putInputs("pyReadInput", "pyRead.out").putOutputs("output", "flatten.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.FLATTEN_TRANSFORM_URN)).build()).putPcollections("flatten.out", pc("flatten.out")).putTransforms("pyParDo", PTransform.newBuilder().setUniqueName("PyParDo").putInputs("input", "flatten.out").putOutputs("output", "pyParDo.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("py").build()).putPcollections("pyParDo.out", pc("pyParDo.out")).putTransforms("goParDo", PTransform.newBuilder().setUniqueName("GoParDo").putInputs("input", "flatten.out").putOutputs("output", "goParDo.out").setSpec(FunctionSpec.newBuilder().setUrn(PTransformTranslation.PAR_DO_TRANSFORM_URN).setPayload(ParDoPayload.newBuilder().setDoFn(FunctionSpec.newBuilder()).build().toByteString())).setEnvironmentId("go").build()).putPcollections("goParDo.out", pc("goParDo.out")).putEnvironments("go", Environments.createDockerEnvironment("go")).putEnvironments("py", Environments.createDockerEnvironment("py")).build();
    FusedPipeline fused = GreedyPipelineFuser.fuse(Pipeline.newBuilder().setComponents(components).build());
    assertThat(fused.getRunnerExecutedTransforms(), hasSize(3));
    assertThat("The runner should include the impulses for both languages, plus an introduced flatten", fused.getRunnerExecutedTransforms(), hasItems(PipelineNode.pTransform("pyImpulse", components.getTransformsOrThrow("pyImpulse")), PipelineNode.pTransform("goImpulse", components.getTransformsOrThrow("goImpulse"))));
    PTransformNode flattenNode = null;
    for (PTransformNode runnerTransform : fused.getRunnerExecutedTransforms()) {
        if (getOnlyElement(runnerTransform.getTransform().getOutputsMap().values()).equals("flatten.out")) {
            flattenNode = runnerTransform;
        }
    }
    assertThat(flattenNode, not(nullValue()));
    assertThat(flattenNode.getTransform().getSpec().getUrn(), equalTo(PTransformTranslation.FLATTEN_TRANSFORM_URN));
    assertThat(new HashSet<>(flattenNode.getTransform().getInputsMap().values()), hasSize(2));
    Collection<String> introducedOutputs = flattenNode.getTransform().getInputsMap().values();
    AnyOf<String> anyIntroducedPCollection = anyOf(introducedOutputs.stream().map(Matchers::equalTo).collect(Collectors.toSet()));
    assertThat(fused.getFusedStages(), containsInAnyOrder(ExecutableStageMatcher.withInput("goImpulse.out").withOutputs(anyIntroducedPCollection).withTransforms("goRead", "flatten"), ExecutableStageMatcher.withInput("pyImpulse.out").withOutputs(anyIntroducedPCollection).withTransforms("pyRead", "flatten"), ExecutableStageMatcher.withInput("flatten.out").withNoOutputs().withTransforms("goParDo"), ExecutableStageMatcher.withInput("flatten.out").withNoOutputs().withTransforms("pyParDo")));
    Set<String> materializedStageOutputs = fused.getFusedStages().stream().flatMap(executableStage -> executableStage.getOutputPCollections().stream()).map(PCollectionNode::getId).collect(Collectors.toSet());
    assertThat("All materialized stage outputs should be flattened, and no more", materializedStageOutputs, containsInAnyOrder(flattenNode.getTransform().getInputsMap().values().toArray(new String[0])));
}
Also used : Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) PTransformNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PTransformNode) Matchers(org.hamcrest.Matchers) Test(org.junit.Test)

Example 40 with Coder

use of org.apache.beam.model.pipeline.v1.RunnerApi.Coder in project beam by apache.

the class DataflowPipelineTranslatorTest method testStreamingGroupIntoBatchesWithShardedKeyTranslationUnifiedWorker.

@Test
public void testStreamingGroupIntoBatchesWithShardedKeyTranslationUnifiedWorker() throws Exception {
    List<String> experiments = new ArrayList<>(ImmutableList.of(GcpOptions.STREAMING_ENGINE_EXPERIMENT, GcpOptions.WINDMILL_SERVICE_EXPERIMENT, "use_runner_v2"));
    JobSpecification jobSpec = runStreamingGroupIntoBatchesAndGetJobSpec(true, experiments);
    List<Step> steps = jobSpec.getJob().getSteps();
    Step shardedStateStep = steps.get(steps.size() - 1);
    Map<String, Object> properties = shardedStateStep.getProperties();
    assertTrue(properties.containsKey(PropertyNames.USES_KEYED_STATE));
    assertTrue(properties.containsKey(PropertyNames.ALLOWS_SHARDABLE_STATE));
    assertEquals("true", getString(properties, PropertyNames.ALLOWS_SHARDABLE_STATE));
    assertTrue(properties.containsKey(PropertyNames.PRESERVES_KEYS));
    assertEquals("true", getString(properties, PropertyNames.PRESERVES_KEYS));
    // Also checks the runner proto is correctly populated.
    Map<String, RunnerApi.PTransform> transformMap = jobSpec.getPipelineProto().getComponents().getTransformsMap();
    boolean transformFound = false;
    for (Map.Entry<String, RunnerApi.PTransform> transform : transformMap.entrySet()) {
        RunnerApi.FunctionSpec spec = transform.getValue().getSpec();
        if (spec.getUrn().equals(PTransformTranslation.GROUP_INTO_BATCHES_WITH_SHARDED_KEY_URN)) {
            for (String subtransform : transform.getValue().getSubtransformsList()) {
                RunnerApi.PTransform ptransform = transformMap.get(subtransform);
                if (ptransform.getSpec().getUrn().equals(PTransformTranslation.GROUP_INTO_BATCHES_URN)) {
                    transformFound = true;
                }
            }
        }
    }
    assertTrue(transformFound);
    boolean coderFound = false;
    Map<String, RunnerApi.Coder> coderMap = jobSpec.getPipelineProto().getComponents().getCodersMap();
    for (Map.Entry<String, RunnerApi.Coder> coder : coderMap.entrySet()) {
        if (coder.getValue().getSpec().getUrn().equals(ModelCoders.SHARDED_KEY_CODER_URN)) {
            coderFound = true;
        }
    }
    assertTrue(coderFound);
}
Also used : SerializableCoder(org.apache.beam.sdk.coders.SerializableCoder) KvCoder(org.apache.beam.sdk.coders.KvCoder) VarIntCoder(org.apache.beam.sdk.coders.VarIntCoder) VoidCoder(org.apache.beam.sdk.coders.VoidCoder) Coder(org.apache.beam.sdk.coders.Coder) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) ArrayList(java.util.ArrayList) Structs.getString(org.apache.beam.runners.dataflow.util.Structs.getString) ByteString(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString) Step(com.google.api.services.dataflow.model.Step) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) CloudObject(org.apache.beam.runners.dataflow.util.CloudObject) JobSpecification(org.apache.beam.runners.dataflow.DataflowPipelineTranslator.JobSpecification) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) Map(java.util.Map) PTransform(org.apache.beam.sdk.transforms.PTransform) Test(org.junit.Test)

Aggregations

RunnerApi (org.apache.beam.model.pipeline.v1.RunnerApi)48 Coder (org.apache.beam.sdk.coders.Coder)33 WindowedValue (org.apache.beam.sdk.util.WindowedValue)32 KvCoder (org.apache.beam.sdk.coders.KvCoder)30 Test (org.junit.Test)30 Map (java.util.Map)23 ByteString (org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString)23 HashMap (java.util.HashMap)21 KV (org.apache.beam.sdk.values.KV)20 ArrayList (java.util.ArrayList)19 IOException (java.io.IOException)18 StringUtf8Coder (org.apache.beam.sdk.coders.StringUtf8Coder)17 List (java.util.List)16 ExecutableStage (org.apache.beam.runners.core.construction.graph.ExecutableStage)16 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)15 ImmutableMap (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap)15 Collection (java.util.Collection)13 Pipeline (org.apache.beam.sdk.Pipeline)13 FusedPipeline (org.apache.beam.runners.core.construction.graph.FusedPipeline)12 ConcurrentHashMap (java.util.concurrent.ConcurrentHashMap)11