Search in sources :

Example 21 with PTransform

use of org.apache.beam.sdk.transforms.PTransform in project beam by apache.

the class BeamAggregationRel method buildPTransform.

@Override
public PTransform<PCollectionList<Row>, PCollection<Row>> buildPTransform() {
    Schema outputSchema = CalciteUtils.toSchema(getRowType());
    List<FieldAggregation> aggregationAdapters = getNamedAggCalls().stream().map(aggCall -> new FieldAggregation(aggCall.getKey(), aggCall.getValue())).collect(toList());
    return new Transform(windowFn, windowFieldIndex, getGroupSet(), aggregationAdapters, outputSchema);
}
Also used : AggregateCall(org.apache.beam.vendor.calcite.v1_28_0.org.apache.calcite.rel.core.AggregateCall) CombineFn(org.apache.beam.sdk.transforms.Combine.CombineFn) RelNode(org.apache.beam.vendor.calcite.v1_28_0.org.apache.calcite.rel.RelNode) NodeStats(org.apache.beam.sdk.extensions.sql.impl.planner.NodeStats) RelWriter(org.apache.beam.vendor.calcite.v1_28_0.org.apache.calcite.rel.RelWriter) Duration(org.joda.time.Duration) BeamCostModel(org.apache.beam.sdk.extensions.sql.impl.planner.BeamCostModel) BeamSqlPipelineOptions(org.apache.beam.sdk.extensions.sql.impl.BeamSqlPipelineOptions) PTransform(org.apache.beam.sdk.transforms.PTransform) Sessions(org.apache.beam.sdk.transforms.windowing.Sessions) SlidingWindows(org.apache.beam.sdk.transforms.windowing.SlidingWindows) PCollectionList(org.apache.beam.sdk.values.PCollectionList) ImmutableBitSet(org.apache.beam.vendor.calcite.v1_28_0.org.apache.calcite.util.ImmutableBitSet) Preconditions.checkArgument(org.apache.beam.vendor.calcite.v1_28_0.com.google.common.base.Preconditions.checkArgument) Window(org.apache.beam.sdk.transforms.windowing.Window) BOUNDED(org.apache.beam.sdk.values.PCollection.IsBounded.BOUNDED) Row(org.apache.beam.sdk.values.Row) RelOptPlanner(org.apache.beam.vendor.calcite.v1_28_0.org.apache.calcite.plan.RelOptPlanner) Nullable(org.checkerframework.checker.nullness.qual.Nullable) Field(org.apache.beam.sdk.schemas.Schema.Field) DoFn(org.apache.beam.sdk.transforms.DoFn) CalciteUtils(org.apache.beam.sdk.extensions.sql.impl.utils.CalciteUtils) GroupByKey(org.apache.beam.sdk.transforms.GroupByKey) WithTimestamps(org.apache.beam.sdk.transforms.WithTimestamps) FixedWindows(org.apache.beam.sdk.transforms.windowing.FixedWindows) WindowFn(org.apache.beam.sdk.transforms.windowing.WindowFn) Lists(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Lists) FieldType(org.apache.beam.sdk.schemas.Schema.FieldType) PCollection(org.apache.beam.sdk.values.PCollection) Schema(org.apache.beam.sdk.schemas.Schema) Serializable(java.io.Serializable) RelOptCluster(org.apache.beam.vendor.calcite.v1_28_0.org.apache.calcite.plan.RelOptCluster) Collectors.toList(java.util.stream.Collectors.toList) List(java.util.List) DefaultTrigger(org.apache.beam.sdk.transforms.windowing.DefaultTrigger) ParDo(org.apache.beam.sdk.transforms.ParDo) Aggregate(org.apache.beam.vendor.calcite.v1_28_0.org.apache.calcite.rel.core.Aggregate) AggregationCombineFnAdapter(org.apache.beam.sdk.extensions.sql.impl.transform.agg.AggregationCombineFnAdapter) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) GlobalWindows(org.apache.beam.sdk.transforms.windowing.GlobalWindows) RelTraitSet(org.apache.beam.vendor.calcite.v1_28_0.org.apache.calcite.plan.RelTraitSet) BeamRelMetadataQuery(org.apache.beam.sdk.extensions.sql.impl.planner.BeamRelMetadataQuery) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) Schema(org.apache.beam.sdk.schemas.Schema) PTransform(org.apache.beam.sdk.transforms.PTransform)

Example 22 with PTransform

use of org.apache.beam.sdk.transforms.PTransform in project beam by apache.

the class BeamSqlErrorTest method testFailedExpression.

@Test
public void testFailedExpression() {
    Schema resultType = Schema.builder().addStringField(ID).addStringField(COUNTRY_CODE).addDoubleField(SUM_AMOUNT).build();
    Schema midResultType = Schema.builder().addStringField(ID).addStringField(COUNTRY_CODE).addStringField(CURRENCY).addInt64Field(F_3).build();
    String sql = "SELECT id,country_code,CalculatePrice(sum(CastUdf(amount)),currency) as sum_amount FROM PCOLLECTION group by id,country_code,currency";
    PTransform mock = spy(PTransform.class);
    when(mock.expand(Matchers.any())).thenAnswer(invocationOnMock -> invocationOnMock.getArgument(0, PCollection.class));
    ArgumentCaptor<PCollection> captor = ArgumentCaptor.forClass(PCollection.class);
    PCollection<Row> validRowsResult = boundedInputBytes.apply("calculate", SqlTransform.query(sql).withAutoLoading(false).withErrorsTransformer(mock).registerUdf("CastUdf", CastUdf.class).registerUdf("CalculatePrice", CalculatePrice.class)).setCoder(SchemaCoder.of(resultType));
    PAssert.that(validRowsResult).containsInAnyOrder(TestUtils.RowsBuilder.of(resultType).addRows("1", "US", 100.0).getRows());
    Schema firstErrorSchema = Schema.builder().addRowField(ROW, inputType).addStringField(ERROR).build();
    Row failedOnFirstUdfElement = TestTableUtils.buildRows(firstErrorSchema, Arrays.asList(TestTableUtils.buildRows(inputType, Arrays.asList("2", invalidAmount, "US", "$")).get(0), "Found invalid value " + invalidAmount)).get(0);
    Schema secondErrorSchema = Schema.builder().addRowField(ROW, midResultType).addStringField(ERROR).build();
    Row failedOnSecondUdfElement = TestTableUtils.buildRows(secondErrorSchema, Arrays.asList(TestTableUtils.buildRows(midResultType, Arrays.asList("3", "US", invalidCurrency, 100L)).get(0), "Currency isn't supported " + invalidCurrency)).get(0);
    Mockito.verify(mock, times(2)).expand(captor.capture());
    PAssert.that(captor.getAllValues().get(0)).containsInAnyOrder(failedOnFirstUdfElement);
    PAssert.that(captor.getAllValues().get(1)).containsInAnyOrder(failedOnSecondUdfElement);
    pipeline.run().waitUntilFinish();
}
Also used : PCollection(org.apache.beam.sdk.values.PCollection) Schema(org.apache.beam.sdk.schemas.Schema) Row(org.apache.beam.sdk.values.Row) PTransform(org.apache.beam.sdk.transforms.PTransform) Test(org.junit.Test)

Example 23 with PTransform

use of org.apache.beam.sdk.transforms.PTransform in project beam by apache.

the class WriteWithShardingFactoryTest method withNoShardingSpecifiedReturnsNewTransform.

@Test
public void withNoShardingSpecifiedReturnsNewTransform() {
    ResourceId outputDirectory = LocalResources.fromString("/foo", true);
    PTransform<PCollection<Object>, WriteFilesResult<Void>> original = WriteFiles.to(new FileBasedSink<Object, Void, Object>(StaticValueProvider.of(outputDirectory), DynamicFileDestinations.constant(new FakeFilenamePolicy())) {

        @Override
        public WriteOperation<Void, Object> createWriteOperation() {
            throw new IllegalArgumentException("Should not be used");
        }
    });
    @SuppressWarnings("unchecked") PCollection<Object> objs = (PCollection) p.apply(Create.empty(VoidCoder.of()));
    AppliedPTransform<PCollection<Object>, WriteFilesResult<Void>, PTransform<PCollection<Object>, WriteFilesResult<Void>>> originalApplication = AppliedPTransform.of("write", PValues.expandInput(objs), Collections.emptyMap(), original, ResourceHints.create(), p);
    assertThat(factory.getReplacementTransform(originalApplication).getTransform(), not(equalTo((Object) original)));
}
Also used : WriteFilesResult(org.apache.beam.sdk.io.WriteFilesResult) PCollection(org.apache.beam.sdk.values.PCollection) ResourceId(org.apache.beam.sdk.io.fs.ResourceId) PTransform(org.apache.beam.sdk.transforms.PTransform) AppliedPTransform(org.apache.beam.sdk.runners.AppliedPTransform) Test(org.junit.Test)

Example 24 with PTransform

use of org.apache.beam.sdk.transforms.PTransform in project beam by apache.

the class DataflowPipelineTranslatorTest method testStreamingGroupIntoBatchesWithShardedKeyTranslationUnifiedWorker.

@Test
public void testStreamingGroupIntoBatchesWithShardedKeyTranslationUnifiedWorker() throws Exception {
    List<String> experiments = new ArrayList<>(ImmutableList.of(GcpOptions.STREAMING_ENGINE_EXPERIMENT, GcpOptions.WINDMILL_SERVICE_EXPERIMENT, "use_runner_v2"));
    JobSpecification jobSpec = runStreamingGroupIntoBatchesAndGetJobSpec(true, experiments);
    List<Step> steps = jobSpec.getJob().getSteps();
    Step shardedStateStep = steps.get(steps.size() - 1);
    Map<String, Object> properties = shardedStateStep.getProperties();
    assertTrue(properties.containsKey(PropertyNames.USES_KEYED_STATE));
    assertTrue(properties.containsKey(PropertyNames.ALLOWS_SHARDABLE_STATE));
    assertEquals("true", getString(properties, PropertyNames.ALLOWS_SHARDABLE_STATE));
    assertTrue(properties.containsKey(PropertyNames.PRESERVES_KEYS));
    assertEquals("true", getString(properties, PropertyNames.PRESERVES_KEYS));
    // Also checks the runner proto is correctly populated.
    Map<String, RunnerApi.PTransform> transformMap = jobSpec.getPipelineProto().getComponents().getTransformsMap();
    boolean transformFound = false;
    for (Map.Entry<String, RunnerApi.PTransform> transform : transformMap.entrySet()) {
        RunnerApi.FunctionSpec spec = transform.getValue().getSpec();
        if (spec.getUrn().equals(PTransformTranslation.GROUP_INTO_BATCHES_WITH_SHARDED_KEY_URN)) {
            for (String subtransform : transform.getValue().getSubtransformsList()) {
                RunnerApi.PTransform ptransform = transformMap.get(subtransform);
                if (ptransform.getSpec().getUrn().equals(PTransformTranslation.GROUP_INTO_BATCHES_URN)) {
                    transformFound = true;
                }
            }
        }
    }
    assertTrue(transformFound);
    boolean coderFound = false;
    Map<String, RunnerApi.Coder> coderMap = jobSpec.getPipelineProto().getComponents().getCodersMap();
    for (Map.Entry<String, RunnerApi.Coder> coder : coderMap.entrySet()) {
        if (coder.getValue().getSpec().getUrn().equals(ModelCoders.SHARDED_KEY_CODER_URN)) {
            coderFound = true;
        }
    }
    assertTrue(coderFound);
}
Also used : SerializableCoder(org.apache.beam.sdk.coders.SerializableCoder) KvCoder(org.apache.beam.sdk.coders.KvCoder) VarIntCoder(org.apache.beam.sdk.coders.VarIntCoder) VoidCoder(org.apache.beam.sdk.coders.VoidCoder) Coder(org.apache.beam.sdk.coders.Coder) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) ArrayList(java.util.ArrayList) Structs.getString(org.apache.beam.runners.dataflow.util.Structs.getString) ByteString(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString) Step(com.google.api.services.dataflow.model.Step) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) CloudObject(org.apache.beam.runners.dataflow.util.CloudObject) JobSpecification(org.apache.beam.runners.dataflow.DataflowPipelineTranslator.JobSpecification) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) Map(java.util.Map) PTransform(org.apache.beam.sdk.transforms.PTransform) Test(org.junit.Test)

Example 25 with PTransform

use of org.apache.beam.sdk.transforms.PTransform in project beam by apache.

the class ParDoTranslatorBatch method translateTransform.

@Override
public void translateTransform(PTransform<PCollection<InputT>, PCollectionTuple> transform, AbstractTranslationContext context) {
    String stepName = context.getCurrentTransform().getFullName();
    // Check for not supported advanced features
    // TODO: add support of Splittable DoFn
    DoFn<InputT, OutputT> doFn = getDoFn(context);
    checkState(!DoFnSignatures.isSplittable(doFn), "Not expected to directly translate splittable DoFn, should have been overridden: %s", doFn);
    // TODO: add support of states and timers
    checkState(!DoFnSignatures.isStateful(doFn), "States and timers are not supported for the moment.");
    checkState(!DoFnSignatures.requiresTimeSortedInput(doFn), "@RequiresTimeSortedInput is not " + "supported for the moment");
    DoFnSchemaInformation doFnSchemaInformation = ParDoTranslation.getSchemaInformation(context.getCurrentTransform());
    // Init main variables
    PValue input = context.getInput();
    Dataset<WindowedValue<InputT>> inputDataSet = context.getDataset(input);
    Map<TupleTag<?>, PCollection<?>> outputs = context.getOutputs();
    TupleTag<?> mainOutputTag = getTupleTag(context);
    List<TupleTag<?>> outputTags = new ArrayList<>(outputs.keySet());
    WindowingStrategy<?, ?> windowingStrategy = ((PCollection<InputT>) input).getWindowingStrategy();
    Coder<InputT> inputCoder = ((PCollection<InputT>) input).getCoder();
    Coder<? extends BoundedWindow> windowCoder = windowingStrategy.getWindowFn().windowCoder();
    // construct a map from side input to WindowingStrategy so that
    // the DoFn runner can map main-input windows to side input windows
    List<PCollectionView<?>> sideInputs = getSideInputs(context);
    Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputStrategies = new HashMap<>();
    for (PCollectionView<?> sideInput : sideInputs) {
        sideInputStrategies.put(sideInput, sideInput.getPCollection().getWindowingStrategy());
    }
    SideInputBroadcast broadcastStateData = createBroadcastSideInputs(sideInputs, context);
    Map<TupleTag<?>, Coder<?>> outputCoderMap = context.getOutputCoders();
    MetricsContainerStepMapAccumulator metricsAccum = MetricsAccumulator.getInstance();
    List<TupleTag<?>> additionalOutputTags = new ArrayList<>();
    for (TupleTag<?> tag : outputTags) {
        if (!tag.equals(mainOutputTag)) {
            additionalOutputTags.add(tag);
        }
    }
    Map<String, PCollectionView<?>> sideInputMapping = ParDoTranslation.getSideInputMapping(context.getCurrentTransform());
    @SuppressWarnings("unchecked") DoFnFunction<InputT, OutputT> doFnWrapper = new DoFnFunction(metricsAccum, stepName, doFn, windowingStrategy, sideInputStrategies, context.getSerializableOptions(), additionalOutputTags, mainOutputTag, inputCoder, outputCoderMap, broadcastStateData, doFnSchemaInformation, sideInputMapping);
    MultiOutputCoder multipleOutputCoder = MultiOutputCoder.of(SerializableCoder.of(TupleTag.class), outputCoderMap, windowCoder);
    Dataset<Tuple2<TupleTag<?>, WindowedValue<?>>> allOutputs = inputDataSet.mapPartitions(doFnWrapper, EncoderHelpers.fromBeamCoder(multipleOutputCoder));
    if (outputs.entrySet().size() > 1) {
        allOutputs.persist();
        for (Map.Entry<TupleTag<?>, PCollection<?>> output : outputs.entrySet()) {
            pruneOutputFilteredByTag(context, allOutputs, output, windowCoder);
        }
    } else {
        Coder<OutputT> outputCoder = ((PCollection<OutputT>) outputs.get(mainOutputTag)).getCoder();
        Coder<WindowedValue<?>> windowedValueCoder = (Coder<WindowedValue<?>>) (Coder<?>) WindowedValue.getFullCoder(outputCoder, windowCoder);
        Dataset<WindowedValue<?>> outputDataset = allOutputs.map((MapFunction<Tuple2<TupleTag<?>, WindowedValue<?>>, WindowedValue<?>>) value -> value._2, EncoderHelpers.fromBeamCoder(windowedValueCoder));
        context.putDatasetWildcard(outputs.entrySet().iterator().next().getValue(), outputDataset);
    }
}
Also used : SideInputBroadcast(org.apache.beam.runners.spark.structuredstreaming.translation.helpers.SideInputBroadcast) SerializableCoder(org.apache.beam.sdk.coders.SerializableCoder) WindowedValue(org.apache.beam.sdk.util.WindowedValue) Dataset(org.apache.spark.sql.Dataset) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Coder(org.apache.beam.sdk.coders.Coder) HashMap(java.util.HashMap) AbstractTranslationContext(org.apache.beam.runners.spark.structuredstreaming.translation.AbstractTranslationContext) DoFnSchemaInformation(org.apache.beam.sdk.transforms.DoFnSchemaInformation) ArrayList(java.util.ArrayList) PTransform(org.apache.beam.sdk.transforms.PTransform) DoFnSignatures(org.apache.beam.sdk.transforms.reflect.DoFnSignatures) EncoderHelpers(org.apache.beam.runners.spark.structuredstreaming.translation.helpers.EncoderHelpers) TupleTag(org.apache.beam.sdk.values.TupleTag) Map(java.util.Map) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) MultiOutputCoder(org.apache.beam.runners.spark.structuredstreaming.translation.helpers.MultiOutputCoder) CoderHelpers(org.apache.beam.runners.spark.structuredstreaming.translation.helpers.CoderHelpers) MapFunction(org.apache.spark.api.java.function.MapFunction) ParDoTranslation(org.apache.beam.runners.core.construction.ParDoTranslation) DoFn(org.apache.beam.sdk.transforms.DoFn) MetricsAccumulator(org.apache.beam.runners.spark.structuredstreaming.metrics.MetricsAccumulator) IOException(java.io.IOException) PCollection(org.apache.beam.sdk.values.PCollection) Tuple2(scala.Tuple2) List(java.util.List) PValue(org.apache.beam.sdk.values.PValue) Preconditions.checkState(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkState) PCollectionView(org.apache.beam.sdk.values.PCollectionView) TransformTranslator(org.apache.beam.runners.spark.structuredstreaming.translation.TransformTranslator) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) MetricsContainerStepMapAccumulator(org.apache.beam.runners.spark.structuredstreaming.metrics.MetricsContainerStepMapAccumulator) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) FilterFunction(org.apache.spark.api.java.function.FilterFunction) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) TupleTag(org.apache.beam.sdk.values.TupleTag) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) WindowedValue(org.apache.beam.sdk.util.WindowedValue) SerializableCoder(org.apache.beam.sdk.coders.SerializableCoder) Coder(org.apache.beam.sdk.coders.Coder) MultiOutputCoder(org.apache.beam.runners.spark.structuredstreaming.translation.helpers.MultiOutputCoder) SideInputBroadcast(org.apache.beam.runners.spark.structuredstreaming.translation.helpers.SideInputBroadcast) MultiOutputCoder(org.apache.beam.runners.spark.structuredstreaming.translation.helpers.MultiOutputCoder) PValue(org.apache.beam.sdk.values.PValue) MetricsContainerStepMapAccumulator(org.apache.beam.runners.spark.structuredstreaming.metrics.MetricsContainerStepMapAccumulator) PCollection(org.apache.beam.sdk.values.PCollection) PCollectionView(org.apache.beam.sdk.values.PCollectionView) DoFnSchemaInformation(org.apache.beam.sdk.transforms.DoFnSchemaInformation) Tuple2(scala.Tuple2) HashMap(java.util.HashMap) Map(java.util.Map)

Aggregations

PTransform (org.apache.beam.sdk.transforms.PTransform)41 PCollection (org.apache.beam.sdk.values.PCollection)29 Test (org.junit.Test)18 AppliedPTransform (org.apache.beam.sdk.runners.AppliedPTransform)11 PBegin (org.apache.beam.sdk.values.PBegin)11 IOException (java.io.IOException)10 ArrayList (java.util.ArrayList)10 List (java.util.List)10 Map (java.util.Map)10 TupleTag (org.apache.beam.sdk.values.TupleTag)10 DoFn (org.apache.beam.sdk.transforms.DoFn)9 Coder (org.apache.beam.sdk.coders.Coder)8 Create (org.apache.beam.sdk.transforms.Create)8 ParDo (org.apache.beam.sdk.transforms.ParDo)7 PDone (org.apache.beam.sdk.values.PDone)7 PCollectionTuple (org.apache.beam.sdk.values.PCollectionTuple)6 Collection (java.util.Collection)5 HashMap (java.util.HashMap)5 Collectors.toList (java.util.stream.Collectors.toList)5 Schema (org.apache.beam.sdk.schemas.Schema)5