Search in sources :

Example 96 with DoFn

use of org.apache.beam.sdk.transforms.DoFn in project beam by apache.

the class ConfigGeneratorTest method testUserStoreConfig.

@Test
public void testUserStoreConfig() {
    SamzaPipelineOptions options = PipelineOptionsFactory.create().as(SamzaPipelineOptions.class);
    options.setJobName("TestStoreConfig");
    options.setRunner(SamzaRunner.class);
    Pipeline pipeline = Pipeline.create(options);
    pipeline.apply(Create.empty(TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.strings()))).apply(ParDo.of(new DoFn<KV<String, String>, Void>() {

        private static final String testState = "testState";

        @StateId(testState)
        private final StateSpec<ValueState<Integer>> state = StateSpecs.value();

        @ProcessElement
        public void processElement(ProcessContext context, @StateId(testState) ValueState<Integer> state) {
        }
    }));
    final Map<PValue, String> idMap = PViewToIdMapper.buildIdMap(pipeline);
    final ConfigBuilder configBuilder = new ConfigBuilder(options);
    SamzaPipelineTranslator.createConfig(pipeline, options, idMap, configBuilder);
    final Config config = configBuilder.build();
    assertEquals(RocksDbKeyValueStorageEngineFactory.class.getName(), config.get("stores.testState.factory"));
    assertEquals("byteArraySerde", config.get("stores.testState.key.serde"));
    assertEquals("stateValueSerde", config.get("stores.testState.msg.serde"));
    assertNull(config.get("stores.testState.changelog"));
    options.setStateDurable(true);
    SamzaPipelineTranslator.createConfig(pipeline, options, idMap, configBuilder);
    final Config config2 = configBuilder.build();
    assertEquals("TestStoreConfig-1-testState-changelog", config2.get("stores.testState.changelog"));
}
Also used : ZkConfig(org.apache.samza.config.ZkConfig) JobCoordinatorConfig(org.apache.samza.config.JobCoordinatorConfig) Config(org.apache.samza.config.Config) RocksDbKeyValueStorageEngineFactory(org.apache.samza.storage.kv.RocksDbKeyValueStorageEngineFactory) PValue(org.apache.beam.sdk.values.PValue) Pipeline(org.apache.beam.sdk.Pipeline) StateSpec(org.apache.beam.sdk.state.StateSpec) DoFn(org.apache.beam.sdk.transforms.DoFn) ValueState(org.apache.beam.sdk.state.ValueState) SamzaPipelineOptions(org.apache.beam.runners.samza.SamzaPipelineOptions) Test(org.junit.Test)

Example 97 with DoFn

use of org.apache.beam.sdk.transforms.DoFn in project beam by apache.

the class WindowTest method testTimestampCombinerDefault.

/**
 * Tests that when two elements are combined via a GroupByKey their output timestamp agrees with
 * the windowing function default, the end of the window.
 */
@Test
@Category(ValidatesRunner.class)
public void testTimestampCombinerDefault() {
    pipeline.enableAbandonedNodeEnforcement(true);
    pipeline.apply(Create.timestamped(TimestampedValue.of(KV.of(0, "hello"), new Instant(0)), TimestampedValue.of(KV.of(0, "goodbye"), new Instant(10)))).apply(Window.into(FixedWindows.of(Duration.standardMinutes(10)))).apply(GroupByKey.create()).apply(ParDo.of(new DoFn<KV<Integer, Iterable<String>>, Void>() {

        @ProcessElement
        public void processElement(ProcessContext c) throws Exception {
            assertThat(c.timestamp(), equalTo(new IntervalWindow(new Instant(0), new Instant(0).plus(Duration.standardMinutes(10))).maxTimestamp()));
        }
    }));
    pipeline.run();
}
Also used : DoFn(org.apache.beam.sdk.transforms.DoFn) Instant(org.joda.time.Instant) Category(org.junit.experimental.categories.Category) Test(org.junit.Test)

Example 98 with DoFn

use of org.apache.beam.sdk.transforms.DoFn in project beam by apache.

the class BeamWindowRel method aggField.

private static DoFn<List<Row>, Row> aggField(final Schema expectedSchema, final FieldAggregation fieldAgg) {
    return new DoFn<List<Row>, Row>() {

        @ProcessElement
        public void processElement(@Element List<Row> inputPartition, OutputReceiver<Row> out, ProcessContext c) {
            List<Row> sortedRowsAsList = inputPartition;
            NavigableMap<BigDecimal, List<Row>> indexRange = null;
            if (!fieldAgg.rows) {
                indexRange = indexRows(sortedRowsAsList);
            }
            for (int idx = 0; idx < sortedRowsAsList.size(); idx++) {
                List<Row> aggRange = null;
                if (fieldAgg.rows) {
                    aggRange = getRows(sortedRowsAsList, idx);
                } else {
                    aggRange = getRange(indexRange, sortedRowsAsList.get(idx));
                }
                Object accumulator = fieldAgg.combineFn.createAccumulator();
                // if not inputs are needed, put a mock Field index
                final int aggFieldIndex = fieldAgg.inputFields.isEmpty() ? -1 : fieldAgg.inputFields.get(0);
                long count = 0;
                for (Row aggRow : aggRange) {
                    if (fieldAgg.combineFn instanceof BeamBuiltinAnalyticFunctions.PositionAwareCombineFn) {
                        BeamBuiltinAnalyticFunctions.PositionAwareCombineFn fn = (BeamBuiltinAnalyticFunctions.PositionAwareCombineFn) fieldAgg.combineFn;
                        accumulator = fn.addInput(accumulator, getOrderByValue(aggRow), count, (long) idx, (long) sortedRowsAsList.size());
                    } else {
                        accumulator = fieldAgg.combineFn.addInput(accumulator, aggRow.getBaseValue(aggFieldIndex));
                    }
                    count++;
                }
                Object result = fieldAgg.combineFn.extractOutput(accumulator);
                Row processingRow = sortedRowsAsList.get(idx);
                List<Object> fieldValues = Lists.newArrayListWithCapacity(processingRow.getFieldCount());
                fieldValues.addAll(processingRow.getValues());
                fieldValues.add(result);
                Row build = Row.withSchema(expectedSchema).addValues(fieldValues).build();
                out.output(build);
            }
        }

        private NavigableMap<BigDecimal, List<Row>> indexRows(List<Row> input) {
            NavigableMap<BigDecimal, List<Row>> map = new TreeMap<BigDecimal, List<Row>>();
            for (Row r : input) {
                BigDecimal orderByValue = getOrderByValue(r);
                if (orderByValue == null) {
                    /**
                     * Special case agg(X) OVER () set dummy value.
                     */
                    orderByValue = BigDecimal.ZERO;
                }
                if (!map.containsKey(orderByValue)) {
                    map.put(orderByValue, Lists.newArrayList());
                }
                map.get(orderByValue).add(r);
            }
            return map;
        }

        private List<Row> getRange(NavigableMap<BigDecimal, List<Row>> indexRanges, Row aRow) {
            NavigableMap<BigDecimal, List<Row>> subMap;
            BigDecimal currentRowValue = getOrderByValue(aRow);
            if (currentRowValue != null && fieldAgg.lowerLimit != null && fieldAgg.upperLimit != null) {
                BigDecimal ll = currentRowValue.subtract(fieldAgg.lowerLimit);
                BigDecimal ul = currentRowValue.add(fieldAgg.upperLimit);
                subMap = indexRanges.subMap(ll, true, ul, true);
            } else if (currentRowValue != null && fieldAgg.lowerLimit != null && fieldAgg.upperLimit == null) {
                BigDecimal ll = currentRowValue.subtract(fieldAgg.lowerLimit);
                subMap = indexRanges.tailMap(ll, true);
            } else if (currentRowValue != null && fieldAgg.lowerLimit == null && fieldAgg.upperLimit != null) {
                BigDecimal ul = currentRowValue.add(fieldAgg.upperLimit);
                subMap = indexRanges.headMap(ul, true);
            } else {
                subMap = indexRanges;
            }
            List<Row> result = Lists.newArrayList();
            for (List<Row> partialList : subMap.values()) {
                result.addAll(partialList);
            }
            return result;
        }

        private BigDecimal getOrderByValue(Row r) {
            /**
             * Special Case: This query is transformed by calcite as follows: agg(X) over () -> agg(X)
             * over (RANGE BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) No orderKeys, so return
             * null.
             */
            if (fieldAgg.orderKeys.size() == 0) {
                return null;
            } else {
                return new BigDecimal(((Number) r.getBaseValue(fieldAgg.orderKeys.get(0))).toString());
            }
        }

        private List<Row> getRows(List<Row> input, int index) {
            Integer ll = fieldAgg.lowerLimit != null ? fieldAgg.lowerLimit.intValue() : Integer.MAX_VALUE;
            Integer ul = fieldAgg.upperLimit != null ? fieldAgg.upperLimit.intValue() : Integer.MAX_VALUE;
            int lowerIndex = ll == Integer.MAX_VALUE ? Integer.MIN_VALUE : index - ll;
            int upperIndex = ul == Integer.MAX_VALUE ? Integer.MAX_VALUE : index + ul + 1;
            lowerIndex = lowerIndex < 0 ? 0 : lowerIndex;
            upperIndex = upperIndex > input.size() ? input.size() : upperIndex;
            List<Row> out = input.subList(lowerIndex, upperIndex);
            return out;
        }
    };
}
Also used : NavigableMap(java.util.NavigableMap) TreeMap(java.util.TreeMap) BeamBuiltinAnalyticFunctions(org.apache.beam.sdk.extensions.sql.impl.transform.BeamBuiltinAnalyticFunctions) BigDecimal(java.math.BigDecimal) DoFn(org.apache.beam.sdk.transforms.DoFn) PCollectionList(org.apache.beam.sdk.values.PCollectionList) List(java.util.List) Row(org.apache.beam.sdk.values.Row)

Example 99 with DoFn

use of org.apache.beam.sdk.transforms.DoFn in project beam by apache.

the class DataflowPipelineTranslatorTest method testStepDisplayData.

@Test
public void testStepDisplayData() throws Exception {
    DataflowPipelineOptions options = buildPipelineOptions();
    DataflowPipelineTranslator translator = DataflowPipelineTranslator.fromOptions(options);
    Pipeline pipeline = Pipeline.create(options);
    DoFn<Integer, Integer> fn1 = new DoFn<Integer, Integer>() {

        @ProcessElement
        public void processElement(ProcessContext c) throws Exception {
            c.output(c.element());
        }

        @Override
        public void populateDisplayData(DisplayData.Builder builder) {
            builder.add(DisplayData.item("foo", "bar")).add(DisplayData.item("foo2", DataflowPipelineTranslatorTest.class).withLabel("Test Class").withLinkUrl("http://www.google.com"));
        }
    };
    DoFn<Integer, Integer> fn2 = new DoFn<Integer, Integer>() {

        @ProcessElement
        public void processElement(ProcessContext c) throws Exception {
            c.output(c.element());
        }

        @Override
        public void populateDisplayData(DisplayData.Builder builder) {
            builder.add(DisplayData.item("foo3", 1234));
        }
    };
    ParDo.SingleOutput<Integer, Integer> parDo1 = ParDo.of(fn1);
    ParDo.SingleOutput<Integer, Integer> parDo2 = ParDo.of(fn2);
    pipeline.apply(Create.of(1, 2, 3)).apply(parDo1).apply(parDo2);
    DataflowRunner runner = DataflowRunner.fromOptions(options);
    runner.replaceV1Transforms(pipeline);
    SdkComponents sdkComponents = createSdkComponents(options);
    RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(pipeline, sdkComponents, true);
    Job job = translator.translate(pipeline, pipelineProto, sdkComponents, runner, Collections.emptyList()).getJob();
    assertAllStepOutputsHaveUniqueIds(job);
    List<Step> steps = job.getSteps();
    assertEquals(3, steps.size());
    Map<String, Object> parDo1Properties = steps.get(1).getProperties();
    Map<String, Object> parDo2Properties = steps.get(2).getProperties();
    assertThat(parDo1Properties, hasKey("display_data"));
    @SuppressWarnings("unchecked") Collection<Map<String, String>> fn1displayData = (Collection<Map<String, String>>) parDo1Properties.get("display_data");
    @SuppressWarnings("unchecked") Collection<Map<String, String>> fn2displayData = (Collection<Map<String, String>>) parDo2Properties.get("display_data");
    ImmutableSet<ImmutableMap<String, Object>> expectedFn1DisplayData = ImmutableSet.of(ImmutableMap.<String, Object>builder().put("key", "foo").put("type", "STRING").put("value", "bar").put("namespace", fn1.getClass().getName()).build(), ImmutableMap.<String, Object>builder().put("key", "fn").put("label", "Transform Function").put("type", "JAVA_CLASS").put("value", fn1.getClass().getName()).put("shortValue", fn1.getClass().getSimpleName()).put("namespace", parDo1.getClass().getName()).build(), ImmutableMap.<String, Object>builder().put("key", "foo2").put("type", "JAVA_CLASS").put("value", DataflowPipelineTranslatorTest.class.getName()).put("shortValue", DataflowPipelineTranslatorTest.class.getSimpleName()).put("namespace", fn1.getClass().getName()).put("label", "Test Class").put("linkUrl", "http://www.google.com").build());
    ImmutableSet<ImmutableMap<String, Object>> expectedFn2DisplayData = ImmutableSet.of(ImmutableMap.<String, Object>builder().put("key", "fn").put("label", "Transform Function").put("type", "JAVA_CLASS").put("value", fn2.getClass().getName()).put("shortValue", fn2.getClass().getSimpleName()).put("namespace", parDo2.getClass().getName()).build(), ImmutableMap.<String, Object>builder().put("key", "foo3").put("type", "INTEGER").put("value", 1234L).put("namespace", fn2.getClass().getName()).build());
    assertEquals(expectedFn1DisplayData, ImmutableSet.copyOf(fn1displayData));
    assertEquals(expectedFn2DisplayData, ImmutableSet.copyOf(fn2displayData));
}
Also used : Step(com.google.api.services.dataflow.model.Step) Structs.getString(org.apache.beam.runners.dataflow.util.Structs.getString) ByteString(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString) SdkComponents(org.apache.beam.runners.core.construction.SdkComponents) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) Job(com.google.api.services.dataflow.model.Job) DataflowPipelineOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineOptions) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) Pipeline(org.apache.beam.sdk.Pipeline) DoFn(org.apache.beam.sdk.transforms.DoFn) ParDo(org.apache.beam.sdk.transforms.ParDo) Collection(java.util.Collection) PCollection(org.apache.beam.sdk.values.PCollection) CloudObject(org.apache.beam.runners.dataflow.util.CloudObject) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) Map(java.util.Map) Test(org.junit.Test)

Example 100 with DoFn

use of org.apache.beam.sdk.transforms.DoFn in project beam by apache.

the class BatchStatefulParDoOverridesTest method testFnApiSingleOutputOverrideNonCrashing.

@Test
public void testFnApiSingleOutputOverrideNonCrashing() throws Exception {
    DataflowPipelineOptions options = buildPipelineOptions();
    options.setRunner(DataflowRunner.class);
    Pipeline pipeline = Pipeline.create(options);
    DummyStatefulDoFn fn = new DummyStatefulDoFn();
    pipeline.apply(Create.of(KV.of(1, 2))).apply(ParDo.of(fn));
    DataflowRunner runner = DataflowRunner.fromOptions(options);
    runner.replaceV1Transforms(pipeline);
    assertThat(findBatchStatefulDoFn(pipeline), equalTo((DoFn) fn));
}
Also used : DataflowPipelineOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineOptions) DoFn(org.apache.beam.sdk.transforms.DoFn) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test)

Aggregations

DoFn (org.apache.beam.sdk.transforms.DoFn)154 Test (org.junit.Test)98 Pipeline (org.apache.beam.sdk.Pipeline)60 KV (org.apache.beam.sdk.values.KV)45 TupleTag (org.apache.beam.sdk.values.TupleTag)28 StateSpec (org.apache.beam.sdk.state.StateSpec)26 Instant (org.joda.time.Instant)26 ArrayList (java.util.ArrayList)23 TestPipeline (org.apache.beam.sdk.testing.TestPipeline)23 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)22 PCollection (org.apache.beam.sdk.values.PCollection)21 TimerSpec (org.apache.beam.sdk.state.TimerSpec)19 WindowedValue (org.apache.beam.sdk.util.WindowedValue)18 PCollectionView (org.apache.beam.sdk.values.PCollectionView)18 HashMap (java.util.HashMap)17 Coder (org.apache.beam.sdk.coders.Coder)17 List (java.util.List)16 Map (java.util.Map)14 ValueState (org.apache.beam.sdk.state.ValueState)14 RunnerApi (org.apache.beam.model.pipeline.v1.RunnerApi)13