Search in sources :

Example 96 with PCollection

use of org.apache.beam.sdk.values.PCollection in project beam by apache.

the class CombineTest method testCombinePerKeyWithHotKeyFanoutPrimitiveDisplayData.

@Test
@Category(ValidatesRunner.class)
public void testCombinePerKeyWithHotKeyFanoutPrimitiveDisplayData() {
    int hotKeyFanout = 2;
    DisplayDataEvaluator evaluator = DisplayDataEvaluator.create();
    CombineTest.UniqueInts combineFn = new CombineTest.UniqueInts();
    PTransform<PCollection<KV<Integer, Integer>>, PCollection<KV<Integer, Set<Integer>>>> combine = Combine.<Integer, Integer, Set<Integer>>perKey(combineFn).withHotKeyFanout(hotKeyFanout);
    Set<DisplayData> displayData = evaluator.displayDataForPrimitiveTransforms(combine, KvCoder.of(VarIntCoder.of(), VarIntCoder.of()));
    assertThat("Combine.perKey.withHotKeyFanout should include the combineFn in its primitive " + "transform", displayData, hasItem(hasDisplayItem("combineFn", combineFn.getClass())));
    assertThat("Combine.perKey.withHotKeyFanout(int) should include the fanout in its primitive " + "transform", displayData, hasItem(hasDisplayItem("fanout", hotKeyFanout)));
}
Also used : PCollection(org.apache.beam.sdk.values.PCollection) ImmutableSet(com.google.common.collect.ImmutableSet) Set(java.util.Set) HashSet(java.util.HashSet) DisplayDataEvaluator(org.apache.beam.sdk.transforms.display.DisplayDataEvaluator) DisplayData(org.apache.beam.sdk.transforms.display.DisplayData) Category(org.junit.experimental.categories.Category) Test(org.junit.Test)

Example 97 with PCollection

use of org.apache.beam.sdk.values.PCollection in project beam by apache.

the class CombineTest method testCombinePerKeyPrimitiveDisplayData.

@Test
@Category(ValidatesRunner.class)
public void testCombinePerKeyPrimitiveDisplayData() {
    DisplayDataEvaluator evaluator = DisplayDataEvaluator.create();
    CombineTest.UniqueInts combineFn = new CombineTest.UniqueInts();
    PTransform<PCollection<KV<Integer, Integer>>, ? extends POutput> combine = Combine.perKey(combineFn);
    Set<DisplayData> displayData = evaluator.displayDataForPrimitiveTransforms(combine, KvCoder.of(VarIntCoder.of(), VarIntCoder.of()));
    assertThat("Combine.perKey should include the combineFn in its primitive transform", displayData, hasItem(hasDisplayItem("combineFn", combineFn.getClass())));
}
Also used : PCollection(org.apache.beam.sdk.values.PCollection) DisplayDataEvaluator(org.apache.beam.sdk.transforms.display.DisplayDataEvaluator) DisplayData(org.apache.beam.sdk.transforms.display.DisplayData) Category(org.junit.experimental.categories.Category) Test(org.junit.Test)

Example 98 with PCollection

use of org.apache.beam.sdk.values.PCollection in project DataflowJavaSDK-examples by GoogleCloudPlatform.

the class GameStats method main.

public static void main(String[] args) throws Exception {
    Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
    // Enforce that this pipeline is always run in streaming mode.
    options.setStreaming(true);
    ExampleUtils exampleUtils = new ExampleUtils(options);
    Pipeline pipeline = Pipeline.create(options);
    // Read Events from Pub/Sub using custom timestamps
    PCollection<GameActionInfo> rawEvents = pipeline.apply(PubsubIO.readStrings().withTimestampAttribute(TIMESTAMP_ATTRIBUTE).fromTopic(options.getTopic())).apply("ParseGameEvent", ParDo.of(new ParseEventFn()));
    // Extract username/score pairs from the event stream
    PCollection<KV<String, Integer>> userEvents = rawEvents.apply("ExtractUserScore", MapElements.into(TypeDescriptors.kvs(TypeDescriptors.strings(), TypeDescriptors.integers())).via((GameActionInfo gInfo) -> KV.of(gInfo.getUser(), gInfo.getScore())));
    // Calculate the total score per user over fixed windows, and
    // cumulative updates for late data.
    final PCollectionView<Map<String, Integer>> spammersView = userEvents.apply("FixedWindowsUser", Window.<KV<String, Integer>>into(FixedWindows.of(Duration.standardMinutes(options.getFixedWindowDuration())))).apply("CalculateSpammyUsers", new CalculateSpammyUsers()).apply("CreateSpammersView", View.<String, Integer>asMap());
    // [START DocInclude_FilterAndCalc]
    // Calculate the total score per team over fixed windows,
    // and emit cumulative updates for late data. Uses the side input derived above-- the set of
    // suspected robots-- to filter out scores from those users from the sum.
    // Write the results to BigQuery.
    rawEvents.apply("WindowIntoFixedWindows", Window.<GameActionInfo>into(FixedWindows.of(Duration.standardMinutes(options.getFixedWindowDuration())))).apply("FilterOutSpammers", ParDo.of(new DoFn<GameActionInfo, GameActionInfo>() {

        @ProcessElement
        public void processElement(ProcessContext c) {
            // If the user is not in the spammers Map, output the data element.
            if (c.sideInput(spammersView).get(c.element().getUser().trim()) == null) {
                c.output(c.element());
            }
        }
    }).withSideInputs(spammersView)).apply("ExtractTeamScore", new ExtractAndSumScore("team")).apply("WriteTeamSums", new WriteWindowedToBigQuery<KV<String, Integer>>(options.as(GcpOptions.class).getProject(), options.getDataset(), options.getGameStatsTablePrefix() + "_team", configureWindowedWrite()));
    // [START DocInclude_SessionCalc]
    // Detect user sessions-- that is, a burst of activity separated by a gap from further
    // activity. Find and record the mean session lengths.
    // This information could help the game designers track the changing user engagement
    // as their set of games changes.
    userEvents.apply("WindowIntoSessions", Window.<KV<String, Integer>>into(Sessions.withGapDuration(Duration.standardMinutes(options.getSessionGap()))).withTimestampCombiner(TimestampCombiner.END_OF_WINDOW)).apply(Combine.perKey(x -> 0)).apply("UserSessionActivity", ParDo.of(new UserSessionInfoFn())).apply("WindowToExtractSessionMean", Window.<Integer>into(FixedWindows.of(Duration.standardMinutes(options.getUserActivityWindowDuration())))).apply(Mean.<Integer>globally().withoutDefaults()).apply("WriteAvgSessionLength", new WriteWindowedToBigQuery<Double>(options.as(GcpOptions.class).getProject(), options.getDataset(), options.getGameStatsTablePrefix() + "_sessions", configureSessionWindowWrite()));
    // [END DocInclude_Rewindow]
    // Run the pipeline and wait for the pipeline to finish; capture cancellation requests from the
    // command line.
    PipelineResult result = pipeline.run();
    exampleUtils.waitToFinish(result);
}
Also used : KV(org.apache.beam.sdk.values.KV) DateTimeZone(org.joda.time.DateTimeZone) TimestampCombiner(org.apache.beam.sdk.transforms.windowing.TimestampCombiner) PipelineResult(org.apache.beam.sdk.PipelineResult) Default(org.apache.beam.sdk.options.Default) Combine(org.apache.beam.sdk.transforms.Combine) Duration(org.joda.time.Duration) LoggerFactory(org.slf4j.LoggerFactory) HashMap(java.util.HashMap) View(org.apache.beam.sdk.transforms.View) PipelineOptionsFactory(org.apache.beam.sdk.options.PipelineOptionsFactory) Metrics(org.apache.beam.sdk.metrics.Metrics) Description(org.apache.beam.sdk.options.Description) PTransform(org.apache.beam.sdk.transforms.PTransform) Sessions(org.apache.beam.sdk.transforms.windowing.Sessions) Map(java.util.Map) Window(org.apache.beam.sdk.transforms.windowing.Window) WriteWindowedToBigQuery(com.google.cloud.dataflow.examples.complete.game.utils.WriteWindowedToBigQuery) Pipeline(org.apache.beam.sdk.Pipeline) DoFn(org.apache.beam.sdk.transforms.DoFn) MapElements(org.apache.beam.sdk.transforms.MapElements) DateTimeFormat(org.joda.time.format.DateTimeFormat) Logger(org.slf4j.Logger) GcpOptions(org.apache.beam.sdk.extensions.gcp.options.GcpOptions) DateTimeFormatter(org.joda.time.format.DateTimeFormatter) TimeZone(java.util.TimeZone) Counter(org.apache.beam.sdk.metrics.Counter) Sum(org.apache.beam.sdk.transforms.Sum) FixedWindows(org.apache.beam.sdk.transforms.windowing.FixedWindows) PCollection(org.apache.beam.sdk.values.PCollection) Mean(org.apache.beam.sdk.transforms.Mean) ExampleUtils(com.google.cloud.dataflow.examples.common.ExampleUtils) PubsubIO(org.apache.beam.sdk.io.gcp.pubsub.PubsubIO) ParDo(org.apache.beam.sdk.transforms.ParDo) PCollectionView(org.apache.beam.sdk.values.PCollectionView) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) TypeDescriptors(org.apache.beam.sdk.values.TypeDescriptors) Instant(org.joda.time.Instant) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow) Values(org.apache.beam.sdk.transforms.Values) GcpOptions(org.apache.beam.sdk.extensions.gcp.options.GcpOptions) ExampleUtils(com.google.cloud.dataflow.examples.common.ExampleUtils) PipelineResult(org.apache.beam.sdk.PipelineResult) KV(org.apache.beam.sdk.values.KV) Pipeline(org.apache.beam.sdk.Pipeline) GcpOptions(org.apache.beam.sdk.extensions.gcp.options.GcpOptions) HashMap(java.util.HashMap) Map(java.util.Map)

Example 99 with PCollection

use of org.apache.beam.sdk.values.PCollection in project beam by apache.

the class PTransformTranslation method toProto.

/**
   * Translates an {@link AppliedPTransform} into a runner API proto.
   *
   * <p>Does not register the {@code appliedPTransform} within the provided {@link SdkComponents}.
   */
static RunnerApi.PTransform toProto(AppliedPTransform<?, ?, ?> appliedPTransform, List<AppliedPTransform<?, ?, ?>> subtransforms, SdkComponents components) throws IOException {
    RunnerApi.PTransform.Builder transformBuilder = RunnerApi.PTransform.newBuilder();
    for (Map.Entry<TupleTag<?>, PValue> taggedInput : appliedPTransform.getInputs().entrySet()) {
        checkArgument(taggedInput.getValue() instanceof PCollection, "Unexpected input type %s", taggedInput.getValue().getClass());
        transformBuilder.putInputs(toProto(taggedInput.getKey()), components.registerPCollection((PCollection<?>) taggedInput.getValue()));
    }
    for (Map.Entry<TupleTag<?>, PValue> taggedOutput : appliedPTransform.getOutputs().entrySet()) {
        // TODO: Remove gating
        if (taggedOutput.getValue() instanceof PCollection) {
            checkArgument(taggedOutput.getValue() instanceof PCollection, "Unexpected output type %s", taggedOutput.getValue().getClass());
            transformBuilder.putOutputs(toProto(taggedOutput.getKey()), components.registerPCollection((PCollection<?>) taggedOutput.getValue()));
        }
    }
    for (AppliedPTransform<?, ?, ?> subtransform : subtransforms) {
        transformBuilder.addSubtransforms(components.getExistingPTransformId(subtransform));
    }
    transformBuilder.setUniqueName(appliedPTransform.getFullName());
    // TODO: Display Data
    PTransform<?, ?> transform = appliedPTransform.getTransform();
    if (KNOWN_PAYLOAD_TRANSLATORS.containsKey(transform.getClass())) {
        FunctionSpec payload = KNOWN_PAYLOAD_TRANSLATORS.get(transform.getClass()).translate(appliedPTransform, components);
        transformBuilder.setSpec(payload);
    }
    return transformBuilder.build();
}
Also used : PCollection(org.apache.beam.sdk.values.PCollection) FunctionSpec(org.apache.beam.sdk.common.runner.v1.RunnerApi.FunctionSpec) TupleTag(org.apache.beam.sdk.values.TupleTag) PValue(org.apache.beam.sdk.values.PValue) ImmutableMap(com.google.common.collect.ImmutableMap) Map(java.util.Map) PTransform(org.apache.beam.sdk.transforms.PTransform) AppliedPTransform(org.apache.beam.sdk.runners.AppliedPTransform)

Example 100 with PCollection

use of org.apache.beam.sdk.values.PCollection in project beam by apache.

the class SplittableProcessElementsEvaluatorFactory method createEvaluator.

@SuppressWarnings({ "unchecked", "rawtypes" })
private TransformEvaluator<KeyedWorkItem<String, ElementAndRestriction<InputT, RestrictionT>>> createEvaluator(AppliedPTransform<PCollection<KeyedWorkItem<String, ElementAndRestriction<InputT, RestrictionT>>>, PCollectionTuple, ProcessElements<InputT, OutputT, RestrictionT, TrackerT>> application, CommittedBundle<InputT> inputBundle) throws Exception {
    final ProcessElements<InputT, OutputT, RestrictionT, TrackerT> transform = application.getTransform();
    ProcessFn<InputT, OutputT, RestrictionT, TrackerT> processFn = transform.newProcessFn(transform.getFn());
    DoFnLifecycleManager fnManager = DoFnLifecycleManager.of(processFn);
    processFn = ((ProcessFn<InputT, OutputT, RestrictionT, TrackerT>) fnManager.<KeyedWorkItem<String, ElementAndRestriction<InputT, RestrictionT>>, OutputT>get());
    String stepName = evaluationContext.getStepName(application);
    final DirectExecutionContext.DirectStepContext stepContext = evaluationContext.getExecutionContext(application, inputBundle.getKey()).getStepContext(stepName);
    final ParDoEvaluator<KeyedWorkItem<String, ElementAndRestriction<InputT, RestrictionT>>> parDoEvaluator = delegateFactory.createParDoEvaluator(application, inputBundle.getKey(), transform.getSideInputs(), transform.getMainOutputTag(), transform.getAdditionalOutputTags().getAll(), stepContext, processFn, fnManager);
    processFn.setStateInternalsFactory(new StateInternalsFactory<String>() {

        @SuppressWarnings({ "unchecked", "rawtypes" })
        @Override
        public StateInternals stateInternalsForKey(String key) {
            return (StateInternals) stepContext.stateInternals();
        }
    });
    processFn.setTimerInternalsFactory(new TimerInternalsFactory<String>() {

        @Override
        public TimerInternals timerInternalsForKey(String key) {
            return stepContext.timerInternals();
        }
    });
    OutputWindowedValue<OutputT> outputWindowedValue = new OutputWindowedValue<OutputT>() {

        private final OutputManager outputManager = parDoEvaluator.getOutputManager();

        @Override
        public void outputWindowedValue(OutputT output, Instant timestamp, Collection<? extends BoundedWindow> windows, PaneInfo pane) {
            outputManager.output(transform.getMainOutputTag(), WindowedValue.of(output, timestamp, windows, pane));
        }

        @Override
        public <AdditionalOutputT> void outputWindowedValue(TupleTag<AdditionalOutputT> tag, AdditionalOutputT output, Instant timestamp, Collection<? extends BoundedWindow> windows, PaneInfo pane) {
            outputManager.output(tag, WindowedValue.of(output, timestamp, windows, pane));
        }
    };
    processFn.setProcessElementInvoker(new OutputAndTimeBoundedSplittableProcessElementInvoker<InputT, OutputT, RestrictionT, TrackerT>(transform.getFn(), evaluationContext.getPipelineOptions(), outputWindowedValue, evaluationContext.createSideInputReader(transform.getSideInputs()), // DirectRunner.
    Executors.newSingleThreadScheduledExecutor(new ThreadFactoryBuilder().setThreadFactory(MoreExecutors.platformThreadFactory()).setDaemon(true).setNameFormat("direct-splittable-process-element-checkpoint-executor").build()), 10000, Duration.standardSeconds(10)));
    return DoFnLifecycleManagerRemovingTransformEvaluator.wrapping(parDoEvaluator, fnManager);
}
Also used : ProcessFn(org.apache.beam.runners.core.SplittableParDoViaKeyedWorkItems.ProcessFn) TupleTag(org.apache.beam.sdk.values.TupleTag) PaneInfo(org.apache.beam.sdk.transforms.windowing.PaneInfo) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) ThreadFactoryBuilder(com.google.common.util.concurrent.ThreadFactoryBuilder) ElementAndRestriction(org.apache.beam.runners.core.construction.ElementAndRestriction) OutputWindowedValue(org.apache.beam.runners.core.OutputWindowedValue) Instant(org.joda.time.Instant) KeyedWorkItem(org.apache.beam.runners.core.KeyedWorkItem) TimerInternals(org.apache.beam.runners.core.TimerInternals) StateInternals(org.apache.beam.runners.core.StateInternals) Collection(java.util.Collection) PCollection(org.apache.beam.sdk.values.PCollection) OutputManager(org.apache.beam.runners.core.DoFnRunners.OutputManager)

Aggregations

PCollection (org.apache.beam.sdk.values.PCollection)199 Test (org.junit.Test)133 KV (org.apache.beam.sdk.values.KV)62 TestPipeline (org.apache.beam.sdk.testing.TestPipeline)61 Map (java.util.Map)59 List (java.util.List)58 Rule (org.junit.Rule)57 RunWith (org.junit.runner.RunWith)54 PAssert (org.apache.beam.sdk.testing.PAssert)52 Instant (org.joda.time.Instant)46 Duration (org.joda.time.Duration)45 JUnit4 (org.junit.runners.JUnit4)45 ParDo (org.apache.beam.sdk.transforms.ParDo)44 TupleTag (org.apache.beam.sdk.values.TupleTag)42 Pipeline (org.apache.beam.sdk.Pipeline)41 Create (org.apache.beam.sdk.transforms.Create)41 ArrayList (java.util.ArrayList)40 Serializable (java.io.Serializable)39 PTransform (org.apache.beam.sdk.transforms.PTransform)37 Row (org.apache.beam.sdk.values.Row)37