Search in sources :

Example 1 with MapFunction

use of org.apache.spark.api.java.function.MapFunction in project net.jgp.labs.spark by jgperrin.

the class PiComputeLambdaApp method start.

/**
 * The processing code.
 */
private void start(int slices) {
    int numberOfThrows = 100000 * slices;
    System.out.println("About to throw " + numberOfThrows + " darts, ready? Stay away from the target!");
    long t0 = System.currentTimeMillis();
    SparkSession spark = SparkSession.builder().appName("Spark Pi with lambdas").master("local[*]").getOrCreate();
    long t1 = System.currentTimeMillis();
    System.out.println("Session initialized in " + (t1 - t0) + " ms");
    List<Integer> l = new ArrayList<>(numberOfThrows);
    for (int i = 0; i < numberOfThrows; i++) {
        l.add(i);
    }
    Dataset<Row> incrementalDf = spark.createDataset(l, Encoders.INT()).toDF();
    long t2 = System.currentTimeMillis();
    System.out.println("Initial dataframe built in " + (t2 - t1) + " ms");
    Dataset<Integer> dotsDs = incrementalDf.map((MapFunction<Row, Integer>) status -> {
        double x = Math.random() * 2 - 1;
        double y = Math.random() * 2 - 1;
        counter++;
        if (counter % 100000 == 0) {
            System.out.println("" + counter + " darts thrown so far");
        }
        return (x * x + y * y <= 1) ? 1 : 0;
    }, Encoders.INT());
    long t3 = System.currentTimeMillis();
    System.out.println("Throwing darts done in " + (t3 - t2) + " ms");
    int dartsInCircle = dotsDs.reduce((ReduceFunction<Integer>) (x, y) -> x + y);
    long t4 = System.currentTimeMillis();
    System.out.println("Analyzing result in " + (t4 - t3) + " ms");
    System.out.println("Pi is roughly " + 4.0 * dartsInCircle / numberOfThrows);
    spark.stop();
}
Also used : List(java.util.List) Dataset(org.apache.spark.sql.Dataset) Row(org.apache.spark.sql.Row) ReduceFunction(org.apache.spark.api.java.function.ReduceFunction) MapFunction(org.apache.spark.api.java.function.MapFunction) Encoders(org.apache.spark.sql.Encoders) Serializable(java.io.Serializable) ArrayList(java.util.ArrayList) SparkSession(org.apache.spark.sql.SparkSession) SparkSession(org.apache.spark.sql.SparkSession) ArrayList(java.util.ArrayList) Row(org.apache.spark.sql.Row)

Example 2 with MapFunction

use of org.apache.spark.api.java.function.MapFunction in project beam by apache.

the class ParDoTranslatorBatch method translateTransform.

@Override
public void translateTransform(PTransform<PCollection<InputT>, PCollectionTuple> transform, AbstractTranslationContext context) {
    String stepName = context.getCurrentTransform().getFullName();
    // Check for not supported advanced features
    // TODO: add support of Splittable DoFn
    DoFn<InputT, OutputT> doFn = getDoFn(context);
    checkState(!DoFnSignatures.isSplittable(doFn), "Not expected to directly translate splittable DoFn, should have been overridden: %s", doFn);
    // TODO: add support of states and timers
    checkState(!DoFnSignatures.isStateful(doFn), "States and timers are not supported for the moment.");
    checkState(!DoFnSignatures.requiresTimeSortedInput(doFn), "@RequiresTimeSortedInput is not " + "supported for the moment");
    DoFnSchemaInformation doFnSchemaInformation = ParDoTranslation.getSchemaInformation(context.getCurrentTransform());
    // Init main variables
    PValue input = context.getInput();
    Dataset<WindowedValue<InputT>> inputDataSet = context.getDataset(input);
    Map<TupleTag<?>, PCollection<?>> outputs = context.getOutputs();
    TupleTag<?> mainOutputTag = getTupleTag(context);
    List<TupleTag<?>> outputTags = new ArrayList<>(outputs.keySet());
    WindowingStrategy<?, ?> windowingStrategy = ((PCollection<InputT>) input).getWindowingStrategy();
    Coder<InputT> inputCoder = ((PCollection<InputT>) input).getCoder();
    Coder<? extends BoundedWindow> windowCoder = windowingStrategy.getWindowFn().windowCoder();
    // construct a map from side input to WindowingStrategy so that
    // the DoFn runner can map main-input windows to side input windows
    List<PCollectionView<?>> sideInputs = getSideInputs(context);
    Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputStrategies = new HashMap<>();
    for (PCollectionView<?> sideInput : sideInputs) {
        sideInputStrategies.put(sideInput, sideInput.getPCollection().getWindowingStrategy());
    }
    SideInputBroadcast broadcastStateData = createBroadcastSideInputs(sideInputs, context);
    Map<TupleTag<?>, Coder<?>> outputCoderMap = context.getOutputCoders();
    MetricsContainerStepMapAccumulator metricsAccum = MetricsAccumulator.getInstance();
    List<TupleTag<?>> additionalOutputTags = new ArrayList<>();
    for (TupleTag<?> tag : outputTags) {
        if (!tag.equals(mainOutputTag)) {
            additionalOutputTags.add(tag);
        }
    }
    Map<String, PCollectionView<?>> sideInputMapping = ParDoTranslation.getSideInputMapping(context.getCurrentTransform());
    @SuppressWarnings("unchecked") DoFnFunction<InputT, OutputT> doFnWrapper = new DoFnFunction(metricsAccum, stepName, doFn, windowingStrategy, sideInputStrategies, context.getSerializableOptions(), additionalOutputTags, mainOutputTag, inputCoder, outputCoderMap, broadcastStateData, doFnSchemaInformation, sideInputMapping);
    MultiOutputCoder multipleOutputCoder = MultiOutputCoder.of(SerializableCoder.of(TupleTag.class), outputCoderMap, windowCoder);
    Dataset<Tuple2<TupleTag<?>, WindowedValue<?>>> allOutputs = inputDataSet.mapPartitions(doFnWrapper, EncoderHelpers.fromBeamCoder(multipleOutputCoder));
    if (outputs.entrySet().size() > 1) {
        allOutputs.persist();
        for (Map.Entry<TupleTag<?>, PCollection<?>> output : outputs.entrySet()) {
            pruneOutputFilteredByTag(context, allOutputs, output, windowCoder);
        }
    } else {
        Coder<OutputT> outputCoder = ((PCollection<OutputT>) outputs.get(mainOutputTag)).getCoder();
        Coder<WindowedValue<?>> windowedValueCoder = (Coder<WindowedValue<?>>) (Coder<?>) WindowedValue.getFullCoder(outputCoder, windowCoder);
        Dataset<WindowedValue<?>> outputDataset = allOutputs.map((MapFunction<Tuple2<TupleTag<?>, WindowedValue<?>>, WindowedValue<?>>) value -> value._2, EncoderHelpers.fromBeamCoder(windowedValueCoder));
        context.putDatasetWildcard(outputs.entrySet().iterator().next().getValue(), outputDataset);
    }
}
Also used : SideInputBroadcast(org.apache.beam.runners.spark.structuredstreaming.translation.helpers.SideInputBroadcast) SerializableCoder(org.apache.beam.sdk.coders.SerializableCoder) WindowedValue(org.apache.beam.sdk.util.WindowedValue) Dataset(org.apache.spark.sql.Dataset) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Coder(org.apache.beam.sdk.coders.Coder) HashMap(java.util.HashMap) AbstractTranslationContext(org.apache.beam.runners.spark.structuredstreaming.translation.AbstractTranslationContext) DoFnSchemaInformation(org.apache.beam.sdk.transforms.DoFnSchemaInformation) ArrayList(java.util.ArrayList) PTransform(org.apache.beam.sdk.transforms.PTransform) DoFnSignatures(org.apache.beam.sdk.transforms.reflect.DoFnSignatures) EncoderHelpers(org.apache.beam.runners.spark.structuredstreaming.translation.helpers.EncoderHelpers) TupleTag(org.apache.beam.sdk.values.TupleTag) Map(java.util.Map) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) MultiOutputCoder(org.apache.beam.runners.spark.structuredstreaming.translation.helpers.MultiOutputCoder) CoderHelpers(org.apache.beam.runners.spark.structuredstreaming.translation.helpers.CoderHelpers) MapFunction(org.apache.spark.api.java.function.MapFunction) ParDoTranslation(org.apache.beam.runners.core.construction.ParDoTranslation) DoFn(org.apache.beam.sdk.transforms.DoFn) MetricsAccumulator(org.apache.beam.runners.spark.structuredstreaming.metrics.MetricsAccumulator) IOException(java.io.IOException) PCollection(org.apache.beam.sdk.values.PCollection) Tuple2(scala.Tuple2) List(java.util.List) PValue(org.apache.beam.sdk.values.PValue) Preconditions.checkState(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkState) PCollectionView(org.apache.beam.sdk.values.PCollectionView) TransformTranslator(org.apache.beam.runners.spark.structuredstreaming.translation.TransformTranslator) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) MetricsContainerStepMapAccumulator(org.apache.beam.runners.spark.structuredstreaming.metrics.MetricsContainerStepMapAccumulator) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) FilterFunction(org.apache.spark.api.java.function.FilterFunction) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) TupleTag(org.apache.beam.sdk.values.TupleTag) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) WindowedValue(org.apache.beam.sdk.util.WindowedValue) SerializableCoder(org.apache.beam.sdk.coders.SerializableCoder) Coder(org.apache.beam.sdk.coders.Coder) MultiOutputCoder(org.apache.beam.runners.spark.structuredstreaming.translation.helpers.MultiOutputCoder) SideInputBroadcast(org.apache.beam.runners.spark.structuredstreaming.translation.helpers.SideInputBroadcast) MultiOutputCoder(org.apache.beam.runners.spark.structuredstreaming.translation.helpers.MultiOutputCoder) PValue(org.apache.beam.sdk.values.PValue) MetricsContainerStepMapAccumulator(org.apache.beam.runners.spark.structuredstreaming.metrics.MetricsContainerStepMapAccumulator) PCollection(org.apache.beam.sdk.values.PCollection) PCollectionView(org.apache.beam.sdk.values.PCollectionView) DoFnSchemaInformation(org.apache.beam.sdk.transforms.DoFnSchemaInformation) Tuple2(scala.Tuple2) HashMap(java.util.HashMap) Map(java.util.Map)

Example 3 with MapFunction

use of org.apache.spark.api.java.function.MapFunction in project beam by apache.

the class WindowingHelpers method assignWindowsMapFunction.

public static <T, W extends BoundedWindow> MapFunction<WindowedValue<T>, WindowedValue<T>> assignWindowsMapFunction(WindowFn<T, W> windowFn) {
    return (MapFunction<WindowedValue<T>, WindowedValue<T>>) windowedValue -> {
        final BoundedWindow boundedWindow = Iterables.getOnlyElement(windowedValue.getWindows());
        final T element = windowedValue.getValue();
        final Instant timestamp = windowedValue.getTimestamp();
        Collection<W> windows = windowFn.assignWindows(windowFn.new AssignContext() {

            @Override
            public T element() {
                return element;
            }

            @Override
            public Instant timestamp() {
                return timestamp;
            }

            @Override
            public BoundedWindow window() {
                return boundedWindow;
            }
        });
        return WindowedValue.of(element, timestamp, windows, windowedValue.getPane());
    };
}
Also used : Instant(org.joda.time.Instant) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) MapFunction(org.apache.spark.api.java.function.MapFunction)

Aggregations

MapFunction (org.apache.spark.api.java.function.MapFunction)3 ArrayList (java.util.ArrayList)2 List (java.util.List)2 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)2 Dataset (org.apache.spark.sql.Dataset)2 IOException (java.io.IOException)1 Serializable (java.io.Serializable)1 HashMap (java.util.HashMap)1 Map (java.util.Map)1 ParDoTranslation (org.apache.beam.runners.core.construction.ParDoTranslation)1 MetricsAccumulator (org.apache.beam.runners.spark.structuredstreaming.metrics.MetricsAccumulator)1 MetricsContainerStepMapAccumulator (org.apache.beam.runners.spark.structuredstreaming.metrics.MetricsContainerStepMapAccumulator)1 AbstractTranslationContext (org.apache.beam.runners.spark.structuredstreaming.translation.AbstractTranslationContext)1 TransformTranslator (org.apache.beam.runners.spark.structuredstreaming.translation.TransformTranslator)1 CoderHelpers (org.apache.beam.runners.spark.structuredstreaming.translation.helpers.CoderHelpers)1 EncoderHelpers (org.apache.beam.runners.spark.structuredstreaming.translation.helpers.EncoderHelpers)1 MultiOutputCoder (org.apache.beam.runners.spark.structuredstreaming.translation.helpers.MultiOutputCoder)1 SideInputBroadcast (org.apache.beam.runners.spark.structuredstreaming.translation.helpers.SideInputBroadcast)1 Coder (org.apache.beam.sdk.coders.Coder)1 SerializableCoder (org.apache.beam.sdk.coders.SerializableCoder)1