Search in sources :

Example 1 with GroupReduceOperator

use of org.apache.flink.api.java.operators.GroupReduceOperator in project flink by apache.

the class DataSetUtils method sampleWithSize.

/**
 * Generate a sample of DataSet which contains fixed size elements.
 *
 * <p><strong>NOTE:</strong> Sample with fixed size is not as efficient as sample with fraction,
 * use sample with fraction unless you need exact precision.
 *
 * @param withReplacement Whether element can be selected more than once.
 * @param numSamples The expected sample size.
 * @param seed Random number generator seed.
 * @return The sampled DataSet
 */
public static <T> DataSet<T> sampleWithSize(DataSet<T> input, final boolean withReplacement, final int numSamples, final long seed) {
    SampleInPartition<T> sampleInPartition = new SampleInPartition<>(withReplacement, numSamples, seed);
    MapPartitionOperator mapPartitionOperator = input.mapPartition(sampleInPartition);
    // There is no previous group, so the parallelism of GroupReduceOperator is always 1.
    String callLocation = Utils.getCallLocationName();
    SampleInCoordinator<T> sampleInCoordinator = new SampleInCoordinator<>(withReplacement, numSamples, seed);
    return new GroupReduceOperator<>(mapPartitionOperator, input.getType(), sampleInCoordinator, callLocation);
}
Also used : GroupReduceOperator(org.apache.flink.api.java.operators.GroupReduceOperator) SampleInPartition(org.apache.flink.api.java.functions.SampleInPartition) SampleInCoordinator(org.apache.flink.api.java.functions.SampleInCoordinator) MapPartitionOperator(org.apache.flink.api.java.operators.MapPartitionOperator)

Example 2 with GroupReduceOperator

use of org.apache.flink.api.java.operators.GroupReduceOperator in project beam by apache.

the class FlinkBatchPortablePipelineTranslator method translateGroupByKey.

private static <K, V> void translateGroupByKey(PTransformNode transform, RunnerApi.Pipeline pipeline, BatchTranslationContext context) {
    RunnerApi.Components components = pipeline.getComponents();
    String inputPCollectionId = Iterables.getOnlyElement(transform.getTransform().getInputsMap().values());
    PCollectionNode inputCollection = PipelineNode.pCollection(inputPCollectionId, components.getPcollectionsOrThrow(inputPCollectionId));
    DataSet<WindowedValue<KV<K, V>>> inputDataSet = context.getDataSetOrThrow(inputPCollectionId);
    RunnerApi.WindowingStrategy windowingStrategyProto = pipeline.getComponents().getWindowingStrategiesOrThrow(pipeline.getComponents().getPcollectionsOrThrow(inputPCollectionId).getWindowingStrategyId());
    RehydratedComponents rehydratedComponents = RehydratedComponents.forComponents(pipeline.getComponents());
    WindowingStrategy<Object, BoundedWindow> windowingStrategy;
    try {
        windowingStrategy = (WindowingStrategy<Object, BoundedWindow>) WindowingStrategyTranslation.fromProto(windowingStrategyProto, rehydratedComponents);
    } catch (InvalidProtocolBufferException e) {
        throw new IllegalStateException(String.format("Unable to hydrate GroupByKey windowing strategy %s.", windowingStrategyProto), e);
    }
    WindowedValueCoder<KV<K, V>> inputCoder;
    try {
        inputCoder = (WindowedValueCoder) WireCoders.instantiateRunnerWireCoder(inputCollection, pipeline.getComponents());
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    KvCoder<K, V> inputElementCoder = (KvCoder<K, V>) inputCoder.getValueCoder();
    Concatenate<V> combineFn = new Concatenate<>();
    Coder<List<V>> accumulatorCoder = combineFn.getAccumulatorCoder(CoderRegistry.createDefault(), inputElementCoder.getValueCoder());
    Coder<WindowedValue<KV<K, List<V>>>> outputCoder = WindowedValue.getFullCoder(KvCoder.of(inputElementCoder.getKeyCoder(), accumulatorCoder), windowingStrategy.getWindowFn().windowCoder());
    TypeInformation<WindowedValue<KV<K, List<V>>>> partialReduceTypeInfo = new CoderTypeInformation<>(outputCoder, context.getPipelineOptions());
    Grouping<WindowedValue<KV<K, V>>> inputGrouping = inputDataSet.groupBy(new KvKeySelector<>(inputElementCoder.getKeyCoder()));
    FlinkPartialReduceFunction<K, V, List<V>, ?> partialReduceFunction = new FlinkPartialReduceFunction<>(combineFn, windowingStrategy, Collections.emptyMap(), context.getPipelineOptions());
    FlinkReduceFunction<K, List<V>, List<V>, ?> reduceFunction = new FlinkReduceFunction<>(combineFn, windowingStrategy, Collections.emptyMap(), context.getPipelineOptions());
    // Partially GroupReduce the values into the intermediate format AccumT (combine)
    GroupCombineOperator<WindowedValue<KV<K, V>>, WindowedValue<KV<K, List<V>>>> groupCombine = new GroupCombineOperator<>(inputGrouping, partialReduceTypeInfo, partialReduceFunction, "GroupCombine: " + transform.getTransform().getUniqueName());
    Grouping<WindowedValue<KV<K, List<V>>>> intermediateGrouping = groupCombine.groupBy(new KvKeySelector<>(inputElementCoder.getKeyCoder()));
    // Fully reduce the values and create output format VO
    GroupReduceOperator<WindowedValue<KV<K, List<V>>>, WindowedValue<KV<K, List<V>>>> outputDataSet = new GroupReduceOperator<>(intermediateGrouping, partialReduceTypeInfo, reduceFunction, transform.getTransform().getUniqueName());
    context.addDataSet(Iterables.getOnlyElement(transform.getTransform().getOutputsMap().values()), outputDataSet);
}
Also used : RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) WindowedValue(org.apache.beam.sdk.util.WindowedValue) KV(org.apache.beam.sdk.values.KV) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) FlinkReduceFunction(org.apache.beam.runners.flink.translation.functions.FlinkReduceFunction) List(java.util.List) GroupCombineOperator(org.apache.flink.api.java.operators.GroupCombineOperator) CoderTypeInformation(org.apache.beam.runners.flink.translation.types.CoderTypeInformation) InvalidProtocolBufferException(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.InvalidProtocolBufferException) FlinkPartialReduceFunction(org.apache.beam.runners.flink.translation.functions.FlinkPartialReduceFunction) KvCoder(org.apache.beam.sdk.coders.KvCoder) KV(org.apache.beam.sdk.values.KV) IOException(java.io.IOException) PCollectionNode(org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode) GroupReduceOperator(org.apache.flink.api.java.operators.GroupReduceOperator) Concatenate(org.apache.beam.runners.core.Concatenate) RehydratedComponents(org.apache.beam.runners.core.construction.RehydratedComponents)

Aggregations

GroupReduceOperator (org.apache.flink.api.java.operators.GroupReduceOperator)2 IOException (java.io.IOException)1 List (java.util.List)1 RunnerApi (org.apache.beam.model.pipeline.v1.RunnerApi)1 Concatenate (org.apache.beam.runners.core.Concatenate)1 RehydratedComponents (org.apache.beam.runners.core.construction.RehydratedComponents)1 PCollectionNode (org.apache.beam.runners.core.construction.graph.PipelineNode.PCollectionNode)1 FlinkPartialReduceFunction (org.apache.beam.runners.flink.translation.functions.FlinkPartialReduceFunction)1 FlinkReduceFunction (org.apache.beam.runners.flink.translation.functions.FlinkReduceFunction)1 CoderTypeInformation (org.apache.beam.runners.flink.translation.types.CoderTypeInformation)1 KvCoder (org.apache.beam.sdk.coders.KvCoder)1 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)1 WindowedValue (org.apache.beam.sdk.util.WindowedValue)1 KV (org.apache.beam.sdk.values.KV)1 InvalidProtocolBufferException (org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.InvalidProtocolBufferException)1 SampleInCoordinator (org.apache.flink.api.java.functions.SampleInCoordinator)1 SampleInPartition (org.apache.flink.api.java.functions.SampleInPartition)1 GroupCombineOperator (org.apache.flink.api.java.operators.GroupCombineOperator)1 MapPartitionOperator (org.apache.flink.api.java.operators.MapPartitionOperator)1