use of io.cdap.cdap.etl.spark.function.AggregatorGroupByFunction in project cdap by caskdata.
the class BaseRDDCollection method aggregate.
@Override
public SparkCollection<RecordInfo<Object>> aggregate(StageSpec stageSpec, @Nullable Integer partitions, StageStatisticsCollector collector) {
PluginFunctionContext pluginFunctionContext = new PluginFunctionContext(stageSpec, sec, collector);
PairFlatMapFunction<T, Object, T> groupByFunction = new AggregatorGroupByFunction<>(pluginFunctionContext, functionCacheFactory.newCache());
JavaPairRDD<Object, T> keyedCollection = rdd.flatMapToPair(groupByFunction);
JavaPairRDD<Object, Iterable<T>> groupedCollection = partitions == null ? keyedCollection.groupByKey() : keyedCollection.groupByKey(partitions);
FlatMapFunction<Tuple2<Object, Iterable<T>>, RecordInfo<Object>> sparkAggregateFunction = new AggregatorAggregateFunction<>(pluginFunctionContext, functionCacheFactory.newCache());
return wrap(groupedCollection.flatMap(sparkAggregateFunction));
}
Aggregations