use of io.cdap.cdap.etl.spark.function.AggregatorMergePartitionFunction in project cdap by caskdata.
the class BaseRDDCollection method reduceAggregate.
@Override
public SparkCollection<RecordInfo<Object>> reduceAggregate(StageSpec stageSpec, @Nullable Integer partitions, StageStatisticsCollector collector) {
PluginFunctionContext pluginFunctionContext = new PluginFunctionContext(stageSpec, sec, collector);
PairFlatMapFunction<T, Object, T> groupByFunction = new AggregatorReduceGroupByFunction<>(pluginFunctionContext, functionCacheFactory.newCache());
JavaPairRDD<Object, T> keyedCollection = rdd.flatMapToPair(groupByFunction);
Function<T, Object> initializeFunction = new AggregatorInitializeFunction<>(pluginFunctionContext, functionCacheFactory.newCache());
Function2<Object, T, Object> mergeValueFunction = new AggregatorMergeValueFunction<>(pluginFunctionContext, functionCacheFactory.newCache());
Function2<Object, Object, Object> mergePartitionFunction = new AggregatorMergePartitionFunction<>(pluginFunctionContext, functionCacheFactory.newCache());
JavaPairRDD<Object, Object> groupedCollection = partitions == null ? keyedCollection.combineByKey(initializeFunction, mergeValueFunction, mergePartitionFunction) : keyedCollection.combineByKey(initializeFunction, mergeValueFunction, mergePartitionFunction, partitions);
FlatMapFunction<Tuple2<Object, Object>, RecordInfo<Object>> postFunction = new AggregatorFinalizeFunction<>(pluginFunctionContext, functionCacheFactory.newCache());
return wrap(groupedCollection.flatMap(postFunction));
}
Aggregations