use of io.cdap.cdap.etl.spark.function.DatasetAggregationFinalizeFunction in project cdap by caskdata.
the class RDDCollection method reduceDatasetAggregate.
/**
* Performs reduce aggregate using Dataset API. This allows SPARK to perform various optimizations that
* are not available when working on the RDD level.
*/
private <GROUP_KEY, AGG_VALUE> SparkCollection<RecordInfo<Object>> reduceDatasetAggregate(StageSpec stageSpec, @Nullable Integer partitions, StageStatisticsCollector collector) {
PluginFunctionContext pluginFunctionContext = new PluginFunctionContext(stageSpec, sec, collector);
DatasetAggregationGetKeyFunction<GROUP_KEY, T, AGG_VALUE> groupByFunction = new DatasetAggregationGetKeyFunction<>(pluginFunctionContext, functionCacheFactory.newCache());
DatasetAggregationReduceFunction<T, AGG_VALUE> reduceFunction = new DatasetAggregationReduceFunction<>(pluginFunctionContext, functionCacheFactory.newCache());
DatasetAggregationFinalizeFunction<GROUP_KEY, T, AGG_VALUE, ?> postFunction = new DatasetAggregationFinalizeFunction<>(pluginFunctionContext, functionCacheFactory.newCache());
MapFunction<Tuple2<GROUP_KEY, DatasetAggregationAccumulator<T, AGG_VALUE>>, GROUP_KEY> keyFromTuple = Tuple2::_1;
MapFunction<Tuple2<GROUP_KEY, DatasetAggregationAccumulator<T, AGG_VALUE>>, DatasetAggregationAccumulator<T, AGG_VALUE>> valueFromTuple = Tuple2::_2;
Dataset<T> dataset = sqlContext.createDataset(rdd.rdd(), objectEncoder());
Dataset<RecordInfo<Object>> groupedDataset = dataset.flatMap(groupByFunction, tupleEncoder()).groupByKey(keyFromTuple, objectEncoder()).mapValues(valueFromTuple, objectEncoder()).reduceGroups(reduceFunction).flatMap(postFunction, objectEncoder());
if (!ignorePartitionsDuringDatasetAggregation && partitions != null) {
groupedDataset = groupedDataset.coalesce(partitions);
}
return wrap(groupedDataset.toJavaRDD());
}
Aggregations