Search in sources :

Example 1 with DatasetAggregationAccumulator

use of io.cdap.cdap.etl.spark.function.DatasetAggregationAccumulator in project cdap by caskdata.

the class RDDCollection method reduceDatasetAggregate.

/**
 * Performs reduce aggregate using Dataset API. This allows SPARK to perform various optimizations that
 * are not available when working on the RDD level.
 */
private <GROUP_KEY, AGG_VALUE> SparkCollection<RecordInfo<Object>> reduceDatasetAggregate(StageSpec stageSpec, @Nullable Integer partitions, StageStatisticsCollector collector) {
    PluginFunctionContext pluginFunctionContext = new PluginFunctionContext(stageSpec, sec, collector);
    DatasetAggregationGetKeyFunction<GROUP_KEY, T, AGG_VALUE> groupByFunction = new DatasetAggregationGetKeyFunction<>(pluginFunctionContext, functionCacheFactory.newCache());
    DatasetAggregationReduceFunction<T, AGG_VALUE> reduceFunction = new DatasetAggregationReduceFunction<>(pluginFunctionContext, functionCacheFactory.newCache());
    DatasetAggregationFinalizeFunction<GROUP_KEY, T, AGG_VALUE, ?> postFunction = new DatasetAggregationFinalizeFunction<>(pluginFunctionContext, functionCacheFactory.newCache());
    MapFunction<Tuple2<GROUP_KEY, DatasetAggregationAccumulator<T, AGG_VALUE>>, GROUP_KEY> keyFromTuple = Tuple2::_1;
    MapFunction<Tuple2<GROUP_KEY, DatasetAggregationAccumulator<T, AGG_VALUE>>, DatasetAggregationAccumulator<T, AGG_VALUE>> valueFromTuple = Tuple2::_2;
    Dataset<T> dataset = sqlContext.createDataset(rdd.rdd(), objectEncoder());
    Dataset<RecordInfo<Object>> groupedDataset = dataset.flatMap(groupByFunction, tupleEncoder()).groupByKey(keyFromTuple, objectEncoder()).mapValues(valueFromTuple, objectEncoder()).reduceGroups(reduceFunction).flatMap(postFunction, objectEncoder());
    if (!ignorePartitionsDuringDatasetAggregation && partitions != null) {
        groupedDataset = groupedDataset.coalesce(partitions);
    }
    return wrap(groupedDataset.toJavaRDD());
}
Also used : DatasetAggregationGetKeyFunction(io.cdap.cdap.etl.spark.function.DatasetAggregationGetKeyFunction) RecordInfo(io.cdap.cdap.etl.common.RecordInfo) PluginFunctionContext(io.cdap.cdap.etl.spark.function.PluginFunctionContext) Tuple2(scala.Tuple2) DatasetAggregationFinalizeFunction(io.cdap.cdap.etl.spark.function.DatasetAggregationFinalizeFunction) DatasetAggregationReduceFunction(io.cdap.cdap.etl.spark.function.DatasetAggregationReduceFunction) DatasetAggregationAccumulator(io.cdap.cdap.etl.spark.function.DatasetAggregationAccumulator)

Aggregations

RecordInfo (io.cdap.cdap.etl.common.RecordInfo)1 DatasetAggregationAccumulator (io.cdap.cdap.etl.spark.function.DatasetAggregationAccumulator)1 DatasetAggregationFinalizeFunction (io.cdap.cdap.etl.spark.function.DatasetAggregationFinalizeFunction)1 DatasetAggregationGetKeyFunction (io.cdap.cdap.etl.spark.function.DatasetAggregationGetKeyFunction)1 DatasetAggregationReduceFunction (io.cdap.cdap.etl.spark.function.DatasetAggregationReduceFunction)1 PluginFunctionContext (io.cdap.cdap.etl.spark.function.PluginFunctionContext)1 Tuple2 (scala.Tuple2)1