use of io.cdap.cdap.etl.spark.function.PluginFunctionContext in project cdap by caskdata.
the class BaseRDDCollection method aggregate.
@Override
public SparkCollection<RecordInfo<Object>> aggregate(StageSpec stageSpec, @Nullable Integer partitions, StageStatisticsCollector collector) {
PluginFunctionContext pluginFunctionContext = new PluginFunctionContext(stageSpec, sec, collector);
PairFlatMapFunction<T, Object, T> groupByFunction = new AggregatorGroupByFunction<>(pluginFunctionContext, functionCacheFactory.newCache());
JavaPairRDD<Object, T> keyedCollection = rdd.flatMapToPair(groupByFunction);
JavaPairRDD<Object, Iterable<T>> groupedCollection = partitions == null ? keyedCollection.groupByKey() : keyedCollection.groupByKey(partitions);
FlatMapFunction<Tuple2<Object, Iterable<T>>, RecordInfo<Object>> sparkAggregateFunction = new AggregatorAggregateFunction<>(pluginFunctionContext, functionCacheFactory.newCache());
return wrap(groupedCollection.flatMap(sparkAggregateFunction));
}
use of io.cdap.cdap.etl.spark.function.PluginFunctionContext in project cdap by caskdata.
the class BaseRDDCollection method publishAlerts.
@Override
public void publishAlerts(StageSpec stageSpec, StageStatisticsCollector collector) throws Exception {
PluginFunctionContext pluginFunctionContext = new PluginFunctionContext(stageSpec, sec, collector);
AlertPublisher alertPublisher = pluginFunctionContext.createPlugin();
PipelineRuntime pipelineRuntime = new SparkPipelineRuntime(sec);
AlertPublisherContext alertPublisherContext = new DefaultAlertPublisherContext(pipelineRuntime, stageSpec, sec.getMessagingContext(), sec.getAdmin());
alertPublisher.initialize(alertPublisherContext);
StageMetrics stageMetrics = new DefaultStageMetrics(sec.getMetrics(), stageSpec.getName());
TrackedIterator<Alert> trackedAlerts = new TrackedIterator<>(((JavaRDD<Alert>) rdd).collect().iterator(), stageMetrics, Constants.Metrics.RECORDS_IN);
alertPublisher.publish(trackedAlerts);
alertPublisher.destroy();
}
use of io.cdap.cdap.etl.spark.function.PluginFunctionContext in project cdap by caskdata.
the class BaseRDDCollection method reduceAggregate.
@Override
public SparkCollection<RecordInfo<Object>> reduceAggregate(StageSpec stageSpec, @Nullable Integer partitions, StageStatisticsCollector collector) {
PluginFunctionContext pluginFunctionContext = new PluginFunctionContext(stageSpec, sec, collector);
PairFlatMapFunction<T, Object, T> groupByFunction = new AggregatorReduceGroupByFunction<>(pluginFunctionContext, functionCacheFactory.newCache());
JavaPairRDD<Object, T> keyedCollection = rdd.flatMapToPair(groupByFunction);
Function<T, Object> initializeFunction = new AggregatorInitializeFunction<>(pluginFunctionContext, functionCacheFactory.newCache());
Function2<Object, T, Object> mergeValueFunction = new AggregatorMergeValueFunction<>(pluginFunctionContext, functionCacheFactory.newCache());
Function2<Object, Object, Object> mergePartitionFunction = new AggregatorMergePartitionFunction<>(pluginFunctionContext, functionCacheFactory.newCache());
JavaPairRDD<Object, Object> groupedCollection = partitions == null ? keyedCollection.combineByKey(initializeFunction, mergeValueFunction, mergePartitionFunction) : keyedCollection.combineByKey(initializeFunction, mergeValueFunction, mergePartitionFunction, partitions);
FlatMapFunction<Tuple2<Object, Object>, RecordInfo<Object>> postFunction = new AggregatorFinalizeFunction<>(pluginFunctionContext, functionCacheFactory.newCache());
return wrap(groupedCollection.flatMap(postFunction));
}
use of io.cdap.cdap.etl.spark.function.PluginFunctionContext in project cdap by caskdata.
the class RDDCollection method reduceDatasetAggregate.
/**
* Performs reduce aggregate using Dataset API. This allows SPARK to perform various optimizations that
* are not available when working on the RDD level.
*/
private <GROUP_KEY, AGG_VALUE> SparkCollection<RecordInfo<Object>> reduceDatasetAggregate(StageSpec stageSpec, @Nullable Integer partitions, StageStatisticsCollector collector) {
PluginFunctionContext pluginFunctionContext = new PluginFunctionContext(stageSpec, sec, collector);
DatasetAggregationGetKeyFunction<GROUP_KEY, T, AGG_VALUE> groupByFunction = new DatasetAggregationGetKeyFunction<>(pluginFunctionContext, functionCacheFactory.newCache());
DatasetAggregationReduceFunction<T, AGG_VALUE> reduceFunction = new DatasetAggregationReduceFunction<>(pluginFunctionContext, functionCacheFactory.newCache());
DatasetAggregationFinalizeFunction<GROUP_KEY, T, AGG_VALUE, ?> postFunction = new DatasetAggregationFinalizeFunction<>(pluginFunctionContext, functionCacheFactory.newCache());
MapFunction<Tuple2<GROUP_KEY, DatasetAggregationAccumulator<T, AGG_VALUE>>, GROUP_KEY> keyFromTuple = Tuple2::_1;
MapFunction<Tuple2<GROUP_KEY, DatasetAggregationAccumulator<T, AGG_VALUE>>, DatasetAggregationAccumulator<T, AGG_VALUE>> valueFromTuple = Tuple2::_2;
Dataset<T> dataset = sqlContext.createDataset(rdd.rdd(), objectEncoder());
Dataset<RecordInfo<Object>> groupedDataset = dataset.flatMap(groupByFunction, tupleEncoder()).groupByKey(keyFromTuple, objectEncoder()).mapValues(valueFromTuple, objectEncoder()).reduceGroups(reduceFunction).flatMap(postFunction, objectEncoder());
if (!ignorePartitionsDuringDatasetAggregation && partitions != null) {
groupedDataset = groupedDataset.coalesce(partitions);
}
return wrap(groupedDataset.toJavaRDD());
}
use of io.cdap.cdap.etl.spark.function.PluginFunctionContext in project cdap by caskdata.
the class DynamicDriverContext method readExternal.
@Override
public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException {
serializationVersion = in.readUTF();
stageSpec = (StageSpec) in.readObject();
sec = (JavaSparkExecutionContext) in.readObject();
// we intentionally do not serialize this context in order to ensure that the runtime arguments
// and logical start time are picked up from the JavaSparkExecutionContext. If we serialized it,
// the arguments and start time of the very first pipeline run would get serialized, then
// used for every subsequent run that loads from the checkpoint.
pluginFunctionContext = new PluginFunctionContext(stageSpec, sec, new NoopStageStatisticsCollector());
}
Aggregations