Search in sources :

Example 1 with PluginFunctionContext

use of io.cdap.cdap.etl.spark.function.PluginFunctionContext in project cdap by caskdata.

the class BaseRDDCollection method aggregate.

@Override
public SparkCollection<RecordInfo<Object>> aggregate(StageSpec stageSpec, @Nullable Integer partitions, StageStatisticsCollector collector) {
    PluginFunctionContext pluginFunctionContext = new PluginFunctionContext(stageSpec, sec, collector);
    PairFlatMapFunction<T, Object, T> groupByFunction = new AggregatorGroupByFunction<>(pluginFunctionContext, functionCacheFactory.newCache());
    JavaPairRDD<Object, T> keyedCollection = rdd.flatMapToPair(groupByFunction);
    JavaPairRDD<Object, Iterable<T>> groupedCollection = partitions == null ? keyedCollection.groupByKey() : keyedCollection.groupByKey(partitions);
    FlatMapFunction<Tuple2<Object, Iterable<T>>, RecordInfo<Object>> sparkAggregateFunction = new AggregatorAggregateFunction<>(pluginFunctionContext, functionCacheFactory.newCache());
    return wrap(groupedCollection.flatMap(sparkAggregateFunction));
}
Also used : PluginFunctionContext(io.cdap.cdap.etl.spark.function.PluginFunctionContext) RecordInfo(io.cdap.cdap.etl.common.RecordInfo) Tuple2(scala.Tuple2) AggregatorAggregateFunction(io.cdap.cdap.etl.spark.function.AggregatorAggregateFunction) AggregatorGroupByFunction(io.cdap.cdap.etl.spark.function.AggregatorGroupByFunction)

Example 2 with PluginFunctionContext

use of io.cdap.cdap.etl.spark.function.PluginFunctionContext in project cdap by caskdata.

the class BaseRDDCollection method publishAlerts.

@Override
public void publishAlerts(StageSpec stageSpec, StageStatisticsCollector collector) throws Exception {
    PluginFunctionContext pluginFunctionContext = new PluginFunctionContext(stageSpec, sec, collector);
    AlertPublisher alertPublisher = pluginFunctionContext.createPlugin();
    PipelineRuntime pipelineRuntime = new SparkPipelineRuntime(sec);
    AlertPublisherContext alertPublisherContext = new DefaultAlertPublisherContext(pipelineRuntime, stageSpec, sec.getMessagingContext(), sec.getAdmin());
    alertPublisher.initialize(alertPublisherContext);
    StageMetrics stageMetrics = new DefaultStageMetrics(sec.getMetrics(), stageSpec.getName());
    TrackedIterator<Alert> trackedAlerts = new TrackedIterator<>(((JavaRDD<Alert>) rdd).collect().iterator(), stageMetrics, Constants.Metrics.RECORDS_IN);
    alertPublisher.publish(trackedAlerts);
    alertPublisher.destroy();
}
Also used : PluginFunctionContext(io.cdap.cdap.etl.spark.function.PluginFunctionContext) AlertPublisher(io.cdap.cdap.etl.api.AlertPublisher) PipelineRuntime(io.cdap.cdap.etl.common.PipelineRuntime) SparkPipelineRuntime(io.cdap.cdap.etl.spark.SparkPipelineRuntime) SparkPipelineRuntime(io.cdap.cdap.etl.spark.SparkPipelineRuntime) TrackedIterator(io.cdap.cdap.etl.common.TrackedIterator) Alert(io.cdap.cdap.etl.api.Alert) AlertPublisherContext(io.cdap.cdap.etl.api.AlertPublisherContext) DefaultAlertPublisherContext(io.cdap.cdap.etl.common.DefaultAlertPublisherContext) DefaultAlertPublisherContext(io.cdap.cdap.etl.common.DefaultAlertPublisherContext) StageMetrics(io.cdap.cdap.etl.api.StageMetrics) DefaultStageMetrics(io.cdap.cdap.etl.common.DefaultStageMetrics) DefaultStageMetrics(io.cdap.cdap.etl.common.DefaultStageMetrics) JavaRDD(org.apache.spark.api.java.JavaRDD)

Example 3 with PluginFunctionContext

use of io.cdap.cdap.etl.spark.function.PluginFunctionContext in project cdap by caskdata.

the class BaseRDDCollection method reduceAggregate.

@Override
public SparkCollection<RecordInfo<Object>> reduceAggregate(StageSpec stageSpec, @Nullable Integer partitions, StageStatisticsCollector collector) {
    PluginFunctionContext pluginFunctionContext = new PluginFunctionContext(stageSpec, sec, collector);
    PairFlatMapFunction<T, Object, T> groupByFunction = new AggregatorReduceGroupByFunction<>(pluginFunctionContext, functionCacheFactory.newCache());
    JavaPairRDD<Object, T> keyedCollection = rdd.flatMapToPair(groupByFunction);
    Function<T, Object> initializeFunction = new AggregatorInitializeFunction<>(pluginFunctionContext, functionCacheFactory.newCache());
    Function2<Object, T, Object> mergeValueFunction = new AggregatorMergeValueFunction<>(pluginFunctionContext, functionCacheFactory.newCache());
    Function2<Object, Object, Object> mergePartitionFunction = new AggregatorMergePartitionFunction<>(pluginFunctionContext, functionCacheFactory.newCache());
    JavaPairRDD<Object, Object> groupedCollection = partitions == null ? keyedCollection.combineByKey(initializeFunction, mergeValueFunction, mergePartitionFunction) : keyedCollection.combineByKey(initializeFunction, mergeValueFunction, mergePartitionFunction, partitions);
    FlatMapFunction<Tuple2<Object, Object>, RecordInfo<Object>> postFunction = new AggregatorFinalizeFunction<>(pluginFunctionContext, functionCacheFactory.newCache());
    return wrap(groupedCollection.flatMap(postFunction));
}
Also used : RecordInfo(io.cdap.cdap.etl.common.RecordInfo) AggregatorReduceGroupByFunction(io.cdap.cdap.etl.spark.function.AggregatorReduceGroupByFunction) AggregatorInitializeFunction(io.cdap.cdap.etl.spark.function.AggregatorInitializeFunction) PluginFunctionContext(io.cdap.cdap.etl.spark.function.PluginFunctionContext) AggregatorMergeValueFunction(io.cdap.cdap.etl.spark.function.AggregatorMergeValueFunction) Tuple2(scala.Tuple2) AggregatorFinalizeFunction(io.cdap.cdap.etl.spark.function.AggregatorFinalizeFunction) AggregatorMergePartitionFunction(io.cdap.cdap.etl.spark.function.AggregatorMergePartitionFunction)

Example 4 with PluginFunctionContext

use of io.cdap.cdap.etl.spark.function.PluginFunctionContext in project cdap by caskdata.

the class RDDCollection method reduceDatasetAggregate.

/**
 * Performs reduce aggregate using Dataset API. This allows SPARK to perform various optimizations that
 * are not available when working on the RDD level.
 */
private <GROUP_KEY, AGG_VALUE> SparkCollection<RecordInfo<Object>> reduceDatasetAggregate(StageSpec stageSpec, @Nullable Integer partitions, StageStatisticsCollector collector) {
    PluginFunctionContext pluginFunctionContext = new PluginFunctionContext(stageSpec, sec, collector);
    DatasetAggregationGetKeyFunction<GROUP_KEY, T, AGG_VALUE> groupByFunction = new DatasetAggregationGetKeyFunction<>(pluginFunctionContext, functionCacheFactory.newCache());
    DatasetAggregationReduceFunction<T, AGG_VALUE> reduceFunction = new DatasetAggregationReduceFunction<>(pluginFunctionContext, functionCacheFactory.newCache());
    DatasetAggregationFinalizeFunction<GROUP_KEY, T, AGG_VALUE, ?> postFunction = new DatasetAggregationFinalizeFunction<>(pluginFunctionContext, functionCacheFactory.newCache());
    MapFunction<Tuple2<GROUP_KEY, DatasetAggregationAccumulator<T, AGG_VALUE>>, GROUP_KEY> keyFromTuple = Tuple2::_1;
    MapFunction<Tuple2<GROUP_KEY, DatasetAggregationAccumulator<T, AGG_VALUE>>, DatasetAggregationAccumulator<T, AGG_VALUE>> valueFromTuple = Tuple2::_2;
    Dataset<T> dataset = sqlContext.createDataset(rdd.rdd(), objectEncoder());
    Dataset<RecordInfo<Object>> groupedDataset = dataset.flatMap(groupByFunction, tupleEncoder()).groupByKey(keyFromTuple, objectEncoder()).mapValues(valueFromTuple, objectEncoder()).reduceGroups(reduceFunction).flatMap(postFunction, objectEncoder());
    if (!ignorePartitionsDuringDatasetAggregation && partitions != null) {
        groupedDataset = groupedDataset.coalesce(partitions);
    }
    return wrap(groupedDataset.toJavaRDD());
}
Also used : DatasetAggregationGetKeyFunction(io.cdap.cdap.etl.spark.function.DatasetAggregationGetKeyFunction) RecordInfo(io.cdap.cdap.etl.common.RecordInfo) PluginFunctionContext(io.cdap.cdap.etl.spark.function.PluginFunctionContext) Tuple2(scala.Tuple2) DatasetAggregationFinalizeFunction(io.cdap.cdap.etl.spark.function.DatasetAggregationFinalizeFunction) DatasetAggregationReduceFunction(io.cdap.cdap.etl.spark.function.DatasetAggregationReduceFunction) DatasetAggregationAccumulator(io.cdap.cdap.etl.spark.function.DatasetAggregationAccumulator)

Example 5 with PluginFunctionContext

use of io.cdap.cdap.etl.spark.function.PluginFunctionContext in project cdap by caskdata.

the class DynamicDriverContext method readExternal.

@Override
public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException {
    serializationVersion = in.readUTF();
    stageSpec = (StageSpec) in.readObject();
    sec = (JavaSparkExecutionContext) in.readObject();
    // we intentionally do not serialize this context in order to ensure that the runtime arguments
    // and logical start time are picked up from the JavaSparkExecutionContext. If we serialized it,
    // the arguments and start time of the very first pipeline run would get serialized, then
    // used for every subsequent run that loads from the checkpoint.
    pluginFunctionContext = new PluginFunctionContext(stageSpec, sec, new NoopStageStatisticsCollector());
}
Also used : PluginFunctionContext(io.cdap.cdap.etl.spark.function.PluginFunctionContext) NoopStageStatisticsCollector(io.cdap.cdap.etl.common.NoopStageStatisticsCollector)

Aggregations

PluginFunctionContext (io.cdap.cdap.etl.spark.function.PluginFunctionContext)10 RecordInfo (io.cdap.cdap.etl.common.RecordInfo)6 Tuple2 (scala.Tuple2)4 MacroEvaluator (io.cdap.cdap.api.macro.MacroEvaluator)3 NoopStageStatisticsCollector (io.cdap.cdap.etl.common.NoopStageStatisticsCollector)3 PipelineRuntime (io.cdap.cdap.etl.common.PipelineRuntime)3 SparkPipelineRuntime (io.cdap.cdap.etl.spark.SparkPipelineRuntime)3 TxRunnable (io.cdap.cdap.api.TxRunnable)2 DatasetContext (io.cdap.cdap.api.data.DatasetContext)2 PluginContext (io.cdap.cdap.api.plugin.PluginContext)2 BasicArguments (io.cdap.cdap.etl.common.BasicArguments)2 DefaultMacroEvaluator (io.cdap.cdap.etl.common.DefaultMacroEvaluator)2 StageSpec (io.cdap.cdap.etl.proto.v2.spec.StageSpec)2 BatchSinkFunction (io.cdap.cdap.etl.spark.function.BatchSinkFunction)2 SparkPipelinePluginContext (io.cdap.cdap.etl.spark.plugin.SparkPipelinePluginContext)2 ImmutableSet (com.google.common.collect.ImmutableSet)1 ThreadFactoryBuilder (com.google.common.util.concurrent.ThreadFactoryBuilder)1 DataTracer (io.cdap.cdap.api.preview.DataTracer)1 JavaSparkExecutionContext (io.cdap.cdap.api.spark.JavaSparkExecutionContext)1 Alert (io.cdap.cdap.etl.api.Alert)1