Search in sources :

Example 1 with TupleSummaryAggregator

use of org.apache.flink.api.java.summarize.aggregation.TupleSummaryAggregator in project flink by apache.

the class DataSetUtils method summarize.

// --------------------------------------------------------------------------------------------
//  Summarize
// --------------------------------------------------------------------------------------------
/**
	 * Summarize a DataSet of Tuples by collecting single pass statistics for all columns
	 *
	 * Example usage:
	 * <pre>
	 * {@code
	 * Dataset<Tuple3<Double, String, Boolean>> input = // [...]
	 * Tuple3<NumericColumnSummary,StringColumnSummary, BooleanColumnSummary> summary = DataSetUtils.summarize(input)
	 *
	 * summary.f0.getStandardDeviation()
	 * summary.f1.getMaxLength()
	 * }
	 * </pre>
	 * @return the summary as a Tuple the same width as input rows
	 */
public static <R extends Tuple, T extends Tuple> R summarize(DataSet<T> input) throws Exception {
    if (!input.getType().isTupleType()) {
        throw new IllegalArgumentException("summarize() is only implemented for DataSet's of Tuples");
    }
    final TupleTypeInfoBase<?> inType = (TupleTypeInfoBase<?>) input.getType();
    DataSet<TupleSummaryAggregator<R>> result = input.mapPartition(new MapPartitionFunction<T, TupleSummaryAggregator<R>>() {

        @Override
        public void mapPartition(Iterable<T> values, Collector<TupleSummaryAggregator<R>> out) throws Exception {
            TupleSummaryAggregator<R> aggregator = SummaryAggregatorFactory.create(inType);
            for (Tuple value : values) {
                aggregator.aggregate(value);
            }
            out.collect(aggregator);
        }
    }).reduce(new ReduceFunction<TupleSummaryAggregator<R>>() {

        @Override
        public TupleSummaryAggregator<R> reduce(TupleSummaryAggregator<R> agg1, TupleSummaryAggregator<R> agg2) throws Exception {
            agg1.combine(agg2);
            return agg1;
        }
    });
    return result.collect().get(0).result();
}
Also used : RichMapPartitionFunction(org.apache.flink.api.common.functions.RichMapPartitionFunction) MapPartitionFunction(org.apache.flink.api.common.functions.MapPartitionFunction) TupleTypeInfoBase(org.apache.flink.api.java.typeutils.TupleTypeInfoBase) Collector(org.apache.flink.util.Collector) Tuple(org.apache.flink.api.java.tuple.Tuple) TupleSummaryAggregator(org.apache.flink.api.java.summarize.aggregation.TupleSummaryAggregator)

Aggregations

MapPartitionFunction (org.apache.flink.api.common.functions.MapPartitionFunction)1 RichMapPartitionFunction (org.apache.flink.api.common.functions.RichMapPartitionFunction)1 TupleSummaryAggregator (org.apache.flink.api.java.summarize.aggregation.TupleSummaryAggregator)1 Tuple (org.apache.flink.api.java.tuple.Tuple)1 TupleTypeInfoBase (org.apache.flink.api.java.typeutils.TupleTypeInfoBase)1 Collector (org.apache.flink.util.Collector)1