Search in sources :

Example 1 with MapPartitionFunction

use of org.apache.flink.api.common.functions.MapPartitionFunction in project flink by apache.

the class PartitionMapOperatorTest method testMapPartitionWithRuntimeContext.

@Test
public void testMapPartitionWithRuntimeContext() {
    try {
        final String taskName = "Test Task";
        final AtomicBoolean opened = new AtomicBoolean();
        final AtomicBoolean closed = new AtomicBoolean();
        final MapPartitionFunction<String, Integer> parser = new RichMapPartitionFunction<String, Integer>() {

            @Override
            public void open(Configuration parameters) throws Exception {
                opened.set(true);
                RuntimeContext ctx = getRuntimeContext();
                assertEquals(0, ctx.getIndexOfThisSubtask());
                assertEquals(1, ctx.getNumberOfParallelSubtasks());
                assertEquals(taskName, ctx.getTaskName());
            }

            @Override
            public void mapPartition(Iterable<String> values, Collector<Integer> out) {
                for (String s : values) {
                    out.collect(Integer.parseInt(s));
                }
            }

            @Override
            public void close() throws Exception {
                closed.set(true);
            }
        };
        MapPartitionOperatorBase<String, Integer, MapPartitionFunction<String, Integer>> op = new MapPartitionOperatorBase<String, Integer, MapPartitionFunction<String, Integer>>(parser, new UnaryOperatorInformation<String, Integer>(BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.INT_TYPE_INFO), taskName);
        List<String> input = new ArrayList<String>(asList("1", "2", "3", "4", "5", "6"));
        final TaskInfo taskInfo = new TaskInfo(taskName, 1, 0, 1, 0);
        ExecutionConfig executionConfig = new ExecutionConfig();
        executionConfig.disableObjectReuse();
        List<Integer> resultMutableSafe = op.executeOnCollections(input, new RuntimeUDFContext(taskInfo, null, executionConfig, new HashMap<String, Future<Path>>(), new HashMap<String, Accumulator<?, ?>>(), new UnregisteredMetricsGroup()), executionConfig);
        executionConfig.enableObjectReuse();
        List<Integer> resultRegular = op.executeOnCollections(input, new RuntimeUDFContext(taskInfo, null, executionConfig, new HashMap<String, Future<Path>>(), new HashMap<String, Accumulator<?, ?>>(), new UnregisteredMetricsGroup()), executionConfig);
        assertEquals(asList(1, 2, 3, 4, 5, 6), resultMutableSafe);
        assertEquals(asList(1, 2, 3, 4, 5, 6), resultRegular);
        assertTrue(opened.get());
        assertTrue(closed.get());
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
}
Also used : Path(org.apache.flink.core.fs.Path) UnregisteredMetricsGroup(org.apache.flink.metrics.groups.UnregisteredMetricsGroup) Configuration(org.apache.flink.configuration.Configuration) HashMap(java.util.HashMap) RichMapPartitionFunction(org.apache.flink.api.common.functions.RichMapPartitionFunction) ArrayList(java.util.ArrayList) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) TaskInfo(org.apache.flink.api.common.TaskInfo) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) RichMapPartitionFunction(org.apache.flink.api.common.functions.RichMapPartitionFunction) MapPartitionFunction(org.apache.flink.api.common.functions.MapPartitionFunction) Collector(org.apache.flink.util.Collector) RuntimeUDFContext(org.apache.flink.api.common.functions.util.RuntimeUDFContext) RuntimeContext(org.apache.flink.api.common.functions.RuntimeContext) Test(org.junit.Test)

Example 2 with MapPartitionFunction

use of org.apache.flink.api.common.functions.MapPartitionFunction in project flink by apache.

the class DataSetUtils method summarize.

// --------------------------------------------------------------------------------------------
//  Summarize
// --------------------------------------------------------------------------------------------
/**
	 * Summarize a DataSet of Tuples by collecting single pass statistics for all columns
	 *
	 * Example usage:
	 * <pre>
	 * {@code
	 * Dataset<Tuple3<Double, String, Boolean>> input = // [...]
	 * Tuple3<NumericColumnSummary,StringColumnSummary, BooleanColumnSummary> summary = DataSetUtils.summarize(input)
	 *
	 * summary.f0.getStandardDeviation()
	 * summary.f1.getMaxLength()
	 * }
	 * </pre>
	 * @return the summary as a Tuple the same width as input rows
	 */
public static <R extends Tuple, T extends Tuple> R summarize(DataSet<T> input) throws Exception {
    if (!input.getType().isTupleType()) {
        throw new IllegalArgumentException("summarize() is only implemented for DataSet's of Tuples");
    }
    final TupleTypeInfoBase<?> inType = (TupleTypeInfoBase<?>) input.getType();
    DataSet<TupleSummaryAggregator<R>> result = input.mapPartition(new MapPartitionFunction<T, TupleSummaryAggregator<R>>() {

        @Override
        public void mapPartition(Iterable<T> values, Collector<TupleSummaryAggregator<R>> out) throws Exception {
            TupleSummaryAggregator<R> aggregator = SummaryAggregatorFactory.create(inType);
            for (Tuple value : values) {
                aggregator.aggregate(value);
            }
            out.collect(aggregator);
        }
    }).reduce(new ReduceFunction<TupleSummaryAggregator<R>>() {

        @Override
        public TupleSummaryAggregator<R> reduce(TupleSummaryAggregator<R> agg1, TupleSummaryAggregator<R> agg2) throws Exception {
            agg1.combine(agg2);
            return agg1;
        }
    });
    return result.collect().get(0).result();
}
Also used : RichMapPartitionFunction(org.apache.flink.api.common.functions.RichMapPartitionFunction) MapPartitionFunction(org.apache.flink.api.common.functions.MapPartitionFunction) TupleTypeInfoBase(org.apache.flink.api.java.typeutils.TupleTypeInfoBase) Collector(org.apache.flink.util.Collector) Tuple(org.apache.flink.api.java.tuple.Tuple) TupleSummaryAggregator(org.apache.flink.api.java.summarize.aggregation.TupleSummaryAggregator)

Aggregations

MapPartitionFunction (org.apache.flink.api.common.functions.MapPartitionFunction)2 RichMapPartitionFunction (org.apache.flink.api.common.functions.RichMapPartitionFunction)2 Collector (org.apache.flink.util.Collector)2 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 AtomicBoolean (java.util.concurrent.atomic.AtomicBoolean)1 ExecutionConfig (org.apache.flink.api.common.ExecutionConfig)1 TaskInfo (org.apache.flink.api.common.TaskInfo)1 RuntimeContext (org.apache.flink.api.common.functions.RuntimeContext)1 RuntimeUDFContext (org.apache.flink.api.common.functions.util.RuntimeUDFContext)1 TupleSummaryAggregator (org.apache.flink.api.java.summarize.aggregation.TupleSummaryAggregator)1 Tuple (org.apache.flink.api.java.tuple.Tuple)1 TupleTypeInfoBase (org.apache.flink.api.java.typeutils.TupleTypeInfoBase)1 Configuration (org.apache.flink.configuration.Configuration)1 Path (org.apache.flink.core.fs.Path)1 UnregisteredMetricsGroup (org.apache.flink.metrics.groups.UnregisteredMetricsGroup)1 Test (org.junit.Test)1