Examples with Collector - org.apache.flink.util.Collector

Example 21 with Collector

use of org.apache.flink.util.Collector in project flink by apache.

the class GroupReduceOperatorTest method testGroupReduceCollection.

@Test
public void testGroupReduceCollection() {
    try {
        final GroupReduceFunction<Tuple2<String, Integer>, Tuple2<String, Integer>> reducer = new GroupReduceFunction<Tuple2<String, Integer>, Tuple2<String, Integer>>() {

            @Override
            public void reduce(Iterable<Tuple2<String, Integer>> values, Collector<Tuple2<String, Integer>> out) throws Exception {
                Iterator<Tuple2<String, Integer>> input = values.iterator();
                Tuple2<String, Integer> result = input.next();
                int sum = result.f1;
                while (input.hasNext()) {
                    Tuple2<String, Integer> next = input.next();
                    sum += next.f1;
                }
                result.f1 = sum;
                out.collect(result);
            }
        };
        GroupReduceOperatorBase<Tuple2<String, Integer>, Tuple2<String, Integer>, GroupReduceFunction<Tuple2<String, Integer>, Tuple2<String, Integer>>> op = new GroupReduceOperatorBase<Tuple2<String, Integer>, Tuple2<String, Integer>, GroupReduceFunction<Tuple2<String, Integer>, Tuple2<String, Integer>>>(reducer, new UnaryOperatorInformation<Tuple2<String, Integer>, Tuple2<String, Integer>>(TypeInfoParser.<Tuple2<String, Integer>>parse("Tuple2<String, Integer>"), TypeInfoParser.<Tuple2<String, Integer>>parse("Tuple2<String, Integer>")), new int[] { 0 }, "TestReducer");
        List<Tuple2<String, Integer>> input = new ArrayList<Tuple2<String, Integer>>(asList(new Tuple2<String, Integer>("foo", 1), new Tuple2<String, Integer>("foo", 3), new Tuple2<String, Integer>("bar", 2), new Tuple2<String, Integer>("bar", 4)));
        ExecutionConfig executionConfig = new ExecutionConfig();
        executionConfig.disableObjectReuse();
        List<Tuple2<String, Integer>> resultMutableSafe = op.executeOnCollections(input, null, executionConfig);
        executionConfig.enableObjectReuse();
        List<Tuple2<String, Integer>> resultRegular = op.executeOnCollections(input, null, executionConfig);
        Set<Tuple2<String, Integer>> resultSetMutableSafe = new HashSet<Tuple2<String, Integer>>(resultMutableSafe);
        Set<Tuple2<String, Integer>> resultSetRegular = new HashSet<Tuple2<String, Integer>>(resultRegular);
        Set<Tuple2<String, Integer>> expectedResult = new HashSet<Tuple2<String, Integer>>(asList(new Tuple2<String, Integer>("foo", 4), new Tuple2<String, Integer>("bar", 6)));
        assertEquals(expectedResult, resultSetMutableSafe);
        assertEquals(expectedResult, resultSetRegular);
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
}

Also used : RichGroupReduceFunction(org.apache.flink.api.common.functions.RichGroupReduceFunction) GroupReduceFunction(org.apache.flink.api.common.functions.GroupReduceFunction) ArrayList(java.util.ArrayList) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) Tuple2(org.apache.flink.api.java.tuple.Tuple2) Collector(org.apache.flink.util.Collector) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 22 with Collector

use of org.apache.flink.util.Collector in project flink by apache.

the class BranchingPlansCompilerTest method testBranchingWithMultipleDataSinks.

/**
	 * 
	 * <pre>

	 *              (SINK A)
	 *                  |    (SINK B)    (SINK C)
	 *                CROSS    /          /
	 *               /     \   |  +------+
	 *              /       \  | /
	 *          REDUCE      MATCH2
	 *             |    +---/    \
	 *              \  /          |
	 *               MAP          |
	 *                |           |
	 *             COGROUP      MATCH1
	 *             /     \     /     \
	 *        (SRC A)    (SRC B)    (SRC C)
	 * </pre>
	 */
@Test
public void testBranchingWithMultipleDataSinks() {
    try {
        ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(DEFAULT_PARALLELISM);
        DataSet<Tuple2<Long, Long>> sourceA = env.generateSequence(1, 10000000).map(new Duplicator<Long>());
        DataSet<Tuple2<Long, Long>> sourceB = env.generateSequence(1, 10000000).map(new Duplicator<Long>());
        DataSet<Tuple2<Long, Long>> sourceC = env.generateSequence(1, 10000000).map(new Duplicator<Long>());
        DataSet<Tuple2<Long, Long>> mapped = sourceA.coGroup(sourceB).where(0).equalTo(1).with(new CoGroupFunction<Tuple2<Long, Long>, Tuple2<Long, Long>, Tuple2<Long, Long>>() {

            @Override
            public void coGroup(Iterable<Tuple2<Long, Long>> first, Iterable<Tuple2<Long, Long>> second, Collector<Tuple2<Long, Long>> out) {
            }
        }).map(new IdentityMapper<Tuple2<Long, Long>>());
        DataSet<Tuple2<Long, Long>> joined = sourceB.join(sourceC).where(0).equalTo(1).with(new DummyFlatJoinFunction<Tuple2<Long, Long>>());
        DataSet<Tuple2<Long, Long>> joined2 = mapped.join(joined).where(1).equalTo(1).with(new DummyFlatJoinFunction<Tuple2<Long, Long>>());
        DataSet<Tuple2<Long, Long>> reduced = mapped.groupBy(1).reduceGroup(new Top1GroupReducer<Tuple2<Long, Long>>());
        reduced.cross(joined2).output(new DiscardingOutputFormat<Tuple2<Tuple2<Long, Long>, Tuple2<Long, Long>>>());
        joined2.output(new DiscardingOutputFormat<Tuple2<Long, Long>>());
        joined2.output(new DiscardingOutputFormat<Tuple2<Long, Long>>());
        Plan plan = env.createProgramPlan();
        OptimizedPlan oPlan = compileNoStats(plan);
        new JobGraphGenerator().compileJobGraph(oPlan);
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
}

Also used : ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) Plan(org.apache.flink.api.common.Plan) OptimizedPlan(org.apache.flink.optimizer.plan.OptimizedPlan) OptimizedPlan(org.apache.flink.optimizer.plan.OptimizedPlan) DummyCoGroupFunction(org.apache.flink.optimizer.testfunctions.DummyCoGroupFunction) CoGroupFunction(org.apache.flink.api.common.functions.CoGroupFunction) Tuple2(org.apache.flink.api.java.tuple.Tuple2) JobGraphGenerator(org.apache.flink.optimizer.plantranslate.JobGraphGenerator) Collector(org.apache.flink.util.Collector) Test(org.junit.Test)

Example 23 with Collector

use of org.apache.flink.util.Collector in project flink by apache.

the class BatchTask method initOutputs.

/**
	 * Creates a writer for each output. Creates an OutputCollector which forwards its input to all writers.
	 * The output collector applies the configured shipping strategy.
	 */
@SuppressWarnings("unchecked")
public static <T> Collector<T> initOutputs(AbstractInvokable containingTask, ClassLoader cl, TaskConfig config, List<ChainedDriver<?, ?>> chainedTasksTarget, List<RecordWriter<?>> eventualOutputs, ExecutionConfig executionConfig, Map<String, Accumulator<?, ?>> accumulatorMap) throws Exception {
    final int numOutputs = config.getNumOutputs();
    // check whether we got any chained tasks
    final int numChained = config.getNumberOfChainedStubs();
    if (numChained > 0) {
        // got chained stubs. that means that this one may only have a single forward connection
        if (numOutputs != 1 || config.getOutputShipStrategy(0) != ShipStrategyType.FORWARD) {
            throw new RuntimeException("Plan Generation Bug: Found a chained stub that is not connected via an only forward connection.");
        }
        // instantiate each task
        @SuppressWarnings("rawtypes") Collector previous = null;
        for (int i = numChained - 1; i >= 0; --i) {
            // get the task first
            final ChainedDriver<?, ?> ct;
            try {
                Class<? extends ChainedDriver<?, ?>> ctc = config.getChainedTask(i);
                ct = ctc.newInstance();
            } catch (Exception ex) {
                throw new RuntimeException("Could not instantiate chained task driver.", ex);
            }
            // get the configuration for the task
            final TaskConfig chainedStubConf = config.getChainedStubConfig(i);
            final String taskName = config.getChainedTaskName(i);
            if (i == numChained - 1) {
                // last in chain, instantiate the output collector for this task
                previous = getOutputCollector(containingTask, chainedStubConf, cl, eventualOutputs, 0, chainedStubConf.getNumOutputs());
            }
            ct.setup(chainedStubConf, taskName, previous, containingTask, cl, executionConfig, accumulatorMap);
            chainedTasksTarget.add(0, ct);
            if (i == numChained - 1) {
                ct.getIOMetrics().reuseOutputMetricsForTask();
            }
            previous = ct;
        }
        // the collector of the first in the chain is the collector for the task
        return (Collector<T>) previous;
    }
    // instantiate the output collector the default way from this configuration
    return getOutputCollector(containingTask, config, cl, eventualOutputs, 0, numOutputs);
}

Also used : OutputCollector(org.apache.flink.runtime.operators.shipping.OutputCollector) Collector(org.apache.flink.util.Collector) TaskConfig(org.apache.flink.runtime.operators.util.TaskConfig) ExceptionInChainedStubException(org.apache.flink.runtime.operators.chaining.ExceptionInChainedStubException) CancelTaskException(org.apache.flink.runtime.execution.CancelTaskException) IOException(java.io.IOException)

Example 24 with Collector

use of org.apache.flink.util.Collector in project flink by apache.

the class GroupCombineITCase method testCheckPartitionShuffleGroupBy.

@Test
public // check if no shuffle is being executed
void testCheckPartitionShuffleGroupBy() throws Exception {
    org.junit.Assume.assumeTrue(mode != TestExecutionMode.COLLECTION);
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    // data
    DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env);
    // partition and group data
    UnsortedGrouping<Tuple3<Integer, Long, String>> partitionedDS = ds.partitionByHash(0).groupBy(1);
    List<Tuple2<Long, Integer>> result = partitionedDS.combineGroup(new GroupCombineFunction<Tuple3<Integer, Long, String>, Tuple2<Long, Integer>>() {

        @Override
        public void combine(Iterable<Tuple3<Integer, Long, String>> values, Collector<Tuple2<Long, Integer>> out) throws Exception {
            int count = 0;
            long key = 0;
            for (Tuple3<Integer, Long, String> value : values) {
                key = value.f1;
                count++;
            }
            out.collect(new Tuple2<>(key, count));
        }
    }).collect();
    String[] localExpected = new String[] { "(6,6)", "(5,5)" + "(4,4)", "(3,3)", "(2,2)", "(1,1)" };
    String[] resultAsStringArray = new String[result.size()];
    for (int i = 0; i < resultAsStringArray.length; ++i) {
        resultAsStringArray[i] = result.get(i).toString();
    }
    Arrays.sort(resultAsStringArray);
    Assert.assertEquals("The two arrays were identical.", false, Arrays.equals(localExpected, resultAsStringArray));
}

Also used : ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) GroupCombineFunction(org.apache.flink.api.common.functions.GroupCombineFunction) Tuple2(org.apache.flink.api.java.tuple.Tuple2) Tuple3(org.apache.flink.api.java.tuple.Tuple3) Collector(org.apache.flink.util.Collector) Test(org.junit.Test)

Example 25 with Collector

use of org.apache.flink.util.Collector in project flink by apache.

the class GroupCombineITCase method testCheckPartitionShuffleDOP1.

@Test
public // check if parallelism of 1 results in the same data like a shuffle
void testCheckPartitionShuffleDOP1() throws Exception {
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    env.setParallelism(1);
    // data
    DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env);
    // partition and group data
    UnsortedGrouping<Tuple3<Integer, Long, String>> partitionedDS = ds.partitionByHash(0).groupBy(1);
    List<Tuple2<Long, Integer>> result = partitionedDS.combineGroup(new GroupCombineFunction<Tuple3<Integer, Long, String>, Tuple2<Long, Integer>>() {

        @Override
        public void combine(Iterable<Tuple3<Integer, Long, String>> values, Collector<Tuple2<Long, Integer>> out) throws Exception {
            int count = 0;
            long key = 0;
            for (Tuple3<Integer, Long, String> value : values) {
                key = value.f1;
                count++;
            }
            out.collect(new Tuple2<>(key, count));
        }
    }).collect();
    String expected = "6,6\n" + "5,5\n" + "4,4\n" + "3,3\n" + "2,2\n" + "1,1\n";
    compareResultAsTuples(result, expected);
}

Aggregations

Collector (org.apache.flink.util.Collector)50 Test (org.junit.Test)38 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)20 ExecutionConfig (org.apache.flink.api.common.ExecutionConfig)16 Configuration (org.apache.flink.configuration.Configuration)16 ExecutionEnvironment (org.apache.flink.api.java.ExecutionEnvironment)15 RuntimeContext (org.apache.flink.api.common.functions.RuntimeContext)14 StreamExecutionEnvironment (org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)14 TimeWindow (org.apache.flink.streaming.api.windowing.windows.TimeWindow)11 ArrayList (java.util.ArrayList)8 HashMap (java.util.HashMap)8 FlatMapFunction (org.apache.flink.api.common.functions.FlatMapFunction)8 Plan (org.apache.flink.api.common.Plan)7 HashSet (java.util.HashSet)6 RichGroupReduceFunction (org.apache.flink.api.common.functions.RichGroupReduceFunction)6 Tuple3 (org.apache.flink.api.java.tuple.Tuple3)6 GroupReduceFunction (org.apache.flink.api.common.functions.GroupReduceFunction)5 Map (java.util.Map)4 TaskInfo (org.apache.flink.api.common.TaskInfo)4 CoGroupFunction (org.apache.flink.api.common.functions.CoGroupFunction)4