use of org.apache.flink.util.Collector in project flink by apache.
the class GroupReduceOperatorTest method testGroupReduceCollection.
@Test
public void testGroupReduceCollection() {
try {
final GroupReduceFunction<Tuple2<String, Integer>, Tuple2<String, Integer>> reducer = new GroupReduceFunction<Tuple2<String, Integer>, Tuple2<String, Integer>>() {
@Override
public void reduce(Iterable<Tuple2<String, Integer>> values, Collector<Tuple2<String, Integer>> out) throws Exception {
Iterator<Tuple2<String, Integer>> input = values.iterator();
Tuple2<String, Integer> result = input.next();
int sum = result.f1;
while (input.hasNext()) {
Tuple2<String, Integer> next = input.next();
sum += next.f1;
}
result.f1 = sum;
out.collect(result);
}
};
GroupReduceOperatorBase<Tuple2<String, Integer>, Tuple2<String, Integer>, GroupReduceFunction<Tuple2<String, Integer>, Tuple2<String, Integer>>> op = new GroupReduceOperatorBase<Tuple2<String, Integer>, Tuple2<String, Integer>, GroupReduceFunction<Tuple2<String, Integer>, Tuple2<String, Integer>>>(reducer, new UnaryOperatorInformation<Tuple2<String, Integer>, Tuple2<String, Integer>>(TypeInfoParser.<Tuple2<String, Integer>>parse("Tuple2<String, Integer>"), TypeInfoParser.<Tuple2<String, Integer>>parse("Tuple2<String, Integer>")), new int[] { 0 }, "TestReducer");
List<Tuple2<String, Integer>> input = new ArrayList<Tuple2<String, Integer>>(asList(new Tuple2<String, Integer>("foo", 1), new Tuple2<String, Integer>("foo", 3), new Tuple2<String, Integer>("bar", 2), new Tuple2<String, Integer>("bar", 4)));
ExecutionConfig executionConfig = new ExecutionConfig();
executionConfig.disableObjectReuse();
List<Tuple2<String, Integer>> resultMutableSafe = op.executeOnCollections(input, null, executionConfig);
executionConfig.enableObjectReuse();
List<Tuple2<String, Integer>> resultRegular = op.executeOnCollections(input, null, executionConfig);
Set<Tuple2<String, Integer>> resultSetMutableSafe = new HashSet<Tuple2<String, Integer>>(resultMutableSafe);
Set<Tuple2<String, Integer>> resultSetRegular = new HashSet<Tuple2<String, Integer>>(resultRegular);
Set<Tuple2<String, Integer>> expectedResult = new HashSet<Tuple2<String, Integer>>(asList(new Tuple2<String, Integer>("foo", 4), new Tuple2<String, Integer>("bar", 6)));
assertEquals(expectedResult, resultSetMutableSafe);
assertEquals(expectedResult, resultSetRegular);
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
use of org.apache.flink.util.Collector in project flink by apache.
the class BranchingPlansCompilerTest method testBranchingWithMultipleDataSinks.
/**
*
* <pre>
* (SINK A)
* | (SINK B) (SINK C)
* CROSS / /
* / \ | +------+
* / \ | /
* REDUCE MATCH2
* | +---/ \
* \ / |
* MAP |
* | |
* COGROUP MATCH1
* / \ / \
* (SRC A) (SRC B) (SRC C)
* </pre>
*/
@Test
public void testBranchingWithMultipleDataSinks() {
try {
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(DEFAULT_PARALLELISM);
DataSet<Tuple2<Long, Long>> sourceA = env.generateSequence(1, 10000000).map(new Duplicator<Long>());
DataSet<Tuple2<Long, Long>> sourceB = env.generateSequence(1, 10000000).map(new Duplicator<Long>());
DataSet<Tuple2<Long, Long>> sourceC = env.generateSequence(1, 10000000).map(new Duplicator<Long>());
DataSet<Tuple2<Long, Long>> mapped = sourceA.coGroup(sourceB).where(0).equalTo(1).with(new CoGroupFunction<Tuple2<Long, Long>, Tuple2<Long, Long>, Tuple2<Long, Long>>() {
@Override
public void coGroup(Iterable<Tuple2<Long, Long>> first, Iterable<Tuple2<Long, Long>> second, Collector<Tuple2<Long, Long>> out) {
}
}).map(new IdentityMapper<Tuple2<Long, Long>>());
DataSet<Tuple2<Long, Long>> joined = sourceB.join(sourceC).where(0).equalTo(1).with(new DummyFlatJoinFunction<Tuple2<Long, Long>>());
DataSet<Tuple2<Long, Long>> joined2 = mapped.join(joined).where(1).equalTo(1).with(new DummyFlatJoinFunction<Tuple2<Long, Long>>());
DataSet<Tuple2<Long, Long>> reduced = mapped.groupBy(1).reduceGroup(new Top1GroupReducer<Tuple2<Long, Long>>());
reduced.cross(joined2).output(new DiscardingOutputFormat<Tuple2<Tuple2<Long, Long>, Tuple2<Long, Long>>>());
joined2.output(new DiscardingOutputFormat<Tuple2<Long, Long>>());
joined2.output(new DiscardingOutputFormat<Tuple2<Long, Long>>());
Plan plan = env.createProgramPlan();
OptimizedPlan oPlan = compileNoStats(plan);
new JobGraphGenerator().compileJobGraph(oPlan);
} catch (Exception e) {
e.printStackTrace();
fail(e.getMessage());
}
}
use of org.apache.flink.util.Collector in project flink by apache.
the class BatchTask method initOutputs.
/**
* Creates a writer for each output. Creates an OutputCollector which forwards its input to all writers.
* The output collector applies the configured shipping strategy.
*/
@SuppressWarnings("unchecked")
public static <T> Collector<T> initOutputs(AbstractInvokable containingTask, ClassLoader cl, TaskConfig config, List<ChainedDriver<?, ?>> chainedTasksTarget, List<RecordWriter<?>> eventualOutputs, ExecutionConfig executionConfig, Map<String, Accumulator<?, ?>> accumulatorMap) throws Exception {
final int numOutputs = config.getNumOutputs();
// check whether we got any chained tasks
final int numChained = config.getNumberOfChainedStubs();
if (numChained > 0) {
// got chained stubs. that means that this one may only have a single forward connection
if (numOutputs != 1 || config.getOutputShipStrategy(0) != ShipStrategyType.FORWARD) {
throw new RuntimeException("Plan Generation Bug: Found a chained stub that is not connected via an only forward connection.");
}
// instantiate each task
@SuppressWarnings("rawtypes") Collector previous = null;
for (int i = numChained - 1; i >= 0; --i) {
// get the task first
final ChainedDriver<?, ?> ct;
try {
Class<? extends ChainedDriver<?, ?>> ctc = config.getChainedTask(i);
ct = ctc.newInstance();
} catch (Exception ex) {
throw new RuntimeException("Could not instantiate chained task driver.", ex);
}
// get the configuration for the task
final TaskConfig chainedStubConf = config.getChainedStubConfig(i);
final String taskName = config.getChainedTaskName(i);
if (i == numChained - 1) {
// last in chain, instantiate the output collector for this task
previous = getOutputCollector(containingTask, chainedStubConf, cl, eventualOutputs, 0, chainedStubConf.getNumOutputs());
}
ct.setup(chainedStubConf, taskName, previous, containingTask, cl, executionConfig, accumulatorMap);
chainedTasksTarget.add(0, ct);
if (i == numChained - 1) {
ct.getIOMetrics().reuseOutputMetricsForTask();
}
previous = ct;
}
// the collector of the first in the chain is the collector for the task
return (Collector<T>) previous;
}
// instantiate the output collector the default way from this configuration
return getOutputCollector(containingTask, config, cl, eventualOutputs, 0, numOutputs);
}
use of org.apache.flink.util.Collector in project flink by apache.
the class GroupCombineITCase method testCheckPartitionShuffleGroupBy.
@Test
public // check if no shuffle is being executed
void testCheckPartitionShuffleGroupBy() throws Exception {
org.junit.Assume.assumeTrue(mode != TestExecutionMode.COLLECTION);
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
// data
DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env);
// partition and group data
UnsortedGrouping<Tuple3<Integer, Long, String>> partitionedDS = ds.partitionByHash(0).groupBy(1);
List<Tuple2<Long, Integer>> result = partitionedDS.combineGroup(new GroupCombineFunction<Tuple3<Integer, Long, String>, Tuple2<Long, Integer>>() {
@Override
public void combine(Iterable<Tuple3<Integer, Long, String>> values, Collector<Tuple2<Long, Integer>> out) throws Exception {
int count = 0;
long key = 0;
for (Tuple3<Integer, Long, String> value : values) {
key = value.f1;
count++;
}
out.collect(new Tuple2<>(key, count));
}
}).collect();
String[] localExpected = new String[] { "(6,6)", "(5,5)" + "(4,4)", "(3,3)", "(2,2)", "(1,1)" };
String[] resultAsStringArray = new String[result.size()];
for (int i = 0; i < resultAsStringArray.length; ++i) {
resultAsStringArray[i] = result.get(i).toString();
}
Arrays.sort(resultAsStringArray);
Assert.assertEquals("The two arrays were identical.", false, Arrays.equals(localExpected, resultAsStringArray));
}
use of org.apache.flink.util.Collector in project flink by apache.
the class GroupCombineITCase method testCheckPartitionShuffleDOP1.
@Test
public // check if parallelism of 1 results in the same data like a shuffle
void testCheckPartitionShuffleDOP1() throws Exception {
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// data
DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env);
// partition and group data
UnsortedGrouping<Tuple3<Integer, Long, String>> partitionedDS = ds.partitionByHash(0).groupBy(1);
List<Tuple2<Long, Integer>> result = partitionedDS.combineGroup(new GroupCombineFunction<Tuple3<Integer, Long, String>, Tuple2<Long, Integer>>() {
@Override
public void combine(Iterable<Tuple3<Integer, Long, String>> values, Collector<Tuple2<Long, Integer>> out) throws Exception {
int count = 0;
long key = 0;
for (Tuple3<Integer, Long, String> value : values) {
key = value.f1;
count++;
}
out.collect(new Tuple2<>(key, count));
}
}).collect();
String expected = "6,6\n" + "5,5\n" + "4,4\n" + "3,3\n" + "2,2\n" + "1,1\n";
compareResultAsTuples(result, expected);
}
Aggregations