use of org.apache.flink.util.Collector in project flink by apache.
the class BatchTask method initOutputs.
/**
* Creates a writer for each output. Creates an OutputCollector which forwards its input to all writers.
* The output collector applies the configured shipping strategy.
*/
@SuppressWarnings("unchecked")
public static <T> Collector<T> initOutputs(AbstractInvokable containingTask, ClassLoader cl, TaskConfig config, List<ChainedDriver<?, ?>> chainedTasksTarget, List<RecordWriter<?>> eventualOutputs, ExecutionConfig executionConfig, Map<String, Accumulator<?, ?>> accumulatorMap) throws Exception {
final int numOutputs = config.getNumOutputs();
// check whether we got any chained tasks
final int numChained = config.getNumberOfChainedStubs();
if (numChained > 0) {
// got chained stubs. that means that this one may only have a single forward connection
if (numOutputs != 1 || config.getOutputShipStrategy(0) != ShipStrategyType.FORWARD) {
throw new RuntimeException("Plan Generation Bug: Found a chained stub that is not connected via an only forward connection.");
}
// instantiate each task
@SuppressWarnings("rawtypes") Collector previous = null;
for (int i = numChained - 1; i >= 0; --i) {
// get the task first
final ChainedDriver<?, ?> ct;
try {
Class<? extends ChainedDriver<?, ?>> ctc = config.getChainedTask(i);
ct = ctc.newInstance();
} catch (Exception ex) {
throw new RuntimeException("Could not instantiate chained task driver.", ex);
}
// get the configuration for the task
final TaskConfig chainedStubConf = config.getChainedStubConfig(i);
final String taskName = config.getChainedTaskName(i);
if (i == numChained - 1) {
// last in chain, instantiate the output collector for this task
previous = getOutputCollector(containingTask, chainedStubConf, cl, eventualOutputs, 0, chainedStubConf.getNumOutputs());
}
ct.setup(chainedStubConf, taskName, previous, containingTask, cl, executionConfig, accumulatorMap);
chainedTasksTarget.add(0, ct);
if (i == numChained - 1) {
ct.getIOMetrics().reuseOutputMetricsForTask();
}
previous = ct;
}
// the collector of the first in the chain is the collector for the task
return (Collector<T>) previous;
}
// instantiate the output collector the default way from this configuration
return getOutputCollector(containingTask, config, cl, eventualOutputs, 0, numOutputs);
}
use of org.apache.flink.util.Collector in project flink by apache.
the class GroupCombineITCase method testCheckPartitionShuffleGroupBy.
@Test
public // check if no shuffle is being executed
void testCheckPartitionShuffleGroupBy() throws Exception {
org.junit.Assume.assumeTrue(mode != TestExecutionMode.COLLECTION);
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
// data
DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env);
// partition and group data
UnsortedGrouping<Tuple3<Integer, Long, String>> partitionedDS = ds.partitionByHash(0).groupBy(1);
List<Tuple2<Long, Integer>> result = partitionedDS.combineGroup(new GroupCombineFunction<Tuple3<Integer, Long, String>, Tuple2<Long, Integer>>() {
@Override
public void combine(Iterable<Tuple3<Integer, Long, String>> values, Collector<Tuple2<Long, Integer>> out) throws Exception {
int count = 0;
long key = 0;
for (Tuple3<Integer, Long, String> value : values) {
key = value.f1;
count++;
}
out.collect(new Tuple2<>(key, count));
}
}).collect();
String[] localExpected = new String[] { "(6,6)", "(5,5)" + "(4,4)", "(3,3)", "(2,2)", "(1,1)" };
String[] resultAsStringArray = new String[result.size()];
for (int i = 0; i < resultAsStringArray.length; ++i) {
resultAsStringArray[i] = result.get(i).toString();
}
Arrays.sort(resultAsStringArray);
Assert.assertEquals("The two arrays were identical.", false, Arrays.equals(localExpected, resultAsStringArray));
}
use of org.apache.flink.util.Collector in project flink by apache.
the class GroupCombineITCase method testCheckPartitionShuffleDOP1.
@Test
public // check if parallelism of 1 results in the same data like a shuffle
void testCheckPartitionShuffleDOP1() throws Exception {
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// data
DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env);
// partition and group data
UnsortedGrouping<Tuple3<Integer, Long, String>> partitionedDS = ds.partitionByHash(0).groupBy(1);
List<Tuple2<Long, Integer>> result = partitionedDS.combineGroup(new GroupCombineFunction<Tuple3<Integer, Long, String>, Tuple2<Long, Integer>>() {
@Override
public void combine(Iterable<Tuple3<Integer, Long, String>> values, Collector<Tuple2<Long, Integer>> out) throws Exception {
int count = 0;
long key = 0;
for (Tuple3<Integer, Long, String> value : values) {
key = value.f1;
count++;
}
out.collect(new Tuple2<>(key, count));
}
}).collect();
String expected = "6,6\n" + "5,5\n" + "4,4\n" + "3,3\n" + "2,2\n" + "1,1\n";
compareResultAsTuples(result, expected);
}
use of org.apache.flink.util.Collector in project beam by apache.
the class FlinkDoFnFunction method mapPartition.
@Override
public void mapPartition(Iterable<WindowedValue<InputT>> values, Collector<WindowedValue<OutputT>> out) throws Exception {
RuntimeContext runtimeContext = getRuntimeContext();
DoFnRunners.OutputManager outputManager;
if (outputMap.size() == 1) {
outputManager = new FlinkDoFnFunction.DoFnOutputManager(out);
} else {
// it has some additional outputs
outputManager = new FlinkDoFnFunction.MultiDoFnOutputManager((Collector) out, outputMap);
}
List<TupleTag<?>> additionalOutputTags = Lists.newArrayList(outputMap.keySet());
DoFnRunner<InputT, OutputT> doFnRunner = DoFnRunners.simpleRunner(serializedOptions.getPipelineOptions(), doFn, new FlinkSideInputReader(sideInputs, runtimeContext), outputManager, mainOutputTag, additionalOutputTags, new FlinkNoOpStepContext(), windowingStrategy);
if ((serializedOptions.getPipelineOptions().as(FlinkPipelineOptions.class)).getEnableMetrics()) {
doFnRunner = new DoFnRunnerWithMetricsUpdate<>(stepName, doFnRunner, getRuntimeContext());
}
doFnRunner.startBundle();
for (WindowedValue<InputT> value : values) {
doFnRunner.processElement(value);
}
doFnRunner.finishBundle();
}
use of org.apache.flink.util.Collector in project beam by apache.
the class FlinkStatefulDoFnFunction method reduce.
@Override
public void reduce(Iterable<WindowedValue<KV<K, V>>> values, Collector<WindowedValue<OutputT>> out) throws Exception {
RuntimeContext runtimeContext = getRuntimeContext();
DoFnRunners.OutputManager outputManager;
if (outputMap.size() == 1) {
outputManager = new FlinkDoFnFunction.DoFnOutputManager(out);
} else {
// it has some additional Outputs
outputManager = new FlinkDoFnFunction.MultiDoFnOutputManager((Collector) out, outputMap);
}
final Iterator<WindowedValue<KV<K, V>>> iterator = values.iterator();
// get the first value, we need this for initializing the state internals with the key.
// we are guaranteed to have a first value, otherwise reduce() would not have been called.
WindowedValue<KV<K, V>> currentValue = iterator.next();
final K key = currentValue.getValue().getKey();
final InMemoryStateInternals<K> stateInternals = InMemoryStateInternals.forKey(key);
// Used with Batch, we know that all the data is available for this key. We can't use the
// timer manager from the context because it doesn't exist. So we create one and advance
// time to the end after processing all elements.
final InMemoryTimerInternals timerInternals = new InMemoryTimerInternals();
timerInternals.advanceProcessingTime(Instant.now());
timerInternals.advanceSynchronizedProcessingTime(Instant.now());
List<TupleTag<?>> additionalOutputTags = Lists.newArrayList(outputMap.keySet());
DoFnRunner<KV<K, V>, OutputT> doFnRunner = DoFnRunners.simpleRunner(serializedOptions.getPipelineOptions(), dofn, new FlinkSideInputReader(sideInputs, runtimeContext), outputManager, mainOutputTag, additionalOutputTags, new FlinkNoOpStepContext() {
@Override
public StateInternals stateInternals() {
return stateInternals;
}
@Override
public TimerInternals timerInternals() {
return timerInternals;
}
}, windowingStrategy);
if ((serializedOptions.getPipelineOptions().as(FlinkPipelineOptions.class)).getEnableMetrics()) {
doFnRunner = new DoFnRunnerWithMetricsUpdate<>(stepName, doFnRunner, getRuntimeContext());
}
doFnRunner.startBundle();
doFnRunner.processElement(currentValue);
while (iterator.hasNext()) {
currentValue = iterator.next();
doFnRunner.processElement(currentValue);
}
// Finish any pending windows by advancing the input watermark to infinity.
timerInternals.advanceInputWatermark(BoundedWindow.TIMESTAMP_MAX_VALUE);
// Finally, advance the processing time to infinity to fire any timers.
timerInternals.advanceProcessingTime(BoundedWindow.TIMESTAMP_MAX_VALUE);
timerInternals.advanceSynchronizedProcessingTime(BoundedWindow.TIMESTAMP_MAX_VALUE);
fireEligibleTimers(timerInternals, doFnRunner);
doFnRunner.finishBundle();
}
Aggregations