Search in sources :

Example 1 with Collector

use of org.apache.flink.util.Collector in project flink by apache.

the class BatchTask method initOutputs.

/**
	 * Creates a writer for each output. Creates an OutputCollector which forwards its input to all writers.
	 * The output collector applies the configured shipping strategy.
	 */
@SuppressWarnings("unchecked")
public static <T> Collector<T> initOutputs(AbstractInvokable containingTask, ClassLoader cl, TaskConfig config, List<ChainedDriver<?, ?>> chainedTasksTarget, List<RecordWriter<?>> eventualOutputs, ExecutionConfig executionConfig, Map<String, Accumulator<?, ?>> accumulatorMap) throws Exception {
    final int numOutputs = config.getNumOutputs();
    // check whether we got any chained tasks
    final int numChained = config.getNumberOfChainedStubs();
    if (numChained > 0) {
        // got chained stubs. that means that this one may only have a single forward connection
        if (numOutputs != 1 || config.getOutputShipStrategy(0) != ShipStrategyType.FORWARD) {
            throw new RuntimeException("Plan Generation Bug: Found a chained stub that is not connected via an only forward connection.");
        }
        // instantiate each task
        @SuppressWarnings("rawtypes") Collector previous = null;
        for (int i = numChained - 1; i >= 0; --i) {
            // get the task first
            final ChainedDriver<?, ?> ct;
            try {
                Class<? extends ChainedDriver<?, ?>> ctc = config.getChainedTask(i);
                ct = ctc.newInstance();
            } catch (Exception ex) {
                throw new RuntimeException("Could not instantiate chained task driver.", ex);
            }
            // get the configuration for the task
            final TaskConfig chainedStubConf = config.getChainedStubConfig(i);
            final String taskName = config.getChainedTaskName(i);
            if (i == numChained - 1) {
                // last in chain, instantiate the output collector for this task
                previous = getOutputCollector(containingTask, chainedStubConf, cl, eventualOutputs, 0, chainedStubConf.getNumOutputs());
            }
            ct.setup(chainedStubConf, taskName, previous, containingTask, cl, executionConfig, accumulatorMap);
            chainedTasksTarget.add(0, ct);
            if (i == numChained - 1) {
                ct.getIOMetrics().reuseOutputMetricsForTask();
            }
            previous = ct;
        }
        // the collector of the first in the chain is the collector for the task
        return (Collector<T>) previous;
    }
    // instantiate the output collector the default way from this configuration
    return getOutputCollector(containingTask, config, cl, eventualOutputs, 0, numOutputs);
}
Also used : OutputCollector(org.apache.flink.runtime.operators.shipping.OutputCollector) Collector(org.apache.flink.util.Collector) TaskConfig(org.apache.flink.runtime.operators.util.TaskConfig) ExceptionInChainedStubException(org.apache.flink.runtime.operators.chaining.ExceptionInChainedStubException) CancelTaskException(org.apache.flink.runtime.execution.CancelTaskException) IOException(java.io.IOException)

Example 2 with Collector

use of org.apache.flink.util.Collector in project flink by apache.

the class GroupCombineITCase method testCheckPartitionShuffleGroupBy.

@Test
public // check if no shuffle is being executed
void testCheckPartitionShuffleGroupBy() throws Exception {
    org.junit.Assume.assumeTrue(mode != TestExecutionMode.COLLECTION);
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    // data
    DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env);
    // partition and group data
    UnsortedGrouping<Tuple3<Integer, Long, String>> partitionedDS = ds.partitionByHash(0).groupBy(1);
    List<Tuple2<Long, Integer>> result = partitionedDS.combineGroup(new GroupCombineFunction<Tuple3<Integer, Long, String>, Tuple2<Long, Integer>>() {

        @Override
        public void combine(Iterable<Tuple3<Integer, Long, String>> values, Collector<Tuple2<Long, Integer>> out) throws Exception {
            int count = 0;
            long key = 0;
            for (Tuple3<Integer, Long, String> value : values) {
                key = value.f1;
                count++;
            }
            out.collect(new Tuple2<>(key, count));
        }
    }).collect();
    String[] localExpected = new String[] { "(6,6)", "(5,5)" + "(4,4)", "(3,3)", "(2,2)", "(1,1)" };
    String[] resultAsStringArray = new String[result.size()];
    for (int i = 0; i < resultAsStringArray.length; ++i) {
        resultAsStringArray[i] = result.get(i).toString();
    }
    Arrays.sort(resultAsStringArray);
    Assert.assertEquals("The two arrays were identical.", false, Arrays.equals(localExpected, resultAsStringArray));
}
Also used : ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) GroupCombineFunction(org.apache.flink.api.common.functions.GroupCombineFunction) Tuple2(org.apache.flink.api.java.tuple.Tuple2) Tuple3(org.apache.flink.api.java.tuple.Tuple3) Collector(org.apache.flink.util.Collector) Test(org.junit.Test)

Example 3 with Collector

use of org.apache.flink.util.Collector in project flink by apache.

the class GroupCombineITCase method testCheckPartitionShuffleDOP1.

@Test
public // check if parallelism of 1 results in the same data like a shuffle
void testCheckPartitionShuffleDOP1() throws Exception {
    final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    env.setParallelism(1);
    // data
    DataSet<Tuple3<Integer, Long, String>> ds = CollectionDataSets.get3TupleDataSet(env);
    // partition and group data
    UnsortedGrouping<Tuple3<Integer, Long, String>> partitionedDS = ds.partitionByHash(0).groupBy(1);
    List<Tuple2<Long, Integer>> result = partitionedDS.combineGroup(new GroupCombineFunction<Tuple3<Integer, Long, String>, Tuple2<Long, Integer>>() {

        @Override
        public void combine(Iterable<Tuple3<Integer, Long, String>> values, Collector<Tuple2<Long, Integer>> out) throws Exception {
            int count = 0;
            long key = 0;
            for (Tuple3<Integer, Long, String> value : values) {
                key = value.f1;
                count++;
            }
            out.collect(new Tuple2<>(key, count));
        }
    }).collect();
    String expected = "6,6\n" + "5,5\n" + "4,4\n" + "3,3\n" + "2,2\n" + "1,1\n";
    compareResultAsTuples(result, expected);
}
Also used : ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) GroupCombineFunction(org.apache.flink.api.common.functions.GroupCombineFunction) Tuple2(org.apache.flink.api.java.tuple.Tuple2) Tuple3(org.apache.flink.api.java.tuple.Tuple3) Collector(org.apache.flink.util.Collector) Test(org.junit.Test)

Example 4 with Collector

use of org.apache.flink.util.Collector in project beam by apache.

the class FlinkDoFnFunction method mapPartition.

@Override
public void mapPartition(Iterable<WindowedValue<InputT>> values, Collector<WindowedValue<OutputT>> out) throws Exception {
    RuntimeContext runtimeContext = getRuntimeContext();
    DoFnRunners.OutputManager outputManager;
    if (outputMap.size() == 1) {
        outputManager = new FlinkDoFnFunction.DoFnOutputManager(out);
    } else {
        // it has some additional outputs
        outputManager = new FlinkDoFnFunction.MultiDoFnOutputManager((Collector) out, outputMap);
    }
    List<TupleTag<?>> additionalOutputTags = Lists.newArrayList(outputMap.keySet());
    DoFnRunner<InputT, OutputT> doFnRunner = DoFnRunners.simpleRunner(serializedOptions.getPipelineOptions(), doFn, new FlinkSideInputReader(sideInputs, runtimeContext), outputManager, mainOutputTag, additionalOutputTags, new FlinkNoOpStepContext(), windowingStrategy);
    if ((serializedOptions.getPipelineOptions().as(FlinkPipelineOptions.class)).getEnableMetrics()) {
        doFnRunner = new DoFnRunnerWithMetricsUpdate<>(stepName, doFnRunner, getRuntimeContext());
    }
    doFnRunner.startBundle();
    for (WindowedValue<InputT> value : values) {
        doFnRunner.processElement(value);
    }
    doFnRunner.finishBundle();
}
Also used : DoFnRunners(org.apache.beam.runners.core.DoFnRunners) TupleTag(org.apache.beam.sdk.values.TupleTag) Collector(org.apache.flink.util.Collector) RuntimeContext(org.apache.flink.api.common.functions.RuntimeContext)

Example 5 with Collector

use of org.apache.flink.util.Collector in project beam by apache.

the class FlinkStatefulDoFnFunction method reduce.

@Override
public void reduce(Iterable<WindowedValue<KV<K, V>>> values, Collector<WindowedValue<OutputT>> out) throws Exception {
    RuntimeContext runtimeContext = getRuntimeContext();
    DoFnRunners.OutputManager outputManager;
    if (outputMap.size() == 1) {
        outputManager = new FlinkDoFnFunction.DoFnOutputManager(out);
    } else {
        // it has some additional Outputs
        outputManager = new FlinkDoFnFunction.MultiDoFnOutputManager((Collector) out, outputMap);
    }
    final Iterator<WindowedValue<KV<K, V>>> iterator = values.iterator();
    // get the first value, we need this for initializing the state internals with the key.
    // we are guaranteed to have a first value, otherwise reduce() would not have been called.
    WindowedValue<KV<K, V>> currentValue = iterator.next();
    final K key = currentValue.getValue().getKey();
    final InMemoryStateInternals<K> stateInternals = InMemoryStateInternals.forKey(key);
    // Used with Batch, we know that all the data is available for this key. We can't use the
    // timer manager from the context because it doesn't exist. So we create one and advance
    // time to the end after processing all elements.
    final InMemoryTimerInternals timerInternals = new InMemoryTimerInternals();
    timerInternals.advanceProcessingTime(Instant.now());
    timerInternals.advanceSynchronizedProcessingTime(Instant.now());
    List<TupleTag<?>> additionalOutputTags = Lists.newArrayList(outputMap.keySet());
    DoFnRunner<KV<K, V>, OutputT> doFnRunner = DoFnRunners.simpleRunner(serializedOptions.getPipelineOptions(), dofn, new FlinkSideInputReader(sideInputs, runtimeContext), outputManager, mainOutputTag, additionalOutputTags, new FlinkNoOpStepContext() {

        @Override
        public StateInternals stateInternals() {
            return stateInternals;
        }

        @Override
        public TimerInternals timerInternals() {
            return timerInternals;
        }
    }, windowingStrategy);
    if ((serializedOptions.getPipelineOptions().as(FlinkPipelineOptions.class)).getEnableMetrics()) {
        doFnRunner = new DoFnRunnerWithMetricsUpdate<>(stepName, doFnRunner, getRuntimeContext());
    }
    doFnRunner.startBundle();
    doFnRunner.processElement(currentValue);
    while (iterator.hasNext()) {
        currentValue = iterator.next();
        doFnRunner.processElement(currentValue);
    }
    // Finish any pending windows by advancing the input watermark to infinity.
    timerInternals.advanceInputWatermark(BoundedWindow.TIMESTAMP_MAX_VALUE);
    // Finally, advance the processing time to infinity to fire any timers.
    timerInternals.advanceProcessingTime(BoundedWindow.TIMESTAMP_MAX_VALUE);
    timerInternals.advanceSynchronizedProcessingTime(BoundedWindow.TIMESTAMP_MAX_VALUE);
    fireEligibleTimers(timerInternals, doFnRunner);
    doFnRunner.finishBundle();
}
Also used : DoFnRunners(org.apache.beam.runners.core.DoFnRunners) TupleTag(org.apache.beam.sdk.values.TupleTag) WindowedValue(org.apache.beam.sdk.util.WindowedValue) KV(org.apache.beam.sdk.values.KV) Collector(org.apache.flink.util.Collector) InMemoryTimerInternals(org.apache.beam.runners.core.InMemoryTimerInternals) KV(org.apache.beam.sdk.values.KV) TimerInternals(org.apache.beam.runners.core.TimerInternals) InMemoryTimerInternals(org.apache.beam.runners.core.InMemoryTimerInternals) InMemoryStateInternals(org.apache.beam.runners.core.InMemoryStateInternals) StateInternals(org.apache.beam.runners.core.StateInternals) RuntimeContext(org.apache.flink.api.common.functions.RuntimeContext)

Aggregations

Collector (org.apache.flink.util.Collector)80 Test (org.junit.Test)60 StreamExecutionEnvironment (org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)33 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)32 Configuration (org.apache.flink.configuration.Configuration)27 ExecutionConfig (org.apache.flink.api.common.ExecutionConfig)19 ExecutionEnvironment (org.apache.flink.api.java.ExecutionEnvironment)18 ArrayList (java.util.ArrayList)16 DataStream (org.apache.flink.streaming.api.datastream.DataStream)16 TimeWindow (org.apache.flink.streaming.api.windowing.windows.TimeWindow)16 HashMap (java.util.HashMap)14 List (java.util.List)14 RuntimeContext (org.apache.flink.api.common.functions.RuntimeContext)14 Tuple3 (org.apache.flink.api.java.tuple.Tuple3)12 IOException (java.io.IOException)11 Arrays (java.util.Arrays)11 Map (java.util.Map)11 FlatMapFunction (org.apache.flink.api.common.functions.FlatMapFunction)11 Assert.assertTrue (org.junit.Assert.assertTrue)11 InternalWindowFunction (org.apache.flink.streaming.runtime.operators.windowing.functions.InternalWindowFunction)10