Search in sources :

Example 36 with Collector

use of org.apache.flink.util.Collector in project flink by apache.

the class TransitiveClosureNaive method main.

public static void main(String... args) throws Exception {
    // Checking input parameters
    final ParameterTool params = ParameterTool.fromArgs(args);
    // set up execution environment
    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    // make parameters available in the web interface
    env.getConfig().setGlobalJobParameters(params);
    final int maxIterations = params.getInt("iterations", 10);
    DataSet<Tuple2<Long, Long>> edges;
    if (params.has("edges")) {
        edges = env.readCsvFile(params.get("edges")).fieldDelimiter(" ").types(Long.class, Long.class);
    } else {
        System.out.println("Executing TransitiveClosureNaive example with default edges data set.");
        System.out.println("Use --edges to specify file input.");
        edges = ConnectedComponentsData.getDefaultEdgeDataSet(env);
    }
    IterativeDataSet<Tuple2<Long, Long>> paths = edges.iterate(maxIterations);
    DataSet<Tuple2<Long, Long>> nextPaths = paths.join(edges).where(1).equalTo(0).with(new JoinFunction<Tuple2<Long, Long>, Tuple2<Long, Long>, Tuple2<Long, Long>>() {

        @Override
        public /**
						left: Path (z,x) - x is reachable by z
						right: Edge (x,y) - edge x-->y exists
						out: Path (z,y) - y is reachable by z
					 */
        Tuple2<Long, Long> join(Tuple2<Long, Long> left, Tuple2<Long, Long> right) throws Exception {
            return new Tuple2<Long, Long>(left.f0, right.f1);
        }
    }).withForwardedFieldsFirst("0").withForwardedFieldsSecond("1").union(paths).groupBy(0, 1).reduceGroup(new GroupReduceFunction<Tuple2<Long, Long>, Tuple2<Long, Long>>() {

        @Override
        public void reduce(Iterable<Tuple2<Long, Long>> values, Collector<Tuple2<Long, Long>> out) throws Exception {
            out.collect(values.iterator().next());
        }
    }).withForwardedFields("0;1");
    DataSet<Tuple2<Long, Long>> newPaths = paths.coGroup(nextPaths).where(0).equalTo(0).with(new CoGroupFunction<Tuple2<Long, Long>, Tuple2<Long, Long>, Tuple2<Long, Long>>() {

        Set<Tuple2<Long, Long>> prevSet = new HashSet<Tuple2<Long, Long>>();

        @Override
        public void coGroup(Iterable<Tuple2<Long, Long>> prevPaths, Iterable<Tuple2<Long, Long>> nextPaths, Collector<Tuple2<Long, Long>> out) throws Exception {
            for (Tuple2<Long, Long> prev : prevPaths) {
                prevSet.add(prev);
            }
            for (Tuple2<Long, Long> next : nextPaths) {
                if (!prevSet.contains(next)) {
                    out.collect(next);
                }
            }
        }
    }).withForwardedFieldsFirst("0").withForwardedFieldsSecond("0");
    DataSet<Tuple2<Long, Long>> transitiveClosure = paths.closeWith(nextPaths, newPaths);
    // emit result
    if (params.has("output")) {
        transitiveClosure.writeAsCsv(params.get("output"), "\n", " ");
        // execute program explicitly, because file sinks are lazy
        env.execute("Transitive Closure Example");
    } else {
        System.out.println("Printing result to stdout. Use --output to specify output path.");
        transitiveClosure.print();
    }
}
Also used : ParameterTool(org.apache.flink.api.java.utils.ParameterTool) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) GroupReduceFunction(org.apache.flink.api.common.functions.GroupReduceFunction) Tuple2(org.apache.flink.api.java.tuple.Tuple2) Collector(org.apache.flink.util.Collector) HashSet(java.util.HashSet)

Example 37 with Collector

use of org.apache.flink.util.Collector in project flink by apache.

the class SocketWindowWordCount method main.

public static void main(String[] args) throws Exception {
    // the host and the port to connect to
    final String hostname;
    final int port;
    try {
        final ParameterTool params = ParameterTool.fromArgs(args);
        hostname = params.has("hostname") ? params.get("hostname") : "localhost";
        port = params.getInt("port");
    } catch (Exception e) {
        System.err.println("No port specified. Please run 'SocketWindowWordCount " + "--hostname <hostname> --port <port>', where hostname (localhost by default) " + "and port is the address of the text server");
        System.err.println("To start a simple text server, run 'netcat -l <port>' and " + "type the input text into the command line");
        return;
    }
    // get the execution environment
    final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    // get input data by connecting to the socket
    DataStream<String> text = env.socketTextStream(hostname, port, "\n");
    // parse the data, group it, window it, and aggregate the counts
    DataStream<WordWithCount> windowCounts = text.flatMap(new FlatMapFunction<String, WordWithCount>() {

        @Override
        public void flatMap(String value, Collector<WordWithCount> out) {
            for (String word : value.split("\\s")) {
                out.collect(new WordWithCount(word, 1L));
            }
        }
    }).keyBy("word").timeWindow(Time.seconds(5)).reduce(new ReduceFunction<WordWithCount>() {

        @Override
        public WordWithCount reduce(WordWithCount a, WordWithCount b) {
            return new WordWithCount(a.word, a.count + b.count);
        }
    });
    // print the results with a single thread, rather than in parallel
    windowCounts.print().setParallelism(1);
    env.execute("Socket Window WordCount");
}
Also used : ParameterTool(org.apache.flink.api.java.utils.ParameterTool) FlatMapFunction(org.apache.flink.api.common.functions.FlatMapFunction) Collector(org.apache.flink.util.Collector) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)

Example 38 with Collector

use of org.apache.flink.util.Collector in project flink by apache.

the class DataSetUtils method summarize.

// --------------------------------------------------------------------------------------------
//  Summarize
// --------------------------------------------------------------------------------------------
/**
	 * Summarize a DataSet of Tuples by collecting single pass statistics for all columns
	 *
	 * Example usage:
	 * <pre>
	 * {@code
	 * Dataset<Tuple3<Double, String, Boolean>> input = // [...]
	 * Tuple3<NumericColumnSummary,StringColumnSummary, BooleanColumnSummary> summary = DataSetUtils.summarize(input)
	 *
	 * summary.f0.getStandardDeviation()
	 * summary.f1.getMaxLength()
	 * }
	 * </pre>
	 * @return the summary as a Tuple the same width as input rows
	 */
public static <R extends Tuple, T extends Tuple> R summarize(DataSet<T> input) throws Exception {
    if (!input.getType().isTupleType()) {
        throw new IllegalArgumentException("summarize() is only implemented for DataSet's of Tuples");
    }
    final TupleTypeInfoBase<?> inType = (TupleTypeInfoBase<?>) input.getType();
    DataSet<TupleSummaryAggregator<R>> result = input.mapPartition(new MapPartitionFunction<T, TupleSummaryAggregator<R>>() {

        @Override
        public void mapPartition(Iterable<T> values, Collector<TupleSummaryAggregator<R>> out) throws Exception {
            TupleSummaryAggregator<R> aggregator = SummaryAggregatorFactory.create(inType);
            for (Tuple value : values) {
                aggregator.aggregate(value);
            }
            out.collect(aggregator);
        }
    }).reduce(new ReduceFunction<TupleSummaryAggregator<R>>() {

        @Override
        public TupleSummaryAggregator<R> reduce(TupleSummaryAggregator<R> agg1, TupleSummaryAggregator<R> agg2) throws Exception {
            agg1.combine(agg2);
            return agg1;
        }
    });
    return result.collect().get(0).result();
}
Also used : RichMapPartitionFunction(org.apache.flink.api.common.functions.RichMapPartitionFunction) MapPartitionFunction(org.apache.flink.api.common.functions.MapPartitionFunction) TupleTypeInfoBase(org.apache.flink.api.java.typeutils.TupleTypeInfoBase) Collector(org.apache.flink.util.Collector) Tuple(org.apache.flink.api.java.tuple.Tuple) TupleSummaryAggregator(org.apache.flink.api.java.summarize.aggregation.TupleSummaryAggregator)

Example 39 with Collector

use of org.apache.flink.util.Collector in project flink by apache.

the class GroupReduceOperatorTest method testGroupReduceCollectionWithRuntimeContext.

@Test
public void testGroupReduceCollectionWithRuntimeContext() {
    try {
        final String taskName = "Test Task";
        final AtomicBoolean opened = new AtomicBoolean();
        final AtomicBoolean closed = new AtomicBoolean();
        final RichGroupReduceFunction<Tuple2<String, Integer>, Tuple2<String, Integer>> reducer = new RichGroupReduceFunction<Tuple2<String, Integer>, Tuple2<String, Integer>>() {

            @Override
            public void reduce(Iterable<Tuple2<String, Integer>> values, Collector<Tuple2<String, Integer>> out) throws Exception {
                Iterator<Tuple2<String, Integer>> input = values.iterator();
                Tuple2<String, Integer> result = input.next();
                int sum = result.f1;
                while (input.hasNext()) {
                    Tuple2<String, Integer> next = input.next();
                    sum += next.f1;
                }
                result.f1 = sum;
                out.collect(result);
            }

            @Override
            public void open(Configuration parameters) throws Exception {
                opened.set(true);
                RuntimeContext ctx = getRuntimeContext();
                assertEquals(0, ctx.getIndexOfThisSubtask());
                assertEquals(1, ctx.getNumberOfParallelSubtasks());
                assertEquals(taskName, ctx.getTaskName());
            }

            @Override
            public void close() throws Exception {
                closed.set(true);
            }
        };
        GroupReduceOperatorBase<Tuple2<String, Integer>, Tuple2<String, Integer>, GroupReduceFunction<Tuple2<String, Integer>, Tuple2<String, Integer>>> op = new GroupReduceOperatorBase<Tuple2<String, Integer>, Tuple2<String, Integer>, GroupReduceFunction<Tuple2<String, Integer>, Tuple2<String, Integer>>>(reducer, new UnaryOperatorInformation<Tuple2<String, Integer>, Tuple2<String, Integer>>(TypeInfoParser.<Tuple2<String, Integer>>parse("Tuple2<String, Integer>"), TypeInfoParser.<Tuple2<String, Integer>>parse("Tuple2<String, Integer>")), new int[] { 0 }, "TestReducer");
        List<Tuple2<String, Integer>> input = new ArrayList<Tuple2<String, Integer>>(asList(new Tuple2<String, Integer>("foo", 1), new Tuple2<String, Integer>("foo", 3), new Tuple2<String, Integer>("bar", 2), new Tuple2<String, Integer>("bar", 4)));
        final TaskInfo taskInfo = new TaskInfo(taskName, 1, 0, 1, 0);
        ExecutionConfig executionConfig = new ExecutionConfig();
        executionConfig.disableObjectReuse();
        List<Tuple2<String, Integer>> resultMutableSafe = op.executeOnCollections(input, new RuntimeUDFContext(taskInfo, null, executionConfig, new HashMap<String, Future<Path>>(), new HashMap<String, Accumulator<?, ?>>(), new UnregisteredMetricsGroup()), executionConfig);
        executionConfig.enableObjectReuse();
        List<Tuple2<String, Integer>> resultRegular = op.executeOnCollections(input, new RuntimeUDFContext(taskInfo, null, executionConfig, new HashMap<String, Future<Path>>(), new HashMap<String, Accumulator<?, ?>>(), new UnregisteredMetricsGroup()), executionConfig);
        Set<Tuple2<String, Integer>> resultSetMutableSafe = new HashSet<Tuple2<String, Integer>>(resultMutableSafe);
        Set<Tuple2<String, Integer>> resultSetRegular = new HashSet<Tuple2<String, Integer>>(resultRegular);
        Set<Tuple2<String, Integer>> expectedResult = new HashSet<Tuple2<String, Integer>>(asList(new Tuple2<String, Integer>("foo", 4), new Tuple2<String, Integer>("bar", 6)));
        assertEquals(expectedResult, resultSetMutableSafe);
        assertEquals(expectedResult, resultSetRegular);
        assertTrue(opened.get());
        assertTrue(closed.get());
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
}
Also used : RichGroupReduceFunction(org.apache.flink.api.common.functions.RichGroupReduceFunction) UnregisteredMetricsGroup(org.apache.flink.metrics.groups.UnregisteredMetricsGroup) Configuration(org.apache.flink.configuration.Configuration) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) TaskInfo(org.apache.flink.api.common.TaskInfo) Collector(org.apache.flink.util.Collector) RuntimeUDFContext(org.apache.flink.api.common.functions.util.RuntimeUDFContext) HashSet(java.util.HashSet) Path(org.apache.flink.core.fs.Path) RichGroupReduceFunction(org.apache.flink.api.common.functions.RichGroupReduceFunction) GroupReduceFunction(org.apache.flink.api.common.functions.GroupReduceFunction) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) Tuple2(org.apache.flink.api.java.tuple.Tuple2) RuntimeContext(org.apache.flink.api.common.functions.RuntimeContext) Test(org.junit.Test)

Example 40 with Collector

use of org.apache.flink.util.Collector in project flink by apache.

the class InnerJoinOperatorBaseTest method testTupleBaseJoiner.

@Test
public void testTupleBaseJoiner() {
    final FlatJoinFunction<Tuple3<String, Double, Integer>, Tuple2<Integer, String>, Tuple2<Double, String>> joiner = new FlatJoinFunction<Tuple3<String, Double, Integer>, Tuple2<Integer, String>, Tuple2<Double, String>>() {

        @Override
        public void join(Tuple3<String, Double, Integer> first, Tuple2<Integer, String> second, Collector<Tuple2<Double, String>> out) {
            assertEquals(first.f0, second.f1);
            assertEquals(first.f2, second.f0);
            out.collect(new Tuple2<>(first.f1, second.f0.toString()));
        }
    };
    final TupleTypeInfo<Tuple3<String, Double, Integer>> leftTypeInfo = TupleTypeInfo.getBasicTupleTypeInfo(String.class, Double.class, Integer.class);
    final TupleTypeInfo<Tuple2<Integer, String>> rightTypeInfo = TupleTypeInfo.getBasicTupleTypeInfo(Integer.class, String.class);
    final TupleTypeInfo<Tuple2<Double, String>> outTypeInfo = TupleTypeInfo.getBasicTupleTypeInfo(Double.class, String.class);
    final int[] leftKeys = new int[] { 0, 2 };
    final int[] rightKeys = new int[] { 1, 0 };
    final String taskName = "Collection based tuple joiner";
    final BinaryOperatorInformation<Tuple3<String, Double, Integer>, Tuple2<Integer, String>, Tuple2<Double, String>> binaryOpInfo = new BinaryOperatorInformation<Tuple3<String, Double, Integer>, Tuple2<Integer, String>, Tuple2<Double, String>>(leftTypeInfo, rightTypeInfo, outTypeInfo);
    final InnerJoinOperatorBase<Tuple3<String, Double, Integer>, Tuple2<Integer, String>, Tuple2<Double, String>, FlatJoinFunction<Tuple3<String, Double, Integer>, Tuple2<Integer, String>, Tuple2<Double, String>>> base = new InnerJoinOperatorBase<Tuple3<String, Double, Integer>, Tuple2<Integer, String>, Tuple2<Double, String>, FlatJoinFunction<Tuple3<String, Double, Integer>, Tuple2<Integer, String>, Tuple2<Double, String>>>(joiner, binaryOpInfo, leftKeys, rightKeys, taskName);
    final List<Tuple3<String, Double, Integer>> inputData1 = new ArrayList<Tuple3<String, Double, Integer>>(Arrays.asList(new Tuple3<>("foo", 42.0, 1), new Tuple3<>("bar", 1.0, 2), new Tuple3<>("bar", 2.0, 3), new Tuple3<>("foobar", 3.0, 4), new Tuple3<>("bar", 3.0, 3)));
    final List<Tuple2<Integer, String>> inputData2 = new ArrayList<Tuple2<Integer, String>>(Arrays.asList(new Tuple2<>(3, "bar"), new Tuple2<>(4, "foobar"), new Tuple2<>(2, "foo")));
    final Set<Tuple2<Double, String>> expected = new HashSet<Tuple2<Double, String>>(Arrays.asList(new Tuple2<>(2.0, "3"), new Tuple2<>(3.0, "3"), new Tuple2<>(3.0, "4")));
    try {
        final TaskInfo taskInfo = new TaskInfo("op", 1, 0, 1, 0);
        ExecutionConfig executionConfig = new ExecutionConfig();
        executionConfig.disableObjectReuse();
        List<Tuple2<Double, String>> resultSafe = base.executeOnCollections(inputData1, inputData2, new RuntimeUDFContext(taskInfo, null, executionConfig, new HashMap<String, Future<Path>>(), new HashMap<String, Accumulator<?, ?>>(), new UnregisteredMetricsGroup()), executionConfig);
        executionConfig.enableObjectReuse();
        List<Tuple2<Double, String>> resultRegular = base.executeOnCollections(inputData1, inputData2, new RuntimeUDFContext(taskInfo, null, executionConfig, new HashMap<String, Future<Path>>(), new HashMap<String, Accumulator<?, ?>>(), new UnregisteredMetricsGroup()), executionConfig);
        assertEquals(expected, new HashSet<>(resultSafe));
        assertEquals(expected, new HashSet<>(resultRegular));
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
}
Also used : UnregisteredMetricsGroup(org.apache.flink.metrics.groups.UnregisteredMetricsGroup) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) FlatJoinFunction(org.apache.flink.api.common.functions.FlatJoinFunction) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) TaskInfo(org.apache.flink.api.common.TaskInfo) Collector(org.apache.flink.util.Collector) RuntimeUDFContext(org.apache.flink.api.common.functions.util.RuntimeUDFContext) BinaryOperatorInformation(org.apache.flink.api.common.operators.BinaryOperatorInformation) HashSet(java.util.HashSet) Path(org.apache.flink.core.fs.Path) Tuple2(org.apache.flink.api.java.tuple.Tuple2) Tuple3(org.apache.flink.api.java.tuple.Tuple3) Test(org.junit.Test)

Aggregations

Collector (org.apache.flink.util.Collector)51 Test (org.junit.Test)38 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)20 ExecutionConfig (org.apache.flink.api.common.ExecutionConfig)16 Configuration (org.apache.flink.configuration.Configuration)16 ExecutionEnvironment (org.apache.flink.api.java.ExecutionEnvironment)15 StreamExecutionEnvironment (org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)15 RuntimeContext (org.apache.flink.api.common.functions.RuntimeContext)14 TimeWindow (org.apache.flink.streaming.api.windowing.windows.TimeWindow)11 HashMap (java.util.HashMap)9 ArrayList (java.util.ArrayList)8 FlatMapFunction (org.apache.flink.api.common.functions.FlatMapFunction)8 Plan (org.apache.flink.api.common.Plan)7 HashSet (java.util.HashSet)6 RichGroupReduceFunction (org.apache.flink.api.common.functions.RichGroupReduceFunction)6 Tuple3 (org.apache.flink.api.java.tuple.Tuple3)6 Map (java.util.Map)5 GroupReduceFunction (org.apache.flink.api.common.functions.GroupReduceFunction)5 TaskInfo (org.apache.flink.api.common.TaskInfo)4 CoGroupFunction (org.apache.flink.api.common.functions.CoGroupFunction)4