Search in sources :

Example 26 with MapFunction

use of org.apache.flink.api.common.functions.MapFunction in project flink by apache.

the class MusicProfiles method main.

public static void main(String[] args) throws Exception {
    if (!parseParameters(args)) {
        return;
    }
    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    /**
		 * Read the user-song-play triplets.
		 */
    DataSet<Tuple3<String, String, Integer>> triplets = getUserSongTripletsData(env);
    /**
		 * Read the mismatches dataset and extract the songIDs
		 */
    DataSet<Tuple1<String>> mismatches = getMismatchesData(env).map(new ExtractMismatchSongIds());
    /**
		 * Filter out the mismatches from the triplets dataset
		 */
    DataSet<Tuple3<String, String, Integer>> validTriplets = triplets.coGroup(mismatches).where(1).equalTo(0).with(new FilterOutMismatches());
    /**
		 * Create a user -> song weighted bipartite graph where the edge weights
		 * correspond to play counts
		 */
    Graph<String, NullValue, Integer> userSongGraph = Graph.fromTupleDataSet(validTriplets, env);
    /**
		 * Get the top track (most listened) for each user
		 */
    DataSet<Tuple2<String, String>> usersWithTopTrack = userSongGraph.groupReduceOnEdges(new GetTopSongPerUser(), EdgeDirection.OUT).filter(new FilterSongNodes());
    if (fileOutput) {
        usersWithTopTrack.writeAsCsv(topTracksOutputPath, "\n", "\t");
    } else {
        usersWithTopTrack.print();
    }
    /**
		 * Create a user-user similarity graph, based on common songs, i.e. two
		 * users that listen to the same song are connected. For each song, we
		 * create an edge between each pair of its in-neighbors.
		 */
    DataSet<Edge<String, NullValue>> similarUsers = userSongGraph.getEdges().filter(new FilterFunction<Edge<String, Integer>>() {

        public boolean filter(Edge<String, Integer> edge) {
            return (edge.getValue() > playcountThreshold);
        }
    }).groupBy(1).reduceGroup(new CreateSimilarUserEdges()).distinct();
    Graph<String, Long, NullValue> similarUsersGraph = Graph.fromDataSet(similarUsers, new MapFunction<String, Long>() {

        public Long map(String value) {
            return 1l;
        }
    }, env).getUndirected();
    /**
		 * Detect user communities using the label propagation library method
		 */
    // Initialize each vertex with a unique numeric label and run the label propagation algorithm
    DataSet<Tuple2<String, Long>> idsWithInitialLabels = DataSetUtils.zipWithUniqueId(similarUsersGraph.getVertexIds()).map(new MapFunction<Tuple2<Long, String>, Tuple2<String, Long>>() {

        @Override
        public Tuple2<String, Long> map(Tuple2<Long, String> tuple2) throws Exception {
            return new Tuple2<String, Long>(tuple2.f1, tuple2.f0);
        }
    });
    DataSet<Vertex<String, Long>> verticesWithCommunity = similarUsersGraph.joinWithVertices(idsWithInitialLabels, new VertexJoinFunction<Long, Long>() {

        public Long vertexJoin(Long vertexValue, Long inputValue) {
            return inputValue;
        }
    }).run(new LabelPropagation<String, Long, NullValue>(maxIterations));
    if (fileOutput) {
        verticesWithCommunity.writeAsCsv(communitiesOutputPath, "\n", "\t");
        // since file sinks are lazy, we trigger the execution explicitly
        env.execute();
    } else {
        verticesWithCommunity.print();
    }
}
Also used : VertexJoinFunction(org.apache.flink.graph.VertexJoinFunction) Vertex(org.apache.flink.graph.Vertex) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) FilterFunction(org.apache.flink.api.common.functions.FilterFunction) MapFunction(org.apache.flink.api.common.functions.MapFunction) NullValue(org.apache.flink.types.NullValue) Tuple1(org.apache.flink.api.java.tuple.Tuple1) Tuple2(org.apache.flink.api.java.tuple.Tuple2) Tuple3(org.apache.flink.api.java.tuple.Tuple3) Edge(org.apache.flink.graph.Edge)

Example 27 with MapFunction

use of org.apache.flink.api.common.functions.MapFunction in project flink by apache.

the class CEPITCase method testSimpleKeyedPatternEventTime.

@Test
public void testSimpleKeyedPatternEventTime() throws Exception {
    StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
    env.setParallelism(2);
    // (Event, timestamp)
    DataStream<Event> input = env.fromElements(Tuple2.of(new Event(1, "start", 1.0), 5L), Tuple2.of(new Event(1, "middle", 2.0), 1L), Tuple2.of(new Event(2, "middle", 2.0), 4L), Tuple2.of(new Event(2, "start", 2.0), 3L), Tuple2.of(new Event(1, "end", 3.0), 3L), Tuple2.of(new Event(3, "start", 4.1), 5L), Tuple2.of(new Event(1, "end", 4.0), 10L), Tuple2.of(new Event(2, "end", 2.0), 8L), Tuple2.of(new Event(1, "middle", 5.0), 7L), Tuple2.of(new Event(3, "middle", 6.0), 9L), Tuple2.of(new Event(3, "end", 7.0), 7L)).assignTimestampsAndWatermarks(new AssignerWithPunctuatedWatermarks<Tuple2<Event, Long>>() {

        @Override
        public long extractTimestamp(Tuple2<Event, Long> element, long currentTimestamp) {
            return element.f1;
        }

        @Override
        public Watermark checkAndGetNextWatermark(Tuple2<Event, Long> lastElement, long extractedTimestamp) {
            return new Watermark(lastElement.f1 - 5);
        }
    }).map(new MapFunction<Tuple2<Event, Long>, Event>() {

        @Override
        public Event map(Tuple2<Event, Long> value) throws Exception {
            return value.f0;
        }
    }).keyBy(new KeySelector<Event, Integer>() {

        @Override
        public Integer getKey(Event value) throws Exception {
            return value.getId();
        }
    });
    Pattern<Event, ?> pattern = Pattern.<Event>begin("start").where(new FilterFunction<Event>() {

        @Override
        public boolean filter(Event value) throws Exception {
            return value.getName().equals("start");
        }
    }).followedBy("middle").where(new FilterFunction<Event>() {

        @Override
        public boolean filter(Event value) throws Exception {
            return value.getName().equals("middle");
        }
    }).followedBy("end").where(new FilterFunction<Event>() {

        @Override
        public boolean filter(Event value) throws Exception {
            return value.getName().equals("end");
        }
    });
    DataStream<String> result = CEP.pattern(input, pattern).select(new PatternSelectFunction<Event, String>() {

        @Override
        public String select(Map<String, Event> pattern) {
            StringBuilder builder = new StringBuilder();
            builder.append(pattern.get("start").getId()).append(",").append(pattern.get("middle").getId()).append(",").append(pattern.get("end").getId());
            return builder.toString();
        }
    });
    result.writeAsText(resultPath, FileSystem.WriteMode.OVERWRITE);
    // the expected sequences of matching event ids
    expected = "1,1,1\n2,2,2";
    env.execute();
}
Also used : MapFunction(org.apache.flink.api.common.functions.MapFunction) Tuple2(org.apache.flink.api.java.tuple.Tuple2) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) Watermark(org.apache.flink.streaming.api.watermark.Watermark) Test(org.junit.Test)

Example 28 with MapFunction

use of org.apache.flink.api.common.functions.MapFunction in project flink by apache.

the class Graph method fromDataSet.

/**
	 * Creates a graph from a DataSet of edges.
	 * Vertices are created automatically and their values are set
	 * by applying the provided map function to the vertex IDs.
	 * 
	 * @param edges a DataSet of edges.
	 * @param vertexValueInitializer the mapper function that initializes the vertex values.
	 * It allows to apply a map transformation on the vertex ID to produce an initial vertex value.
	 * @param context the flink execution environment.
	 * @return the newly created graph.
	 */
public static <K, VV, EV> Graph<K, VV, EV> fromDataSet(DataSet<Edge<K, EV>> edges, final MapFunction<K, VV> vertexValueInitializer, ExecutionEnvironment context) {
    TypeInformation<K> keyType = ((TupleTypeInfo<?>) edges.getType()).getTypeAt(0);
    TypeInformation<VV> valueType = TypeExtractor.createTypeInfo(MapFunction.class, vertexValueInitializer.getClass(), 1, keyType, null);
    @SuppressWarnings({ "unchecked", "rawtypes" }) TypeInformation<Vertex<K, VV>> returnType = (TypeInformation<Vertex<K, VV>>) new TupleTypeInfo(Vertex.class, keyType, valueType);
    DataSet<Vertex<K, VV>> vertices = edges.flatMap(new EmitSrcAndTargetAsTuple1<K, EV>()).name("Source and target IDs").distinct().name("IDs").map(new MapFunction<Tuple1<K>, Vertex<K, VV>>() {

        private Vertex<K, VV> output = new Vertex<>();

        public Vertex<K, VV> map(Tuple1<K> value) throws Exception {
            output.f0 = value.f0;
            output.f1 = vertexValueInitializer.map(value.f0);
            return output;
        }
    }).returns(returnType).withForwardedFields("f0").name("Initialize vertex values");
    return new Graph<>(vertices, edges, context);
}
Also used : MapFunction(org.apache.flink.api.common.functions.MapFunction) FlatMapFunction(org.apache.flink.api.common.functions.FlatMapFunction) TupleTypeInfo(org.apache.flink.api.java.typeutils.TupleTypeInfo) TypeInformation(org.apache.flink.api.common.typeinfo.TypeInformation) Tuple1(org.apache.flink.api.java.tuple.Tuple1)

Example 29 with MapFunction

use of org.apache.flink.api.common.functions.MapFunction in project flink by apache.

the class PregelCompilerTest method testPregelCompiler.

@SuppressWarnings("serial")
@Test
public void testPregelCompiler() {
    try {
        ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(DEFAULT_PARALLELISM);
        // compose test program
        {
            DataSet<Vertex<Long, Long>> initialVertices = env.fromElements(new Tuple2<>(1L, 1L), new Tuple2<>(2L, 2L)).map(new Tuple2ToVertexMap<Long, Long>());
            DataSet<Edge<Long, NullValue>> edges = env.fromElements(new Tuple2<>(1L, 2L)).map(new MapFunction<Tuple2<Long, Long>, Edge<Long, NullValue>>() {

                public Edge<Long, NullValue> map(Tuple2<Long, Long> edge) {
                    return new Edge<>(edge.f0, edge.f1, NullValue.getInstance());
                }
            });
            Graph<Long, Long, NullValue> graph = Graph.fromDataSet(initialVertices, edges, env);
            DataSet<Vertex<Long, Long>> result = graph.runVertexCentricIteration(new CCCompute(), null, 100).getVertices();
            result.output(new DiscardingOutputFormat<Vertex<Long, Long>>());
        }
        Plan p = env.createProgramPlan("Pregel Connected Components");
        OptimizedPlan op = compileNoStats(p);
        // check the sink
        SinkPlanNode sink = op.getDataSinks().iterator().next();
        assertEquals(ShipStrategyType.FORWARD, sink.getInput().getShipStrategy());
        assertEquals(DEFAULT_PARALLELISM, sink.getParallelism());
        // check the iteration
        WorksetIterationPlanNode iteration = (WorksetIterationPlanNode) sink.getInput().getSource();
        assertEquals(DEFAULT_PARALLELISM, iteration.getParallelism());
        // check the solution set delta
        PlanNode ssDelta = iteration.getSolutionSetDeltaPlanNode();
        assertTrue(ssDelta instanceof SingleInputPlanNode);
        SingleInputPlanNode ssFlatMap = (SingleInputPlanNode) ((SingleInputPlanNode) (ssDelta)).getInput().getSource();
        assertEquals(DEFAULT_PARALLELISM, ssFlatMap.getParallelism());
        assertEquals(ShipStrategyType.FORWARD, ssFlatMap.getInput().getShipStrategy());
        // check the computation coGroup
        DualInputPlanNode computationCoGroup = (DualInputPlanNode) (ssFlatMap.getInput().getSource());
        assertEquals(DEFAULT_PARALLELISM, computationCoGroup.getParallelism());
        assertEquals(ShipStrategyType.FORWARD, computationCoGroup.getInput1().getShipStrategy());
        assertEquals(ShipStrategyType.PARTITION_HASH, computationCoGroup.getInput2().getShipStrategy());
        assertTrue(computationCoGroup.getInput2().getTempMode().isCached());
        assertEquals(new FieldList(0), computationCoGroup.getInput2().getShipStrategyKeys());
        // check that the initial partitioning is pushed out of the loop
        assertEquals(ShipStrategyType.PARTITION_HASH, iteration.getInput1().getShipStrategy());
        assertEquals(new FieldList(0), iteration.getInput1().getShipStrategyKeys());
    } catch (Exception e) {
        System.err.println(e.getMessage());
        e.printStackTrace();
        fail(e.getMessage());
    }
}
Also used : ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) Tuple2ToVertexMap(org.apache.flink.graph.utils.Tuple2ToVertexMap) DataSet(org.apache.flink.api.java.DataSet) WorksetIterationPlanNode(org.apache.flink.optimizer.plan.WorksetIterationPlanNode) MapFunction(org.apache.flink.api.common.functions.MapFunction) Plan(org.apache.flink.api.common.Plan) OptimizedPlan(org.apache.flink.optimizer.plan.OptimizedPlan) DiscardingOutputFormat(org.apache.flink.api.java.io.DiscardingOutputFormat) OptimizedPlan(org.apache.flink.optimizer.plan.OptimizedPlan) FieldList(org.apache.flink.api.common.operators.util.FieldList) SingleInputPlanNode(org.apache.flink.optimizer.plan.SingleInputPlanNode) DualInputPlanNode(org.apache.flink.optimizer.plan.DualInputPlanNode) NullValue(org.apache.flink.types.NullValue) Graph(org.apache.flink.graph.Graph) WorksetIterationPlanNode(org.apache.flink.optimizer.plan.WorksetIterationPlanNode) DualInputPlanNode(org.apache.flink.optimizer.plan.DualInputPlanNode) PlanNode(org.apache.flink.optimizer.plan.PlanNode) SinkPlanNode(org.apache.flink.optimizer.plan.SinkPlanNode) SingleInputPlanNode(org.apache.flink.optimizer.plan.SingleInputPlanNode) Tuple2(org.apache.flink.api.java.tuple.Tuple2) SinkPlanNode(org.apache.flink.optimizer.plan.SinkPlanNode) Edge(org.apache.flink.graph.Edge) Test(org.junit.Test)

Example 30 with MapFunction

use of org.apache.flink.api.common.functions.MapFunction in project flink by apache.

the class PregelCompilerTest method testPregelWithCombiner.

@SuppressWarnings("serial")
@Test
public void testPregelWithCombiner() {
    try {
        ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(DEFAULT_PARALLELISM);
        // compose test program
        {
            DataSet<Vertex<Long, Long>> initialVertices = env.fromElements(new Tuple2<>(1L, 1L), new Tuple2<>(2L, 2L)).map(new Tuple2ToVertexMap<Long, Long>());
            DataSet<Edge<Long, NullValue>> edges = env.fromElements(new Tuple2<>(1L, 2L)).map(new MapFunction<Tuple2<Long, Long>, Edge<Long, NullValue>>() {

                public Edge<Long, NullValue> map(Tuple2<Long, Long> edge) {
                    return new Edge<>(edge.f0, edge.f1, NullValue.getInstance());
                }
            });
            Graph<Long, Long, NullValue> graph = Graph.fromDataSet(initialVertices, edges, env);
            DataSet<Vertex<Long, Long>> result = graph.runVertexCentricIteration(new CCCompute(), new CCCombiner(), 100).getVertices();
            result.output(new DiscardingOutputFormat<Vertex<Long, Long>>());
        }
        Plan p = env.createProgramPlan("Pregel Connected Components");
        OptimizedPlan op = compileNoStats(p);
        // check the sink
        SinkPlanNode sink = op.getDataSinks().iterator().next();
        assertEquals(ShipStrategyType.FORWARD, sink.getInput().getShipStrategy());
        assertEquals(DEFAULT_PARALLELISM, sink.getParallelism());
        // check the iteration
        WorksetIterationPlanNode iteration = (WorksetIterationPlanNode) sink.getInput().getSource();
        assertEquals(DEFAULT_PARALLELISM, iteration.getParallelism());
        // check the combiner
        SingleInputPlanNode combiner = (SingleInputPlanNode) iteration.getInput2().getSource();
        assertEquals(ShipStrategyType.FORWARD, combiner.getInput().getShipStrategy());
        // check the solution set delta
        PlanNode ssDelta = iteration.getSolutionSetDeltaPlanNode();
        assertTrue(ssDelta instanceof SingleInputPlanNode);
        SingleInputPlanNode ssFlatMap = (SingleInputPlanNode) ((SingleInputPlanNode) (ssDelta)).getInput().getSource();
        assertEquals(DEFAULT_PARALLELISM, ssFlatMap.getParallelism());
        assertEquals(ShipStrategyType.FORWARD, ssFlatMap.getInput().getShipStrategy());
        // check the computation coGroup
        DualInputPlanNode computationCoGroup = (DualInputPlanNode) (ssFlatMap.getInput().getSource());
        assertEquals(DEFAULT_PARALLELISM, computationCoGroup.getParallelism());
        assertEquals(ShipStrategyType.FORWARD, computationCoGroup.getInput1().getShipStrategy());
        assertEquals(ShipStrategyType.PARTITION_HASH, computationCoGroup.getInput2().getShipStrategy());
        assertTrue(computationCoGroup.getInput2().getTempMode().isCached());
        assertEquals(new FieldList(0), computationCoGroup.getInput2().getShipStrategyKeys());
        // check that the initial partitioning is pushed out of the loop
        assertEquals(ShipStrategyType.PARTITION_HASH, iteration.getInput1().getShipStrategy());
        assertEquals(new FieldList(0), iteration.getInput1().getShipStrategyKeys());
    } catch (Exception e) {
        System.err.println(e.getMessage());
        e.printStackTrace();
        fail(e.getMessage());
    }
}
Also used : ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) Tuple2ToVertexMap(org.apache.flink.graph.utils.Tuple2ToVertexMap) DataSet(org.apache.flink.api.java.DataSet) WorksetIterationPlanNode(org.apache.flink.optimizer.plan.WorksetIterationPlanNode) MapFunction(org.apache.flink.api.common.functions.MapFunction) Plan(org.apache.flink.api.common.Plan) OptimizedPlan(org.apache.flink.optimizer.plan.OptimizedPlan) DiscardingOutputFormat(org.apache.flink.api.java.io.DiscardingOutputFormat) OptimizedPlan(org.apache.flink.optimizer.plan.OptimizedPlan) FieldList(org.apache.flink.api.common.operators.util.FieldList) SingleInputPlanNode(org.apache.flink.optimizer.plan.SingleInputPlanNode) DualInputPlanNode(org.apache.flink.optimizer.plan.DualInputPlanNode) NullValue(org.apache.flink.types.NullValue) Graph(org.apache.flink.graph.Graph) WorksetIterationPlanNode(org.apache.flink.optimizer.plan.WorksetIterationPlanNode) DualInputPlanNode(org.apache.flink.optimizer.plan.DualInputPlanNode) PlanNode(org.apache.flink.optimizer.plan.PlanNode) SinkPlanNode(org.apache.flink.optimizer.plan.SinkPlanNode) SingleInputPlanNode(org.apache.flink.optimizer.plan.SingleInputPlanNode) Tuple2(org.apache.flink.api.java.tuple.Tuple2) SinkPlanNode(org.apache.flink.optimizer.plan.SinkPlanNode) Edge(org.apache.flink.graph.Edge) Test(org.junit.Test)

Aggregations

MapFunction (org.apache.flink.api.common.functions.MapFunction)48 Test (org.junit.Test)31 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)29 ExecutionEnvironment (org.apache.flink.api.java.ExecutionEnvironment)19 StreamExecutionEnvironment (org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)19 Configuration (org.apache.flink.configuration.Configuration)10 FlatMapFunction (org.apache.flink.api.common.functions.FlatMapFunction)9 Plan (org.apache.flink.api.common.Plan)8 RichMapFunction (org.apache.flink.api.common.functions.RichMapFunction)8 OptimizedPlan (org.apache.flink.optimizer.plan.OptimizedPlan)8 RichFlatMapFunction (org.apache.flink.api.common.functions.RichFlatMapFunction)7 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)7 DiscardingOutputFormat (org.apache.flink.api.java.io.DiscardingOutputFormat)6 Edge (org.apache.flink.graph.Edge)6 SinkPlanNode (org.apache.flink.optimizer.plan.SinkPlanNode)6 NullValue (org.apache.flink.types.NullValue)6 FilterFunction (org.apache.flink.api.common.functions.FilterFunction)5 FieldList (org.apache.flink.api.common.operators.util.FieldList)5 DataSet (org.apache.flink.api.java.DataSet)5 Tuple1 (org.apache.flink.api.java.tuple.Tuple1)5