Search in sources :

Example 1 with VertexJoinFunction

use of org.apache.flink.graph.VertexJoinFunction in project flink by apache.

the class MusicProfiles method main.

public static void main(String[] args) throws Exception {
    if (!parseParameters(args)) {
        return;
    }
    ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
    /**
		 * Read the user-song-play triplets.
		 */
    DataSet<Tuple3<String, String, Integer>> triplets = getUserSongTripletsData(env);
    /**
		 * Read the mismatches dataset and extract the songIDs
		 */
    DataSet<Tuple1<String>> mismatches = getMismatchesData(env).map(new ExtractMismatchSongIds());
    /**
		 * Filter out the mismatches from the triplets dataset
		 */
    DataSet<Tuple3<String, String, Integer>> validTriplets = triplets.coGroup(mismatches).where(1).equalTo(0).with(new FilterOutMismatches());
    /**
		 * Create a user -> song weighted bipartite graph where the edge weights
		 * correspond to play counts
		 */
    Graph<String, NullValue, Integer> userSongGraph = Graph.fromTupleDataSet(validTriplets, env);
    /**
		 * Get the top track (most listened) for each user
		 */
    DataSet<Tuple2<String, String>> usersWithTopTrack = userSongGraph.groupReduceOnEdges(new GetTopSongPerUser(), EdgeDirection.OUT).filter(new FilterSongNodes());
    if (fileOutput) {
        usersWithTopTrack.writeAsCsv(topTracksOutputPath, "\n", "\t");
    } else {
        usersWithTopTrack.print();
    }
    /**
		 * Create a user-user similarity graph, based on common songs, i.e. two
		 * users that listen to the same song are connected. For each song, we
		 * create an edge between each pair of its in-neighbors.
		 */
    DataSet<Edge<String, NullValue>> similarUsers = userSongGraph.getEdges().filter(new FilterFunction<Edge<String, Integer>>() {

        public boolean filter(Edge<String, Integer> edge) {
            return (edge.getValue() > playcountThreshold);
        }
    }).groupBy(1).reduceGroup(new CreateSimilarUserEdges()).distinct();
    Graph<String, Long, NullValue> similarUsersGraph = Graph.fromDataSet(similarUsers, new MapFunction<String, Long>() {

        public Long map(String value) {
            return 1l;
        }
    }, env).getUndirected();
    /**
		 * Detect user communities using the label propagation library method
		 */
    // Initialize each vertex with a unique numeric label and run the label propagation algorithm
    DataSet<Tuple2<String, Long>> idsWithInitialLabels = DataSetUtils.zipWithUniqueId(similarUsersGraph.getVertexIds()).map(new MapFunction<Tuple2<Long, String>, Tuple2<String, Long>>() {

        @Override
        public Tuple2<String, Long> map(Tuple2<Long, String> tuple2) throws Exception {
            return new Tuple2<String, Long>(tuple2.f1, tuple2.f0);
        }
    });
    DataSet<Vertex<String, Long>> verticesWithCommunity = similarUsersGraph.joinWithVertices(idsWithInitialLabels, new VertexJoinFunction<Long, Long>() {

        public Long vertexJoin(Long vertexValue, Long inputValue) {
            return inputValue;
        }
    }).run(new LabelPropagation<String, Long, NullValue>(maxIterations));
    if (fileOutput) {
        verticesWithCommunity.writeAsCsv(communitiesOutputPath, "\n", "\t");
        // since file sinks are lazy, we trigger the execution explicitly
        env.execute();
    } else {
        verticesWithCommunity.print();
    }
}
Also used : VertexJoinFunction(org.apache.flink.graph.VertexJoinFunction) Vertex(org.apache.flink.graph.Vertex) ExecutionEnvironment(org.apache.flink.api.java.ExecutionEnvironment) FilterFunction(org.apache.flink.api.common.functions.FilterFunction) MapFunction(org.apache.flink.api.common.functions.MapFunction) NullValue(org.apache.flink.types.NullValue) Tuple1(org.apache.flink.api.java.tuple.Tuple1) Tuple2(org.apache.flink.api.java.tuple.Tuple2) Tuple3(org.apache.flink.api.java.tuple.Tuple3) Edge(org.apache.flink.graph.Edge)

Aggregations

FilterFunction (org.apache.flink.api.common.functions.FilterFunction)1 MapFunction (org.apache.flink.api.common.functions.MapFunction)1 ExecutionEnvironment (org.apache.flink.api.java.ExecutionEnvironment)1 Tuple1 (org.apache.flink.api.java.tuple.Tuple1)1 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)1 Tuple3 (org.apache.flink.api.java.tuple.Tuple3)1 Edge (org.apache.flink.graph.Edge)1 Vertex (org.apache.flink.graph.Vertex)1 VertexJoinFunction (org.apache.flink.graph.VertexJoinFunction)1 NullValue (org.apache.flink.types.NullValue)1