Search in sources :

Example 11 with PairFunction

use of org.apache.spark.api.java.function.PairFunction in project cdap by caskdata.

the class SparkPageRankProgram method run.

@Override
public void run(JavaSparkExecutionContext sec) throws Exception {
    JavaSparkContext jsc = new JavaSparkContext();
    LOG.info("Processing backlinkURLs data");
    JavaPairRDD<Long, String> backlinkURLs = sec.fromStream("backlinkURLStream", String.class);
    int iterationCount = getIterationCount(sec);
    LOG.info("Grouping data by key");
    // Grouping backlinks by unique URL in key
    JavaPairRDD<String, Iterable<String>> links = backlinkURLs.values().mapToPair(new PairFunction<String, String, String>() {

        @Override
        public Tuple2<String, String> call(String s) {
            String[] parts = SPACES.split(s);
            return new Tuple2<>(parts[0], parts[1]);
        }
    }).distinct().groupByKey().cache();
    // Initialize default rank for each key URL
    JavaPairRDD<String, Double> ranks = links.mapValues(new Function<Iterable<String>, Double>() {

        @Override
        public Double call(Iterable<String> rs) {
            return 1.0;
        }
    });
    // Calculates and updates URL ranks continuously using PageRank algorithm.
    for (int current = 0; current < iterationCount; current++) {
        LOG.debug("Processing data with PageRank algorithm. Iteration {}/{}", current + 1, (iterationCount));
        // Calculates URL contributions to the rank of other URLs.
        JavaPairRDD<String, Double> contribs = links.join(ranks).values().flatMapToPair(new PairFlatMapFunction<Tuple2<Iterable<String>, Double>, String, Double>() {

            @Override
            public Iterable<Tuple2<String, Double>> call(Tuple2<Iterable<String>, Double> s) {
                LOG.debug("Processing {} with rank {}", s._1(), s._2());
                int urlCount = Iterables.size(s._1());
                List<Tuple2<String, Double>> results = new ArrayList<>();
                for (String n : s._1()) {
                    results.add(new Tuple2<>(n, s._2() / urlCount));
                }
                return results;
            }
        });
        // Re-calculates URL ranks based on backlink contributions.
        ranks = contribs.reduceByKey(new Sum()).mapValues(new Function<Double, Double>() {

            @Override
            public Double call(Double sum) {
                return 0.15 + sum * 0.85;
            }
        });
    }
    LOG.info("Writing ranks data");
    final ServiceDiscoverer discoveryServiceContext = sec.getServiceDiscoverer();
    final Metrics sparkMetrics = sec.getMetrics();
    JavaPairRDD<byte[], Integer> ranksRaw = ranks.mapToPair(new PairFunction<Tuple2<String, Double>, byte[], Integer>() {

        @Override
        public Tuple2<byte[], Integer> call(Tuple2<String, Double> tuple) throws Exception {
            LOG.debug("URL {} has rank {}", Arrays.toString(tuple._1().getBytes(Charsets.UTF_8)), tuple._2());
            URL serviceURL = discoveryServiceContext.getServiceURL(SparkPageRankApp.SERVICE_HANDLERS);
            if (serviceURL == null) {
                throw new RuntimeException("Failed to discover service: " + SparkPageRankApp.SERVICE_HANDLERS);
            }
            try {
                URLConnection connection = new URL(serviceURL, String.format("%s/%s", SparkPageRankApp.SparkPageRankServiceHandler.TRANSFORM_PATH, tuple._2().toString())).openConnection();
                try (BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), Charsets.UTF_8))) {
                    String pr = reader.readLine();
                    if ((Integer.parseInt(pr)) == POPULAR_PAGE_THRESHOLD) {
                        sparkMetrics.count(POPULAR_PAGES, 1);
                    } else if (Integer.parseInt(pr) <= UNPOPULAR_PAGE_THRESHOLD) {
                        sparkMetrics.count(UNPOPULAR_PAGES, 1);
                    } else {
                        sparkMetrics.count(REGULAR_PAGES, 1);
                    }
                    return new Tuple2<>(tuple._1().getBytes(Charsets.UTF_8), Integer.parseInt(pr));
                }
            } catch (Exception e) {
                LOG.warn("Failed to read the Stream for service {}", SparkPageRankApp.SERVICE_HANDLERS, e);
                throw Throwables.propagate(e);
            }
        }
    });
    // Store calculated results in output Dataset.
    // All calculated results are stored in one row.
    // Each result, the calculated URL rank based on backlink contributions, is an entry of the row.
    // The value of the entry is the URL rank.
    sec.saveAsDataset(ranksRaw, "ranks");
    LOG.info("PageRanks successfuly computed and written to \"ranks\" dataset");
}
Also used : URL(java.net.URL) PairFlatMapFunction(org.apache.spark.api.java.function.PairFlatMapFunction) Function(org.apache.spark.api.java.function.Function) PairFunction(org.apache.spark.api.java.function.PairFunction) Metrics(co.cask.cdap.api.metrics.Metrics) ArrayList(java.util.ArrayList) List(java.util.List) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) PairFunction(org.apache.spark.api.java.function.PairFunction) ServiceDiscoverer(co.cask.cdap.api.ServiceDiscoverer) InputStreamReader(java.io.InputStreamReader) URLConnection(java.net.URLConnection) Tuple2(scala.Tuple2) BufferedReader(java.io.BufferedReader)

Example 12 with PairFunction

use of org.apache.spark.api.java.function.PairFunction in project ignite by apache.

the class SharedRDDExample method main.

/**
 * Executes the example.
 * @param args Command line arguments, none required.
 */
public static void main(String[] args) {
    // Spark Configuration.
    SparkConf sparkConf = new SparkConf().setAppName("JavaIgniteRDDExample").setMaster("local").set("spark.executor.instances", "2");
    // Spark context.
    JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);
    // Adjust the logger to exclude the logs of no interest.
    Logger.getRootLogger().setLevel(Level.ERROR);
    Logger.getLogger("org.apache.ignite").setLevel(Level.INFO);
    // Creates Ignite context with specific configuration and runs Ignite in the embedded mode.
    JavaIgniteContext<Integer, Integer> igniteContext = new JavaIgniteContext<Integer, Integer>(sparkContext, "examples/config/spark/example-shared-rdd.xml", false);
    // Create a Java Ignite RDD of Type (Int,Int) Integer Pair.
    JavaIgniteRDD<Integer, Integer> sharedRDD = igniteContext.<Integer, Integer>fromCache("sharedRDD");
    // Define data to be stored in the Ignite RDD (cache).
    List<Integer> data = new ArrayList<>(20);
    for (int i = 0; i < 20; i++) {
        data.add(i);
    }
    // Preparing a Java RDD.
    JavaRDD<Integer> javaRDD = sparkContext.<Integer>parallelize(data);
    // Fill the Ignite RDD in with Int pairs. Here Pairs are represented as Scala Tuple2.
    sharedRDD.savePairs(javaRDD.<Integer, Integer>mapToPair(new PairFunction<Integer, Integer, Integer>() {

        @Override
        public Tuple2<Integer, Integer> call(Integer val) throws Exception {
            return new Tuple2<Integer, Integer>(val, val);
        }
    }));
    System.out.println(">>> Iterating over Ignite Shared RDD...");
    // Iterate over the Ignite RDD.
    sharedRDD.foreach(new VoidFunction<Tuple2<Integer, Integer>>() {

        @Override
        public void call(Tuple2<Integer, Integer> tuple) throws Exception {
            System.out.println("(" + tuple._1 + "," + tuple._2 + ")");
        }
    });
    System.out.println(">>> Transforming values stored in Ignite Shared RDD...");
    // Filter out even values as a transformed RDD.
    JavaPairRDD<Integer, Integer> transformedValues = sharedRDD.filter(new Function<Tuple2<Integer, Integer>, Boolean>() {

        @Override
        public Boolean call(Tuple2<Integer, Integer> tuple) throws Exception {
            return tuple._2() % 2 == 0;
        }
    });
    // Print out the transformed values.
    transformedValues.foreach(new VoidFunction<Tuple2<Integer, Integer>>() {

        @Override
        public void call(Tuple2<Integer, Integer> tuple) throws Exception {
            System.out.println("(" + tuple._1 + "," + tuple._2 + ")");
        }
    });
    System.out.println(">>> Executing SQL query over Ignite Shared RDD...");
    // Execute SQL query over the Ignite RDD.
    Dataset df = sharedRDD.sql("select _val from Integer where _key < 9");
    // Show the result of the execution.
    df.show();
    // Close IgniteContext on all the workers.
    igniteContext.close(true);
}
Also used : Dataset(org.apache.spark.sql.Dataset) ArrayList(java.util.ArrayList) JavaIgniteContext(org.apache.ignite.spark.JavaIgniteContext) Tuple2(scala.Tuple2) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) PairFunction(org.apache.spark.api.java.function.PairFunction) SparkConf(org.apache.spark.SparkConf)

Aggregations

PairFunction (org.apache.spark.api.java.function.PairFunction)12 Tuple2 (scala.Tuple2)12 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)9 ArrayList (java.util.ArrayList)6 Function (org.apache.spark.api.java.function.Function)5 List (java.util.List)4 SparkConf (org.apache.spark.SparkConf)4 ServiceDiscoverer (co.cask.cdap.api.ServiceDiscoverer)2 Metrics (co.cask.cdap.api.metrics.Metrics)2 BufferedReader (java.io.BufferedReader)2 InputStreamReader (java.io.InputStreamReader)2 Iterable (java.lang.Iterable)2 URL (java.net.URL)2 URLConnection (java.net.URLConnection)2 Comparator (java.util.Comparator)2 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)2 JavaRDD (org.apache.spark.api.java.JavaRDD)2 PairFlatMapFunction (org.apache.spark.api.java.function.PairFlatMapFunction)2 Tuple4 (scala.Tuple4)2 StructuredRecord (co.cask.cdap.api.data.format.StructuredRecord)1