Search in sources :

Example 6 with PairFunction

use of org.apache.spark.api.java.function.PairFunction in project cdap by caskdata.

the class SparkPageRankProgram method run.

@Override
public void run(JavaSparkExecutionContext sec) throws Exception {
    JavaSparkContext jsc = new JavaSparkContext();
    LOG.info("Processing backlinkURLs data");
    JavaPairRDD<Long, String> backlinkURLs = sec.fromStream("backlinkURLStream", String.class);
    int iterationCount = getIterationCount(sec);
    LOG.info("Grouping data by key");
    // Grouping backlinks by unique URL in key
    JavaPairRDD<String, Iterable<String>> links = backlinkURLs.values().mapToPair(new PairFunction<String, String, String>() {

        @Override
        public Tuple2<String, String> call(String s) {
            String[] parts = SPACES.split(s);
            return new Tuple2<>(parts[0], parts[1]);
        }
    }).distinct().groupByKey().cache();
    // Initialize default rank for each key URL
    JavaPairRDD<String, Double> ranks = links.mapValues(new Function<Iterable<String>, Double>() {

        @Override
        public Double call(Iterable<String> rs) {
            return 1.0;
        }
    });
    // Calculates and updates URL ranks continuously using PageRank algorithm.
    for (int current = 0; current < iterationCount; current++) {
        LOG.debug("Processing data with PageRank algorithm. Iteration {}/{}", current + 1, (iterationCount));
        // Calculates URL contributions to the rank of other URLs.
        JavaPairRDD<String, Double> contribs = links.join(ranks).values().flatMapToPair(new PairFlatMapFunction<Tuple2<Iterable<String>, Double>, String, Double>() {

            @Override
            public Iterable<Tuple2<String, Double>> call(Tuple2<Iterable<String>, Double> s) {
                LOG.debug("Processing {} with rank {}", s._1(), s._2());
                int urlCount = Iterables.size(s._1());
                List<Tuple2<String, Double>> results = new ArrayList<>();
                for (String n : s._1()) {
                    results.add(new Tuple2<>(n, s._2() / urlCount));
                }
                return results;
            }
        });
        // Re-calculates URL ranks based on backlink contributions.
        ranks = contribs.reduceByKey(new Sum()).mapValues(new Function<Double, Double>() {

            @Override
            public Double call(Double sum) {
                return 0.15 + sum * 0.85;
            }
        });
    }
    LOG.info("Writing ranks data");
    final ServiceDiscoverer discoveryServiceContext = sec.getServiceDiscoverer();
    final Metrics sparkMetrics = sec.getMetrics();
    JavaPairRDD<byte[], Integer> ranksRaw = ranks.mapToPair(new PairFunction<Tuple2<String, Double>, byte[], Integer>() {

        @Override
        public Tuple2<byte[], Integer> call(Tuple2<String, Double> tuple) throws Exception {
            LOG.debug("URL {} has rank {}", Arrays.toString(tuple._1().getBytes(Charsets.UTF_8)), tuple._2());
            URL serviceURL = discoveryServiceContext.getServiceURL(SparkPageRankApp.SERVICE_HANDLERS);
            if (serviceURL == null) {
                throw new RuntimeException("Failed to discover service: " + SparkPageRankApp.SERVICE_HANDLERS);
            }
            try {
                URLConnection connection = new URL(serviceURL, String.format("%s/%s", SparkPageRankApp.SparkPageRankServiceHandler.TRANSFORM_PATH, tuple._2().toString())).openConnection();
                try (BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), Charsets.UTF_8))) {
                    String pr = reader.readLine();
                    if ((Integer.parseInt(pr)) == POPULAR_PAGE_THRESHOLD) {
                        sparkMetrics.count(POPULAR_PAGES, 1);
                    } else if (Integer.parseInt(pr) <= UNPOPULAR_PAGE_THRESHOLD) {
                        sparkMetrics.count(UNPOPULAR_PAGES, 1);
                    } else {
                        sparkMetrics.count(REGULAR_PAGES, 1);
                    }
                    return new Tuple2<>(tuple._1().getBytes(Charsets.UTF_8), Integer.parseInt(pr));
                }
            } catch (Exception e) {
                LOG.warn("Failed to read the Stream for service {}", SparkPageRankApp.SERVICE_HANDLERS, e);
                throw Throwables.propagate(e);
            }
        }
    });
    // Store calculated results in output Dataset.
    // All calculated results are stored in one row.
    // Each result, the calculated URL rank based on backlink contributions, is an entry of the row.
    // The value of the entry is the URL rank.
    sec.saveAsDataset(ranksRaw, "ranks");
    LOG.info("PageRanks successfuly computed and written to \"ranks\" dataset");
}
Also used : URL(java.net.URL) PairFlatMapFunction(org.apache.spark.api.java.function.PairFlatMapFunction) Function(org.apache.spark.api.java.function.Function) PairFunction(org.apache.spark.api.java.function.PairFunction) Metrics(co.cask.cdap.api.metrics.Metrics) ArrayList(java.util.ArrayList) List(java.util.List) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) PairFunction(org.apache.spark.api.java.function.PairFunction) ServiceDiscoverer(co.cask.cdap.api.ServiceDiscoverer) InputStreamReader(java.io.InputStreamReader) URLConnection(java.net.URLConnection) Tuple2(scala.Tuple2) BufferedReader(java.io.BufferedReader)

Example 7 with PairFunction

use of org.apache.spark.api.java.function.PairFunction in project cdap by caskdata.

the class ClassicSparkProgram method main.

public static void main(String[] args) throws Exception {
    SparkConf sparkConf = new SparkConf();
    sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    sparkConf.set("spark.kryo.registrator", MyKryoRegistrator.class.getName());
    Schema schema = Schema.recordOf("record", Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("id", Schema.of(Schema.Type.INT)));
    List<StructuredRecord> records = new ArrayList<>();
    for (int i = 1; i <= 10; i++) {
        records.add(StructuredRecord.builder(schema).set("name", "Name" + i).set("id", i).build());
    }
    // This test serialization of StructuredRecord as well as using custom kryo serializer
    JavaSparkContext jsc = new JavaSparkContext(sparkConf);
    int result = jsc.parallelize(records).mapToPair(new PairFunction<StructuredRecord, MyInt, StructuredRecord>() {

        @Override
        public Tuple2<MyInt, StructuredRecord> call(StructuredRecord record) throws Exception {
            return new Tuple2<>(new MyInt((Integer) record.get("id")), record);
        }
    }).map(new Function<Tuple2<MyInt, StructuredRecord>, MyInt>() {

        @Override
        public MyInt call(Tuple2<MyInt, StructuredRecord> tuple) throws Exception {
            return tuple._1;
        }
    }).reduce(new Function2<MyInt, MyInt, MyInt>() {

        @Override
        public MyInt call(MyInt v1, MyInt v2) throws Exception {
            return new MyInt(v1.toInt() + v2.toInt());
        }
    }).toInt();
    if (result != 55) {
        throw new Exception("Expected result to be 55");
    }
}
Also used : Schema(co.cask.cdap.api.data.schema.Schema) ArrayList(java.util.ArrayList) Function2(org.apache.spark.api.java.function.Function2) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) Tuple2(scala.Tuple2) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) PairFunction(org.apache.spark.api.java.function.PairFunction) SparkConf(org.apache.spark.SparkConf)

Example 8 with PairFunction

use of org.apache.spark.api.java.function.PairFunction in project learning-spark by databricks.

the class ChapterSixExample method main.

public static void main(String[] args) throws Exception {
    if (args.length != 4) {
        throw new Exception("Usage AccumulatorExample sparkMaster inputFile outDirectory");
    }
    String sparkMaster = args[0];
    String inputFile = args[1];
    String inputFile2 = args[2];
    String outputDir = args[3];
    JavaSparkContext sc = new JavaSparkContext(sparkMaster, "ChapterSixExample", System.getenv("SPARK_HOME"), System.getenv("JARS"));
    JavaRDD<String> rdd = sc.textFile(inputFile);
    // Count the number of lines with KK6JKQ
    final Accumulator<Integer> count = sc.accumulator(0);
    rdd.foreach(new VoidFunction<String>() {

        public void call(String line) {
            if (line.contains("KK6JKQ")) {
                count.add(1);
            }
        }
    });
    System.out.println("Lines with 'KK6JKQ': " + count.value());
    // Create Accumulators initialized at 0
    final Accumulator<Integer> blankLines = sc.accumulator(0);
    JavaRDD<String> callSigns = rdd.flatMap(new FlatMapFunction<String, String>() {

        public Iterable<String> call(String line) {
            if (line.equals("")) {
                blankLines.add(1);
            }
            return Arrays.asList(line.split(" "));
        }
    });
    callSigns.saveAsTextFile(outputDir + "/callsigns");
    System.out.println("Blank lines: " + blankLines.value());
    // Start validating the call signs
    final Accumulator<Integer> validSignCount = sc.accumulator(0);
    final Accumulator<Integer> invalidSignCount = sc.accumulator(0);
    JavaRDD<String> validCallSigns = callSigns.filter(new Function<String, Boolean>() {

        public Boolean call(String callSign) {
            Pattern p = Pattern.compile("\\A\\d?\\p{Alpha}{1,2}\\d{1,4}\\p{Alpha}{1,3}\\Z");
            Matcher m = p.matcher(callSign);
            boolean b = m.matches();
            if (b) {
                validSignCount.add(1);
            } else {
                invalidSignCount.add(1);
            }
            return b;
        }
    });
    JavaPairRDD<String, Integer> contactCounts = validCallSigns.mapToPair(new PairFunction<String, String, Integer>() {

        public Tuple2<String, Integer> call(String callSign) {
            return new Tuple2(callSign, 1);
        }
    }).reduceByKey(new Function2<Integer, Integer, Integer>() {

        public Integer call(Integer x, Integer y) {
            return x + y;
        }
    });
    // Force evaluation so the counters are populated
    contactCounts.count();
    if (invalidSignCount.value() < 0.1 * validSignCount.value()) {
        contactCounts.saveAsTextFile(outputDir + "/contactCount");
    } else {
        System.out.println("Too many errors " + invalidSignCount.value() + " for " + validSignCount.value());
        System.exit(1);
    }
    // Read in the call sign table
    // Lookup the countries for each call sign in the
    // contactCounts RDD.
    final Broadcast<String[]> signPrefixes = sc.broadcast(loadCallSignTable());
    JavaPairRDD<String, Integer> countryContactCounts = contactCounts.mapToPair(new PairFunction<Tuple2<String, Integer>, String, Integer>() {

        public Tuple2<String, Integer> call(Tuple2<String, Integer> callSignCount) {
            String sign = callSignCount._1();
            String country = lookupCountry(sign, signPrefixes.value());
            return new Tuple2(country, callSignCount._2());
        }
    }).reduceByKey(new SumInts());
    countryContactCounts.saveAsTextFile(outputDir + "/countries.txt");
    System.out.println("Saved country contact counts as a file");
    // Use mapPartitions to re-use setup work.
    JavaPairRDD<String, CallLog[]> contactsContactLists = validCallSigns.mapPartitionsToPair(new PairFlatMapFunction<Iterator<String>, String, CallLog[]>() {

        public Iterable<Tuple2<String, CallLog[]>> call(Iterator<String> input) {
            // List for our results.
            ArrayList<Tuple2<String, CallLog[]>> callsignQsos = new ArrayList<Tuple2<String, CallLog[]>>();
            ArrayList<Tuple2<String, ContentExchange>> requests = new ArrayList<Tuple2<String, ContentExchange>>();
            ObjectMapper mapper = createMapper();
            HttpClient client = new HttpClient();
            try {
                client.start();
                while (input.hasNext()) {
                    requests.add(createRequestForSign(input.next(), client));
                }
                for (Tuple2<String, ContentExchange> signExchange : requests) {
                    callsignQsos.add(fetchResultFromRequest(mapper, signExchange));
                }
            } catch (Exception e) {
            }
            return callsignQsos;
        }
    });
    System.out.println(StringUtils.join(contactsContactLists.collect(), ","));
    // Computer the distance of each call using an external R program
    // adds our script to a list of files for each node to download with this job
    String distScript = System.getProperty("user.dir") + "/src/R/finddistance.R";
    String distScriptName = "finddistance.R";
    sc.addFile(distScript);
    JavaRDD<String> pipeInputs = contactsContactLists.values().map(new VerifyCallLogs()).flatMap(new FlatMapFunction<CallLog[], String>() {

        public Iterable<String> call(CallLog[] calls) {
            ArrayList<String> latLons = new ArrayList<String>();
            for (CallLog call : calls) {
                latLons.add(call.mylat + "," + call.mylong + "," + call.contactlat + "," + call.contactlong);
            }
            return latLons;
        }
    });
    JavaRDD<String> distances = pipeInputs.pipe(SparkFiles.get(distScriptName));
    // First we need to convert our RDD of String to a DoubleRDD so we can
    // access the stats function
    JavaDoubleRDD distanceDoubles = distances.mapToDouble(new DoubleFunction<String>() {

        public double call(String value) {
            return Double.parseDouble(value);
        }
    });
    final StatCounter stats = distanceDoubles.stats();
    final Double stddev = stats.stdev();
    final Double mean = stats.mean();
    JavaDoubleRDD reasonableDistances = distanceDoubles.filter(new Function<Double, Boolean>() {

        public Boolean call(Double x) {
            return (Math.abs(x - mean) < 3 * stddev);
        }
    });
    System.out.println(StringUtils.join(reasonableDistances.collect(), ","));
    sc.stop();
    System.exit(0);
}
Also used : ArrayList(java.util.ArrayList) Iterator(java.util.Iterator) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) StatCounter(org.apache.spark.util.StatCounter) PairFunction(org.apache.spark.api.java.function.PairFunction) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) JavaDoubleRDD(org.apache.spark.api.java.JavaDoubleRDD) FileNotFoundException(java.io.FileNotFoundException) Tuple2(scala.Tuple2) HttpClient(org.eclipse.jetty.client.HttpClient) ContentExchange(org.eclipse.jetty.client.ContentExchange)

Example 9 with PairFunction

use of org.apache.spark.api.java.function.PairFunction in project learning-spark by databricks.

the class WordCount method main.

public static void main(String[] args) throws Exception {
    String master = args[0];
    JavaSparkContext sc = new JavaSparkContext(master, "wordcount", System.getenv("SPARK_HOME"), System.getenv("JARS"));
    JavaRDD<String> rdd = sc.textFile(args[1]);
    JavaPairRDD<String, Integer> counts = rdd.flatMap(new FlatMapFunction<String, String>() {

        public Iterable<String> call(String x) {
            return Arrays.asList(x.split(" "));
        }
    }).mapToPair(new PairFunction<String, String, Integer>() {

        public Tuple2<String, Integer> call(String x) {
            return new Tuple2(x, 1);
        }
    }).reduceByKey(new Function2<Integer, Integer, Integer>() {

        public Integer call(Integer x, Integer y) {
            return x + y;
        }
    });
    counts.saveAsTextFile(args[2]);
}
Also used : Iterable(java.lang.Iterable) Tuple2(scala.Tuple2) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) PairFunction(org.apache.spark.api.java.function.PairFunction)

Example 10 with PairFunction

use of org.apache.spark.api.java.function.PairFunction in project learning-spark by databricks.

the class WordCount method main.

public static void main(String[] args) throws Exception {
    String inputFile = args[0];
    String outputFile = args[1];
    // Create a Java Spark Context.
    SparkConf conf = new SparkConf().setAppName("wordCount");
    JavaSparkContext sc = new JavaSparkContext(conf);
    // Load our input data.
    JavaRDD<String> input = sc.textFile(inputFile);
    // Split up into words.
    JavaRDD<String> words = input.flatMap(new FlatMapFunction<String, String>() {

        public Iterable<String> call(String x) {
            return Arrays.asList(x.split(" "));
        }
    });
    // Transform into word and count.
    JavaPairRDD<String, Integer> counts = words.mapToPair(new PairFunction<String, String, Integer>() {

        public Tuple2<String, Integer> call(String x) {
            return new Tuple2(x, 1);
        }
    }).reduceByKey(new Function2<Integer, Integer, Integer>() {

        public Integer call(Integer x, Integer y) {
            return x + y;
        }
    });
    // Save the word count back out to a text file, causing evaluation.
    counts.saveAsTextFile(outputFile);
}
Also used : Iterable(java.lang.Iterable) Tuple2(scala.Tuple2) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) PairFunction(org.apache.spark.api.java.function.PairFunction) SparkConf(org.apache.spark.SparkConf)

Aggregations

PairFunction (org.apache.spark.api.java.function.PairFunction)12 Tuple2 (scala.Tuple2)12 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)9 ArrayList (java.util.ArrayList)6 Function (org.apache.spark.api.java.function.Function)5 List (java.util.List)4 SparkConf (org.apache.spark.SparkConf)4 ServiceDiscoverer (co.cask.cdap.api.ServiceDiscoverer)2 Metrics (co.cask.cdap.api.metrics.Metrics)2 BufferedReader (java.io.BufferedReader)2 InputStreamReader (java.io.InputStreamReader)2 Iterable (java.lang.Iterable)2 URL (java.net.URL)2 URLConnection (java.net.URLConnection)2 Comparator (java.util.Comparator)2 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)2 JavaRDD (org.apache.spark.api.java.JavaRDD)2 PairFlatMapFunction (org.apache.spark.api.java.function.PairFlatMapFunction)2 Tuple4 (scala.Tuple4)2 StructuredRecord (co.cask.cdap.api.data.format.StructuredRecord)1