Search in sources :

Example 1 with StatCounter

use of org.apache.spark.util.StatCounter in project learning-spark by databricks.

the class ChapterSixExample method main.

public static void main(String[] args) throws Exception {
    if (args.length != 4) {
        throw new Exception("Usage AccumulatorExample sparkMaster inputFile outDirectory");
    String sparkMaster = args[0];
    String inputFile = args[1];
    String inputFile2 = args[2];
    String outputDir = args[3];
    JavaSparkContext sc = new JavaSparkContext(sparkMaster, "ChapterSixExample", System.getenv("SPARK_HOME"), System.getenv("JARS"));
    JavaRDD<String> rdd = sc.textFile(inputFile);
    // Count the number of lines with KK6JKQ
    final Accumulator<Integer> count = sc.accumulator(0);
    rdd.foreach(new VoidFunction<String>() {

        public void call(String line) {
            if (line.contains("KK6JKQ")) {
    System.out.println("Lines with 'KK6JKQ': " + count.value());
    // Create Accumulators initialized at 0
    final Accumulator<Integer> blankLines = sc.accumulator(0);
    JavaRDD<String> callSigns = rdd.flatMap(new FlatMapFunction<String, String>() {

        public Iterable<String> call(String line) {
            if (line.equals("")) {
            return Arrays.asList(line.split(" "));
    callSigns.saveAsTextFile(outputDir + "/callsigns");
    System.out.println("Blank lines: " + blankLines.value());
    // Start validating the call signs
    final Accumulator<Integer> validSignCount = sc.accumulator(0);
    final Accumulator<Integer> invalidSignCount = sc.accumulator(0);
    JavaRDD<String> validCallSigns = callSigns.filter(new Function<String, Boolean>() {

        public Boolean call(String callSign) {
            Pattern p = Pattern.compile("\\A\\d?\\p{Alpha}{1,2}\\d{1,4}\\p{Alpha}{1,3}\\Z");
            Matcher m = p.matcher(callSign);
            boolean b = m.matches();
            if (b) {
            } else {
            return b;
    JavaPairRDD<String, Integer> contactCounts = validCallSigns.mapToPair(new PairFunction<String, String, Integer>() {

        public Tuple2<String, Integer> call(String callSign) {
            return new Tuple2(callSign, 1);
    }).reduceByKey(new Function2<Integer, Integer, Integer>() {

        public Integer call(Integer x, Integer y) {
            return x + y;
    // Force evaluation so the counters are populated
    if (invalidSignCount.value() < 0.1 * validSignCount.value()) {
        contactCounts.saveAsTextFile(outputDir + "/contactCount");
    } else {
        System.out.println("Too many errors " + invalidSignCount.value() + " for " + validSignCount.value());
    // Read in the call sign table
    // Lookup the countries for each call sign in the
    // contactCounts RDD.
    final Broadcast<String[]> signPrefixes = sc.broadcast(loadCallSignTable());
    JavaPairRDD<String, Integer> countryContactCounts = contactCounts.mapToPair(new PairFunction<Tuple2<String, Integer>, String, Integer>() {

        public Tuple2<String, Integer> call(Tuple2<String, Integer> callSignCount) {
            String sign = callSignCount._1();
            String country = lookupCountry(sign, signPrefixes.value());
            return new Tuple2(country, callSignCount._2());
    }).reduceByKey(new SumInts());
    countryContactCounts.saveAsTextFile(outputDir + "/countries.txt");
    System.out.println("Saved country contact counts as a file");
    // Use mapPartitions to re-use setup work.
    JavaPairRDD<String, CallLog[]> contactsContactLists = validCallSigns.mapPartitionsToPair(new PairFlatMapFunction<Iterator<String>, String, CallLog[]>() {

        public Iterable<Tuple2<String, CallLog[]>> call(Iterator<String> input) {
            // List for our results.
            ArrayList<Tuple2<String, CallLog[]>> callsignQsos = new ArrayList<Tuple2<String, CallLog[]>>();
            ArrayList<Tuple2<String, ContentExchange>> requests = new ArrayList<Tuple2<String, ContentExchange>>();
            ObjectMapper mapper = createMapper();
            HttpClient client = new HttpClient();
            try {
                while (input.hasNext()) {
                    requests.add(createRequestForSign(, client));
                for (Tuple2<String, ContentExchange> signExchange : requests) {
                    callsignQsos.add(fetchResultFromRequest(mapper, signExchange));
            } catch (Exception e) {
            return callsignQsos;
    System.out.println(StringUtils.join(contactsContactLists.collect(), ","));
    // Computer the distance of each call using an external R program
    // adds our script to a list of files for each node to download with this job
    String distScript = System.getProperty("user.dir") + "/src/R/finddistance.R";
    String distScriptName = "finddistance.R";
    JavaRDD<String> pipeInputs = contactsContactLists.values().map(new VerifyCallLogs()).flatMap(new FlatMapFunction<CallLog[], String>() {

        public Iterable<String> call(CallLog[] calls) {
            ArrayList<String> latLons = new ArrayList<String>();
            for (CallLog call : calls) {
                latLons.add(call.mylat + "," + call.mylong + "," + call.contactlat + "," + call.contactlong);
            return latLons;
    JavaRDD<String> distances = pipeInputs.pipe(SparkFiles.get(distScriptName));
    // First we need to convert our RDD of String to a DoubleRDD so we can
    // access the stats function
    JavaDoubleRDD distanceDoubles = distances.mapToDouble(new DoubleFunction<String>() {

        public double call(String value) {
            return Double.parseDouble(value);
    final StatCounter stats = distanceDoubles.stats();
    final Double stddev = stats.stdev();
    final Double mean = stats.mean();
    JavaDoubleRDD reasonableDistances = distanceDoubles.filter(new Function<Double, Boolean>() {

        public Boolean call(Double x) {
            return (Math.abs(x - mean) < 3 * stddev);
    System.out.println(StringUtils.join(reasonableDistances.collect(), ","));
Also used : ArrayList(java.util.ArrayList) Iterator(java.util.Iterator) JavaSparkContext( StatCounter(org.apache.spark.util.StatCounter) PairFunction( ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) JavaDoubleRDD( FileNotFoundException( Tuple2(scala.Tuple2) HttpClient(org.eclipse.jetty.client.HttpClient) ContentExchange(org.eclipse.jetty.client.ContentExchange)

Example 2 with StatCounter

use of org.apache.spark.util.StatCounter in project learning-spark by databricks.

the class RemoveOutliers method removeOutliers.

static JavaDoubleRDD removeOutliers(JavaDoubleRDD rdd) {
    final StatCounter summaryStats = rdd.stats();
    final Double stddev = Math.sqrt(summaryStats.variance());
    return rdd.filter(new Function<Double, Boolean>() {

        public Boolean call(Double x) {
            return (Math.abs(x - summaryStats.mean()) < 3 * stddev);
Also used : StatCounter(org.apache.spark.util.StatCounter)


StatCounter (org.apache.spark.util.StatCounter)2 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)1 FileNotFoundException ( ArrayList (java.util.ArrayList)1 Iterator (java.util.Iterator)1 JavaDoubleRDD ( JavaSparkContext ( PairFunction ( ContentExchange (org.eclipse.jetty.client.ContentExchange)1 HttpClient (org.eclipse.jetty.client.HttpClient)1 Tuple2 (scala.Tuple2)1