Examples with PairFunction - org.apache.spark.api.java.function.PairFunction

Example 1 with PairFunction

use of org.apache.spark.api.java.function.PairFunction in project learning-spark by databricks.

the class LogAnalyzerTotal method processAccessLogs.

public void processAccessLogs(String outDir, JavaDStream<ApacheAccessLog> accessLogsDStream) {
    // Calculate statistics based on the content size, and update the static variables to track this.
    accessLogsDStream.foreachRDD(new Function<JavaRDD<ApacheAccessLog>, Void>() {

        public Void call(JavaRDD<ApacheAccessLog> accessLogs) {
            Tuple4<Long, Long, Long, Long> stats = Functions.contentSizeStats(accessLogs);
            if (stats != null) {
                runningCount.getAndAdd(stats._1());
                runningSum.getAndAdd(stats._2());
                runningMin.set(Math.min(runningMin.get(), stats._3()));
                runningMax.set(Math.max(runningMax.get(), stats._4()));
            }
            return null;
        }
    });
    // A DStream of Resonse Code Counts;
    JavaPairDStream<Integer, Long> responseCodeCountDStream = accessLogsDStream.transformToPair(new Function<JavaRDD<ApacheAccessLog>, JavaPairRDD<Integer, Long>>() {

        public JavaPairRDD<Integer, Long> call(JavaRDD<ApacheAccessLog> rdd) {
            return Functions.responseCodeCount(rdd);
        }
    }).updateStateByKey(new Functions.ComputeRunningSum());
    responseCodeCountDStream.foreachRDD(new Function<JavaPairRDD<Integer, Long>, Void>() {

        public Void call(JavaPairRDD<Integer, Long> rdd) {
            currentResponseCodeCounts = rdd.take(100);
            return null;
        }
    });
    // A DStream of ipAddressCounts.
    JavaPairDStream<String, Long> ipRawDStream = accessLogsDStream.transformToPair(new Function<JavaRDD<ApacheAccessLog>, JavaPairRDD<String, Long>>() {

        public JavaPairRDD<String, Long> call(JavaRDD<ApacheAccessLog> rdd) {
            return Functions.ipAddressCount(rdd);
        }
    });
    JavaPairDStream<String, Long> ipCumDStream = ipRawDStream.updateStateByKey(new Functions.ComputeRunningSum());
    // A DStream of ipAddressCounts without transform
    JavaPairDStream<String, Long> ipDStream = accessLogsDStream.mapToPair(new Functions.IpTuple());
    JavaPairDStream<String, Long> ipCountsDStream = ipDStream.reduceByKey(new Functions.LongSumReducer());
    // and joining it with the transfer amount
    JavaPairDStream<String, Long> ipBytesDStream = accessLogsDStream.mapToPair(new Functions.IpContentTuple());
    JavaPairDStream<String, Long> ipBytesSumDStream = ipBytesDStream.reduceByKey(new Functions.LongSumReducer());
    JavaPairDStream<String, Tuple2<Long, Long>> ipBytesRequestCountDStream = ipBytesSumDStream.join(ipCountsDStream);
    // Save our dstream of ip address request counts
    JavaPairDStream<Text, LongWritable> writableDStream = ipDStream.mapToPair(new PairFunction<Tuple2<String, Long>, Text, LongWritable>() {

        public Tuple2<Text, LongWritable> call(Tuple2<String, Long> e) {
            return new Tuple2(new Text(e._1()), new LongWritable(e._2()));
        }
    });
    class OutFormat extends SequenceFileOutputFormat<Text, LongWritable> {
    }
    ;
    writableDStream.saveAsHadoopFiles(outDir, "pandas", Text.class, LongWritable.class, OutFormat.class);
    // All ips more than 10
    JavaDStream<String> ipAddressDStream = ipCumDStream.transform(new Function<JavaPairRDD<String, Long>, JavaRDD<String>>() {

        public JavaRDD<String> call(JavaPairRDD<String, Long> rdd) {
            return Functions.filterIPAddress(rdd);
        }
    });
    ipAddressDStream.foreachRDD(new Function<JavaRDD<String>, Void>() {

        public Void call(JavaRDD<String> rdd) {
            List<String> currentIPAddresses = rdd.take(100);
            return null;
        }
    });
    // A DStream of endpoint to count.
    JavaPairDStream<String, Long> endpointCountsDStream = accessLogsDStream.transformToPair(new Function<JavaRDD<ApacheAccessLog>, JavaPairRDD<String, Long>>() {

        public JavaPairRDD<String, Long> call(JavaRDD<ApacheAccessLog> rdd) {
            return Functions.endpointCount(rdd);
        }
    }).updateStateByKey(new Functions.ComputeRunningSum());
    Object ordering = Ordering.natural();
    final Comparator<Long> cmp = (Comparator<Long>) ordering;
    endpointCountsDStream.foreachRDD(new Function<JavaPairRDD<String, Long>, Void>() {

        public Void call(JavaPairRDD<String, Long> rdd) {
            currentTopEndpoints = rdd.takeOrdered(10, new Functions.ValueComparator<String, Long>(cmp));
            return null;
        }
    });
}

Also used : SequenceFileOutputFormat(org.apache.hadoop.mapred.SequenceFileOutputFormat) Comparator(java.util.Comparator) VoidFunction(org.apache.spark.api.java.function.VoidFunction) Function(org.apache.spark.api.java.function.Function) PairFunction(org.apache.spark.api.java.function.PairFunction) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) List(java.util.List) LongWritable(org.apache.hadoop.io.LongWritable) Text(org.apache.hadoop.io.Text) JavaRDD(org.apache.spark.api.java.JavaRDD) Tuple4(scala.Tuple4) Tuple2(scala.Tuple2) AtomicLong(java.util.concurrent.atomic.AtomicLong)

Example 2 with PairFunction

use of org.apache.spark.api.java.function.PairFunction in project learning-spark by databricks.

the class LogAnalyzerWindowed method processAccessLogs.

public void processAccessLogs(String outDir, JavaDStream<ApacheAccessLog> accessLogsDStream) {
    JavaDStream<ApacheAccessLog> windowDStream = accessLogsDStream.window(Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
    JavaDStream<String> ip = accessLogsDStream.map(new Function<ApacheAccessLog, String>() {

        public String call(ApacheAccessLog entry) {
            return entry.getIpAddress();
        }
    });
    // reduceByWindow
    JavaDStream<Long> requestCountRBW = accessLogsDStream.map(new Function<ApacheAccessLog, Long>() {

        public Long call(ApacheAccessLog entry) {
            return 1L;
        }
    }).reduceByWindow(new Function2<Long, Long, Long>() {

        public Long call(Long v1, Long v2) {
            return v1 + v2;
        }
    }, new Function2<Long, Long, Long>() {

        public Long call(Long v1, Long v2) {
            return v1 - v2;
        }
    }, Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
    requestCountRBW.print();
    // reducebykeyandwindow
    JavaPairDStream<String, Long> ipAddressPairDStream = accessLogsDStream.mapToPair(new PairFunction<ApacheAccessLog, String, Long>() {

        public Tuple2<String, Long> call(ApacheAccessLog entry) {
            return new Tuple2(entry.getIpAddress(), 1L);
        }
    });
    JavaPairDStream<String, Long> ipCountDStream = ipAddressPairDStream.reduceByKeyAndWindow(// Adding elements in the new slice
    new Function2<Long, Long, Long>() {

        public Long call(Long v1, Long v2) {
            return v1 + v2;
        }
    }, // Removing elements from the oldest slice
    new Function2<Long, Long, Long>() {

        public Long call(Long v1, Long v2) {
            return v1 - v2;
        }
    }, Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
    ipCountDStream.print();
    // Use countByWindow
    JavaDStream<Long> requestCount = accessLogsDStream.countByWindow(Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
    JavaPairDStream<String, Long> ipAddressRequestCount = ip.countByValueAndWindow(Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
    requestCount.print();
    ipAddressRequestCount.print();
    // use a transform for the response code count
    JavaPairDStream<Integer, Long> responseCodeCountTransform = accessLogsDStream.transformToPair(new Function<JavaRDD<ApacheAccessLog>, JavaPairRDD<Integer, Long>>() {

        public JavaPairRDD<Integer, Long> call(JavaRDD<ApacheAccessLog> logs) {
            return Functions.responseCodeCount(logs);
        }
    });
    windowDStream.foreachRDD(new Function<JavaRDD<ApacheAccessLog>, Void>() {

        public Void call(JavaRDD<ApacheAccessLog> accessLogs) {
            Tuple4<Long, Long, Long, Long> contentSizeStats = Functions.contentSizeStats(accessLogs);
            List<Tuple2<Integer, Long>> responseCodeToCount = Functions.responseCodeCount(accessLogs).take(100);
            JavaPairRDD<String, Long> ipAddressCounts = Functions.ipAddressCount(accessLogs);
            List<String> ip = Functions.filterIPAddress(ipAddressCounts).take(100);
            Object ordering = Ordering.natural();
            Comparator<Long> cmp = (Comparator<Long>) ordering;
            List<Tuple2<String, Long>> topEndpoints = Functions.endpointCount(accessLogs).top(10, new Functions.ValueComparator<String, Long>(cmp));
            logStatistics = new LogStatistics(contentSizeStats, responseCodeToCount, ip, topEndpoints);
            return null;
        }
    });
}

Also used : Comparator(java.util.Comparator) Function(org.apache.spark.api.java.function.Function) PairFunction(org.apache.spark.api.java.function.PairFunction) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) List(java.util.List) JavaRDD(org.apache.spark.api.java.JavaRDD) Tuple4(scala.Tuple4) Tuple2(scala.Tuple2)

Example 3 with PairFunction

use of org.apache.spark.api.java.function.PairFunction in project learning-spark by databricks.

the class KeyValueMapFilter method main.

public static void main(String[] args) throws Exception {
    if (args.length != 2) {
        throw new Exception("Usage KeyValueMapFilter sparkMaster inputFile");
    }
    String master = args[0];
    String inputFile = args[1];
    JavaSparkContext sc = new JavaSparkContext(master, "KeyValueMapFilter", System.getenv("SPARK_HOME"), System.getenv("JARS"));
    JavaRDD<String> input = sc.textFile(inputFile);
    PairFunction<String, String, String> keyData = new PairFunction<String, String, String>() {

        @Override
        public Tuple2<String, String> call(String x) {
            return new Tuple2(x.split(" ")[0], x);
        }
    };
    Function<Tuple2<String, String>, Boolean> longWordFilter = new Function<Tuple2<String, String>, Boolean>() {

        @Override
        public Boolean call(Tuple2<String, String> input) {
            return (input._2().length() < 20);
        }
    };
    JavaPairRDD<String, String> rdd = input.mapToPair(keyData);
    JavaPairRDD<String, String> result = rdd.filter(longWordFilter);
    Map<String, String> resultMap = result.collectAsMap();
    for (Entry<String, String> entry : resultMap.entrySet()) {
        System.out.println(entry.getKey() + ":" + entry.getValue());
    }
}

Also used : Function(org.apache.spark.api.java.function.Function) PairFunction(org.apache.spark.api.java.function.PairFunction) Tuple2(scala.Tuple2) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) PairFunction(org.apache.spark.api.java.function.PairFunction)

Example 4 with PairFunction

use of org.apache.spark.api.java.function.PairFunction in project geode by apache.

the class RDDSaveJavaDemo method main.

public static void main(String[] argv) {
    if (argv.length != 1) {
        System.err.printf("Usage: RDDSaveJavaDemo <locators>\n");
        return;
    }
    SparkConf conf = new SparkConf().setAppName("RDDSaveJavaDemo");
    conf.set(GeodeLocatorPropKey, argv[0]);
    JavaSparkContext sc = new JavaSparkContext(conf);
    List<String> data = new ArrayList<String>();
    data.add("abcdefg");
    data.add("abcdefgh");
    data.add("abcdefghi");
    JavaRDD<String> rdd = sc.parallelize(data);
    GeodeConnectionConf connConf = GeodeConnectionConf.apply(conf);
    PairFunction<String, String, Integer> func = new PairFunction<String, String, Integer>() {

        @Override
        public Tuple2<String, Integer> call(String s) throws Exception {
            return new Tuple2<String, Integer>(s, s.length());
        }
    };
    javaFunctions(rdd).saveToGeode("str_int_region", func, connConf);
    sc.stop();
}

Also used : GeodeConnectionConf(org.apache.geode.spark.connector.GeodeConnectionConf) Tuple2(scala.Tuple2) ArrayList(java.util.ArrayList) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) PairFunction(org.apache.spark.api.java.function.PairFunction) SparkConf(org.apache.spark.SparkConf)

Example 5 with PairFunction

use of org.apache.spark.api.java.function.PairFunction in project incubator-systemml by apache.

the class ApplyTfCSVSPARK method runSparkJob.

/**
	 * Apply transformation metadata and generate the result in CSV format, as a
	 * JavaRDD of Strings.
	 * 
	 * @param sec spark execution context
	 * @param inputRDD input rdd
	 * @param tfMtdPath transform metadata path
	 * @param spec transform specification as json string
	 * @param tmpPath temporary file path
	 * @param prop csv file format properties
	 * @param numCols number of columns
	 * @param headerLine header line
	 * @return JavaPairRDD of long-strings
	 * @throws IOException if IOException occurs
	 * @throws ClassNotFoundException if ClassNotFoundException occurs
	 * @throws InterruptedException if InterruptedException occurs
	 * @throws IllegalArgumentException if IllegalArgumentException occurs
	 * @throws JSONException if JSONException occurs
	 */
public static JavaPairRDD<Long, String> runSparkJob(SparkExecutionContext sec, JavaRDD<Tuple2<LongWritable, Text>> inputRDD, String tfMtdPath, String spec, String tmpPath, CSVFileFormatProperties prop, int numCols, String headerLine) throws IOException, ClassNotFoundException, InterruptedException, IllegalArgumentException, JSONException {
    // Load transformation metadata and broadcast it
    String[] naStrings = TfUtils.parseNAStrings(prop.getNAStrings());
    JSONObject jspec = new JSONObject(spec);
    TfUtils _tfmapper = new TfUtils(headerLine, prop.hasHeader(), prop.getDelim(), naStrings, jspec, numCols, tfMtdPath, null, tmpPath);
    _tfmapper.loadTfMetadata();
    Broadcast<TfUtils> bcast_tf = sec.getSparkContext().broadcast(_tfmapper);
    /*
		 * Construct transformation metadata (map-side) -- the logic is similar
		 * to GTFMTDMapper
		 * 
		 * Note: The result of mapPartitionsWithIndex is cached so that the
		 * transformed data is not redundantly computed multiple times
		 */
    JavaPairRDD<Long, String> applyRDD = inputRDD.mapPartitionsWithIndex(new ApplyTfCSVMap(bcast_tf), true).mapToPair(new PairFunction<String, Long, String>() {

        private static final long serialVersionUID = 3868143093999082931L;

        @Override
        public Tuple2<Long, String> call(String t) throws Exception {
            return new Tuple2<Long, String>(new Long(1), t);
        }
    }).cache();
    /*
		 * An action to force execution of apply()
		 * 
		 * We need to trigger the execution of this RDD so as to ensure the
		 * creation of a few metadata files (headers, dummycoded information,
		 * etc.), which are referenced in the caller function.
		 */
    applyRDD.count();
    return applyRDD;
}

Also used : JSONObject(org.apache.wink.json4j.JSONObject) Tuple2(scala.Tuple2) PairFunction(org.apache.spark.api.java.function.PairFunction)

Aggregations

PairFunction (org.apache.spark.api.java.function.PairFunction)12 Tuple2 (scala.Tuple2)12 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)9 ArrayList (java.util.ArrayList)6 Function (org.apache.spark.api.java.function.Function)5 List (java.util.List)4 SparkConf (org.apache.spark.SparkConf)4 ServiceDiscoverer (co.cask.cdap.api.ServiceDiscoverer)2 Metrics (co.cask.cdap.api.metrics.Metrics)2 BufferedReader (java.io.BufferedReader)2 InputStreamReader (java.io.InputStreamReader)2 Iterable (java.lang.Iterable)2 URL (java.net.URL)2 URLConnection (java.net.URLConnection)2 Comparator (java.util.Comparator)2 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)2 JavaRDD (org.apache.spark.api.java.JavaRDD)2 PairFlatMapFunction (org.apache.spark.api.java.function.PairFlatMapFunction)2 Tuple4 (scala.Tuple4)2 StructuredRecord (co.cask.cdap.api.data.format.StructuredRecord)1