Search in sources :

Example 16 with JavaSparkContext

use of org.apache.spark.api.java.JavaSparkContext in project learning-spark by databricks.

the class PerKeyAvg method main.

public static void main(String[] args) throws Exception {
    String master;
    if (args.length > 0) {
        master = args[0];
    } else {
        master = "local";
    }
    JavaSparkContext sc = new JavaSparkContext(master, "PerKeyAvg", System.getenv("SPARK_HOME"), System.getenv("JARS"));
    List<Tuple2<String, Integer>> input = new ArrayList();
    input.add(new Tuple2("coffee", 1));
    input.add(new Tuple2("coffee", 2));
    input.add(new Tuple2("pandas", 3));
    JavaPairRDD<String, Integer> rdd = sc.parallelizePairs(input);
    Function<Integer, AvgCount> createAcc = new Function<Integer, AvgCount>() {

        @Override
        public AvgCount call(Integer x) {
            return new AvgCount(x, 1);
        }
    };
    Function2<AvgCount, Integer, AvgCount> addAndCount = new Function2<AvgCount, Integer, AvgCount>() {

        @Override
        public AvgCount call(AvgCount a, Integer x) {
            a.total_ += x;
            a.num_ += 1;
            return a;
        }
    };
    Function2<AvgCount, AvgCount, AvgCount> combine = new Function2<AvgCount, AvgCount, AvgCount>() {

        @Override
        public AvgCount call(AvgCount a, AvgCount b) {
            a.total_ += b.total_;
            a.num_ += b.num_;
            return a;
        }
    };
    AvgCount initial = new AvgCount(0, 0);
    JavaPairRDD<String, AvgCount> avgCounts = rdd.combineByKey(createAcc, addAndCount, combine);
    Map<String, AvgCount> countMap = avgCounts.collectAsMap();
    for (Entry<String, AvgCount> entry : countMap.entrySet()) {
        System.out.println(entry.getKey() + ":" + entry.getValue().avg());
    }
}
Also used : ArrayList(java.util.ArrayList) Function2(org.apache.spark.api.java.function.Function2) Function(org.apache.spark.api.java.function.Function) Tuple2(scala.Tuple2) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext)

Example 17 with JavaSparkContext

use of org.apache.spark.api.java.JavaSparkContext in project learning-spark by databricks.

the class BasicLoadSequenceFile method main.

public static void main(String[] args) throws Exception {
    if (args.length != 2) {
        throw new Exception("Usage BasicLoadSequenceFile [sparkMaster] [input]");
    }
    String master = args[0];
    String fileName = args[1];
    JavaSparkContext sc = new JavaSparkContext(master, "basicloadsequencefile", System.getenv("SPARK_HOME"), System.getenv("JARS"));
    JavaPairRDD<Text, IntWritable> input = sc.sequenceFile(fileName, Text.class, IntWritable.class);
    JavaPairRDD<String, Integer> result = input.mapToPair(new ConvertToNativeTypes());
    List<Tuple2<String, Integer>> resultList = result.collect();
    for (Tuple2<String, Integer> record : resultList) {
        System.out.println(record);
    }
}
Also used : Tuple2(scala.Tuple2) Text(org.apache.hadoop.io.Text) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) IntWritable(org.apache.hadoop.io.IntWritable)

Example 18 with JavaSparkContext

use of org.apache.spark.api.java.JavaSparkContext in project learning-spark by databricks.

the class BasicLoadWholeCsv method main.

public static void main(String[] args) throws Exception {
    if (args.length != 3) {
        throw new Exception("Usage BasicLoadCsv sparkMaster csvInputFile csvOutputFile key");
    }
    String master = args[0];
    String csvInput = args[1];
    String outputFile = args[2];
    final String key = args[3];
    JavaSparkContext sc = new JavaSparkContext(master, "loadwholecsv", System.getenv("SPARK_HOME"), System.getenv("JARS"));
    JavaPairRDD<String, String> csvData = sc.wholeTextFiles(csvInput);
    JavaRDD<String[]> keyedRDD = csvData.flatMap(new ParseLine());
    JavaRDD<String[]> result = keyedRDD.filter(new Function<String[], Boolean>() {

        public Boolean call(String[] input) {
            return input[0].equals(key);
        }
    });
    result.saveAsTextFile(outputFile);
}
Also used : JavaSparkContext(org.apache.spark.api.java.JavaSparkContext)

Example 19 with JavaSparkContext

use of org.apache.spark.api.java.JavaSparkContext in project learning-spark by databricks.

the class BasicMap method main.

public static void main(String[] args) throws Exception {
    String master;
    if (args.length > 0) {
        master = args[0];
    } else {
        master = "local";
    }
    JavaSparkContext sc = new JavaSparkContext(master, "basicmap", System.getenv("SPARK_HOME"), System.getenv("JARS"));
    JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4));
    JavaRDD<Integer> result = rdd.map(new Function<Integer, Integer>() {

        public Integer call(Integer x) {
            return x * x;
        }
    });
    System.out.println(StringUtils.join(result.collect(), ","));
}
Also used : JavaSparkContext(org.apache.spark.api.java.JavaSparkContext)

Example 20 with JavaSparkContext

use of org.apache.spark.api.java.JavaSparkContext in project learning-spark by databricks.

the class BasicMapPartitions method main.

public static void main(String[] args) throws Exception {
    String master;
    if (args.length > 0) {
        master = args[0];
    } else {
        master = "local";
    }
    JavaSparkContext sc = new JavaSparkContext(master, "basicmappartitions", System.getenv("SPARK_HOME"), System.getenv("JARS"));
    JavaRDD<String> rdd = sc.parallelize(Arrays.asList("KK6JKQ", "Ve3UoW", "kk6jlk", "W6BB"));
    JavaRDD<String> result = rdd.mapPartitions(new FlatMapFunction<Iterator<String>, String>() {

        public Iterable<String> call(Iterator<String> input) {
            ArrayList<String> content = new ArrayList<String>();
            ArrayList<ContentExchange> cea = new ArrayList<ContentExchange>();
            HttpClient client = new HttpClient();
            try {
                client.start();
                while (input.hasNext()) {
                    ContentExchange exchange = new ContentExchange(true);
                    exchange.setURL("http://qrzcq.com/call/" + input.next());
                    client.send(exchange);
                    cea.add(exchange);
                }
                for (ContentExchange exchange : cea) {
                    exchange.waitForDone();
                    content.add(exchange.getResponseContent());
                }
            } catch (Exception e) {
            }
            return content;
        }
    });
    System.out.println(StringUtils.join(result.collect(), ","));
}
Also used : ArrayList(java.util.ArrayList) HttpClient(org.eclipse.jetty.client.HttpClient) Iterator(java.util.Iterator) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) ContentExchange(org.eclipse.jetty.client.ContentExchange)

Aggregations

JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)260 Test (org.testng.annotations.Test)65 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)64 SparkConf (org.apache.spark.SparkConf)49 Tuple2 (scala.Tuple2)48 ArrayList (java.util.ArrayList)45 Test (org.junit.Test)43 GATKRead (org.broadinstitute.hellbender.utils.read.GATKRead)32 List (java.util.List)28 Configuration (org.apache.hadoop.conf.Configuration)24 JavaRDD (org.apache.spark.api.java.JavaRDD)24 File (java.io.File)23 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)20 Collectors (java.util.stream.Collectors)16 TextPipeline (org.deeplearning4j.spark.text.functions.TextPipeline)15 DataSet (org.nd4j.linalg.dataset.DataSet)15 IOException (java.io.IOException)14 SAMFileHeader (htsjdk.samtools.SAMFileHeader)12 HashSet (java.util.HashSet)12 RealMatrix (org.apache.commons.math3.linear.RealMatrix)12