Search in sources :

Example 11 with JavaSparkContext

use of org.apache.spark.api.java.JavaSparkContext in project learning-spark by databricks.

the class BasicSaveSequenceFile method main.

public static void main(String[] args) throws Exception {
    if (args.length != 2) {
        throw new Exception("Usage BasicSaveSequenceFile [sparkMaster] [output]");
    }
    String master = args[0];
    String fileName = args[1];
    JavaSparkContext sc = new JavaSparkContext(master, "basicloadsequencefile", System.getenv("SPARK_HOME"), System.getenv("JARS"));
    List<Tuple2<String, Integer>> input = new ArrayList();
    input.add(new Tuple2("coffee", 1));
    input.add(new Tuple2("coffee", 2));
    input.add(new Tuple2("pandas", 3));
    JavaPairRDD<String, Integer> rdd = sc.parallelizePairs(input);
    JavaPairRDD<Text, IntWritable> result = rdd.mapToPair(new ConvertToWritableTypes());
    result.saveAsHadoopFile(fileName, Text.class, IntWritable.class, SequenceFileOutputFormat.class);
}
Also used : Tuple2(scala.Tuple2) ArrayList(java.util.ArrayList) Text(org.apache.hadoop.io.Text) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) IntWritable(org.apache.hadoop.io.IntWritable)

Example 12 with JavaSparkContext

use of org.apache.spark.api.java.JavaSparkContext in project learning-spark by databricks.

the class BasicMap method main.

public static void main(String[] args) throws Exception {
    String master;
    if (args.length > 0) {
        master = args[0];
    } else {
        master = "local";
    }
    JavaSparkContext sc = new JavaSparkContext(master, "basicmap", System.getenv("SPARK_HOME"), System.getenv("JARS"));
    JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4));
    JavaRDD<Integer> result = rdd.map(new Function<Integer, Integer>() {

        public Integer call(Integer x) {
            return x * x;
        }
    });
    System.out.println(StringUtils.join(result.collect(), ","));
}
Also used : JavaSparkContext(org.apache.spark.api.java.JavaSparkContext)

Example 13 with JavaSparkContext

use of org.apache.spark.api.java.JavaSparkContext in project learning-spark by databricks.

the class BasicAvgMapPartitions method run.

public void run(String master) {
    JavaSparkContext sc = new JavaSparkContext(master, "basicavgmappartitions", System.getenv("SPARK_HOME"), System.getenv("JARS"));
    JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5));
    FlatMapFunction<Iterator<Integer>, AvgCount> setup = new FlatMapFunction<Iterator<Integer>, AvgCount>() {

        @Override
        public Iterable<AvgCount> call(Iterator<Integer> input) {
            AvgCount a = new AvgCount(0, 0);
            while (input.hasNext()) {
                a.total_ += input.next();
                a.num_ += 1;
            }
            ArrayList<AvgCount> ret = new ArrayList<AvgCount>();
            ret.add(a);
            return ret;
        }
    };
    Function2<AvgCount, AvgCount, AvgCount> combine = new Function2<AvgCount, AvgCount, AvgCount>() {

        @Override
        public AvgCount call(AvgCount a, AvgCount b) {
            a.total_ += b.total_;
            a.num_ += b.num_;
            return a;
        }
    };
    AvgCount result = rdd.mapPartitions(setup).reduce(combine);
    System.out.println(result.avg());
}
Also used : FlatMapFunction(org.apache.spark.api.java.function.FlatMapFunction) Iterator(java.util.Iterator) ArrayList(java.util.ArrayList) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Function2(org.apache.spark.api.java.function.Function2)

Example 14 with JavaSparkContext

use of org.apache.spark.api.java.JavaSparkContext in project learning-spark by databricks.

the class BasicFlatMap method main.

public static void main(String[] args) throws Exception {
    if (args.length != 2) {
        throw new Exception("Usage BasicFlatMap sparkMaster inputFile");
    }
    JavaSparkContext sc = new JavaSparkContext(args[0], "basicflatmap", System.getenv("SPARK_HOME"), System.getenv("JARS"));
    JavaRDD<String> rdd = sc.textFile(args[1]);
    JavaRDD<String> words = rdd.flatMap(new FlatMapFunction<String, String>() {

        public Iterable<String> call(String x) {
            return Arrays.asList(x.split(" "));
        }
    });
    Map<String, Long> result = words.countByValue();
    for (Entry<String, Long> entry : result.entrySet()) {
        System.out.println(entry.getKey() + ":" + entry.getValue());
    }
}
Also used : JavaSparkContext(org.apache.spark.api.java.JavaSparkContext)

Example 15 with JavaSparkContext

use of org.apache.spark.api.java.JavaSparkContext in project learning-spark by databricks.

the class BasicJoinCsv method run.

public void run(String master, String csv1, String csv2) throws Exception {
    JavaSparkContext sc = new JavaSparkContext(master, "basicjoincsv", System.getenv("SPARK_HOME"), System.getenv("JARS"));
    JavaRDD<String> csvFile1 = sc.textFile(csv1);
    JavaRDD<String> csvFile2 = sc.textFile(csv2);
    JavaPairRDD<Integer, String[]> keyedRDD1 = csvFile1.mapToPair(new ParseLine());
    JavaPairRDD<Integer, String[]> keyedRDD2 = csvFile1.mapToPair(new ParseLine());
    JavaPairRDD<Integer, Tuple2<String[], String[]>> result = keyedRDD1.join(keyedRDD2);
    List<Tuple2<Integer, Tuple2<String[], String[]>>> resultCollection = result.collect();
}
Also used : Tuple2(scala.Tuple2) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext)

Aggregations

JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)251 Test (org.testng.annotations.Test)65 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)64 Tuple2 (scala.Tuple2)48 SparkConf (org.apache.spark.SparkConf)46 Test (org.junit.Test)43 ArrayList (java.util.ArrayList)41 GATKRead (org.broadinstitute.hellbender.utils.read.GATKRead)32 List (java.util.List)26 Configuration (org.apache.hadoop.conf.Configuration)23 JavaRDD (org.apache.spark.api.java.JavaRDD)23 File (java.io.File)22 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)20 Collectors (java.util.stream.Collectors)16 TextPipeline (org.deeplearning4j.spark.text.functions.TextPipeline)15 DataSet (org.nd4j.linalg.dataset.DataSet)15 IOException (java.io.IOException)13 SAMFileHeader (htsjdk.samtools.SAMFileHeader)12 RealMatrix (org.apache.commons.math3.linear.RealMatrix)12 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)11