Search in sources :

Example 6 with JavaSparkContext

use of org.apache.spark.api.java.JavaSparkContext in project learning-spark by databricks.

the class BasicLoadSequenceFile method main.

public static void main(String[] args) throws Exception {
    if (args.length != 2) {
        throw new Exception("Usage BasicLoadSequenceFile [sparkMaster] [input]");
    }
    String master = args[0];
    String fileName = args[1];
    JavaSparkContext sc = new JavaSparkContext(master, "basicloadsequencefile", System.getenv("SPARK_HOME"), System.getenv("JARS"));
    JavaPairRDD<Text, IntWritable> input = sc.sequenceFile(fileName, Text.class, IntWritable.class);
    JavaPairRDD<String, Integer> result = input.mapToPair(new ConvertToNativeTypes());
    List<Tuple2<String, Integer>> resultList = result.collect();
    for (Tuple2<String, Integer> record : resultList) {
        System.out.println(record);
    }
}
Also used : Tuple2(scala.Tuple2) Text(org.apache.hadoop.io.Text) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) IntWritable(org.apache.hadoop.io.IntWritable)

Example 7 with JavaSparkContext

use of org.apache.spark.api.java.JavaSparkContext in project learning-spark by databricks.

the class BasicLoadWholeCsv method main.

public static void main(String[] args) throws Exception {
    if (args.length != 3) {
        throw new Exception("Usage BasicLoadCsv sparkMaster csvInputFile csvOutputFile key");
    }
    String master = args[0];
    String csvInput = args[1];
    String outputFile = args[2];
    final String key = args[3];
    JavaSparkContext sc = new JavaSparkContext(master, "loadwholecsv", System.getenv("SPARK_HOME"), System.getenv("JARS"));
    JavaPairRDD<String, String> csvData = sc.wholeTextFiles(csvInput);
    JavaRDD<String[]> keyedRDD = csvData.flatMap(new ParseLine());
    JavaRDD<String[]> result = keyedRDD.filter(new Function<String[], Boolean>() {

        public Boolean call(String[] input) {
            return input[0].equals(key);
        }
    });
    result.saveAsTextFile(outputFile);
}
Also used : JavaSparkContext(org.apache.spark.api.java.JavaSparkContext)

Example 8 with JavaSparkContext

use of org.apache.spark.api.java.JavaSparkContext in project learning-spark by databricks.

the class BasicMap method main.

public static void main(String[] args) throws Exception {
    String master;
    if (args.length > 0) {
        master = args[0];
    } else {
        master = "local";
    }
    JavaSparkContext sc = new JavaSparkContext(master, "basicmap", System.getenv("SPARK_HOME"), System.getenv("JARS"));
    JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4));
    JavaRDD<Integer> result = rdd.map(new Function<Integer, Integer>() {

        public Integer call(Integer x) {
            return x * x;
        }
    });
    System.out.println(StringUtils.join(result.collect(), ","));
}
Also used : JavaSparkContext(org.apache.spark.api.java.JavaSparkContext)

Example 9 with JavaSparkContext

use of org.apache.spark.api.java.JavaSparkContext in project learning-spark by databricks.

the class BasicMapPartitions method main.

public static void main(String[] args) throws Exception {
    String master;
    if (args.length > 0) {
        master = args[0];
    } else {
        master = "local";
    }
    JavaSparkContext sc = new JavaSparkContext(master, "basicmappartitions", System.getenv("SPARK_HOME"), System.getenv("JARS"));
    JavaRDD<String> rdd = sc.parallelize(Arrays.asList("KK6JKQ", "Ve3UoW", "kk6jlk", "W6BB"));
    JavaRDD<String> result = rdd.mapPartitions(new FlatMapFunction<Iterator<String>, String>() {

        public Iterable<String> call(Iterator<String> input) {
            ArrayList<String> content = new ArrayList<String>();
            ArrayList<ContentExchange> cea = new ArrayList<ContentExchange>();
            HttpClient client = new HttpClient();
            try {
                client.start();
                while (input.hasNext()) {
                    ContentExchange exchange = new ContentExchange(true);
                    exchange.setURL("http://qrzcq.com/call/" + input.next());
                    client.send(exchange);
                    cea.add(exchange);
                }
                for (ContentExchange exchange : cea) {
                    exchange.waitForDone();
                    content.add(exchange.getResponseContent());
                }
            } catch (Exception e) {
            }
            return content;
        }
    });
    System.out.println(StringUtils.join(result.collect(), ","));
}
Also used : ArrayList(java.util.ArrayList) HttpClient(org.eclipse.jetty.client.HttpClient) Iterator(java.util.Iterator) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) ContentExchange(org.eclipse.jetty.client.ContentExchange)

Example 10 with JavaSparkContext

use of org.apache.spark.api.java.JavaSparkContext in project learning-spark by databricks.

the class BasicMapToDouble method main.

public static void main(String[] args) throws Exception {
    String master;
    if (args.length > 0) {
        master = args[0];
    } else {
        master = "local";
    }
    JavaSparkContext sc = new JavaSparkContext(master, "basicmaptodouble", System.getenv("SPARK_HOME"), System.getenv("JARS"));
    JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4));
    JavaDoubleRDD result = rdd.mapToDouble(new DoubleFunction<Integer>() {

        public double call(Integer x) {
            double y = (double) x;
            return y * y;
        }
    });
    System.out.println(StringUtils.join(result.collect(), ","));
}
Also used : JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) JavaDoubleRDD(org.apache.spark.api.java.JavaDoubleRDD)

Aggregations

JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)251 Test (org.testng.annotations.Test)65 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)64 Tuple2 (scala.Tuple2)48 SparkConf (org.apache.spark.SparkConf)46 Test (org.junit.Test)43 ArrayList (java.util.ArrayList)41 GATKRead (org.broadinstitute.hellbender.utils.read.GATKRead)32 List (java.util.List)26 Configuration (org.apache.hadoop.conf.Configuration)23 JavaRDD (org.apache.spark.api.java.JavaRDD)23 File (java.io.File)22 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)20 Collectors (java.util.stream.Collectors)16 TextPipeline (org.deeplearning4j.spark.text.functions.TextPipeline)15 DataSet (org.nd4j.linalg.dataset.DataSet)15 IOException (java.io.IOException)13 SAMFileHeader (htsjdk.samtools.SAMFileHeader)12 RealMatrix (org.apache.commons.math3.linear.RealMatrix)12 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)11