Search in sources :

Example 1 with JavaSparkContext

use of org.apache.spark.api.java.JavaSparkContext in project hbase by apache.

the class JavaHBaseBulkPutExample method main.

public static void main(String[] args) {
    if (args.length < 2) {
        System.out.println("JavaHBaseBulkPutExample  " + "{tableName} {columnFamily}");
        return;
    }
    String tableName = args[0];
    String columnFamily = args[1];
    SparkConf sparkConf = new SparkConf().setAppName("JavaHBaseBulkPutExample " + tableName);
    JavaSparkContext jsc = new JavaSparkContext(sparkConf);
    try {
        List<String> list = new ArrayList<>(5);
        list.add("1," + columnFamily + ",a,1");
        list.add("2," + columnFamily + ",a,2");
        list.add("3," + columnFamily + ",a,3");
        list.add("4," + columnFamily + ",a,4");
        list.add("5," + columnFamily + ",a,5");
        JavaRDD<String> rdd = jsc.parallelize(list);
        Configuration conf = HBaseConfiguration.create();
        JavaHBaseContext hbaseContext = new JavaHBaseContext(jsc, conf);
        hbaseContext.bulkPut(rdd, TableName.valueOf(tableName), new PutFunction());
    } finally {
        jsc.stop();
    }
}
Also used : HBaseConfiguration(org.apache.hadoop.hbase.HBaseConfiguration) Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) JavaHBaseContext(org.apache.hadoop.hbase.spark.JavaHBaseContext) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) SparkConf(org.apache.spark.SparkConf)

Example 2 with JavaSparkContext

use of org.apache.spark.api.java.JavaSparkContext in project learning-spark by databricks.

the class IntersectByKey method main.

public static void main(String[] args) throws Exception {
    String master;
    if (args.length > 0) {
        master = args[0];
    } else {
        master = "local";
    }
    JavaSparkContext sc = new JavaSparkContext(master, "IntersectByKey", System.getenv("SPARK_HOME"), System.getenv("JARS"));
    List<Tuple2<String, Integer>> input1 = new ArrayList();
    input1.add(new Tuple2("coffee", 1));
    input1.add(new Tuple2("coffee", 2));
    input1.add(new Tuple2("pandas", 3));
    List<Tuple2<String, Integer>> input2 = new ArrayList();
    input2.add(new Tuple2("pandas", 20));
    JavaPairRDD<String, Integer> rdd1 = sc.parallelizePairs(input1);
    JavaPairRDD<String, Integer> rdd2 = sc.parallelizePairs(input2);
    JavaPairRDD<String, Integer> result = intersectByKey(rdd1, rdd2);
    for (Tuple2<String, Integer> entry : result.collect()) {
        System.out.println(entry._1() + ":" + entry._2());
    }
    System.out.println("Done");
    sc.stop();
}
Also used : Tuple2(scala.Tuple2) ArrayList(java.util.ArrayList) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext)

Example 3 with JavaSparkContext

use of org.apache.spark.api.java.JavaSparkContext in project learning-spark by databricks.

the class KeyValueMapFilter method main.

public static void main(String[] args) throws Exception {
    if (args.length != 2) {
        throw new Exception("Usage KeyValueMapFilter sparkMaster inputFile");
    }
    String master = args[0];
    String inputFile = args[1];
    JavaSparkContext sc = new JavaSparkContext(master, "KeyValueMapFilter", System.getenv("SPARK_HOME"), System.getenv("JARS"));
    JavaRDD<String> input = sc.textFile(inputFile);
    PairFunction<String, String, String> keyData = new PairFunction<String, String, String>() {

        @Override
        public Tuple2<String, String> call(String x) {
            return new Tuple2(x.split(" ")[0], x);
        }
    };
    Function<Tuple2<String, String>, Boolean> longWordFilter = new Function<Tuple2<String, String>, Boolean>() {

        @Override
        public Boolean call(Tuple2<String, String> input) {
            return (input._2().length() < 20);
        }
    };
    JavaPairRDD<String, String> rdd = input.mapToPair(keyData);
    JavaPairRDD<String, String> result = rdd.filter(longWordFilter);
    Map<String, String> resultMap = result.collectAsMap();
    for (Entry<String, String> entry : resultMap.entrySet()) {
        System.out.println(entry.getKey() + ":" + entry.getValue());
    }
}
Also used : Function(org.apache.spark.api.java.function.Function) PairFunction(org.apache.spark.api.java.function.PairFunction) Tuple2(scala.Tuple2) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) PairFunction(org.apache.spark.api.java.function.PairFunction)

Example 4 with JavaSparkContext

use of org.apache.spark.api.java.JavaSparkContext in project learning-spark by databricks.

the class MLlib method main.

public static void main(String[] args) {
    SparkConf sparkConf = new SparkConf().setAppName("JavaBookExample");
    JavaSparkContext sc = new JavaSparkContext(sparkConf);
    // Load 2 types of emails from text files: spam and ham (non-spam).
    // Each line has text from one email.
    JavaRDD<String> spam = sc.textFile("files/spam.txt");
    JavaRDD<String> ham = sc.textFile("files/ham.txt");
    // Create a HashingTF instance to map email text to vectors of 100 features.
    final HashingTF tf = new HashingTF(100);
    // Each email is split into words, and each word is mapped to one feature.
    // Create LabeledPoint datasets for positive (spam) and negative (ham) examples.
    JavaRDD<LabeledPoint> positiveExamples = spam.map(new Function<String, LabeledPoint>() {

        @Override
        public LabeledPoint call(String email) {
            return new LabeledPoint(1, tf.transform(Arrays.asList(email.split(" "))));
        }
    });
    JavaRDD<LabeledPoint> negativeExamples = ham.map(new Function<String, LabeledPoint>() {

        @Override
        public LabeledPoint call(String email) {
            return new LabeledPoint(0, tf.transform(Arrays.asList(email.split(" "))));
        }
    });
    JavaRDD<LabeledPoint> trainingData = positiveExamples.union(negativeExamples);
    // Cache data since Logistic Regression is an iterative algorithm.
    trainingData.cache();
    // Create a Logistic Regression learner which uses the LBFGS optimizer.
    LogisticRegressionWithSGD lrLearner = new LogisticRegressionWithSGD();
    // Run the actual learning algorithm on the training data.
    LogisticRegressionModel model = lrLearner.run(trainingData.rdd());
    // Test on a positive example (spam) and a negative one (ham).
    // First apply the same HashingTF feature transformation used on the training data.
    Vector posTestExample = tf.transform(Arrays.asList("O M G GET cheap stuff by sending money to ...".split(" ")));
    Vector negTestExample = tf.transform(Arrays.asList("Hi Dad, I started studying Spark the other ...".split(" ")));
    // Now use the learned model to predict spam/ham for new emails.
    System.out.println("Prediction for positive test example: " + model.predict(posTestExample));
    System.out.println("Prediction for negative test example: " + model.predict(negTestExample));
    sc.stop();
}
Also used : HashingTF(org.apache.spark.mllib.feature.HashingTF) LogisticRegressionWithSGD(org.apache.spark.mllib.classification.LogisticRegressionWithSGD) LogisticRegressionModel(org.apache.spark.mllib.classification.LogisticRegressionModel) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LabeledPoint(org.apache.spark.mllib.regression.LabeledPoint) SparkConf(org.apache.spark.SparkConf) Vector(org.apache.spark.mllib.linalg.Vector)

Example 5 with JavaSparkContext

use of org.apache.spark.api.java.JavaSparkContext in project learning-spark by databricks.

the class PerKeyAvg method main.

public static void main(String[] args) throws Exception {
    String master;
    if (args.length > 0) {
        master = args[0];
    } else {
        master = "local";
    }
    JavaSparkContext sc = new JavaSparkContext(master, "PerKeyAvg", System.getenv("SPARK_HOME"), System.getenv("JARS"));
    List<Tuple2<String, Integer>> input = new ArrayList();
    input.add(new Tuple2("coffee", 1));
    input.add(new Tuple2("coffee", 2));
    input.add(new Tuple2("pandas", 3));
    JavaPairRDD<String, Integer> rdd = sc.parallelizePairs(input);
    Function<Integer, AvgCount> createAcc = new Function<Integer, AvgCount>() {

        @Override
        public AvgCount call(Integer x) {
            return new AvgCount(x, 1);
        }
    };
    Function2<AvgCount, Integer, AvgCount> addAndCount = new Function2<AvgCount, Integer, AvgCount>() {

        @Override
        public AvgCount call(AvgCount a, Integer x) {
            a.total_ += x;
            a.num_ += 1;
            return a;
        }
    };
    Function2<AvgCount, AvgCount, AvgCount> combine = new Function2<AvgCount, AvgCount, AvgCount>() {

        @Override
        public AvgCount call(AvgCount a, AvgCount b) {
            a.total_ += b.total_;
            a.num_ += b.num_;
            return a;
        }
    };
    AvgCount initial = new AvgCount(0, 0);
    JavaPairRDD<String, AvgCount> avgCounts = rdd.combineByKey(createAcc, addAndCount, combine);
    Map<String, AvgCount> countMap = avgCounts.collectAsMap();
    for (Entry<String, AvgCount> entry : countMap.entrySet()) {
        System.out.println(entry.getKey() + ":" + entry.getValue().avg());
    }
}
Also used : ArrayList(java.util.ArrayList) Function2(org.apache.spark.api.java.function.Function2) Function(org.apache.spark.api.java.function.Function) Tuple2(scala.Tuple2) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext)

Aggregations

JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)251 Test (org.testng.annotations.Test)65 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)64 Tuple2 (scala.Tuple2)48 SparkConf (org.apache.spark.SparkConf)46 Test (org.junit.Test)43 ArrayList (java.util.ArrayList)41 GATKRead (org.broadinstitute.hellbender.utils.read.GATKRead)32 List (java.util.List)26 Configuration (org.apache.hadoop.conf.Configuration)23 JavaRDD (org.apache.spark.api.java.JavaRDD)23 File (java.io.File)22 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)20 Collectors (java.util.stream.Collectors)16 TextPipeline (org.deeplearning4j.spark.text.functions.TextPipeline)15 DataSet (org.nd4j.linalg.dataset.DataSet)15 IOException (java.io.IOException)13 SAMFileHeader (htsjdk.samtools.SAMFileHeader)12 RealMatrix (org.apache.commons.math3.linear.RealMatrix)12 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)11