Search in sources :

Example 96 with JavaPairRDD

use of org.apache.spark.api.java.JavaPairRDD in project BD2017 by achintya-kumar.

the class WordCount method main.

public static void main(String[] args) {
    // create Spark context with Spark configuration
    SparkConf sparkConf = new SparkConf().setAppName("Spark Count").setMaster("local[2]");
    JavaSparkContext sc = new JavaSparkContext(sparkConf);
    // read in text file and split each document into words
    JavaRDD<String> lines = sc.textFile("hdfs://localhost:8020/user/cloudera/source/MarinerMissionToMars.txt");
    JavaPairRDD<String, Integer> counts = lines.flatMap(line -> Arrays.asList(line.split(" ")).iterator()).mapToPair(word -> new Tuple2<String, Integer>(word, 1)).reduceByKey((x, y) -> x + y);
    counts.saveAsTextFile("hdfs://localhost:8020/user/cloudera/results/MarinerMissionToMarsResults");
    sc.close();
}
Also used : Arrays(java.util.Arrays) SparkConf(org.apache.spark.SparkConf) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Tuple2(scala.Tuple2) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) JavaRDD(org.apache.spark.api.java.JavaRDD) Tuple2(scala.Tuple2) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) SparkConf(org.apache.spark.SparkConf)

Example 97 with JavaPairRDD

use of org.apache.spark.api.java.JavaPairRDD in project mm-dev by sbl-sdsc.

the class ReducedEncoderNewTest method test.

@Test
public void test() {
    SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(FilterByRFree.class.getSimpleName());
    JavaSparkContext sc = new JavaSparkContext(conf);
    // List<String> pdbIds = Arrays.asList("1STP","4HHB","2ONX","1JLP","5X6H","5L2G","2MK1");
    List<String> pdbIds = Arrays.asList("1STP", "4HHB", "2ONX", "2CCV");
    JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc).cache();
    // pdb.foreach(t -> System.out.println(t._1 + "o :" + t._2.getNumBonds()));
    // List<String> chainIds = pdb.map(t -> t._1 + "_chainId_" + Arrays.toString(t._2.getChainIds())).collect();
    // System.out.println("full: " + chainIds);
    // List<String> chainNames = pdb.map(t -> t._1 + "_chainNames_" + Arrays.toString(t._2.getChainNames())).collect();
    // System.out.println("full: " + chainNames);
    // List<String> numGroups = pdb.map(t -> t._1 + "_numGroups_" + t._2.getNumGroups()).collect();
    // System.out.println("full: " + numGroups);
    // List<String> altlocs = pdb.map(t -> t._1 + "_altLocs_" + Arrays.toString(t._2.getAltLocIds())).collect();
    // System.out.println("full: " + altlocs);
    pdb = pdb.mapValues(v -> ReducedEncoder.getReduced(v)).cache();
    // chainIds = pdb.map(t -> t._1 + "_chainId_" + Arrays.toString(t._2.getChainIds())).collect();
    // System.out.println("reduced: " + chainIds);
    // chainNames = pdb.map(t -> t._1 + "_chainNames_" + Arrays.toString(t._2.getChainNames())).collect();
    // System.out.println("reduced: " + chainNames);
    // altlocs = pdb.map(t -> t._1 + "_altLocs_" + Arrays.toString(t._2.getAltLocIds())).collect();
    // System.out.println("reduced: " + altlocs);
    // 1STP # groups 121 CA + 1 BTN = 122
    // 4HHB # groups 141x2 + 146x2 CA +  4 HEM + 2P (from PO4) = 580
    // 2ONX # groups 4 CA = 4
    // 2CVV # atoms 99 CA + 4 altloc CA + 1 A2G (sugar) + 1 NAG (orig 15) + 1 GOL + 1 ZN, 1 ACE = 108
    // TODO (4 altlocs missing?)
    // numGroups = pdb.map(t -> t._1 + "_numGroups_" + t._2.getNumGroups()).collect();
    // System.out.println("reduced: " + numGroups);
    List<String> atoms = pdb.map(t -> t._1 + "_atoms_" + t._2.getNumAtoms()).collect();
    // System.out.println(atoms);
    // 1STP # atoms 121 CA + 16 BTN
    // 4HHB # atom 141x2 + 146x2 CA +  43x4 HEM + 2P (from PO4) = 748
    // 2ONX # atoms 4 CA
    // 2CVV # atoms 99 CA + 4 (5?) altloc CA + 15 A2G (sugar) + 14 NAG (orig 15) + 6 GOL + 1 ZN, ACE 4 = 143
    assertTrue(atoms.contains("1STP_atoms_137"));
    assertTrue(atoms.contains("4HHB_atoms_748"));
    assertTrue(atoms.contains("2ONX_atoms_4"));
    assertTrue(atoms.contains("2CCV_atoms_143"));
    List<String> bonds = pdb.map(t -> t._1 + "_bonds_" + t._2.getNumBonds()).collect();
    // 1STP # bond 17 BTN
    // 4HHB # bonds 50 x 4 HEM = 200
    // 2ONX # bonds 0
    // 2CVV # bonds 15 A2G+ 14 NAG (-O) + 5 GOL + 3 ACE + 2 disulfide bridges + 1 covalent bond to NAG = 40
    assertTrue(bonds.contains("1STP_bonds_17"));
    assertTrue(bonds.contains("4HHB_bonds_200"));
    assertTrue(bonds.contains("2ONX_bonds_0"));
    assertTrue(bonds.contains("2CCV_bonds_40"));
    sc.close();
}
Also used : Arrays(java.util.Arrays) List(java.util.List) ReducedEncoder(org.rcsb.mmtf.encoder.ReducedEncoder) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) FilterByRFree(edu.sdsc.mmtf.spark.filters.demos.FilterByRFree) SparkConf(org.apache.spark.SparkConf) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Assert.assertTrue(org.junit.Assert.assertTrue) Test(org.junit.Test) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) MmtfReader(edu.sdsc.mmtf.spark.io.MmtfReader) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) StructureDataInterface(org.rcsb.mmtf.api.StructureDataInterface) FilterByRFree(edu.sdsc.mmtf.spark.filters.demos.FilterByRFree) SparkConf(org.apache.spark.SparkConf) Test(org.junit.Test)

Example 98 with JavaPairRDD

use of org.apache.spark.api.java.JavaPairRDD in project net.jgp.labs.spark by jgperrin.

the class PageRankApp method start.

public void start(String file, int numberOfIterations) {
    showWarning();
    SparkSession spark = SparkSession.builder().appName("JavaPageRank").getOrCreate();
    // Loads in input file. It should be in format of:
    // URL neighbor URL
    // URL neighbor URL
    // URL neighbor URL
    // ...
    JavaRDD<String> lines = spark.read().textFile(file).javaRDD();
    // Loads all URLs from input file and initialize their neighbors.
    JavaPairRDD<String, Iterable<String>> links = lines.mapToPair(s -> {
        String[] parts = SPACES.split(s);
        return new Tuple2<>(parts[0], parts[1]);
    }).distinct().groupByKey().cache();
    // Loads all URLs with other URL(s) link to from input file and
    // initialize ranks of them to one.
    JavaPairRDD<String, Double> ranks = links.mapValues(rs -> 1.0);
    // algorithm.
    for (int current = 0; current < numberOfIterations; current++) {
        // Calculates URL contributions to the rank of other URLs.
        JavaPairRDD<String, Double> contribs = links.join(ranks).values().flatMapToPair(s -> {
            int urlCount = Iterables.size(s._1());
            List<Tuple2<String, Double>> results = new ArrayList<>();
            for (String n : s._1) {
                results.add(new Tuple2<>(n, s._2() / urlCount));
            }
            return results.iterator();
        });
        // Re-calculates URL ranks based on neighbor contributions.
        ranks = contribs.reduceByKey(new Sum()).mapValues(sum -> 0.15 + sum * 0.85);
    }
    // Collects all URL ranks and dump them to console.
    List<Tuple2<String, Double>> output = ranks.collect();
    for (Tuple2<?, ?> tuple : output) {
        System.out.println(tuple._1() + " has rank: " + tuple._2() + ".");
    }
    spark.stop();
}
Also used : Iterables(com.google.common.collect.Iterables) Arrays(java.util.Arrays) Function2(org.apache.spark.api.java.function.Function2) Dataset(org.apache.spark.sql.Dataset) Tuple2(scala.Tuple2) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) ReduceFunction(org.apache.spark.api.java.function.ReduceFunction) Encoders(org.apache.spark.sql.Encoders) ArrayList(java.util.ArrayList) Serializable(java.io.Serializable) List(java.util.List) Pattern(java.util.regex.Pattern) JavaRDD(org.apache.spark.api.java.JavaRDD) SparkSession(org.apache.spark.sql.SparkSession) SparkSession(org.apache.spark.sql.SparkSession) ArrayList(java.util.ArrayList) Tuple2(scala.Tuple2)

Example 99 with JavaPairRDD

use of org.apache.spark.api.java.JavaPairRDD in project auratrainingproject by liuqinghua666.

the class JavaKafkaShopCityAnalytics method main.

public static void main(String[] args) throws Exception {
    SparkConf conf = new SparkConf().setAppName("JavaKafkaShopCityAnalytics");
    if (args.length == 0) {
        conf.setMaster("local[1]");
    } else {
        dataPath = args[0];
    }
    JavaStreamingContext ssc = new JavaStreamingContext(conf, Durations.seconds(5));
    // 从MySQL或文本中读取数据库,作为广播变量共享使用
    Map<String, String> shopCityMap = getShopCityMap(dataPath);
    JavaSparkContext jsc = ssc.sparkContext();
    Broadcast<Map<String, String>> broadcastCountryMap = jsc.broadcast(shopCityMap);
    // Kafka configurations
    String[] topics = KafkaRedisConfig.KAFKA_USER_PAY_TOPIC.split("\\,");
    System.out.println("Topics: " + Arrays.toString(topics));
    String brokers = KafkaRedisConfig.KAFKA_ADDR;
    Map<String, String> kafkaParams = new HashMap<>();
    kafkaParams.put("metadata.broker.list", brokers);
    kafkaParams.put("serializer.class", "kafka.serializer.StringEncoder");
    final String clickHashKey = "app::shop::paycount";
    // Create a direct stream
    JavaPairInputDStream<String, String> kafkaStream = KafkaUtils.createDirectStream(ssc, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, new HashSet<String>(Arrays.asList(topics)));
    JavaDStream events = kafkaStream.map(new Function<Tuple2<String, String>, String[]>() {

        @Override
        public String[] call(Tuple2<String, String> line) throws Exception {
            System.out.println("line:" + line._1() + "=>" + line._2().split(",")[0]);
            String[] data = new String[] { line._1(), line._2().split(",")[0] };
            return data;
        }
    });
    // Compute user click times
    JavaPairDStream<String, Long> shopClicks = events.mapToPair(new PairFunction<String[], String, Long>() {

        @Override
        public Tuple2<String, Long> call(String[] x) {
            return new Tuple2<>(x[1], new Long(1));
        }
    }).reduceByKey(new Function2<Long, Long, Long>() {

        @Override
        public Long call(Long i1, Long i2) {
            return i1 + i2;
        }
    });
    shopClicks.foreachRDD(new VoidFunction<JavaPairRDD<String, Long>>() {

        @Override
        public void call(JavaPairRDD<String, Long> rdd) throws Exception {
            rdd.foreachPartition(new VoidFunction<Iterator<Tuple2<String, Long>>>() {

                @Override
                public void call(Iterator<Tuple2<String, Long>> partitionOfRecords) throws Exception {
                    Jedis jedis = JavaRedisClient.get().getResource();
                    while (partitionOfRecords.hasNext()) {
                        try {
                            Tuple2<String, Long> pair = partitionOfRecords.next();
                            String shopidKey = "jiaoyi" + pair._1();
                            // 读取广播变量Map,根据shopd获取cityName
                            String cityName = broadcastCountryMap.getValue().get(pair._1());
                            String cityKey = "交易" + cityName;
                            // String cityKey = "交易"+getCityOfShop(pair._1 (),dataPath);
                            // 交易量
                            long clickCount = pair._2();
                            // 将店铺交易增量写入Redis
                            jedis.incrBy(shopidKey, clickCount);
                            System.out.println("Update shop " + shopidKey + " inc " + clickCount);
                            // 将城市交易增量写入Redis
                            jedis.incrBy(cityKey, clickCount);
                            System.out.println("Update city " + cityKey + " inc " + clickCount);
                        } catch (Exception e) {
                            System.out.println("error:" + e);
                        }
                    }
                    jedis.close();
                }
            });
        }
    });
    ssc.start();
    ssc.awaitTermination();
}
Also used : JavaDStream(org.apache.spark.streaming.api.java.JavaDStream) JavaStreamingContext(org.apache.spark.streaming.api.java.JavaStreamingContext) Jedis(redis.clients.jedis.Jedis) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) PairFunction(org.apache.spark.api.java.function.PairFunction) IOException(java.io.IOException) Tuple2(scala.Tuple2) VoidFunction(org.apache.spark.api.java.function.VoidFunction) SparkConf(org.apache.spark.SparkConf)

Aggregations

JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)99 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)44 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)42 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)42 MatrixCharacteristics (org.apache.sysml.runtime.matrix.MatrixCharacteristics)41 Tuple2 (scala.Tuple2)35 DMLRuntimeException (org.apache.sysml.runtime.DMLRuntimeException)33 JavaRDD (org.apache.spark.api.java.JavaRDD)28 List (java.util.List)27 SparkExecutionContext (org.apache.sysml.runtime.controlprogram.context.SparkExecutionContext)24 FrameBlock (org.apache.sysml.runtime.matrix.data.FrameBlock)23 Collectors (java.util.stream.Collectors)22 IOException (java.io.IOException)17 RDDObject (org.apache.sysml.runtime.instructions.spark.data.RDDObject)16 LongWritable (org.apache.hadoop.io.LongWritable)15 Broadcast (org.apache.spark.broadcast.Broadcast)15 Text (org.apache.hadoop.io.Text)12 UserException (org.broadinstitute.hellbender.exceptions.UserException)12 Function (org.apache.spark.api.java.function.Function)11 MatrixObject (org.apache.sysml.runtime.controlprogram.caching.MatrixObject)11