use of org.apache.spark.api.java.JavaPairRDD in project BD2017 by achintya-kumar.
the class WordCount method main.
public static void main(String[] args) {
// create Spark context with Spark configuration
SparkConf sparkConf = new SparkConf().setAppName("Spark Count").setMaster("local[2]");
JavaSparkContext sc = new JavaSparkContext(sparkConf);
// read in text file and split each document into words
JavaRDD<String> lines = sc.textFile("hdfs://localhost:8020/user/cloudera/source/MarinerMissionToMars.txt");
JavaPairRDD<String, Integer> counts = lines.flatMap(line -> Arrays.asList(line.split(" ")).iterator()).mapToPair(word -> new Tuple2<String, Integer>(word, 1)).reduceByKey((x, y) -> x + y);
counts.saveAsTextFile("hdfs://localhost:8020/user/cloudera/results/MarinerMissionToMarsResults");
sc.close();
}
use of org.apache.spark.api.java.JavaPairRDD in project mm-dev by sbl-sdsc.
the class ReducedEncoderNewTest method test.
@Test
public void test() {
SparkConf conf = new SparkConf().setMaster("local[*]").setAppName(FilterByRFree.class.getSimpleName());
JavaSparkContext sc = new JavaSparkContext(conf);
// List<String> pdbIds = Arrays.asList("1STP","4HHB","2ONX","1JLP","5X6H","5L2G","2MK1");
List<String> pdbIds = Arrays.asList("1STP", "4HHB", "2ONX", "2CCV");
JavaPairRDD<String, StructureDataInterface> pdb = MmtfReader.downloadFullMmtfFiles(pdbIds, sc).cache();
// pdb.foreach(t -> System.out.println(t._1 + "o :" + t._2.getNumBonds()));
// List<String> chainIds = pdb.map(t -> t._1 + "_chainId_" + Arrays.toString(t._2.getChainIds())).collect();
// System.out.println("full: " + chainIds);
// List<String> chainNames = pdb.map(t -> t._1 + "_chainNames_" + Arrays.toString(t._2.getChainNames())).collect();
// System.out.println("full: " + chainNames);
// List<String> numGroups = pdb.map(t -> t._1 + "_numGroups_" + t._2.getNumGroups()).collect();
// System.out.println("full: " + numGroups);
// List<String> altlocs = pdb.map(t -> t._1 + "_altLocs_" + Arrays.toString(t._2.getAltLocIds())).collect();
// System.out.println("full: " + altlocs);
pdb = pdb.mapValues(v -> ReducedEncoder.getReduced(v)).cache();
// chainIds = pdb.map(t -> t._1 + "_chainId_" + Arrays.toString(t._2.getChainIds())).collect();
// System.out.println("reduced: " + chainIds);
// chainNames = pdb.map(t -> t._1 + "_chainNames_" + Arrays.toString(t._2.getChainNames())).collect();
// System.out.println("reduced: " + chainNames);
// altlocs = pdb.map(t -> t._1 + "_altLocs_" + Arrays.toString(t._2.getAltLocIds())).collect();
// System.out.println("reduced: " + altlocs);
// 1STP # groups 121 CA + 1 BTN = 122
// 4HHB # groups 141x2 + 146x2 CA + 4 HEM + 2P (from PO4) = 580
// 2ONX # groups 4 CA = 4
// 2CVV # atoms 99 CA + 4 altloc CA + 1 A2G (sugar) + 1 NAG (orig 15) + 1 GOL + 1 ZN, 1 ACE = 108
// TODO (4 altlocs missing?)
// numGroups = pdb.map(t -> t._1 + "_numGroups_" + t._2.getNumGroups()).collect();
// System.out.println("reduced: " + numGroups);
List<String> atoms = pdb.map(t -> t._1 + "_atoms_" + t._2.getNumAtoms()).collect();
// System.out.println(atoms);
// 1STP # atoms 121 CA + 16 BTN
// 4HHB # atom 141x2 + 146x2 CA + 43x4 HEM + 2P (from PO4) = 748
// 2ONX # atoms 4 CA
// 2CVV # atoms 99 CA + 4 (5?) altloc CA + 15 A2G (sugar) + 14 NAG (orig 15) + 6 GOL + 1 ZN, ACE 4 = 143
assertTrue(atoms.contains("1STP_atoms_137"));
assertTrue(atoms.contains("4HHB_atoms_748"));
assertTrue(atoms.contains("2ONX_atoms_4"));
assertTrue(atoms.contains("2CCV_atoms_143"));
List<String> bonds = pdb.map(t -> t._1 + "_bonds_" + t._2.getNumBonds()).collect();
// 1STP # bond 17 BTN
// 4HHB # bonds 50 x 4 HEM = 200
// 2ONX # bonds 0
// 2CVV # bonds 15 A2G+ 14 NAG (-O) + 5 GOL + 3 ACE + 2 disulfide bridges + 1 covalent bond to NAG = 40
assertTrue(bonds.contains("1STP_bonds_17"));
assertTrue(bonds.contains("4HHB_bonds_200"));
assertTrue(bonds.contains("2ONX_bonds_0"));
assertTrue(bonds.contains("2CCV_bonds_40"));
sc.close();
}
use of org.apache.spark.api.java.JavaPairRDD in project net.jgp.labs.spark by jgperrin.
the class PageRankApp method start.
public void start(String file, int numberOfIterations) {
showWarning();
SparkSession spark = SparkSession.builder().appName("JavaPageRank").getOrCreate();
// Loads in input file. It should be in format of:
// URL neighbor URL
// URL neighbor URL
// URL neighbor URL
// ...
JavaRDD<String> lines = spark.read().textFile(file).javaRDD();
// Loads all URLs from input file and initialize their neighbors.
JavaPairRDD<String, Iterable<String>> links = lines.mapToPair(s -> {
String[] parts = SPACES.split(s);
return new Tuple2<>(parts[0], parts[1]);
}).distinct().groupByKey().cache();
// Loads all URLs with other URL(s) link to from input file and
// initialize ranks of them to one.
JavaPairRDD<String, Double> ranks = links.mapValues(rs -> 1.0);
// algorithm.
for (int current = 0; current < numberOfIterations; current++) {
// Calculates URL contributions to the rank of other URLs.
JavaPairRDD<String, Double> contribs = links.join(ranks).values().flatMapToPair(s -> {
int urlCount = Iterables.size(s._1());
List<Tuple2<String, Double>> results = new ArrayList<>();
for (String n : s._1) {
results.add(new Tuple2<>(n, s._2() / urlCount));
}
return results.iterator();
});
// Re-calculates URL ranks based on neighbor contributions.
ranks = contribs.reduceByKey(new Sum()).mapValues(sum -> 0.15 + sum * 0.85);
}
// Collects all URL ranks and dump them to console.
List<Tuple2<String, Double>> output = ranks.collect();
for (Tuple2<?, ?> tuple : output) {
System.out.println(tuple._1() + " has rank: " + tuple._2() + ".");
}
spark.stop();
}
use of org.apache.spark.api.java.JavaPairRDD in project auratrainingproject by liuqinghua666.
the class JavaKafkaShopCityAnalytics method main.
public static void main(String[] args) throws Exception {
SparkConf conf = new SparkConf().setAppName("JavaKafkaShopCityAnalytics");
if (args.length == 0) {
conf.setMaster("local[1]");
} else {
dataPath = args[0];
}
JavaStreamingContext ssc = new JavaStreamingContext(conf, Durations.seconds(5));
// 从MySQL或文本中读取数据库,作为广播变量共享使用
Map<String, String> shopCityMap = getShopCityMap(dataPath);
JavaSparkContext jsc = ssc.sparkContext();
Broadcast<Map<String, String>> broadcastCountryMap = jsc.broadcast(shopCityMap);
// Kafka configurations
String[] topics = KafkaRedisConfig.KAFKA_USER_PAY_TOPIC.split("\\,");
System.out.println("Topics: " + Arrays.toString(topics));
String brokers = KafkaRedisConfig.KAFKA_ADDR;
Map<String, String> kafkaParams = new HashMap<>();
kafkaParams.put("metadata.broker.list", brokers);
kafkaParams.put("serializer.class", "kafka.serializer.StringEncoder");
final String clickHashKey = "app::shop::paycount";
// Create a direct stream
JavaPairInputDStream<String, String> kafkaStream = KafkaUtils.createDirectStream(ssc, String.class, String.class, StringDecoder.class, StringDecoder.class, kafkaParams, new HashSet<String>(Arrays.asList(topics)));
JavaDStream events = kafkaStream.map(new Function<Tuple2<String, String>, String[]>() {
@Override
public String[] call(Tuple2<String, String> line) throws Exception {
System.out.println("line:" + line._1() + "=>" + line._2().split(",")[0]);
String[] data = new String[] { line._1(), line._2().split(",")[0] };
return data;
}
});
// Compute user click times
JavaPairDStream<String, Long> shopClicks = events.mapToPair(new PairFunction<String[], String, Long>() {
@Override
public Tuple2<String, Long> call(String[] x) {
return new Tuple2<>(x[1], new Long(1));
}
}).reduceByKey(new Function2<Long, Long, Long>() {
@Override
public Long call(Long i1, Long i2) {
return i1 + i2;
}
});
shopClicks.foreachRDD(new VoidFunction<JavaPairRDD<String, Long>>() {
@Override
public void call(JavaPairRDD<String, Long> rdd) throws Exception {
rdd.foreachPartition(new VoidFunction<Iterator<Tuple2<String, Long>>>() {
@Override
public void call(Iterator<Tuple2<String, Long>> partitionOfRecords) throws Exception {
Jedis jedis = JavaRedisClient.get().getResource();
while (partitionOfRecords.hasNext()) {
try {
Tuple2<String, Long> pair = partitionOfRecords.next();
String shopidKey = "jiaoyi" + pair._1();
// 读取广播变量Map,根据shopd获取cityName
String cityName = broadcastCountryMap.getValue().get(pair._1());
String cityKey = "交易" + cityName;
// String cityKey = "交易"+getCityOfShop(pair._1 (),dataPath);
// 交易量
long clickCount = pair._2();
// 将店铺交易增量写入Redis
jedis.incrBy(shopidKey, clickCount);
System.out.println("Update shop " + shopidKey + " inc " + clickCount);
// 将城市交易增量写入Redis
jedis.incrBy(cityKey, clickCount);
System.out.println("Update city " + cityKey + " inc " + clickCount);
} catch (Exception e) {
System.out.println("error:" + e);
}
}
jedis.close();
}
});
}
});
ssc.start();
ssc.awaitTermination();
}
Aggregations