use of org.apache.spark.api.java.function.PairFunction in project cdap by caskdata.
the class SparkPageRankProgram method run.
@Override
public void run(JavaSparkExecutionContext sec) throws Exception {
JavaSparkContext jsc = new JavaSparkContext();
LOG.info("Processing backlinkURLs data");
JavaPairRDD<Long, String> backlinkURLs = sec.fromStream("backlinkURLStream", String.class);
int iterationCount = getIterationCount(sec);
LOG.info("Grouping data by key");
// Grouping backlinks by unique URL in key
JavaPairRDD<String, Iterable<String>> links = backlinkURLs.values().mapToPair(new PairFunction<String, String, String>() {
@Override
public Tuple2<String, String> call(String s) {
String[] parts = SPACES.split(s);
return new Tuple2<>(parts[0], parts[1]);
}
}).distinct().groupByKey().cache();
// Initialize default rank for each key URL
JavaPairRDD<String, Double> ranks = links.mapValues(new Function<Iterable<String>, Double>() {
@Override
public Double call(Iterable<String> rs) {
return 1.0;
}
});
// Calculates and updates URL ranks continuously using PageRank algorithm.
for (int current = 0; current < iterationCount; current++) {
LOG.debug("Processing data with PageRank algorithm. Iteration {}/{}", current + 1, (iterationCount));
// Calculates URL contributions to the rank of other URLs.
JavaPairRDD<String, Double> contribs = links.join(ranks).values().flatMapToPair(new PairFlatMapFunction<Tuple2<Iterable<String>, Double>, String, Double>() {
@Override
public Iterable<Tuple2<String, Double>> call(Tuple2<Iterable<String>, Double> s) {
LOG.debug("Processing {} with rank {}", s._1(), s._2());
int urlCount = Iterables.size(s._1());
List<Tuple2<String, Double>> results = new ArrayList<>();
for (String n : s._1()) {
results.add(new Tuple2<>(n, s._2() / urlCount));
}
return results;
}
});
// Re-calculates URL ranks based on backlink contributions.
ranks = contribs.reduceByKey(new Sum()).mapValues(new Function<Double, Double>() {
@Override
public Double call(Double sum) {
return 0.15 + sum * 0.85;
}
});
}
LOG.info("Writing ranks data");
final ServiceDiscoverer discoveryServiceContext = sec.getServiceDiscoverer();
final Metrics sparkMetrics = sec.getMetrics();
JavaPairRDD<byte[], Integer> ranksRaw = ranks.mapToPair(new PairFunction<Tuple2<String, Double>, byte[], Integer>() {
@Override
public Tuple2<byte[], Integer> call(Tuple2<String, Double> tuple) throws Exception {
LOG.debug("URL {} has rank {}", Arrays.toString(tuple._1().getBytes(Charsets.UTF_8)), tuple._2());
URL serviceURL = discoveryServiceContext.getServiceURL(SparkPageRankApp.SERVICE_HANDLERS);
if (serviceURL == null) {
throw new RuntimeException("Failed to discover service: " + SparkPageRankApp.SERVICE_HANDLERS);
}
try {
URLConnection connection = new URL(serviceURL, String.format("%s/%s", SparkPageRankApp.SparkPageRankServiceHandler.TRANSFORM_PATH, tuple._2().toString())).openConnection();
try (BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream(), Charsets.UTF_8))) {
String pr = reader.readLine();
if ((Integer.parseInt(pr)) == POPULAR_PAGE_THRESHOLD) {
sparkMetrics.count(POPULAR_PAGES, 1);
} else if (Integer.parseInt(pr) <= UNPOPULAR_PAGE_THRESHOLD) {
sparkMetrics.count(UNPOPULAR_PAGES, 1);
} else {
sparkMetrics.count(REGULAR_PAGES, 1);
}
return new Tuple2<>(tuple._1().getBytes(Charsets.UTF_8), Integer.parseInt(pr));
}
} catch (Exception e) {
LOG.warn("Failed to read the Stream for service {}", SparkPageRankApp.SERVICE_HANDLERS, e);
throw Throwables.propagate(e);
}
}
});
// Store calculated results in output Dataset.
// All calculated results are stored in one row.
// Each result, the calculated URL rank based on backlink contributions, is an entry of the row.
// The value of the entry is the URL rank.
sec.saveAsDataset(ranksRaw, "ranks");
LOG.info("PageRanks successfuly computed and written to \"ranks\" dataset");
}
use of org.apache.spark.api.java.function.PairFunction in project cdap by caskdata.
the class ClassicSparkProgram method main.
public static void main(String[] args) throws Exception {
SparkConf sparkConf = new SparkConf();
sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
sparkConf.set("spark.kryo.registrator", MyKryoRegistrator.class.getName());
Schema schema = Schema.recordOf("record", Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("id", Schema.of(Schema.Type.INT)));
List<StructuredRecord> records = new ArrayList<>();
for (int i = 1; i <= 10; i++) {
records.add(StructuredRecord.builder(schema).set("name", "Name" + i).set("id", i).build());
}
// This test serialization of StructuredRecord as well as using custom kryo serializer
JavaSparkContext jsc = new JavaSparkContext(sparkConf);
int result = jsc.parallelize(records).mapToPair(new PairFunction<StructuredRecord, MyInt, StructuredRecord>() {
@Override
public Tuple2<MyInt, StructuredRecord> call(StructuredRecord record) throws Exception {
return new Tuple2<>(new MyInt((Integer) record.get("id")), record);
}
}).map(new Function<Tuple2<MyInt, StructuredRecord>, MyInt>() {
@Override
public MyInt call(Tuple2<MyInt, StructuredRecord> tuple) throws Exception {
return tuple._1;
}
}).reduce(new Function2<MyInt, MyInt, MyInt>() {
@Override
public MyInt call(MyInt v1, MyInt v2) throws Exception {
return new MyInt(v1.toInt() + v2.toInt());
}
}).toInt();
if (result != 55) {
throw new Exception("Expected result to be 55");
}
}
use of org.apache.spark.api.java.function.PairFunction in project learning-spark by databricks.
the class ChapterSixExample method main.
public static void main(String[] args) throws Exception {
if (args.length != 4) {
throw new Exception("Usage AccumulatorExample sparkMaster inputFile outDirectory");
}
String sparkMaster = args[0];
String inputFile = args[1];
String inputFile2 = args[2];
String outputDir = args[3];
JavaSparkContext sc = new JavaSparkContext(sparkMaster, "ChapterSixExample", System.getenv("SPARK_HOME"), System.getenv("JARS"));
JavaRDD<String> rdd = sc.textFile(inputFile);
// Count the number of lines with KK6JKQ
final Accumulator<Integer> count = sc.accumulator(0);
rdd.foreach(new VoidFunction<String>() {
public void call(String line) {
if (line.contains("KK6JKQ")) {
count.add(1);
}
}
});
System.out.println("Lines with 'KK6JKQ': " + count.value());
// Create Accumulators initialized at 0
final Accumulator<Integer> blankLines = sc.accumulator(0);
JavaRDD<String> callSigns = rdd.flatMap(new FlatMapFunction<String, String>() {
public Iterable<String> call(String line) {
if (line.equals("")) {
blankLines.add(1);
}
return Arrays.asList(line.split(" "));
}
});
callSigns.saveAsTextFile(outputDir + "/callsigns");
System.out.println("Blank lines: " + blankLines.value());
// Start validating the call signs
final Accumulator<Integer> validSignCount = sc.accumulator(0);
final Accumulator<Integer> invalidSignCount = sc.accumulator(0);
JavaRDD<String> validCallSigns = callSigns.filter(new Function<String, Boolean>() {
public Boolean call(String callSign) {
Pattern p = Pattern.compile("\\A\\d?\\p{Alpha}{1,2}\\d{1,4}\\p{Alpha}{1,3}\\Z");
Matcher m = p.matcher(callSign);
boolean b = m.matches();
if (b) {
validSignCount.add(1);
} else {
invalidSignCount.add(1);
}
return b;
}
});
JavaPairRDD<String, Integer> contactCounts = validCallSigns.mapToPair(new PairFunction<String, String, Integer>() {
public Tuple2<String, Integer> call(String callSign) {
return new Tuple2(callSign, 1);
}
}).reduceByKey(new Function2<Integer, Integer, Integer>() {
public Integer call(Integer x, Integer y) {
return x + y;
}
});
// Force evaluation so the counters are populated
contactCounts.count();
if (invalidSignCount.value() < 0.1 * validSignCount.value()) {
contactCounts.saveAsTextFile(outputDir + "/contactCount");
} else {
System.out.println("Too many errors " + invalidSignCount.value() + " for " + validSignCount.value());
System.exit(1);
}
// Read in the call sign table
// Lookup the countries for each call sign in the
// contactCounts RDD.
final Broadcast<String[]> signPrefixes = sc.broadcast(loadCallSignTable());
JavaPairRDD<String, Integer> countryContactCounts = contactCounts.mapToPair(new PairFunction<Tuple2<String, Integer>, String, Integer>() {
public Tuple2<String, Integer> call(Tuple2<String, Integer> callSignCount) {
String sign = callSignCount._1();
String country = lookupCountry(sign, signPrefixes.value());
return new Tuple2(country, callSignCount._2());
}
}).reduceByKey(new SumInts());
countryContactCounts.saveAsTextFile(outputDir + "/countries.txt");
System.out.println("Saved country contact counts as a file");
// Use mapPartitions to re-use setup work.
JavaPairRDD<String, CallLog[]> contactsContactLists = validCallSigns.mapPartitionsToPair(new PairFlatMapFunction<Iterator<String>, String, CallLog[]>() {
public Iterable<Tuple2<String, CallLog[]>> call(Iterator<String> input) {
// List for our results.
ArrayList<Tuple2<String, CallLog[]>> callsignQsos = new ArrayList<Tuple2<String, CallLog[]>>();
ArrayList<Tuple2<String, ContentExchange>> requests = new ArrayList<Tuple2<String, ContentExchange>>();
ObjectMapper mapper = createMapper();
HttpClient client = new HttpClient();
try {
client.start();
while (input.hasNext()) {
requests.add(createRequestForSign(input.next(), client));
}
for (Tuple2<String, ContentExchange> signExchange : requests) {
callsignQsos.add(fetchResultFromRequest(mapper, signExchange));
}
} catch (Exception e) {
}
return callsignQsos;
}
});
System.out.println(StringUtils.join(contactsContactLists.collect(), ","));
// Computer the distance of each call using an external R program
// adds our script to a list of files for each node to download with this job
String distScript = System.getProperty("user.dir") + "/src/R/finddistance.R";
String distScriptName = "finddistance.R";
sc.addFile(distScript);
JavaRDD<String> pipeInputs = contactsContactLists.values().map(new VerifyCallLogs()).flatMap(new FlatMapFunction<CallLog[], String>() {
public Iterable<String> call(CallLog[] calls) {
ArrayList<String> latLons = new ArrayList<String>();
for (CallLog call : calls) {
latLons.add(call.mylat + "," + call.mylong + "," + call.contactlat + "," + call.contactlong);
}
return latLons;
}
});
JavaRDD<String> distances = pipeInputs.pipe(SparkFiles.get(distScriptName));
// First we need to convert our RDD of String to a DoubleRDD so we can
// access the stats function
JavaDoubleRDD distanceDoubles = distances.mapToDouble(new DoubleFunction<String>() {
public double call(String value) {
return Double.parseDouble(value);
}
});
final StatCounter stats = distanceDoubles.stats();
final Double stddev = stats.stdev();
final Double mean = stats.mean();
JavaDoubleRDD reasonableDistances = distanceDoubles.filter(new Function<Double, Boolean>() {
public Boolean call(Double x) {
return (Math.abs(x - mean) < 3 * stddev);
}
});
System.out.println(StringUtils.join(reasonableDistances.collect(), ","));
sc.stop();
System.exit(0);
}
use of org.apache.spark.api.java.function.PairFunction in project learning-spark by databricks.
the class WordCount method main.
public static void main(String[] args) throws Exception {
String master = args[0];
JavaSparkContext sc = new JavaSparkContext(master, "wordcount", System.getenv("SPARK_HOME"), System.getenv("JARS"));
JavaRDD<String> rdd = sc.textFile(args[1]);
JavaPairRDD<String, Integer> counts = rdd.flatMap(new FlatMapFunction<String, String>() {
public Iterable<String> call(String x) {
return Arrays.asList(x.split(" "));
}
}).mapToPair(new PairFunction<String, String, Integer>() {
public Tuple2<String, Integer> call(String x) {
return new Tuple2(x, 1);
}
}).reduceByKey(new Function2<Integer, Integer, Integer>() {
public Integer call(Integer x, Integer y) {
return x + y;
}
});
counts.saveAsTextFile(args[2]);
}
use of org.apache.spark.api.java.function.PairFunction in project learning-spark by databricks.
the class WordCount method main.
public static void main(String[] args) throws Exception {
String inputFile = args[0];
String outputFile = args[1];
// Create a Java Spark Context.
SparkConf conf = new SparkConf().setAppName("wordCount");
JavaSparkContext sc = new JavaSparkContext(conf);
// Load our input data.
JavaRDD<String> input = sc.textFile(inputFile);
// Split up into words.
JavaRDD<String> words = input.flatMap(new FlatMapFunction<String, String>() {
public Iterable<String> call(String x) {
return Arrays.asList(x.split(" "));
}
});
// Transform into word and count.
JavaPairRDD<String, Integer> counts = words.mapToPair(new PairFunction<String, String, Integer>() {
public Tuple2<String, Integer> call(String x) {
return new Tuple2(x, 1);
}
}).reduceByKey(new Function2<Integer, Integer, Integer>() {
public Integer call(Integer x, Integer y) {
return x + y;
}
});
// Save the word count back out to a text file, causing evaluation.
counts.saveAsTextFile(outputFile);
}
Aggregations