use of org.apache.spark.api.java.function.Function in project systemml by apache.
the class RDDConverterUtilsExt method stringDataFrameToVectorDataFrame.
/**
* Convert a dataframe of comma-separated string rows to a dataframe of
* ml.linalg.Vector rows.
*
* <p>
* Example input rows:<br>
*
* <code>
* ((1.2, 4.3, 3.4))<br>
* (1.2, 3.4, 2.2)<br>
* [[1.2, 34.3, 1.2, 1.25]]<br>
* [1.2, 3.4]<br>
* </code>
*
* @param sparkSession
* Spark Session
* @param inputDF
* dataframe of comma-separated row strings to convert to
* dataframe of ml.linalg.Vector rows
* @return dataframe of ml.linalg.Vector rows
*/
public static Dataset<Row> stringDataFrameToVectorDataFrame(SparkSession sparkSession, Dataset<Row> inputDF) {
StructField[] oldSchema = inputDF.schema().fields();
StructField[] newSchema = new StructField[oldSchema.length];
for (int i = 0; i < oldSchema.length; i++) {
String colName = oldSchema[i].name();
newSchema[i] = DataTypes.createStructField(colName, new VectorUDT(), true);
}
// converter
class StringToVector implements Function<Tuple2<Row, Long>, Row> {
private static final long serialVersionUID = -4733816995375745659L;
@Override
public Row call(Tuple2<Row, Long> arg0) throws Exception {
Row oldRow = arg0._1;
int oldNumCols = oldRow.length();
if (oldNumCols > 1) {
throw new DMLRuntimeException("The row must have at most one column");
}
// parse the various strings. i.e
// ((1.2, 4.3, 3.4)) or (1.2, 3.4, 2.2)
// [[1.2, 34.3, 1.2, 1.2]] or [1.2, 3.4]
Object[] fields = new Object[oldNumCols];
ArrayList<Object> fieldsArr = new ArrayList<Object>();
for (int i = 0; i < oldRow.length(); i++) {
Object ci = oldRow.get(i);
if (ci == null) {
fieldsArr.add(null);
} else if (ci instanceof String) {
String cis = (String) ci;
StringBuffer sb = new StringBuffer(cis.trim());
for (int nid = 0; i < 2; i++) {
// nesting
if ((sb.charAt(0) == '(' && sb.charAt(sb.length() - 1) == ')') || (sb.charAt(0) == '[' && sb.charAt(sb.length() - 1) == ']')) {
sb.deleteCharAt(0);
sb.setLength(sb.length() - 1);
}
}
// have the replace code
String ncis = "[" + sb.toString().replaceAll(" *, *", ",") + "]";
try {
// ncis [ ] will always result in double array return type
double[] doubles = (double[]) NumericParser.parse(ncis);
Vector dense = Vectors.dense(doubles);
fieldsArr.add(dense);
} catch (Exception e) {
// can't catch SparkException here in Java apparently
throw new DMLRuntimeException("Error converting to double array. " + e.getMessage(), e);
}
} else {
throw new DMLRuntimeException("Only String is supported");
}
}
Row row = RowFactory.create(fieldsArr.toArray());
return row;
}
}
// output DF
JavaRDD<Row> newRows = inputDF.rdd().toJavaRDD().zipWithIndex().map(new StringToVector());
Dataset<Row> outDF = sparkSession.createDataFrame(newRows.rdd(), DataTypes.createStructType(newSchema));
return outDF;
}
use of org.apache.spark.api.java.function.Function in project microservices by pwillhan.
the class GeoLocationJob method main.
public static void main(String[] args) throws Exception {
SparkConf conf = new SparkConf().setAppName("geolocationJob").setMaster("local[1]");
JavaStreamingContext context = new JavaStreamingContext(conf, new Duration(2000));
Map<String, Object> kafkaParams = new HashMap<>();
kafkaParams.put("bootstrap.servers", "192.168.99.100:9092");
kafkaParams.put("key.deserializer", StringDeserializer.class);
kafkaParams.put("value.deserializer", StringDeserializer.class);
kafkaParams.put("group.id", "geolocationJob");
kafkaParams.put("auto.offset.reset", "latest");
kafkaParams.put("enable.auto.commit", false);
Collection<String> topics = Arrays.asList("geolocationJob");
final JavaInputDStream<ConsumerRecord<String, String>> dstream = KafkaUtils.createDirectStream(context, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String>Subscribe(topics, kafkaParams));
dstream.map(new // map to GeoLocation
Function<ConsumerRecord<String, String>, GeoLocation>() {
private static final long serialVersionUID = -5289370913799710097L;
@Override
public GeoLocation call(ConsumerRecord<String, String> record) throws Exception {
return new Gson().fromJson(record.value(), GeoLocation.class);
}
}).filter(new // filter out invalid geolocations
Function<GeoLocation, Boolean>() {
private static final long serialVersionUID = 6980980875802694946L;
@Override
public Boolean call(GeoLocation geolocation) throws Exception {
System.out.println("Spark Job received => " + geolocation);
return geolocation.getLatitude() >= -90 && geolocation.getLatitude() < 90 && geolocation.getLongitude() >= -180 && geolocation.getLongitude() < 180;
}
}).foreachRDD(new // iterate over RDD
VoidFunction<JavaRDD<GeoLocation>>() {
private static final long serialVersionUID = -4161320579495422870L;
@Override
public void call(JavaRDD<GeoLocation> rdd) throws Exception {
rdd.foreach(new // send valid geolocations to another topic
VoidFunction<GeoLocation>() {
private static final long serialVersionUID = -3282778715126743482L;
@Override
public void call(GeoLocation geolocation) throws Exception {
ProducerRecord<String, String> record = new ProducerRecord<>("geolocations", geolocation.toString());
getProducer().send(record);
}
});
}
});
context.start();
context.awaitTermination();
}
use of org.apache.spark.api.java.function.Function in project incubator-sdap-mudrod by apache.
the class SimilarityUtil method calculateSimilarityFromVector.
/**
* Calculate term similarity from vector.
*
* @param importRDD the {@link org.apache.spark.api.java.JavaPairRDD}
* data structure containing the vectors.
* @param simType the similarity calculation to execute e.g.
* <ul>
* <li>{@link org.apache.sdap.mudrod.utils.SimilarityUtil#SIM_COSINE} - 3,</li>
* <li>{@link org.apache.sdap.mudrod.utils.SimilarityUtil#SIM_HELLINGER} - 2,</li>
* <li>{@link org.apache.sdap.mudrod.utils.SimilarityUtil#SIM_PEARSON} - 1</li>
* </ul>
* @return a new {@link org.apache.spark.api.java.JavaPairRDD}
*/
public static JavaRDD<LinkageTriple> calculateSimilarityFromVector(JavaPairRDD<String, Vector> importRDD, int simType) {
JavaRDD<Tuple2<String, Vector>> importRDD1 = importRDD.map(f -> new Tuple2<String, Vector>(f._1, f._2));
JavaPairRDD<Tuple2<String, Vector>, Tuple2<String, Vector>> cartesianRDD = importRDD1.cartesian(importRDD1);
return cartesianRDD.map(new Function<Tuple2<Tuple2<String, Vector>, Tuple2<String, Vector>>, LinkageTriple>() {
/**
*/
private static final long serialVersionUID = 1L;
@Override
public LinkageTriple call(Tuple2<Tuple2<String, Vector>, Tuple2<String, Vector>> arg) {
String keyA = arg._1._1;
String keyB = arg._2._1;
if (keyA.equals(keyB)) {
return null;
}
Vector vecA = arg._1._2;
Vector vecB = arg._2._2;
Double weight = 0.0;
if (simType == SimilarityUtil.SIM_PEARSON) {
weight = SimilarityUtil.pearsonDistance(vecA, vecB);
} else if (simType == SimilarityUtil.SIM_HELLINGER) {
weight = SimilarityUtil.hellingerDistance(vecA, vecB);
}
LinkageTriple triple = new LinkageTriple();
triple.keyA = keyA;
triple.keyB = keyB;
triple.weight = weight;
return triple;
}
}).filter(new Function<LinkageTriple, Boolean>() {
/**
*/
private static final long serialVersionUID = 1L;
@Override
public Boolean call(LinkageTriple arg0) throws Exception {
if (arg0 == null) {
return false;
}
return true;
}
});
}
use of org.apache.spark.api.java.function.Function in project rocketmq-externals by apache.
the class RocketMqUtilsTest method testGetOffsets.
@Test
public void testGetOffsets() throws MQBrokerException, MQClientException, InterruptedException, UnsupportedEncodingException {
Map<String, String> optionParams = new HashMap<>();
optionParams.put(RocketMQConfig.NAME_SERVER_ADDR, NAME_SERVER);
SparkConf sparkConf = new SparkConf().setAppName("JavaCustomReceiver").setMaster("local[*]");
JavaStreamingContext sc = new JavaStreamingContext(sparkConf, new Duration(1000));
List<String> topics = new ArrayList<>();
topics.add(TOPIC_DEFAULT);
LocationStrategy locationStrategy = LocationStrategy.PreferConsistent();
JavaInputDStream<MessageExt> dStream = RocketMqUtils.createJavaMQPullStream(sc, UUID.randomUUID().toString(), topics, ConsumerStrategy.earliest(), false, false, false, locationStrategy, optionParams);
// hold a reference to the current offset ranges, so it can be used downstream
final AtomicReference<Map<TopicQueueId, OffsetRange[]>> offsetRanges = new AtomicReference<>();
final Set<MessageExt> result = Collections.synchronizedSet(new HashSet<MessageExt>());
dStream.transform(new Function<JavaRDD<MessageExt>, JavaRDD<MessageExt>>() {
@Override
public JavaRDD<MessageExt> call(JavaRDD<MessageExt> v1) throws Exception {
Map<TopicQueueId, OffsetRange[]> offsets = ((HasOffsetRanges) v1.rdd()).offsetRanges();
offsetRanges.set(offsets);
return v1;
}
}).foreachRDD(new VoidFunction<JavaRDD<MessageExt>>() {
@Override
public void call(JavaRDD<MessageExt> messageExtJavaRDD) throws Exception {
result.addAll(messageExtJavaRDD.collect());
}
});
sc.start();
long startTime = System.currentTimeMillis();
boolean matches = false;
while (!matches && System.currentTimeMillis() - startTime < 10000) {
matches = MESSAGE_NUM == result.size();
Thread.sleep(50);
}
sc.stop();
}
use of org.apache.spark.api.java.function.Function in project azure-tools-for-java by Microsoft.
the class JavaSparkPi method main.
public static void main(String[] args) throws Exception {
// use this line if you want to run your application in the cluster
// SparkConf sparkConf = new SparkConf().setAppName("JavaSparkPi");
SparkConf sparkConf = new SparkConf().setAppName("JavaSparkPi");
JavaSparkContext jsc = new JavaSparkContext(sparkConf);
int slices = (args.length == 1) ? Integer.parseInt(args[0]) : 2;
int n = 100000 * slices;
List<Integer> l = new ArrayList<Integer>(n);
for (int i = 0; i < n; i++) {
l.add(i);
}
JavaRDD<Integer> dataSet = jsc.parallelize(l, slices);
int count = dataSet.map(new Function<Integer, Integer>() {
@Override
public Integer call(Integer integer) {
double x = Math.random() * 2 - 1;
double y = Math.random() * 2 - 1;
return (x * x + y * y < 1) ? 1 : 0;
}
}).reduce(new Function2<Integer, Integer, Integer>() {
@Override
public Integer call(Integer integer, Integer integer2) {
return integer + integer2;
}
});
System.out.println("Pi is roughly " + 4.0 * count / n);
jsc.stop();
}
Aggregations