Search in sources :

Example 26 with Function

use of org.apache.spark.api.java.function.Function in project systemml by apache.

the class RDDConverterUtilsExt method stringDataFrameToVectorDataFrame.

/**
 * Convert a dataframe of comma-separated string rows to a dataframe of
 * ml.linalg.Vector rows.
 *
 * <p>
 * Example input rows:<br>
 *
 * <code>
 * ((1.2, 4.3, 3.4))<br>
 * (1.2, 3.4, 2.2)<br>
 * [[1.2, 34.3, 1.2, 1.25]]<br>
 * [1.2, 3.4]<br>
 * </code>
 *
 * @param sparkSession
 *            Spark Session
 * @param inputDF
 *            dataframe of comma-separated row strings to convert to
 *            dataframe of ml.linalg.Vector rows
 * @return dataframe of ml.linalg.Vector rows
 */
public static Dataset<Row> stringDataFrameToVectorDataFrame(SparkSession sparkSession, Dataset<Row> inputDF) {
    StructField[] oldSchema = inputDF.schema().fields();
    StructField[] newSchema = new StructField[oldSchema.length];
    for (int i = 0; i < oldSchema.length; i++) {
        String colName = oldSchema[i].name();
        newSchema[i] = DataTypes.createStructField(colName, new VectorUDT(), true);
    }
    // converter
    class StringToVector implements Function<Tuple2<Row, Long>, Row> {

        private static final long serialVersionUID = -4733816995375745659L;

        @Override
        public Row call(Tuple2<Row, Long> arg0) throws Exception {
            Row oldRow = arg0._1;
            int oldNumCols = oldRow.length();
            if (oldNumCols > 1) {
                throw new DMLRuntimeException("The row must have at most one column");
            }
            // parse the various strings. i.e
            // ((1.2, 4.3, 3.4)) or (1.2, 3.4, 2.2)
            // [[1.2, 34.3, 1.2, 1.2]] or [1.2, 3.4]
            Object[] fields = new Object[oldNumCols];
            ArrayList<Object> fieldsArr = new ArrayList<Object>();
            for (int i = 0; i < oldRow.length(); i++) {
                Object ci = oldRow.get(i);
                if (ci == null) {
                    fieldsArr.add(null);
                } else if (ci instanceof String) {
                    String cis = (String) ci;
                    StringBuffer sb = new StringBuffer(cis.trim());
                    for (int nid = 0; i < 2; i++) {
                        // nesting
                        if ((sb.charAt(0) == '(' && sb.charAt(sb.length() - 1) == ')') || (sb.charAt(0) == '[' && sb.charAt(sb.length() - 1) == ']')) {
                            sb.deleteCharAt(0);
                            sb.setLength(sb.length() - 1);
                        }
                    }
                    // have the replace code
                    String ncis = "[" + sb.toString().replaceAll(" *, *", ",") + "]";
                    try {
                        // ncis [ ] will always result in double array return type
                        double[] doubles = (double[]) NumericParser.parse(ncis);
                        Vector dense = Vectors.dense(doubles);
                        fieldsArr.add(dense);
                    } catch (Exception e) {
                        // can't catch SparkException here in Java apparently
                        throw new DMLRuntimeException("Error converting to double array. " + e.getMessage(), e);
                    }
                } else {
                    throw new DMLRuntimeException("Only String is supported");
                }
            }
            Row row = RowFactory.create(fieldsArr.toArray());
            return row;
        }
    }
    // output DF
    JavaRDD<Row> newRows = inputDF.rdd().toJavaRDD().zipWithIndex().map(new StringToVector());
    Dataset<Row> outDF = sparkSession.createDataFrame(newRows.rdd(), DataTypes.createStructType(newSchema));
    return outDF;
}
Also used : VectorUDT(org.apache.spark.ml.linalg.VectorUDT) ArrayList(java.util.ArrayList) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) IOException(java.io.IOException) DMLRuntimeException(org.apache.sysml.runtime.DMLRuntimeException) PairFlatMapFunction(org.apache.spark.api.java.function.PairFlatMapFunction) Function(org.apache.spark.api.java.function.Function) StructField(org.apache.spark.sql.types.StructField) Tuple2(scala.Tuple2) Row(org.apache.spark.sql.Row) Vector(org.apache.spark.ml.linalg.Vector)

Example 27 with Function

use of org.apache.spark.api.java.function.Function in project microservices by pwillhan.

the class GeoLocationJob method main.

public static void main(String[] args) throws Exception {
    SparkConf conf = new SparkConf().setAppName("geolocationJob").setMaster("local[1]");
    JavaStreamingContext context = new JavaStreamingContext(conf, new Duration(2000));
    Map<String, Object> kafkaParams = new HashMap<>();
    kafkaParams.put("bootstrap.servers", "192.168.99.100:9092");
    kafkaParams.put("key.deserializer", StringDeserializer.class);
    kafkaParams.put("value.deserializer", StringDeserializer.class);
    kafkaParams.put("group.id", "geolocationJob");
    kafkaParams.put("auto.offset.reset", "latest");
    kafkaParams.put("enable.auto.commit", false);
    Collection<String> topics = Arrays.asList("geolocationJob");
    final JavaInputDStream<ConsumerRecord<String, String>> dstream = KafkaUtils.createDirectStream(context, LocationStrategies.PreferConsistent(), ConsumerStrategies.<String, String>Subscribe(topics, kafkaParams));
    dstream.map(new // map to GeoLocation
    Function<ConsumerRecord<String, String>, GeoLocation>() {

        private static final long serialVersionUID = -5289370913799710097L;

        @Override
        public GeoLocation call(ConsumerRecord<String, String> record) throws Exception {
            return new Gson().fromJson(record.value(), GeoLocation.class);
        }
    }).filter(new // filter out invalid geolocations
    Function<GeoLocation, Boolean>() {

        private static final long serialVersionUID = 6980980875802694946L;

        @Override
        public Boolean call(GeoLocation geolocation) throws Exception {
            System.out.println("Spark Job received => " + geolocation);
            return geolocation.getLatitude() >= -90 && geolocation.getLatitude() < 90 && geolocation.getLongitude() >= -180 && geolocation.getLongitude() < 180;
        }
    }).foreachRDD(new // iterate over RDD
    VoidFunction<JavaRDD<GeoLocation>>() {

        private static final long serialVersionUID = -4161320579495422870L;

        @Override
        public void call(JavaRDD<GeoLocation> rdd) throws Exception {
            rdd.foreach(new // send valid geolocations to another topic
            VoidFunction<GeoLocation>() {

                private static final long serialVersionUID = -3282778715126743482L;

                @Override
                public void call(GeoLocation geolocation) throws Exception {
                    ProducerRecord<String, String> record = new ProducerRecord<>("geolocations", geolocation.toString());
                    getProducer().send(record);
                }
            });
        }
    });
    context.start();
    context.awaitTermination();
}
Also used : HashMap(java.util.HashMap) Gson(com.google.gson.Gson) Duration(org.apache.spark.streaming.Duration) ConsumerRecord(org.apache.kafka.clients.consumer.ConsumerRecord) JavaRDD(org.apache.spark.api.java.JavaRDD) JavaStreamingContext(org.apache.spark.streaming.api.java.JavaStreamingContext) VoidFunction(org.apache.spark.api.java.function.VoidFunction) Function(org.apache.spark.api.java.function.Function) VoidFunction(org.apache.spark.api.java.function.VoidFunction) ProducerRecord(org.apache.kafka.clients.producer.ProducerRecord) SparkConf(org.apache.spark.SparkConf)

Example 28 with Function

use of org.apache.spark.api.java.function.Function in project incubator-sdap-mudrod by apache.

the class SimilarityUtil method calculateSimilarityFromVector.

/**
 * Calculate term similarity from vector.
 *
 * @param importRDD the {@link org.apache.spark.api.java.JavaPairRDD}
 *                  data structure containing the vectors.
 * @param simType   the similarity calculation to execute e.g.
 * <ul>
 * <li>{@link org.apache.sdap.mudrod.utils.SimilarityUtil#SIM_COSINE} - 3,</li>
 * <li>{@link org.apache.sdap.mudrod.utils.SimilarityUtil#SIM_HELLINGER} - 2,</li>
 * <li>{@link org.apache.sdap.mudrod.utils.SimilarityUtil#SIM_PEARSON} - 1</li>
 * </ul>
 * @return a new {@link org.apache.spark.api.java.JavaPairRDD}
 */
public static JavaRDD<LinkageTriple> calculateSimilarityFromVector(JavaPairRDD<String, Vector> importRDD, int simType) {
    JavaRDD<Tuple2<String, Vector>> importRDD1 = importRDD.map(f -> new Tuple2<String, Vector>(f._1, f._2));
    JavaPairRDD<Tuple2<String, Vector>, Tuple2<String, Vector>> cartesianRDD = importRDD1.cartesian(importRDD1);
    return cartesianRDD.map(new Function<Tuple2<Tuple2<String, Vector>, Tuple2<String, Vector>>, LinkageTriple>() {

        /**
         */
        private static final long serialVersionUID = 1L;

        @Override
        public LinkageTriple call(Tuple2<Tuple2<String, Vector>, Tuple2<String, Vector>> arg) {
            String keyA = arg._1._1;
            String keyB = arg._2._1;
            if (keyA.equals(keyB)) {
                return null;
            }
            Vector vecA = arg._1._2;
            Vector vecB = arg._2._2;
            Double weight = 0.0;
            if (simType == SimilarityUtil.SIM_PEARSON) {
                weight = SimilarityUtil.pearsonDistance(vecA, vecB);
            } else if (simType == SimilarityUtil.SIM_HELLINGER) {
                weight = SimilarityUtil.hellingerDistance(vecA, vecB);
            }
            LinkageTriple triple = new LinkageTriple();
            triple.keyA = keyA;
            triple.keyB = keyB;
            triple.weight = weight;
            return triple;
        }
    }).filter(new Function<LinkageTriple, Boolean>() {

        /**
         */
        private static final long serialVersionUID = 1L;

        @Override
        public Boolean call(LinkageTriple arg0) throws Exception {
            if (arg0 == null) {
                return false;
            }
            return true;
        }
    });
}
Also used : Function(org.apache.spark.api.java.function.Function) PairFunction(org.apache.spark.api.java.function.PairFunction) Tuple2(scala.Tuple2) Vector(org.apache.spark.mllib.linalg.Vector)

Example 29 with Function

use of org.apache.spark.api.java.function.Function in project rocketmq-externals by apache.

the class RocketMqUtilsTest method testGetOffsets.

@Test
public void testGetOffsets() throws MQBrokerException, MQClientException, InterruptedException, UnsupportedEncodingException {
    Map<String, String> optionParams = new HashMap<>();
    optionParams.put(RocketMQConfig.NAME_SERVER_ADDR, NAME_SERVER);
    SparkConf sparkConf = new SparkConf().setAppName("JavaCustomReceiver").setMaster("local[*]");
    JavaStreamingContext sc = new JavaStreamingContext(sparkConf, new Duration(1000));
    List<String> topics = new ArrayList<>();
    topics.add(TOPIC_DEFAULT);
    LocationStrategy locationStrategy = LocationStrategy.PreferConsistent();
    JavaInputDStream<MessageExt> dStream = RocketMqUtils.createJavaMQPullStream(sc, UUID.randomUUID().toString(), topics, ConsumerStrategy.earliest(), false, false, false, locationStrategy, optionParams);
    // hold a reference to the current offset ranges, so it can be used downstream
    final AtomicReference<Map<TopicQueueId, OffsetRange[]>> offsetRanges = new AtomicReference<>();
    final Set<MessageExt> result = Collections.synchronizedSet(new HashSet<MessageExt>());
    dStream.transform(new Function<JavaRDD<MessageExt>, JavaRDD<MessageExt>>() {

        @Override
        public JavaRDD<MessageExt> call(JavaRDD<MessageExt> v1) throws Exception {
            Map<TopicQueueId, OffsetRange[]> offsets = ((HasOffsetRanges) v1.rdd()).offsetRanges();
            offsetRanges.set(offsets);
            return v1;
        }
    }).foreachRDD(new VoidFunction<JavaRDD<MessageExt>>() {

        @Override
        public void call(JavaRDD<MessageExt> messageExtJavaRDD) throws Exception {
            result.addAll(messageExtJavaRDD.collect());
        }
    });
    sc.start();
    long startTime = System.currentTimeMillis();
    boolean matches = false;
    while (!matches && System.currentTimeMillis() - startTime < 10000) {
        matches = MESSAGE_NUM == result.size();
        Thread.sleep(50);
    }
    sc.stop();
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) HasOffsetRanges(org.apache.rocketmq.spark.HasOffsetRanges) JavaStreamingContext(org.apache.spark.streaming.api.java.JavaStreamingContext) VoidFunction(org.apache.spark.api.java.function.VoidFunction) Function(org.apache.spark.api.java.function.Function) Duration(org.apache.spark.streaming.Duration) AtomicReference(java.util.concurrent.atomic.AtomicReference) MQClientException(org.apache.rocketmq.client.exception.MQClientException) MQBrokerException(org.apache.rocketmq.client.exception.MQBrokerException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) JavaRDD(org.apache.spark.api.java.JavaRDD) OffsetRange(org.apache.rocketmq.spark.OffsetRange) MessageExt(org.apache.rocketmq.common.message.MessageExt) LocationStrategy(org.apache.rocketmq.spark.LocationStrategy) SparkConf(org.apache.spark.SparkConf) HashMap(java.util.HashMap) Map(java.util.Map) TopicQueueId(org.apache.rocketmq.spark.TopicQueueId) Test(org.junit.Test)

Example 30 with Function

use of org.apache.spark.api.java.function.Function in project azure-tools-for-java by Microsoft.

the class JavaSparkPi method main.

public static void main(String[] args) throws Exception {
    // use this line if you want to run your application in the cluster
    // SparkConf sparkConf = new SparkConf().setAppName("JavaSparkPi");
    SparkConf sparkConf = new SparkConf().setAppName("JavaSparkPi");
    JavaSparkContext jsc = new JavaSparkContext(sparkConf);
    int slices = (args.length == 1) ? Integer.parseInt(args[0]) : 2;
    int n = 100000 * slices;
    List<Integer> l = new ArrayList<Integer>(n);
    for (int i = 0; i < n; i++) {
        l.add(i);
    }
    JavaRDD<Integer> dataSet = jsc.parallelize(l, slices);
    int count = dataSet.map(new Function<Integer, Integer>() {

        @Override
        public Integer call(Integer integer) {
            double x = Math.random() * 2 - 1;
            double y = Math.random() * 2 - 1;
            return (x * x + y * y < 1) ? 1 : 0;
        }
    }).reduce(new Function2<Integer, Integer, Integer>() {

        @Override
        public Integer call(Integer integer, Integer integer2) {
            return integer + integer2;
        }
    });
    System.out.println("Pi is roughly " + 4.0 * count / n);
    jsc.stop();
}
Also used : Function(org.apache.spark.api.java.function.Function) ArrayList(java.util.ArrayList) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) SparkConf(org.apache.spark.SparkConf)

Aggregations

Function (org.apache.spark.api.java.function.Function)30 Tuple2 (scala.Tuple2)17 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)15 ArrayList (java.util.ArrayList)11 PairFunction (org.apache.spark.api.java.function.PairFunction)9 JavaRDD (org.apache.spark.api.java.JavaRDD)8 List (java.util.List)7 SparkConf (org.apache.spark.SparkConf)6 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)6 VoidFunction (org.apache.spark.api.java.function.VoidFunction)5 JavaStreamingContext (org.apache.spark.streaming.api.java.JavaStreamingContext)5 IOException (java.io.IOException)4 FlatMapFunction (org.apache.spark.api.java.function.FlatMapFunction)4 WindowedValue (com.google.cloud.dataflow.sdk.util.WindowedValue)3 VariantContext (htsjdk.variant.variantcontext.VariantContext)3 HashMap (java.util.HashMap)3 Map (java.util.Map)3 Collectors (java.util.stream.Collectors)3 Function2 (org.apache.spark.api.java.function.Function2)3 PairFlatMapFunction (org.apache.spark.api.java.function.PairFlatMapFunction)3