Search in sources :

Example 1 with Function2

use of org.apache.spark.api.java.function.Function2 in project cdap by caskdata.

the class SparkLogParser method run.

@Override
public void run(JavaSparkExecutionContext sec) throws Exception {
    JavaSparkContext jsc = new JavaSparkContext();
    Map<String, String> runtimeArguments = sec.getRuntimeArguments();
    String inputFileSet = runtimeArguments.get("input");
    final String outputTable = runtimeArguments.get("output");
    JavaPairRDD<LongWritable, Text> input = sec.fromDataset(inputFileSet);
    final JavaPairRDD<String, String> aggregated = input.mapToPair(new PairFunction<Tuple2<LongWritable, Text>, LogKey, LogStats>() {

        @Override
        public Tuple2<LogKey, LogStats> call(Tuple2<LongWritable, Text> input) throws Exception {
            return SparkAppUsingGetDataset.parse(input._2());
        }
    }).reduceByKey(new Function2<LogStats, LogStats, LogStats>() {

        @Override
        public LogStats call(LogStats stats1, LogStats stats2) throws Exception {
            return stats1.aggregate(stats2);
        }
    }).mapPartitionsToPair(new PairFlatMapFunction<Iterator<Tuple2<LogKey, LogStats>>, String, String>() {

        @Override
        public Iterable<Tuple2<String, String>> call(Iterator<Tuple2<LogKey, LogStats>> itor) throws Exception {
            final Gson gson = new Gson();
            return Lists.newArrayList(Iterators.transform(itor, new Function<Tuple2<LogKey, LogStats>, Tuple2<String, String>>() {

                @Override
                public Tuple2<String, String> apply(Tuple2<LogKey, LogStats> input) {
                    return new Tuple2<>(gson.toJson(input._1()), gson.toJson(input._2()));
                }
            }));
        }
    });
    // Collect all data to driver and write to dataset directly. That's the intend of the test.
    sec.execute(new TxRunnable() {

        @Override
        public void run(DatasetContext context) throws Exception {
            KeyValueTable kvTable = context.getDataset(outputTable);
            for (Map.Entry<String, String> entry : aggregated.collectAsMap().entrySet()) {
                kvTable.write(entry.getKey(), entry.getValue());
            }
        }
    });
}
Also used : Gson(com.google.gson.Gson) TxRunnable(co.cask.cdap.api.TxRunnable) Iterator(java.util.Iterator) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LongWritable(org.apache.hadoop.io.LongWritable) DatasetContext(co.cask.cdap.api.data.DatasetContext) LogKey(co.cask.cdap.spark.app.SparkAppUsingGetDataset.LogKey) Text(org.apache.hadoop.io.Text) Function2(org.apache.spark.api.java.function.Function2) LogStats(co.cask.cdap.spark.app.SparkAppUsingGetDataset.LogStats) Tuple2(scala.Tuple2) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable)

Example 2 with Function2

use of org.apache.spark.api.java.function.Function2 in project cdap by caskdata.

the class ClassicSparkProgram method main.

public static void main(String[] args) throws Exception {
    SparkConf sparkConf = new SparkConf();
    sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    sparkConf.set("spark.kryo.registrator", MyKryoRegistrator.class.getName());
    Schema schema = Schema.recordOf("record", Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("id", Schema.of(Schema.Type.INT)));
    List<StructuredRecord> records = new ArrayList<>();
    for (int i = 1; i <= 10; i++) {
        records.add(StructuredRecord.builder(schema).set("name", "Name" + i).set("id", i).build());
    }
    // This test serialization of StructuredRecord as well as using custom kryo serializer
    JavaSparkContext jsc = new JavaSparkContext(sparkConf);
    int result = jsc.parallelize(records).mapToPair(new PairFunction<StructuredRecord, MyInt, StructuredRecord>() {

        @Override
        public Tuple2<MyInt, StructuredRecord> call(StructuredRecord record) throws Exception {
            return new Tuple2<>(new MyInt((Integer) record.get("id")), record);
        }
    }).map(new Function<Tuple2<MyInt, StructuredRecord>, MyInt>() {

        @Override
        public MyInt call(Tuple2<MyInt, StructuredRecord> tuple) throws Exception {
            return tuple._1;
        }
    }).reduce(new Function2<MyInt, MyInt, MyInt>() {

        @Override
        public MyInt call(MyInt v1, MyInt v2) throws Exception {
            return new MyInt(v1.toInt() + v2.toInt());
        }
    }).toInt();
    if (result != 55) {
        throw new Exception("Expected result to be 55");
    }
}
Also used : Schema(co.cask.cdap.api.data.schema.Schema) ArrayList(java.util.ArrayList) Function2(org.apache.spark.api.java.function.Function2) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) Tuple2(scala.Tuple2) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) PairFunction(org.apache.spark.api.java.function.PairFunction) SparkConf(org.apache.spark.SparkConf)

Example 3 with Function2

use of org.apache.spark.api.java.function.Function2 in project incubator-sdap-mudrod by apache.

the class CrawlerDetection method checkByRateInParallel.

void checkByRateInParallel() throws InterruptedException, IOException {
    JavaRDD<String> userRDD = getUserRDD(this.httpType);
    LOG.info("Original User count: {}", userRDD.count());
    int userCount = 0;
    userCount = userRDD.mapPartitions((FlatMapFunction<Iterator<String>, Integer>) iterator -> {
        ESDriver tmpES = new ESDriver(props);
        tmpES.createBulkProcessor();
        List<Integer> realUserNums = new ArrayList<>();
        while (iterator.hasNext()) {
            String s = iterator.next();
            Integer realUser = checkByRate(tmpES, s);
            realUserNums.add(realUser);
        }
        tmpES.destroyBulkProcessor();
        tmpES.close();
        return realUserNums.iterator();
    }).reduce((Function2<Integer, Integer, Integer>) (a, b) -> a + b);
    LOG.info("User count: {}", Integer.toString(userCount));
}
Also used : java.util(java.util) Function2(org.apache.spark.api.java.function.Function2) AggregationBuilder(org.elasticsearch.search.aggregations.AggregationBuilder) MudrodConstants(org.apache.sdap.mudrod.main.MudrodConstants) DiscoveryStepAbstract(org.apache.sdap.mudrod.discoveryengine.DiscoveryStepAbstract) Histogram(org.elasticsearch.search.aggregations.bucket.histogram.Histogram) LoggerFactory(org.slf4j.LoggerFactory) QueryBuilders(org.elasticsearch.index.query.QueryBuilders) IndexRequest(org.elasticsearch.action.index.IndexRequest) Matcher(java.util.regex.Matcher) Seconds(org.joda.time.Seconds) TimeValue(org.elasticsearch.common.unit.TimeValue) SearchResponse(org.elasticsearch.action.search.SearchResponse) JavaRDD(org.apache.spark.api.java.JavaRDD) FlatMapFunction(org.apache.spark.api.java.function.FlatMapFunction) DateHistogramInterval(org.elasticsearch.search.aggregations.bucket.histogram.DateHistogramInterval) SearchHit(org.elasticsearch.search.SearchHit) ISODateTimeFormat(org.joda.time.format.ISODateTimeFormat) Logger(org.slf4j.Logger) DateTimeFormatter(org.joda.time.format.DateTimeFormatter) Terms(org.elasticsearch.search.aggregations.bucket.terms.Terms) DateTime(org.joda.time.DateTime) AggregationBuilders(org.elasticsearch.search.aggregations.AggregationBuilders) IOException(java.io.IOException) ESDriver(org.apache.sdap.mudrod.driver.ESDriver) SparkDriver(org.apache.sdap.mudrod.driver.SparkDriver) Pattern(java.util.regex.Pattern) BoolQueryBuilder(org.elasticsearch.index.query.BoolQueryBuilder) Order(org.elasticsearch.search.aggregations.bucket.histogram.Histogram.Order) ESDriver(org.apache.sdap.mudrod.driver.ESDriver) Function2(org.apache.spark.api.java.function.Function2)

Example 4 with Function2

use of org.apache.spark.api.java.function.Function2 in project incubator-sdap-mudrod by apache.

the class SessionGenerator method genSessionByRefererInParallel.

public void genSessionByRefererInParallel(int timeThres) throws InterruptedException, IOException {
    JavaRDD<String> userRDD = getUserRDD(this.cleanupType);
    int sessionCount = 0;
    sessionCount = userRDD.mapPartitions(new FlatMapFunction<Iterator<String>, Integer>() {

        /**
         */
        private static final long serialVersionUID = 1L;

        @Override
        public Iterator<Integer> call(Iterator<String> arg0) throws Exception {
            ESDriver tmpES = new ESDriver(props);
            tmpES.createBulkProcessor();
            List<Integer> sessionNums = new ArrayList<>();
            while (arg0.hasNext()) {
                String s = arg0.next();
                Integer sessionNum = genSessionByReferer(tmpES, s, timeThres);
                sessionNums.add(sessionNum);
            }
            tmpES.destroyBulkProcessor();
            tmpES.close();
            return sessionNums.iterator();
        }
    }).reduce(new Function2<Integer, Integer, Integer>() {

        /**
         */
        private static final long serialVersionUID = 1L;

        @Override
        public Integer call(Integer a, Integer b) {
            return a + b;
        }
    });
    LOG.info("Initial Session count: {}", Integer.toString(sessionCount));
}
Also used : ESDriver(org.apache.sdap.mudrod.driver.ESDriver) Function2(org.apache.spark.api.java.function.Function2) ElasticsearchException(org.elasticsearch.ElasticsearchException) IOException(java.io.IOException)

Example 5 with Function2

use of org.apache.spark.api.java.function.Function2 in project java_study by aloyschen.

the class RDD method fold_ep.

/*
    * spark fold example
     */
public void fold_ep() {
    JavaSparkContext sc = getSc();
    sc.setLogLevel("ERROR");
    JavaRDD<Integer> fold_ep = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5));
    Integer sum = fold_ep.fold(0, (Function2<Integer, Integer, Integer>) (v1, v2) -> v1 + v2);
    System.out.println("The sum is " + sum);
}
Also used : Arrays(java.util.Arrays) Function2(org.apache.spark.api.java.function.Function2) Serializable(scala.Serializable) SparkConf(org.apache.spark.SparkConf) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Tuple2(scala.Tuple2) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) ArrayList(java.util.ArrayList) List(java.util.List) Map(java.util.Map) Function(org.apache.spark.api.java.function.Function) PairFunction(org.apache.spark.api.java.function.PairFunction) JavaRDD(org.apache.spark.api.java.JavaRDD) FlatMapFunction(org.apache.spark.api.java.function.FlatMapFunction) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext)

Aggregations

Function2 (org.apache.spark.api.java.function.Function2)13 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)10 Tuple2 (scala.Tuple2)7 ArrayList (java.util.ArrayList)6 SparkConf (org.apache.spark.SparkConf)5 FlatMapFunction (org.apache.spark.api.java.function.FlatMapFunction)5 Function (org.apache.spark.api.java.function.Function)4 PairFunction (org.apache.spark.api.java.function.PairFunction)4 IOException (java.io.IOException)3 Map (java.util.Map)3 ESDriver (org.apache.sdap.mudrod.driver.ESDriver)3 JavaRDD (org.apache.spark.api.java.JavaRDD)3 Arrays (java.util.Arrays)2 Iterator (java.util.Iterator)2 List (java.util.List)2 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)2 TxRunnable (co.cask.cdap.api.TxRunnable)1 DatasetContext (co.cask.cdap.api.data.DatasetContext)1 StructuredRecord (co.cask.cdap.api.data.format.StructuredRecord)1 Schema (co.cask.cdap.api.data.schema.Schema)1