Search in sources :

Example 31 with Tuple2

use of scala.Tuple2 in project flink by apache.

the class JobSubmitTest method setupJobManager.

@BeforeClass
public static void setupJobManager() {
    jmConfig = new Configuration();
    int port = NetUtils.getAvailablePort();
    jmConfig.setString(ConfigConstants.JOB_MANAGER_IPC_ADDRESS_KEY, "localhost");
    jmConfig.setInteger(ConfigConstants.JOB_MANAGER_IPC_PORT_KEY, port);
    scala.Option<Tuple2<String, Object>> listeningAddress = scala.Option.apply(new Tuple2<String, Object>("localhost", port));
    jobManagerSystem = AkkaUtils.createActorSystem(jmConfig, listeningAddress);
    // only start JobManager (no ResourceManager)
    JobManager.startJobManagerActors(jmConfig, jobManagerSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), JobManager.class, MemoryArchivist.class)._1();
    try {
        LeaderRetrievalService lrs = LeaderRetrievalUtils.createLeaderRetrievalService(jmConfig);
        jmGateway = LeaderRetrievalUtils.retrieveLeaderGateway(lrs, jobManagerSystem, timeout);
    } catch (Exception e) {
        fail("Could not retrieve the JobManager gateway. " + e.getMessage());
    }
}
Also used : Configuration(org.apache.flink.configuration.Configuration) Tuple2(scala.Tuple2) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) JobExecutionException(org.apache.flink.runtime.client.JobExecutionException) IOException(java.io.IOException) BeforeClass(org.junit.BeforeClass)

Example 32 with Tuple2

use of scala.Tuple2 in project learning-spark by databricks.

the class LogAnalyzerTotal method processAccessLogs.

public void processAccessLogs(String outDir, JavaDStream<ApacheAccessLog> accessLogsDStream) {
    // Calculate statistics based on the content size, and update the static variables to track this.
    accessLogsDStream.foreachRDD(new Function<JavaRDD<ApacheAccessLog>, Void>() {

        public Void call(JavaRDD<ApacheAccessLog> accessLogs) {
            Tuple4<Long, Long, Long, Long> stats = Functions.contentSizeStats(accessLogs);
            if (stats != null) {
                runningCount.getAndAdd(stats._1());
                runningSum.getAndAdd(stats._2());
                runningMin.set(Math.min(runningMin.get(), stats._3()));
                runningMax.set(Math.max(runningMax.get(), stats._4()));
            }
            return null;
        }
    });
    // A DStream of Resonse Code Counts;
    JavaPairDStream<Integer, Long> responseCodeCountDStream = accessLogsDStream.transformToPair(new Function<JavaRDD<ApacheAccessLog>, JavaPairRDD<Integer, Long>>() {

        public JavaPairRDD<Integer, Long> call(JavaRDD<ApacheAccessLog> rdd) {
            return Functions.responseCodeCount(rdd);
        }
    }).updateStateByKey(new Functions.ComputeRunningSum());
    responseCodeCountDStream.foreachRDD(new Function<JavaPairRDD<Integer, Long>, Void>() {

        public Void call(JavaPairRDD<Integer, Long> rdd) {
            currentResponseCodeCounts = rdd.take(100);
            return null;
        }
    });
    // A DStream of ipAddressCounts.
    JavaPairDStream<String, Long> ipRawDStream = accessLogsDStream.transformToPair(new Function<JavaRDD<ApacheAccessLog>, JavaPairRDD<String, Long>>() {

        public JavaPairRDD<String, Long> call(JavaRDD<ApacheAccessLog> rdd) {
            return Functions.ipAddressCount(rdd);
        }
    });
    JavaPairDStream<String, Long> ipCumDStream = ipRawDStream.updateStateByKey(new Functions.ComputeRunningSum());
    // A DStream of ipAddressCounts without transform
    JavaPairDStream<String, Long> ipDStream = accessLogsDStream.mapToPair(new Functions.IpTuple());
    JavaPairDStream<String, Long> ipCountsDStream = ipDStream.reduceByKey(new Functions.LongSumReducer());
    // and joining it with the transfer amount
    JavaPairDStream<String, Long> ipBytesDStream = accessLogsDStream.mapToPair(new Functions.IpContentTuple());
    JavaPairDStream<String, Long> ipBytesSumDStream = ipBytesDStream.reduceByKey(new Functions.LongSumReducer());
    JavaPairDStream<String, Tuple2<Long, Long>> ipBytesRequestCountDStream = ipBytesSumDStream.join(ipCountsDStream);
    // Save our dstream of ip address request counts
    JavaPairDStream<Text, LongWritable> writableDStream = ipDStream.mapToPair(new PairFunction<Tuple2<String, Long>, Text, LongWritable>() {

        public Tuple2<Text, LongWritable> call(Tuple2<String, Long> e) {
            return new Tuple2(new Text(e._1()), new LongWritable(e._2()));
        }
    });
    class OutFormat extends SequenceFileOutputFormat<Text, LongWritable> {
    }
    ;
    writableDStream.saveAsHadoopFiles(outDir, "pandas", Text.class, LongWritable.class, OutFormat.class);
    // All ips more than 10
    JavaDStream<String> ipAddressDStream = ipCumDStream.transform(new Function<JavaPairRDD<String, Long>, JavaRDD<String>>() {

        public JavaRDD<String> call(JavaPairRDD<String, Long> rdd) {
            return Functions.filterIPAddress(rdd);
        }
    });
    ipAddressDStream.foreachRDD(new Function<JavaRDD<String>, Void>() {

        public Void call(JavaRDD<String> rdd) {
            List<String> currentIPAddresses = rdd.take(100);
            return null;
        }
    });
    // A DStream of endpoint to count.
    JavaPairDStream<String, Long> endpointCountsDStream = accessLogsDStream.transformToPair(new Function<JavaRDD<ApacheAccessLog>, JavaPairRDD<String, Long>>() {

        public JavaPairRDD<String, Long> call(JavaRDD<ApacheAccessLog> rdd) {
            return Functions.endpointCount(rdd);
        }
    }).updateStateByKey(new Functions.ComputeRunningSum());
    Object ordering = Ordering.natural();
    final Comparator<Long> cmp = (Comparator<Long>) ordering;
    endpointCountsDStream.foreachRDD(new Function<JavaPairRDD<String, Long>, Void>() {

        public Void call(JavaPairRDD<String, Long> rdd) {
            currentTopEndpoints = rdd.takeOrdered(10, new Functions.ValueComparator<String, Long>(cmp));
            return null;
        }
    });
}
Also used : SequenceFileOutputFormat(org.apache.hadoop.mapred.SequenceFileOutputFormat) Comparator(java.util.Comparator) VoidFunction(org.apache.spark.api.java.function.VoidFunction) Function(org.apache.spark.api.java.function.Function) PairFunction(org.apache.spark.api.java.function.PairFunction) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) List(java.util.List) LongWritable(org.apache.hadoop.io.LongWritable) Text(org.apache.hadoop.io.Text) JavaRDD(org.apache.spark.api.java.JavaRDD) Tuple4(scala.Tuple4) Tuple2(scala.Tuple2) AtomicLong(java.util.concurrent.atomic.AtomicLong)

Example 33 with Tuple2

use of scala.Tuple2 in project learning-spark by databricks.

the class LogAnalyzerWindowed method processAccessLogs.

public void processAccessLogs(String outDir, JavaDStream<ApacheAccessLog> accessLogsDStream) {
    JavaDStream<ApacheAccessLog> windowDStream = accessLogsDStream.window(Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
    JavaDStream<String> ip = accessLogsDStream.map(new Function<ApacheAccessLog, String>() {

        public String call(ApacheAccessLog entry) {
            return entry.getIpAddress();
        }
    });
    // reduceByWindow
    JavaDStream<Long> requestCountRBW = accessLogsDStream.map(new Function<ApacheAccessLog, Long>() {

        public Long call(ApacheAccessLog entry) {
            return 1L;
        }
    }).reduceByWindow(new Function2<Long, Long, Long>() {

        public Long call(Long v1, Long v2) {
            return v1 + v2;
        }
    }, new Function2<Long, Long, Long>() {

        public Long call(Long v1, Long v2) {
            return v1 - v2;
        }
    }, Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
    requestCountRBW.print();
    // reducebykeyandwindow
    JavaPairDStream<String, Long> ipAddressPairDStream = accessLogsDStream.mapToPair(new PairFunction<ApacheAccessLog, String, Long>() {

        public Tuple2<String, Long> call(ApacheAccessLog entry) {
            return new Tuple2(entry.getIpAddress(), 1L);
        }
    });
    JavaPairDStream<String, Long> ipCountDStream = ipAddressPairDStream.reduceByKeyAndWindow(// Adding elements in the new slice
    new Function2<Long, Long, Long>() {

        public Long call(Long v1, Long v2) {
            return v1 + v2;
        }
    }, // Removing elements from the oldest slice
    new Function2<Long, Long, Long>() {

        public Long call(Long v1, Long v2) {
            return v1 - v2;
        }
    }, Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
    ipCountDStream.print();
    // Use countByWindow
    JavaDStream<Long> requestCount = accessLogsDStream.countByWindow(Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
    JavaPairDStream<String, Long> ipAddressRequestCount = ip.countByValueAndWindow(Flags.getInstance().getWindowLength(), Flags.getInstance().getSlideInterval());
    requestCount.print();
    ipAddressRequestCount.print();
    // use a transform for the response code count
    JavaPairDStream<Integer, Long> responseCodeCountTransform = accessLogsDStream.transformToPair(new Function<JavaRDD<ApacheAccessLog>, JavaPairRDD<Integer, Long>>() {

        public JavaPairRDD<Integer, Long> call(JavaRDD<ApacheAccessLog> logs) {
            return Functions.responseCodeCount(logs);
        }
    });
    windowDStream.foreachRDD(new Function<JavaRDD<ApacheAccessLog>, Void>() {

        public Void call(JavaRDD<ApacheAccessLog> accessLogs) {
            Tuple4<Long, Long, Long, Long> contentSizeStats = Functions.contentSizeStats(accessLogs);
            List<Tuple2<Integer, Long>> responseCodeToCount = Functions.responseCodeCount(accessLogs).take(100);
            JavaPairRDD<String, Long> ipAddressCounts = Functions.ipAddressCount(accessLogs);
            List<String> ip = Functions.filterIPAddress(ipAddressCounts).take(100);
            Object ordering = Ordering.natural();
            Comparator<Long> cmp = (Comparator<Long>) ordering;
            List<Tuple2<String, Long>> topEndpoints = Functions.endpointCount(accessLogs).top(10, new Functions.ValueComparator<String, Long>(cmp));
            logStatistics = new LogStatistics(contentSizeStats, responseCodeToCount, ip, topEndpoints);
            return null;
        }
    });
}
Also used : Comparator(java.util.Comparator) Function(org.apache.spark.api.java.function.Function) PairFunction(org.apache.spark.api.java.function.PairFunction) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) List(java.util.List) JavaRDD(org.apache.spark.api.java.JavaRDD) Tuple4(scala.Tuple4) Tuple2(scala.Tuple2)

Example 34 with Tuple2

use of scala.Tuple2 in project learning-spark by databricks.

the class IntersectByKey method main.

public static void main(String[] args) throws Exception {
    String master;
    if (args.length > 0) {
        master = args[0];
    } else {
        master = "local";
    }
    JavaSparkContext sc = new JavaSparkContext(master, "IntersectByKey", System.getenv("SPARK_HOME"), System.getenv("JARS"));
    List<Tuple2<String, Integer>> input1 = new ArrayList();
    input1.add(new Tuple2("coffee", 1));
    input1.add(new Tuple2("coffee", 2));
    input1.add(new Tuple2("pandas", 3));
    List<Tuple2<String, Integer>> input2 = new ArrayList();
    input2.add(new Tuple2("pandas", 20));
    JavaPairRDD<String, Integer> rdd1 = sc.parallelizePairs(input1);
    JavaPairRDD<String, Integer> rdd2 = sc.parallelizePairs(input2);
    JavaPairRDD<String, Integer> result = intersectByKey(rdd1, rdd2);
    for (Tuple2<String, Integer> entry : result.collect()) {
        System.out.println(entry._1() + ":" + entry._2());
    }
    System.out.println("Done");
    sc.stop();
}
Also used : Tuple2(scala.Tuple2) ArrayList(java.util.ArrayList) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext)

Example 35 with Tuple2

use of scala.Tuple2 in project learning-spark by databricks.

the class KeyValueMapFilter method main.

public static void main(String[] args) throws Exception {
    if (args.length != 2) {
        throw new Exception("Usage KeyValueMapFilter sparkMaster inputFile");
    }
    String master = args[0];
    String inputFile = args[1];
    JavaSparkContext sc = new JavaSparkContext(master, "KeyValueMapFilter", System.getenv("SPARK_HOME"), System.getenv("JARS"));
    JavaRDD<String> input = sc.textFile(inputFile);
    PairFunction<String, String, String> keyData = new PairFunction<String, String, String>() {

        @Override
        public Tuple2<String, String> call(String x) {
            return new Tuple2(x.split(" ")[0], x);
        }
    };
    Function<Tuple2<String, String>, Boolean> longWordFilter = new Function<Tuple2<String, String>, Boolean>() {

        @Override
        public Boolean call(Tuple2<String, String> input) {
            return (input._2().length() < 20);
        }
    };
    JavaPairRDD<String, String> rdd = input.mapToPair(keyData);
    JavaPairRDD<String, String> result = rdd.filter(longWordFilter);
    Map<String, String> resultMap = result.collectAsMap();
    for (Entry<String, String> entry : resultMap.entrySet()) {
        System.out.println(entry.getKey() + ":" + entry.getValue());
    }
}
Also used : Function(org.apache.spark.api.java.function.Function) PairFunction(org.apache.spark.api.java.function.PairFunction) Tuple2(scala.Tuple2) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) PairFunction(org.apache.spark.api.java.function.PairFunction)

Aggregations

Tuple2 (scala.Tuple2)183 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)57 ArrayList (java.util.ArrayList)44 IOException (java.io.IOException)32 Test (org.junit.Test)32 INDArray (org.nd4j.linalg.api.ndarray.INDArray)28 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)23 List (java.util.List)22 Function (org.apache.spark.api.java.function.Function)19 File (java.io.File)18 Collectors (java.util.stream.Collectors)18 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)18 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)18 GATKException (org.broadinstitute.hellbender.exceptions.GATKException)18 Configuration (org.apache.hadoop.conf.Configuration)17 UserException (org.broadinstitute.hellbender.exceptions.UserException)17 Broadcast (org.apache.spark.broadcast.Broadcast)16 SparkConf (org.apache.spark.SparkConf)15 JavaRDD (org.apache.spark.api.java.JavaRDD)15 VisibleForTesting (com.google.common.annotations.VisibleForTesting)14