Search in sources :

Example 91 with Tuple2

use of scala.Tuple2 in project cdap by caskdata.

the class ClassicSparkProgram method main.

public static void main(String[] args) throws Exception {
    SparkConf sparkConf = new SparkConf();
    sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    sparkConf.set("spark.kryo.registrator", MyKryoRegistrator.class.getName());
    Schema schema = Schema.recordOf("record", Schema.Field.of("name", Schema.of(Schema.Type.STRING)), Schema.Field.of("id", Schema.of(Schema.Type.INT)));
    List<StructuredRecord> records = new ArrayList<>();
    for (int i = 1; i <= 10; i++) {
        records.add(StructuredRecord.builder(schema).set("name", "Name" + i).set("id", i).build());
    }
    // This test serialization of StructuredRecord as well as using custom kryo serializer
    JavaSparkContext jsc = new JavaSparkContext(sparkConf);
    int result = jsc.parallelize(records).mapToPair(new PairFunction<StructuredRecord, MyInt, StructuredRecord>() {

        @Override
        public Tuple2<MyInt, StructuredRecord> call(StructuredRecord record) throws Exception {
            return new Tuple2<>(new MyInt((Integer) record.get("id")), record);
        }
    }).map(new Function<Tuple2<MyInt, StructuredRecord>, MyInt>() {

        @Override
        public MyInt call(Tuple2<MyInt, StructuredRecord> tuple) throws Exception {
            return tuple._1;
        }
    }).reduce(new Function2<MyInt, MyInt, MyInt>() {

        @Override
        public MyInt call(MyInt v1, MyInt v2) throws Exception {
            return new MyInt(v1.toInt() + v2.toInt());
        }
    }).toInt();
    if (result != 55) {
        throw new Exception("Expected result to be 55");
    }
}
Also used : Schema(co.cask.cdap.api.data.schema.Schema) ArrayList(java.util.ArrayList) Function2(org.apache.spark.api.java.function.Function2) StructuredRecord(co.cask.cdap.api.data.format.StructuredRecord) Tuple2(scala.Tuple2) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) PairFunction(org.apache.spark.api.java.function.PairFunction) SparkConf(org.apache.spark.SparkConf)

Example 92 with Tuple2

use of scala.Tuple2 in project cdap by caskdata.

the class SparkAppUsingGetDataset method parse.

@Nullable
static Tuple2<LogKey, LogStats> parse(Text log) {
    Matcher matcher = CLF_LOG_PATTERN.matcher(log.toString());
    if (matcher.find()) {
        String ip = matcher.group(1);
        String user = matcher.group(3);
        String request = matcher.group(5);
        int code = Integer.parseInt(matcher.group(6));
        int size = Integer.parseInt(matcher.group(7));
        return new Tuple2<>(new LogKey(ip, user, request, code), new LogStats(1, size));
    }
    return null;
}
Also used : Matcher(java.util.regex.Matcher) Tuple2(scala.Tuple2) Nullable(javax.annotation.Nullable)

Example 93 with Tuple2

use of scala.Tuple2 in project cdap by caskdata.

the class SparkLogParser method run.

@Override
public void run(JavaSparkExecutionContext sec) throws Exception {
    JavaSparkContext jsc = new JavaSparkContext();
    Map<String, String> runtimeArguments = sec.getRuntimeArguments();
    String inputFileSet = runtimeArguments.get("input");
    final String outputTable = runtimeArguments.get("output");
    JavaPairRDD<LongWritable, Text> input = sec.fromDataset(inputFileSet);
    final JavaPairRDD<String, String> aggregated = input.mapToPair(new PairFunction<Tuple2<LongWritable, Text>, LogKey, LogStats>() {

        @Override
        public Tuple2<LogKey, LogStats> call(Tuple2<LongWritable, Text> input) throws Exception {
            return SparkAppUsingGetDataset.parse(input._2());
        }
    }).reduceByKey(new Function2<LogStats, LogStats, LogStats>() {

        @Override
        public LogStats call(LogStats stats1, LogStats stats2) throws Exception {
            return stats1.aggregate(stats2);
        }
    }).mapPartitionsToPair(new PairFlatMapFunction<Iterator<Tuple2<LogKey, LogStats>>, String, String>() {

        @Override
        public Iterable<Tuple2<String, String>> call(Iterator<Tuple2<LogKey, LogStats>> itor) throws Exception {
            final Gson gson = new Gson();
            return Lists.newArrayList(Iterators.transform(itor, new Function<Tuple2<LogKey, LogStats>, Tuple2<String, String>>() {

                @Override
                public Tuple2<String, String> apply(Tuple2<LogKey, LogStats> input) {
                    return new Tuple2<>(gson.toJson(input._1()), gson.toJson(input._2()));
                }
            }));
        }
    });
    // Collect all data to driver and write to dataset directly. That's the intend of the test.
    sec.execute(new TxRunnable() {

        @Override
        public void run(DatasetContext context) throws Exception {
            KeyValueTable kvTable = context.getDataset(outputTable);
            for (Map.Entry<String, String> entry : aggregated.collectAsMap().entrySet()) {
                kvTable.write(entry.getKey(), entry.getValue());
            }
        }
    });
}
Also used : Gson(com.google.gson.Gson) TxRunnable(co.cask.cdap.api.TxRunnable) Iterator(java.util.Iterator) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LongWritable(org.apache.hadoop.io.LongWritable) DatasetContext(co.cask.cdap.api.data.DatasetContext) LogKey(co.cask.cdap.spark.app.SparkAppUsingGetDataset.LogKey) Text(org.apache.hadoop.io.Text) Function2(org.apache.spark.api.java.function.Function2) LogStats(co.cask.cdap.spark.app.SparkAppUsingGetDataset.LogStats) Tuple2(scala.Tuple2) KeyValueTable(co.cask.cdap.api.dataset.lib.KeyValueTable)

Example 94 with Tuple2

use of scala.Tuple2 in project spark-dataflow by cloudera.

the class TransformTranslator method readHadoop.

private static <K, V> TransformEvaluator<HadoopIO.Read.Bound<K, V>> readHadoop() {
    return new TransformEvaluator<HadoopIO.Read.Bound<K, V>>() {

        @Override
        public void evaluate(HadoopIO.Read.Bound<K, V> transform, EvaluationContext context) {
            String pattern = transform.getFilepattern();
            JavaSparkContext jsc = context.getSparkContext();
            @SuppressWarnings("unchecked") JavaPairRDD<K, V> file = jsc.newAPIHadoopFile(pattern, transform.getFormatClass(), transform.getKeyClass(), transform.getValueClass(), new Configuration());
            JavaRDD<WindowedValue<KV<K, V>>> rdd = file.map(new Function<Tuple2<K, V>, KV<K, V>>() {

                @Override
                public KV<K, V> call(Tuple2<K, V> t2) throws Exception {
                    return KV.of(t2._1(), t2._2());
                }
            }).map(WindowingHelpers.<KV<K, V>>windowFunction());
            context.setOutputRDD(transform, rdd);
        }
    };
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) HadoopIO(com.cloudera.dataflow.hadoop.HadoopIO) Function(org.apache.spark.api.java.function.Function) PairFunction(org.apache.spark.api.java.function.PairFunction) KV(com.google.cloud.dataflow.sdk.values.KV) Tuple2(scala.Tuple2) WindowedValue(com.google.cloud.dataflow.sdk.util.WindowedValue) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext)

Example 95 with Tuple2

use of scala.Tuple2 in project spark-dataflow by cloudera.

the class TransformTranslator method writeHadoop.

private static <K, V> TransformEvaluator<HadoopIO.Write.Bound<K, V>> writeHadoop() {
    return new TransformEvaluator<HadoopIO.Write.Bound<K, V>>() {

        @Override
        public void evaluate(HadoopIO.Write.Bound<K, V> transform, EvaluationContext context) {
            @SuppressWarnings("unchecked") JavaPairRDD<K, V> last = ((JavaRDDLike<WindowedValue<KV<K, V>>, ?>) context.getInputRDD(transform)).map(WindowingHelpers.<KV<K, V>>unwindowFunction()).mapToPair(new PairFunction<KV<K, V>, K, V>() {

                @Override
                public Tuple2<K, V> call(KV<K, V> t) throws Exception {
                    return new Tuple2<>(t.getKey(), t.getValue());
                }
            });
            ShardTemplateInformation shardTemplateInfo = new ShardTemplateInformation(transform.getNumShards(), transform.getShardTemplate(), transform.getFilenamePrefix(), transform.getFilenameSuffix());
            Configuration conf = new Configuration();
            for (Map.Entry<String, String> e : transform.getConfigurationProperties().entrySet()) {
                conf.set(e.getKey(), e.getValue());
            }
            writeHadoopFile(last, conf, shardTemplateInfo, transform.getKeyClass(), transform.getValueClass(), transform.getFormatClass());
        }
    };
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) KV(com.google.cloud.dataflow.sdk.values.KV) HadoopIO(com.cloudera.dataflow.hadoop.HadoopIO) CannotProvideCoderException(com.google.cloud.dataflow.sdk.coders.CannotProvideCoderException) IOException(java.io.IOException) KV(com.google.cloud.dataflow.sdk.values.KV) WindowedValue(com.google.cloud.dataflow.sdk.util.WindowedValue) Tuple2(scala.Tuple2) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap)

Aggregations

Tuple2 (scala.Tuple2)181 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)57 ArrayList (java.util.ArrayList)43 IOException (java.io.IOException)32 Test (org.junit.Test)32 INDArray (org.nd4j.linalg.api.ndarray.INDArray)28 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)23 List (java.util.List)22 Function (org.apache.spark.api.java.function.Function)19 File (java.io.File)18 Collectors (java.util.stream.Collectors)18 GATKException (org.broadinstitute.hellbender.exceptions.GATKException)18 Configuration (org.apache.hadoop.conf.Configuration)17 UserException (org.broadinstitute.hellbender.exceptions.UserException)17 Broadcast (org.apache.spark.broadcast.Broadcast)16 MatrixBlock (org.apache.sysml.runtime.matrix.data.MatrixBlock)16 MatrixIndexes (org.apache.sysml.runtime.matrix.data.MatrixIndexes)16 SparkConf (org.apache.spark.SparkConf)15 JavaRDD (org.apache.spark.api.java.JavaRDD)15 VisibleForTesting (com.google.common.annotations.VisibleForTesting)14