Search in sources :

Example 16 with Function

use of org.apache.spark.api.java.function.Function in project beam by apache.

the class SparkCompat method extractOutput.

/**
 * Extracts the output for a given collection of WindowedAccumulators.
 *
 * <p>This is required because the API of JavaPairRDD.flatMapValues is different among Spark
 * versions. See https://issues.apache.org/jira/browse/SPARK-19287
 */
public static <K, InputT, AccumT, OutputT> JavaPairRDD<K, WindowedValue<OutputT>> extractOutput(JavaPairRDD<K, SparkCombineFn.WindowedAccumulator<KV<K, InputT>, InputT, AccumT, ?>> accumulatePerKey, SparkCombineFn<KV<K, InputT>, InputT, AccumT, OutputT> sparkCombineFn) {
    try {
        if (accumulatePerKey.context().version().startsWith("3")) {
            FlatMapFunction<SparkCombineFn.WindowedAccumulator<KV<K, InputT>, InputT, AccumT, ?>, WindowedValue<OutputT>> flatMapFunction = (FlatMapFunction<SparkCombineFn.WindowedAccumulator<KV<K, InputT>, InputT, AccumT, ?>, WindowedValue<OutputT>>) windowedAccumulator -> sparkCombineFn.extractOutputStream(windowedAccumulator).iterator();
            // This invokes by reflection the equivalent of:
            // return accumulatePerKey.flatMapValues(flatMapFunction);
            Method method = accumulatePerKey.getClass().getDeclaredMethod("flatMapValues", FlatMapFunction.class);
            Object result = method.invoke(accumulatePerKey, flatMapFunction);
            return (JavaPairRDD<K, WindowedValue<OutputT>>) result;
        }
        Function<SparkCombineFn.WindowedAccumulator<KV<K, InputT>, InputT, AccumT, ?>, Iterable<WindowedValue<OutputT>>> flatMapFunction = windowedAccumulator -> sparkCombineFn.extractOutputStream(windowedAccumulator).collect(Collectors.toList());
        // This invokes by reflection the equivalent of:
        // return accumulatePerKey.flatMapValues(flatMapFunction);
        Method method = accumulatePerKey.getClass().getDeclaredMethod("flatMapValues", Function.class);
        Object result = method.invoke(accumulatePerKey, flatMapFunction);
        return (JavaPairRDD<K, WindowedValue<OutputT>>) result;
    } catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException e) {
        throw new RuntimeException("Error invoking Spark flatMapValues", e);
    }
}
Also used : SparkListenerApplicationStart(org.apache.spark.scheduler.SparkListenerApplicationStart) SparkCombineFn(org.apache.beam.runners.spark.translation.SparkCombineFn) KV(org.apache.beam.sdk.values.KV) WindowedValue(org.apache.beam.sdk.util.WindowedValue) JavaStreamingContext(org.apache.spark.streaming.api.java.JavaStreamingContext) PipelineResult(org.apache.beam.sdk.PipelineResult) ApplicationNameOptions(org.apache.beam.sdk.options.ApplicationNameOptions) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Option(scala.Option) Constructor(java.lang.reflect.Constructor) Collectors(java.util.stream.Collectors) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) InvocationTargetException(java.lang.reflect.InvocationTargetException) SparkBeamMetric(org.apache.beam.runners.spark.metrics.SparkBeamMetric) List(java.util.List) JavaConverters(scala.collection.JavaConverters) JavaDStream(org.apache.spark.streaming.api.java.JavaDStream) Function(org.apache.spark.api.java.function.Function) Method(java.lang.reflect.Method) SparkPipelineOptions(org.apache.beam.runners.spark.SparkPipelineOptions) FlatMapFunction(org.apache.spark.api.java.function.FlatMapFunction) KV(org.apache.beam.sdk.values.KV) Method(java.lang.reflect.Method) InvocationTargetException(java.lang.reflect.InvocationTargetException) WindowedValue(org.apache.beam.sdk.util.WindowedValue) FlatMapFunction(org.apache.spark.api.java.function.FlatMapFunction) SparkCombineFn(org.apache.beam.runners.spark.translation.SparkCombineFn) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD)

Example 17 with Function

use of org.apache.spark.api.java.function.Function in project net.jgp.labs.spark by jgperrin.

the class StreamingIngestionFileSystemTextFileToDataframeApp method start.

private void start() {
    // Create a local StreamingContext with two working thread and batch
    // interval of
    // 1 second
    SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("Streaming Ingestion File System Text File to Dataframe");
    JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(5));
    JavaDStream<String> msgDataStream = jssc.textFileStream(StreamingUtils.getInputDirectory());
    msgDataStream.print();
    // Create JavaRDD<Row>
    msgDataStream.foreachRDD(new VoidFunction<JavaRDD<String>>() {

        private static final long serialVersionUID = -590010339928376829L;

        @Override
        public void call(JavaRDD<String> rdd) {
            JavaRDD<Row> rowRDD = rdd.map(new Function<String, Row>() {

                private static final long serialVersionUID = 5167089361335095997L;

                @Override
                public Row call(String msg) {
                    Row row = RowFactory.create(msg);
                    return row;
                }
            });
            // Create Schema
            StructType schema = DataTypes.createStructType(new StructField[] { DataTypes.createStructField("Message", DataTypes.StringType, true) });
            // Get Spark 2.0 session
            SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context().getConf());
            Dataset<Row> msgDataFrame = spark.createDataFrame(rowRDD, schema);
            msgDataFrame.show();
        }
    });
    jssc.start();
    try {
        jssc.awaitTermination();
    } catch (InterruptedException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}
Also used : SparkSession(org.apache.spark.sql.SparkSession) StructType(org.apache.spark.sql.types.StructType) Dataset(org.apache.spark.sql.Dataset) JavaRDD(org.apache.spark.api.java.JavaRDD) JavaStreamingContext(org.apache.spark.streaming.api.java.JavaStreamingContext) VoidFunction(org.apache.spark.api.java.function.VoidFunction) Function(org.apache.spark.api.java.function.Function) StructField(org.apache.spark.sql.types.StructField) Row(org.apache.spark.sql.Row) SparkConf(org.apache.spark.SparkConf)

Example 18 with Function

use of org.apache.spark.api.java.function.Function in project spark-dataflow by cloudera.

the class TransformTranslator method readAvro.

private static <T> TransformEvaluator<AvroIO.Read.Bound<T>> readAvro() {
    return new TransformEvaluator<AvroIO.Read.Bound<T>>() {

        @Override
        public void evaluate(AvroIO.Read.Bound<T> transform, EvaluationContext context) {
            String pattern = transform.getFilepattern();
            JavaSparkContext jsc = context.getSparkContext();
            @SuppressWarnings("unchecked") JavaRDD<AvroKey<T>> avroFile = (JavaRDD<AvroKey<T>>) (JavaRDD<?>) jsc.newAPIHadoopFile(pattern, AvroKeyInputFormat.class, AvroKey.class, NullWritable.class, new Configuration()).keys();
            JavaRDD<WindowedValue<T>> rdd = avroFile.map(new Function<AvroKey<T>, T>() {

                @Override
                public T call(AvroKey<T> key) {
                    return key.datum();
                }
            }).map(WindowingHelpers.<T>windowFunction());
            context.setOutputRDD(transform, rdd);
        }
    };
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) AvroIO(com.google.cloud.dataflow.sdk.io.AvroIO) AvroKey(org.apache.avro.mapred.AvroKey) NullWritable(org.apache.hadoop.io.NullWritable) JavaRDD(org.apache.spark.api.java.JavaRDD) Function(org.apache.spark.api.java.function.Function) PairFunction(org.apache.spark.api.java.function.PairFunction) WindowedValue(com.google.cloud.dataflow.sdk.util.WindowedValue) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) AvroKeyInputFormat(org.apache.avro.mapreduce.AvroKeyInputFormat)

Example 19 with Function

use of org.apache.spark.api.java.function.Function in project gatk by broadinstitute.

the class ExampleVariantWalkerSpark method variantFunction.

private static Function<VariantWalkerContext, String> variantFunction(FeatureInput<VariantContext> auxiliaryVariants) {
    return (Function<VariantWalkerContext, String>) context -> {
        VariantContext variant = context.getVariant();
        ReadsContext readsContext = context.getReadsContext();
        ReferenceContext referenceContext = context.getReferenceContext();
        FeatureContext featureContext = context.getFeatureContext();
        StringBuilder sb = new StringBuilder();
        sb.append(String.format("Current variant: " + variant));
        sb.append("\n");
        if (referenceContext.hasBackingDataSource()) {
            sb.append(String.format("\tOverlapping reference bases: %s\n\n", new String(referenceContext.getBases())));
        }
        if (readsContext.hasBackingDataSource()) {
            for (final GATKRead read : readsContext) {
                sb.append(String.format("\tOverlapping read at %s:%d-%d\n", read.getContig(), read.getStart(), read.getEnd()));
            }
            sb.append("\n");
        }
        if (featureContext.hasBackingDataSource()) {
            for (final VariantContext variant1 : featureContext.getValues(auxiliaryVariants)) {
                sb.append(String.format("\tOverlapping variant at %s:%d-%d. Ref: %s Alt(s): %s\n", variant1.getContig(), variant1.getStart(), variant1.getEnd(), variant1.getReference(), variant1.getAlternateAlleles()));
            }
            sb.append("\n");
        }
        return sb.toString();
    };
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) Function(org.apache.spark.api.java.function.Function) VariantContext(htsjdk.variant.variantcontext.VariantContext)

Example 20 with Function

use of org.apache.spark.api.java.function.Function in project gatk by broadinstitute.

the class ExampleAssemblyRegionWalkerSpark method assemblyFunction.

private static Function<AssemblyRegionWalkerContext, String> assemblyFunction(FeatureInput<VariantContext> knownVariants) {
    return (Function<AssemblyRegionWalkerContext, String>) context -> {
        AssemblyRegion region = context.getAssemblyRegion();
        ReferenceContext referenceContext = context.getReferenceContext();
        FeatureContext featureContext = context.getFeatureContext();
        StringBuilder sb = new StringBuilder();
        sb.append(String.format("%s assembly region at %s (%s with padding), containing %d reads.\n\n", region.isActive() ? "ACTIVE" : "INACTIVE", region.getSpan(), region.getExtendedSpan(), region.getReads().size()));
        sb.append(String.format("\tOverlapping reference bases: %s\n\n", new String(referenceContext.getBases())));
        if (featureContext.hasBackingDataSource()) {
            for (final VariantContext variant : featureContext.getValues(knownVariants)) {
                sb.append(String.format("\tOverlapping variant at %s:%d-%d. Ref: %s Alt(s): %s\n\n", variant.getContig(), variant.getStart(), variant.getEnd(), variant.getReference(), variant.getAlternateAlleles()));
            }
        }
        return sb.toString();
    };
}
Also used : Function(org.apache.spark.api.java.function.Function) VariantContext(htsjdk.variant.variantcontext.VariantContext)

Aggregations

Function (org.apache.spark.api.java.function.Function)30 Tuple2 (scala.Tuple2)17 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)15 ArrayList (java.util.ArrayList)11 PairFunction (org.apache.spark.api.java.function.PairFunction)9 JavaRDD (org.apache.spark.api.java.JavaRDD)8 List (java.util.List)7 SparkConf (org.apache.spark.SparkConf)6 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)6 VoidFunction (org.apache.spark.api.java.function.VoidFunction)5 JavaStreamingContext (org.apache.spark.streaming.api.java.JavaStreamingContext)5 IOException (java.io.IOException)4 FlatMapFunction (org.apache.spark.api.java.function.FlatMapFunction)4 WindowedValue (com.google.cloud.dataflow.sdk.util.WindowedValue)3 VariantContext (htsjdk.variant.variantcontext.VariantContext)3 HashMap (java.util.HashMap)3 Map (java.util.Map)3 Collectors (java.util.stream.Collectors)3 Function2 (org.apache.spark.api.java.function.Function2)3 PairFlatMapFunction (org.apache.spark.api.java.function.PairFlatMapFunction)3