use of org.apache.spark.api.java.function.Function in project beam by apache.
the class SparkCompat method extractOutput.
/**
* Extracts the output for a given collection of WindowedAccumulators.
*
* <p>This is required because the API of JavaPairRDD.flatMapValues is different among Spark
* versions. See https://issues.apache.org/jira/browse/SPARK-19287
*/
public static <K, InputT, AccumT, OutputT> JavaPairRDD<K, WindowedValue<OutputT>> extractOutput(JavaPairRDD<K, SparkCombineFn.WindowedAccumulator<KV<K, InputT>, InputT, AccumT, ?>> accumulatePerKey, SparkCombineFn<KV<K, InputT>, InputT, AccumT, OutputT> sparkCombineFn) {
try {
if (accumulatePerKey.context().version().startsWith("3")) {
FlatMapFunction<SparkCombineFn.WindowedAccumulator<KV<K, InputT>, InputT, AccumT, ?>, WindowedValue<OutputT>> flatMapFunction = (FlatMapFunction<SparkCombineFn.WindowedAccumulator<KV<K, InputT>, InputT, AccumT, ?>, WindowedValue<OutputT>>) windowedAccumulator -> sparkCombineFn.extractOutputStream(windowedAccumulator).iterator();
// This invokes by reflection the equivalent of:
// return accumulatePerKey.flatMapValues(flatMapFunction);
Method method = accumulatePerKey.getClass().getDeclaredMethod("flatMapValues", FlatMapFunction.class);
Object result = method.invoke(accumulatePerKey, flatMapFunction);
return (JavaPairRDD<K, WindowedValue<OutputT>>) result;
}
Function<SparkCombineFn.WindowedAccumulator<KV<K, InputT>, InputT, AccumT, ?>, Iterable<WindowedValue<OutputT>>> flatMapFunction = windowedAccumulator -> sparkCombineFn.extractOutputStream(windowedAccumulator).collect(Collectors.toList());
// This invokes by reflection the equivalent of:
// return accumulatePerKey.flatMapValues(flatMapFunction);
Method method = accumulatePerKey.getClass().getDeclaredMethod("flatMapValues", Function.class);
Object result = method.invoke(accumulatePerKey, flatMapFunction);
return (JavaPairRDD<K, WindowedValue<OutputT>>) result;
} catch (NoSuchMethodException | IllegalAccessException | InvocationTargetException e) {
throw new RuntimeException("Error invoking Spark flatMapValues", e);
}
}
use of org.apache.spark.api.java.function.Function in project net.jgp.labs.spark by jgperrin.
the class StreamingIngestionFileSystemTextFileToDataframeApp method start.
private void start() {
// Create a local StreamingContext with two working thread and batch
// interval of
// 1 second
SparkConf conf = new SparkConf().setMaster("local[2]").setAppName("Streaming Ingestion File System Text File to Dataframe");
JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(5));
JavaDStream<String> msgDataStream = jssc.textFileStream(StreamingUtils.getInputDirectory());
msgDataStream.print();
// Create JavaRDD<Row>
msgDataStream.foreachRDD(new VoidFunction<JavaRDD<String>>() {
private static final long serialVersionUID = -590010339928376829L;
@Override
public void call(JavaRDD<String> rdd) {
JavaRDD<Row> rowRDD = rdd.map(new Function<String, Row>() {
private static final long serialVersionUID = 5167089361335095997L;
@Override
public Row call(String msg) {
Row row = RowFactory.create(msg);
return row;
}
});
// Create Schema
StructType schema = DataTypes.createStructType(new StructField[] { DataTypes.createStructField("Message", DataTypes.StringType, true) });
// Get Spark 2.0 session
SparkSession spark = JavaSparkSessionSingleton.getInstance(rdd.context().getConf());
Dataset<Row> msgDataFrame = spark.createDataFrame(rowRDD, schema);
msgDataFrame.show();
}
});
jssc.start();
try {
jssc.awaitTermination();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
use of org.apache.spark.api.java.function.Function in project spark-dataflow by cloudera.
the class TransformTranslator method readAvro.
private static <T> TransformEvaluator<AvroIO.Read.Bound<T>> readAvro() {
return new TransformEvaluator<AvroIO.Read.Bound<T>>() {
@Override
public void evaluate(AvroIO.Read.Bound<T> transform, EvaluationContext context) {
String pattern = transform.getFilepattern();
JavaSparkContext jsc = context.getSparkContext();
@SuppressWarnings("unchecked") JavaRDD<AvroKey<T>> avroFile = (JavaRDD<AvroKey<T>>) (JavaRDD<?>) jsc.newAPIHadoopFile(pattern, AvroKeyInputFormat.class, AvroKey.class, NullWritable.class, new Configuration()).keys();
JavaRDD<WindowedValue<T>> rdd = avroFile.map(new Function<AvroKey<T>, T>() {
@Override
public T call(AvroKey<T> key) {
return key.datum();
}
}).map(WindowingHelpers.<T>windowFunction());
context.setOutputRDD(transform, rdd);
}
};
}
use of org.apache.spark.api.java.function.Function in project gatk by broadinstitute.
the class ExampleVariantWalkerSpark method variantFunction.
private static Function<VariantWalkerContext, String> variantFunction(FeatureInput<VariantContext> auxiliaryVariants) {
return (Function<VariantWalkerContext, String>) context -> {
VariantContext variant = context.getVariant();
ReadsContext readsContext = context.getReadsContext();
ReferenceContext referenceContext = context.getReferenceContext();
FeatureContext featureContext = context.getFeatureContext();
StringBuilder sb = new StringBuilder();
sb.append(String.format("Current variant: " + variant));
sb.append("\n");
if (referenceContext.hasBackingDataSource()) {
sb.append(String.format("\tOverlapping reference bases: %s\n\n", new String(referenceContext.getBases())));
}
if (readsContext.hasBackingDataSource()) {
for (final GATKRead read : readsContext) {
sb.append(String.format("\tOverlapping read at %s:%d-%d\n", read.getContig(), read.getStart(), read.getEnd()));
}
sb.append("\n");
}
if (featureContext.hasBackingDataSource()) {
for (final VariantContext variant1 : featureContext.getValues(auxiliaryVariants)) {
sb.append(String.format("\tOverlapping variant at %s:%d-%d. Ref: %s Alt(s): %s\n", variant1.getContig(), variant1.getStart(), variant1.getEnd(), variant1.getReference(), variant1.getAlternateAlleles()));
}
sb.append("\n");
}
return sb.toString();
};
}
use of org.apache.spark.api.java.function.Function in project gatk by broadinstitute.
the class ExampleAssemblyRegionWalkerSpark method assemblyFunction.
private static Function<AssemblyRegionWalkerContext, String> assemblyFunction(FeatureInput<VariantContext> knownVariants) {
return (Function<AssemblyRegionWalkerContext, String>) context -> {
AssemblyRegion region = context.getAssemblyRegion();
ReferenceContext referenceContext = context.getReferenceContext();
FeatureContext featureContext = context.getFeatureContext();
StringBuilder sb = new StringBuilder();
sb.append(String.format("%s assembly region at %s (%s with padding), containing %d reads.\n\n", region.isActive() ? "ACTIVE" : "INACTIVE", region.getSpan(), region.getExtendedSpan(), region.getReads().size()));
sb.append(String.format("\tOverlapping reference bases: %s\n\n", new String(referenceContext.getBases())));
if (featureContext.hasBackingDataSource()) {
for (final VariantContext variant : featureContext.getValues(knownVariants)) {
sb.append(String.format("\tOverlapping variant at %s:%d-%d. Ref: %s Alt(s): %s\n\n", variant.getContig(), variant.getStart(), variant.getEnd(), variant.getReference(), variant.getAlternateAlleles()));
}
}
return sb.toString();
};
}
Aggregations