use of org.apache.spark.api.java.function.Function in project beam by apache.
the class SparkGroupAlsoByWindowViaWindowSet method groupAlsoByWindow.
public static <K, InputT, W extends BoundedWindow> JavaDStream<WindowedValue<KV<K, Iterable<InputT>>>> groupAlsoByWindow(JavaDStream<WindowedValue<KV<K, Iterable<WindowedValue<InputT>>>>> inputDStream, final Coder<K> keyCoder, final Coder<WindowedValue<InputT>> wvCoder, final WindowingStrategy<?, W> windowingStrategy, final SparkRuntimeContext runtimeContext, final List<Integer> sourceIds) {
final IterableCoder<WindowedValue<InputT>> itrWvCoder = IterableCoder.of(wvCoder);
final Coder<InputT> iCoder = ((FullWindowedValueCoder<InputT>) wvCoder).getValueCoder();
final Coder<? extends BoundedWindow> wCoder = ((FullWindowedValueCoder<InputT>) wvCoder).getWindowCoder();
final Coder<WindowedValue<KV<K, Iterable<InputT>>>> wvKvIterCoder = FullWindowedValueCoder.of(KvCoder.of(keyCoder, IterableCoder.of(iCoder)), wCoder);
final TimerInternals.TimerDataCoder timerDataCoder = TimerInternals.TimerDataCoder.of(windowingStrategy.getWindowFn().windowCoder());
long checkpointDurationMillis = runtimeContext.getPipelineOptions().as(SparkPipelineOptions.class).getCheckpointDurationMillis();
// we have to switch to Scala API to avoid Optional in the Java API, see: SPARK-4819.
// we also have a broader API for Scala (access to the actual key and entire iterator).
// we use coders to convert objects in the PCollection to byte arrays, so they
// can be transferred over the network for the shuffle and be in serialized form
// for checkpointing.
// for readability, we add comments with actual type next to byte[].
// to shorten line length, we use:
//---- WV: WindowedValue
//---- Iterable: Itr
//---- AccumT: A
//---- InputT: I
DStream<Tuple2<ByteArray, byte[]>> /*Itr<WV<I>>*/
pairDStream = inputDStream.transformToPair(new Function<JavaRDD<WindowedValue<KV<K, Iterable<WindowedValue<InputT>>>>>, JavaPairRDD<ByteArray, byte[]>>() {
// we use mapPartitions with the RDD API because its the only available API
// that allows to preserve partitioning.
@Override
public JavaPairRDD<ByteArray, byte[]> call(JavaRDD<WindowedValue<KV<K, Iterable<WindowedValue<InputT>>>>> rdd) throws Exception {
return rdd.mapPartitions(TranslationUtils.functionToFlatMapFunction(WindowingHelpers.<KV<K, Iterable<WindowedValue<InputT>>>>unwindowFunction()), true).mapPartitionsToPair(TranslationUtils.<K, Iterable<WindowedValue<InputT>>>toPairFlatMapFunction(), true).mapPartitionsToPair(TranslationUtils.pairFunctionToPairFlatMapFunction(CoderHelpers.toByteFunction(keyCoder, itrWvCoder)), true);
}
}).dstream();
PairDStreamFunctions<ByteArray, byte[]> pairDStreamFunctions = DStream.toPairDStreamFunctions(pairDStream, JavaSparkContext$.MODULE$.<ByteArray>fakeClassTag(), JavaSparkContext$.MODULE$.<byte[]>fakeClassTag(), null);
int defaultNumPartitions = pairDStreamFunctions.defaultPartitioner$default$1();
Partitioner partitioner = pairDStreamFunctions.defaultPartitioner(defaultNumPartitions);
// use updateStateByKey to scan through the state and update elements and timers.
DStream<Tuple2<ByteArray, Tuple2<StateAndTimers, List<byte[]>>>> /*WV<KV<K, Itr<I>>>*/
firedStream = pairDStreamFunctions.updateStateByKey(new SerializableFunction1<scala.collection.Iterator<Tuple3</*K*/
ByteArray, Seq<byte[]>, Option<Tuple2<StateAndTimers, /*WV<KV<K, Itr<I>>>*/
List<byte[]>>>>>, scala.collection.Iterator<Tuple2</*K*/
ByteArray, Tuple2<StateAndTimers, /*WV<KV<K, Itr<I>>>*/
List<byte[]>>>>>() {
@Override
public scala.collection.Iterator<Tuple2</*K*/
ByteArray, Tuple2<StateAndTimers, /*WV<KV<K, Itr<I>>>*/
List<byte[]>>>> apply(final scala.collection.Iterator<Tuple3</*K*/
ByteArray, Seq<byte[]>, Option<Tuple2<StateAndTimers, /*WV<KV<K, Itr<I>>>*/
List<byte[]>>>>> iter) {
//--- ACTUAL STATEFUL OPERATION:
//
// Input Iterator: the partition (~bundle) of a cogrouping of the input
// and the previous state (if exists).
//
// Output Iterator: the output key, and the updated state.
//
// possible input scenarios for (K, Seq, Option<S>):
// (1) Option<S>.isEmpty: new data with no previous state.
// (2) Seq.isEmpty: no new data, but evaluating previous state (timer-like behaviour).
// (3) Seq.nonEmpty && Option<S>.isDefined: new data with previous state.
final SystemReduceFn<K, InputT, Iterable<InputT>, Iterable<InputT>, W> reduceFn = SystemReduceFn.buffering(((FullWindowedValueCoder<InputT>) wvCoder).getValueCoder());
final OutputWindowedValueHolder<K, InputT> outputHolder = new OutputWindowedValueHolder<>();
// use in memory Aggregators since Spark Accumulators are not resilient
// in stateful operators, once done with this partition.
final MetricsContainerImpl cellProvider = new MetricsContainerImpl("cellProvider");
final CounterCell droppedDueToClosedWindow = cellProvider.getCounter(MetricName.named(SparkGroupAlsoByWindowViaWindowSet.class, GroupAlsoByWindowsAggregators.DROPPED_DUE_TO_CLOSED_WINDOW_COUNTER));
final CounterCell droppedDueToLateness = cellProvider.getCounter(MetricName.named(SparkGroupAlsoByWindowViaWindowSet.class, GroupAlsoByWindowsAggregators.DROPPED_DUE_TO_LATENESS_COUNTER));
AbstractIterator<Tuple2<ByteArray, Tuple2<StateAndTimers, List<byte[]>>>> /*WV<KV<K, Itr<I>>>*/
outIter = new AbstractIterator<Tuple2</*K*/
ByteArray, Tuple2<StateAndTimers, /*WV<KV<K, Itr<I>>>*/
List<byte[]>>>>() {
@Override
protected Tuple2</*K*/
ByteArray, Tuple2<StateAndTimers, /*WV<KV<K, Itr<I>>>*/
List<byte[]>>> computeNext() {
// (possibly) previous-state and (possibly) new data.
while (iter.hasNext()) {
// for each element in the partition:
Tuple3<ByteArray, Seq<byte[]>, Option<Tuple2<StateAndTimers, List<byte[]>>>> next = iter.next();
ByteArray encodedKey = next._1();
K key = CoderHelpers.fromByteArray(encodedKey.getValue(), keyCoder);
Seq<byte[]> seq = next._2();
Option<Tuple2<StateAndTimers, List<byte[]>>> prevStateAndTimersOpt = next._3();
SparkStateInternals<K> stateInternals;
SparkTimerInternals timerInternals = SparkTimerInternals.forStreamFromSources(sourceIds, GlobalWatermarkHolder.get());
// get state(internals) per key.
if (prevStateAndTimersOpt.isEmpty()) {
// no previous state.
stateInternals = SparkStateInternals.forKey(key);
} else {
// with pre-existing state.
StateAndTimers prevStateAndTimers = prevStateAndTimersOpt.get()._1();
stateInternals = SparkStateInternals.forKeyAndState(key, prevStateAndTimers.getState());
Collection<byte[]> serTimers = prevStateAndTimers.getTimers();
timerInternals.addTimers(SparkTimerInternals.deserializeTimers(serTimers, timerDataCoder));
}
ReduceFnRunner<K, InputT, Iterable<InputT>, W> reduceFnRunner = new ReduceFnRunner<>(key, windowingStrategy, ExecutableTriggerStateMachine.create(TriggerStateMachines.stateMachineForTrigger(TriggerTranslation.toProto(windowingStrategy.getTrigger()))), stateInternals, timerInternals, outputHolder, new UnsupportedSideInputReader("GroupAlsoByWindow"), reduceFn, runtimeContext.getPipelineOptions());
// clear before potential use.
outputHolder.clear();
if (!seq.isEmpty()) {
// new input for key.
try {
Iterable<WindowedValue<InputT>> elementsIterable = CoderHelpers.fromByteArray(seq.head(), itrWvCoder);
Iterable<WindowedValue<InputT>> validElements = LateDataUtils.dropExpiredWindows(key, elementsIterable, timerInternals, windowingStrategy, droppedDueToLateness);
reduceFnRunner.processElements(validElements);
} catch (Exception e) {
throw new RuntimeException("Failed to process element with ReduceFnRunner", e);
}
} else if (stateInternals.getState().isEmpty()) {
// no input and no state -> GC evict now.
continue;
}
try {
// advance the watermark to HWM to fire by timers.
timerInternals.advanceWatermark();
// call on timers that are ready.
reduceFnRunner.onTimers(timerInternals.getTimersReadyToProcess());
} catch (Exception e) {
throw new RuntimeException("Failed to process ReduceFnRunner onTimer.", e);
}
// this is mostly symbolic since actual persist is done by emitting output.
reduceFnRunner.persist();
// obtain output, if fired.
List<WindowedValue<KV<K, Iterable<InputT>>>> outputs = outputHolder.get();
if (!outputs.isEmpty() || !stateInternals.getState().isEmpty()) {
StateAndTimers updated = new StateAndTimers(stateInternals.getState(), SparkTimerInternals.serializeTimers(timerInternals.getTimers(), timerDataCoder));
// persist Spark's state by outputting.
List<byte[]> serOutput = CoderHelpers.toByteArrays(outputs, wvKvIterCoder);
return new Tuple2<>(encodedKey, new Tuple2<>(updated, serOutput));
}
// an empty state with no output, can be evicted completely - do nothing.
}
return endOfData();
}
};
// log if there's something to log.
long lateDropped = droppedDueToLateness.getCumulative();
if (lateDropped > 0) {
LOG.info(String.format("Dropped %d elements due to lateness.", lateDropped));
droppedDueToLateness.inc(-droppedDueToLateness.getCumulative());
}
long closedWindowDropped = droppedDueToClosedWindow.getCumulative();
if (closedWindowDropped > 0) {
LOG.info(String.format("Dropped %d elements due to closed window.", closedWindowDropped));
droppedDueToClosedWindow.inc(-droppedDueToClosedWindow.getCumulative());
}
return scala.collection.JavaConversions.asScalaIterator(outIter);
}
}, partitioner, true, JavaSparkContext$.MODULE$.<Tuple2<StateAndTimers, List<byte[]>>>fakeClassTag());
if (checkpointDurationMillis > 0) {
firedStream.checkpoint(new Duration(checkpointDurationMillis));
}
// go back to Java now.
JavaPairDStream<ByteArray, Tuple2<StateAndTimers, List<byte[]>>> /*WV<KV<K, Itr<I>>>*/
javaFiredStream = JavaPairDStream.fromPairDStream(firedStream, JavaSparkContext$.MODULE$.<ByteArray>fakeClassTag(), JavaSparkContext$.MODULE$.<Tuple2<StateAndTimers, List<byte[]>>>fakeClassTag());
// filter state-only output (nothing to fire) and remove the state from the output.
return javaFiredStream.filter(new Function<Tuple2</*K*/
ByteArray, Tuple2<StateAndTimers, /*WV<KV<K, Itr<I>>>*/
List<byte[]>>>, Boolean>() {
@Override
public Boolean call(Tuple2</*K*/
ByteArray, Tuple2<StateAndTimers, /*WV<KV<K, Itr<I>>>*/
List<byte[]>>> t2) throws Exception {
// filter output if defined.
return !t2._2()._2().isEmpty();
}
}).flatMap(new FlatMapFunction<Tuple2</*K*/
ByteArray, Tuple2<StateAndTimers, /*WV<KV<K, Itr<I>>>*/
List<byte[]>>>, WindowedValue<KV<K, Iterable<InputT>>>>() {
@Override
public Iterable<WindowedValue<KV<K, Iterable<InputT>>>> call(Tuple2</*K*/
ByteArray, Tuple2<StateAndTimers, /*WV<KV<K, Itr<I>>>*/
List<byte[]>>> t2) throws Exception {
// return in serialized form.
return CoderHelpers.fromByteArrays(t2._2()._2(), wvKvIterCoder);
}
});
}
use of org.apache.spark.api.java.function.Function in project spark-dataflow by cloudera.
the class TransformTranslator method readHadoop.
private static <K, V> TransformEvaluator<HadoopIO.Read.Bound<K, V>> readHadoop() {
return new TransformEvaluator<HadoopIO.Read.Bound<K, V>>() {
@Override
public void evaluate(HadoopIO.Read.Bound<K, V> transform, EvaluationContext context) {
String pattern = transform.getFilepattern();
JavaSparkContext jsc = context.getSparkContext();
@SuppressWarnings("unchecked") JavaPairRDD<K, V> file = jsc.newAPIHadoopFile(pattern, transform.getFormatClass(), transform.getKeyClass(), transform.getValueClass(), new Configuration());
JavaRDD<WindowedValue<KV<K, V>>> rdd = file.map(new Function<Tuple2<K, V>, KV<K, V>>() {
@Override
public KV<K, V> call(Tuple2<K, V> t2) throws Exception {
return KV.of(t2._1(), t2._2());
}
}).map(WindowingHelpers.<KV<K, V>>windowFunction());
context.setOutputRDD(transform, rdd);
}
};
}
use of org.apache.spark.api.java.function.Function in project spark-dataflow by cloudera.
the class StreamingTransformTranslator method kafka.
private static <K, V> TransformEvaluator<KafkaIO.Read.Unbound<K, V>> kafka() {
return new TransformEvaluator<KafkaIO.Read.Unbound<K, V>>() {
@Override
public void evaluate(KafkaIO.Read.Unbound<K, V> transform, EvaluationContext context) {
StreamingEvaluationContext sec = (StreamingEvaluationContext) context;
JavaStreamingContext jssc = sec.getStreamingContext();
Class<K> keyClazz = transform.getKeyClass();
Class<V> valueClazz = transform.getValueClass();
Class<? extends Decoder<K>> keyDecoderClazz = transform.getKeyDecoderClass();
Class<? extends Decoder<V>> valueDecoderClazz = transform.getValueDecoderClass();
Map<String, String> kafkaParams = transform.getKafkaParams();
Set<String> topics = transform.getTopics();
JavaPairInputDStream<K, V> inputPairStream = KafkaUtils.createDirectStream(jssc, keyClazz, valueClazz, keyDecoderClazz, valueDecoderClazz, kafkaParams, topics);
JavaDStream<WindowedValue<KV<K, V>>> inputStream = inputPairStream.map(new Function<Tuple2<K, V>, KV<K, V>>() {
@Override
public KV<K, V> call(Tuple2<K, V> t2) throws Exception {
return KV.of(t2._1(), t2._2());
}
}).map(WindowingHelpers.<KV<K, V>>windowFunction());
sec.setStream(transform, inputStream);
}
};
}
use of org.apache.spark.api.java.function.Function in project gatk by broadinstitute.
the class PileupSpark method pileupFunction.
private static Function<LocusWalkerContext, String> pileupFunction(List<FeatureInput<Feature>> metadata, boolean outputInsertLength, boolean showVerbose) {
return (Function<LocusWalkerContext, String>) context -> {
AlignmentContext alignmentContext = context.getAlignmentContext();
ReferenceContext referenceContext = context.getReferenceContext();
FeatureContext featureContext = context.getFeatureContext();
final String features = getFeaturesString(featureContext, metadata);
final ReadPileup basePileup = alignmentContext.getBasePileup();
final StringBuilder s = new StringBuilder();
s.append(String.format("%s %s", basePileup.getPileupString((referenceContext.hasBackingDataSource()) ? (char) referenceContext.getBase() : 'N'), features));
if (outputInsertLength) {
s.append(" ").append(insertLengthOutput(basePileup));
}
if (showVerbose) {
s.append(" ").append(createVerboseOutput(basePileup));
}
s.append("\n");
return s.toString();
};
}
use of org.apache.spark.api.java.function.Function in project gatk-protected by broadinstitute.
the class CoverageModelEMWorkspace method updateSampleUnexplainedVariance.
/**
* E-step update of the sample-specific unexplained variance
*
* @return a {@link SubroutineSignal} containing the update size (key: "error_norm") and the average
* number of function evaluations per sample (key: "iterations")
*/
@EvaluatesRDD
@UpdatesRDD
@CachesRDD
public SubroutineSignal updateSampleUnexplainedVariance() {
mapWorkers(cb -> cb.cloneWithUpdatedCachesByTag(CoverageModelEMComputeBlock.CoverageModelICGCacheTag.E_STEP_GAMMA));
cacheWorkers("after E-step for sample unexplained variance initialization");
/* create a compound objective function for simultaneous multi-sample queries */
final java.util.function.Function<Map<Integer, Double>, Map<Integer, Double>> objFunc = arg -> {
if (arg.isEmpty()) {
return Collections.emptyMap();
}
final int[] sampleIndices = arg.keySet().stream().mapToInt(i -> i).toArray();
final INDArray gammaValues = Nd4j.create(Arrays.stream(sampleIndices).mapToDouble(arg::get).toArray(), new int[] { sampleIndices.length, 1 });
final INDArray eval = mapWorkersAndReduce(cb -> cb.calculateSampleSpecificVarianceObjectiveFunctionMultiSample(sampleIndices, gammaValues), INDArray::add);
final Map<Integer, Double> output = new HashMap<>();
IntStream.range(0, sampleIndices.length).forEach(evalIdx -> output.put(sampleIndices[evalIdx], eval.getDouble(evalIdx)));
return output;
};
final java.util.function.Function<UnivariateSolverSpecifications, AbstractUnivariateSolver> solverFactory = spec -> new RobustBrentSolver(spec.getRelativeAccuracy(), spec.getAbsoluteAccuracy(), spec.getFunctionValueAccuracy(), null, config.getSampleSpecificVarianceSolverNumBisections(), config.getSampleSpecificVarianceSolverRefinementDepth());
/* instantiate a synchronized multi-sample root finder and add jobs */
final SynchronizedUnivariateSolver syncSolver = new SynchronizedUnivariateSolver(objFunc, solverFactory, numSamples);
IntStream.range(0, numSamples).forEach(si -> {
final double x0 = 0.5 * config.getSampleSpecificVarianceUpperLimit();
syncSolver.add(si, 0, config.getSampleSpecificVarianceUpperLimit(), x0, config.getSampleSpecificVarianceAbsoluteTolerance(), config.getSampleSpecificVarianceRelativeTolerance(), config.getSampleSpecificVarianceMaximumIterations());
});
/* solve and collect statistics */
final INDArray newSampleUnexplainedVariance = Nd4j.create(numSamples, 1);
final List<Integer> numberOfEvaluations = new ArrayList<>(numSamples);
try {
final Map<Integer, SynchronizedUnivariateSolver.UnivariateSolverSummary> newSampleSpecificVarianceMap = syncSolver.solve();
newSampleSpecificVarianceMap.entrySet().forEach(entry -> {
final int sampleIndex = entry.getKey();
final SynchronizedUnivariateSolver.UnivariateSolverSummary summary = entry.getValue();
double val = 0;
switch(summary.status) {
case SUCCESS:
val = summary.x;
break;
case TOO_MANY_EVALUATIONS:
logger.warn("Could not locate the root of gamma -- increase the maximum number of" + "function evaluations");
break;
}
newSampleUnexplainedVariance.put(sampleIndex, 0, val);
numberOfEvaluations.add(summary.evaluations);
});
} catch (final InterruptedException ex) {
throw new RuntimeException("The update of sample unexplained variance was interrupted -- can not continue");
}
/* admix */
final INDArray newSampleUnexplainedVarianceAdmixed = newSampleUnexplainedVariance.mul(config.getMeanFieldAdmixingRatio()).addi(sampleUnexplainedVariance.mul(1 - config.getMeanFieldAdmixingRatio()));
/* calculate the error */
final double errorNormInfinity = CoverageModelEMWorkspaceMathUtils.getINDArrayNormInfinity(newSampleUnexplainedVarianceAdmixed.sub(sampleUnexplainedVariance));
/* update local copy */
sampleUnexplainedVariance.assign(newSampleUnexplainedVarianceAdmixed);
/* push to workers */
pushToWorkers(newSampleUnexplainedVarianceAdmixed, (arr, cb) -> cb.cloneWithUpdatedPrimitive(CoverageModelEMComputeBlock.CoverageModelICGCacheNode.gamma_s, newSampleUnexplainedVarianceAdmixed));
final int iterations = (int) (numberOfEvaluations.stream().mapToDouble(d -> d).sum() / numSamples);
return SubroutineSignal.builder().put(StandardSubroutineSignals.RESIDUAL_ERROR_NORM, errorNormInfinity).put(StandardSubroutineSignals.ITERATIONS, iterations).build();
}
Aggregations