use of org.apache.spark.HashPartitioner in project gatk by broadinstitute.
the class CoverageModelEMWorkspace method updateCopyRatioPosteriorExpectationsSpark.
/**
* The Spark implementation of the E-step update of copy ratio posteriors
*
* @return a {@link SubroutineSignal} containing the update size
*/
@EvaluatesRDD
@UpdatesRDD
@CachesRDD
private SubroutineSignal updateCopyRatioPosteriorExpectationsSpark(final double admixingRatio) {
/* local final member variables for lambda capture */
final List<LinearlySpacedIndexBlock> targetBlocks = new ArrayList<>();
targetBlocks.addAll(this.targetBlocks);
final List<Target> targetList = new ArrayList<>();
targetList.addAll(processedTargetList);
final List<String> sampleNameList = new ArrayList<>();
sampleNameList.addAll(processedSampleNameList);
final List<SexGenotypeData> sampleSexGenotypeData = new ArrayList<>();
sampleSexGenotypeData.addAll(processedSampleSexGenotypeData);
final int numTargetBlocks = targetBlocks.size();
final CopyRatioExpectationsCalculator<CoverageModelCopyRatioEmissionData, STATE> calculator = this.copyRatioExpectationsCalculator;
final INDArray sampleReadDepths = Transforms.exp(sampleMeanLogReadDepths, true);
/* make an RDD of copy ratio posterior expectations */
final JavaPairRDD<Integer, CopyRatioExpectations> copyRatioPosteriorExpectationsPairRDD = /* fetch copy ratio emission data from workers */
fetchCopyRatioEmissionDataSpark().mapPartitionsToPair(it -> {
final List<Tuple2<Integer, CopyRatioExpectations>> newPartitionData = new ArrayList<>();
while (it.hasNext()) {
final Tuple2<Integer, List<CoverageModelCopyRatioEmissionData>> prevDatum = it.next();
final int si = prevDatum._1;
final CopyRatioCallingMetadata copyRatioCallingMetadata = CopyRatioCallingMetadata.builder().sampleName(sampleNameList.get(si)).sampleSexGenotypeData(sampleSexGenotypeData.get(si)).sampleCoverageDepth(sampleReadDepths.getDouble(si)).emissionCalculationStrategy(EmissionCalculationStrategy.HYBRID_POISSON_GAUSSIAN).build();
newPartitionData.add(new Tuple2<>(prevDatum._1, calculator.getCopyRatioPosteriorExpectations(copyRatioCallingMetadata, targetList, prevDatum._2)));
}
return newPartitionData.iterator();
}, true);
/* we need to do two things to copyRatioPosteriorExpectationsPairRDD; so we cache it */
/* step 1. update log chain posterior expectation on the driver node */
final double[] newSampleLogChainPosteriors = copyRatioPosteriorExpectationsPairRDD.mapValues(CopyRatioExpectations::getLogChainPosteriorProbability).collect().stream().sorted(Comparator.comparingInt(t -> t._1)).mapToDouble(t -> t._2).toArray();
sampleLogChainPosteriors.assign(Nd4j.create(newSampleLogChainPosteriors, new int[] { numSamples, 1 }));
/* step 2. repartition in target space */
final JavaPairRDD<LinearlySpacedIndexBlock, ImmutablePair<INDArray, INDArray>> blockifiedCopyRatioPosteriorResultsPairRDD = copyRatioPosteriorExpectationsPairRDD.flatMapToPair(dat -> targetBlocks.stream().map(tb -> new Tuple2<>(tb, new Tuple2<>(dat._1, ImmutablePair.of(dat._2.getLogCopyRatioMeans(tb), dat._2.getLogCopyRatioVariances(tb))))).iterator()).combineByKey(/* recipe to create an singleton list */
Collections::singletonList, /* recipe to add an element to the list */
(list, element) -> Stream.concat(list.stream(), Stream.of(element)).collect(Collectors.toList()), /* recipe to concatenate two lists */
(list1, list2) -> Stream.concat(list1.stream(), list2.stream()).collect(Collectors.toList()), /* repartition with respect to target space blocks */
new HashPartitioner(numTargetBlocks)).mapValues(list -> list.stream().sorted(Comparator.comparingInt(t -> t._1)).map(p -> p._2).map(t -> ImmutablePair.of(Nd4j.create(t.left), Nd4j.create(t.right))).collect(Collectors.toList())).mapValues(CoverageModelEMWorkspace::stackCopyRatioPosteriorDataForAllSamples);
/* we do not need copy ratio expectations anymore */
copyRatioPosteriorExpectationsPairRDD.unpersist();
/* step 3. merge with computeRDD and update */
computeRDD = computeRDD.join(blockifiedCopyRatioPosteriorResultsPairRDD).mapValues(t -> t._1.cloneWithUpdatedCopyRatioPosteriors(t._2.left, t._2.right, admixingRatio));
cacheWorkers("after E-step for copy ratio update");
/* collect subroutine signals */
final List<SubroutineSignal> sigs = mapWorkersAndCollect(CoverageModelEMComputeBlock::getLatestMStepSignal);
final double errorNormInfinity = Collections.max(sigs.stream().map(sig -> sig.<Double>get(StandardSubroutineSignals.RESIDUAL_ERROR_NORM)).collect(Collectors.toList()));
return SubroutineSignal.builder().put(StandardSubroutineSignals.RESIDUAL_ERROR_NORM, errorNormInfinity).build();
}
use of org.apache.spark.HashPartitioner in project gatk-protected by broadinstitute.
the class CoverageModelEMWorkspace method updateCopyRatioPosteriorExpectationsSpark.
/**
* The Spark implementation of the E-step update of copy ratio posteriors
*
* @return a {@link SubroutineSignal} containing the update size
*/
@EvaluatesRDD
@UpdatesRDD
@CachesRDD
private SubroutineSignal updateCopyRatioPosteriorExpectationsSpark(final double admixingRatio) {
/* local final member variables for lambda capture */
final List<LinearlySpacedIndexBlock> targetBlocks = new ArrayList<>();
targetBlocks.addAll(this.targetBlocks);
final List<Target> targetList = new ArrayList<>();
targetList.addAll(processedTargetList);
final List<String> sampleNameList = new ArrayList<>();
sampleNameList.addAll(processedSampleNameList);
final List<SexGenotypeData> sampleSexGenotypeData = new ArrayList<>();
sampleSexGenotypeData.addAll(processedSampleSexGenotypeData);
final int numTargetBlocks = targetBlocks.size();
final CopyRatioExpectationsCalculator<CoverageModelCopyRatioEmissionData, STATE> calculator = this.copyRatioExpectationsCalculator;
final INDArray sampleReadDepths = Transforms.exp(sampleMeanLogReadDepths, true);
/* make an RDD of copy ratio posterior expectations */
final JavaPairRDD<Integer, CopyRatioExpectations> copyRatioPosteriorExpectationsPairRDD = /* fetch copy ratio emission data from workers */
fetchCopyRatioEmissionDataSpark().mapPartitionsToPair(it -> {
final List<Tuple2<Integer, CopyRatioExpectations>> newPartitionData = new ArrayList<>();
while (it.hasNext()) {
final Tuple2<Integer, List<CoverageModelCopyRatioEmissionData>> prevDatum = it.next();
final int si = prevDatum._1;
final CopyRatioCallingMetadata copyRatioCallingMetadata = CopyRatioCallingMetadata.builder().sampleName(sampleNameList.get(si)).sampleSexGenotypeData(sampleSexGenotypeData.get(si)).sampleCoverageDepth(sampleReadDepths.getDouble(si)).emissionCalculationStrategy(EmissionCalculationStrategy.HYBRID_POISSON_GAUSSIAN).build();
newPartitionData.add(new Tuple2<>(prevDatum._1, calculator.getCopyRatioPosteriorExpectations(copyRatioCallingMetadata, targetList, prevDatum._2)));
}
return newPartitionData.iterator();
}, true);
/* we need to do two things to copyRatioPosteriorExpectationsPairRDD; so we cache it */
/* step 1. update log chain posterior expectation on the driver node */
final double[] newSampleLogChainPosteriors = copyRatioPosteriorExpectationsPairRDD.mapValues(CopyRatioExpectations::getLogChainPosteriorProbability).collect().stream().sorted(Comparator.comparingInt(t -> t._1)).mapToDouble(t -> t._2).toArray();
sampleLogChainPosteriors.assign(Nd4j.create(newSampleLogChainPosteriors, new int[] { numSamples, 1 }));
/* step 2. repartition in target space */
final JavaPairRDD<LinearlySpacedIndexBlock, ImmutablePair<INDArray, INDArray>> blockifiedCopyRatioPosteriorResultsPairRDD = copyRatioPosteriorExpectationsPairRDD.flatMapToPair(dat -> targetBlocks.stream().map(tb -> new Tuple2<>(tb, new Tuple2<>(dat._1, ImmutablePair.of(dat._2.getLogCopyRatioMeans(tb), dat._2.getLogCopyRatioVariances(tb))))).iterator()).combineByKey(/* recipe to create an singleton list */
Collections::singletonList, /* recipe to add an element to the list */
(list, element) -> Stream.concat(list.stream(), Stream.of(element)).collect(Collectors.toList()), /* recipe to concatenate two lists */
(list1, list2) -> Stream.concat(list1.stream(), list2.stream()).collect(Collectors.toList()), /* repartition with respect to target space blocks */
new HashPartitioner(numTargetBlocks)).mapValues(list -> list.stream().sorted(Comparator.comparingInt(t -> t._1)).map(p -> p._2).map(t -> ImmutablePair.of(Nd4j.create(t.left), Nd4j.create(t.right))).collect(Collectors.toList())).mapValues(CoverageModelEMWorkspace::stackCopyRatioPosteriorDataForAllSamples);
/* we do not need copy ratio expectations anymore */
copyRatioPosteriorExpectationsPairRDD.unpersist();
/* step 3. merge with computeRDD and update */
computeRDD = computeRDD.join(blockifiedCopyRatioPosteriorResultsPairRDD).mapValues(t -> t._1.cloneWithUpdatedCopyRatioPosteriors(t._2.left, t._2.right, admixingRatio));
cacheWorkers("after E-step for copy ratio update");
/* collect subroutine signals */
final List<SubroutineSignal> sigs = mapWorkersAndCollect(CoverageModelEMComputeBlock::getLatestMStepSignal);
final double errorNormInfinity = Collections.max(sigs.stream().map(sig -> sig.<Double>get(StandardSubroutineSignals.RESIDUAL_ERROR_NORM)).collect(Collectors.toList()));
return SubroutineSignal.builder().put(StandardSubroutineSignals.RESIDUAL_ERROR_NORM, errorNormInfinity).build();
}
use of org.apache.spark.HashPartitioner in project gatk by broadinstitute.
the class FindBadGenomicKmersSpark method processRefRDD.
/**
* Do a map/reduce on an RDD of genomic sequences:
* Kmerize, mapping to a pair <kmer,1>, reduce by summing values by key, filter out <kmer,N> where
* N <= MAX_KMER_FREQ, and collect the high frequency kmers back in the driver.
*/
@VisibleForTesting
static List<SVKmer> processRefRDD(final int kSize, final int maxDUSTScore, final int maxKmerFreq, final JavaRDD<byte[]> refRDD) {
final int nPartitions = refRDD.getNumPartitions();
final int hashSize = 2 * REF_RECORDS_PER_PARTITION;
final int arrayCap = REF_RECORDS_PER_PARTITION / 100;
return refRDD.mapPartitions(seqItr -> {
final HopscotchMap<SVKmer, Integer, KmerAndCount> kmerCounts = new HopscotchMap<>(hashSize);
while (seqItr.hasNext()) {
final byte[] seq = seqItr.next();
SVDUSTFilteredKmerizer.stream(seq, kSize, maxDUSTScore, new SVKmerLong()).map(kmer -> kmer.canonical(kSize)).forEach(kmer -> {
final KmerAndCount entry = kmerCounts.find(kmer);
if (entry == null)
kmerCounts.add(new KmerAndCount((SVKmerLong) kmer));
else
entry.bumpCount();
});
}
return kmerCounts.iterator();
}).mapToPair(entry -> new Tuple2<>(entry.getKey(), entry.getValue())).partitionBy(new HashPartitioner(nPartitions)).mapPartitions(pairItr -> {
final HopscotchMap<SVKmer, Integer, KmerAndCount> kmerCounts = new HopscotchMap<>(hashSize);
while (pairItr.hasNext()) {
final Tuple2<SVKmer, Integer> pair = pairItr.next();
final SVKmer kmer = pair._1();
final int count = pair._2();
KmerAndCount entry = kmerCounts.find(kmer);
if (entry == null)
kmerCounts.add(new KmerAndCount((SVKmerLong) kmer, count));
else
entry.bumpCount(count);
}
final List<SVKmer> highFreqKmers = new ArrayList<>(arrayCap);
for (KmerAndCount kmerAndCount : kmerCounts) {
if (kmerAndCount.grabCount() > maxKmerFreq)
highFreqKmers.add(kmerAndCount.getKey());
}
return highFreqKmers.iterator();
}).collect();
}
use of org.apache.spark.HashPartitioner in project beam by apache.
the class GroupCombineFunctions method groupByKeyOnly.
/**
* An implementation of
* {@link org.apache.beam.runners.core.GroupByKeyViaGroupByKeyOnly.GroupByKeyOnly}
* for the Spark runner.
*/
public static <K, V> JavaRDD<WindowedValue<KV<K, Iterable<WindowedValue<V>>>>> groupByKeyOnly(JavaRDD<WindowedValue<KV<K, V>>> rdd, Coder<K> keyCoder, WindowedValueCoder<V> wvCoder) {
// we use coders to convert objects in the PCollection to byte arrays, so they
// can be transferred over the network for the shuffle.
JavaPairRDD<ByteArray, byte[]> pairRDD = rdd.map(new ReifyTimestampsAndWindowsFunction<K, V>()).map(WindowingHelpers.<KV<K, WindowedValue<V>>>unwindowFunction()).mapToPair(TranslationUtils.<K, WindowedValue<V>>toPairFunction()).mapToPair(CoderHelpers.toByteFunction(keyCoder, wvCoder));
// use a default parallelism HashPartitioner.
Partitioner partitioner = new HashPartitioner(rdd.rdd().sparkContext().defaultParallelism());
// and avoid unnecessary shuffle downstream.
return pairRDD.groupByKey(partitioner).mapPartitionsToPair(TranslationUtils.pairFunctionToPairFlatMapFunction(CoderHelpers.fromByteFunctionIterable(keyCoder, wvCoder)), true).mapPartitions(TranslationUtils.<K, Iterable<WindowedValue<V>>>fromPairFlatMapFunction(), true).mapPartitions(TranslationUtils.functionToFlatMapFunction(WindowingHelpers.<KV<K, Iterable<WindowedValue<V>>>>windowFunction()), true);
}
use of org.apache.spark.HashPartitioner in project gatk by broadinstitute.
the class CoverageModelEMWorkspace method joinWithWorkersAndMap.
/**
* A generic function for handling a blockified list of objects to their corresponding compute nodes
*
* If Spark is enabled:
*
* Joins an instance of {@code List<Tuple2<LinearlySpacedIndexBlock, V>>} with {@link #computeRDD}, calls the provided
* map {@code mapper} on the RDD, and the reference to the old RDD will be replaced with the new RDD.
*
* If Spark is disabled:
*
* Only a single target-space block is assumed, such that {@code data} is a singleton. The map function
* {@code mapper} will be called on the value contained in {@code data} and {@link #localComputeBlock}, and
* the old instance of {@link CoverageModelEMComputeBlock} is replaced with the new instance returned
* by {@code mapper.}
*
* @param data the list to joined and mapped together with the compute block(s)
* @param mapper a mapper binary function that takes a compute block together with an object of type {@code V} and
* returns a new compute block
* @param <V> the type of the object to the broadcasted
*/
@UpdatesRDD
private <V> void joinWithWorkersAndMap(@Nonnull final List<Tuple2<LinearlySpacedIndexBlock, V>> data, @Nonnull final Function<Tuple2<CoverageModelEMComputeBlock, V>, CoverageModelEMComputeBlock> mapper) {
if (sparkContextIsAvailable) {
final JavaPairRDD<LinearlySpacedIndexBlock, V> newRDD = ctx.parallelizePairs(data, numTargetBlocks).partitionBy(new HashPartitioner(numTargetBlocks));
computeRDD = computeRDD.join(newRDD).mapValues(mapper);
} else {
try {
Utils.validateArg(data.size() == 1, "Only a single data block is expected in the local mode");
localComputeBlock = mapper.call(new Tuple2<>(localComputeBlock, data.get(0)._2));
} catch (Exception e) {
throw new RuntimeException("Can not apply the map function to the local compute block: " + e.getMessage());
}
}
}
Aggregations