Search in sources :

Example 1 with HashPartitioner

use of org.apache.spark.HashPartitioner in project gatk by broadinstitute.

the class CoverageModelEMWorkspace method updateCopyRatioPosteriorExpectationsSpark.

/**
     * The Spark implementation of the E-step update of copy ratio posteriors
     *
     * @return a {@link SubroutineSignal} containing the update size
     */
@EvaluatesRDD
@UpdatesRDD
@CachesRDD
private SubroutineSignal updateCopyRatioPosteriorExpectationsSpark(final double admixingRatio) {
    /* local final member variables for lambda capture */
    final List<LinearlySpacedIndexBlock> targetBlocks = new ArrayList<>();
    targetBlocks.addAll(this.targetBlocks);
    final List<Target> targetList = new ArrayList<>();
    targetList.addAll(processedTargetList);
    final List<String> sampleNameList = new ArrayList<>();
    sampleNameList.addAll(processedSampleNameList);
    final List<SexGenotypeData> sampleSexGenotypeData = new ArrayList<>();
    sampleSexGenotypeData.addAll(processedSampleSexGenotypeData);
    final int numTargetBlocks = targetBlocks.size();
    final CopyRatioExpectationsCalculator<CoverageModelCopyRatioEmissionData, STATE> calculator = this.copyRatioExpectationsCalculator;
    final INDArray sampleReadDepths = Transforms.exp(sampleMeanLogReadDepths, true);
    /* make an RDD of copy ratio posterior expectations */
    final JavaPairRDD<Integer, CopyRatioExpectations> copyRatioPosteriorExpectationsPairRDD = /* fetch copy ratio emission data from workers */
    fetchCopyRatioEmissionDataSpark().mapPartitionsToPair(it -> {
        final List<Tuple2<Integer, CopyRatioExpectations>> newPartitionData = new ArrayList<>();
        while (it.hasNext()) {
            final Tuple2<Integer, List<CoverageModelCopyRatioEmissionData>> prevDatum = it.next();
            final int si = prevDatum._1;
            final CopyRatioCallingMetadata copyRatioCallingMetadata = CopyRatioCallingMetadata.builder().sampleName(sampleNameList.get(si)).sampleSexGenotypeData(sampleSexGenotypeData.get(si)).sampleCoverageDepth(sampleReadDepths.getDouble(si)).emissionCalculationStrategy(EmissionCalculationStrategy.HYBRID_POISSON_GAUSSIAN).build();
            newPartitionData.add(new Tuple2<>(prevDatum._1, calculator.getCopyRatioPosteriorExpectations(copyRatioCallingMetadata, targetList, prevDatum._2)));
        }
        return newPartitionData.iterator();
    }, true);
    /* we need to do two things to copyRatioPosteriorExpectationsPairRDD; so we cache it */
    /* step 1. update log chain posterior expectation on the driver node */
    final double[] newSampleLogChainPosteriors = copyRatioPosteriorExpectationsPairRDD.mapValues(CopyRatioExpectations::getLogChainPosteriorProbability).collect().stream().sorted(Comparator.comparingInt(t -> t._1)).mapToDouble(t -> t._2).toArray();
    sampleLogChainPosteriors.assign(Nd4j.create(newSampleLogChainPosteriors, new int[] { numSamples, 1 }));
    /* step 2. repartition in target space */
    final JavaPairRDD<LinearlySpacedIndexBlock, ImmutablePair<INDArray, INDArray>> blockifiedCopyRatioPosteriorResultsPairRDD = copyRatioPosteriorExpectationsPairRDD.flatMapToPair(dat -> targetBlocks.stream().map(tb -> new Tuple2<>(tb, new Tuple2<>(dat._1, ImmutablePair.of(dat._2.getLogCopyRatioMeans(tb), dat._2.getLogCopyRatioVariances(tb))))).iterator()).combineByKey(/* recipe to create an singleton list */
    Collections::singletonList, /* recipe to add an element to the list */
    (list, element) -> Stream.concat(list.stream(), Stream.of(element)).collect(Collectors.toList()), /* recipe to concatenate two lists */
    (list1, list2) -> Stream.concat(list1.stream(), list2.stream()).collect(Collectors.toList()), /* repartition with respect to target space blocks */
    new HashPartitioner(numTargetBlocks)).mapValues(list -> list.stream().sorted(Comparator.comparingInt(t -> t._1)).map(p -> p._2).map(t -> ImmutablePair.of(Nd4j.create(t.left), Nd4j.create(t.right))).collect(Collectors.toList())).mapValues(CoverageModelEMWorkspace::stackCopyRatioPosteriorDataForAllSamples);
    /* we do not need copy ratio expectations anymore */
    copyRatioPosteriorExpectationsPairRDD.unpersist();
    /* step 3. merge with computeRDD and update */
    computeRDD = computeRDD.join(blockifiedCopyRatioPosteriorResultsPairRDD).mapValues(t -> t._1.cloneWithUpdatedCopyRatioPosteriors(t._2.left, t._2.right, admixingRatio));
    cacheWorkers("after E-step for copy ratio update");
    /* collect subroutine signals */
    final List<SubroutineSignal> sigs = mapWorkersAndCollect(CoverageModelEMComputeBlock::getLatestMStepSignal);
    final double errorNormInfinity = Collections.max(sigs.stream().map(sig -> sig.<Double>get(StandardSubroutineSignals.RESIDUAL_ERROR_NORM)).collect(Collectors.toList()));
    return SubroutineSignal.builder().put(StandardSubroutineSignals.RESIDUAL_ERROR_NORM, errorNormInfinity).build();
}
Also used : ScalarProducer(org.broadinstitute.hellbender.utils.hmm.interfaces.ScalarProducer) Function2(org.apache.spark.api.java.function.Function2) HMMSegmentProcessor(org.broadinstitute.hellbender.utils.hmm.segmentation.HMMSegmentProcessor) GermlinePloidyAnnotatedTargetCollection(org.broadinstitute.hellbender.tools.exome.sexgenotyper.GermlinePloidyAnnotatedTargetCollection) HiddenStateSegmentRecordWriter(org.broadinstitute.hellbender.utils.hmm.segmentation.HiddenStateSegmentRecordWriter) BiFunction(java.util.function.BiFunction) GATKException(org.broadinstitute.hellbender.exceptions.GATKException) SexGenotypeData(org.broadinstitute.hellbender.tools.exome.sexgenotyper.SexGenotypeData) ParamUtils(org.broadinstitute.hellbender.utils.param.ParamUtils) CallStringProducer(org.broadinstitute.hellbender.utils.hmm.interfaces.CallStringProducer) StorageLevel(org.apache.spark.storage.StorageLevel) SynchronizedUnivariateSolver(org.broadinstitute.hellbender.tools.coveragemodel.math.SynchronizedUnivariateSolver) CopyRatioExpectationsCalculator(org.broadinstitute.hellbender.tools.coveragemodel.interfaces.CopyRatioExpectationsCalculator) UnivariateSolverSpecifications(org.broadinstitute.hellbender.tools.coveragemodel.math.UnivariateSolverSpecifications) IndexRange(org.broadinstitute.hellbender.utils.IndexRange) Broadcast(org.apache.spark.broadcast.Broadcast) ExitStatus(org.broadinstitute.hellbender.tools.coveragemodel.linalg.IterativeLinearSolverNDArray.ExitStatus) SexGenotypeDataCollection(org.broadinstitute.hellbender.tools.exome.sexgenotyper.SexGenotypeDataCollection) HashPartitioner(org.apache.spark.HashPartitioner) Predicate(java.util.function.Predicate) GeneralLinearOperator(org.broadinstitute.hellbender.tools.coveragemodel.linalg.GeneralLinearOperator) Nd4j(org.nd4j.linalg.factory.Nd4j) INDArrayIndex(org.nd4j.linalg.indexing.INDArrayIndex) FastMath(org.apache.commons.math3.util.FastMath) org.broadinstitute.hellbender.tools.exome(org.broadinstitute.hellbender.tools.exome) Tuple2(scala.Tuple2) Collectors(java.util.stream.Collectors) Sets(com.google.common.collect.Sets) AbstractUnivariateSolver(org.apache.commons.math3.analysis.solvers.AbstractUnivariateSolver) FourierLinearOperatorNDArray(org.broadinstitute.hellbender.tools.coveragemodel.linalg.FourierLinearOperatorNDArray) Logger(org.apache.logging.log4j.Logger) Stream(java.util.stream.Stream) UserException(org.broadinstitute.hellbender.exceptions.UserException) UnivariateFunction(org.apache.commons.math3.analysis.UnivariateFunction) TooManyEvaluationsException(org.apache.commons.math3.exception.TooManyEvaluationsException) Utils(org.broadinstitute.hellbender.utils.Utils) Function(org.apache.spark.api.java.function.Function) DataBuffer(org.nd4j.linalg.api.buffer.DataBuffer) IntStream(java.util.stream.IntStream) java.util(java.util) NDArrayIndex(org.nd4j.linalg.indexing.NDArrayIndex) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) AlleleMetadataProducer(org.broadinstitute.hellbender.utils.hmm.interfaces.AlleleMetadataProducer) EmissionCalculationStrategy(org.broadinstitute.hellbender.tools.coveragemodel.CoverageModelCopyRatioEmissionProbabilityCalculator.EmissionCalculationStrategy) RobustBrentSolver(org.broadinstitute.hellbender.tools.coveragemodel.math.RobustBrentSolver) IntervalUtils(org.broadinstitute.hellbender.utils.IntervalUtils) Nonnull(javax.annotation.Nonnull) Nullable(javax.annotation.Nullable) HiddenStateSegmentRecord(org.broadinstitute.hellbender.utils.hmm.segmentation.HiddenStateSegmentRecord) ImmutableTriple(org.apache.commons.lang3.tuple.ImmutableTriple) IterativeLinearSolverNDArray(org.broadinstitute.hellbender.tools.coveragemodel.linalg.IterativeLinearSolverNDArray) GATKProtectedMathUtils(org.broadinstitute.hellbender.utils.GATKProtectedMathUtils) Nd4jIOUtils(org.broadinstitute.hellbender.tools.coveragemodel.nd4jutils.Nd4jIOUtils) IOException(java.io.IOException) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) ImmutablePair(org.apache.commons.lang3.tuple.ImmutablePair) File(java.io.File) INDArray(org.nd4j.linalg.api.ndarray.INDArray) VisibleForTesting(com.google.common.annotations.VisibleForTesting) Transforms(org.nd4j.linalg.ops.transforms.Transforms) LogManager(org.apache.logging.log4j.LogManager) NoBracketingException(org.apache.commons.math3.exception.NoBracketingException) INDArray(org.nd4j.linalg.api.ndarray.INDArray) ImmutablePair(org.apache.commons.lang3.tuple.ImmutablePair) Tuple2(scala.Tuple2) SexGenotypeData(org.broadinstitute.hellbender.tools.exome.sexgenotyper.SexGenotypeData) HashPartitioner(org.apache.spark.HashPartitioner)

Example 2 with HashPartitioner

use of org.apache.spark.HashPartitioner in project gatk-protected by broadinstitute.

the class CoverageModelEMWorkspace method updateCopyRatioPosteriorExpectationsSpark.

/**
     * The Spark implementation of the E-step update of copy ratio posteriors
     *
     * @return a {@link SubroutineSignal} containing the update size
     */
@EvaluatesRDD
@UpdatesRDD
@CachesRDD
private SubroutineSignal updateCopyRatioPosteriorExpectationsSpark(final double admixingRatio) {
    /* local final member variables for lambda capture */
    final List<LinearlySpacedIndexBlock> targetBlocks = new ArrayList<>();
    targetBlocks.addAll(this.targetBlocks);
    final List<Target> targetList = new ArrayList<>();
    targetList.addAll(processedTargetList);
    final List<String> sampleNameList = new ArrayList<>();
    sampleNameList.addAll(processedSampleNameList);
    final List<SexGenotypeData> sampleSexGenotypeData = new ArrayList<>();
    sampleSexGenotypeData.addAll(processedSampleSexGenotypeData);
    final int numTargetBlocks = targetBlocks.size();
    final CopyRatioExpectationsCalculator<CoverageModelCopyRatioEmissionData, STATE> calculator = this.copyRatioExpectationsCalculator;
    final INDArray sampleReadDepths = Transforms.exp(sampleMeanLogReadDepths, true);
    /* make an RDD of copy ratio posterior expectations */
    final JavaPairRDD<Integer, CopyRatioExpectations> copyRatioPosteriorExpectationsPairRDD = /* fetch copy ratio emission data from workers */
    fetchCopyRatioEmissionDataSpark().mapPartitionsToPair(it -> {
        final List<Tuple2<Integer, CopyRatioExpectations>> newPartitionData = new ArrayList<>();
        while (it.hasNext()) {
            final Tuple2<Integer, List<CoverageModelCopyRatioEmissionData>> prevDatum = it.next();
            final int si = prevDatum._1;
            final CopyRatioCallingMetadata copyRatioCallingMetadata = CopyRatioCallingMetadata.builder().sampleName(sampleNameList.get(si)).sampleSexGenotypeData(sampleSexGenotypeData.get(si)).sampleCoverageDepth(sampleReadDepths.getDouble(si)).emissionCalculationStrategy(EmissionCalculationStrategy.HYBRID_POISSON_GAUSSIAN).build();
            newPartitionData.add(new Tuple2<>(prevDatum._1, calculator.getCopyRatioPosteriorExpectations(copyRatioCallingMetadata, targetList, prevDatum._2)));
        }
        return newPartitionData.iterator();
    }, true);
    /* we need to do two things to copyRatioPosteriorExpectationsPairRDD; so we cache it */
    /* step 1. update log chain posterior expectation on the driver node */
    final double[] newSampleLogChainPosteriors = copyRatioPosteriorExpectationsPairRDD.mapValues(CopyRatioExpectations::getLogChainPosteriorProbability).collect().stream().sorted(Comparator.comparingInt(t -> t._1)).mapToDouble(t -> t._2).toArray();
    sampleLogChainPosteriors.assign(Nd4j.create(newSampleLogChainPosteriors, new int[] { numSamples, 1 }));
    /* step 2. repartition in target space */
    final JavaPairRDD<LinearlySpacedIndexBlock, ImmutablePair<INDArray, INDArray>> blockifiedCopyRatioPosteriorResultsPairRDD = copyRatioPosteriorExpectationsPairRDD.flatMapToPair(dat -> targetBlocks.stream().map(tb -> new Tuple2<>(tb, new Tuple2<>(dat._1, ImmutablePair.of(dat._2.getLogCopyRatioMeans(tb), dat._2.getLogCopyRatioVariances(tb))))).iterator()).combineByKey(/* recipe to create an singleton list */
    Collections::singletonList, /* recipe to add an element to the list */
    (list, element) -> Stream.concat(list.stream(), Stream.of(element)).collect(Collectors.toList()), /* recipe to concatenate two lists */
    (list1, list2) -> Stream.concat(list1.stream(), list2.stream()).collect(Collectors.toList()), /* repartition with respect to target space blocks */
    new HashPartitioner(numTargetBlocks)).mapValues(list -> list.stream().sorted(Comparator.comparingInt(t -> t._1)).map(p -> p._2).map(t -> ImmutablePair.of(Nd4j.create(t.left), Nd4j.create(t.right))).collect(Collectors.toList())).mapValues(CoverageModelEMWorkspace::stackCopyRatioPosteriorDataForAllSamples);
    /* we do not need copy ratio expectations anymore */
    copyRatioPosteriorExpectationsPairRDD.unpersist();
    /* step 3. merge with computeRDD and update */
    computeRDD = computeRDD.join(blockifiedCopyRatioPosteriorResultsPairRDD).mapValues(t -> t._1.cloneWithUpdatedCopyRatioPosteriors(t._2.left, t._2.right, admixingRatio));
    cacheWorkers("after E-step for copy ratio update");
    /* collect subroutine signals */
    final List<SubroutineSignal> sigs = mapWorkersAndCollect(CoverageModelEMComputeBlock::getLatestMStepSignal);
    final double errorNormInfinity = Collections.max(sigs.stream().map(sig -> sig.<Double>get(StandardSubroutineSignals.RESIDUAL_ERROR_NORM)).collect(Collectors.toList()));
    return SubroutineSignal.builder().put(StandardSubroutineSignals.RESIDUAL_ERROR_NORM, errorNormInfinity).build();
}
Also used : ScalarProducer(org.broadinstitute.hellbender.utils.hmm.interfaces.ScalarProducer) Function2(org.apache.spark.api.java.function.Function2) HMMSegmentProcessor(org.broadinstitute.hellbender.utils.hmm.segmentation.HMMSegmentProcessor) GermlinePloidyAnnotatedTargetCollection(org.broadinstitute.hellbender.tools.exome.sexgenotyper.GermlinePloidyAnnotatedTargetCollection) HiddenStateSegmentRecordWriter(org.broadinstitute.hellbender.utils.hmm.segmentation.HiddenStateSegmentRecordWriter) BiFunction(java.util.function.BiFunction) GATKException(org.broadinstitute.hellbender.exceptions.GATKException) SexGenotypeData(org.broadinstitute.hellbender.tools.exome.sexgenotyper.SexGenotypeData) ParamUtils(org.broadinstitute.hellbender.utils.param.ParamUtils) CallStringProducer(org.broadinstitute.hellbender.utils.hmm.interfaces.CallStringProducer) StorageLevel(org.apache.spark.storage.StorageLevel) SynchronizedUnivariateSolver(org.broadinstitute.hellbender.tools.coveragemodel.math.SynchronizedUnivariateSolver) CopyRatioExpectationsCalculator(org.broadinstitute.hellbender.tools.coveragemodel.interfaces.CopyRatioExpectationsCalculator) UnivariateSolverSpecifications(org.broadinstitute.hellbender.tools.coveragemodel.math.UnivariateSolverSpecifications) IndexRange(org.broadinstitute.hellbender.utils.IndexRange) Broadcast(org.apache.spark.broadcast.Broadcast) ExitStatus(org.broadinstitute.hellbender.tools.coveragemodel.linalg.IterativeLinearSolverNDArray.ExitStatus) SexGenotypeDataCollection(org.broadinstitute.hellbender.tools.exome.sexgenotyper.SexGenotypeDataCollection) HashPartitioner(org.apache.spark.HashPartitioner) Predicate(java.util.function.Predicate) GeneralLinearOperator(org.broadinstitute.hellbender.tools.coveragemodel.linalg.GeneralLinearOperator) Nd4j(org.nd4j.linalg.factory.Nd4j) INDArrayIndex(org.nd4j.linalg.indexing.INDArrayIndex) FastMath(org.apache.commons.math3.util.FastMath) org.broadinstitute.hellbender.tools.exome(org.broadinstitute.hellbender.tools.exome) Tuple2(scala.Tuple2) Collectors(java.util.stream.Collectors) Sets(com.google.common.collect.Sets) AbstractUnivariateSolver(org.apache.commons.math3.analysis.solvers.AbstractUnivariateSolver) FourierLinearOperatorNDArray(org.broadinstitute.hellbender.tools.coveragemodel.linalg.FourierLinearOperatorNDArray) Logger(org.apache.logging.log4j.Logger) Stream(java.util.stream.Stream) UserException(org.broadinstitute.hellbender.exceptions.UserException) UnivariateFunction(org.apache.commons.math3.analysis.UnivariateFunction) TooManyEvaluationsException(org.apache.commons.math3.exception.TooManyEvaluationsException) Utils(org.broadinstitute.hellbender.utils.Utils) Function(org.apache.spark.api.java.function.Function) DataBuffer(org.nd4j.linalg.api.buffer.DataBuffer) IntStream(java.util.stream.IntStream) java.util(java.util) NDArrayIndex(org.nd4j.linalg.indexing.NDArrayIndex) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) AlleleMetadataProducer(org.broadinstitute.hellbender.utils.hmm.interfaces.AlleleMetadataProducer) EmissionCalculationStrategy(org.broadinstitute.hellbender.tools.coveragemodel.CoverageModelCopyRatioEmissionProbabilityCalculator.EmissionCalculationStrategy) RobustBrentSolver(org.broadinstitute.hellbender.tools.coveragemodel.math.RobustBrentSolver) IntervalUtils(org.broadinstitute.hellbender.utils.IntervalUtils) Nonnull(javax.annotation.Nonnull) Nullable(javax.annotation.Nullable) HiddenStateSegmentRecord(org.broadinstitute.hellbender.utils.hmm.segmentation.HiddenStateSegmentRecord) ImmutableTriple(org.apache.commons.lang3.tuple.ImmutableTriple) IterativeLinearSolverNDArray(org.broadinstitute.hellbender.tools.coveragemodel.linalg.IterativeLinearSolverNDArray) GATKProtectedMathUtils(org.broadinstitute.hellbender.utils.GATKProtectedMathUtils) Nd4jIOUtils(org.broadinstitute.hellbender.tools.coveragemodel.nd4jutils.Nd4jIOUtils) IOException(java.io.IOException) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) ImmutablePair(org.apache.commons.lang3.tuple.ImmutablePair) File(java.io.File) INDArray(org.nd4j.linalg.api.ndarray.INDArray) VisibleForTesting(com.google.common.annotations.VisibleForTesting) Transforms(org.nd4j.linalg.ops.transforms.Transforms) LogManager(org.apache.logging.log4j.LogManager) NoBracketingException(org.apache.commons.math3.exception.NoBracketingException) INDArray(org.nd4j.linalg.api.ndarray.INDArray) ImmutablePair(org.apache.commons.lang3.tuple.ImmutablePair) Tuple2(scala.Tuple2) SexGenotypeData(org.broadinstitute.hellbender.tools.exome.sexgenotyper.SexGenotypeData) HashPartitioner(org.apache.spark.HashPartitioner)

Example 3 with HashPartitioner

use of org.apache.spark.HashPartitioner in project gatk by broadinstitute.

the class FindBadGenomicKmersSpark method processRefRDD.

/**
     * Do a map/reduce on an RDD of genomic sequences:
     * Kmerize, mapping to a pair <kmer,1>, reduce by summing values by key, filter out <kmer,N> where
     * N <= MAX_KMER_FREQ, and collect the high frequency kmers back in the driver.
     */
@VisibleForTesting
static List<SVKmer> processRefRDD(final int kSize, final int maxDUSTScore, final int maxKmerFreq, final JavaRDD<byte[]> refRDD) {
    final int nPartitions = refRDD.getNumPartitions();
    final int hashSize = 2 * REF_RECORDS_PER_PARTITION;
    final int arrayCap = REF_RECORDS_PER_PARTITION / 100;
    return refRDD.mapPartitions(seqItr -> {
        final HopscotchMap<SVKmer, Integer, KmerAndCount> kmerCounts = new HopscotchMap<>(hashSize);
        while (seqItr.hasNext()) {
            final byte[] seq = seqItr.next();
            SVDUSTFilteredKmerizer.stream(seq, kSize, maxDUSTScore, new SVKmerLong()).map(kmer -> kmer.canonical(kSize)).forEach(kmer -> {
                final KmerAndCount entry = kmerCounts.find(kmer);
                if (entry == null)
                    kmerCounts.add(new KmerAndCount((SVKmerLong) kmer));
                else
                    entry.bumpCount();
            });
        }
        return kmerCounts.iterator();
    }).mapToPair(entry -> new Tuple2<>(entry.getKey(), entry.getValue())).partitionBy(new HashPartitioner(nPartitions)).mapPartitions(pairItr -> {
        final HopscotchMap<SVKmer, Integer, KmerAndCount> kmerCounts = new HopscotchMap<>(hashSize);
        while (pairItr.hasNext()) {
            final Tuple2<SVKmer, Integer> pair = pairItr.next();
            final SVKmer kmer = pair._1();
            final int count = pair._2();
            KmerAndCount entry = kmerCounts.find(kmer);
            if (entry == null)
                kmerCounts.add(new KmerAndCount((SVKmerLong) kmer, count));
            else
                entry.bumpCount(count);
        }
        final List<SVKmer> highFreqKmers = new ArrayList<>(arrayCap);
        for (KmerAndCount kmerAndCount : kmerCounts) {
            if (kmerAndCount.grabCount() > maxKmerFreq)
                highFreqKmers.add(kmerAndCount.getKey());
        }
        return highFreqKmers.iterator();
    }).collect();
}
Also used : Output(com.esotericsoftware.kryo.io.Output) CommandLineProgramProperties(org.broadinstitute.barclay.argparser.CommandLineProgramProperties) java.util(java.util) ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) Argument(org.broadinstitute.barclay.argparser.Argument) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) StandardArgumentDefinitions(org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions) SAMFileHeader(htsjdk.samtools.SAMFileHeader) GATKException(org.broadinstitute.hellbender.exceptions.GATKException) Kryo(com.esotericsoftware.kryo.Kryo) BucketUtils(org.broadinstitute.hellbender.utils.gcs.BucketUtils) HopscotchMap(org.broadinstitute.hellbender.tools.spark.utils.HopscotchMap) Input(com.esotericsoftware.kryo.io.Input) HopscotchSet(org.broadinstitute.hellbender.tools.spark.utils.HopscotchSet) JavaRDD(org.apache.spark.api.java.JavaRDD) DefaultSerializer(com.esotericsoftware.kryo.DefaultSerializer) HashPartitioner(org.apache.spark.HashPartitioner) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) GATKSparkTool(org.broadinstitute.hellbender.engine.spark.GATKSparkTool) IOException(java.io.IOException) Tuple2(scala.Tuple2) InputStreamReader(java.io.InputStreamReader) StructuralVariationSparkProgramGroup(org.broadinstitute.hellbender.cmdline.programgroups.StructuralVariationSparkProgramGroup) PipelineOptions(com.google.cloud.dataflow.sdk.options.PipelineOptions) VisibleForTesting(com.google.common.annotations.VisibleForTesting) BufferedReader(java.io.BufferedReader) HopscotchMap(org.broadinstitute.hellbender.tools.spark.utils.HopscotchMap) Tuple2(scala.Tuple2) HashPartitioner(org.apache.spark.HashPartitioner) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 4 with HashPartitioner

use of org.apache.spark.HashPartitioner in project beam by apache.

the class GroupCombineFunctions method groupByKeyOnly.

/**
   * An implementation of
   * {@link org.apache.beam.runners.core.GroupByKeyViaGroupByKeyOnly.GroupByKeyOnly}
   * for the Spark runner.
   */
public static <K, V> JavaRDD<WindowedValue<KV<K, Iterable<WindowedValue<V>>>>> groupByKeyOnly(JavaRDD<WindowedValue<KV<K, V>>> rdd, Coder<K> keyCoder, WindowedValueCoder<V> wvCoder) {
    // we use coders to convert objects in the PCollection to byte arrays, so they
    // can be transferred over the network for the shuffle.
    JavaPairRDD<ByteArray, byte[]> pairRDD = rdd.map(new ReifyTimestampsAndWindowsFunction<K, V>()).map(WindowingHelpers.<KV<K, WindowedValue<V>>>unwindowFunction()).mapToPair(TranslationUtils.<K, WindowedValue<V>>toPairFunction()).mapToPair(CoderHelpers.toByteFunction(keyCoder, wvCoder));
    // use a default parallelism HashPartitioner.
    Partitioner partitioner = new HashPartitioner(rdd.rdd().sparkContext().defaultParallelism());
    // and avoid unnecessary shuffle downstream.
    return pairRDD.groupByKey(partitioner).mapPartitionsToPair(TranslationUtils.pairFunctionToPairFlatMapFunction(CoderHelpers.fromByteFunctionIterable(keyCoder, wvCoder)), true).mapPartitions(TranslationUtils.<K, Iterable<WindowedValue<V>>>fromPairFlatMapFunction(), true).mapPartitions(TranslationUtils.functionToFlatMapFunction(WindowingHelpers.<KV<K, Iterable<WindowedValue<V>>>>windowFunction()), true);
}
Also used : WindowedValue(org.apache.beam.sdk.util.WindowedValue) KV(org.apache.beam.sdk.values.KV) ByteArray(org.apache.beam.runners.spark.util.ByteArray) HashPartitioner(org.apache.spark.HashPartitioner) KV(org.apache.beam.sdk.values.KV) HashPartitioner(org.apache.spark.HashPartitioner) Partitioner(org.apache.spark.Partitioner)

Example 5 with HashPartitioner

use of org.apache.spark.HashPartitioner in project gatk by broadinstitute.

the class CoverageModelEMWorkspace method joinWithWorkersAndMap.

/**
     * A generic function for handling a blockified list of objects to their corresponding compute nodes
     *
     * If Spark is enabled:
     *
     *      Joins an instance of {@code List<Tuple2<LinearlySpacedIndexBlock, V>>} with {@link #computeRDD}, calls the provided
     *      map {@code mapper} on the RDD, and the reference to the old RDD will be replaced with the new RDD.
     *
     * If Spark is disabled:
     *
     *      Only a single target-space block is assumed, such that {@code data} is a singleton. The map function
     *      {@code mapper} will be called on the value contained in {@code data} and {@link #localComputeBlock}, and
     *      the old instance of {@link CoverageModelEMComputeBlock} is replaced with the new instance returned
     *      by {@code mapper.}
     *
     * @param data the list to joined and mapped together with the compute block(s)
     * @param mapper a mapper binary function that takes a compute block together with an object of type {@code V} and
     *               returns a new compute block
     * @param <V> the type of the object to the broadcasted
     */
@UpdatesRDD
private <V> void joinWithWorkersAndMap(@Nonnull final List<Tuple2<LinearlySpacedIndexBlock, V>> data, @Nonnull final Function<Tuple2<CoverageModelEMComputeBlock, V>, CoverageModelEMComputeBlock> mapper) {
    if (sparkContextIsAvailable) {
        final JavaPairRDD<LinearlySpacedIndexBlock, V> newRDD = ctx.parallelizePairs(data, numTargetBlocks).partitionBy(new HashPartitioner(numTargetBlocks));
        computeRDD = computeRDD.join(newRDD).mapValues(mapper);
    } else {
        try {
            Utils.validateArg(data.size() == 1, "Only a single data block is expected in the local mode");
            localComputeBlock = mapper.call(new Tuple2<>(localComputeBlock, data.get(0)._2));
        } catch (Exception e) {
            throw new RuntimeException("Can not apply the map function to the local compute block: " + e.getMessage());
        }
    }
}
Also used : Tuple2(scala.Tuple2) HashPartitioner(org.apache.spark.HashPartitioner) GATKException(org.broadinstitute.hellbender.exceptions.GATKException) UserException(org.broadinstitute.hellbender.exceptions.UserException) TooManyEvaluationsException(org.apache.commons.math3.exception.TooManyEvaluationsException) IOException(java.io.IOException) NoBracketingException(org.apache.commons.math3.exception.NoBracketingException)

Aggregations

HashPartitioner (org.apache.spark.HashPartitioner)11 IOException (java.io.IOException)8 GATKException (org.broadinstitute.hellbender.exceptions.GATKException)8 Tuple2 (scala.Tuple2)8 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)7 UserException (org.broadinstitute.hellbender.exceptions.UserException)7 VisibleForTesting (com.google.common.annotations.VisibleForTesting)6 java.util (java.util)6 NoBracketingException (org.apache.commons.math3.exception.NoBracketingException)6 TooManyEvaluationsException (org.apache.commons.math3.exception.TooManyEvaluationsException)6 File (java.io.File)5 Collectors (java.util.stream.Collectors)5 IntStream (java.util.stream.IntStream)5 LogManager (org.apache.logging.log4j.LogManager)5 Logger (org.apache.logging.log4j.Logger)5 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)5 Broadcast (org.apache.spark.broadcast.Broadcast)5 StorageLevel (org.apache.spark.storage.StorageLevel)5 Sets (com.google.common.collect.Sets)4 BiFunction (java.util.function.BiFunction)4