Search in sources :

Example 1 with ReferenceShard

use of org.broadinstitute.hellbender.engine.ReferenceShard in project gatk by broadinstitute.

the class ShuffleJoinReadsWithRefBases method addBases.

/**
     * Joins each read of an RDD<GATKRead, T> with key's corresponding reference sequence.
     *
     * @param referenceDataflowSource The source of the reference sequence information
     * @param keyedByRead The read-keyed RDD for which to extract reference sequence information
     * @return The JavaPairRDD that contains each read along with the corresponding ReferenceBases object and the value
     */
public static <T> JavaPairRDD<GATKRead, Tuple2<T, ReferenceBases>> addBases(final ReferenceMultiSource referenceDataflowSource, final JavaPairRDD<GATKRead, T> keyedByRead) {
    SerializableFunction<GATKRead, SimpleInterval> windowFunction = referenceDataflowSource.getReferenceWindowFunction();
    JavaPairRDD<ReferenceShard, Tuple2<GATKRead, T>> shardRead = keyedByRead.mapToPair(pair -> {
        ReferenceShard shard = ReferenceShard.getShardNumberFromInterval(windowFunction.apply(pair._1()));
        return new Tuple2<>(shard, pair);
    });
    JavaPairRDD<ReferenceShard, Iterable<Tuple2<GATKRead, T>>> shardiRead = shardRead.groupByKey();
    return shardiRead.flatMapToPair(in -> {
        List<Tuple2<GATKRead, Tuple2<T, ReferenceBases>>> out = Lists.newArrayList();
        Iterable<Tuple2<GATKRead, T>> iReads = in._2();
        final List<SimpleInterval> readWindows = Utils.stream(iReads).map(pair -> windowFunction.apply(pair._1())).collect(Collectors.toList());
        SimpleInterval interval = IntervalUtils.getSpanningInterval(readWindows);
        ReferenceBases bases = referenceDataflowSource.getReferenceBases(null, interval);
        for (Tuple2<GATKRead, T> p : iReads) {
            final ReferenceBases subset = bases.getSubset(windowFunction.apply(p._1()));
            out.add(new Tuple2<>(p._1(), new Tuple2<>(p._2(), subset)));
        }
        return out.iterator();
    });
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) Tuple2(scala.Tuple2) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) List(java.util.List) Lists(com.google.common.collect.Lists) IntervalUtils(org.broadinstitute.hellbender.utils.IntervalUtils) ReferenceBases(org.broadinstitute.hellbender.utils.reference.ReferenceBases) Utils(org.broadinstitute.hellbender.utils.Utils) StreamSupport(java.util.stream.StreamSupport) SerializableFunction(org.broadinstitute.hellbender.utils.SerializableFunction) ReferenceShard(org.broadinstitute.hellbender.engine.ReferenceShard) JavaRDD(org.apache.spark.api.java.JavaRDD) ReferenceShard(org.broadinstitute.hellbender.engine.ReferenceShard) ReferenceBases(org.broadinstitute.hellbender.utils.reference.ReferenceBases) Tuple2(scala.Tuple2) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval)

Example 2 with ReferenceShard

use of org.broadinstitute.hellbender.engine.ReferenceShard in project gatk by broadinstitute.

the class ShuffleJoinReadsWithRefBases method addBases.

/**
     * Joins each read of an RDD<GATKRead> with that read's corresponding reference sequence.
     *
     * @param referenceDataflowSource The source of the reference sequence information
     * @param reads The reads for which to extract reference sequence information
     * @return The JavaPairRDD that contains each read along with the corresponding ReferenceBases object
     */
public static JavaPairRDD<GATKRead, ReferenceBases> addBases(final ReferenceMultiSource referenceDataflowSource, final JavaRDD<GATKRead> reads) {
    // TODO: reimpl this method by calling out to the more complex version?
    SerializableFunction<GATKRead, SimpleInterval> windowFunction = referenceDataflowSource.getReferenceWindowFunction();
    JavaPairRDD<ReferenceShard, GATKRead> shardRead = reads.mapToPair(gatkRead -> {
        ReferenceShard shard = ReferenceShard.getShardNumberFromInterval(windowFunction.apply(gatkRead));
        return new Tuple2<>(shard, gatkRead);
    });
    JavaPairRDD<ReferenceShard, Iterable<GATKRead>> shardiRead = shardRead.groupByKey();
    return shardiRead.flatMapToPair(in -> {
        List<Tuple2<GATKRead, ReferenceBases>> out = Lists.newArrayList();
        Iterable<GATKRead> iReads = in._2();
        final List<SimpleInterval> readWindows = Utils.stream(iReads).map(read -> windowFunction.apply(read)).collect(Collectors.toList());
        SimpleInterval interval = IntervalUtils.getSpanningInterval(readWindows);
        ReferenceBases bases = referenceDataflowSource.getReferenceBases(null, interval);
        for (GATKRead r : iReads) {
            final ReferenceBases subset = bases.getSubset(windowFunction.apply(r));
            out.add(new Tuple2<>(r, subset));
        }
        return out.iterator();
    });
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) Tuple2(scala.Tuple2) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) List(java.util.List) Lists(com.google.common.collect.Lists) IntervalUtils(org.broadinstitute.hellbender.utils.IntervalUtils) ReferenceBases(org.broadinstitute.hellbender.utils.reference.ReferenceBases) Utils(org.broadinstitute.hellbender.utils.Utils) StreamSupport(java.util.stream.StreamSupport) SerializableFunction(org.broadinstitute.hellbender.utils.SerializableFunction) ReferenceShard(org.broadinstitute.hellbender.engine.ReferenceShard) JavaRDD(org.apache.spark.api.java.JavaRDD) ReferenceShard(org.broadinstitute.hellbender.engine.ReferenceShard) ReferenceBases(org.broadinstitute.hellbender.utils.reference.ReferenceBases) Tuple2(scala.Tuple2) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval)

Aggregations

Lists (com.google.common.collect.Lists)2 List (java.util.List)2 Collectors (java.util.stream.Collectors)2 StreamSupport (java.util.stream.StreamSupport)2 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)2 JavaRDD (org.apache.spark.api.java.JavaRDD)2 ReferenceShard (org.broadinstitute.hellbender.engine.ReferenceShard)2 ReferenceMultiSource (org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource)2 IntervalUtils (org.broadinstitute.hellbender.utils.IntervalUtils)2 SerializableFunction (org.broadinstitute.hellbender.utils.SerializableFunction)2 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)2 Utils (org.broadinstitute.hellbender.utils.Utils)2 GATKRead (org.broadinstitute.hellbender.utils.read.GATKRead)2 ReferenceBases (org.broadinstitute.hellbender.utils.reference.ReferenceBases)2 Tuple2 (scala.Tuple2)2