use of org.broadinstitute.hellbender.engine.ReferenceShard in project gatk by broadinstitute.
the class ShuffleJoinReadsWithRefBases method addBases.
/**
* Joins each read of an RDD<GATKRead, T> with key's corresponding reference sequence.
*
* @param referenceDataflowSource The source of the reference sequence information
* @param keyedByRead The read-keyed RDD for which to extract reference sequence information
* @return The JavaPairRDD that contains each read along with the corresponding ReferenceBases object and the value
*/
public static <T> JavaPairRDD<GATKRead, Tuple2<T, ReferenceBases>> addBases(final ReferenceMultiSource referenceDataflowSource, final JavaPairRDD<GATKRead, T> keyedByRead) {
SerializableFunction<GATKRead, SimpleInterval> windowFunction = referenceDataflowSource.getReferenceWindowFunction();
JavaPairRDD<ReferenceShard, Tuple2<GATKRead, T>> shardRead = keyedByRead.mapToPair(pair -> {
ReferenceShard shard = ReferenceShard.getShardNumberFromInterval(windowFunction.apply(pair._1()));
return new Tuple2<>(shard, pair);
});
JavaPairRDD<ReferenceShard, Iterable<Tuple2<GATKRead, T>>> shardiRead = shardRead.groupByKey();
return shardiRead.flatMapToPair(in -> {
List<Tuple2<GATKRead, Tuple2<T, ReferenceBases>>> out = Lists.newArrayList();
Iterable<Tuple2<GATKRead, T>> iReads = in._2();
final List<SimpleInterval> readWindows = Utils.stream(iReads).map(pair -> windowFunction.apply(pair._1())).collect(Collectors.toList());
SimpleInterval interval = IntervalUtils.getSpanningInterval(readWindows);
ReferenceBases bases = referenceDataflowSource.getReferenceBases(null, interval);
for (Tuple2<GATKRead, T> p : iReads) {
final ReferenceBases subset = bases.getSubset(windowFunction.apply(p._1()));
out.add(new Tuple2<>(p._1(), new Tuple2<>(p._2(), subset)));
}
return out.iterator();
});
}
use of org.broadinstitute.hellbender.engine.ReferenceShard in project gatk by broadinstitute.
the class ShuffleJoinReadsWithRefBases method addBases.
/**
* Joins each read of an RDD<GATKRead> with that read's corresponding reference sequence.
*
* @param referenceDataflowSource The source of the reference sequence information
* @param reads The reads for which to extract reference sequence information
* @return The JavaPairRDD that contains each read along with the corresponding ReferenceBases object
*/
public static JavaPairRDD<GATKRead, ReferenceBases> addBases(final ReferenceMultiSource referenceDataflowSource, final JavaRDD<GATKRead> reads) {
// TODO: reimpl this method by calling out to the more complex version?
SerializableFunction<GATKRead, SimpleInterval> windowFunction = referenceDataflowSource.getReferenceWindowFunction();
JavaPairRDD<ReferenceShard, GATKRead> shardRead = reads.mapToPair(gatkRead -> {
ReferenceShard shard = ReferenceShard.getShardNumberFromInterval(windowFunction.apply(gatkRead));
return new Tuple2<>(shard, gatkRead);
});
JavaPairRDD<ReferenceShard, Iterable<GATKRead>> shardiRead = shardRead.groupByKey();
return shardiRead.flatMapToPair(in -> {
List<Tuple2<GATKRead, ReferenceBases>> out = Lists.newArrayList();
Iterable<GATKRead> iReads = in._2();
final List<SimpleInterval> readWindows = Utils.stream(iReads).map(read -> windowFunction.apply(read)).collect(Collectors.toList());
SimpleInterval interval = IntervalUtils.getSpanningInterval(readWindows);
ReferenceBases bases = referenceDataflowSource.getReferenceBases(null, interval);
for (GATKRead r : iReads) {
final ReferenceBases subset = bases.getSubset(windowFunction.apply(r));
out.add(new Tuple2<>(r, subset));
}
return out.iterator();
});
}
Aggregations