use of htsjdk.samtools.util.OverlapDetector in project gatk by broadinstitute.
the class RefFlatReader method load.
OverlapDetector<Gene> load() {
final OverlapDetector<Gene> overlapDetector = new OverlapDetector<>(0, 0);
final int expectedColumns = RefFlatColumns.values().length;
final TabbedTextFileWithHeaderParser parser = new TabbedTextFileWithHeaderParser(refFlatFile, RefFlatColumnLabels);
final Map<String, List<TabbedTextFileWithHeaderParser.Row>> refFlatLinesByGene = new LinkedHashMap<>();
for (final TabbedTextFileWithHeaderParser.Row row : parser) {
// getCurrentLineNumber returns the number of the next line
final int lineNumber = parser.getCurrentLineNumber();
if (row.getFields().length != expectedColumns) {
throw new GeneAnnotationException("Wrong number of fields in refFlat file " + refFlatFile + " at line " + lineNumber);
}
final String geneName = row.getField(RefFlatColumns.GENE_NAME.name());
final String transcriptName = row.getField(RefFlatColumns.TRANSCRIPT_NAME.name());
final String transcriptDescription = geneName + ":" + transcriptName;
final String chromosome = row.getField(RefFlatColumns.CHROMOSOME.name());
if (!isSequenceRecognized(chromosome)) {
LOG.debug("Skipping " + transcriptDescription + " due to unrecognized sequence " + chromosome);
} else {
List<TabbedTextFileWithHeaderParser.Row> transcriptLines = refFlatLinesByGene.get(geneName);
if (transcriptLines == null) {
transcriptLines = new ArrayList<>();
refFlatLinesByGene.put(geneName, transcriptLines);
}
transcriptLines.add(row);
}
}
int longestInterval = 0;
int numIntervalsOver1MB = 0;
for (final List<TabbedTextFileWithHeaderParser.Row> transcriptLines : refFlatLinesByGene.values()) {
try {
final Gene gene = makeGeneFromRefFlatLines(transcriptLines);
overlapDetector.addLhs(gene, gene);
if (gene.length() > longestInterval)
longestInterval = gene.length();
if (gene.length() > 1000000)
++numIntervalsOver1MB;
} catch (Exception e) {
LOG.debug(e.getMessage() + " -- skipping");
}
}
LOG.debug("Longest gene: " + longestInterval + "; number of genes > 1MB: " + numIntervalsOver1MB);
return overlapDetector;
}
use of htsjdk.samtools.util.OverlapDetector in project gatk-protected by broadinstitute.
the class HaplotypeCallerSpark method createReadShards.
/**
* Create an RDD of {@link Shard} from an RDD of {@link GATKRead}
* @param shardBoundariesBroadcast broadcast of an {@link OverlapDetector} loaded with the intervals that should be used for creating ReadShards
* @param reads Rdd of {@link GATKRead}
* @return a Rdd of reads grouped into potentially overlapping shards
*/
private static JavaRDD<Shard<GATKRead>> createReadShards(final Broadcast<OverlapDetector<ShardBoundary>> shardBoundariesBroadcast, final JavaRDD<GATKRead> reads) {
final JavaPairRDD<ShardBoundary, GATKRead> paired = reads.flatMapToPair(read -> {
final Collection<ShardBoundary> overlappingShards = shardBoundariesBroadcast.value().getOverlaps(read);
return overlappingShards.stream().map(key -> new Tuple2<>(key, read)).iterator();
});
final JavaPairRDD<ShardBoundary, Iterable<GATKRead>> shardsWithReads = paired.groupByKey();
return shardsWithReads.map(shard -> new SparkReadShard(shard._1(), shard._2()));
}
use of htsjdk.samtools.util.OverlapDetector in project gatk by broadinstitute.
the class HaplotypeCallerSpark method createReadShards.
/**
* Create an RDD of {@link Shard} from an RDD of {@link GATKRead}
* @param shardBoundariesBroadcast broadcast of an {@link OverlapDetector} loaded with the intervals that should be used for creating ReadShards
* @param reads Rdd of {@link GATKRead}
* @return a Rdd of reads grouped into potentially overlapping shards
*/
private static JavaRDD<Shard<GATKRead>> createReadShards(final Broadcast<OverlapDetector<ShardBoundary>> shardBoundariesBroadcast, final JavaRDD<GATKRead> reads) {
final JavaPairRDD<ShardBoundary, GATKRead> paired = reads.flatMapToPair(read -> {
final Collection<ShardBoundary> overlappingShards = shardBoundariesBroadcast.value().getOverlaps(read);
return overlappingShards.stream().map(key -> new Tuple2<>(key, read)).iterator();
});
final JavaPairRDD<ShardBoundary, Iterable<GATKRead>> shardsWithReads = paired.groupByKey();
return shardsWithReads.map(shard -> new SparkReadShard(shard._1(), shard._2()));
}
use of htsjdk.samtools.util.OverlapDetector in project gatk by broadinstitute.
the class RnaSeqMetricsCollector method makeOverlapDetector.
public static OverlapDetector<Interval> makeOverlapDetector(final File samFile, final SAMFileHeader header, final File ribosomalIntervalsFile) {
OverlapDetector<Interval> ribosomalSequenceOverlapDetector = new OverlapDetector<>(0, 0);
if (ribosomalIntervalsFile != null) {
final IntervalList ribosomalIntervals = IntervalList.fromFile(ribosomalIntervalsFile);
try {
SequenceUtil.assertSequenceDictionariesEqual(header.getSequenceDictionary(), ribosomalIntervals.getHeader().getSequenceDictionary());
} catch (SequenceUtil.SequenceListsDifferException e) {
throw new UserException("Sequence dictionaries differ in " + samFile.getAbsolutePath() + " and " + ribosomalIntervalsFile.getAbsolutePath(), e);
}
final IntervalList uniquedRibosomalIntervals = ribosomalIntervals.uniqued();
final List<Interval> intervals = uniquedRibosomalIntervals.getIntervals();
ribosomalSequenceOverlapDetector.addAll(intervals, intervals);
}
return ribosomalSequenceOverlapDetector;
}
Aggregations