use of htsjdk.samtools.SAMSequenceRecord in project gatk by broadinstitute.
the class RnaSeqMetricsCollector method makeIgnoredSequenceIndicesSet.
public static Set<Integer> makeIgnoredSequenceIndicesSet(final SAMFileHeader header, final Set<String> ignoredSequence) {
final Set<Integer> ignoredSequenceIndices = new HashSet<>();
for (final String sequenceName : ignoredSequence) {
final SAMSequenceRecord sequenceRecord = header.getSequence(sequenceName);
if (sequenceRecord == null) {
throw new UserException("Unrecognized sequence " + sequenceName + " passed as argument to IGNORE_SEQUENCE");
}
ignoredSequenceIndices.add(sequenceRecord.getSequenceIndex());
}
return ignoredSequenceIndices;
}
use of htsjdk.samtools.SAMSequenceRecord in project gatk by broadinstitute.
the class BedToIntervalList method doWork.
@Override
protected Object doWork() {
IOUtil.assertFileIsReadable(INPUT);
IOUtil.assertFileIsReadable(SEQUENCE_DICTIONARY);
IOUtil.assertFileIsWritable(OUTPUT);
try {
final SAMFileHeader header = SamReaderFactory.makeDefault().referenceSequence(REFERENCE_SEQUENCE).getFileHeader(SEQUENCE_DICTIONARY);
final IntervalList intervalList = new IntervalList(header);
/**
* NB: BED is zero-based, but a BEDCodec by default (since it is returns tribble Features) has an offset of one,
* so it returns 1-based starts. Ugh. Set to zero.
*/
final FeatureReader<BEDFeature> bedReader = AbstractFeatureReader.getFeatureReader(INPUT.getAbsolutePath(), new BEDCodec(BEDCodec.StartOffset.ZERO), false);
final CloseableTribbleIterator<BEDFeature> iterator = bedReader.iterator();
final ProgressLogger progressLogger = new ProgressLogger(logger, (int) 1e6);
while (iterator.hasNext()) {
final BEDFeature bedFeature = iterator.next();
final String sequenceName = bedFeature.getContig();
/**
* NB: BED is zero-based, so we need to add one here to make it one-based. Please observe we set the start
* offset to zero when creating the BEDCodec.
*/
final int start = bedFeature.getStart() + 1;
/**
* NB: BED is 0-based OPEN (which, for the end is equivalent to 1-based closed).
*/
final int end = bedFeature.getEnd();
// NB: do not use an empty name within an interval
String name = bedFeature.getName();
if (name.isEmpty())
name = null;
final SAMSequenceRecord sequenceRecord = header.getSequenceDictionary().getSequence(sequenceName);
// Do some validation
if (null == sequenceRecord) {
throw new GATKException(String.format("Sequence '%s' was not found in the sequence dictionary", sequenceName));
} else if (start < 1) {
throw new GATKException(String.format("Start on sequence '%s' was less than one: %d", sequenceName, start));
} else if (sequenceRecord.getSequenceLength() < start) {
throw new GATKException(String.format("Start on sequence '%s' was past the end: %d < %d", sequenceName, sequenceRecord.getSequenceLength(), start));
} else if (end < 1) {
throw new GATKException(String.format("End on sequence '%s' was less than one: %d", sequenceName, end));
} else if (sequenceRecord.getSequenceLength() < end) {
throw new GATKException(String.format("End on sequence '%s' was past the end: %d < %d", sequenceName, sequenceRecord.getSequenceLength(), end));
} else if (end < start - 1) {
throw new GATKException(String.format("On sequence '%s', end < start-1: %d <= %d", sequenceName, end, start));
}
final Interval interval = new Interval(sequenceName, start, end, bedFeature.getStrand() == Strand.POSITIVE, name);
intervalList.add(interval);
progressLogger.record(sequenceName, start);
}
CloserUtil.close(bedReader);
// Sort and write the output
intervalList.uniqued().write(OUTPUT);
} catch (final IOException e) {
throw new RuntimeException(e);
}
return null;
}
use of htsjdk.samtools.SAMSequenceRecord in project gatk by broadinstitute.
the class AddContextDataToReadSparkUnitTest method addContextDataTest.
@Test(dataProvider = "bases", groups = "spark")
public void addContextDataTest(List<GATKRead> reads, List<GATKVariant> variantList, List<KV<GATKRead, ReadContextData>> expectedReadContextData, JoinStrategy joinStrategy) throws IOException {
JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
JavaRDD<GATKRead> rddReads = ctx.parallelize(reads);
JavaRDD<GATKVariant> rddVariants = ctx.parallelize(variantList);
ReferenceMultiSource mockSource = mock(ReferenceMultiSource.class, withSettings().serializable());
when(mockSource.getReferenceBases(any(PipelineOptions.class), any())).then(new ReferenceBasesAnswer());
when(mockSource.getReferenceWindowFunction()).thenReturn(ReferenceWindowFunctions.IDENTITY_FUNCTION);
SAMSequenceDictionary sd = new SAMSequenceDictionary(Lists.newArrayList(new SAMSequenceRecord("1", 100000), new SAMSequenceRecord("2", 100000)));
when(mockSource.getReferenceSequenceDictionary(null)).thenReturn(sd);
JavaPairRDD<GATKRead, ReadContextData> rddActual = AddContextDataToReadSpark.add(ctx, rddReads, mockSource, rddVariants, joinStrategy, sd, 10000, 1000);
Map<GATKRead, ReadContextData> actual = rddActual.collectAsMap();
Assert.assertEquals(actual.size(), expectedReadContextData.size());
for (KV<GATKRead, ReadContextData> kv : expectedReadContextData) {
ReadContextData readContextData = actual.get(kv.getKey());
Assert.assertNotNull(readContextData);
Assert.assertTrue(CollectionUtils.isEqualCollection(Lists.newArrayList(readContextData.getOverlappingVariants()), Lists.newArrayList(kv.getValue().getOverlappingVariants())));
SimpleInterval minimalInterval = kv.getValue().getOverlappingReferenceBases().getInterval();
ReferenceBases subset = readContextData.getOverlappingReferenceBases().getSubset(minimalInterval);
Assert.assertEquals(subset, kv.getValue().getOverlappingReferenceBases());
}
}
use of htsjdk.samtools.SAMSequenceRecord in project gatk by broadinstitute.
the class FindBadGenomicKmersSparkUnitTest method miniRefTest.
@Test(groups = "spark")
public void miniRefTest() throws IOException {
final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
final ReferenceMultiSource ref = new ReferenceMultiSource((PipelineOptions) null, REFERENCE_FILE_NAME, ReferenceWindowFunctions.IDENTITY_FUNCTION);
final SAMSequenceDictionary dict = ref.getReferenceSequenceDictionary(null);
if (dict == null)
throw new GATKException("No reference dictionary available.");
final Map<SVKmer, Long> kmerMap = new LinkedHashMap<>();
for (final SAMSequenceRecord rec : dict.getSequences()) {
final SimpleInterval interval = new SimpleInterval(rec.getSequenceName(), 1, rec.getSequenceLength());
final byte[] bases = ref.getReferenceBases(null, interval).getBases();
final SVKmerizer kmerizer = new SVKmerizer(bases, KMER_SIZE, new SVKmerLong());
while (kmerizer.hasNext()) {
final SVKmer kmer = kmerizer.next().canonical(KMER_SIZE);
final Long currentCount = kmerMap.getOrDefault(kmer, 0L);
kmerMap.put(kmer, currentCount + 1);
}
}
final Iterator<Map.Entry<SVKmer, Long>> kmerIterator = kmerMap.entrySet().iterator();
while (kmerIterator.hasNext()) {
if (kmerIterator.next().getValue() <= FindBadGenomicKmersSpark.MAX_KMER_FREQ)
kmerIterator.remove();
}
final List<SVKmer> badKmers = FindBadGenomicKmersSpark.findBadGenomicKmers(ctx, KMER_SIZE, Integer.MAX_VALUE, ref, null, null);
final Set<SVKmer> badKmerSet = new HashSet<>(badKmers);
Assert.assertEquals(badKmers.size(), badKmerSet.size());
Assert.assertEquals(badKmerSet, kmerMap.keySet());
}
use of htsjdk.samtools.SAMSequenceRecord in project gatk by broadinstitute.
the class SATagBuilderUnitTests method testGetArtificalReadsBasedOnSATag.
@Test
public void testGetArtificalReadsBasedOnSATag() {
// setup the record
final SAMFileHeader header = new SAMFileHeader();
header.addSequence(new SAMSequenceRecord("1", 1000));
header.addSequence(new SAMSequenceRecord("2", 1000));
GATKRead read = ArtificialReadUtils.createArtificialRead(header, "name", "2", 1, "AAAAAAAAAA".getBytes(), "##########".getBytes());
read.setIsFirstOfPair();
read.setCigar(TextCigarCodec.decode("10M"));
read.setMatePosition("2", 1);
//spec says first 'SA' record will be the primary record
read.setIsSupplementaryAlignment(true);
SATagBuilder builder = new SATagBuilder(read);
// check no alignments if no SA tag */
Assert.assertEquals(builder.getArtificialReadsBasedOnSATag(header).size(), 0);
// Tests that parsing existing tags works properly
read.setAttribute("SA", "2,500,+,3S2=1X2=2S,60,1;" + "1,191,-,8M2S,60,0;");
builder = new SATagBuilder(read);
List<GATKRead> suppl = builder.getArtificialReadsBasedOnSATag(header);
Assert.assertNotNull(suppl);
Assert.assertEquals(suppl.size(), 2);
for (final GATKRead other : suppl) {
Assert.assertFalse(other.isUnmapped());
Assert.assertTrue(other.isPaired());
Assert.assertFalse(other.mateIsUnmapped());
Assert.assertEquals(other.getMateContig(), read.getMateContig());
Assert.assertEquals(other.getName(), read.getName());
}
GATKRead other = suppl.get(0);
//1st of suppl and 'record' is supplementary
Assert.assertFalse(other.isSupplementaryAlignment());
Assert.assertEquals(other.getContig(), "2");
Assert.assertEquals(other.getStart(), 500);
Assert.assertEquals(other.getMappingQuality(), 60);
Assert.assertEquals((int) other.getAttributeAsInteger("NM"), 1);
Assert.assertEquals(other.getCigar().toString(), "3S2=1X2=2S");
other = suppl.get(1);
Assert.assertTrue(other.isSupplementaryAlignment());
Assert.assertEquals(other.getContig(), "1");
Assert.assertEquals(other.getStart(), 191);
Assert.assertTrue(other.isReverseStrand());
Assert.assertEquals(other.getMappingQuality(), 60);
Assert.assertEquals((int) other.getAttributeAsInteger("NM"), 0);
Assert.assertEquals(other.getCigar().toString(), "8M2S");
}
Aggregations