use of htsjdk.samtools.SAMSequenceDictionary in project gatk by broadinstitute.
the class GATKTool method initializeIntervals.
/**
* Initialize our intervals for traversal.
*
* If intervals were specified, requires that another data source (reads or reference) be present
* to supply a sequence dictionary. This requirement may be removed in the future.
*
* Must be called after other data sources have been initialized because of the sequence dictionary
* requirement.
*
* Package-private so that engine classes can access it, but concrete tool child classes cannot.
* May be overridden by traversals that require custom initialization of intervals.
*/
void initializeIntervals() {
if (intervalArgumentCollection.intervalsSpecified()) {
final SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary();
if (sequenceDictionary == null) {
throw new UserException("We require a sequence dictionary from a reference, a source of reads, or a source of variants to process intervals. " + "Since reference and reads files generally contain sequence dictionaries, this error most commonly occurs " + "for VariantWalkers that do not require a reference or reads. You can fix the problem by passing a reference file with a sequence dictionary " + "via the -R argument or you can run the tool UpdateVCFSequenceDictionary on your vcf.");
}
intervalsForTraversal = intervalArgumentCollection.getIntervals(sequenceDictionary);
}
}
use of htsjdk.samtools.SAMSequenceDictionary in project gatk by broadinstitute.
the class ReferenceAPISource method getReferenceSequenceDictionaryFromMap.
/**
* Given a reference map, get a SAMSequenceDictionary that lists its contigs and their length.
* Note that this does NOT use the referenceContig order from the API. Instead, we'll sort them
* and try to match the read sequence dictionary order, if given.
*
* @param referenceMap - return value from getReferenceNameToReferenceTable
* @param optReadSequenceDictionaryToMatch - (optional) the sequence dictionary of the reads, we'll match its order if possible.
* @return a SAMSequenceDictionary that lists the referenceset's contigs and their length.
*/
public SAMSequenceDictionary getReferenceSequenceDictionaryFromMap(final Map<String, Reference> referenceMap, final SAMSequenceDictionary optReadSequenceDictionaryToMatch) {
SAMSequenceDictionary refDictionary = new SAMSequenceDictionary();
ArrayList<SAMSequenceRecord> refContigs = new ArrayList<>();
for (Map.Entry<String, Reference> e : referenceMap.entrySet()) {
if (e.getKey() != null && e.getValue().getLength() != null) {
refContigs.add(new SAMSequenceRecord(e.getKey(), e.getValue().getLength().intValue()));
}
}
HashMap<String, Integer> indexBuilder = null;
if (null != optReadSequenceDictionaryToMatch) {
indexBuilder = new LinkedHashMap<>();
for (int i = 0; i < optReadSequenceDictionaryToMatch.size(); i++) {
final SAMSequenceRecord sequence = optReadSequenceDictionaryToMatch.getSequence(i);
indexBuilder.put(sequence.getSequenceName(), i);
}
}
final Map<String, Integer> optReadSequenceDictionaryToMatchIndex = indexBuilder;
// GATK requires human contigs in karyotypic order: 1, 2, ..., 10, 11, ..., 20, 21, 22, X, Y with M either leading or trailing these contigs.
// So we sort them.
Collections.sort(refContigs, new Comparator<SAMSequenceRecord>() {
@Override
public int compare(SAMSequenceRecord o1, SAMSequenceRecord o2) {
// if those are ordered in the readDictionary, then match that order
if (null != optReadSequenceDictionaryToMatchIndex) {
if (optReadSequenceDictionaryToMatchIndex.containsKey(o1.getSequenceName()) && optReadSequenceDictionaryToMatchIndex.containsKey(o2.getSequenceName())) {
return optReadSequenceDictionaryToMatchIndex.get(o1.getSequenceName()).compareTo(optReadSequenceDictionaryToMatchIndex.get(o2.getSequenceName()));
}
}
// otherwise, order them in karyotypic order.
int r1 = getRank(o1.getSequenceName());
int r2 = getRank(o2.getSequenceName());
if (r1 < r2)
return -1;
if (r2 < r1)
return 1;
return o1.getSequenceName().compareTo(o2.getSequenceName());
}
private int getRank(String name) {
if (name.equalsIgnoreCase("x"))
return 23;
if (name.equalsIgnoreCase("y"))
return 24;
if (name.equalsIgnoreCase("m"))
return 25;
if (name.equalsIgnoreCase("mt"))
return 25;
StringBuilder b = new StringBuilder();
for (char c : name.toCharArray()) {
if (Character.isDigit(c)) {
b.append(c);
}
}
String numsOnly = b.toString();
if (numsOnly.isEmpty())
return 0;
return Integer.parseInt(numsOnly);
}
});
for (SAMSequenceRecord s : refContigs) {
refDictionary.addSequence(s);
}
return refDictionary;
}
use of htsjdk.samtools.SAMSequenceDictionary in project gatk by broadinstitute.
the class LocusWalkerSpark method getAlignmentsFunction.
/**
* Return a function that maps a {@link Shard} of reads into a tuple of alignments and their corresponding reference and features.
* @param bReferenceSource the reference source broadcast
* @param bFeatureManager the feature manager broadcast
* @param sequenceDictionary the sequence dictionary for the reads
* @param header the reads header
* @param downsamplingInfo the downsampling method for the reads
* @return a function that maps a {@link Shard} of reads into a tuple of alignments and their corresponding reference and features.
*/
private static FlatMapFunction<Shard<GATKRead>, LocusWalkerContext> getAlignmentsFunction(Broadcast<ReferenceMultiSource> bReferenceSource, Broadcast<FeatureManager> bFeatureManager, SAMSequenceDictionary sequenceDictionary, SAMFileHeader header, LIBSDownsamplingInfo downsamplingInfo) {
return (FlatMapFunction<Shard<GATKRead>, LocusWalkerContext>) shardedRead -> {
SimpleInterval interval = shardedRead.getInterval();
SimpleInterval paddedInterval = shardedRead.getPaddedInterval();
Iterator<GATKRead> readIterator = shardedRead.iterator();
ReferenceDataSource reference = bReferenceSource == null ? null : new ReferenceMemorySource(bReferenceSource.getValue().getReferenceBases(null, paddedInterval), sequenceDictionary);
FeatureManager fm = bFeatureManager == null ? null : bFeatureManager.getValue();
final Set<String> samples = header.getReadGroups().stream().map(SAMReadGroupRecord::getSample).collect(Collectors.toSet());
LocusIteratorByState libs = new LocusIteratorByState(readIterator, downsamplingInfo, false, samples, header, true, false);
IntervalOverlappingIterator<AlignmentContext> alignmentContexts = new IntervalOverlappingIterator<>(libs, ImmutableList.of(interval), sequenceDictionary);
final Spliterator<AlignmentContext> alignmentContextSpliterator = Spliterators.spliteratorUnknownSize(alignmentContexts, 0);
return StreamSupport.stream(alignmentContextSpliterator, false).map(alignmentContext -> {
final SimpleInterval alignmentInterval = new SimpleInterval(alignmentContext);
return new LocusWalkerContext(alignmentContext, new ReferenceContext(reference, alignmentInterval), new FeatureContext(fm, alignmentInterval));
}).iterator();
};
}
use of htsjdk.samtools.SAMSequenceDictionary in project gatk by broadinstitute.
the class ReadWalkerSpark method getReadsFunction.
private static FlatMapFunction<Shard<GATKRead>, ReadWalkerContext> getReadsFunction(Broadcast<ReferenceMultiSource> bReferenceSource, Broadcast<FeatureManager> bFeatureManager, SAMSequenceDictionary sequenceDictionary, int readShardPadding) {
return (FlatMapFunction<Shard<GATKRead>, ReadWalkerContext>) shard -> {
SimpleInterval paddedInterval = shard.getInterval().expandWithinContig(readShardPadding, sequenceDictionary);
ReferenceDataSource reference = bReferenceSource == null ? null : new ReferenceMemorySource(bReferenceSource.getValue().getReferenceBases(null, paddedInterval), sequenceDictionary);
FeatureManager features = bFeatureManager == null ? null : bFeatureManager.getValue();
return StreamSupport.stream(shard.spliterator(), false).map(r -> {
final SimpleInterval readInterval = getReadInterval(r);
return new ReadWalkerContext(r, new ReferenceContext(reference, readInterval), new FeatureContext(features, readInterval));
}).iterator();
};
}
use of htsjdk.samtools.SAMSequenceDictionary in project gatk by broadinstitute.
the class PSPairedUnpairedSplitterSparkTest method testReadPairing.
@Test(groups = "spark")
public void testReadPairing() {
final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
final List<GATKRead> readList = new ArrayList<>(2);
final SAMSequenceDictionary seq = new SAMSequenceDictionary();
seq.addSequence(new SAMSequenceRecord("test_seq", 1000));
final SAMFileHeader header = new SAMFileHeader(seq);
final int numReadPairs = 300;
final int numUnpairedReads = 205;
final int numFormerlyPairedReads = 400;
final List<String> pairedReadNames = new ArrayList<>(numReadPairs * 2);
final List<String> unpairedReadNames = new ArrayList<>(numUnpairedReads + numFormerlyPairedReads);
for (int i = 0; i < numReadPairs; i++) {
final String readName = "paired_" + i;
final List<GATKRead> readPair = ArtificialReadUtils.createPair(header, readName, 101, 1, 102, false, false);
readList.addAll(readPair);
pairedReadNames.add(readName);
pairedReadNames.add(readName);
}
for (int i = 0; i < numUnpairedReads; i++) {
final String readName = "unpaired_" + i;
final GATKRead unpairedRead = ArtificialReadUtils.createRandomRead(101);
unpairedRead.setName(readName);
readList.add(unpairedRead);
unpairedReadNames.add(readName);
}
for (int i = 0; i < numFormerlyPairedReads; i++) {
final String readName = "formerly_paired_" + i;
final List<GATKRead> readPair = ArtificialReadUtils.createPair(header, readName, 101, 1, 102, false, false);
final GATKRead formerlyPairedRead = readPair.get(0);
readList.add(formerlyPairedRead);
unpairedReadNames.add(readName);
}
Collections.shuffle(readList);
final int numPartitions = 3;
final JavaRDD<GATKRead> readRDD = ctx.parallelize(readList, numPartitions);
final int readsPerPartitionGuess = (numReadPairs + numUnpairedReads + numFormerlyPairedReads) / numPartitions;
final PSPairedUnpairedSplitterSpark splitter = new PSPairedUnpairedSplitterSpark(readRDD, readsPerPartitionGuess);
final List<GATKRead> resultPaired = splitter.getPairedReads().collect();
final List<GATKRead> resultUnpaired = splitter.getUnpairedReads().collect();
splitter.close();
Assert.assertEquals(resultPaired.size(), pairedReadNames.size());
for (final GATKRead read : resultPaired) {
final String readName = read.getName();
Assert.assertTrue(pairedReadNames.contains(readName));
pairedReadNames.remove(readName);
}
Assert.assertEquals(resultUnpaired.size(), unpairedReadNames.size());
for (final GATKRead read : resultUnpaired) {
final String readName = read.getName();
Assert.assertTrue(unpairedReadNames.contains(readName));
unpairedReadNames.remove(readName);
}
}
Aggregations