Search in sources :

Example 86 with SAMSequenceDictionary

use of htsjdk.samtools.SAMSequenceDictionary in project gatk by broadinstitute.

the class GATKTool method initializeIntervals.

/**
     * Initialize our intervals for traversal.
     *
     * If intervals were specified, requires that another data source (reads or reference) be present
     * to supply a sequence dictionary. This requirement may be removed in the future.
     *
     * Must be called after other data sources have been initialized because of the sequence dictionary
     * requirement.
     *
     * Package-private so that engine classes can access it, but concrete tool child classes cannot.
     * May be overridden by traversals that require custom initialization of intervals.
     */
void initializeIntervals() {
    if (intervalArgumentCollection.intervalsSpecified()) {
        final SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary();
        if (sequenceDictionary == null) {
            throw new UserException("We require a sequence dictionary from a reference, a source of reads, or a source of variants to process intervals.  " + "Since reference and reads files generally contain sequence dictionaries, this error most commonly occurs " + "for VariantWalkers that do not require a reference or reads.  You can fix the problem by passing a reference file with a sequence dictionary " + "via the -R argument or you can run the tool UpdateVCFSequenceDictionary on your vcf.");
        }
        intervalsForTraversal = intervalArgumentCollection.getIntervals(sequenceDictionary);
    }
}
Also used : UserException(org.broadinstitute.hellbender.exceptions.UserException) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary)

Example 87 with SAMSequenceDictionary

use of htsjdk.samtools.SAMSequenceDictionary in project gatk by broadinstitute.

the class ReferenceAPISource method getReferenceSequenceDictionaryFromMap.

/**
     * Given a reference map, get a SAMSequenceDictionary that lists its contigs and their length.
     * Note that this does NOT use the referenceContig order from the API. Instead, we'll sort them
     * and try to match the read sequence dictionary order, if given.
     *
     * @param referenceMap - return value from getReferenceNameToReferenceTable
     * @param optReadSequenceDictionaryToMatch - (optional) the sequence dictionary of the reads, we'll match its order if possible.
     * @return a SAMSequenceDictionary that lists the referenceset's contigs and their length.
     */
public SAMSequenceDictionary getReferenceSequenceDictionaryFromMap(final Map<String, Reference> referenceMap, final SAMSequenceDictionary optReadSequenceDictionaryToMatch) {
    SAMSequenceDictionary refDictionary = new SAMSequenceDictionary();
    ArrayList<SAMSequenceRecord> refContigs = new ArrayList<>();
    for (Map.Entry<String, Reference> e : referenceMap.entrySet()) {
        if (e.getKey() != null && e.getValue().getLength() != null) {
            refContigs.add(new SAMSequenceRecord(e.getKey(), e.getValue().getLength().intValue()));
        }
    }
    HashMap<String, Integer> indexBuilder = null;
    if (null != optReadSequenceDictionaryToMatch) {
        indexBuilder = new LinkedHashMap<>();
        for (int i = 0; i < optReadSequenceDictionaryToMatch.size(); i++) {
            final SAMSequenceRecord sequence = optReadSequenceDictionaryToMatch.getSequence(i);
            indexBuilder.put(sequence.getSequenceName(), i);
        }
    }
    final Map<String, Integer> optReadSequenceDictionaryToMatchIndex = indexBuilder;
    // GATK requires human contigs in karyotypic order: 1, 2, ..., 10, 11, ..., 20, 21, 22, X, Y with M either leading or trailing these contigs.
    // So we sort them.
    Collections.sort(refContigs, new Comparator<SAMSequenceRecord>() {

        @Override
        public int compare(SAMSequenceRecord o1, SAMSequenceRecord o2) {
            // if those are ordered in the readDictionary, then match that order
            if (null != optReadSequenceDictionaryToMatchIndex) {
                if (optReadSequenceDictionaryToMatchIndex.containsKey(o1.getSequenceName()) && optReadSequenceDictionaryToMatchIndex.containsKey(o2.getSequenceName())) {
                    return optReadSequenceDictionaryToMatchIndex.get(o1.getSequenceName()).compareTo(optReadSequenceDictionaryToMatchIndex.get(o2.getSequenceName()));
                }
            }
            // otherwise, order them in karyotypic order.
            int r1 = getRank(o1.getSequenceName());
            int r2 = getRank(o2.getSequenceName());
            if (r1 < r2)
                return -1;
            if (r2 < r1)
                return 1;
            return o1.getSequenceName().compareTo(o2.getSequenceName());
        }

        private int getRank(String name) {
            if (name.equalsIgnoreCase("x"))
                return 23;
            if (name.equalsIgnoreCase("y"))
                return 24;
            if (name.equalsIgnoreCase("m"))
                return 25;
            if (name.equalsIgnoreCase("mt"))
                return 25;
            StringBuilder b = new StringBuilder();
            for (char c : name.toCharArray()) {
                if (Character.isDigit(c)) {
                    b.append(c);
                }
            }
            String numsOnly = b.toString();
            if (numsOnly.isEmpty())
                return 0;
            return Integer.parseInt(numsOnly);
        }
    });
    for (SAMSequenceRecord s : refContigs) {
        refDictionary.addSequence(s);
    }
    return refDictionary;
}
Also used : SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary)

Example 88 with SAMSequenceDictionary

use of htsjdk.samtools.SAMSequenceDictionary in project gatk by broadinstitute.

the class LocusWalkerSpark method getAlignmentsFunction.

/**
     * Return a function that maps a {@link Shard} of reads into a tuple of alignments and their corresponding reference and features.
     * @param bReferenceSource the reference source broadcast
     * @param bFeatureManager the feature manager broadcast
     * @param sequenceDictionary the sequence dictionary for the reads
     * @param header the reads header
     * @param downsamplingInfo the downsampling method for the reads
     * @return a function that maps a {@link Shard} of reads into a tuple of alignments and their corresponding reference and features.
     */
private static FlatMapFunction<Shard<GATKRead>, LocusWalkerContext> getAlignmentsFunction(Broadcast<ReferenceMultiSource> bReferenceSource, Broadcast<FeatureManager> bFeatureManager, SAMSequenceDictionary sequenceDictionary, SAMFileHeader header, LIBSDownsamplingInfo downsamplingInfo) {
    return (FlatMapFunction<Shard<GATKRead>, LocusWalkerContext>) shardedRead -> {
        SimpleInterval interval = shardedRead.getInterval();
        SimpleInterval paddedInterval = shardedRead.getPaddedInterval();
        Iterator<GATKRead> readIterator = shardedRead.iterator();
        ReferenceDataSource reference = bReferenceSource == null ? null : new ReferenceMemorySource(bReferenceSource.getValue().getReferenceBases(null, paddedInterval), sequenceDictionary);
        FeatureManager fm = bFeatureManager == null ? null : bFeatureManager.getValue();
        final Set<String> samples = header.getReadGroups().stream().map(SAMReadGroupRecord::getSample).collect(Collectors.toSet());
        LocusIteratorByState libs = new LocusIteratorByState(readIterator, downsamplingInfo, false, samples, header, true, false);
        IntervalOverlappingIterator<AlignmentContext> alignmentContexts = new IntervalOverlappingIterator<>(libs, ImmutableList.of(interval), sequenceDictionary);
        final Spliterator<AlignmentContext> alignmentContextSpliterator = Spliterators.spliteratorUnknownSize(alignmentContexts, 0);
        return StreamSupport.stream(alignmentContextSpliterator, false).map(alignmentContext -> {
            final SimpleInterval alignmentInterval = new SimpleInterval(alignmentContext);
            return new LocusWalkerContext(alignmentContext, new ReferenceContext(reference, alignmentInterval), new FeatureContext(fm, alignmentInterval));
        }).iterator();
    };
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) Broadcast(org.apache.spark.broadcast.Broadcast) java.util(java.util) IntervalOverlappingIterator(org.broadinstitute.hellbender.utils.iterators.IntervalOverlappingIterator) ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Argument(org.broadinstitute.barclay.argparser.Argument) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) LocusIteratorByState(org.broadinstitute.hellbender.utils.locusiterator.LocusIteratorByState) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) SAMFileHeader(htsjdk.samtools.SAMFileHeader) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) org.broadinstitute.hellbender.engine(org.broadinstitute.hellbender.engine) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) IntervalUtils(org.broadinstitute.hellbender.utils.IntervalUtils) ImmutableList(com.google.common.collect.ImmutableList) StreamSupport(java.util.stream.StreamSupport) LIBSDownsamplingInfo(org.broadinstitute.hellbender.utils.locusiterator.LIBSDownsamplingInfo) JavaRDD(org.apache.spark.api.java.JavaRDD) FlatMapFunction(org.apache.spark.api.java.function.FlatMapFunction) CommandLineException(org.broadinstitute.barclay.argparser.CommandLineException) IntervalOverlappingIterator(org.broadinstitute.hellbender.utils.iterators.IntervalOverlappingIterator) LocusIteratorByState(org.broadinstitute.hellbender.utils.locusiterator.LocusIteratorByState) FlatMapFunction(org.apache.spark.api.java.function.FlatMapFunction) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval)

Example 89 with SAMSequenceDictionary

use of htsjdk.samtools.SAMSequenceDictionary in project gatk by broadinstitute.

the class ReadWalkerSpark method getReadsFunction.

private static FlatMapFunction<Shard<GATKRead>, ReadWalkerContext> getReadsFunction(Broadcast<ReferenceMultiSource> bReferenceSource, Broadcast<FeatureManager> bFeatureManager, SAMSequenceDictionary sequenceDictionary, int readShardPadding) {
    return (FlatMapFunction<Shard<GATKRead>, ReadWalkerContext>) shard -> {
        SimpleInterval paddedInterval = shard.getInterval().expandWithinContig(readShardPadding, sequenceDictionary);
        ReferenceDataSource reference = bReferenceSource == null ? null : new ReferenceMemorySource(bReferenceSource.getValue().getReferenceBases(null, paddedInterval), sequenceDictionary);
        FeatureManager features = bFeatureManager == null ? null : bFeatureManager.getValue();
        return StreamSupport.stream(shard.spliterator(), false).map(r -> {
            final SimpleInterval readInterval = getReadInterval(r);
            return new ReadWalkerContext(r, new ReferenceContext(reference, readInterval), new FeatureContext(features, readInterval));
        }).iterator();
    };
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) Broadcast(org.apache.spark.broadcast.Broadcast) ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Argument(org.broadinstitute.barclay.argparser.Argument) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) org.broadinstitute.hellbender.engine(org.broadinstitute.hellbender.engine) List(java.util.List) IntervalUtils(org.broadinstitute.hellbender.utils.IntervalUtils) StreamSupport(java.util.stream.StreamSupport) JavaRDD(org.apache.spark.api.java.JavaRDD) FlatMapFunction(org.apache.spark.api.java.function.FlatMapFunction) FlatMapFunction(org.apache.spark.api.java.function.FlatMapFunction) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval)

Example 90 with SAMSequenceDictionary

use of htsjdk.samtools.SAMSequenceDictionary in project gatk by broadinstitute.

the class PSPairedUnpairedSplitterSparkTest method testReadPairing.

@Test(groups = "spark")
public void testReadPairing() {
    final JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    final List<GATKRead> readList = new ArrayList<>(2);
    final SAMSequenceDictionary seq = new SAMSequenceDictionary();
    seq.addSequence(new SAMSequenceRecord("test_seq", 1000));
    final SAMFileHeader header = new SAMFileHeader(seq);
    final int numReadPairs = 300;
    final int numUnpairedReads = 205;
    final int numFormerlyPairedReads = 400;
    final List<String> pairedReadNames = new ArrayList<>(numReadPairs * 2);
    final List<String> unpairedReadNames = new ArrayList<>(numUnpairedReads + numFormerlyPairedReads);
    for (int i = 0; i < numReadPairs; i++) {
        final String readName = "paired_" + i;
        final List<GATKRead> readPair = ArtificialReadUtils.createPair(header, readName, 101, 1, 102, false, false);
        readList.addAll(readPair);
        pairedReadNames.add(readName);
        pairedReadNames.add(readName);
    }
    for (int i = 0; i < numUnpairedReads; i++) {
        final String readName = "unpaired_" + i;
        final GATKRead unpairedRead = ArtificialReadUtils.createRandomRead(101);
        unpairedRead.setName(readName);
        readList.add(unpairedRead);
        unpairedReadNames.add(readName);
    }
    for (int i = 0; i < numFormerlyPairedReads; i++) {
        final String readName = "formerly_paired_" + i;
        final List<GATKRead> readPair = ArtificialReadUtils.createPair(header, readName, 101, 1, 102, false, false);
        final GATKRead formerlyPairedRead = readPair.get(0);
        readList.add(formerlyPairedRead);
        unpairedReadNames.add(readName);
    }
    Collections.shuffle(readList);
    final int numPartitions = 3;
    final JavaRDD<GATKRead> readRDD = ctx.parallelize(readList, numPartitions);
    final int readsPerPartitionGuess = (numReadPairs + numUnpairedReads + numFormerlyPairedReads) / numPartitions;
    final PSPairedUnpairedSplitterSpark splitter = new PSPairedUnpairedSplitterSpark(readRDD, readsPerPartitionGuess);
    final List<GATKRead> resultPaired = splitter.getPairedReads().collect();
    final List<GATKRead> resultUnpaired = splitter.getUnpairedReads().collect();
    splitter.close();
    Assert.assertEquals(resultPaired.size(), pairedReadNames.size());
    for (final GATKRead read : resultPaired) {
        final String readName = read.getName();
        Assert.assertTrue(pairedReadNames.contains(readName));
        pairedReadNames.remove(readName);
    }
    Assert.assertEquals(resultUnpaired.size(), unpairedReadNames.size());
    for (final GATKRead read : resultUnpaired) {
        final String readName = read.getName();
        Assert.assertTrue(unpairedReadNames.contains(readName));
        unpairedReadNames.remove(readName);
    }
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) ArrayList(java.util.ArrayList) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) SAMFileHeader(htsjdk.samtools.SAMFileHeader) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Test(org.testng.annotations.Test)

Aggregations

SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)110 Test (org.testng.annotations.Test)41 SAMSequenceRecord (htsjdk.samtools.SAMSequenceRecord)37 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)37 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)35 File (java.io.File)31 UserException (org.broadinstitute.hellbender.exceptions.UserException)24 VariantContext (htsjdk.variant.variantcontext.VariantContext)23 Argument (org.broadinstitute.barclay.argparser.Argument)21 Collectors (java.util.stream.Collectors)20 ReferenceMultiSource (org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource)20 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)18 GATKRead (org.broadinstitute.hellbender.utils.read.GATKRead)17 VCFHeader (htsjdk.variant.vcf.VCFHeader)16 IntervalUtils (org.broadinstitute.hellbender.utils.IntervalUtils)16 SAMFileHeader (htsjdk.samtools.SAMFileHeader)14 List (java.util.List)14 JavaRDD (org.apache.spark.api.java.JavaRDD)14 Broadcast (org.apache.spark.broadcast.Broadcast)12 StreamSupport (java.util.stream.StreamSupport)11