Search in sources :

Example 21 with SAMSequenceDictionary

use of htsjdk.samtools.SAMSequenceDictionary in project gatk by broadinstitute.

the class IndexUtils method getSamSequenceDictionaryFromIndex.

private static SAMSequenceDictionary getSamSequenceDictionaryFromIndex(final Index index) {
    final List<String> seqNames = index.getSequenceNames();
    if (seqNames == null || seqNames.isEmpty()) {
        return null;
    }
    final SAMSequenceDictionary dict = new SAMSequenceDictionary();
    //use UNKNOWN_SEQUENCE_LENGTH to indicate contigs that will not be compared by length (see SequenceDictionaryUtils.sequenceRecordsAreEquivalent)
    seqNames.forEach(seqName -> dict.addSequence(new SAMSequenceRecord(seqName, SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH)));
    return dict;
}
Also used : SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary)

Example 22 with SAMSequenceDictionary

use of htsjdk.samtools.SAMSequenceDictionary in project gatk by broadinstitute.

the class AnnotateTargetsIntegrationTest method checkOutputFileContent.

private void checkOutputFileContent(final File outputFile, final boolean mustHaveGCContent, final boolean mustHaveRepeats) throws IOException {
    try (final TargetTableReader outputReader = new TargetTableReader(outputFile);
        final TargetTableReader inputReader = new TargetTableReader(TARGET_FILE);
        final ReferenceFileSource reference = new ReferenceFileSource(REFERENCE)) {
        final SAMSequenceDictionary dictionary = reference.getSequenceDictionary();
        Target outputTarget;
        Target inputTarget;
        do {
            outputTarget = outputReader.readRecord();
            inputTarget = inputReader.readRecord();
            if (outputTarget == inputTarget) {
                Assert.assertNull(outputTarget);
                break;
            }
            Assert.assertNotNull(outputTarget, "too few targets in output");
            Assert.assertNotNull(inputTarget, "too many targets in output");
            Assert.assertEquals(outputTarget.getName(), inputTarget.getName());
            Assert.assertEquals(outputTarget.getInterval(), inputTarget.getInterval());
            final TargetAnnotationCollection annotations = outputTarget.getAnnotations();
            if (mustHaveGCContent) {
                Assert.assertTrue(annotations.hasAnnotation(TargetAnnotation.GC_CONTENT));
                checkOutputGCContent(reference, outputTarget.getInterval(), annotations.getDouble(TargetAnnotation.GC_CONTENT));
            } else {
                Assert.assertFalse(annotations.hasAnnotation(TargetAnnotation.GC_CONTENT));
            }
        } while (true);
    }
}
Also used : SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ReferenceFileSource(org.broadinstitute.hellbender.engine.ReferenceFileSource)

Example 23 with SAMSequenceDictionary

use of htsjdk.samtools.SAMSequenceDictionary in project gatk by broadinstitute.

the class AnnotateTargetsIntegrationTest method createTargetFile.

@BeforeClass
public void createTargetFile() throws IOException {
    final SAMSequenceDictionary referenceDictionary = resolveReferenceDictionary();
    final List<SimpleInterval> targetIntervals = createRandomIntervals(referenceDictionary, NUMBER_OF_TARGETS, MIN_TARGET_SIZE, MAX_TARGET_SIZE, MEAN_TARGET_SIZE, TARGET_SIZE_STDEV);
    final List<Target> targets = targetIntervals.stream().map(Target::new).collect(Collectors.toList());
    TargetWriter.writeTargetsToFile(TARGET_FILE, targets);
    final Index index = IndexFactory.createIndex(TARGET_FILE, new TargetCodec(), IndexFactory.IndexType.LINEAR);
    final LittleEndianOutputStream stream = new LittleEndianOutputStream(new FileOutputStream(TARGET_FILE_IDX));
    index.write(stream);
    stream.close();
}
Also used : LittleEndianOutputStream(htsjdk.tribble.util.LittleEndianOutputStream) TargetCodec(org.broadinstitute.hellbender.utils.codecs.TargetCodec) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Index(htsjdk.tribble.index.Index) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) BeforeClass(org.testng.annotations.BeforeClass)

Example 24 with SAMSequenceDictionary

use of htsjdk.samtools.SAMSequenceDictionary in project gatk by broadinstitute.

the class FeatureDataSource method getSequenceDictionary.

/**
     * Returns the sequence dictionary for this source of Features.
     * Uses the dictionary from the VCF header (if present) for variant inputs,
     * otherwise attempts to create a sequence dictionary from the index file (if present).
     * Returns null if no dictionary could be created from either the header or the index.
     */
public SAMSequenceDictionary getSequenceDictionary() {
    SAMSequenceDictionary dict = null;
    final Object header = getHeader();
    if (header instanceof VCFHeader) {
        dict = ((VCFHeader) header).getSequenceDictionary();
    }
    if (dict != null && !dict.isEmpty()) {
        return dict;
    }
    if (hasIndex) {
        return IndexUtils.createSequenceDictionaryFromFeatureIndex(new File(featureInput.getFeaturePath()));
    }
    return null;
}
Also used : VCFHeader(htsjdk.variant.vcf.VCFHeader) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) File(java.io.File) IndexFeatureFile(org.broadinstitute.hellbender.tools.IndexFeatureFile)

Example 25 with SAMSequenceDictionary

use of htsjdk.samtools.SAMSequenceDictionary in project gatk by broadinstitute.

the class SparkGenomeReadCounts method collectReads.

private void collectReads() {
    if (readArguments.getReadFilesNames().size() != 1) {
        throw new UserException("This tool only accepts a single bam/sam/cram as input");
    }
    final SampleCollection sampleCollection = new SampleCollection(getHeaderForReads());
    if (sampleCollection.sampleCount() > 1) {
        throw new UserException.BadInput("We do not support bams with more than one sample.");
    }
    final String sampleName = sampleCollection.sampleIds().get(0);
    final String[] commentsForRawCoverage = { "##fileFormat  = tsv", "##commandLine = " + getCommandLine(), String.format("##title = Coverage counts in %d base bins for WGS", binsize) };
    final ReadFilter filter = makeGenomeReadFilter();
    final SAMSequenceDictionary sequenceDictionary = getReferenceSequenceDictionary();
    logger.info("Starting Spark coverage collection...");
    final long coverageCollectionStartTime = System.currentTimeMillis();
    final JavaRDD<GATKRead> rawReads = getReads();
    final JavaRDD<GATKRead> reads = rawReads.filter(read -> filter.test(read));
    //Note: using a field inside a closure will pull in the whole enclosing object to serialization
    // (which leads to bad performance and can blow up if some objects in the fields are not
    // Serializable - closures always use java Serializable and not Kryo)
    //Solution here is to use a temp variable for binsize because it's just an int.
    final int binsize_tmp = binsize;
    final JavaRDD<SimpleInterval> readIntervals = reads.filter(read -> sequenceDictionary.getSequence(read.getContig()) != null).map(read -> SparkGenomeReadCounts.createKey(read, sequenceDictionary, binsize_tmp));
    final Map<SimpleInterval, Long> byKey = readIntervals.countByValue();
    final Set<SimpleInterval> readIntervalKeySet = byKey.keySet();
    final long totalReads = byKey.values().stream().mapToLong(v -> v).sum();
    final long coverageCollectionEndTime = System.currentTimeMillis();
    logger.info(String.format("Finished the spark coverage collection with %d targets and %d reads. Elapse of %d seconds", readIntervalKeySet.size(), totalReads, (coverageCollectionEndTime - coverageCollectionStartTime) / 1000));
    final String[] commentsForProportionalCoverage = { commentsForRawCoverage[0], commentsForRawCoverage[1], String.format("##title = Proportional coverage counts in %d base bins for WGS (total reads: %d)", binsize, totalReads) };
    logger.info("Creating full genome bins...");
    final long createGenomeBinsStartTime = System.currentTimeMillis();
    final List<SimpleInterval> fullGenomeBins = createFullGenomeBins(binsize);
    List<Target> fullGenomeTargetCollection = createTargetListFromSimpleInterval(fullGenomeBins);
    TargetWriter.writeTargetsToFile(new File(outputFile.getAbsolutePath() + ".targets.tsv"), fullGenomeTargetCollection);
    final long createGenomeBinsEndTime = System.currentTimeMillis();
    logger.info(String.format("Finished creating genome bins. Elapse of %d seconds", (createGenomeBinsEndTime - createGenomeBinsStartTime) / 1000));
    logger.info("Creating missing genome bins...");
    final long createMissingGenomeBinsStartTime = System.currentTimeMillis();
    logger.info("Creating missing genome bins: Creating a mutable mapping...");
    final Map<SimpleInterval, Long> byKeyMutable = new HashMap<>();
    byKeyMutable.putAll(byKey);
    logger.info("Creating missing genome bins: Populating mutable mapping with zero counts for empty regions...");
    fullGenomeBins.stream().forEach(b -> byKeyMutable.putIfAbsent(b, 0l));
    final long createMissingGenomeBinsEndTime = System.currentTimeMillis();
    logger.info(String.format("Finished creating missing genome bins. Elapse of %d seconds", (createMissingGenomeBinsEndTime - createMissingGenomeBinsStartTime) / 1000));
    logger.info("Creating final map...");
    final long createFinalMapStartTime = System.currentTimeMillis();
    final SortedMap<SimpleInterval, Long> byKeySorted = new TreeMap<>(IntervalUtils.LEXICOGRAPHICAL_ORDER_COMPARATOR);
    byKeySorted.putAll(byKeyMutable);
    final long createFinalMapEndTime = System.currentTimeMillis();
    logger.info(String.format("Finished creating final map. Elapse of %d seconds", (createFinalMapEndTime - createFinalMapStartTime) / 1000));
    logger.info("Creating proportional coverage... ");
    final long pCovFileStartTime = System.currentTimeMillis();
    final SortedMap<SimpleInterval, Double> byKeyProportionalSorted = new TreeMap<>(IntervalUtils.LEXICOGRAPHICAL_ORDER_COMPARATOR);
    byKeySorted.entrySet().stream().forEach(e -> byKeyProportionalSorted.put(e.getKey(), (double) e.getValue() / totalReads));
    final long pCovFileEndTime = System.currentTimeMillis();
    logger.info(String.format("Finished creating proportional coverage map. Elapse of %d seconds", (pCovFileEndTime - pCovFileStartTime) / 1000));
    logger.info("Writing raw coverage file ...");
    final long writingCovFileStartTime = System.currentTimeMillis();
    ReadCountCollectionUtils.writeReadCountsFromSimpleInterval(new File(outputFile.getAbsolutePath() + RAW_COV_OUTPUT_EXTENSION), sampleName, byKeySorted, commentsForRawCoverage);
    final long writingCovFileEndTime = System.currentTimeMillis();
    logger.info(String.format("Finished writing coverage file. Elapse of %d seconds", (writingCovFileEndTime - writingCovFileStartTime) / 1000));
    logger.info("Writing proportional coverage file ...");
    final long writingPCovFileStartTime = System.currentTimeMillis();
    ReadCountCollectionUtils.writeReadCountsFromSimpleInterval(outputFile, sampleName, byKeyProportionalSorted, commentsForProportionalCoverage);
    final long writingPCovFileEndTime = System.currentTimeMillis();
    logger.info(String.format("Finished writing proportional coverage file. Elapse of %d seconds", (writingPCovFileEndTime - writingPCovFileStartTime) / 1000));
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) DocumentedFeature(org.broadinstitute.barclay.help.DocumentedFeature) CommandLineProgramProperties(org.broadinstitute.barclay.argparser.CommandLineProgramProperties) java.util(java.util) CopyNumberProgramGroup(org.broadinstitute.hellbender.cmdline.programgroups.CopyNumberProgramGroup) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Argument(org.broadinstitute.barclay.argparser.Argument) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) GATKSparkTool(org.broadinstitute.hellbender.engine.spark.GATKSparkTool) ReadFilter(org.broadinstitute.hellbender.engine.filters.ReadFilter) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) File(java.io.File) Logger(org.apache.logging.log4j.Logger) IntervalUtils(org.broadinstitute.hellbender.utils.IntervalUtils) UserException(org.broadinstitute.hellbender.exceptions.UserException) WellformedReadFilter(org.broadinstitute.hellbender.engine.filters.WellformedReadFilter) Target(org.broadinstitute.hellbender.tools.exome.Target) ReadCountCollectionUtils(org.broadinstitute.hellbender.tools.exome.ReadCountCollectionUtils) TargetWriter(org.broadinstitute.hellbender.tools.exome.TargetWriter) LogManager(org.apache.logging.log4j.LogManager) SampleCollection(org.broadinstitute.hellbender.tools.exome.SampleCollection) JavaRDD(org.apache.spark.api.java.JavaRDD) ReadFilterLibrary(org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Target(org.broadinstitute.hellbender.tools.exome.Target) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) UserException(org.broadinstitute.hellbender.exceptions.UserException) SampleCollection(org.broadinstitute.hellbender.tools.exome.SampleCollection) ReadFilter(org.broadinstitute.hellbender.engine.filters.ReadFilter) WellformedReadFilter(org.broadinstitute.hellbender.engine.filters.WellformedReadFilter) File(java.io.File)

Aggregations

SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)110 Test (org.testng.annotations.Test)41 SAMSequenceRecord (htsjdk.samtools.SAMSequenceRecord)37 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)37 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)35 File (java.io.File)31 UserException (org.broadinstitute.hellbender.exceptions.UserException)24 VariantContext (htsjdk.variant.variantcontext.VariantContext)23 Argument (org.broadinstitute.barclay.argparser.Argument)21 Collectors (java.util.stream.Collectors)20 ReferenceMultiSource (org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource)20 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)18 GATKRead (org.broadinstitute.hellbender.utils.read.GATKRead)17 VCFHeader (htsjdk.variant.vcf.VCFHeader)16 IntervalUtils (org.broadinstitute.hellbender.utils.IntervalUtils)16 SAMFileHeader (htsjdk.samtools.SAMFileHeader)14 List (java.util.List)14 JavaRDD (org.apache.spark.api.java.JavaRDD)14 Broadcast (org.apache.spark.broadcast.Broadcast)12 StreamSupport (java.util.stream.StreamSupport)11