Search in sources :

Example 41 with SAMSequenceDictionary

use of htsjdk.samtools.SAMSequenceDictionary in project gatk by broadinstitute.

the class ReferenceUtilsUnitTest method testLoadFastaDictionaryFromStream.

@Test
public void testLoadFastaDictionaryFromStream() throws IOException {
    try (final ClosingAwareFileInputStream referenceDictionaryStream = new ClosingAwareFileInputStream(new File(ReferenceUtils.getFastaDictionaryFileName(hg19MiniReference)))) {
        final SAMSequenceDictionary dictionary = ReferenceUtils.loadFastaDictionary(referenceDictionaryStream);
        Assert.assertNotNull(dictionary, "Sequence dictionary null after loading");
        Assert.assertEquals(dictionary.size(), 4, "Wrong sequence dictionary size after loading");
        Assert.assertFalse(referenceDictionaryStream.isClosed(), "InputStream was improperly closed by ReferenceUtils.loadFastaDictionary()");
    }
}
Also used : SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Example 42 with SAMSequenceDictionary

use of htsjdk.samtools.SAMSequenceDictionary in project gatk by broadinstitute.

the class ReferenceUtilsUnitTest method testLoadFastaDictionaryFromGCSBucket.

@Test(groups = { "bucket" })
public void testLoadFastaDictionaryFromGCSBucket() throws IOException {
    final String bucketDictionary = getGCPTestInputPath() + "org/broadinstitute/hellbender/utils/ReferenceUtilsTest.dict";
    final PipelineOptions popts = getAuthenticatedPipelineOptions();
    try (final InputStream referenceDictionaryStream = BucketUtils.openFile(bucketDictionary)) {
        final SAMSequenceDictionary dictionary = ReferenceUtils.loadFastaDictionary(referenceDictionaryStream);
        Assert.assertNotNull(dictionary, "Sequence dictionary null after loading");
        Assert.assertEquals(dictionary.size(), 4, "Wrong sequence dictionary size after loading");
    }
}
Also used : PipelineOptions(com.google.cloud.dataflow.sdk.options.PipelineOptions) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Example 43 with SAMSequenceDictionary

use of htsjdk.samtools.SAMSequenceDictionary in project gatk-protected by broadinstitute.

the class SparkGenomeReadCounts method collectReads.

private void collectReads() {
    if (readArguments.getReadFilesNames().size() != 1) {
        throw new UserException("This tool only accepts a single bam/sam/cram as input");
    }
    final SampleCollection sampleCollection = new SampleCollection(getHeaderForReads());
    if (sampleCollection.sampleCount() > 1) {
        throw new UserException.BadInput("We do not support bams with more than one sample.");
    }
    final String sampleName = sampleCollection.sampleIds().get(0);
    final String[] commentsForRawCoverage = { "##fileFormat  = tsv", "##commandLine = " + getCommandLine(), String.format("##title = Coverage counts in %d base bins for WGS", binsize) };
    final ReadFilter filter = makeGenomeReadFilter();
    final SAMSequenceDictionary sequenceDictionary = getReferenceSequenceDictionary();
    logger.info("Starting Spark coverage collection...");
    final long coverageCollectionStartTime = System.currentTimeMillis();
    final JavaRDD<GATKRead> rawReads = getReads();
    final JavaRDD<GATKRead> reads = rawReads.filter(read -> filter.test(read));
    //Note: using a field inside a closure will pull in the whole enclosing object to serialization
    // (which leads to bad performance and can blow up if some objects in the fields are not
    // Serializable - closures always use java Serializable and not Kryo)
    //Solution here is to use a temp variable for binsize because it's just an int.
    final int binsize_tmp = binsize;
    final JavaRDD<SimpleInterval> readIntervals = reads.filter(read -> sequenceDictionary.getSequence(read.getContig()) != null).map(read -> SparkGenomeReadCounts.createKey(read, sequenceDictionary, binsize_tmp));
    final Map<SimpleInterval, Long> byKey = readIntervals.countByValue();
    final Set<SimpleInterval> readIntervalKeySet = byKey.keySet();
    final long totalReads = byKey.values().stream().mapToLong(v -> v).sum();
    final long coverageCollectionEndTime = System.currentTimeMillis();
    logger.info(String.format("Finished the spark coverage collection with %d targets and %d reads. Elapse of %d seconds", readIntervalKeySet.size(), totalReads, (coverageCollectionEndTime - coverageCollectionStartTime) / 1000));
    final String[] commentsForProportionalCoverage = { commentsForRawCoverage[0], commentsForRawCoverage[1], String.format("##title = Proportional coverage counts in %d base bins for WGS (total reads: %d)", binsize, totalReads) };
    logger.info("Creating full genome bins...");
    final long createGenomeBinsStartTime = System.currentTimeMillis();
    final List<SimpleInterval> fullGenomeBins = createFullGenomeBins(binsize);
    List<Target> fullGenomeTargetCollection = createTargetListFromSimpleInterval(fullGenomeBins);
    TargetWriter.writeTargetsToFile(new File(outputFile.getAbsolutePath() + ".targets.tsv"), fullGenomeTargetCollection);
    final long createGenomeBinsEndTime = System.currentTimeMillis();
    logger.info(String.format("Finished creating genome bins. Elapse of %d seconds", (createGenomeBinsEndTime - createGenomeBinsStartTime) / 1000));
    logger.info("Creating missing genome bins...");
    final long createMissingGenomeBinsStartTime = System.currentTimeMillis();
    logger.info("Creating missing genome bins: Creating a mutable mapping...");
    final Map<SimpleInterval, Long> byKeyMutable = new HashMap<>();
    byKeyMutable.putAll(byKey);
    logger.info("Creating missing genome bins: Populating mutable mapping with zero counts for empty regions...");
    fullGenomeBins.stream().forEach(b -> byKeyMutable.putIfAbsent(b, 0l));
    final long createMissingGenomeBinsEndTime = System.currentTimeMillis();
    logger.info(String.format("Finished creating missing genome bins. Elapse of %d seconds", (createMissingGenomeBinsEndTime - createMissingGenomeBinsStartTime) / 1000));
    logger.info("Creating final map...");
    final long createFinalMapStartTime = System.currentTimeMillis();
    final SortedMap<SimpleInterval, Long> byKeySorted = new TreeMap<>(IntervalUtils.LEXICOGRAPHICAL_ORDER_COMPARATOR);
    byKeySorted.putAll(byKeyMutable);
    final long createFinalMapEndTime = System.currentTimeMillis();
    logger.info(String.format("Finished creating final map. Elapse of %d seconds", (createFinalMapEndTime - createFinalMapStartTime) / 1000));
    logger.info("Creating proportional coverage... ");
    final long pCovFileStartTime = System.currentTimeMillis();
    final SortedMap<SimpleInterval, Double> byKeyProportionalSorted = new TreeMap<>(IntervalUtils.LEXICOGRAPHICAL_ORDER_COMPARATOR);
    byKeySorted.entrySet().stream().forEach(e -> byKeyProportionalSorted.put(e.getKey(), (double) e.getValue() / totalReads));
    final long pCovFileEndTime = System.currentTimeMillis();
    logger.info(String.format("Finished creating proportional coverage map. Elapse of %d seconds", (pCovFileEndTime - pCovFileStartTime) / 1000));
    logger.info("Writing raw coverage file ...");
    final long writingCovFileStartTime = System.currentTimeMillis();
    ReadCountCollectionUtils.writeReadCountsFromSimpleInterval(new File(outputFile.getAbsolutePath() + RAW_COV_OUTPUT_EXTENSION), sampleName, byKeySorted, commentsForRawCoverage);
    final long writingCovFileEndTime = System.currentTimeMillis();
    logger.info(String.format("Finished writing coverage file. Elapse of %d seconds", (writingCovFileEndTime - writingCovFileStartTime) / 1000));
    logger.info("Writing proportional coverage file ...");
    final long writingPCovFileStartTime = System.currentTimeMillis();
    ReadCountCollectionUtils.writeReadCountsFromSimpleInterval(outputFile, sampleName, byKeyProportionalSorted, commentsForProportionalCoverage);
    final long writingPCovFileEndTime = System.currentTimeMillis();
    logger.info(String.format("Finished writing proportional coverage file. Elapse of %d seconds", (writingPCovFileEndTime - writingPCovFileStartTime) / 1000));
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) DocumentedFeature(org.broadinstitute.barclay.help.DocumentedFeature) CommandLineProgramProperties(org.broadinstitute.barclay.argparser.CommandLineProgramProperties) java.util(java.util) CopyNumberProgramGroup(org.broadinstitute.hellbender.cmdline.programgroups.CopyNumberProgramGroup) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Argument(org.broadinstitute.barclay.argparser.Argument) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) GATKSparkTool(org.broadinstitute.hellbender.engine.spark.GATKSparkTool) ReadFilter(org.broadinstitute.hellbender.engine.filters.ReadFilter) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) File(java.io.File) Logger(org.apache.logging.log4j.Logger) IntervalUtils(org.broadinstitute.hellbender.utils.IntervalUtils) UserException(org.broadinstitute.hellbender.exceptions.UserException) WellformedReadFilter(org.broadinstitute.hellbender.engine.filters.WellformedReadFilter) Target(org.broadinstitute.hellbender.tools.exome.Target) ReadCountCollectionUtils(org.broadinstitute.hellbender.tools.exome.ReadCountCollectionUtils) TargetWriter(org.broadinstitute.hellbender.tools.exome.TargetWriter) LogManager(org.apache.logging.log4j.LogManager) SampleCollection(org.broadinstitute.hellbender.tools.exome.SampleCollection) JavaRDD(org.apache.spark.api.java.JavaRDD) ReadFilterLibrary(org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Target(org.broadinstitute.hellbender.tools.exome.Target) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) UserException(org.broadinstitute.hellbender.exceptions.UserException) SampleCollection(org.broadinstitute.hellbender.tools.exome.SampleCollection) ReadFilter(org.broadinstitute.hellbender.engine.filters.ReadFilter) WellformedReadFilter(org.broadinstitute.hellbender.engine.filters.WellformedReadFilter) File(java.io.File)

Example 44 with SAMSequenceDictionary

use of htsjdk.samtools.SAMSequenceDictionary in project gatk-protected by broadinstitute.

the class SplitIntervals method onTraversalStart.

@Override
public void onTraversalStart() {
    ParamUtils.isPositive(scatterCount, "scatter count must be > 0.");
    if (!outputDir.exists() && !outputDir.mkdir()) {
        throw new RuntimeIOException("Unable to create directory: " + outputDir.getAbsolutePath());
    }
    // in general dictionary will be from the reference, but using -I reads.bam or -F variants.vcf
    // to use the sequence dict from a bam or vcf is also supported
    final SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary();
    final List<SimpleInterval> intervals = hasIntervals() ? intervalArgumentCollection.getIntervals(sequenceDictionary) : IntervalUtils.getAllIntervalsForReference(sequenceDictionary);
    final IntervalList intervalList = new IntervalList(sequenceDictionary);
    intervals.stream().map(si -> new Interval(si.getContig(), si.getStart(), si.getEnd())).forEach(intervalList::add);
    final IntervalListScatterer scatterer = new IntervalListScatterer(subdivisionMode);
    final List<IntervalList> scattered = scatterer.scatter(intervalList, scatterCount, false);
    final DecimalFormat formatter = new DecimalFormat("0000");
    IntStream.range(0, scattered.size()).forEach(n -> scattered.get(n).write(new File(outputDir, formatter.format(n) + "-scattered.intervals")));
}
Also used : IntStream(java.util.stream.IntStream) CommandLineProgramProperties(org.broadinstitute.barclay.argparser.CommandLineProgramProperties) IntervalListScatterer(org.broadinstitute.hellbender.tools.picard.interval.IntervalListScatterer) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Argument(org.broadinstitute.barclay.argparser.Argument) DecimalFormat(java.text.DecimalFormat) IntervalList(htsjdk.samtools.util.IntervalList) StandardArgumentDefinitions(org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) VariantProgramGroup(org.broadinstitute.hellbender.cmdline.programgroups.VariantProgramGroup) IntervalArgumentCollection(org.broadinstitute.hellbender.cmdline.argumentcollections.IntervalArgumentCollection) ParamUtils(org.broadinstitute.hellbender.utils.param.ParamUtils) File(java.io.File) GATKTool(org.broadinstitute.hellbender.engine.GATKTool) Interval(htsjdk.samtools.util.Interval) List(java.util.List) IntervalUtils(org.broadinstitute.hellbender.utils.IntervalUtils) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) IntervalListScatterer(org.broadinstitute.hellbender.tools.picard.interval.IntervalListScatterer) IntervalList(htsjdk.samtools.util.IntervalList) DecimalFormat(java.text.DecimalFormat) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) File(java.io.File) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Interval(htsjdk.samtools.util.Interval)

Example 45 with SAMSequenceDictionary

use of htsjdk.samtools.SAMSequenceDictionary in project gatk-protected by broadinstitute.

the class PlotACNVResults method doWork.

@Override
protected Object doWork() {
    checkRegularReadableUserFiles();
    //get sample name from input files (consistency check is performed)
    final String sampleName = getSampleName();
    //load contig names and lengths from the sequence dictionary into a LinkedHashMap
    final SAMSequenceDictionary sequenceDictionary = ReferenceUtils.loadFastaDictionary(sequenceDictionaryFile);
    Utils.validateArg(sequenceDictionary.getSequences().stream().map(SAMSequenceRecord::getSequenceName).noneMatch(n -> n.contains(CONTIG_DELIMITER)), String.format("Contig names cannot contain \"%s\".", CONTIG_DELIMITER));
    final Map<String, Integer> contigLengthMap = sequenceDictionary.getSequences().stream().filter(s -> s.getSequenceLength() >= minContigLength).collect(Collectors.toMap(SAMSequenceRecord::getSequenceName, SAMSequenceRecord::getSequenceLength, (c, l) -> {
        throw new IllegalArgumentException(String.format("Duplicate contig in sequence dictionary: %s", c));
    }, LinkedHashMap::new));
    Utils.validateArg(contigLengthMap.size() > 0, "There must be at least one contig above the threshold length in the sequence dictionary.");
    logger.info("Contigs above length threshold: " + contigLengthMap.toString());
    //check that contigs in input files are present in sequence dictionary and that data points are valid given lengths
    validateContigs(contigLengthMap);
    //generate the plots
    final List<String> contigNames = new ArrayList<>(contigLengthMap.keySet());
    final List<Integer> contigLengths = new ArrayList<>(contigLengthMap.values());
    writeSegmentedAlleleFractionPlot(sampleName, contigNames, contigLengths);
    return "SUCCESS";
}
Also used : DocumentedFeature(org.broadinstitute.barclay.help.DocumentedFeature) CommandLineProgramProperties(org.broadinstitute.barclay.argparser.CommandLineProgramProperties) Resource(org.broadinstitute.hellbender.utils.io.Resource) java.util(java.util) IOUtils(org.broadinstitute.hellbender.utils.io.IOUtils) CopyNumberProgramGroup(org.broadinstitute.hellbender.cmdline.programgroups.CopyNumberProgramGroup) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Argument(org.broadinstitute.barclay.argparser.Argument) org.broadinstitute.hellbender.cmdline(org.broadinstitute.hellbender.cmdline) AllelicCount(org.broadinstitute.hellbender.tools.exome.alleliccount.AllelicCount) org.broadinstitute.hellbender.tools.exome(org.broadinstitute.hellbender.tools.exome) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) File(java.io.File) UserException(org.broadinstitute.hellbender.exceptions.UserException) ReferenceUtils(org.broadinstitute.hellbender.utils.reference.ReferenceUtils) Utils(org.broadinstitute.hellbender.utils.Utils) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) AllelicCountCollection(org.broadinstitute.hellbender.tools.exome.alleliccount.AllelicCountCollection) RScriptExecutor(org.broadinstitute.hellbender.utils.R.RScriptExecutor) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary)

Aggregations

SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)110 Test (org.testng.annotations.Test)41 SAMSequenceRecord (htsjdk.samtools.SAMSequenceRecord)37 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)37 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)35 File (java.io.File)31 UserException (org.broadinstitute.hellbender.exceptions.UserException)24 VariantContext (htsjdk.variant.variantcontext.VariantContext)23 Argument (org.broadinstitute.barclay.argparser.Argument)21 Collectors (java.util.stream.Collectors)20 ReferenceMultiSource (org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource)20 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)18 GATKRead (org.broadinstitute.hellbender.utils.read.GATKRead)17 VCFHeader (htsjdk.variant.vcf.VCFHeader)16 IntervalUtils (org.broadinstitute.hellbender.utils.IntervalUtils)16 SAMFileHeader (htsjdk.samtools.SAMFileHeader)14 List (java.util.List)14 JavaRDD (org.apache.spark.api.java.JavaRDD)14 Broadcast (org.apache.spark.broadcast.Broadcast)12 StreamSupport (java.util.stream.StreamSupport)11