Search in sources :

Example 1 with SAMSequenceDictionary

use of htsjdk.samtools.SAMSequenceDictionary in project gatk by broadinstitute.

the class FindBadGenomicKmersSpark method runTool.

/** Get the list of high copy number kmers in the reference, and write them to a file. */
@Override
protected void runTool(final JavaSparkContext ctx) {
    final SAMFileHeader hdr = getHeaderForReads();
    SAMSequenceDictionary dict = null;
    if (hdr != null)
        dict = hdr.getSequenceDictionary();
    final PipelineOptions options = getAuthenticatedGCSOptions();
    final ReferenceMultiSource referenceMultiSource = getReference();
    Collection<SVKmer> killList = findBadGenomicKmers(ctx, kSize, maxDUSTScore, referenceMultiSource, options, dict);
    if (highCopyFastaFilename != null) {
        killList = uniquify(killList, processFasta(kSize, maxDUSTScore, highCopyFastaFilename, options));
    }
    SVUtils.writeKmersFile(kSize, outputFile, killList);
}
Also used : ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) PipelineOptions(com.google.cloud.dataflow.sdk.options.PipelineOptions) SAMFileHeader(htsjdk.samtools.SAMFileHeader) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary)

Example 2 with SAMSequenceDictionary

use of htsjdk.samtools.SAMSequenceDictionary in project gatk by broadinstitute.

the class SplitIntervals method onTraversalStart.

@Override
public void onTraversalStart() {
    ParamUtils.isPositive(scatterCount, "scatter count must be > 0.");
    if (!outputDir.exists() && !outputDir.mkdir()) {
        throw new RuntimeIOException("Unable to create directory: " + outputDir.getAbsolutePath());
    }
    // in general dictionary will be from the reference, but using -I reads.bam or -F variants.vcf
    // to use the sequence dict from a bam or vcf is also supported
    final SAMSequenceDictionary sequenceDictionary = getBestAvailableSequenceDictionary();
    final List<SimpleInterval> intervals = hasIntervals() ? intervalArgumentCollection.getIntervals(sequenceDictionary) : IntervalUtils.getAllIntervalsForReference(sequenceDictionary);
    final IntervalList intervalList = new IntervalList(sequenceDictionary);
    intervals.stream().map(si -> new Interval(si.getContig(), si.getStart(), si.getEnd())).forEach(intervalList::add);
    final IntervalListScatterer scatterer = new IntervalListScatterer(subdivisionMode);
    final List<IntervalList> scattered = scatterer.scatter(intervalList, scatterCount, false);
    final DecimalFormat formatter = new DecimalFormat("0000");
    IntStream.range(0, scattered.size()).forEach(n -> scattered.get(n).write(new File(outputDir, formatter.format(n) + "-scattered.intervals")));
}
Also used : IntStream(java.util.stream.IntStream) CommandLineProgramProperties(org.broadinstitute.barclay.argparser.CommandLineProgramProperties) IntervalListScatterer(org.broadinstitute.hellbender.tools.picard.interval.IntervalListScatterer) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Argument(org.broadinstitute.barclay.argparser.Argument) DecimalFormat(java.text.DecimalFormat) IntervalList(htsjdk.samtools.util.IntervalList) StandardArgumentDefinitions(org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) VariantProgramGroup(org.broadinstitute.hellbender.cmdline.programgroups.VariantProgramGroup) IntervalArgumentCollection(org.broadinstitute.hellbender.cmdline.argumentcollections.IntervalArgumentCollection) ParamUtils(org.broadinstitute.hellbender.utils.param.ParamUtils) File(java.io.File) GATKTool(org.broadinstitute.hellbender.engine.GATKTool) Interval(htsjdk.samtools.util.Interval) List(java.util.List) IntervalUtils(org.broadinstitute.hellbender.utils.IntervalUtils) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) IntervalListScatterer(org.broadinstitute.hellbender.tools.picard.interval.IntervalListScatterer) IntervalList(htsjdk.samtools.util.IntervalList) DecimalFormat(java.text.DecimalFormat) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) File(java.io.File) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Interval(htsjdk.samtools.util.Interval)

Example 3 with SAMSequenceDictionary

use of htsjdk.samtools.SAMSequenceDictionary in project gatk by broadinstitute.

the class SVUtils method getRefRDD.

/**
     * Create an RDD from the reference sequences.
     * The reference sequences are transformed into a single, large collection of byte arrays. The collection is then
     * parallelized into an RDD.
     * Each contig that exceeds a size given by REF_RECORD_LEN is broken into a series of REF_RECORD_LEN chunks with a
     * K-1 base overlap between successive chunks. (I.e., for K=63, the last 62 bases in chunk n match the first 62
     * bases in chunk n+1) so that we don't miss any kmers due to the chunking -- we can just kmerize each record
     * independently.
     */
public static JavaRDD<byte[]> getRefRDD(final JavaSparkContext ctx, final int kSize, final ReferenceMultiSource ref, final PipelineOptions options, final SAMSequenceDictionary readsDict, final int ref_record_len, final int ref_records_per_partition) {
    final SAMSequenceDictionary dict = ref.getReferenceSequenceDictionary(readsDict);
    if (dict == null)
        throw new GATKException("No reference dictionary available");
    final int effectiveRecLen = ref_record_len - kSize + 1;
    final List<byte[]> sequenceChunks = new ArrayList<>();
    for (final SAMSequenceRecord rec : dict.getSequences()) {
        final String seqName = rec.getSequenceName();
        final int seqLen = rec.getSequenceLength();
        final SimpleInterval interval = new SimpleInterval(seqName, 1, seqLen);
        try {
            final byte[] bases = ref.getReferenceBases(options, interval).getBases();
            for (int start = 0; start < seqLen; start += effectiveRecLen) {
                sequenceChunks.add(Arrays.copyOfRange(bases, start, Math.min(start + ref_record_len, seqLen)));
            }
        } catch (final IOException ioe) {
            throw new GATKException("Can't get reference sequence bases for " + interval, ioe);
        }
    }
    return ctx.parallelize(sequenceChunks, sequenceChunks.size() / ref_records_per_partition + 1);
}
Also used : SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) GATKException(org.broadinstitute.hellbender.exceptions.GATKException) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary)

Example 4 with SAMSequenceDictionary

use of htsjdk.samtools.SAMSequenceDictionary in project gatk by broadinstitute.

the class SVVCFWriter method writeVCF.

/**
     * FASTA and Broadcast references are both required because 2bit Broadcast references currently order their
     * sequence dictionaries in a scrambled order, see https://github.com/broadinstitute/gatk/issues/2037.
     */
public static void writeVCF(final PipelineOptions pipelineOptions, final String vcfFileName, final String fastaReference, final JavaRDD<VariantContext> variantContexts, final Logger logger) {
    final SAMSequenceDictionary referenceSequenceDictionary = new ReferenceMultiSource(pipelineOptions, fastaReference, ReferenceWindowFunctions.IDENTITY_FUNCTION).getReferenceSequenceDictionary(null);
    final List<VariantContext> sortedVariantsList = sortVariantsByCoordinate(variantContexts.collect(), referenceSequenceDictionary);
    logNumOfVarByTypes(sortedVariantsList, logger);
    writeVariants(pipelineOptions, vcfFileName, sortedVariantsList, referenceSequenceDictionary);
}
Also used : ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) VariantContext(htsjdk.variant.variantcontext.VariantContext) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary)

Example 5 with SAMSequenceDictionary

use of htsjdk.samtools.SAMSequenceDictionary in project gatk by broadinstitute.

the class GenomeLocParserUnitTest method testParseUnknownSequenceLength.

@Test
public void testParseUnknownSequenceLength() {
    SAMSequenceDictionary seqDict = new SAMSequenceDictionary();
    seqDict.addSequence(new SAMSequenceRecord("1", SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH));
    Assert.assertEquals(seqDict.getSequence("1").getSequenceLength(), SAMSequenceRecord.UNKNOWN_SEQUENCE_LENGTH);
    GenomeLocParser myLocParser = new GenomeLocParser(seqDict);
    GenomeLoc genomeLoc = myLocParser.parseGenomeLoc("1:1-99");
    Assert.assertEquals(genomeLoc.getEnd(), 99);
}
Also used : SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Aggregations

SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)110 Test (org.testng.annotations.Test)41 SAMSequenceRecord (htsjdk.samtools.SAMSequenceRecord)37 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)37 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)35 File (java.io.File)31 UserException (org.broadinstitute.hellbender.exceptions.UserException)24 VariantContext (htsjdk.variant.variantcontext.VariantContext)23 Argument (org.broadinstitute.barclay.argparser.Argument)21 Collectors (java.util.stream.Collectors)20 ReferenceMultiSource (org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource)20 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)18 GATKRead (org.broadinstitute.hellbender.utils.read.GATKRead)17 VCFHeader (htsjdk.variant.vcf.VCFHeader)16 IntervalUtils (org.broadinstitute.hellbender.utils.IntervalUtils)16 SAMFileHeader (htsjdk.samtools.SAMFileHeader)14 List (java.util.List)14 JavaRDD (org.apache.spark.api.java.JavaRDD)14 Broadcast (org.apache.spark.broadcast.Broadcast)12 StreamSupport (java.util.stream.StreamSupport)11