Search in sources :

Example 61 with SAMSequenceRecord

use of htsjdk.samtools.SAMSequenceRecord in project gatk by broadinstitute.

the class SparkSharderUnitTest method testPartitionReadExtents.

@Test
public void testPartitionReadExtents() throws IOException {
    JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
    // Consider the following reads.
    // This test checks the partition read extents when the reads are divided into
    // different numbers of partitions (1, 2, or 3), and with different sequence dictionaries.
    //                      1                   2
    //    1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7
    // ---------------------------------------------------------
    // Reads
    //   [-----] chr 1
    //           [-----] chr 1
    //               [-----] chr 1
    //                       [-----] chr 2
    //                         [-----] chr 2
    //                           [-----] chr 2
    // ---------------------------------------------------------
    //    1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7
    ImmutableList<TestRead> reads = ImmutableList.of(new TestRead("1", 1, 3), new TestRead("1", 5, 7), new TestRead("1", 7, 9), new TestRead("2", 11, 13), new TestRead("2", 12, 14), new TestRead("2", 13, 15));
    assertEquals(SparkSharder.computePartitionReadExtents(ctx.parallelize(reads, 1), sequenceDictionary, STANDARD_READ_LENGTH), ImmutableList.of(new SparkSharder.PartitionLocatable<>(0, new SimpleInterval("1", 1, 100)), new SparkSharder.PartitionLocatable<>(0, new SimpleInterval("2", 1, 50))));
    assertEquals(SparkSharder.computePartitionReadExtents(ctx.parallelize(reads, 2), sequenceDictionary, STANDARD_READ_LENGTH), ImmutableList.of(new SparkSharder.PartitionLocatable<>(0, new SimpleInterval("1", 1, 100)), // since last read of partition 0 _could_ end at start of first read in partition 1 + max read length
    new SparkSharder.PartitionLocatable<>(0, new SimpleInterval("2", 1, 14)), new SparkSharder.PartitionLocatable<>(1, new SimpleInterval("2", 11, 50))));
    assertEquals(SparkSharder.computePartitionReadExtents(ctx.parallelize(reads, 3), sequenceDictionary, STANDARD_READ_LENGTH), ImmutableList.of(new SparkSharder.PartitionLocatable<>(0, new SimpleInterval("1", 1, 10)), new SparkSharder.PartitionLocatable<>(1, new SimpleInterval("1", 7, 100)), new SparkSharder.PartitionLocatable<>(1, new SimpleInterval("2", 1, 15)), new SparkSharder.PartitionLocatable<>(2, new SimpleInterval("2", 12, 50))));
    // Use a different dictionary with contig 3 at the end
    SAMSequenceDictionary sequenceDictionary123 = new SAMSequenceDictionary(ImmutableList.of(new SAMSequenceRecord("1", 100), new SAMSequenceRecord("2", 50), new SAMSequenceRecord("3", 25)));
    assertEquals(SparkSharder.computePartitionReadExtents(ctx.parallelize(reads, 1), sequenceDictionary123, STANDARD_READ_LENGTH), ImmutableList.of(new SparkSharder.PartitionLocatable<>(0, new SimpleInterval("1", 1, 100)), new SparkSharder.PartitionLocatable<>(0, new SimpleInterval("2", 1, 50)), // partition could contain contig 3 reads
    new SparkSharder.PartitionLocatable<>(0, new SimpleInterval("3", 1, 25))));
    assertEquals(SparkSharder.computePartitionReadExtents(ctx.parallelize(reads, 2), sequenceDictionary123, STANDARD_READ_LENGTH), ImmutableList.of(new SparkSharder.PartitionLocatable<>(0, new SimpleInterval("1", 1, 100)), // since last read of partition 0 _could_ end at start of first read in partition 1 + max read length
    new SparkSharder.PartitionLocatable<>(0, new SimpleInterval("2", 1, 14)), new SparkSharder.PartitionLocatable<>(1, new SimpleInterval("2", 11, 50)), // partition could contain contig 3 reads
    new SparkSharder.PartitionLocatable<>(1, new SimpleInterval("3", 1, 25))));
    assertEquals(SparkSharder.computePartitionReadExtents(ctx.parallelize(reads, 3), sequenceDictionary123, STANDARD_READ_LENGTH), ImmutableList.of(new SparkSharder.PartitionLocatable<>(0, new SimpleInterval("1", 1, 10)), new SparkSharder.PartitionLocatable<>(1, new SimpleInterval("1", 7, 100)), new SparkSharder.PartitionLocatable<>(1, new SimpleInterval("2", 1, 15)), new SparkSharder.PartitionLocatable<>(2, new SimpleInterval("2", 12, 50)), // partition could contain contig 3 reads
    new SparkSharder.PartitionLocatable<>(2, new SimpleInterval("3", 1, 25))));
    // Use a different dictionary with contig X between contigs 1 and 2
    SAMSequenceDictionary sequenceDictionary1X2 = new SAMSequenceDictionary(ImmutableList.of(new SAMSequenceRecord("1", 100), new SAMSequenceRecord("X", 75), new SAMSequenceRecord("2", 50)));
    assertEquals(SparkSharder.computePartitionReadExtents(ctx.parallelize(reads, 1), sequenceDictionary1X2, STANDARD_READ_LENGTH), ImmutableList.of(new SparkSharder.PartitionLocatable<>(0, new SimpleInterval("1", 1, 100)), // partition could contain contig X reads
    new SparkSharder.PartitionLocatable<>(0, new SimpleInterval("X", 1, 75)), new SparkSharder.PartitionLocatable<>(0, new SimpleInterval("2", 1, 50))));
    assertEquals(SparkSharder.computePartitionReadExtents(ctx.parallelize(reads, 2), sequenceDictionary1X2, STANDARD_READ_LENGTH), ImmutableList.of(new SparkSharder.PartitionLocatable<>(0, new SimpleInterval("1", 1, 100)), // partition could contain contig X reads
    new SparkSharder.PartitionLocatable<>(0, new SimpleInterval("X", 1, 75)), // since last read of partition 0 _could_ end at start of first read in partition 1 + max read length
    new SparkSharder.PartitionLocatable<>(0, new SimpleInterval("2", 1, 14)), new SparkSharder.PartitionLocatable<>(1, new SimpleInterval("2", 11, 50))));
    assertEquals(SparkSharder.computePartitionReadExtents(ctx.parallelize(reads, 3), sequenceDictionary1X2, STANDARD_READ_LENGTH), ImmutableList.of(new SparkSharder.PartitionLocatable<>(0, new SimpleInterval("1", 1, 10)), new SparkSharder.PartitionLocatable<>(1, new SimpleInterval("1", 7, 100)), // partition could contain contig X reads
    new SparkSharder.PartitionLocatable<>(1, new SimpleInterval("X", 1, 75)), new SparkSharder.PartitionLocatable<>(1, new SimpleInterval("2", 1, 15)), new SparkSharder.PartitionLocatable<>(2, new SimpleInterval("2", 12, 50))));
}
Also used : SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Example 62 with SAMSequenceRecord

use of htsjdk.samtools.SAMSequenceRecord in project gatk by broadinstitute.

the class GATKVariantContextUtilsUnitTest method makeSimpleSequenceDictionary.

private SAMSequenceDictionary makeSimpleSequenceDictionary() {
    final SAMSequenceDictionary seqDictionary = new SAMSequenceDictionary();
    seqDictionary.addSequence(new SAMSequenceRecord("chr1", 10));
    return seqDictionary;
}
Also used : SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary)

Example 63 with SAMSequenceRecord

use of htsjdk.samtools.SAMSequenceRecord in project gatk by broadinstitute.

the class VcfUtilsUnitTest method createSequenceDictonary.

private SAMSequenceDictionary createSequenceDictonary() {
    List<SAMSequenceRecord> seqRecList = new ArrayList<>(2);
    seqRecList.add(new SAMSequenceRecord("contig1", 100));
    seqRecList.add(new SAMSequenceRecord("contig2", 200));
    return new SAMSequenceDictionary(seqRecList);
}
Also used : SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary)

Example 64 with SAMSequenceRecord

use of htsjdk.samtools.SAMSequenceRecord in project gatk-protected by broadinstitute.

the class AnnotateTargetsIntegrationTest method createRandomIntervals.

private List<SimpleInterval> createRandomIntervals(final SAMSequenceDictionary referenceDictionary, final int numberOfIntervals, final int minIntervalSize, final int maxIntervalSize, final int meanIntervalSize, final double intervalSizeStdev) {
    final List<SimpleInterval> result = new ArrayList<>(numberOfIntervals);
    final int numberOfSequences = referenceDictionary.getSequences().size();
    for (int i = 0; i < numberOfIntervals; i++) {
        final SAMSequenceRecord contig = referenceDictionary.getSequence(RANDOM.nextInt(numberOfSequences));
        final String contigName = contig.getSequenceName();
        final int intervalSize = Math.min(maxIntervalSize, (int) Math.max(minIntervalSize, Math.round(RANDOM.nextDouble() * intervalSizeStdev + meanIntervalSize)));
        final int intervalStart = 1 + RANDOM.nextInt(contig.getSequenceLength() - intervalSize);
        final int intervalEnd = intervalStart + intervalSize - 1;
        final SimpleInterval interval = new SimpleInterval(contigName, intervalStart, intervalEnd);
        result.add(interval);
    }
    final Comparator<SimpleInterval> comparator = Comparator.comparing(SimpleInterval::getContig, (a, b) -> Integer.compare(referenceDictionary.getSequenceIndex(a), referenceDictionary.getSequenceIndex(b))).thenComparingInt(SimpleInterval::getStart).thenComparingInt(SimpleInterval::getEnd);
    Collections.sort(result, comparator);
    return result;
}
Also used : SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord)

Example 65 with SAMSequenceRecord

use of htsjdk.samtools.SAMSequenceRecord in project gatk by broadinstitute.

the class LiftOverVcf method doWork.

@Override
protected Object doWork() {
    IOUtil.assertFileIsReadable(INPUT);
    IOUtil.assertFileIsReadable(REFERENCE_SEQUENCE);
    IOUtil.assertFileIsReadable(CHAIN);
    IOUtil.assertFileIsWritable(OUTPUT);
    IOUtil.assertFileIsWritable(REJECT);
    ////////////////////////////////////////////////////////////////////////
    // Setup the inputs
    ////////////////////////////////////////////////////////////////////////
    final LiftOver liftOver = new LiftOver(CHAIN);
    final VCFFileReader in = new VCFFileReader(INPUT, false);
    logger.info("Loading up the target reference genome.");
    final ReferenceSequenceFileWalker walker = new ReferenceSequenceFileWalker(REFERENCE_SEQUENCE);
    final Map<String, byte[]> refSeqs = new HashMap<>();
    for (final SAMSequenceRecord rec : walker.getSequenceDictionary().getSequences()) {
        refSeqs.put(rec.getSequenceName(), walker.get(rec.getSequenceIndex()).getBases());
    }
    CloserUtil.close(walker);
    ////////////////////////////////////////////////////////////////////////
    // Setup the outputs
    ////////////////////////////////////////////////////////////////////////
    final VCFHeader inHeader = in.getFileHeader();
    final VCFHeader outHeader = new VCFHeader(inHeader);
    outHeader.setSequenceDictionary(walker.getSequenceDictionary());
    final VariantContextWriter out = new VariantContextWriterBuilder().setOption(Options.INDEX_ON_THE_FLY).setOutputFile(OUTPUT).setReferenceDictionary(walker.getSequenceDictionary()).build();
    out.writeHeader(outHeader);
    final VariantContextWriter rejects = new VariantContextWriterBuilder().setOutputFile(REJECT).unsetOption(Options.INDEX_ON_THE_FLY).build();
    final VCFHeader rejectHeader = new VCFHeader(in.getFileHeader());
    for (final VCFFilterHeaderLine line : FILTERS) rejectHeader.addMetaDataLine(line);
    rejects.writeHeader(rejectHeader);
    ////////////////////////////////////////////////////////////////////////
    // Read the input VCF, lift the records over and write to the sorting
    // collection.
    ////////////////////////////////////////////////////////////////////////
    long failedLiftover = 0, failedAlleleCheck = 0, total = 0;
    logger.info("Lifting variants over and sorting.");
    final SortingCollection<VariantContext> sorter = SortingCollection.newInstance(VariantContext.class, new VCFRecordCodec(outHeader), outHeader.getVCFRecordComparator(), MAX_RECORDS_IN_RAM, TMP_DIR);
    ProgressLogger progress = new ProgressLogger(logger, 1000000, "read");
    for (final VariantContext ctx : in) {
        ++total;
        final Interval source = new Interval(ctx.getContig(), ctx.getStart(), ctx.getEnd(), false, ctx.getContig() + ":" + ctx.getStart() + "-" + ctx.getEnd());
        final Interval target = liftOver.liftOver(source, 1.0);
        if (target == null) {
            rejects.add(new VariantContextBuilder(ctx).filter(FILTER_CANNOT_LIFTOVER).make());
            failedLiftover++;
        } else {
            // Fix the alleles if we went from positive to negative strand
            final List<Allele> alleles = new ArrayList<>();
            for (final Allele oldAllele : ctx.getAlleles()) {
                if (target.isPositiveStrand() || oldAllele.isSymbolic()) {
                    alleles.add(oldAllele);
                } else {
                    alleles.add(Allele.create(SequenceUtil.reverseComplement(oldAllele.getBaseString()), oldAllele.isReference()));
                }
            }
            // Build the new variant context
            final VariantContextBuilder builder = new VariantContextBuilder(ctx.getSource(), target.getContig(), target.getStart(), target.getEnd(), alleles);
            builder.id(ctx.getID());
            builder.attributes(ctx.getAttributes());
            builder.genotypes(ctx.getGenotypes());
            builder.filters(ctx.getFilters());
            builder.log10PError(ctx.getLog10PError());
            // Check that the reference allele still agrees with the reference sequence
            boolean mismatchesReference = false;
            for (final Allele allele : builder.getAlleles()) {
                if (allele.isReference()) {
                    final byte[] ref = refSeqs.get(target.getContig());
                    final String refString = StringUtil.bytesToString(ref, target.getStart() - 1, target.length());
                    if (!refString.equalsIgnoreCase(allele.getBaseString())) {
                        mismatchesReference = true;
                    }
                    break;
                }
            }
            if (mismatchesReference) {
                rejects.add(new VariantContextBuilder(ctx).filter(FILTER_MISMATCHING_REF_ALLELE).make());
                failedAlleleCheck++;
            } else {
                sorter.add(builder.make());
            }
        }
        progress.record(ctx.getContig(), ctx.getStart());
    }
    final NumberFormat pfmt = new DecimalFormat("0.0000%");
    final String pct = pfmt.format((failedLiftover + failedAlleleCheck) / (double) total);
    logger.info("Processed ", total, " variants.");
    logger.info(Long.toString(failedLiftover), " variants failed to liftover.");
    logger.info(Long.toString(failedAlleleCheck), " variants lifted over but had mismatching reference alleles after lift over.");
    logger.info(pct, " of variants were not successfully lifted over and written to the output.");
    rejects.close();
    in.close();
    ////////////////////////////////////////////////////////////////////////
    // Write the sorted outputs to the final output file
    ////////////////////////////////////////////////////////////////////////
    sorter.doneAdding();
    progress = new ProgressLogger(logger, 1000000, "written");
    logger.info("Writing out sorted records to final VCF.");
    for (final VariantContext ctx : sorter) {
        out.add(ctx);
        progress.record(ctx.getContig(), ctx.getStart());
    }
    out.close();
    sorter.cleanup();
    return null;
}
Also used : LiftOver(htsjdk.samtools.liftover.LiftOver) HashMap(java.util.HashMap) DecimalFormat(java.text.DecimalFormat) VCFFileReader(htsjdk.variant.vcf.VCFFileReader) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) ProgressLogger(org.broadinstitute.hellbender.utils.runtime.ProgressLogger) VCFRecordCodec(htsjdk.variant.vcf.VCFRecordCodec) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) ReferenceSequenceFileWalker(htsjdk.samtools.reference.ReferenceSequenceFileWalker) VCFHeader(htsjdk.variant.vcf.VCFHeader) Allele(htsjdk.variant.variantcontext.Allele) VariantContextWriterBuilder(htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) NumberFormat(java.text.NumberFormat)

Aggregations

SAMSequenceRecord (htsjdk.samtools.SAMSequenceRecord)72 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)35 Test (org.testng.annotations.Test)26 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)24 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)13 File (java.io.File)10 SAMFileHeader (htsjdk.samtools.SAMFileHeader)9 UserException (org.broadinstitute.hellbender.exceptions.UserException)8 DataProvider (org.testng.annotations.DataProvider)8 IndexedFastaSequenceFile (htsjdk.samtools.reference.IndexedFastaSequenceFile)7 GATKException (org.broadinstitute.hellbender.exceptions.GATKException)7 IOException (java.io.IOException)6 ArrayList (java.util.ArrayList)6 QueryInterval (htsjdk.samtools.QueryInterval)5 Allele (htsjdk.variant.variantcontext.Allele)4 VariantContext (htsjdk.variant.variantcontext.VariantContext)4 VariantContextBuilder (htsjdk.variant.variantcontext.VariantContextBuilder)4 java.util (java.util)4 Collectors (java.util.stream.Collectors)4 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)4