Search in sources :

Example 36 with SimpleInterval

use of org.broadinstitute.hellbender.utils.SimpleInterval in project gatk-protected by broadinstitute.

the class AssemblyResultSetUnitTest method trimmingData.

@DataProvider(name = "trimmingData")
public Iterator<Object[]> trimmingData() {
    final AssemblyRegion activeRegion = new AssemblyRegion(new SimpleInterval("1", 1000, 1100), 25, header);
    final int length = activeRegion.getExtendedSpan().size();
    // keep it prepoducible by fixing the seed to lucky 13.
    final RandomDNA rnd = new RandomDNA(13);
    final AssemblyRegionTestDataSet actd = new AssemblyRegionTestDataSet(10, new String(rnd.nextBases(length)), new String[] { "Civar:*1T*" }, new String[0], new byte[0], new byte[0], new byte[0]);
    final List<Haplotype> haplotypes = actd.haplotypeList();
    for (final Haplotype h : haplotypes) h.setGenomeLocation(activeRegion.getExtendedSpan());
    final ReadThreadingGraph rtg = new ReadThreadingGraph(10);
    for (final Haplotype h : haplotypes) rtg.addSequence("seq-" + Math.abs(h.hashCode()), h.getBases(), h.isReference());
    final SeqGraph seqGraph = rtg.toSequenceGraph();
    final AssemblyResult ar = new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION, seqGraph, rtg);
    final Map<Haplotype, AssemblyResult> result = new HashMap<>();
    for (final Haplotype h : haplotypes) result.put(h, ar);
    return Collections.singleton(new Object[] { result, activeRegion }).iterator();
}
Also used : ReadThreadingGraph(org.broadinstitute.hellbender.tools.walkers.haplotypecaller.readthreading.ReadThreadingGraph) TestingReadThreadingGraph(org.broadinstitute.hellbender.tools.walkers.haplotypecaller.readthreading.TestingReadThreadingGraph) SeqGraph(org.broadinstitute.hellbender.tools.walkers.haplotypecaller.graphs.SeqGraph) RandomDNA(org.broadinstitute.hellbender.utils.RandomDNA) AssemblyRegion(org.broadinstitute.hellbender.engine.AssemblyRegion) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Haplotype(org.broadinstitute.hellbender.utils.haplotype.Haplotype) DataProvider(org.testng.annotations.DataProvider)

Example 37 with SimpleInterval

use of org.broadinstitute.hellbender.utils.SimpleInterval in project gatk-protected by broadinstitute.

the class ReferenceConfidenceVariantContextMergerUnitTest method makeReferenceConfidenceMergeData.

@DataProvider(name = "referenceConfidenceMergeData")
public Object[][] makeReferenceConfidenceMergeData() {
    final List<Object[]> tests = new ArrayList<>();
    final int start = 10;
    final SimpleInterval loc = new SimpleInterval("20", start, start);
    final VariantContext VCbase = new VariantContextBuilder("test", "20", start, start, Arrays.asList(Aref)).make();
    final VariantContext VCbase2 = new VariantContextBuilder("test2", "20", start, start, Arrays.asList(Aref)).make();
    final VariantContext VCprevBase = new VariantContextBuilder("test", "20", start - 1, start - 1, Arrays.asList(Aref)).make();
    final int[] standardPLs = new int[] { 30, 20, 10, 71, 72, 73 };
    final int[] reorderedSecondAllelePLs = new int[] { 30, 71, 73, 20, 72, 10 };
    final List<Allele> noCalls = new ArrayList<>(2);
    noCalls.add(Allele.NO_CALL);
    noCalls.add(Allele.NO_CALL);
    final List<Allele> A_ALT = Arrays.asList(Aref, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
    final Genotype gA_ALT = new GenotypeBuilder("A").PL(new int[] { 0, 100, 1000 }).alleles(noCalls).make();
    final VariantContext vcA_ALT = new VariantContextBuilder(VCbase).alleles(A_ALT).genotypes(gA_ALT).make();
    final Allele AAref = Allele.create("AA", true);
    final List<Allele> AA_ALT = Arrays.asList(AAref, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
    final Genotype gAA_ALT = new GenotypeBuilder("AA").PL(new int[] { 0, 80, 800 }).alleles(noCalls).make();
    final VariantContext vcAA_ALT = new VariantContextBuilder(VCprevBase).alleles(AA_ALT).genotypes(gAA_ALT).make();
    final List<Allele> A_C = Arrays.asList(Aref, C);
    final Genotype gA_C = new GenotypeBuilder("A_C").PL(new int[] { 30, 20, 10 }).alleles(noCalls).make();
    final List<Allele> A_C_ALT = Arrays.asList(Aref, C, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
    final Genotype gA_C_ALT = new GenotypeBuilder("A_C").PL(standardPLs).alleles(noCalls).make();
    final VariantContext vcA_C = new VariantContextBuilder(VCbase2).alleles(A_C_ALT).genotypes(gA_C).make();
    final VariantContext vcA_C_ALT = new VariantContextBuilder(VCbase).alleles(A_C_ALT).genotypes(gA_C_ALT).make();
    final List<Allele> A_G_ALT = Arrays.asList(Aref, G, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
    final Genotype gA_G_ALT = new GenotypeBuilder("A_G").PL(standardPLs).alleles(noCalls).make();
    final VariantContext vcA_G_ALT = new VariantContextBuilder(VCbase).alleles(A_G_ALT).genotypes(gA_G_ALT).make();
    final List<Allele> A_C_G = Arrays.asList(Aref, C, G);
    final Genotype gA_C_G = new GenotypeBuilder("A_C_G").PL(new int[] { 40, 20, 30, 20, 10, 30 }).alleles(noCalls).make();
    final List<Allele> A_C_G_ALT = Arrays.asList(Aref, C, G, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
    final Genotype gA_C_G_ALT = new GenotypeBuilder("A_C_G").PL(new int[] { 40, 20, 30, 20, 10, 30, 71, 72, 73, 74 }).alleles(noCalls).make();
    final VariantContext vcA_C_G = new VariantContextBuilder(VCbase2).alleles(A_C_G_ALT).genotypes(gA_C_G).make();
    final VariantContext vcA_C_G_ALT = new VariantContextBuilder(VCbase).alleles(A_C_G_ALT).genotypes(gA_C_G_ALT).make();
    final List<Allele> A_ATC_ALT = Arrays.asList(Aref, ATC, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
    final Genotype gA_ATC_ALT = new GenotypeBuilder("A_ATC").PL(standardPLs).alleles(noCalls).make();
    final VariantContext vcA_ATC_ALT = new VariantContextBuilder(VCbase).alleles(A_ATC_ALT).genotypes(gA_ATC_ALT).make();
    final Allele A = Allele.create("A", false);
    final List<Allele> AA_A_ALT = Arrays.asList(AAref, A, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
    final Genotype gAA_A_ALT = new GenotypeBuilder("AA_A").PL(standardPLs).alleles(noCalls).make();
    final VariantContext vcAA_A_ALT = new VariantContextBuilder(VCprevBase).alleles(AA_A_ALT).genotypes(gAA_A_ALT).make();
    final List<Allele> A_C_del = Arrays.asList(Aref, C, del);
    // first test the case of a single record
    tests.add(new Object[] { "test00", Arrays.asList(vcA_C_ALT), loc, false, false, new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C).make() });
    // now, test pairs:
    // a SNP with another SNP
    tests.add(new Object[] { "test01", Arrays.asList(vcA_C_ALT, vcA_G_ALT), loc, false, false, new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(gA_C_ALT, new GenotypeBuilder("A_G").PL(reorderedSecondAllelePLs).alleles(noCalls).make()).make() });
    // a SNP with an indel
    tests.add(new Object[] { "test02", Arrays.asList(vcA_C_ALT, vcA_ATC_ALT), loc, false, false, new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, ATC)).genotypes(gA_C_ALT, new GenotypeBuilder("A_ATC").PL(reorderedSecondAllelePLs).alleles(noCalls).make()).make() });
    // a SNP with 2 SNPs
    tests.add(new Object[] { "test03", Arrays.asList(vcA_C_ALT, vcA_C_G_ALT), loc, false, false, new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(gA_C_ALT, gA_C_G).make() });
    // a SNP with a ref record
    tests.add(new Object[] { "test04", Arrays.asList(vcA_C_ALT, vcA_ALT), loc, false, false, new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, gA_ALT).make() });
    // spanning records:
    // a SNP with a spanning ref record
    tests.add(new Object[] { "test05", Arrays.asList(vcA_C_ALT, vcAA_ALT), loc, false, false, new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, gAA_ALT).make() });
    // a SNP with a spanning deletion
    tests.add(new Object[] { "test06", Arrays.asList(vcA_C_ALT, vcAA_A_ALT), loc, false, false, new VariantContextBuilder(VCbase).alleles(A_C_del).genotypes(new GenotypeBuilder("A_C").PL(new int[] { 30, 20, 10, 71, 72, 73 }).alleles(noCalls).make(), new GenotypeBuilder("AA_A").PL(new int[] { 30, 71, 73, 20, 72, 10 }).alleles(noCalls).make()).make() });
    // combination of all
    tests.add(new Object[] { "test07", Arrays.asList(vcA_C_ALT, vcA_G_ALT, vcA_ATC_ALT, vcA_C_G_ALT, vcA_ALT, vcAA_ALT, vcAA_A_ALT), loc, false, false, new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, G, ATC, del)).genotypes(new GenotypeBuilder("A_C").PL(new int[] { 30, 20, 10, 71, 72, 73, 71, 72, 73, 73, 71, 72, 73, 73, 73 }).alleles(noCalls).make(), new GenotypeBuilder("A_G").PL(new int[] { 30, 71, 73, 20, 72, 10, 71, 73, 72, 73, 71, 73, 72, 73, 73 }).alleles(noCalls).make(), new GenotypeBuilder("A_ATC").PL(new int[] { 30, 71, 73, 71, 73, 73, 20, 72, 72, 10, 71, 73, 73, 72, 73 }).alleles(noCalls).make(), new GenotypeBuilder("A_C_G").PL(new int[] { 40, 20, 30, 20, 10, 30, 71, 72, 73, 74, 71, 72, 73, 74, 74 }).alleles(noCalls).make(), new GenotypeBuilder("A").PL(new int[] { 0, 100, 1000, 100, 1000, 1000, 100, 1000, 1000, 1000, 100, 1000, 1000, 1000, 1000 }).alleles(noCalls).make(), new GenotypeBuilder("AA").PL(new int[] { 0, 80, 800, 80, 800, 800, 80, 800, 800, 800, 80, 800, 800, 800, 800 }).alleles(noCalls).make(), new GenotypeBuilder("AA_A").PL(new int[] { 30, 71, 73, 71, 73, 73, 71, 73, 73, 73, 20, 72, 72, 72, 10 }).alleles(noCalls).make()).make() });
    // just spanning ref contexts, trying both instances where we want/do not want ref-only contexts
    tests.add(new Object[] { "test08", Arrays.asList(vcAA_ALT), loc, false, false, null });
    tests.add(new Object[] { "test09", Arrays.asList(vcAA_ALT), loc, true, false, new VariantContextBuilder(VCbase).alleles(Arrays.asList(Allele.create("A", true))).genotypes(new GenotypeBuilder("AA").PL(new int[] { 0 }).alleles(noCalls).make()).make() });
    // test uniquification of sample names
    tests.add(new Object[] { "test10", Arrays.asList(vcA_C, vcA_C_ALT), loc, false, true, new VariantContextBuilder(VCbase).alleles(A_C).genotypes(new GenotypeBuilder("A_C.test2").PL(new int[] { 30, 20, 10 }).alleles(noCalls).make(), new GenotypeBuilder("A_C.test").PL(new int[] { 30, 20, 10 }).alleles(noCalls).make()).make() });
    tests.add(new Object[] { "test11", Arrays.asList(vcA_C_G, vcA_C_G_ALT), loc, false, true, new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(new GenotypeBuilder("A_C_G.test2").PL(new int[] { 40, 20, 30, 20, 10, 30 }).alleles(noCalls).make(), new GenotypeBuilder("A_C_G.test").PL(new int[] { 40, 20, 30, 20, 10, 30 }).alleles(noCalls).make()).make() });
    return tests.toArray(new Object[][] {});
}
Also used : ArrayList(java.util.ArrayList) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) DataProvider(org.testng.annotations.DataProvider)

Example 38 with SimpleInterval

use of org.broadinstitute.hellbender.utils.SimpleInterval in project gatk-protected by broadinstitute.

the class ReferenceBasesUnitTest method test.

@Test
public void test() {
    final File refFasta = new File(b37_reference_20_21);
    final ReferenceDataSource refDataSource = new ReferenceFileSource(refFasta);
    final ReferenceContext ref = new ReferenceContext(refDataSource, new SimpleInterval("20", 10_000_000, 10_000_200));
    final VariantContext vc = new VariantContextBuilder("source", "20", 10_000_100, 10_000_100, Collections.singleton(Allele.create((byte) 'A', true))).make();
    final String refBases = (String) new ReferenceBases().annotate(ref, vc, null).get(ReferenceBases.REFERENCE_BASES_KEY);
    Assert.assertEquals(refBases, "ACTGCATCCCTTGCATTTCC");
}
Also used : VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) ReferenceDataSource(org.broadinstitute.hellbender.engine.ReferenceDataSource) ReferenceContext(org.broadinstitute.hellbender.engine.ReferenceContext) VariantContext(htsjdk.variant.variantcontext.VariantContext) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) File(java.io.File) ReferenceFileSource(org.broadinstitute.hellbender.engine.ReferenceFileSource) BaseTest(org.broadinstitute.hellbender.utils.test.BaseTest) Test(org.testng.annotations.Test)

Example 39 with SimpleInterval

use of org.broadinstitute.hellbender.utils.SimpleInterval in project gatk by broadinstitute.

the class AllelicCountCollector method collect.

/**
     * Returns an {@link AllelicCountCollection} based on the pileup at sites (specified by an interval list)
     * in a sorted BAM file.  Reads and bases below the specified mapping quality and base quality, respectively,
     * are filtered out of the pileup.  The alt count is defined as the total count minus the ref count, and the
     * alt nucleotide is defined as the non-ref base with the highest count, with ties broken by the order of the
     * bases in {@link AllelicCountCollector#BASES}.
     * @param bamFile           sorted BAM file
     * @param siteIntervals     interval list of sites
     * @param minMappingQuality minimum mapping quality required for reads to be included in pileup
     * @param minBaseQuality    minimum base quality required for bases to be included in pileup
     * @return                  AllelicCountCollection of ref/alt counts at sites in BAM file
     */
public AllelicCountCollection collect(final File bamFile, final IntervalList siteIntervals, final int minMappingQuality, final int minBaseQuality) {
    try (final SamReader reader = readerFactory.open(bamFile)) {
        ParamUtils.isPositiveOrZero(minMappingQuality, "Minimum mapping quality must be nonnegative.");
        ParamUtils.isPositiveOrZero(minBaseQuality, "Minimum base quality must be nonnegative.");
        if (reader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
            throw new UserException.BadInput("BAM file " + bamFile.toString() + " must be coordinate sorted.");
        }
        final int numberOfSites = siteIntervals.size();
        final boolean useIndex = numberOfSites < MAX_INTERVALS_FOR_INDEX;
        final SamLocusIterator locusIterator = new SamLocusIterator(reader, siteIntervals, useIndex);
        //set read and locus filters [note: read counts match IGV, but off by a few from pysam.mpileup]
        final List<SamRecordFilter> samFilters = Arrays.asList(new NotPrimaryAlignmentFilter(), new DuplicateReadFilter());
        locusIterator.setSamFilters(samFilters);
        locusIterator.setEmitUncoveredLoci(true);
        locusIterator.setIncludeNonPfReads(false);
        locusIterator.setMappingQualityScoreCutoff(minMappingQuality);
        locusIterator.setQualityScoreCutoff(minBaseQuality);
        logger.info("Examining " + numberOfSites + " sites in total...");
        int locusCount = 0;
        final AllelicCountCollection counts = new AllelicCountCollection();
        for (final SamLocusIterator.LocusInfo locus : locusIterator) {
            if (locusCount % NUMBER_OF_SITES_PER_LOGGED_STATUS_UPDATE == 0) {
                logger.info("Examined " + locusCount + " sites.");
            }
            locusCount++;
            final Nucleotide refBase = Nucleotide.valueOf(referenceWalker.get(locus.getSequenceIndex()).getBases()[locus.getPosition() - 1]);
            if (!BASES.contains(refBase)) {
                logger.warn(String.format("The reference position at %d has an unknown base call (value: %s). Skipping...", locus.getPosition(), refBase.toString()));
                continue;
            }
            final Nucleotide.Counter baseCounts = getPileupBaseCounts(locus);
            //only include total ACGT counts in binomial test (exclude N, etc.)
            final int totalBaseCount = BASES.stream().mapToInt(b -> (int) baseCounts.get(b)).sum();
            final int refReadCount = (int) baseCounts.get(refBase);
            //we take alt = total - ref instead of the actual alt count
            final int altReadCount = totalBaseCount - refReadCount;
            final Nucleotide altBase = inferAltFromPileupBaseCounts(baseCounts, refBase);
            counts.add(new AllelicCount(new SimpleInterval(locus.getSequenceName(), locus.getPosition(), locus.getPosition()), refReadCount, altReadCount, refBase, altBase));
        }
        logger.info(locusCount + " sites out of " + numberOfSites + " total sites were examined.");
        return counts;
    } catch (final IOException | SAMFormatException e) {
        throw new UserException("Unable to collect allelic counts from " + bamFile);
    }
}
Also used : Arrays(java.util.Arrays) IOUtils(org.broadinstitute.hellbender.utils.io.IOUtils) SamLocusIterator(htsjdk.samtools.util.SamLocusIterator) IntervalList(htsjdk.samtools.util.IntervalList) IOException(java.io.IOException) Nucleotide(org.broadinstitute.hellbender.utils.Nucleotide) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) ParamUtils(org.broadinstitute.hellbender.utils.param.ParamUtils) File(java.io.File) SamRecordFilter(htsjdk.samtools.filter.SamRecordFilter) NotPrimaryAlignmentFilter(htsjdk.samtools.filter.NotPrimaryAlignmentFilter) List(java.util.List) Logger(org.apache.logging.log4j.Logger) UserException(org.broadinstitute.hellbender.exceptions.UserException) DuplicateReadFilter(htsjdk.samtools.filter.DuplicateReadFilter) ReferenceSequenceFileWalker(htsjdk.samtools.reference.ReferenceSequenceFileWalker) Utils(org.broadinstitute.hellbender.utils.Utils) htsjdk.samtools(htsjdk.samtools) LogManager(org.apache.logging.log4j.LogManager) Collections(java.util.Collections) SamRecordFilter(htsjdk.samtools.filter.SamRecordFilter) IOException(java.io.IOException) SamLocusIterator(htsjdk.samtools.util.SamLocusIterator) NotPrimaryAlignmentFilter(htsjdk.samtools.filter.NotPrimaryAlignmentFilter) Nucleotide(org.broadinstitute.hellbender.utils.Nucleotide) DuplicateReadFilter(htsjdk.samtools.filter.DuplicateReadFilter) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) UserException(org.broadinstitute.hellbender.exceptions.UserException)

Example 40 with SimpleInterval

use of org.broadinstitute.hellbender.utils.SimpleInterval in project gatk by broadinstitute.

the class AllelicCountReader method createRecord.

@Override
protected AllelicCount createRecord(final DataLine dataLine) {
    try {
        final String contig = dataLine.get(AllelicCountTableColumn.CONTIG);
        final int position = dataLine.getInt(AllelicCountTableColumn.POSITION);
        final int refReadCount = dataLine.getInt(AllelicCountTableColumn.REF_COUNT);
        final int altReadCount = dataLine.getInt(AllelicCountTableColumn.ALT_COUNT);
        final Nucleotide refNucleotide = Nucleotide.valueOf(dataLine.get(AllelicCountTableColumn.REF_NUCLEOTIDE.name()).getBytes()[0]);
        final Nucleotide altNucleotide = Nucleotide.valueOf(dataLine.get(AllelicCountTableColumn.ALT_NUCLEOTIDE.name()).getBytes()[0]);
        final SimpleInterval interval = new SimpleInterval(contig, position, position);
        return new AllelicCount(interval, refReadCount, altReadCount, refNucleotide, altNucleotide);
    } catch (final IllegalArgumentException e) {
        throw new UserException.BadInput("AllelicCountCollection file must have all columns specified.");
    }
}
Also used : Nucleotide(org.broadinstitute.hellbender.utils.Nucleotide) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) UserException(org.broadinstitute.hellbender.exceptions.UserException)

Aggregations

SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)545 Test (org.testng.annotations.Test)287 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)202 File (java.io.File)102 ArrayList (java.util.ArrayList)66 DataProvider (org.testng.annotations.DataProvider)64 GATKRead (org.broadinstitute.hellbender.utils.read.GATKRead)60 Collectors (java.util.stream.Collectors)53 java.util (java.util)41 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)40 AllelicCount (org.broadinstitute.hellbender.tools.exome.alleliccount.AllelicCount)40 UserException (org.broadinstitute.hellbender.exceptions.UserException)39 VariantContext (htsjdk.variant.variantcontext.VariantContext)36 IntStream (java.util.stream.IntStream)34 Target (org.broadinstitute.hellbender.tools.exome.Target)34 IOException (java.io.IOException)32 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)28 Assert (org.testng.Assert)27 Locatable (htsjdk.samtools.util.Locatable)26 List (java.util.List)26