use of org.broadinstitute.hellbender.utils.SimpleInterval in project gatk-protected by broadinstitute.
the class AssemblyResultSetUnitTest method trimmingData.
@DataProvider(name = "trimmingData")
public Iterator<Object[]> trimmingData() {
final AssemblyRegion activeRegion = new AssemblyRegion(new SimpleInterval("1", 1000, 1100), 25, header);
final int length = activeRegion.getExtendedSpan().size();
// keep it prepoducible by fixing the seed to lucky 13.
final RandomDNA rnd = new RandomDNA(13);
final AssemblyRegionTestDataSet actd = new AssemblyRegionTestDataSet(10, new String(rnd.nextBases(length)), new String[] { "Civar:*1T*" }, new String[0], new byte[0], new byte[0], new byte[0]);
final List<Haplotype> haplotypes = actd.haplotypeList();
for (final Haplotype h : haplotypes) h.setGenomeLocation(activeRegion.getExtendedSpan());
final ReadThreadingGraph rtg = new ReadThreadingGraph(10);
for (final Haplotype h : haplotypes) rtg.addSequence("seq-" + Math.abs(h.hashCode()), h.getBases(), h.isReference());
final SeqGraph seqGraph = rtg.toSequenceGraph();
final AssemblyResult ar = new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION, seqGraph, rtg);
final Map<Haplotype, AssemblyResult> result = new HashMap<>();
for (final Haplotype h : haplotypes) result.put(h, ar);
return Collections.singleton(new Object[] { result, activeRegion }).iterator();
}
use of org.broadinstitute.hellbender.utils.SimpleInterval in project gatk-protected by broadinstitute.
the class ReferenceConfidenceVariantContextMergerUnitTest method makeReferenceConfidenceMergeData.
@DataProvider(name = "referenceConfidenceMergeData")
public Object[][] makeReferenceConfidenceMergeData() {
final List<Object[]> tests = new ArrayList<>();
final int start = 10;
final SimpleInterval loc = new SimpleInterval("20", start, start);
final VariantContext VCbase = new VariantContextBuilder("test", "20", start, start, Arrays.asList(Aref)).make();
final VariantContext VCbase2 = new VariantContextBuilder("test2", "20", start, start, Arrays.asList(Aref)).make();
final VariantContext VCprevBase = new VariantContextBuilder("test", "20", start - 1, start - 1, Arrays.asList(Aref)).make();
final int[] standardPLs = new int[] { 30, 20, 10, 71, 72, 73 };
final int[] reorderedSecondAllelePLs = new int[] { 30, 71, 73, 20, 72, 10 };
final List<Allele> noCalls = new ArrayList<>(2);
noCalls.add(Allele.NO_CALL);
noCalls.add(Allele.NO_CALL);
final List<Allele> A_ALT = Arrays.asList(Aref, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
final Genotype gA_ALT = new GenotypeBuilder("A").PL(new int[] { 0, 100, 1000 }).alleles(noCalls).make();
final VariantContext vcA_ALT = new VariantContextBuilder(VCbase).alleles(A_ALT).genotypes(gA_ALT).make();
final Allele AAref = Allele.create("AA", true);
final List<Allele> AA_ALT = Arrays.asList(AAref, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
final Genotype gAA_ALT = new GenotypeBuilder("AA").PL(new int[] { 0, 80, 800 }).alleles(noCalls).make();
final VariantContext vcAA_ALT = new VariantContextBuilder(VCprevBase).alleles(AA_ALT).genotypes(gAA_ALT).make();
final List<Allele> A_C = Arrays.asList(Aref, C);
final Genotype gA_C = new GenotypeBuilder("A_C").PL(new int[] { 30, 20, 10 }).alleles(noCalls).make();
final List<Allele> A_C_ALT = Arrays.asList(Aref, C, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
final Genotype gA_C_ALT = new GenotypeBuilder("A_C").PL(standardPLs).alleles(noCalls).make();
final VariantContext vcA_C = new VariantContextBuilder(VCbase2).alleles(A_C_ALT).genotypes(gA_C).make();
final VariantContext vcA_C_ALT = new VariantContextBuilder(VCbase).alleles(A_C_ALT).genotypes(gA_C_ALT).make();
final List<Allele> A_G_ALT = Arrays.asList(Aref, G, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
final Genotype gA_G_ALT = new GenotypeBuilder("A_G").PL(standardPLs).alleles(noCalls).make();
final VariantContext vcA_G_ALT = new VariantContextBuilder(VCbase).alleles(A_G_ALT).genotypes(gA_G_ALT).make();
final List<Allele> A_C_G = Arrays.asList(Aref, C, G);
final Genotype gA_C_G = new GenotypeBuilder("A_C_G").PL(new int[] { 40, 20, 30, 20, 10, 30 }).alleles(noCalls).make();
final List<Allele> A_C_G_ALT = Arrays.asList(Aref, C, G, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
final Genotype gA_C_G_ALT = new GenotypeBuilder("A_C_G").PL(new int[] { 40, 20, 30, 20, 10, 30, 71, 72, 73, 74 }).alleles(noCalls).make();
final VariantContext vcA_C_G = new VariantContextBuilder(VCbase2).alleles(A_C_G_ALT).genotypes(gA_C_G).make();
final VariantContext vcA_C_G_ALT = new VariantContextBuilder(VCbase).alleles(A_C_G_ALT).genotypes(gA_C_G_ALT).make();
final List<Allele> A_ATC_ALT = Arrays.asList(Aref, ATC, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
final Genotype gA_ATC_ALT = new GenotypeBuilder("A_ATC").PL(standardPLs).alleles(noCalls).make();
final VariantContext vcA_ATC_ALT = new VariantContextBuilder(VCbase).alleles(A_ATC_ALT).genotypes(gA_ATC_ALT).make();
final Allele A = Allele.create("A", false);
final List<Allele> AA_A_ALT = Arrays.asList(AAref, A, GATKVCFConstants.NON_REF_SYMBOLIC_ALLELE);
final Genotype gAA_A_ALT = new GenotypeBuilder("AA_A").PL(standardPLs).alleles(noCalls).make();
final VariantContext vcAA_A_ALT = new VariantContextBuilder(VCprevBase).alleles(AA_A_ALT).genotypes(gAA_A_ALT).make();
final List<Allele> A_C_del = Arrays.asList(Aref, C, del);
// first test the case of a single record
tests.add(new Object[] { "test00", Arrays.asList(vcA_C_ALT), loc, false, false, new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C).make() });
// now, test pairs:
// a SNP with another SNP
tests.add(new Object[] { "test01", Arrays.asList(vcA_C_ALT, vcA_G_ALT), loc, false, false, new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(gA_C_ALT, new GenotypeBuilder("A_G").PL(reorderedSecondAllelePLs).alleles(noCalls).make()).make() });
// a SNP with an indel
tests.add(new Object[] { "test02", Arrays.asList(vcA_C_ALT, vcA_ATC_ALT), loc, false, false, new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, ATC)).genotypes(gA_C_ALT, new GenotypeBuilder("A_ATC").PL(reorderedSecondAllelePLs).alleles(noCalls).make()).make() });
// a SNP with 2 SNPs
tests.add(new Object[] { "test03", Arrays.asList(vcA_C_ALT, vcA_C_G_ALT), loc, false, false, new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(gA_C_ALT, gA_C_G).make() });
// a SNP with a ref record
tests.add(new Object[] { "test04", Arrays.asList(vcA_C_ALT, vcA_ALT), loc, false, false, new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, gA_ALT).make() });
// spanning records:
// a SNP with a spanning ref record
tests.add(new Object[] { "test05", Arrays.asList(vcA_C_ALT, vcAA_ALT), loc, false, false, new VariantContextBuilder(VCbase).alleles(A_C).genotypes(gA_C, gAA_ALT).make() });
// a SNP with a spanning deletion
tests.add(new Object[] { "test06", Arrays.asList(vcA_C_ALT, vcAA_A_ALT), loc, false, false, new VariantContextBuilder(VCbase).alleles(A_C_del).genotypes(new GenotypeBuilder("A_C").PL(new int[] { 30, 20, 10, 71, 72, 73 }).alleles(noCalls).make(), new GenotypeBuilder("AA_A").PL(new int[] { 30, 71, 73, 20, 72, 10 }).alleles(noCalls).make()).make() });
// combination of all
tests.add(new Object[] { "test07", Arrays.asList(vcA_C_ALT, vcA_G_ALT, vcA_ATC_ALT, vcA_C_G_ALT, vcA_ALT, vcAA_ALT, vcAA_A_ALT), loc, false, false, new VariantContextBuilder(VCbase).alleles(Arrays.asList(Aref, C, G, ATC, del)).genotypes(new GenotypeBuilder("A_C").PL(new int[] { 30, 20, 10, 71, 72, 73, 71, 72, 73, 73, 71, 72, 73, 73, 73 }).alleles(noCalls).make(), new GenotypeBuilder("A_G").PL(new int[] { 30, 71, 73, 20, 72, 10, 71, 73, 72, 73, 71, 73, 72, 73, 73 }).alleles(noCalls).make(), new GenotypeBuilder("A_ATC").PL(new int[] { 30, 71, 73, 71, 73, 73, 20, 72, 72, 10, 71, 73, 73, 72, 73 }).alleles(noCalls).make(), new GenotypeBuilder("A_C_G").PL(new int[] { 40, 20, 30, 20, 10, 30, 71, 72, 73, 74, 71, 72, 73, 74, 74 }).alleles(noCalls).make(), new GenotypeBuilder("A").PL(new int[] { 0, 100, 1000, 100, 1000, 1000, 100, 1000, 1000, 1000, 100, 1000, 1000, 1000, 1000 }).alleles(noCalls).make(), new GenotypeBuilder("AA").PL(new int[] { 0, 80, 800, 80, 800, 800, 80, 800, 800, 800, 80, 800, 800, 800, 800 }).alleles(noCalls).make(), new GenotypeBuilder("AA_A").PL(new int[] { 30, 71, 73, 71, 73, 73, 71, 73, 73, 73, 20, 72, 72, 72, 10 }).alleles(noCalls).make()).make() });
// just spanning ref contexts, trying both instances where we want/do not want ref-only contexts
tests.add(new Object[] { "test08", Arrays.asList(vcAA_ALT), loc, false, false, null });
tests.add(new Object[] { "test09", Arrays.asList(vcAA_ALT), loc, true, false, new VariantContextBuilder(VCbase).alleles(Arrays.asList(Allele.create("A", true))).genotypes(new GenotypeBuilder("AA").PL(new int[] { 0 }).alleles(noCalls).make()).make() });
// test uniquification of sample names
tests.add(new Object[] { "test10", Arrays.asList(vcA_C, vcA_C_ALT), loc, false, true, new VariantContextBuilder(VCbase).alleles(A_C).genotypes(new GenotypeBuilder("A_C.test2").PL(new int[] { 30, 20, 10 }).alleles(noCalls).make(), new GenotypeBuilder("A_C.test").PL(new int[] { 30, 20, 10 }).alleles(noCalls).make()).make() });
tests.add(new Object[] { "test11", Arrays.asList(vcA_C_G, vcA_C_G_ALT), loc, false, true, new VariantContextBuilder(VCbase).alleles(A_C_G).genotypes(new GenotypeBuilder("A_C_G.test2").PL(new int[] { 40, 20, 30, 20, 10, 30 }).alleles(noCalls).make(), new GenotypeBuilder("A_C_G.test").PL(new int[] { 40, 20, 30, 20, 10, 30 }).alleles(noCalls).make()).make() });
return tests.toArray(new Object[][] {});
}
use of org.broadinstitute.hellbender.utils.SimpleInterval in project gatk-protected by broadinstitute.
the class ReferenceBasesUnitTest method test.
@Test
public void test() {
final File refFasta = new File(b37_reference_20_21);
final ReferenceDataSource refDataSource = new ReferenceFileSource(refFasta);
final ReferenceContext ref = new ReferenceContext(refDataSource, new SimpleInterval("20", 10_000_000, 10_000_200));
final VariantContext vc = new VariantContextBuilder("source", "20", 10_000_100, 10_000_100, Collections.singleton(Allele.create((byte) 'A', true))).make();
final String refBases = (String) new ReferenceBases().annotate(ref, vc, null).get(ReferenceBases.REFERENCE_BASES_KEY);
Assert.assertEquals(refBases, "ACTGCATCCCTTGCATTTCC");
}
use of org.broadinstitute.hellbender.utils.SimpleInterval in project gatk by broadinstitute.
the class AllelicCountCollector method collect.
/**
* Returns an {@link AllelicCountCollection} based on the pileup at sites (specified by an interval list)
* in a sorted BAM file. Reads and bases below the specified mapping quality and base quality, respectively,
* are filtered out of the pileup. The alt count is defined as the total count minus the ref count, and the
* alt nucleotide is defined as the non-ref base with the highest count, with ties broken by the order of the
* bases in {@link AllelicCountCollector#BASES}.
* @param bamFile sorted BAM file
* @param siteIntervals interval list of sites
* @param minMappingQuality minimum mapping quality required for reads to be included in pileup
* @param minBaseQuality minimum base quality required for bases to be included in pileup
* @return AllelicCountCollection of ref/alt counts at sites in BAM file
*/
public AllelicCountCollection collect(final File bamFile, final IntervalList siteIntervals, final int minMappingQuality, final int minBaseQuality) {
try (final SamReader reader = readerFactory.open(bamFile)) {
ParamUtils.isPositiveOrZero(minMappingQuality, "Minimum mapping quality must be nonnegative.");
ParamUtils.isPositiveOrZero(minBaseQuality, "Minimum base quality must be nonnegative.");
if (reader.getFileHeader().getSortOrder() != SAMFileHeader.SortOrder.coordinate) {
throw new UserException.BadInput("BAM file " + bamFile.toString() + " must be coordinate sorted.");
}
final int numberOfSites = siteIntervals.size();
final boolean useIndex = numberOfSites < MAX_INTERVALS_FOR_INDEX;
final SamLocusIterator locusIterator = new SamLocusIterator(reader, siteIntervals, useIndex);
//set read and locus filters [note: read counts match IGV, but off by a few from pysam.mpileup]
final List<SamRecordFilter> samFilters = Arrays.asList(new NotPrimaryAlignmentFilter(), new DuplicateReadFilter());
locusIterator.setSamFilters(samFilters);
locusIterator.setEmitUncoveredLoci(true);
locusIterator.setIncludeNonPfReads(false);
locusIterator.setMappingQualityScoreCutoff(minMappingQuality);
locusIterator.setQualityScoreCutoff(minBaseQuality);
logger.info("Examining " + numberOfSites + " sites in total...");
int locusCount = 0;
final AllelicCountCollection counts = new AllelicCountCollection();
for (final SamLocusIterator.LocusInfo locus : locusIterator) {
if (locusCount % NUMBER_OF_SITES_PER_LOGGED_STATUS_UPDATE == 0) {
logger.info("Examined " + locusCount + " sites.");
}
locusCount++;
final Nucleotide refBase = Nucleotide.valueOf(referenceWalker.get(locus.getSequenceIndex()).getBases()[locus.getPosition() - 1]);
if (!BASES.contains(refBase)) {
logger.warn(String.format("The reference position at %d has an unknown base call (value: %s). Skipping...", locus.getPosition(), refBase.toString()));
continue;
}
final Nucleotide.Counter baseCounts = getPileupBaseCounts(locus);
//only include total ACGT counts in binomial test (exclude N, etc.)
final int totalBaseCount = BASES.stream().mapToInt(b -> (int) baseCounts.get(b)).sum();
final int refReadCount = (int) baseCounts.get(refBase);
//we take alt = total - ref instead of the actual alt count
final int altReadCount = totalBaseCount - refReadCount;
final Nucleotide altBase = inferAltFromPileupBaseCounts(baseCounts, refBase);
counts.add(new AllelicCount(new SimpleInterval(locus.getSequenceName(), locus.getPosition(), locus.getPosition()), refReadCount, altReadCount, refBase, altBase));
}
logger.info(locusCount + " sites out of " + numberOfSites + " total sites were examined.");
return counts;
} catch (final IOException | SAMFormatException e) {
throw new UserException("Unable to collect allelic counts from " + bamFile);
}
}
use of org.broadinstitute.hellbender.utils.SimpleInterval in project gatk by broadinstitute.
the class AllelicCountReader method createRecord.
@Override
protected AllelicCount createRecord(final DataLine dataLine) {
try {
final String contig = dataLine.get(AllelicCountTableColumn.CONTIG);
final int position = dataLine.getInt(AllelicCountTableColumn.POSITION);
final int refReadCount = dataLine.getInt(AllelicCountTableColumn.REF_COUNT);
final int altReadCount = dataLine.getInt(AllelicCountTableColumn.ALT_COUNT);
final Nucleotide refNucleotide = Nucleotide.valueOf(dataLine.get(AllelicCountTableColumn.REF_NUCLEOTIDE.name()).getBytes()[0]);
final Nucleotide altNucleotide = Nucleotide.valueOf(dataLine.get(AllelicCountTableColumn.ALT_NUCLEOTIDE.name()).getBytes()[0]);
final SimpleInterval interval = new SimpleInterval(contig, position, position);
return new AllelicCount(interval, refReadCount, altReadCount, refNucleotide, altNucleotide);
} catch (final IllegalArgumentException e) {
throw new UserException.BadInput("AllelicCountCollection file must have all columns specified.");
}
}
Aggregations