use of org.broadinstitute.hellbender.utils.haplotype.Haplotype in project gatk by broadinstitute.
the class HaplotypeCallerGenotypingEngineUnitTest method makeConstructPhaseSetMappingData.
@DataProvider(name = "ConstructPhaseSetMappingProvider")
public Object[][] makeConstructPhaseSetMappingData() {
List<Object[]> tests = new ArrayList<Object[]>();
final Allele ref = Allele.create("A", true);
final Allele altC = Allele.create("C", false);
final Allele altT = Allele.create("T", false);
final VariantContext vc1 = new VariantContextBuilder().chr("20").start(1).stop(1).alleles(Arrays.asList(ref, altC)).make();
final VariantContext vc2 = new VariantContextBuilder().chr("20").start(2).stop(2).alleles(Arrays.asList(ref, altC)).make();
final VariantContext vc3 = new VariantContextBuilder().chr("20").start(3).stop(3).alleles(Arrays.asList(ref, altT)).make();
final VariantContext vc4 = new VariantContextBuilder().chr("20").start(4).stop(4).alleles(Arrays.asList(ref, altC)).make();
final List<VariantContext> calls = Arrays.asList(vc2, vc3, vc4);
final Haplotype pos1 = new Haplotype("CAAAA".getBytes());
pos1.setEventMap(new EventMap(Arrays.asList(vc1)));
pos1.getEventMap().put(1, vc1);
final Haplotype pos2 = new Haplotype("ACAAA".getBytes());
pos2.setEventMap(new EventMap(Arrays.asList(vc2)));
pos2.getEventMap().put(2, vc2);
final Haplotype pos3 = new Haplotype("AACAA".getBytes());
pos3.setEventMap(new EventMap(Arrays.asList(vc3)));
pos3.getEventMap().put(3, vc3);
final Haplotype pos4 = new Haplotype("AAACA".getBytes());
pos4.setEventMap(new EventMap(Arrays.asList(vc4)));
pos4.getEventMap().put(4, vc4);
final Haplotype pos24 = new Haplotype("ACACA".getBytes());
pos24.setEventMap(new EventMap(Arrays.asList(vc2, vc4)));
pos24.getEventMap().put(2, vc2);
pos24.getEventMap().put(4, vc4);
final Haplotype pos34 = new Haplotype("AACCA".getBytes());
pos34.setEventMap(new EventMap(Arrays.asList(vc3, vc4)));
pos34.getEventMap().put(3, vc3);
pos34.getEventMap().put(4, vc4);
final Haplotype pos234 = new Haplotype("ACCCA".getBytes());
pos234.setEventMap(new EventMap(Arrays.asList(vc2, vc3, vc4)));
pos234.getEventMap().put(2, vc2);
pos234.getEventMap().put(3, vc3);
pos234.getEventMap().put(4, vc4);
final Map<VariantContext, Set<Haplotype>> haplotypeMap = new HashMap<>();
// test no phased variants #1
final Set<Haplotype> haplotypes2 = new HashSet<>();
haplotypes2.add(pos2);
haplotypeMap.put(vc2, haplotypes2);
tests.add(new Object[] { Arrays.asList(vc2), new HashMap<>(haplotypeMap), 2, 0, 0, 0, 0 });
// test no phased variants #2
final Set<Haplotype> haplotypes3 = new HashSet<>();
haplotypes3.add(pos3);
haplotypeMap.put(vc3, haplotypes3);
tests.add(new Object[] { Arrays.asList(vc2, vc3), new HashMap<>(haplotypeMap), 3, 0, 0, 0, 0 });
// test opposite phase
tests.add(new Object[] { Arrays.asList(vc2, vc3), new HashMap<>(haplotypeMap), 2, 2, 1, 1, 1 });
// test no phased variants #3
final Set<Haplotype> haplotypes4 = new HashSet<>();
haplotypes4.add(pos4);
haplotypeMap.put(vc4, haplotypes4);
tests.add(new Object[] { calls, new HashMap<>(haplotypeMap), 3, 0, 0, 0, 0 });
// test mixture
final Set<Haplotype> haplotypes24 = new HashSet<>();
haplotypes24.add(pos24);
haplotypeMap.put(vc2, haplotypes24);
haplotypeMap.put(vc4, haplotypes24);
tests.add(new Object[] { calls, new HashMap<>(haplotypeMap), 2, 3, 1, 2, 1 });
// test 2 hets
haplotypeMap.remove(vc3);
tests.add(new Object[] { Arrays.asList(vc2, vc4), new HashMap<>(haplotypeMap), 1, 2, 1, 2, 0 });
// test 2 with opposite phase
final Set<Haplotype> haplotypes1 = new HashSet<>();
haplotypes1.add(pos1);
haplotypeMap.put(vc1, haplotypes1);
tests.add(new Object[] { Arrays.asList(vc1, vc2, vc4), new HashMap<>(haplotypeMap), 2, 3, 1, 1, 2 });
// test homs around a het
final Set<Haplotype> haplotypes2hom = new HashSet<>();
haplotypes2hom.add(pos24);
haplotypes2hom.add(pos234);
final Set<Haplotype> haplotypes4hom = new HashSet<>();
haplotypes4hom.add(pos24);
haplotypes4hom.add(pos234);
final Set<Haplotype> haplotypes3het = new HashSet<>();
haplotypes3het.add(pos234);
haplotypeMap.put(vc2, haplotypes2hom);
haplotypeMap.put(vc3, haplotypes3het);
haplotypeMap.put(vc4, haplotypes4hom);
tests.add(new Object[] { calls, new HashMap<>(haplotypeMap), 2, 3, 1, 3, 0 });
// test hets around a hom
final Set<Haplotype> haplotypes2het = new HashSet<>();
haplotypes2het.add(pos234);
final Set<Haplotype> haplotypes4het = new HashSet<>();
haplotypes4het.add(pos234);
final Set<Haplotype> haplotypes3hom = new HashSet<>();
haplotypes3hom.add(pos3);
haplotypes3hom.add(pos234);
haplotypeMap.put(vc2, haplotypes2het);
haplotypeMap.put(vc3, haplotypes3hom);
haplotypeMap.put(vc4, haplotypes4het);
tests.add(new Object[] { calls, new HashMap<>(haplotypeMap), 2, 3, 1, 3, 0 });
// test no phased variants around a hom
final Set<Haplotype> haplotypes2incomplete = new HashSet<>();
haplotypes2incomplete.add(pos24);
final Set<Haplotype> haplotypes3incomplete = new HashSet<>();
haplotypes3incomplete.add(pos34);
final Set<Haplotype> haplotypes4complete = new HashSet<>();
haplotypes4complete.add(pos24);
haplotypes4complete.add(pos34);
haplotypes4complete.add(pos234);
haplotypeMap.put(vc2, haplotypes2incomplete);
haplotypeMap.put(vc3, haplotypes3incomplete);
haplotypeMap.put(vc4, haplotypes4complete);
tests.add(new Object[] { calls, new HashMap<>(haplotypeMap), 0, 0, 0, 0, 0 });
return tests.toArray(new Object[][] {});
}
use of org.broadinstitute.hellbender.utils.haplotype.Haplotype in project gatk by broadinstitute.
the class HaplotypeCallerGenotypingEngineUnitTest method makeCreateHaplotypeMappingData.
@DataProvider(name = "CreateHaplotypeMappingProvider")
public Object[][] makeCreateHaplotypeMappingData() {
List<Object[]> tests = new ArrayList<Object[]>();
final Set<Haplotype> haplotypes = new HashSet<>();
final Allele ref = Allele.create("A", true);
final Allele altC = Allele.create("C", false);
final Allele altT = Allele.create("T", false);
final Haplotype AtoC1 = new Haplotype("AACAA".getBytes());
final VariantContext vc1 = new VariantContextBuilder().chr("20").start(3).stop(3).alleles(Arrays.asList(ref, altC)).make();
AtoC1.setEventMap(new EventMap(Arrays.asList(vc1)));
AtoC1.getEventMap().put(3, vc1);
haplotypes.add(AtoC1);
final Haplotype AtoC2 = new Haplotype("AAACA".getBytes());
final VariantContext vc2 = new VariantContextBuilder().chr("20").start(4).stop(4).alleles(Arrays.asList(ref, altT)).make();
AtoC2.setEventMap(new EventMap(Arrays.asList(vc2)));
AtoC2.getEventMap().put(4, vc2);
haplotypes.add(AtoC2);
tests.add(new Object[] { vc1, haplotypes, AtoC1 });
tests.add(new Object[] { vc2, haplotypes, AtoC2 });
tests.add(new Object[] { new VariantContextBuilder().chr("20").start(1).stop(1).alleles(Arrays.asList(ref, altT)).make(), haplotypes, null });
return tests.toArray(new Object[][] {});
}
use of org.broadinstitute.hellbender.utils.haplotype.Haplotype in project gatk by broadinstitute.
the class AlignmentUtilsUnitTest method makeComplexReadAlignedToRef.
@DataProvider(name = "ComplexReadAlignedToRef")
public Object[][] makeComplexReadAlignedToRef() {
List<Object[]> tests = new ArrayList<>();
final List<Mutation> allMutations = Arrays.asList(new Mutation(1, 1, CigarOperator.M), new Mutation(2, 1, CigarOperator.M), new Mutation(3, 1, CigarOperator.I), new Mutation(7, 1, CigarOperator.D));
int i = 0;
final String referenceBases = "ACTGACTGACTG";
final String paddedReference = "NNNN" + referenceBases + "NNNN";
for (final List<Mutation> mutations : Utils.makePermutations(allMutations, 3, false)) {
final MutatedSequence hap = mutateSequence(referenceBases, mutations);
final Haplotype haplotype = new Haplotype(hap.seq.getBytes());
final SWPairwiseAlignment align = new SWPairwiseAlignment(paddedReference.getBytes(), hap.seq.getBytes());
haplotype.setAlignmentStartHapwrtRef(align.getAlignmentStart2wrt1());
haplotype.setCigar(align.getCigar());
for (final List<Mutation> readMutations : Utils.makePermutations(allMutations, 3, false)) {
final MutatedSequence readBases = mutateSequence(hap.seq, readMutations);
final GATKRead read = makeReadForAlignedToRefTest(readBases.seq);
tests.add(new Object[] { i++, read, paddedReference, haplotype, hap.numMismatches + readBases.numMismatches });
}
}
return tests.toArray(new Object[][] {});
}
use of org.broadinstitute.hellbender.utils.haplotype.Haplotype in project gatk-protected by broadinstitute.
the class ReadThreadingAssembler method runLocalAssembly.
/**
* Main entry point into the assembly engine. Build a set of deBruijn graphs out of the provided reference sequence and list of reads
* @param assemblyRegion AssemblyRegion object holding the reads which are to be used during assembly
* @param refHaplotype reference haplotype object
* @param fullReferenceWithPadding byte array holding the reference sequence with padding
* @param refLoc GenomeLoc object corresponding to the reference sequence with padding
* @param givenAlleles the alleles to inject into the haplotypes during GGA mode
* @param readErrorCorrector a ReadErrorCorrector object, if read are to be corrected before assembly. Can be null if no error corrector is to be used.
* @return the resulting assembly-result-set
*/
public AssemblyResultSet runLocalAssembly(final AssemblyRegion assemblyRegion, final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final SimpleInterval refLoc, final List<VariantContext> givenAlleles, final ReadErrorCorrector readErrorCorrector, final SAMFileHeader header) {
Utils.nonNull(assemblyRegion, "Assembly engine cannot be used with a null AssemblyRegion.");
Utils.nonNull(assemblyRegion.getExtendedSpan(), "Active region must have an extended location.");
Utils.nonNull(refHaplotype, "Reference haplotype cannot be null.");
Utils.nonNull(fullReferenceWithPadding, "fullReferenceWithPadding");
Utils.nonNull(refLoc, "refLoc");
Utils.validateArg(fullReferenceWithPadding.length == refLoc.size(), "Reference bases and reference loc must be the same size.");
ParamUtils.isPositiveOrZero(pruneFactor, "Pruning factor cannot be negative");
// create the list of artificial haplotypes that should be added to the graph for GGA mode
final List<Haplotype> givenHaplotypes = composeGivenHaplotypes(refHaplotype, givenAlleles, assemblyRegion.getExtendedSpan());
// error-correct reads before clipping low-quality tails: some low quality bases might be good and we want to recover them
final List<GATKRead> correctedReads;
if (readErrorCorrector != null) {
// now correct all reads in active region after filtering/downsampling
// Note that original reads in active region are NOT modified by default, since they will be used later for GL computation,
// and we only want the read-error corrected reads for graph building.
readErrorCorrector.addReadsToKmers(assemblyRegion.getReads());
correctedReads = new ArrayList<>(readErrorCorrector.correctReads(assemblyRegion.getReads()));
} else {
correctedReads = assemblyRegion.getReads();
}
final List<SeqGraph> nonRefGraphs = new LinkedList<>();
final AssemblyResultSet resultSet = new AssemblyResultSet();
resultSet.setRegionForGenotyping(assemblyRegion);
resultSet.setFullReferenceWithPadding(fullReferenceWithPadding);
resultSet.setPaddedReferenceLoc(refLoc);
final SimpleInterval activeRegionExtendedLocation = assemblyRegion.getExtendedSpan();
refHaplotype.setGenomeLocation(activeRegionExtendedLocation);
resultSet.add(refHaplotype);
final Map<SeqGraph, AssemblyResult> assemblyResultByGraph = new HashMap<>();
// create the graphs by calling our subclass assemble method
for (final AssemblyResult result : assemble(correctedReads, refHaplotype, givenHaplotypes, header)) {
if (result.getStatus() == AssemblyResult.Status.ASSEMBLED_SOME_VARIATION) {
// do some QC on the graph
sanityCheckGraph(result.getGraph(), refHaplotype);
// add it to graphs with meaningful non-reference features
assemblyResultByGraph.put(result.getGraph(), result);
nonRefGraphs.add(result.getGraph());
}
}
findBestPaths(nonRefGraphs, refHaplotype, refLoc, activeRegionExtendedLocation, assemblyResultByGraph, resultSet);
// print the graphs if the appropriate debug option has been turned on
if (graphOutputPath != null) {
printGraphs(nonRefGraphs);
}
return resultSet;
}
use of org.broadinstitute.hellbender.utils.haplotype.Haplotype in project gatk-protected by broadinstitute.
the class ReadThreadingAssembler method findBestPaths.
private List<Haplotype> findBestPaths(final Collection<SeqGraph> graphs, final Haplotype refHaplotype, final SimpleInterval refLoc, final SimpleInterval activeRegionWindow, final Map<SeqGraph, AssemblyResult> assemblyResultByGraph, final AssemblyResultSet assemblyResultSet) {
// add the reference haplotype separately from all the others to ensure that it is present in the list of haplotypes
final Set<Haplotype> returnHaplotypes = new LinkedHashSet<>();
final int activeRegionStart = refHaplotype.getAlignmentStartHapwrtRef();
final Collection<KBestHaplotypeFinder> finders = new ArrayList<>(graphs.size());
int failedCigars = 0;
for (final SeqGraph graph : graphs) {
final SeqVertex source = graph.getReferenceSourceVertex();
final SeqVertex sink = graph.getReferenceSinkVertex();
Utils.validateArg(source != null && sink != null, () -> "Both source and sink cannot be null but got " + source + " and sink " + sink + " for graph " + graph);
final KBestHaplotypeFinder haplotypeFinder = new KBestHaplotypeFinder(graph, source, sink);
finders.add(haplotypeFinder);
final Iterator<KBestHaplotype> bestHaplotypes = haplotypeFinder.iterator(numBestHaplotypesPerGraph);
while (bestHaplotypes.hasNext()) {
final KBestHaplotype kBestHaplotype = bestHaplotypes.next();
final Haplotype h = kBestHaplotype.haplotype();
if (!returnHaplotypes.contains(h)) {
final Cigar cigar = CigarUtils.calculateCigar(refHaplotype.getBases(), h.getBases());
if (cigar == null) {
// couldn't produce a meaningful alignment of haplotype to reference, fail quietly
failedCigars++;
continue;
} else if (cigar.isEmpty()) {
throw new IllegalStateException("Smith-Waterman alignment failure. Cigar = " + cigar + " with reference length " + cigar.getReferenceLength() + " but expecting reference length of " + refHaplotype.getCigar().getReferenceLength());
} else if (pathIsTooDivergentFromReference(cigar) || cigar.getReferenceLength() < MIN_HAPLOTYPE_REFERENCE_LENGTH) {
// N cigar elements means that a bubble was too divergent from the reference so skip over this path
continue;
} else if (cigar.getReferenceLength() != refHaplotype.getCigar().getReferenceLength()) {
// SW failure
throw new IllegalStateException("Smith-Waterman alignment failure. Cigar = " + cigar + " with reference length " + cigar.getReferenceLength() + " but expecting reference length of " + refHaplotype.getCigar().getReferenceLength() + " ref = " + refHaplotype + " path " + new String(h.getBases()));
}
h.setCigar(cigar);
h.setAlignmentStartHapwrtRef(activeRegionStart);
h.setGenomeLocation(activeRegionWindow);
returnHaplotypes.add(h);
assemblyResultSet.add(h, assemblyResultByGraph.get(graph));
if (debug) {
logger.info("Adding haplotype " + h.getCigar() + " from graph with kmer " + graph.getKmerSize());
}
}
}
}
// the first returned by any finder.
if (!returnHaplotypes.contains(refHaplotype)) {
double refScore = Double.NaN;
for (final KBestHaplotypeFinder finder : finders) {
final double candidate = finder.score(refHaplotype);
if (Double.isNaN(candidate)) {
continue;
}
refScore = candidate;
break;
}
refHaplotype.setScore(refScore);
returnHaplotypes.add(refHaplotype);
}
if (failedCigars != 0) {
logger.debug(String.format("failed to align some haplotypes (%d) back to the reference (loc=%s); these will be ignored.", failedCigars, refLoc.toString()));
}
if (debug) {
if (returnHaplotypes.size() > 1) {
logger.info("Found " + returnHaplotypes.size() + " candidate haplotypes of " + returnHaplotypes.size() + " possible combinations to evaluate every read against.");
} else {
logger.info("Found only the reference haplotype in the assembly graph.");
}
for (final Haplotype h : returnHaplotypes) {
logger.info(h.toString());
logger.info("> Cigar = " + h.getCigar() + " : " + h.getCigar().getReferenceLength() + " score " + h.getScore() + " ref " + h.isReference());
}
}
return new ArrayList<>(returnHaplotypes);
}
Aggregations