use of org.broadinstitute.hellbender.tools.walkers.haplotypecaller.AssemblyResult in project gatk-protected by broadinstitute.
the class ReadThreadingAssembler method runLocalAssembly.
/**
* Main entry point into the assembly engine. Build a set of deBruijn graphs out of the provided reference sequence and list of reads
* @param assemblyRegion AssemblyRegion object holding the reads which are to be used during assembly
* @param refHaplotype reference haplotype object
* @param fullReferenceWithPadding byte array holding the reference sequence with padding
* @param refLoc GenomeLoc object corresponding to the reference sequence with padding
* @param givenAlleles the alleles to inject into the haplotypes during GGA mode
* @param readErrorCorrector a ReadErrorCorrector object, if read are to be corrected before assembly. Can be null if no error corrector is to be used.
* @return the resulting assembly-result-set
*/
public AssemblyResultSet runLocalAssembly(final AssemblyRegion assemblyRegion, final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final SimpleInterval refLoc, final List<VariantContext> givenAlleles, final ReadErrorCorrector readErrorCorrector, final SAMFileHeader header) {
Utils.nonNull(assemblyRegion, "Assembly engine cannot be used with a null AssemblyRegion.");
Utils.nonNull(assemblyRegion.getExtendedSpan(), "Active region must have an extended location.");
Utils.nonNull(refHaplotype, "Reference haplotype cannot be null.");
Utils.nonNull(fullReferenceWithPadding, "fullReferenceWithPadding");
Utils.nonNull(refLoc, "refLoc");
Utils.validateArg(fullReferenceWithPadding.length == refLoc.size(), "Reference bases and reference loc must be the same size.");
ParamUtils.isPositiveOrZero(pruneFactor, "Pruning factor cannot be negative");
// create the list of artificial haplotypes that should be added to the graph for GGA mode
final List<Haplotype> givenHaplotypes = composeGivenHaplotypes(refHaplotype, givenAlleles, assemblyRegion.getExtendedSpan());
// error-correct reads before clipping low-quality tails: some low quality bases might be good and we want to recover them
final List<GATKRead> correctedReads;
if (readErrorCorrector != null) {
// now correct all reads in active region after filtering/downsampling
// Note that original reads in active region are NOT modified by default, since they will be used later for GL computation,
// and we only want the read-error corrected reads for graph building.
readErrorCorrector.addReadsToKmers(assemblyRegion.getReads());
correctedReads = new ArrayList<>(readErrorCorrector.correctReads(assemblyRegion.getReads()));
} else {
correctedReads = assemblyRegion.getReads();
}
final List<SeqGraph> nonRefGraphs = new LinkedList<>();
final AssemblyResultSet resultSet = new AssemblyResultSet();
resultSet.setRegionForGenotyping(assemblyRegion);
resultSet.setFullReferenceWithPadding(fullReferenceWithPadding);
resultSet.setPaddedReferenceLoc(refLoc);
final SimpleInterval activeRegionExtendedLocation = assemblyRegion.getExtendedSpan();
refHaplotype.setGenomeLocation(activeRegionExtendedLocation);
resultSet.add(refHaplotype);
final Map<SeqGraph, AssemblyResult> assemblyResultByGraph = new HashMap<>();
// create the graphs by calling our subclass assemble method
for (final AssemblyResult result : assemble(correctedReads, refHaplotype, givenHaplotypes, header)) {
if (result.getStatus() == AssemblyResult.Status.ASSEMBLED_SOME_VARIATION) {
// do some QC on the graph
sanityCheckGraph(result.getGraph(), refHaplotype);
// add it to graphs with meaningful non-reference features
assemblyResultByGraph.put(result.getGraph(), result);
nonRefGraphs.add(result.getGraph());
}
}
findBestPaths(nonRefGraphs, refHaplotype, refLoc, activeRegionExtendedLocation, assemblyResultByGraph, resultSet);
// print the graphs if the appropriate debug option has been turned on
if (graphOutputPath != null) {
printGraphs(nonRefGraphs);
}
return resultSet;
}
use of org.broadinstitute.hellbender.tools.walkers.haplotypecaller.AssemblyResult in project gatk-protected by broadinstitute.
the class ReadThreadingAssembler method createGraph.
/**
* Creates the sequence graph for the given kmerSize
*
* @param reads reads to use
* @param refHaplotype reference haplotype
* @param kmerSize kmer size
* @param activeAlleleHaplotypes the GGA haplotypes to inject into the graph
* @param allowLowComplexityGraphs if true, do not check for low-complexity graphs
* @param allowNonUniqueKmersInRef if true, do not fail if the reference has non-unique kmers
* @return sequence graph or null if one could not be created (e.g. because it contains cycles or too many paths or is low complexity)
*/
private AssemblyResult createGraph(final Iterable<GATKRead> reads, final Haplotype refHaplotype, final int kmerSize, final Iterable<Haplotype> activeAlleleHaplotypes, final boolean allowLowComplexityGraphs, final boolean allowNonUniqueKmersInRef, final SAMFileHeader header) {
if (refHaplotype.length() < kmerSize) {
// happens in cases where the assembled region is just too small
return new AssemblyResult(AssemblyResult.Status.FAILED, null, null);
}
if (!allowNonUniqueKmersInRef && !ReadThreadingGraph.determineNonUniqueKmers(new ReadThreadingGraph.SequenceForKmers("ref", refHaplotype.getBases(), 0, refHaplotype.getBases().length, 1, true), kmerSize).isEmpty()) {
if (debug) {
logger.info("Not using kmer size of " + kmerSize + " in read threading assembler because reference contains non-unique kmers");
}
return null;
}
final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize, debugGraphTransformations, MIN_BASE_QUALITY_TO_USE_IN_ASSEMBLY, numPruningSamples);
rtgraph.setThreadingStartOnlyAtExistingVertex(!recoverDanglingBranches);
// add the reference sequence to the graph
rtgraph.addSequence("ref", refHaplotype.getBases(), true);
// add the artificial GGA haplotypes to the graph
int hapCount = 0;
for (final Haplotype h : activeAlleleHaplotypes) {
rtgraph.addSequence("activeAllele" + hapCount++, h.getBases(), GGA_MODE_ARTIFICIAL_COUNTS, false);
}
// Next pull kmers out of every read and throw them on the graph
for (final GATKRead read : reads) {
rtgraph.addRead(read, header);
}
// actually build the read threading graph
rtgraph.buildGraphIfNecessary();
// sanity check: make sure there are no cycles in the graph
if (rtgraph.hasCycles()) {
if (debug) {
logger.info("Not using kmer size of " + kmerSize + " in read threading assembler because it contains a cycle");
}
return null;
}
// sanity check: make sure the graph had enough complexity with the given kmer
if (!allowLowComplexityGraphs && rtgraph.isLowComplexity()) {
if (debug) {
logger.info("Not using kmer size of " + kmerSize + " in read threading assembler because it does not produce a graph with enough complexity");
}
return null;
}
return getAssemblyResult(refHaplotype, kmerSize, rtgraph);
}
use of org.broadinstitute.hellbender.tools.walkers.haplotypecaller.AssemblyResult in project gatk-protected by broadinstitute.
the class ReadThreadingAssembler method getAssemblyResult.
private AssemblyResult getAssemblyResult(final Haplotype refHaplotype, final int kmerSize, final ReadThreadingGraph rtgraph) {
printDebugGraphTransform(rtgraph, refHaplotype.getLocation() + "-sequenceGraph." + kmerSize + ".0.0.raw_readthreading_graph.dot");
// prune all of the chains where all edges have multiplicity < pruneFactor. This must occur
// before recoverDanglingTails in the graph, so that we don't spend a ton of time recovering
// tails that we'll ultimately just trim away anyway, as the dangling tail edges have weight of 1
rtgraph.pruneLowWeightChains(pruneFactor);
// we can recover them by merging some N bases from the chain back into the reference
if (recoverDanglingBranches) {
rtgraph.recoverDanglingTails(pruneFactor, minDanglingBranchLength);
rtgraph.recoverDanglingHeads(pruneFactor, minDanglingBranchLength);
}
// remove all heading and trailing paths
if (removePathsNotConnectedToRef) {
rtgraph.removePathsNotConnectedToRef();
}
printDebugGraphTransform(rtgraph, refHaplotype.getLocation() + "-sequenceGraph." + kmerSize + ".0.1.cleaned_readthreading_graph.dot");
final SeqGraph initialSeqGraph = rtgraph.toSequenceGraph();
if (debugGraphTransformations) {
initialSeqGraph.printGraph(new File(debugGraphOutputPath, refHaplotype.getLocation() + "-sequenceGraph." + kmerSize + ".0.1.initial_seqgraph.dot"), 10000);
}
// if the unit tests don't want us to cleanup the graph, just return the raw sequence graph
if (justReturnRawGraph) {
return new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION, initialSeqGraph, null);
}
if (debug) {
logger.info("Using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler");
}
printDebugGraphTransform(initialSeqGraph, refHaplotype.getLocation() + "-sequenceGraph." + kmerSize + ".0.2.initial_seqgraph.dot");
// TODO -- I don't this is possible by construction
initialSeqGraph.cleanNonRefPaths();
final AssemblyResult cleaned = cleanupSeqGraph(initialSeqGraph);
final AssemblyResult.Status status = cleaned.getStatus();
return new AssemblyResult(status, cleaned.getGraph(), rtgraph);
}
use of org.broadinstitute.hellbender.tools.walkers.haplotypecaller.AssemblyResult in project gatk by broadinstitute.
the class ReadThreadingAssembler method createGraph.
/**
* Creates the sequence graph for the given kmerSize
*
* @param reads reads to use
* @param refHaplotype reference haplotype
* @param kmerSize kmer size
* @param activeAlleleHaplotypes the GGA haplotypes to inject into the graph
* @param allowLowComplexityGraphs if true, do not check for low-complexity graphs
* @param allowNonUniqueKmersInRef if true, do not fail if the reference has non-unique kmers
* @return sequence graph or null if one could not be created (e.g. because it contains cycles or too many paths or is low complexity)
*/
private AssemblyResult createGraph(final Iterable<GATKRead> reads, final Haplotype refHaplotype, final int kmerSize, final Iterable<Haplotype> activeAlleleHaplotypes, final boolean allowLowComplexityGraphs, final boolean allowNonUniqueKmersInRef, final SAMFileHeader header) {
if (refHaplotype.length() < kmerSize) {
// happens in cases where the assembled region is just too small
return new AssemblyResult(AssemblyResult.Status.FAILED, null, null);
}
if (!allowNonUniqueKmersInRef && !ReadThreadingGraph.determineNonUniqueKmers(new ReadThreadingGraph.SequenceForKmers("ref", refHaplotype.getBases(), 0, refHaplotype.getBases().length, 1, true), kmerSize).isEmpty()) {
if (debug) {
logger.info("Not using kmer size of " + kmerSize + " in read threading assembler because reference contains non-unique kmers");
}
return null;
}
final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize, debugGraphTransformations, MIN_BASE_QUALITY_TO_USE_IN_ASSEMBLY, numPruningSamples);
rtgraph.setThreadingStartOnlyAtExistingVertex(!recoverDanglingBranches);
// add the reference sequence to the graph
rtgraph.addSequence("ref", refHaplotype.getBases(), true);
// add the artificial GGA haplotypes to the graph
int hapCount = 0;
for (final Haplotype h : activeAlleleHaplotypes) {
rtgraph.addSequence("activeAllele" + hapCount++, h.getBases(), GGA_MODE_ARTIFICIAL_COUNTS, false);
}
// Next pull kmers out of every read and throw them on the graph
for (final GATKRead read : reads) {
rtgraph.addRead(read, header);
}
// actually build the read threading graph
rtgraph.buildGraphIfNecessary();
// sanity check: make sure there are no cycles in the graph
if (rtgraph.hasCycles()) {
if (debug) {
logger.info("Not using kmer size of " + kmerSize + " in read threading assembler because it contains a cycle");
}
return null;
}
// sanity check: make sure the graph had enough complexity with the given kmer
if (!allowLowComplexityGraphs && rtgraph.isLowComplexity()) {
if (debug) {
logger.info("Not using kmer size of " + kmerSize + " in read threading assembler because it does not produce a graph with enough complexity");
}
return null;
}
return getAssemblyResult(refHaplotype, kmerSize, rtgraph);
}
use of org.broadinstitute.hellbender.tools.walkers.haplotypecaller.AssemblyResult in project gatk by broadinstitute.
the class ReadThreadingAssembler method getAssemblyResult.
private AssemblyResult getAssemblyResult(final Haplotype refHaplotype, final int kmerSize, final ReadThreadingGraph rtgraph) {
printDebugGraphTransform(rtgraph, refHaplotype.getLocation() + "-sequenceGraph." + kmerSize + ".0.0.raw_readthreading_graph.dot");
// prune all of the chains where all edges have multiplicity < pruneFactor. This must occur
// before recoverDanglingTails in the graph, so that we don't spend a ton of time recovering
// tails that we'll ultimately just trim away anyway, as the dangling tail edges have weight of 1
rtgraph.pruneLowWeightChains(pruneFactor);
// we can recover them by merging some N bases from the chain back into the reference
if (recoverDanglingBranches) {
rtgraph.recoverDanglingTails(pruneFactor, minDanglingBranchLength);
rtgraph.recoverDanglingHeads(pruneFactor, minDanglingBranchLength);
}
// remove all heading and trailing paths
if (removePathsNotConnectedToRef) {
rtgraph.removePathsNotConnectedToRef();
}
printDebugGraphTransform(rtgraph, refHaplotype.getLocation() + "-sequenceGraph." + kmerSize + ".0.1.cleaned_readthreading_graph.dot");
final SeqGraph initialSeqGraph = rtgraph.toSequenceGraph();
if (debugGraphTransformations) {
initialSeqGraph.printGraph(new File(debugGraphOutputPath, refHaplotype.getLocation() + "-sequenceGraph." + kmerSize + ".0.1.initial_seqgraph.dot"), 10000);
}
// if the unit tests don't want us to cleanup the graph, just return the raw sequence graph
if (justReturnRawGraph) {
return new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION, initialSeqGraph, null);
}
if (debug) {
logger.info("Using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler");
}
printDebugGraphTransform(initialSeqGraph, refHaplotype.getLocation() + "-sequenceGraph." + kmerSize + ".0.2.initial_seqgraph.dot");
// TODO -- I don't this is possible by construction
initialSeqGraph.cleanNonRefPaths();
final AssemblyResult cleaned = cleanupSeqGraph(initialSeqGraph);
final AssemblyResult.Status status = cleaned.getStatus();
return new AssemblyResult(status, cleaned.getGraph(), rtgraph);
}
Aggregations