Search in sources :

Example 1 with AssemblyResult

use of org.broadinstitute.hellbender.tools.walkers.haplotypecaller.AssemblyResult in project gatk-protected by broadinstitute.

the class ReadThreadingAssembler method runLocalAssembly.

/**
     * Main entry point into the assembly engine. Build a set of deBruijn graphs out of the provided reference sequence and list of reads
     * @param assemblyRegion              AssemblyRegion object holding the reads which are to be used during assembly
     * @param refHaplotype              reference haplotype object
     * @param fullReferenceWithPadding  byte array holding the reference sequence with padding
     * @param refLoc                    GenomeLoc object corresponding to the reference sequence with padding
     * @param givenAlleles   the alleles to inject into the haplotypes during GGA mode
     * @param readErrorCorrector        a ReadErrorCorrector object, if read are to be corrected before assembly. Can be null if no error corrector is to be used.
     * @return                          the resulting assembly-result-set
     */
public AssemblyResultSet runLocalAssembly(final AssemblyRegion assemblyRegion, final Haplotype refHaplotype, final byte[] fullReferenceWithPadding, final SimpleInterval refLoc, final List<VariantContext> givenAlleles, final ReadErrorCorrector readErrorCorrector, final SAMFileHeader header) {
    Utils.nonNull(assemblyRegion, "Assembly engine cannot be used with a null AssemblyRegion.");
    Utils.nonNull(assemblyRegion.getExtendedSpan(), "Active region must have an extended location.");
    Utils.nonNull(refHaplotype, "Reference haplotype cannot be null.");
    Utils.nonNull(fullReferenceWithPadding, "fullReferenceWithPadding");
    Utils.nonNull(refLoc, "refLoc");
    Utils.validateArg(fullReferenceWithPadding.length == refLoc.size(), "Reference bases and reference loc must be the same size.");
    ParamUtils.isPositiveOrZero(pruneFactor, "Pruning factor cannot be negative");
    // create the list of artificial haplotypes that should be added to the graph for GGA mode
    final List<Haplotype> givenHaplotypes = composeGivenHaplotypes(refHaplotype, givenAlleles, assemblyRegion.getExtendedSpan());
    // error-correct reads before clipping low-quality tails: some low quality bases might be good and we want to recover them
    final List<GATKRead> correctedReads;
    if (readErrorCorrector != null) {
        // now correct all reads in active region after filtering/downsampling
        // Note that original reads in active region are NOT modified by default, since they will be used later for GL computation,
        // and we only want the read-error corrected reads for graph building.
        readErrorCorrector.addReadsToKmers(assemblyRegion.getReads());
        correctedReads = new ArrayList<>(readErrorCorrector.correctReads(assemblyRegion.getReads()));
    } else {
        correctedReads = assemblyRegion.getReads();
    }
    final List<SeqGraph> nonRefGraphs = new LinkedList<>();
    final AssemblyResultSet resultSet = new AssemblyResultSet();
    resultSet.setRegionForGenotyping(assemblyRegion);
    resultSet.setFullReferenceWithPadding(fullReferenceWithPadding);
    resultSet.setPaddedReferenceLoc(refLoc);
    final SimpleInterval activeRegionExtendedLocation = assemblyRegion.getExtendedSpan();
    refHaplotype.setGenomeLocation(activeRegionExtendedLocation);
    resultSet.add(refHaplotype);
    final Map<SeqGraph, AssemblyResult> assemblyResultByGraph = new HashMap<>();
    // create the graphs by calling our subclass assemble method
    for (final AssemblyResult result : assemble(correctedReads, refHaplotype, givenHaplotypes, header)) {
        if (result.getStatus() == AssemblyResult.Status.ASSEMBLED_SOME_VARIATION) {
            // do some QC on the graph
            sanityCheckGraph(result.getGraph(), refHaplotype);
            // add it to graphs with meaningful non-reference features
            assemblyResultByGraph.put(result.getGraph(), result);
            nonRefGraphs.add(result.getGraph());
        }
    }
    findBestPaths(nonRefGraphs, refHaplotype, refLoc, activeRegionExtendedLocation, assemblyResultByGraph, resultSet);
    // print the graphs if the appropriate debug option has been turned on
    if (graphOutputPath != null) {
        printGraphs(nonRefGraphs);
    }
    return resultSet;
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) AssemblyResultSet(org.broadinstitute.hellbender.tools.walkers.haplotypecaller.AssemblyResultSet) AssemblyResult(org.broadinstitute.hellbender.tools.walkers.haplotypecaller.AssemblyResult) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Haplotype(org.broadinstitute.hellbender.utils.haplotype.Haplotype)

Example 2 with AssemblyResult

use of org.broadinstitute.hellbender.tools.walkers.haplotypecaller.AssemblyResult in project gatk-protected by broadinstitute.

the class ReadThreadingAssembler method createGraph.

/**
     * Creates the sequence graph for the given kmerSize
     *
     * @param reads            reads to use
     * @param refHaplotype     reference haplotype
     * @param kmerSize         kmer size
     * @param activeAlleleHaplotypes the GGA haplotypes to inject into the graph
     * @param allowLowComplexityGraphs if true, do not check for low-complexity graphs
     * @param allowNonUniqueKmersInRef if true, do not fail if the reference has non-unique kmers
     * @return sequence graph or null if one could not be created (e.g. because it contains cycles or too many paths or is low complexity)
     */
private AssemblyResult createGraph(final Iterable<GATKRead> reads, final Haplotype refHaplotype, final int kmerSize, final Iterable<Haplotype> activeAlleleHaplotypes, final boolean allowLowComplexityGraphs, final boolean allowNonUniqueKmersInRef, final SAMFileHeader header) {
    if (refHaplotype.length() < kmerSize) {
        // happens in cases where the assembled region is just too small
        return new AssemblyResult(AssemblyResult.Status.FAILED, null, null);
    }
    if (!allowNonUniqueKmersInRef && !ReadThreadingGraph.determineNonUniqueKmers(new ReadThreadingGraph.SequenceForKmers("ref", refHaplotype.getBases(), 0, refHaplotype.getBases().length, 1, true), kmerSize).isEmpty()) {
        if (debug) {
            logger.info("Not using kmer size of " + kmerSize + " in read threading assembler because reference contains non-unique kmers");
        }
        return null;
    }
    final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize, debugGraphTransformations, MIN_BASE_QUALITY_TO_USE_IN_ASSEMBLY, numPruningSamples);
    rtgraph.setThreadingStartOnlyAtExistingVertex(!recoverDanglingBranches);
    // add the reference sequence to the graph
    rtgraph.addSequence("ref", refHaplotype.getBases(), true);
    // add the artificial GGA haplotypes to the graph
    int hapCount = 0;
    for (final Haplotype h : activeAlleleHaplotypes) {
        rtgraph.addSequence("activeAllele" + hapCount++, h.getBases(), GGA_MODE_ARTIFICIAL_COUNTS, false);
    }
    // Next pull kmers out of every read and throw them on the graph
    for (final GATKRead read : reads) {
        rtgraph.addRead(read, header);
    }
    // actually build the read threading graph
    rtgraph.buildGraphIfNecessary();
    // sanity check: make sure there are no cycles in the graph
    if (rtgraph.hasCycles()) {
        if (debug) {
            logger.info("Not using kmer size of " + kmerSize + " in read threading assembler because it contains a cycle");
        }
        return null;
    }
    // sanity check: make sure the graph had enough complexity with the given kmer
    if (!allowLowComplexityGraphs && rtgraph.isLowComplexity()) {
        if (debug) {
            logger.info("Not using kmer size of " + kmerSize + " in read threading assembler because it does not produce a graph with enough complexity");
        }
        return null;
    }
    return getAssemblyResult(refHaplotype, kmerSize, rtgraph);
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) AssemblyResult(org.broadinstitute.hellbender.tools.walkers.haplotypecaller.AssemblyResult) Haplotype(org.broadinstitute.hellbender.utils.haplotype.Haplotype)

Example 3 with AssemblyResult

use of org.broadinstitute.hellbender.tools.walkers.haplotypecaller.AssemblyResult in project gatk-protected by broadinstitute.

the class ReadThreadingAssembler method getAssemblyResult.

private AssemblyResult getAssemblyResult(final Haplotype refHaplotype, final int kmerSize, final ReadThreadingGraph rtgraph) {
    printDebugGraphTransform(rtgraph, refHaplotype.getLocation() + "-sequenceGraph." + kmerSize + ".0.0.raw_readthreading_graph.dot");
    // prune all of the chains where all edges have multiplicity < pruneFactor.  This must occur
    // before recoverDanglingTails in the graph, so that we don't spend a ton of time recovering
    // tails that we'll ultimately just trim away anyway, as the dangling tail edges have weight of 1
    rtgraph.pruneLowWeightChains(pruneFactor);
    // we can recover them by merging some N bases from the chain back into the reference
    if (recoverDanglingBranches) {
        rtgraph.recoverDanglingTails(pruneFactor, minDanglingBranchLength);
        rtgraph.recoverDanglingHeads(pruneFactor, minDanglingBranchLength);
    }
    // remove all heading and trailing paths
    if (removePathsNotConnectedToRef) {
        rtgraph.removePathsNotConnectedToRef();
    }
    printDebugGraphTransform(rtgraph, refHaplotype.getLocation() + "-sequenceGraph." + kmerSize + ".0.1.cleaned_readthreading_graph.dot");
    final SeqGraph initialSeqGraph = rtgraph.toSequenceGraph();
    if (debugGraphTransformations) {
        initialSeqGraph.printGraph(new File(debugGraphOutputPath, refHaplotype.getLocation() + "-sequenceGraph." + kmerSize + ".0.1.initial_seqgraph.dot"), 10000);
    }
    // if the unit tests don't want us to cleanup the graph, just return the raw sequence graph
    if (justReturnRawGraph) {
        return new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION, initialSeqGraph, null);
    }
    if (debug) {
        logger.info("Using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler");
    }
    printDebugGraphTransform(initialSeqGraph, refHaplotype.getLocation() + "-sequenceGraph." + kmerSize + ".0.2.initial_seqgraph.dot");
    // TODO -- I don't this is possible by construction
    initialSeqGraph.cleanNonRefPaths();
    final AssemblyResult cleaned = cleanupSeqGraph(initialSeqGraph);
    final AssemblyResult.Status status = cleaned.getStatus();
    return new AssemblyResult(status, cleaned.getGraph(), rtgraph);
}
Also used : AssemblyResult(org.broadinstitute.hellbender.tools.walkers.haplotypecaller.AssemblyResult) File(java.io.File)

Example 4 with AssemblyResult

use of org.broadinstitute.hellbender.tools.walkers.haplotypecaller.AssemblyResult in project gatk by broadinstitute.

the class ReadThreadingAssembler method createGraph.

/**
     * Creates the sequence graph for the given kmerSize
     *
     * @param reads            reads to use
     * @param refHaplotype     reference haplotype
     * @param kmerSize         kmer size
     * @param activeAlleleHaplotypes the GGA haplotypes to inject into the graph
     * @param allowLowComplexityGraphs if true, do not check for low-complexity graphs
     * @param allowNonUniqueKmersInRef if true, do not fail if the reference has non-unique kmers
     * @return sequence graph or null if one could not be created (e.g. because it contains cycles or too many paths or is low complexity)
     */
private AssemblyResult createGraph(final Iterable<GATKRead> reads, final Haplotype refHaplotype, final int kmerSize, final Iterable<Haplotype> activeAlleleHaplotypes, final boolean allowLowComplexityGraphs, final boolean allowNonUniqueKmersInRef, final SAMFileHeader header) {
    if (refHaplotype.length() < kmerSize) {
        // happens in cases where the assembled region is just too small
        return new AssemblyResult(AssemblyResult.Status.FAILED, null, null);
    }
    if (!allowNonUniqueKmersInRef && !ReadThreadingGraph.determineNonUniqueKmers(new ReadThreadingGraph.SequenceForKmers("ref", refHaplotype.getBases(), 0, refHaplotype.getBases().length, 1, true), kmerSize).isEmpty()) {
        if (debug) {
            logger.info("Not using kmer size of " + kmerSize + " in read threading assembler because reference contains non-unique kmers");
        }
        return null;
    }
    final ReadThreadingGraph rtgraph = new ReadThreadingGraph(kmerSize, debugGraphTransformations, MIN_BASE_QUALITY_TO_USE_IN_ASSEMBLY, numPruningSamples);
    rtgraph.setThreadingStartOnlyAtExistingVertex(!recoverDanglingBranches);
    // add the reference sequence to the graph
    rtgraph.addSequence("ref", refHaplotype.getBases(), true);
    // add the artificial GGA haplotypes to the graph
    int hapCount = 0;
    for (final Haplotype h : activeAlleleHaplotypes) {
        rtgraph.addSequence("activeAllele" + hapCount++, h.getBases(), GGA_MODE_ARTIFICIAL_COUNTS, false);
    }
    // Next pull kmers out of every read and throw them on the graph
    for (final GATKRead read : reads) {
        rtgraph.addRead(read, header);
    }
    // actually build the read threading graph
    rtgraph.buildGraphIfNecessary();
    // sanity check: make sure there are no cycles in the graph
    if (rtgraph.hasCycles()) {
        if (debug) {
            logger.info("Not using kmer size of " + kmerSize + " in read threading assembler because it contains a cycle");
        }
        return null;
    }
    // sanity check: make sure the graph had enough complexity with the given kmer
    if (!allowLowComplexityGraphs && rtgraph.isLowComplexity()) {
        if (debug) {
            logger.info("Not using kmer size of " + kmerSize + " in read threading assembler because it does not produce a graph with enough complexity");
        }
        return null;
    }
    return getAssemblyResult(refHaplotype, kmerSize, rtgraph);
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) AssemblyResult(org.broadinstitute.hellbender.tools.walkers.haplotypecaller.AssemblyResult) Haplotype(org.broadinstitute.hellbender.utils.haplotype.Haplotype)

Example 5 with AssemblyResult

use of org.broadinstitute.hellbender.tools.walkers.haplotypecaller.AssemblyResult in project gatk by broadinstitute.

the class ReadThreadingAssembler method getAssemblyResult.

private AssemblyResult getAssemblyResult(final Haplotype refHaplotype, final int kmerSize, final ReadThreadingGraph rtgraph) {
    printDebugGraphTransform(rtgraph, refHaplotype.getLocation() + "-sequenceGraph." + kmerSize + ".0.0.raw_readthreading_graph.dot");
    // prune all of the chains where all edges have multiplicity < pruneFactor.  This must occur
    // before recoverDanglingTails in the graph, so that we don't spend a ton of time recovering
    // tails that we'll ultimately just trim away anyway, as the dangling tail edges have weight of 1
    rtgraph.pruneLowWeightChains(pruneFactor);
    // we can recover them by merging some N bases from the chain back into the reference
    if (recoverDanglingBranches) {
        rtgraph.recoverDanglingTails(pruneFactor, minDanglingBranchLength);
        rtgraph.recoverDanglingHeads(pruneFactor, minDanglingBranchLength);
    }
    // remove all heading and trailing paths
    if (removePathsNotConnectedToRef) {
        rtgraph.removePathsNotConnectedToRef();
    }
    printDebugGraphTransform(rtgraph, refHaplotype.getLocation() + "-sequenceGraph." + kmerSize + ".0.1.cleaned_readthreading_graph.dot");
    final SeqGraph initialSeqGraph = rtgraph.toSequenceGraph();
    if (debugGraphTransformations) {
        initialSeqGraph.printGraph(new File(debugGraphOutputPath, refHaplotype.getLocation() + "-sequenceGraph." + kmerSize + ".0.1.initial_seqgraph.dot"), 10000);
    }
    // if the unit tests don't want us to cleanup the graph, just return the raw sequence graph
    if (justReturnRawGraph) {
        return new AssemblyResult(AssemblyResult.Status.ASSEMBLED_SOME_VARIATION, initialSeqGraph, null);
    }
    if (debug) {
        logger.info("Using kmer size of " + rtgraph.getKmerSize() + " in read threading assembler");
    }
    printDebugGraphTransform(initialSeqGraph, refHaplotype.getLocation() + "-sequenceGraph." + kmerSize + ".0.2.initial_seqgraph.dot");
    // TODO -- I don't this is possible by construction
    initialSeqGraph.cleanNonRefPaths();
    final AssemblyResult cleaned = cleanupSeqGraph(initialSeqGraph);
    final AssemblyResult.Status status = cleaned.getStatus();
    return new AssemblyResult(status, cleaned.getGraph(), rtgraph);
}
Also used : AssemblyResult(org.broadinstitute.hellbender.tools.walkers.haplotypecaller.AssemblyResult) File(java.io.File)

Aggregations

AssemblyResult (org.broadinstitute.hellbender.tools.walkers.haplotypecaller.AssemblyResult)6 Haplotype (org.broadinstitute.hellbender.utils.haplotype.Haplotype)4 GATKRead (org.broadinstitute.hellbender.utils.read.GATKRead)4 File (java.io.File)2 AssemblyResultSet (org.broadinstitute.hellbender.tools.walkers.haplotypecaller.AssemblyResultSet)2 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)2