use of au.edu.wehi.idsv.DirectedEvidence in project gridss by PapenfussLab.
the class NonReferenceContigAssembler method callContig.
/**
* Calls the contig.
* Note: to call the contig, additional graph nodes may be required to be loaded. No loading checks
* (such as flushing excessively dense intervals) are performed on these additional loaded nodes.
* @param contig to call
* @return SAMRecord containing breakend assembly, null if a valid break-end assembly contig
* could not be created from this contig
*/
private SAMRecord callContig(ArrayDeque<KmerPathSubnode> rawcontig) {
if (rawcontig == null)
return null;
ArrayDeque<KmerPathSubnode> contig = rawcontig;
if (containsKmerRepeat(contig)) {
// recalculate the called contig, this may break the contig at the repeated kmer
MisassemblyFixer fixed = new MisassemblyFixer(contig);
contig = new ArrayDeque<KmerPathSubnode>(fixed.correctMisassignedEvidence(evidenceTracker.support(contig)));
}
if (contig.isEmpty())
return null;
int contigLength = contig.stream().mapToInt(sn -> sn.length()).sum();
int targetAnchorLength = Math.max(Math.min(contigLength, maxExpectedBreakendLength()), maxAnchorLength);
KmerPathNodePath startAnchorPath = new KmerPathNodePath(contig.getFirst(), false, targetAnchorLength + maxEvidenceSupportIntervalWidth + contig.getFirst().length());
startAnchorPath.greedyTraverse(true, false);
ArrayDeque<KmerPathSubnode> startingAnchor = startAnchorPath.headNode().asSubnodes();
startingAnchor.removeLast();
// make sure we have enough of the graph loaded so that when
// we traverse forward, our anchor sequence will be fully defined
advanceUnderlying(contig.getLast().lastEnd() + targetAnchorLength + minDistanceFromNextPositionForEvidenceToBeFullyLoaded());
KmerPathNodePath endAnchorPath = new KmerPathNodePath(contig.getLast(), true, targetAnchorLength + maxEvidenceSupportIntervalWidth + contig.getLast().length());
endAnchorPath.greedyTraverse(true, false);
ArrayDeque<KmerPathSubnode> endingAnchor = endAnchorPath.headNode().asSubnodes();
endingAnchor.removeFirst();
List<KmerPathSubnode> fullContig = new ArrayList<KmerPathSubnode>(contig.size() + startingAnchor.size() + endingAnchor.size());
fullContig.addAll(startingAnchor);
fullContig.addAll(contig);
fullContig.addAll(endingAnchor);
byte[] bases = KmerEncodingHelper.baseCalls(fullContig.stream().flatMap(sn -> sn.node().pathKmers().stream()).collect(Collectors.toList()), k);
byte[] quals = DeBruijnGraphBase.kmerWeightsToBaseQuals(k, fullContig.stream().flatMapToInt(sn -> sn.node().pathWeights().stream().mapToInt(Integer::intValue)).toArray());
assert (quals.length == bases.length);
// left aligned anchor position although it shouldn't matter since anchoring should be a single base wide
int startAnchorPosition = startingAnchor.size() == 0 ? 0 : startingAnchor.getLast().lastStart() + k - 1;
int endAnchorPosition = endingAnchor.size() == 0 ? 0 : endingAnchor.getFirst().firstStart();
int startAnchorBaseCount = startingAnchor.size() == 0 ? 0 : startingAnchor.stream().mapToInt(n -> n.length()).sum() + k - 1;
int endAnchorBaseCount = endingAnchor.size() == 0 ? 0 : endingAnchor.stream().mapToInt(n -> n.length()).sum() + k - 1;
int startBasesToTrim = Math.max(0, startAnchorBaseCount - targetAnchorLength);
int endingBasesToTrim = Math.max(0, endAnchorBaseCount - targetAnchorLength);
bases = Arrays.copyOfRange(bases, startBasesToTrim, bases.length - endingBasesToTrim);
quals = Arrays.copyOfRange(quals, startBasesToTrim, quals.length - endingBasesToTrim);
Set<KmerEvidence> evidence = evidenceTracker.untrack(contig);
List<DirectedEvidence> evidenceIds = evidence.stream().map(e -> e.evidence()).collect(Collectors.toList());
SAMRecord assembledContig;
if (startingAnchor.size() == 0 && endingAnchor.size() == 0) {
assert (startBasesToTrim == 0);
assert (endingBasesToTrim == 0);
// unanchored
if (evidence.size() > 0) {
BreakendSummary be = Models.calculateBreakend(aes.getContext().getLinear(), evidence.stream().map(e -> e.evidence().getBreakendSummary()).collect(Collectors.toList()), evidence.stream().map(e -> ScalingHelper.toScaledWeight(e.evidenceQuality())).collect(Collectors.toList()));
assembledContig = AssemblyFactory.createUnanchoredBreakend(aes.getContext(), aes, assemblyNameGenerator, be, evidenceIds, bases, quals);
if (evidence.stream().anyMatch(e -> e.isAnchored())) {
log.debug(String.format("Unanchored assembly %s at %s:%d contains anchored evidence", assembledContig.getReadName(), contigName, contig.getFirst().firstStart()));
}
} else {
// Can't call contig because we don't known the supporting breakend positions
assembledContig = null;
}
} else if (startingAnchor.size() == 0) {
// end anchored
assembledContig = AssemblyFactory.createAnchoredBreakend(aes.getContext(), aes, assemblyNameGenerator, BreakendDirection.Backward, evidenceIds, referenceIndex, endAnchorPosition, endAnchorBaseCount - endingBasesToTrim, bases, quals);
} else if (endingAnchor.size() == 0) {
// start anchored
assembledContig = AssemblyFactory.createAnchoredBreakend(aes.getContext(), aes, assemblyNameGenerator, BreakendDirection.Forward, evidenceIds, referenceIndex, startAnchorPosition, startAnchorBaseCount - startBasesToTrim, bases, quals);
} else {
if (startAnchorBaseCount + endAnchorBaseCount >= quals.length) {
// no unanchored bases - not an SV assembly
assembledContig = null;
} else {
assembledContig = AssemblyFactory.createAnchoredBreakpoint(aes.getContext(), aes, assemblyNameGenerator, evidenceIds, referenceIndex, startAnchorPosition, startAnchorBaseCount - startBasesToTrim, referenceIndex, endAnchorPosition, endAnchorBaseCount - endingBasesToTrim, bases, quals);
}
}
if (contigLength > maxExpectedBreakendLength()) {
log.debug(String.format("Called breakend contig %s at %s:%d-%d of length %d when maximum expected breakend contig length is %d", assembledContig == null ? "" : assembledContig.getReadName(), contigName, rawcontig.getFirst().firstStart(), rawcontig.getLast().lastEnd(), contigLength, maxExpectedBreakendLength()));
}
if (assembledContig != null) {
if (aes.getContext().getConfig().getVisualisation().assemblyGraph) {
try {
PositionalExporter.exportDot(new File(aes.getContext().getConfig().getVisualisation().directory, "assembly." + contigName + "." + assembledContig.getReadName() + ".dot"), k, graphByPosition, fullContig);
} catch (Exception ex) {
log.debug(ex, "Error exporting assembly ", assembledContig != null ? assembledContig.getReadName() : "(null)", " ", contigName);
}
}
if (aes.getContext().getConfig().getVisualisation().assemblyGraphFullSize) {
try {
PositionalExporter.exportNodeDot(new File(aes.getContext().getConfig().getVisualisation().directory, "assembly.fullsize." + contigName + "." + assembledContig.getReadName() + ".dot"), k, graphByPosition, fullContig);
} catch (Exception ex) {
log.debug(ex, "Error exporting assembly ", assembledContig != null ? assembledContig.getReadName() : "(null)", " ", contigName);
}
}
if (aes.getContext().getConfig().getVisualisation().assemblyContigMemoization) {
File file = new File(aes.getContext().getConfig().getVisualisation().directory, "assembly.path.memoization." + contigName + "." + Integer.toString(pathExportCount.incrementAndGet()) + ".csv");
try {
bestContigCaller.exportState(file);
} catch (IOException e) {
log.debug(e, " Unable to export assembly path memoization to ", file, " ", contigName);
}
}
}
stats.contigNodes = contig.size();
stats.truncatedNodes = rawcontig.size() - contig.size();
stats.contigStartPosition = contig.getFirst().firstStart();
stats.startAnchorNodes = startingAnchor.size();
stats.endAnchorNodes = endingAnchor.size();
if (exportTracker != null) {
exportTracker.trackAssembly(bestContigCaller);
}
// remove all evidence contributing to this assembly from the graph
if (evidence.size() > 0) {
removeFromGraph(evidence);
if (Defaults.SANITY_CHECK_MEMOIZATION) {
bestContigCaller.sanityCheck(graphByPosition);
}
} else {
if (!MessageThrottler.Current.shouldSupress(log, "unsupported paths")) {
log.error("Sanity check failure: found path with no support. Attempting to recover by direct node removal ", contigName);
}
for (KmerPathSubnode n : contig) {
removeFromGraph(n.node(), true);
}
}
contigsCalled++;
if (assembledContig != null) {
called.add(assembledContig);
}
return assembledContig;
}
use of au.edu.wehi.idsv.DirectedEvidence in project gridss by PapenfussLab.
the class AllocateEvidenceTest method should_uniquely_assign.
@Test
public void should_uniquely_assign() throws IOException, InterruptedException, ExecutionException {
final int fragSize = 4;
final int testSize = 64;
final List<SAMRecord> in = new ArrayList<SAMRecord>();
final ProcessingContext pc = getCommandlineContext();
pc.getVariantCallingParameters().writeFiltered = true;
pc.getVariantCallingParameters().minScore = 0;
StubSAMEvidenceSource ses = new StubSAMEvidenceSource(pc, input, 0, 0, fragSize);
for (int i = 1; i < testSize; i++) {
for (int j = 1; j < testSize; j++) {
SAMRecord[] dp = withReadName(String.format("read-%d-%d", i, j), DP(0, i, "1M", true, 1, j, "1M", false));
ses.evidence.add(NonReferenceReadPair.create(dp[0], dp[1], ses));
ses.evidence.add(NonReferenceReadPair.create(dp[1], dp[0], ses));
in.add(dp[0]);
in.add(dp[1]);
}
}
StubAssemblyEvidenceSource aes = new StubAssemblyEvidenceSource(pc);
aes.fragSize = fragSize;
Collections.sort(ses.evidence, DirectedEvidenceOrder.ByNatural);
createInput(in);
VariantCaller vc = new VariantCaller(pc, ImmutableList.<SAMEvidenceSource>of(ses), aes);
ExecutorService threadpool = Executors.newSingleThreadExecutor();
vc.callBreakends(output, threadpool);
threadpool.shutdown();
AllocateEvidence cmd = new AllocateEvidence();
cmd.INPUT_VCF = output;
cmd.setContext(pc);
cmd.setAssemblySource(aes);
cmd.setSamEvidenceSources(ImmutableList.of(ses));
cmd.OUTPUT_VCF = new File(testFolder.getRoot(), "annotated.vcf");
cmd.ASSEMBLY = new File(testFolder.getRoot(), "assembly.bam");
// to shut up the command-line parsing
createBAM(cmd.ASSEMBLY, SortOrder.coordinate, Collections.emptyList());
cmd.doWork(null);
// List<IdsvVariantContext> annotated = getVcf(new File(testFolder.getRoot(), "out.vcf"), null);
List<IdsvVariantContext> rawcalls = getVcf(output, null);
List<IdsvVariantContext> calls = getVcf(cmd.OUTPUT_VCF, null);
assertSymmetrical(rawcalls.stream().map(x -> (VariantContextDirectedBreakpoint) x).collect(Collectors.toList()));
assertSymmetrical(calls.stream().map(x -> (VariantContextDirectedBreakpoint) x).collect(Collectors.toList()));
// with no filtering, annotation should not change call set
double expectedEvidence = 0;
for (DirectedEvidence e : ses.evidence) {
DiscordantReadPair bp = (DiscordantReadPair) e;
expectedEvidence += bp.getBreakpointQual();
}
double annotatedEvidence = 0;
for (IdsvVariantContext e : calls) {
annotatedEvidence += e.getPhredScaledQual();
}
double rawEvidence = rawcalls.stream().mapToDouble(e -> e.getPhredScaledQual()).sum();
// unique assignment must result in the evidence total being, at most, the same
assertTrue(annotatedEvidence <= rawEvidence);
// each piece of evidence should be assigned to a single breakpoint so totals should match
int rpCalls = calls.stream().mapToInt(v -> ((VariantContextDirectedBreakpoint) v).getBreakpointEvidenceCount()).sum();
assertEquals(ses.evidence.size(), rpCalls);
// floating point truncation on VCF is severe!
assertEquals(expectedEvidence, annotatedEvidence, 20);
}
use of au.edu.wehi.idsv.DirectedEvidence in project gridss by PapenfussLab.
the class CallVariantsTest method should_handle_unpaired_libraries.
@Test
public void should_handle_unpaired_libraries() throws IOException {
List<DirectedEvidence> in = new ArrayList<DirectedEvidence>();
in.add(SCE(FWD, withSequence("AACCGGTTCTA", Read(0, 15, "5M6S"))));
List<SAMRecord> insam = in.stream().flatMap(de -> {
if (de instanceof SoftClipEvidence)
return ImmutableList.<SAMRecord>of(((SoftClipEvidence) de).getSAMRecord()).stream();
if (de instanceof NonReferenceReadPair)
return ImmutableList.<SAMRecord>of(((NonReferenceReadPair) de).getNonReferenceRead(), ((NonReferenceReadPair) de).getLocalledMappedRead()).stream();
throw new RuntimeException("NYI");
}).collect(Collectors.toList());
createInput(insam);
File assembly = new File(testFolder.getRoot(), "assembly.bam");
createBAM(assembly, SortOrder.coordinate);
String[] args = new String[] { "INPUT=" + input.toString(), "ASSEMBLY=" + assembly.toString(), "REFERENCE_SEQUENCE=" + reference.toString(), "OUTPUT=" + output.toString(), "TMP_DIR=" + super.testFolder.getRoot().toString(), "WORKING_DIR=" + super.testFolder.getRoot().toString() };
assertEquals(0, new CallVariants().instanceMain(args));
assertTrue(output.exists());
assembly.delete();
}
use of au.edu.wehi.idsv.DirectedEvidence in project gridss by PapenfussLab.
the class NonReferenceContigAssemblerTest method should_not_assembly_when_graph_density_exceeds_maximum.
@Test
public void should_not_assembly_when_graph_density_exceeds_maximum() {
ProcessingContext pc = getContext();
MockSAMEvidenceSource ses = SES(10, 10);
pc.getAssemblyParameters().k = 4;
pc.getAssemblyParameters().maxExpectedBreakendLengthMultiple = 1;
pc.getAssemblyParameters().positional.maximumNodeDensity = 0.1f;
List<DirectedEvidence> e = new ArrayList<>();
for (int i = 1; i < 101; i++) {
e.add(SCE(FWD, ses, withReadName(String.format("%d-%d", i, 0), withSequence("AAAATTGG", Read(0, i, "4M4S")))[0]));
e.add(SCE(FWD, ses, withReadName(String.format("%d-%d", i, 1), withSequence("AAAACCGG", Read(0, i, "4M4S")))[0]));
}
List<SAMRecord> output = go(pc, false, e.toArray(new DirectedEvidence[0]));
// the final advance doesn't get flushed since the underlying stream
// is considered to advance to Integer.MAX_VALUE at end of input
assertTrue(output.size() <= 20);
// shouldn't get throttled
pc.getAssemblyParameters().positional.maximumNodeDensity = 10;
output = go(pc, false, e.toArray(new DirectedEvidence[0]));
assertEquals(2 * 100, output.size());
}
use of au.edu.wehi.idsv.DirectedEvidence in project gridss by PapenfussLab.
the class NonReferenceContigAssemblerTest method should_remove_misassembled_partial_paths.
@Test
public void should_remove_misassembled_partial_paths() {
ProcessingContext pc = getContext();
MockSAMEvidenceSource ses = SES(10, 10);
pc.getAssemblyParameters().k = 4;
pc.getAssemblyParameters().maxExpectedBreakendLengthMultiple = 1;
List<DirectedEvidence> e = new ArrayList<>();
for (int i = 1; i < 100; i++) {
e.add(SCE(FWD, ses, withReadName(String.format("%d-%d", 0, i), withSequence("TAAAAAAAAAAAAAAAAAAAAAAAAAAAA", Read(0, i * 20, "4M25S")))[0]));
}
List<SAMRecord> output = go(pc, false, e.toArray(new DirectedEvidence[0]));
// the rest should have been removed
assertEquals(1, output.size());
}
Aggregations