use of org.broadinstitute.hellbender.exceptions.GATKException in project gatk by broadinstitute.
the class GappedAlignmentSplitter method split.
/**
* Splits a gapped alignment into multiple alignment regions, based on input sensitivity: i.e. if the gap(s) in the input alignment
* is equal to or larger than the size specified by {@code sensitivity}, two alignment regions will be generated.
*
* To fulfill this functionality, needs to accomplish three key tasks correctly:
* <ul>
* <li>generate appropriate cigar,</li>
* <li>infer reference coordinates,</li>
* <li>infer contig coordinates</li>
* </ul>
* As an example, an alignment with CIGAR "397S118M2D26M6I50M7I26M1I8M13D72M398S", when {@code sensitivity} is set to "1", should be split into 5 alignments
* <ul>
* <li>"397S118M594S"</li>
* <li>"515S26M568S"</li>
* <li>"547S50M512S"</li>
* <li>"631S8M470S"</li>
* <li>"639S72M398S"</li>
* </ul>
* On the other hand, an alignment with CIGAR "10M10D10M60I10M10I10M50D10M", when {@code sensitivity} is set to "50", should be split into 3 alignments
* <ul>
* <li>"10M10D10M100S"</li>
* <li>"80S10M10I10M10S"</li>
* <li>"110S10M"</li>
* </ul>
* And when an alignment has hard clipping adjacent to soft clippings, e.g. "1H2S3M5I10M20D6M7S8H", it should be split into alignments with CIGAR resembling the original CIGAR as much as possible:
* <ul>
* <li>"1H2S3M28S8H"</li>
* <li>"1H10S10M13S8H"</li>
* <li>"1H20S6M7S8H"</li>
* </ul>
*
* @return an iterable of size >= 1. if size==1, the returned iterable contains only the input (i.e. either no gap or hasn't reached sensitivity)
*/
@VisibleForTesting
public static Iterable<AlignedAssembly.AlignmentInterval> split(final AlignedAssembly.AlignmentInterval oneRegion, final int sensitivity, final int unclippedContigLen) {
final List<CigarElement> cigarElements = checkCigarAndConvertTerminalInsertionToSoftClip(oneRegion.cigarAlong5to3DirectionOfContig);
if (cigarElements.size() == 1)
return new ArrayList<>(Collections.singletonList(oneRegion));
// blunt guess
final List<AlignedAssembly.AlignmentInterval> result = new ArrayList<>(3);
final int originalMapQ = oneRegion.mapQual;
final List<CigarElement> cigarMemoryList = new ArrayList<>();
final int clippedNBasesFromStart = SVVariantDiscoveryUtils.getNumClippedBases(true, cigarElements);
final int hardClippingAtBeginning = cigarElements.get(0).getOperator() == CigarOperator.H ? cigarElements.get(0).getLength() : 0;
final int hardClippingAtEnd = (cigarElements.get(cigarElements.size() - 1).getOperator() == CigarOperator.H) ? cigarElements.get(cigarElements.size() - 1).getLength() : 0;
final CigarElement hardClippingAtBeginningMaybeNull = hardClippingAtBeginning == 0 ? null : new CigarElement(hardClippingAtBeginning, CigarOperator.H);
int contigIntervalStart = 1 + clippedNBasesFromStart;
// we are walking along the contig following the cigar, which indicates that we might be walking backwards on the reference if oneRegion.forwardStrand==false
int refBoundary1stInTheDirectionOfContig = oneRegion.forwardStrand ? oneRegion.referenceInterval.getStart() : oneRegion.referenceInterval.getEnd();
for (final CigarElement cigarElement : cigarElements) {
final CigarOperator op = cigarElement.getOperator();
final int operatorLen = cigarElement.getLength();
switch(op) {
case M:
case EQ:
case X:
case S:
case H:
cigarMemoryList.add(cigarElement);
break;
case I:
case D:
if (operatorLen < sensitivity) {
cigarMemoryList.add(cigarElement);
break;
}
// collapse cigar memory list into a single cigar for ref & contig interval computation
final Cigar memoryCigar = new Cigar(cigarMemoryList);
final int effectiveReadLen = memoryCigar.getReadLength() + SVVariantDiscoveryUtils.getTotalHardClipping(memoryCigar) - SVVariantDiscoveryUtils.getNumClippedBases(true, memoryCigar);
// task 1: infer reference interval taking into account of strand
final SimpleInterval referenceInterval;
if (oneRegion.forwardStrand) {
referenceInterval = new SimpleInterval(oneRegion.referenceInterval.getContig(), refBoundary1stInTheDirectionOfContig, refBoundary1stInTheDirectionOfContig + (memoryCigar.getReferenceLength() - 1));
} else {
referenceInterval = new SimpleInterval(oneRegion.referenceInterval.getContig(), // step backward
refBoundary1stInTheDirectionOfContig - (memoryCigar.getReferenceLength() - 1), refBoundary1stInTheDirectionOfContig);
}
// task 2: infer contig interval
final int contigIntervalEnd = contigIntervalStart + effectiveReadLen - 1;
// task 3: now add trailing cigar element and create the real cigar for the to-be-returned AR
cigarMemoryList.add(new CigarElement(unclippedContigLen - contigIntervalEnd - hardClippingAtEnd, CigarOperator.S));
if (hardClippingAtEnd != 0) {
// be faithful to hard clipping (as the accompanying bases have been hard-clipped away)
cigarMemoryList.add(new CigarElement(hardClippingAtEnd, CigarOperator.H));
}
final Cigar cigarForNewAlignmentInterval = new Cigar(cigarMemoryList);
final AlignedAssembly.AlignmentInterval split = new AlignedAssembly.AlignmentInterval(referenceInterval, contigIntervalStart, contigIntervalEnd, cigarForNewAlignmentInterval, oneRegion.forwardStrand, originalMapQ, SVConstants.DiscoveryStepConstants.ARTIFICIAL_MISMATCH);
result.add(split);
// update cigar memory
cigarMemoryList.clear();
if (hardClippingAtBeginningMaybeNull != null) {
// be faithful about hard clippings
cigarMemoryList.add(hardClippingAtBeginningMaybeNull);
}
cigarMemoryList.add(new CigarElement(contigIntervalEnd - hardClippingAtBeginning + (op.consumesReadBases() ? operatorLen : 0), CigarOperator.S));
// update pointers into reference and contig
final int refBoundaryAdvance = op.consumesReadBases() ? memoryCigar.getReferenceLength() : memoryCigar.getReferenceLength() + operatorLen;
refBoundary1stInTheDirectionOfContig += oneRegion.forwardStrand ? refBoundaryAdvance : -refBoundaryAdvance;
contigIntervalStart += op.consumesReadBases() ? effectiveReadLen + operatorLen : effectiveReadLen;
break;
default:
// TODO: 1/20/17 still not quite sure if this is quite right, it doesn't blow up on NA12878 WGS, but who knows what happens in the future
throw new GATKException("Alignment CIGAR contains an unexpected N or P element: " + oneRegion.toPackedString());
}
}
if (result.isEmpty()) {
return new ArrayList<>(Collections.singletonList(oneRegion));
}
final SimpleInterval lastReferenceInterval;
if (oneRegion.forwardStrand) {
lastReferenceInterval = new SimpleInterval(oneRegion.referenceInterval.getContig(), refBoundary1stInTheDirectionOfContig, oneRegion.referenceInterval.getEnd());
} else {
lastReferenceInterval = new SimpleInterval(oneRegion.referenceInterval.getContig(), oneRegion.referenceInterval.getStart(), refBoundary1stInTheDirectionOfContig);
}
final Cigar lastForwardStrandCigar = new Cigar(cigarMemoryList);
int clippedNBasesFromEnd = SVVariantDiscoveryUtils.getNumClippedBases(false, cigarElements);
result.add(new AlignedAssembly.AlignmentInterval(lastReferenceInterval, contigIntervalStart, unclippedContigLen - clippedNBasesFromEnd, lastForwardStrandCigar, oneRegion.forwardStrand, originalMapQ, SVConstants.DiscoveryStepConstants.ARTIFICIAL_MISMATCH));
return result;
}
use of org.broadinstitute.hellbender.exceptions.GATKException in project gatk by broadinstitute.
the class MarkDuplicatesSparkUtils method generateMetrics.
static JavaPairRDD<String, DuplicationMetrics> generateMetrics(final SAMFileHeader header, final JavaRDD<GATKRead> reads) {
return reads.filter(read -> !read.isSecondaryAlignment() && !read.isSupplementaryAlignment()).mapToPair(read -> {
final String library = LibraryIdGenerator.getLibraryName(header, read.getReadGroup());
DuplicationMetrics metrics = new DuplicationMetrics();
metrics.LIBRARY = library;
if (read.isUnmapped()) {
++metrics.UNMAPPED_READS;
} else if (!read.isPaired() || read.mateIsUnmapped()) {
++metrics.UNPAIRED_READS_EXAMINED;
} else {
++metrics.READ_PAIRS_EXAMINED;
}
if (read.isDuplicate()) {
if (!read.isPaired() || read.mateIsUnmapped()) {
++metrics.UNPAIRED_READ_DUPLICATES;
} else {
++metrics.READ_PAIR_DUPLICATES;
}
}
if (read.hasAttribute(OPTICAL_DUPLICATE_TOTAL_ATTRIBUTE_NAME)) {
metrics.READ_PAIR_OPTICAL_DUPLICATES += read.getAttributeAsInteger(OPTICAL_DUPLICATE_TOTAL_ATTRIBUTE_NAME);
}
return new Tuple2<>(library, metrics);
}).foldByKey(new DuplicationMetrics(), (metricsSum, m) -> {
if (metricsSum.LIBRARY == null) {
metricsSum.LIBRARY = m.LIBRARY;
}
// This should never happen, as we grouped by key using library as the key.
if (!metricsSum.LIBRARY.equals(m.LIBRARY)) {
throw new GATKException("Two different libraries encountered while summing metrics: " + metricsSum.LIBRARY + " and " + m.LIBRARY);
}
metricsSum.UNMAPPED_READS += m.UNMAPPED_READS;
metricsSum.UNPAIRED_READS_EXAMINED += m.UNPAIRED_READS_EXAMINED;
metricsSum.READ_PAIRS_EXAMINED += m.READ_PAIRS_EXAMINED;
metricsSum.UNPAIRED_READ_DUPLICATES += m.UNPAIRED_READ_DUPLICATES;
metricsSum.READ_PAIR_DUPLICATES += m.READ_PAIR_DUPLICATES;
metricsSum.READ_PAIR_OPTICAL_DUPLICATES += m.READ_PAIR_OPTICAL_DUPLICATES;
return metricsSum;
}).mapValues(metrics -> {
DuplicationMetrics copy = metrics.copy();
copy.READ_PAIRS_EXAMINED = metrics.READ_PAIRS_EXAMINED / 2;
copy.READ_PAIR_DUPLICATES = metrics.READ_PAIR_DUPLICATES / 2;
copy.calculateDerivedMetrics();
if (copy.ESTIMATED_LIBRARY_SIZE == null) {
copy.ESTIMATED_LIBRARY_SIZE = 0L;
}
return copy;
});
}
use of org.broadinstitute.hellbender.exceptions.GATKException in project gatk by broadinstitute.
the class SVUtils method readKmersFile.
/**
* Read a file of kmers.
* Each line must be exactly SVConstants.KMER_SIZE characters long, and must match [ACGT]*.
*/
public static Set<SVKmer> readKmersFile(final int kSize, final String kmersFile, final SVKmer kmer) {
final Set<SVKmer> kmers;
try (final BufferedReader rdr = new BufferedReader(new InputStreamReader(BucketUtils.openFile(kmersFile)))) {
final long fileLength = BucketUtils.fileSize(kmersFile);
kmers = new HopscotchSet<>((int) (fileLength / (kSize + 1)));
String line;
while ((line = rdr.readLine()) != null) {
if (line.length() != kSize) {
throw new GATKException("SVKmer kill set contains a line of length " + line.length() + " but we were expecting K=" + kSize);
}
final SVKmerizer kmerizer = new SVKmerizer(line, kSize, 1, new SVKmerLong(kSize));
if (!kmerizer.hasNext()) {
throw new GATKException("Unable to kmerize the kmer kill set string '" + line + "'.");
}
kmers.add(kmerizer.next());
}
} catch (final IOException ioe) {
throw new GATKException("Unable to read kmers from " + kmersFile, ioe);
}
return kmers;
}
use of org.broadinstitute.hellbender.exceptions.GATKException in project gatk by broadinstitute.
the class SVUtils method readIntervalsFile.
/** Read intervals from file. */
public static List<SVInterval> readIntervalsFile(final String intervalsFile, final Map<String, Integer> contigNameMap) {
final List<SVInterval> intervals;
try (final BufferedReader rdr = new BufferedReader(new InputStreamReader(BucketUtils.openFile(intervalsFile)))) {
final int INTERVAL_FILE_LINE_LENGTH_GUESS = 25;
final long sizeGuess = BucketUtils.fileSize(intervalsFile) / INTERVAL_FILE_LINE_LENGTH_GUESS;
intervals = new ArrayList<>((int) sizeGuess);
String line;
int lineNo = 0;
while ((line = rdr.readLine()) != null) {
++lineNo;
if (line.startsWith(REFERENCE_GAP_INTERVAL_FILE_COMMENT_LINE_PROMPT)) {
continue;
}
final String[] tokens = line.split("\t");
if (tokens.length != 3) {
throw new GATKException("Interval file " + intervalsFile + " line " + lineNo + " did not contain 3 columns: " + line);
}
try {
final Integer contigId = contigNameMap.get(tokens[0]);
if (contigId == null)
throw new GATKException("contig name " + tokens[0] + " not in dictionary");
final int start = Integer.valueOf(tokens[1]) - 1;
final int end = Integer.valueOf(tokens[2]);
intervals.add(new SVInterval(contigId, start, end));
} catch (final Exception e) {
throw new GATKException("Unable to parse interval file " + intervalsFile + " line " + lineNo + ": " + line, e);
}
}
} catch (final IOException ioe) {
throw new GATKException("Unable to read intervals from " + intervalsFile, ioe);
}
return intervals;
}
use of org.broadinstitute.hellbender.exceptions.GATKException in project gatk by broadinstitute.
the class SVUtils method writeKmersFile.
/** Write kmers to file. */
public static <KType extends SVKmer> void writeKmersFile(final int kSize, final String kmersFile, final Collection<KType> kmers) {
try (final Writer writer = new BufferedWriter(new OutputStreamWriter(BucketUtils.createFile(kmersFile)))) {
for (final KType kmer : kmers) {
writer.write(kmer.toString(kSize));
writer.write('\n');
}
} catch (final IOException ioe) {
throw new GATKException("Unable to write kmers to " + kmersFile, ioe);
}
}
Aggregations