use of htsjdk.samtools.CigarElement in project gatk by broadinstitute.
the class SVVariantDiscoveryUtils method getNumSoftClippingBases.
/**
* @return the number of soft clipped bases as indicated in the input {@code cigarElements}, either from the beginning
* of the list ({@code fromStart==true}) or from the end of the list ({@code fromStart==false}).
*
* @throws IllegalArgumentException if fails check by {@link #validateCigar(List)}
*/
@VisibleForTesting
public static int getNumSoftClippingBases(final boolean fromStart, final List<CigarElement> cigarElements) {
validateCigar(cigarElements);
// because 'H' can only be the 1st/last operation according to the spec, and also
// "S may only have H operations between them and the ends of the CIGAR string",
// 'S' could only be the 1st operation or the 2nd operation next to the 'H' sitting at the end
// no two 'S' operations should sit next to each other
final int endIndex = fromStart ? 0 : cigarElements.size() - 1;
CigarElement element = cigarElements.get(endIndex);
if (element.getOperator().isClipping()) {
if (element.getOperator() == CigarOperator.S) {
return element.getLength();
} else {
final CigarElement mayBeSoftClipping = cigarElements.get(fromStart ? 1 : cigarElements.size() - 2);
return mayBeSoftClipping.getOperator() == CigarOperator.S ? mayBeSoftClipping.getLength() : 0;
}
} else {
return 0;
}
}
use of htsjdk.samtools.CigarElement in project gatk by broadinstitute.
the class ReadClassifier method hasInitialSoftClip.
private static boolean hasInitialSoftClip(final List<CigarElement> cigarElements, final GATKRead read) {
final ListIterator<CigarElement> itr = cigarElements.listIterator();
if (!itr.hasNext())
return false;
CigarElement firstEle = itr.next();
if (firstEle.getOperator() == CigarOperator.HARD_CLIP && itr.hasNext()) {
firstEle = itr.next();
}
final int clipStart = firstEle.getLength() - MIN_SOFT_CLIP_LEN;
return firstEle.getOperator() == CigarOperator.SOFT_CLIP && clipStart >= 0 && isHighQualityRegion(read.getBaseQualities(), clipStart);
}
use of htsjdk.samtools.CigarElement in project gatk by broadinstitute.
the class AlignmentUtils method createIndelString.
/**
* Create the string (really a byte array) representation of an indel-containing cigar against the reference.
*
* @param cigar the indel-containing cigar
* @param indexOfIndel the index of the indel cigar element
* @param refSeq the reference sequence
* @param readSeq the read sequence for the cigar
* @param refIndex the starting reference index into refSeq
* @param readIndex the starting read index into readSeq
* @return non-null byte array which is the indel representation against the reference
*/
private static byte[] createIndelString(final Cigar cigar, final int indexOfIndel, final byte[] refSeq, final byte[] readSeq, int refIndex, int readIndex) {
CigarElement indel = cigar.getCigarElement(indexOfIndel);
int indelLength = indel.getLength();
int totalRefBases = 0;
for (int i = 0; i < indexOfIndel; i++) {
CigarElement ce = cigar.getCigarElement(i);
int length = ce.getLength();
switch(ce.getOperator()) {
case M:
case EQ:
case X:
readIndex += length;
refIndex += length;
totalRefBases += length;
break;
case S:
readIndex += length;
break;
case N:
refIndex += length;
totalRefBases += length;
break;
default:
break;
}
}
// sometimes, when there are very large known indels, we won't have enough reference sequence to cover them
if (totalRefBases + indelLength > refSeq.length)
indelLength -= (totalRefBases + indelLength - refSeq.length);
// the indel-based reference string
byte[] alt = new byte[refSeq.length + (indelLength * (indel.getOperator() == CigarOperator.D ? -1 : 1))];
// add the bases before the indel, making sure it's not aligned off the end of the reference
if (refIndex > alt.length || refIndex > refSeq.length)
return null;
System.arraycopy(refSeq, 0, alt, 0, refIndex);
int currentPos = refIndex;
// take care of the indel
if (indel.getOperator() == CigarOperator.D) {
refIndex += indelLength;
} else {
System.arraycopy(readSeq, readIndex, alt, currentPos, indelLength);
currentPos += indelLength;
}
// add the bases after the indel, making sure it's not aligned off the end of the reference
if (refSeq.length - refIndex > alt.length - currentPos)
return null;
System.arraycopy(refSeq, refIndex, alt, currentPos, refSeq.length - refIndex);
return alt;
}
use of htsjdk.samtools.CigarElement in project gatk by broadinstitute.
the class AlignmentUtils method calcFirstBaseMatchingReferenceInCigar.
/**
* Get the offset (base 0) of the first reference aligned base in Cigar that occurs after readStartByBaseOfCigar base of the cigar
*
* The main purpose of this routine is to find a good start position for a read given it's cigar. The real
* challenge is that the starting base might be inside an insertion, in which case the read actually starts
* at the next M/EQ/X operator.
*
* @param cigar a non-null cigar
* @param readStartByBaseOfCigar finds the first base after this (0 indexed) that aligns to the reference genome (M, EQ, X)
* @throws IllegalStateException if no such base can be found
* @return an offset into cigar
*/
public static int calcFirstBaseMatchingReferenceInCigar(final Cigar cigar, int readStartByBaseOfCigar) {
if (cigar == null)
throw new IllegalArgumentException("cigar cannot be null");
if (readStartByBaseOfCigar >= cigar.getReadLength())
throw new IllegalArgumentException("readStartByBaseOfCigar " + readStartByBaseOfCigar + " must be <= readLength " + cigar.getReadLength());
int hapOffset = 0, refOffset = 0;
for (final CigarElement ce : cigar.getCigarElements()) {
for (int i = 0; i < ce.getLength(); i++) {
switch(ce.getOperator()) {
case M:
case EQ:
case X:
if (hapOffset >= readStartByBaseOfCigar)
return refOffset;
hapOffset++;
refOffset++;
break;
case I:
case S:
hapOffset++;
break;
case D:
refOffset++;
break;
default:
throw new IllegalStateException("calcFirstBaseMatchingReferenceInCigar does not support cigar " + ce.getOperator() + " in cigar " + cigar);
}
}
}
throw new IllegalStateException("Never found appropriate matching state for cigar " + cigar + " given start of " + readStartByBaseOfCigar);
}
use of htsjdk.samtools.CigarElement in project gatk by broadinstitute.
the class AlignmentUtils method getMismatchCount.
/**
* Count how many bases mismatch the reference. Indels are not considered mismatching.
*
* @param r the sam record to check against
* @param refSeq the byte array representing the reference sequence
* @param refIndex the index in the reference byte array of the read's first base (the reference index
* is matching the alignment start, there may be tons of soft-clipped bases before/after
* that so it's wrong to compare with getLength() here.). Note that refIndex is
* zero based, not 1 based
* @param startOnRead the index in the read's bases from which we start counting
* @param nReadBases the number of bases after (but including) startOnRead that we check
* @return non-null object representing the mismatch count
*/
@SuppressWarnings("fallthrough")
public static MismatchCount getMismatchCount(GATKRead r, byte[] refSeq, int refIndex, int startOnRead, int nReadBases) {
Utils.nonNull(r);
Utils.nonNull(refSeq);
if (refIndex < 0)
throw new IllegalArgumentException("attempting to calculate the mismatch count with a reference index that is negative");
if (startOnRead < 0)
throw new IllegalArgumentException("attempting to calculate the mismatch count with a read start that is negative");
if (nReadBases < 0)
throw new IllegalArgumentException("attempting to calculate the mismatch count for a negative number of read bases");
if (refSeq.length - refIndex < (r.getEnd() - r.getStart()))
throw new IllegalArgumentException("attempting to calculate the mismatch count against a reference string that is smaller than the read");
MismatchCount mc = new MismatchCount();
int readIdx = 0;
// index of the last base on read we want to count (note we are including soft-clipped bases with this math)
final int endOnRead = startOnRead + nReadBases - 1;
final byte[] readSeq = r.getBases();
final Cigar c = r.getCigar();
final byte[] readQuals = r.getBaseQualities();
for (final CigarElement ce : c.getCigarElements()) {
if (readIdx > endOnRead)
break;
final int elementLength = ce.getLength();
switch(ce.getOperator()) {
case X:
mc.numMismatches += elementLength;
for (int j = 0; j < elementLength; j++) mc.mismatchQualities += readQuals[readIdx + j];
case EQ:
refIndex += elementLength;
readIdx += elementLength;
break;
case M:
for (int j = 0; j < elementLength; j++, refIndex++, readIdx++) {
if (refIndex >= refSeq.length)
// TODO : It should never happen, we should throw exception here
continue;
if (readIdx < startOnRead)
continue;
if (readIdx > endOnRead)
break;
byte refChr = refSeq[refIndex];
byte readChr = readSeq[readIdx];
// continue; // do not count Ns/Xs/etc ?
if (readChr != refChr) {
mc.numMismatches++;
mc.mismatchQualities += readQuals[readIdx];
}
}
break;
case I:
case S:
readIdx += elementLength;
break;
case D:
case N:
refIndex += elementLength;
break;
case H:
case P:
break;
default:
throw new GATKException("The " + ce.getOperator() + " cigar element is not currently supported");
}
}
return mc;
}
Aggregations