use of org.broadinstitute.hellbender.exceptions.GATKException in project gatk by broadinstitute.
the class AlignmentUtils method createReadAlignedToRef.
/**
* Aligns reads the haplotype, and then projects this alignment of read -> hap onto the reference
* via the alignment of haplotype (via its getCigar) method.
*
* @param originalRead the read we want to write aligned to the reference genome
* @param haplotype the haplotype that the read should be aligned to, before aligning to the reference
* @param referenceStart the start of the reference that haplotype is aligned to. Provides global coordinate frame.
* @param isInformative true if the read is differentially informative for one of the haplotypes
*
* @throws IllegalArgumentException if {@code originalRead} is {@code null} or {@code haplotype} is {@code null} or it
* does not have a Cigar or the {@code referenceStart} is invalid (less than 1).
*
* @return a GATKRead aligned to reference. Never {@code null}.
*/
public static GATKRead createReadAlignedToRef(final GATKRead originalRead, final Haplotype haplotype, final Haplotype refHaplotype, final int referenceStart, final boolean isInformative) {
Utils.nonNull(originalRead);
Utils.nonNull(haplotype);
Utils.nonNull(refHaplotype);
Utils.nonNull(haplotype.getCigar());
if (referenceStart < 1) {
throw new IllegalArgumentException("reference start much be >= 1 but got " + referenceStart);
}
// compute the smith-waterman alignment of read -> haplotype
final SWPairwiseAlignment swPairwiseAlignment = new SWPairwiseAlignment(haplotype.getBases(), originalRead.getBases(), CigarUtils.NEW_SW_PARAMETERS);
if (swPairwiseAlignment.getAlignmentStart2wrt1() == -1) {
// sw can fail (reasons not clear) so if it happens just don't realign the read
return originalRead;
}
final Cigar swCigar = consolidateCigar(swPairwiseAlignment.getCigar());
// since we're modifying the read we need to clone it
final GATKRead read = originalRead.copy();
// only informative reads are given the haplotype tag to enhance visualization
if (isInformative) {
read.setAttribute(HAPLOTYPE_TAG, haplotype.hashCode());
}
// compute here the read starts w.r.t. the reference from the SW result and the hap -> ref cigar
final Cigar extendedHaplotypeCigar = haplotype.getConsolidatedPaddedCigar(1000);
final int readStartOnHaplotype = calcFirstBaseMatchingReferenceInCigar(extendedHaplotypeCigar, swPairwiseAlignment.getAlignmentStart2wrt1());
final int readStartOnReference = referenceStart + haplotype.getAlignmentStartHapwrtRef() + readStartOnHaplotype;
read.setPosition(read.getContig(), readStartOnReference);
// compute the read -> ref alignment by mapping read -> hap -> ref from the
// SW of read -> hap mapped through the given by hap -> ref
final Cigar haplotypeToRef = trimCigarByBases(extendedHaplotypeCigar, swPairwiseAlignment.getAlignmentStart2wrt1(), extendedHaplotypeCigar.getReadLength() - 1);
final Cigar readToRefCigarRaw = applyCigarToCigar(swCigar, haplotypeToRef);
final Cigar readToRefCigarClean = cleanUpCigar(readToRefCigarRaw);
final Cigar readToRefCigar = leftAlignIndel(readToRefCigarClean, refHaplotype.getBases(), originalRead.getBases(), swPairwiseAlignment.getAlignmentStart2wrt1(), 0, true);
read.setCigar(readToRefCigar);
if (readToRefCigar.getReadLength() != read.getLength()) {
throw new GATKException("Cigar " + readToRefCigar + " with read length " + readToRefCigar.getReadLength() + " != read length " + read.getLength() + " for read " + read.toString() + "\nhapToRef " + haplotypeToRef + " length " + haplotypeToRef.getReadLength() + "/" + haplotypeToRef.getReferenceLength() + "\nreadToHap " + swCigar + " length " + swCigar.getReadLength() + "/" + swCigar.getReferenceLength());
}
return read;
}
use of org.broadinstitute.hellbender.exceptions.GATKException in project gatk by broadinstitute.
the class AlignmentUtils method calcAlignmentByteArrayOffset.
/**
* Calculate the index into the read's bases of the beginning of the encompassing cigar element for a given cigar and offset
*
* @param cigar the read's CIGAR -- cannot be null
* @param offset the offset to use for the calculation or -1 if in the middle of a deletion
* @param isDeletion are we in the middle of a deletion?
* @param alignmentStart the alignment start of the read
* @param refLocus the reference position of the offset
* @return a non-negative int index
*/
public static int calcAlignmentByteArrayOffset(final Cigar cigar, final int offset, final boolean isDeletion, final int alignmentStart, final int refLocus) {
if (cigar == null)
throw new IllegalArgumentException("attempting to find the alignment position from a CIGAR that is null");
if (offset < -1)
throw new IllegalArgumentException("attempting to find the alignment position with an offset that is negative (and not -1)");
if (alignmentStart < 0)
throw new IllegalArgumentException("attempting to find the alignment position from an alignment start that is negative");
if (refLocus < 0)
throw new IllegalArgumentException("attempting to find the alignment position from a reference position that is negative");
if (offset >= cigar.getReadLength())
throw new IllegalArgumentException("attempting to find the alignment position of an offset than is larger than the read length");
int pileupOffset = offset;
// Reassign the offset if we are in the middle of a deletion because of the modified representation of the read bases
if (isDeletion) {
pileupOffset = refLocus - alignmentStart;
final CigarElement ce = cigar.getCigarElement(0);
if (ce.getOperator() == CigarOperator.S) {
pileupOffset += ce.getLength();
}
}
int pos = 0;
int alignmentPos = 0;
for (int iii = 0; iii < cigar.numCigarElements(); iii++) {
final CigarElement ce = cigar.getCigarElement(iii);
final int elementLength = ce.getLength();
switch(ce.getOperator()) {
case I:
case // TODO -- I don't think that soft clips should be treated the same as inserted bases here. Investigation needed.
S:
pos += elementLength;
if (pos >= pileupOffset) {
return alignmentPos;
}
break;
case D:
if (!isDeletion) {
alignmentPos += elementLength;
} else {
if (pos + elementLength - 1 >= pileupOffset) {
return alignmentPos + (pileupOffset - pos);
} else {
pos += elementLength;
alignmentPos += elementLength;
}
}
break;
case M:
case EQ:
case X:
if (pos + elementLength - 1 >= pileupOffset) {
return alignmentPos + (pileupOffset - pos);
} else {
pos += elementLength;
alignmentPos += elementLength;
}
break;
case H:
case P:
case N:
break;
default:
throw new GATKException("Unsupported cigar operator: " + ce.getOperator());
}
}
return alignmentPos;
}
use of org.broadinstitute.hellbender.exceptions.GATKException in project gatk by broadinstitute.
the class AlignmentUtils method isInsideDeletion.
/**
* Is the offset inside a deletion?
*
* @param cigar the read's CIGAR -- cannot be null
* @param offset the offset into the CIGAR
* @return true if the offset is inside a deletion, false otherwise
*/
public static boolean isInsideDeletion(final Cigar cigar, final int offset) {
Utils.nonNull(cigar);
if (offset < 0)
return false;
// pos counts read bases
int pos = 0;
int prevPos = 0;
for (final CigarElement ce : cigar.getCigarElements()) {
switch(ce.getOperator()) {
case I:
case S:
case D:
case M:
case EQ:
case X:
prevPos = pos;
pos += ce.getLength();
break;
case H:
case P:
case N:
break;
default:
throw new GATKException("Unsupported cigar operator: " + ce.getOperator());
}
// Is the offset inside a deletion?
if (prevPos < offset && pos >= offset && ce.getOperator() == CigarOperator.D) {
return true;
}
}
return false;
}
use of org.broadinstitute.hellbender.exceptions.GATKException in project gatk by broadinstitute.
the class FindBadGenomicKmersSpark method processFasta.
@VisibleForTesting
static List<SVKmer> processFasta(final int kSize, final int maxDUSTScore, final String fastaFilename, final PipelineOptions options) {
try (BufferedReader rdr = new BufferedReader(new InputStreamReader(BucketUtils.openFile(fastaFilename)))) {
final List<SVKmer> kmers = new ArrayList<>((int) BucketUtils.fileSize(fastaFilename));
String line;
final StringBuilder sb = new StringBuilder();
final SVKmer kmerSeed = new SVKmerLong();
while ((line = rdr.readLine()) != null) {
if (line.charAt(0) != '>')
sb.append(line);
else if (sb.length() > 0) {
SVDUSTFilteredKmerizer.stream(sb, kSize, maxDUSTScore, kmerSeed).map(kmer -> kmer.canonical(kSize)).forEach(kmers::add);
sb.setLength(0);
}
}
if (sb.length() > 0) {
SVDUSTFilteredKmerizer.stream(sb, kSize, maxDUSTScore, kmerSeed).map(kmer -> kmer.canonical(kSize)).forEach(kmers::add);
}
return kmers;
} catch (IOException ioe) {
throw new GATKException("Can't read high copy kmers fasta file " + fastaFilename, ioe);
}
}
use of org.broadinstitute.hellbender.exceptions.GATKException in project gatk by broadinstitute.
the class FindBreakpointEvidenceSpark method getKmerIntervals.
/** find kmers for each interval */
@VisibleForTesting
static Tuple2<List<AlignedAssemblyOrExcuse>, List<KmerAndInterval>> getKmerIntervals(final Params params, final JavaSparkContext ctx, final HopscotchUniqueMultiMap<String, Integer, QNameAndInterval> qNamesMultiMap, final int nIntervals, final Set<SVKmer> kmerKillSet, final JavaRDD<GATKRead> reads, final Locations locations) {
final Broadcast<Set<SVKmer>> broadcastKmerKillSet = ctx.broadcast(kmerKillSet);
final Broadcast<HopscotchUniqueMultiMap<String, Integer, QNameAndInterval>> broadcastQNameAndIntervalsMultiMap = ctx.broadcast(qNamesMultiMap);
// given a set of template names with interval IDs and a kill set of ubiquitous kmers,
// produce a set of interesting kmers for each interval ID
final int kmersPerPartitionGuess = params.cleanerKmersPerPartitionGuess;
final int minKmers = params.cleanerMinKmerCount;
final int maxKmers = params.cleanerMaxKmerCount;
final int maxIntervals = params.cleanerMaxIntervals;
final int kSize = params.kSize;
final int maxDUSTScore = params.maxDUSTScore;
final List<KmerAndInterval> kmerIntervals = reads.mapPartitionsToPair(readItr -> new MapPartitioner<>(readItr, new QNameKmerizer(broadcastQNameAndIntervalsMultiMap.value(), broadcastKmerKillSet.value(), kSize, maxDUSTScore)).iterator(), false).reduceByKey(Integer::sum).mapPartitions(itr -> new KmerCleaner(itr, kmersPerPartitionGuess, minKmers, maxKmers, maxIntervals).iterator()).collect();
broadcastQNameAndIntervalsMultiMap.destroy();
broadcastKmerKillSet.destroy();
final int[] intervalKmerCounts = new int[nIntervals];
for (final KmerAndInterval kmerAndInterval : kmerIntervals) {
intervalKmerCounts[kmerAndInterval.getIntervalId()] += 1;
}
final Set<Integer> intervalsToKill = new HashSet<>();
final List<AlignedAssemblyOrExcuse> intervalDispositions = new ArrayList<>();
for (int idx = 0; idx != nIntervals; ++idx) {
if (intervalKmerCounts[idx] < params.minKmersPerInterval) {
intervalsToKill.add(idx);
intervalDispositions.add(new AlignedAssemblyOrExcuse(idx, "FASTQ not written -- too few kmers"));
}
}
qNamesMultiMap.removeIf(qNameAndInterval -> intervalsToKill.contains(qNameAndInterval.getIntervalId()));
final List<KmerAndInterval> filteredKmerIntervals = kmerIntervals.stream().filter(kmerAndInterval -> !intervalsToKill.contains(kmerAndInterval.getIntervalId())).collect(SVUtils.arrayListCollector(kmerIntervals.size()));
// record the kmers with their interval IDs
if (locations.kmerFile != null) {
try (final OutputStreamWriter writer = new OutputStreamWriter(new BufferedOutputStream(BucketUtils.createFile(locations.kmerFile)))) {
for (final KmerAndInterval kmerAndInterval : filteredKmerIntervals) {
writer.write(kmerAndInterval.toString(kSize) + " " + kmerAndInterval.getIntervalId() + "\n");
}
} catch (final IOException ioe) {
throw new GATKException("Can't write kmer intervals file " + locations.kmerFile, ioe);
}
}
return new Tuple2<>(intervalDispositions, filteredKmerIntervals);
}
Aggregations