use of org.broadinstitute.hellbender.utils.read.ReadCoordinateComparator in project gatk-protected by broadinstitute.
the class AssemblyBasedCallerUtils method finalizeRegion.
public static void finalizeRegion(final AssemblyRegion region, final boolean errorCorrectReads, final boolean dontUseSoftClippedBases, final byte minTailQuality, final SAMFileHeader readsHeader, final SampleList samplesList) {
if (region.isFinalized()) {
return;
}
// Loop through the reads hard clipping the adaptor and low quality tails
final List<GATKRead> readsToUse = new ArrayList<>(region.getReads().size());
for (final GATKRead myRead : region.getReads()) {
final byte minTailQualityToUse = errorCorrectReads ? HaplotypeCallerEngine.MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION : minTailQuality;
GATKRead clippedRead = ReadClipper.hardClipLowQualEnds(myRead, minTailQualityToUse);
// remove soft clips if we cannot reliably clip off adapter sequence or if the user doesn't want to use soft clips at all
// otherwie revert soft clips so that we see the alignment start and end assuming the soft clips are all matches
// TODO -- WARNING -- still possibility that unclipping the soft clips will introduce bases that aren't
// TODO -- truly in the extended region, as the unclipped bases might actually include a deletion
// TODO -- w.r.t. the reference. What really needs to happen is that kmers that occur before the
// TODO -- reference haplotype start must be removed
clippedRead = dontUseSoftClippedBases || !ReadUtils.hasWellDefinedFragmentSize(clippedRead) ? ReadClipper.hardClipSoftClippedBases(clippedRead) : ReadClipper.revertSoftClippedBases(clippedRead);
clippedRead = clippedRead.isUnmapped() ? clippedRead : ReadClipper.hardClipAdaptorSequence(clippedRead);
if (!clippedRead.isEmpty() && clippedRead.getCigar().getReadLength() > 0) {
clippedRead = ReadClipper.hardClipToRegion(clippedRead, region.getExtendedSpan().getStart(), region.getExtendedSpan().getEnd());
if (region.readOverlapsRegion(clippedRead) && clippedRead.getLength() > 0) {
readsToUse.add(clippedRead);
}
}
}
// TODO -- Performance optimization: we partition the reads by sample 4 times right now; let's unify that code.
// final List<GATKRead> downsampledReads = DownsamplingUtils.levelCoverageByPosition(ReadUtils.sortReadsByCoordinate(readsToUse), maxReadsInRegionPerSample, minReadsPerAlignmentStart);
// TODO: sort may be unnecessary here
Collections.sort(readsToUse, new ReadCoordinateComparator(readsHeader));
// handle overlapping read pairs from the same fragment
cleanOverlappingReadPairs(readsToUse, samplesList, readsHeader);
region.clearReads();
region.addAll(readsToUse);
region.setFinalized(true);
}
use of org.broadinstitute.hellbender.utils.read.ReadCoordinateComparator in project gatk by broadinstitute.
the class AssemblyRegion method trim.
/**
* Trim this region to no more than the span, producing a new assembly region with properly trimmed reads that
* attempts to provide the best possible representation of this region covering the span.
*
* The challenge here is that span may (1) be larger than can be represented by this assembly region
* + its original extension and (2) the extension must be symmetric on both sides. This algorithm
* therefore determines how best to represent span as a subset of the span of this
* region with a padding value that captures as much of the span as possible.
*
* For example, suppose this active region is
*
* Active: 100-200 with extension of 50, so that the true span is 50-250
* NewExtent: 150-225 saying that we'd ideally like to just have bases 150-225
*
* Here we represent the assembly region as a region from 150-200 with 25 bp of padding.
*
* The overall constraint is that the region can never exceed the original region, and
* the extension is chosen to maximize overlap with the desired region
*
* @param span the new extend of the active region we want
* @return a non-null, empty active region
*/
public AssemblyRegion trim(final SimpleInterval span, final SimpleInterval extendedSpan) {
Utils.nonNull(span, "Active region extent cannot be null");
Utils.nonNull(extendedSpan, "Active region extended span cannot be null");
Utils.validateArg(extendedSpan.contains(span), "The requested extended span must fully contain the requested span");
final SimpleInterval subActive = getSpan().intersect(span);
final int requiredOnRight = Math.max(extendedSpan.getEnd() - subActive.getEnd(), 0);
final int requiredOnLeft = Math.max(subActive.getStart() - extendedSpan.getStart(), 0);
final int requiredExtension = Math.min(Math.max(requiredOnLeft, requiredOnRight), getExtension());
final AssemblyRegion result = new AssemblyRegion(subActive, Collections.<ActivityProfileState>emptyList(), isActive, requiredExtension, header);
final List<GATKRead> myReads = getReads();
final SimpleInterval resultExtendedLoc = result.getExtendedSpan();
final int resultExtendedLocStart = resultExtendedLoc.getStart();
final int resultExtendedLocStop = resultExtendedLoc.getEnd();
final List<GATKRead> trimmedReads = new ArrayList<>(myReads.size());
for (final GATKRead read : myReads) {
final GATKRead clippedRead = ReadClipper.hardClipToRegion(read, resultExtendedLocStart, resultExtendedLocStop);
if (result.readOverlapsRegion(clippedRead) && clippedRead.getLength() > 0) {
trimmedReads.add(clippedRead);
}
}
result.clearReads();
trimmedReads.sort(new ReadCoordinateComparator(header));
result.addAll(trimmedReads);
return result;
}
use of org.broadinstitute.hellbender.utils.read.ReadCoordinateComparator in project gatk by broadinstitute.
the class ReadsSparkSinkUnitTest method readsSinkADAMTest.
@Test(enabled = false, dataProvider = "loadReadsADAM", groups = "spark")
public void readsSinkADAMTest(String inputBam, String outputDirectoryName) throws IOException {
// Since the test requires that we not create the actual output directory in advance,
// we instead create its parent directory and mark it for deletion on exit. This protects
// us from naming collisions across multiple instances of the test suite.
final File outputParentDirectory = createTempDir(outputDirectoryName + "_parent");
final File outputDirectory = new File(outputParentDirectory, outputDirectoryName);
JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
ReadsSparkSource readSource = new ReadsSparkSource(ctx);
JavaRDD<GATKRead> rddParallelReads = readSource.getParallelReads(inputBam, null).filter(// filter out unmapped reads (see comment below)
r -> !r.isUnmapped());
SAMFileHeader header = readSource.getHeader(inputBam, null);
ReadsSparkSink.writeReads(ctx, outputDirectory.getAbsolutePath(), null, rddParallelReads, header, ReadsWriteFormat.ADAM);
JavaRDD<GATKRead> rddParallelReads2 = readSource.getADAMReads(outputDirectory.getAbsolutePath(), null, header);
Assert.assertEquals(rddParallelReads.count(), rddParallelReads2.count());
// Test the round trip
//make a mutable copy for sort
List<GATKRead> samList = new ArrayList<>(rddParallelReads.collect());
//make a mutable copy for sort
List<GATKRead> adamList = new ArrayList<>(rddParallelReads2.collect());
Comparator<GATKRead> comparator = new ReadCoordinateComparator(header);
samList.sort(comparator);
adamList.sort(comparator);
for (int i = 0; i < samList.size(); i++) {
SAMRecord expected = samList.get(i).convertToSAMRecord(header);
SAMRecord observed = adamList.get(i).convertToSAMRecord(header);
// manually test equality of some fields, as there are issues with roundtrip BAM -> ADAM -> BAM
// see https://github.com/bigdatagenomics/adam/issues/823
Assert.assertEquals(observed.getReadName(), expected.getReadName(), "readname");
Assert.assertEquals(observed.getAlignmentStart(), expected.getAlignmentStart(), "getAlignmentStart");
Assert.assertEquals(observed.getAlignmentEnd(), expected.getAlignmentEnd(), "getAlignmentEnd");
Assert.assertEquals(observed.getFlags(), expected.getFlags(), "getFlags");
Assert.assertEquals(observed.getMappingQuality(), expected.getMappingQuality(), "getMappingQuality");
Assert.assertEquals(observed.getMateAlignmentStart(), expected.getMateAlignmentStart(), "getMateAlignmentStart");
Assert.assertEquals(observed.getCigar(), expected.getCigar(), "getCigar");
}
}
use of org.broadinstitute.hellbender.utils.read.ReadCoordinateComparator in project gatk by broadinstitute.
the class AssemblyBasedCallerUtils method finalizeRegion.
public static void finalizeRegion(final AssemblyRegion region, final boolean errorCorrectReads, final boolean dontUseSoftClippedBases, final byte minTailQuality, final SAMFileHeader readsHeader, final SampleList samplesList) {
if (region.isFinalized()) {
return;
}
// Loop through the reads hard clipping the adaptor and low quality tails
final List<GATKRead> readsToUse = new ArrayList<>(region.getReads().size());
for (final GATKRead myRead : region.getReads()) {
final byte minTailQualityToUse = errorCorrectReads ? HaplotypeCallerEngine.MIN_TAIL_QUALITY_WITH_ERROR_CORRECTION : minTailQuality;
GATKRead clippedRead = ReadClipper.hardClipLowQualEnds(myRead, minTailQualityToUse);
// remove soft clips if we cannot reliably clip off adapter sequence or if the user doesn't want to use soft clips at all
// otherwie revert soft clips so that we see the alignment start and end assuming the soft clips are all matches
// TODO -- WARNING -- still possibility that unclipping the soft clips will introduce bases that aren't
// TODO -- truly in the extended region, as the unclipped bases might actually include a deletion
// TODO -- w.r.t. the reference. What really needs to happen is that kmers that occur before the
// TODO -- reference haplotype start must be removed
clippedRead = dontUseSoftClippedBases || !ReadUtils.hasWellDefinedFragmentSize(clippedRead) ? ReadClipper.hardClipSoftClippedBases(clippedRead) : ReadClipper.revertSoftClippedBases(clippedRead);
clippedRead = clippedRead.isUnmapped() ? clippedRead : ReadClipper.hardClipAdaptorSequence(clippedRead);
if (!clippedRead.isEmpty() && clippedRead.getCigar().getReadLength() > 0) {
clippedRead = ReadClipper.hardClipToRegion(clippedRead, region.getExtendedSpan().getStart(), region.getExtendedSpan().getEnd());
if (region.readOverlapsRegion(clippedRead) && clippedRead.getLength() > 0) {
readsToUse.add(clippedRead);
}
}
}
// TODO -- Performance optimization: we partition the reads by sample 4 times right now; let's unify that code.
// final List<GATKRead> downsampledReads = DownsamplingUtils.levelCoverageByPosition(ReadUtils.sortReadsByCoordinate(readsToUse), maxReadsInRegionPerSample, minReadsPerAlignmentStart);
// TODO: sort may be unnecessary here
Collections.sort(readsToUse, new ReadCoordinateComparator(readsHeader));
// handle overlapping read pairs from the same fragment
cleanOverlappingReadPairs(readsToUse, samplesList, readsHeader);
region.clearReads();
region.addAll(readsToUse);
region.setFinalized(true);
}
use of org.broadinstitute.hellbender.utils.read.ReadCoordinateComparator in project gatk by broadinstitute.
the class SortReadFileSpark method runTool.
@Override
protected void runTool(final JavaSparkContext ctx) {
JavaRDD<GATKRead> reads = getReads();
int numReducers = getRecommendedNumReducers();
logger.info("Using %s reducers" + numReducers);
final SAMFileHeader readsHeader = getHeaderForReads();
ReadCoordinateComparator comparator = new ReadCoordinateComparator(readsHeader);
JavaRDD<GATKRead> sortedReads;
if (shardedOutput) {
sortedReads = reads.mapToPair(read -> new Tuple2<>(read, null)).sortByKey(comparator, true, numReducers).keys();
} else {
// sorting is done by writeReads below
sortedReads = reads;
}
readsHeader.setSortOrder(SAMFileHeader.SortOrder.coordinate);
writeReads(ctx, outputFile, sortedReads);
}
Aggregations