use of org.broadinstitute.hellbender.utils.read.markduplicates.OpticalDuplicateFinder in project gatk by broadinstitute.
the class MarkDuplicatesSpark method runTool.
@Override
protected void runTool(final JavaSparkContext ctx) {
JavaRDD<GATKRead> reads = getReads();
final OpticalDuplicateFinder finder = opticalDuplicatesArgumentCollection.READ_NAME_REGEX != null ? new OpticalDuplicateFinder(opticalDuplicatesArgumentCollection.READ_NAME_REGEX, opticalDuplicatesArgumentCollection.OPTICAL_DUPLICATE_PIXEL_DISTANCE, null) : null;
final JavaRDD<GATKRead> finalReadsForMetrics = mark(reads, getHeaderForReads(), duplicatesScoringStrategy, finder, getRecommendedNumReducers());
if (metricsFile != null) {
final JavaPairRDD<String, DuplicationMetrics> metricsByLibrary = MarkDuplicatesSparkUtils.generateMetrics(getHeaderForReads(), finalReadsForMetrics);
final MetricsFile<DuplicationMetrics, Double> resultMetrics = getMetricsFile();
MarkDuplicatesSparkUtils.saveMetricsRDD(resultMetrics, getHeaderForReads(), metricsByLibrary, metricsFile, getAuthHolder());
}
final JavaRDD<GATKRead> finalReads = cleanupTemporaryAttributes(finalReadsForMetrics);
writeReads(ctx, output, finalReads);
}
use of org.broadinstitute.hellbender.utils.read.markduplicates.OpticalDuplicateFinder in project gatk by broadinstitute.
the class MarkDuplicatesSparkUnitTest method markDupesTest.
@Test(dataProvider = "md", groups = "spark")
public void markDupesTest(final String input, final long totalExpected, final long dupsExpected) throws IOException {
JavaSparkContext ctx = SparkContextFactory.getTestSparkContext();
ReadsSparkSource readSource = new ReadsSparkSource(ctx);
JavaRDD<GATKRead> reads = readSource.getParallelReads(input, null);
Assert.assertEquals(reads.count(), totalExpected);
SAMFileHeader header = readSource.getHeader(input, null);
OpticalDuplicatesArgumentCollection opticalDuplicatesArgumentCollection = new OpticalDuplicatesArgumentCollection();
final OpticalDuplicateFinder finder = opticalDuplicatesArgumentCollection.READ_NAME_REGEX != null ? new OpticalDuplicateFinder(opticalDuplicatesArgumentCollection.READ_NAME_REGEX, opticalDuplicatesArgumentCollection.OPTICAL_DUPLICATE_PIXEL_DISTANCE, null) : null;
JavaRDD<GATKRead> markedReads = MarkDuplicatesSpark.mark(reads, header, MarkDuplicatesScoringStrategy.SUM_OF_BASE_QUALITIES, finder, 1);
Assert.assertEquals(markedReads.count(), totalExpected);
JavaRDD<GATKRead> dupes = markedReads.filter(GATKRead::isDuplicate);
Assert.assertEquals(dupes.count(), dupsExpected);
}
use of org.broadinstitute.hellbender.utils.read.markduplicates.OpticalDuplicateFinder in project gatk by broadinstitute.
the class PathSeqFilterSpark method runTool.
@Override
protected void runTool(final JavaSparkContext ctx) {
final JavaRDD<GATKRead> reads = getReads();
//Filter secondary/supplementary reads and reads that fail the vendor quality check
final JavaRDD<GATKRead> primaryReads = reads.filter(read -> !(read.isSecondaryAlignment() || read.failsVendorQualityCheck() || read.isSupplementaryAlignment()));
logger.info("Loaded " + reads.count() + " reads");
//Mark and filter optical duplicates
final OpticalDuplicateFinder finder = new OpticalDuplicateFinder(opticalDuplicatesArgumentCollection.READ_NAME_REGEX, opticalDuplicatesArgumentCollection.OPTICAL_DUPLICATE_PIXEL_DISTANCE, null);
final JavaRDD<GATKRead> markedReads = MarkDuplicatesSpark.mark(primaryReads, getHeaderForReads(), MarkDuplicatesScoringStrategy.SUM_OF_BASE_QUALITIES, finder, getRecommendedNumReducers());
final JavaRDD<GATKRead> markedFilteredReads = markedReads.filter(new ReadFilterSparkifier(new MarkedOpticalDuplicateReadFilter()));
logger.info("Reads remaining after de-duplication: " + markedFilteredReads.count());
//Apply DUST masking
final JavaRDD<GATKRead> readsDUSTMasked = markedFilteredReads.map(new ReadTransformerSparkifier(new DUSTReadTransformer(DUST_MASK, DUST_W, DUST_T)));
//Apply base quality hard clipping
final JavaRDD<GATKRead> readsClipped = readsDUSTMasked.map(new ReadTransformerSparkifier(new BaseQualityClipReadTransformer(READ_TRIM_THRESH)));
//Filter reads with less than MIN_READ_LENGTH bases
final JavaRDD<GATKRead> readsLengthFiltered = readsClipped.filter(new ReadFilterSparkifier(new ReadLengthReadFilter(MIN_READ_LENGTH, Integer.MAX_VALUE)));
logger.info("Reads remaining after clipping: " + readsLengthFiltered.count());
//Change low-quality bases to 'N'
final JavaRDD<GATKRead> readsBQFiltered = readsLengthFiltered.map(new ReadTransformerSparkifier(new BaseQualityReadTransformer(QUAL_PHRED_THRESH)));
//Filter reads with too many 'N's
final JavaRDD<GATKRead> readsAmbigFiltered = readsBQFiltered.filter(new ReadFilterSparkifier(new AmbiguousBaseReadFilter(FRAC_N_THRESHOLD)));
logger.info("Reads remaining after ambiguous base filtering: " + readsAmbigFiltered.count());
//Load Kmer hopscotch set and filter reads containing > 0 matching kmers
final JavaRDD<GATKRead> readsKmerFiltered = doKmerFiltering(ctx, readsAmbigFiltered);
logger.info("Reads remaining after kmer filtering: " + readsKmerFiltered.count());
//Filter unpaired reads
final JavaRDD<GATKRead> readsFilteredPaired = retainPairs(readsKmerFiltered);
logger.info("Reads remaining after unpaired filtering: " + readsFilteredPaired.count());
//BWA filtering against user-specified host organism reference
header = getHeaderForReads();
final JavaRDD<GATKRead> readsAligned = doHostBWA(ctx, header, readsFilteredPaired);
//Get unmapped reads (note these always come in pairs)
//TODO: retain read pairs by alignment score instead of flags
final JavaRDD<GATKRead> readsNonHost = readsAligned.filter(read -> read.isUnmapped() && read.mateIsUnmapped());
logger.info("Reads remaining after BWA filtering: " + readsFilteredPaired.count());
//TODO: repeat BWA with seed size 11
//Write output
header.setSortOrder(SAMFileHeader.SortOrder.queryname);
try {
ReadsSparkSink.writeReads(ctx, OUTPUT_PATH, null, readsNonHost, header, shardedOutput ? ReadsWriteFormat.SHARDED : ReadsWriteFormat.SINGLE);
} catch (final IOException e) {
throw new GATKException("Unable to write bam", e);
}
}
use of org.broadinstitute.hellbender.utils.read.markduplicates.OpticalDuplicateFinder in project gatk by broadinstitute.
the class ReadsPipelineSpark method runTool.
@Override
protected void runTool(final JavaSparkContext ctx) {
if (joinStrategy == JoinStrategy.BROADCAST && !getReference().isCompatibleWithSparkBroadcast()) {
throw new UserException.Require2BitReferenceForBroadcast();
}
//TOOO: should this use getUnfilteredReads? getReads will apply default and command line filters
final JavaRDD<GATKRead> initialReads = getReads();
final JavaRDD<GATKRead> markedReadsWithOD = MarkDuplicatesSpark.mark(initialReads, getHeaderForReads(), duplicatesScoringStrategy, new OpticalDuplicateFinder(), getRecommendedNumReducers());
final JavaRDD<GATKRead> markedReads = MarkDuplicatesSpark.cleanupTemporaryAttributes(markedReadsWithOD);
// The markedReads have already had the WellformedReadFilter applied to them, which
// is all the filtering that MarkDupes and ApplyBQSR want. BQSR itself wants additional
// filtering performed, so we do that here.
//NOTE: this doesn't honor enabled/disabled commandline filters
final ReadFilter bqsrReadFilter = ReadFilter.fromList(BaseRecalibrator.getBQSRSpecificReadFilterList(), getHeaderForReads());
final JavaRDD<GATKRead> markedFilteredReadsForBQSR = markedReads.filter(read -> bqsrReadFilter.test(read));
VariantsSparkSource variantsSparkSource = new VariantsSparkSource(ctx);
JavaRDD<GATKVariant> bqsrKnownVariants = variantsSparkSource.getParallelVariants(baseRecalibrationKnownVariants, getIntervals());
JavaPairRDD<GATKRead, ReadContextData> rddReadContext = AddContextDataToReadSpark.add(ctx, markedFilteredReadsForBQSR, getReference(), bqsrKnownVariants, joinStrategy, getReferenceSequenceDictionary(), readShardSize, readShardPadding);
final RecalibrationReport bqsrReport = BaseRecalibratorSparkFn.apply(rddReadContext, getHeaderForReads(), getReferenceSequenceDictionary(), bqsrArgs);
final Broadcast<RecalibrationReport> reportBroadcast = ctx.broadcast(bqsrReport);
final JavaRDD<GATKRead> finalReads = ApplyBQSRSparkFn.apply(markedReads, reportBroadcast, getHeaderForReads(), applyBqsrArgs.toApplyBQSRArgumentCollection(bqsrArgs.PRESERVE_QSCORES_LESS_THAN));
writeReads(ctx, output, finalReads);
}
use of org.broadinstitute.hellbender.utils.read.markduplicates.OpticalDuplicateFinder in project gatk by broadinstitute.
the class BwaAndMarkDuplicatesPipelineSpark method runTool.
@Override
protected void runTool(final JavaSparkContext ctx) {
try (final BwaSparkEngine engine = new BwaSparkEngine(ctx, indexImageFile, getHeaderForReads(), getReferenceSequenceDictionary())) {
final JavaRDD<GATKRead> alignedReads = engine.align(getReads());
final JavaRDD<GATKRead> markedReadsWithOD = MarkDuplicatesSpark.mark(alignedReads, engine.getHeader(), duplicatesScoringStrategy, new OpticalDuplicateFinder(), getRecommendedNumReducers());
final JavaRDD<GATKRead> markedReads = MarkDuplicatesSpark.cleanupTemporaryAttributes(markedReadsWithOD);
try {
ReadsSparkSink.writeReads(ctx, output, referenceArguments.getReferenceFile().getAbsolutePath(), markedReads, engine.getHeader(), shardedOutput ? ReadsWriteFormat.SHARDED : ReadsWriteFormat.SINGLE, getRecommendedNumReducers());
} catch (IOException e) {
throw new GATKException("unable to write bam: " + e);
}
}
}
Aggregations