use of org.broadinstitute.hellbender.tools.spark.utils.ReadTransformerSparkifier in project gatk by broadinstitute.
the class PathSeqFilterSpark method runTool.
@Override
protected void runTool(final JavaSparkContext ctx) {
final JavaRDD<GATKRead> reads = getReads();
//Filter secondary/supplementary reads and reads that fail the vendor quality check
final JavaRDD<GATKRead> primaryReads = reads.filter(read -> !(read.isSecondaryAlignment() || read.failsVendorQualityCheck() || read.isSupplementaryAlignment()));
logger.info("Loaded " + reads.count() + " reads");
//Mark and filter optical duplicates
final OpticalDuplicateFinder finder = new OpticalDuplicateFinder(opticalDuplicatesArgumentCollection.READ_NAME_REGEX, opticalDuplicatesArgumentCollection.OPTICAL_DUPLICATE_PIXEL_DISTANCE, null);
final JavaRDD<GATKRead> markedReads = MarkDuplicatesSpark.mark(primaryReads, getHeaderForReads(), MarkDuplicatesScoringStrategy.SUM_OF_BASE_QUALITIES, finder, getRecommendedNumReducers());
final JavaRDD<GATKRead> markedFilteredReads = markedReads.filter(new ReadFilterSparkifier(new MarkedOpticalDuplicateReadFilter()));
logger.info("Reads remaining after de-duplication: " + markedFilteredReads.count());
//Apply DUST masking
final JavaRDD<GATKRead> readsDUSTMasked = markedFilteredReads.map(new ReadTransformerSparkifier(new DUSTReadTransformer(DUST_MASK, DUST_W, DUST_T)));
//Apply base quality hard clipping
final JavaRDD<GATKRead> readsClipped = readsDUSTMasked.map(new ReadTransformerSparkifier(new BaseQualityClipReadTransformer(READ_TRIM_THRESH)));
//Filter reads with less than MIN_READ_LENGTH bases
final JavaRDD<GATKRead> readsLengthFiltered = readsClipped.filter(new ReadFilterSparkifier(new ReadLengthReadFilter(MIN_READ_LENGTH, Integer.MAX_VALUE)));
logger.info("Reads remaining after clipping: " + readsLengthFiltered.count());
//Change low-quality bases to 'N'
final JavaRDD<GATKRead> readsBQFiltered = readsLengthFiltered.map(new ReadTransformerSparkifier(new BaseQualityReadTransformer(QUAL_PHRED_THRESH)));
//Filter reads with too many 'N's
final JavaRDD<GATKRead> readsAmbigFiltered = readsBQFiltered.filter(new ReadFilterSparkifier(new AmbiguousBaseReadFilter(FRAC_N_THRESHOLD)));
logger.info("Reads remaining after ambiguous base filtering: " + readsAmbigFiltered.count());
//Load Kmer hopscotch set and filter reads containing > 0 matching kmers
final JavaRDD<GATKRead> readsKmerFiltered = doKmerFiltering(ctx, readsAmbigFiltered);
logger.info("Reads remaining after kmer filtering: " + readsKmerFiltered.count());
//Filter unpaired reads
final JavaRDD<GATKRead> readsFilteredPaired = retainPairs(readsKmerFiltered);
logger.info("Reads remaining after unpaired filtering: " + readsFilteredPaired.count());
//BWA filtering against user-specified host organism reference
header = getHeaderForReads();
final JavaRDD<GATKRead> readsAligned = doHostBWA(ctx, header, readsFilteredPaired);
//Get unmapped reads (note these always come in pairs)
//TODO: retain read pairs by alignment score instead of flags
final JavaRDD<GATKRead> readsNonHost = readsAligned.filter(read -> read.isUnmapped() && read.mateIsUnmapped());
logger.info("Reads remaining after BWA filtering: " + readsFilteredPaired.count());
//TODO: repeat BWA with seed size 11
//Write output
header.setSortOrder(SAMFileHeader.SortOrder.queryname);
try {
ReadsSparkSink.writeReads(ctx, OUTPUT_PATH, null, readsNonHost, header, shardedOutput ? ReadsWriteFormat.SHARDED : ReadsWriteFormat.SINGLE);
} catch (final IOException e) {
throw new GATKException("Unable to write bam", e);
}
}
Aggregations