Search in sources :

Example 6 with ReadFilter

use of org.broadinstitute.hellbender.engine.filters.ReadFilter in project gatk-protected by broadinstitute.

the class SparkGenomeReadCounts method collectReads.

private void collectReads() {
    if (readArguments.getReadFilesNames().size() != 1) {
        throw new UserException("This tool only accepts a single bam/sam/cram as input");
    }
    final SampleCollection sampleCollection = new SampleCollection(getHeaderForReads());
    if (sampleCollection.sampleCount() > 1) {
        throw new UserException.BadInput("We do not support bams with more than one sample.");
    }
    final String sampleName = sampleCollection.sampleIds().get(0);
    final String[] commentsForRawCoverage = { "##fileFormat  = tsv", "##commandLine = " + getCommandLine(), String.format("##title = Coverage counts in %d base bins for WGS", binsize) };
    final ReadFilter filter = makeGenomeReadFilter();
    final SAMSequenceDictionary sequenceDictionary = getReferenceSequenceDictionary();
    logger.info("Starting Spark coverage collection...");
    final long coverageCollectionStartTime = System.currentTimeMillis();
    final JavaRDD<GATKRead> rawReads = getReads();
    final JavaRDD<GATKRead> reads = rawReads.filter(read -> filter.test(read));
    //Note: using a field inside a closure will pull in the whole enclosing object to serialization
    // (which leads to bad performance and can blow up if some objects in the fields are not
    // Serializable - closures always use java Serializable and not Kryo)
    //Solution here is to use a temp variable for binsize because it's just an int.
    final int binsize_tmp = binsize;
    final JavaRDD<SimpleInterval> readIntervals = reads.filter(read -> sequenceDictionary.getSequence(read.getContig()) != null).map(read -> SparkGenomeReadCounts.createKey(read, sequenceDictionary, binsize_tmp));
    final Map<SimpleInterval, Long> byKey = readIntervals.countByValue();
    final Set<SimpleInterval> readIntervalKeySet = byKey.keySet();
    final long totalReads = byKey.values().stream().mapToLong(v -> v).sum();
    final long coverageCollectionEndTime = System.currentTimeMillis();
    logger.info(String.format("Finished the spark coverage collection with %d targets and %d reads. Elapse of %d seconds", readIntervalKeySet.size(), totalReads, (coverageCollectionEndTime - coverageCollectionStartTime) / 1000));
    final String[] commentsForProportionalCoverage = { commentsForRawCoverage[0], commentsForRawCoverage[1], String.format("##title = Proportional coverage counts in %d base bins for WGS (total reads: %d)", binsize, totalReads) };
    logger.info("Creating full genome bins...");
    final long createGenomeBinsStartTime = System.currentTimeMillis();
    final List<SimpleInterval> fullGenomeBins = createFullGenomeBins(binsize);
    List<Target> fullGenomeTargetCollection = createTargetListFromSimpleInterval(fullGenomeBins);
    TargetWriter.writeTargetsToFile(new File(outputFile.getAbsolutePath() + ".targets.tsv"), fullGenomeTargetCollection);
    final long createGenomeBinsEndTime = System.currentTimeMillis();
    logger.info(String.format("Finished creating genome bins. Elapse of %d seconds", (createGenomeBinsEndTime - createGenomeBinsStartTime) / 1000));
    logger.info("Creating missing genome bins...");
    final long createMissingGenomeBinsStartTime = System.currentTimeMillis();
    logger.info("Creating missing genome bins: Creating a mutable mapping...");
    final Map<SimpleInterval, Long> byKeyMutable = new HashMap<>();
    byKeyMutable.putAll(byKey);
    logger.info("Creating missing genome bins: Populating mutable mapping with zero counts for empty regions...");
    fullGenomeBins.stream().forEach(b -> byKeyMutable.putIfAbsent(b, 0l));
    final long createMissingGenomeBinsEndTime = System.currentTimeMillis();
    logger.info(String.format("Finished creating missing genome bins. Elapse of %d seconds", (createMissingGenomeBinsEndTime - createMissingGenomeBinsStartTime) / 1000));
    logger.info("Creating final map...");
    final long createFinalMapStartTime = System.currentTimeMillis();
    final SortedMap<SimpleInterval, Long> byKeySorted = new TreeMap<>(IntervalUtils.LEXICOGRAPHICAL_ORDER_COMPARATOR);
    byKeySorted.putAll(byKeyMutable);
    final long createFinalMapEndTime = System.currentTimeMillis();
    logger.info(String.format("Finished creating final map. Elapse of %d seconds", (createFinalMapEndTime - createFinalMapStartTime) / 1000));
    logger.info("Creating proportional coverage... ");
    final long pCovFileStartTime = System.currentTimeMillis();
    final SortedMap<SimpleInterval, Double> byKeyProportionalSorted = new TreeMap<>(IntervalUtils.LEXICOGRAPHICAL_ORDER_COMPARATOR);
    byKeySorted.entrySet().stream().forEach(e -> byKeyProportionalSorted.put(e.getKey(), (double) e.getValue() / totalReads));
    final long pCovFileEndTime = System.currentTimeMillis();
    logger.info(String.format("Finished creating proportional coverage map. Elapse of %d seconds", (pCovFileEndTime - pCovFileStartTime) / 1000));
    logger.info("Writing raw coverage file ...");
    final long writingCovFileStartTime = System.currentTimeMillis();
    ReadCountCollectionUtils.writeReadCountsFromSimpleInterval(new File(outputFile.getAbsolutePath() + RAW_COV_OUTPUT_EXTENSION), sampleName, byKeySorted, commentsForRawCoverage);
    final long writingCovFileEndTime = System.currentTimeMillis();
    logger.info(String.format("Finished writing coverage file. Elapse of %d seconds", (writingCovFileEndTime - writingCovFileStartTime) / 1000));
    logger.info("Writing proportional coverage file ...");
    final long writingPCovFileStartTime = System.currentTimeMillis();
    ReadCountCollectionUtils.writeReadCountsFromSimpleInterval(outputFile, sampleName, byKeyProportionalSorted, commentsForProportionalCoverage);
    final long writingPCovFileEndTime = System.currentTimeMillis();
    logger.info(String.format("Finished writing proportional coverage file. Elapse of %d seconds", (writingPCovFileEndTime - writingPCovFileStartTime) / 1000));
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) DocumentedFeature(org.broadinstitute.barclay.help.DocumentedFeature) CommandLineProgramProperties(org.broadinstitute.barclay.argparser.CommandLineProgramProperties) java.util(java.util) CopyNumberProgramGroup(org.broadinstitute.hellbender.cmdline.programgroups.CopyNumberProgramGroup) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Argument(org.broadinstitute.barclay.argparser.Argument) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) GATKSparkTool(org.broadinstitute.hellbender.engine.spark.GATKSparkTool) ReadFilter(org.broadinstitute.hellbender.engine.filters.ReadFilter) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) Collectors(java.util.stream.Collectors) File(java.io.File) Logger(org.apache.logging.log4j.Logger) IntervalUtils(org.broadinstitute.hellbender.utils.IntervalUtils) UserException(org.broadinstitute.hellbender.exceptions.UserException) WellformedReadFilter(org.broadinstitute.hellbender.engine.filters.WellformedReadFilter) Target(org.broadinstitute.hellbender.tools.exome.Target) ReadCountCollectionUtils(org.broadinstitute.hellbender.tools.exome.ReadCountCollectionUtils) TargetWriter(org.broadinstitute.hellbender.tools.exome.TargetWriter) LogManager(org.apache.logging.log4j.LogManager) SampleCollection(org.broadinstitute.hellbender.tools.exome.SampleCollection) JavaRDD(org.apache.spark.api.java.JavaRDD) ReadFilterLibrary(org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Target(org.broadinstitute.hellbender.tools.exome.Target) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) UserException(org.broadinstitute.hellbender.exceptions.UserException) SampleCollection(org.broadinstitute.hellbender.tools.exome.SampleCollection) ReadFilter(org.broadinstitute.hellbender.engine.filters.ReadFilter) WellformedReadFilter(org.broadinstitute.hellbender.engine.filters.WellformedReadFilter) File(java.io.File)

Example 7 with ReadFilter

use of org.broadinstitute.hellbender.engine.filters.ReadFilter in project gatk by broadinstitute.

the class PileupSpark method getDefaultReadFilters.

@Override
public List<ReadFilter> getDefaultReadFilters() {
    List<ReadFilter> filterList = new ArrayList<>(5);
    filterList.add(ReadFilterLibrary.MAPPED);
    filterList.add(ReadFilterLibrary.NOT_DUPLICATE);
    filterList.add(ReadFilterLibrary.PASSES_VENDOR_QUALITY_CHECK);
    filterList.add(ReadFilterLibrary.PRIMARY_ALIGNMENT);
    filterList.add(new WellformedReadFilter());
    return filterList;
}
Also used : WellformedReadFilter(org.broadinstitute.hellbender.engine.filters.WellformedReadFilter) ArrayList(java.util.ArrayList) ReadFilter(org.broadinstitute.hellbender.engine.filters.ReadFilter) WellformedReadFilter(org.broadinstitute.hellbender.engine.filters.WellformedReadFilter)

Example 8 with ReadFilter

use of org.broadinstitute.hellbender.engine.filters.ReadFilter in project gatk by broadinstitute.

the class BaseRecalibratorSparkSharded method runPipeline.

@Override
protected void runPipeline(JavaSparkContext ctx) {
    if (readArguments.getReadFilesNames().size() != 1) {
        throw new UserException("Sorry, we only support a single reads input for now.");
    }
    final String bam = readArguments.getReadFilesNames().get(0);
    final String referenceURL = referenceArguments.getReferenceFileName();
    auth = getAuthHolder();
    final ReferenceMultiSource rds = new ReferenceMultiSource(auth, referenceURL, BaseRecalibrationEngine.BQSR_REFERENCE_WINDOW_FUNCTION);
    SAMFileHeader readsHeader = new ReadsSparkSource(ctx, readArguments.getReadValidationStringency()).getHeader(bam, referenceURL);
    final SAMSequenceDictionary readsDictionary = readsHeader.getSequenceDictionary();
    final SAMSequenceDictionary refDictionary = rds.getReferenceSequenceDictionary(readsDictionary);
    final ReadFilter readFilterToApply = ReadFilter.fromList(BaseRecalibrator.getStandardBQSRReadFilterList(), readsHeader);
    SequenceDictionaryUtils.validateDictionaries("reference", refDictionary, "reads", readsDictionary);
    Broadcast<SAMFileHeader> readsHeaderBcast = ctx.broadcast(readsHeader);
    Broadcast<SAMSequenceDictionary> refDictionaryBcast = ctx.broadcast(refDictionary);
    List<SimpleInterval> intervals = intervalArgumentCollection.intervalsSpecified() ? intervalArgumentCollection.getIntervals(readsHeader.getSequenceDictionary()) : IntervalUtils.getAllIntervalsForReference(readsHeader.getSequenceDictionary());
    List<String> localVariants = knownVariants;
    localVariants = hackilyCopyFromGCSIfNecessary(localVariants);
    List<GATKVariant> variants = VariantsSource.getVariantsList(localVariants);
    // get reads, reference, variants
    JavaRDD<ContextShard> readsWithContext = AddContextDataToReadSparkOptimized.add(ctx, intervals, bam, variants, readFilterToApply, rds);
    // run BaseRecalibratorEngine.
    BaseRecalibratorEngineSparkWrapper recal = new BaseRecalibratorEngineSparkWrapper(readsHeaderBcast, refDictionaryBcast, bqsrArgs);
    JavaRDD<RecalibrationTables> tables = readsWithContext.mapPartitions(s -> recal.apply(s));
    final RecalibrationTables emptyRecalibrationTable = new RecalibrationTables(new StandardCovariateList(bqsrArgs, readsHeader));
    final RecalibrationTables table = tables.treeAggregate(emptyRecalibrationTable, RecalibrationTables::inPlaceCombine, RecalibrationTables::inPlaceCombine, Math.max(1, (int) (Math.log(tables.partitions().size()) / Math.log(2))));
    BaseRecalibrationEngine.finalizeRecalibrationTables(table);
    try {
        BaseRecalibratorEngineSparkWrapper.saveTextualReport(outputTablesPath, readsHeader, table, bqsrArgs, auth);
    } catch (IOException e) {
        throw new UserException.CouldNotCreateOutputFile(new File(outputTablesPath), e);
    }
}
Also used : ContextShard(org.broadinstitute.hellbender.engine.ContextShard) ReadsSparkSource(org.broadinstitute.hellbender.engine.spark.datasources.ReadsSparkSource) GATKVariant(org.broadinstitute.hellbender.utils.variant.GATKVariant) ReferenceMultiSource(org.broadinstitute.hellbender.engine.datasources.ReferenceMultiSource) IOException(java.io.IOException) RecalibrationTables(org.broadinstitute.hellbender.utils.recalibration.RecalibrationTables) StandardCovariateList(org.broadinstitute.hellbender.utils.recalibration.covariates.StandardCovariateList) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) BaseRecalibratorEngineSparkWrapper(org.broadinstitute.hellbender.tools.spark.transforms.bqsr.BaseRecalibratorEngineSparkWrapper) ReadFilter(org.broadinstitute.hellbender.engine.filters.ReadFilter) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) UserException(org.broadinstitute.hellbender.exceptions.UserException) SAMFileHeader(htsjdk.samtools.SAMFileHeader) File(java.io.File)

Example 9 with ReadFilter

use of org.broadinstitute.hellbender.engine.filters.ReadFilter in project gatk by broadinstitute.

the class CollectMultipleMetricsSpark method runTool.

@Override
protected void runTool(final JavaSparkContext ctx) {
    final JavaRDD<GATKRead> unFilteredReads = getUnfilteredReads();
    List<SparkCollectorProvider> collectorsToRun = getCollectorsToRun();
    if (collectorsToRun.size() > 1) {
        // if there is more than one collector to run, cache the
        // unfiltered RDD so we don't recompute it
        unFilteredReads.cache();
    }
    for (final SparkCollectorProvider provider : collectorsToRun) {
        MetricsCollectorSpark<? extends MetricsArgumentCollection> metricsCollector = provider.createCollector(outputBaseName, metricAccumulationLevel.accumulationLevels, getDefaultHeaders(), getHeaderForReads());
        validateCollector(metricsCollector, collectorsToRun.get(collectorsToRun.indexOf(provider)).getClass().getName());
        // Execute the collector's lifecycle
        //Bypass the framework merging of command line filters and just apply the default
        //ones specified by the collector
        ReadFilter readFilter = ReadFilter.fromList(metricsCollector.getDefaultReadFilters(), getHeaderForReads());
        metricsCollector.collectMetrics(unFilteredReads.filter(r -> readFilter.test(r)), getHeaderForReads());
        metricsCollector.saveMetrics(getReadSourceName(), getAuthHolder());
    }
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) DocumentedFeature(org.broadinstitute.barclay.help.DocumentedFeature) CommandLineProgramProperties(org.broadinstitute.barclay.argparser.CommandLineProgramProperties) java.util(java.util) SparkProgramGroup(org.broadinstitute.hellbender.cmdline.programgroups.SparkProgramGroup) Header(htsjdk.samtools.metrics.Header) Argument(org.broadinstitute.barclay.argparser.Argument) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) GATKSparkTool(org.broadinstitute.hellbender.engine.spark.GATKSparkTool) StandardArgumentDefinitions(org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions) ArgumentCollection(org.broadinstitute.barclay.argparser.ArgumentCollection) ReadFilter(org.broadinstitute.hellbender.engine.filters.ReadFilter) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) SAMFileHeader(htsjdk.samtools.SAMFileHeader) ReadUtils(org.broadinstitute.hellbender.utils.read.ReadUtils) org.broadinstitute.hellbender.metrics(org.broadinstitute.hellbender.metrics) MetricAccumulationLevelArgumentCollection(org.broadinstitute.hellbender.cmdline.argumentcollections.MetricAccumulationLevelArgumentCollection) JavaRDD(org.apache.spark.api.java.JavaRDD) ReadFilter(org.broadinstitute.hellbender.engine.filters.ReadFilter)

Example 10 with ReadFilter

use of org.broadinstitute.hellbender.engine.filters.ReadFilter in project gatk by broadinstitute.

the class ReadsPipelineSpark method runTool.

@Override
protected void runTool(final JavaSparkContext ctx) {
    if (joinStrategy == JoinStrategy.BROADCAST && !getReference().isCompatibleWithSparkBroadcast()) {
        throw new UserException.Require2BitReferenceForBroadcast();
    }
    //TOOO: should this use getUnfilteredReads? getReads will apply default and command line filters
    final JavaRDD<GATKRead> initialReads = getReads();
    final JavaRDD<GATKRead> markedReadsWithOD = MarkDuplicatesSpark.mark(initialReads, getHeaderForReads(), duplicatesScoringStrategy, new OpticalDuplicateFinder(), getRecommendedNumReducers());
    final JavaRDD<GATKRead> markedReads = MarkDuplicatesSpark.cleanupTemporaryAttributes(markedReadsWithOD);
    // The markedReads have already had the WellformedReadFilter applied to them, which
    // is all the filtering that MarkDupes and ApplyBQSR want. BQSR itself wants additional
    // filtering performed, so we do that here.
    //NOTE: this doesn't honor enabled/disabled commandline filters
    final ReadFilter bqsrReadFilter = ReadFilter.fromList(BaseRecalibrator.getBQSRSpecificReadFilterList(), getHeaderForReads());
    final JavaRDD<GATKRead> markedFilteredReadsForBQSR = markedReads.filter(read -> bqsrReadFilter.test(read));
    VariantsSparkSource variantsSparkSource = new VariantsSparkSource(ctx);
    JavaRDD<GATKVariant> bqsrKnownVariants = variantsSparkSource.getParallelVariants(baseRecalibrationKnownVariants, getIntervals());
    JavaPairRDD<GATKRead, ReadContextData> rddReadContext = AddContextDataToReadSpark.add(ctx, markedFilteredReadsForBQSR, getReference(), bqsrKnownVariants, joinStrategy, getReferenceSequenceDictionary(), readShardSize, readShardPadding);
    final RecalibrationReport bqsrReport = BaseRecalibratorSparkFn.apply(rddReadContext, getHeaderForReads(), getReferenceSequenceDictionary(), bqsrArgs);
    final Broadcast<RecalibrationReport> reportBroadcast = ctx.broadcast(bqsrReport);
    final JavaRDD<GATKRead> finalReads = ApplyBQSRSparkFn.apply(markedReads, reportBroadcast, getHeaderForReads(), applyBqsrArgs.toApplyBQSRArgumentCollection(bqsrArgs.PRESERVE_QSCORES_LESS_THAN));
    writeReads(ctx, output, finalReads);
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) ReadContextData(org.broadinstitute.hellbender.engine.ReadContextData) GATKVariant(org.broadinstitute.hellbender.utils.variant.GATKVariant) OpticalDuplicateFinder(org.broadinstitute.hellbender.utils.read.markduplicates.OpticalDuplicateFinder) ReadFilter(org.broadinstitute.hellbender.engine.filters.ReadFilter) VariantsSparkSource(org.broadinstitute.hellbender.engine.spark.datasources.VariantsSparkSource) RecalibrationReport(org.broadinstitute.hellbender.utils.recalibration.RecalibrationReport)

Aggregations

ReadFilter (org.broadinstitute.hellbender.engine.filters.ReadFilter)25 WellformedReadFilter (org.broadinstitute.hellbender.engine.filters.WellformedReadFilter)12 GATKRead (org.broadinstitute.hellbender.utils.read.GATKRead)10 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)7 File (java.io.File)6 ReadFilterLibrary (org.broadinstitute.hellbender.engine.filters.ReadFilterLibrary)6 CountingReadFilter (org.broadinstitute.hellbender.engine.filters.CountingReadFilter)5 MappingQualityReadFilter (org.broadinstitute.hellbender.engine.filters.MappingQualityReadFilter)5 ArrayList (java.util.ArrayList)4 JavaRDD (org.apache.spark.api.java.JavaRDD)4 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)4 SAMFileHeader (htsjdk.samtools.SAMFileHeader)3 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)3 java.util (java.util)3 Argument (org.broadinstitute.barclay.argparser.Argument)3 CommandLineProgramProperties (org.broadinstitute.barclay.argparser.CommandLineProgramProperties)3 DocumentedFeature (org.broadinstitute.barclay.help.DocumentedFeature)3 GATKSparkTool (org.broadinstitute.hellbender.engine.spark.GATKSparkTool)3 UserException (org.broadinstitute.hellbender.exceptions.UserException)3 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)3