Search in sources :

Example 1 with GATKRead

use of org.broadinstitute.hellbender.utils.read.GATKRead in project gatk by broadinstitute.

the class PSPairedUnpairedSplitterSpark method mapPartitionsToPairedAndUnpairedLists.

/**
     * Maps each partition to a Tuple of two Lists, the first containing the paired reads, the second containing unpaired
     */
private static Iterator<Tuple2<List<GATKRead>, List<GATKRead>>> mapPartitionsToPairedAndUnpairedLists(final Iterator<GATKRead> iter, final int readsPerPartitionGuess) {
    //Find the paired and unpaired reads by scanning the partition for repeated names
    final List<GATKRead> pairedReadsList = new ArrayList<>(readsPerPartitionGuess);
    final Map<String, GATKRead> unpairedReads = new HashMap<>(readsPerPartitionGuess);
    while (iter.hasNext()) {
        final GATKRead read = iter.next();
        final String readName = read.getName();
        //If read's mate is already in unpairedReads then we have a pair, which gets added to the ordered List
        if (unpairedReads.containsKey(readName)) {
            pairedReadsList.add(read);
            pairedReadsList.add(unpairedReads.remove(readName));
        } else {
            unpairedReads.put(readName, read);
        }
    }
    //Get the unpaired reads out of the hashmap
    final List<GATKRead> unpairedReadsList = new ArrayList<>(unpairedReads.values());
    //Minimize unpairedReads memory footprint (don't rely on readsPerPartitionGuess)
    final List<GATKRead> pairedReadsListResized = new ArrayList<>(pairedReadsList.size());
    pairedReadsListResized.addAll(pairedReadsList);
    //Wrap and return the result
    return Collections.singletonList(new Tuple2<>(pairedReadsListResized, unpairedReadsList)).iterator();
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) Tuple2(scala.Tuple2)

Example 2 with GATKRead

use of org.broadinstitute.hellbender.utils.read.GATKRead in project gatk by broadinstitute.

the class CountBasesSpark method runTool.

@Override
protected void runTool(final JavaSparkContext ctx) {
    final JavaRDD<GATKRead> reads = getReads();
    final long count = reads.map(r -> (long) r.getLength()).reduce(Long::sum);
    System.out.println(count);
    if (out != null) {
        try (final PrintStream ps = new PrintStream(BucketUtils.createFile(out))) {
            ps.print(count);
        }
    }
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) DocumentedFeature(org.broadinstitute.barclay.help.DocumentedFeature) PrintStream(java.io.PrintStream) CommandLineProgramProperties(org.broadinstitute.barclay.argparser.CommandLineProgramProperties) BucketUtils(org.broadinstitute.hellbender.utils.gcs.BucketUtils) SparkProgramGroup(org.broadinstitute.hellbender.cmdline.programgroups.SparkProgramGroup) Argument(org.broadinstitute.barclay.argparser.Argument) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) GATKSparkTool(org.broadinstitute.hellbender.engine.spark.GATKSparkTool) StandardArgumentDefinitions(org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) JavaRDD(org.apache.spark.api.java.JavaRDD) PrintStream(java.io.PrintStream)

Example 3 with GATKRead

use of org.broadinstitute.hellbender.utils.read.GATKRead in project gatk by broadinstitute.

the class ApplyBQSRSpark method runTool.

@Override
protected void runTool(JavaSparkContext ctx) {
    JavaRDD<GATKRead> initialReads = getReads();
    // null if we have no api key
    final GCSOptions gcsOptions = getAuthenticatedGCSOptions();
    Broadcast<RecalibrationReport> recalibrationReportBroadCast = ctx.broadcast(new RecalibrationReport(BucketUtils.openFile(bqsrRecalFile)));
    final JavaRDD<GATKRead> recalibratedReads = ApplyBQSRSparkFn.apply(initialReads, recalibrationReportBroadCast, getHeaderForReads(), applyBQSRArgs);
    writeReads(ctx, output, recalibratedReads);
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) RecalibrationReport(org.broadinstitute.hellbender.utils.recalibration.RecalibrationReport) GCSOptions(com.google.cloud.genomics.dataflow.utils.GCSOptions)

Example 4 with GATKRead

use of org.broadinstitute.hellbender.utils.read.GATKRead in project gatk by broadinstitute.

the class BaseRecalibratorSpark method runTool.

@Override
protected void runTool(JavaSparkContext ctx) {
    if (joinStrategy == JoinStrategy.BROADCAST && !getReference().isCompatibleWithSparkBroadcast()) {
        throw new UserException.Require2BitReferenceForBroadcast();
    }
    JavaRDD<GATKRead> initialReads = getReads();
    VariantsSparkSource variantsSparkSource = new VariantsSparkSource(ctx);
    JavaRDD<GATKVariant> bqsrKnownVariants = variantsSparkSource.getParallelVariants(knownVariants, getIntervals());
    // TODO: Look into broadcasting the reference to all of the workers. This would make AddContextDataToReadSpark
    // TODO: and ApplyBQSRStub simpler (#855).
    JavaPairRDD<GATKRead, ReadContextData> rddReadContext = AddContextDataToReadSpark.add(ctx, initialReads, getReference(), bqsrKnownVariants, joinStrategy, getReferenceSequenceDictionary(), readShardSize, readShardPadding);
    // TODO: broadcast the reads header?
    final RecalibrationReport bqsrReport = BaseRecalibratorSparkFn.apply(rddReadContext, getHeaderForReads(), getReferenceSequenceDictionary(), bqsrArgs);
    try (final PrintStream reportStream = new PrintStream(BucketUtils.createFile(outputTablesPath))) {
        RecalUtils.outputRecalibrationReport(reportStream, bqsrArgs, bqsrReport.getQuantizationInfo(), bqsrReport.getRecalibrationTables(), bqsrReport.getCovariates());
    }
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) ReadContextData(org.broadinstitute.hellbender.engine.ReadContextData) GATKVariant(org.broadinstitute.hellbender.utils.variant.GATKVariant) PrintStream(java.io.PrintStream) VariantsSparkSource(org.broadinstitute.hellbender.engine.spark.datasources.VariantsSparkSource) RecalibrationReport(org.broadinstitute.hellbender.utils.recalibration.RecalibrationReport)

Example 5 with GATKRead

use of org.broadinstitute.hellbender.utils.read.GATKRead in project gatk by broadinstitute.

the class CollectBaseDistributionByCycleSpark method calculateBaseDistributionByCycle.

/**
     * Computes the MeanQualityByCycle. Creates a metrics file with relevant histograms.
     */
public MetricsFile<BaseDistributionByCycleMetrics, Integer> calculateBaseDistributionByCycle(final JavaRDD<GATKRead> reads) {
    final MetricsReadFilter metricsFilter = new MetricsReadFilter(this.pfReadsOnly, this.alignedReadsOnly);
    final JavaRDD<GATKRead> filteredReads = reads.filter(read -> metricsFilter.test(read));
    final HistogramGenerator hist = filteredReads.aggregate(new HistogramGenerator(), (hgp, read) -> hgp.addRead(read), (hgp1, hgp2) -> hgp1.merge(hgp2));
    final MetricsFile<BaseDistributionByCycleMetrics, Integer> metricsFile = getMetricsFile();
    hist.addToMetricsFile(metricsFile);
    return metricsFile;
}
Also used : GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) BaseDistributionByCycleMetrics(org.broadinstitute.hellbender.tools.picard.analysis.BaseDistributionByCycleMetrics) MetricsReadFilter(org.broadinstitute.hellbender.engine.filters.MetricsReadFilter)

Aggregations

GATKRead (org.broadinstitute.hellbender.utils.read.GATKRead)457 Test (org.testng.annotations.Test)286 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)163 SAMFileHeader (htsjdk.samtools.SAMFileHeader)87 SimpleInterval (org.broadinstitute.hellbender.utils.SimpleInterval)59 JavaSparkContext (org.apache.spark.api.java.JavaSparkContext)40 ArrayList (java.util.ArrayList)34 Collectors (java.util.stream.Collectors)34 List (java.util.List)30 Cigar (htsjdk.samtools.Cigar)29 File (java.io.File)28 java.util (java.util)28 DataProvider (org.testng.annotations.DataProvider)28 JavaRDD (org.apache.spark.api.java.JavaRDD)26 Haplotype (org.broadinstitute.hellbender.utils.haplotype.Haplotype)26 Assert (org.testng.Assert)25 ReadPileup (org.broadinstitute.hellbender.utils.pileup.ReadPileup)24 SAMReadGroupRecord (htsjdk.samtools.SAMReadGroupRecord)22 Argument (org.broadinstitute.barclay.argparser.Argument)18 UserException (org.broadinstitute.hellbender.exceptions.UserException)18