use of org.broadinstitute.hellbender.utils.read.GATKRead in project gatk by broadinstitute.
the class PSPairedUnpairedSplitterSpark method mapPartitionsToPairedAndUnpairedLists.
/**
* Maps each partition to a Tuple of two Lists, the first containing the paired reads, the second containing unpaired
*/
private static Iterator<Tuple2<List<GATKRead>, List<GATKRead>>> mapPartitionsToPairedAndUnpairedLists(final Iterator<GATKRead> iter, final int readsPerPartitionGuess) {
//Find the paired and unpaired reads by scanning the partition for repeated names
final List<GATKRead> pairedReadsList = new ArrayList<>(readsPerPartitionGuess);
final Map<String, GATKRead> unpairedReads = new HashMap<>(readsPerPartitionGuess);
while (iter.hasNext()) {
final GATKRead read = iter.next();
final String readName = read.getName();
//If read's mate is already in unpairedReads then we have a pair, which gets added to the ordered List
if (unpairedReads.containsKey(readName)) {
pairedReadsList.add(read);
pairedReadsList.add(unpairedReads.remove(readName));
} else {
unpairedReads.put(readName, read);
}
}
//Get the unpaired reads out of the hashmap
final List<GATKRead> unpairedReadsList = new ArrayList<>(unpairedReads.values());
//Minimize unpairedReads memory footprint (don't rely on readsPerPartitionGuess)
final List<GATKRead> pairedReadsListResized = new ArrayList<>(pairedReadsList.size());
pairedReadsListResized.addAll(pairedReadsList);
//Wrap and return the result
return Collections.singletonList(new Tuple2<>(pairedReadsListResized, unpairedReadsList)).iterator();
}
use of org.broadinstitute.hellbender.utils.read.GATKRead in project gatk by broadinstitute.
the class CountBasesSpark method runTool.
@Override
protected void runTool(final JavaSparkContext ctx) {
final JavaRDD<GATKRead> reads = getReads();
final long count = reads.map(r -> (long) r.getLength()).reduce(Long::sum);
System.out.println(count);
if (out != null) {
try (final PrintStream ps = new PrintStream(BucketUtils.createFile(out))) {
ps.print(count);
}
}
}
use of org.broadinstitute.hellbender.utils.read.GATKRead in project gatk by broadinstitute.
the class ApplyBQSRSpark method runTool.
@Override
protected void runTool(JavaSparkContext ctx) {
JavaRDD<GATKRead> initialReads = getReads();
// null if we have no api key
final GCSOptions gcsOptions = getAuthenticatedGCSOptions();
Broadcast<RecalibrationReport> recalibrationReportBroadCast = ctx.broadcast(new RecalibrationReport(BucketUtils.openFile(bqsrRecalFile)));
final JavaRDD<GATKRead> recalibratedReads = ApplyBQSRSparkFn.apply(initialReads, recalibrationReportBroadCast, getHeaderForReads(), applyBQSRArgs);
writeReads(ctx, output, recalibratedReads);
}
use of org.broadinstitute.hellbender.utils.read.GATKRead in project gatk by broadinstitute.
the class BaseRecalibratorSpark method runTool.
@Override
protected void runTool(JavaSparkContext ctx) {
if (joinStrategy == JoinStrategy.BROADCAST && !getReference().isCompatibleWithSparkBroadcast()) {
throw new UserException.Require2BitReferenceForBroadcast();
}
JavaRDD<GATKRead> initialReads = getReads();
VariantsSparkSource variantsSparkSource = new VariantsSparkSource(ctx);
JavaRDD<GATKVariant> bqsrKnownVariants = variantsSparkSource.getParallelVariants(knownVariants, getIntervals());
// TODO: Look into broadcasting the reference to all of the workers. This would make AddContextDataToReadSpark
// TODO: and ApplyBQSRStub simpler (#855).
JavaPairRDD<GATKRead, ReadContextData> rddReadContext = AddContextDataToReadSpark.add(ctx, initialReads, getReference(), bqsrKnownVariants, joinStrategy, getReferenceSequenceDictionary(), readShardSize, readShardPadding);
// TODO: broadcast the reads header?
final RecalibrationReport bqsrReport = BaseRecalibratorSparkFn.apply(rddReadContext, getHeaderForReads(), getReferenceSequenceDictionary(), bqsrArgs);
try (final PrintStream reportStream = new PrintStream(BucketUtils.createFile(outputTablesPath))) {
RecalUtils.outputRecalibrationReport(reportStream, bqsrArgs, bqsrReport.getQuantizationInfo(), bqsrReport.getRecalibrationTables(), bqsrReport.getCovariates());
}
}
use of org.broadinstitute.hellbender.utils.read.GATKRead in project gatk by broadinstitute.
the class CollectBaseDistributionByCycleSpark method calculateBaseDistributionByCycle.
/**
* Computes the MeanQualityByCycle. Creates a metrics file with relevant histograms.
*/
public MetricsFile<BaseDistributionByCycleMetrics, Integer> calculateBaseDistributionByCycle(final JavaRDD<GATKRead> reads) {
final MetricsReadFilter metricsFilter = new MetricsReadFilter(this.pfReadsOnly, this.alignedReadsOnly);
final JavaRDD<GATKRead> filteredReads = reads.filter(read -> metricsFilter.test(read));
final HistogramGenerator hist = filteredReads.aggregate(new HistogramGenerator(), (hgp, read) -> hgp.addRead(read), (hgp1, hgp2) -> hgp1.merge(hgp2));
final MetricsFile<BaseDistributionByCycleMetrics, Integer> metricsFile = getMetricsFile();
hist.addToMetricsFile(metricsFile);
return metricsFile;
}
Aggregations