use of org.apache.spark.api.java.JavaPairRDD in project gatk by broadinstitute.
the class ReadsSparkSink method writeReadsADAM.
private static void writeReadsADAM(final JavaSparkContext ctx, final String outputFile, final JavaRDD<SAMRecord> reads, final SAMFileHeader header) throws IOException {
final SequenceDictionary seqDict = SequenceDictionary.fromSAMSequenceDictionary(header.getSequenceDictionary());
final RecordGroupDictionary readGroups = RecordGroupDictionary.fromSAMHeader(header);
final JavaPairRDD<Void, AlignmentRecord> rddAlignmentRecords = reads.map(read -> {
read.setHeaderStrict(header);
AlignmentRecord alignmentRecord = GATKReadToBDGAlignmentRecordConverter.convert(read, seqDict, readGroups);
read.setHeaderStrict(null);
return alignmentRecord;
}).mapToPair(alignmentRecord -> new Tuple2<>(null, alignmentRecord));
// instantiating a Job is necessary here in order to set the Hadoop Configuration...
final Job job = Job.getInstance(ctx.hadoopConfiguration());
// ...here, which sets a config property that the AvroParquetOutputFormat needs when writing data. Specifically,
// we are writing the Avro schema to the Configuration as a JSON string. The AvroParquetOutputFormat class knows
// how to translate objects in the Avro data model to the Parquet primitives that get written.
AvroParquetOutputFormat.setSchema(job, AlignmentRecord.getClassSchema());
deleteHadoopFile(outputFile, ctx.hadoopConfiguration());
rddAlignmentRecords.saveAsNewAPIHadoopFile(outputFile, Void.class, AlignmentRecord.class, AvroParquetOutputFormat.class, job.getConfiguration());
}
use of org.apache.spark.api.java.JavaPairRDD in project gatk by broadinstitute.
the class ReadsSparkSource method getParallelReads.
/**
* Loads Reads using Hadoop-BAM. For local files, bam must have the fully-qualified path,
* i.e., file:///path/to/bam.bam.
* @param readFileName file to load
* @param referencePath Reference path or null if not available. Reference is required for CRAM files.
* @param intervals intervals of reads to include. If <code>null</code> then all the reads (both mapped and unmapped) will be returned.
* @param splitSize maximum bytes of bam file to read into a single partition, increasing this will result in fewer partitions. A value of zero means
* use the default split size (determined by the Hadoop input format, typically the size of one HDFS block).
* @return RDD of (SAMRecord-backed) GATKReads from the file.
*/
public JavaRDD<GATKRead> getParallelReads(final String readFileName, final String referencePath, final List<SimpleInterval> intervals, final long splitSize) {
SAMFileHeader header = getHeader(readFileName, referencePath);
// use the Hadoop configuration attached to the Spark context to maintain cumulative settings
final Configuration conf = ctx.hadoopConfiguration();
if (splitSize > 0) {
conf.set("mapreduce.input.fileinputformat.split.maxsize", Long.toString(splitSize));
}
final JavaPairRDD<LongWritable, SAMRecordWritable> rdd2;
setHadoopBAMConfigurationProperties(readFileName, referencePath);
boolean isBam = IOUtils.isBamFileName(readFileName);
if (isBam && intervals != null && !intervals.isEmpty()) {
BAMInputFormat.setIntervals(conf, intervals);
} else {
conf.unset(BAMInputFormat.INTERVALS_PROPERTY);
}
rdd2 = ctx.newAPIHadoopFile(readFileName, AnySAMInputFormat.class, LongWritable.class, SAMRecordWritable.class, conf);
JavaRDD<GATKRead> reads = rdd2.map(v1 -> {
SAMRecord sam = v1._2().get();
if (isBam || samRecordOverlaps(sam, intervals)) {
return (GATKRead) SAMRecordToGATKReadAdapter.headerlessReadAdapter(sam);
}
return null;
}).filter(v1 -> v1 != null);
return putPairsInSamePartition(header, reads);
}
use of org.apache.spark.api.java.JavaPairRDD in project gatk by broadinstitute.
the class HaplotypeCallerSpark method createReadShards.
/**
* Create an RDD of {@link Shard} from an RDD of {@link GATKRead}
* @param shardBoundariesBroadcast broadcast of an {@link OverlapDetector} loaded with the intervals that should be used for creating ReadShards
* @param reads Rdd of {@link GATKRead}
* @return a Rdd of reads grouped into potentially overlapping shards
*/
private static JavaRDD<Shard<GATKRead>> createReadShards(final Broadcast<OverlapDetector<ShardBoundary>> shardBoundariesBroadcast, final JavaRDD<GATKRead> reads) {
final JavaPairRDD<ShardBoundary, GATKRead> paired = reads.flatMapToPair(read -> {
final Collection<ShardBoundary> overlappingShards = shardBoundariesBroadcast.value().getOverlaps(read);
return overlappingShards.stream().map(key -> new Tuple2<>(key, read)).iterator();
});
final JavaPairRDD<ShardBoundary, Iterable<GATKRead>> shardsWithReads = paired.groupByKey();
return shardsWithReads.map(shard -> new SparkReadShard(shard._1(), shard._2()));
}
use of org.apache.spark.api.java.JavaPairRDD in project gatk by broadinstitute.
the class ShuffleJoinReadsWithRefBases method addBases.
/**
* Joins each read of an RDD<GATKRead> with that read's corresponding reference sequence.
*
* @param referenceDataflowSource The source of the reference sequence information
* @param reads The reads for which to extract reference sequence information
* @return The JavaPairRDD that contains each read along with the corresponding ReferenceBases object
*/
public static JavaPairRDD<GATKRead, ReferenceBases> addBases(final ReferenceMultiSource referenceDataflowSource, final JavaRDD<GATKRead> reads) {
// TODO: reimpl this method by calling out to the more complex version?
SerializableFunction<GATKRead, SimpleInterval> windowFunction = referenceDataflowSource.getReferenceWindowFunction();
JavaPairRDD<ReferenceShard, GATKRead> shardRead = reads.mapToPair(gatkRead -> {
ReferenceShard shard = ReferenceShard.getShardNumberFromInterval(windowFunction.apply(gatkRead));
return new Tuple2<>(shard, gatkRead);
});
JavaPairRDD<ReferenceShard, Iterable<GATKRead>> shardiRead = shardRead.groupByKey();
return shardiRead.flatMapToPair(in -> {
List<Tuple2<GATKRead, ReferenceBases>> out = Lists.newArrayList();
Iterable<GATKRead> iReads = in._2();
final List<SimpleInterval> readWindows = Utils.stream(iReads).map(read -> windowFunction.apply(read)).collect(Collectors.toList());
SimpleInterval interval = IntervalUtils.getSpanningInterval(readWindows);
ReferenceBases bases = referenceDataflowSource.getReferenceBases(null, interval);
for (GATKRead r : iReads) {
final ReferenceBases subset = bases.getSubset(windowFunction.apply(r));
out.add(new Tuple2<>(r, subset));
}
return out.iterator();
});
}
use of org.apache.spark.api.java.JavaPairRDD in project gatk-protected by broadinstitute.
the class CoverageModelEMWorkspace method instantiateWorkers.
/**
* Instantiate compute block(s). If Spark is disabled, a single {@link CoverageModelEMComputeBlock} is
* instantiated. Otherwise, a {@link JavaPairRDD} of compute nodes will be created.
*/
private void instantiateWorkers() {
if (sparkContextIsAvailable) {
/* initialize the RDD */
logger.info("Initializing an RDD of compute blocks");
computeRDD = ctx.parallelizePairs(targetBlockStream().map(tb -> new Tuple2<>(tb, new CoverageModelEMComputeBlock(tb, numSamples, numLatents, ardEnabled))).collect(Collectors.toList()), numTargetBlocks).partitionBy(new HashPartitioner(numTargetBlocks)).cache();
} else {
logger.info("Initializing a local compute block");
localComputeBlock = new CoverageModelEMComputeBlock(targetBlocks.get(0), numSamples, numLatents, ardEnabled);
}
prevCheckpointedComputeRDD = null;
cacheCallCounter = 0;
}
Aggregations