use of org.seqdoop.hadoop_bam.SAMRecordWritable in project gatk by broadinstitute.
the class ReadsSparkSource method getParallelReads.
/**
* Loads Reads using Hadoop-BAM. For local files, bam must have the fully-qualified path,
* i.e., file:///path/to/bam.bam.
* @param readFileName file to load
* @param referencePath Reference path or null if not available. Reference is required for CRAM files.
* @param intervals intervals of reads to include. If <code>null</code> then all the reads (both mapped and unmapped) will be returned.
* @param splitSize maximum bytes of bam file to read into a single partition, increasing this will result in fewer partitions. A value of zero means
* use the default split size (determined by the Hadoop input format, typically the size of one HDFS block).
* @return RDD of (SAMRecord-backed) GATKReads from the file.
*/
public JavaRDD<GATKRead> getParallelReads(final String readFileName, final String referencePath, final List<SimpleInterval> intervals, final long splitSize) {
SAMFileHeader header = getHeader(readFileName, referencePath);
// use the Hadoop configuration attached to the Spark context to maintain cumulative settings
final Configuration conf = ctx.hadoopConfiguration();
if (splitSize > 0) {
conf.set("mapreduce.input.fileinputformat.split.maxsize", Long.toString(splitSize));
}
final JavaPairRDD<LongWritable, SAMRecordWritable> rdd2;
setHadoopBAMConfigurationProperties(readFileName, referencePath);
boolean isBam = IOUtils.isBamFileName(readFileName);
if (isBam && intervals != null && !intervals.isEmpty()) {
BAMInputFormat.setIntervals(conf, intervals);
} else {
conf.unset(BAMInputFormat.INTERVALS_PROPERTY);
}
rdd2 = ctx.newAPIHadoopFile(readFileName, AnySAMInputFormat.class, LongWritable.class, SAMRecordWritable.class, conf);
JavaRDD<GATKRead> reads = rdd2.map(v1 -> {
SAMRecord sam = v1._2().get();
if (isBam || samRecordOverlaps(sam, intervals)) {
return (GATKRead) SAMRecordToGATKReadAdapter.headerlessReadAdapter(sam);
}
return null;
}).filter(v1 -> v1 != null);
return putPairsInSamePartition(header, reads);
}
Aggregations