Search in sources :

Example 1 with SAMRecordWritable

use of org.seqdoop.hadoop_bam.SAMRecordWritable in project gatk by broadinstitute.

the class ReadsSparkSource method getParallelReads.

/**
     * Loads Reads using Hadoop-BAM. For local files, bam must have the fully-qualified path,
     * i.e., file:///path/to/bam.bam.
     * @param readFileName file to load
     * @param referencePath Reference path or null if not available. Reference is required for CRAM files.
     * @param intervals intervals of reads to include. If <code>null</code> then all the reads (both mapped and unmapped) will be returned.
     * @param splitSize maximum bytes of bam file to read into a single partition, increasing this will result in fewer partitions. A value of zero means
     *                  use the default split size (determined by the Hadoop input format, typically the size of one HDFS block).
     * @return RDD of (SAMRecord-backed) GATKReads from the file.
     */
public JavaRDD<GATKRead> getParallelReads(final String readFileName, final String referencePath, final List<SimpleInterval> intervals, final long splitSize) {
    SAMFileHeader header = getHeader(readFileName, referencePath);
    // use the Hadoop configuration attached to the Spark context to maintain cumulative settings
    final Configuration conf = ctx.hadoopConfiguration();
    if (splitSize > 0) {
        conf.set("mapreduce.input.fileinputformat.split.maxsize", Long.toString(splitSize));
    }
    final JavaPairRDD<LongWritable, SAMRecordWritable> rdd2;
    setHadoopBAMConfigurationProperties(readFileName, referencePath);
    boolean isBam = IOUtils.isBamFileName(readFileName);
    if (isBam && intervals != null && !intervals.isEmpty()) {
        BAMInputFormat.setIntervals(conf, intervals);
    } else {
        conf.unset(BAMInputFormat.INTERVALS_PROPERTY);
    }
    rdd2 = ctx.newAPIHadoopFile(readFileName, AnySAMInputFormat.class, LongWritable.class, SAMRecordWritable.class, conf);
    JavaRDD<GATKRead> reads = rdd2.map(v1 -> {
        SAMRecord sam = v1._2().get();
        if (isBam || samRecordOverlaps(sam, intervals)) {
            return (GATKRead) SAMRecordToGATKReadAdapter.headerlessReadAdapter(sam);
        }
        return null;
    }).filter(v1 -> v1 != null);
    return putPairsInSamePartition(header, reads);
}
Also used : ReadsDataSource(org.broadinstitute.hellbender.engine.ReadsDataSource) SAMHeaderReader(org.seqdoop.hadoop_bam.util.SAMHeaderReader) FileSystem(org.apache.hadoop.fs.FileSystem) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) PathFilter(org.apache.hadoop.fs.PathFilter) FlatMapFunction2(org.apache.spark.api.java.function.FlatMapFunction2) FileStatus(org.apache.hadoop.fs.FileStatus) BDGAlignmentRecordToGATKReadAdapter(org.broadinstitute.hellbender.utils.read.BDGAlignmentRecordToGATKReadAdapter) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) PeekingIterator(com.google.common.collect.PeekingIterator) LongWritable(org.apache.hadoop.io.LongWritable) Iterators(com.google.common.collect.Iterators) SAMRecordWritable(org.seqdoop.hadoop_bam.SAMRecordWritable) ArrayList(java.util.ArrayList) AvroParquetInputFormat(org.apache.parquet.avro.AvroParquetInputFormat) BucketUtils(org.broadinstitute.hellbender.utils.gcs.BucketUtils) Configuration(org.apache.hadoop.conf.Configuration) Path(org.apache.hadoop.fs.Path) AlignmentRecord(org.bdgenomics.formats.avro.AlignmentRecord) JavaRDD(org.apache.spark.api.java.JavaRDD) FlatMapFunction(org.apache.spark.api.java.function.FlatMapFunction) Broadcast(org.apache.spark.broadcast.Broadcast) ReadConstants(org.broadinstitute.hellbender.utils.read.ReadConstants) Iterator(java.util.Iterator) IOUtils(org.broadinstitute.hellbender.utils.io.IOUtils) SAMRecordToGATKReadAdapter(org.broadinstitute.hellbender.utils.read.SAMRecordToGATKReadAdapter) BAMInputFormat(org.seqdoop.hadoop_bam.BAMInputFormat) IOException(java.io.IOException) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) SimpleInterval(org.broadinstitute.hellbender.utils.SimpleInterval) File(java.io.File) Serializable(java.io.Serializable) CRAMInputFormat(org.seqdoop.hadoop_bam.CRAMInputFormat) List(java.util.List) Logger(org.apache.logging.log4j.Logger) UserException(org.broadinstitute.hellbender.exceptions.UserException) Job(org.apache.hadoop.mapreduce.Job) AnySAMInputFormat(org.seqdoop.hadoop_bam.AnySAMInputFormat) SparkUtils(org.broadinstitute.hellbender.utils.spark.SparkUtils) htsjdk.samtools(htsjdk.samtools) LogManager(org.apache.logging.log4j.LogManager) GATKRead(org.broadinstitute.hellbender.utils.read.GATKRead) Configuration(org.apache.hadoop.conf.Configuration) AnySAMInputFormat(org.seqdoop.hadoop_bam.AnySAMInputFormat) LongWritable(org.apache.hadoop.io.LongWritable) SAMRecordWritable(org.seqdoop.hadoop_bam.SAMRecordWritable)

Aggregations

Iterators (com.google.common.collect.Iterators)1 PeekingIterator (com.google.common.collect.PeekingIterator)1 htsjdk.samtools (htsjdk.samtools)1 File (java.io.File)1 IOException (java.io.IOException)1 Serializable (java.io.Serializable)1 ArrayList (java.util.ArrayList)1 Iterator (java.util.Iterator)1 List (java.util.List)1 Configuration (org.apache.hadoop.conf.Configuration)1 FileStatus (org.apache.hadoop.fs.FileStatus)1 FileSystem (org.apache.hadoop.fs.FileSystem)1 Path (org.apache.hadoop.fs.Path)1 PathFilter (org.apache.hadoop.fs.PathFilter)1 LongWritable (org.apache.hadoop.io.LongWritable)1 Job (org.apache.hadoop.mapreduce.Job)1 LogManager (org.apache.logging.log4j.LogManager)1 Logger (org.apache.logging.log4j.Logger)1 AvroParquetInputFormat (org.apache.parquet.avro.AvroParquetInputFormat)1 JavaPairRDD (org.apache.spark.api.java.JavaPairRDD)1