Search in sources :

Example 81 with SAMReadGroupRecord

use of htsjdk.samtools.SAMReadGroupRecord in project jvarkit by lindenb.

the class BamStats04 method doWork.

@Override
public int doWork(final List<String> args) {
    if (this.bedFile == null || !this.bedFile.exists()) {
        LOG.error("undefined option -B (bed file)");
        return -1;
    }
    if (args.isEmpty()) {
        LOG.error("Bam files missing");
        return -1;
    }
    if (this.minCoverages.isEmpty()) {
        this.minCoverages.add(0);
    }
    final String NO_PARTITION = "N/A";
    BufferedReader bedIn = null;
    final List<SamReader> samReaders = new ArrayList<>(args.size());
    PrintWriter pw = null;
    ReferenceGenome referenceGenome = null;
    ReferenceContig referenceContig = null;
    try {
        final BedLineCodec codec = new BedLineCodec();
        final Set<String> all_partitions = new TreeSet<>();
        bedIn = IOUtils.openFileForBufferedReading(this.bedFile);
        SAMSequenceDictionary dict = null;
        for (final String filename : IOUtils.unrollFiles(args)) {
            LOG.info(filename);
            final SamReader samReader = super.openSamReader(filename);
            if (!samReader.hasIndex()) {
                LOG.error(filename + " is not indexed");
                samReader.close();
                return -1;
            }
            final SAMFileHeader samFileheader = samReader.getFileHeader();
            if (samFileheader == null) {
                LOG.error("SAM file is missing a header " + filename);
                return -1;
            }
            final List<SAMReadGroupRecord> readGroups = samFileheader.getReadGroups();
            if (readGroups == null || readGroups.isEmpty()) {
                LOG.warn("No Read group (RG) in the header of " + filename);
                all_partitions.add(NO_PARTITION);
            } else {
                for (final SAMReadGroupRecord rg : readGroups) {
                    all_partitions.add(this.partition.apply(rg, NO_PARTITION));
                }
            }
            final SAMSequenceDictionary d = samFileheader.getSequenceDictionary();
            if (d == null) {
                samReader.close();
                LOG.error(JvarkitException.BamDictionaryMissing.getMessage(filename));
                return -1;
            }
            samReaders.add(samReader);
            if (dict == null) {
                dict = d;
            } else if (SequenceUtil.areSequenceDictionariesEqual(d, dict)) {
                LOG.error(JvarkitException.DictionariesAreNotTheSame.getMessage(d, dict));
                return -1;
            }
        }
        if (samReaders.isEmpty()) {
            LOG.error("No Bam defined");
            return -1;
        }
        if (!StringUtil.isBlank(this.faidxUri)) {
            referenceGenome = new ReferenceGenomeFactory().open(this.faidxUri);
        }
        pw = super.openFileOrStdoutAsPrintWriter(this.outputFile);
        pw.print("#chrom\tstart\tend\tlength\t" + this.partition.name() + (referenceGenome == null ? "" : "\tgc_percent"));
        pw.print("\tmincov\tmaxcov");
        for (final int MIN_COVERAGE : this.minCoverages) {
            pw.print("\tmeancov_" + MIN_COVERAGE + "\tmediancov_" + MIN_COVERAGE + "\tnocoveragebp_" + MIN_COVERAGE + "\tpercentcovered_" + MIN_COVERAGE);
        }
        pw.println();
        String line = null;
        while ((line = bedIn.readLine()) != null) {
            if (line.isEmpty() || line.startsWith("#"))
                continue;
            final BedLine bedLine = codec.decode(line);
            if (bedLine == null)
                continue;
            if (dict.getSequence(bedLine.getContig()) == null) {
                LOG.error("Unknown contig in " + line);
                return -1;
            }
            if (bedLine.getStart() > bedLine.getEnd()) {
                LOG.info("ignoring " + bedLine);
                continue;
            }
            if (referenceGenome != null && (referenceContig == null || !referenceContig.hasName(bedLine.getContig()))) {
                referenceContig = referenceGenome.getContig(bedLine.getContig());
            }
            final Map<String, IntervalStat> sample2stats = new HashMap<>(all_partitions.size());
            for (final String rgId : all_partitions) {
                sample2stats.put(rgId, new IntervalStat(bedLine));
            }
            for (final SamReader samReader : samReaders) {
                /**
                 *     start - 1-based, inclusive start of interval of interest. Zero implies start of the reference sequence.
                 *	   end - 1-based, inclusive end of interval of interest. Zero implies end of the reference sequence.
                 */
                final SAMRecordIterator r = samReader.queryOverlapping(bedLine.getContig(), bedLine.getStart(), bedLine.getEnd());
                while (r.hasNext()) {
                    final SAMRecord rec = r.next();
                    if (rec.getReadUnmappedFlag())
                        continue;
                    if (this.filter.filterOut(rec))
                        continue;
                    if (!rec.getReferenceName().equals(bedLine.getContig()))
                        continue;
                    final String partition;
                    final SAMReadGroupRecord group = rec.getReadGroup();
                    if (group == null) {
                        partition = NO_PARTITION;
                    } else {
                        final String name = this.partition.apply(group);
                        partition = (StringUtil.isBlank(name) ? NO_PARTITION : name);
                    }
                    IntervalStat stat = sample2stats.get(partition);
                    if (stat == null) {
                        stat = new IntervalStat(bedLine);
                        sample2stats.put(partition, stat);
                    }
                    stat.visit(rec);
                }
                r.close();
            }
            // end of loop over sam Readers
            final OptionalInt gcPercentInt = (referenceContig == null ? OptionalInt.empty() : referenceContig.getGCPercent(bedLine.getStart() - 1, bedLine.getEnd()).getGCPercentAsInteger());
            for (final String partitionName : sample2stats.keySet()) {
                final IntervalStat stat = sample2stats.get(partitionName);
                Arrays.sort(stat.counts);
                pw.print(bedLine.getContig() + "\t" + (bedLine.getStart() - 1) + "\t" + (bedLine.getEnd()) + "\t" + stat.counts.length + "\t" + partitionName);
                if (referenceGenome != null) {
                    pw.print("\t");
                    if (gcPercentInt.isPresent())
                        pw.print(gcPercentInt.getAsInt());
                }
                pw.print("\t" + stat.counts[0] + "\t" + stat.counts[stat.counts.length - 1]);
                for (final int MIN_COVERAGE : this.minCoverages) {
                    /**
                     * map depth to 0 if depth <= MIN_COVERAGE
                     */
                    final IntUnaryOperator depthAdjuster = (D) -> (D <= MIN_COVERAGE ? 0 : D);
                    final int count_no_coverage = (int) Arrays.stream(stat.counts).filter(D -> depthAdjuster.applyAsInt(D) <= 0).count();
                    final double mean = Percentile.average().evaluate(Arrays.stream(stat.counts).map(depthAdjuster));
                    final double median_depth = Percentile.median().evaluate(Arrays.stream(stat.counts).map(depthAdjuster));
                    pw.print("\t" + mean + "\t" + median_depth + "\t" + count_no_coverage + "\t" + (int) (((stat.counts.length - count_no_coverage) / (double) stat.counts.length) * 100.0));
                }
                pw.println();
            }
        }
        pw.flush();
        pw.close();
        pw = null;
        LOG.info("done");
        return RETURN_OK;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(referenceGenome);
        CloserUtil.close(pw);
        CloserUtil.close(bedIn);
        CloserUtil.close(samReaders);
    }
}
Also used : Cigar(htsjdk.samtools.Cigar) Arrays(java.util.Arrays) SequenceUtil(htsjdk.samtools.util.SequenceUtil) Program(com.github.lindenb.jvarkit.util.jcommander.Program) IntUnaryOperator(java.util.function.IntUnaryOperator) Parameter(com.beust.jcommander.Parameter) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) CigarElement(htsjdk.samtools.CigarElement) CigarOperator(htsjdk.samtools.CigarOperator) HashMap(java.util.HashMap) SAMRecordPartition(com.github.lindenb.jvarkit.util.samtools.SAMRecordPartition) OptionalInt(java.util.OptionalInt) SAMFileHeader(htsjdk.samtools.SAMFileHeader) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) StringUtil(htsjdk.samtools.util.StringUtil) ReferenceGenomeFactory(com.github.lindenb.jvarkit.util.bio.fasta.ReferenceGenomeFactory) Map(java.util.Map) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) CloserUtil(htsjdk.samtools.util.CloserUtil) PrintWriter(java.io.PrintWriter) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) Logger(com.github.lindenb.jvarkit.util.log.Logger) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Set(java.util.Set) ReferenceContig(com.github.lindenb.jvarkit.util.bio.fasta.ReferenceContig) SamReader(htsjdk.samtools.SamReader) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) Percentile(com.github.lindenb.jvarkit.math.stats.Percentile) File(java.io.File) SAMRecord(htsjdk.samtools.SAMRecord) SamRecordFilter(htsjdk.samtools.filter.SamRecordFilter) List(java.util.List) SamRecordJEXLFilter(com.github.lindenb.jvarkit.util.samtools.SamRecordJEXLFilter) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) BufferedReader(java.io.BufferedReader) ReferenceGenome(com.github.lindenb.jvarkit.util.bio.fasta.ReferenceGenome) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) ReferenceContig(com.github.lindenb.jvarkit.util.bio.fasta.ReferenceContig) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) HashMap(java.util.HashMap) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) ArrayList(java.util.ArrayList) IntUnaryOperator(java.util.function.IntUnaryOperator) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) SamReader(htsjdk.samtools.SamReader) TreeSet(java.util.TreeSet) PrintWriter(java.io.PrintWriter) ReferenceGenome(com.github.lindenb.jvarkit.util.bio.fasta.ReferenceGenome) ReferenceGenomeFactory(com.github.lindenb.jvarkit.util.bio.fasta.ReferenceGenomeFactory) OptionalInt(java.util.OptionalInt) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) SAMRecord(htsjdk.samtools.SAMRecord) BufferedReader(java.io.BufferedReader) SAMFileHeader(htsjdk.samtools.SAMFileHeader)

Aggregations

SAMReadGroupRecord (htsjdk.samtools.SAMReadGroupRecord)81 SAMFileHeader (htsjdk.samtools.SAMFileHeader)48 SAMRecord (htsjdk.samtools.SAMRecord)33 Test (org.testng.annotations.Test)31 SamReader (htsjdk.samtools.SamReader)29 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)26 File (java.io.File)23 ArrayList (java.util.ArrayList)22 SAMRecordIterator (htsjdk.samtools.SAMRecordIterator)20 GATKRead (org.broadinstitute.hellbender.utils.read.GATKRead)20 HashMap (java.util.HashMap)18 CigarElement (htsjdk.samtools.CigarElement)17 Cigar (htsjdk.samtools.Cigar)16 HashSet (java.util.HashSet)16 SAMFileWriter (htsjdk.samtools.SAMFileWriter)15 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)15 CigarOperator (htsjdk.samtools.CigarOperator)14 IOException (java.io.IOException)14 SAMSequenceDictionaryProgress (com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress)13 List (java.util.List)12