Search in sources :

Example 21 with CigarOperator

use of htsjdk.samtools.CigarOperator in project jvarkit by lindenb.

the class ExtendReferenceWithReads method scanRegion.

/**
 *scanRegion
 * @param contig    Reference sequence of interest.
 * @param start     0-based, inclusive start of interval of interest. Zero implies start of the reference sequence.
 * @param end       0-based, exclusive end of interval of interest. Zero implies end of the reference sequence.
 */
private Map<Integer, Counter<Byte>> scanRegion(final SAMSequenceRecord contig, final int chromStart, final int chromEnd, final Rescue rescue) {
    final Map<Integer, Counter<Byte>> pos2bases = new HashMap<>(1 + chromEnd - chromStart);
    LOG.info("Scanning :" + contig.getSequenceName() + ":" + chromStart + "-" + chromEnd);
    for (int side = 0; side < 2; ++side) {
        if (// 5' or 3'
        !rescue.equals(Rescue.CENTER) && side > 0) {
            // already done
            break;
        }
        for (final SamReader samReader : samReaders) {
            final SAMSequenceDictionary dict2 = samReader.getFileHeader().getSequenceDictionary();
            final SAMSequenceRecord ssr2 = dict2.getSequence(contig.getSequenceName());
            if (ssr2 == null || ssr2.getSequenceLength() != contig.getSequenceLength()) {
                LOG.info("No contig " + contig.getSequenceName() + " with L=" + contig.getSequenceLength() + " bases in " + samReader.getResourceDescription());
                continue;
            }
            int mappedPos = -1;
            switch(rescue) {
                case LEFT:
                    mappedPos = 1;
                    break;
                case RIGHT:
                    mappedPos = contig.getSequenceLength();
                    break;
                case CENTER:
                    mappedPos = (side == 0 ? chromStart + 1 : chromEnd);
                    break;
                default:
                    throw new IllegalStateException();
            }
            final SAMRecordIterator iter = samReader.query(contig.getSequenceName(), mappedPos, mappedPos, false);
            while (iter.hasNext()) {
                final SAMRecord rec = iter.next();
                if (rec.getReadUnmappedFlag())
                    continue;
                if (this.filter.filterOut(rec))
                    continue;
                final Cigar cigar = rec.getCigar();
                if (cigar == null)
                    continue;
                final byte[] bases = rec.getReadBases();
                if (bases == null || bases.length == 0)
                    continue;
                int refPos1 = rec.getUnclippedStart();
                int readpos = 0;
                for (final CigarElement ce : cigar) {
                    final CigarOperator op = ce.getOperator();
                    for (int L = 0; L < ce.getLength(); ++L) {
                        if (op.consumesReadBases()) {
                            Counter<Byte> count = pos2bases.get(refPos1 - 1);
                            if (count == null) {
                                count = new Counter<>();
                                pos2bases.put(refPos1 - 1, count);
                            }
                            count.incr((byte) Character.toLowerCase(bases[readpos]));
                            readpos++;
                        }
                        if (op.consumesReferenceBases()) {
                            refPos1++;
                        }
                    }
                }
            }
            iter.close();
        }
    }
    return pos2bases;
}
Also used : SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) HashMap(java.util.HashMap) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) CigarOperator(htsjdk.samtools.CigarOperator) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) CigarElement(htsjdk.samtools.CigarElement) SamReader(htsjdk.samtools.SamReader) Counter(com.github.lindenb.jvarkit.util.Counter) Cigar(htsjdk.samtools.Cigar) SAMRecord(htsjdk.samtools.SAMRecord)

Example 22 with CigarOperator

use of htsjdk.samtools.CigarOperator in project jvarkit by lindenb.

the class FindAllCoverageAtPosition method doWork.

@Override
public int doWork(final List<String> args) {
    final Set<Mutation> mutations = new TreeSet<>();
    BufferedReader r = null;
    try {
        if (this.referenceFileFile != null) {
            this.indexedFastaSequenceFile = new IndexedFastaSequenceFile(this.referenceFileFile);
        }
        mutations.addAll(Arrays.asList(this.positionStr.split("[  ]")).stream().filter(S -> !S.trim().isEmpty()).map(S -> new Mutation(S)).collect(Collectors.toSet()));
        for (final String s : positionStr.split("[  ]")) {
            if (s.trim().isEmpty())
                continue;
            mutations.add(new Mutation(s));
        }
        if (this.positionFile != null) {
            String line;
            r = IOUtils.openFileForBufferedReading(positionFile);
            if (positionFile.getName().endsWith(".bed")) {
                final BedLineCodec codec = new BedLineCodec();
                while ((line = r.readLine()) != null) {
                    final BedLine bedLine = codec.decode(line);
                    if (bedLine == null)
                        continue;
                    for (int x = bedLine.getStart(); x <= bedLine.getEnd(); ++x) {
                        mutations.add(new Mutation(bedLine.getContig(), x));
                    }
                }
            } else {
                while ((line = r.readLine()) != null) {
                    if (line.trim().isEmpty() || line.startsWith("#"))
                        continue;
                    mutations.add(new Mutation(line));
                }
            }
            r.close();
            r = null;
        }
        if (mutations.isEmpty()) {
            LOG.fatal("undefined position \'str\'");
            return -1;
        }
        LOG.info("number of mutations " + mutations.size());
        this.out = this.openFileOrStdoutAsPrintWriter(this.outputFile);
        out.print("#File");
        out.print('\t');
        out.print("CHROM");
        out.print('\t');
        out.print("POS");
        if (this.indexedFastaSequenceFile != null) {
            out.print('\t');
            out.print("REF");
        }
        out.print('\t');
        out.print(this.groupBy.name().toUpperCase());
        out.print('\t');
        out.print("DEPTH");
        for (final CigarOperator op : CigarOperator.values()) {
            out.print('\t');
            out.print(op.name());
        }
        for (char c : BASES_To_PRINT) {
            out.print('\t');
            out.print("Base(" + c + ")");
        }
        out.println();
        if (args.isEmpty()) {
            LOG.info("Reading from stdin");
            r = new BufferedReader(new InputStreamReader(stdin()));
            scan(r, mutations);
            r.close();
            r = null;
        } else {
            for (final String filename : args) {
                LOG.info("Reading from " + filename);
                r = IOUtils.openURIForBufferedReading(filename);
                scan(r, mutations);
                r.close();
                r = null;
            }
        }
        this.out.flush();
        return 0;
    } catch (Exception err) {
        LOG.severe(err);
        return -1;
    } finally {
        CloserUtil.close(this.indexedFastaSequenceFile);
        CloserUtil.close(this.out);
        CloserUtil.close(r);
    }
}
Also used : Cigar(htsjdk.samtools.Cigar) Arrays(java.util.Arrays) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) CigarElement(htsjdk.samtools.CigarElement) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) CigarOperator(htsjdk.samtools.CigarOperator) SAMRecordPartition(com.github.lindenb.jvarkit.util.samtools.SAMRecordPartition) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) SAMFileHeader(htsjdk.samtools.SAMFileHeader) ValidationStringency(htsjdk.samtools.ValidationStringency) TreeSet(java.util.TreeSet) StringUtil(htsjdk.samtools.util.StringUtil) Map(java.util.Map) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) CloserUtil(htsjdk.samtools.util.CloserUtil) PrintWriter(java.io.PrintWriter) Counter(com.github.lindenb.jvarkit.util.Counter) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) SamReader(htsjdk.samtools.SamReader) InputStreamReader(java.io.InputStreamReader) Collectors(java.util.stream.Collectors) File(java.io.File) SAMRecord(htsjdk.samtools.SAMRecord) SamRecordFilter(htsjdk.samtools.filter.SamRecordFilter) List(java.util.List) SamRecordJEXLFilter(com.github.lindenb.jvarkit.util.samtools.SamRecordJEXLFilter) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) TreeMap(java.util.TreeMap) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile) BufferedReader(java.io.BufferedReader) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) SamReaderFactory(htsjdk.samtools.SamReaderFactory) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) InputStreamReader(java.io.InputStreamReader) CigarOperator(htsjdk.samtools.CigarOperator) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) TreeSet(java.util.TreeSet) BufferedReader(java.io.BufferedReader)

Example 23 with CigarOperator

use of htsjdk.samtools.CigarOperator in project jvarkit by lindenb.

the class FindAllCoverageAtPosition method scan.

private void scan(final BufferedReader in, final Set<Mutation> mutations) throws Exception {
    final String DEFAULT_SAMPLE_NAME = "(undefined)";
    String line;
    while ((line = in.readLine()) != null) {
        if (out.checkError())
            break;
        if (line.isEmpty() || line.startsWith("#"))
            continue;
        File f = new File(line);
        if (!f.exists())
            continue;
        if (!f.isFile())
            continue;
        if (!f.canRead())
            continue;
        String filename = f.getName();
        if (filename.endsWith(".cram")) {
            LOG.warn("Sorry CRAM is not supported " + filename);
            continue;
        }
        if (!filename.endsWith(".bam"))
            continue;
        SamReader samReader = null;
        SAMRecordIterator iter = null;
        try {
            samReader = this.samReaderFactory.open(f);
            if (!samReader.hasIndex()) {
                LOG.warn("no index for " + f);
                continue;
            }
            final SAMFileHeader header = samReader.getFileHeader();
            for (final Mutation src : mutations) {
                final Map<String, CigarAndBases> sample2count = new TreeMap<String, CigarAndBases>();
                for (SAMReadGroupRecord rg : header.getReadGroups()) {
                    if (rg != null) {
                        String sn = this.groupBy.apply(rg);
                        if (sn != null && !sn.trim().isEmpty()) {
                            sample2count.put(sn, new CigarAndBases());
                        }
                    }
                }
                if (sample2count.isEmpty()) {
                    sample2count.put(DEFAULT_SAMPLE_NAME, new CigarAndBases());
                }
                final Mutation m = convertFromSamHeader(f, header, src);
                if (m == null)
                    continue;
                iter = samReader.query(m.chrom, m.pos - 1, m.pos + 1, false);
                while (iter.hasNext()) {
                    final SAMRecord rec = iter.next();
                    if (rec.getReadUnmappedFlag())
                        continue;
                    if (this.filter.filterOut(rec))
                        continue;
                    final Cigar cigar = rec.getCigar();
                    if (cigar == null)
                        continue;
                    final String readString = rec.getReadString().toUpperCase();
                    String sampleName = DEFAULT_SAMPLE_NAME;
                    final SAMReadGroupRecord rg = rec.getReadGroup();
                    if (rg != null) {
                        String sn = groupBy.apply(rg);
                        if (!StringUtil.isBlank(sn)) {
                            sampleName = sn;
                        }
                    }
                    CigarAndBases counter = sample2count.get(sampleName);
                    if (counter == null) {
                        counter = new CigarAndBases();
                        sample2count.put(sampleName, counter);
                    }
                    int ref = rec.getUnclippedStart();
                    int readPos = 0;
                    for (int k = 0; k < cigar.numCigarElements() && ref < m.pos + 1; ++k) {
                        final CigarElement ce = cigar.getCigarElement(k);
                        final CigarOperator op = ce.getOperator();
                        switch(op) {
                            case P:
                                break;
                            case I:
                                {
                                    if (ref == m.pos) {
                                        counter.operators.incr(op);
                                        counter.bases.incr(INSERTION_CHAR);
                                    }
                                    readPos += ce.getLength();
                                    break;
                                }
                            case D:
                            case N:
                            case M:
                            case X:
                            case EQ:
                            case H:
                            case S:
                                {
                                    for (int i = 0; i < ce.getLength(); ++i) {
                                        if (ref == m.pos) {
                                            counter.operators.incr(op);
                                            switch(op) {
                                                case M:
                                                case X:
                                                case EQ:
                                                    counter.bases.incr(readString.charAt(readPos));
                                                    break;
                                                case D:
                                                case N:
                                                    counter.bases.incr(DELETION_CHAR);
                                                    break;
                                                default:
                                                    break;
                                            }
                                            break;
                                        }
                                        if (op.consumesReadBases())
                                            ++readPos;
                                        ref++;
                                    }
                                    break;
                                }
                            default:
                                throw new RuntimeException("unknown operator:" + op);
                        }
                    }
                }
                iter.close();
                iter = null;
                for (final String sample : sample2count.keySet()) {
                    final CigarAndBases counter = sample2count.get(sample);
                    out.print(f);
                    out.print('\t');
                    out.print(m.chrom);
                    out.print('\t');
                    out.print(m.pos);
                    if (this.indexedFastaSequenceFile != null) {
                        out.print('\t');
                        out.print(getReferenceAt(m.chrom, m.pos));
                    }
                    out.print('\t');
                    out.print(sample);
                    out.print('\t');
                    out.print(counter.operators.count(CigarOperator.M) + counter.operators.count(CigarOperator.EQ) + counter.operators.count(CigarOperator.X));
                    for (final CigarOperator op : CigarOperator.values()) {
                        out.print('\t');
                        out.print(counter.operators.count(op));
                    }
                    for (char c : BASES_To_PRINT) {
                        out.print('\t');
                        out.print(counter.bases.count(c));
                    }
                    out.println();
                }
            }
        // end of loop over mutations
        } catch (final Exception err) {
            LOG.error(err);
            throw err;
        } finally {
            CloserUtil.close(iter);
            CloserUtil.close(samReader);
        }
    }
}
Also used : SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) CigarOperator(htsjdk.samtools.CigarOperator) TreeMap(java.util.TreeMap) CigarElement(htsjdk.samtools.CigarElement) SamReader(htsjdk.samtools.SamReader) Cigar(htsjdk.samtools.Cigar) SAMRecord(htsjdk.samtools.SAMRecord) SAMFileHeader(htsjdk.samtools.SAMFileHeader) File(java.io.File) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile)

Example 24 with CigarOperator

use of htsjdk.samtools.CigarOperator in project jvarkit by lindenb.

the class BamToSql method doWork.

@Override
public int doWork(List<String> args) {
    if (this.faidxFile == null) {
        LOG.error("ref sequence faidx not defined");
        return -1;
    }
    SAMRecordIterator iter = null;
    SamReader sfr = null;
    PrintWriter out = null;
    GenomicSequence genomicSequence = null;
    IndexedFastaSequenceFile indexedFastaSequenceFile = null;
    args = new ArrayList<String>(IOUtils.unrollFiles(args));
    try {
        out = super.openFileOrStdoutAsPrintWriter(this.outputFile);
        indexedFastaSequenceFile = new IndexedFastaSequenceFile(this.faidxFile);
        out.println("CREATE TABLE IF NOT EXISTS SamFile");
        out.println("(");
        out.println("id INTEGER PRIMARY KEY,");
        out.println("filename TEXT");
        out.println(");");
        out.println("CREATE TABLE IF NOT EXISTS Dictionary");
        out.println("(");
        out.println("id INTEGER PRIMARY KEY,");
        out.println("name TEXT NOT NULL,");
        out.println("length INT NOT NULL,");
        out.println("tid INT NOT NULL,");
        out.println("samfile_id INT NOT NULL,");
        out.println("FOREIGN KEY(samfile_id) REFERENCES SamFile(id)");
        out.println(");");
        out.println("CREATE TABLE IF NOT EXISTS ReadGroup");
        out.println("(");
        out.println("id INTEGER PRIMARY KEY,");
        out.println("groupId TEXT NOT NULL,");
        out.println("sample TEXT NOT NULL,");
        out.println("samfile_id INT NOT NULL,");
        out.println("FOREIGN KEY(samfile_id) REFERENCES SamFile(id)");
        out.println(");");
        out.println("CREATE TABLE IF NOT EXISTS Read");
        out.println("(");
        out.println("id INTEGER PRIMARY KEY,");
        out.println("name TEXT NOT NULL,");
        out.println("flag INTEGER NOT NULL,");
        if (this.printflag) {
            for (final SAMFlag flg : SAMFlag.values()) {
                out.println(flg.name() + " INTEGER NOT NULL,");
            }
        }
        out.println("rname TEXT,");
        out.println("pos INTEGER,");
        out.println("mapq INTEGER NOT NULL,");
        out.println("cigar TEXT,");
        out.println("rnext TEXT,");
        out.println("pnext INTEGER,");
        out.println("tlen INTEGER,");
        out.println("sequence TEXT NOT NULL,");
        out.println("qualities TEXT NOT NULL,");
        out.println("samfile_id INT NOT NULL,");
        out.println("group_id INT,");
        out.println("FOREIGN KEY(samfile_id) REFERENCES SamFile(id),");
        out.println("FOREIGN KEY(group_id) REFERENCES ReadGroup(id)");
        out.println(");");
        out.println("CREATE TABLE IF NOT EXISTS Cigar");
        out.println("(");
        out.println("id INTEGER PRIMARY KEY,");
        out.println("read_pos INT ,");
        out.println("read_base TEXT,");
        out.println("read_qual INT ,");
        out.println("ref_pos INT ,");
        out.println("ref_base TEXT,");
        out.println("operator TEXT NOT NULL,");
        out.println("read_id INT NOT NULL,");
        out.println("FOREIGN KEY(read_id) REFERENCES Read(id)");
        out.println(");");
        out.println("begin transaction;");
        int samIndex = 0;
        do {
            final String inputName;
            if (samIndex == 0 && args.isEmpty()) {
                sfr = openSamReader(null);
                inputName = "<stdin>";
            } else {
                inputName = args.get(samIndex);
                sfr = openSamReader(inputName);
            }
            final SAMFileHeader header1 = sfr.getFileHeader();
            if (header1 == null) {
                throw new JvarkitException.FileFormatError("File header missing");
            }
            final SAMSequenceDictionary dict = header1.getSequenceDictionary();
            if (dict == null) {
                throw new JvarkitException.DictionaryMissing("No Dictionary in input");
            }
            final IntervalParser intervalParser = new IntervalParser(dict);
            final Interval userInterval;
            iter = null;
            if (this.regionStr == null || this.regionStr.isEmpty()) {
                LOG.warn("You're currently scanning the whole BAM ???!!!");
                iter = sfr.iterator();
                userInterval = null;
            } else {
                userInterval = intervalParser.parse(this.regionStr);
                if (userInterval == null) {
                    throw new JvarkitException.UserError("cannot parse interval " + this.regionStr);
                }
                iter = sfr.query(userInterval.getContig(), userInterval.getStart(), userInterval.getEnd(), false);
            }
            out.println(String.join(" ", "insert into SamFile(filename) values(", quote(inputName), ");"));
            for (int i = 0; i < dict.size(); ++i) {
                final SAMSequenceRecord ssr = dict.getSequence(i);
                out.println("insert into Dictionary(name,length,tid,samfile_id) select " + quote(inputName) + "," + ssr.getSequenceLength() + "," + i + ",max(id) from SamFile;");
            }
            for (final SAMReadGroupRecord g : header1.getReadGroups()) {
                out.println("insert into ReadGroup(groupId,sample,samfile_id) select " + quote(g.getId()) + "," + quote(g.getSample()) + "," + "max(id) from SamFile;");
            }
            final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header1);
            while (iter.hasNext()) {
                final SAMRecord rec = progress.watch(iter.next());
                final StringBuilder sql = new StringBuilder();
                sql.append("insert into Read(" + "name,flag,");
                if (this.printflag) {
                    for (final SAMFlag flg : SAMFlag.values()) {
                        sql.append(flg.name()).append(",");
                    }
                }
                sql.append("rname,pos,mapq,cigar,rnext,pnext,tlen,sequence,qualities,group_id,samfile_id) select ");
                sql.append(quote(rec.getReadName())).append(",");
                sql.append(rec.getFlags()).append(",");
                if (this.printflag) {
                    for (final SAMFlag flg : SAMFlag.values()) {
                        sql.append(flg.isSet(rec.getFlags()) ? 1 : 0);
                        sql.append(",");
                    }
                }
                if (rec.getReferenceName() == null || rec.getReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME)) {
                    sql.append("NULL,NULL");
                } else {
                    sql.append(quote(rec.getReferenceName()));
                    sql.append(",");
                    sql.append(rec.getAlignmentStart());
                }
                sql.append(",");
                sql.append(rec.getMappingQuality());
                sql.append(",");
                // cigar
                if (rec.getCigarString() == null || rec.getCigarString().equals(SAMRecord.NO_ALIGNMENT_CIGAR)) {
                    sql.append("NULL");
                } else {
                    sql.append(quote(rec.getCigarString()));
                }
                sql.append(",");
                // rnext
                if (rec.getMateReferenceName() == null || rec.getMateReferenceName().equals(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME)) {
                    sql.append("NULL,NULL");
                } else {
                    sql.append(quote(rec.getMateReferenceName()));
                    sql.append(",");
                    sql.append(rec.getMateAlignmentStart());
                }
                sql.append(",");
                // tlen
                sql.append(rec.getInferredInsertSize());
                sql.append(",");
                // sequence
                sql.append(quote(rec.getReadString()));
                sql.append(",");
                // qualities
                sql.append(quote(rec.getBaseQualityString()));
                sql.append(",");
                if (rec.getReadGroup() == null) {
                    sql.append("NULL");
                } else {
                    sql.append("G.id");
                }
                sql.append(",F.id FROM SamFile as F");
                if (rec.getReadGroup() != null) {
                    sql.append(" , ReadGroup as G where G.groupId=").append(quote(rec.getReadGroup().getId())).append(" and F.id = G.samfile_id ");
                }
                sql.append("  ORDER BY F.id DESC LIMIT 1;");
                out.println(sql.toString());
                if (this.printcigar && !rec.getReadUnmappedFlag() && rec.getCigar() != null) {
                    if (genomicSequence == null || !genomicSequence.getChrom().equals(rec.getReferenceName())) {
                        genomicSequence = new GenomicSequence(indexedFastaSequenceFile, rec.getReferenceName());
                    }
                    int ref = rec.getUnclippedStart();
                    final byte[] bases = rec.getReadBases();
                    final byte[] quals = rec.getBaseQualities();
                    int read = 0;
                    for (final CigarElement ce : rec.getCigar()) {
                        final CigarOperator op = ce.getOperator();
                        if (op.equals(CigarOperator.P))
                            continue;
                        for (int i = 0; i < ce.getLength(); ++i) {
                            sql.setLength(0);
                            boolean in_user_interval = true;
                            sql.append("insert into Cigar(operator,read_pos,read_base,read_qual,ref_pos,ref_base,read_id) ");
                            sql.append("select '");
                            sql.append(op.name());
                            sql.append("',");
                            if (userInterval != null && !(rec.getReferenceName().equals(userInterval.getContig()) && ref >= userInterval.getStart() && ref <= userInterval.getEnd())) {
                                in_user_interval = false;
                            }
                            switch(op) {
                                case I:
                                    {
                                        sql.append(read);
                                        sql.append(",");
                                        sql.append("'" + (char) bases[read] + "',");
                                        sql.append("" + quals[read] + "");
                                        sql.append(",");
                                        sql.append("NULL,NULL");
                                        read++;
                                        break;
                                    }
                                case D:
                                case N:
                                case // yes H (hard clip)
                                H:
                                    {
                                        sql.append("NULL,NULL,NULL,");
                                        sql.append(ref);
                                        sql.append(",'");
                                        sql.append((ref < 1 || ref - 1 >= genomicSequence.length()) ? '*' : genomicSequence.charAt(ref - 1));
                                        sql.append("'");
                                        ref++;
                                        break;
                                    }
                                case M:
                                case X:
                                case EQ:
                                case // yes S, soft clip
                                S:
                                    {
                                        sql.append(read);
                                        sql.append(",");
                                        sql.append("'" + (char) bases[read] + "',");
                                        sql.append("" + quals[read] + "");
                                        sql.append(",");
                                        sql.append(ref);
                                        sql.append(",'");
                                        sql.append((ref < 1 || ref - 1 >= genomicSequence.length()) ? '*' : genomicSequence.charAt(ref - 1));
                                        sql.append("'");
                                        ref++;
                                        read++;
                                        break;
                                    }
                                default:
                                    throw new IllegalStateException();
                            }
                            sql.append(", id from Read ORDER BY id DESC LIMIT 1;");
                            if (in_user_interval)
                                out.println(sql.toString());
                        }
                    }
                }
            }
            iter.close();
            iter = null;
            sfr.close();
            sfr = null;
            progress.finish();
            samIndex++;
        } while (samIndex < args.size());
        out.println("COMMIT;");
        out.flush();
        out.close();
        LOG.info("done");
        return 0;
    } catch (Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(iter);
        CloserUtil.close(sfr);
        CloserUtil.close(out);
        CloserUtil.close(indexedFastaSequenceFile);
    }
}
Also used : SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile) SamReader(htsjdk.samtools.SamReader) PrintWriter(java.io.PrintWriter) IntervalParser(com.github.lindenb.jvarkit.util.bio.IntervalParser) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) SAMFlag(htsjdk.samtools.SAMFlag) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) CigarOperator(htsjdk.samtools.CigarOperator) CigarElement(htsjdk.samtools.CigarElement) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) SAMRecord(htsjdk.samtools.SAMRecord) SAMFileHeader(htsjdk.samtools.SAMFileHeader) Interval(htsjdk.samtools.util.Interval)

Example 25 with CigarOperator

use of htsjdk.samtools.CigarOperator in project jvarkit by lindenb.

the class SamClipIndelFraction method doWork.

@Override
public int doWork(final List<String> args) {
    SamReader sfr = null;
    SAMRecordIterator iter = null;
    PrintWriter pw = null;
    try {
        sfr = openSamReader(oneFileOrNull(args));
        pw = super.openFileOrStdoutAsPrintWriter(outputFile);
        long total_bases_count = 0L;
        long count_clipped_reads = 0L;
        long count_clipped_left_reads = 0L;
        long count_clipped_right_reads = 0L;
        long count_unclipped_reads = 0L;
        long count_unmapped_reads = 0L;
        SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(sfr.getFileHeader()).logger(LOG);
        Counter<Integer> counter = new Counter<>();
        iter = sfr.iterator();
        while (iter.hasNext()) {
            final SAMRecord record = progress.watch(iter.next());
            if (record.getReadUnmappedFlag()) {
                ++count_unmapped_reads;
                continue;
            }
            final Cigar cigar = record.getCigar();
            int left_clip_length = 0;
            int right_clip_length = 0;
            int deletion_N_length = 0;
            int deletion_D_length = 0;
            int insertion_length = 0;
            boolean onLeft = true;
            for (int i = 0; i < cigar.numCigarElements(); ++i) {
                final CigarElement ce = cigar.getCigarElement(i);
                final CigarOperator op = ce.getOperator();
                switch(op) {
                    case N:
                        {
                            onLeft = false;
                            deletion_D_length += ce.getLength();
                            total_bases_count += ce.getLength();
                            break;
                        }
                    case D:
                        {
                            onLeft = false;
                            deletion_N_length += ce.getLength();
                            total_bases_count += ce.getLength();
                            break;
                        }
                    case I:
                        {
                            onLeft = false;
                            insertion_length += ce.getLength();
                            total_bases_count += ce.getLength();
                            break;
                        }
                    case S:
                    case H:
                        {
                            if (onLeft) {
                                if (record.getReadNegativeStrandFlag()) {
                                    right_clip_length += ce.getLength();
                                } else {
                                    left_clip_length += ce.getLength();
                                }
                            } else {
                                if (record.getReadNegativeStrandFlag()) {
                                    left_clip_length += ce.getLength();
                                } else {
                                    right_clip_length += ce.getLength();
                                }
                            }
                            total_bases_count += ce.getLength();
                            break;
                        }
                    default:
                        {
                            onLeft = false;
                            if (op.consumesReadBases()) {
                                total_bases_count += ce.getLength();
                            }
                            break;
                        }
                }
            }
            if (left_clip_length + right_clip_length == 0) {
                count_unclipped_reads++;
            } else {
                if (left_clip_length > 0)
                    count_clipped_left_reads++;
                if (right_clip_length > 0)
                    count_clipped_right_reads++;
                count_clipped_reads++;
            }
            switch(type) {
                case leftclip:
                    counter.incr(left_clip_length);
                    break;
                case rightclip:
                    counter.incr(right_clip_length);
                    break;
                case allclip:
                    counter.incr(left_clip_length + right_clip_length);
                    break;
                case deletion:
                    counter.incr(deletion_D_length + deletion_N_length);
                    break;
                case insert:
                    counter.incr(insertion_length);
                    break;
                default:
                    LOG.error("Bad type: " + type);
                    return -1;
            }
        }
        progress.finish();
        pw.println("##UNMAPPED_READS=" + count_unmapped_reads);
        pw.println("##MAPPED_READS=" + (count_clipped_reads + count_unclipped_reads));
        pw.println("##CLIPPED_READS=" + count_clipped_reads);
        pw.println("##CLIPPED_READS_5_PRIME=" + count_clipped_left_reads);
        pw.println("##CLIPPED_READS_3_PRIME=" + count_clipped_right_reads);
        pw.println("##UNCLIPPED_READS=" + count_unclipped_reads);
        pw.println("##COUNT_BASES=" + total_bases_count);
        pw.print("#");
        switch(type) {
            case leftclip:
                pw.print("CLIP_5_PRIME");
                break;
            case rightclip:
                pw.print("CLIP_3_PRIME");
                break;
            case allclip:
                pw.print("CLIP");
                break;
            case deletion:
                pw.print("DELETION");
                break;
            case insert:
                pw.print("INSERTION");
                break;
            default:
                LOG.error("Bad type: " + type);
                return -1;
        }
        pw.println("\tCOUNT\tFRACTION_OF_MAPPED_READS");
        for (final Integer size : new TreeSet<Integer>(counter.keySet())) {
            pw.print(size);
            pw.print('\t');
            pw.print(counter.count(size));
            pw.print('\t');
            pw.println(counter.count(size) / (double) (count_unclipped_reads + count_unclipped_reads));
        }
        pw.flush();
        pw.close();
        pw = null;
        return 0;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(iter);
        CloserUtil.close(sfr);
        CloserUtil.close(pw);
    }
}
Also used : SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) CigarOperator(htsjdk.samtools.CigarOperator) CigarElement(htsjdk.samtools.CigarElement) SamReader(htsjdk.samtools.SamReader) Counter(com.github.lindenb.jvarkit.util.Counter) Cigar(htsjdk.samtools.Cigar) TreeSet(java.util.TreeSet) SAMRecord(htsjdk.samtools.SAMRecord) PrintWriter(java.io.PrintWriter)

Aggregations

CigarOperator (htsjdk.samtools.CigarOperator)48 CigarElement (htsjdk.samtools.CigarElement)31 Cigar (htsjdk.samtools.Cigar)24 SAMRecord (htsjdk.samtools.SAMRecord)22 ArrayList (java.util.ArrayList)17 SAMRecordIterator (htsjdk.samtools.SAMRecordIterator)15 SamReader (htsjdk.samtools.SamReader)15 SAMSequenceDictionaryProgress (com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress)13 SAMFileHeader (htsjdk.samtools.SAMFileHeader)12 File (java.io.File)12 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)11 List (java.util.List)11 Parameter (com.beust.jcommander.Parameter)9 Launcher (com.github.lindenb.jvarkit.util.jcommander.Launcher)9 Logger (com.github.lindenb.jvarkit.util.log.Logger)9 SAMReadGroupRecord (htsjdk.samtools.SAMReadGroupRecord)9 CloserUtil (htsjdk.samtools.util.CloserUtil)9 Program (com.github.lindenb.jvarkit.util.jcommander.Program)8 IndexedFastaSequenceFile (htsjdk.samtools.reference.IndexedFastaSequenceFile)8 HashMap (java.util.HashMap)8