Search in sources :

Example 66 with CigarElement

use of htsjdk.samtools.CigarElement in project jvarkit by lindenb.

the class Biostar154220 method doWork.

@SuppressWarnings("resource")
private int doWork(final SamReader in) throws IOException {
    SAMFileHeader header = in.getFileHeader();
    if (header.getSortOrder() != SAMFileHeader.SortOrder.unsorted) {
        LOG.error("input should be unsorted, reads sorted on REF/query-name e.g: see https://github.com/lindenb/jvarkit/wiki/SortSamRefName");
        return -1;
    }
    SAMSequenceDictionary dict = header.getSequenceDictionary();
    if (dict == null) {
        LOG.error("no dict !");
        return -1;
    }
    SAMFileWriter out = null;
    SAMRecordIterator iter = null;
    int prev_tid = -1;
    int[] depth_array = null;
    try {
        SAMFileHeader header2 = header.clone();
        header2.addComment("Biostar154220" + " " + getVersion() + " " + getProgramCommandLine());
        out = this.writingBams.openSAMFileWriter(outputFile, header2, true);
        SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict);
        iter = in.iterator();
        List<SAMRecord> buffer = new ArrayList<>();
        for (; ; ) {
            SAMRecord rec = null;
            if (iter.hasNext()) {
                rec = progress.watch(iter.next());
            }
            if (rec != null && rec.getReadUnmappedFlag()) {
                out.addAlignment(rec);
                continue;
            }
            // no more record or
            if (!buffer.isEmpty() && rec != null && buffer.get(0).getReadName().equals(rec.getReadName()) && buffer.get(0).getReferenceIndex().equals(rec.getReferenceIndex())) {
                buffer.add(progress.watch(rec));
            } else if (buffer.isEmpty() && rec != null) {
                buffer.add(progress.watch(rec));
            } else // dump buffer
            {
                if (!buffer.isEmpty()) {
                    final int tid = buffer.get(0).getReferenceIndex();
                    if (prev_tid == -1 || prev_tid != tid) {
                        SAMSequenceRecord ssr = dict.getSequence(tid);
                        prev_tid = tid;
                        depth_array = null;
                        System.gc();
                        LOG.info("Alloc memory for contig " + ssr.getSequenceName() + " N=" + ssr.getSequenceLength() + "*sizeof(int)");
                        // use a +1 pos
                        depth_array = new int[ssr.getSequenceLength() + 1];
                        Arrays.fill(depth_array, 0);
                    }
                    // position->coverage for this set of reads
                    Counter<Integer> readposition2coverage = new Counter<Integer>();
                    boolean dump_this_buffer = true;
                    for (final SAMRecord sr : buffer) {
                        if (!dump_this_buffer)
                            break;
                        if (this.filter.filterOut(sr))
                            continue;
                        final Cigar cigar = sr.getCigar();
                        if (cigar == null) {
                            throw new IOException("Cigar missing in " + rec.getSAMString());
                        }
                        int refPos1 = sr.getAlignmentStart();
                        for (final CigarElement ce : cigar.getCigarElements()) {
                            final CigarOperator op = ce.getOperator();
                            if (!op.consumesReferenceBases())
                                continue;
                            if (op.consumesReadBases()) {
                                for (int x = 0; x < ce.getLength() && refPos1 + x < depth_array.length; ++x) {
                                    int cov = (int) readposition2coverage.incr(refPos1 + x);
                                    if (depth_array[refPos1 + x] + cov > this.capDepth) {
                                        dump_this_buffer = false;
                                        break;
                                    }
                                }
                            }
                            if (!dump_this_buffer)
                                break;
                            refPos1 += ce.getLength();
                        }
                    }
                    if (dump_this_buffer) {
                        // consumme this coverage
                        for (Integer pos : readposition2coverage.keySet()) {
                            depth_array[pos] += (int) readposition2coverage.count(pos);
                        }
                        for (SAMRecord sr : buffer) {
                            out.addAlignment(sr);
                        }
                    }
                    buffer.clear();
                }
                if (rec == null)
                    break;
                buffer.add(rec);
            }
        }
        depth_array = null;
        progress.finish();
        return 0;
    } catch (Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(iter);
        CloserUtil.close(out);
    }
}
Also used : SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) SAMFileWriter(htsjdk.samtools.SAMFileWriter) ArrayList(java.util.ArrayList) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) IOException(java.io.IOException) CigarOperator(htsjdk.samtools.CigarOperator) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) CigarElement(htsjdk.samtools.CigarElement) IOException(java.io.IOException) Counter(com.github.lindenb.jvarkit.util.Counter) Cigar(htsjdk.samtools.Cigar) SAMRecord(htsjdk.samtools.SAMRecord) SAMFileHeader(htsjdk.samtools.SAMFileHeader)

Example 67 with CigarElement

use of htsjdk.samtools.CigarElement in project jvarkit by lindenb.

the class BamStats05 method doWork.

protected int doWork(final PrintWriter pw, final Map<String, List<Interval>> gene2interval, final String filename, final SamReader IN) throws Exception {
    try {
        LOG.info("Scanning " + filename);
        final SAMFileHeader header = IN.getFileHeader();
        final List<SAMReadGroupRecord> rgs = header.getReadGroups();
        if (rgs == null || rgs.isEmpty())
            throw new IOException("No read groups in " + filename);
        final Set<String> groupNames = this.groupBy.getPartitions(rgs);
        for (final String partition : groupNames) {
            if (partition.isEmpty())
                throw new IOException("Empty " + groupBy.name());
            for (final String gene : gene2interval.keySet()) {
                int geneStart = Integer.MAX_VALUE;
                int geneEnd = 0;
                final List<Integer> counts = new ArrayList<>();
                for (final Interval interval : gene2interval.get(gene)) {
                    geneStart = Math.min(geneStart, interval.getStart() - 1);
                    geneEnd = Math.max(geneEnd, interval.getEnd());
                    /* picard javadoc:  - Sequence name - Start position (1-based) - End position (1-based, end inclusive)  */
                    int[] interval_counts = new int[interval.getEnd() - interval.getStart() + 1];
                    if (interval_counts.length == 0)
                        continue;
                    Arrays.fill(interval_counts, 0);
                    if (IN.getFileHeader().getSequenceIndex(interval.getContig()) == -1) {
                        throw new IllegalArgumentException("NO DICT FOR \"" + interval.getContig() + "\"");
                    }
                    /**
                     *     start - 1-based, inclusive start of interval of interest. Zero implies start of the reference sequence.
                     *	   end - 1-based, inclusive end of interval of interest. Zero implies end of the reference sequence.
                     */
                    SAMRecordIterator r = IN.query(new QueryInterval[] { new QueryInterval(header.getSequenceIndex(interval.getContig()), interval.getStart(), interval.getEnd()) }, false);
                    while (r.hasNext()) {
                        final SAMRecord rec = r.next();
                        if (rec.getReadUnmappedFlag())
                            continue;
                        if (filter.filterOut(rec))
                            continue;
                        if (!rec.getReferenceName().equals(interval.getContig()))
                            continue;
                        final SAMReadGroupRecord rg = rec.getReadGroup();
                        if (rg == null || !partition.equals(this.groupBy.apply(rg)))
                            continue;
                        final Cigar cigar = rec.getCigar();
                        if (cigar == null)
                            continue;
                        int refpos1 = rec.getAlignmentStart();
                        for (final CigarElement ce : cigar.getCigarElements()) {
                            final CigarOperator op = ce.getOperator();
                            if (!op.consumesReferenceBases())
                                continue;
                            if (op.consumesReadBases()) {
                                for (int i = 0; i < ce.getLength(); ++i) {
                                    if (refpos1 + i >= interval.getStart() && refpos1 + i <= interval.getEnd()) {
                                        interval_counts[refpos1 + i - interval.getStart()]++;
                                    }
                                }
                            }
                            refpos1 += ce.getLength();
                        }
                    }
                    /* end while r */
                    r.close();
                    for (int d : interval_counts) {
                        counts.add(d);
                    }
                }
                /* end interval */
                Collections.sort(counts);
                int count_no_coverage = 0;
                double mean = 0;
                for (int cov : counts) {
                    if (cov <= MIN_COVERAGE)
                        ++count_no_coverage;
                    mean += cov;
                }
                mean /= counts.size();
                pw.println(gene2interval.get(gene).get(0).getContig() + "\t" + geneStart + "\t" + geneEnd + "\t" + gene + "\t" + partition + "\t" + counts.size() + "\t" + counts.get(0) + "\t" + counts.get(counts.size() - 1) + "\t" + mean + "\t" + count_no_coverage + "\t" + (int) (((counts.size() - count_no_coverage) / (double) counts.size()) * 100.0));
            }
        // end gene
        }
        // end sample
        return RETURN_OK;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(IN);
    }
}
Also used : SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) ArrayList(java.util.ArrayList) QueryInterval(htsjdk.samtools.QueryInterval) IOException(java.io.IOException) CigarOperator(htsjdk.samtools.CigarOperator) CigarElement(htsjdk.samtools.CigarElement) IOException(java.io.IOException) Cigar(htsjdk.samtools.Cigar) SAMRecord(htsjdk.samtools.SAMRecord) SAMFileHeader(htsjdk.samtools.SAMFileHeader) Interval(htsjdk.samtools.util.Interval) QueryInterval(htsjdk.samtools.QueryInterval)

Example 68 with CigarElement

use of htsjdk.samtools.CigarElement in project jvarkit by lindenb.

the class HaloplexParasite method doVcfToVcf.

@Override
protected int doVcfToVcf(String inputName, VcfIterator in, VariantContextWriter out) {
    SamReader samReader = null;
    final List<Mutation> mutations = new ArrayList<>();
    try {
        final Set<File> bamFiles = Files.lines(this.bamList.toPath()).filter(T -> !(T.isEmpty() || T.startsWith("#"))).map(T -> new File(T)).collect(Collectors.toSet());
        final VCFHeader header = new VCFHeader();
        header.setSequenceDictionary(in.getHeader().getSequenceDictionary());
        final VCFFilterHeaderLine filter = new VCFFilterHeaderLine("HALOPLEXPARASITE", "fails Parasite Haloplex Sequence");
        final VCFInfoHeaderLine infoWord = new VCFInfoHeaderLine(filter.getID(), 1, VCFHeaderLineType.String, "Parasite Haloplex Sequence (Word|Count|Fraction)");
        super.addMetaData(header);
        out.writeHeader(header);
        header.addMetaDataLine(filter);
        header.addMetaDataLine(infoWord);
        while (in.hasNext()) {
            final VariantContext ctx = in.next();
            final VariantContextBuilder vcb = new VariantContextBuilder(inputName, ctx.getContig(), ctx.getStart(), ctx.getEnd(), ctx.getAlleles());
            if (!(ctx.isIndel() || ctx.isMixed())) {
                out.add(vcb.make());
                continue;
            }
            if (!vcb.getAlleles().stream().filter(A -> !(A.isSymbolic() || A.isNoCall() || A.length() < this.minClipSize)).findAny().isPresent()) {
                out.add(vcb.make());
                continue;
            }
            final Mutation mut = new Mutation(ctx);
            mutations.add(mut);
        }
        final Counter<String> words = new Counter<>();
        for (final File bamFile : bamFiles) {
            LOG.info("Opening " + bamFile);
            samReader = createSamReaderFactory().open(bamFile);
            for (final Mutation mut : mutations) {
                // words seen in that mutation : don't use a Counter
                final Set<String> mutWords = new HashSet<>();
                /* loop over reads overlapping that mutation */
                final SAMRecordIterator sri = samReader.queryOverlapping(mut.contig, mut.start, mut.end);
                while (sri.hasNext()) {
                    final SAMRecord rec = sri.next();
                    if (rec.getReadUnmappedFlag())
                        continue;
                    if (rec.isSecondaryOrSupplementary())
                        continue;
                    if (rec.getDuplicateReadFlag())
                        continue;
                    if (rec.getReadFailsVendorQualityCheckFlag())
                        continue;
                    final Cigar cigar = rec.getCigar();
                    if (cigar.numCigarElements() == 1)
                        continue;
                    final byte[] bases = rec.getReadBases();
                    int refpos = rec.getUnclippedStart();
                    int readpos = 0;
                    /* scan cigar overlapping that mutation */
                    for (final CigarElement ce : cigar) {
                        final CigarOperator op = ce.getOperator();
                        final int ref_end = refpos + (op.consumesReferenceBases() || op.isClipping() ? ce.getLength() : 0);
                        final int read_end = readpos + (op.consumesReadBases() ? ce.getLength() : 0);
                        /* check clip is large enough */
                        if (op.equals(CigarOperator.S) && ce.getLength() >= this.minClipSize) {
                            /* check clip overlap mutation */
                            if (!(ref_end < mut.start || refpos > mut.end)) {
                                /* break read of soft clip into words */
                                for (int x = 0; x + this.minClipSize <= ce.getLength(); ++x) {
                                    final String substr = new String(bases, readpos + x, this.minClipSize);
                                    if (!substr.contains("N")) {
                                        final String revcomp = AcidNucleics.reverseComplement(substr);
                                        mutWords.add(substr);
                                        if (!revcomp.equals(substr))
                                            mutWords.add(revcomp);
                                    }
                                }
                            }
                        }
                        refpos = ref_end;
                        readpos = read_end;
                    }
                }
                sri.close();
                for (final String w : mutWords) {
                    words.incr(w);
                }
            }
            // end of for(mutations)
            samReader.close();
            samReader = null;
        }
        LOG.info("mutations:" + mutations.size());
        final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
        for (final Mutation mut : mutations) {
            progress.watch(mut.contig, mut.start);
            final VariantContextBuilder vcb = new VariantContextBuilder(inputName, mut.contig, mut.start, mut.end, mut.alleles);
            String worstString = null;
            Double worstFraction = null;
            final double totalWords = words.getTotal();
            for (final Allele a : mut.alleles) {
                if (a.isSymbolic() || a.isNoCall() || a.length() < this.minClipSize)
                    continue;
                for (int x = 0; x + this.minClipSize <= a.length(); ++x) {
                    final String substr = new String(a.getBases(), x, this.minClipSize);
                    final long count = words.count(substr);
                    final double fraction = count / totalWords;
                    if (worstFraction == null || worstFraction < fraction) {
                        worstString = substr + "|" + count + "|" + fraction;
                        worstFraction = fraction;
                    }
                }
            }
            if (worstString != null) {
                vcb.attribute(infoWord.getID(), worstString);
            }
            if (worstFraction != null && worstFraction.doubleValue() >= this.tresholdFraction) {
                vcb.filter(filter.getID());
            }
            out.add(vcb.make());
        }
        progress.finish();
        return RETURN_OK;
    } catch (Exception e) {
        LOG.error(e);
        return -1;
    } finally {
        CloserUtil.close(samReader);
    }
}
Also used : Cigar(htsjdk.samtools.Cigar) Allele(htsjdk.variant.variantcontext.Allele) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) VCFHeader(htsjdk.variant.vcf.VCFHeader) CigarElement(htsjdk.samtools.CigarElement) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) CigarOperator(htsjdk.samtools.CigarOperator) AcidNucleics(com.github.lindenb.jvarkit.util.bio.AcidNucleics) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) CloserUtil(htsjdk.samtools.util.CloserUtil) Counter(com.github.lindenb.jvarkit.util.Counter) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) Files(java.nio.file.Files) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) VcfIterator(com.github.lindenb.jvarkit.util.vcf.VcfIterator) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) SamReader(htsjdk.samtools.SamReader) Collectors(java.util.stream.Collectors) File(java.io.File) SAMRecord(htsjdk.samtools.SAMRecord) List(java.util.List) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) VariantContext(htsjdk.variant.variantcontext.VariantContext) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) SamReader(htsjdk.samtools.SamReader) Counter(com.github.lindenb.jvarkit.util.Counter) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) CigarOperator(htsjdk.samtools.CigarOperator) CigarElement(htsjdk.samtools.CigarElement) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) Allele(htsjdk.variant.variantcontext.Allele) Cigar(htsjdk.samtools.Cigar) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) SAMRecord(htsjdk.samtools.SAMRecord) File(java.io.File)

Example 69 with CigarElement

use of htsjdk.samtools.CigarElement in project jvarkit by lindenb.

the class Biostar59647 method doWork.

@Override
public int doWork(final List<String> args) {
    IndexedFastaSequenceFile indexedFastaSequenceFile = null;
    SamReader samFileReader = null;
    PrintStream pout;
    try {
        GenomicSequence genomicSequence = null;
        indexedFastaSequenceFile = new IndexedFastaSequenceFile(refFile);
        samFileReader = null;
        final String bamFile = oneFileOrNull(args);
        samFileReader = super.openSamReader(bamFile);
        if (!SequenceUtil.areSequenceDictionariesEqual(indexedFastaSequenceFile.getSequenceDictionary(), samFileReader.getFileHeader().getSequenceDictionary())) {
            LOG.warning("Not the same sequence dictionaries");
        }
        final XMLOutputFactory xmlfactory = XMLOutputFactory.newInstance();
        pout = (outputFile == null ? stdout() : new PrintStream(this.outputFile));
        final XMLStreamWriter w = xmlfactory.createXMLStreamWriter(pout, "UTF-8");
        w.writeStartDocument("UTF-8", "1.0");
        w.writeStartElement("sam");
        w.writeAttribute("bam", (bamFile == null ? "stdin" : bamFile));
        w.writeAttribute("ref", refFile.getPath());
        w.writeComment(getProgramCommandLine());
        final SAMSequenceDictionaryProgress progess = new SAMSequenceDictionaryProgress(samFileReader.getFileHeader().getSequenceDictionary());
        final SAMRecordIterator iter = samFileReader.iterator();
        while (iter.hasNext()) {
            final SAMRecord rec = iter.next();
            progess.watch(rec);
            final byte[] readbases = rec.getReadBases();
            w.writeStartElement("read");
            w.writeStartElement("name");
            w.writeCharacters(rec.getReadName());
            w.writeEndElement();
            w.writeStartElement("sequence");
            w.writeCharacters(new String(readbases));
            w.writeEndElement();
            w.writeStartElement("flags");
            for (SAMFlag f : SAMFlag.values()) {
                w.writeAttribute(f.name(), String.valueOf(f.isSet(rec.getFlags())));
            }
            w.writeCharacters(String.valueOf(rec.getFlags()));
            // flags
            w.writeEndElement();
            if (!rec.getReadUnmappedFlag()) {
                w.writeStartElement("qual");
                w.writeCharacters(String.valueOf(rec.getMappingQuality()));
                w.writeEndElement();
                w.writeStartElement("chrom");
                w.writeAttribute("index", String.valueOf(rec.getReferenceIndex()));
                w.writeCharacters(rec.getReferenceName());
                w.writeEndElement();
                w.writeStartElement("pos");
                w.writeCharacters(String.valueOf(rec.getAlignmentStart()));
                w.writeEndElement();
                w.writeStartElement("cigar");
                w.writeCharacters(rec.getCigarString());
                w.writeEndElement();
            }
            if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag()) {
                w.writeStartElement("mate-chrom");
                w.writeAttribute("index", String.valueOf(rec.getMateReferenceIndex()));
                w.writeCharacters(rec.getMateReferenceName());
                w.writeEndElement();
                w.writeStartElement("mate-pos");
                w.writeCharacters(String.valueOf(rec.getMateAlignmentStart()));
                w.writeEndElement();
            }
            if (!rec.getReadUnmappedFlag() && rec.getCigar() != null) {
                if (genomicSequence == null || !genomicSequence.getChrom().equals(rec.getReferenceName())) {
                    genomicSequence = new GenomicSequence(indexedFastaSequenceFile, rec.getReferenceName());
                }
                w.writeStartElement("align");
                int readIndex = 0;
                int refIndex = rec.getAlignmentStart();
                for (final CigarElement e : rec.getCigar().getCigarElements()) {
                    switch(e.getOperator()) {
                        // ignore hard clips
                        case H:
                            break;
                        // ignore pads
                        case P:
                            break;
                        // cont.
                        case I:
                        case S:
                            {
                                final int length = e.getLength();
                                for (int i = 0; i < length; ++i) {
                                    w.writeEmptyElement(e.getOperator().name());
                                    w.writeAttribute("read-index", String.valueOf(readIndex + 1));
                                    if (readIndex >= 0 && readIndex < readbases.length) {
                                        w.writeAttribute("read-base", String.valueOf((char) (readbases[readIndex])));
                                    }
                                    readIndex++;
                                }
                                break;
                            }
                        // cont. -- reference skip
                        case N:
                        case D:
                            {
                                final int length = e.getLength();
                                for (int i = 0; i < length; ++i) {
                                    w.writeEmptyElement(e.getOperator().name());
                                    w.writeAttribute("ref-index", String.valueOf(refIndex));
                                    if (refIndex >= 1 && refIndex <= genomicSequence.length()) {
                                        w.writeAttribute("ref-base", String.valueOf(genomicSequence.charAt(refIndex - 1)));
                                    }
                                    refIndex++;
                                }
                                break;
                            }
                        case M:
                        case EQ:
                        case X:
                            {
                                final int length = e.getLength();
                                for (int i = 0; i < length; ++i) {
                                    w.writeEmptyElement(e.getOperator().name());
                                    char baseRead = '\0';
                                    if (readIndex >= 0 && readIndex < readbases.length) {
                                        baseRead = (char) (rec.getReadBases()[readIndex]);
                                        w.writeAttribute("read-index", String.valueOf(readIndex + 1));
                                        w.writeAttribute("read-base", String.valueOf(baseRead));
                                    }
                                    w.writeAttribute("ref-index", String.valueOf(refIndex));
                                    if (refIndex >= 1 && refIndex <= genomicSequence.length()) {
                                        char baseRef = genomicSequence.charAt(refIndex - 1);
                                        w.writeAttribute("ref-base", String.valueOf(baseRef));
                                        if (Character.toUpperCase(baseRef) != Character.toUpperCase(baseRead)) {
                                            w.writeAttribute("mismatch", "true");
                                        }
                                    }
                                    refIndex++;
                                    readIndex++;
                                }
                                break;
                            }
                        default:
                            throw new IllegalStateException("Case statement didn't deal with cigar op: " + e.getOperator());
                    }
                }
                w.writeEndElement();
            }
            w.writeEndElement();
        }
        iter.close();
        w.writeEndElement();
        w.writeEndDocument();
        w.flush();
        pout.flush();
        CloserUtil.close(w);
        CloserUtil.close(pout);
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(samFileReader);
        CloserUtil.close(indexedFastaSequenceFile);
    }
    return 0;
}
Also used : PrintStream(java.io.PrintStream) XMLOutputFactory(javax.xml.stream.XMLOutputFactory) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) SAMFlag(htsjdk.samtools.SAMFlag) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) CigarElement(htsjdk.samtools.CigarElement) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile) SamReader(htsjdk.samtools.SamReader) XMLStreamWriter(javax.xml.stream.XMLStreamWriter) SAMRecord(htsjdk.samtools.SAMRecord)

Example 70 with CigarElement

use of htsjdk.samtools.CigarElement in project jvarkit by lindenb.

the class MiniCaller method doWork.

@Override
public int doWork(final List<String> args) {
    ConcatSam.ConcatSamIterator iter = null;
    try {
        if (this.fastaFile == null) {
            LOG.error("no REF");
            return -1;
        }
        /* load faid */
        final ReferenceGenomeFactory referenceGenomeFactory = new ReferenceGenomeFactory();
        this.referenceGenome = referenceGenomeFactory.openFastaFile(this.fastaFile);
        this.dictionary = this.referenceGenome.getDictionary();
        if (this.dictionary == null) {
            LOG.error(JvarkitException.FastaDictionaryMissing.getMessage(this.fastaFile.getPath()));
        }
        /* create sam record iterator */
        iter = new ConcatSam.Factory().addInterval(this.rgnStr).setEnableUnrollList(true).open(args);
        final SAMFileHeader samFileheader = iter.getFileHeader();
        final SAMSequenceDictionary dict = samFileheader.getSequenceDictionary();
        if (dict == null) {
            LOG.error(JvarkitException.BamDictionaryMissing.getMessage(String.join(", ", args)));
            return -1;
        }
        if (!SequenceUtil.areSequenceDictionariesEqual(dict, this.dictionary)) {
            LOG.error(JvarkitException.DictionariesAreNotTheSame.getMessage(dict, this.dictionary));
            return -1;
        }
        final List<SAMReadGroupRecord> groups = samFileheader.getReadGroups();
        if (groups == null || groups.isEmpty()) {
            LOG.error("No group defined in input");
            return -1;
        }
        final Set<String> sampleSet = groups.stream().map(srgr -> this.samRecordPartition.apply(srgr, samRecordPartition.name())).collect(Collectors.toSet());
        /* create VCF metadata */
        final Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
        metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY));
        metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.DEPTH_KEY));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_NUMBER_KEY));
        metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_FREQUENCY_KEY));
        metaData.add(new VCFFormatHeaderLine("DPG", // one value of each genotype
        VCFHeaderLineCount.G, VCFHeaderLineType.Integer, "Depth for each allele"));
        metaData.add(new VCFFormatHeaderLine("DP4", 4, VCFHeaderLineType.Integer, "Depth ReforAlt|Strand : RF,RR,AF,AR"));
        metaData.add(new VCFInfoHeaderLine("INDEL", 1, VCFHeaderLineType.Flag, "Variant is indel"));
        // addMetaData(metaData);
        final VCFHeader vcfHeader = new VCFHeader(metaData, sampleSet);
        vcfHeader.setSequenceDictionary(this.dictionary);
        /* create variant context */
        this.variantContextWriter = super.openVariantContextWriter(outputFile);
        this.variantContextWriter.writeHeader(vcfHeader);
        ReferenceContig genomicSeq = null;
        SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(this.dictionary);
        for (; ; ) {
            SAMRecord rec = null;
            if (iter.hasNext()) {
                rec = progress.watch(iter.next());
                if (rec.getReadUnmappedFlag())
                    continue;
                if (this.readFilter.filterOut(rec))
                    continue;
                /* flush buffer if needed */
                while (!this.buffer.isEmpty() && (this.buffer.get(0).tid < rec.getReferenceIndex() || (this.buffer.get(0).tid == rec.getReferenceIndex() && (this.buffer.get(0).getEnd()) < rec.getAlignmentStart()))) {
                    this.buffer.remove(0).print();
                }
                /* get genomic sequence at this position */
                if (genomicSeq == null || !genomicSeq.getContig().equals(rec.getContig())) {
                    genomicSeq = this.referenceGenome.getContig(rec.getContig());
                }
                final Cigar cigar = rec.getCigar();
                if (cigar == null)
                    continue;
                int readPos = 0;
                // 0 based-reference
                int refPos0 = rec.getAlignmentStart() - 1;
                final byte[] bases = rec.getReadBases();
                final byte[] quals = rec.getBaseQualities();
                final String sampleName = this.samRecordPartition.getPartion(rec, samRecordPartition.name());
                for (final CigarElement ce : cigar.getCigarElements()) {
                    final CigarOperator op = ce.getOperator();
                    switch(op) {
                        case P:
                            break;
                        case H:
                            break;
                        case S:
                            readPos += ce.getLength();
                            break;
                        // go
                        case N:
                        case D:
                            {
                                if (// we need base before deletion
                                refPos0 > 0) {
                                    char refBase = genomicSeq.charAt(refPos0 - 1);
                                    /* we use base before deletion */
                                    final StringBuilder sb = new StringBuilder(ce.getLength());
                                    sb.append(refBase);
                                    for (int i = 0; i < ce.getLength(); ++i) {
                                        sb.append(genomicSeq.charAt(refPos0 + i));
                                    }
                                    findContext(rec.getReferenceIndex(), // we use base *before deletion */
                                    refPos0 - 1, Allele.create(sb.toString(), true)).getSample(sampleName).getAllele(Allele.create(String.valueOf(refBase), false)).incr(rec.getReadNegativeStrandFlag());
                                }
                                refPos0 += ce.getLength();
                                break;
                            }
                        case I:
                            {
                                if (refPos0 > 0) {
                                    // float qual=0;
                                    char refBase = Character.toUpperCase(genomicSeq.charAt(refPos0 - 1));
                                    final StringBuilder sb = new StringBuilder(1 + ce.getLength());
                                    sb.append(refBase);
                                    for (int i = 0; i < ce.getLength(); ++i) {
                                        sb.append((char) bases[readPos + i]);
                                    // qual+=(readPos + i < quals.length?quals[ readPos + i]:0);
                                    }
                                    findContext(rec.getReferenceIndex(), // we use base *before deletion */
                                    refPos0 - 1, Allele.create(String.valueOf(refBase), true)).getSample(sampleName).getAllele(Allele.create(sb.toString().toUpperCase(), false)).incr(rec.getReadNegativeStrandFlag());
                                }
                                readPos += ce.getLength();
                                break;
                            }
                        case EQ:
                        case M:
                        case X:
                            {
                                for (int i = 0; i < ce.getLength(); ++i) {
                                    findContext(rec.getReferenceIndex(), refPos0 + i, Allele.create(String.valueOf(genomicSeq.charAt(refPos0 + i)), true)).getSample(sampleName).getAllele(Allele.create(String.valueOf((char) bases[readPos + i]), false)).incr(rec.getReadNegativeStrandFlag());
                                }
                                readPos += ce.getLength();
                                refPos0 += ce.getLength();
                                break;
                            }
                        default:
                            throw new IllegalStateException("Case statement didn't deal with cigar op: " + op);
                    }
                }
            } else {
                break;
            }
        }
        while (!buffer.isEmpty()) buffer.remove(0).print();
        progress.finish();
        iter.close();
        iter = null;
        this.variantContextWriter.close();
        this.variantContextWriter = null;
        return RETURN_OK;
    } catch (Exception e) {
        LOG.error(e);
        return -1;
    } finally {
        CloserUtil.close(iter);
        CloserUtil.close(this.referenceGenome);
        CloserUtil.close(this.variantContextWriter);
    }
}
Also used : Allele(htsjdk.variant.variantcontext.Allele) Arrays(java.util.Arrays) Program(com.github.lindenb.jvarkit.util.jcommander.Program) VCFStandardHeaderLines(htsjdk.variant.vcf.VCFStandardHeaderLines) VCFHeader(htsjdk.variant.vcf.VCFHeader) CigarElement(htsjdk.samtools.CigarElement) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) CigarOperator(htsjdk.samtools.CigarOperator) SAMRecordPartition(com.github.lindenb.jvarkit.util.samtools.SAMRecordPartition) IntervalParser(com.github.lindenb.jvarkit.util.bio.IntervalParser) SAMFileHeader(htsjdk.samtools.SAMFileHeader) ReferenceGenomeFactory(com.github.lindenb.jvarkit.util.bio.fasta.ReferenceGenomeFactory) Map(java.util.Map) CloserUtil(htsjdk.samtools.util.CloserUtil) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) Collectors(java.util.stream.Collectors) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) SAMRecord(htsjdk.samtools.SAMRecord) List(java.util.List) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) VariantContext(htsjdk.variant.variantcontext.VariantContext) VCFHeaderLineCount(htsjdk.variant.vcf.VCFHeaderLineCount) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) Genotype(htsjdk.variant.variantcontext.Genotype) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) Cigar(htsjdk.samtools.Cigar) SequenceUtil(htsjdk.samtools.util.SequenceUtil) Parameter(com.beust.jcommander.Parameter) HashMap(java.util.HashMap) Term(com.github.lindenb.semontology.Term) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFConstants(htsjdk.variant.vcf.VCFConstants) Counter(com.github.lindenb.jvarkit.util.Counter) Locatable(htsjdk.samtools.util.Locatable) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ReferenceContig(com.github.lindenb.jvarkit.util.bio.fasta.ReferenceContig) File(java.io.File) SamRecordFilter(htsjdk.samtools.filter.SamRecordFilter) SamRecordJEXLFilter(com.github.lindenb.jvarkit.util.samtools.SamRecordJEXLFilter) ConcatSam(com.github.lindenb.jvarkit.tools.misc.ConcatSam) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) ReferenceGenome(com.github.lindenb.jvarkit.util.bio.fasta.ReferenceGenome) ReferenceContig(com.github.lindenb.jvarkit.util.bio.fasta.ReferenceContig) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) ReferenceGenomeFactory(com.github.lindenb.jvarkit.util.bio.fasta.ReferenceGenomeFactory) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) ReferenceGenomeFactory(com.github.lindenb.jvarkit.util.bio.fasta.ReferenceGenomeFactory) CigarOperator(htsjdk.samtools.CigarOperator) CigarElement(htsjdk.samtools.CigarElement) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) Cigar(htsjdk.samtools.Cigar) SAMRecord(htsjdk.samtools.SAMRecord) SAMFileHeader(htsjdk.samtools.SAMFileHeader) ConcatSam(com.github.lindenb.jvarkit.tools.misc.ConcatSam)

Aggregations

CigarElement (htsjdk.samtools.CigarElement)164 Cigar (htsjdk.samtools.Cigar)97 ArrayList (java.util.ArrayList)50 SAMRecord (htsjdk.samtools.SAMRecord)49 CigarOperator (htsjdk.samtools.CigarOperator)34 SAMRecordIterator (htsjdk.samtools.SAMRecordIterator)32 SamReader (htsjdk.samtools.SamReader)31 SAMFileHeader (htsjdk.samtools.SAMFileHeader)24 SAMSequenceDictionaryProgress (com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress)22 Test (org.testng.annotations.Test)19 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)17 BaseTest (org.broadinstitute.hellbender.utils.test.BaseTest)17 File (java.io.File)16 SAMFileWriter (htsjdk.samtools.SAMFileWriter)15 SAMReadGroupRecord (htsjdk.samtools.SAMReadGroupRecord)14 Interval (htsjdk.samtools.util.Interval)14 IOException (java.io.IOException)14 VisibleForTesting (com.google.common.annotations.VisibleForTesting)13 List (java.util.List)13 GATKException (org.broadinstitute.hellbender.exceptions.GATKException)13