Search in sources :

Example 1 with GenomicSequence

use of com.github.lindenb.jvarkit.util.picard.GenomicSequence in project jvarkit by lindenb.

the class BamToSVG method doWork.

@Override
public int doWork(List<String> args) {
    /* parse interval */
    if (this.intervalStr == null) {
        LOG.error("bed.interval0.undefined");
        return -1;
    }
    int colon = this.intervalStr.indexOf(':');
    int hyphen = this.intervalStr.indexOf('-', colon + 1);
    if (colon < 1 || hyphen <= colon || hyphen + 1 == intervalStr.length()) {
        LOG.error("Bad interval " + this.intervalStr);
        return -1;
    }
    this.interval = new Interval();
    this.interval.chrom = this.intervalStr.substring(0, colon);
    this.interval.start = Integer.parseInt(this.intervalStr.substring(colon + 1, hyphen)) + 1;
    this.interval.end = Integer.parseInt(this.intervalStr.substring(hyphen + 1));
    this.drawinAreaWidth = Math.max(100, this.drawinAreaWidth);
    SamReader in = null;
    SAMRecordIterator iter = null;
    SamReaderFactory sfrf = SamReaderFactory.makeDefault();
    sfrf.validationStringency(ValidationStringency.SILENT);
    XMLStreamWriter w = null;
    FileOutputStream fout = null;
    try {
        /* get genomic sequence */
        if (this.referenceFile != null) {
            LOG.info("opening " + this.referenceFile);
            this.indexedFastaSequenceFile = new IndexedFastaSequenceFile(this.referenceFile);
            this.genomicSequence = new GenomicSequence(this.indexedFastaSequenceFile, this.interval.chrom);
        }
        for (String vcf : this.vcfFileSet) {
            readVariantFile(vcf);
        }
        /* read SAM data */
        if (args.isEmpty()) {
            LOG.info("Reading from stdin");
            in = sfrf.open(SamInputResource.of(stdin()));
            iter = in.iterator();
            readBamStream(iter);
            iter.close();
            in.close();
        } else {
            for (String arg : args) {
                File filename = new File(arg);
                LOG.info("Reading from " + filename);
                in = sfrf.open(SamInputResource.of(filename));
                if (in.hasIndex()) {
                    iter = in.query(this.interval.getChrom(), this.interval.getStart(), this.interval.getEnd(), false);
                } else {
                    LOG.info("Bam file not indexed !! " + filename);
                    iter = in.iterator();
                }
                readBamStream(iter);
                iter.close();
                in.close();
            }
        }
        this.featureWidth = this.drawinAreaWidth / (double) ((this.interval.end - this.interval.start) + 1);
        this.featureHeight = Math.min(Math.max(5.0, this.featureWidth), 30);
        this.HEIGHT_RULER = (int) (this.niceIntFormat.format(this.interval.end).length() * this.featureHeight + 5);
        LOG.info("Feature height:" + this.featureHeight);
        XMLOutputFactory xof = XMLOutputFactory.newFactory();
        if (this.outputFile == null) {
            w = xof.createXMLStreamWriter(stdout(), "UTF-8");
        } else {
            fout = new FileOutputStream(this.outputFile);
            w = xof.createXMLStreamWriter(fout, "UTF-8");
        }
        w.writeStartDocument("UTF-8", "1.0");
        printDocument(w, intervalStr);
        w.writeEndDocument();
        w.flush();
        w.close();
        return RETURN_OK;
    } catch (Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(iter);
        CloserUtil.close(fout);
        CloserUtil.close(in);
        CloserUtil.close(this.indexedFastaSequenceFile);
        this.indexedFastaSequenceFile = null;
        this.interval = null;
    }
}
Also used : XMLOutputFactory(javax.xml.stream.XMLOutputFactory) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) SamReaderFactory(htsjdk.samtools.SamReaderFactory) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile) XMLStreamException(javax.xml.stream.XMLStreamException) IOException(java.io.IOException) SamReader(htsjdk.samtools.SamReader) XMLStreamWriter(javax.xml.stream.XMLStreamWriter) FileOutputStream(java.io.FileOutputStream) File(java.io.File) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile)

Example 2 with GenomicSequence

use of com.github.lindenb.jvarkit.util.picard.GenomicSequence in project jvarkit by lindenb.

the class Biostar175929 method doWork.

@Override
public int doWork(List<String> args) {
    if (this.faidx == null) {
        LOG.error("fasta reference was not defined.");
        return -1;
    }
    IndexedFastaSequenceFile reference = null;
    VcfIterator iter = null;
    try {
        reference = new IndexedFastaSequenceFile(this.faidx);
        iter = super.openVcfIterator(oneFileOrNull(args));
        this.pw = openFileOrStdoutAsPrintWriter(this.outputFile);
        final List<VariantContext> variants = new ArrayList<>();
        for (; ; ) {
            VariantContext ctx = null;
            if (iter.hasNext()) {
                ctx = iter.next();
            }
            if (ctx == null || (!variants.isEmpty() && !ctx.getContig().equals(variants.get(0).getContig()))) {
                if (!variants.isEmpty()) {
                    LOG.info("chrom:" + variants.get(0).getContig() + " N=" + variants.size());
                    final GenomicSequence genomic = new GenomicSequence(reference, variants.get(0).getContig());
                    final StringBuilder title = new StringBuilder();
                    final StringBuilder sequence = new StringBuilder();
                    recursive(genomic, variants, 0, title, sequence);
                    variants.clear();
                }
                if (ctx == null)
                    break;
            }
            variants.add(ctx);
        }
        iter.close();
        iter = null;
        this.pw.flush();
        this.pw.close();
        return RETURN_OK;
    } catch (Exception e) {
        LOG.error(e);
        return -1;
    } finally {
        CloserUtil.close(reference);
        CloserUtil.close(iter);
        CloserUtil.close(pw);
    }
}
Also used : VcfIterator(com.github.lindenb.jvarkit.util.vcf.VcfIterator) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile)

Example 3 with GenomicSequence

use of com.github.lindenb.jvarkit.util.picard.GenomicSequence in project jvarkit by lindenb.

the class Biostar251649 method doVcfToVcf.

@Override
protected int doVcfToVcf(String inputName, VcfIterator in, VariantContextWriter w) {
    try {
        final VCFHeader header = new VCFHeader(in.getHeader());
        VCFInfoHeaderLine info5 = new VCFInfoHeaderLine(leftTag + extend, 1, VCFHeaderLineType.String, "Sequence on the 5' of mutation");
        VCFInfoHeaderLine info3 = new VCFInfoHeaderLine(rightTag + extend, 1, VCFHeaderLineType.String, "Sequence on the 3' of mutation");
        if (header.getInfoHeaderLine(info5.getID()) != null) {
            LOG.error("tag " + info5.getID() + " already present in VCF header");
            return -1;
        }
        if (header.getInfoHeaderLine(info3.getID()) != null) {
            LOG.error("tag " + info3.getID() + " already present in VCF header");
            return -1;
        }
        header.addMetaDataLine(info5);
        header.addMetaDataLine(info3);
        GenomicSequence chrom = null;
        w.writeHeader(header);
        while (in.hasNext()) {
            final VariantContext ctx = in.next();
            if (chrom == null || !chrom.getChrom().equals(ctx.getContig())) {
                chrom = new GenomicSequence(this.faidx, ctx.getContig());
            }
            final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
            if (ctx.getStart() > 0) {
                final StringBuilder sb = new StringBuilder(this.extend);
                for (int i = 0; i < this.extend; ++i) {
                    final int pos0 = (ctx.getStart() - 2) - i;
                    if (pos0 <= 0)
                        continue;
                    sb.insert(0, chrom.charAt(pos0));
                }
                if (sb.length() > 0)
                    vcb.attribute(info5.getID(), sb.toString());
            }
            {
                final StringBuilder sb = new StringBuilder(this.extend);
                for (int i = 0; i < this.extend; ++i) {
                    int pos0 = ctx.getEnd() + i;
                    if (pos0 >= chrom.length())
                        break;
                    sb.append(chrom.charAt(pos0));
                }
                if (sb.length() > 0)
                    vcb.attribute(info3.getID(), sb.toString());
            }
            w.add(vcb.make());
        }
        return 0;
    } catch (Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(faidx);
    }
}
Also used : VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) VariantContext(htsjdk.variant.variantcontext.VariantContext) VCFHeader(htsjdk.variant.vcf.VCFHeader) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine)

Example 4 with GenomicSequence

use of com.github.lindenb.jvarkit.util.picard.GenomicSequence in project jvarkit by lindenb.

the class Biostar59647 method doWork.

@Override
public int doWork(final List<String> args) {
    IndexedFastaSequenceFile indexedFastaSequenceFile = null;
    SamReader samFileReader = null;
    PrintStream pout;
    try {
        GenomicSequence genomicSequence = null;
        indexedFastaSequenceFile = new IndexedFastaSequenceFile(refFile);
        samFileReader = null;
        final String bamFile = oneFileOrNull(args);
        samFileReader = super.openSamReader(bamFile);
        if (!SequenceUtil.areSequenceDictionariesEqual(indexedFastaSequenceFile.getSequenceDictionary(), samFileReader.getFileHeader().getSequenceDictionary())) {
            LOG.warning("Not the same sequence dictionaries");
        }
        final XMLOutputFactory xmlfactory = XMLOutputFactory.newInstance();
        pout = (outputFile == null ? stdout() : new PrintStream(this.outputFile));
        final XMLStreamWriter w = xmlfactory.createXMLStreamWriter(pout, "UTF-8");
        w.writeStartDocument("UTF-8", "1.0");
        w.writeStartElement("sam");
        w.writeAttribute("bam", (bamFile == null ? "stdin" : bamFile));
        w.writeAttribute("ref", refFile.getPath());
        w.writeComment(getProgramCommandLine());
        final SAMSequenceDictionaryProgress progess = new SAMSequenceDictionaryProgress(samFileReader.getFileHeader().getSequenceDictionary());
        final SAMRecordIterator iter = samFileReader.iterator();
        while (iter.hasNext()) {
            final SAMRecord rec = iter.next();
            progess.watch(rec);
            final byte[] readbases = rec.getReadBases();
            w.writeStartElement("read");
            w.writeStartElement("name");
            w.writeCharacters(rec.getReadName());
            w.writeEndElement();
            w.writeStartElement("sequence");
            w.writeCharacters(new String(readbases));
            w.writeEndElement();
            w.writeStartElement("flags");
            for (SAMFlag f : SAMFlag.values()) {
                w.writeAttribute(f.name(), String.valueOf(f.isSet(rec.getFlags())));
            }
            w.writeCharacters(String.valueOf(rec.getFlags()));
            // flags
            w.writeEndElement();
            if (!rec.getReadUnmappedFlag()) {
                w.writeStartElement("qual");
                w.writeCharacters(String.valueOf(rec.getMappingQuality()));
                w.writeEndElement();
                w.writeStartElement("chrom");
                w.writeAttribute("index", String.valueOf(rec.getReferenceIndex()));
                w.writeCharacters(rec.getReferenceName());
                w.writeEndElement();
                w.writeStartElement("pos");
                w.writeCharacters(String.valueOf(rec.getAlignmentStart()));
                w.writeEndElement();
                w.writeStartElement("cigar");
                w.writeCharacters(rec.getCigarString());
                w.writeEndElement();
            }
            if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag()) {
                w.writeStartElement("mate-chrom");
                w.writeAttribute("index", String.valueOf(rec.getMateReferenceIndex()));
                w.writeCharacters(rec.getMateReferenceName());
                w.writeEndElement();
                w.writeStartElement("mate-pos");
                w.writeCharacters(String.valueOf(rec.getMateAlignmentStart()));
                w.writeEndElement();
            }
            if (!rec.getReadUnmappedFlag() && rec.getCigar() != null) {
                if (genomicSequence == null || !genomicSequence.getChrom().equals(rec.getReferenceName())) {
                    genomicSequence = new GenomicSequence(indexedFastaSequenceFile, rec.getReferenceName());
                }
                w.writeStartElement("align");
                int readIndex = 0;
                int refIndex = rec.getAlignmentStart();
                for (final CigarElement e : rec.getCigar().getCigarElements()) {
                    switch(e.getOperator()) {
                        // ignore hard clips
                        case H:
                            break;
                        // ignore pads
                        case P:
                            break;
                        // cont.
                        case I:
                        case S:
                            {
                                final int length = e.getLength();
                                for (int i = 0; i < length; ++i) {
                                    w.writeEmptyElement(e.getOperator().name());
                                    w.writeAttribute("read-index", String.valueOf(readIndex + 1));
                                    if (readIndex >= 0 && readIndex < readbases.length) {
                                        w.writeAttribute("read-base", String.valueOf((char) (readbases[readIndex])));
                                    }
                                    readIndex++;
                                }
                                break;
                            }
                        // cont. -- reference skip
                        case N:
                        case D:
                            {
                                final int length = e.getLength();
                                for (int i = 0; i < length; ++i) {
                                    w.writeEmptyElement(e.getOperator().name());
                                    w.writeAttribute("ref-index", String.valueOf(refIndex));
                                    if (refIndex >= 1 && refIndex <= genomicSequence.length()) {
                                        w.writeAttribute("ref-base", String.valueOf(genomicSequence.charAt(refIndex - 1)));
                                    }
                                    refIndex++;
                                }
                                break;
                            }
                        case M:
                        case EQ:
                        case X:
                            {
                                final int length = e.getLength();
                                for (int i = 0; i < length; ++i) {
                                    w.writeEmptyElement(e.getOperator().name());
                                    char baseRead = '\0';
                                    if (readIndex >= 0 && readIndex < readbases.length) {
                                        baseRead = (char) (rec.getReadBases()[readIndex]);
                                        w.writeAttribute("read-index", String.valueOf(readIndex + 1));
                                        w.writeAttribute("read-base", String.valueOf(baseRead));
                                    }
                                    w.writeAttribute("ref-index", String.valueOf(refIndex));
                                    if (refIndex >= 1 && refIndex <= genomicSequence.length()) {
                                        char baseRef = genomicSequence.charAt(refIndex - 1);
                                        w.writeAttribute("ref-base", String.valueOf(baseRef));
                                        if (Character.toUpperCase(baseRef) != Character.toUpperCase(baseRead)) {
                                            w.writeAttribute("mismatch", "true");
                                        }
                                    }
                                    refIndex++;
                                    readIndex++;
                                }
                                break;
                            }
                        default:
                            throw new IllegalStateException("Case statement didn't deal with cigar op: " + e.getOperator());
                    }
                }
                w.writeEndElement();
            }
            w.writeEndElement();
        }
        iter.close();
        w.writeEndElement();
        w.writeEndDocument();
        w.flush();
        pout.flush();
        CloserUtil.close(w);
        CloserUtil.close(pout);
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(samFileReader);
        CloserUtil.close(indexedFastaSequenceFile);
    }
    return 0;
}
Also used : PrintStream(java.io.PrintStream) XMLOutputFactory(javax.xml.stream.XMLOutputFactory) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) SAMFlag(htsjdk.samtools.SAMFlag) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) CigarElement(htsjdk.samtools.CigarElement) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile) SamReader(htsjdk.samtools.SamReader) XMLStreamWriter(javax.xml.stream.XMLStreamWriter) SAMRecord(htsjdk.samtools.SAMRecord)

Example 5 with GenomicSequence

use of com.github.lindenb.jvarkit.util.picard.GenomicSequence in project jvarkit by lindenb.

the class ExtendReferenceWithReads method doWork.

@Override
public int doWork(List<String> args) {
    if (this.faidx == null) {
        LOG.error("No REF defined");
        return -1;
    }
    this.samReaders.clear();
    PrintStream out = null;
    try {
        this.indexedFastaSequenceFile = new IndexedFastaSequenceFile(faidx);
        SAMSequenceDictionary dict = this.indexedFastaSequenceFile.getSequenceDictionary();
        if (dict == null) {
            LOG.error("Reference file is missing a sequence dictionary (use picard)");
            return -1;
        }
        final SamReaderFactory srf = super.createSamReaderFactory();
        srf.setOption(Option.CACHE_FILE_BASED_INDEXES, true);
        for (final String uri : IOUtils.unrollFiles(args)) {
            LOG.info("opening BAM " + uri);
            final SamReader sr = srf.open(SamInputResource.of(uri));
            /* doesn't work with remote ?? */
            if (!sr.hasIndex()) {
                LOG.error("file " + uri + " is not indexed");
                return -1;
            }
            final SAMFileHeader header = sr.getFileHeader();
            if (!header.getSortOrder().equals(SortOrder.coordinate)) {
                LOG.error("file " + uri + " not sorted on coordinate but " + header.getSortOrder());
                return -1;
            }
            final SAMSequenceDictionary dict2 = header.getSequenceDictionary();
            if (dict2 == null) {
                LOG.error("file " + uri + " needs a sequence dictionary");
                return -1;
            }
            samReaders.add(sr);
        }
        if (samReaders.isEmpty()) {
            LOG.error("No BAM defined");
            return -1;
        }
        out = super.openFileOrStdoutAsPrintStream(this.outputFile);
        final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict);
        for (final SAMSequenceRecord ssr : dict.getSequences()) {
            final GenomicSequence genomic = new GenomicSequence(this.indexedFastaSequenceFile, ssr.getSequenceName());
            int chromStart = 0;
            int nPrinted = 0;
            out.print(">");
            out.print(ssr.getSequenceName());
            for (final Rescue side : Rescue.values()) {
                switch(side) {
                    case LEFT:
                        /* look before position 0 of chromosome */
                        {
                            final Map<Integer, Counter<Byte>> pos2bases = scanRegion(ssr, -1, -1, side);
                            int newstart = 0;
                            for (final Integer pos : pos2bases.keySet()) {
                                if (pos >= 0)
                                    continue;
                                newstart = Math.min(newstart, pos);
                            }
                            while (newstart < 0) {
                                final Counter<Byte> count = pos2bases.get(newstart);
                                if (nPrinted % 60 == 0)
                                    out.println();
                                out.print(consensus(count));
                                newstart++;
                                ++nPrinted;
                            }
                            break;
                        }
                    case RIGHT:
                        /* look after position > length(chromosome) */
                        {
                            final Map<Integer, Counter<Byte>> pos2bases = this.scanRegion(ssr, -1, -1, side);
                            int newend = ssr.getSequenceLength();
                            for (final Integer pos : pos2bases.keySet()) {
                                if (pos < ssr.getSequenceLength())
                                    continue;
                                newend = Math.max(newend, pos + 1);
                            }
                            for (int i = ssr.getSequenceLength(); i < newend; i++) {
                                final Counter<Byte> count = pos2bases.get(i);
                                if (nPrinted % 60 == 0)
                                    out.println();
                                out.print(consensus(count));
                                ++nPrinted;
                            }
                            break;
                        }
                    case CENTER:
                        /* 0 to chromosome-length */
                        {
                            chromStart = 0;
                            while (chromStart < genomic.length()) {
                                final char base = Character.toUpperCase(genomic.charAt(chromStart));
                                if (base != 'N') {
                                    progress.watch(ssr.getSequenceName(), chromStart);
                                    if (nPrinted % 60 == 0)
                                        out.println();
                                    out.print(base);
                                    ++chromStart;
                                    ++nPrinted;
                                    continue;
                                }
                                int chromEnd = chromStart + 1;
                                while (chromEnd < genomic.length() && Character.toUpperCase(genomic.charAt(chromEnd)) == 'N') {
                                    ++chromEnd;
                                }
                                if (chromEnd - chromStart < this.minLenNNNNContig) {
                                    while (chromStart < chromEnd) {
                                        progress.watch(ssr.getSequenceName(), chromStart);
                                        if (nPrinted % 60 == 0)
                                            out.println();
                                        out.print(base);
                                        ++chromStart;
                                        ++nPrinted;
                                    }
                                    continue;
                                }
                                final Map<Integer, Counter<Byte>> pos2bases = scanRegion(ssr, chromStart, chromEnd, side);
                                while (chromStart < chromEnd) {
                                    final Counter<Byte> count = pos2bases.get(chromStart);
                                    if (nPrinted % 60 == 0)
                                        out.println();
                                    if (count == null) {
                                        out.print('N');
                                    } else {
                                        out.print(consensus(count));
                                    }
                                    ++chromStart;
                                    ++nPrinted;
                                    continue;
                                }
                            }
                            break;
                        }
                }
            // end switch type
            }
            out.println();
        }
        progress.finish();
        out.flush();
        out.close();
        return RETURN_OK;
    } catch (final Exception e) {
        LOG.error(e);
        return -1;
    } finally {
        CloserUtil.close(this.indexedFastaSequenceFile);
        for (final SamReader r : samReaders) CloserUtil.close(r);
        samReaders.clear();
    }
}
Also used : PrintStream(java.io.PrintStream) SamReaderFactory(htsjdk.samtools.SamReaderFactory) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile) SamReader(htsjdk.samtools.SamReader) Counter(com.github.lindenb.jvarkit.util.Counter) SAMFileHeader(htsjdk.samtools.SAMFileHeader) HashMap(java.util.HashMap) Map(java.util.Map)

Aggregations

GenomicSequence (com.github.lindenb.jvarkit.util.picard.GenomicSequence)24 IndexedFastaSequenceFile (htsjdk.samtools.reference.IndexedFastaSequenceFile)15 ArrayList (java.util.ArrayList)12 SAMSequenceDictionaryProgress (com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress)10 CigarElement (htsjdk.samtools.CigarElement)10 SAMRecord (htsjdk.samtools.SAMRecord)10 SAMSequenceRecord (htsjdk.samtools.SAMSequenceRecord)10 SamReader (htsjdk.samtools.SamReader)10 VCFHeader (htsjdk.variant.vcf.VCFHeader)10 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)9 SAMRecordIterator (htsjdk.samtools.SAMRecordIterator)8 VariantContextBuilder (htsjdk.variant.variantcontext.VariantContextBuilder)8 SAMFileHeader (htsjdk.samtools.SAMFileHeader)7 Allele (htsjdk.variant.variantcontext.Allele)7 Cigar (htsjdk.samtools.Cigar)6 VariantContext (htsjdk.variant.variantcontext.VariantContext)6 HashSet (java.util.HashSet)6 CigarOperator (htsjdk.samtools.CigarOperator)5 Genotype (htsjdk.variant.variantcontext.Genotype)5 VariantContextWriter (htsjdk.variant.variantcontext.writer.VariantContextWriter)5