Search in sources :

Example 6 with KnownGene

use of com.github.lindenb.jvarkit.util.ucsc.KnownGene in project jvarkit by lindenb.

the class KnownGenesToBed method scan.

private void scan(final BufferedReader r) throws IOException {
    String line;
    final Pattern tab = Pattern.compile("[\t]");
    while ((line = r.readLine()) != null) {
        if (out.checkError())
            break;
        final String[] tokens = tab.split(line);
        final KnownGene kg = new KnownGene(tokens);
        if (!this.hide_transcripts)
            print(kg, kg.getTxStart(), kg.getTxEnd(), "TRANSCRIPT", kg.getName());
        for (int i = 0; i < kg.getExonCount(); ++i) {
            final KnownGene.Exon exon = kg.getExon(i);
            if (!this.hide_exons)
                print(kg, exon.getStart(), exon.getEnd(), "EXON", exon.getName());
            if (!this.hide_utr && kg.getCdsStart() > exon.getStart()) {
                print(kg, exon.getStart(), Math.min(kg.getCdsStart(), exon.getEnd()), "UTR", "UTR" + (kg.isPositiveStrand() ? "5" : "3"));
            }
            if (!this.hide_cds && !(kg.getCdsStart() >= exon.getEnd() || kg.getCdsEnd() < exon.getStart())) {
                print(kg, Math.max(kg.getCdsStart(), exon.getStart()), Math.min(kg.getCdsEnd(), exon.getEnd()), "CDS", exon.getName());
            }
            KnownGene.Intron intron = exon.getNextIntron();
            if (!this.hide_introns && intron != null) {
                print(kg, intron.getStart(), intron.getEnd(), "INTRON", intron.getName());
            }
            if (!this.hide_utr && kg.getCdsEnd() < exon.getEnd()) {
                print(kg, Math.max(kg.getCdsEnd(), exon.getStart()), exon.getEnd(), "UTR", "UTR" + (kg.isPositiveStrand() ? "3" : "5"));
            }
        }
    }
}
Also used : Pattern(java.util.regex.Pattern) KnownGene(com.github.lindenb.jvarkit.util.ucsc.KnownGene)

Example 7 with KnownGene

use of com.github.lindenb.jvarkit.util.ucsc.KnownGene in project jvarkit by lindenb.

the class Gff2KnownGeneTest method test01.

@Test(dataProvider = "gff-data")
public void test01(final String inputFile) throws IOException {
    final File out = super.createTmpFile(".kg");
    Assert.assertEquals(0, new Gff2KnownGene().instanceMain(new String[] { "-o", out.getPath(), inputFile }));
    final BufferedReader r = IOUtils.openFileForBufferedReading(out);
    Assert.assertTrue(r.lines().map(L -> new KnownGene(L.split("[\t]"))).count() > 0);
    r.close();
}
Also used : Assert(org.testng.Assert) DataProvider(org.testng.annotations.DataProvider) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) IOException(java.io.IOException) Test(org.testng.annotations.Test) BufferedReader(java.io.BufferedReader) TestUtils(com.github.lindenb.jvarkit.tools.tests.TestUtils) KnownGene(com.github.lindenb.jvarkit.util.ucsc.KnownGene) File(java.io.File) BufferedReader(java.io.BufferedReader) KnownGene(com.github.lindenb.jvarkit.util.ucsc.KnownGene) File(java.io.File) Test(org.testng.annotations.Test)

Example 8 with KnownGene

use of com.github.lindenb.jvarkit.util.ucsc.KnownGene in project jvarkit by lindenb.

the class VcfToPostscript method run.

private void run(final VcfIterator in) {
    for (; ; ) {
        VariantContext ctx = null;
        if (in.hasNext())
            ctx = in.next();
        if (ctx == null || this.genes.isEmpty() || (!this.genes.isEmpty() && !this.genes.get(0).getContig().equals(ctx.getContig())) || (!this.genes.isEmpty() && this.chromEnd <= ctx.getStart())) {
            this.print();
            if (this.outw.checkError())
                return;
            if (ctx == null)
                return;
            this.clear();
            if (chrom2knownGenes.containsKey(ctx.getContig())) {
                for (KnownGene g : chrom2knownGenes.get(ctx.getContig())) {
                    if (this.genes.isEmpty()) {
                        if (g.getTxEnd() <= ctx.getStart() || g.getTxStart() > ctx.getEnd()) {
                            continue;
                        }
                        this.addGene(g);
                    } else {
                        if (!(g.getTxStart() > this.chromEnd || g.getTxEnd() <= this.chromStart)) {
                            this.addGene(g);
                        }
                    }
                }
                if (genes.isEmpty()) {
                    LOG.debug("no gene for " + ctx.getContig() + ":" + ctx.getStart());
                }
            } else {
                LOG.debug("not any gene for " + ctx.getContig());
            }
        }
        if (!genes.isEmpty() && ctx.getStart() - 1 >= this.chromStart && ctx.getStart() <= this.chromEnd) {
            this.addVariant(ctx);
        }
    }
}
Also used : VariantContext(htsjdk.variant.variantcontext.VariantContext) KnownGene(com.github.lindenb.jvarkit.util.ucsc.KnownGene)

Example 9 with KnownGene

use of com.github.lindenb.jvarkit.util.ucsc.KnownGene in project jvarkit by lindenb.

the class VcfToSvg method doWork.

@Override
public int doWork(final List<String> args) {
    if (this.outputFile != null && !outputFile.getName().contains(SEGMENT)) {
        LOG.error("output file must contain the word " + SEGMENT + " :" + this.outputFile);
        return -1;
    }
    TabixKnownGeneFileReader tabix = null;
    VcfIterator r = null;
    OutputStream outputStream = null;
    XMLStreamWriter w = null;
    PrintWriter manifestW = null;
    try {
        LOG.info("opening knownGene ");
        tabix = new TabixKnownGeneFileReader(knownGeneUri);
        if (manifestFile != null && this.outputFile != null) {
            manifestW = new PrintWriter(manifestFile);
        } else {
            manifestW = new PrintWriter(new NullOuputStream());
        }
        final Set<String> chromosomes = tabix.getChromosomes();
        final XMLOutputFactory xof = XMLOutputFactory.newInstance();
        r = super.openVcfIterator(super.oneFileOrNull(args));
        final VCFHeader header = r.getHeader();
        while (r.hasNext()) {
            final VariantContext ctx = r.next();
            String tabixContig = ctx.getContig();
            if (!chromosomes.contains(tabixContig)) {
                if (tabixContig.startsWith("chr")) {
                    tabixContig = tabixContig.substring(3);
                } else if (!tabixContig.startsWith("chr")) {
                    tabixContig = "chr" + tabixContig;
                }
                if (!chromosomes.contains(tabixContig)) {
                    while (r.hasNext()) {
                        final VariantContext ctx2 = r.peek();
                        if (!ctx2.getContig().equals(ctx.getContig()))
                            break;
                        r.next();
                    }
                    LOG.error("No chromosome " + ctx.getContig() + " in " + knownGeneUri + ". Check the chromosome nomenclature.");
                    continue;
                }
            }
            final List<VariantContext> variants = new ArrayList<>();
            final List<KnownGene> genes = new ArrayList<>();
            variants.add(ctx);
            int chromStart = ctx.getStart() - 1;
            int chromEnd = ctx.getEnd();
            /* walk over know gene, loop until there is no overapping transcript 
			 * over that region */
            for (; ; ) {
                genes.clear();
                /* the max chromEnd, let's see if we can get a bigger */
                int newStart = chromStart;
                int newEnd = chromEnd;
                final Iterator<KnownGene> kgr = tabix.iterator(tabixContig, chromStart, chromEnd);
                while (kgr.hasNext()) {
                    final KnownGene g = kgr.next();
                    if (this.removeNonCoding && g.isNonCoding())
                        continue;
                    genes.add(g);
                    newStart = Math.min(g.getTxStart(), newStart);
                    newEnd = Math.max(g.getTxEnd(), newEnd);
                }
                if (newStart >= chromStart && newEnd <= chromEnd) {
                    break;
                }
                chromStart = newStart;
                chromEnd = newEnd;
            }
            // intergenic, no gene over that variant
            if (genes.isEmpty())
                continue;
            // fill the variant for that region
            while (r.hasNext()) {
                final VariantContext ctx2 = r.peek();
                if (!ctx2.getContig().equals(ctx.getContig()))
                    break;
                if (ctx2.getStart() > chromEnd)
                    break;
                variants.add(r.next());
            }
            if (this.variantsInExonOnly) {
                variants.removeIf(V -> {
                    for (final KnownGene gene : genes) {
                        for (final KnownGene.Exon exon : gene.getExons()) {
                            if (V.getEnd() < exon.getStart() || V.getStart() >= exon.getEnd()) {
                            // rien
                            } else {
                                return false;
                            }
                        }
                    }
                    return true;
                });
            }
            if (this.variantFILTEREDOpacity <= 0) {
                variants.removeIf(V -> V.isFiltered());
            }
            if (this.variantIndelOpacity <= 0) {
                variants.removeIf(V -> V.isIndel());
            }
            if (variants.isEmpty())
                continue;
            LOG.info("Variants (" + variants.size() + ") Transcripts (" + genes.size() + ") " + tabixContig + ":" + chromStart + "-" + chromEnd);
            if (outputFile != null) {
                File fname = new File(outputFile.getParentFile(), outputFile.getName().replaceAll("__SEGMENT__", ctx.getContig() + "_" + chromStart + "_" + chromEnd));
                LOG.info("saving as " + fname);
                outputStream = IOUtils.openFileForWriting(fname);
                w = xof.createXMLStreamWriter(outputStream);
                manifestW.println(ctx.getContig() + "\t" + chromStart + "\t" + chromEnd + "\t" + genes.stream().map(G -> G.getName()).collect(Collectors.joining(",")) + "\t" + genes.size() + "\t" + variants.size() + "\t" + fname);
            } else {
                w = xof.createXMLStreamWriter(stdout());
            }
            double featureHeight = 10;
            double TRANSCRIPT_HEIGHT = featureHeight;
            final int all_genotypes_width = variants.size() * this.genotype_width;
            if (trimToVariants) {
                chromStart = variants.stream().map(V -> V.getStart() - 1).min((A, B) -> A.compareTo(B)).get();
                chromEnd = variants.stream().map(V -> V.getEnd() + 1).max((A, B) -> A.compareTo(B)).get();
            }
            final int drawinAreaWidth = Math.max(all_genotypes_width, 1000);
            final Interval interval = new Interval(ctx.getContig(), chromStart, chromEnd);
            final int interline_weight = 6;
            final int margin_top = 10;
            final int margin_bottom = 10;
            final int margin_right = 100;
            final int margin_left = 100;
            w.writeStartDocument("UTF-8", "1.0");
            w.writeStartElement("svg");
            w.writeDefaultNamespace(SVG.NS);
            w.writeNamespace("xlink", XLINK.NS);
            w.writeAttribute("version", "1.1");
            w.writeAttribute("width", String.valueOf(margin_right + margin_right + drawinAreaWidth));
            w.writeAttribute("height", String.valueOf(margin_top + margin_bottom + genes.size() * TRANSCRIPT_HEIGHT + interline_weight * featureHeight + header.getSampleNamesInOrder().size() * this.genotype_width));
            title(w, ctx.getContig() + ":" + chromStart + "-" + chromEnd);
            w.writeStartElement("desc");
            w.writeCharacters("generated with " + getProgramName() + "\n" + "Author: Pierre Lindenbaum PhD. @yokofakun .");
            w.writeEndElement();
            // defs
            w.writeStartElement("defs");
            // genotypes
            w.writeStartElement("g");
            // 
            w.writeAttribute("id", "g_" + GenotypeType.HOM_REF);
            w.writeEmptyElement("rect");
            w.writeAttribute("style", "fill:lime;stroke;none;");
            w.writeAttribute("x", "0");
            w.writeAttribute("y", "0");
            w.writeAttribute("width", String.valueOf(this.genotype_width));
            w.writeAttribute("height", String.valueOf(this.genotype_width));
            w.writeEndElement();
            w.writeStartElement("g");
            // 
            w.writeAttribute("id", "g_" + GenotypeType.NO_CALL);
            w.writeEmptyElement("rect");
            w.writeAttribute("style", "fill:silver;stroke;gray;");
            w.writeAttribute("x", "0");
            w.writeAttribute("y", "0");
            w.writeAttribute("width", String.valueOf(this.genotype_width));
            w.writeAttribute("height", String.valueOf(this.genotype_width));
            w.writeEndElement();
            w.writeStartElement("g");
            // 
            w.writeAttribute("id", "g_" + GenotypeType.HOM_VAR);
            w.writeEmptyElement("rect");
            w.writeAttribute("style", "fill:crimson;stroke;none;");
            w.writeAttribute("x", "0");
            w.writeAttribute("y", "0");
            w.writeAttribute("width", String.valueOf(this.genotype_width));
            w.writeAttribute("height", String.valueOf(this.genotype_width));
            w.writeEndElement();
            w.writeStartElement("g");
            // 
            w.writeAttribute("id", "g_" + GenotypeType.MIXED);
            w.writeEmptyElement("rect");
            w.writeAttribute("style", "fill:pink;stroke;none;");
            w.writeAttribute("x", "0");
            w.writeAttribute("y", "0");
            w.writeAttribute("width", String.valueOf(this.genotype_width));
            w.writeAttribute("height", String.valueOf(this.genotype_width));
            w.writeEndElement();
            w.writeStartElement("g");
            // 
            w.writeAttribute("id", "g_" + GenotypeType.UNAVAILABLE);
            w.writeEmptyElement("rect");
            w.writeAttribute("style", "fill:gray;stroke;none;");
            w.writeAttribute("x", "0");
            w.writeAttribute("y", "0");
            w.writeAttribute("width", String.valueOf(this.genotype_width));
            w.writeAttribute("height", String.valueOf(this.genotype_width));
            w.writeEndElement();
            w.writeStartElement("g");
            // 
            w.writeAttribute("id", "g_" + GenotypeType.HET);
            w.writeEmptyElement("rect");
            w.writeAttribute("style", "fill:lime;stroke;black;");
            w.writeAttribute("x", "0");
            w.writeAttribute("y", "0");
            w.writeAttribute("width", String.valueOf(genotype_width));
            w.writeAttribute("height", String.valueOf(genotype_width));
            w.writeEmptyElement("polygon");
            w.writeAttribute("style", "fill:crimson;stroke;black;");
            w.writeAttribute("points", "0,0 " + genotype_width + ",0 0," + genotype_width + " 0,0");
            w.writeEndElement();
            // strand
            w.writeEmptyElement("polyline");
            w.writeAttribute("id", "strandF");
            w.writeAttribute("points", "-5,-5 0,0 -5,5");
            w.writeEmptyElement("polyline");
            w.writeAttribute("id", "strandR");
            w.writeAttribute("points", "5,-5 0,0 5,5");
            // gradients
            w.writeStartElement("linearGradient");
            w.writeAttribute("id", "grad01");
            w.writeAttribute("x1", "50%");
            w.writeAttribute("x2", "50%");
            w.writeAttribute("y1", "0%");
            w.writeAttribute("y2", "100%");
            w.writeEmptyElement("stop");
            w.writeAttribute("offset", "0%");
            w.writeAttribute("style", "stop-color:black;stop-opacity:1;");
            w.writeEmptyElement("stop");
            w.writeAttribute("offset", "50%");
            w.writeAttribute("style", "stop-color:white;stop-opacity:1;");
            w.writeEmptyElement("stop");
            w.writeAttribute("offset", "100%");
            w.writeAttribute("style", "stop-color:black;stop-opacity:1;");
            w.writeEndElement();
            // defs
            w.writeEndElement();
            w.writeStartElement("style");
            w.writeCharacters("svg {fill:none; stroke:black;}\n" + "text {fill:black;stroke:none;font-size:" + (featureHeight / 1.5) + "px;}\n" + ".ruler-label { stroke:red;}\n" + ".frame { stroke:black;fill:none;}\n" + ".kgexon {fill:url(#grad01);stroke:black;}\n" + ".gcpercent {fill:url(#grad02);stroke:black;}" + ".coverage {fill:url(#grad03);stroke:black;}" + ".kgcds {fill:yellow;stroke:black;opacity:0.7;}\n" + ".variant{stroke:none;fill:red;opacity:0.2;}\n" + ".xaxis{stroke:gray;fill:none;opacity:0.2;}\n" + ".postick{font-size:9px;stroke:black;stroke-width:1;}");
            // style
            w.writeEndElement();
            final Function<Integer, Integer> trim = new Function<Integer, Integer>() {

                @Override
                public Integer apply(final Integer t) {
                    return Math.max(interval.getStart(), Math.min(interval.getEnd(), t));
                }
            };
            final Function<Integer, Double> baseToPixel = new Function<Integer, Double>() {

                @Override
                public Double apply(final Integer t) {
                    return margin_left + drawinAreaWidth * (t - (double) interval.getStart()) / ((double) interval.length());
                }
            };
            final Function<Integer, Double> variantIndexToPixel = new Function<Integer, Double>() {

                @Override
                public Double apply(final Integer idx) {
                    final double variant_width = drawinAreaWidth / (double) variants.size();
                    final double midx = variant_width * idx + variant_width / 2.0;
                    return margin_left + midx - genotype_width / 2.0;
                }
            };
            final Function<VariantContext, String> variantTitle = V -> (V.getContig().startsWith("chr") ? V.getContig().substring(3) : V.getContig()) + ":" + V.getStart() + " " + V.getReference().getDisplayString();
            /**
             * title
             */
            double y = 0;
            w.writeStartElement("text");
            w.writeAttribute("x", "0");
            w.writeAttribute("y", String.valueOf(featureHeight));
            w.writeCharacters(interval.toString());
            w.writeEndElement();
            y += featureHeight;
            for (final KnownGene g : genes) {
                int cdsHeigh = 5;
                double exonHeight = TRANSCRIPT_HEIGHT - 5;
                double midY = TRANSCRIPT_HEIGHT / 2;
                w.writeStartElement("g");
                w.writeAttribute("transform", "translate(0," + y + ")");
                title(w, g.getName());
                w.writeStartElement("text");
                w.writeAttribute("x", String.valueOf(margin_left - 10));
                w.writeAttribute("y", String.valueOf(featureHeight));
                w.writeAttribute("style", "text-anchor:end;");
                w.writeCharacters(g.getName());
                w.writeEndElement();
                /* transcript line */
                w.writeEmptyElement("line");
                w.writeAttribute("class", "kgtr");
                w.writeAttribute("x1", String.valueOf(baseToPixel.apply(trim.apply(g.getTxStart()))));
                w.writeAttribute("y1", String.valueOf(midY));
                w.writeAttribute("x2", String.valueOf(baseToPixel.apply(trim.apply(g.getTxEnd()))));
                w.writeAttribute("y2", String.valueOf(midY));
                /* strand symbols */
                for (double pixX = 0; pixX < drawinAreaWidth; pixX += 30) {
                    double pos0 = interval.getStart() + (pixX / (double) drawinAreaWidth) * interval.length();
                    if (pos0 + 1 < g.getTxStart())
                        continue;
                    if (pos0 > g.getTxEnd())
                        break;
                    w.writeEmptyElement("use");
                    w.writeAttribute("class", "kgstrand");
                    w.writeAttribute("xlink", XLINK.NS, "href", "#strand" + (g.isPositiveStrand() ? "F" : "R"));
                    w.writeAttribute("x", String.valueOf(margin_left + pixX));
                    w.writeAttribute("y", String.valueOf(midY));
                }
                /* exons */
                for (KnownGene.Exon exon : g.getExons()) {
                    if (exon.getStart() + 1 >= interval.getEnd())
                        continue;
                    if (exon.getEnd() <= interval.getStart())
                        continue;
                    w.writeStartElement("rect");
                    w.writeAttribute("class", "kgexon");
                    w.writeAttribute("x", String.valueOf(baseToPixel.apply(trim.apply(exon.getStart()))));
                    w.writeAttribute("y", String.valueOf(midY - exonHeight / 2));
                    w.writeAttribute("width", String.valueOf(baseToPixel.apply(trim.apply(exon.getEnd())) - baseToPixel.apply((trim.apply(exon.getStart())))));
                    w.writeAttribute("height", String.valueOf(exonHeight));
                    title(w, exon.getName());
                    w.writeEndElement();
                }
                /* coding line */
                if (!g.isNonCoding()) {
                    w.writeEmptyElement("rect");
                    w.writeAttribute("class", "kgcds");
                    w.writeAttribute("x", String.valueOf(baseToPixel.apply(trim.apply(g.getCdsStart()))));
                    w.writeAttribute("y", String.valueOf(midY - cdsHeigh / 4.0));
                    w.writeAttribute("width", String.valueOf(baseToPixel.apply(trim.apply(g.getCdsEnd())) - baseToPixel.apply((trim.apply((g.getCdsStart()))))));
                    w.writeAttribute("height", String.valueOf(cdsHeigh / 2.0));
                }
                // String label=String.format("%15s", g.getName());
                // w.writeEmptyElement("path");
                // double fontHeight=Math.min(10,0.8*TRANSCRIPT_HEIGHT);
                // w.writeAttribute("d",this.hershey.svgPath(label,-insets.left,midY-fontHeight/2,insets.left*0.9,fontHeight));
                w.writeEndElement();
                w.writeCharacters("\n");
                y += featureHeight;
            }
            /* draw lines to variants */
            for (int vidx = 0; vidx < variants.size(); ++vidx) {
                final VariantContext vc = variants.get(vidx);
                double x1 = baseToPixel.apply(vc.getStart());
                double x2 = baseToPixel.apply(vc.getEnd());
                final double y2 = y + featureHeight * interline_weight;
                w.writeStartElement("polygon");
                w.writeAttribute("style", "fill:" + (vidx % 2 == 0 ? "ghostwhite" : "lavender") + ";stroke:black;opacity:0.6;stroke-width:0.5;");
                w.writeAttribute("points", "" + x1 + "," + (y - featureHeight / 2.0) + " " + x2 + "," + (y - featureHeight / 2.0) + " " + variantIndexToPixel.apply(vidx) + "," + y2 + " " + (variantIndexToPixel.apply(vidx) + this.genotype_width) + "," + y2);
                title(w, variantTitle.apply(vc));
                w.writeEndElement();
            }
            for (int vidx = 0; vidx < variants.size(); ++vidx) {
                final VariantContext vc = variants.get(vidx);
                final double y2 = y + featureHeight * interline_weight;
                w.writeStartElement("text");
                w.writeAttribute("transform", "translate(" + (String.valueOf(variantIndexToPixel.apply(vidx) + genotype_width / 2.0)) + "," + String.valueOf(y2 - 5) + ") " + "rotate(-45)");
                w.writeAttribute("x", "0");
                w.writeAttribute("y", "0");
                w.writeAttribute("class", "postick");
                w.writeCharacters(variantTitle.apply(vc));
                w.writeEndElement();
                w.writeCharacters("\n");
            }
            y += featureHeight * interline_weight;
            w.writeStartElement("g");
            for (final String sample : header.getSampleNamesInOrder()) {
                for (int vidx = 0; vidx < variants.size(); ++vidx) {
                    final VariantContext vc = variants.get(vidx);
                    final Genotype g = vc.getGenotype(sample);
                    double opacity = 1.0;
                    if (vc.isIndel())
                        opacity *= this.variantIndelOpacity;
                    if (vc.isFiltered())
                        opacity *= this.variantFILTEREDOpacity;
                    if (opacity > 1)
                        opacity = 1;
                    if (opacity <= 0)
                        continue;
                    if (opacity < 1) {
                        w.writeStartElement("g");
                        w.writeAttribute("style", "opacity:" + opacity + ";");
                    }
                    w.writeEmptyElement("use");
                    w.writeAttribute("x", "" + variantIndexToPixel.apply(vidx));
                    w.writeAttribute("y", String.valueOf(y));
                    w.writeAttribute("xlink", XLINK.NS, "href", "#g_" + g.getType());
                    if (opacity < 1) {
                        w.writeEndElement();
                    }
                }
                w.writeCharacters("\n");
                w.writeStartElement("text");
                w.writeAttribute("x", String.valueOf(margin_left - 10));
                w.writeAttribute("y", String.valueOf(y + this.genotype_width / 2.0));
                w.writeAttribute("style", "text-anchor:end;");
                w.writeCharacters(sample);
                w.writeEndElement();
                y += this.genotype_width;
            }
            w.writeCharacters("\n");
            w.writeEndDocument();
            w.writeCharacters("\n");
            w.flush();
            w.close();
            if (outputFile != null) {
                outputStream.flush();
                outputStream.close();
                outputStream = null;
            }
            if (stop_at_first) {
                LOG.info("Stop after first SVG document");
                break;
            }
        }
        r.close();
        manifestW.flush();
        manifestW.close();
        manifestW = null;
        return 0;
    } catch (Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(r);
        CloserUtil.close(tabix);
        CloserUtil.close(outputStream);
        CloserUtil.close(manifestW);
    }
}
Also used : Genotype(htsjdk.variant.variantcontext.Genotype) XLINK(com.github.lindenb.jvarkit.util.ns.XLINK) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) NullOuputStream(com.github.lindenb.jvarkit.io.NullOuputStream) VCFHeader(htsjdk.variant.vcf.VCFHeader) Function(java.util.function.Function) SVG(com.github.lindenb.jvarkit.util.svg.SVG) ArrayList(java.util.ArrayList) Interval(htsjdk.samtools.util.Interval) XMLStreamException(javax.xml.stream.XMLStreamException) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) XMLStreamWriter(javax.xml.stream.XMLStreamWriter) CloserUtil(htsjdk.samtools.util.CloserUtil) OutputStream(java.io.OutputStream) PrintWriter(java.io.PrintWriter) Iterator(java.util.Iterator) Logger(com.github.lindenb.jvarkit.util.log.Logger) VcfIterator(com.github.lindenb.jvarkit.util.vcf.VcfIterator) GenotypeType(htsjdk.variant.variantcontext.GenotypeType) Set(java.util.Set) Collectors(java.util.stream.Collectors) KnownGene(com.github.lindenb.jvarkit.util.ucsc.KnownGene) TabixKnownGeneFileReader(com.github.lindenb.jvarkit.util.ucsc.TabixKnownGeneFileReader) File(java.io.File) XMLOutputFactory(javax.xml.stream.XMLOutputFactory) List(java.util.List) VariantContext(htsjdk.variant.variantcontext.VariantContext) XMLOutputFactory(javax.xml.stream.XMLOutputFactory) OutputStream(java.io.OutputStream) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) Function(java.util.function.Function) VcfIterator(com.github.lindenb.jvarkit.util.vcf.VcfIterator) XMLStreamWriter(javax.xml.stream.XMLStreamWriter) NullOuputStream(com.github.lindenb.jvarkit.io.NullOuputStream) VCFHeader(htsjdk.variant.vcf.VCFHeader) PrintWriter(java.io.PrintWriter) Genotype(htsjdk.variant.variantcontext.Genotype) TabixKnownGeneFileReader(com.github.lindenb.jvarkit.util.ucsc.TabixKnownGeneFileReader) KnownGene(com.github.lindenb.jvarkit.util.ucsc.KnownGene) File(java.io.File) Interval(htsjdk.samtools.util.Interval)

Example 10 with KnownGene

use of com.github.lindenb.jvarkit.util.ucsc.KnownGene in project jvarkit by lindenb.

the class MapUniProtFeatures method doWork.

@Override
public int doWork(List<String> args) {
    PrintWriter pw = null;
    try {
        JAXBContext jc = JAXBContext.newInstance("org.uniprot");
        Unmarshaller uniprotUnmarshaller = jc.createUnmarshaller();
        Marshaller uniprotMarshaller = jc.createMarshaller();
        pw = super.openFileOrStdoutAsPrintWriter(OUT);
        LOG.info("read " + REF);
        this.indexedFastaSequenceFile = new IndexedFastaSequenceFile(REF);
        if (this.indexedFastaSequenceFile.getSequenceDictionary() == null) {
            LOG.error("fasta sequence is not indexed");
            return -1;
        }
        LOG.info("readubf " + kgUri);
        String line;
        Pattern tab = Pattern.compile("[\t]");
        BufferedReader r = IOUtils.openURIForBufferedReading(kgUri);
        while ((line = r.readLine()) != null) {
            String[] tokens = tab.split(line);
            KnownGene kg = new KnownGene();
            kg.setName(tokens[0]);
            kg.setChrom(tokens[1]);
            kg.setStrand(tokens[2].charAt(0));
            kg.setTxStart(Integer.parseInt(tokens[3]));
            kg.setTxEnd(Integer.parseInt(tokens[4]));
            kg.setCdsStart(Integer.parseInt(tokens[5]));
            kg.setCdsEnd(Integer.parseInt(tokens[6]));
            kg.setExonBounds(Integer.parseInt(tokens[7]), tokens[8], tokens[9]);
            List<KnownGene> L = prot2genes.get(tokens[10]);
            if (L == null) {
                L = new ArrayList<KnownGene>();
                prot2genes.put(tokens[10], L);
            }
            if (indexedFastaSequenceFile.getSequenceDictionary().getSequence(kg.getContig()) == null) {
                LOG.info("ignoring " + line);
                continue;
            }
            L.add(kg);
        }
        r.close();
        if (OUT != null) {
            LOG.info("opening " + OUT);
            pw = new PrintWriter(OUT);
        }
        LOG.info("knownGenes: " + this.prot2genes.size());
        XMLInputFactory xmlInputFactory = XMLInputFactory.newFactory();
        xmlInputFactory.setProperty(XMLInputFactory.IS_NAMESPACE_AWARE, Boolean.TRUE);
        xmlInputFactory.setProperty(XMLInputFactory.IS_COALESCING, Boolean.TRUE);
        xmlInputFactory.setProperty(XMLInputFactory.IS_REPLACING_ENTITY_REFERENCES, Boolean.TRUE);
        xmlInputFactory.setProperty(XMLInputFactory.IS_SUPPORTING_EXTERNAL_ENTITIES, Boolean.FALSE);
        xmlInputFactory.setXMLResolver(new XMLResolver() {

            @Override
            public Object resolveEntity(String arg0, String arg1, String arg2, String arg3) throws XMLStreamException {
                LOG.info("resolveEntity:" + arg0 + "/" + arg1 + "/" + arg2);
                return null;
            }
        });
        // SortingCollection<UBed> mappedFeatures=SortingCollection.newInstance(UBed.class, new UBedCodec(),new UBedCmp(),super.MAX_RECORDS_IN_RAM);
        // mappedFeatures.setDestructiveIteration(true);
        LOG.info("Scanning " + UNIPROT);
        Reader fr = IOUtils.openURIForBufferedReading(UNIPROT);
        XMLEventReader rx = xmlInputFactory.createXMLEventReader(fr);
        final QName uEntry = new QName(UNIPROT_NS, "entry");
        GenomicSequence genomic = null;
        while (rx.hasNext()) {
            XMLEvent evt = rx.peek();
            if (!(evt.isStartElement() && evt.asStartElement().getName().equals(uEntry))) {
                rx.next();
                continue;
            }
            JAXBElement<Entry> jaxbElement = uniprotUnmarshaller.unmarshal(rx, Entry.class);
            Entry entry = jaxbElement.getValue();
            if (entry.getFeature().isEmpty())
                continue;
            List<KnownGene> genes = null;
            for (String acn : entry.getAccession()) {
                genes = prot2genes.get(acn);
                if (genes != null)
                    break;
            }
            if (genes == null)
                continue;
            for (KnownGene kg : genes) {
                if (genomic == null || !genomic.getChrom().equals(kg.getChromosome())) {
                    genomic = new GenomicSequence(this.indexedFastaSequenceFile, kg.getChromosome());
                }
                KnownGene.Peptide pep = kg.getCodingRNA(genomic).getPeptide();
                int sameSequenceLength = 0;
                while (sameSequenceLength < entry.getSequence().getValue().length() && sameSequenceLength < pep.length()) {
                    if (Character.toUpperCase(entry.getSequence().getValue().charAt(sameSequenceLength)) != Character.toUpperCase(entry.getSequence().getValue().charAt(sameSequenceLength))) {
                        break;
                    }
                    sameSequenceLength++;
                }
                if (sameSequenceLength != pep.length()) {
                    System.err.println("Not Same sequence " + kg.getName() + " strand " + kg.getStrand() + " ok-up-to-" + sameSequenceLength);
                    System.err.println("P:" + pep.toString() + " " + pep.length());
                    System.err.println("Q:" + entry.getSequence().getValue() + " " + entry.getSequence().getLength());
                    if (pep.toString().contains("?")) {
                        System.err.println("RNA:" + pep.getCodingRNA().toString());
                    }
                }
                if (sameSequenceLength == 0)
                    continue;
                for (FeatureType feat : entry.getFeature()) {
                    if (feat.getType() == null || feat.getType().isEmpty())
                        continue;
                    LocationType locType = feat.getLocation();
                    if (locType == null)
                        continue;
                    int pepStart, pepEnd;
                    if (locType.getPosition() != null && locType.getPosition().getPosition() != null) {
                        pepEnd = locType.getPosition().getPosition().intValue();
                        pepStart = pepEnd - 1;
                    } else if (locType.getBegin() != null && locType.getEnd() != null && locType.getBegin().getPosition() != null && locType.getEnd().getPosition() != null) {
                        pepStart = locType.getBegin().getPosition().intValue() - 1;
                        pepEnd = locType.getEnd().getPosition().intValue();
                    } else {
                        continue;
                    }
                    if (pepEnd >= sameSequenceLength)
                        continue;
                    List<Integer> genomicPos = new ArrayList<Integer>();
                    while (pepStart < pepEnd) {
                        if (pepStart >= pep.length()) {
                            System.err.println();
                            System.err.println("P:" + pep.toString() + " " + pep.length() + " " + kg.getStrand());
                            System.err.println("Q:" + entry.getSequence().getValue() + " " + entry.getSequence().getLength());
                            uniprotMarshaller.marshal(new JAXBElement<FeatureType>(new QName(UNIPROT_NS, "feature"), FeatureType.class, feat), System.err);
                        }
                        int[] codon = pep.convertToGenomicCoordinates(pepStart);
                        pepStart++;
                        for (int gP : codon) {
                            if (gP == -1) {
                                uniprotMarshaller.marshal(new JAXBElement<FeatureType>(new QName(UNIPROT_NS, "feature"), FeatureType.class, feat), System.err);
                                LOG.error("error in genomoc for (" + pepStart + "/" + pepEnd + "):" + entry.getName());
                                System.exit(-1);
                            }
                            genomicPos.add(gP);
                        }
                    }
                    Collections.sort(genomicPos);
                    UBed ubed = new UBed();
                    ubed.tid = this.indexedFastaSequenceFile.getSequenceDictionary().getSequenceIndex(genomic.getChrom());
                    int k0 = 0;
                    while (k0 < genomicPos.size()) {
                        Range range = new Range(genomicPos.get(k0), genomicPos.get(k0) + 1);
                        k0++;
                        while (k0 < genomicPos.size() && range.end == genomicPos.get(k0)) {
                            range = new Range(range.start, range.end + 1);
                            ++k0;
                        }
                        ubed.positions.add(range);
                    }
                    ubed.strand = (byte) (kg.isPositiveStrand() ? '+' : '-');
                    ubed.name = feat.getType();
                    ubed.print(pw);
                }
            }
        }
        rx.close();
        fr.close();
        LOG.info("End scan uniprot");
    /*
			mappedFeatures.doneAdding();
			
			
			CloseableIterator<UBed> iter=mappedFeatures.iterator();
			while(iter.hasNext())
				{
				UBed ubed=iter.next();
				ubed.print();
				}
			mappedFeatures.cleanup();
			*/
    } catch (Exception err) {
        err.printStackTrace();
        if (OUT != null)
            OUT.deleteOnExit();
        return -1;
    } finally {
        pw.flush();
        pw.close();
        CloserUtil.close(this.indexedFastaSequenceFile);
    }
    return 0;
}
Also used : FeatureType(org.uniprot.FeatureType) ArrayList(java.util.ArrayList) XMLEventReader(javax.xml.stream.XMLEventReader) Reader(java.io.Reader) BufferedReader(java.io.BufferedReader) JAXBContext(javax.xml.bind.JAXBContext) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile) Entry(org.uniprot.Entry) XMLEventReader(javax.xml.stream.XMLEventReader) XMLResolver(javax.xml.stream.XMLResolver) Unmarshaller(javax.xml.bind.Unmarshaller) PrintWriter(java.io.PrintWriter) Pattern(java.util.regex.Pattern) Marshaller(javax.xml.bind.Marshaller) QName(javax.xml.namespace.QName) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) XMLStreamException(javax.xml.stream.XMLStreamException) XMLStreamException(javax.xml.stream.XMLStreamException) BufferedReader(java.io.BufferedReader) XMLEvent(javax.xml.stream.events.XMLEvent) KnownGene(com.github.lindenb.jvarkit.util.ucsc.KnownGene) LocationType(org.uniprot.LocationType) XMLInputFactory(javax.xml.stream.XMLInputFactory)

Aggregations

KnownGene (com.github.lindenb.jvarkit.util.ucsc.KnownGene)22 Interval (htsjdk.samtools.util.Interval)11 ArrayList (java.util.ArrayList)9 BufferedReader (java.io.BufferedReader)8 Pattern (java.util.regex.Pattern)8 IOException (java.io.IOException)7 VariantContext (htsjdk.variant.variantcontext.VariantContext)6 VCFHeader (htsjdk.variant.vcf.VCFHeader)5 File (java.io.File)5 HashSet (java.util.HashSet)5 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)4 List (java.util.List)4 Parameter (com.beust.jcommander.Parameter)3 IOUtils (com.github.lindenb.jvarkit.io.IOUtils)3 GeneticCode (com.github.lindenb.jvarkit.util.bio.GeneticCode)3 Launcher (com.github.lindenb.jvarkit.util.jcommander.Launcher)3 Logger (com.github.lindenb.jvarkit.util.log.Logger)3 GenomicSequence (com.github.lindenb.jvarkit.util.picard.GenomicSequence)3 SAMSequenceDictionaryProgress (com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress)3 VcfIterator (com.github.lindenb.jvarkit.util.vcf.VcfIterator)3