Search in sources :

Example 11 with GtfReader

use of com.github.lindenb.jvarkit.util.bio.structure.GtfReader in project jvarkit by lindenb.

the class Biostar398854 method doWork.

@Override
public int doWork(final List<String> args) {
    PrintWriter out = null;
    try {
        this.referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.faidx);
        final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.referenceSequenceFile);
        try (VCFReader in = VCFReaderFactory.makeDefault().open(Paths.get(oneAndOnlyOneFile(args)), true)) {
            out = super.openPathOrStdoutAsPrintWriter(this.outputFile);
            final PrintWriter final_out = out;
            final List<String> samples = in.getHeader().getSampleNamesInOrder();
            try (GtfReader gtfReader = new GtfReader(this.gtfIn)) {
                final SAMSequenceDictionary dict2 = in.getHeader().getSequenceDictionary();
                if (dict2 != null)
                    SequenceUtil.assertSequenceDictionariesEqual(dict, dict2);
                gtfReader.setContigNameConverter(ContigNameConverter.fromOneDictionary(dict));
                gtfReader.getAllGenes().stream().flatMap(G -> G.getTranscripts().stream()).filter(T -> T.hasCDS()).forEach(transcript -> {
                    final List<VariantContext> variants1 = in.query(transcript).stream().filter(V -> V.isVariant() && AcidNucleics.isATGCN(V.getReference()) && V.getAlternateAlleles().stream().anyMatch(A -> AcidNucleics.isATGCN(A))).collect(Collectors.toCollection(ArrayList::new));
                    if (variants1.isEmpty())
                        return;
                    final int[] positions1 = transcript.getAllCds().stream().flatMapToInt(CDS -> IntStream.rangeClosed(CDS.getStart(), CDS.getEnd())).toArray();
                    final List<VariantContext> variants = variants1.stream().filter(V -> {
                        int insert = Arrays.binarySearch(positions1, V.getStart());
                        return insert >= 0 && insert < positions1.length;
                    }).collect(Collectors.toCollection(ArrayList::new));
                    if (variants.isEmpty())
                        return;
                    final ReferenceSequence refSeq = this.referenceSequenceFile.getSubsequenceAt(transcript.getContig(), transcript.getStart(), transcript.getEnd());
                    for (int nSample = 0; nSample <= /* yes <= */
                    samples.size(); nSample++) {
                        final String fastaName = transcript.getId() + " " + transcript.getGene().getId() + " " + transcript.getGene().getGeneName() + " " + (nSample < samples.size() ? samples.get(nSample) : "ALL") + " " + transcript.getContig() + ":" + transcript.getStart() + "-" + transcript.getEnd() + "(" + transcript.getStrand() + ")";
                        final StringBuilder sb = new StringBuilder();
                        int array_index = 0;
                        while (array_index < positions1.length) {
                            final int x1 = positions1[array_index];
                            final int refseqidx0 = x1 - transcript.getStart();
                            char refbase = (refseqidx0 < 0 || refseqidx0 >= refSeq.length() ? 'N' : (char) refSeq.getBases()[refseqidx0]);
                            final int x1_final = x1;
                            String base = String.valueOf(refbase);
                            final VariantContext ctx = variants.stream().filter(V -> V.getStart() == x1_final).findFirst().orElse(null);
                            Allele alt = ctx == null ? null : ctx.getAlternateAlleles().stream().filter(A -> AcidNucleics.isATGCN(A)).findFirst().orElse(null);
                            if (ctx != null && nSample < samples.size()) {
                                final Genotype gt = ctx.getGenotype(nSample);
                                alt = gt.getAlleles().stream().filter(A -> !A.isReference() && !A.isNoCall() && AcidNucleics.isATGCN(A)).findFirst().orElse(null);
                            }
                            if (alt != null) {
                                base = alt.getBaseString().toUpperCase();
                                int i = 0;
                                while (i < ctx.getReference().length() && array_index < positions1.length) {
                                    array_index++;
                                    i++;
                                }
                            } else {
                                base = base.toLowerCase();
                                array_index++;
                            }
                            sb.append(base);
                        }
                        String fastaSeq = transcript.isNegativeStrand() ? AcidNucleics.reverseComplement(sb.toString()) : sb.toString();
                        final_out.print(">");
                        final_out.println(fastaName);
                        final_out.println(fastaSeq);
                    }
                });
            }
            out.flush();
            out.close();
            out = null;
        }
        return 0;
    } catch (final Throwable e) {
        LOG.error(e);
        return -1;
    } finally {
        CloserUtil.close(out);
        CloserUtil.close(this.referenceSequenceFile);
    }
}
Also used : IntStream(java.util.stream.IntStream) Genotype(htsjdk.variant.variantcontext.Genotype) Allele(htsjdk.variant.variantcontext.Allele) Arrays(java.util.Arrays) SequenceUtil(htsjdk.samtools.util.SequenceUtil) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) AcidNucleics(com.github.lindenb.jvarkit.util.bio.AcidNucleics) ArrayList(java.util.ArrayList) ReferenceSequenceFile(htsjdk.samtools.reference.ReferenceSequenceFile) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFReaderFactory(com.github.lindenb.jvarkit.variant.vcf.VCFReaderFactory) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) PrintWriter(java.io.PrintWriter) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) Logger(com.github.lindenb.jvarkit.util.log.Logger) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) VCFReader(htsjdk.variant.vcf.VCFReader) Collectors(java.util.stream.Collectors) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) ReferenceSequenceFileFactory(htsjdk.samtools.reference.ReferenceSequenceFileFactory) List(java.util.List) Paths(java.nio.file.Paths) VariantContext(htsjdk.variant.variantcontext.VariantContext) ReferenceSequence(htsjdk.samtools.reference.ReferenceSequence) VariantContext(htsjdk.variant.variantcontext.VariantContext) Genotype(htsjdk.variant.variantcontext.Genotype) ReferenceSequence(htsjdk.samtools.reference.ReferenceSequence) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Allele(htsjdk.variant.variantcontext.Allele) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) VCFReader(htsjdk.variant.vcf.VCFReader) PrintWriter(java.io.PrintWriter)

Example 12 with GtfReader

use of com.github.lindenb.jvarkit.util.bio.structure.GtfReader in project jvarkit by lindenb.

the class BamMatrix method doWork.

@Override
public int doWork(final List<String> args) {
    if (pixel_size < 1) {
        LOG.error("pixel size is too small (" + this.pixel_size + ")");
        return -1;
    }
    if (StringUtils.isBlank(region2Str)) {
        this.region2Str = region1Str;
    }
    try {
        final SamReaderFactory srf = SamReaderFactory.makeDefault().validationStringency(ValidationStringency.LENIENT);
        if (this.faidx != null)
            srf.referenceSequence(this.faidx);
        final String inputX;
        final String inputY;
        if (args.size() == 1) {
            inputX = args.get(0);
            inputY = null;
        } else if (args.size() == 2) {
            inputX = args.get(0);
            inputY = args.get(1);
        } else {
            LOG.error("illegal number of arguments.");
            return -1;
        }
        this.samReaderX = srf.open(SamInputResource.of(inputX));
        if (!this.samReaderX.hasIndex()) {
            LOG.error("Input " + inputX + " is not indexed");
            return -1;
        }
        this.dict = SequenceDictionaryUtils.extractRequired(this.samReaderX.getFileHeader());
        if (inputY == null) {
            this.samReaderY = srf.open(SamInputResource.of(inputY));
            if (!this.samReaderY.hasIndex()) {
                LOG.error("Input " + inputY + " is not indexed");
                return -1;
            }
            SequenceUtil.assertSequenceDictionariesEqual(SequenceDictionaryUtils.extractRequired(this.samReaderY.getFileHeader()), this.dict);
        } else {
            this.samReaderY = this.samReaderX;
        }
        final ContigNameConverter converter = ContigNameConverter.fromOneDictionary(this.dict);
        final Function<String, Optional<SimpleInterval>> intervalParser = IntervalParserFactory.newInstance().dictionary(dict).enableWholeContig().make();
        this.userIntervalX = intervalParser.apply(this.region1Str).orElseThrow(IntervalParserFactory.exception(this.region1Str));
        this.userIntervalY = intervalParser.apply(this.region2Str).orElseThrow(IntervalParserFactory.exception(this.region2Str));
        // adjust intervals so they have the same length
        if (this.userIntervalX.getLengthOnReference() > this.userIntervalY.getLengthOnReference()) {
            final int mid = this.userIntervalY.getStart() + this.userIntervalY.getLengthOnReference() / 2;
            final int start = Math.max(1, mid - this.userIntervalX.getLengthOnReference() / 2);
            this.userIntervalY = new SimpleInterval(this.userIntervalY.getContig(), start, start + this.userIntervalX.getLengthOnReference());
            LOG.warn("Adjusting interval Y to " + this.userIntervalY + " so both intervals have the same length");
        } else if (this.userIntervalY.getLengthOnReference() > this.userIntervalX.getLengthOnReference()) {
            final int mid = this.userIntervalX.getStart() + this.userIntervalX.getLengthOnReference() / 2;
            final int start = Math.max(1, mid - this.userIntervalY.getLengthOnReference() / 2);
            this.userIntervalX = new SimpleInterval(this.userIntervalX.getContig(), start, start + this.userIntervalY.getLengthOnReference());
            LOG.warn("Adjusting interval X to " + this.userIntervalX + " so both intervals have the same length");
        }
        LOG.info("One pixel is " + (this.userIntervalX.getLengthOnReference() / (double) matrix_size) + " bases");
        final int distance = Math.max(this.userIntervalX.getLengthOnReference(), this.userIntervalY.getLengthOnReference());
        final double pixel2base = distance / (double) matrix_size;
        short max_count = 1;
        final short[] counts = new short[this.matrix_size * this.matrix_size];
        final ReadCounter counter = new MemoryReadCounter();
        /* loop over each pixel 1st axis */
        for (int pixY = 0; pixY < this.matrix_size; pixY++) {
            final int start1 = (int) (this.userIntervalY.getStart() + pixY * pixel2base);
            final int end1 = start1 + (int) pixel2base;
            final Interval qy = new Interval(this.userIntervalY.getContig(), start1, end1);
            if (!qy.overlaps(this.userIntervalY))
                continue;
            final Set<String> set1 = counter.getNamesMatching(1, qy);
            if (set1.isEmpty())
                continue;
            /* loop over each pixel 2nd axis */
            for (int pixX = 0; pixX < this.matrix_size; pixX++) {
                final int start2 = (int) (this.userIntervalX.getStart() + pixX * pixel2base);
                final int end2 = start2 + (int) pixel2base;
                final Interval qx = new Interval(this.userIntervalX.getContig(), start2, end2);
                if (!qx.overlaps(this.userIntervalX))
                    continue;
                if (!validateDisance(qy, qx))
                    continue;
                final int count_common;
                if (qx.compareTo(qy) == 0) {
                    count_common = set1.size();
                } else {
                    final HashSet<String> common = new HashSet<>(set1);
                    common.retainAll(counter.getNamesMatching(0, qx));
                    count_common = common.size();
                }
                final short count = count_common > Short.MAX_VALUE ? Short.MAX_VALUE : (short) count_common;
                max_count = (short) Math.max(count, max_count);
                counts[pixY * this.matrix_size + pixX] = count;
            }
        }
        counter.dispose();
        final int font_size = 10;
        final int cov_height = (this.hide_coverage ? 0 : 50);
        final int gene_height = 25;
        final int margin = font_size + cov_height + (this.gtfPath == null ? 0 : gene_height);
        final Insets margins = new Insets(margin, margin, 10, 10);
        final Dimension drawingAreaDim = new Dimension(this.matrix_size + margins.left + margins.right, this.matrix_size + margins.top + margins.bottom);
        final BufferedImage img = new BufferedImage(drawingAreaDim.width, drawingAreaDim.height, BufferedImage.TYPE_INT_RGB);
        final Graphics2D g = img.createGraphics();
        g.setRenderingHint(RenderingHints.KEY_ANTIALIASING, RenderingHints.VALUE_ANTIALIAS_ON);
        g.setColor(Color.WHITE);
        g.fillRect(0, 0, drawingAreaDim.width, drawingAreaDim.height);
        // draw sample
        final Hershey herschey = new Hershey();
        final String sampleX = samReaderX.getFileHeader().getReadGroups().stream().map(R -> R.getSample()).filter(S -> !StringUtils.isBlank(S)).findFirst().orElse(inputX);
        final String sampleY = (samReaderX == samReaderY ? sampleX : samReaderX.getFileHeader().getReadGroups().stream().map(R -> R.getSample()).filter(S -> !StringUtils.isBlank(S)).findFirst().orElse(inputY));
        final String sample = (sampleX.equals(sampleY) ? sampleX : String.join(" ", sampleX, sampleY));
        g.setColor(Color.DARK_GRAY);
        herschey.paint(g, sample, new Rectangle2D.Double(0, 1, margins.left - 1, font_size));
        for (int side = 0; side < 2 && !StringUtils.isBlank(this.highlightPath); ++side) {
            final int curr_side = side;
            final SimpleInterval r = (side == 0 ? this.userIntervalX : this.userIntervalY);
            final BedLineCodec bedCodec = new BedLineCodec();
            final Composite oldComposite = g.getComposite();
            g.setComposite(AlphaComposite.getInstance(AlphaComposite.SRC_OVER, 0.3f));
            try (BufferedReader br = IOUtils.openURIForBufferedReading(this.highlightPath)) {
                br.lines().filter(L -> !(StringUtils.isBlank(L) || L.startsWith("#"))).map(L -> bedCodec.decode(L)).filter(B -> B != null).filter(K -> converter.apply(K.getContig()) != null && r.getContig().equals(converter.apply(K.getContig()))).filter(K -> CoordMath.overlaps(K.getStart(), K.getEnd(), r.getStart(), r.getEnd())).map(E -> new Interval(converter.apply(E.getContig()), E.getStart() + 1, E.getEnd())).filter(E -> CoordMath.overlaps(E.getStart(), E.getEnd(), r.getStart(), r.getEnd())).map(E -> new Interval(E.getContig(), Math.max(r.getStart(), E.getStart()), Math.min(r.getEnd(), E.getEnd()))).forEach(E -> {
                    double d = ((E.getStart() - r.getStart()) / (double) r.getLengthOnReference()) * matrix_size;
                    double dL = ((E.getLengthOnReference()) / (double) r.getLengthOnReference()) * matrix_size;
                    g.setColor(Color.YELLOW);
                    if (curr_side == 0) {
                        g.fill(new Rectangle2D.Double(d, 0, dL, margins.left));
                    } else {
                        g.fill(new Rectangle2D.Double(0, d, margins.top, dL));
                    }
                });
            }
            g.setComposite(oldComposite);
        }
        g.translate(margins.left, margins.top);
        final double logMaxV = Math.log(max_count);
        for (int pix1 = 0; pix1 < this.matrix_size; pix1++) {
            for (int pix2 = 0; pix2 < this.matrix_size; pix2++) {
                final short count = counts[pix1 * this.matrix_size + pix2];
                if (count == 0 || count < this.min_common_names)
                    continue;
                final int gray;
                switch(color_scale) {
                    case LINEAR:
                        gray = 255 - (int) (255 * (count / (double) max_count));
                        break;
                    case LOG:
                        gray = 255 - (int) (255 * ((Math.log(count)) / logMaxV));
                        break;
                    default:
                        throw new IllegalStateException(color_scale.name());
                }
                g.setColor(new Color(gray, 0, 0));
                g.fill(new Rectangle2D.Double(pix1 - pixel_size / 2.0, pix2 - pixel_size / 2.0, pixel_size, pixel_size));
            }
        }
        // draw frame
        g.setColor(Color.GRAY);
        g.drawRect(0, 0, this.matrix_size, this.matrix_size);
        g.translate(-margins.left, -margins.top);
        // used to plot depth
        final double[] coverage = new double[matrix_size];
        final List<SimpleInterval> exonsList;
        if (this.gtfPath == null) {
            exonsList = Collections.emptyList();
        } else {
            try (GtfReader gtfReader = new GtfReader(this.gtfPath)) {
                gtfReader.setContigNameConverter(converter);
                exonsList = gtfReader.getAllGenes().stream().filter(K -> K.overlaps(this.userIntervalX) || K.overlaps(this.userIntervalY)).flatMap(G -> G.getTranscripts().stream()).filter(T -> T.hasExon()).flatMap(K -> K.getExons().stream()).filter(E -> E.overlaps(this.userIntervalX) || E.overlaps(this.userIntervalY)).map(E -> new SimpleInterval(E)).collect(Collectors.toSet()).stream().collect(Collectors.toList());
            }
        }
        for (int side = 0; side < 2; ++side) {
            final SimpleInterval r = (side == 0 ? this.userIntervalX : this.userIntervalY);
            final AffineTransform oldtr = g.getTransform();
            AffineTransform tr;
            if (side == 0) {
                // horizonal axis
                tr = AffineTransform.getTranslateInstance(margins.left, 1);
            } else {
                // vertical
                tr = AffineTransform.getTranslateInstance(margins.left, margins.top);
                tr.concatenate(AffineTransform.getRotateInstance(Math.PI / 2.0));
            }
            g.setTransform(tr);
            // calculate coverage , do this only once if regionX==regionY
            if (!hide_coverage && !(side == 1 && this.userIntervalX.equals(this.userIntervalY))) {
                Arrays.fill(coverage, 0);
                final int[] count = new int[this.matrix_size];
                final IntervalList intervalList = new IntervalList(this.dict);
                intervalList.add(new Interval(r));
                try (final SamLocusIterator sli = new SamLocusIterator(this.samReaderX, intervalList, true)) {
                    while (sli.hasNext()) {
                        final LocusInfo locusInfo = sli.next();
                        final int pos = locusInfo.getPosition();
                        if (pos < r.getStart() || pos > r.getEnd())
                            continue;
                        final int depth = locusInfo.getRecordAndOffsets().size();
                        final int array_index = (int) (((pos - r.getStart()) / (double) r.getLengthOnReference()) * matrix_size);
                        coverage[array_index] += depth;
                        count[array_index]++;
                    }
                }
                for (int i = 0; i < coverage.length; ++i) {
                    if (count[i] == 0)
                        continue;
                    coverage[i] /= count[i];
                }
            }
            // draw ruler
            int y = 0;
            if (!this.hide_coverage) {
                final double max_cov = Arrays.stream(coverage).max().orElse(1);
                final GeneralPath gp = new GeneralPath();
                gp.moveTo(0, cov_height);
                for (int x = 0; x < coverage.length; ++x) {
                    gp.lineTo(x, y + cov_height - (coverage[x] / max_cov) * cov_height);
                }
                gp.lineTo(coverage.length, cov_height);
                gp.closePath();
                g.setColor(Color.GRAY);
                g.fill(gp);
                // string for max cov
                String label = StringUtils.niceInt((int) Arrays.stream(coverage).max().orElse(9));
                g.setColor(Color.DARK_GRAY);
                herschey.paint(g, label, new Rectangle2D.Double(matrix_size - label.length() * font_size, y, label.length() * font_size, font_size));
                y += cov_height;
            }
            // draw label
            g.setColor(Color.DARK_GRAY);
            // label is 'start position'
            String label = StringUtils.niceInt(r.getStart());
            herschey.paint(g, label, new Rectangle2D.Double(0, y, label.length() * font_size, font_size));
            // label is 'end position'
            label = StringUtils.niceInt(r.getEnd());
            herschey.paint(g, label, new Rectangle2D.Double(matrix_size - (label.length() * font_size), y, label.length() * font_size, font_size));
            // label is 'chromosome and length'
            label = r.getContig() + " ( " + StringUtils.niceInt(r.getLengthOnReference()) + " bp )";
            herschey.paint(g, label, new Rectangle2D.Double(matrix_size / 2.0 - (label.length() * font_size) / 2.0, y, label.length() * font_size, font_size));
            y += font_size;
            // draw genes
            if (this.gtfPath != null) {
                final double curr_y = y;
                double midy = y + gene_height / 2.0;
                g.setColor(Color.CYAN);
                g.draw(new Line2D.Double(0, midy, matrix_size, midy));
                exonsList.stream().filter(E -> E.overlaps(r)).map(E -> new SimpleInterval(E.getContig(), Math.max(r.getStart(), E.getStart()), Math.min(r.getEnd(), E.getEnd()))).forEach(E -> {
                    final double x = ((E.getStart() - r.getStart()) / (double) r.getLengthOnReference()) * matrix_size;
                    final double width = ((E.getLengthOnReference()) / (double) r.getLengthOnReference()) * matrix_size;
                    g.setColor(Color.BLUE);
                    g.fill(new Rectangle2D.Double(x, curr_y, width, gene_height));
                });
            }
            g.setTransform(oldtr);
        }
        g.dispose();
        try {
            if (this.outputFile == null) {
                ImageIO.write(img, "PNG", stdout());
            } else {
                ImageIO.write(img, this.outputFile.getName().endsWith(".png") ? "PNG" : "JPG", this.outputFile);
            }
        } catch (final IOException err) {
            throw new RuntimeIOException(err);
        }
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(this.samReaderX);
        CloserUtil.close(this.samReaderY);
    }
}
Also used : Color(java.awt.Color) Arrays(java.util.Arrays) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Rectangle2D(java.awt.geom.Rectangle2D) RenderingHints(java.awt.RenderingHints) IntervalParserFactory(com.github.lindenb.jvarkit.samtools.util.IntervalParserFactory) AlignmentBlock(htsjdk.samtools.AlignmentBlock) DistanceParser(com.github.lindenb.jvarkit.util.bio.DistanceParser) ImageIO(javax.imageio.ImageIO) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) Composite(java.awt.Composite) BufferedImage(java.awt.image.BufferedImage) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) LocusInfo(htsjdk.samtools.util.SamLocusIterator.LocusInfo) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) AffineTransform(java.awt.geom.AffineTransform) Collectors(java.util.stream.Collectors) SAMRecord(htsjdk.samtools.SAMRecord) Dimension(java.awt.Dimension) List(java.util.List) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) CoordMath(htsjdk.samtools.util.CoordMath) Optional(java.util.Optional) GeneralPath(java.awt.geom.GeneralPath) SamReaderFactory(htsjdk.samtools.SamReaderFactory) Insets(java.awt.Insets) SequenceUtil(htsjdk.samtools.util.SequenceUtil) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) SAMUtils(htsjdk.samtools.SAMUtils) Parameter(com.beust.jcommander.Parameter) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) Function(java.util.function.Function) ValidationStringency(htsjdk.samtools.ValidationStringency) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Interval(htsjdk.samtools.util.Interval) AlphaComposite(java.awt.AlphaComposite) NoSplitter(com.github.lindenb.jvarkit.util.jcommander.NoSplitter) Graphics2D(java.awt.Graphics2D) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) Line2D(java.awt.geom.Line2D) Locatable(htsjdk.samtools.util.Locatable) Hershey(com.github.lindenb.jvarkit.util.hershey.Hershey) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) SamLocusIterator(htsjdk.samtools.util.SamLocusIterator) IntervalList(htsjdk.samtools.util.IntervalList) IOException(java.io.IOException) SamReader(htsjdk.samtools.SamReader) File(java.io.File) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) SamInputResource(htsjdk.samtools.SamInputResource) QueryInterval(htsjdk.samtools.QueryInterval) BufferedReader(java.io.BufferedReader) Collections(java.util.Collections) Insets(java.awt.Insets) GeneralPath(java.awt.geom.GeneralPath) Line2D(java.awt.geom.Line2D) BufferedImage(java.awt.image.BufferedImage) IntervalList(htsjdk.samtools.util.IntervalList) LocusInfo(htsjdk.samtools.util.SamLocusIterator.LocusInfo) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) HashSet(java.util.HashSet) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) SamReaderFactory(htsjdk.samtools.SamReaderFactory) Optional(java.util.Optional) Composite(java.awt.Composite) AlphaComposite(java.awt.AlphaComposite) Color(java.awt.Color) Hershey(com.github.lindenb.jvarkit.util.hershey.Hershey) Rectangle2D(java.awt.geom.Rectangle2D) Dimension(java.awt.Dimension) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) IOException(java.io.IOException) Graphics2D(java.awt.Graphics2D) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) SamLocusIterator(htsjdk.samtools.util.SamLocusIterator) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) BufferedReader(java.io.BufferedReader) AffineTransform(java.awt.geom.AffineTransform) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) Interval(htsjdk.samtools.util.Interval) QueryInterval(htsjdk.samtools.QueryInterval)

Example 13 with GtfReader

use of com.github.lindenb.jvarkit.util.bio.structure.GtfReader in project jvarkit by lindenb.

the class VcfGtfSplitter method doWork.

@Override
public int doWork(final List<String> args) {
    ArchiveFactory archiveFactory = null;
    PrintWriter manifest = null;
    VCFReader vcfFileReader = null;
    try {
        this.attCleaner = AttributeCleaner.compile(this.xannotatePattern);
        for (final String s : featuresString.split("[;, ]")) {
            if (StringUtils.isBlank(s))
                continue;
            if (s.equals("cds")) {
                use_cds = true;
            } else if (s.equals("intron")) {
                use_cds = true;
            } else if (s.equals("exon")) {
                use_exon = true;
            } else if (s.equals("stop")) {
                use_stop = true;
            } else if (s.equals("start")) {
                use_start = true;
            } else if (s.equals("transcript")) {
                use_exon = true;
                use_intron = true;
            } else if (s.equals("utr5")) {
                use_utr5 = true;
            } else if (s.equals("utr3")) {
                use_utr3 = true;
            } else if (s.equals("utr")) {
                use_utr3 = true;
                use_utr5 = true;
            } else if (s.equals("upstream")) {
                use_upstream = true;
            } else if (s.equals("downstream")) {
                use_downstream = true;
            } else if (s.equals("splice")) {
                use_splice = true;
            } else if (s.equals("cds_utr5")) {
                use_cds_utr5 = true;
            } else if (s.equals("cds_utr3")) {
                use_cds_utr3 = true;
            } else if (s.equals("cds_utr")) {
                use_cds_utr3 = true;
                use_cds_utr5 = true;
            } else {
                LOG.error("unknown code " + s + " in " + this.featuresString);
                return -1;
            }
        }
        final Path tmpVcf = Files.createTempFile("tmp.", (use_bcf ? FileExtensions.BCF : FileExtensions.COMPRESSED_VCF));
        String input = oneAndOnlyOneFile(args);
        vcfFileReader = VCFReaderFactory.makeDefault().open(Paths.get(input), true);
        final VCFHeader header1 = vcfFileReader.getHeader();
        final SAMSequenceDictionary dict = header1.getSequenceDictionary();
        if (dict == null && this.use_bcf) {
            throw new JvarkitException.VcfDictionaryMissing(input);
        }
        if (dict != null && !limitToContigs.isEmpty()) {
            final ContigNameConverter ctgNameConverter = ContigNameConverter.fromOneDictionary(dict);
            final Set<String> set2 = new HashSet<>(this.limitToContigs.size());
            for (final String ctg : this.limitToContigs) {
                final String ctg2 = ctgNameConverter.apply(ctg);
                if (StringUtils.isBlank(ctg2)) {
                    LOG.error(JvarkitException.ContigNotFoundInDictionary.getMessage(ctg, dict));
                    return -1;
                }
                set2.add(ctg2);
            }
            this.limitToContigs = set2;
        }
        final List<Gene> all_genes;
        try (GtfReader gtfReader = new GtfReader(this.gtfPath)) {
            final Comparator<Gene> cmp;
            if (dict != null) {
                gtfReader.setContigNameConverter(ContigNameConverter.fromOneDictionary(dict));
                cmp = new ContigDictComparator(dict).createLocatableComparator();
            } else {
                cmp = (A, B) -> {
                    final int i = A.getContig().compareTo(B.getContig());
                    if (i != 0)
                        return i;
                    return Integer.compare(A.getStart(), B.getStart());
                };
            }
            all_genes = gtfReader.getAllGenes().stream().filter(G -> {
                if (this.protein_coding_only && !"protein_coding".equals(G.getGeneBiotype()))
                    return false;
                if (this.limitToContigs.isEmpty())
                    return true;
                return this.limitToContigs.contains(G.getContig());
            }).sorted(cmp).collect(Collectors.toList());
        }
        archiveFactory = ArchiveFactory.open(this.outputFile);
        archiveFactory.setCompressionLevel(0);
        manifest = new PrintWriter(this.manifestFile == null ? new NullOuputStream() : IOUtils.openPathForWriting(manifestFile));
        manifest.println("#chrom\tstart\tend\tGene-Id\tGene-Name\tGene-Biotype\tTranscript-Id\tpath\tCount_Variants");
        if (this.split_by_transcript) {
            final Iterator<Transcript> triter = all_genes.stream().flatMap(G -> G.getTranscripts().stream()).iterator();
            while (triter.hasNext()) {
                final Transcript tr = triter.next();
                final AbstractSplitter splitter = new TranscriptSplitter(tr);
                this.split(splitter, vcfFileReader, header1, dict, archiveFactory, tmpVcf, manifest);
            }
        } else {
            for (Gene gene : all_genes) {
                final AbstractSplitter splitter = new GeneSplitter(gene);
                this.split(splitter, vcfFileReader, header1, dict, archiveFactory, tmpVcf, manifest);
            }
        }
        vcfFileReader.close();
        vcfFileReader = null;
        manifest.flush();
        manifest.close();
        manifest = null;
        archiveFactory.close();
        Files.deleteIfExists(tmpVcf);
        return RETURN_OK;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(vcfFileReader);
        CloserUtil.close(archiveFactory);
        CloserUtil.close(manifest);
    }
}
Also used : VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) CloseableIterator(htsjdk.samtools.util.CloseableIterator) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Program(com.github.lindenb.jvarkit.util.jcommander.Program) AttributeCleaner(com.github.lindenb.jvarkit.variant.variantcontext.AttributeCleaner) Parameter(com.beust.jcommander.Parameter) NullOuputStream(com.github.lindenb.jvarkit.io.NullOuputStream) Transcript(com.github.lindenb.jvarkit.util.bio.structure.Transcript) VCFHeader(htsjdk.variant.vcf.VCFHeader) TabixIndexCreator(htsjdk.tribble.index.tabix.TabixIndexCreator) Gene(com.github.lindenb.jvarkit.util.bio.structure.Gene) VariantContextWriterBuilder(htsjdk.variant.variantcontext.writer.VariantContextWriterBuilder) HashSet(java.util.HashSet) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) DistanceParser(com.github.lindenb.jvarkit.util.bio.DistanceParser) NoSplitter(com.github.lindenb.jvarkit.util.jcommander.NoSplitter) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFReaderFactory(com.github.lindenb.jvarkit.variant.vcf.VCFReaderFactory) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) OutputStream(java.io.OutputStream) PrintWriter(java.io.PrintWriter) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) Locatable(htsjdk.samtools.util.Locatable) Iterator(java.util.Iterator) Files(java.nio.file.Files) Logger(com.github.lindenb.jvarkit.util.log.Logger) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) VCFReader(htsjdk.variant.vcf.VCFReader) Set(java.util.Set) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) File(java.io.File) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) SimplePosition(com.github.lindenb.jvarkit.samtools.util.SimplePosition) List(java.util.List) Intron(com.github.lindenb.jvarkit.util.bio.structure.Intron) Paths(java.nio.file.Paths) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) FileExtensions(htsjdk.samtools.util.FileExtensions) Options(htsjdk.variant.variantcontext.writer.Options) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VariantContext(htsjdk.variant.variantcontext.VariantContext) Comparator(java.util.Comparator) TabixFormat(htsjdk.tribble.index.tabix.TabixFormat) ArchiveFactory(com.github.lindenb.jvarkit.io.ArchiveFactory) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) Gene(com.github.lindenb.jvarkit.util.bio.structure.Gene) VCFReader(htsjdk.variant.vcf.VCFReader) NullOuputStream(com.github.lindenb.jvarkit.io.NullOuputStream) VCFHeader(htsjdk.variant.vcf.VCFHeader) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) PrintWriter(java.io.PrintWriter) HashSet(java.util.HashSet) Path(java.nio.file.Path) ArchiveFactory(com.github.lindenb.jvarkit.io.ArchiveFactory) Transcript(com.github.lindenb.jvarkit.util.bio.structure.Transcript) IOException(java.io.IOException) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader)

Example 14 with GtfReader

use of com.github.lindenb.jvarkit.util.bio.structure.GtfReader in project jvarkit by lindenb.

the class VCFCombineTwoSnvs method loadTranscripts.

/**
 * load KnownGenes
 */
private void loadTranscripts() throws IOException {
    GtfReader gtfReader = null;
    try {
        final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.indexedFastaSequenceFile);
        final ContigNameConverter ctgNameConverter = ContigNameConverter.fromOneDictionary(dict);
        LOG.info("loading genes from " + this.gtfPath);
        gtfReader = new GtfReader(this.gtfPath);
        gtfReader.setContigNameConverter(ctgNameConverter);
        gtfReader.getAllGenes().stream().flatMap(G -> G.getTranscripts().stream()).filter(T -> T.hasStrand() && T.hasCDS()).forEach(T -> {
            // use 1 based interval
            final Interval interval = new Interval(T);
            List<Transcript> lkg = this.knownGenes.get(interval);
            if (lkg == null) {
                lkg = new ArrayList<>(2);
                this.knownGenes.put(interval, lkg);
            }
            lkg.add(T);
        });
    } finally {
        CloserUtil.close(gtfReader);
    }
}
Also used : WritingVariantsDelegate(com.github.lindenb.jvarkit.variant.variantcontext.writer.WritingVariantsDelegate) Allele(htsjdk.variant.variantcontext.Allele) Program(com.github.lindenb.jvarkit.util.jcommander.Program) IOUtil(htsjdk.samtools.util.IOUtil) Transcript(com.github.lindenb.jvarkit.util.bio.structure.Transcript) VCFHeader(htsjdk.variant.vcf.VCFHeader) CigarElement(htsjdk.samtools.CigarElement) CigarOperator(htsjdk.samtools.CigarOperator) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) SAMFileHeader(htsjdk.samtools.SAMFileHeader) ReferenceSequenceFile(htsjdk.samtools.reference.ReferenceSequenceFile) DataOutputStream(java.io.DataOutputStream) AbstractDataCodec(com.github.lindenb.jvarkit.util.picard.AbstractDataCodec) Map(java.util.Map) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) PrintWriter(java.io.PrintWriter) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) GranthamScore(com.github.lindenb.jvarkit.util.bio.GranthamScore) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) Collection(java.util.Collection) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) Collectors(java.util.stream.Collectors) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) SAMRecord(htsjdk.samtools.SAMRecord) ReferenceSequenceFileFactory(htsjdk.samtools.reference.ReferenceSequenceFileFactory) List(java.util.List) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) VariantContext(htsjdk.variant.variantcontext.VariantContext) VCFHeaderLineCount(htsjdk.variant.vcf.VCFHeaderLineCount) SamReaderFactory(htsjdk.samtools.SamReaderFactory) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) Genotype(htsjdk.variant.variantcontext.Genotype) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) DataInputStream(java.io.DataInputStream) VCFUtils(com.github.lindenb.jvarkit.util.vcf.VCFUtils) Cigar(htsjdk.samtools.Cigar) CloseableIterator(htsjdk.samtools.util.CloseableIterator) PeptideSequence(com.github.lindenb.jvarkit.util.bio.structure.PeptideSequence) SequenceUtil(htsjdk.samtools.util.SequenceUtil) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) VCFIterator(htsjdk.variant.vcf.VCFIterator) Parameter(com.beust.jcommander.Parameter) NullOuputStream(com.github.lindenb.jvarkit.io.NullOuputStream) AcidNucleics(com.github.lindenb.jvarkit.util.bio.AcidNucleics) HashMap(java.util.HashMap) OptionalInt(java.util.OptionalInt) ValidationStringency(htsjdk.samtools.ValidationStringency) TreeSet(java.util.TreeSet) ParametersDelegate(com.beust.jcommander.ParametersDelegate) RNASequenceFactory(com.github.lindenb.jvarkit.util.bio.structure.RNASequenceFactory) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) LinkedHashMap(java.util.LinkedHashMap) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) Interval(htsjdk.samtools.util.Interval) DelegateCharSequence(com.github.lindenb.jvarkit.lang.DelegateCharSequence) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) WeakHashMap(java.util.WeakHashMap) VCFConstants(htsjdk.variant.vcf.VCFConstants) Locatable(htsjdk.samtools.util.Locatable) SortingCollection(htsjdk.samtools.util.SortingCollection) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) RNASequence(com.github.lindenb.jvarkit.util.bio.structure.RNASequence) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) IOException(java.io.IOException) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) SamReader(htsjdk.samtools.SamReader) File(java.io.File) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) BufferedReader(java.io.BufferedReader) Comparator(java.util.Comparator) Collections(java.util.Collections) Transcript(com.github.lindenb.jvarkit.util.bio.structure.Transcript) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Interval(htsjdk.samtools.util.Interval)

Example 15 with GtfReader

use of com.github.lindenb.jvarkit.util.bio.structure.GtfReader in project jvarkit by lindenb.

the class GtfUpstreamOrfTest method testBasicVcf.

@Test
public void testBasicVcf() throws IOException {
    try {
        Optional<Path> ref = support.getGRCh37Path();
        if (!ref.isPresent())
            return;
        Path out = support.createTmpPath(".gtf");
        Assert.assertEquals(new GtfUpstreamOrf().instanceMain(new String[] { "-o", out.toString(), "-R", ref.get().toString(), support.resource("Homo_sapiens.GRCh37.87.gtf.gz") }), 0);
        GtfReader gf = new GtfReader(out);
        gf.getAllGenes();
        gf.close();
    } finally {
        support.removeTmpFiles();
    }
}
Also used : Path(java.nio.file.Path) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) GtfReaderTest(com.github.lindenb.jvarkit.util.bio.structure.GtfReaderTest) Test(org.testng.annotations.Test) KozakSequenceTest(com.github.lindenb.jvarkit.util.bio.KozakSequenceTest) LauncherTest(com.github.lindenb.jvarkit.util.jcommander.LauncherTest) AlsoTest(com.github.lindenb.jvarkit.tests.AlsoTest) KnownGeneTest(com.github.lindenb.jvarkit.util.ucsc.KnownGeneTest) VCFUtilsTest(com.github.lindenb.jvarkit.util.vcf.VCFUtilsTest)

Aggregations

GtfReader (com.github.lindenb.jvarkit.util.bio.structure.GtfReader)23 Path (java.nio.file.Path)22 Parameter (com.beust.jcommander.Parameter)20 ContigNameConverter (com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter)20 Program (com.github.lindenb.jvarkit.util.jcommander.Program)20 Logger (com.github.lindenb.jvarkit.util.log.Logger)20 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)20 List (java.util.List)20 Collectors (java.util.stream.Collectors)18 SequenceDictionaryUtils (com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils)15 Interval (htsjdk.samtools.util.Interval)15 ArrayList (java.util.ArrayList)15 StringUtils (com.github.lindenb.jvarkit.lang.StringUtils)14 Transcript (com.github.lindenb.jvarkit.util.bio.structure.Transcript)14 Launcher (com.github.lindenb.jvarkit.util.jcommander.Launcher)14 CloserUtil (htsjdk.samtools.util.CloserUtil)14 IntervalTreeMap (htsjdk.samtools.util.IntervalTreeMap)13 Locatable (htsjdk.samtools.util.Locatable)13 Set (java.util.Set)13 VariantContext (htsjdk.variant.variantcontext.VariantContext)12