Search in sources :

Example 1 with GTFLine

use of com.github.lindenb.jvarkit.util.bio.gtf.GTFLine in project jvarkit by lindenb.

the class Gff2KnownGene method doWork.

@Override
public int doWork(final List<String> args) {
    this.gtfCodec = this.formatChooser.makeCodec();
    LineIterator in = null;
    EqualRangeIterator<GffLine> eq = null;
    CloseableIterator<GffLine> iter = null;
    SortingCollection<GffLine> sorting = null;
    PrintWriter pw = null;
    final Set<String> transcriptIdentifiersSet = new HashSet<>(Arrays.asList(semicolon.split(this.transcriptIdentifiersStr)));
    final GffLineComparator comparator = new GffLineComparator();
    try {
        final String input = oneFileOrNull(args);
        in = (input == null ? IOUtils.openStdinForLineIterator() : IOUtils.openURIForLineIterator(input));
        sorting = SortingCollection.newInstance(GffLine.class, new GffLineCodec(), comparator, this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
        sorting.setDestructiveIteration(true);
        int nRead = 0;
        this.gtfCodec.readActualHeader(in);
        while (in.hasNext()) {
            ++nRead;
            final String line = in.next();
            if (line.isEmpty() || line.startsWith("#"))
                continue;
            final GTFLine delegate = this.gtfCodec.decode(line);
            if (delegate.getType().equals("gene")) {
                if (verbose)
                    LOG.info("skipping " + line);
                continue;
            }
            final GffLine gffLine = new GffLine(delegate);
            if (!gffLine.hasTranscript()) {
                if (verbose)
                    LOG.info("skipping " + line);
                continue;
            }
            sorting.add(gffLine);
            if (nRead % 50000 == 0)
                LOG.info("Read " + nRead + " lines. Last: " + line);
        }
        sorting.doneAdding();
        LOG.info("sorting...." + nRead);
        pw = super.openFileOrStdoutAsPrintWriter(this.outputFile);
        iter = sorting.iterator();
        eq = new EqualRangeIterator<>(iter, (o1, o2) -> {
            final int i = o1.getContig().compareTo(o2.getContig());
            if (i != 0)
                return i;
            return o1.getTranscript().compareTo(o2.getTranscript());
        });
        while (eq.hasNext()) {
            final List<GffLine> L = eq.next();
            final GffLine first = L.get(0);
            final String firstContig = first.getContig();
            final String firstTranscriptName = first.getTranscript();
            if (verbose)
                LOG.info("processing " + firstTranscriptName);
            final char strand = first.delegate.getStrand();
            if (!(strand == '+' || strand == '-')) {
                LOG.error("Bad strand in " + first.delegate.getLine());
                return -1;
            }
            final List<Interval> exons = new ArrayList<>();
            Interval mainTranscriptInterval = null;
            final List<Interval> cds = new ArrayList<>();
            for (final GffLine item : L) {
                if (!firstContig.equals(item.getContig())) {
                    LOG.error("Conflict in contig!!");
                    return -1;
                }
                if (!firstTranscriptName.equals(item.getTranscript())) {
                    LOG.error("Conflict in name!! " + firstTranscriptName + ":" + item.getTranscript());
                    return -1;
                }
                if (item.delegate.getType().equals("gene")) {
                    if (verbose)
                        LOG.info("ignore line " + item);
                    continue;
                } else if ((transcriptIdentifiersSet.contains(item.delegate.getType()))) {
                    if (mainTranscriptInterval != null && !mainTranscriptInterval.equals(item.interval)) {
                        LOG.error("Transcript found twice for " + firstTranscriptName);
                        return -1;
                    }
                    mainTranscriptInterval = item.interval;
                    continue;
                } else if (item.delegate.getType().equals("exon")) {
                    exons.add(item.interval);
                    continue;
                } else if (item.delegate.getType().equals("CDS")) {
                    cds.add(item.interval);
                    continue;
                } else // UTR , stop_codon, etc...
                {
                    if (verbose)
                        LOG.info("ignore line " + firstTranscriptName + ":" + item.delegate.getType());
                    continue;
                }
            }
            exons.sort((o1, o2) -> o1.getStart() - o2.getStart());
            if (mainTranscriptInterval == null) {
                LOG.warn("main transcript not found for " + firstTranscriptName + " " + first + " available feature type where:" + L.stream().map(T -> T.delegate.getType()).collect(Collectors.toSet()));
                continue;
            }
            if (this.writeBin) {
                pw.print(reg2bin(mainTranscriptInterval.getStart() - 1, mainTranscriptInterval.getEnd()));
                pw.print("\t");
            }
            pw.print(firstTranscriptName);
            pw.print("\t");
            pw.print(firstContig);
            pw.print("\t");
            pw.print(strand);
            pw.print("\t");
            pw.print(mainTranscriptInterval.getStart() - 1);
            pw.print("\t");
            pw.print(mainTranscriptInterval.getEnd());
            pw.print("\t");
            if (cds.isEmpty()) {
                pw.print(mainTranscriptInterval.getStart() - 1);
                pw.print("\t");
                pw.print(mainTranscriptInterval.getStart() - 1);
                pw.print("\t");
            } else {
                int minCds = cds.get(0).getStart();
                int maxCds = cds.get(0).getEnd();
                for (int i = 1; i < cds.size(); ++i) {
                    minCds = Math.min(cds.get(i).getStart(), minCds);
                    maxCds = Math.max(cds.get(i).getEnd(), maxCds);
                }
                pw.print(minCds - 1);
                pw.print("\t");
                pw.print(maxCds);
                pw.print("\t");
            }
            pw.print(exons.size());
            pw.print("\t");
            for (int i = 0; i < exons.size(); ++i) {
                if (i > 0)
                    pw.print(",");
                pw.print(exons.get(i).getStart() - 1);
            }
            pw.print("\t");
            for (int i = 0; i < exons.size(); ++i) {
                if (i > 0)
                    pw.print(",");
                pw.print(exons.get(i).getEnd());
            }
            pw.print("\t");
            for (final Iterator<Map.Entry<String, String>> metainfoiter = first.delegate.iterator(); metainfoiter.hasNext(); ) {
                final Map.Entry<String, String> entry = metainfoiter.next();
                if (entry == null)
                    continue;
                final String s = entry.getKey();
                if (s.equals("gene_id") || s.equals("transcript_type") || s.equals("gene_name") || s.equals("gene_status") || s.equals("gene_type") || s.equals("transcript_id") || s.equals("havana_gene") || s.equals("havana_transcript") || s.equals("transcript_name") || s.equals("protein_id") || s.equals("ccdsid") || s.equals("Parent")) {
                    pw.print(entry.getValue());
                    pw.print(";");
                }
            }
            pw.print("\t");
            pw.print(firstTranscriptName);
            pw.println();
        }
        eq.close();
        iter.close();
        iter = null;
        sorting = null;
        pw.flush();
        pw.close();
        pw = null;
        LOG.info("done");
        return RETURN_OK;
    } catch (final Exception e) {
        LOG.error(e);
        return -1;
    } finally {
        CloserUtil.close(eq);
        CloserUtil.close(pw);
        CloserUtil.close(in);
        CloserUtil.close(iter);
        CloserUtil.close(sorting);
    }
}
Also used : DataInputStream(java.io.DataInputStream) CloseableIterator(htsjdk.samtools.util.CloseableIterator) Arrays(java.util.Arrays) Program(com.github.lindenb.jvarkit.util.jcommander.Program) LineIterator(htsjdk.tribble.readers.LineIterator) Parameter(com.beust.jcommander.Parameter) ParametersDelegate(com.beust.jcommander.ParametersDelegate) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) GTFLine(com.github.lindenb.jvarkit.util.bio.gtf.GTFLine) Interval(htsjdk.samtools.util.Interval) DataOutputStream(java.io.DataOutputStream) StringUtil(htsjdk.samtools.util.StringUtil) AbstractDataCodec(com.github.lindenb.jvarkit.util.picard.AbstractDataCodec) Map(java.util.Map) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) CloserUtil(htsjdk.samtools.util.CloserUtil) PrintWriter(java.io.PrintWriter) SortingCollection(htsjdk.samtools.util.SortingCollection) Iterator(java.util.Iterator) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) IOException(java.io.IOException) EOFException(java.io.EOFException) Collectors(java.util.stream.Collectors) GTFCodec(com.github.lindenb.jvarkit.util.bio.gtf.GTFCodec) File(java.io.File) List(java.util.List) GenomicIndexUtil(htsjdk.samtools.GenomicIndexUtil) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) Pattern(java.util.regex.Pattern) Comparator(java.util.Comparator) ArrayList(java.util.ArrayList) LineIterator(htsjdk.tribble.readers.LineIterator) PrintWriter(java.io.PrintWriter) HashSet(java.util.HashSet) IOException(java.io.IOException) EOFException(java.io.EOFException) GTFLine(com.github.lindenb.jvarkit.util.bio.gtf.GTFLine) Map(java.util.Map) Interval(htsjdk.samtools.util.Interval)

Example 2 with GTFLine

use of com.github.lindenb.jvarkit.util.bio.gtf.GTFLine in project jvarkit by lindenb.

the class Gtf2Xml method doWork.

@Override
public int doWork(final List<String> args) {
    LineIterator r = null;
    XMLStreamWriter w = null;
    FileWriter fw = null;
    try {
        String inputName = oneFileOrNull(args);
        r = (StringUtil.isBlank(inputName) ? IOUtils.openStreamForLineIterator(stdin()) : IOUtils.openURIForLineIterator(inputName));
        XMLOutputFactory xof = XMLOutputFactory.newFactory();
        if (this.outputFile == null) {
            w = xof.createXMLStreamWriter(stdout(), "UTF-8");
        } else {
            w = xof.createXMLStreamWriter((fw = new FileWriter(this.outputFile)));
        }
        final GTFCodec codec = this.formatChooser.makeCodec();
        w.writeStartDocument("UTF-8", "1.0");
        w.writeStartElement("gtf");
        final GTFCodec.GTFHeader header = codec.readActualHeader(r);
        for (final String headerLine : header.getLines()) {
            if (!headerLine.startsWith("#!"))
                continue;
            final int ws = headerLine.indexOf(' ');
            // ??
            if (ws == -1)
                continue;
            w.writeAttribute(headerLine.substring(2, ws), headerLine.substring(ws + 1).trim());
        }
        while (r.hasNext()) {
            final String line = r.next();
            GTFLine gtfline = codec.decode(line);
            if (gtfline == null)
                continue;
            write(w, gtfline);
        }
        if (!this.disable_att_keys) {
            w.writeStartElement("attributes");
            for (String k : this.att_keys) {
                w.writeStartElement("attribute");
                w.writeCharacters(k);
                w.writeEndElement();
                w.writeCharacters("\n");
            }
            w.writeEndElement();
            w.writeCharacters("\n");
        }
        if (!this.disable_feature_type) {
            w.writeStartElement("types");
            for (String k : this.types) {
                w.writeStartElement("type");
                w.writeCharacters(k);
                w.writeEndElement();
                w.writeCharacters("\n");
            }
            w.writeEndElement();
            w.writeCharacters("\n");
        }
        if (!this.disable_sources) {
            w.writeStartElement("sources");
            for (final String k : this.sources) {
                w.writeStartElement("source");
                w.writeCharacters(k);
                w.writeEndElement();
                w.writeCharacters("\n");
            }
            w.writeEndElement();
            w.writeCharacters("\n");
        }
        if (!this.disable_dict) {
            w.writeStartElement("dict");
            for (final String k : this.seqdict.keySet()) {
                w.writeEmptyElement("chrom");
                w.writeAttribute("name", k);
                w.writeAttribute("length", String.valueOf(this.seqdict.get(k)));
                w.writeCharacters("\n");
            }
            w.writeEndElement();
            w.writeCharacters("\n");
        }
        w.writeEndElement();
        w.writeEndDocument();
        w.flush();
        return 0;
    } catch (Exception e) {
        LOG.error(e);
        return -1;
    } finally {
        CloserUtil.close(r);
        CloserUtil.close(fw);
    }
}
Also used : XMLOutputFactory(javax.xml.stream.XMLOutputFactory) XMLStreamWriter(javax.xml.stream.XMLStreamWriter) FileWriter(java.io.FileWriter) GTFCodec(com.github.lindenb.jvarkit.util.bio.gtf.GTFCodec) LineIterator(htsjdk.tribble.readers.LineIterator) GTFLine(com.github.lindenb.jvarkit.util.bio.gtf.GTFLine) IOException(java.io.IOException) XMLStreamException(javax.xml.stream.XMLStreamException)

Aggregations

GTFCodec (com.github.lindenb.jvarkit.util.bio.gtf.GTFCodec)2 GTFLine (com.github.lindenb.jvarkit.util.bio.gtf.GTFLine)2 LineIterator (htsjdk.tribble.readers.LineIterator)2 IOException (java.io.IOException)2 Parameter (com.beust.jcommander.Parameter)1 ParametersDelegate (com.beust.jcommander.ParametersDelegate)1 IOUtils (com.github.lindenb.jvarkit.io.IOUtils)1 EqualRangeIterator (com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator)1 Launcher (com.github.lindenb.jvarkit.util.jcommander.Launcher)1 Program (com.github.lindenb.jvarkit.util.jcommander.Program)1 Logger (com.github.lindenb.jvarkit.util.log.Logger)1 AbstractDataCodec (com.github.lindenb.jvarkit.util.picard.AbstractDataCodec)1 GenomicIndexUtil (htsjdk.samtools.GenomicIndexUtil)1 CloseableIterator (htsjdk.samtools.util.CloseableIterator)1 CloserUtil (htsjdk.samtools.util.CloserUtil)1 Interval (htsjdk.samtools.util.Interval)1 SortingCollection (htsjdk.samtools.util.SortingCollection)1 StringUtil (htsjdk.samtools.util.StringUtil)1 DataInputStream (java.io.DataInputStream)1 DataOutputStream (java.io.DataOutputStream)1