use of com.github.lindenb.jvarkit.util.bio.gtf.GTFLine in project jvarkit by lindenb.
the class Gff2KnownGene method doWork.
@Override
public int doWork(final List<String> args) {
this.gtfCodec = this.formatChooser.makeCodec();
LineIterator in = null;
EqualRangeIterator<GffLine> eq = null;
CloseableIterator<GffLine> iter = null;
SortingCollection<GffLine> sorting = null;
PrintWriter pw = null;
final Set<String> transcriptIdentifiersSet = new HashSet<>(Arrays.asList(semicolon.split(this.transcriptIdentifiersStr)));
final GffLineComparator comparator = new GffLineComparator();
try {
final String input = oneFileOrNull(args);
in = (input == null ? IOUtils.openStdinForLineIterator() : IOUtils.openURIForLineIterator(input));
sorting = SortingCollection.newInstance(GffLine.class, new GffLineCodec(), comparator, this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
sorting.setDestructiveIteration(true);
int nRead = 0;
this.gtfCodec.readActualHeader(in);
while (in.hasNext()) {
++nRead;
final String line = in.next();
if (line.isEmpty() || line.startsWith("#"))
continue;
final GTFLine delegate = this.gtfCodec.decode(line);
if (delegate.getType().equals("gene")) {
if (verbose)
LOG.info("skipping " + line);
continue;
}
final GffLine gffLine = new GffLine(delegate);
if (!gffLine.hasTranscript()) {
if (verbose)
LOG.info("skipping " + line);
continue;
}
sorting.add(gffLine);
if (nRead % 50000 == 0)
LOG.info("Read " + nRead + " lines. Last: " + line);
}
sorting.doneAdding();
LOG.info("sorting...." + nRead);
pw = super.openFileOrStdoutAsPrintWriter(this.outputFile);
iter = sorting.iterator();
eq = new EqualRangeIterator<>(iter, (o1, o2) -> {
final int i = o1.getContig().compareTo(o2.getContig());
if (i != 0)
return i;
return o1.getTranscript().compareTo(o2.getTranscript());
});
while (eq.hasNext()) {
final List<GffLine> L = eq.next();
final GffLine first = L.get(0);
final String firstContig = first.getContig();
final String firstTranscriptName = first.getTranscript();
if (verbose)
LOG.info("processing " + firstTranscriptName);
final char strand = first.delegate.getStrand();
if (!(strand == '+' || strand == '-')) {
LOG.error("Bad strand in " + first.delegate.getLine());
return -1;
}
final List<Interval> exons = new ArrayList<>();
Interval mainTranscriptInterval = null;
final List<Interval> cds = new ArrayList<>();
for (final GffLine item : L) {
if (!firstContig.equals(item.getContig())) {
LOG.error("Conflict in contig!!");
return -1;
}
if (!firstTranscriptName.equals(item.getTranscript())) {
LOG.error("Conflict in name!! " + firstTranscriptName + ":" + item.getTranscript());
return -1;
}
if (item.delegate.getType().equals("gene")) {
if (verbose)
LOG.info("ignore line " + item);
continue;
} else if ((transcriptIdentifiersSet.contains(item.delegate.getType()))) {
if (mainTranscriptInterval != null && !mainTranscriptInterval.equals(item.interval)) {
LOG.error("Transcript found twice for " + firstTranscriptName);
return -1;
}
mainTranscriptInterval = item.interval;
continue;
} else if (item.delegate.getType().equals("exon")) {
exons.add(item.interval);
continue;
} else if (item.delegate.getType().equals("CDS")) {
cds.add(item.interval);
continue;
} else // UTR , stop_codon, etc...
{
if (verbose)
LOG.info("ignore line " + firstTranscriptName + ":" + item.delegate.getType());
continue;
}
}
exons.sort((o1, o2) -> o1.getStart() - o2.getStart());
if (mainTranscriptInterval == null) {
LOG.warn("main transcript not found for " + firstTranscriptName + " " + first + " available feature type where:" + L.stream().map(T -> T.delegate.getType()).collect(Collectors.toSet()));
continue;
}
if (this.writeBin) {
pw.print(reg2bin(mainTranscriptInterval.getStart() - 1, mainTranscriptInterval.getEnd()));
pw.print("\t");
}
pw.print(firstTranscriptName);
pw.print("\t");
pw.print(firstContig);
pw.print("\t");
pw.print(strand);
pw.print("\t");
pw.print(mainTranscriptInterval.getStart() - 1);
pw.print("\t");
pw.print(mainTranscriptInterval.getEnd());
pw.print("\t");
if (cds.isEmpty()) {
pw.print(mainTranscriptInterval.getStart() - 1);
pw.print("\t");
pw.print(mainTranscriptInterval.getStart() - 1);
pw.print("\t");
} else {
int minCds = cds.get(0).getStart();
int maxCds = cds.get(0).getEnd();
for (int i = 1; i < cds.size(); ++i) {
minCds = Math.min(cds.get(i).getStart(), minCds);
maxCds = Math.max(cds.get(i).getEnd(), maxCds);
}
pw.print(minCds - 1);
pw.print("\t");
pw.print(maxCds);
pw.print("\t");
}
pw.print(exons.size());
pw.print("\t");
for (int i = 0; i < exons.size(); ++i) {
if (i > 0)
pw.print(",");
pw.print(exons.get(i).getStart() - 1);
}
pw.print("\t");
for (int i = 0; i < exons.size(); ++i) {
if (i > 0)
pw.print(",");
pw.print(exons.get(i).getEnd());
}
pw.print("\t");
for (final Iterator<Map.Entry<String, String>> metainfoiter = first.delegate.iterator(); metainfoiter.hasNext(); ) {
final Map.Entry<String, String> entry = metainfoiter.next();
if (entry == null)
continue;
final String s = entry.getKey();
if (s.equals("gene_id") || s.equals("transcript_type") || s.equals("gene_name") || s.equals("gene_status") || s.equals("gene_type") || s.equals("transcript_id") || s.equals("havana_gene") || s.equals("havana_transcript") || s.equals("transcript_name") || s.equals("protein_id") || s.equals("ccdsid") || s.equals("Parent")) {
pw.print(entry.getValue());
pw.print(";");
}
}
pw.print("\t");
pw.print(firstTranscriptName);
pw.println();
}
eq.close();
iter.close();
iter = null;
sorting = null;
pw.flush();
pw.close();
pw = null;
LOG.info("done");
return RETURN_OK;
} catch (final Exception e) {
LOG.error(e);
return -1;
} finally {
CloserUtil.close(eq);
CloserUtil.close(pw);
CloserUtil.close(in);
CloserUtil.close(iter);
CloserUtil.close(sorting);
}
}
use of com.github.lindenb.jvarkit.util.bio.gtf.GTFLine in project jvarkit by lindenb.
the class Gtf2Xml method doWork.
@Override
public int doWork(final List<String> args) {
LineIterator r = null;
XMLStreamWriter w = null;
FileWriter fw = null;
try {
String inputName = oneFileOrNull(args);
r = (StringUtil.isBlank(inputName) ? IOUtils.openStreamForLineIterator(stdin()) : IOUtils.openURIForLineIterator(inputName));
XMLOutputFactory xof = XMLOutputFactory.newFactory();
if (this.outputFile == null) {
w = xof.createXMLStreamWriter(stdout(), "UTF-8");
} else {
w = xof.createXMLStreamWriter((fw = new FileWriter(this.outputFile)));
}
final GTFCodec codec = this.formatChooser.makeCodec();
w.writeStartDocument("UTF-8", "1.0");
w.writeStartElement("gtf");
final GTFCodec.GTFHeader header = codec.readActualHeader(r);
for (final String headerLine : header.getLines()) {
if (!headerLine.startsWith("#!"))
continue;
final int ws = headerLine.indexOf(' ');
// ??
if (ws == -1)
continue;
w.writeAttribute(headerLine.substring(2, ws), headerLine.substring(ws + 1).trim());
}
while (r.hasNext()) {
final String line = r.next();
GTFLine gtfline = codec.decode(line);
if (gtfline == null)
continue;
write(w, gtfline);
}
if (!this.disable_att_keys) {
w.writeStartElement("attributes");
for (String k : this.att_keys) {
w.writeStartElement("attribute");
w.writeCharacters(k);
w.writeEndElement();
w.writeCharacters("\n");
}
w.writeEndElement();
w.writeCharacters("\n");
}
if (!this.disable_feature_type) {
w.writeStartElement("types");
for (String k : this.types) {
w.writeStartElement("type");
w.writeCharacters(k);
w.writeEndElement();
w.writeCharacters("\n");
}
w.writeEndElement();
w.writeCharacters("\n");
}
if (!this.disable_sources) {
w.writeStartElement("sources");
for (final String k : this.sources) {
w.writeStartElement("source");
w.writeCharacters(k);
w.writeEndElement();
w.writeCharacters("\n");
}
w.writeEndElement();
w.writeCharacters("\n");
}
if (!this.disable_dict) {
w.writeStartElement("dict");
for (final String k : this.seqdict.keySet()) {
w.writeEmptyElement("chrom");
w.writeAttribute("name", k);
w.writeAttribute("length", String.valueOf(this.seqdict.get(k)));
w.writeCharacters("\n");
}
w.writeEndElement();
w.writeCharacters("\n");
}
w.writeEndElement();
w.writeEndDocument();
w.flush();
return 0;
} catch (Exception e) {
LOG.error(e);
return -1;
} finally {
CloserUtil.close(r);
CloserUtil.close(fw);
}
}
Aggregations