use of com.github.lindenb.jvarkit.util.bio.structure.Transcript in project jvarkit by lindenb.
the class VcfToSvg method doWork.
@Override
public int doWork(final List<String> args) {
VCFReader r = null;
OutputStream outputStream = null;
XMLStreamWriter w = null;
PrintWriter manifestW = null;
ArchiveFactory archiveFactory = null;
try {
r = VCFReaderFactory.makeDefault().open(Paths.get(oneAndOnlyOneFile(args)), true);
final VCFHeader header = r.getHeader();
final List<String> samples = new ArrayList<>(header.getSampleNamesInOrder());
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(header);
intervalListProvider.dictionary(dict);
/* read gtf if any */
final IntervalTreeMap<Gene> geneMap = new IntervalTreeMap<>();
if (this.gtfPath != null) {
try (GtfReader gtfReader = new GtfReader(this.gtfPath)) {
gtfReader.setContigNameConverter(ContigNameConverter.fromOneDictionary(dict));
gtfReader.getAllGenes().stream().filter(G -> !this.removeNonCoding || G.getTranscripts().stream().anyMatch(T -> T.isCoding())).forEach(G -> geneMap.put(new Interval(G), G));
}
}
archiveFactory = ArchiveFactory.open(this.outputPath);
if (manifestFile != null) {
manifestW = IOUtils.openPathForPrintWriter(this.manifestFile);
} else {
manifestW = new PrintWriter(new NullOuputStream());
}
final Pedigree pedigree;
if (this.pedPath == null) {
pedigree = PedigreeParser.empty();
} else {
pedigree = new PedigreeParser().parse(this.pedPath);
}
final Path tmpSvg = Files.createTempFile("vcf.", ".svg");
final XMLOutputFactory xof = XMLOutputFactory.newInstance();
for (final Locatable interval : intervalListProvider.dictionary(dict).stream().collect(Collectors.toList())) {
final List<VariantContext> variants = r.query(interval).stream().filter(V -> this.variantFILTEREDOpacity > 0 || !V.isFiltered()).filter(V -> this.variantIndelOpacity > 0 || !V.isIndel()).collect(Collectors.toCollection(ArrayList::new));
if (variants.isEmpty())
continue;
final List<Transcript> transcripts = geneMap.getOverlapping(interval).stream().flatMap(G -> G.getTranscripts().stream()).filter(T -> !this.removeNonCoding || T.isCoding()).collect(Collectors.toList());
variants.removeIf(V -> this.gtfPath != null && this.variantsInExonOnly && transcripts.stream().flatMap(T -> T.getExons().stream()).noneMatch(EX -> EX.overlaps(V)));
if (variants.isEmpty())
continue;
final String geneId = transcripts.stream().map(T -> T.getGene().getId()).collect(Collectors.toSet()).stream().collect(HtsCollectors.oneAndOnlyOne()).orElse(null);
final String geneName = transcripts.stream().map(T -> T.getGene().getGeneName()).collect(Collectors.toSet()).stream().collect(HtsCollectors.oneAndOnlyOne()).orElse(null);
outputStream = IOUtils.openPathForWriting(tmpSvg);
w = xof.createXMLStreamWriter(outputStream);
double featureHeight = 10;
double TRANSCRIPT_HEIGHT = featureHeight;
final int all_genotypes_width = variants.size() * this.genotype_width;
final int drawinAreaWidth = Math.max(all_genotypes_width, 1000);
final int interline_weight = 6;
final int margin_top = 10;
final int margin_bottom = 10;
final int margin_right = 100;
final int margin_left = 100;
w.writeStartDocument("UTF-8", "1.0");
w.writeStartElement("svg");
w.writeDefaultNamespace(SVG.NS);
w.writeNamespace("xlink", XLINK.NS);
w.writeAttribute("version", "1.1");
w.writeAttribute("width", String.valueOf(margin_right + margin_right + drawinAreaWidth));
w.writeAttribute("height", String.valueOf(margin_top + margin_bottom + transcripts.size() * TRANSCRIPT_HEIGHT + interline_weight * featureHeight + samples.size() * this.genotype_width));
title(w, interval.getContig() + ":" + interval.getStart() + "-" + interval.getEnd());
w.writeStartElement("desc");
w.writeCharacters("generated with " + getProgramName() + "\n" + "Author: Pierre Lindenbaum PhD. @yokofakun .");
w.writeEndElement();
// defs
w.writeStartElement("defs");
// genotypes
w.writeStartElement("g");
//
w.writeAttribute("id", "g_" + GenotypeType.HOM_REF);
w.writeEmptyElement("rect");
w.writeAttribute("style", "fill:lime;stroke;none;");
w.writeAttribute("x", "0");
w.writeAttribute("y", "0");
w.writeAttribute("width", String.valueOf(this.genotype_width));
w.writeAttribute("height", String.valueOf(this.genotype_width));
w.writeEndElement();
w.writeStartElement("g");
//
w.writeAttribute("id", "g_" + GenotypeType.NO_CALL);
w.writeEmptyElement("rect");
w.writeAttribute("style", "fill:silver;stroke;gray;");
w.writeAttribute("x", "0");
w.writeAttribute("y", "0");
w.writeAttribute("width", String.valueOf(this.genotype_width));
w.writeAttribute("height", String.valueOf(this.genotype_width));
w.writeEndElement();
w.writeStartElement("g");
//
w.writeAttribute("id", "g_" + GenotypeType.HOM_VAR);
w.writeEmptyElement("rect");
w.writeAttribute("style", "fill:crimson;stroke;none;");
w.writeAttribute("x", "0");
w.writeAttribute("y", "0");
w.writeAttribute("width", String.valueOf(this.genotype_width));
w.writeAttribute("height", String.valueOf(this.genotype_width));
w.writeEndElement();
w.writeStartElement("g");
//
w.writeAttribute("id", "g_" + GenotypeType.MIXED);
w.writeEmptyElement("rect");
w.writeAttribute("style", "fill:pink;stroke;none;");
w.writeAttribute("x", "0");
w.writeAttribute("y", "0");
w.writeAttribute("width", String.valueOf(this.genotype_width));
w.writeAttribute("height", String.valueOf(this.genotype_width));
w.writeEndElement();
w.writeStartElement("g");
//
w.writeAttribute("id", "g_" + GenotypeType.UNAVAILABLE);
w.writeEmptyElement("rect");
w.writeAttribute("style", "fill:gray;stroke;none;");
w.writeAttribute("x", "0");
w.writeAttribute("y", "0");
w.writeAttribute("width", String.valueOf(this.genotype_width));
w.writeAttribute("height", String.valueOf(this.genotype_width));
w.writeEndElement();
w.writeStartElement("g");
//
w.writeAttribute("id", "g_" + GenotypeType.HET);
w.writeEmptyElement("rect");
w.writeAttribute("style", "fill:lime;stroke;black;");
w.writeAttribute("x", "0");
w.writeAttribute("y", "0");
w.writeAttribute("width", String.valueOf(genotype_width));
w.writeAttribute("height", String.valueOf(genotype_width));
w.writeEmptyElement("polygon");
w.writeAttribute("style", "fill:crimson;stroke;black;");
w.writeAttribute("points", "0,0 " + genotype_width + ",0 0," + genotype_width + " 0,0");
w.writeEndElement();
// strand
w.writeEmptyElement("polyline");
w.writeAttribute("id", "strandF");
w.writeAttribute("points", "-5,-5 0,0 -5,5");
w.writeEmptyElement("polyline");
w.writeAttribute("id", "strandR");
w.writeAttribute("points", "5,-5 0,0 5,5");
// gradients
w.writeStartElement("linearGradient");
w.writeAttribute("id", "grad01");
w.writeAttribute("x1", "50%");
w.writeAttribute("x2", "50%");
w.writeAttribute("y1", "0%");
w.writeAttribute("y2", "100%");
w.writeEmptyElement("stop");
w.writeAttribute("offset", "0%");
w.writeAttribute("style", "stop-color:black;stop-opacity:1;");
w.writeEmptyElement("stop");
w.writeAttribute("offset", "50%");
w.writeAttribute("style", "stop-color:white;stop-opacity:1;");
w.writeEmptyElement("stop");
w.writeAttribute("offset", "100%");
w.writeAttribute("style", "stop-color:black;stop-opacity:1;");
w.writeEndElement();
// defs
w.writeEndElement();
w.writeStartElement("style");
w.writeCharacters("svg {fill:none; stroke:black;}\n" + "text {fill:black;stroke:none;font-size:" + (featureHeight / 1.5) + "px;}\n" + ".ruler-label { stroke:red;}\n" + ".frame { stroke:black;fill:none;}\n" + ".kgexon {fill:url(#grad01);stroke:black;}\n" + ".gcpercent {fill:url(#grad02);stroke:black;}" + ".coverage {fill:url(#grad03);stroke:black;}" + ".kgcds {fill:yellow;stroke:black;opacity:0.7;}\n" + ".variant{stroke:none;fill:red;opacity:0.2;}\n" + ".xaxis{stroke:gray;fill:none;opacity:0.2;}\n" + ".postick{font-size:9px;stroke:black;stroke-width:1;}");
// style
w.writeEndElement();
final IntFunction<Integer> trim = t -> Math.max(interval.getStart(), Math.min(interval.getEnd(), t));
final IntFunction<Double> baseToPixel = t -> margin_left + drawinAreaWidth * (t - (double) interval.getStart()) / ((double) interval.getLengthOnReference());
final IntFunction<Double> variantIndexToPixel = idx -> {
final double variant_width = drawinAreaWidth / (double) variants.size();
final double midx = variant_width * idx + variant_width / 2.0;
return margin_left + midx - genotype_width / 2.0;
};
final Function<VariantContext, String> variantTitle = V -> (V.getContig().startsWith("chr") ? V.getContig().substring(3) : V.getContig()) + ":" + V.getStart() + " " + V.getReference().getDisplayString();
/**
* title
*/
double y = 0;
w.writeStartElement("text");
w.writeAttribute("x", "0");
w.writeAttribute("y", String.valueOf(featureHeight));
w.writeCharacters(interval.toString());
w.writeEndElement();
y += featureHeight;
for (final Transcript g : transcripts) {
int cdsHeigh = 5;
double exonHeight = TRANSCRIPT_HEIGHT - 5;
double midY = TRANSCRIPT_HEIGHT / 2;
w.writeStartElement("g");
w.writeAttribute("transform", "translate(0," + y + ")");
title(w, g.getId());
w.writeStartElement("text");
w.writeAttribute("x", String.valueOf(margin_left - 10));
w.writeAttribute("y", String.valueOf(featureHeight));
w.writeAttribute("style", "text-anchor:end;");
w.writeCharacters(g.getId());
w.writeEndElement();
/* transcript line */
w.writeEmptyElement("line");
w.writeAttribute("class", "kgtr");
w.writeAttribute("x1", String.valueOf(baseToPixel.apply(trim.apply(g.getTxStart()))));
w.writeAttribute("y1", String.valueOf(midY));
w.writeAttribute("x2", String.valueOf(baseToPixel.apply(trim.apply(g.getTxEnd()))));
w.writeAttribute("y2", String.valueOf(midY));
/* strand symbols */
for (double pixX = 0; pixX < drawinAreaWidth; pixX += 30) {
double pos0 = interval.getStart() + (pixX / (double) drawinAreaWidth) * interval.getLengthOnReference();
if (pos0 + 1 < g.getTxStart())
continue;
if (pos0 > g.getTxEnd())
break;
w.writeEmptyElement("use");
w.writeAttribute("class", "kgstrand");
w.writeAttribute("xlink", XLINK.NS, "href", "#strand" + (g.isPositiveStrand() ? "F" : "R"));
w.writeAttribute("x", String.valueOf(margin_left + pixX));
w.writeAttribute("y", String.valueOf(midY));
}
/* exons */
for (final Exon exon : g.getExons()) {
if (exon.getStart() + 1 >= interval.getEnd())
continue;
if (exon.getEnd() <= interval.getStart())
continue;
w.writeStartElement("rect");
w.writeAttribute("class", "kgexon");
w.writeAttribute("x", String.valueOf(baseToPixel.apply(trim.apply(exon.getStart()))));
w.writeAttribute("y", String.valueOf(midY - exonHeight / 2));
w.writeAttribute("width", String.valueOf(baseToPixel.apply(trim.apply(exon.getEnd())) - baseToPixel.apply((trim.apply(exon.getStart())))));
w.writeAttribute("height", String.valueOf(exonHeight));
title(w, exon.getName());
w.writeEndElement();
}
/* coding line */
if (!g.isNonCoding() && g.hasCodonStartDefined() && g.hasCodonStopDefined()) {
final double codonx1 = baseToPixel.apply(trim.apply(g.getLeftmostCodon().get().getStart()));
final double codonx2 = baseToPixel.apply(trim.apply(g.getRightmostCodon().get().getEnd()));
w.writeEmptyElement("rect");
w.writeAttribute("class", "kgcds");
w.writeAttribute("x", String.valueOf(codonx1));
w.writeAttribute("y", String.valueOf(midY - cdsHeigh / 4.0));
w.writeAttribute("width", String.valueOf(baseToPixel.apply((int) (codonx2 - codonx1))));
w.writeAttribute("height", String.valueOf(cdsHeigh / 2.0));
}
// String label=String.format("%15s", g.getName());
// w.writeEmptyElement("path");
// double fontHeight=Math.min(10,0.8*TRANSCRIPT_HEIGHT);
// w.writeAttribute("d",this.hershey.svgPath(label,-insets.left,midY-fontHeight/2,insets.left*0.9,fontHeight));
w.writeEndElement();
w.writeCharacters("\n");
y += featureHeight;
}
/* draw lines to variants */
for (int vidx = 0; vidx < variants.size(); ++vidx) {
final VariantContext vc = variants.get(vidx);
double x1 = baseToPixel.apply(vc.getStart());
double x2 = baseToPixel.apply(vc.getEnd());
final double y2 = y + featureHeight * interline_weight;
w.writeStartElement("polygon");
w.writeAttribute("style", "fill:" + (vidx % 2 == 0 ? "ghostwhite" : "lavender") + ";stroke:black;opacity:0.6;stroke-width:0.5;");
w.writeAttribute("points", "" + x1 + "," + (y - featureHeight / 2.0) + " " + x2 + "," + (y - featureHeight / 2.0) + " " + variantIndexToPixel.apply(vidx) + "," + y2 + " " + (variantIndexToPixel.apply(vidx) + this.genotype_width) + "," + y2);
title(w, variantTitle.apply(vc));
w.writeEndElement();
}
for (int vidx = 0; vidx < variants.size(); ++vidx) {
final VariantContext vc = variants.get(vidx);
final double y2 = y + featureHeight * interline_weight;
w.writeStartElement("text");
w.writeAttribute("transform", "translate(" + (String.valueOf(variantIndexToPixel.apply(vidx) + genotype_width / 2.0)) + "," + String.valueOf(y2 - 5) + ") " + "rotate(-45)");
w.writeAttribute("x", "0");
w.writeAttribute("y", "0");
w.writeAttribute("class", "postick");
w.writeCharacters(variantTitle.apply(vc));
w.writeEndElement();
w.writeCharacters("\n");
}
y += featureHeight * interline_weight;
w.writeStartElement("g");
/* step 0: affected, 1: unaffected, 2: others */
for (int step = 0; step < 3; ++step) {
for (final String sample : samples) {
final Sample individual = pedigree.getSampleById(sample);
if (step == 0 && (individual == null || !individual.isAffected()))
continue;
if (step == 1 && (individual == null || !individual.isUnaffected()))
continue;
if (step == 2 && individual != null && individual.isStatusSet())
continue;
// sample
w.writeStartElement("g");
switch(step) {
case 0:
w.writeAttribute("style", "hue-rotate(195deg);");
break;
case 1:
w.writeAttribute("style", "hue-rotate(45deg);");
break;
default:
break;
}
for (int vidx = 0; vidx < variants.size(); ++vidx) {
final VariantContext vc = variants.get(vidx);
final Genotype g = vc.getGenotype(sample);
double opacity = 1.0;
if (vc.isIndel())
opacity *= this.variantIndelOpacity;
if (vc.isFiltered())
opacity *= this.variantFILTEREDOpacity;
if (opacity > 1)
opacity = 1;
if (opacity <= 0)
continue;
if (opacity < 1) {
w.writeStartElement("g");
w.writeAttribute("style", "opacity:" + opacity + ";");
}
w.writeEmptyElement("use");
w.writeAttribute("x", String.valueOf(variantIndexToPixel.apply(vidx)));
w.writeAttribute("y", String.valueOf(y));
w.writeAttribute("xlink", XLINK.NS, "href", "#g_" + g.getType());
if (opacity < 1) {
w.writeEndElement();
}
}
w.writeCharacters("\n");
w.writeStartElement("text");
w.writeAttribute("x", String.valueOf(margin_left - 10));
w.writeAttribute("y", String.valueOf(y + this.genotype_width / 2.0));
w.writeAttribute("style", "text-anchor:end;");
w.writeCharacters(sample);
// text
w.writeEndElement();
// g for sample
w.writeEndElement();
y += this.genotype_width;
}
}
w.writeCharacters("\n");
w.writeEndDocument();
w.writeCharacters("\n");
w.flush();
w.close();
final String md5 = StringUtils.md5(interval.getContig() + ":" + interval.getStart() + "-" + interval.getEnd());
final String filename = md5.substring(0, 2) + File.separatorChar + md5.substring(2) + File.separator + interval.getContig().replaceAll("[/\\:]", "_") + "_" + interval.getStart() + "_" + interval.getEnd() + (StringUtils.isBlank(geneName) ? "" : "." + geneName.replaceAll("[/\\:]", "")) + (StringUtils.isBlank(geneId) ? "" : "." + geneId.replaceAll("[/\\:]", "")) + ".svg";
OutputStream os = archiveFactory.openOuputStream(filename);
IOUtils.copyTo(tmpSvg, os);
os.flush();
os.close();
Files.delete(tmpSvg);
manifestW.print(interval.getContig());
manifestW.print('\t');
manifestW.print(interval.getStart() - 1);
manifestW.print('\t');
manifestW.print(interval.getEnd());
manifestW.print('\t');
manifestW.print(transcripts.stream().map(G -> G.getGene().getId()).collect(Collectors.toSet()).stream().collect(Collectors.joining(";")));
manifestW.print('\t');
manifestW.print(transcripts.stream().map(G -> G.getGene().getGeneName()).collect(Collectors.toSet()).stream().collect(Collectors.joining(";")));
manifestW.print('\t');
manifestW.print(transcripts.stream().map(G -> G.getId()).collect(Collectors.toSet()).stream().collect(Collectors.joining(";")));
manifestW.print('\t');
manifestW.print((archiveFactory.isTarOrZipArchive() ? "" : this.outputPath.toString() + File.separator) + filename);
manifestW.print('\t');
manifestW.println(variants.size());
}
r.close();
manifestW.flush();
manifestW.close();
manifestW = null;
archiveFactory.close();
archiveFactory = null;
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(archiveFactory);
CloserUtil.close(r);
CloserUtil.close(outputStream);
CloserUtil.close(manifestW);
}
}
use of com.github.lindenb.jvarkit.util.bio.structure.Transcript in project jvarkit by lindenb.
the class Biostar81455 method doWork.
@Override
public int doWork(final List<String> args) {
BufferedReader r = null;
String line;
PrintStream out = null;
final CharSplitter tab = CharSplitter.TAB;
try {
try (final GtfReader gtfReader = new GtfReader(this.gtfPath)) {
gtfReader.getAllGenes().stream().forEach(G -> this.geneMap.put(new Interval(G), G));
}
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(r);
}
final ContigNameConverter contigNameConverter = ContigNameConverter.fromIntervalTreeMap(this.geneMap);
try {
r = super.openBufferedReader(oneFileOrNull(args));
out = openPathOrStdoutAsPrintStream(this.outputFile);
while ((line = r.readLine()) != null) {
if (line.startsWith("#")) {
out.println(line);
continue;
}
boolean found = false;
final String[] tokens = tab.split(line);
final int pos1 = Integer.parseInt(tokens[1]) + (this.one_based ? 0 : 1);
final String convertCtg = contigNameConverter.apply(tokens[0]);
if (convertCtg == null) {
LOG.error("CANNOT FIND contig " + tokens[0]);
out.println("##UNKNOWN CONTIG " + line);
continue;
}
final SimplePosition position = new SimplePosition(convertCtg, pos1);
final List<Transcript> transcripts = this.geneMap.getOverlapping(position).stream().flatMap(G -> G.getTranscripts().stream()).filter(T -> T.overlaps(position)).collect(Collectors.toList());
if (transcripts.isEmpty()) {
LOG.info("no gene found in chromosome " + tokens[0] + " (check chrom prefix?)");
} else {
for (final Transcript kg : transcripts) {
Exon bestExon = null;
for (final Exon exon : kg.getExons()) {
if (bestExon == null || Math.abs(distance(position.getPosition(), exon)) < Math.abs(distance(position.getPosition(), bestExon))) {
bestExon = exon;
}
}
if (bestExon != null) {
out.print(line);
out.print("\t");
out.print(bestExon.getTranscript().getId());
out.print("\t");
out.print(kg.getStart() - 1);
out.print("\t");
out.print(kg.getEnd());
out.print("\t");
out.print(kg.getStrand());
out.print("\t");
out.print(bestExon.getName());
out.print("\t");
out.print(bestExon.getStart() - 1);
out.print("\t");
out.print(bestExon.getEnd());
out.print("\t");
out.print(distance(position.getPosition(), bestExon));
out.println();
found = true;
}
}
}
if (!found) {
out.println(line + "\tNULL");
}
}
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(r);
CloserUtil.close(out);
}
}
use of com.github.lindenb.jvarkit.util.bio.structure.Transcript in project jvarkit by lindenb.
the class VcfGtfSplitter method testTranscript.
private boolean testTranscript(final Transcript transcript, final VariantContext ctx) {
if (!transcript.overlaps(ctx)) {
if (this.use_upstream) {
final SimplePosition pos = new SimplePosition(transcript.getContig(), transcript.isPositiveStrand() ? transcript.getStart() : transcript.getEnd());
if (ctx.withinDistanceOf(pos, this.xxxxstream_length))
return true;
}
if (this.use_downstream) {
final SimplePosition pos = new SimplePosition(transcript.getContig(), transcript.isPositiveStrand() ? transcript.getEnd() : transcript.getStart());
if (ctx.withinDistanceOf(pos, this.xxxxstream_length))
return true;
}
return false;
}
if (this.use_exon && transcript.hasExon() && transcript.getExons().stream().anyMatch(FEAT -> FEAT.overlaps(ctx)))
return true;
if (this.use_intron && transcript.hasIntron() && transcript.getIntrons().stream().anyMatch(FEAT -> FEAT.overlaps(ctx)))
return true;
if (this.use_cds && transcript.hasCDS() && transcript.getAllCds().stream().anyMatch(FEAT -> FEAT.overlaps(ctx)))
return true;
if (this.use_utr5) {
if (transcript.isPositiveStrand() && transcript.getUTR5().isPresent() && transcript.getUTR5().get().overlaps(ctx))
return true;
if (transcript.isNegativeStrand() && transcript.getUTR3().isPresent() && transcript.getUTR3().get().overlaps(ctx))
return true;
}
if (this.use_utr3) {
if (transcript.isPositiveStrand() && transcript.getUTR3().isPresent() && transcript.getUTR3().get().overlaps(ctx))
return true;
if (transcript.isNegativeStrand() && transcript.getUTR5().isPresent() && transcript.getUTR5().get().overlaps(ctx))
return true;
}
if (this.use_cds_utr5) {
if (transcript.isPositiveStrand() && transcript.getUTR5().isPresent() && transcript.getUTR5().get().getIntervals().stream().anyMatch(FEAT -> FEAT.overlaps(ctx)))
return true;
if (transcript.isNegativeStrand() && transcript.getUTR3().isPresent() && transcript.getUTR3().get().getIntervals().stream().anyMatch(FEAT -> FEAT.overlaps(ctx)))
return true;
}
if (this.use_cds_utr3) {
if (transcript.isPositiveStrand() && transcript.getUTR3().isPresent() && transcript.getUTR3().get().getIntervals().stream().anyMatch(FEAT -> FEAT.overlaps(ctx)))
return true;
if (transcript.isNegativeStrand() && transcript.getUTR5().isPresent() && transcript.getUTR5().get().getIntervals().stream().anyMatch(FEAT -> FEAT.overlaps(ctx)))
return true;
}
if (this.use_stop && transcript.hasCodonStopDefined() && transcript.getCodonStop().get().getBlocks().stream().anyMatch(FEAT -> FEAT.overlaps(ctx)))
return true;
if (this.use_start && transcript.hasCodonStartDefined() && transcript.getCodonStart().get().getBlocks().stream().anyMatch(FEAT -> FEAT.overlaps(ctx)))
return true;
if (this.use_splice && transcript.hasIntron()) {
for (final Intron intron : transcript.getIntrons()) {
final SimpleInterval splice1 = new SimpleInterval(intron.getContig(), intron.getStart() - 1, intron.getStart());
if (ctx.withinDistanceOf(splice1, this.split_length))
return true;
final SimpleInterval splice2 = new SimpleInterval(intron.getContig(), intron.getEnd(), intron.getEnd() + 1);
if (ctx.withinDistanceOf(splice2, this.split_length))
return true;
}
}
return false;
}
use of com.github.lindenb.jvarkit.util.bio.structure.Transcript in project jvarkit by lindenb.
the class GtfUpstreamOrf method doWork.
@Override
public int doWork(final List<String> args) {
GtfReader gtfReader = null;
PrintWriter pw = null;
try {
this.indexedFastaSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.faidx);
final SAMSequenceDictionary refDict = SequenceDictionaryUtils.extractRequired(this.indexedFastaSequenceFile);
this.refCtgNameConverter = ContigNameConverter.fromOneDictionary(refDict);
final ContigDictComparator ctgDictComparator = new ContigDictComparator(refDict);
final String input = oneFileOrNull(args);
gtfReader = input == null ? new GtfReader(stdin()) : new GtfReader(input);
gtfReader.setContigNameConverter(this.refCtgNameConverter);
final List<Gene> genes = gtfReader.getAllGenes().stream().filter(G -> G.hasStrand()).sorted((A, B) -> {
int i = ctgDictComparator.compare(A.getContig(), B.getContig());
if (i != 0)
return i;
i = Integer.compare(A.getStart(), B.getStart());
if (i != 0)
return i;
return Integer.compare(A.getEnd(), B.getEnd());
}).collect(Collectors.toList());
gtfReader.close();
gtfReader = null;
pw = super.openPathOrStdoutAsPrintWriter(this.outputFile);
for (final KozakSequence.Strength f : KozakSequence.Strength.values()) {
pw.println("#kozak." + f.name() + "=" + kozakStrengthToScore(f));
}
final String gtfSource = getProgramName().toLowerCase();
for (final SAMSequenceRecord ssr : refDict.getSequences()) {
pw.println("##contig " + ssr.getSequenceName() + ": length:" + ssr.getSequenceLength());
}
pw.println("#" + gtfSource + ":" + JVarkitVersion.getInstance().toString());
if (!StringUtils.isBlank(input)) {
pw.println("#gtf:" + input);
}
final ProgressFactory.Watcher<Gene> progress = ProgressFactory.newInstance().dictionary(refDict).logger(LOG).build();
for (final Gene gene : genes) {
progress.apply(gene);
/* new reference sequence */
if (this.genomicSequence == null || !this.genomicSequence.getChrom().equals(gene.getContig())) {
this.genomicSequence = new GenomicSequence(this.indexedFastaSequenceFile, gene.getContig());
}
final List<RNASequence> rnas = gene.getTranscripts().stream().filter(T -> T.isCoding()).filter(T -> T.hasStrand()).filter(T -> T.hasCodonStartDefined()).filter(T -> T.getTranscriptUTR5().isPresent()).map(T -> new RNASequence(T)).collect(Collectors.toList());
if (rnas.isEmpty())
continue;
final Set<OpenReadingFrame> orfs = rnas.stream().flatMap(R -> R.getUpstreamOpenReadingFrames().stream()).collect(Collectors.toSet());
if (orfs.isEmpty())
continue;
boolean gene_printed = false;
for (final OpenReadingFrame uORF : orfs) {
/* is there any other RNA containing this uORF ?*/
if (rnas.stream().filter(other -> !other.getTranscript().getId().equals(uORF.mRNA.getTranscript().getId())).anyMatch(other -> {
// other must have atg in frame and ATG before the observed one
final int other_atg0 = other.getATG0InRNA();
if (uORF.in_rna_atg0 % 3 != other_atg0 % 3)
return false;
return uORF.in_rna_atg0 >= other_atg0;
}))
continue;
final String transcript_id = uORF.getTranscript().getId() + ".uorf" + (1 + uORF.in_rna_atg0);
if (!gene_printed) {
gene_printed = true;
pw.print(gene.getContig());
pw.print("\t");
// source
pw.print(gtfSource);
pw.print("\t");
pw.print("gene");
pw.print("\t");
// start
pw.print(gene.getStart());
pw.print("\t");
// end
pw.print(gene.getEnd());
pw.print("\t");
// score
pw.print(".");
pw.print("\t");
// strand
pw.print(gene.getStrand());
pw.print("\t");
// phase
pw.print(".");
pw.print("\t");
pw.print(keyvalue("gene_id", gene.getId()));
pw.println();
}
// TRANSCRIPT
final UTR utr_5_prime = uORF.getTranscript().getTranscriptUTR5().get();
pw.print(gene.getContig());
pw.print("\t");
pw.print(gtfSource);
pw.print("\t");
pw.print("transcript");
pw.print("\t");
if (gene.isPositiveStrand()) {
pw.print(uORF.mRNA.mrnaIndex0ToGenomic0[uORF.in_rna_atg0] + 1);
pw.print("\t");
// pw.print(transcriptSequence.mrnaIndex0ToBase[uORF.in_rna_stop0]+1);
if (uORF.in_rna_stop0 == NPOS) {
pw.print(uORF.mRNA.getTranscript().getTxEnd());
} else {
pw.print(uORF.mRNA.mrnaIndex0ToGenomic0[uORF.in_rna_stop0] + 1);
}
} else {
if (uORF.in_rna_stop0 == NPOS) {
pw.print(uORF.mRNA.getTranscript().getTxStart());
} else {
pw.print(uORF.mRNA.mrnaIndex0ToGenomic0[uORF.in_rna_stop0] + 1);
}
pw.print("\t");
pw.print(uORF.mRNA.mrnaIndex0ToGenomic0[uORF.in_rna_atg0] + 1);
}
pw.print("\t");
// score
pw.print(kozakStrengthToScore(uORF.kozak.getStrength()));
pw.print("\t");
// strand
pw.print(gene.getStrand());
pw.print("\t");
// phase
pw.print("0");
pw.print("\t");
pw.print(keyvalue("gene_id", gene.getId()));
pw.print(keyvalue("transcript_id", transcript_id));
pw.print(keyvalue("transcript_biotype", "uORF"));
pw.print(keyvalue("kozak-seq", uORF.kozak.getString()));
pw.print(keyvalue("kozak-strength", uORF.kozak.getStrength()));
pw.print(keyvalue("translation", uORF.peptide));
pw.print(keyvalue("uORF-atg-in-frame-with-transcript-atg", uORF.uorf_atg_in_frame));
pw.print(keyvalue("utr", utr_5_prime.toString() + " " + utr_5_prime.getStart() + "-" + utr_5_prime.getEnd()));
pw.println();
// Exon
for (final Exon exon : uORF.getTranscript().getExons()) {
pw.print(exon.getContig());
pw.print("\t");
pw.print(gtfSource);
pw.print("\t");
pw.print("exon");
pw.print("\t");
pw.print(exon.getStart());
pw.print("\t");
pw.print(exon.getEnd());
pw.print("\t");
// score
pw.print(kozakStrengthToScore(uORF.kozak.getStrength()));
pw.print("\t");
// strand
pw.print(exon.getStrand());
pw.print("\t");
// phase
pw.print(0);
pw.print("\t");
pw.print(keyvalue("gene_id", gene.getId()));
pw.print(keyvalue("transcript_id", transcript_id));
pw.println();
}
final List<Interval> startBlocks = uORF.mRNA.getCodonBlocks(uORF.in_rna_atg0, uORF.in_rna_atg0 + 1, uORF.in_rna_atg0 + 2);
final List<Interval> stopBlocks = uORF.in_rna_stop0 != NPOS ? uORF.mRNA.getCodonBlocks(uORF.in_rna_stop0, uORF.in_rna_stop0 + 1, uORF.in_rna_stop0 + 2) : Collections.emptyList();
// CDS
if (!stopBlocks.isEmpty()) {
final int cdsStart = startBlocks.stream().mapToInt(B -> B.getStart()).min().orElseThrow(IllegalStateException::new);
final int cdsEnd = stopBlocks.stream().mapToInt(B -> B.getEnd()).max().orElseThrow(IllegalStateException::new);
for (final Exon exon : uORF.getTranscript().getExons()) {
if (exon.getEnd() < cdsStart)
continue;
if (exon.getStart() > cdsEnd)
break;
pw.print(exon.getContig());
pw.print("\t");
pw.print(gtfSource);
pw.print("\t");
pw.print("CDS");
pw.print("\t");
pw.print(Math.max(cdsStart, exon.getStart()));
pw.print("\t");
pw.print(Math.min(cdsEnd, exon.getEnd()));
pw.print("\t");
// score
pw.print(kozakStrengthToScore(uORF.kozak.getStrength()));
pw.print("\t");
// strand
pw.print(exon.getStrand());
pw.print("\t");
// phase
pw.print(uORF.getFrameAt(Math.max(cdsStart, exon.getStart())));
pw.print("\t");
pw.print(keyvalue("gene_id", gene.getId()));
pw.print(keyvalue("transcript_id", transcript_id));
pw.println();
}
}
// CODON START
for (final Interval startc : startBlocks) {
PARANOID.assertLe(startc.getStart(), startc.getEnd());
pw.print(startc.getContig());
pw.print("\t");
pw.print(gtfSource);
pw.print("\t");
pw.print("start_codon");
pw.print("\t");
pw.print(startc.getStart());
pw.print("\t");
pw.print(startc.getEnd());
pw.print("\t");
// score
pw.print(kozakStrengthToScore(uORF.kozak.getStrength()));
pw.print("\t");
// strand
pw.print(gene.getStrand());
pw.print("\t");
// phase
pw.print(uORF.getFrameAt(startc.getStart()));
pw.print("\t");
pw.print(keyvalue("gene_id", gene.getId()));
pw.print(keyvalue("transcript_id", transcript_id));
pw.print(keyvalue("distance-mrna-atg", uORF.mRNA.getATG0InRNA() - uORF.in_rna_atg0));
pw.print(keyvalue("pos0-in-mrna", uORF.in_rna_atg0));
pw.print(keyvalue("spliced", startBlocks.size() > 1));
pw.println();
}
// CODON END
for (final Interval stopc : stopBlocks) /* might be empty */
{
PARANOID.assertLe(stopc.getStart(), stopc.getEnd());
pw.print(stopc.getContig());
pw.print("\t");
pw.print(gtfSource);
pw.print("\t");
pw.print("stop_codon");
pw.print("\t");
pw.print(stopc.getStart());
pw.print("\t");
pw.print(stopc.getEnd());
pw.print("\t");
// score
pw.print(kozakStrengthToScore(uORF.kozak.getStrength()));
pw.print("\t");
// strand
pw.print(gene.getStrand());
pw.print("\t");
// phase
pw.print(uORF.getFrameAt(stopc.getStart()));
pw.print("\t");
pw.print(keyvalue("gene_id", gene.getId()));
pw.print(keyvalue("transcript_id", transcript_id));
pw.print(keyvalue("spliced", stopBlocks.size() > 1));
pw.println();
}
}
}
progress.close();
pw.flush();
pw.close();
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(this.indexedFastaSequenceFile);
}
}
use of com.github.lindenb.jvarkit.util.bio.structure.Transcript in project jvarkit by lindenb.
the class VcfScanUpstreamOrf method beforeVcf.
@Override
protected int beforeVcf() {
try {
this.indexedFastaSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.faidx);
final SAMSequenceDictionary refDict = SequenceDictionaryUtils.extractRequired(this.indexedFastaSequenceFile);
this.refCtgNameConverter = ContigNameConverter.fromOneDictionary(refDict);
try (GtfReader gtfReader = new GtfReader(this.gtfPath)) {
gtfReader.setContigNameConverter(this.refCtgNameConverter);
// tmp IntervalTreeMap for gene, will be used to remove uORF overlapping alternate transcript with CDS */
final IntervalTreeMap<List<Transcript>> tmpTreeMap = new IntervalTreeMap<>();
gtfReader.getAllGenes().stream().flatMap(G -> G.getTranscripts().stream()).filter(T -> T.hasCodonStartDefined() && T.hasCodonStopDefined()).filter(T -> T.getTranscriptUTR5().isPresent()).filter(T -> (!this.plus_strand_only || T.isPositiveStrand())).forEach(T -> {
final Interval interval = new Interval(T);
List<Transcript> L = tmpTreeMap.get(interval);
if (L == null) {
L = new ArrayList<>();
tmpTreeMap.put(interval, L);
}
L.add(T);
});
for (final Transcript transcript : tmpTreeMap.values().stream().flatMap(L -> L.stream()).collect(Collectors.toList())) {
final Interval interval = transcript.getTranscriptUTR5().get().toInterval();
if (this.exclude_cds_overlaping_alternative) {
if (tmpTreeMap.getOverlapping(interval).stream().flatMap(L -> L.stream()).filter(// same object in memory
G -> G != transcript).anyMatch(K -> overlapCDS(transcript, K))) {
continue;
}
}
List<Transcript> L = this.transcriptMap.get(interval);
if (L == null) {
L = new ArrayList<>();
this.transcriptMap.put(interval, L);
}
if (this.canonical_utr) {
if (L.stream().map(K -> K.getTranscriptUTR5().get().toInterval()).anyMatch(R -> R.isNegativeStrand() == interval.isNegativeStrand() && R.equals(interval)))
continue;
}
L.add(transcript);
}
LOG.info("number of transcripts :" + this.transcriptMap.values().stream().flatMap(L -> L.stream()).count());
}
if (this.transcriptMap.isEmpty()) {
LOG.error("no transcripts found in " + this.gtfPath);
return -1;
}
if (this.archivePath != null) {
try (final ArchiveFactory archive = ArchiveFactory.open(this.archivePath)) {
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.indexedFastaSequenceFile);
final ContigDictComparator ctgCmp = new ContigDictComparator(dict);
final Comparator<Locatable> locCmp1 = (A, B) -> {
int i = ctgCmp.compare(A.getContig(), B.getContig());
if (i != 0)
return i;
i = Integer.compare(A.getStart(), B.getStart());
if (i != 0)
return i;
return Integer.compare(A.getEnd(), B.getEnd());
};
final PrintWriter pw1 = archive.openWriter("uorf-dna.fa");
final PrintWriter pw2 = archive.openWriter("uorf-pep.fa");
final PrintWriter pw3 = archive.openWriter("uorf.bed");
pw3.println("track name=\"uORF\" description=\"uORF for " + this.gtfPath + "\"");
pw3.println("#chrom\tchromStart\tchromEnd\tname\tscore\tstrand\tthickStart\tthickEnd\titemRgb\tblockCount\tblockSizes\tblockStarts");
final ProgressFactory.Watcher<Locatable> progress = ProgressFactory.newInstance().dictionary(dict).logger(LOG).build();
for (final Transcript kg : this.transcriptMap.values().stream().flatMap(L -> L.stream()).sorted(locCmp1).collect(Collectors.toList())) {
/* new reference sequence */
if (this.genomicSequence == null || !this.genomicSequence.getChrom().equals(kg.getContig())) {
this.genomicSequence = new GenomicSequence(this.indexedFastaSequenceFile, kg.getContig());
}
final TranscriptRNA mRNA = new TranscriptRNA(kg);
final UpstreamORF uorf = new UpstreamORF(mRNA);
progress.apply(new Interval(uorf.getContig(), uorf.getChromStart() + 1, uorf.getChromEnd()));
final OpenReadingFrame orf = uorf.getBestOpenReadingFrame();
if (orf == null)
continue;
orf.printFastaDNA(pw1);
orf.printFastaPep(pw2);
orf.printBed(pw3);
}
;
pw1.flush();
pw1.close();
pw2.flush();
pw2.close();
pw3.flush();
pw3.close();
progress.close();
}
}
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
}
}
Aggregations