use of com.github.lindenb.jvarkit.util.bio.structure.Transcript in project jvarkit by lindenb.
the class VcfToSvg method doWork.
@Override
public int doWork(final List<String> args) {
VCFReader r = null;
OutputStream outputStream = null;
XMLStreamWriter w = null;
PrintWriter manifestW = null;
ArchiveFactory archiveFactory = null;
try {
r = VCFReaderFactory.makeDefault().open(Paths.get(oneAndOnlyOneFile(args)), true);
final VCFHeader header = r.getHeader();
final List<String> samples = new ArrayList<>(header.getSampleNamesInOrder());
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(header);
intervalListProvider.dictionary(dict);
/* read gtf if any */
final IntervalTreeMap<Gene> geneMap = new IntervalTreeMap<>();
if (this.gtfPath != null) {
try (GtfReader gtfReader = new GtfReader(this.gtfPath)) {
gtfReader.setContigNameConverter(ContigNameConverter.fromOneDictionary(dict));
gtfReader.getAllGenes().stream().filter(G -> !this.removeNonCoding || G.getTranscripts().stream().anyMatch(T -> T.isCoding())).forEach(G -> geneMap.put(new Interval(G), G));
}
}
archiveFactory = ArchiveFactory.open(this.outputPath);
if (manifestFile != null) {
manifestW = IOUtils.openPathForPrintWriter(this.manifestFile);
} else {
manifestW = new PrintWriter(new NullOuputStream());
}
final Pedigree pedigree;
if (this.pedPath == null) {
pedigree = PedigreeParser.empty();
} else {
pedigree = new PedigreeParser().parse(this.pedPath);
}
final Path tmpSvg = Files.createTempFile("vcf.", ".svg");
final XMLOutputFactory xof = XMLOutputFactory.newInstance();
for (final Locatable interval : intervalListProvider.dictionary(dict).stream().collect(Collectors.toList())) {
final List<VariantContext> variants = r.query(interval).stream().filter(V -> this.variantFILTEREDOpacity > 0 || !V.isFiltered()).filter(V -> this.variantIndelOpacity > 0 || !V.isIndel()).collect(Collectors.toCollection(ArrayList::new));
if (variants.isEmpty())
continue;
final List<Transcript> transcripts = geneMap.getOverlapping(interval).stream().flatMap(G -> G.getTranscripts().stream()).filter(T -> !this.removeNonCoding || T.isCoding()).collect(Collectors.toList());
variants.removeIf(V -> this.gtfPath != null && this.variantsInExonOnly && transcripts.stream().flatMap(T -> T.getExons().stream()).noneMatch(EX -> EX.overlaps(V)));
if (variants.isEmpty())
continue;
final String geneId = transcripts.stream().map(T -> T.getGene().getId()).collect(Collectors.toSet()).stream().collect(HtsCollectors.oneAndOnlyOne()).orElse(null);
final String geneName = transcripts.stream().map(T -> T.getGene().getGeneName()).collect(Collectors.toSet()).stream().collect(HtsCollectors.oneAndOnlyOne()).orElse(null);
outputStream = IOUtils.openPathForWriting(tmpSvg);
w = xof.createXMLStreamWriter(outputStream);
double featureHeight = 10;
double TRANSCRIPT_HEIGHT = featureHeight;
final int all_genotypes_width = variants.size() * this.genotype_width;
final int drawinAreaWidth = Math.max(all_genotypes_width, 1000);
final int interline_weight = 6;
final int margin_top = 10;
final int margin_bottom = 10;
final int margin_right = 100;
final int margin_left = 100;
w.writeStartDocument("UTF-8", "1.0");
w.writeStartElement("svg");
w.writeDefaultNamespace(SVG.NS);
w.writeNamespace("xlink", XLINK.NS);
w.writeAttribute("version", "1.1");
w.writeAttribute("width", String.valueOf(margin_right + margin_right + drawinAreaWidth));
w.writeAttribute("height", String.valueOf(margin_top + margin_bottom + transcripts.size() * TRANSCRIPT_HEIGHT + interline_weight * featureHeight + samples.size() * this.genotype_width));
title(w, interval.getContig() + ":" + interval.getStart() + "-" + interval.getEnd());
w.writeStartElement("desc");
w.writeCharacters("generated with " + getProgramName() + "\n" + "Author: Pierre Lindenbaum PhD. @yokofakun .");
w.writeEndElement();
// defs
w.writeStartElement("defs");
// genotypes
w.writeStartElement("g");
//
w.writeAttribute("id", "g_" + GenotypeType.HOM_REF);
w.writeEmptyElement("rect");
w.writeAttribute("style", "fill:lime;stroke;none;");
w.writeAttribute("x", "0");
w.writeAttribute("y", "0");
w.writeAttribute("width", String.valueOf(this.genotype_width));
w.writeAttribute("height", String.valueOf(this.genotype_width));
w.writeEndElement();
w.writeStartElement("g");
//
w.writeAttribute("id", "g_" + GenotypeType.NO_CALL);
w.writeEmptyElement("rect");
w.writeAttribute("style", "fill:silver;stroke;gray;");
w.writeAttribute("x", "0");
w.writeAttribute("y", "0");
w.writeAttribute("width", String.valueOf(this.genotype_width));
w.writeAttribute("height", String.valueOf(this.genotype_width));
w.writeEndElement();
w.writeStartElement("g");
//
w.writeAttribute("id", "g_" + GenotypeType.HOM_VAR);
w.writeEmptyElement("rect");
w.writeAttribute("style", "fill:crimson;stroke;none;");
w.writeAttribute("x", "0");
w.writeAttribute("y", "0");
w.writeAttribute("width", String.valueOf(this.genotype_width));
w.writeAttribute("height", String.valueOf(this.genotype_width));
w.writeEndElement();
w.writeStartElement("g");
//
w.writeAttribute("id", "g_" + GenotypeType.MIXED);
w.writeEmptyElement("rect");
w.writeAttribute("style", "fill:pink;stroke;none;");
w.writeAttribute("x", "0");
w.writeAttribute("y", "0");
w.writeAttribute("width", String.valueOf(this.genotype_width));
w.writeAttribute("height", String.valueOf(this.genotype_width));
w.writeEndElement();
w.writeStartElement("g");
//
w.writeAttribute("id", "g_" + GenotypeType.UNAVAILABLE);
w.writeEmptyElement("rect");
w.writeAttribute("style", "fill:gray;stroke;none;");
w.writeAttribute("x", "0");
w.writeAttribute("y", "0");
w.writeAttribute("width", String.valueOf(this.genotype_width));
w.writeAttribute("height", String.valueOf(this.genotype_width));
w.writeEndElement();
w.writeStartElement("g");
//
w.writeAttribute("id", "g_" + GenotypeType.HET);
w.writeEmptyElement("rect");
w.writeAttribute("style", "fill:lime;stroke;black;");
w.writeAttribute("x", "0");
w.writeAttribute("y", "0");
w.writeAttribute("width", String.valueOf(genotype_width));
w.writeAttribute("height", String.valueOf(genotype_width));
w.writeEmptyElement("polygon");
w.writeAttribute("style", "fill:crimson;stroke;black;");
w.writeAttribute("points", "0,0 " + genotype_width + ",0 0," + genotype_width + " 0,0");
w.writeEndElement();
// strand
w.writeEmptyElement("polyline");
w.writeAttribute("id", "strandF");
w.writeAttribute("points", "-5,-5 0,0 -5,5");
w.writeEmptyElement("polyline");
w.writeAttribute("id", "strandR");
w.writeAttribute("points", "5,-5 0,0 5,5");
// gradients
w.writeStartElement("linearGradient");
w.writeAttribute("id", "grad01");
w.writeAttribute("x1", "50%");
w.writeAttribute("x2", "50%");
w.writeAttribute("y1", "0%");
w.writeAttribute("y2", "100%");
w.writeEmptyElement("stop");
w.writeAttribute("offset", "0%");
w.writeAttribute("style", "stop-color:black;stop-opacity:1;");
w.writeEmptyElement("stop");
w.writeAttribute("offset", "50%");
w.writeAttribute("style", "stop-color:white;stop-opacity:1;");
w.writeEmptyElement("stop");
w.writeAttribute("offset", "100%");
w.writeAttribute("style", "stop-color:black;stop-opacity:1;");
w.writeEndElement();
// defs
w.writeEndElement();
w.writeStartElement("style");
w.writeCharacters("svg {fill:none; stroke:black;}\n" + "text {fill:black;stroke:none;font-size:" + (featureHeight / 1.5) + "px;}\n" + ".ruler-label { stroke:red;}\n" + ".frame { stroke:black;fill:none;}\n" + ".kgexon {fill:url(#grad01);stroke:black;}\n" + ".gcpercent {fill:url(#grad02);stroke:black;}" + ".coverage {fill:url(#grad03);stroke:black;}" + ".kgcds {fill:yellow;stroke:black;opacity:0.7;}\n" + ".variant{stroke:none;fill:red;opacity:0.2;}\n" + ".xaxis{stroke:gray;fill:none;opacity:0.2;}\n" + ".postick{font-size:9px;stroke:black;stroke-width:1;}");
// style
w.writeEndElement();
final IntFunction<Integer> trim = t -> Math.max(interval.getStart(), Math.min(interval.getEnd(), t));
final IntFunction<Double> baseToPixel = t -> margin_left + drawinAreaWidth * (t - (double) interval.getStart()) / ((double) interval.getLengthOnReference());
final IntFunction<Double> variantIndexToPixel = idx -> {
final double variant_width = drawinAreaWidth / (double) variants.size();
final double midx = variant_width * idx + variant_width / 2.0;
return margin_left + midx - genotype_width / 2.0;
};
final Function<VariantContext, String> variantTitle = V -> (V.getContig().startsWith("chr") ? V.getContig().substring(3) : V.getContig()) + ":" + V.getStart() + " " + V.getReference().getDisplayString();
/**
* title
*/
double y = 0;
w.writeStartElement("text");
w.writeAttribute("x", "0");
w.writeAttribute("y", String.valueOf(featureHeight));
w.writeCharacters(interval.toString());
w.writeEndElement();
y += featureHeight;
for (final Transcript g : transcripts) {
int cdsHeigh = 5;
double exonHeight = TRANSCRIPT_HEIGHT - 5;
double midY = TRANSCRIPT_HEIGHT / 2;
w.writeStartElement("g");
w.writeAttribute("transform", "translate(0," + y + ")");
title(w, g.getId());
w.writeStartElement("text");
w.writeAttribute("x", String.valueOf(margin_left - 10));
w.writeAttribute("y", String.valueOf(featureHeight));
w.writeAttribute("style", "text-anchor:end;");
w.writeCharacters(g.getId());
w.writeEndElement();
/* transcript line */
w.writeEmptyElement("line");
w.writeAttribute("class", "kgtr");
w.writeAttribute("x1", String.valueOf(baseToPixel.apply(trim.apply(g.getTxStart()))));
w.writeAttribute("y1", String.valueOf(midY));
w.writeAttribute("x2", String.valueOf(baseToPixel.apply(trim.apply(g.getTxEnd()))));
w.writeAttribute("y2", String.valueOf(midY));
/* strand symbols */
for (double pixX = 0; pixX < drawinAreaWidth; pixX += 30) {
double pos0 = interval.getStart() + (pixX / (double) drawinAreaWidth) * interval.getLengthOnReference();
if (pos0 + 1 < g.getTxStart())
continue;
if (pos0 > g.getTxEnd())
break;
w.writeEmptyElement("use");
w.writeAttribute("class", "kgstrand");
w.writeAttribute("xlink", XLINK.NS, "href", "#strand" + (g.isPositiveStrand() ? "F" : "R"));
w.writeAttribute("x", String.valueOf(margin_left + pixX));
w.writeAttribute("y", String.valueOf(midY));
}
/* exons */
for (final Exon exon : g.getExons()) {
if (exon.getStart() + 1 >= interval.getEnd())
continue;
if (exon.getEnd() <= interval.getStart())
continue;
w.writeStartElement("rect");
w.writeAttribute("class", "kgexon");
w.writeAttribute("x", String.valueOf(baseToPixel.apply(trim.apply(exon.getStart()))));
w.writeAttribute("y", String.valueOf(midY - exonHeight / 2));
w.writeAttribute("width", String.valueOf(baseToPixel.apply(trim.apply(exon.getEnd())) - baseToPixel.apply((trim.apply(exon.getStart())))));
w.writeAttribute("height", String.valueOf(exonHeight));
title(w, exon.getName());
w.writeEndElement();
}
/* coding line */
if (!g.isNonCoding() && g.hasCodonStartDefined() && g.hasCodonStopDefined()) {
final double codonx1 = baseToPixel.apply(trim.apply(g.getLeftmostCodon().get().getStart()));
final double codonx2 = baseToPixel.apply(trim.apply(g.getRightmostCodon().get().getEnd()));
w.writeEmptyElement("rect");
w.writeAttribute("class", "kgcds");
w.writeAttribute("x", String.valueOf(codonx1));
w.writeAttribute("y", String.valueOf(midY - cdsHeigh / 4.0));
w.writeAttribute("width", String.valueOf(baseToPixel.apply((int) (codonx2 - codonx1))));
w.writeAttribute("height", String.valueOf(cdsHeigh / 2.0));
}
// String label=String.format("%15s", g.getName());
// w.writeEmptyElement("path");
// double fontHeight=Math.min(10,0.8*TRANSCRIPT_HEIGHT);
// w.writeAttribute("d",this.hershey.svgPath(label,-insets.left,midY-fontHeight/2,insets.left*0.9,fontHeight));
w.writeEndElement();
w.writeCharacters("\n");
y += featureHeight;
}
/* draw lines to variants */
for (int vidx = 0; vidx < variants.size(); ++vidx) {
final VariantContext vc = variants.get(vidx);
double x1 = baseToPixel.apply(vc.getStart());
double x2 = baseToPixel.apply(vc.getEnd());
final double y2 = y + featureHeight * interline_weight;
w.writeStartElement("polygon");
w.writeAttribute("style", "fill:" + (vidx % 2 == 0 ? "ghostwhite" : "lavender") + ";stroke:black;opacity:0.6;stroke-width:0.5;");
w.writeAttribute("points", "" + x1 + "," + (y - featureHeight / 2.0) + " " + x2 + "," + (y - featureHeight / 2.0) + " " + variantIndexToPixel.apply(vidx) + "," + y2 + " " + (variantIndexToPixel.apply(vidx) + this.genotype_width) + "," + y2);
title(w, variantTitle.apply(vc));
w.writeEndElement();
}
for (int vidx = 0; vidx < variants.size(); ++vidx) {
final VariantContext vc = variants.get(vidx);
final double y2 = y + featureHeight * interline_weight;
w.writeStartElement("text");
w.writeAttribute("transform", "translate(" + (String.valueOf(variantIndexToPixel.apply(vidx) + genotype_width / 2.0)) + "," + String.valueOf(y2 - 5) + ") " + "rotate(-45)");
w.writeAttribute("x", "0");
w.writeAttribute("y", "0");
w.writeAttribute("class", "postick");
w.writeCharacters(variantTitle.apply(vc));
w.writeEndElement();
w.writeCharacters("\n");
}
y += featureHeight * interline_weight;
w.writeStartElement("g");
/* step 0: affected, 1: unaffected, 2: others */
for (int step = 0; step < 3; ++step) {
for (final String sample : samples) {
final Sample individual = pedigree.getSampleById(sample);
if (step == 0 && (individual == null || !individual.isAffected()))
continue;
if (step == 1 && (individual == null || !individual.isUnaffected()))
continue;
if (step == 2 && individual != null && individual.isStatusSet())
continue;
// sample
w.writeStartElement("g");
switch(step) {
case 0:
w.writeAttribute("style", "hue-rotate(195deg);");
break;
case 1:
w.writeAttribute("style", "hue-rotate(45deg);");
break;
default:
break;
}
for (int vidx = 0; vidx < variants.size(); ++vidx) {
final VariantContext vc = variants.get(vidx);
final Genotype g = vc.getGenotype(sample);
double opacity = 1.0;
if (vc.isIndel())
opacity *= this.variantIndelOpacity;
if (vc.isFiltered())
opacity *= this.variantFILTEREDOpacity;
if (opacity > 1)
opacity = 1;
if (opacity <= 0)
continue;
if (opacity < 1) {
w.writeStartElement("g");
w.writeAttribute("style", "opacity:" + opacity + ";");
}
w.writeEmptyElement("use");
w.writeAttribute("x", String.valueOf(variantIndexToPixel.apply(vidx)));
w.writeAttribute("y", String.valueOf(y));
w.writeAttribute("xlink", XLINK.NS, "href", "#g_" + g.getType());
if (opacity < 1) {
w.writeEndElement();
}
}
w.writeCharacters("\n");
w.writeStartElement("text");
w.writeAttribute("x", String.valueOf(margin_left - 10));
w.writeAttribute("y", String.valueOf(y + this.genotype_width / 2.0));
w.writeAttribute("style", "text-anchor:end;");
w.writeCharacters(sample);
// text
w.writeEndElement();
// g for sample
w.writeEndElement();
y += this.genotype_width;
}
}
w.writeCharacters("\n");
w.writeEndDocument();
w.writeCharacters("\n");
w.flush();
w.close();
final String md5 = StringUtils.md5(interval.getContig() + ":" + interval.getStart() + "-" + interval.getEnd());
final String filename = md5.substring(0, 2) + File.separatorChar + md5.substring(2) + File.separator + interval.getContig().replaceAll("[/\\:]", "_") + "_" + interval.getStart() + "_" + interval.getEnd() + (StringUtils.isBlank(geneName) ? "" : "." + geneName.replaceAll("[/\\:]", "")) + (StringUtils.isBlank(geneId) ? "" : "." + geneId.replaceAll("[/\\:]", "")) + ".svg";
OutputStream os = archiveFactory.openOuputStream(filename);
IOUtils.copyTo(tmpSvg, os);
os.flush();
os.close();
Files.delete(tmpSvg);
manifestW.print(interval.getContig());
manifestW.print('\t');
manifestW.print(interval.getStart() - 1);
manifestW.print('\t');
manifestW.print(interval.getEnd());
manifestW.print('\t');
manifestW.print(transcripts.stream().map(G -> G.getGene().getId()).collect(Collectors.toSet()).stream().collect(Collectors.joining(";")));
manifestW.print('\t');
manifestW.print(transcripts.stream().map(G -> G.getGene().getGeneName()).collect(Collectors.toSet()).stream().collect(Collectors.joining(";")));
manifestW.print('\t');
manifestW.print(transcripts.stream().map(G -> G.getId()).collect(Collectors.toSet()).stream().collect(Collectors.joining(";")));
manifestW.print('\t');
manifestW.print((archiveFactory.isTarOrZipArchive() ? "" : this.outputPath.toString() + File.separator) + filename);
manifestW.print('\t');
manifestW.println(variants.size());
}
r.close();
manifestW.flush();
manifestW.close();
manifestW = null;
archiveFactory.close();
archiveFactory = null;
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(archiveFactory);
CloserUtil.close(r);
CloserUtil.close(outputStream);
CloserUtil.close(manifestW);
}
}
use of com.github.lindenb.jvarkit.util.bio.structure.Transcript in project jvarkit by lindenb.
the class SVPredictions method doVcfToVcf.
@Override
protected int doVcfToVcf(String inputName, VCFIterator r, VariantContextWriter w) {
try {
final VCFHeader header = r.getHeader();
final SAMSequenceDictionary dict = header.getSequenceDictionary();
try (final GtfReader gtfReader = new GtfReader(this.gtfPath)) {
if (dict != null)
gtfReader.setContigNameConverter(ContigNameConverter.fromOneDictionary(dict));
gtfReader.getAllGenes().stream().forEach(G -> this.all_gene.put(new Interval(G), G));
}
final VCFHeader h2 = new VCFHeader(header);
/* split 'limit' string */
if (StringUtils.isBlank(this.whereStr)) {
// add all
Arrays.stream(WhereInGene.values()).forEach(C -> this.limitWhere.add(C));
} else {
for (final String ss : this.whereStr.split("[ \t,;]+")) {
if (StringUtils.isBlank(ss))
continue;
// gene and transcript expand to everything but intergenic
if (ss.toLowerCase().equals("gene") || ss.toLowerCase().equals("transcript")) {
Arrays.stream(WhereInGene.values()).filter(C -> !C.equals(WhereInGene.intergenic)).forEach(C -> this.limitWhere.add(C));
} else {
final WhereInGene g = Arrays.stream(WhereInGene.values()).filter(A -> A.name().equalsIgnoreCase(ss)).findFirst().orElseThrow(() -> new IllegalArgumentException("Bad identifier expected one of :" + Arrays.stream(WhereInGene.values()).map(X -> X.name()).collect(Collectors.joining(", "))));
this.limitWhere.add(g);
}
}
if (this.limitWhere.isEmpty()) {
LOG.error("--where option provided but no identifier was found.");
return -1;
}
}
final VCFFilterHeaderLine filterHeader;
if (!StringUtils.isBlank(this.filterStr)) {
filterHeader = new VCFFilterHeaderLine(this.filterStr, "variant failing locations: " + this.limitWhere.stream().map(V -> V.name()).collect(Collectors.joining(",")));
h2.addMetaDataLine(filterHeader);
} else {
filterHeader = null;
}
h2.addMetaDataLine(new VCFInfoHeaderLine(this.info_tag, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Structural variant consequence."));
JVarkitVersion.getInstance().addMetaData(this, h2);
w.writeHeader(h2);
while (r.hasNext()) {
final VariantContext ctx = r.next();
final int ctx_bnd_end;
if (this.ignore_bnd_end && ctx.hasAttribute(VCFConstants.SVTYPE) && ctx.getAttributeAsString(VCFConstants.SVTYPE, "").equals("BND")) {
ctx_bnd_end = ctx.getStart();
} else {
ctx_bnd_end = ctx.getEnd();
}
final Collection<Gene> genes = this.all_gene.getOverlapping(new Interval(ctx.getContig(), Math.max(1, ctx.getStart() - this.upstream_size), ctx_bnd_end + this.upstream_size));
if (// intergenic
genes.isEmpty()) {
// discard anyway
if (!this.limitWhere.contains(WhereInGene.intergenic) && filterHeader == null)
continue;
Gene leftGene = null;
for (final Gene g : this.all_gene.getOverlapping(new Interval(ctx.getContig(), 1, ctx.getStart()))) {
if (leftGene == null || leftGene.getEnd() < g.getEnd())
leftGene = g;
}
final String leftId = (leftGene == null ? "" : leftGene.getId());
final String leftName = (leftGene == null ? "" : leftGene.getGeneName());
Gene rightGene = null;
for (final Gene g : this.all_gene.getOverlapping(new Interval(ctx.getContig(), ctx.getStart(), Integer.MAX_VALUE))) {
if (rightGene == null || rightGene.getStart() > g.getStart())
rightGene = g;
}
final String rightId = (rightGene == null ? "" : rightGene.getId());
final String rightName = (rightGene == null ? "" : rightGene.getGeneName());
final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
// FILTER
if (!this.limitWhere.contains(WhereInGene.intergenic) && filterHeader != null) {
vcb.filter(filterHeader.getID());
}
if (!(leftGene == null && rightGene == null)) {
vcb.attribute(this.info_tag, "intergenic|" + leftId + "-" + rightId + "|" + leftName + "-" + rightName);
}
w.add(vcb.make());
} else {
final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
final List<List<Annotation>> annotations = genes.stream().map(G -> annotGene(G, ctx)).collect(Collectors.toList());
final boolean match_user_filter = annotations.stream().flatMap(L -> L.stream()).flatMap(A -> A.where.stream()).anyMatch(A -> this.limitWhere.contains(A));
if (!match_user_filter) {
if (filterHeader == null)
continue;
vcb.filter(filterHeader.getID());
}
if (this.max_genes_count != -1 && genes.size() > this.max_genes_count) {
final String prefix = annotations.stream().flatMap(L -> L.stream()).flatMap(A -> A.where.stream()).map(A -> A.name()).collect(Collectors.toCollection(TreeSet::new)).stream().collect(Collectors.joining("&"));
vcb.attribute(this.info_tag, (StringUtils.isBlank(prefix) ? "." : prefix) + "|multiple_genes|" + genes.size());
} else {
final List<String> annots = annotations.stream().flatMap(L -> L.stream()).map(A -> A.where.stream().map(X -> X.name()).collect(Collectors.toCollection(TreeSet::new)).stream().collect(Collectors.joining("&")) + A.label).collect(Collectors.toSet()).stream().collect(Collectors.toList());
if (!annots.isEmpty())
vcb.attribute(this.info_tag, annots);
}
w.add(vcb.make());
}
}
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(w);
}
}
use of com.github.lindenb.jvarkit.util.bio.structure.Transcript in project jvarkit by lindenb.
the class VCFPredictions method doVcfToVcf.
@Override
protected int doVcfToVcf(String inputName, VCFIterator r, VariantContextWriter w) {
try {
this.referenceGenome = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.faidxPath);
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.referenceGenome);
final ContigNameConverter contigNameConverter = ContigNameConverter.fromOneDictionary(dict);
loadGtf(dict);
final VCFHeader header = r.getHeader();
final VCFHeader h2 = new VCFHeader(header);
JVarkitVersion.getInstance().addMetaData(this, h2);
switch(this.outputSyntax) {
case Vep:
{
h2.addMetaDataLine(new VCFInfoHeaderLine("CSQ", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Consequence type as predicted by VEP" + ". Format: Allele|Feature|Feature_type|Consequence|CDS_position|Protein_position|Amino_acids|Codons"));
break;
}
case SnpEff:
{
h2.addMetaDataLine(new VCFInfoHeaderLine("ANN", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Functional annotations: 'Allele | Annotation | Annotation_Impact | Gene_Name | Gene_ID | Feature_Type | Feature_ID | Transcript_BioType | Rank | HGVS.c | HGVS.p | cDNA.pos / cDNA.length | CDS.pos / CDS.length | AA.pos / AA.length | Distance | ERRORS / WARNINGS / INFO'"));
break;
}
default:
{
final StringBuilder format = new StringBuilder();
for (FORMAT1 f : FORMAT1.values()) {
if (format.length() > 0)
format.append("|");
format.append(f.name());
}
h2.addMetaDataLine(new VCFInfoHeaderLine(TAG, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Prediction from " + getClass().getSimpleName() + ". Format: " + format));
break;
}
}
w.writeHeader(h2);
final RNASequenceFactory rnaSeqFactory = new RNASequenceFactory();
rnaSeqFactory.setContigToGenomicSequence(S -> getGenomicSequence(S));
while (r.hasNext()) {
final VariantContext ctx = r.next();
final String normalizedContig = contigNameConverter.apply(ctx.getContig());
if (StringUtil.isBlank(normalizedContig)) {
w.add(ctx);
continue;
}
final List<Transcript> transcripts = this.transcriptTreeMap.getOverlapping(new SimpleInterval(normalizedContig, ctx.getStart(), ctx.getEnd())).stream().flatMap(L -> L.stream()).collect(Collectors.toList());
final List<Annotation> all_annotations = new ArrayList<>();
final List<Allele> alternateAlleles;
if (ctx.getNAlleles() <= 1) {
// not a variant, just REF
alternateAlleles = Arrays.asList(Allele.NO_CALL);
} else {
alternateAlleles = ctx.getAlternateAlleles();
}
for (final Allele altAllele : alternateAlleles) {
if (altAllele.isReference() || altAllele.equals(Allele.SPAN_DEL) || altAllele.equals(Allele.NON_REF_ALLELE))
continue;
/* intergenic ====================================================== */
if (transcripts.isEmpty()) {
Transcript leftGene = null;
String leftId = "";
String leftName = "";
for (Iterator<Transcript> iter = this.transcriptTreeMap.getOverlapping(new SimpleInterval(normalizedContig, 1, ctx.getStart())).stream().flatMap(L -> L.stream()).iterator(); iter.hasNext(); ) {
final Transcript t = iter.next();
if (leftGene == null || leftGene.getEnd() < t.getEnd()) {
leftGene = t;
leftId = t.getGene().getId();
leftName = t.getGene().getGeneName();
}
}
Transcript rightGene = null;
String rightId = "";
String rightName = "";
for (Iterator<Transcript> iter = this.transcriptTreeMap.getOverlapping(new SimpleInterval(normalizedContig, ctx.getEnd(), dict.getSequence(normalizedContig).getSequenceLength())).stream().flatMap(L -> L.stream()).iterator(); iter.hasNext(); ) {
final Transcript t = iter.next();
if (rightGene == null || t.getStart() < rightGene.getStart()) {
rightGene = t;
rightId = t.getGene().getId();
rightName = t.getGene().getGeneName();
}
}
// intergenic
final Annotation annot = new Annotation(altAllele);
annot.seqont.add("intergenic");
annot.geneId = leftId + "-" + rightId;
annot.geneName = leftName + "-" + rightName;
all_annotations.add(annot);
} else {
for (final Transcript transcript : transcripts) {
final Annotation annotation = new Annotation(altAllele, transcript);
all_annotations.add(annotation);
if (!transcript.overlaps(ctx)) {
if (((ctx.getEnd() < transcript.getStart() && transcript.isNegativeStrand()) || (ctx.getStart() > transcript.getEnd() && transcript.isPositiveStrand()))) {
if (ctx.withinDistanceOf(transcript, 500)) {
annotation.seqont.add("500B_downstream_variant");
} else if (ctx.withinDistanceOf(transcript, 2_000)) {
annotation.seqont.add("2KB_downstream_variant");
}
} else if (((ctx.getEnd() < transcript.getStart() && transcript.isPositiveStrand()) || (ctx.getStart() > transcript.getEnd() && transcript.isNegativeStrand()))) {
if (ctx.withinDistanceOf(transcript, 2_000)) {
annotation.seqont.add("2KB_upstream_variant");
} else if (ctx.withinDistanceOf(transcript, 5_000)) {
annotation.seqont.add("5KB_upstream_variant");
}
}
continue;
}
if (CoordMath.encloses(ctx.getStart(), ctx.getEnd(), transcript.getStart(), transcript.getEnd())) {
// TODO can be inversion ,etc...
annotation.seqont.add("transcript_ablation");
continue;
}
for (int side = 0; side < 2; ++side) {
final Optional<UTR> opt_utr = (side == 0 ? transcript.getTranscriptUTR5() : transcript.getTranscriptUTR3());
if (!opt_utr.isPresent())
continue;
final UTR utr = opt_utr.get();
if (CoordMath.overlaps(utr.getStart(), utr.getEnd(), ctx.getStart(), ctx.getEnd())) {
annotation.seqont.add(side == 0 ? "5_prime_UTR_variant" : "3_prime_UTR_variant");
}
}
for (int side = 0; side < 2; ++side) {
final Optional<? extends ExonOrIntron> opt_ex;
if (side == 0) {
opt_ex = transcript.getExons().stream().filter(E -> E.overlaps(ctx)).findFirst();
} else {
opt_ex = transcript.getIntrons().stream().filter(E -> E.overlaps(ctx)).findFirst();
}
if (!opt_ex.isPresent())
continue;
final ExonOrIntron ei = opt_ex.get();
if (side == 0) {
if (transcript.isNonCoding())
annotation.seqont.add("non_coding_transcript_exon_variant");
} else {
if (transcript.isNonCoding())
annotation.seqont.add("non_coding_transcript_intron_variant");
annotation.seqont.add("intron");
}
if (ctx.getStart() == ctx.getEnd() && ei.isSplicing(ctx.getStart())) {
if (ei.isSplicingAcceptor(ctx.getStart())) {
// SPLICING_ACCEPTOR
annotation.seqont.add("splice_acceptor");
} else if (ei.isSplicingDonor(ctx.getStart())) {
// SPLICING_DONOR
annotation.seqont.add("splice_donor");
} else // ??
{
annotation.seqont.add("splicing_variant");
}
}
}
final StructuralVariantType svType = ctx.getStructuralVariantType();
if (svType != null) {
continue;
}
if (transcript.isNonCoding()) {
// TODO
annotation.seqont.add("non_coding_transcript_variant");
continue;
}
RNASequence cDNA = this.transcriptId2cdna.get(transcript.getId());
if (cDNA == null) {
cDNA = rnaSeqFactory.getCodingRNA(transcript);
this.transcriptId2cdna.put(transcript.getId(), cDNA);
}
final OptionalInt opt_pos_cdna0 = cDNA.convertGenomic0ToRnaIndex0(ctx.getStart() - 1);
if (!opt_pos_cdna0.isPresent())
continue;
final int pos_cdna0 = opt_pos_cdna0.getAsInt();
final int pos_aa = pos_cdna0 / 3;
final GeneticCode geneticCode = GeneticCode.getStandard();
if (AcidNucleics.isATGC(altAllele.getBaseString())) {
String bases = altAllele.getBaseString().toUpperCase();
if (transcript.isNegativeStrand()) {
bases = AcidNucleics.reverseComplement(bases);
}
final MutedSequence mutRNA = new MutedSequence(cDNA, pos_cdna0, ctx.getReference().length(), bases);
final PeptideSequence<CharSequence> wildProt = PeptideSequence.of(cDNA, geneticCode);
final PeptideSequence<CharSequence> mutProt = PeptideSequence.of(mutRNA, geneticCode);
final int mod = pos_cdna0 % 3;
annotation.wildCodon = ("" + cDNA.charAt(pos_cdna0 - mod + 0) + cDNA.charAt(pos_cdna0 - mod + 1) + cDNA.charAt(pos_cdna0 - mod + 2));
annotation.mutCodon = ("" + mutRNA.charAt(pos_cdna0 - mod + 0) + mutRNA.charAt(pos_cdna0 - mod + 1) + mutRNA.charAt(pos_cdna0 - mod + 2));
annotation.position_protein = (pos_aa + 1);
annotation.wildAA = String.valueOf(wildProt.charAt(pos_aa));
annotation.mutAA = (String.valueOf(mutProt.charAt(pos_aa)));
if (isStop(wildProt.charAt(pos_aa)) && !isStop(mutProt.charAt(pos_aa))) {
annotation.seqont.add("stop_lost");
} else if (!isStop(wildProt.charAt(pos_aa)) && isStop(mutProt.charAt(pos_aa))) {
annotation.seqont.add("stop_gained");
} else if (wildProt.charAt(pos_aa) == mutProt.charAt(pos_aa)) {
annotation.seqont.add("synonymous");
} else {
annotation.seqont.add("missense_variant");
}
}
}
}
}
final Set<String> info = new HashSet<String>(all_annotations.size());
for (final Annotation a : all_annotations) {
info.add(a.toString());
}
final VariantContextBuilder vb = new VariantContextBuilder(ctx);
final String thetag;
switch(this.outputSyntax) {
case Vep:
thetag = "CSQ";
break;
case SnpEff:
thetag = "ANN";
break;
default:
thetag = TAG;
break;
}
vb.attribute(thetag, info.toArray());
w.add(vb.make());
}
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(w);
CloserUtil.close(r);
CloserUtil.close(this.referenceGenome);
}
}
use of com.github.lindenb.jvarkit.util.bio.structure.Transcript in project jvarkit by lindenb.
the class VCFPredictions method loadGtf.
private void loadGtf(final SAMSequenceDictionary dict) throws IOException {
GtfReader in = null;
try {
this.transcriptTreeMap = new IntervalTreeMap<>();
final ContigNameConverter contigNameConverter = ContigNameConverter.fromOneDictionary(dict);
in = new GtfReader(this.gtfPath);
in.setContigNameConverter(contigNameConverter);
in.getAllGenes().stream().flatMap(G -> G.getTranscripts().stream()).filter(G -> G.hasCDS() && G.hasStrand()).forEach(g -> {
// because we want to set
final int extend_gene_search = 5000;
// SO:5KB_upstream_variant
final Interval interval = new Interval(g.getContig(), Math.max(1, g.getTxStart() + 1 - extend_gene_search), g.getTxEnd() + extend_gene_search);
List<Transcript> L = this.transcriptTreeMap.get(interval);
if (L == null) {
L = new ArrayList<>(2);
this.transcriptTreeMap.put(interval, L);
}
L.add(g);
});
} finally {
CloserUtil.close(in);
}
}
use of com.github.lindenb.jvarkit.util.bio.structure.Transcript in project jvarkit by lindenb.
the class VcfBurdenGtf method runBurden.
@Override
protected void runBurden(PrintWriter pw, VCFReader vcfReader, VariantContextWriter vcw) throws IOException {
final SAMSequenceDictionary vcfDict = SequenceDictionaryUtils.extractRequired(vcfReader.getHeader());
final List<Gene> all_genes;
try (GtfReader gtfReader = new GtfReader(this.gtfFile)) {
gtfReader.setContigNameConverter(ContigNameConverter.fromOneDictionary(vcfDict));
all_genes = gtfReader.getAllGenes().stream().filter(G -> StringUtil.isBlank(this.intergenic_contig) || this.intergenic_contig.equals("*") || this.intergenic_contig.equals(G.getContig())).sorted(new ContigDictComparator(vcfDict).createLocatableComparator()).collect(Collectors.toCollection(ArrayList::new));
}
pw.print("#chrom");
pw.print("\t");
pw.print("start0");
pw.print("\t");
pw.print("end");
pw.print("\t");
pw.print("name");
pw.print("\t");
pw.print("length");
pw.print("\t");
pw.print("gene");
pw.print("\t");
pw.print("type");
pw.print("\t");
pw.print("strand");
pw.print("\t");
pw.print("transcript");
pw.print("\t");
pw.print("gene-id");
pw.print("\t");
pw.print("intervals");
pw.print("\t");
pw.print("p-value");
pw.print("\t");
pw.print("affected_alt");
pw.print("\t");
pw.print("affected_hom");
pw.print("\t");
pw.print("unaffected_alt");
pw.print("\t");
pw.print("unaffected_hom");
pw.print("\t");
pw.print("variants.count");
pw.println();
final List<SimpleInterval> all_intergenic = new ArrayList<>();
if (!StringUtil.isBlank(this.intergenic_contig)) {
for (final SAMSequenceRecord ssr : vcfDict.getSequences()) {
if (!(this.intergenic_contig.equals("*") || this.intergenic_contig.equals(ssr.getSequenceName())))
continue;
final BitSet filled = new BitSet(ssr.getSequenceLength() + 2);
all_genes.stream().filter(G -> G.getContig().equals(ssr.getSequenceName())).forEach(G -> filled.set(G.getStart(), 1 + /* bit set is 0 based */
Math.min(G.getEnd(), ssr.getSequenceLength())));
int i = 1;
while (i < ssr.getSequenceLength()) {
if (filled.get(i)) {
i++;
continue;
}
int j = i;
while (j < ssr.getSequenceLength() && !filled.get(j)) {
j++;
}
all_intergenic.add(new SimpleInterval(ssr.getSequenceName(), i, j));
i = j + 1;
}
all_genes.removeIf(G -> G.getContig().equals(ssr.getSequenceName()));
}
all_genes.clear();
}
final ProgressFactory.Watcher<Gene> progress = ProgressFactory.newInstance().logger(LOG).dictionary(vcfDict).build();
/* run genes */
for (final Gene gene : all_genes) {
progress.apply(gene);
final IntervalTree<VariantContext> intervalTree = new IntervalTree<>();
vcfReader.query(gene).stream().filter(V -> accept(V)).forEach(V -> intervalTree.put(V.getStart(), V.getEnd(), V));
if (intervalTree.size() == 0)
continue;
for (final Transcript transcript : gene.getTranscripts()) {
final List<SubPartOfTranscript> parts = new ArrayList<>();
parts.addAll(transcript.getExons().stream().map(R -> new SubPartOfTranscript(R)).collect(Collectors.toList()));
parts.addAll(transcript.getIntrons().stream().map(R -> new SubPartOfTranscript(R)).collect(Collectors.toList()));
final int intron_window_size = 1000;
final int intron_window_shift = 500;
for (final Intron intron : transcript.getIntrons()) {
if (intron.getLengthOnReference() <= intron_window_size)
continue;
int start_pos = intron.getStart();
while (start_pos + intron_window_size <= intron.getEnd()) {
int xend = Math.min(intron.getEnd(), start_pos + intron_window_size - 1);
int xstart = xend - intron_window_size - 1;
parts.add(new SubPartOfTranscript(transcript, intron.getName() + ".Sliding", Collections.singletonList(new SimpleInterval(intron.getContig(), xstart, xend))));
start_pos += intron_window_shift;
}
}
for (final UTR utr : transcript.getUTRs()) {
parts.add(new SubPartOfTranscript(transcript, utr.getName(), utr.getIntervals()));
}
if (transcript.getExonCount() > 1) {
parts.add(new SubPartOfTranscript(transcript, "AllExons", transcript.getExons().stream().map(E -> E.toInterval()).collect(Collectors.toList())));
}
if (transcript.hasCodonStartDefined() && transcript.hasCodonStopDefined() && transcript.getAllCds().size() > 1) {
parts.add(new SubPartOfTranscript(transcript, "AllCds", transcript.getAllCds().stream().map(E -> E.toInterval()).collect(Collectors.toList())));
}
final int L = transcript.getTranscriptLength();
final int[] index2genomic = new int[L];
int pos = 0;
for (final Exon exon : transcript.getExons()) {
for (int i = exon.getStart(); i <= exon.getEnd(); i++) {
index2genomic[pos] = i;
pos++;
}
}
final int window_size = 200;
final int window_shift = 100;
int array_index = 0;
while (array_index < index2genomic.length) {
final List<Locatable> intervals = new ArrayList<>();
int prev_pos = -1;
int start_pos = index2genomic[array_index];
int i = 0;
while (i < window_size && array_index + i < index2genomic.length) {
final int curr_pos = index2genomic[array_index + i];
if (i > 0 && prev_pos + 1 != curr_pos) {
intervals.add(new SimpleInterval(transcript.getContig(), start_pos, prev_pos));
start_pos = curr_pos;
}
prev_pos = curr_pos;
i++;
}
intervals.add(new SimpleInterval(transcript.getContig(), start_pos, prev_pos));
parts.add(new SubPartOfTranscript(transcript, "Sliding", intervals));
array_index += window_shift;
}
for (final SubPartOfTranscript part : parts) {
final List<VariantContext> variants = new ArrayList<>();
for (final Locatable loc : part.intervals) {
Iterator<IntervalTree.Node<VariantContext>> iter = intervalTree.overlappers(loc.getStart(), loc.getEnd());
while (iter.hasNext()) variants.add(iter.next().getValue());
}
if (variants.isEmpty())
continue;
final FisherResult fisher = runFisher(variants);
if (fisher.p_value > this.fisherTreshold)
continue;
if (vcw != null) {
for (final VariantContext ctx : variants) {
vcw.add(new VariantContextBuilder(ctx).attribute(BURDEN_KEY, VCFUtils.escapeInfoField(part.label)).make());
}
}
pw.print(part.getContig());
pw.print("\t");
pw.print(part.getStart() - 1);
pw.print("\t");
pw.print(part.getEnd());
pw.print("\t");
pw.print(part.label);
pw.print("\t");
pw.print(part.getLengthOnReference());
pw.print("\t");
pw.print(transcript.getProperties().getOrDefault("gene_name", "."));
pw.print("\t");
pw.print(transcript.getProperties().getOrDefault("transcript_type", "."));
pw.print("\t");
pw.print(gene.getStrand());
pw.print("\t");
pw.print(transcript.getId());
pw.print("\t");
pw.print(gene.getId());
pw.print("\t");
pw.print(part.intervals.stream().map(R -> String.valueOf(R.getStart()) + "-" + R.getEnd()).collect(Collectors.joining(";")));
pw.print("\t");
pw.print(fisher.p_value);
pw.print("\t");
pw.print(fisher.affected_alt);
pw.print("\t");
pw.print(fisher.affected_hom);
pw.print("\t");
pw.print(fisher.unaffected_alt);
pw.print("\t");
pw.print(fisher.unaffected_hom);
pw.print("\t");
pw.print(variants.size());
pw.println();
}
}
}
progress.close();
final ProgressFactory.Watcher<SimpleInterval> progress2 = ProgressFactory.newInstance().logger(LOG).dictionary(vcfDict).build();
/**
* scan intergenics ...
*/
for (final SimpleInterval intergenic : all_intergenic) {
progress2.apply(intergenic);
final int intergenic_window_size = 2000;
final int intergenic_window_shifr = 100;
final List<SimpleInterval> parts = new ArrayList<>();
if (intergenic.getLengthOnReference() <= intergenic_window_size)
continue;
int start_pos = intergenic.getStart();
while (start_pos + intergenic_window_size <= intergenic.getEnd()) {
int xend = Math.min(intergenic.getEnd(), start_pos + intergenic_window_size - 1);
int xstart = xend - intergenic_window_size - 1;
parts.add(new SimpleInterval(intergenic.getContig(), xstart, xend));
start_pos += intergenic_window_shifr;
}
for (final SimpleInterval part : parts) {
final List<VariantContext> variants = vcfReader.query(part).stream().filter(V -> accept(V)).collect(Collectors.toList());
if (variants.isEmpty())
continue;
final FisherResult fisher = runFisher(variants);
if (fisher.p_value > this.fisherTreshold)
continue;
final String label = "intergenic_" + part.getStart() + "_" + part.getEnd();
if (vcw != null) {
for (final VariantContext ctx : variants) {
vcw.add(new VariantContextBuilder(ctx).attribute(BURDEN_KEY, VCFUtils.escapeInfoField(label)).make());
}
}
pw.print(part.getContig());
pw.print("\t");
pw.print(part.getStart() - 1);
pw.print("\t");
pw.print(part.getEnd());
pw.print("\t");
pw.print(label);
pw.print("\t");
pw.print(part.getLengthOnReference());
pw.print("\t");
pw.print(".");
pw.print("\t");
pw.print("intergenic");
pw.print("\t");
pw.print(".");
pw.print("\t");
pw.print(".");
pw.print("\t");
pw.print(".");
pw.print("\t");
pw.print("" + part.getStart() + "-" + part.getEnd());
pw.print("\t");
pw.print(fisher.p_value);
pw.print("\t");
pw.print(fisher.affected_alt);
pw.print("\t");
pw.print(fisher.affected_hom);
pw.print("\t");
pw.print(fisher.unaffected_alt);
pw.print("\t");
pw.print(fisher.unaffected_hom);
pw.print("\t");
pw.print(variants.size());
pw.println();
}
}
progress2.close();
}
Aggregations