use of com.github.lindenb.jvarkit.util.bio.structure.Gene in project jvarkit by lindenb.
the class VcfBurdenGtf method runBurden.
@Override
protected void runBurden(PrintWriter pw, VCFReader vcfReader, VariantContextWriter vcw) throws IOException {
final SAMSequenceDictionary vcfDict = SequenceDictionaryUtils.extractRequired(vcfReader.getHeader());
final List<Gene> all_genes;
try (GtfReader gtfReader = new GtfReader(this.gtfFile)) {
gtfReader.setContigNameConverter(ContigNameConverter.fromOneDictionary(vcfDict));
all_genes = gtfReader.getAllGenes().stream().filter(G -> StringUtil.isBlank(this.intergenic_contig) || this.intergenic_contig.equals("*") || this.intergenic_contig.equals(G.getContig())).sorted(new ContigDictComparator(vcfDict).createLocatableComparator()).collect(Collectors.toCollection(ArrayList::new));
}
pw.print("#chrom");
pw.print("\t");
pw.print("start0");
pw.print("\t");
pw.print("end");
pw.print("\t");
pw.print("name");
pw.print("\t");
pw.print("length");
pw.print("\t");
pw.print("gene");
pw.print("\t");
pw.print("type");
pw.print("\t");
pw.print("strand");
pw.print("\t");
pw.print("transcript");
pw.print("\t");
pw.print("gene-id");
pw.print("\t");
pw.print("intervals");
pw.print("\t");
pw.print("p-value");
pw.print("\t");
pw.print("affected_alt");
pw.print("\t");
pw.print("affected_hom");
pw.print("\t");
pw.print("unaffected_alt");
pw.print("\t");
pw.print("unaffected_hom");
pw.print("\t");
pw.print("variants.count");
pw.println();
final List<SimpleInterval> all_intergenic = new ArrayList<>();
if (!StringUtil.isBlank(this.intergenic_contig)) {
for (final SAMSequenceRecord ssr : vcfDict.getSequences()) {
if (!(this.intergenic_contig.equals("*") || this.intergenic_contig.equals(ssr.getSequenceName())))
continue;
final BitSet filled = new BitSet(ssr.getSequenceLength() + 2);
all_genes.stream().filter(G -> G.getContig().equals(ssr.getSequenceName())).forEach(G -> filled.set(G.getStart(), 1 + /* bit set is 0 based */
Math.min(G.getEnd(), ssr.getSequenceLength())));
int i = 1;
while (i < ssr.getSequenceLength()) {
if (filled.get(i)) {
i++;
continue;
}
int j = i;
while (j < ssr.getSequenceLength() && !filled.get(j)) {
j++;
}
all_intergenic.add(new SimpleInterval(ssr.getSequenceName(), i, j));
i = j + 1;
}
all_genes.removeIf(G -> G.getContig().equals(ssr.getSequenceName()));
}
all_genes.clear();
}
final ProgressFactory.Watcher<Gene> progress = ProgressFactory.newInstance().logger(LOG).dictionary(vcfDict).build();
/* run genes */
for (final Gene gene : all_genes) {
progress.apply(gene);
final IntervalTree<VariantContext> intervalTree = new IntervalTree<>();
vcfReader.query(gene).stream().filter(V -> accept(V)).forEach(V -> intervalTree.put(V.getStart(), V.getEnd(), V));
if (intervalTree.size() == 0)
continue;
for (final Transcript transcript : gene.getTranscripts()) {
final List<SubPartOfTranscript> parts = new ArrayList<>();
parts.addAll(transcript.getExons().stream().map(R -> new SubPartOfTranscript(R)).collect(Collectors.toList()));
parts.addAll(transcript.getIntrons().stream().map(R -> new SubPartOfTranscript(R)).collect(Collectors.toList()));
final int intron_window_size = 1000;
final int intron_window_shift = 500;
for (final Intron intron : transcript.getIntrons()) {
if (intron.getLengthOnReference() <= intron_window_size)
continue;
int start_pos = intron.getStart();
while (start_pos + intron_window_size <= intron.getEnd()) {
int xend = Math.min(intron.getEnd(), start_pos + intron_window_size - 1);
int xstart = xend - intron_window_size - 1;
parts.add(new SubPartOfTranscript(transcript, intron.getName() + ".Sliding", Collections.singletonList(new SimpleInterval(intron.getContig(), xstart, xend))));
start_pos += intron_window_shift;
}
}
for (final UTR utr : transcript.getUTRs()) {
parts.add(new SubPartOfTranscript(transcript, utr.getName(), utr.getIntervals()));
}
if (transcript.getExonCount() > 1) {
parts.add(new SubPartOfTranscript(transcript, "AllExons", transcript.getExons().stream().map(E -> E.toInterval()).collect(Collectors.toList())));
}
if (transcript.hasCodonStartDefined() && transcript.hasCodonStopDefined() && transcript.getAllCds().size() > 1) {
parts.add(new SubPartOfTranscript(transcript, "AllCds", transcript.getAllCds().stream().map(E -> E.toInterval()).collect(Collectors.toList())));
}
final int L = transcript.getTranscriptLength();
final int[] index2genomic = new int[L];
int pos = 0;
for (final Exon exon : transcript.getExons()) {
for (int i = exon.getStart(); i <= exon.getEnd(); i++) {
index2genomic[pos] = i;
pos++;
}
}
final int window_size = 200;
final int window_shift = 100;
int array_index = 0;
while (array_index < index2genomic.length) {
final List<Locatable> intervals = new ArrayList<>();
int prev_pos = -1;
int start_pos = index2genomic[array_index];
int i = 0;
while (i < window_size && array_index + i < index2genomic.length) {
final int curr_pos = index2genomic[array_index + i];
if (i > 0 && prev_pos + 1 != curr_pos) {
intervals.add(new SimpleInterval(transcript.getContig(), start_pos, prev_pos));
start_pos = curr_pos;
}
prev_pos = curr_pos;
i++;
}
intervals.add(new SimpleInterval(transcript.getContig(), start_pos, prev_pos));
parts.add(new SubPartOfTranscript(transcript, "Sliding", intervals));
array_index += window_shift;
}
for (final SubPartOfTranscript part : parts) {
final List<VariantContext> variants = new ArrayList<>();
for (final Locatable loc : part.intervals) {
Iterator<IntervalTree.Node<VariantContext>> iter = intervalTree.overlappers(loc.getStart(), loc.getEnd());
while (iter.hasNext()) variants.add(iter.next().getValue());
}
if (variants.isEmpty())
continue;
final FisherResult fisher = runFisher(variants);
if (fisher.p_value > this.fisherTreshold)
continue;
if (vcw != null) {
for (final VariantContext ctx : variants) {
vcw.add(new VariantContextBuilder(ctx).attribute(BURDEN_KEY, VCFUtils.escapeInfoField(part.label)).make());
}
}
pw.print(part.getContig());
pw.print("\t");
pw.print(part.getStart() - 1);
pw.print("\t");
pw.print(part.getEnd());
pw.print("\t");
pw.print(part.label);
pw.print("\t");
pw.print(part.getLengthOnReference());
pw.print("\t");
pw.print(transcript.getProperties().getOrDefault("gene_name", "."));
pw.print("\t");
pw.print(transcript.getProperties().getOrDefault("transcript_type", "."));
pw.print("\t");
pw.print(gene.getStrand());
pw.print("\t");
pw.print(transcript.getId());
pw.print("\t");
pw.print(gene.getId());
pw.print("\t");
pw.print(part.intervals.stream().map(R -> String.valueOf(R.getStart()) + "-" + R.getEnd()).collect(Collectors.joining(";")));
pw.print("\t");
pw.print(fisher.p_value);
pw.print("\t");
pw.print(fisher.affected_alt);
pw.print("\t");
pw.print(fisher.affected_hom);
pw.print("\t");
pw.print(fisher.unaffected_alt);
pw.print("\t");
pw.print(fisher.unaffected_hom);
pw.print("\t");
pw.print(variants.size());
pw.println();
}
}
}
progress.close();
final ProgressFactory.Watcher<SimpleInterval> progress2 = ProgressFactory.newInstance().logger(LOG).dictionary(vcfDict).build();
/**
* scan intergenics ...
*/
for (final SimpleInterval intergenic : all_intergenic) {
progress2.apply(intergenic);
final int intergenic_window_size = 2000;
final int intergenic_window_shifr = 100;
final List<SimpleInterval> parts = new ArrayList<>();
if (intergenic.getLengthOnReference() <= intergenic_window_size)
continue;
int start_pos = intergenic.getStart();
while (start_pos + intergenic_window_size <= intergenic.getEnd()) {
int xend = Math.min(intergenic.getEnd(), start_pos + intergenic_window_size - 1);
int xstart = xend - intergenic_window_size - 1;
parts.add(new SimpleInterval(intergenic.getContig(), xstart, xend));
start_pos += intergenic_window_shifr;
}
for (final SimpleInterval part : parts) {
final List<VariantContext> variants = vcfReader.query(part).stream().filter(V -> accept(V)).collect(Collectors.toList());
if (variants.isEmpty())
continue;
final FisherResult fisher = runFisher(variants);
if (fisher.p_value > this.fisherTreshold)
continue;
final String label = "intergenic_" + part.getStart() + "_" + part.getEnd();
if (vcw != null) {
for (final VariantContext ctx : variants) {
vcw.add(new VariantContextBuilder(ctx).attribute(BURDEN_KEY, VCFUtils.escapeInfoField(label)).make());
}
}
pw.print(part.getContig());
pw.print("\t");
pw.print(part.getStart() - 1);
pw.print("\t");
pw.print(part.getEnd());
pw.print("\t");
pw.print(label);
pw.print("\t");
pw.print(part.getLengthOnReference());
pw.print("\t");
pw.print(".");
pw.print("\t");
pw.print("intergenic");
pw.print("\t");
pw.print(".");
pw.print("\t");
pw.print(".");
pw.print("\t");
pw.print(".");
pw.print("\t");
pw.print("" + part.getStart() + "-" + part.getEnd());
pw.print("\t");
pw.print(fisher.p_value);
pw.print("\t");
pw.print(fisher.affected_alt);
pw.print("\t");
pw.print(fisher.affected_hom);
pw.print("\t");
pw.print(fisher.unaffected_alt);
pw.print("\t");
pw.print(fisher.unaffected_hom);
pw.print("\t");
pw.print(variants.size());
pw.println();
}
}
progress2.close();
}
use of com.github.lindenb.jvarkit.util.bio.structure.Gene in project jvarkit by lindenb.
the class VcfGtfSplitter method doWork.
@Override
public int doWork(final List<String> args) {
ArchiveFactory archiveFactory = null;
PrintWriter manifest = null;
VCFReader vcfFileReader = null;
try {
this.attCleaner = AttributeCleaner.compile(this.xannotatePattern);
for (final String s : featuresString.split("[;, ]")) {
if (StringUtils.isBlank(s))
continue;
if (s.equals("cds")) {
use_cds = true;
} else if (s.equals("intron")) {
use_cds = true;
} else if (s.equals("exon")) {
use_exon = true;
} else if (s.equals("stop")) {
use_stop = true;
} else if (s.equals("start")) {
use_start = true;
} else if (s.equals("transcript")) {
use_exon = true;
use_intron = true;
} else if (s.equals("utr5")) {
use_utr5 = true;
} else if (s.equals("utr3")) {
use_utr3 = true;
} else if (s.equals("utr")) {
use_utr3 = true;
use_utr5 = true;
} else if (s.equals("upstream")) {
use_upstream = true;
} else if (s.equals("downstream")) {
use_downstream = true;
} else if (s.equals("splice")) {
use_splice = true;
} else if (s.equals("cds_utr5")) {
use_cds_utr5 = true;
} else if (s.equals("cds_utr3")) {
use_cds_utr3 = true;
} else if (s.equals("cds_utr")) {
use_cds_utr3 = true;
use_cds_utr5 = true;
} else {
LOG.error("unknown code " + s + " in " + this.featuresString);
return -1;
}
}
final Path tmpVcf = Files.createTempFile("tmp.", (use_bcf ? FileExtensions.BCF : FileExtensions.COMPRESSED_VCF));
String input = oneAndOnlyOneFile(args);
vcfFileReader = VCFReaderFactory.makeDefault().open(Paths.get(input), true);
final VCFHeader header1 = vcfFileReader.getHeader();
final SAMSequenceDictionary dict = header1.getSequenceDictionary();
if (dict == null && this.use_bcf) {
throw new JvarkitException.VcfDictionaryMissing(input);
}
if (dict != null && !limitToContigs.isEmpty()) {
final ContigNameConverter ctgNameConverter = ContigNameConverter.fromOneDictionary(dict);
final Set<String> set2 = new HashSet<>(this.limitToContigs.size());
for (final String ctg : this.limitToContigs) {
final String ctg2 = ctgNameConverter.apply(ctg);
if (StringUtils.isBlank(ctg2)) {
LOG.error(JvarkitException.ContigNotFoundInDictionary.getMessage(ctg, dict));
return -1;
}
set2.add(ctg2);
}
this.limitToContigs = set2;
}
final List<Gene> all_genes;
try (GtfReader gtfReader = new GtfReader(this.gtfPath)) {
final Comparator<Gene> cmp;
if (dict != null) {
gtfReader.setContigNameConverter(ContigNameConverter.fromOneDictionary(dict));
cmp = new ContigDictComparator(dict).createLocatableComparator();
} else {
cmp = (A, B) -> {
final int i = A.getContig().compareTo(B.getContig());
if (i != 0)
return i;
return Integer.compare(A.getStart(), B.getStart());
};
}
all_genes = gtfReader.getAllGenes().stream().filter(G -> {
if (this.protein_coding_only && !"protein_coding".equals(G.getGeneBiotype()))
return false;
if (this.limitToContigs.isEmpty())
return true;
return this.limitToContigs.contains(G.getContig());
}).sorted(cmp).collect(Collectors.toList());
}
archiveFactory = ArchiveFactory.open(this.outputFile);
archiveFactory.setCompressionLevel(0);
manifest = new PrintWriter(this.manifestFile == null ? new NullOuputStream() : IOUtils.openPathForWriting(manifestFile));
manifest.println("#chrom\tstart\tend\tGene-Id\tGene-Name\tGene-Biotype\tTranscript-Id\tpath\tCount_Variants");
if (this.split_by_transcript) {
final Iterator<Transcript> triter = all_genes.stream().flatMap(G -> G.getTranscripts().stream()).iterator();
while (triter.hasNext()) {
final Transcript tr = triter.next();
final AbstractSplitter splitter = new TranscriptSplitter(tr);
this.split(splitter, vcfFileReader, header1, dict, archiveFactory, tmpVcf, manifest);
}
} else {
for (Gene gene : all_genes) {
final AbstractSplitter splitter = new GeneSplitter(gene);
this.split(splitter, vcfFileReader, header1, dict, archiveFactory, tmpVcf, manifest);
}
}
vcfFileReader.close();
vcfFileReader = null;
manifest.flush();
manifest.close();
manifest = null;
archiveFactory.close();
Files.deleteIfExists(tmpVcf);
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(vcfFileReader);
CloserUtil.close(archiveFactory);
CloserUtil.close(manifest);
}
}
use of com.github.lindenb.jvarkit.util.bio.structure.Gene in project jvarkit by lindenb.
the class PlotSashimi method plotSashimi.
/**
* create the SVG itself
*/
private void plotSashimi(final ArchiveFactory archive, final SamReader samReader, final Locatable interval, final Path bamPath, final PrintWriter manifest) {
final int drawing_width = Math.max(100, this.image_width_pixel);
final int coverageHeight = Math.max(100, Integer.parseInt(this.dynamicParams.getOrDefault("coverage.height", "300")));
final double pixelperbase = drawing_width / (double) interval.getLengthOnReference();
final SAMFileHeader header = samReader.getFileHeader();
final Collection<Gene> genes = this.geneMap.getOverlapping(interval);
final Set<String> geneNames = genes.stream().map(G -> G.getGeneName()).filter(S -> !StringUtils.isBlank(S)).collect(Collectors.toCollection(TreeSet::new));
/**
* extract the sample name or just use the filename
*/
final String sampleName = StringUtils.ifBlank(header.getReadGroups().stream().map(G -> this.partition.apply(G)).filter(S -> !StringUtils.isBlank(S)).sorted().collect(Collectors.joining(";")), bamPath.getFileName().toString());
final Function<Integer, Double> pos2pixel = POS -> (POS - interval.getStart()) / (double) interval.getLengthOnReference() * drawing_width;
final Counter<SimpleInterval> gaps = new Counter<>();
final int[] coverage = new int[interval.getLengthOnReference()];
try (SAMRecordIterator iter = samReader.queryOverlapping(interval.getContig(), interval.getStart(), interval.getEnd())) {
/**
* no read here, skip
*/
boolean got_one = false;
while (iter.hasNext()) {
final SAMRecord rec = iter.next();
if (rec.getReadUnmappedFlag())
continue;
if (rec.getReadFailsVendorQualityCheckFlag())
continue;
if (rec.isSecondaryOrSupplementary())
continue;
if (rec.getDuplicateReadFlag())
continue;
if (rec.getMappingQuality() < this.min_mapq)
continue;
final Cigar cigar = rec.getCigar();
if (cigar == null || cigar.isEmpty())
continue;
got_one = true;
int ref = rec.getAlignmentStart();
for (final CigarElement ce : cigar) {
if (ref > interval.getEnd())
break;
final CigarOperator op = ce.getOperator();
if (op.equals(CigarOperator.N) || (use_D_operator && op.equals(CigarOperator.D))) {
gaps.incr(new SimpleInterval(rec.getContig(), ref, ref + ce.getLength() - 1));
}
if (op.consumesReferenceBases()) {
if (op.consumesReadBases()) {
for (int x = 0; x < ce.getLength(); ++x) {
final int pos1 = ref + x;
if (pos1 < interval.getStart())
continue;
if (pos1 > interval.getEnd())
break;
coverage[pos1 - interval.getStart()]++;
}
}
ref += ce.getLength();
}
}
}
if (!got_one && this.skip_region_without_read)
return;
}
final int max_coverage;
if (this.force_max_coverage > 0) {
max_coverage = this.force_max_coverage;
} else {
max_coverage = Math.max(1, Arrays.stream(coverage).max().orElse(0));
}
while (this.document.hasChildNodes()) {
this.document.removeChild(this.document.getFirstChild());
}
final Element svgRoot = element("svg");
this.document.appendChild(svgRoot);
/* SVG title */
{
final Element title = element("title");
svgRoot.appendChild(title);
title.appendChild(text(interval.toString() + (!geneNames.isEmpty() && geneNames.size() < 3 ? " " + String.join(" ", geneNames) : "")));
}
/* SVG style */
{
final Element style = element("style");
svgRoot.appendChild(style);
style.appendChild(text(this.cssPath == null ? ".coverage { fill:red;fill:url('#grad01')} " + ".maintitle {text-anchor:middle;fill:blue} " + ".sample {fill:blue;font-size:7px;} " + ".frame { fill:none; stroke: darkgray;} " + ".arcK { fill:none; stroke: blue; stroke-linecap:round;opacity:0.8;} " + ".arcU { fill:none; stroke: red; stroke-linecap:round;opacity:0.8;} " + ".transcript { fill:darkgray; stroke: darkgray;} " + ".exon { fill:green; stroke: darkgray;} " + ".frame { fill:none; stroke: darkgray;} " + ".rulerline {stroke:lightgray;stroke-width:0.5px;}\n" + ".exonline {stroke:green;stroke-width:0.5px;opacity:0.5;}\n" + ".rulerlabel {stroke:gray;stroke-width:0.5px;font-size:7px;}\n" + "a {cursor: pointer;}\n" : IOUtils.slurpPath(this.cssPath)));
}
// SVG def
{
final Element defs = element("defs");
svgRoot.appendChild(defs);
// linear gradient
{
Element grad = element("linearGradient");
defs.appendChild(grad);
grad.setAttribute("id", "grad01");
grad.setAttribute("gradientTransform", "rotate(90)");
Element stop = element("stop");
grad.appendChild(stop);
stop.setAttribute("offset", "0%");
stop.setAttribute("stop-color", (max_coverage > 50 ? "red" : max_coverage > 20 ? "green" : "blue"));
stop = element("stop");
grad.appendChild(stop);
stop.setAttribute("offset", "100%");
stop.setAttribute("stop-color", "darkblue");
}
}
final Element descr = element("desc");
svgRoot.appendChild(descr);
descr.appendChild(text("Author: Pierre Lindenbaum\n" + JVarkitVersion.getInstance().getCompilationDate() + "\n" + JVarkitVersion.getInstance().getGitHash()));
final Element maing = element("g");
svgRoot.appendChild(maing);
int y = 0;
// main title
Element gtitle = element("text", new SimpleInterval(interval).toNiceString() + (StringUtils.isBlank(sampleName) ? "" : " " + sampleName) + (geneNames.isEmpty() ? "" : " " + String.join(" ", geneNames)));
gtitle.setAttribute("class", "maintitle");
gtitle.setAttribute("x", format(drawing_width / 2));
gtitle.setAttribute("y", "15");
svgRoot.appendChild(gtitle);
y += 20;
// sample name
if (!StringUtils.isBlank(sampleName)) {
gtitle = element("text", sampleName);
gtitle.setAttribute("class", "sample");
gtitle.setAttribute("x", "5");
gtitle.setAttribute("y", "20");
svgRoot.appendChild(gtitle);
}
y += 50;
final int prev_y = y;
/**
* horizontal ruler
*/
{
final Element ruler_gh = element("g");
maing.appendChild(ruler_gh);
final int sep = bestTicks(interval.getLengthOnReference());
for (int pos = interval.getStart(); pos <= interval.getEnd(); ++pos) {
if (pos % sep != 0)
continue;
double x = pos2pixel.apply(pos);
final Element line = element("line");
ruler_gh.appendChild(line);
line.setAttribute("class", "rulerline");
line.appendChild(element("title", StringUtils.niceInt(pos)));
line.setAttribute("x1", format(x));
line.setAttribute("x2", format(x));
line.setAttribute("y1", format(y));
line.setAttribute("y2", format(y + coverageHeight));
final Element label = element("text", StringUtils.niceInt(pos));
label.setAttribute("class", "rulerlabel");
label.setAttribute("x", "0");
label.setAttribute("y", "0");
label.setAttribute("transform", "translate(" + format(x) + "," + y + ") rotate(90) ");
ruler_gh.appendChild(label);
}
}
/**
* vertical ruler
*/
{
final Element ruler_gv = element("g");
maing.appendChild(ruler_gv);
final int sep = bestTicks(max_coverage);
for (int pos = 0; pos <= max_coverage; ++pos) {
if (pos % sep != 0)
continue;
double ry = (int) (y + coverageHeight - (pos / (double) max_coverage) * coverageHeight);
final Element line = element("line");
ruler_gv.appendChild(line);
line.setAttribute("class", "rulerline");
line.appendChild(element("title", StringUtils.niceInt(pos)));
line.setAttribute("x1", format(0));
line.setAttribute("x2", format(drawing_width));
line.setAttribute("y1", format(ry));
line.setAttribute("y2", format(ry));
final Element label = element("text", StringUtils.niceInt(pos));
label.setAttribute("class", "rulerlabel");
label.setAttribute("x", "1");
label.setAttribute("y", format(ry));
ruler_gv.appendChild(label);
}
}
/**
* vertical lines of exons
*/
final Element exon_v = element("g");
final Element covPath = element("path");
covPath.setAttribute("class", "coverage");
maing.appendChild(covPath);
final StringBuilder sb = new StringBuilder();
sb.append("M 0 " + format(y + coverageHeight));
for (int k = 0; k < coverage.length; k++) {
// if(k+1< coverage.length && coverage[k]==coverage[k+1]) continue;
final double dpy = y + coverageHeight - coverageHeight * (coverage[k] / (double) max_coverage);
sb.append(" L " + format(pixelperbase * k) + " " + format(dpy));
}
sb.append(" L " + format(drawing_width) + " " + format(y + coverageHeight));
sb.append(" Z");
covPath.setAttribute("d", sb.toString());
covPath.appendChild(element("title", "Coverage. Max:" + StringUtils.niceInt(max_coverage)));
int next_y = y + coverageHeight;
/* plot arc */
if (!gaps.isEmpty()) {
boolean drawAbove = true;
int max_occurence = (int) gaps.count(gaps.getMostFrequent());
for (final SimpleInterval intron : gaps.keySet()) {
final int occurence = (int) gaps.count(intron);
boolean is_known_intron = genes.stream().flatMap(G -> G.getTranscripts().stream()).flatMap(T -> T.getIntrons().stream()).filter(E -> E.overlaps(intron)).anyMatch(I -> I.getStart() == intron.getStart() && I.getEnd() == intron.getEnd());
final int junctionStart = intron.getStart() - 1;
final int junctionEnd = intron.getEnd() + 1;
if (!CoordMath.encloses(interval.getStart(), interval.getEnd(), junctionStart, junctionEnd))
continue;
final double xstart = pos2pixel.apply(junctionStart);
final double xend = pos2pixel.apply(junctionEnd);
double ystart = y + coverageHeight - coverageHeight * (coverage[junctionStart - interval.getStart()] / (double) max_coverage);
double yend = y + coverageHeight - coverageHeight * (coverage[junctionEnd - interval.getStart()] / (double) max_coverage);
final Element arc = element("path");
sb.setLength(0);
double x_mid = (xend - xstart) / 2.0;
double x2 = xstart + x_mid;
final double y2;
// small gap: always print it under xaxis
if (xend - xstart < 30)
drawAbove = true;
if (drawAbove) {
ystart = y + coverageHeight;
yend = y + coverageHeight;
y2 = y + coverageHeight + x_mid;
next_y = (int) Math.max(next_y, y + coverageHeight + x_mid / 2 + 10);
} else {
y2 = Math.max(0, ystart + (yend - ystart) / 2.0 - x_mid);
}
sb.append("M " + format(xstart) + " " + format(ystart));
sb.append(" Q " + format(x2) + " " + format(y2) + " " + format(xend) + " " + format(yend));
arc.setAttribute("d", sb.toString());
arc.setAttribute("class", "arc" + (is_known_intron ? "K" : "U"));
final double stroke_width = 1 + /* show very small one */
(occurence / (double) max_occurence) * 10;
arc.setAttribute("style", "stroke-width:" + format(stroke_width) + "px;");
arc.appendChild(element("title", new SimpleInterval(interval.getContig(), junctionStart, junctionEnd).toNiceString() + " (" + StringUtils.niceInt(occurence) + ") " + (is_known_intron ? "known" : "unknown")));
maing.appendChild(wrapLoc(arc, intron));
drawAbove = !drawAbove;
}
}
y = next_y;
y += 2;
/**
* pileup transcripts
*/
final List<List<Transcript>> transcriptRows = new ArrayList<>();
for (final Transcript transcript : genes.stream().flatMap(L -> L.getTranscripts().stream()).filter(T -> T.overlaps(interval)).sorted((A, B) -> Integer.compare(A.getStart(), B.getStart())).collect(Collectors.toList())) {
int rowidx = 0;
while (rowidx < transcriptRows.size()) {
final List<Transcript> row = transcriptRows.get(rowidx);
final Transcript last = row.get(row.size() - 1);
if (!last.overlaps(transcript)) {
row.add(transcript);
break;
}
rowidx++;
}
if (rowidx == transcriptRows.size()) {
final List<Transcript> row = new ArrayList<>();
row.add(transcript);
transcriptRows.add(row);
}
}
/**
* plot transcripts
*/
final Element transcripts_g = element("g");
maing.appendChild(transcripts_g);
final int transcript_height = Math.max(10, Integer.parseInt(this.dynamicParams.getOrDefault("transcript.height", "12")));
for (final List<Transcript> row : transcriptRows) {
final Element grow = element("g");
transcripts_g.appendChild(grow);
for (final Transcript transcript : row) {
final Element transcript_g = element("g");
grow.appendChild(transcript_g);
final Element tr = element("line");
final double midy = y + transcript_height / 2.0;
tr.setAttribute("class", "transcript");
double tr_x1 = Math.max(1.0, pos2pixel.apply(transcript.getStart()));
tr.setAttribute("x1", format(tr_x1));
tr.setAttribute("y1", format(midy));
tr.setAttribute("x2", format(Math.min(drawing_width, pos2pixel.apply(transcript.getEnd()))));
tr.setAttribute("y2", format(midy));
tr.appendChild(element("title", transcript.getId() + " " + transcript.getGene().getGeneName()));
transcript_g.appendChild(wrapLoc(tr, transcript));
final Element label = element("text", transcript.getId() + " " + transcript.getGene().getGeneName());
label.setAttribute("class", "rulerlabel");
label.setAttribute("x", format(tr_x1));
label.setAttribute("y", format(midy));
transcript_g.appendChild(label);
for (final Exon exon : transcript.getExons()) {
if (!exon.overlaps(interval))
continue;
final Element exon_rect = element("rect");
exon_rect.setAttribute("class", "exon");
final double exonx1 = Math.max(1.0, pos2pixel.apply(exon.getStart()));
final double exonx2 = Math.min(drawing_width, pos2pixel.apply(exon.getEnd()));
exon_rect.setAttribute("x", format(exonx1));
exon_rect.setAttribute("y", format(y));
exon_rect.setAttribute("height", format(transcript_height));
exon_rect.setAttribute("width", format(exonx2 - exonx1));
exon_rect.appendChild(element("title", exon.getName() + " " + transcript.getId() + " " + transcript.getGene().getGeneName()));
transcript_g.appendChild(wrapLoc(exon_rect, exon));
for (int side = 0; side < 2; ++side) {
final double x = pos2pixel.apply(side == 0 ? exon.getStart() : exon.getEnd());
final Element line = element("line");
exon_v.appendChild(line);
line.setAttribute("class", "exonline");
line.setAttribute("x1", format(x));
line.setAttribute("x2", format(x));
line.setAttribute("y2", format(y + transcript_height));
line.setAttribute("y1", format(prev_y));
}
}
}
y += transcript_height + 2;
}
maing.appendChild(exon_v);
/* final frame */
final Element frame_rect = element("rect");
frame_rect.setAttribute("class", "frame");
frame_rect.setAttribute("x", "0");
frame_rect.setAttribute("y", "0");
frame_rect.setAttribute("width", format(drawing_width));
frame_rect.setAttribute("height", format(y));
svgRoot.appendChild(frame_rect);
svgRoot.setAttribute("width", format(drawing_width + 1));
svgRoot.setAttribute("height", format(y + 1));
try {
final Transformer tr = TransformerFactory.newInstance().newTransformer();
final String md5 = StringUtils.md5(interval.getContig() + ":" + interval.getStart() + ":" + interval.getEnd() + ":" + bamPath.toString());
final String filename = md5.substring(0, 2) + File.separatorChar + md5.substring(2) + File.separator + interval.getContig() + "_" + interval.getStart() + "_" + interval.getEnd() + (StringUtils.isBlank(sampleName) ? "" : "." + sampleName.replaceAll("[/\\:]", "_")) + ".svg" + (this.compressed_svg ? ".gz" : "");
if (this.compressed_svg) {
try (final OutputStream pw = archive.openOuputStream(filename)) {
try (GZIPOutputStream gzout = new GZIPOutputStream(pw)) {
tr.transform(new DOMSource(this.document), new StreamResult(gzout));
gzout.finish();
gzout.flush();
}
pw.flush();
}
} else {
try (final PrintWriter pw = archive.openWriter(filename)) {
tr.transform(new DOMSource(this.document), new StreamResult(pw));
pw.flush();
}
}
manifest.print(interval.getContig());
manifest.print('\t');
manifest.print(interval.getStart() - 1);
manifest.print('\t');
manifest.print(interval.getEnd());
manifest.print('\t');
manifest.print(bamPath.toString());
manifest.print('\t');
manifest.print(geneNames.isEmpty() ? "." : String.join(",", geneNames));
manifest.print('\t');
manifest.print(StringUtils.isBlank(sampleName) ? "." : sampleName);
manifest.print('\t');
manifest.print((archive.isTarOrZipArchive() ? "" : this.outputFile.toString() + File.separator) + filename);
manifest.println();
} catch (final Exception err) {
throw new RuntimeException(err);
}
}
use of com.github.lindenb.jvarkit.util.bio.structure.Gene in project jvarkit by lindenb.
the class GtfRetroCopy method doWork.
@Override
public int doWork(final List<String> args) {
VCFReader vcfFileReader = null;
VariantContextWriter vcw0 = null;
try {
/* load the reference genome */
/* create a contig name converter from the REF */
final Set<String> knownGeneIds;
if (this.knownPath != null) {
try (BufferedReader br = IOUtils.openPathForBufferedReading(this.knownPath)) {
knownGeneIds = br.lines().filter(L -> !StringUtils.isBlank(L)).map(S -> S.trim()).filter(S -> !(S.equals("-") || S.equals(".") || S.startsWith("#"))).collect(Collectors.toSet());
}
} else {
knownGeneIds = Collections.emptySet();
}
// open the sam file
final String input = oneAndOnlyOneFile(args);
vcfFileReader = VCFReaderFactory.makeDefault().open(Paths.get(input), true);
final VCFHeader header = vcfFileReader.getHeader();
final SAMSequenceDictionary dict = header.getSequenceDictionary();
final Comparator<String> contigCmp = dict == null ? (A, B) -> A.compareTo(B) : new ContigDictComparator(dict);
final Comparator<Gene> geneCmp = (A, B) -> {
int i = contigCmp.compare(A.getContig(), B.getContig());
if (i != 0)
return i;
i = Integer.compare(A.getStart(), B.getStart());
if (i != 0)
return i;
return Integer.compare(A.getEnd(), B.getEnd());
};
final GtfReader gtfReader = new GtfReader(this.gtfPath);
if (dict != null && !dict.isEmpty()) {
this.writingVcf.dictionary(dict);
gtfReader.setContigNameConverter(ContigNameConverter.fromOneDictionary(dict));
}
final List<Gene> genes = gtfReader.getAllGenes().stream().filter(G -> G.getTranscripts().stream().count() > 0L).filter(G -> G.getTranscripts().stream().anyMatch(T -> T.getIntronCount() >= this.min_intron_count)).sorted(geneCmp).collect(Collectors.toList());
gtfReader.close();
/**
* build vcf header
*/
final Set<VCFHeaderLine> metaData = new HashSet<>();
metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY, true));
metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_QUALITY_KEY, true));
metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.DEPTH_KEY, true));
metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_ALLELE_DEPTHS, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_NUMBER_KEY, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_FREQUENCY_KEY, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY, true));
metaData.add(new VCFInfoHeaderLine(VCFConstants.SVTYPE, 1, VCFHeaderLineType.String, "Variation type"));
metaData.add(new VCFInfoHeaderLine("SVLEN", 1, VCFHeaderLineType.Integer, "Variation Length"));
metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_BOUNDS, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Introns boundaries"));
metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_SIZES, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.Integer, "Introns sizes"));
metaData.add(new VCFFilterHeaderLine(ATT_NOT_ALL_INTRONS, "Not all introns were found retrocopied"));
for (final String att : ENSEMBL_TRANSCRIPT_ATTS) {
metaData.add(new VCFInfoHeaderLine(att, 1, VCFHeaderLineType.String, "Value for the attribute '" + att + "' in the gtf"));
}
// metaData.add(new VCFFormatHeaderLine(ATT_COUNT_SUPPORTING_READS, 2,VCFHeaderLineType.Integer,"Count supporting reads [intron-left/intron-right]"));
// metaData.add(new VCFInfoHeaderLine(ATT_RETRO_DESC, VCFHeaderLineCount.UNBOUNDED,VCFHeaderLineType.String,
// "Retrocopy attributes: transcript-id|strand|exon-left|exon-left-bases|exon-right-bases|exon-right"));
metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_COUNT, 1, VCFHeaderLineType.Integer, "Number of introns for the Transcript"));
metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_CANDIDATE_COUNT, 1, VCFHeaderLineType.Integer, "Number of introns found retrocopied for the transcript"));
metaData.add(new VCFInfoHeaderLine(ATT_INTRONS_CANDIDATE_FRACTION, 1, VCFHeaderLineType.Float, "Fraction of introns found retrocopied for the transcript"));
metaData.add(new VCFFilterHeaderLine(ATT_NOT_ALL_INTRONS, "Not all introns were found retrocopied"));
metaData.add(new VCFFilterHeaderLine(ATT_KNOWN, "Known RetroGenes. " + (this.knownPath == null ? "" : " Source: " + this.knownPath)));
final VCFHeader header2 = new VCFHeader(header);
metaData.stream().forEach(H -> header2.addMetaDataLine(H));
JVarkitVersion.getInstance().addMetaData(this, header2);
final Allele ref = Allele.create((byte) 'N', true);
final Allele alt = Allele.create("<RETROCOPY>", false);
/* open vcf for writing*/
vcw0 = this.writingVcf.open(this.outputFile);
vcw0.writeHeader(header2);
final ProgressFactory.Watcher<Gene> progress = ProgressFactory.newInstance().logger(LOG).dictionary(dict).build();
for (final Gene gene : genes) {
progress.apply(gene);
final List<VariantContext> variants = new ArrayList<>();
final CloseableIterator<VariantContext> iter2 = vcfFileReader.query(gene.getContig(), gene.getStart(), gene.getEnd());
while (iter2.hasNext()) {
final VariantContext ctx = iter2.next();
// SNV
if (ctx.getStart() == ctx.getEnd())
continue;
StructuralVariantType svType = ctx.getStructuralVariantType();
if (StructuralVariantType.BND.equals(svType))
continue;
if (StructuralVariantType.INS.equals(svType))
continue;
variants.add(ctx);
}
iter2.close();
if (variants.isEmpty())
continue;
for (final Transcript transcript : gene.getTranscripts()) {
if (!transcript.hasIntron())
continue;
if (transcript.getIntronCount() < this.min_intron_count)
continue;
final Counter<String> samples = new Counter<>();
for (final Intron intron : transcript.getIntrons()) {
for (final VariantContext ctx : variants) {
if (!isWithinDistance(intron.getStart(), ctx.getStart()))
continue;
if (!isWithinDistance(intron.getEnd(), ctx.getEnd()))
continue;
if (ctx.hasGenotypes()) {
for (final Genotype g : ctx.getGenotypes()) {
if (g.isNoCall() || g.isHomRef())
continue;
samples.incr(g.getSampleName());
}
} else {
samples.incr("*");
}
}
// end iter2
}
// end intron
final long max_count = samples.stream().mapToLong(E -> E.getValue()).max().orElse(0L);
if (max_count == 0)
continue;
if (this.only_all_introns && max_count != transcript.getIntronCount())
continue;
// ok good candidate
final VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(transcript.getContig());
vcb.start(transcript.getStart());
vcb.stop(transcript.getEnd());
switch(this.idKey) {
case gene_name:
final String gn = transcript.getGene().getGeneName();
vcb.id(StringUtils.isBlank(gn) ? transcript.getId() : gn);
break;
case gene_id:
vcb.id(transcript.getGene().getId());
break;
case transcript_id:
vcb.id(transcript.getId());
break;
default:
throw new IllegalStateException();
}
final List<Allele> alleles = Arrays.asList(ref, alt);
// vcb.attribute(VCFConstants.ALLELE_NUMBER_KEY,2);
// vcb.attribute(VCFConstants.ALLELE_COUNT_KEY,1);
// vcb.attribute(VCFConstants.ALLELE_FREQUENCY_KEY,0.5);
vcb.attribute(VCFConstants.SVTYPE, "DEL");
vcb.attribute(VCFConstants.END_KEY, transcript.getEnd());
vcb.attribute("SVLEN", transcript.getLengthOnReference());
vcb.attribute(ATT_INTRONS_BOUNDS, transcript.getIntrons().stream().map(S -> "" + S.getStart() + "-" + S.getEnd()).collect(Collectors.toList()));
vcb.attribute(ATT_INTRONS_SIZES, transcript.getIntrons().stream().mapToInt(S -> S.getLengthOnReference()).toArray());
for (final String att : ENSEMBL_TRANSCRIPT_ATTS) {
final String v = transcript.getProperties().get(att);
if (StringUtils.isBlank(v))
continue;
vcb.attribute(att, v);
}
vcb.alleles(alleles);
boolean pass_filter = true;
// introns sequences
vcb.attribute(ATT_INTRONS_CANDIDATE_COUNT, max_count);
vcb.attribute(ATT_INTRONS_COUNT, transcript.getIntronCount());
vcb.attribute(ATT_INTRONS_CANDIDATE_FRACTION, max_count / (float) transcript.getIntronCount());
if (transcript.getIntronCount() != max_count) {
vcb.filter(ATT_NOT_ALL_INTRONS);
pass_filter = false;
}
if (knownGeneIds.contains(transcript.getGene().getId())) {
vcb.filter(ATT_KNOWN);
pass_filter = false;
}
if (header.hasGenotypingData()) {
final List<Genotype> genotypes = new ArrayList<>();
for (final String sn : header.getSampleNamesInOrder()) {
final List<Allele> gtalleles;
if (samples.count(sn) == 0L) {
gtalleles = Arrays.asList(ref, ref);
} else {
gtalleles = Arrays.asList(ref, alt);
}
final GenotypeBuilder gb = new GenotypeBuilder(sn, gtalleles);
genotypes.add(gb.make());
}
vcb.genotypes(genotypes);
}
if (pass_filter)
vcb.passFilters();
vcw0.add(vcb.make());
}
}
progress.close();
vcw0.close();
vcfFileReader.close();
vcfFileReader = null;
return 0;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(vcfFileReader);
CloserUtil.close(vcw0);
}
}
Aggregations