use of com.github.lindenb.jvarkit.util.bio.structure.GtfReader in project jvarkit by lindenb.
the class VcfToSvg method doWork.
@Override
public int doWork(final List<String> args) {
VCFReader r = null;
OutputStream outputStream = null;
XMLStreamWriter w = null;
PrintWriter manifestW = null;
ArchiveFactory archiveFactory = null;
try {
r = VCFReaderFactory.makeDefault().open(Paths.get(oneAndOnlyOneFile(args)), true);
final VCFHeader header = r.getHeader();
final List<String> samples = new ArrayList<>(header.getSampleNamesInOrder());
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(header);
intervalListProvider.dictionary(dict);
/* read gtf if any */
final IntervalTreeMap<Gene> geneMap = new IntervalTreeMap<>();
if (this.gtfPath != null) {
try (GtfReader gtfReader = new GtfReader(this.gtfPath)) {
gtfReader.setContigNameConverter(ContigNameConverter.fromOneDictionary(dict));
gtfReader.getAllGenes().stream().filter(G -> !this.removeNonCoding || G.getTranscripts().stream().anyMatch(T -> T.isCoding())).forEach(G -> geneMap.put(new Interval(G), G));
}
}
archiveFactory = ArchiveFactory.open(this.outputPath);
if (manifestFile != null) {
manifestW = IOUtils.openPathForPrintWriter(this.manifestFile);
} else {
manifestW = new PrintWriter(new NullOuputStream());
}
final Pedigree pedigree;
if (this.pedPath == null) {
pedigree = PedigreeParser.empty();
} else {
pedigree = new PedigreeParser().parse(this.pedPath);
}
final Path tmpSvg = Files.createTempFile("vcf.", ".svg");
final XMLOutputFactory xof = XMLOutputFactory.newInstance();
for (final Locatable interval : intervalListProvider.dictionary(dict).stream().collect(Collectors.toList())) {
final List<VariantContext> variants = r.query(interval).stream().filter(V -> this.variantFILTEREDOpacity > 0 || !V.isFiltered()).filter(V -> this.variantIndelOpacity > 0 || !V.isIndel()).collect(Collectors.toCollection(ArrayList::new));
if (variants.isEmpty())
continue;
final List<Transcript> transcripts = geneMap.getOverlapping(interval).stream().flatMap(G -> G.getTranscripts().stream()).filter(T -> !this.removeNonCoding || T.isCoding()).collect(Collectors.toList());
variants.removeIf(V -> this.gtfPath != null && this.variantsInExonOnly && transcripts.stream().flatMap(T -> T.getExons().stream()).noneMatch(EX -> EX.overlaps(V)));
if (variants.isEmpty())
continue;
final String geneId = transcripts.stream().map(T -> T.getGene().getId()).collect(Collectors.toSet()).stream().collect(HtsCollectors.oneAndOnlyOne()).orElse(null);
final String geneName = transcripts.stream().map(T -> T.getGene().getGeneName()).collect(Collectors.toSet()).stream().collect(HtsCollectors.oneAndOnlyOne()).orElse(null);
outputStream = IOUtils.openPathForWriting(tmpSvg);
w = xof.createXMLStreamWriter(outputStream);
double featureHeight = 10;
double TRANSCRIPT_HEIGHT = featureHeight;
final int all_genotypes_width = variants.size() * this.genotype_width;
final int drawinAreaWidth = Math.max(all_genotypes_width, 1000);
final int interline_weight = 6;
final int margin_top = 10;
final int margin_bottom = 10;
final int margin_right = 100;
final int margin_left = 100;
w.writeStartDocument("UTF-8", "1.0");
w.writeStartElement("svg");
w.writeDefaultNamespace(SVG.NS);
w.writeNamespace("xlink", XLINK.NS);
w.writeAttribute("version", "1.1");
w.writeAttribute("width", String.valueOf(margin_right + margin_right + drawinAreaWidth));
w.writeAttribute("height", String.valueOf(margin_top + margin_bottom + transcripts.size() * TRANSCRIPT_HEIGHT + interline_weight * featureHeight + samples.size() * this.genotype_width));
title(w, interval.getContig() + ":" + interval.getStart() + "-" + interval.getEnd());
w.writeStartElement("desc");
w.writeCharacters("generated with " + getProgramName() + "\n" + "Author: Pierre Lindenbaum PhD. @yokofakun .");
w.writeEndElement();
// defs
w.writeStartElement("defs");
// genotypes
w.writeStartElement("g");
//
w.writeAttribute("id", "g_" + GenotypeType.HOM_REF);
w.writeEmptyElement("rect");
w.writeAttribute("style", "fill:lime;stroke;none;");
w.writeAttribute("x", "0");
w.writeAttribute("y", "0");
w.writeAttribute("width", String.valueOf(this.genotype_width));
w.writeAttribute("height", String.valueOf(this.genotype_width));
w.writeEndElement();
w.writeStartElement("g");
//
w.writeAttribute("id", "g_" + GenotypeType.NO_CALL);
w.writeEmptyElement("rect");
w.writeAttribute("style", "fill:silver;stroke;gray;");
w.writeAttribute("x", "0");
w.writeAttribute("y", "0");
w.writeAttribute("width", String.valueOf(this.genotype_width));
w.writeAttribute("height", String.valueOf(this.genotype_width));
w.writeEndElement();
w.writeStartElement("g");
//
w.writeAttribute("id", "g_" + GenotypeType.HOM_VAR);
w.writeEmptyElement("rect");
w.writeAttribute("style", "fill:crimson;stroke;none;");
w.writeAttribute("x", "0");
w.writeAttribute("y", "0");
w.writeAttribute("width", String.valueOf(this.genotype_width));
w.writeAttribute("height", String.valueOf(this.genotype_width));
w.writeEndElement();
w.writeStartElement("g");
//
w.writeAttribute("id", "g_" + GenotypeType.MIXED);
w.writeEmptyElement("rect");
w.writeAttribute("style", "fill:pink;stroke;none;");
w.writeAttribute("x", "0");
w.writeAttribute("y", "0");
w.writeAttribute("width", String.valueOf(this.genotype_width));
w.writeAttribute("height", String.valueOf(this.genotype_width));
w.writeEndElement();
w.writeStartElement("g");
//
w.writeAttribute("id", "g_" + GenotypeType.UNAVAILABLE);
w.writeEmptyElement("rect");
w.writeAttribute("style", "fill:gray;stroke;none;");
w.writeAttribute("x", "0");
w.writeAttribute("y", "0");
w.writeAttribute("width", String.valueOf(this.genotype_width));
w.writeAttribute("height", String.valueOf(this.genotype_width));
w.writeEndElement();
w.writeStartElement("g");
//
w.writeAttribute("id", "g_" + GenotypeType.HET);
w.writeEmptyElement("rect");
w.writeAttribute("style", "fill:lime;stroke;black;");
w.writeAttribute("x", "0");
w.writeAttribute("y", "0");
w.writeAttribute("width", String.valueOf(genotype_width));
w.writeAttribute("height", String.valueOf(genotype_width));
w.writeEmptyElement("polygon");
w.writeAttribute("style", "fill:crimson;stroke;black;");
w.writeAttribute("points", "0,0 " + genotype_width + ",0 0," + genotype_width + " 0,0");
w.writeEndElement();
// strand
w.writeEmptyElement("polyline");
w.writeAttribute("id", "strandF");
w.writeAttribute("points", "-5,-5 0,0 -5,5");
w.writeEmptyElement("polyline");
w.writeAttribute("id", "strandR");
w.writeAttribute("points", "5,-5 0,0 5,5");
// gradients
w.writeStartElement("linearGradient");
w.writeAttribute("id", "grad01");
w.writeAttribute("x1", "50%");
w.writeAttribute("x2", "50%");
w.writeAttribute("y1", "0%");
w.writeAttribute("y2", "100%");
w.writeEmptyElement("stop");
w.writeAttribute("offset", "0%");
w.writeAttribute("style", "stop-color:black;stop-opacity:1;");
w.writeEmptyElement("stop");
w.writeAttribute("offset", "50%");
w.writeAttribute("style", "stop-color:white;stop-opacity:1;");
w.writeEmptyElement("stop");
w.writeAttribute("offset", "100%");
w.writeAttribute("style", "stop-color:black;stop-opacity:1;");
w.writeEndElement();
// defs
w.writeEndElement();
w.writeStartElement("style");
w.writeCharacters("svg {fill:none; stroke:black;}\n" + "text {fill:black;stroke:none;font-size:" + (featureHeight / 1.5) + "px;}\n" + ".ruler-label { stroke:red;}\n" + ".frame { stroke:black;fill:none;}\n" + ".kgexon {fill:url(#grad01);stroke:black;}\n" + ".gcpercent {fill:url(#grad02);stroke:black;}" + ".coverage {fill:url(#grad03);stroke:black;}" + ".kgcds {fill:yellow;stroke:black;opacity:0.7;}\n" + ".variant{stroke:none;fill:red;opacity:0.2;}\n" + ".xaxis{stroke:gray;fill:none;opacity:0.2;}\n" + ".postick{font-size:9px;stroke:black;stroke-width:1;}");
// style
w.writeEndElement();
final IntFunction<Integer> trim = t -> Math.max(interval.getStart(), Math.min(interval.getEnd(), t));
final IntFunction<Double> baseToPixel = t -> margin_left + drawinAreaWidth * (t - (double) interval.getStart()) / ((double) interval.getLengthOnReference());
final IntFunction<Double> variantIndexToPixel = idx -> {
final double variant_width = drawinAreaWidth / (double) variants.size();
final double midx = variant_width * idx + variant_width / 2.0;
return margin_left + midx - genotype_width / 2.0;
};
final Function<VariantContext, String> variantTitle = V -> (V.getContig().startsWith("chr") ? V.getContig().substring(3) : V.getContig()) + ":" + V.getStart() + " " + V.getReference().getDisplayString();
/**
* title
*/
double y = 0;
w.writeStartElement("text");
w.writeAttribute("x", "0");
w.writeAttribute("y", String.valueOf(featureHeight));
w.writeCharacters(interval.toString());
w.writeEndElement();
y += featureHeight;
for (final Transcript g : transcripts) {
int cdsHeigh = 5;
double exonHeight = TRANSCRIPT_HEIGHT - 5;
double midY = TRANSCRIPT_HEIGHT / 2;
w.writeStartElement("g");
w.writeAttribute("transform", "translate(0," + y + ")");
title(w, g.getId());
w.writeStartElement("text");
w.writeAttribute("x", String.valueOf(margin_left - 10));
w.writeAttribute("y", String.valueOf(featureHeight));
w.writeAttribute("style", "text-anchor:end;");
w.writeCharacters(g.getId());
w.writeEndElement();
/* transcript line */
w.writeEmptyElement("line");
w.writeAttribute("class", "kgtr");
w.writeAttribute("x1", String.valueOf(baseToPixel.apply(trim.apply(g.getTxStart()))));
w.writeAttribute("y1", String.valueOf(midY));
w.writeAttribute("x2", String.valueOf(baseToPixel.apply(trim.apply(g.getTxEnd()))));
w.writeAttribute("y2", String.valueOf(midY));
/* strand symbols */
for (double pixX = 0; pixX < drawinAreaWidth; pixX += 30) {
double pos0 = interval.getStart() + (pixX / (double) drawinAreaWidth) * interval.getLengthOnReference();
if (pos0 + 1 < g.getTxStart())
continue;
if (pos0 > g.getTxEnd())
break;
w.writeEmptyElement("use");
w.writeAttribute("class", "kgstrand");
w.writeAttribute("xlink", XLINK.NS, "href", "#strand" + (g.isPositiveStrand() ? "F" : "R"));
w.writeAttribute("x", String.valueOf(margin_left + pixX));
w.writeAttribute("y", String.valueOf(midY));
}
/* exons */
for (final Exon exon : g.getExons()) {
if (exon.getStart() + 1 >= interval.getEnd())
continue;
if (exon.getEnd() <= interval.getStart())
continue;
w.writeStartElement("rect");
w.writeAttribute("class", "kgexon");
w.writeAttribute("x", String.valueOf(baseToPixel.apply(trim.apply(exon.getStart()))));
w.writeAttribute("y", String.valueOf(midY - exonHeight / 2));
w.writeAttribute("width", String.valueOf(baseToPixel.apply(trim.apply(exon.getEnd())) - baseToPixel.apply((trim.apply(exon.getStart())))));
w.writeAttribute("height", String.valueOf(exonHeight));
title(w, exon.getName());
w.writeEndElement();
}
/* coding line */
if (!g.isNonCoding() && g.hasCodonStartDefined() && g.hasCodonStopDefined()) {
final double codonx1 = baseToPixel.apply(trim.apply(g.getLeftmostCodon().get().getStart()));
final double codonx2 = baseToPixel.apply(trim.apply(g.getRightmostCodon().get().getEnd()));
w.writeEmptyElement("rect");
w.writeAttribute("class", "kgcds");
w.writeAttribute("x", String.valueOf(codonx1));
w.writeAttribute("y", String.valueOf(midY - cdsHeigh / 4.0));
w.writeAttribute("width", String.valueOf(baseToPixel.apply((int) (codonx2 - codonx1))));
w.writeAttribute("height", String.valueOf(cdsHeigh / 2.0));
}
// String label=String.format("%15s", g.getName());
// w.writeEmptyElement("path");
// double fontHeight=Math.min(10,0.8*TRANSCRIPT_HEIGHT);
// w.writeAttribute("d",this.hershey.svgPath(label,-insets.left,midY-fontHeight/2,insets.left*0.9,fontHeight));
w.writeEndElement();
w.writeCharacters("\n");
y += featureHeight;
}
/* draw lines to variants */
for (int vidx = 0; vidx < variants.size(); ++vidx) {
final VariantContext vc = variants.get(vidx);
double x1 = baseToPixel.apply(vc.getStart());
double x2 = baseToPixel.apply(vc.getEnd());
final double y2 = y + featureHeight * interline_weight;
w.writeStartElement("polygon");
w.writeAttribute("style", "fill:" + (vidx % 2 == 0 ? "ghostwhite" : "lavender") + ";stroke:black;opacity:0.6;stroke-width:0.5;");
w.writeAttribute("points", "" + x1 + "," + (y - featureHeight / 2.0) + " " + x2 + "," + (y - featureHeight / 2.0) + " " + variantIndexToPixel.apply(vidx) + "," + y2 + " " + (variantIndexToPixel.apply(vidx) + this.genotype_width) + "," + y2);
title(w, variantTitle.apply(vc));
w.writeEndElement();
}
for (int vidx = 0; vidx < variants.size(); ++vidx) {
final VariantContext vc = variants.get(vidx);
final double y2 = y + featureHeight * interline_weight;
w.writeStartElement("text");
w.writeAttribute("transform", "translate(" + (String.valueOf(variantIndexToPixel.apply(vidx) + genotype_width / 2.0)) + "," + String.valueOf(y2 - 5) + ") " + "rotate(-45)");
w.writeAttribute("x", "0");
w.writeAttribute("y", "0");
w.writeAttribute("class", "postick");
w.writeCharacters(variantTitle.apply(vc));
w.writeEndElement();
w.writeCharacters("\n");
}
y += featureHeight * interline_weight;
w.writeStartElement("g");
/* step 0: affected, 1: unaffected, 2: others */
for (int step = 0; step < 3; ++step) {
for (final String sample : samples) {
final Sample individual = pedigree.getSampleById(sample);
if (step == 0 && (individual == null || !individual.isAffected()))
continue;
if (step == 1 && (individual == null || !individual.isUnaffected()))
continue;
if (step == 2 && individual != null && individual.isStatusSet())
continue;
// sample
w.writeStartElement("g");
switch(step) {
case 0:
w.writeAttribute("style", "hue-rotate(195deg);");
break;
case 1:
w.writeAttribute("style", "hue-rotate(45deg);");
break;
default:
break;
}
for (int vidx = 0; vidx < variants.size(); ++vidx) {
final VariantContext vc = variants.get(vidx);
final Genotype g = vc.getGenotype(sample);
double opacity = 1.0;
if (vc.isIndel())
opacity *= this.variantIndelOpacity;
if (vc.isFiltered())
opacity *= this.variantFILTEREDOpacity;
if (opacity > 1)
opacity = 1;
if (opacity <= 0)
continue;
if (opacity < 1) {
w.writeStartElement("g");
w.writeAttribute("style", "opacity:" + opacity + ";");
}
w.writeEmptyElement("use");
w.writeAttribute("x", String.valueOf(variantIndexToPixel.apply(vidx)));
w.writeAttribute("y", String.valueOf(y));
w.writeAttribute("xlink", XLINK.NS, "href", "#g_" + g.getType());
if (opacity < 1) {
w.writeEndElement();
}
}
w.writeCharacters("\n");
w.writeStartElement("text");
w.writeAttribute("x", String.valueOf(margin_left - 10));
w.writeAttribute("y", String.valueOf(y + this.genotype_width / 2.0));
w.writeAttribute("style", "text-anchor:end;");
w.writeCharacters(sample);
// text
w.writeEndElement();
// g for sample
w.writeEndElement();
y += this.genotype_width;
}
}
w.writeCharacters("\n");
w.writeEndDocument();
w.writeCharacters("\n");
w.flush();
w.close();
final String md5 = StringUtils.md5(interval.getContig() + ":" + interval.getStart() + "-" + interval.getEnd());
final String filename = md5.substring(0, 2) + File.separatorChar + md5.substring(2) + File.separator + interval.getContig().replaceAll("[/\\:]", "_") + "_" + interval.getStart() + "_" + interval.getEnd() + (StringUtils.isBlank(geneName) ? "" : "." + geneName.replaceAll("[/\\:]", "")) + (StringUtils.isBlank(geneId) ? "" : "." + geneId.replaceAll("[/\\:]", "")) + ".svg";
OutputStream os = archiveFactory.openOuputStream(filename);
IOUtils.copyTo(tmpSvg, os);
os.flush();
os.close();
Files.delete(tmpSvg);
manifestW.print(interval.getContig());
manifestW.print('\t');
manifestW.print(interval.getStart() - 1);
manifestW.print('\t');
manifestW.print(interval.getEnd());
manifestW.print('\t');
manifestW.print(transcripts.stream().map(G -> G.getGene().getId()).collect(Collectors.toSet()).stream().collect(Collectors.joining(";")));
manifestW.print('\t');
manifestW.print(transcripts.stream().map(G -> G.getGene().getGeneName()).collect(Collectors.toSet()).stream().collect(Collectors.joining(";")));
manifestW.print('\t');
manifestW.print(transcripts.stream().map(G -> G.getId()).collect(Collectors.toSet()).stream().collect(Collectors.joining(";")));
manifestW.print('\t');
manifestW.print((archiveFactory.isTarOrZipArchive() ? "" : this.outputPath.toString() + File.separator) + filename);
manifestW.print('\t');
manifestW.println(variants.size());
}
r.close();
manifestW.flush();
manifestW.close();
manifestW = null;
archiveFactory.close();
archiveFactory = null;
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(archiveFactory);
CloserUtil.close(r);
CloserUtil.close(outputStream);
CloserUtil.close(manifestW);
}
}
use of com.github.lindenb.jvarkit.util.bio.structure.GtfReader in project jvarkit by lindenb.
the class Biostar81455 method doWork.
@Override
public int doWork(final List<String> args) {
BufferedReader r = null;
String line;
PrintStream out = null;
final CharSplitter tab = CharSplitter.TAB;
try {
try (final GtfReader gtfReader = new GtfReader(this.gtfPath)) {
gtfReader.getAllGenes().stream().forEach(G -> this.geneMap.put(new Interval(G), G));
}
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(r);
}
final ContigNameConverter contigNameConverter = ContigNameConverter.fromIntervalTreeMap(this.geneMap);
try {
r = super.openBufferedReader(oneFileOrNull(args));
out = openPathOrStdoutAsPrintStream(this.outputFile);
while ((line = r.readLine()) != null) {
if (line.startsWith("#")) {
out.println(line);
continue;
}
boolean found = false;
final String[] tokens = tab.split(line);
final int pos1 = Integer.parseInt(tokens[1]) + (this.one_based ? 0 : 1);
final String convertCtg = contigNameConverter.apply(tokens[0]);
if (convertCtg == null) {
LOG.error("CANNOT FIND contig " + tokens[0]);
out.println("##UNKNOWN CONTIG " + line);
continue;
}
final SimplePosition position = new SimplePosition(convertCtg, pos1);
final List<Transcript> transcripts = this.geneMap.getOverlapping(position).stream().flatMap(G -> G.getTranscripts().stream()).filter(T -> T.overlaps(position)).collect(Collectors.toList());
if (transcripts.isEmpty()) {
LOG.info("no gene found in chromosome " + tokens[0] + " (check chrom prefix?)");
} else {
for (final Transcript kg : transcripts) {
Exon bestExon = null;
for (final Exon exon : kg.getExons()) {
if (bestExon == null || Math.abs(distance(position.getPosition(), exon)) < Math.abs(distance(position.getPosition(), bestExon))) {
bestExon = exon;
}
}
if (bestExon != null) {
out.print(line);
out.print("\t");
out.print(bestExon.getTranscript().getId());
out.print("\t");
out.print(kg.getStart() - 1);
out.print("\t");
out.print(kg.getEnd());
out.print("\t");
out.print(kg.getStrand());
out.print("\t");
out.print(bestExon.getName());
out.print("\t");
out.print(bestExon.getStart() - 1);
out.print("\t");
out.print(bestExon.getEnd());
out.print("\t");
out.print(distance(position.getPosition(), bestExon));
out.println();
found = true;
}
}
}
if (!found) {
out.println(line + "\tNULL");
}
}
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(r);
CloserUtil.close(out);
}
}
use of com.github.lindenb.jvarkit.util.bio.structure.GtfReader in project jvarkit by lindenb.
the class SamFindClippedRegions method doWork.
@Override
public int doWork(final List<String> args) {
final int bad_mapq = 30;
try {
if (this.min_clip_depth > this.min_depth) {
LOG.error("this.min_clip_depth>this.min_depth");
return -1;
}
if (this.fraction < 0 || this.fraction > 1.0) {
LOG.error("bad ratio: " + fraction);
return -1;
}
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.referenceFai);
final List<Path> inputs = IOUtils.unrollPaths(args);
if (inputs.isEmpty()) {
LOG.error("input is missing");
return -1;
}
final IntervalTreeMap<Interval> intronMap = new IntervalTreeMap<>();
if (this.gtfPath != null) {
try (GtfReader gtfReader = new GtfReader(this.gtfPath)) {
gtfReader.setContigNameConverter(ContigNameConverter.fromOneDictionary(dict));
gtfReader.getAllGenes().stream().flatMap(G -> G.getTranscripts().stream()).filter(T -> T.hasIntron()).flatMap(T -> T.getIntrons().stream()).map(I -> new Interval(I.getContig(), I.getStart(), I.getEnd(), I.isNegativeStrand(), I.getTranscript().getId())).forEach(I -> intronMap.put(I, I));
}
}
final List<String> sample_list = new ArrayList<>();
final QueryInterval[] queryIntervals;
if (this.bedPath == null) {
queryIntervals = null;
} else {
try (BedLineReader br = new BedLineReader(this.bedPath).setValidationStringency(ValidationStringency.LENIENT).setContigNameConverter(ContigNameConverter.fromOneDictionary(dict))) {
queryIntervals = br.optimizeIntervals(dict);
}
}
SortingCollection<Base> sortingCollection = SortingCollection.newInstance(Base.class, new BaseCodec(), (A, B) -> A.compare1(B), writingSortingCollection.getMaxRecordsInRam(), writingSortingCollection.getTmpPath());
sortingCollection.setDestructiveIteration(true);
final Predicate<Base> acceptBase = B -> {
return B.clip() > 0;
};
final SamReaderFactory srf = super.createSamReaderFactory().referenceSequence(this.referenceFai);
for (final Path path : inputs) {
try (SamReader sr = srf.open(path)) {
final SAMFileHeader header = sr.getFileHeader();
SequenceUtil.assertSequenceDictionariesEqual(dict, SequenceDictionaryUtils.extractRequired(header));
final String sample_name = header.getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtils.isBlank(S)).findFirst().orElse(IOUtils.getFilenameWithoutCommonSuffixes(path));
if (sample_list.contains(sample_name)) {
LOG.error("duplicate sample " + sample_name + " in " + path);
return -1;
}
final int sample_idx = sample_list.size();
sample_list.add(sample_name);
int prev_tid = -1;
final SortedMap<Integer, Base> pos2base = new TreeMap<>();
/* get base in pos2base, create it if needed */
final BiFunction<Integer, Integer, Base> baseAt = (TID, POS) -> {
Base b = pos2base.get(POS);
if (b == null) {
b = new Base(sample_idx, TID, POS);
pos2base.put(POS, b);
}
return b;
};
try (CloseableIterator<SAMRecord> iter = queryIntervals == null ? sr.iterator() : sr.query(queryIntervals, false)) {
for (; ; ) {
final SAMRecord rec = (iter.hasNext() ? iter.next() : null);
if (rec != null && !SAMRecordDefaultFilter.accept(rec, this.min_mapq))
continue;
if (rec == null || prev_tid != rec.getReferenceIndex().intValue()) {
for (final Integer pos : pos2base.keySet()) {
final Base b = pos2base.get(pos);
if (acceptBase.test(b))
sortingCollection.add(b);
}
if (rec == null)
break;
pos2base.clear();
prev_tid = rec.getReferenceIndex().intValue();
}
/* pop back previous bases */
for (Iterator<Integer> rpos = pos2base.keySet().iterator(); rpos.hasNext(); ) {
final Integer pos = rpos.next();
if (pos.intValue() + this.max_clip_length >= rec.getUnclippedStart())
break;
final Base b = pos2base.get(pos);
if (acceptBase.test(b))
sortingCollection.add(b);
rpos.remove();
}
final Cigar cigar = rec.getCigar();
int refPos = rec.getAlignmentStart();
for (final CigarElement ce : cigar.getCigarElements()) {
final CigarOperator op = ce.getOperator();
if (op.consumesReferenceBases()) {
if (op.consumesReadBases()) {
for (int x = 0; x < ce.getLength(); ++x) {
final Base gt = baseAt.apply(prev_tid, refPos + x);
gt.noClip++;
gt.noClip_sum_mapq += rec.getMappingQuality();
}
} else if (op.equals(CigarOperator.D) || op.equals(CigarOperator.N)) {
baseAt.apply(prev_tid, refPos).del++;
baseAt.apply(prev_tid, refPos + ce.getLength() - 1).del++;
}
refPos += ce.getLength();
}
}
CigarElement ce = cigar.getFirstCigarElement();
if (ce != null && ce.getOperator().isClipping() && ce.getLength() >= this.min_clip_operator_length) {
baseAt.apply(prev_tid, rec.getStart() - 1).leftClip++;
}
ce = cigar.getLastCigarElement();
if (ce != null && ce.getOperator().isClipping() && ce.getLength() >= this.min_clip_operator_length) {
baseAt.apply(prev_tid, rec.getEnd() + 1).rightClip++;
}
}
}
/* write last bases */
for (final Integer pos : pos2base.keySet()) {
final Base b = pos2base.get(pos);
if (acceptBase.test(b))
sortingCollection.add(b);
}
}
// end open reader
}
// end loop sam files
sortingCollection.doneAdding();
/* build VCF header */
final Allele reference_allele = Allele.create("N", true);
final Allele alt_allele = Allele.create("<CLIP>", false);
final Set<VCFHeaderLine> vcfHeaderLines = new HashSet<>();
VCFStandardHeaderLines.addStandardFormatLines(vcfHeaderLines, true, VCFConstants.GENOTYPE_KEY, VCFConstants.DEPTH_KEY, VCFConstants.GENOTYPE_ALLELE_DEPTHS);
VCFStandardHeaderLines.addStandardInfoLines(vcfHeaderLines, true, VCFConstants.DEPTH_KEY, VCFConstants.ALLELE_COUNT_KEY, VCFConstants.ALLELE_NUMBER_KEY, VCFConstants.ALLELE_FREQUENCY_KEY);
final VCFFormatHeaderLine leftClip = new VCFFormatHeaderLine("CL", 1, VCFHeaderLineType.Integer, "Left Clip");
vcfHeaderLines.add(leftClip);
final VCFFormatHeaderLine rightClip = new VCFFormatHeaderLine("RL", 1, VCFHeaderLineType.Integer, "Right Clip");
vcfHeaderLines.add(rightClip);
final VCFFormatHeaderLine totalCip = new VCFFormatHeaderLine("TL", 1, VCFHeaderLineType.Integer, "Total Clip");
vcfHeaderLines.add(totalCip);
final VCFFormatHeaderLine totalDel = new VCFFormatHeaderLine("DL", 1, VCFHeaderLineType.Integer, "Total Deletions");
vcfHeaderLines.add(totalDel);
final VCFFormatHeaderLine noClipMAPQ = new VCFFormatHeaderLine("MQ", 1, VCFHeaderLineType.Integer, "Average MAPQ for reads without clip at this position.");
vcfHeaderLines.add(noClipMAPQ);
final VCFInfoHeaderLine averageMAPQ = new VCFInfoHeaderLine("AVG_MAPQ", 1, VCFHeaderLineType.Integer, "Average MAPQ for called genotypes");
vcfHeaderLines.add(averageMAPQ);
final VCFInfoHeaderLine infoRetrogene = new VCFInfoHeaderLine("RETROGENE", 1, VCFHeaderLineType.String, "transcript name for Possible retrogene.");
vcfHeaderLines.add(infoRetrogene);
final VCFFilterHeaderLine filterRetrogene = new VCFFilterHeaderLine("POSSIBLE_RETROGENE", "Junction is a possible Retrogene.");
vcfHeaderLines.add(filterRetrogene);
final VCFFilterHeaderLine filterlowMapq = new VCFFilterHeaderLine("LOW_MAPQ", "Low average mapq (< " + bad_mapq + ")");
vcfHeaderLines.add(filterlowMapq);
final VCFHeader vcfHeader = new VCFHeader(vcfHeaderLines, sample_list);
vcfHeader.setSequenceDictionary(dict);
JVarkitVersion.getInstance().addMetaData(this, vcfHeader);
this.writingVcfConfig.dictionary(dict);
try (VariantContextWriter w = this.writingVcfConfig.open(this.outputFile)) {
w.writeHeader(vcfHeader);
try (CloseableIterator<Base> r1 = sortingCollection.iterator()) {
try (EqualRangeIterator<Base> r2 = new EqualRangeIterator<>(r1, (A, B) -> A.compare2(B))) {
while (r2.hasNext()) {
final List<Base> array = r2.next();
final Base first = array.get(0);
if (first.pos < 1)
continue;
// no clip
if (array.stream().mapToInt(G -> G.clip()).sum() == 0)
continue;
if (array.stream().allMatch(G -> G.clip() < min_clip_depth))
continue;
if (array.stream().allMatch(G -> G.dp() < min_depth))
continue;
if (array.stream().allMatch(G -> G.ratio() < fraction))
continue;
final VariantContextBuilder vcb = new VariantContextBuilder();
final String chrom = dict.getSequence(first.tid).getSequenceName();
vcb.chr(chrom);
vcb.start(first.pos);
vcb.stop(first.pos);
vcb.alleles(Arrays.asList(reference_allele, alt_allele));
vcb.attribute(VCFConstants.DEPTH_KEY, array.stream().mapToInt(G -> G.dp()).sum());
final List<Genotype> genotypes = new ArrayList<>(array.size());
int AC = 0;
int AN = 0;
int max_clip = 1;
double sum_mapq = 0.0;
int count_mapq = 0;
for (final Base gt : array) {
final GenotypeBuilder gb = new GenotypeBuilder(sample_list.get(gt.sample_idx));
if (gt.clip() == 0 && gt.noClip == 0) {
gb.alleles(Arrays.asList(Allele.NO_CALL, Allele.NO_CALL));
} else if (gt.noClip == 0) {
gb.alleles(Arrays.asList(alt_allele, alt_allele));
AC += 2;
sum_mapq += gt.noClipMapq();
count_mapq++;
AN += 2;
} else if (gt.clip() == 0) {
gb.alleles(Arrays.asList(reference_allele, reference_allele));
AN += 2;
} else {
gb.alleles(Arrays.asList(reference_allele, alt_allele));
AC++;
sum_mapq += gt.noClipMapq();
count_mapq++;
AN += 2;
}
gb.DP(gt.dp());
gb.attribute(leftClip.getID(), gt.leftClip);
gb.attribute(rightClip.getID(), gt.rightClip);
gb.attribute(totalCip.getID(), gt.clip());
gb.attribute(totalDel.getID(), gt.del);
gb.attribute(noClipMAPQ.getID(), gt.noClipMapq());
gb.AD(new int[] { gt.noClip, gt.clip() });
genotypes.add(gb.make());
max_clip = Math.max(max_clip, gt.clip());
}
if (count_mapq > 0) {
final int avg_mapq = (int) (sum_mapq / count_mapq);
vcb.attribute(averageMAPQ.getID(), avg_mapq);
if (avg_mapq < bad_mapq)
vcb.filter(filterlowMapq.getID());
}
if (gtfPath != null) {
final Locatable bounds1 = new SimpleInterval(chrom, Math.max(1, first.pos - max_intron_distance), first.pos + max_intron_distance);
intronMap.getOverlapping(bounds1).stream().filter(I -> Math.abs(I.getStart() - first.pos) <= this.max_intron_distance || Math.abs(I.getEnd() - first.pos) <= this.max_intron_distance).map(I -> I.getName()).findFirst().ifPresent(transcript_id -> {
vcb.attribute(infoRetrogene.getID(), transcript_id);
vcb.filter(filterRetrogene.getID());
});
;
}
vcb.log10PError(max_clip / -10.0);
vcb.attribute(VCFConstants.ALLELE_COUNT_KEY, AC);
vcb.attribute(VCFConstants.ALLELE_NUMBER_KEY, AN);
if (AN > 0)
vcb.attribute(VCFConstants.ALLELE_FREQUENCY_KEY, AC / (float) AN);
vcb.genotypes(genotypes);
w.add(vcb.make());
}
// end while r2
}
// end r2
}
// end r1
}
// end writer
sortingCollection.cleanup();
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
}
}
use of com.github.lindenb.jvarkit.util.bio.structure.GtfReader in project jvarkit by lindenb.
the class VcfStrechToSvg method doWork.
@Override
public int doWork(final List<String> args) {
try {
this.afExtractor = new AFExtractorFactory().parseFieldExtractor(this.afExtractorFactoryStr);
final String input = super.oneAndOnlyOneFile(args);
if (!this.bamListPath.isEmpty()) {
LOG.info("reading bam list");
for (Path bamPath : IOUtils.unrollPaths(this.bamListPath)) {
try (SamReader sr = openSamReader(bamPath)) {
final SAMFileHeader hdr = sr.getFileHeader();
for (final String sn : hdr.getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtils.isBlank(S)).collect(Collectors.toSet())) {
if (this.sample2bam.containsKey(sn)) {
LOG.info("duplicate sample in bam " + bamPath + " and " + this.sample2bam.get(sn));
return -1;
}
this.sample2bam.put(sn, bamPath);
}
}
}
}
try (VCFReader r = VCFReaderFactory.makeDefault().open(input, true)) {
final VCFHeader header = r.getHeader();
this.afExtractor.validateHeader(header);
final String searchFormat;
switch(this.useFormat) {
case PL:
searchFormat = VCFConstants.GENOTYPE_PL_KEY;
break;
// through
case AD:
default:
searchFormat = VCFConstants.GENOTYPE_ALLELE_DEPTHS;
break;
}
if (header.getFormatHeaderLine(searchFormat) == null) {
LOG.error("FORMAT/" + searchFormat + " undefined in " + input);
return -1;
}
if (!header.hasGenotypingData()) {
LOG.error("No genotype in input");
return -1;
}
final SAMSequenceDictionary dict = header.getSequenceDictionary();
if (dict != null && this.refFaidx != null) {
SequenceUtil.assertSequenceDictionariesEqual(dict, SequenceDictionaryUtils.extractRequired(this.refFaidx));
}
if (this.gff3Path != null) {
LOG.info("reading gtf" + this.gff3Path);
try (GtfReader gtfReader = new GtfReader(this.gff3Path)) {
if (dict != null)
gtfReader.setContigNameConverter(ContigNameConverter.fromOneDictionary(dict));
gtfReader.getAllGenes().forEach(G -> {
this.all_genes.put(new Interval(G), G);
});
}
}
try (BufferedReader br = IOUtils.openPathForBufferedReading(this.bedFile)) {
try (PrintWriter manifest = (this.manifestPath == null ? new PrintWriter(new NullOuputStream()) : IOUtils.openPathForPrintWriter(this.manifestPath))) {
try (ArchiveFactory out = ArchiveFactory.open(this.outputFile)) {
final BedLineCodec codec = new BedLineCodec();
br.lines().map(L -> codec.decode(L)).filter(B -> B != null).forEach(B -> {
run(out, B, header, r, manifest);
});
}
manifest.flush();
}
}
}
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
}
}
use of com.github.lindenb.jvarkit.util.bio.structure.GtfReader in project jvarkit by lindenb.
the class GtfUpstreamOrf method doWork.
@Override
public int doWork(final List<String> args) {
GtfReader gtfReader = null;
PrintWriter pw = null;
try {
this.indexedFastaSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.faidx);
final SAMSequenceDictionary refDict = SequenceDictionaryUtils.extractRequired(this.indexedFastaSequenceFile);
this.refCtgNameConverter = ContigNameConverter.fromOneDictionary(refDict);
final ContigDictComparator ctgDictComparator = new ContigDictComparator(refDict);
final String input = oneFileOrNull(args);
gtfReader = input == null ? new GtfReader(stdin()) : new GtfReader(input);
gtfReader.setContigNameConverter(this.refCtgNameConverter);
final List<Gene> genes = gtfReader.getAllGenes().stream().filter(G -> G.hasStrand()).sorted((A, B) -> {
int i = ctgDictComparator.compare(A.getContig(), B.getContig());
if (i != 0)
return i;
i = Integer.compare(A.getStart(), B.getStart());
if (i != 0)
return i;
return Integer.compare(A.getEnd(), B.getEnd());
}).collect(Collectors.toList());
gtfReader.close();
gtfReader = null;
pw = super.openPathOrStdoutAsPrintWriter(this.outputFile);
for (final KozakSequence.Strength f : KozakSequence.Strength.values()) {
pw.println("#kozak." + f.name() + "=" + kozakStrengthToScore(f));
}
final String gtfSource = getProgramName().toLowerCase();
for (final SAMSequenceRecord ssr : refDict.getSequences()) {
pw.println("##contig " + ssr.getSequenceName() + ": length:" + ssr.getSequenceLength());
}
pw.println("#" + gtfSource + ":" + JVarkitVersion.getInstance().toString());
if (!StringUtils.isBlank(input)) {
pw.println("#gtf:" + input);
}
final ProgressFactory.Watcher<Gene> progress = ProgressFactory.newInstance().dictionary(refDict).logger(LOG).build();
for (final Gene gene : genes) {
progress.apply(gene);
/* new reference sequence */
if (this.genomicSequence == null || !this.genomicSequence.getChrom().equals(gene.getContig())) {
this.genomicSequence = new GenomicSequence(this.indexedFastaSequenceFile, gene.getContig());
}
final List<RNASequence> rnas = gene.getTranscripts().stream().filter(T -> T.isCoding()).filter(T -> T.hasStrand()).filter(T -> T.hasCodonStartDefined()).filter(T -> T.getTranscriptUTR5().isPresent()).map(T -> new RNASequence(T)).collect(Collectors.toList());
if (rnas.isEmpty())
continue;
final Set<OpenReadingFrame> orfs = rnas.stream().flatMap(R -> R.getUpstreamOpenReadingFrames().stream()).collect(Collectors.toSet());
if (orfs.isEmpty())
continue;
boolean gene_printed = false;
for (final OpenReadingFrame uORF : orfs) {
/* is there any other RNA containing this uORF ?*/
if (rnas.stream().filter(other -> !other.getTranscript().getId().equals(uORF.mRNA.getTranscript().getId())).anyMatch(other -> {
// other must have atg in frame and ATG before the observed one
final int other_atg0 = other.getATG0InRNA();
if (uORF.in_rna_atg0 % 3 != other_atg0 % 3)
return false;
return uORF.in_rna_atg0 >= other_atg0;
}))
continue;
final String transcript_id = uORF.getTranscript().getId() + ".uorf" + (1 + uORF.in_rna_atg0);
if (!gene_printed) {
gene_printed = true;
pw.print(gene.getContig());
pw.print("\t");
// source
pw.print(gtfSource);
pw.print("\t");
pw.print("gene");
pw.print("\t");
// start
pw.print(gene.getStart());
pw.print("\t");
// end
pw.print(gene.getEnd());
pw.print("\t");
// score
pw.print(".");
pw.print("\t");
// strand
pw.print(gene.getStrand());
pw.print("\t");
// phase
pw.print(".");
pw.print("\t");
pw.print(keyvalue("gene_id", gene.getId()));
pw.println();
}
// TRANSCRIPT
final UTR utr_5_prime = uORF.getTranscript().getTranscriptUTR5().get();
pw.print(gene.getContig());
pw.print("\t");
pw.print(gtfSource);
pw.print("\t");
pw.print("transcript");
pw.print("\t");
if (gene.isPositiveStrand()) {
pw.print(uORF.mRNA.mrnaIndex0ToGenomic0[uORF.in_rna_atg0] + 1);
pw.print("\t");
// pw.print(transcriptSequence.mrnaIndex0ToBase[uORF.in_rna_stop0]+1);
if (uORF.in_rna_stop0 == NPOS) {
pw.print(uORF.mRNA.getTranscript().getTxEnd());
} else {
pw.print(uORF.mRNA.mrnaIndex0ToGenomic0[uORF.in_rna_stop0] + 1);
}
} else {
if (uORF.in_rna_stop0 == NPOS) {
pw.print(uORF.mRNA.getTranscript().getTxStart());
} else {
pw.print(uORF.mRNA.mrnaIndex0ToGenomic0[uORF.in_rna_stop0] + 1);
}
pw.print("\t");
pw.print(uORF.mRNA.mrnaIndex0ToGenomic0[uORF.in_rna_atg0] + 1);
}
pw.print("\t");
// score
pw.print(kozakStrengthToScore(uORF.kozak.getStrength()));
pw.print("\t");
// strand
pw.print(gene.getStrand());
pw.print("\t");
// phase
pw.print("0");
pw.print("\t");
pw.print(keyvalue("gene_id", gene.getId()));
pw.print(keyvalue("transcript_id", transcript_id));
pw.print(keyvalue("transcript_biotype", "uORF"));
pw.print(keyvalue("kozak-seq", uORF.kozak.getString()));
pw.print(keyvalue("kozak-strength", uORF.kozak.getStrength()));
pw.print(keyvalue("translation", uORF.peptide));
pw.print(keyvalue("uORF-atg-in-frame-with-transcript-atg", uORF.uorf_atg_in_frame));
pw.print(keyvalue("utr", utr_5_prime.toString() + " " + utr_5_prime.getStart() + "-" + utr_5_prime.getEnd()));
pw.println();
// Exon
for (final Exon exon : uORF.getTranscript().getExons()) {
pw.print(exon.getContig());
pw.print("\t");
pw.print(gtfSource);
pw.print("\t");
pw.print("exon");
pw.print("\t");
pw.print(exon.getStart());
pw.print("\t");
pw.print(exon.getEnd());
pw.print("\t");
// score
pw.print(kozakStrengthToScore(uORF.kozak.getStrength()));
pw.print("\t");
// strand
pw.print(exon.getStrand());
pw.print("\t");
// phase
pw.print(0);
pw.print("\t");
pw.print(keyvalue("gene_id", gene.getId()));
pw.print(keyvalue("transcript_id", transcript_id));
pw.println();
}
final List<Interval> startBlocks = uORF.mRNA.getCodonBlocks(uORF.in_rna_atg0, uORF.in_rna_atg0 + 1, uORF.in_rna_atg0 + 2);
final List<Interval> stopBlocks = uORF.in_rna_stop0 != NPOS ? uORF.mRNA.getCodonBlocks(uORF.in_rna_stop0, uORF.in_rna_stop0 + 1, uORF.in_rna_stop0 + 2) : Collections.emptyList();
// CDS
if (!stopBlocks.isEmpty()) {
final int cdsStart = startBlocks.stream().mapToInt(B -> B.getStart()).min().orElseThrow(IllegalStateException::new);
final int cdsEnd = stopBlocks.stream().mapToInt(B -> B.getEnd()).max().orElseThrow(IllegalStateException::new);
for (final Exon exon : uORF.getTranscript().getExons()) {
if (exon.getEnd() < cdsStart)
continue;
if (exon.getStart() > cdsEnd)
break;
pw.print(exon.getContig());
pw.print("\t");
pw.print(gtfSource);
pw.print("\t");
pw.print("CDS");
pw.print("\t");
pw.print(Math.max(cdsStart, exon.getStart()));
pw.print("\t");
pw.print(Math.min(cdsEnd, exon.getEnd()));
pw.print("\t");
// score
pw.print(kozakStrengthToScore(uORF.kozak.getStrength()));
pw.print("\t");
// strand
pw.print(exon.getStrand());
pw.print("\t");
// phase
pw.print(uORF.getFrameAt(Math.max(cdsStart, exon.getStart())));
pw.print("\t");
pw.print(keyvalue("gene_id", gene.getId()));
pw.print(keyvalue("transcript_id", transcript_id));
pw.println();
}
}
// CODON START
for (final Interval startc : startBlocks) {
PARANOID.assertLe(startc.getStart(), startc.getEnd());
pw.print(startc.getContig());
pw.print("\t");
pw.print(gtfSource);
pw.print("\t");
pw.print("start_codon");
pw.print("\t");
pw.print(startc.getStart());
pw.print("\t");
pw.print(startc.getEnd());
pw.print("\t");
// score
pw.print(kozakStrengthToScore(uORF.kozak.getStrength()));
pw.print("\t");
// strand
pw.print(gene.getStrand());
pw.print("\t");
// phase
pw.print(uORF.getFrameAt(startc.getStart()));
pw.print("\t");
pw.print(keyvalue("gene_id", gene.getId()));
pw.print(keyvalue("transcript_id", transcript_id));
pw.print(keyvalue("distance-mrna-atg", uORF.mRNA.getATG0InRNA() - uORF.in_rna_atg0));
pw.print(keyvalue("pos0-in-mrna", uORF.in_rna_atg0));
pw.print(keyvalue("spliced", startBlocks.size() > 1));
pw.println();
}
// CODON END
for (final Interval stopc : stopBlocks) /* might be empty */
{
PARANOID.assertLe(stopc.getStart(), stopc.getEnd());
pw.print(stopc.getContig());
pw.print("\t");
pw.print(gtfSource);
pw.print("\t");
pw.print("stop_codon");
pw.print("\t");
pw.print(stopc.getStart());
pw.print("\t");
pw.print(stopc.getEnd());
pw.print("\t");
// score
pw.print(kozakStrengthToScore(uORF.kozak.getStrength()));
pw.print("\t");
// strand
pw.print(gene.getStrand());
pw.print("\t");
// phase
pw.print(uORF.getFrameAt(stopc.getStart()));
pw.print("\t");
pw.print(keyvalue("gene_id", gene.getId()));
pw.print(keyvalue("transcript_id", transcript_id));
pw.print(keyvalue("spliced", stopBlocks.size() > 1));
pw.println();
}
}
}
progress.close();
pw.flush();
pw.close();
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(this.indexedFastaSequenceFile);
}
}
Aggregations