use of com.github.lindenb.jvarkit.samtools.CoverageFactory in project jvarkit by lindenb.
the class BamStats05 method doWork.
protected int doWork(final PrintWriter pw, final Map<String, List<SimpleInterval>> gene2interval, final String filename, final SamReader IN) throws Exception {
try {
LOG.info("Scanning " + filename);
final SAMFileHeader header = IN.getFileHeader();
final List<SAMReadGroupRecord> rgs = header.getReadGroups();
if (rgs == null || rgs.isEmpty())
throw new IOException("No read groups in " + filename);
final Set<String> groupNames = this.groupBy.getPartitions(rgs);
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(header);
final ContigNameConverter contigNameConverter = ContigNameConverter.fromOneDictionary(dict);
final CoverageFactory coverageFactory = new CoverageFactory().setMappingQuality(this.mapq).setPartition(this.groupBy).setRecordFilter(R -> !filter.filterOut(R));
for (final String partition : groupNames) {
if (StringUtils.isBlank(partition))
throw new IOException("Empty read group: " + groupBy.name() + " for " + filename);
for (final String gene : gene2interval.keySet()) {
final List<Integer> counts = new ArrayList<>();
final List<SimpleInterval> intervals = gene2interval.get(gene);
final String newContig = contigNameConverter.apply(intervals.get(0).getContig());
if (StringUtil.isBlank(newContig)) {
throw new JvarkitException.ContigNotFoundInDictionary(intervals.get(0).getContig(), dict);
}
final CoverageFactory.SimpleCoverage coverage = coverageFactory.getSimpleCoverage(IN, intervals.stream().map(R -> R.renameContig(newContig)).collect(Collectors.toList()), partition);
for (final SimpleInterval interval : intervals) {
for (int i = interval.getStart(); i <= interval.getEnd() && i <= coverage.getEnd(); i++) {
final int d = coverage.get(i - coverage.getStart());
counts.add(d);
}
}
Collections.sort(counts);
pw.print(intervals.get(0).getContig() + "\t" + (coverage.getStart() - 1) + "\t" + coverage.getEnd() + "\t" + gene + "\t" + partition + "\t" + intervals.size() + "\t" + counts.size() + "\t" + counts.get(0) + "\t" + counts.get(counts.size() - 1));
for (final int mc : this.min_coverages) {
final DiscreteMedian<Integer> discreteMedian = new DiscreteMedian<>();
int count_no_coverage = 0;
for (int cov : counts) {
if (cov <= mc)
++count_no_coverage;
discreteMedian.add(cov);
}
final OptionalDouble average = discreteMedian.getAverage();
final OptionalDouble median = discreteMedian.getMedian();
pw.print("\t" + (average.isPresent() ? String.format("%.2f", average.orElse(0.0)) : ".") + "\t" + (median.isPresent() ? String.format("%.2f", median.orElse(-1.0)) : ".") + "\t" + count_no_coverage + "\t" + (int) (((counts.size() - count_no_coverage) / (double) counts.size()) * 100.0));
}
pw.println();
}
// end gene
}
// end sample
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(IN);
}
}
use of com.github.lindenb.jvarkit.samtools.CoverageFactory in project jvarkit by lindenb.
the class VcfStrechToSvg method run.
private void run(final ArchiveFactory archive, final BedLine bed, final VCFHeader header, final VCFReader in, final PrintWriter manifest) {
LOG.info("processing " + bed);
final Set<String> limitSamples;
if (StringUtils.isBlank(this.sampleStr)) {
limitSamples = new TreeSet<>(header.getSampleNamesInOrder());
} else if (this.sampleStr.startsWith("^")) {
limitSamples = new TreeSet<>(header.getSampleNamesInOrder());
limitSamples.removeAll(Arrays.stream(this.sampleStr.substring(1).split("[, ]+")).filter(S -> !StringUtils.isBlank(S)).collect(Collectors.toSet()));
} else {
limitSamples = Arrays.stream(this.sampleStr.split("[, ]+")).filter(S -> !StringUtils.isBlank(S)).collect(Collectors.toCollection(TreeSet::new));
}
final SAMSequenceDictionary dict = header.getSequenceDictionary();
try (CloseableIterator<VariantContext> iter = in.query(bed)) {
final List<VariantSet> L = iter.stream().filter(V -> acceptVariant(V)).map(V -> new VariantSet(V)).collect(Collectors.toCollection(ArrayList::new));
if (L.isEmpty()) {
LOG.warn("No valid variant found for \"" + bed + "\"");
return;
}
int i = 0;
while (i + 1 < L.size()) {
if (L.get(i).withinDistanceOf(L.get(i + 1), this.withinDistance)) {
L.get(i).variants.addAll(L.get(i + 1).variants);
L.remove(i + 1);
} else {
i++;
}
}
final int margin_left = 50;
final int margin_right = 10;
final double drawingAreaWidth = image_width_pixel - (margin_left + margin_right);
final int intervalLength = L.stream().mapToInt(V -> V.getLengthOnReference()).sum();
double x = 0;
for (i = 0; i < L.size(); i++) {
L.get(i).x = x;
x += (L.get(i).getLengthOnReference() / (double) intervalLength) * drawingAreaWidth;
}
for (i = 0; i < L.size(); i++) {
L.get(i).width = (i + 1 < L.size() ? L.get(i + 1).x : drawingAreaWidth) - L.get(i).x;
}
try {
final DocumentBuilderFactory db = DocumentBuilderFactory.newInstance();
final DocumentBuilder dom = db.newDocumentBuilder();
this.document = dom.newDocument();
final Element svgRoot = element("svg");
this.document.appendChild(svgRoot);
final String mainTitleStr = SequenceDictionaryUtils.getBuildName(dict).orElse("") + " " + new SimpleInterval(bed).toNiceString() + " length:" + StringUtils.niceInt(bed.getLengthOnReference());
/* SVG title */
{
final Element title = element("title");
svgRoot.appendChild(title);
title.appendChild(text(mainTitleStr));
}
/* SVG style */
{
final String gtopacity = this.dynamicParams.getOrDefault("gt.opacity", "0.7");
final Element style = element("style");
svgRoot.appendChild(style);
style.appendChild(text(".maintitle {text-anchor:middle;fill:blue} " + ".vc {stroke-width:0.5px;} " + ".transcript {stroke:black;stroke-width:1px;opacity:1;}" + ".exon {stroke:black;stroke-width:0.5px;fill:blue;opacity:1;}" + ".sample {fill:blue;font-size:7px;} " + ".samplelabel {stroke:gray;stroke-width:0.5px;font-size:" + this.dynamicParams.getOrDefault("sample.fontsize", "7") + "px;}\n" + ".coverage { fill:gray; stroke:yellow;opacity:0.2;} " + ".frame { fill:none; stroke: darkgray;} " + ".area0 {fill:white;}\n" + ".area1 {fill:floralwhite;}\n" + "circle.HOM_REF {fill:green;opacity:" + gtopacity + ";stroke-width:0.5px;}\n" + "circle.HET {fill:blue;opacity:" + gtopacity + ";stroke-width:0.5px;}\n" + "circle.HOM_VAR {fill:red;opacity:" + gtopacity + ";stroke-width:0.5px;}\n" + "a {cursor: pointer;}\n"));
}
/* desc */
{
final Element descr = element("desc");
svgRoot.appendChild(descr);
descr.appendChild(text("Author: Pierre Lindenbaum\n" + JVarkitVersion.getInstance().getCompilationDate() + "\n" + JVarkitVersion.getInstance().getGitHash()));
}
// main title
{
final Element gtitle = element("text", mainTitleStr);
gtitle.setAttribute("class", "maintitle");
gtitle.setAttribute("x", format(this.image_width_pixel / 2.0));
gtitle.setAttribute("y", "15");
svgRoot.appendChild(wrapLoc(gtitle, bed));
}
int margin_top = 50;
double y = margin_top;
final double min_circle_radius = Double.parseDouble(this.dynamicParams.getOrDefault("gt.r1", "1"));
final double max_circle_radius = Double.parseDouble(this.dynamicParams.getOrDefault("gt.r2", "7"));
final Element main_g = element("g");
svgRoot.appendChild(main_g);
/**
* plot genes
*/
if (!this.all_genes.isEmpty()) {
final double transcript_height = 5;
final double exon_height = (transcript_height - 1);
final double save_y = y;
final Element g_genes = element("g");
g_genes.setAttribute("transform", "translate(" + margin_left + ",0)");
main_g.appendChild(g_genes);
/* loop over each vset */
for (i = 0; i < L.size(); i++) {
final VariantSet vset = L.get(i);
// find transcript in this vset
final List<Transcript> transcripts = this.all_genes.getOverlapping(vset).stream().flatMap(G -> G.getTranscripts().stream()).filter(T -> T.overlaps(vset)).collect(Collectors.toList());
if (transcripts.isEmpty())
continue;
final Element g_vset = element("g");
g_vset.setAttribute("transform", "translate(" + format(vset.x) + ",0)");
g_genes.appendChild(g_vset);
// y in this vset
double y2 = save_y;
/* convert base to pixel */
final ToDoubleFunction<Integer> base2pixel = vset.createBaseToPixelFunction();
/* loop over transcripts */
for (final Transcript tr : transcripts) {
final Element g_tr = element("g");
g_tr.setAttribute("transform", "translate(0," + format(y2) + ")");
g_vset.appendChild(g_tr);
final Element line = element("line");
line.setAttribute("class", "transcript");
line.setAttribute("x1", format(Math.max(0, base2pixel.applyAsDouble(tr.getStart()))));
line.setAttribute("y1", format(transcript_height / 2.0));
line.setAttribute("x2", format(Math.min(vset.width, base2pixel.applyAsDouble(tr.getEnd()))));
line.setAttribute("y2", format(transcript_height / 2.0));
line.appendChild(element("title", tr.getId()));
g_tr.appendChild(wrapLoc(line, tr));
/* loop over exons */
for (final Exon exon : tr.getExons()) {
if (!exon.overlaps(vset))
continue;
final Element exRect = element("rect");
exRect.setAttribute("class", "exon");
final double x_start = Math.max(0, base2pixel.applyAsDouble(exon.getStart()));
exRect.setAttribute("x", format(x_start));
exRect.setAttribute("y", format(transcript_height / 2.0 - exon_height / 2.0));
final double x_end = Math.min(vset.width, base2pixel.applyAsDouble(exon.getEnd()));
exRect.setAttribute("width", format(x_end - x_start));
exRect.setAttribute("height", format(exon_height));
exRect.appendChild(element("title", exon.getName()));
g_tr.appendChild(wrapLoc(exRect, exon));
}
y2 += transcript_height + 0.5;
}
y = Math.max(y, y2);
}
y++;
}
final double sample_height = Double.parseDouble(this.dynamicParams.getOrDefault("sample.height", "25"));
final double sample_height2 = sample_height - (max_circle_radius * 2.0);
int space_between_samples = 2;
int got_n_samples = 0;
for (final String sn : header.getSampleNamesInOrder()) {
if (!limitSamples.contains(sn))
continue;
boolean got_this_sample = false;
final Element g_sample = element("g");
g_sample.setAttribute("transform", "translate(" + margin_left + "," + format(y) + ")");
/* get coverage */
final int maxCoverage;
if (this.sample2bam.containsKey(sn)) {
final CoverageFactory coverageFactory = new CoverageFactory();
try (SamReader sr = this.openSamReader(this.sample2bam.get(sn))) {
/* loop over each variant set */
for (final VariantSet vset : L) {
vset.coverage = coverageFactory.getSimpleCoverage(sr, vset, sn);
}
}
maxCoverage = L.stream().flatMapToInt(V -> V.coverage.stream()).max().orElse(0);
} else {
maxCoverage = 0;
for (final VariantSet vset : L) {
vset.coverage = null;
}
}
/* loop over each variant set */
for (i = 0; i < L.size(); i++) {
final VariantSet vset = L.get(i);
final Element g_vset = element("g");
g_vset.setAttribute("transform", "translate(" + format(vset.x) + ",0)");
g_sample.appendChild(g_vset);
/* convert base to pixel */
final ToDoubleFunction<Integer> base2pixel = vset.createBaseToPixelFunction();
// plot set length
final Element rect = element("rect");
rect.setAttribute("class", "area" + (i % 2));
rect.setAttribute("x", "0");
rect.setAttribute("y", "0");
rect.setAttribute("width", format(vset.width));
rect.setAttribute("height", format(sample_height));
if (!remove_tooltip)
rect.appendChild(element("title", vset.toString()));
g_vset.appendChild(rect);
// plot coverage
if (maxCoverage > 0 && this.sample2bam.containsKey(sn)) {
final double[] scaled = vset.coverage.scaleAverage((int) vset.width);
final StringBuilder sb = new StringBuilder();
sb.append("0," + sample_height);
for (int t = 0; t < scaled.length; t++) {
if (t > 1 && t + 1 < scaled.length && format(scaled[t - 1]).equals(format(scaled[t + 1])) && format(scaled[t - 1]).equals(format(scaled[t])))
continue;
sb.append(" ").append(t).append(",");
sb.append(format(sample_height * (1.0 - scaled[t] / maxCoverage)));
}
sb.append(" " + format(vset.width) + "," + sample_height);
final Element polyline = element("polyline");
polyline.setAttribute("class", "coverage");
polyline.setAttribute("points", sb.toString());
g_vset.appendChild(polyline);
vset.coverage = null;
}
// plot vertical line if colorTag defined
if (!StringUtils.isBlank(this.colorTag)) {
for (final VariantContext vc : vset.variants) {
if (!vc.hasAttribute(this.colorTag))
continue;
final String cssColor = vc.getAttributeAsString(this.colorTag, "");
if (StringUtils.isBlank(cssColor))
continue;
final double x0 = base2pixel.applyAsDouble(vc.getStart());
final Element line = element("line");
line.setAttribute("class", "vc");
line.setAttribute("style", "stroke:" + cssColor);
line.setAttribute("x1", format(x0));
line.setAttribute("y1", "0");
line.setAttribute("x2", format(x0));
line.setAttribute("y2", format(sample_height));
g_vset.appendChild(line);
}
}
// print all variants in this vcfset for this sample
for (final VariantContext vc : vset.variants) {
final Genotype gt = vc.getGenotype(sn);
if (gt.isNoCall())
continue;
if (hide_hom_ref && gt.isHomRef())
continue;
if (gt.hasGQ() && gt.getGQ() < this.minGQ)
continue;
final OptionalDouble alt_ratio = getAltRatio(gt);
if (!alt_ratio.isPresent())
continue;
final OptionalDouble af = getAF(vc);
final double circle_radius = min_circle_radius + (max_circle_radius - min_circle_radius) * (1.0 - af.orElse(1.0));
// HOMREF=0; HET =0.5; HOMVAR = 1;
final double gtx = base2pixel.applyAsDouble(vc.getStart());
final double gty = sample_height - (sample_height2 * alt_ratio.getAsDouble() + (sample_height - sample_height2) / 2.0);
final Element circle = element("circle");
circle.setAttribute("class", gt.getType().name());
circle.setAttribute("cx", format(gtx));
circle.setAttribute("cy", format(gty));
circle.setAttribute("r", format(circle_radius));
if (!remove_tooltip)
circle.appendChild(element("title", vc.getStart() + " " + (vc.hasID() ? vc.getID() : "") + " " + vc.getAlleles().stream().map(A -> A.getDisplayString()).collect(Collectors.joining("/")) + " " + gt.getType().name() + " AF=" + format(af.orElse(-1))));
g_vset.appendChild(wrapLoc(circle, vc));
got_this_sample = true;
}
}
final Element frame_sample = element("rect");
frame_sample.setAttribute("class", "frame");
frame_sample.setAttribute("x", "0");
frame_sample.setAttribute("y", "0");
frame_sample.setAttribute("width", format(drawingAreaWidth));
frame_sample.setAttribute("height", format(sample_height));
g_sample.appendChild(frame_sample);
final Element label = element("text", sn + (maxCoverage == 0 ? "" : " Max Cov. " + maxCoverage));
label.setAttribute("class", "samplelabel");
label.setAttribute("x", "0");
label.setAttribute("y", "0");
// label.setAttribute("transform", "translate("+format(-10)+","+0+") rotate(90) ");
label.setAttribute("transform", "translate(12,12)");
if (!remove_tooltip)
label.appendChild(element("title", sn));
g_sample.appendChild(label);
if (got_this_sample) {
got_n_samples++;
main_g.appendChild(g_sample);
y += sample_height + space_between_samples;
} else {
LOG.warn("no valid data for sample " + sn + " in " + bed);
}
}
// remove extra sample space
y -= space_between_samples;
svgRoot.setAttribute("width", format(this.image_width_pixel + 1));
svgRoot.setAttribute("height", format(y + 1));
if (got_n_samples == 0) {
LOG.info("no sample/genotype found for " + bed);
return;
}
// save
final Transformer tr = TransformerFactory.newInstance().newTransformer();
final String filename = bed.getContig() + "_" + bed.getStart() + "_" + bed.getEnd() + ".svg" + (this.compressed_svg ? ".gz" : "");
LOG.info("writing " + filename);
if (this.compressed_svg) {
try (final OutputStream pw = archive.openOuputStream(filename)) {
try (GZIPOutputStream gzout = new GZIPOutputStream(pw)) {
tr.transform(new DOMSource(this.document), new StreamResult(gzout));
gzout.finish();
gzout.flush();
}
pw.flush();
}
} else {
try (final PrintWriter pw = archive.openWriter(filename)) {
tr.transform(new DOMSource(this.document), new StreamResult(pw));
pw.flush();
}
}
manifest.print(bed.getContig());
manifest.print("\t");
manifest.print(bed.getStart() - 1);
manifest.print("\t");
manifest.print(bed.getEnd());
manifest.print("\t");
manifest.print(filename);
manifest.println();
} catch (final Throwable err) {
throw new RuntimeException(err);
} finally {
this.document = null;
}
}
}
use of com.github.lindenb.jvarkit.samtools.CoverageFactory in project jvarkit by lindenb.
the class CnvTView method runInterval.
private SampleInfo runInterval(final AbstractViewWriter w, final Path baminput, final Locatable interval0) throws IOException {
final Locatable interval = extendInterval(interval0);
try (SamReader samReader = samReaderFactory.open(baminput)) {
final SAMFileHeader hdr = samReader.getFileHeader();
SequenceUtil.assertSequenceDictionariesEqual(this.dictionary, hdr.getSequenceDictionary());
JvarkitException.BamHasIndex.verify(samReader);
final String sample = samReader.getFileHeader().getReadGroups().stream().map(V -> V.getSample()).filter(S -> !StringUtil.isBlank(S)).findFirst().orElse(IOUtils.getFilenameWithoutCommonSuffixes(baminput));
final SampleInfo si = new SampleInfo(sample);
si.pixel_coverage = new double[this.terminalWidth - LEFT_MARGIN];
final CoverageFactory coverageFactory = new CoverageFactory().setMappingQuality(this.mappingQuality);
final CoverageFactory.SimpleCoverage cov = coverageFactory.getSimpleCoverage(samReader, interval, null);
si.pixel_coverage = cov.scale(this.percentile, si.pixel_coverage.length);
w.sampleInfos.add(si);
return si;
}
}
use of com.github.lindenb.jvarkit.samtools.CoverageFactory in project jvarkit by lindenb.
the class CopyNumber01 method doWork.
@Override
public int doWork(final List<String> args) {
ReferenceSequenceFile indexedFastaSequenceFile = null;
try {
this.sexContigSet.addAll(Arrays.stream(this.sexContigStr.split("[, \t]")).filter(S -> !StringUtils.isBlank(S)).collect(Collectors.toSet()));
/* loading REF Reference */
indexedFastaSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile);
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(indexedFastaSequenceFile);
final Comparator<Locatable> locComparator = new ContigDictComparator(dict).createLocatableComparator();
final List<Locatable> intervals = new ArrayList<>();
if (this.bedFile == null) {
for (final Locatable loc : dict.getSequences()) {
intervals.add(loc);
}
} else {
final ContigNameConverter converter = ContigNameConverter.fromOneDictionary(dict);
try (BedLineReader br = new BedLineReader(this.bedFile)) {
br.stream().filter(L -> !StringUtil.isBlank(converter.apply(L.getContig()))).forEach(B -> {
final String ctg = converter.apply(B.getContig());
intervals.add(new SimpleInterval(ctg, B.getStart(), B.getEnd()));
});
}
}
if (intervals.isEmpty()) {
LOG.error("no interval defined.");
return -1;
}
LOG.info("intervals N=" + intervals.size() + " mean-size:" + intervals.stream().mapToInt(R -> R.getLengthOnReference()).average().orElse(0.0));
final List<GCAndDepth> user_items = new ArrayList<>();
// split intervals
for (final Locatable loc : intervals) {
int pos = loc.getStart();
while (pos < loc.getEnd()) {
final int pos_end = Math.min(pos + this.windowSize, loc.getEnd());
final GCAndDepth dataRow = new GCAndDepth(loc.getContig(), pos, pos_end);
if (dataRow.getLengthOnReference() < this.windowMin) {
break;
}
user_items.add(dataRow);
pos += this.windowShift;
}
}
// free memory
intervals.clear();
LOG.info("sorting N=" + user_items.size());
Collections.sort(user_items, locComparator);
// fill gc percent
LOG.info("fill gc% N=" + user_items.size());
for (final String ctg : user_items.stream().map(T -> T.getContig()).collect(Collectors.toSet())) {
final GenomicSequence gseq = new GenomicSequence(indexedFastaSequenceFile, ctg);
for (final GCAndDepth dataRow : user_items) {
if (!dataRow.getContig().equals(ctg))
continue;
final GCPercent gc = gseq.getGCPercent(dataRow.getStart(), dataRow.getEnd());
if (gc.isEmpty())
continue;
dataRow.gc = gc.getGCPercent();
}
}
// remove strange gc
user_items.removeIf(B -> B.gc < this.minGC);
user_items.removeIf(B -> B.gc > this.maxGC);
LOG.info("remove high/low gc% N=" + user_items.size());
if (user_items.stream().allMatch(P -> isSex(P.getContig()))) {
LOG.error("All chromosomes are defined as sexual. Cannot normalize");
return -1;
}
final CoverageFactory coverageFactory = new CoverageFactory().setMappingQuality(this.mappingQuality);
try (PrintWriter pw = super.openPathOrStdoutAsPrintWriter(this.outputFile)) {
/* header */
pw.println("#CHROM\tSTART\tEND\tSample\tIDX\tGC\tRAW-DEPTH\tNORM-DEPTH");
for (final Path bamPath : IOUtils.unrollPaths(args)) {
// open this samReader
try (SamReader samReader = super.createSamReaderFactory().referenceSequence(this.refFile).open(bamPath)) {
if (!samReader.hasIndex()) {
LOG.error("file is not indexed " + bamPath);
return -1;
}
final SAMFileHeader header = samReader.getFileHeader();
SequenceUtil.assertSequenceDictionariesEqual(dict, SequenceDictionaryUtils.extractRequired(header));
final String sampleName = header.getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtil.isBlank(S)).collect(Collectors.toSet()).stream().collect(HtsCollectors.toSingleton());
final List<GCAndDepth> bam_items = new ArrayList<>(user_items.size());
/* loop over contigs */
for (final SAMSequenceRecord ssr : dict.getSequences()) {
/* create a **COPY** of the intervals */
final List<GCAndDepth> ctgitems = user_items.stream().filter(T -> T.contigsMatch(ssr)).collect(Collectors.toList());
if (ctgitems.isEmpty())
continue;
LOG.info("Getting coverage for " + ssr.getSequenceName() + " N=" + ctgitems.size());
// get coverage
final CoverageFactory.SimpleCoverage coverage = coverageFactory.getSimpleCoverage(samReader, ctgitems, sampleName);
// fill coverage
for (final GCAndDepth gc : ctgitems) {
final OptionalDouble optCov;
switch(this.univariateDepth) {
case median:
optCov = coverage.getMedian(gc);
break;
case mean:
optCov = coverage.getAverage(gc);
break;
default:
throw new IllegalStateException();
}
gc.raw_depth = optCov.orElse(-1.0);
gc.norm_depth = gc.raw_depth;
}
ctgitems.removeIf(V -> V.raw_depth < 0);
ctgitems.removeIf(V -> V.raw_depth > this.weirdMaxDepth);
ctgitems.removeIf(V -> V.raw_depth < this.weirdMinDepth);
if (ctgitems.isEmpty())
continue;
bam_items.addAll(ctgitems);
}
double[] y = bam_items.stream().filter(R -> !isSex(R.getContig())).mapToDouble(R -> R.raw_depth).toArray();
LOG.info("median raw depth " + new Median().evaluate(y, 0, y.length));
Collections.sort(bam_items, (a, b) -> {
final int i = Double.compare(a.getX(), b.getX());
if (i != 0)
return i;
return Double.compare(a.getY(), b.getY());
});
double[] x = bam_items.stream().filter(R -> !isSex(R.getContig())).mapToDouble(R -> R.getX()).toArray();
y = bam_items.stream().filter(R -> !isSex(R.getContig())).mapToDouble(R -> R.getY()).toArray();
// get min GC
final double min_x = x[0];
// get max GC
final double max_x = x[x.length - 1];
LOG.info("min/max gc " + min_x + " " + max_x);
/* merge adjacent x (GC%) having same values */
int i = 0;
int k = 0;
while (i < x.length) {
int j = i + 1;
while (j < x.length && Precision.equals(x[i], x[j])) {
++j;
}
x[k] = x[i];
y[k] = this.univariateGCLoess.create().evaluate(y, i, j - i);
++k;
i = j;
}
LOG.info("merged n=" + (x.length - k) + " items.");
/* reduce size of x et y */
final List<XY> xyL = new ArrayList<>(k);
for (int t = 0; t < k; ++t) {
xyL.add(new XYImpl(x[t], y[t]));
}
/* sort on Y */
Collections.sort(xyL, (a, b) -> {
final int d = Double.compare(a.getX(), b.getX());
if (d != 0)
return d;
return Double.compare(a.getY(), b.getY());
});
x = xyL.stream().mapToDouble(R -> R.getX()).toArray();
y = xyL.stream().mapToDouble(R -> R.getY()).toArray();
final UnivariateInterpolator interpolator = createInterpolator();
UnivariateFunction spline = null;
try {
spline = interpolator.interpolate(x, y);
} catch (final org.apache.commons.math3.exception.NumberIsTooSmallException err) {
spline = null;
LOG.error("Cannot use " + interpolator.getClass().getName() + ":" + err.getMessage());
}
// min depth cal
int points_removed = 0;
i = 0;
while (i < bam_items.size()) {
final GCAndDepth r = bam_items.get(i);
if (spline == null) {
++i;
} else if (r.getX() < min_x || r.getX() > max_x) {
bam_items.remove(i);
++points_removed;
} else {
final double norm;
if (this.gcDepthInterpolation.equals(UnivariateInerpolation.identity)) {
norm = r.getY();
} else {
norm = spline.value(r.getX());
}
if (Double.isNaN(norm) || Double.isInfinite(norm)) {
LOG.info("NAN " + r);
bam_items.remove(i);
++points_removed;
continue;
}
r.norm_depth = norm;
++i;
}
}
LOG.info("removed " + points_removed + ". now N=" + bam_items.size());
if (bam_items.isEmpty())
continue;
spline = null;
// DO NOT NORMALIZE ON MINIMUM DEPTH, STUPID.
// normalize on median
y = bam_items.stream().mapToDouble(G -> G.getY()).toArray();
final double median_depth = this.univariateMid.create().evaluate(y, 0, y.length);
LOG.info("median norm depth : " + median_depth);
for (i = 0; median_depth > 0 && i < bam_items.size(); ++i) {
final GCAndDepth gc = bam_items.get(i);
gc.norm_depth /= median_depth;
}
// restore genomic order
Collections.sort(bam_items, locComparator);
// smoothing values with neighbours
y = bam_items.stream().mapToDouble(V -> V.getY()).toArray();
for (i = 0; i < bam_items.size(); ++i) {
final GCAndDepth gc = bam_items.get(i);
int left = i;
for (int j = Math.max(0, i - this.smooth_window); j <= i; ++j) {
final GCAndDepth gc2 = bam_items.get(j);
if (!gc2.withinDistanceOf(gc, this.smoothDistance))
continue;
left = j;
break;
}
int right = i;
for (int j = i; j <= i + this.smooth_window && j < bam_items.size(); ++j) {
final GCAndDepth gc2 = bam_items.get(j);
if (!gc2.withinDistanceOf(gc, this.smoothDistance))
break;
right = j;
// no break;
}
gc.norm_depth = this.univariateSmooth.create().evaluate(y, left, (right - left) + 1);
}
/* print data */
for (final GCAndDepth r : bam_items) {
pw.print(r.getContig());
pw.print('\t');
pw.print(r.getStart() - 1);
pw.print('\t');
pw.print(r.getEnd());
pw.print('\t');
pw.print(sampleName);
pw.print('\t');
pw.print(getGenomicIndex(dict, r.getContig(), r.getStart()) - 1);
pw.print('\t');
pw.printf("%.3f", r.gc);
pw.print('\t');
pw.printf("%.3f", r.raw_depth);
pw.print('\t');
pw.printf("%.3f", r.norm_depth);
pw.println();
}
pw.flush();
}
// samReader
}
// end loop bamPath
pw.flush();
}
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(indexedFastaSequenceFile);
}
}
use of com.github.lindenb.jvarkit.samtools.CoverageFactory in project jvarkit by lindenb.
the class WesCnvSvg method doWork.
@Override
public int doWork(final List<String> args) {
XMLStreamWriter w = null;
BufferedReader r = null;
OutputStream fout = null;
VCFReader vcfReader = null;
try {
this.indexedFastaSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.faidx);
final SAMSequenceDictionary refDict = SequenceDictionaryUtils.extractRequired(this.indexedFastaSequenceFile);
final IntervalExtender extender = IntervalExtender.of(refDict, this.extendWhat);
if (extender.isShriking()) {
LOG.error("extend factor <1.0");
return -1;
}
final ContigNameConverter contigNameConverter = ContigNameConverter.fromOneDictionary(refDict);
final ContigDictComparator contigDictCompare = new ContigDictComparator(refDict);
final List<CaptureInterval> userIntervals = this.intervalListProvider.stream().map(loc -> contigNameConverter.convertToSimpleInterval(loc).<RuntimeException>orElseThrow(() -> new RuntimeException(new JvarkitException.ContigNotFoundInDictionary(loc.getContig(), refDict)))).map(extender).collect(HtsCollectors.mergeIntervals()).map(L -> new CaptureInterval(L)).sorted(contigDictCompare.createLocatableComparator()).collect(Collectors.toCollection(ArrayList::new));
if (userIntervals.isEmpty()) {
LOG.error("no interval or bed defined");
return -1;
}
this.countBasesToBeDisplayed = userIntervals.stream().mapToInt(R -> R.getLengthOnReference()).sum();
if (this.countBasesToBeDisplayed < 1) {
LOG.error("Nothing to display. BED count==0");
return -1;
} else {
double x1 = 0;
for (int i = 0; i < userIntervals.size(); ++i) {
final CaptureInterval ci = userIntervals.get(i);
ci.pixelx = x1;
x1 += ci.getPixelWidth();
}
}
/* distinct ordered contigs */
final List<String> distinctContigs = userIntervals.stream().map(R -> R.getContig()).collect(Collectors.toSet()).stream().sorted(contigDictCompare).collect(Collectors.toList());
final SamReaderFactory srf = SamReaderFactory.makeDefault().validationStringency(ValidationStringency.LENIENT).referenceSequence(this.faidx);
for (final Path bamFile : IOUtils.unrollPaths(args)) {
final BamInput bi = new BamInput();
bi.bamPath = bamFile;
try (SamReader samReader = srf.open(bamFile)) {
final SAMSequenceDictionary samDict = SequenceDictionaryUtils.extractRequired(samReader.getFileHeader());
if (!SequenceUtil.areSequenceDictionariesEqual(refDict, samDict)) {
throw new JvarkitException.DictionariesAreNotTheSame(refDict, samDict);
}
bi.sample = samReader.getFileHeader().getReadGroups().stream().map(V -> V.getSample()).filter(S -> !StringUtil.isBlank(S)).findFirst().orElse(IOUtils.getFilenameWithoutCommonSuffixes(bamFile));
final CoverageFactory covFactory = new CoverageFactory().setMappingQuality(this.minMappingQuality);
for (final String contig : distinctContigs) {
final List<CaptureInterval> contig_intervals = userIntervals.stream().filter(R -> R.getContig().equals(contig)).collect(Collectors.toList());
final CoverageFactory.SimpleCoverage coverage = covFactory.getSimpleCoverage(samReader, contig_intervals, null);
for (CaptureInterval rgn : contig_intervals) {
final double[] array = coverage.getSubCoverage(rgn).scale(this.scaleType, (int) rgn.getPixelWidth());
bi.coverages.add(array);
}
}
}
this.bamInputs.add(bi);
}
if (this.bamInputs.isEmpty()) {
LOG.error("no bam input");
return -1;
}
if (this.vcfFile != null) {
vcfReader = VCFReaderFactory.makeDefault().open(this.vcfFile, true);
final SAMSequenceDictionary vcfDict = vcfReader.getHeader().getSequenceDictionary();
if (vcfDict != null)
SequenceUtil.assertSequenceDictionariesEqual(refDict, vcfDict);
}
final XMLOutputFactory xof = XMLOutputFactory.newFactory();
if (this.outputFile == null) {
w = xof.createXMLStreamWriter(stdout(), "UTF-8");
} else {
fout = Files.newOutputStream(this.outputFile);
w = xof.createXMLStreamWriter(fout, "UTF-8");
}
final Function<List<Point2D.Double>, String> points2str = (L) -> L.stream().map(S -> format(S.getX()) + "," + format(S.getY())).collect(Collectors.joining(" "));
final Consumer<List<Point2D.Double>> simplifyPoints = (L) -> {
for (int z = 0; z + 1 < L.size(); ++z) {
if (L.get(z).getY() == L.get(z + 1).getY()) {
L.get(z).x = L.get(z + 1).x;
L.remove(z + 1);
}
}
};
w.writeStartDocument("UTF-8", "1.0");
final Dimension dim = new Dimension(this.drawinAreaWidth, 0);
final int bed_header_height = 20;
dim.height += bed_header_height;
dim.height += (int) this.bamInputs.stream().mapToDouble(B -> B.getPixelHeight()).sum();
LOG.debug("drawing area: " + dim);
w.writeStartElement("svg");
w.writeAttribute("width", String.valueOf(dim.width));
w.writeAttribute("height", String.valueOf(dim.height));
w.writeDefaultNamespace(SVG.NS);
w.writeNamespace("xlink", XLINK.NS);
// https://stackoverflow.com/questions/15717970
w.writeStartElement("style");
if (this.cssFile != null) {
w.writeCharacters(IOUtil.slurp(this.cssFile));
} else {
w.writeCharacters("g.maing {stroke:black;stroke-width:0.5px;fill:whitesmoke;font-size:10pt;}\n" + "text.sampleLabel {stroke:none;stroke-width:0.5px;fill:blue;}" + "text.captureLabel {stroke:none;stroke-width:0.5px;fill:slategrey;text-anchor:middle;}" + "polygon.area {stroke:darkgray;stroke-width:0.5px;fill:url('#grad01');}" + "line.linedp {stroke:darkcyan;stroke-width:0.3px;opacity:0.4;}" + "text.linedp {fill-opacity:0.6;font-size:7px;stroke:none;stroke-width:0.5px;fill:darkcyan;}" + "rect.sampleFrame { fill:none;stroke:slategray;stroke-width:0.3px;}" + "rect.clickRgn {fill:none;stroke:none;pointer-events:all;}" + "polyline.gc {stroke:lightcoral;stroke-width:0.3px;fill:none;}" + "polyline.clipping {stroke:orange;stroke-width:0.8px;fill:none;}" + "circle.ar {fill:orange;stroke:none;}" + "circle.aa {fill:red;stroke:none;}" + "circle.rr {fill:green;stroke:none;}");
}
// style
w.writeEndElement();
w.writeStartElement("title");
w.writeCharacters(this.domSvgTitle);
w.writeEndElement();
w.writeStartElement("defs");
// alleles
final double genotype_radius = Double.parseDouble(this.dynaParams.getOrDefault("gt.radius", "1.5"));
w.writeEmptyElement("circle");
w.writeAttribute("r", format(genotype_radius));
w.writeAttribute("id", "rr");
w.writeAttribute("class", "rr");
w.writeEmptyElement("circle");
w.writeAttribute("r", format(genotype_radius));
w.writeAttribute("id", "ar");
w.writeAttribute("class", "ar");
w.writeEmptyElement("circle");
w.writeAttribute("r", format(genotype_radius));
w.writeAttribute("id", "aa");
w.writeAttribute("class", "aa");
// gradient
w.writeStartElement("linearGradient");
w.writeAttribute("id", "grad01");
w.writeAttribute("x1", "50%");
w.writeAttribute("x2", "50%");
w.writeAttribute("y1", "0%");
w.writeAttribute("y2", "100%");
w.writeEmptyElement("stop");
w.writeAttribute("offset", "0%");
w.writeAttribute("style", "stop-color:lightgray;stop-opacity:1;");
w.writeEmptyElement("stop");
w.writeAttribute("offset", "100%");
w.writeAttribute("style", "stop-color:gray;stop-opacity:1;");
w.writeEndElement();
// gc percent
for (final CaptureInterval ci : userIntervals) {
final GenomicSequence genomicSequence = new GenomicSequence(this.indexedFastaSequenceFile, ci.getContig());
final int gc_percent_width = (int) ci.getPixelWidth();
final List<Point2D.Double> points = new ArrayList<>(gc_percent_width);
for (int x = 0; x < gc_percent_width; ++x) {
int pos1 = ci.getStart() + (int) (((x + 0) / ci.getPixelWidth()) * ci.getLengthOnReference());
int pos2 = ci.getStart() + (int) (((x + 1) / ci.getPixelWidth()) * ci.getLengthOnReference());
double gc_percent = getGcPercent(genomicSequence, pos1, pos2);
double y = this.sampleTrackHeight - this.sampleTrackHeight * gc_percent;
points.add(new Point2D.Double(x, y));
}
simplifyPoints.accept(points);
w.writeStartElement("polyline");
w.writeAttribute("class", "gc");
w.writeAttribute("id", "z" + ci.getId());
w.writeAttribute("points", points2str.apply(points));
w.writeStartElement("title");
w.writeCharacters("GC %");
w.writeEndElement();
w.writeEndElement();
}
// defs
w.writeEndElement();
w.writeStartElement("script");
final StringBuilder openBrowserFunction = new StringBuilder("function openGenomeBrowser(contig,chromStart,chromEnd) {\n");
if (!this.hyperlinkType.isEmpty()) {
openBrowserFunction.append("var url=\"" + this.hyperlinkType.getPattern() + "\".replace(/__CHROM__/g,contig).replace(/__START__/g,chromStart).replace(/__END__/g,chromEnd);\n");
openBrowserFunction.append("window.open(url,'_blank');\n");
} else {
// nothing
}
openBrowserFunction.append("}\n");
w.writeCData(openBrowserFunction.toString() + "function clicked(evt,contig,chromStart,chromEnd){\n" + " var e = evt.target;\n" + " var dim = e.getBoundingClientRect();\n" + " var x = 1.0 * evt.clientX - dim.left;\n" + " var cLen = 1.0* (chromEnd - chromStart); if(cLen<1) cLen=1.0;\n" + " var pos1 = chromStart + parseInt(((x+0)/dim.width)*cLen);\n" + " var pos2 = chromStart + parseInt(((x+1)/dim.width)*cLen);\n" + " openGenomeBrowser(contig,pos1,pos2);\n" + "}\n");
// script
w.writeEndElement();
w.writeStartElement("g");
w.writeAttribute("class", "maing");
int y = 0;
w.writeStartElement("g");
w.writeComment("interval background");
for (final CaptureInterval ci : userIntervals) {
w.writeStartElement("text");
w.writeAttribute("class", "captureLabel");
w.writeAttribute("textLength", String.valueOf(ci.getPixelWidth() * 0.8));
w.writeAttribute("lengthAdjust", "spacing");
w.writeAttribute("x", String.valueOf(ci.getPixelX1() + ci.getPixelWidth() / 2.0));
w.writeAttribute("y", String.valueOf(bed_header_height - 2));
w.writeCharacters(ci.getName());
w.writeStartElement("title");
w.writeCharacters(ci.toNiceString());
// title
w.writeEndElement();
// text
w.writeEndElement();
w.writeStartElement("rect");
w.writeAttribute("style", "stroke:black;fill:none;");
w.writeAttribute("x", String.valueOf(ci.getPixelX1()));
w.writeAttribute("y", "0");
w.writeAttribute("width", String.valueOf(ci.getPixelWidth()));
w.writeAttribute("height", String.valueOf(dim.height));
w.writeStartElement("title");
w.writeCharacters(ci.getName());
w.writeEndElement();
w.writeEndElement();
}
// interval background
w.writeEndElement();
y += bed_header_height;
for (final BamInput bi : this.bamInputs) {
w.writeComment(bi.bamPath.toString());
w.writeStartElement("g");
w.writeAttribute("transform", "translate(0," + y + ")");
if (normalize_on_median_flag) {
final double medianDepth = Math.max(1.0, Percentile.median().evaluate(bi.coverages.stream().flatMapToDouble(B -> Arrays.stream(B)).toArray()).orElse(1.0));
LOG.info("median" + medianDepth);
for (final double[] coverage_array : bi.coverages) {
for (int px = 0; px < coverage_array.length; px++) {
coverage_array[px] /= medianDepth;
}
}
}
final double maxDepth = bi.coverages.stream().flatMapToDouble(B -> Arrays.stream(B)).max().orElse(1.0);
for (int ridx = 0; ridx < userIntervals.size(); ridx++) {
final CaptureInterval ci = userIntervals.get(ridx);
final String clickedAttribute = "clicked(evt,\"" + ci.getContig() + "\"," + ci.getStart() + "," + ci.getEnd() + ")";
final double[] coverage_array = bi.coverages.get(ridx);
final double leftx = ci.getPixelX1();
w.writeStartElement("g");
w.writeAttribute("transform", "translate(" + leftx + ",0)");
final int segment_width = (int) ci.getPixelWidth();
// coverage
{
final List<Point2D.Double> points = new ArrayList<>(segment_width);
points.add(new Point2D.Double(0, bi.getPixelHeight()));
for (int px = 0; px < coverage_array.length; px++) {
final double y_avg_cov = coverage_array[px];
final double new_y = bi.getPixelHeight() - (y_avg_cov / maxDepth) * bi.getPixelHeight();
points.add(new Point2D.Double(px, new_y));
}
points.add(new Point2D.Double(ci.getPixelWidth(), bi.getPixelHeight()));
simplifyPoints.accept(points);
// close
points.add(new Point2D.Double(leftx, bi.getPixelHeight()));
w.writeEmptyElement("polygon");
w.writeAttribute("class", "area");
w.writeAttribute("title", ci.toNiceString());
// w.writeAttribute("onclick", clickedAttribute);
w.writeAttribute("points", points2str.apply(points));
}
// w.writeEndElement();//g
int depthshift = 1;
for (; ; ) {
final int numdiv = (int) (maxDepth / depthshift);
if (numdiv <= 10)
break;
depthshift *= 10;
}
int depth = depthshift;
while (depth < maxDepth) {
double new_y = bi.getPixelHeight() - (depth / maxDepth) * bi.getPixelHeight();
w.writeStartElement("text");
w.writeAttribute("class", "linedp");
w.writeAttribute("x", "1");
w.writeAttribute("y", String.valueOf(new_y + 1));
w.writeCharacters(String.valueOf(depth));
// text
w.writeEndElement();
w.writeStartElement("line");
w.writeAttribute("class", "linedp");
w.writeAttribute("stroke-dasharray", "4");
w.writeAttribute("x1", "0");
w.writeAttribute("x2", String.valueOf(ci.getPixelWidth()));
w.writeAttribute("y1", String.valueOf(new_y));
w.writeAttribute("y2", String.valueOf(new_y));
w.writeStartElement("title");
w.writeCharacters(String.valueOf(depth));
w.writeEndElement();
// line
w.writeEndElement();
depth += depthshift;
}
// polyline
w.writeEmptyElement("use");
w.writeAttribute("xlink", XLINK.NS, "href", "#z" + ci.getId());
w.writeAttribute("x", "0");
w.writeAttribute("y", "0");
// click
w.writeStartElement("rect");
w.writeAttribute("class", "clickRgn");
w.writeAttribute("onclick", clickedAttribute);
w.writeAttribute("x", "0");
w.writeAttribute("y", "0");
w.writeAttribute("width", String.valueOf(ci.getPixelWidth()));
w.writeAttribute("height", String.valueOf(bi.getPixelHeight()));
w.writeEndElement();
// genotype
if (vcfReader != null) {
try (CloseableIterator<VariantContext> iter = vcfReader.query(ci)) {
while (iter.hasNext()) {
final VariantContext ctx = iter.next();
final Genotype gt = ctx.getGenotype(bi.sample);
if (gt == null)
break;
String allele_id = null;
switch(gt.getType()) {
case HET:
allele_id = "ar";
break;
case HOM_REF:
allele_id = "rr";
break;
case HOM_VAR:
allele_id = "aa";
break;
default:
allele_id = null;
break;
}
if (allele_id != null) {
w.writeEmptyElement("use");
w.writeAttribute("xlink", XLINK.NS, "href", "#" + allele_id);
w.writeAttribute("x", format(((ctx.getStart() - ci.getStart()) / (double) ci.getLengthOnReference()) * ci.getPixelWidth()));
w.writeAttribute("y", format(bi.getPixelHeight() - 2 * genotype_radius));
}
}
}
}
// g
w.writeEndElement();
}
// frame for this sample
w.writeStartElement("rect");
w.writeAttribute("class", "sampleFrame");
w.writeAttribute("x", "0");
w.writeAttribute("y", "0");
w.writeAttribute("width", String.valueOf(dim.width));
w.writeAttribute("height", String.valueOf(bi.getPixelHeight()));
// rect
w.writeEndElement();
w.writeStartElement("text");
w.writeAttribute("class", "sampleLabel");
w.writeAttribute("x", "5");
w.writeAttribute("y", "12");
w.writeStartElement("title");
w.writeCharacters(bi.bamPath.toString());
w.writeEndElement();
w.writeCharacters(bi.sample);
// text
w.writeEndElement();
// g
w.writeEndElement();
y += bi.getPixelHeight();
}
w.writeStartElement("g");
w.writeComment("interval lines");
for (int n = 0; n <= userIntervals.size(); n++) {
w.writeEmptyElement("line");
String x1 = n < userIntervals.size() ? String.valueOf(userIntervals.get(n).getPixelX1()) : String.valueOf(dim.width);
w.writeAttribute("x1", x1);
w.writeAttribute("y1", "0");
w.writeAttribute("x2", x1);
w.writeAttribute("y2", String.valueOf(dim.height));
}
// interval lines
w.writeEndElement();
// g
w.writeEndElement();
// svg
w.writeEndElement();
w.writeEndDocument();
w.flush();
w.close();
return 0;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(vcfReader);
CloserUtil.close(w);
CloserUtil.close(fout);
CloserUtil.close(r);
CloserUtil.close(this.indexedFastaSequenceFile);
CloserUtil.close(this.bamInputs);
}
}
Aggregations