use of com.github.lindenb.jvarkit.util.picard.GenomicSequence.GCPercent in project jvarkit by lindenb.
the class CopyNumber01 method doWork.
@Override
public int doWork(final List<String> args) {
ReferenceSequenceFile indexedFastaSequenceFile = null;
try {
this.sexContigSet.addAll(Arrays.stream(this.sexContigStr.split("[, \t]")).filter(S -> !StringUtils.isBlank(S)).collect(Collectors.toSet()));
/* loading REF Reference */
indexedFastaSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile);
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(indexedFastaSequenceFile);
final Comparator<Locatable> locComparator = new ContigDictComparator(dict).createLocatableComparator();
final List<Locatable> intervals = new ArrayList<>();
if (this.bedFile == null) {
for (final Locatable loc : dict.getSequences()) {
intervals.add(loc);
}
} else {
final ContigNameConverter converter = ContigNameConverter.fromOneDictionary(dict);
try (BedLineReader br = new BedLineReader(this.bedFile)) {
br.stream().filter(L -> !StringUtil.isBlank(converter.apply(L.getContig()))).forEach(B -> {
final String ctg = converter.apply(B.getContig());
intervals.add(new SimpleInterval(ctg, B.getStart(), B.getEnd()));
});
}
}
if (intervals.isEmpty()) {
LOG.error("no interval defined.");
return -1;
}
LOG.info("intervals N=" + intervals.size() + " mean-size:" + intervals.stream().mapToInt(R -> R.getLengthOnReference()).average().orElse(0.0));
final List<GCAndDepth> user_items = new ArrayList<>();
// split intervals
for (final Locatable loc : intervals) {
int pos = loc.getStart();
while (pos < loc.getEnd()) {
final int pos_end = Math.min(pos + this.windowSize, loc.getEnd());
final GCAndDepth dataRow = new GCAndDepth(loc.getContig(), pos, pos_end);
if (dataRow.getLengthOnReference() < this.windowMin) {
break;
}
user_items.add(dataRow);
pos += this.windowShift;
}
}
// free memory
intervals.clear();
LOG.info("sorting N=" + user_items.size());
Collections.sort(user_items, locComparator);
// fill gc percent
LOG.info("fill gc% N=" + user_items.size());
for (final String ctg : user_items.stream().map(T -> T.getContig()).collect(Collectors.toSet())) {
final GenomicSequence gseq = new GenomicSequence(indexedFastaSequenceFile, ctg);
for (final GCAndDepth dataRow : user_items) {
if (!dataRow.getContig().equals(ctg))
continue;
final GCPercent gc = gseq.getGCPercent(dataRow.getStart(), dataRow.getEnd());
if (gc.isEmpty())
continue;
dataRow.gc = gc.getGCPercent();
}
}
// remove strange gc
user_items.removeIf(B -> B.gc < this.minGC);
user_items.removeIf(B -> B.gc > this.maxGC);
LOG.info("remove high/low gc% N=" + user_items.size());
if (user_items.stream().allMatch(P -> isSex(P.getContig()))) {
LOG.error("All chromosomes are defined as sexual. Cannot normalize");
return -1;
}
final CoverageFactory coverageFactory = new CoverageFactory().setMappingQuality(this.mappingQuality);
try (PrintWriter pw = super.openPathOrStdoutAsPrintWriter(this.outputFile)) {
/* header */
pw.println("#CHROM\tSTART\tEND\tSample\tIDX\tGC\tRAW-DEPTH\tNORM-DEPTH");
for (final Path bamPath : IOUtils.unrollPaths(args)) {
// open this samReader
try (SamReader samReader = super.createSamReaderFactory().referenceSequence(this.refFile).open(bamPath)) {
if (!samReader.hasIndex()) {
LOG.error("file is not indexed " + bamPath);
return -1;
}
final SAMFileHeader header = samReader.getFileHeader();
SequenceUtil.assertSequenceDictionariesEqual(dict, SequenceDictionaryUtils.extractRequired(header));
final String sampleName = header.getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtil.isBlank(S)).collect(Collectors.toSet()).stream().collect(HtsCollectors.toSingleton());
final List<GCAndDepth> bam_items = new ArrayList<>(user_items.size());
/* loop over contigs */
for (final SAMSequenceRecord ssr : dict.getSequences()) {
/* create a **COPY** of the intervals */
final List<GCAndDepth> ctgitems = user_items.stream().filter(T -> T.contigsMatch(ssr)).collect(Collectors.toList());
if (ctgitems.isEmpty())
continue;
LOG.info("Getting coverage for " + ssr.getSequenceName() + " N=" + ctgitems.size());
// get coverage
final CoverageFactory.SimpleCoverage coverage = coverageFactory.getSimpleCoverage(samReader, ctgitems, sampleName);
// fill coverage
for (final GCAndDepth gc : ctgitems) {
final OptionalDouble optCov;
switch(this.univariateDepth) {
case median:
optCov = coverage.getMedian(gc);
break;
case mean:
optCov = coverage.getAverage(gc);
break;
default:
throw new IllegalStateException();
}
gc.raw_depth = optCov.orElse(-1.0);
gc.norm_depth = gc.raw_depth;
}
ctgitems.removeIf(V -> V.raw_depth < 0);
ctgitems.removeIf(V -> V.raw_depth > this.weirdMaxDepth);
ctgitems.removeIf(V -> V.raw_depth < this.weirdMinDepth);
if (ctgitems.isEmpty())
continue;
bam_items.addAll(ctgitems);
}
double[] y = bam_items.stream().filter(R -> !isSex(R.getContig())).mapToDouble(R -> R.raw_depth).toArray();
LOG.info("median raw depth " + new Median().evaluate(y, 0, y.length));
Collections.sort(bam_items, (a, b) -> {
final int i = Double.compare(a.getX(), b.getX());
if (i != 0)
return i;
return Double.compare(a.getY(), b.getY());
});
double[] x = bam_items.stream().filter(R -> !isSex(R.getContig())).mapToDouble(R -> R.getX()).toArray();
y = bam_items.stream().filter(R -> !isSex(R.getContig())).mapToDouble(R -> R.getY()).toArray();
// get min GC
final double min_x = x[0];
// get max GC
final double max_x = x[x.length - 1];
LOG.info("min/max gc " + min_x + " " + max_x);
/* merge adjacent x (GC%) having same values */
int i = 0;
int k = 0;
while (i < x.length) {
int j = i + 1;
while (j < x.length && Precision.equals(x[i], x[j])) {
++j;
}
x[k] = x[i];
y[k] = this.univariateGCLoess.create().evaluate(y, i, j - i);
++k;
i = j;
}
LOG.info("merged n=" + (x.length - k) + " items.");
/* reduce size of x et y */
final List<XY> xyL = new ArrayList<>(k);
for (int t = 0; t < k; ++t) {
xyL.add(new XYImpl(x[t], y[t]));
}
/* sort on Y */
Collections.sort(xyL, (a, b) -> {
final int d = Double.compare(a.getX(), b.getX());
if (d != 0)
return d;
return Double.compare(a.getY(), b.getY());
});
x = xyL.stream().mapToDouble(R -> R.getX()).toArray();
y = xyL.stream().mapToDouble(R -> R.getY()).toArray();
final UnivariateInterpolator interpolator = createInterpolator();
UnivariateFunction spline = null;
try {
spline = interpolator.interpolate(x, y);
} catch (final org.apache.commons.math3.exception.NumberIsTooSmallException err) {
spline = null;
LOG.error("Cannot use " + interpolator.getClass().getName() + ":" + err.getMessage());
}
// min depth cal
int points_removed = 0;
i = 0;
while (i < bam_items.size()) {
final GCAndDepth r = bam_items.get(i);
if (spline == null) {
++i;
} else if (r.getX() < min_x || r.getX() > max_x) {
bam_items.remove(i);
++points_removed;
} else {
final double norm;
if (this.gcDepthInterpolation.equals(UnivariateInerpolation.identity)) {
norm = r.getY();
} else {
norm = spline.value(r.getX());
}
if (Double.isNaN(norm) || Double.isInfinite(norm)) {
LOG.info("NAN " + r);
bam_items.remove(i);
++points_removed;
continue;
}
r.norm_depth = norm;
++i;
}
}
LOG.info("removed " + points_removed + ". now N=" + bam_items.size());
if (bam_items.isEmpty())
continue;
spline = null;
// DO NOT NORMALIZE ON MINIMUM DEPTH, STUPID.
// normalize on median
y = bam_items.stream().mapToDouble(G -> G.getY()).toArray();
final double median_depth = this.univariateMid.create().evaluate(y, 0, y.length);
LOG.info("median norm depth : " + median_depth);
for (i = 0; median_depth > 0 && i < bam_items.size(); ++i) {
final GCAndDepth gc = bam_items.get(i);
gc.norm_depth /= median_depth;
}
// restore genomic order
Collections.sort(bam_items, locComparator);
// smoothing values with neighbours
y = bam_items.stream().mapToDouble(V -> V.getY()).toArray();
for (i = 0; i < bam_items.size(); ++i) {
final GCAndDepth gc = bam_items.get(i);
int left = i;
for (int j = Math.max(0, i - this.smooth_window); j <= i; ++j) {
final GCAndDepth gc2 = bam_items.get(j);
if (!gc2.withinDistanceOf(gc, this.smoothDistance))
continue;
left = j;
break;
}
int right = i;
for (int j = i; j <= i + this.smooth_window && j < bam_items.size(); ++j) {
final GCAndDepth gc2 = bam_items.get(j);
if (!gc2.withinDistanceOf(gc, this.smoothDistance))
break;
right = j;
// no break;
}
gc.norm_depth = this.univariateSmooth.create().evaluate(y, left, (right - left) + 1);
}
/* print data */
for (final GCAndDepth r : bam_items) {
pw.print(r.getContig());
pw.print('\t');
pw.print(r.getStart() - 1);
pw.print('\t');
pw.print(r.getEnd());
pw.print('\t');
pw.print(sampleName);
pw.print('\t');
pw.print(getGenomicIndex(dict, r.getContig(), r.getStart()) - 1);
pw.print('\t');
pw.printf("%.3f", r.gc);
pw.print('\t');
pw.printf("%.3f", r.raw_depth);
pw.print('\t');
pw.printf("%.3f", r.norm_depth);
pw.println();
}
pw.flush();
}
// samReader
}
// end loop bamPath
pw.flush();
}
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(indexedFastaSequenceFile);
}
}
Aggregations