use of com.github.lindenb.jvarkit.tools.structvar.indexcov.IndexCovUtils in project jvarkit by lindenb.
the class DepthAnomaly method doWork.
@Override
public int doWork(final List<String> args) {
PrintWriter pw = null;
try {
final IndexCovUtils indexCovUtils = new IndexCovUtils(this.treshold);
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.refPath);
final SamReaderFactory samReaderFactory = SamReaderFactory.makeDefault().referenceSequence(DepthAnomaly.this.refPath).validationStringency(ValidationStringency.LENIENT);
final List<Path> inputBams = IOUtils.unrollPaths(this.bamsPath);
if (inputBams.isEmpty()) {
LOG.error("input bam file missing.");
return -1;
}
Iterator<? extends Locatable> iter;
final String input = oneFileOrNull(args);
if (input == null) {
final BedLineCodec codec = new BedLineCodec();
final LineIterator liter = new LineIterator(stdin());
iter = new AbstractIterator<Locatable>() {
@Override
protected Locatable advance() {
while (liter.hasNext()) {
final String line = liter.next();
final BedLine bed = codec.decode(line);
if (bed == null) {
continue;
}
return bed;
}
liter.close();
return null;
}
};
} else {
iter = IntervalListProvider.from(input).dictionary(dict).stream().iterator();
}
pw = super.openPathOrStdoutAsPrintWriter(this.outputFile);
while (iter.hasNext()) {
final SimpleInterval locatable = new SimpleInterval(iter.next());
boolean found_anomaly_here = false;
if (this.min_anomaly_length * 3 >= locatable.getLengthOnReference()) {
LOG.warning("interval " + locatable.toNiceString() + " is too short. skipping.");
continue;
}
final int[] depth = new int[locatable.getLengthOnReference()];
final int[] copy = new int[depth.length];
for (final Path path : inputBams) {
try (SamReader sr = samReaderFactory.open(path)) {
final SAMFileHeader header = sr.getFileHeader();
final String sample = header.getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtil.isBlank(S)).findFirst().orElse(IOUtils.getFilenameWithoutCommonSuffixes(path));
SequenceUtil.assertSequenceDictionariesEqual(dict, header.getSequenceDictionary());
Arrays.fill(depth, 0);
try (CloseableIterator<SAMRecord> siter = sr.queryOverlapping(locatable.getContig(), locatable.getStart(), locatable.getEnd())) {
while (siter.hasNext()) {
final SAMRecord rec = siter.next();
if (rec.getReadUnmappedFlag())
continue;
if (!SAMRecordDefaultFilter.accept(rec, this.min_mapq))
continue;
int ref = rec.getStart();
final Cigar cigar = rec.getCigar();
if (cigar == null)
continue;
for (CigarElement ce : cigar) {
final CigarOperator op = ce.getOperator();
final int len = ce.getLength();
if (op.consumesReferenceBases()) {
if (op.consumesReadBases()) {
for (int i = 0; i < len; i++) {
final int pos = ref + i;
if (pos < locatable.getStart())
continue;
if (pos > locatable.getEnd())
break;
depth[pos - locatable.getStart()]++;
}
}
ref += len;
}
}
}
// loop cigar
}
// end samItere
System.arraycopy(depth, 0, copy, 0, depth.length);
final double median = median(copy);
final List<CovInterval> anomalies = new ArrayList<>();
// int minDepth = Arrays.stream(depth).min().orElse(0);
int x0 = 0;
while (x0 < depth.length && median > 0.0) {
final int xi = x0;
double total = 0;
double count = 0;
IndexCovUtils.SvType prevType = null;
while (x0 < depth.length) {
final IndexCovUtils.SvType type;
final int depthHere = depth[x0];
final double normDepth = depthHere / (median == 0 ? 1.0 : median);
if (depthHere > this.max_depth) {
type = null;
} else {
type = indexCovUtils.getType(normDepth);
}
x0++;
if (type == null || !type.isVariant())
break;
if (prevType != null && !type.equals(prevType))
break;
if (prevType == null)
prevType = type;
total += depthHere;
count++;
}
if (prevType != null && count >= this.min_anomaly_length) {
anomalies.add(new CovInterval(locatable.getContig(), locatable.getStart() + xi, locatable.getStart() + x0 - 1, prevType, Collections.singletonList(total / count)));
}
}
if (!anomalies.isEmpty() || force_screen) {
int i = 0;
while (i + 1 < anomalies.size() && this.merge_intervals) {
final CovInterval loc1 = anomalies.get(i);
final CovInterval loc2 = anomalies.get(i + 1);
if (loc1.svtype.equals(loc2.svtype) && loc1.withinDistanceOf(loc2, this.min_anomaly_length)) {
final List<Double> newdepths = new ArrayList<>(loc1.depths);
newdepths.addAll(loc2.depths);
anomalies.set(i, new CovInterval(loc1.getContig(), loc1.getStart(), loc2.getEnd(), loc1.svtype, newdepths));
anomalies.remove(i + 1);
} else {
i++;
}
}
if (!found_anomaly_here) {
pw.println(">>> " + locatable.toNiceString() + " length:" + StringUtils.niceInt(locatable.getLengthOnReference()));
found_anomaly_here = true;
}
if (screen_width > 0) {
pw.print("#");
pw.print(String.format("%-15s", sample));
pw.print("[");
for (i = 0; i < screen_width; i++) {
double t = 0;
double n = 0;
final int x1 = (int) (((i + 0) / (double) screen_width) * depth.length);
final int x2 = (int) (((i + 1) / (double) screen_width) * depth.length);
for (int x3 = x1; x3 <= x2 && x3 < depth.length; ++x3) {
t += depth[x1];
n++;
}
double normDepth = t /= n;
if (median > 0)
normDepth /= median;
// centered
normDepth /= 2.0;
final boolean is_anomaly = anomalies.stream().anyMatch(R -> CoordMath.overlaps(R.getStart(), R.getEnd(), locatable.getStart() + x1, locatable.getStart() + x2));
final AnsiUtils.AnsiColor color = is_anomaly ? AnsiColor.RED : null;
if (color != null)
pw.print(color.begin());
pw.print(AnsiUtils.getHistogram(normDepth));
if (color != null)
pw.print(color.end());
}
pw.print("]");
pw.println();
}
for (i = 0; i < anomalies.size(); i++) {
final CovInterval anomalie = anomalies.get(i);
pw.print(anomalie.getContig());
pw.print("\t");
pw.print(anomalie.getStart() - 1);
pw.print("\t");
pw.print(anomalie.getEnd());
pw.print("\t");
pw.print(anomalie.getLengthOnReference());
pw.print("\t");
pw.print(anomalie.svtype.name());
pw.print("\t");
pw.print(sample);
pw.print("\t");
pw.print(path);
pw.print("\t");
pw.print(i + 1);
pw.print("\t");
pw.print(anomalies.size());
pw.print("\t");
pw.print(locatable.toNiceString());
pw.print("\t");
pw.print((int) median);
pw.print("\t");
pw.print((int) anomalie.depths.stream().mapToDouble(X -> X.doubleValue()).average().orElse(0));
pw.print("\t");
pw.println();
}
}
}
}
if (found_anomaly_here) {
pw.println("<<< " + locatable.toNiceString() + " length:" + StringUtils.niceInt(locatable.getLengthOnReference()));
pw.println();
}
}
// end while iter
pw.flush();
pw.close();
pw = null;
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(pw);
}
}
use of com.github.lindenb.jvarkit.tools.structvar.indexcov.IndexCovUtils in project jvarkit by lindenb.
the class CoverageMatrix method doWork.
@Override
public int doWork(final List<String> args) {
VariantContextWriter w = null;
try {
final IndexCovUtils indexCovUtils = new IndexCovUtils(this.indexCovTreshold);
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.refPath);
final SamReaderFactory samReaderFactory = SamReaderFactory.makeDefault().referenceSequence(CoverageMatrix.this.refPath).validationStringency(ValidationStringency.LENIENT);
final List<Path> inputBams = IOUtils.unrollPaths(args);
if (inputBams.size() < 3) {
LOG.error("not enough input bam file defined.");
return -1;
}
final Set<String> sampleSet = new TreeSet<>();
final List<String> idx2samples = new ArrayList<String>(inputBams.size());
for (final Path path : inputBams) {
try (SamReader sr = samReaderFactory.open(path)) {
final SAMFileHeader header = sr.getFileHeader();
final String sample = header.getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtil.isBlank(S)).findFirst().orElse(IOUtils.getFilenameWithoutCommonSuffixes(path));
if (sampleSet.contains(sample)) {
LOG.error("duplicate sample " + sample);
return -1;
}
sampleSet.add(sample);
idx2samples.add(sample);
}
}
final Set<VCFHeaderLine> metaData = new HashSet<>();
w = this.writingVariantsDelegate.dictionary(dict).open(this.outputFile);
final VCFFormatHeaderLine fmtNormDepth = new VCFFormatHeaderLine("D", 1, VCFHeaderLineType.Float, "norm Depth");
metaData.add(fmtNormDepth);
final VCFFormatHeaderLine fmtStdDev = new VCFFormatHeaderLine("STDDEV", 1, VCFHeaderLineType.Float, "standard deviation");
metaData.add(fmtStdDev);
final VCFInfoHeaderLine infoStdDev = new VCFInfoHeaderLine(fmtStdDev.getID(), 1, VCFHeaderLineType.Float, "standard deviation");
metaData.add(infoStdDev);
final VCFInfoHeaderLine infoMedianD = new VCFInfoHeaderLine("MEDIAN", 1, VCFHeaderLineType.Float, "median depth");
metaData.add(infoMedianD);
final VCFInfoHeaderLine infoNSamples = new VCFInfoHeaderLine("NSAMPLES", 1, VCFHeaderLineType.Integer, "number of samples");
metaData.add(infoNSamples);
final VCFInfoHeaderLine infoSamples = new VCFInfoHeaderLine("SAMPLES", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Samples");
metaData.add(infoSamples);
final VCFFilterHeaderLine filterAll = new VCFFilterHeaderLine("ALL_AFFECTED", "All Samples carry a variant");
metaData.add(filterAll);
VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.END_KEY);
VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.GENOTYPE_KEY);
final VCFHeader vcfheader = new VCFHeader(metaData, idx2samples);
vcfheader.setSequenceDictionary(dict);
JVarkitVersion.getInstance().addMetaData(this, vcfheader);
w.writeHeader(vcfheader);
for (final SAMSequenceRecord ssr : dict.getSequences()) {
if (!StringUtils.isBlank(restrictContig) && !restrictContig.equals(ssr.getSequenceName()))
continue;
final int[] depth = new int[ssr.getSequenceLength()];
final BitSet blackListedPositions = new BitSet(depth.length);
// fill black listed regions
if (this.blackListedPath != null) {
try (TabixReader tbr = new TabixReader(this.blackListedPath.toString())) {
final ContigNameConverter cvt = ContigNameConverter.fromContigSet(tbr.getChromosomes());
final String ctg = cvt.apply(ssr.getSequenceName());
if (!StringUtils.isBlank(ctg)) {
final BedLineCodec codec = new BedLineCodec();
final TabixReader.Iterator tbxr = tbr.query(ctg, 1, ssr.getSequenceLength());
for (; ; ) {
final String line = tbxr.next();
if (line == null)
break;
final BedLine bed = codec.decode(line);
if (bed == null)
continue;
int p1 = Math.max(bed.getStart(), 1);
while (p1 <= ssr.getSequenceLength() && p1 <= bed.getEnd()) {
blackListedPositions.set(p1 - 1);
++p1;
}
}
}
} catch (Throwable err) {
LOG.warn(err);
}
}
final SortingCollection<CovItem> sorter = SortingCollection.newInstance(CovItem.class, new CovItemCodec(), (A, B) -> A.compare0(B), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
for (int bam_idx = 0; bam_idx < inputBams.size(); ++bam_idx) {
final Path path = inputBams.get(bam_idx);
LOG.info(ssr.getContig() + ":" + path + " " + bam_idx + "/" + inputBams.size());
try (SamReader sr = samReaderFactory.open(path)) {
final SAMFileHeader header = sr.getFileHeader();
SequenceUtil.assertSequenceDictionariesEqual(dict, header.getSequenceDictionary());
Arrays.fill(depth, 0);
try (CloseableIterator<SAMRecord> siter = sr.queryOverlapping(ssr.getContig(), 1, ssr.getLengthOnReference())) {
while (siter.hasNext()) {
final SAMRecord rec = siter.next();
if (rec.getReadUnmappedFlag())
continue;
if (!SAMRecordDefaultFilter.accept(rec, this.min_mapq))
continue;
int ref = rec.getStart();
final Cigar cigar = rec.getCigar();
if (cigar == null)
continue;
for (CigarElement ce : cigar) {
final CigarOperator op = ce.getOperator();
final int len = ce.getLength();
if (op.consumesReferenceBases()) {
if (op.consumesReadBases()) {
for (int i = 0; i < len; i++) {
final int pos = ref + i;
if (pos < 1)
continue;
if (pos > ssr.getLengthOnReference())
break;
depth[pos - 1]++;
}
}
ref += len;
}
}
// loop cigar
}
// end samItere
}
// try
final DiscreteMedian<Integer> discreteMedian = new DiscreteMedian<>();
int pos = 0;
while (pos < depth.length) {
if (!blackListedPositions.get(pos) && depth[pos] <= this.max_depth) {
discreteMedian.add(depth[pos]);
}
++pos;
}
final double median = discreteMedian.getMedian().orElse(1.0);
LOG.info(idx2samples.get(bam_idx) + " :" + ssr.getSequenceName() + " median depth:" + median);
final DiscreteMedian<Integer> localMedian = new DiscreteMedian<>();
pos = 0;
while (pos < depth.length) {
if (blackListedPositions.get(pos)) /* non pas maxdepth */
{
++pos;
continue;
}
int pos2 = pos;
localMedian.clear();
while (pos2 - pos < this.bin_size && pos2 < depth.length && !blackListedPositions.get(pos2)) {
// consider this.max_depth here ?
localMedian.add(depth[pos2]);
++pos2;
}
if (pos2 - pos == this.bin_size) {
final double localMed = localMedian.getMedian().orElse(0.0);
final CovItem item = new CovItem();
item.pos = pos;
item.sample_idx = bam_idx;
item.depth = (float) (localMed / median);
item.stddev = (float) localMedian.getStandardDeviation().orElse(-1.0);
sorter.add(item);
}
pos = pos2;
}
}
// end loop over samples
}
// end loop over bams
sorter.doneAdding();
sorter.setDestructiveIteration(true);
final CloseableIterator<CovItem> iter = sorter.iterator();
final EqualRangeIterator<CovItem> iter2 = new EqualRangeIterator<>(iter, (A, B) -> Integer.compare(A.pos, B.pos));
final Allele REF = Allele.create("N", true);
final Allele DEL = Allele.create("<DEL>", false);
final Allele DUP = Allele.create("<DUP>", false);
while (iter2.hasNext()) {
final List<CovItem> list = iter2.next();
final CovItem first = list.get(0);
final double avg_depth = list.stream().mapToDouble(F -> F.depth).average().orElse(0);
final double sum = list.stream().mapToDouble(F -> F.depth).map(D -> Math.pow(D - avg_depth, 2.0)).sum();
final double stdDev = Math.sqrt(sum / list.size());
final OptionalDouble optMedianOfmedian = Percentile.median().evaluate(list.stream().mapToDouble(I -> I.depth));
final double medianOfmedian = optMedianOfmedian.orElse(1.0);
if (medianOfmedian <= 0)
continue;
for (int i = 0; i < list.size(); i++) {
list.get(i).depth /= medianOfmedian;
}
if (list.stream().allMatch(F -> Float.isNaN(F.depth) || Float.isInfinite(F.depth)))
continue;
final VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(ssr.getContig());
vcb.start(first.pos + 1);
vcb.stop(first.pos + this.bin_size);
vcb.attribute(VCFConstants.END_KEY, first.pos + this.bin_size);
vcb.attribute(infoStdDev.getID(), stdDev);
vcb.attribute(infoMedianD.getID(), medianOfmedian);
final Set<Allele> alleles = new HashSet<>();
alleles.add(REF);
final List<Genotype> genotypes = new ArrayList<>(list.size());
final Set<String> affected = new TreeSet<>();
for (int i = 0; i < list.size(); i++) {
final CovItem item = list.get(i);
final String sn = idx2samples.get(item.sample_idx);
final GenotypeBuilder gb;
switch(indexCovUtils.getType(item.depth)) {
case AMBIGOUS:
gb = new GenotypeBuilder(sn, Arrays.asList(Allele.NO_CALL, Allele.NO_CALL));
break;
case HET_DEL:
alleles.add(DEL);
gb = new GenotypeBuilder(sn, Arrays.asList(REF, DEL));
affected.add(sn);
break;
case HOM_DEL:
alleles.add(DEL);
gb = new GenotypeBuilder(sn, Arrays.asList(DEL, DEL));
affected.add(sn);
break;
case HET_DUP:
alleles.add(DUP);
gb = new GenotypeBuilder(sn, Arrays.asList(REF, DUP));
affected.add(sn);
break;
case HOM_DUP:
alleles.add(DUP);
gb = new GenotypeBuilder(sn, Arrays.asList(DUP, DUP));
affected.add(sn);
break;
case REF:
gb = new GenotypeBuilder(sn, Arrays.asList(REF, REF));
break;
default:
throw new IllegalStateException();
}
gb.attribute(fmtNormDepth.getID(), item.depth);
gb.attribute(fmtStdDev.getID(), item.stddev);
genotypes.add(gb.make());
}
if (affected.isEmpty())
continue;
if (affected.size() == inputBams.size()) {
vcb.filter(filterAll.getID());
} else {
vcb.passFilters();
}
vcb.attribute(infoSamples.getID(), new ArrayList<>(affected));
vcb.attribute(infoNSamples.getID(), affected.size());
vcb.genotypes(genotypes);
vcb.alleles(alleles);
w.add(vcb.make());
}
iter2.close();
iter.close();
sorter.cleanup();
System.gc();
}
// end while iter
w.close();
w = null;
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(w);
}
}
Aggregations