use of com.github.lindenb.jvarkit.math.DiscreteMedian in project jvarkit by lindenb.
the class BamStats05 method doWork.
protected int doWork(final PrintWriter pw, final Map<String, List<SimpleInterval>> gene2interval, final String filename, final SamReader IN) throws Exception {
try {
LOG.info("Scanning " + filename);
final SAMFileHeader header = IN.getFileHeader();
final List<SAMReadGroupRecord> rgs = header.getReadGroups();
if (rgs == null || rgs.isEmpty())
throw new IOException("No read groups in " + filename);
final Set<String> groupNames = this.groupBy.getPartitions(rgs);
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(header);
final ContigNameConverter contigNameConverter = ContigNameConverter.fromOneDictionary(dict);
final CoverageFactory coverageFactory = new CoverageFactory().setMappingQuality(this.mapq).setPartition(this.groupBy).setRecordFilter(R -> !filter.filterOut(R));
for (final String partition : groupNames) {
if (StringUtils.isBlank(partition))
throw new IOException("Empty read group: " + groupBy.name() + " for " + filename);
for (final String gene : gene2interval.keySet()) {
final List<Integer> counts = new ArrayList<>();
final List<SimpleInterval> intervals = gene2interval.get(gene);
final String newContig = contigNameConverter.apply(intervals.get(0).getContig());
if (StringUtil.isBlank(newContig)) {
throw new JvarkitException.ContigNotFoundInDictionary(intervals.get(0).getContig(), dict);
}
final CoverageFactory.SimpleCoverage coverage = coverageFactory.getSimpleCoverage(IN, intervals.stream().map(R -> R.renameContig(newContig)).collect(Collectors.toList()), partition);
for (final SimpleInterval interval : intervals) {
for (int i = interval.getStart(); i <= interval.getEnd() && i <= coverage.getEnd(); i++) {
final int d = coverage.get(i - coverage.getStart());
counts.add(d);
}
}
Collections.sort(counts);
pw.print(intervals.get(0).getContig() + "\t" + (coverage.getStart() - 1) + "\t" + coverage.getEnd() + "\t" + gene + "\t" + partition + "\t" + intervals.size() + "\t" + counts.size() + "\t" + counts.get(0) + "\t" + counts.get(counts.size() - 1));
for (final int mc : this.min_coverages) {
final DiscreteMedian<Integer> discreteMedian = new DiscreteMedian<>();
int count_no_coverage = 0;
for (int cov : counts) {
if (cov <= mc)
++count_no_coverage;
discreteMedian.add(cov);
}
final OptionalDouble average = discreteMedian.getAverage();
final OptionalDouble median = discreteMedian.getMedian();
pw.print("\t" + (average.isPresent() ? String.format("%.2f", average.orElse(0.0)) : ".") + "\t" + (median.isPresent() ? String.format("%.2f", median.orElse(-1.0)) : ".") + "\t" + count_no_coverage + "\t" + (int) (((counts.size() - count_no_coverage) / (double) counts.size()) * 100.0));
}
pw.println();
}
// end gene
}
// end sample
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(IN);
}
}
use of com.github.lindenb.jvarkit.math.DiscreteMedian in project jvarkit by lindenb.
the class SetFileTools method doStats.
/**
* statistics for setFile
*/
private int doStats(final List<String> args) throws IOException {
final Counter<String> chrom2count = new Counter<>();
final DiscreteMedian<Integer> d_size = new DiscreteMedian<>();
final DiscreteMedian<Integer> d_nitems = new DiscreteMedian<>();
final DiscreteMedian<Integer> d_distance = new DiscreteMedian<>();
final DiscreteMedian<Integer> d_item_size = new DiscreteMedian<>();
for (final SAMSequenceRecord ssr : this.theDict.getSequences()) {
chrom2count.initializeIfNotExists(noChr(ssr.getSequenceName()));
}
chrom2count.initializeIfNotExists("*multiple*");
chrom2count.initializeIfNotExists("*empty*");
try (CloseableIterator<SetFileRecord> iter = openSetFileIterator(args)) {
while (iter.hasNext()) {
final SetFileRecord rec = iter.next();
final Set<String> chroms = rec.getChromosomes();
switch(chroms.size()) {
case 0:
chrom2count.incr("*empty*");
break;
case 1:
chrom2count.incr(noChr(chroms.iterator().next()));
break;
default:
chrom2count.incr("*multiple*");
break;
}
int len = rec.stream().mapToInt(B -> B.getLengthOnReference()).sum();
d_size.add(len);
d_nitems.add(rec.size());
if (rec.size() > 0) {
len = len / rec.size();
d_item_size.add(len);
}
if (rec.size() > 1 && chroms.size() == 1) {
int d = 0;
final List<Locatable> L = sortAndMerge(rec);
for (int i = 0; i + 1 < L.size(); i++) {
d += (rec.get(i + 1).getStart() - rec.get(i).getEnd());
}
d = d / (L.size() - 1);
d_distance.add(d);
}
}
}
try (PrintWriter pw = super.openPathOrStdoutAsPrintWriter(this.outputFile)) {
for (final String key : chrom2count.keySetDecreasing()) {
pw.println("C\trecords-per-chrom\t" + key + "\t" + chrom2count.count(key));
}
pw.println("AS\taverage-size\t" + (d_size.isEmpty() ? "." : String.valueOf(d_size.getAverage().orElse(0.0))));
pw.println("MS\tmedian-size\t" + (d_size.isEmpty() ? "." : String.valueOf(d_size.getMedian().orElse(0.0))));
pw.println("AIS\taverage-item-size\t" + (d_item_size.isEmpty() ? "." : String.valueOf(d_item_size.getAverage().orElse(0.0))));
pw.println("MIS\tmedian-item-size\t" + (d_item_size.isEmpty() ? "." : String.valueOf(d_item_size.getMedian().orElse(0.0))));
pw.println("AN\taverage-nitems\t" + (d_nitems.isEmpty() ? "." : String.valueOf(d_nitems.getAverage().orElse(0.0))));
pw.println("MN\tmedian-nitems\t" + (d_nitems.isEmpty() ? "." : String.valueOf(d_nitems.getMedian().orElse(0.0))));
pw.println("AD\taverage-distance-between-items\t" + (d_distance.isEmpty() ? "." : String.valueOf(d_distance.getAverage().orElse(0.0))));
pw.println("MD\tmedian-distance-between-items\t" + (d_distance.isEmpty() ? "." : String.valueOf(d_distance.getMedian().orElse(0.0))));
pw.flush();
}
return 0;
}
use of com.github.lindenb.jvarkit.math.DiscreteMedian in project jvarkit by lindenb.
the class SamReadLengthDistribution method scan.
private void scan(final SamReader in, Path pathName) throws IOException {
final String defName = (pathName == null ? "STDIN" : pathName.toString()) + "#" + this.partition.name();
in.getFileHeader().getReadGroups().stream().map(RG -> this.partition.apply(RG)).map(S -> StringUtils.isBlank(S) ? defName : S).filter(S -> !this.sample2discreteMedian.containsKey(S)).forEach(S -> this.sample2discreteMedian.put(S, new DiscreteMedian<Integer>()));
final CloseableIterator<SAMRecord> iter = openSamIterator(in);
while (iter.hasNext()) {
final SAMRecord rec = iter.next();
if (rec.getReadFailsVendorQualityCheckFlag())
continue;
if (rec.getDuplicateReadFlag())
continue;
if (rec.isSecondaryOrSupplementary())
continue;
if (!rec.getReadUnmappedFlag() && rec.getMappingQuality() < this.mapq)
continue;
final String sampleName = this.partition.getPartion(rec, defName);
DiscreteMedian<Integer> counter = this.sample2discreteMedian.get(sampleName);
if (counter == null) {
counter = new DiscreteMedian<>();
this.sample2discreteMedian.put(sampleName, counter);
}
final int len;
switch(this.method) {
case SEQ_LENGTH:
len = rec.getReadLength();
break;
case CIGAR_REF_LENGTH:
{
if (rec.getReadUnmappedFlag())
continue;
final Cigar c = rec.getCigar();
if (c == null || c.isEmpty())
continue;
len = c.getReferenceLength();
break;
}
case CIGAR_PADDED_REF_LENGTH:
{
if (rec.getReadUnmappedFlag())
continue;
final Cigar c = rec.getCigar();
if (c == null || c.isEmpty())
continue;
len = c.getPaddedReferenceLength();
break;
}
case INSERT_LENGTH:
{
if (rec.getReadUnmappedFlag())
continue;
if (!rec.getReadPairedFlag())
continue;
if (rec.getMateUnmappedFlag())
continue;
if (!rec.getContig().equals(rec.getMateReferenceName()))
continue;
// ignore 2nd
if (!rec.getFirstOfPairFlag())
continue;
len = Math.abs(rec.getInferredInsertSize());
break;
}
default:
throw new IllegalStateException("unsupported method " + this.method);
}
counter.add(len);
}
iter.close();
}
use of com.github.lindenb.jvarkit.math.DiscreteMedian in project jvarkit by lindenb.
the class CoverageMatrix method doWork.
@Override
public int doWork(final List<String> args) {
VariantContextWriter w = null;
try {
final IndexCovUtils indexCovUtils = new IndexCovUtils(this.indexCovTreshold);
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.refPath);
final SamReaderFactory samReaderFactory = SamReaderFactory.makeDefault().referenceSequence(CoverageMatrix.this.refPath).validationStringency(ValidationStringency.LENIENT);
final List<Path> inputBams = IOUtils.unrollPaths(args);
if (inputBams.size() < 3) {
LOG.error("not enough input bam file defined.");
return -1;
}
final Set<String> sampleSet = new TreeSet<>();
final List<String> idx2samples = new ArrayList<String>(inputBams.size());
for (final Path path : inputBams) {
try (SamReader sr = samReaderFactory.open(path)) {
final SAMFileHeader header = sr.getFileHeader();
final String sample = header.getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtil.isBlank(S)).findFirst().orElse(IOUtils.getFilenameWithoutCommonSuffixes(path));
if (sampleSet.contains(sample)) {
LOG.error("duplicate sample " + sample);
return -1;
}
sampleSet.add(sample);
idx2samples.add(sample);
}
}
final Set<VCFHeaderLine> metaData = new HashSet<>();
w = this.writingVariantsDelegate.dictionary(dict).open(this.outputFile);
final VCFFormatHeaderLine fmtNormDepth = new VCFFormatHeaderLine("D", 1, VCFHeaderLineType.Float, "norm Depth");
metaData.add(fmtNormDepth);
final VCFFormatHeaderLine fmtStdDev = new VCFFormatHeaderLine("STDDEV", 1, VCFHeaderLineType.Float, "standard deviation");
metaData.add(fmtStdDev);
final VCFInfoHeaderLine infoStdDev = new VCFInfoHeaderLine(fmtStdDev.getID(), 1, VCFHeaderLineType.Float, "standard deviation");
metaData.add(infoStdDev);
final VCFInfoHeaderLine infoMedianD = new VCFInfoHeaderLine("MEDIAN", 1, VCFHeaderLineType.Float, "median depth");
metaData.add(infoMedianD);
final VCFInfoHeaderLine infoNSamples = new VCFInfoHeaderLine("NSAMPLES", 1, VCFHeaderLineType.Integer, "number of samples");
metaData.add(infoNSamples);
final VCFInfoHeaderLine infoSamples = new VCFInfoHeaderLine("SAMPLES", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Samples");
metaData.add(infoSamples);
final VCFFilterHeaderLine filterAll = new VCFFilterHeaderLine("ALL_AFFECTED", "All Samples carry a variant");
metaData.add(filterAll);
VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.END_KEY);
VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.GENOTYPE_KEY);
final VCFHeader vcfheader = new VCFHeader(metaData, idx2samples);
vcfheader.setSequenceDictionary(dict);
JVarkitVersion.getInstance().addMetaData(this, vcfheader);
w.writeHeader(vcfheader);
for (final SAMSequenceRecord ssr : dict.getSequences()) {
if (!StringUtils.isBlank(restrictContig) && !restrictContig.equals(ssr.getSequenceName()))
continue;
final int[] depth = new int[ssr.getSequenceLength()];
final BitSet blackListedPositions = new BitSet(depth.length);
// fill black listed regions
if (this.blackListedPath != null) {
try (TabixReader tbr = new TabixReader(this.blackListedPath.toString())) {
final ContigNameConverter cvt = ContigNameConverter.fromContigSet(tbr.getChromosomes());
final String ctg = cvt.apply(ssr.getSequenceName());
if (!StringUtils.isBlank(ctg)) {
final BedLineCodec codec = new BedLineCodec();
final TabixReader.Iterator tbxr = tbr.query(ctg, 1, ssr.getSequenceLength());
for (; ; ) {
final String line = tbxr.next();
if (line == null)
break;
final BedLine bed = codec.decode(line);
if (bed == null)
continue;
int p1 = Math.max(bed.getStart(), 1);
while (p1 <= ssr.getSequenceLength() && p1 <= bed.getEnd()) {
blackListedPositions.set(p1 - 1);
++p1;
}
}
}
} catch (Throwable err) {
LOG.warn(err);
}
}
final SortingCollection<CovItem> sorter = SortingCollection.newInstance(CovItem.class, new CovItemCodec(), (A, B) -> A.compare0(B), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
for (int bam_idx = 0; bam_idx < inputBams.size(); ++bam_idx) {
final Path path = inputBams.get(bam_idx);
LOG.info(ssr.getContig() + ":" + path + " " + bam_idx + "/" + inputBams.size());
try (SamReader sr = samReaderFactory.open(path)) {
final SAMFileHeader header = sr.getFileHeader();
SequenceUtil.assertSequenceDictionariesEqual(dict, header.getSequenceDictionary());
Arrays.fill(depth, 0);
try (CloseableIterator<SAMRecord> siter = sr.queryOverlapping(ssr.getContig(), 1, ssr.getLengthOnReference())) {
while (siter.hasNext()) {
final SAMRecord rec = siter.next();
if (rec.getReadUnmappedFlag())
continue;
if (!SAMRecordDefaultFilter.accept(rec, this.min_mapq))
continue;
int ref = rec.getStart();
final Cigar cigar = rec.getCigar();
if (cigar == null)
continue;
for (CigarElement ce : cigar) {
final CigarOperator op = ce.getOperator();
final int len = ce.getLength();
if (op.consumesReferenceBases()) {
if (op.consumesReadBases()) {
for (int i = 0; i < len; i++) {
final int pos = ref + i;
if (pos < 1)
continue;
if (pos > ssr.getLengthOnReference())
break;
depth[pos - 1]++;
}
}
ref += len;
}
}
// loop cigar
}
// end samItere
}
// try
final DiscreteMedian<Integer> discreteMedian = new DiscreteMedian<>();
int pos = 0;
while (pos < depth.length) {
if (!blackListedPositions.get(pos) && depth[pos] <= this.max_depth) {
discreteMedian.add(depth[pos]);
}
++pos;
}
final double median = discreteMedian.getMedian().orElse(1.0);
LOG.info(idx2samples.get(bam_idx) + " :" + ssr.getSequenceName() + " median depth:" + median);
final DiscreteMedian<Integer> localMedian = new DiscreteMedian<>();
pos = 0;
while (pos < depth.length) {
if (blackListedPositions.get(pos)) /* non pas maxdepth */
{
++pos;
continue;
}
int pos2 = pos;
localMedian.clear();
while (pos2 - pos < this.bin_size && pos2 < depth.length && !blackListedPositions.get(pos2)) {
// consider this.max_depth here ?
localMedian.add(depth[pos2]);
++pos2;
}
if (pos2 - pos == this.bin_size) {
final double localMed = localMedian.getMedian().orElse(0.0);
final CovItem item = new CovItem();
item.pos = pos;
item.sample_idx = bam_idx;
item.depth = (float) (localMed / median);
item.stddev = (float) localMedian.getStandardDeviation().orElse(-1.0);
sorter.add(item);
}
pos = pos2;
}
}
// end loop over samples
}
// end loop over bams
sorter.doneAdding();
sorter.setDestructiveIteration(true);
final CloseableIterator<CovItem> iter = sorter.iterator();
final EqualRangeIterator<CovItem> iter2 = new EqualRangeIterator<>(iter, (A, B) -> Integer.compare(A.pos, B.pos));
final Allele REF = Allele.create("N", true);
final Allele DEL = Allele.create("<DEL>", false);
final Allele DUP = Allele.create("<DUP>", false);
while (iter2.hasNext()) {
final List<CovItem> list = iter2.next();
final CovItem first = list.get(0);
final double avg_depth = list.stream().mapToDouble(F -> F.depth).average().orElse(0);
final double sum = list.stream().mapToDouble(F -> F.depth).map(D -> Math.pow(D - avg_depth, 2.0)).sum();
final double stdDev = Math.sqrt(sum / list.size());
final OptionalDouble optMedianOfmedian = Percentile.median().evaluate(list.stream().mapToDouble(I -> I.depth));
final double medianOfmedian = optMedianOfmedian.orElse(1.0);
if (medianOfmedian <= 0)
continue;
for (int i = 0; i < list.size(); i++) {
list.get(i).depth /= medianOfmedian;
}
if (list.stream().allMatch(F -> Float.isNaN(F.depth) || Float.isInfinite(F.depth)))
continue;
final VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(ssr.getContig());
vcb.start(first.pos + 1);
vcb.stop(first.pos + this.bin_size);
vcb.attribute(VCFConstants.END_KEY, first.pos + this.bin_size);
vcb.attribute(infoStdDev.getID(), stdDev);
vcb.attribute(infoMedianD.getID(), medianOfmedian);
final Set<Allele> alleles = new HashSet<>();
alleles.add(REF);
final List<Genotype> genotypes = new ArrayList<>(list.size());
final Set<String> affected = new TreeSet<>();
for (int i = 0; i < list.size(); i++) {
final CovItem item = list.get(i);
final String sn = idx2samples.get(item.sample_idx);
final GenotypeBuilder gb;
switch(indexCovUtils.getType(item.depth)) {
case AMBIGOUS:
gb = new GenotypeBuilder(sn, Arrays.asList(Allele.NO_CALL, Allele.NO_CALL));
break;
case HET_DEL:
alleles.add(DEL);
gb = new GenotypeBuilder(sn, Arrays.asList(REF, DEL));
affected.add(sn);
break;
case HOM_DEL:
alleles.add(DEL);
gb = new GenotypeBuilder(sn, Arrays.asList(DEL, DEL));
affected.add(sn);
break;
case HET_DUP:
alleles.add(DUP);
gb = new GenotypeBuilder(sn, Arrays.asList(REF, DUP));
affected.add(sn);
break;
case HOM_DUP:
alleles.add(DUP);
gb = new GenotypeBuilder(sn, Arrays.asList(DUP, DUP));
affected.add(sn);
break;
case REF:
gb = new GenotypeBuilder(sn, Arrays.asList(REF, REF));
break;
default:
throw new IllegalStateException();
}
gb.attribute(fmtNormDepth.getID(), item.depth);
gb.attribute(fmtStdDev.getID(), item.stddev);
genotypes.add(gb.make());
}
if (affected.isEmpty())
continue;
if (affected.size() == inputBams.size()) {
vcb.filter(filterAll.getID());
} else {
vcb.passFilters();
}
vcb.attribute(infoSamples.getID(), new ArrayList<>(affected));
vcb.attribute(infoNSamples.getID(), affected.size());
vcb.genotypes(genotypes);
vcb.alleles(alleles);
w.add(vcb.make());
}
iter2.close();
iter.close();
sorter.cleanup();
System.gc();
}
// end while iter
w.close();
w = null;
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(w);
}
}
use of com.github.lindenb.jvarkit.math.DiscreteMedian in project jvarkit by lindenb.
the class DepthOfCoverage method doWork.
@Override
public int doWork(final List<String> args) {
PrintWriter out = null;
if (this.auto_mask && this.faidx == null) {
LOG.error("Cannot auto mask if REF is not defined");
return -1;
}
if (this.maskBed != null && this.includeBed != null) {
LOG.error("both --mask and --bed both defined");
return -1;
}
ReferenceSequenceFile referenceSequenceFile = null;
try {
final Predicate<String> isRejectContigRegex;
if (!StringUtils.isBlank(this.skipContigExpr)) {
final Pattern pat = Pattern.compile(this.skipContigExpr);
isRejectContigRegex = S -> pat.matcher(S).matches();
} else {
isRejectContigRegex = S -> false;
}
final SamReaderFactory srf = super.createSamReaderFactory();
if (this.faidx != null) {
srf.referenceSequence(this.faidx);
srf.setUseAsyncIo(this.asyncIo);
referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.faidx);
}
out = super.openPathOrStdoutAsPrintWriter(this.outputFile);
out.print("#BAM\tSample\tContig\tContig-Length\tMasked-Contig-Length\tCount\tDepth\tMedian\tMin\tMax\tMaxPos");
for (RangeOfIntegers.Range r : this.summaryCov.getRanges()) {
if (r.getMinInclusive() == null)
continue;
out.print("\t");
out.print(r.toString());
}
out.println();
for (final Path path : IOUtils.unrollPaths(args)) {
try (final SamReader sr = srf.open(path)) {
if (!sr.hasIndex()) {
LOG.error("File " + path + " is not indexed.");
return -1;
}
final SAMFileHeader header = sr.getFileHeader();
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(header);
final Set<String> rejectContigSet = dict.getSequences().stream().map(SSR -> SSR.getSequenceName()).filter(isRejectContigRegex).collect(Collectors.toCollection(HashSet::new));
rejectContigSet.addAll(dict.getSequences().stream().filter(SSR -> SSR.getSequenceLength() < this.skipContigLength).map(SSR -> SSR.getSequenceName()).collect(Collectors.toCollection(HashSet::new)));
if (!header.getSortOrder().equals(SAMFileHeader.SortOrder.coordinate)) {
LOG.error("file is not sorted on coordinate :" + header.getSortOrder() + " " + path);
return -1;
}
final QueryInterval[] intervals;
if (this.useBamIndexFlag && this.includeBed != null) {
if (!sr.hasIndex()) {
LOG.error("Bam is not indexed. " + path);
return -1;
}
final ContigNameConverter contigNameConverter = ContigNameConverter.fromOneDictionary(dict);
final List<QueryInterval> L = new ArrayList<>();
try (BedLineReader br = new BedLineReader(this.includeBed)) {
while (br.hasNext()) {
final BedLine bed = br.next();
final String ctg = contigNameConverter.apply(bed.getContig());
if (StringUtils.isBlank(ctg))
continue;
final int tid = dict.getSequenceIndex(ctg);
if (tid < 0)
continue;
L.add(new QueryInterval(tid, bed.getStart(), bed.getEnd()));
}
}
intervals = QueryInterval.optimizeIntervals(L.toArray(new QueryInterval[L.size()]));
} else {
intervals = null;
}
Integer minCov = null;
Integer maxCov = null;
ContigPos maxCovPosition = null;
long count_raw_bases = 0L;
long count_bases = 0L;
long sum_coverage = 0L;
final DiscreteMedian<Integer> discreteMedian_wg = new DiscreteMedian<>();
final Counter<RangeOfIntegers.Range> countMap_wg = new Counter<>();
final String sample = header.getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtils.isBlank(S)).findFirst().orElse(path.toString());
int[] coverage = null;
String prevContig = null;
BitSet mask = null;
final ProgressFactory.Watcher<SAMRecord> progress = ProgressFactory.newInstance().dictionary(dict).logger(LOG).build();
try (CloseableIterator<SAMRecord> iter = intervals == null ? sr.iterator() : sr.queryOverlapping(intervals)) {
for (; ; ) {
final SAMRecord rec = iter.hasNext() ? progress.apply(iter.next()) : null;
if (rec != null) {
if (!SAMRecordDefaultFilter.accept(rec, this.mapping_quality))
continue;
if (rejectContigSet.contains(rec.getContig()))
continue;
}
if (rec == null || !rec.getContig().equals(prevContig)) {
if (coverage != null) {
// DUMP
long count_bases_ctg = 0L;
long sum_coverage_ctg = 0L;
Integer minV_ctg = null;
Integer maxV_ctg = null;
ContigPos maxPos_ctg = null;
final DiscreteMedian<Integer> discreteMedian_ctg = new DiscreteMedian<>();
final Counter<RangeOfIntegers.Range> countMap_ctg = new Counter<>();
for (int i = 0; i < coverage.length; i++) {
if (mask.get(i))
continue;
final int covi = coverage[i];
if (covi > this.max_depth)
continue;
if (minV_ctg == null || minV_ctg.intValue() > covi)
minV_ctg = covi;
if (maxV_ctg == null || maxV_ctg.intValue() < covi) {
maxV_ctg = covi;
maxPos_ctg = new ContigPos(prevContig, i + 1);
}
countMap_ctg.incr(this.summaryCov.getRange(covi));
count_bases_ctg++;
sum_coverage_ctg += covi;
discreteMedian_ctg.add(covi);
}
out.print(path);
out.print("\t");
out.print(sample);
out.print("\t");
out.print(prevContig);
out.print("\t");
out.print(coverage.length);
out.print("\t");
out.print(count_bases_ctg);
out.print("\t");
out.print(sum_coverage_ctg);
out.print("\t");
if (count_bases_ctg > 0) {
out.printf("%.2f", sum_coverage_ctg / (double) count_bases_ctg);
} else {
out.print("N/A");
}
out.print("\t");
final OptionalDouble median = discreteMedian_ctg.getMedian();
if (median.isPresent()) {
out.print(median.getAsDouble());
} else {
out.print("N/A");
}
out.print("\t");
if (minV_ctg != null) {
out.print(minV_ctg);
} else {
out.print("N/A");
}
out.print("\t");
if (maxV_ctg != null) {
out.print(maxV_ctg);
out.print("\t");
out.print(maxPos_ctg);
} else {
out.print("N/A\tN/A");
}
for (final RangeOfIntegers.Range r : this.summaryCov.getRanges()) {
if (r.getMinInclusive() == null)
continue;
out.print("\t");
out.print(countMap_ctg.count(r));
if (!countMap_ctg.isEmpty()) {
out.print(" ");
out.printf("(%.2f%%)", (countMap_ctg.count(r) / (countMap_ctg.getTotal() * 1.0)) * 100.0);
}
}
out.println();
if (minCov == null || (minV_ctg != null && minV_ctg.compareTo(minCov) < 0))
minCov = minV_ctg;
if (maxCov == null || (maxV_ctg != null && maxV_ctg.compareTo(maxCov) > 0)) {
maxCov = maxV_ctg;
maxCovPosition = maxPos_ctg;
}
count_bases += count_bases_ctg;
sum_coverage += sum_coverage_ctg;
count_raw_bases += coverage.length;
discreteMedian_wg.add(discreteMedian_ctg);
countMap_wg.putAll(countMap_ctg);
}
coverage = null;
mask = null;
// /
System.gc();
if (rec == null)
break;
final SAMSequenceRecord ssr = Objects.requireNonNull(dict.getSequence(rec.getContig()));
coverage = new int[ssr.getSequenceLength()];
mask = new BitSet(ssr.getSequenceLength());
if (this.auto_mask && referenceSequenceFile != null) {
final byte[] refSeq = Objects.requireNonNull(referenceSequenceFile.getSequence(ssr.getSequenceName())).getBases();
for (int i = 0; i < refSeq.length; i++) {
if (AcidNucleics.isATGC(refSeq[i]))
continue;
mask.set(i);
}
}
/* read mask */
if (this.maskBed != null) {
final ContigNameConverter contigNameConverter = ContigNameConverter.fromOneDictionary(dict);
try (BedLineReader br = new BedLineReader(this.maskBed)) {
while (br.hasNext()) {
final BedLine bed = br.next();
if (bed == null)
continue;
String ctg = contigNameConverter.apply(bed.getContig());
if (StringUtils.isBlank(ctg))
continue;
if (!rec.getContig().equals(ctg))
continue;
for (int p1 = bed.getStart(); p1 <= bed.getEnd() && p1 <= coverage.length; ++p1) {
mask.set(p1 - 1);
}
}
}
} else if (this.includeBed != null) {
final List<Locatable> list = new ArrayList<>();
final ContigNameConverter contigNameConverter = ContigNameConverter.fromOneDictionary(dict);
try (BedLineReader br = new BedLineReader(this.includeBed)) {
while (br.hasNext()) {
final BedLine bed = br.next();
if (bed == null)
continue;
final String ctg = contigNameConverter.apply(bed.getContig());
if (StringUtils.isBlank(ctg))
continue;
if (!rec.getContig().equals(ctg))
continue;
list.add(new SimpleInterval(ctg, bed.getStart(), bed.getEnd()));
}
}
// sort on starts
Collections.sort(list, (A, B) -> Integer.compare(A.getStart(), B.getStart()));
int p1 = 1;
while (p1 <= coverage.length) {
while (!list.isEmpty() && list.get(0).getEnd() < p1) {
list.remove(0);
}
if (!list.isEmpty() && list.get(0).getStart() <= p1 && p1 <= list.get(0).getEnd()) {
++p1;
continue;
}
mask.set(p1 - 1);
p1++;
}
}
prevContig = rec.getContig();
}
int max_end1 = coverage.length;
if (!this.disable_paired_overlap_flag && rec.getReadPairedFlag() && !rec.getMateUnmappedFlag() && rec.getReferenceIndex().equals(rec.getMateReferenceIndex()) && rec.getAlignmentStart() < rec.getMateAlignmentStart() && rec.getAlignmentEnd() > rec.getMateAlignmentStart()) {
max_end1 = rec.getMateAlignmentStart() - 1;
}
for (final AlignmentBlock block : rec.getAlignmentBlocks()) {
final int pos1 = block.getReferenceStart();
final int len = block.getLength();
for (int i = 0; i < len; i++) {
if (pos1 + i - 1 >= 0 && pos1 + i <= max_end1) {
coverage[pos1 + i - 1]++;
}
}
}
}
/* end rec */
}
/* end iter */
progress.close();
out.print(path);
out.print("\t");
out.print(sample);
out.print("\t");
out.print(SAMRecord.NO_ALIGNMENT_REFERENCE_NAME);
out.print("\t");
out.print(count_raw_bases);
out.print("\t");
out.print(count_bases);
out.print("\t");
out.print(sum_coverage);
out.print("\t");
if (count_bases > 0) {
out.printf("%.2f", sum_coverage / (double) count_bases);
} else {
out.print("N/A");
}
out.print("\t");
final OptionalDouble median = discreteMedian_wg.getMedian();
if (median.isPresent()) {
out.print(median.getAsDouble());
} else {
out.print("N/A");
}
out.print("\t");
if (minCov != null) {
out.print(minCov);
} else {
out.print("N/A");
}
out.print("\t");
if (maxCov != null) {
out.print(maxCov + "\t" + maxCovPosition);
} else {
out.print("N/A\tN/A");
}
for (final RangeOfIntegers.Range r : this.summaryCov.getRanges()) {
if (r.getMinInclusive() == null)
continue;
out.print("\t");
out.print(countMap_wg.count(r));
if (!countMap_wg.isEmpty()) {
out.print(" ");
out.printf("(%.2f%%)", (countMap_wg.count(r) / (countMap_wg.getTotal() * 1.0)) * 100.0);
}
}
out.println();
}
}
out.flush();
out.close();
return 0;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(referenceSequenceFile);
}
}
Aggregations