Search in sources :

Example 1 with GCPercent

use of com.github.lindenb.jvarkit.util.picard.GenomicSequence.GCPercent in project jvarkit by lindenb.

the class CopyNumber01 method doWork.

@Override
public int doWork(final List<String> args) {
    ReferenceSequenceFile indexedFastaSequenceFile = null;
    try {
        this.sexContigSet.addAll(Arrays.stream(this.sexContigStr.split("[, \t]")).filter(S -> !StringUtils.isBlank(S)).collect(Collectors.toSet()));
        /* loading REF Reference */
        indexedFastaSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(refFile);
        final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(indexedFastaSequenceFile);
        final Comparator<Locatable> locComparator = new ContigDictComparator(dict).createLocatableComparator();
        final List<Locatable> intervals = new ArrayList<>();
        if (this.bedFile == null) {
            for (final Locatable loc : dict.getSequences()) {
                intervals.add(loc);
            }
        } else {
            final ContigNameConverter converter = ContigNameConverter.fromOneDictionary(dict);
            try (BedLineReader br = new BedLineReader(this.bedFile)) {
                br.stream().filter(L -> !StringUtil.isBlank(converter.apply(L.getContig()))).forEach(B -> {
                    final String ctg = converter.apply(B.getContig());
                    intervals.add(new SimpleInterval(ctg, B.getStart(), B.getEnd()));
                });
            }
        }
        if (intervals.isEmpty()) {
            LOG.error("no interval defined.");
            return -1;
        }
        LOG.info("intervals N=" + intervals.size() + " mean-size:" + intervals.stream().mapToInt(R -> R.getLengthOnReference()).average().orElse(0.0));
        final List<GCAndDepth> user_items = new ArrayList<>();
        // split intervals
        for (final Locatable loc : intervals) {
            int pos = loc.getStart();
            while (pos < loc.getEnd()) {
                final int pos_end = Math.min(pos + this.windowSize, loc.getEnd());
                final GCAndDepth dataRow = new GCAndDepth(loc.getContig(), pos, pos_end);
                if (dataRow.getLengthOnReference() < this.windowMin) {
                    break;
                }
                user_items.add(dataRow);
                pos += this.windowShift;
            }
        }
        // free memory
        intervals.clear();
        LOG.info("sorting N=" + user_items.size());
        Collections.sort(user_items, locComparator);
        // fill gc percent
        LOG.info("fill gc% N=" + user_items.size());
        for (final String ctg : user_items.stream().map(T -> T.getContig()).collect(Collectors.toSet())) {
            final GenomicSequence gseq = new GenomicSequence(indexedFastaSequenceFile, ctg);
            for (final GCAndDepth dataRow : user_items) {
                if (!dataRow.getContig().equals(ctg))
                    continue;
                final GCPercent gc = gseq.getGCPercent(dataRow.getStart(), dataRow.getEnd());
                if (gc.isEmpty())
                    continue;
                dataRow.gc = gc.getGCPercent();
            }
        }
        // remove strange gc
        user_items.removeIf(B -> B.gc < this.minGC);
        user_items.removeIf(B -> B.gc > this.maxGC);
        LOG.info("remove high/low gc% N=" + user_items.size());
        if (user_items.stream().allMatch(P -> isSex(P.getContig()))) {
            LOG.error("All chromosomes are defined as sexual. Cannot normalize");
            return -1;
        }
        final CoverageFactory coverageFactory = new CoverageFactory().setMappingQuality(this.mappingQuality);
        try (PrintWriter pw = super.openPathOrStdoutAsPrintWriter(this.outputFile)) {
            /* header */
            pw.println("#CHROM\tSTART\tEND\tSample\tIDX\tGC\tRAW-DEPTH\tNORM-DEPTH");
            for (final Path bamPath : IOUtils.unrollPaths(args)) {
                // open this samReader
                try (SamReader samReader = super.createSamReaderFactory().referenceSequence(this.refFile).open(bamPath)) {
                    if (!samReader.hasIndex()) {
                        LOG.error("file is not indexed " + bamPath);
                        return -1;
                    }
                    final SAMFileHeader header = samReader.getFileHeader();
                    SequenceUtil.assertSequenceDictionariesEqual(dict, SequenceDictionaryUtils.extractRequired(header));
                    final String sampleName = header.getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtil.isBlank(S)).collect(Collectors.toSet()).stream().collect(HtsCollectors.toSingleton());
                    final List<GCAndDepth> bam_items = new ArrayList<>(user_items.size());
                    /* loop over contigs */
                    for (final SAMSequenceRecord ssr : dict.getSequences()) {
                        /* create a **COPY** of the intervals */
                        final List<GCAndDepth> ctgitems = user_items.stream().filter(T -> T.contigsMatch(ssr)).collect(Collectors.toList());
                        if (ctgitems.isEmpty())
                            continue;
                        LOG.info("Getting coverage for " + ssr.getSequenceName() + " N=" + ctgitems.size());
                        // get coverage
                        final CoverageFactory.SimpleCoverage coverage = coverageFactory.getSimpleCoverage(samReader, ctgitems, sampleName);
                        // fill coverage
                        for (final GCAndDepth gc : ctgitems) {
                            final OptionalDouble optCov;
                            switch(this.univariateDepth) {
                                case median:
                                    optCov = coverage.getMedian(gc);
                                    break;
                                case mean:
                                    optCov = coverage.getAverage(gc);
                                    break;
                                default:
                                    throw new IllegalStateException();
                            }
                            gc.raw_depth = optCov.orElse(-1.0);
                            gc.norm_depth = gc.raw_depth;
                        }
                        ctgitems.removeIf(V -> V.raw_depth < 0);
                        ctgitems.removeIf(V -> V.raw_depth > this.weirdMaxDepth);
                        ctgitems.removeIf(V -> V.raw_depth < this.weirdMinDepth);
                        if (ctgitems.isEmpty())
                            continue;
                        bam_items.addAll(ctgitems);
                    }
                    double[] y = bam_items.stream().filter(R -> !isSex(R.getContig())).mapToDouble(R -> R.raw_depth).toArray();
                    LOG.info("median raw depth " + new Median().evaluate(y, 0, y.length));
                    Collections.sort(bam_items, (a, b) -> {
                        final int i = Double.compare(a.getX(), b.getX());
                        if (i != 0)
                            return i;
                        return Double.compare(a.getY(), b.getY());
                    });
                    double[] x = bam_items.stream().filter(R -> !isSex(R.getContig())).mapToDouble(R -> R.getX()).toArray();
                    y = bam_items.stream().filter(R -> !isSex(R.getContig())).mapToDouble(R -> R.getY()).toArray();
                    // get min GC
                    final double min_x = x[0];
                    // get max GC
                    final double max_x = x[x.length - 1];
                    LOG.info("min/max gc " + min_x + " " + max_x);
                    /* merge adjacent x (GC%) having same values */
                    int i = 0;
                    int k = 0;
                    while (i < x.length) {
                        int j = i + 1;
                        while (j < x.length && Precision.equals(x[i], x[j])) {
                            ++j;
                        }
                        x[k] = x[i];
                        y[k] = this.univariateGCLoess.create().evaluate(y, i, j - i);
                        ++k;
                        i = j;
                    }
                    LOG.info("merged n=" + (x.length - k) + " items.");
                    /* reduce size of x et y */
                    final List<XY> xyL = new ArrayList<>(k);
                    for (int t = 0; t < k; ++t) {
                        xyL.add(new XYImpl(x[t], y[t]));
                    }
                    /* sort on Y */
                    Collections.sort(xyL, (a, b) -> {
                        final int d = Double.compare(a.getX(), b.getX());
                        if (d != 0)
                            return d;
                        return Double.compare(a.getY(), b.getY());
                    });
                    x = xyL.stream().mapToDouble(R -> R.getX()).toArray();
                    y = xyL.stream().mapToDouble(R -> R.getY()).toArray();
                    final UnivariateInterpolator interpolator = createInterpolator();
                    UnivariateFunction spline = null;
                    try {
                        spline = interpolator.interpolate(x, y);
                    } catch (final org.apache.commons.math3.exception.NumberIsTooSmallException err) {
                        spline = null;
                        LOG.error("Cannot use " + interpolator.getClass().getName() + ":" + err.getMessage());
                    }
                    // min depth cal
                    int points_removed = 0;
                    i = 0;
                    while (i < bam_items.size()) {
                        final GCAndDepth r = bam_items.get(i);
                        if (spline == null) {
                            ++i;
                        } else if (r.getX() < min_x || r.getX() > max_x) {
                            bam_items.remove(i);
                            ++points_removed;
                        } else {
                            final double norm;
                            if (this.gcDepthInterpolation.equals(UnivariateInerpolation.identity)) {
                                norm = r.getY();
                            } else {
                                norm = spline.value(r.getX());
                            }
                            if (Double.isNaN(norm) || Double.isInfinite(norm)) {
                                LOG.info("NAN " + r);
                                bam_items.remove(i);
                                ++points_removed;
                                continue;
                            }
                            r.norm_depth = norm;
                            ++i;
                        }
                    }
                    LOG.info("removed " + points_removed + ". now N=" + bam_items.size());
                    if (bam_items.isEmpty())
                        continue;
                    spline = null;
                    // DO NOT NORMALIZE ON MINIMUM DEPTH, STUPID.
                    // normalize on median
                    y = bam_items.stream().mapToDouble(G -> G.getY()).toArray();
                    final double median_depth = this.univariateMid.create().evaluate(y, 0, y.length);
                    LOG.info("median norm depth : " + median_depth);
                    for (i = 0; median_depth > 0 && i < bam_items.size(); ++i) {
                        final GCAndDepth gc = bam_items.get(i);
                        gc.norm_depth /= median_depth;
                    }
                    // restore genomic order
                    Collections.sort(bam_items, locComparator);
                    // smoothing values with neighbours
                    y = bam_items.stream().mapToDouble(V -> V.getY()).toArray();
                    for (i = 0; i < bam_items.size(); ++i) {
                        final GCAndDepth gc = bam_items.get(i);
                        int left = i;
                        for (int j = Math.max(0, i - this.smooth_window); j <= i; ++j) {
                            final GCAndDepth gc2 = bam_items.get(j);
                            if (!gc2.withinDistanceOf(gc, this.smoothDistance))
                                continue;
                            left = j;
                            break;
                        }
                        int right = i;
                        for (int j = i; j <= i + this.smooth_window && j < bam_items.size(); ++j) {
                            final GCAndDepth gc2 = bam_items.get(j);
                            if (!gc2.withinDistanceOf(gc, this.smoothDistance))
                                break;
                            right = j;
                        // no break;
                        }
                        gc.norm_depth = this.univariateSmooth.create().evaluate(y, left, (right - left) + 1);
                    }
                    /* print data */
                    for (final GCAndDepth r : bam_items) {
                        pw.print(r.getContig());
                        pw.print('\t');
                        pw.print(r.getStart() - 1);
                        pw.print('\t');
                        pw.print(r.getEnd());
                        pw.print('\t');
                        pw.print(sampleName);
                        pw.print('\t');
                        pw.print(getGenomicIndex(dict, r.getContig(), r.getStart()) - 1);
                        pw.print('\t');
                        pw.printf("%.3f", r.gc);
                        pw.print('\t');
                        pw.printf("%.3f", r.raw_depth);
                        pw.print('\t');
                        pw.printf("%.3f", r.norm_depth);
                        pw.println();
                    }
                    pw.flush();
                }
            // samReader
            }
            // end loop bamPath
            pw.flush();
        }
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(indexedFastaSequenceFile);
    }
}
Also used : Arrays(java.util.Arrays) Precision(org.apache.commons.math3.util.Precision) NevilleInterpolator(org.apache.commons.math3.analysis.interpolation.NevilleInterpolator) Program(com.github.lindenb.jvarkit.util.jcommander.Program) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) UnivariateInterpolator(org.apache.commons.math3.analysis.interpolation.UnivariateInterpolator) SAMFileHeader(htsjdk.samtools.SAMFileHeader) ReferenceSequenceFile(htsjdk.samtools.reference.ReferenceSequenceFile) DistanceParser(com.github.lindenb.jvarkit.util.bio.DistanceParser) CoverageFactory(com.github.lindenb.jvarkit.samtools.CoverageFactory) StringUtil(htsjdk.samtools.util.StringUtil) Map(java.util.Map) Median(org.apache.commons.math3.stat.descriptive.rank.Median) Path(java.nio.file.Path) MathIllegalArgumentException(org.apache.commons.math3.exception.MathIllegalArgumentException) CloserUtil(htsjdk.samtools.util.CloserUtil) PrintWriter(java.io.PrintWriter) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) LinearInterpolator(org.apache.commons.math3.analysis.interpolation.LinearInterpolator) LoessInterpolator(org.apache.commons.math3.analysis.interpolation.LoessInterpolator) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) Collectors(java.util.stream.Collectors) ReferenceSequenceFileFactory(htsjdk.samtools.reference.ReferenceSequenceFileFactory) List(java.util.List) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) UnivariateFunction(org.apache.commons.math3.analysis.UnivariateFunction) SplineInterpolator(org.apache.commons.math3.analysis.interpolation.SplineInterpolator) SequenceUtil(htsjdk.samtools.util.SequenceUtil) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Parameter(com.beust.jcommander.Parameter) OptionalDouble(java.util.OptionalDouble) HtsCollectors(com.github.lindenb.jvarkit.stream.HtsCollectors) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) BedLineReader(com.github.lindenb.jvarkit.bed.BedLineReader) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) AbstractUnivariateStatistic(org.apache.commons.math3.stat.descriptive.AbstractUnivariateStatistic) Mean(org.apache.commons.math3.stat.descriptive.moment.Mean) NoSplitter(com.github.lindenb.jvarkit.util.jcommander.NoSplitter) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) Locatable(htsjdk.samtools.util.Locatable) GCPercent(com.github.lindenb.jvarkit.util.picard.GenomicSequence.GCPercent) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) DividedDifferenceInterpolator(org.apache.commons.math3.analysis.interpolation.DividedDifferenceInterpolator) Identity(org.apache.commons.math3.analysis.function.Identity) DimensionMismatchException(org.apache.commons.math3.exception.DimensionMismatchException) SamReader(htsjdk.samtools.SamReader) DynamicParameter(com.beust.jcommander.DynamicParameter) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) Comparator(java.util.Comparator) Collections(java.util.Collections) ArrayList(java.util.ArrayList) Median(org.apache.commons.math3.stat.descriptive.rank.Median) ReferenceSequenceFile(htsjdk.samtools.reference.ReferenceSequenceFile) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) BedLineReader(com.github.lindenb.jvarkit.bed.BedLineReader) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ContigDictComparator(com.github.lindenb.jvarkit.util.samtools.ContigDictComparator) SamReader(htsjdk.samtools.SamReader) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) UnivariateInterpolator(org.apache.commons.math3.analysis.interpolation.UnivariateInterpolator) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) PrintWriter(java.io.PrintWriter) Path(java.nio.file.Path) GCPercent(com.github.lindenb.jvarkit.util.picard.GenomicSequence.GCPercent) CoverageFactory(com.github.lindenb.jvarkit.samtools.CoverageFactory) UnivariateFunction(org.apache.commons.math3.analysis.UnivariateFunction) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) OptionalDouble(java.util.OptionalDouble) SAMFileHeader(htsjdk.samtools.SAMFileHeader) Locatable(htsjdk.samtools.util.Locatable)

Aggregations

DynamicParameter (com.beust.jcommander.DynamicParameter)1 Parameter (com.beust.jcommander.Parameter)1 BedLineReader (com.github.lindenb.jvarkit.bed.BedLineReader)1 IOUtils (com.github.lindenb.jvarkit.io.IOUtils)1 StringUtils (com.github.lindenb.jvarkit.lang.StringUtils)1 CoverageFactory (com.github.lindenb.jvarkit.samtools.CoverageFactory)1 SimpleInterval (com.github.lindenb.jvarkit.samtools.util.SimpleInterval)1 HtsCollectors (com.github.lindenb.jvarkit.stream.HtsCollectors)1 DistanceParser (com.github.lindenb.jvarkit.util.bio.DistanceParser)1 SequenceDictionaryUtils (com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils)1 ContigNameConverter (com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter)1 Launcher (com.github.lindenb.jvarkit.util.jcommander.Launcher)1 NoSplitter (com.github.lindenb.jvarkit.util.jcommander.NoSplitter)1 Program (com.github.lindenb.jvarkit.util.jcommander.Program)1 Logger (com.github.lindenb.jvarkit.util.log.Logger)1 GenomicSequence (com.github.lindenb.jvarkit.util.picard.GenomicSequence)1 GCPercent (com.github.lindenb.jvarkit.util.picard.GenomicSequence.GCPercent)1 ContigDictComparator (com.github.lindenb.jvarkit.util.samtools.ContigDictComparator)1 SAMFileHeader (htsjdk.samtools.SAMFileHeader)1 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)1