Search in sources :

Example 86 with Max

use of org.apache.commons.math3.stat.descriptive.rank.Max in project SeqMonk by s-andrews.

the class IntronRegressionPipeline method startPipeline.

protected void startPipeline() {
    // We first need to generate probes over all of the features listed in
    // the feature types.  The probes should cover the whole area of the
    // feature regardless of where it splices.
    Vector<Probe> probes = new Vector<Probe>();
    int minDensity = optionsPanel.minDensity();
    int minLength = optionsPanel.minLength();
    double maxPValue = optionsPanel.maxPValue();
    int binSize = optionsPanel.measurementBinSize();
    QuantitationStrandType readFilter = optionsPanel.readFilter();
    Chromosome[] chrs = collection().genome().getAllChromosomes();
    for (int c = 0; c < chrs.length; c++) {
        if (cancel) {
            progressCancelled();
            return;
        }
        Vector<Probe> probesForThisChromosome = new Vector<Probe>();
        progressUpdated("Making probes", c, chrs.length);
        Feature[] features = getValidFeatures(chrs[c]);
        for (int f = 0; f < features.length; f++) {
            if (cancel) {
                progressCancelled();
                return;
            }
            // There are no introns here
            if (!(features[f].location() instanceof SplitLocation))
                continue;
            Location[] subLocations = ((SplitLocation) features[f].location()).subLocations();
            // TODO: Reverse the subLocations if its a reverse feature
            for (int intron = 1; intron < subLocations.length; intron++) {
                int start = subLocations[intron - 1].end();
                int end = subLocations[intron].start();
                if ((end - start) + 1 < minLength) {
                    // This intron is too short.
                    continue;
                }
                // TODO: We could throw away any probes which didn't have enough reads in any feature
                Probe p = new Probe(chrs[c], start, end, features[f].location().strand(), features[f].name() + "_" + intron);
                probesForThisChromosome.add(p);
            }
        }
        // Now we can deduplicate the probes for this chromosome and add them to the main collection
        Probe[] dupProbes = probesForThisChromosome.toArray(new Probe[0]);
        Arrays.sort(dupProbes);
        for (int p = 0; p < dupProbes.length; p++) {
            if (p > 0 && dupProbes[p].packedPosition() == dupProbes[p - 1].packedPosition())
                continue;
            probes.add(dupProbes[p]);
        }
    }
    Probe[] allProbes = probes.toArray(new Probe[0]);
    collection().setProbeSet(new ProbeSet("Features over " + optionsPanel.getSelectedFeatureType(), allProbes));
    // Now we go back through the probes and quantitate them
    for (int p = 0; p < allProbes.length; p++) {
        if (cancel) {
            progressCancelled();
            return;
        }
        if (p % 1000 == 0) {
            progressUpdated("Quantitated " + p + " out of " + allProbes.length + " probes", p, allProbes.length);
        }
        for (int d = 0; d < data.length; d++) {
            long[] reads = data[d].getReadsForProbe(allProbes[p]);
            int[] countsPerSite = new int[allProbes[p].length()];
            int usableCounts = 0;
            for (int r = 0; r < reads.length; r++) {
                if (readFilter.useRead(allProbes[p], reads[r])) {
                    ++usableCounts;
                    for (int pos = Math.max(0, SequenceRead.start(reads[r]) - allProbes[p].start()); pos <= Math.min(countsPerSite.length - 1, SequenceRead.end(reads[r]) - allProbes[p].start()); pos++) {
                        ++countsPerSite[pos];
                    }
                }
            }
            if (usableCounts / (allProbes[p].length() / 1000d) >= minDensity) {
                // We're going to do a linear regression rather than a correlation
                // We're analysing in bins so we'll work out the bin counts and
                // add them dynamically to the regression.
                SimpleRegression regression = new SimpleRegression();
                int binCount = 0;
                for (int i = 0; i < countsPerSite.length; i++) {
                    if (i > 0 && i % binSize == 0) {
                        regression.addData(i, binCount);
                        binCount = 0;
                    }
                    binCount += countsPerSite[i];
                }
                float slope = (float) (regression.getSlope() * 1000000);
                double pValue = regression.getSignificance();
                if (allProbes[p].strand() == Location.REVERSE) {
                    slope = 0 - slope;
                }
                if (pValue <= maxPValue) {
                    data[d].setValueForProbe(allProbes[p], slope);
                } else {
                    data[d].setValueForProbe(allProbes[p], Float.NaN);
                }
            } else {
                data[d].setValueForProbe(allProbes[p], Float.NaN);
            }
        }
    }
    StringBuffer quantitationDescription = new StringBuffer();
    quantitationDescription.append("Intron regression pipeline quantitation ");
    quantitationDescription.append(". Directionality was ");
    quantitationDescription.append(optionsPanel.libraryTypeBox.getSelectedItem());
    quantitationDescription.append(". Min intron length was ");
    quantitationDescription.append(minLength);
    quantitationDescription.append(". Min read density was ");
    quantitationDescription.append(minDensity);
    quantitationDescription.append(". Max slope p-value was ");
    quantitationDescription.append(maxPValue);
    collection().probeSet().setCurrentQuantitation(quantitationDescription.toString());
    quantitatonComplete();
}
Also used : Chromosome(uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome) Probe(uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe) Feature(uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature) ProbeSet(uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeSet) SimpleRegression(org.apache.commons.math3.stat.regression.SimpleRegression) SplitLocation(uk.ac.babraham.SeqMonk.DataTypes.Genome.SplitLocation) QuantitationStrandType(uk.ac.babraham.SeqMonk.DataTypes.Sequence.QuantitationStrandType) Vector(java.util.Vector) Location(uk.ac.babraham.SeqMonk.DataTypes.Genome.Location) SplitLocation(uk.ac.babraham.SeqMonk.DataTypes.Genome.SplitLocation)

Example 87 with Max

use of org.apache.commons.math3.stat.descriptive.rank.Max in project tetrad by cmu-phil.

the class Lofs2 method optimizeRow.

private void optimizeRow(final int rowIndex, final TetradMatrix data, final double range, final List<List<Integer>> rows, final List<List<Double>> parameters) {
    System.out.println("A");
    final int numParams = rows.get(rowIndex).size();
    final double[] dLeftMin = new double[numParams];
    final double[] dRightMin = new double[numParams];
    double[] values = new double[numParams];
    double delta = 0.1;
    if (false) {
        // isEdgeCorrected()) {
        double min = -2;
        double max = 2;
        int[] dims = new int[values.length];
        int numBins = 5;
        for (int i = 0; i < values.length; i++) dims[i] = numBins;
        CombinationGenerator gen = new CombinationGenerator(dims);
        int[] comb;
        List<Double> maxParams = new ArrayList<>();
        for (int i = 0; i < values.length; i++) maxParams.add(0.0);
        double maxV = Double.NEGATIVE_INFINITY;
        while ((comb = gen.next()) != null) {
            if (Thread.currentThread().isInterrupted()) {
                break;
            }
            List<Double> params = new ArrayList<>();
            for (int i = 0; i < values.length; i++) {
                params.add(min + (max - min) * (comb[i] / (double) numBins));
            }
            parameters.set(rowIndex, params);
            double v = scoreRow(rowIndex, data, rows, parameters);
            if (v > maxV) {
                maxV = v;
                maxParams = params;
            }
        }
        System.out.println("maxparams = " + maxParams);
        parameters.set(rowIndex, maxParams);
        for (int i = 0; i < values.length; i++) {
            dLeftMin[i] = -range;
            dRightMin[i] = range;
            values[i] = maxParams.get(i);
        }
    } else if (false) {
        for (int i = 0; i < numParams; i++) {
            if (Thread.currentThread().isInterrupted()) {
                break;
            }
            parameters.get(rowIndex).set(i, -range);
            double vLeft = scoreRow(rowIndex, data, rows, parameters);
            double dLeft = -range;
            // Search from the left for the first valley; mark that as dleft.
            for (double d = -range + delta; d < range; d += delta) {
                if (Thread.currentThread().isInterrupted()) {
                    break;
                }
                parameters.get(rowIndex).set(i, d);
                double v = scoreRow(rowIndex, data, rows, parameters);
                if (Double.isNaN(v))
                    continue;
                if (v > vLeft)
                    break;
                vLeft = v;
                dLeft = d;
            }
            parameters.get(rowIndex).set(i, range);
            double vRight = scoreRow(rowIndex, data, rows, parameters);
            double dRight = range;
            // to avoid high scores at the boundaries.
            for (double d = range - delta; d > -range; d -= delta) {
                if (Thread.currentThread().isInterrupted()) {
                    break;
                }
                parameters.get(rowIndex).set(i, d);
                double v = scoreRow(rowIndex, data, rows, parameters);
                if (Double.isNaN(v))
                    continue;
                if (v > vRight)
                    break;
                vRight = v;
                dRight = d;
            }
            // If dleft dright ended up reversed, re-reverse them.
            if (dLeft > dRight) {
                double temp = dRight;
                dLeft = dRight;
                dRight = temp;
            }
            System.out.println("dLeft = " + dLeft + " dRight = " + dRight);
            dLeftMin[i] = dLeft;
            dRightMin[i] = dRight;
            values[i] = (dLeft + dRight) / 2.0;
        }
    } else {
        // Default case: search for the maximum score over the entire range.
        for (int i = 0; i < numParams; i++) {
            if (Thread.currentThread().isInterrupted()) {
                break;
            }
            dLeftMin[i] = -range;
            dRightMin[i] = range;
            values[i] = 0;
        }
    }
    MultivariateFunction function = new MultivariateFunction() {

        public double value(double[] values) {
            System.out.println(Arrays.toString(values));
            for (int i = 0; i < values.length; i++) {
                if (Thread.currentThread().isInterrupted()) {
                    break;
                }
                parameters.get(rowIndex).set(i, values[i]);
            }
            double v = scoreRow(rowIndex, data, rows, parameters);
            if (Double.isNaN(v)) {
                // was 10000
                return Double.POSITIVE_INFINITY;
            }
            return -v;
        }
    };
    try {
        MultivariateOptimizer search = new PowellOptimizer(1e-7, 1e-7);
        PointValuePair pair = search.optimize(new InitialGuess(values), new ObjectiveFunction(function), GoalType.MINIMIZE, new MaxEval(100000));
        values = pair.getPoint();
    } catch (Exception e) {
        e.printStackTrace();
        for (int i = 0; i < values.length; i++) {
            parameters.get(rowIndex).set(i, Double.NaN);
        }
    }
}
Also used : MultivariateOptimizer(org.apache.commons.math3.optim.nonlinear.scalar.MultivariateOptimizer) InitialGuess(org.apache.commons.math3.optim.InitialGuess) MaxEval(org.apache.commons.math3.optim.MaxEval) ObjectiveFunction(org.apache.commons.math3.optim.nonlinear.scalar.ObjectiveFunction) PowellOptimizer(org.apache.commons.math3.optim.nonlinear.scalar.noderiv.PowellOptimizer) PointValuePair(org.apache.commons.math3.optim.PointValuePair) MultivariateFunction(org.apache.commons.math3.analysis.MultivariateFunction)

Example 88 with Max

use of org.apache.commons.math3.stat.descriptive.rank.Max in project jvarkit by lindenb.

the class CopyNumber01 method doWork.

@Override
public int doWork(final List<String> args) {
    if (this.refFile == null) {
        LOG.error("Undefined REF file");
        return -1;
    }
    if (this.archiveFile == null) {
        LOG.error("Undefined output file.");
        return -1;
    }
    if (!StringUtil.isBlank(prefix)) {
        if (!(prefix.endsWith(".") || prefix.endsWith("_"))) {
            prefix = prefix + ".";
        }
    }
    SamReader samReader = null;
    ArchiveFactory archive = null;
    try {
        final String input = oneAndOnlyOneFile(args);
        samReader = super.openSamReader(input);
        if (!samReader.hasIndex()) {
            LOG.error("file is not indexed " + input);
            return -1;
        }
        final String sampleName = samReader.getFileHeader().getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtil.isBlank(S)).findFirst().orElse("SAMPLE");
        final SAMFileHeader header = samReader.getFileHeader();
        this.samDictionary = header.getSequenceDictionary();
        if (this.samDictionary == null || this.samDictionary.isEmpty()) {
            throw new JvarkitException.DictionaryMissing(input);
        }
        /* loading REF Reference */
        this.indexedFastaSequenceFile = new IndexedFastaSequenceFile(refFile);
        final SAMSequenceDictionary dict = this.indexedFastaSequenceFile.getSequenceDictionary();
        if (dict == null) {
            throw new JvarkitException.DictionaryMissing(refFile.getPath());
        }
        this.sam2faiContigNameConverter = ContigNameConverter.fromDictionaries(this.samDictionary, dict);
        this.sam2faiContigNameConverter.setOnNotFound(OnNotFound.SKIP);
        archive = ArchiveFactory.open(this.archiveFile);
        PrintWriter mkw = archive.openWriter(this.prefix + "cnv01.mk");
        mkw.println("## " + this.getProgramCommandLine());
        mkw.println(".PHONY=all all2");
        mkw.println("TARGETS=");
        mkw.println("SCREEN_WIDTH?=2600");
        mkw.println("SCREEN_HEIGHT?=1000");
        mkw.println("GNUPLOT?=gnuplot");
        mkw.println("all : all2");
        final List<String> all_chrom_files = new ArrayList<>();
        double maxDepth = 0.0;
        for (final SAMSequenceRecord ssr : this.samDictionary.getSequences()) {
            if (!StringUtil.isBlank(this.limitToChrom) && !this.limitToChrom.equals(ssr.getSequenceName())) {
                LOG.info("Skipping " + ssr.getSequenceName() + "....");
                continue;
            }
            if (this.sam2faiContigNameConverter.apply(ssr.getSequenceName()) == null) {
                LOG.info("Ignoring " + ssr.getSequenceName() + " because it's not in REF");
                continue;
            }
            if (ignoreChromosomeName(ssr.getSequenceName())) {
                LOG.info("Ignoring " + ssr.getSequenceName());
                continue;
            }
            LOG.info("processing chromosome " + ssr.getSequenceName());
            ContigProcessor proc = new ContigProcessor(samReader, ssr, sampleName);
            proc.run();
            final String tsvFilename = this.prefix + proc.getContig() + "." + proc.sampleName + ".bed";
            final String pngFilename = "$(addsuffix .png,$(basename " + tsvFilename + "))";
            final double depth = proc.items.stream().mapToDouble(I -> I.norm_depth).max().orElse(4);
            maxDepth = Math.max(maxDepth, depth);
            all_chrom_files.add(tsvFilename);
            mkw.println("TARGETS+=" + pngFilename);
            mkw.println(pngFilename + ":" + tsvFilename);
            mkw.println("\techo 'set terminal png truecolor size ${SCREEN_WIDTH},${SCREEN_HEIGHT};" + "set key autotitle columnhead;" + "set datafile separator \"\t\";" + "set title \"" + proc.sampleName + " " + ssr.getSequenceName() + "\";" + "set ylabel \"Normalized Depth - Number of copies\";" + "set xlabel \"Position on " + ssr.getSequenceName() + "\";" + // + "set style circle radius 0.02;"
            "set nokey;" + "set yrange [0:" + Math.min(4, Math.ceil(depth)) + "];" + "set xrange [1:" + ssr.getSequenceLength() + "];" + "set xtic rotate by 90 right;" + "set output \"$@\";" + // + "plot \"$<\" using 1:2:1 w points linecolor variable "
            "plot \"$<\" using 2:7 w lines " + "' | ${GNUPLOT}");
            PrintWriter pw = archive.openWriter(tsvFilename);
            proc.saveCoverage(pw);
            pw.flush();
            pw.close();
            proc = null;
        }
        samReader.close();
        final String pngFilename = this.prefix + "wholeGenome.png";
        mkw.println("TARGETS+=" + pngFilename);
        mkw.println(pngFilename + ":" + String.join(" ", all_chrom_files));
        mkw.println("\tgrep -v '^#' $^ | cut -f 1,4,7 | sed " + samDictionary.getSequences().stream().map(SSR -> " -e 's/^" + SSR.getSequenceName() + "\t/" + SSR.getSequenceIndex() + "\t/' ").collect(Collectors.joining()) + "| sort -t '\t' -k2,2n > $(addsuffix .tmp.tsv,$@)");
        mkw.println("\techo 'set terminal png truecolor size ${SCREEN_WIDTH},${SCREEN_HEIGHT};" + "set datafile separator \"\t\";" + "set title \"" + sampleName + " (Whole Genome)\";" + "set ylabel \"Normalized Depth - Number of copies\";" + "set xlabel \"Genomic Position\";" + "set yrange [0:" + Math.min(4, Math.ceil(maxDepth)) + "];" + "set xrange [1:" + this.samDictionary.getReferenceLength() + ".0];" + "set xtic rotate by 90 right;" + "set nokey;" + "set output \"$@\";" + "plot \"$(addsuffix .tmp.tsv,$@)\" using 2:3:1 w points linecolor variable " + "' | ${GNUPLOT}");
        mkw.println("\trm -f $(addsuffix .tmp.tsv,$@)");
        mkw.println("all2: ${TARGETS}");
        mkw.flush();
        mkw.close();
        mkw = null;
        archive.close();
        archive = null;
        return 0;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(samReader);
        CloserUtil.close(archive);
        CloserUtil.close(this.indexedFastaSequenceFile);
    }
}
Also used : Arrays(java.util.Arrays) NevilleInterpolator(org.apache.commons.math3.analysis.interpolation.NevilleInterpolator) LineIterator(htsjdk.tribble.readers.LineIterator) Program(com.github.lindenb.jvarkit.util.jcommander.Program) CigarElement(htsjdk.samtools.CigarElement) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) UnivariateInterpolator(org.apache.commons.math3.analysis.interpolation.UnivariateInterpolator) SAMFileHeader(htsjdk.samtools.SAMFileHeader) StringUtil(htsjdk.samtools.util.StringUtil) Map(java.util.Map) Median(org.apache.commons.math3.stat.descriptive.rank.Median) MathIllegalArgumentException(org.apache.commons.math3.exception.MathIllegalArgumentException) CloserUtil(htsjdk.samtools.util.CloserUtil) PrintWriter(java.io.PrintWriter) LinearInterpolator(org.apache.commons.math3.analysis.interpolation.LinearInterpolator) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) LoessInterpolator(org.apache.commons.math3.analysis.interpolation.LoessInterpolator) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) SamFilterParser(com.github.lindenb.jvarkit.util.bio.samfilter.SamFilterParser) Collectors(java.util.stream.Collectors) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) SAMRecord(htsjdk.samtools.SAMRecord) List(java.util.List) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile) UnivariateFunction(org.apache.commons.math3.analysis.UnivariateFunction) GZIPOutputStream(java.util.zip.GZIPOutputStream) Pattern(java.util.regex.Pattern) BBFileReader(org.broad.igv.bbfile.BBFileReader) SamReaderFactory(htsjdk.samtools.SamReaderFactory) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) SplineInterpolator(org.apache.commons.math3.analysis.interpolation.SplineInterpolator) Cigar(htsjdk.samtools.Cigar) SequenceUtil(htsjdk.samtools.util.SequenceUtil) GcPercentAndDepth(com.github.lindenb.jvarkit.tools.misc.GcPercentAndDepth) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Parameter(com.beust.jcommander.Parameter) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) HashMap(java.util.HashMap) Function(java.util.function.Function) ValidationStringency(htsjdk.samtools.ValidationStringency) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Interval(htsjdk.samtools.util.Interval) OnNotFound(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter.OnNotFound) AbstractUnivariateStatistic(org.apache.commons.math3.stat.descriptive.AbstractUnivariateStatistic) Mean(org.apache.commons.math3.stat.descriptive.moment.Mean) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) BEDCodec(htsjdk.tribble.bed.BEDCodec) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) Locatable(htsjdk.samtools.util.Locatable) Iterator(java.util.Iterator) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) DividedDifferenceInterpolator(org.apache.commons.math3.analysis.interpolation.DividedDifferenceInterpolator) FileOutputStream(java.io.FileOutputStream) IOException(java.io.IOException) IntervalTree(htsjdk.samtools.util.IntervalTree) Identity(org.apache.commons.math3.analysis.function.Identity) DimensionMismatchException(org.apache.commons.math3.exception.DimensionMismatchException) SamReader(htsjdk.samtools.SamReader) File(java.io.File) SamRecordFilter(htsjdk.samtools.filter.SamRecordFilter) WigItem(org.broad.igv.bbfile.WigItem) BufferedReader(java.io.BufferedReader) BitSet(java.util.BitSet) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) Comparator(java.util.Comparator) Collections(java.util.Collections) BigWigIterator(org.broad.igv.bbfile.BigWigIterator) ArchiveFactory(com.github.lindenb.jvarkit.io.ArchiveFactory) ArchiveFactory(com.github.lindenb.jvarkit.io.ArchiveFactory) ArrayList(java.util.ArrayList) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile) MathIllegalArgumentException(org.apache.commons.math3.exception.MathIllegalArgumentException) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) IOException(java.io.IOException) DimensionMismatchException(org.apache.commons.math3.exception.DimensionMismatchException) SamReader(htsjdk.samtools.SamReader) SAMFileHeader(htsjdk.samtools.SAMFileHeader) PrintWriter(java.io.PrintWriter)

Aggregations

ArrayList (java.util.ArrayList)26 List (java.util.List)19 Collectors (java.util.stream.Collectors)13 DescriptiveStatistics (org.apache.commons.math3.stat.descriptive.DescriptiveStatistics)13 Arrays (java.util.Arrays)11 Map (java.util.Map)11 IntStream (java.util.stream.IntStream)10 RandomGenerator (org.apache.commons.math3.random.RandomGenerator)10 Array2DRowRealMatrix (org.apache.commons.math3.linear.Array2DRowRealMatrix)9 RealMatrix (org.apache.commons.math3.linear.RealMatrix)9 Plot2 (ij.gui.Plot2)8 File (java.io.File)8 IOException (java.io.IOException)8 TooManyEvaluationsException (org.apache.commons.math3.exception.TooManyEvaluationsException)7 Test (org.testng.annotations.Test)7 StoredDataStatistics (gdsc.core.utils.StoredDataStatistics)6 Collections (java.util.Collections)6 HashMap (java.util.HashMap)6 Random (java.util.Random)6 UnivariateFunction (org.apache.commons.math3.analysis.UnivariateFunction)6