Search in sources :

Example 91 with Probe

use of uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe in project SeqMonk by s-andrews.

the class RNASeqPipeline method startPipeline.

protected void startPipeline() {
    // We first need to generate probes over all of the features listed in
    // the feature types.  The probes should cover the whole area of the
    // feature regardless of where it splices.
    Vector<Probe> probes = new Vector<Probe>();
    boolean mergeTranscripts = optionsPanel.mergeTranscripts();
    boolean pairedEnd = optionsPanel.pairedEnd();
    boolean logTransform = optionsPanel.logTransform();
    boolean applyTranscriptLengthCorrection = optionsPanel.applyTranscriptLengthCorrection();
    boolean rawCounts = optionsPanel.rawCounts();
    boolean noValueForZeroCounts = optionsPanel.noValueForZeroCounts();
    boolean correctDNAContamination = optionsPanel.correctForDNAContamination();
    boolean correctDuplication = optionsPanel.correctForDNADuplication();
    if (rawCounts) {
        logTransform = false;
        applyTranscriptLengthCorrection = false;
        noValueForZeroCounts = false;
    }
    Chromosome[] chrs = collection().genome().getAllChromosomes();
    for (int c = 0; c < chrs.length; c++) {
        // System.err.println("Processing chr "+chrs[c].name());
        if (cancel) {
            progressCancelled();
            return;
        }
        progressUpdated("Making features for chr" + chrs[c].name(), c, chrs.length * 2);
        Feature[] features = collection().genome().annotationCollection().getFeaturesForType(chrs[c], optionsPanel.getSelectedFeatureType());
        Arrays.sort(features);
        FeatureGroup[] mergedTranscripts = mergeTranscripts(features, mergeTranscripts);
        for (int f = 0; f < mergedTranscripts.length; f++) {
            if (cancel) {
                progressCancelled();
                return;
            }
            probes.add(new Probe(chrs[c], mergedTranscripts[f].start(), mergedTranscripts[f].end(), mergedTranscripts[f].strand(), mergedTranscripts[f].name()));
        }
    }
    Probe[] allProbes = probes.toArray(new Probe[0]);
    Arrays.sort(allProbes);
    if (collection().probeSet() == null) {
        collection().setProbeSet(new ProbeSet("Transcript features over " + optionsPanel.getSelectedFeatureType(), allProbes));
    } else {
        Probe[] existingProbes = collection().probeSet().getAllProbes();
        Arrays.sort(existingProbes);
        if (allProbes.length != existingProbes.length) {
            collection().setProbeSet(new ProbeSet("Transcript features over " + optionsPanel.getSelectedFeatureType(), allProbes));
        } else {
            // Check the positions against the new ones
            boolean areTheyTheSame = true;
            for (int p = 0; p < allProbes.length; p++) {
                if (allProbes[p].packedPosition() != existingProbes[p].packedPosition()) {
                    areTheyTheSame = false;
                    break;
                }
            }
            if (areTheyTheSame) {
                allProbes = existingProbes;
            } else {
                collection().setProbeSet(new ProbeSet("Transcript features over " + optionsPanel.getSelectedFeatureType(), allProbes));
            }
        }
    }
    // If we're correcting for DNA contamination we need to work out the average density of
    // reads in intergenic regions
    float[] dnaDensityPerKb = new float[data.length];
    int[] correctedTotalCounts = new int[data.length];
    if (correctDNAContamination) {
        // We need to make interstitial probes to the set we already have, ignoring those at the end of chromosomes
        Vector<Probe> intergenicProbes = new Vector<Probe>();
        Chromosome lastChr = allProbes[0].chromosome();
        for (int p = 1; p < allProbes.length; p++) {
            if (allProbes[p].chromosome() != lastChr) {
                lastChr = allProbes[p].chromosome();
                continue;
            }
            // See if there's a gap back to the last probe
            if (allProbes[p].start() > allProbes[p - 1].end()) {
                if (allProbes[p].start() - allProbes[p - 1].end() < 1000) {
                    // Don't bother with really short probes
                    continue;
                }
                intergenicProbes.add(new Probe(lastChr, allProbes[p - 1].end() + 1, allProbes[p].start() - 1));
            }
        }
        Probe[] allIntergenicProbes = intergenicProbes.toArray(new Probe[0]);
        for (int d = 0; d < data.length; d++) {
            progressUpdated("Quantitating DNA contamination", 1, 2);
            float[] densities = new float[allIntergenicProbes.length];
            for (int p = 0; p < allIntergenicProbes.length; p++) {
                densities[p] = data[d].getReadsForProbe(allIntergenicProbes[p]).length / (allIntergenicProbes[p].length() / 1000f);
            }
            dnaDensityPerKb[d] = SimpleStats.median(densities);
        }
        // Work out adjusted total counts having subtracted the DNA contamination
        for (int d = 0; d < data.length; d++) {
            int predictedContamination = (int) (dnaDensityPerKb[d] * (SeqMonkApplication.getInstance().dataCollection().genome().getTotalGenomeLength() / 1000));
            int correctedTotalReadCount = data[d].getTotalReadCount() - predictedContamination;
            correctedTotalCounts[d] = correctedTotalReadCount;
        }
        // Halve the density if they're doing a directional quantitation
        if (optionsPanel.isDirectional()) {
            for (int i = 0; i < dnaDensityPerKb.length; i++) {
                dnaDensityPerKb[i] /= 2;
            }
        }
        // Halve the density if the libraries are paired end
        if (pairedEnd) {
            for (int i = 0; i < dnaDensityPerKb.length; i++) {
                dnaDensityPerKb[i] /= 2;
            }
        }
    }
    // If we're correcting for duplication we need to work out the modal count depth in
    // intergenic regions
    int[] modalDuplicationLevels = new int[data.length];
    if (correctDuplication) {
        for (int d = 0; d < data.length; d++) {
            progressUpdated("Quantitating DNA duplication", 1, 2);
            // We're not going to look at depths which are > 200.  If it's that duplicated
            // then there's no point continuing anyway.
            int[] depthCount = new int[200];
            for (int p = 0; p < allProbes.length; p++) {
                long[] reads = data[d].getReadsForProbe(allProbes[p]);
                int currentCount = 0;
                for (int r = 1; r < reads.length; r++) {
                    if (reads[r] == reads[r - 1]) {
                        ++currentCount;
                    } else {
                        if (currentCount > 0 && currentCount < 200) {
                            ++depthCount[currentCount];
                        }
                        currentCount = 1;
                    }
                }
            }
            // Find the modal depth in intergenic regions. This is the best estimate
            // of duplication
            // Since unique reads turn up all over the place even in duplicated
            // data we say that if unique reads are higher than the sum of 2-10 there
            // is no duplication
            int twoTenSum = 0;
            for (int i = 2; i <= 10; i++) {
                twoTenSum += depthCount[i];
            }
            if (depthCount[1] > twoTenSum) {
                modalDuplicationLevels[d] = 1;
            } else {
                int highestDepth = 0;
                int bestDupGuess = 1;
                for (int i = 2; i < depthCount.length; i++) {
                    // System.err.println("For depth "+i+" count was "+depthCount[i]);
                    if (depthCount[i] > highestDepth) {
                        bestDupGuess = i;
                        highestDepth = depthCount[i];
                    }
                }
                modalDuplicationLevels[d] = bestDupGuess;
            }
        }
    }
    // Having made probes we now need to quantitate them.  We'll fetch the
    // probes overlapping each sub-feature and then aggregate these together
    // to get the final quantitation.
    QuantitationStrandType readFilter = optionsPanel.readFilter();
    int currentIndex = 0;
    for (int c = 0; c < chrs.length; c++) {
        if (cancel) {
            progressCancelled();
            return;
        }
        progressUpdated("Quantitating features on chr" + chrs[c].name(), chrs.length + c, chrs.length * 2);
        Feature[] features = collection().genome().annotationCollection().getFeaturesForType(chrs[c], optionsPanel.getSelectedFeatureType());
        Arrays.sort(features);
        FeatureGroup[] mergedTranscripts = mergeTranscripts(features, mergeTranscripts);
        int[] readLengths = new int[data.length];
        for (int d = 0; d < data.length; d++) {
            readLengths[d] = data[d].getMaxReadLength();
            // actual length.
            if (pairedEnd) {
                readLengths[d] *= 2;
            }
        }
        for (int f = 0; f < mergedTranscripts.length; f++) {
            Location[] subLocations = mergedTranscripts[f].getSubLocations();
            int totalLength = 0;
            // Find the total length of all of the exons
            for (int s = 0; s < subLocations.length; s++) {
                totalLength += subLocations[s].length();
            }
            for (int d = 0; d < data.length; d++) {
                if (cancel) {
                    progressCancelled();
                    return;
                }
                long totalCount = 0;
                for (int s = 0; s < subLocations.length; s++) {
                    long[] reads = data[d].getReadsForProbe(new Probe(chrs[c], subLocations[s].start(), subLocations[s].end()));
                    for (int r = 0; r < reads.length; r++) {
                        if (!readFilter.useRead(subLocations[s], reads[r])) {
                            continue;
                        }
                        int overlap = (Math.min(subLocations[s].end(), SequenceRead.end(reads[r])) - Math.max(subLocations[s].start(), SequenceRead.start(reads[r]))) + 1;
                        totalCount += overlap;
                    }
                }
                // Now we correct the count by the total length of reads in the data and by
                // the length of the split parts of the probe, and assign this to the probe.
                // As we're correcting for read length then we work out the whole number of
                // reads which this count could comprise, rounding down to a whole number.
                totalCount /= readLengths[d];
                // We can now subtract the DNA contamination prediction.
                if (correctDNAContamination) {
                    int predictedContamination = (int) ((totalLength / 1000f) * dnaDensityPerKb[d]);
                    totalCount -= predictedContamination;
                    // Makes no sense to have negative counts
                    if (totalCount < 0)
                        totalCount = 0;
                }
                // ..and we can divide by the duplication level if we know it.
                if (correctDuplication) {
                    totalCount /= modalDuplicationLevels[d];
                }
                // System.err.println("Total read count for "+mergedTranscripts[f].name+" is "+totalCount);
                float value = totalCount;
                if (value == 0 && noValueForZeroCounts) {
                    value = Float.NaN;
                }
                // If we're log transforming then we need to set zero values to 0.9
                if (logTransform && value == 0 && !noValueForZeroCounts) {
                    value = 0.9f;
                }
                // been asked to.
                if (applyTranscriptLengthCorrection) {
                    value /= (totalLength / 1000f);
                }
                // We also correct by the total read count
                if (!rawCounts) {
                    // System.err.println("True total is "+data[d].getTotalReadCount()+" corrected total is "+correctedTotalCounts[d]);
                    // If these libraries are paired end then the total number of
                    // reads is also effectively halved.
                    float totalReadCount;
                    // calculated this already, but otherwise we'll take the total count (total length/read length)
                    if (correctDNAContamination) {
                        totalReadCount = correctedTotalCounts[d];
                    } else {
                        totalReadCount = data[d].getTotalReadLength() / readLengths[d];
                    }
                    // If we're correcting for duplication we divide by the duplication level.
                    if (correctDuplication) {
                        totalReadCount /= modalDuplicationLevels[d];
                    }
                    // Finally we work out millions of reads (single end) or fragments (paired end)
                    if (pairedEnd) {
                        totalReadCount /= 2000000f;
                    } else {
                        totalReadCount /= 1000000f;
                    }
                    // Lastly we divide the value by the total millions of reads to get the globally corrected count.
                    value /= totalReadCount;
                }
                // Finally we do the log transform if we've been asked to
                if (logTransform) {
                    value = (float) Math.log(value) / log2;
                }
                data[d].setValueForProbe(allProbes[currentIndex], value);
            }
            currentIndex++;
        }
    }
    collection().probeSet().setCurrentQuantitation(getQuantitationDescription(mergeTranscripts, applyTranscriptLengthCorrection, correctDNAContamination, logTransform, rawCounts));
    // If we estimated any parameters let's report them.
    if (correctDNAContamination || correctDuplication) {
        float[] dna = null;
        if (correctDNAContamination) {
            dna = dnaDensityPerKb;
        }
        int[] dup = null;
        if (correctDuplication) {
            dup = modalDuplicationLevels;
        }
        RNASeqParametersModel model = new RNASeqParametersModel(data, dna, dup);
        ReportTableDialog report = new ReportTableDialog(SeqMonkApplication.getInstance(), new Report(null, null) {

            @Override
            public void run() {
            }

            @Override
            public String name() {
                return "RNA-Seq parameter";
            }

            @Override
            public boolean isReady() {
                return true;
            }

            @Override
            public JPanel getOptionsPanel() {
                return null;
            }

            @Override
            public void generateReport() {
            }
        }, model);
    }
    quantitatonComplete();
}
Also used : JPanel(javax.swing.JPanel) Probe(uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe) Feature(uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature) ProbeSet(uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeSet) ReportTableDialog(uk.ac.babraham.SeqMonk.Displays.Report.ReportTableDialog) QuantitationStrandType(uk.ac.babraham.SeqMonk.DataTypes.Sequence.QuantitationStrandType) Vector(java.util.Vector) Report(uk.ac.babraham.SeqMonk.Reports.Report) Chromosome(uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome) Location(uk.ac.babraham.SeqMonk.DataTypes.Genome.Location)

Example 92 with Probe

use of uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe in project SeqMonk by s-andrews.

the class ZScoreScatterPlotPanel method calculateNonredundantSet.

/**
 * This collapses individual points which are over the same
 * pixel when redrawing the plot at a different scale
 */
private synchronized void calculateNonredundantSet() {
    closestPoint = null;
    ProbePairValue[][] grid = new ProbePairValue[getWidth()][getHeight()];
    try {
        for (int p = 0; p < probes.length; p++) {
            /* x value is the mean intensity */
            float xValue = (xStore.getValueForProbe(probes[p]) + yStore.getValueForProbe(probes[p])) / 2;
            /* y value is the z-score */
            float yValue = probeZScoreLookupTable.get(probes[p]).floatValue();
            if (Float.isNaN(xValue) || Float.isInfinite(xValue) || Float.isNaN(yValue) || Float.isInfinite(yValue)) {
                continue;
            }
            int x = getX(xValue);
            int y = getY(yValue);
            if (grid[x][y] == null) {
                grid[x][y] = new ProbePairValue(xValue, yValue, x, y);
                grid[x][y].setProbe(probes[p]);
            } else {
                // belong to
                if (subLists == null)
                    grid[x][y].count++;
                // As we have multiple probes at this point we remove the
                // specific probe annotation.
                grid[x][y].setProbe(null);
            }
        }
        if (subLists != null) {
            for (int s = 0; s < subLists.length; s++) {
                Probe[] subListProbes = subLists[s].getAllProbes();
                for (int p = 0; p < subListProbes.length; p++) {
                    // TODO: this needs to be sorted out properly - it's to do with the filtering out of NA probes
                    if (probeZScoreLookupTable.containsKey(subListProbes[p])) {
                        // System.err.println("p = " + p + ", name = " + subListProbes[p].name());
                        /* x value is the mean intensity */
                        float xValue = (xStore.getValueForProbe(subListProbes[p]) + yStore.getValueForProbe(subListProbes[p])) / 2;
                        /* y value is the z-score */
                        float yValue = probeZScoreLookupTable.get(subListProbes[p]).floatValue();
                        // System.err.println("for " + probes[p].name() + ", xValue = " + xValue+ ", yValue = " + yValue);
                        int x = getX(xValue);
                        int y = getY(yValue);
                        if (grid[x][y] == null) {
                            // This messes up where we catch it in the middle of a redraw
                            continue;
                        // throw new IllegalArgumentException("Found subList position not in main list");
                        }
                        // 1 = no list so 2 is the lowest sublist index
                        grid[x][y].count = s + 2;
                    }
                }
            }
        }
    } catch (SeqMonkException e) {
        throw new IllegalStateException(e);
    }
    // Now we need to put all of the ProbePairValues into
    // a single array;
    int count = 0;
    for (int x = 0; x < grid.length; x++) {
        for (int y = 0; y < grid[x].length; y++) {
            if (grid[x][y] != null)
                count++;
        }
    }
    ProbePairValue[] nonred = new ProbePairValue[count];
    count--;
    for (int x = 0; x < grid.length; x++) {
        for (int y = 0; y < grid[x].length; y++) {
            if (grid[x][y] != null) {
                nonred[count] = grid[x][y];
                count--;
            }
        }
    }
    Arrays.sort(nonred);
    // Work out the 95% percentile count
    int minCount = 1;
    int maxCount = 2;
    if (nonred.length > 0) {
        minCount = nonred[0].count;
        maxCount = nonred[((nonred.length - 1) * 95) / 100].count;
    }
    // Go through every nonred assigning a suitable colour
    ColourGradient gradient = new HotColdColourGradient();
    for (int i = 0; i < nonred.length; i++) {
        if (subLists == null) {
            nonred[i].color = gradient.getColor(nonred[i].count, minCount, maxCount);
        } else {
            if (nonred[i].count > subLists.length + 1) {
                throw new IllegalArgumentException("Count above threshold when showing sublists");
            }
            if (nonred[i].count == 1) {
                nonred[i].color = Color.LIGHT_GRAY;
            } else {
                nonred[i].color = ColourIndexSet.getColour(nonred[i].count - 2);
            }
        }
    }
    nonRedundantValues = nonred;
    lastNonredWidth = getWidth();
    lastNonredHeight = getHeight();
}
Also used : HotColdColourGradient(uk.ac.babraham.SeqMonk.Gradients.HotColdColourGradient) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) Probe(uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe) ColourGradient(uk.ac.babraham.SeqMonk.Gradients.ColourGradient) HotColdColourGradient(uk.ac.babraham.SeqMonk.Gradients.HotColdColourGradient)

Example 93 with Probe

use of uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe in project SeqMonk by s-andrews.

the class IntensityReplicateFilter method generateProbeList.

/* (non-Javadoc)
	 * @see uk.ac.babraham.SeqMonk.Filters.ProbeFilter#generateProbeList()
	 */
protected void generateProbeList() {
    applyMultipleTestingCorrection = optionsPanel.multipleTestingBox.isSelected();
    Probe[] probes = startingList.getAllProbes();
    // We need to work out how many probes are going to be put into
    // each sub-distribution we calculate.  The rule is going to be that
    // we use 1% of the total, or 100 probes whichever is the higher
    probesPerSet = probes.length / 100;
    if (probesPerSet < 100)
        probesPerSet = 100;
    if (probesPerSet > probes.length)
        probesPerSet = probes.length;
    ProbeList newList = new ProbeList(startingList, "Filtered Probes", "", "Diff p-value");
    // We'll build up a set of p-values as we go along
    float[] lowestPValues = new float[probes.length];
    for (int p = 0; p < lowestPValues.length; p++) {
        lowestPValues[p] = 1;
    }
    // These arrays are going to hold the real SDs for the replicate
    // sets we're going to analyse.
    double[] realSDFrom = new double[probes.length];
    double[] realSDTo = new double[probes.length];
    // These arrays are going to hold the averaged SDs for the replicate
    // sets we're going to analyse.
    double[] averageSDFrom = new double[probes.length];
    double[] averageSDTo = new double[probes.length];
    // This is going to be the temporary array we populate with the set of
    // differences we are going to analyse.
    double[] localSDset = new double[probesPerSet];
    // First work out the set of comparisons we're going to make
    Vector<SingleComparison> comparisons = new Vector<IntensityReplicateFilter.SingleComparison>();
    for (int fromIndex = 0; fromIndex < fromStores.length; fromIndex++) {
        for (int toIndex = 0; toIndex < toStores.length; toIndex++) {
            if (fromStores[fromIndex] == toStores[toIndex])
                continue;
            // If we can find the fromStore in the toStores we've already done and the
            // toStore anywhere in the fromStores then we can skip this.
            boolean canSkip = false;
            for (int i = 0; i < fromIndex; i++) {
                if (fromStores[i] == toStores[toIndex]) {
                    for (int j = 0; j < toStores.length; j++) {
                        if (toStores[j] == fromStores[fromIndex]) {
                            canSkip = true;
                            break;
                        }
                    }
                    break;
                }
            }
            if (canSkip)
                continue;
            comparisons.add(new SingleComparison(fromIndex, toIndex));
        }
    }
    for (int comparisonIndex = 0; comparisonIndex < comparisons.size(); comparisonIndex++) {
        int fromIndex = comparisons.elementAt(comparisonIndex).fromIndex;
        int toIndex = comparisons.elementAt(comparisonIndex).toIndex;
        // We need to generate a set of probe indices ordered by their average intensity
        Integer[] fromIndices = new Integer[probes.length];
        Integer[] toIndices = new Integer[probes.length];
        for (int i = 0; i < probes.length; i++) {
            fromIndices[i] = i;
            toIndices[i] = i;
        }
        // This isn't ideal.  We're sorting the probes by average intensity which puts together
        // probes which should probably have different standard deviations.  It would be better
        // to sort on the intensities in each data store separately, but we get such a problem
        // from very low values contaminating the set by giving artificially low average SDs that
        // I don't think we can get away with this.
        Comparator<Integer> fromComp = new AveragePairedIntensityComparator(fromStores[fromIndex], toStores[toIndex], probes);
        Comparator<Integer> toComp = new AveragePairedIntensityComparator(toStores[toIndex], fromStores[fromIndex], probes);
        // Comparator<Integer> fromComp = new AverageIntensityComparator(fromStores[fromIndex], probes);
        // Comparator<Integer> toComp = new AverageIntensityComparator(fromStores[fromIndex], probes);
        Arrays.sort(fromIndices, fromComp);
        Arrays.sort(toIndices, toComp);
        // We also need to get the raw SDs for the two replicate sets
        double[] fromValues = new double[fromStores[fromIndex].dataStores().length];
        double[] toValues = new double[toStores[toIndex].dataStores().length];
        try {
            for (int p = 0; p < probes.length; p++) {
                for (int f = 0; f < fromValues.length; f++) {
                    fromValues[f] = fromStores[fromIndex].dataStores()[f].getValueForProbe(probes[p]);
                }
                for (int t = 0; t < toValues.length; t++) {
                    toValues[t] = toStores[toIndex].dataStores()[t].getValueForProbe(probes[p]);
                }
                realSDFrom[p] = SimpleStats.stdev(fromValues);
                realSDTo[p] = SimpleStats.stdev(toValues);
            }
        } catch (SeqMonkException sme) {
            progressExceptionReceived(sme);
        }
        progressUpdated("Made " + comparisonIndex + " out of " + comparisons.size() + " comparisons", comparisonIndex, comparisons.size());
        IndexTTestValue[] currentPValues = new IndexTTestValue[probes.length];
        for (int i = 0; i < probes.length; i++) {
            if (cancel) {
                cancel = false;
                progressCancelled();
                return;
            }
            if (i % 1000 == 0) {
                int progress = (i * 100) / probes.length;
                progress += 100 * comparisonIndex;
                progressUpdated("Made " + comparisonIndex + " out of " + comparisons.size() + " comparisons", progress, comparisons.size() * 100);
            }
            // We need to make up the set of SDs to represent this probe
            int startingIndex = i - (probesPerSet / 2);
            if (startingIndex < 0)
                startingIndex = 0;
            if (startingIndex + probesPerSet >= probes.length)
                startingIndex = probes.length - probesPerSet;
            for (int j = startingIndex; j < startingIndex + probesPerSet; j++) {
                localSDset[j - startingIndex] = realSDFrom[fromIndices[j]];
            }
            // averageSDFrom[fromIndices[i]] = SimpleStats.percentile(localSDset,90);
            averageSDFrom[fromIndices[i]] = SimpleStats.mean(localSDset);
            for (int j = startingIndex; j < startingIndex + probesPerSet; j++) {
                localSDset[j - startingIndex] = realSDTo[toIndices[j]];
            }
            // averageSDTo[toIndices[i]] = SimpleStats.percentile(localSDset,90);
            averageSDTo[toIndices[i]] = SimpleStats.mean(localSDset);
        }
        for (int p = 0; p < probes.length; p++) {
            double fromValue = 0;
            double toValue = 0;
            try {
                fromValue = fromStores[fromIndex].getValueForProbe(probes[p]);
                toValue = toStores[toIndex].getValueForProbe(probes[p]);
            } catch (SeqMonkException sme) {
            }
            double fromSD = Math.max(realSDFrom[p], averageSDFrom[p]);
            double toSD = Math.max(realSDTo[p], averageSDTo[p]);
            // double fromSD = averageSDFrom[p];
            // double toSD = averageSDTo[p];
            currentPValues[p] = new IndexTTestValue(p, TTest.calculatePValue(fromValue, toValue, fromSD, toSD, fromStores[fromIndex].dataStores().length, toStores[toIndex].dataStores().length));
        // System.err.println("P value was "+currentPValues[p].p+" from "+fromValue+" "+toValue+" "+fromSD+" "+toSD+" "+fromStores[fromIndex].dataStores().length+" "+toStores[toIndex].dataStores().length);
        }
        // We now need to correct the set of pValues
        if (applyMultipleTestingCorrection) {
            BenjHochFDR.calculateQValues(currentPValues);
        }
        // the combined set
        if (applyMultipleTestingCorrection) {
            for (int i = 0; i < currentPValues.length; i++) {
                if (currentPValues[i].q < lowestPValues[currentPValues[i].index]) {
                    lowestPValues[currentPValues[i].index] = (float) currentPValues[i].q;
                }
            }
        } else {
            for (int i = 0; i < currentPValues.length; i++) {
                if (currentPValues[i].p < lowestPValues[currentPValues[i].index]) {
                    lowestPValues[currentPValues[i].index] = (float) currentPValues[i].p;
                }
            }
        }
    }
    // pass the filter.
    for (int i = 0; i < lowestPValues.length; i++) {
        if (lowestPValues[i] < pValueLimit) {
            newList.addProbe(probes[i], lowestPValues[i]);
        }
    }
    filterFinished(newList);
}
Also used : ProbeList(uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeList) Probe(uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) Vector(java.util.Vector) IndexTTestValue(uk.ac.babraham.SeqMonk.Analysis.Statistics.IndexTTestValue)

Example 94 with Probe

use of uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe in project SeqMonk by s-andrews.

the class LogisticRegressionSplicingFilter method generateProbeList.

/* (non-Javadoc)
	 * @see uk.ac.babraham.SeqMonk.Filters.ProbeFilter#generateProbeList()
	 */
@Override
protected void generateProbeList() {
    // We need to make a temporary directory, save the data into it, write out the R script
    // and then run it an collect the list of results, then clean up.
    // Make up the list of DataStores in each replicate set
    DataStore[] fromStores = replicateSets[0].dataStores();
    DataStore[] toStores = replicateSets[1].dataStores();
    File tempDir;
    try {
        Probe[] probes = startingList.getAllProbes();
        probes = filterProbesByCount(probes, fromStores, toStores);
        progressUpdated("Pairing Probes", 0, 1);
        // Make pairs of probes to test
        ProbePair[] pairs = makeProbePairs(probes, fromStores, toStores);
        System.err.println("Found " + pairs.length + " pairs to test");
        progressUpdated("Creating temp directory", 0, 1);
        tempDir = TempDirectory.createTempDirectory();
        System.err.println("Temp dir is " + tempDir.getAbsolutePath());
        progressUpdated("Writing R script", 0, 1);
        // Get the template script
        Template template = new Template(ClassLoader.getSystemResource("uk/ac/babraham/SeqMonk/Filters/LogisticRegressionFilter/logistic_regression_template.r"));
        // Substitute in the variables we need to change
        template.setValue("WORKING", tempDir.getAbsolutePath().replace("\\", "/"));
        template.setValue("PVALUE", "" + pValueCutoff);
        template.setValue("CORRECTCOUNT", "" + pairs.length);
        if (multiTest) {
            template.setValue("MULTITEST", "TRUE");
        } else {
            template.setValue("MULTITEST", "FALSE");
        }
        // Write the script file
        File scriptFile = new File(tempDir.getAbsoluteFile() + "/script.r");
        PrintWriter pr = new PrintWriter(scriptFile);
        pr.print(template.toString());
        pr.close();
        // Write the count data
        // Sort these so we can get probes from the same chromosome together
        Arrays.sort(probes);
        pr = null;
        String lastChr = "";
        for (int p = 0; p < pairs.length; p++) {
            if (!pairs[p].probe1.chromosome().name().equals(lastChr)) {
                if (pr != null)
                    pr.close();
                File outFile = new File(tempDir.getAbsoluteFile() + "/data_chr" + pairs[p].probe1.chromosome().name() + ".txt");
                pr = new PrintWriter(outFile);
                lastChr = pairs[p].probe1.chromosome().name();
                pr.println("id\tgroup\treplicate\tstate\tcount");
            }
            if (p % 1000 == 0) {
                progressUpdated("Writing data for chr" + lastChr, p, probes.length);
            }
            int[] fromProbe1Counts = new int[fromStores.length];
            int[] fromProbe2Counts = new int[fromStores.length];
            int[] toProbe1Counts = new int[toStores.length];
            int[] toProbe2Counts = new int[toStores.length];
            for (int i = 0; i < fromStores.length; i++) {
                // TODO: For the moment we'll expect they've quantitated this themselves
                fromProbe1Counts[i] = (int) fromStores[i].getValueForProbe(pairs[p].probe1);
                fromProbe2Counts[i] = (int) fromStores[i].getValueForProbe(pairs[p].probe2);
            }
            for (int i = 0; i < toStores.length; i++) {
                // TODO: For the moment we'll expect they've quantitated this themselves
                toProbe1Counts[i] = (int) toStores[i].getValueForProbe(pairs[p].probe1);
                toProbe2Counts[i] = (int) toStores[i].getValueForProbe(pairs[p].probe2);
            }
            for (int i = 0; i < fromProbe1Counts.length; i++) {
                pr.println(p + "\tfrom\tfrom" + i + "\tmeth\t" + fromProbe1Counts[i]);
                pr.println(p + "\tfrom\tfrom" + i + "\tunmeth\t" + fromProbe2Counts[i]);
            }
            for (int i = 0; i < toProbe1Counts.length; i++) {
                pr.println(p + "\tto\tto" + i + "\tmeth\t" + toProbe1Counts[i]);
                pr.println(p + "\tto\tto" + i + "\tunmeth\t" + toProbe2Counts[i]);
            }
        }
        if (pr != null)
            pr.close();
        progressUpdated("Running R Script", 0, 1);
        RScriptRunner runner = new RScriptRunner(tempDir);
        RProgressListener listener = new RProgressListener(runner);
        runner.addProgressListener(new ProgressRecordDialog("R Session", runner));
        runner.runScript();
        while (true) {
            if (listener.cancelled()) {
                progressCancelled();
                pr.close();
                return;
            }
            if (listener.exceptionReceived()) {
                progressExceptionReceived(new SeqMonkException("R Script failed"));
                pr.close();
                return;
            }
            if (listener.complete())
                break;
            Thread.sleep(500);
        }
        // We can now parse the results and put the hits into a new probe list
        ProbeList newList;
        if (multiTest) {
            newList = new ProbeList(startingList, "", "", "FDR");
        } else {
            newList = new ProbeList(startingList, "", "", "p-value");
        }
        File hitsFile = new File(tempDir.getAbsolutePath() + "/hits.txt");
        BufferedReader br = new BufferedReader(new FileReader(hitsFile));
        String line = br.readLine();
        // In case the same probe is found multiple times we'll cache the p-values
        // we see and report the best one.
        HashMap<Probe, Float> cachedHits = new HashMap<Probe, Float>();
        while ((line = br.readLine()) != null) {
            String[] sections = line.split("\t");
            String[] indexSections = sections[0].split("\\.");
            int probeIndex = Integer.parseInt(indexSections[indexSections.length - 1]);
            float pValue = Float.parseFloat(sections[sections.length - 1]);
            if (!cachedHits.containsKey(pairs[probeIndex].probe1)) {
                cachedHits.put(pairs[probeIndex].probe1, pValue);
            }
            if (!cachedHits.containsKey(pairs[probeIndex].probe2)) {
                cachedHits.put(pairs[probeIndex].probe2, pValue);
            }
            // See if the pvalue we've got is better than the one we're storing
            if (pValue < cachedHits.get(pairs[probeIndex].probe1)) {
                cachedHits.put(pairs[probeIndex].probe1, pValue);
            }
            if (pValue < cachedHits.get(pairs[probeIndex].probe2)) {
                cachedHits.put(pairs[probeIndex].probe2, pValue);
            }
        }
        br.close();
        for (Probe probeHit : cachedHits.keySet()) {
            newList.addProbe(probeHit, cachedHits.get(probeHit));
        }
        runner.cleanUp();
        filterFinished(newList);
    } catch (Exception ioe) {
        progressExceptionReceived(ioe);
        return;
    }
}
Also used : HashMap(java.util.HashMap) Probe(uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe) RProgressListener(uk.ac.babraham.SeqMonk.R.RProgressListener) Template(uk.ac.babraham.SeqMonk.Utilities.Templates.Template) ProgressRecordDialog(uk.ac.babraham.SeqMonk.Dialogs.ProgressRecordDialog) DataStore(uk.ac.babraham.SeqMonk.DataTypes.DataStore) FileReader(java.io.FileReader) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) PrintWriter(java.io.PrintWriter) ProbeList(uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeList) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) BufferedReader(java.io.BufferedReader) File(java.io.File) RScriptRunner(uk.ac.babraham.SeqMonk.R.RScriptRunner)

Example 95 with Probe

use of uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe in project SeqMonk by s-andrews.

the class MonteCarloFilter method generateProbeList.

/* (non-Javadoc)
	 * @see uk.ac.babraham.SeqMonk.Filters.ProbeFilter#generateProbeList()
	 */
protected void generateProbeList() {
    Probe[] probes = toList.getAllProbes();
    ArrayList<Probe> allProbes = new ArrayList<Probe>();
    Probe[] fromProbes = startingList.getAllProbes();
    try {
        for (int i = 0; i < fromProbes.length; i++) {
            allProbes.add(fromProbes[i]);
        }
        int passedCount = 0;
        float targetValue = getValue(probes);
        for (int iteration = 0; iteration < iterationCount; iteration++) {
            if (iteration % 100 == 0) {
                progressUpdated("Performed " + iteration + " iterations", iteration, iterationCount);
            }
            if (cancel) {
                progressCancelled();
                return;
            }
            Probe[] theseProbes = makeRandomProbes(allProbes, probes.length);
            float value = getValue(theseProbes);
            if (value >= targetValue) {
                ++passedCount;
            }
        }
        float pValue = ((float) passedCount) / iterationCount;
        ProbeList newList = new ProbeList(toList, "", "", "P-value");
        for (int i = 0; i < probes.length; i++) {
            newList.addProbe(probes[i], pValue);
        }
        filterFinished(newList);
    } catch (SeqMonkException sme) {
        progressExceptionReceived(sme);
        return;
    }
}
Also used : ProbeList(uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeList) ArrayList(java.util.ArrayList) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) Probe(uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe)

Aggregations

Probe (uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe)125 ProbeList (uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeList)54 SeqMonkException (uk.ac.babraham.SeqMonk.SeqMonkException)52 Vector (java.util.Vector)48 Chromosome (uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome)47 ProbeSet (uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeSet)26 DataStore (uk.ac.babraham.SeqMonk.DataTypes.DataStore)21 Feature (uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature)20 HashSet (java.util.HashSet)9 Location (uk.ac.babraham.SeqMonk.DataTypes.Genome.Location)9 File (java.io.File)8 PrintWriter (java.io.PrintWriter)8 ProbeTTestValue (uk.ac.babraham.SeqMonk.Analysis.Statistics.ProbeTTestValue)8 SplitLocation (uk.ac.babraham.SeqMonk.DataTypes.Genome.SplitLocation)7 HiCDataStore (uk.ac.babraham.SeqMonk.DataTypes.HiCDataStore)7 BufferedReader (java.io.BufferedReader)6 FileReader (java.io.FileReader)6 Hashtable (java.util.Hashtable)6 DataSet (uk.ac.babraham.SeqMonk.DataTypes.DataSet)6 HiCHitCollection (uk.ac.babraham.SeqMonk.DataTypes.Sequence.HiCHitCollection)6