Search in sources :

Example 86 with Probe

use of uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe in project SeqMonk by s-andrews.

the class GeneSetIntensityDifferenceFilter method generateProbeList.

/* (non-Javadoc)
	 * @see uk.ac.babraham.SeqMonk.Filters.ProbeFilter#generateProbeList()
	 */
protected void generateProbeList() {
    try {
        applyMultipleTestingCorrection = optionsPanel.multipleTestingBox.isSelected();
        calculateCustomRegression = optionsPanel.calculateCustomRegressionBox.isSelected();
        if (calculateCustomRegression == false) {
            customRegressionValues = null;
        }
        if (calculateLinearRegression) {
            simpleRegression = new SimpleRegression();
        }
        Probe[] allProbes = startingList.getAllProbes();
        // We're not allowing multiple comparisons - this is a bit of a messy workaround so it's compatible with other methods.
        fromStore = fromStores[0];
        toStore = toStores[0];
        ArrayList<Probe> probeArrayList = new ArrayList<Probe>();
        int NaNcount = 0;
        // remove the invalid probes - the ones without a value
        for (int i = 0; i < allProbes.length; i++) {
            if ((Float.isNaN(fromStore.getValueForProbe(allProbes[i]))) || (Float.isNaN(toStore.getValueForProbe(allProbes[i])))) {
                NaNcount++;
            } else {
                probeArrayList.add(allProbes[i]);
            }
        }
        System.err.println("Found " + NaNcount + " probes that were invalid.");
        probes = probeArrayList.toArray(new Probe[0]);
        if (calculateCustomRegression == true) {
            customRegressionValues = new float[2][probes.length];
        }
        // We'll pull the number of probes to sample from the preferences if they've changed it
        Integer updatedProbesPerSet = optionsPanel.probesPerSet();
        if (updatedProbesPerSet != null)
            probesPerSet = updatedProbesPerSet;
        // we want a set of z-scores using the local distribution.
        probeZScoreLookupTable = new Hashtable<Probe, Double>();
        // Put something in the progress whilst we're ordering the probe values to make the comparison.
        progressUpdated("Generating background model", 0, 1);
        Comparator<Integer> comp = new AverageIntensityComparator(fromStore, toStore, probes);
        // We need to generate a set of probe indices that can be ordered by their average intensity
        Integer[] indices = new Integer[probes.length];
        for (int i = 0; i < probes.length; i++) {
            indices[i] = i;
            /* add the data to the linear regression object */
            if (calculateLinearRegression) {
                simpleRegression.addData((double) fromStore.getValueForProbe(probes[i]), (double) toStore.getValueForProbe(probes[i]));
            }
        }
        if (calculateLinearRegression) {
            System.out.println("intercept = " + simpleRegression.getIntercept() + ", slope = " + simpleRegression.getSlope());
        }
        Arrays.sort(indices, comp);
        /* This holds the indices to get the deduplicated probe from the original list of probes */
        deduplicatedIndices = new Integer[probes.length];
        /* The number of probes with different values */
        int dedupProbeCounter = 0;
        // the probes deduplicated by value
        ArrayList<Probe> deduplicatedProbes = new ArrayList<Probe>();
        // populate the first one so that we have something to compare to in the loop
        deduplicatedIndices[0] = 0;
        deduplicatedProbes.add(probes[indices[0]]);
        progressUpdated("Made 0 of 1 comparisons", 0, 1);
        for (int i = 1; i < indices.length; i++) {
            /* indices have been sorted, now we need to check whether adjacent pair values are identical */
            if ((fromStore.getValueForProbe(probes[indices[i]]) == fromStore.getValueForProbe(probes[indices[i - 1]])) && (toStore.getValueForProbe(probes[indices[i]]) == toStore.getValueForProbe(probes[indices[i - 1]]))) {
                /* If they are identical, do not add the probe to the deduplicatedProbes object, but have a reference for it so we can look up which deduplicated probe and 
					 * therefore which distribution slice to use for the duplicated probe. */
                deduplicatedIndices[i] = dedupProbeCounter;
            } else {
                deduplicatedProbes.add(probes[indices[i]]);
                dedupProbeCounter++;
                deduplicatedIndices[i] = dedupProbeCounter;
            }
        }
        Probe[] dedupProbes = deduplicatedProbes.toArray(new Probe[0]);
        // make sure we're not trying to use more probes than we've got in the analysis
        if (probesPerSet > dedupProbes.length) {
            probesPerSet = dedupProbes.length;
        }
        System.out.println("total number of probe values = " + probes.length);
        System.out.println("number of deduplicated probe values = " + dedupProbes.length);
        System.out.println("probesPerSet = " + probesPerSet);
        // I want this to contain all the differences, then from that I'm going to get the z-scores.
        double[] currentDiffSet = new double[probesPerSet];
        for (int i = 0; i < indices.length; i++) {
            if (cancel) {
                cancel = false;
                progressCancelled();
                return;
            }
            if (i % 1000 == 0) {
                int progress = (i * 100) / indices.length;
                // progress += 100*comparisonIndex;
                progressUpdated("Made 0 out of 1 comparisons", progress, 100);
            }
            /**
             * There are +1s in here because we skip over j when j == startingIndex.
             */
            // We need to make up the set of differences to represent this probe
            int startingIndex = deduplicatedIndices[i] - (probesPerSet / 2);
            if (startingIndex < 0)
                startingIndex = 0;
            if (startingIndex + (probesPerSet + 1) >= dedupProbes.length)
                startingIndex = dedupProbes.length - (probesPerSet + 1);
            try {
                for (int j = startingIndex; j < startingIndex + (probesPerSet + 1); j++) {
                    if (j == startingIndex) {
                        // Don't include the point being tested in the background model
                        continue;
                    }
                    double diff;
                    if (calculateLinearRegression == true) {
                        if (j > dedupProbes.length) {
                            System.err.println(" j is too big, it's " + j + " and dedupProbes.length = " + dedupProbes.length + ", starting index = " + startingIndex);
                        }
                        double x = fromStore.getValueForProbe(dedupProbes[j]);
                        double expectedY = (simpleRegression.getSlope() * x) + simpleRegression.getIntercept();
                        diff = toStore.getValueForProbe(dedupProbes[j]) - expectedY;
                    } else {
                        diff = toStore.getValueForProbe(dedupProbes[j]) - fromStore.getValueForProbe(dedupProbes[j]);
                    }
                    if (j < startingIndex) {
                        currentDiffSet[j - startingIndex] = diff;
                    } else {
                        currentDiffSet[(j - startingIndex) - 1] = diff;
                    }
                }
                if (calculateCustomRegression == true) {
                    // the average/ kind of centre line
                    float z = ((fromStore.getValueForProbe(probes[indices[i]]) + toStore.getValueForProbe(probes[indices[i]])) / 2);
                    customRegressionValues[0][indices[i]] = z - ((float) SimpleStats.mean(currentDiffSet) / 2);
                    customRegressionValues[1][indices[i]] = z + ((float) SimpleStats.mean(currentDiffSet) / 2);
                }
                double mean = 0;
                // Get the difference for this point
                double diff;
                if (calculateLinearRegression == true) {
                    double x = fromStore.getValueForProbe(probes[indices[i]]);
                    double expectedY = (simpleRegression.getSlope() * x) + simpleRegression.getIntercept();
                    diff = toStore.getValueForProbe(probes[indices[i]]) - expectedY;
                } else if (calculateCustomRegression == true) {
                    diff = toStore.getValueForProbe(probes[indices[i]]) - fromStore.getValueForProbe(probes[indices[i]]);
                    mean = SimpleStats.mean(currentDiffSet);
                } else {
                    diff = toStore.getValueForProbe(probes[indices[i]]) - fromStore.getValueForProbe(probes[indices[i]]);
                }
                double stdev;
                stdev = SimpleStats.stdev(currentDiffSet, mean);
                // if there are no reads in the probe for either of the datasets, should we set the zscore to 0??
                double zScore = (diff - mean) / stdev;
                // modified z score
                // median absolute deviation
                /*		double[] madArray = new double[currentDiffSet.length];
						double median = SimpleStats.median(currentDiffSet);					
						
						for(int d=0; d<currentDiffSet.length; d++){							
							madArray[d] = Math.abs(currentDiffSet[d] - median);							
						}
						
						double mad = SimpleStats.median(madArray);							
						zScore = (0.6745 * (diff - median))/mad;
					}
				*/
                probeZScoreLookupTable.put(probes[indices[i]], zScore);
            } catch (SeqMonkException sme) {
                progressExceptionReceived(sme);
                return;
            }
        }
        // make this an array list as we're kicking out the mapped gene sets that have zscores with variance of 0.
        ArrayList<MappedGeneSetTTestValue> pValueArrayList = new ArrayList<MappedGeneSetTTestValue>();
        MappedGeneSet[] mappedGeneSets = null;
        /* if we're using the gene set from a file, map the gene sets to the probes */
        if (optionsPanel.geneSetsFileRadioButton.isSelected()) {
            GeneSetCollectionParser geneSetCollectionParser = new GeneSetCollectionParser(minGenesInSet, maxGenesInSet);
            GeneSetCollection geneSetCollection = geneSetCollectionParser.parseGeneSetInformation(validGeneSetFilepath);
            MappedGeneSet[] allMappedGeneSets = geneSetCollection.getMappedGeneSets(probes);
            if (allMappedGeneSets == null) {
                JOptionPane.showMessageDialog(SeqMonkApplication.getInstance(), "No sets of genes could be matched to probes.\nCheck that the gmt file is for the right species. \nTo use gene sets from a file, probe names must contain the gene name. \nTry defining new probes over genes (e.g. using the RNA-seq quantitation pipeline) or use existing probes lists instead of a gene set file.", "No gene sets identified", JOptionPane.ERROR_MESSAGE);
                throw new SeqMonkException("No sets of genes could be matched to probes.\nTo use gene sets from a file, probe names must contain the gene name.\nTry defining new probes over genes or use existing probes lists instead of a gene set file.");
            } else {
                ArrayList<MappedGeneSet> mgsArrayList = new ArrayList<MappedGeneSet>();
                /* get rid of those that have fewer probes in the set than minGenesInSet. We shouldn't exceed maxGenesInSet unless probes have been made over something other than genes */
                for (int i = 0; i < allMappedGeneSets.length; i++) {
                    if (allMappedGeneSets[i].getProbes().length >= minGenesInSet) {
                        mgsArrayList.add(allMappedGeneSets[i]);
                    }
                }
                mappedGeneSets = mgsArrayList.toArray(new MappedGeneSet[0]);
            }
        } else /* or if we're using existing probelists, create mappedGeneSets from them */
        if (optionsPanel.probeListRadioButton.isSelected() && selectedProbeLists != null) {
            mappedGeneSets = new MappedGeneSet[selectedProbeLists.length];
            for (int i = 0; i < selectedProbeLists.length; i++) {
                mappedGeneSets[i] = new MappedGeneSet(selectedProbeLists[i]);
            }
        } else {
            throw new SeqMonkException("Haven't got any genesets to use, shouldn't have got here without having any selected.");
        }
        if (mappedGeneSets == null || mappedGeneSets.length == 0) {
            throw new SeqMonkException("Couldn't map gene sets to the probes, try again with a different probe set");
        } else {
            System.err.println("there are " + mappedGeneSets.length + " mappedGeneSets");
            System.err.println("size of zScore lookup table = " + probeZScoreLookupTable.size());
        }
        System.err.println("total number of mapped gene sets = " + mappedGeneSets.length);
        // deduplicate our mappedGeneSets
        if (optionsPanel.deduplicateGeneSetBox.isSelected()) {
            mappedGeneSets = deduplicateMappedGeneSets(mappedGeneSets);
        }
        System.err.println("deduplicated mapped gene sets = " + mappedGeneSets.length);
        /* 
			 * we need to go through the mapped gene set and get all the values for the matched probes 
			 * 
			 */
        for (int i = 0; i < mappedGeneSets.length; i++) {
            Probe[] geneSetProbes = mappedGeneSets[i].getProbes();
            // to contain all the z-scores for the gene set
            double[] geneSetZscores = new double[geneSetProbes.length];
            // Find the z-scores for each of the probes in the mappedGeneSet
            for (int gsp = 0; gsp < geneSetProbes.length; gsp++) {
                if (probeZScoreLookupTable.containsKey(geneSetProbes[gsp])) {
                    geneSetZscores[gsp] = probeZScoreLookupTable.get(geneSetProbes[gsp]);
                }
            }
            // this is just temporary to check the stats - there's some duplication with this.... is there??
            mappedGeneSets[i].zScores = geneSetZscores;
            if (geneSetZscores.length > 1) {
                // the number of probes in the mappedGeneSet should always be > 1 anyway as the mappedGeneSet shouldn't be created if there are < 2 matched probes.
                double pVal;
                if (optionsPanel.statisticalTestBox.getSelectedItem().toString().equals("t-test")) {
                    pVal = TTest.calculatePValue(geneSetZscores, 0);
                } else if (optionsPanel.statisticalTestBox.getSelectedItem().toString().equals("Kolmorogov-Smirnov test")) {
                    double[] allZscores = getAllZScores(probeZScoreLookupTable);
                    KolmogorovSmirnovTest ksTest = new KolmogorovSmirnovTest();
                    pVal = ksTest.kolmogorovSmirnovTest(geneSetZscores, allZscores);
                } else if (optionsPanel.statisticalTestBox.getSelectedItem().toString().equals("Kolmorogov-Smirnov non-directional test")) {
                    double[] allZscores = getAllZScores(probeZScoreLookupTable);
                    KolmogorovSmirnovTest ksTest = new KolmogorovSmirnovTest();
                    pVal = ksTest.kolmogorovSmirnovTest(convertToAbsoluteValues(geneSetZscores), convertToAbsoluteValues(allZscores));
                } else {
                    throw new IllegalArgumentException("Didn't recognise statistical test " + optionsPanel.statisticalTestBox.getSelectedItem().toString());
                }
                mappedGeneSets[i].meanZScore = SimpleStats.mean(geneSetZscores);
                // check the variance - we don't want variances of 0.
                double stdev = SimpleStats.stdev(geneSetZscores);
                if ((stdev * stdev) < 0.00000000000001) {
                    continue;
                } else // if all the differences between the datasets are 0 then just get rid of them
                if (Double.isNaN(pVal)) {
                    continue;
                } else {
                    pValueArrayList.add(new MappedGeneSetTTestValue(mappedGeneSets[i], pVal));
                }
            } else {
                System.err.println("Fell through the net somewhere, why does the set of zscores contain fewer than 2 values?");
                continue;
            }
        }
        MappedGeneSetTTestValue[] filterResultpValues = pValueArrayList.toArray(new MappedGeneSetTTestValue[0]);
        ArrayList<MappedGeneSetTTestValue> filteredPValueArrayList = new ArrayList<MappedGeneSetTTestValue>();
        // Now we've got all the p values they need to be corrected.
        if (applyMultipleTestingCorrection) {
            BenjHochFDR.calculateQValues(filterResultpValues);
        }
        System.err.println(filterResultpValues.length + " p-values calculated, multtest = " + applyMultipleTestingCorrection + ", pval limit = " + pValueLimit);
        for (int i = 0; i < filterResultpValues.length; i++) {
            double pOrQvalue;
            if (applyMultipleTestingCorrection) {
                pOrQvalue = filterResultpValues[i].q;
            } else {
                pOrQvalue = filterResultpValues[i].p;
            }
            // check whether it passes the p/q-value and z-score cut-offs
            if (optionsPanel.reportAllResults.isSelected()) {
                filteredPValueArrayList.add(filterResultpValues[i]);
            } else {
                if ((pOrQvalue < pValueLimit) && (Math.abs(filterResultpValues[i].mappedGeneSet.meanZScore) > zScoreThreshold)) {
                    filteredPValueArrayList.add(filterResultpValues[i]);
                }
            }
        }
        // convert the ArrayList to MappedGeneSetTTestValue
        filterResultpValues = filteredPValueArrayList.toArray(new MappedGeneSetTTestValue[0]);
        if (filterResultpValues.length == 0) {
            JOptionPane.showMessageDialog(SeqMonkApplication.getInstance(), "No sets of genes were identified using the selected parameters, \ntry changing the gene sets or relaxing the p-value/z-score thresholds.", "No gene sets identified", JOptionPane.INFORMATION_MESSAGE);
        } else {
            geneSetDisplay = new GeneSetDisplay(dataCollection, listDescription(), fromStore, toStore, probes, probeZScoreLookupTable, filterResultpValues, startingList, customRegressionValues, simpleRegression, // optionsPanel.bidirectionalRadioButton.isSelected());
            false);
            geneSetDisplay.addWindowListener(this);
        }
        // We don't want to save the probe list here, we're bringing up the intermediate display from which probe lists can be saved.
        progressCancelled();
    } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}
Also used : GeneSetCollection(uk.ac.babraham.SeqMonk.GeneSets.GeneSetCollection) ArrayList(java.util.ArrayList) Probe(uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe) MappedGeneSetTTestValue(uk.ac.babraham.SeqMonk.Analysis.Statistics.MappedGeneSetTTestValue) KolmogorovSmirnovTest(org.apache.commons.math3.stat.inference.KolmogorovSmirnovTest) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) GeneSetCollectionParser(uk.ac.babraham.SeqMonk.GeneSets.GeneSetCollectionParser) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) SimpleRegression(org.apache.commons.math3.stat.regression.SimpleRegression) MappedGeneSet(uk.ac.babraham.SeqMonk.GeneSets.MappedGeneSet)

Example 87 with Probe

use of uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe in project SeqMonk by s-andrews.

the class ContigProbeGenerator method run.

/* (non-Javadoc)
	 * @see java.lang.Runnable#run()
	 */
public void run() {
    Chromosome[] chromosomes = collection.genome().getAllChromosomes();
    Vector<Probe> newProbes = new Vector<Probe>();
    for (int c = 0; c < chromosomes.length; c++) {
        // Time for an update
        updateGenerationProgress("Processed " + c + " chromosomes", c, chromosomes.length);
        // We'll merge together the reads for all of the selected DataStores and
        // compute a single set of probes which covers all of them.
        ReadsWithCounts[] v = new ReadsWithCounts[selectedStores.length];
        for (int s = 0; s < selectedStores.length; s++) {
            v[s] = selectedStores[s].getReadsForChromosome(chromosomes[c]);
        }
        ReadsWithCounts rawReads = new ReadsWithCounts(v);
        v = null;
        // We now want to convert this list into a non-redundant set of
        // read positions with counts.  If we don't do this then we get
        // appalling performance where we have many reads mapped at the
        // same position
        // Our default is to do all strands at once
        int[] strandsToTry = new int[] { 100 };
        if (separateStrands.isSelected()) {
            strandsToTry = new int[] { Location.FORWARD, Location.REVERSE, Location.UNKNOWN };
        }
        for (int strand = 0; strand < strandsToTry.length; strand++) {
            ReadsWithCounts reads = getNonRedundantReads(rawReads, strandsToTry[strand]);
            if (reads.totalCount() == 0) {
                // System.err.println("Skipping strand "+strandsToTry[strand]+" on chr "+chromosomes[c]);
                continue;
            }
            int strandForNewProbes = Location.UNKNOWN;
            if (strandsToTry.length > 1) {
                strandForNewProbes = strandsToTry[strand];
            }
            int start = -1;
            // We now start a process where we work out at what point we cross the
            // threshold of having more than depthCutoff reads overlapping at any
            // point
            LinkedList<SequenceReadWithCount> currentSet = new LinkedList<SequenceReadWithCount>();
            int currentSetSize = 0;
            for (int r = 0; r < reads.reads.length; r++) {
                // See if we need to quit
                if (cancel) {
                    generationCancelled();
                    return;
                }
                while (currentSetSize > 0 && SequenceRead.end(currentSet.getFirst().read) < SequenceRead.start(reads.reads[r])) {
                    SequenceReadWithCount lastRead = currentSet.removeFirst();
                    currentSetSize -= lastRead.count;
                    if (start > 0 && currentSetSize < depthCutoff) {
                        // We just got to the end of a probe
                        Probe p = new Probe(chromosomes[c], start, SequenceRead.end(lastRead.read), strandForNewProbes);
                        // Check to see if we have a previous probe against which we can check
                        Probe lastProbe = null;
                        if (!newProbes.isEmpty())
                            lastProbe = newProbes.lastElement();
                        // Can we merge?
                        if (lastProbe != null && p.chromosome() == lastProbe.chromosome() && p.strand() == lastProbe.strand() && p.start() - lastProbe.end() <= distance) {
                            // Remove the last probe from the stored set
                            newProbes.remove(newProbes.size() - 1);
                            // Expand this probe to cover the last one and add it to the stored set
                            newProbes.add(new Probe(p.chromosome(), lastProbe.start(), p.end(), strandForNewProbes));
                        } else if (lastProbe != null) {
                            // We might still remove this if it's too small
                            if (lastProbe.length() < minSize) {
                                newProbes.remove(newProbes.size() - 1);
                            }
                            // We still need to add the new probe
                            newProbes.add(p);
                        } else {
                            newProbes.add(p);
                        }
                        start = -1;
                    }
                }
                // If there's nothing there already then just add it
                if (currentSetSize == 0) {
                    currentSet.add(new SequenceReadWithCount(reads.reads[r], reads.counts[r]));
                    currentSetSize += reads.counts[r];
                } else // If there are reads in the current set then we need to add this read
                // so that the current set is ordered by the end positions of the
                // reads, with the earliest end first.  We therefore start from the back
                // and work our way to the front, as soon as we see an entry whose end
                // is lower than ours we add ourselves after that
                {
                    // Now we add this read at a position based on its end position
                    ListIterator<SequenceReadWithCount> it = currentSet.listIterator(currentSet.size());
                    while (true) {
                        // If we reach the front of the set then we add ourselves to the front
                        if (!it.hasPrevious()) {
                            currentSet.addFirst(new SequenceReadWithCount(reads.reads[r], reads.counts[r]));
                            currentSetSize += reads.counts[r];
                            break;
                        } else {
                            SequenceReadWithCount previousRead = it.previous();
                            if (SequenceRead.end(previousRead.read) < SequenceRead.end(reads.reads[r])) {
                                // We want to add ourselves after this element so backtrack
                                // by one position (which must exist because we just went
                                // past it
                                it.next();
                                it.add(new SequenceReadWithCount(reads.reads[r], reads.counts[r]));
                                currentSetSize += reads.counts[r];
                                break;
                            }
                        }
                    }
                }
                // See if we crossed the threshold for starting a new probe
                if (start < 0 && currentSetSize >= depthCutoff) {
                    start = SequenceRead.start(reads.reads[r]);
                }
            }
            // out of the so far unprocessed reads on this chromosome
            if (start > 0) {
                Probe p = new Probe(chromosomes[c], start, SequenceRead.end(currentSet.getFirst().read), strandForNewProbes);
                // Check to see if we can merge with the last probe made
                Probe lastProbe = null;
                if (!newProbes.isEmpty())
                    lastProbe = newProbes.lastElement();
                // Can we merge?
                if (lastProbe != null && p.chromosome() == lastProbe.chromosome() && p.start() - lastProbe.end() <= distance) {
                    newProbes.remove(newProbes.size() - 1);
                    newProbes.add(new Probe(p.chromosome(), lastProbe.start(), p.end(), strandForNewProbes));
                } else if (lastProbe != null) {
                    // We might still remove this if it's too small
                    if (lastProbe.length() < minSize) {
                        newProbes.remove(newProbes.size() - 1);
                    }
                    // final chance
                    if (p.length() > minSize) {
                        newProbes.add(p);
                    }
                } else {
                    // Add the remaining probe if it's big enough.
                    if (p.length() > minSize) {
                        newProbes.add(p);
                    }
                }
            }
        }
    }
    Probe[] finalList = newProbes.toArray(new Probe[0]);
    newProbes.clear();
    ProbeSet finalSet = new ProbeSet(getDescription(), finalList);
    generationComplete(finalSet);
}
Also used : Chromosome(uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome) ReadsWithCounts(uk.ac.babraham.SeqMonk.DataTypes.Sequence.ReadsWithCounts) Probe(uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe) LinkedList(java.util.LinkedList) ProbeSet(uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeSet) Vector(java.util.Vector) IntVector(uk.ac.babraham.SeqMonk.Utilities.IntVector) LongVector(uk.ac.babraham.SeqMonk.Utilities.LongVector)

Example 88 with Probe

use of uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe in project SeqMonk by s-andrews.

the class DeduplicationProbeGenerator method run.

/* (non-Javadoc)
	 * @see java.lang.Runnable#run()
	 */
public void run() {
    Chromosome[] chromosomes = collection.genome().getAllChromosomes();
    boolean separateStrands = separateStrandsBox.isSelected();
    int maxDistance = 0;
    if (maxDistanceField.getText().length() > 0) {
        maxDistance = Integer.parseInt(maxDistanceField.getText());
    }
    Vector<Probe> newProbes = new Vector<Probe>();
    for (int c = 0; c < chromosomes.length; c++) {
        // Time for an update
        updateGenerationProgress("Processed " + c + " chromosomes", c, chromosomes.length);
        Probe[] startingProbes = initialList.getProbesForChromosome(chromosomes[c]);
        // For directional merging we use 3 probes
        Probe currentForward = null;
        Probe currentReverse = null;
        Probe currentUnknown = null;
        // For non-directional merging we use only one
        Probe currentProbe = null;
        // Now we can make the actual probes
        for (int i = 0; i < startingProbes.length; i++) {
            if (cancel) {
                generationCancelled();
                return;
            }
            if (separateStrands) {
                switch(startingProbes[i].strand()) {
                    case (Probe.FORWARD):
                        if (currentForward == null) {
                            currentForward = new Probe(chromosomes[c], startingProbes[i].packedPosition());
                        } else if (startingProbes[i].start() <= currentForward.end() + maxDistance) {
                            if (startingProbes[i].end() > currentForward.end()) {
                                // Extend the current probe
                                currentForward = new Probe(chromosomes[c], currentForward.start(), startingProbes[i].end(), currentForward.strand(), currentForward.name());
                            }
                        } else {
                            newProbes.add(currentForward);
                            currentForward = new Probe(chromosomes[c], startingProbes[i].packedPosition());
                        }
                        break;
                    case (Probe.REVERSE):
                        if (currentReverse == null) {
                            currentReverse = new Probe(chromosomes[c], startingProbes[i].packedPosition());
                        } else if (startingProbes[i].start() <= currentReverse.end() + maxDistance) {
                            if (startingProbes[i].end() > currentReverse.end()) {
                                // Extend the current probe
                                currentReverse = new Probe(chromosomes[c], currentReverse.start(), startingProbes[i].end(), currentReverse.strand(), currentReverse.name());
                            }
                        } else {
                            newProbes.add(currentReverse);
                            currentReverse = new Probe(chromosomes[c], startingProbes[i].packedPosition());
                        }
                        break;
                    case (Probe.UNKNOWN):
                        if (currentUnknown == null) {
                            currentUnknown = new Probe(chromosomes[c], startingProbes[i].packedPosition());
                        } else if (startingProbes[i].start() <= currentUnknown.end() + maxDistance) {
                            if (startingProbes[i].end() > currentUnknown.end()) {
                                // Extend the current probe
                                currentUnknown = new Probe(chromosomes[c], currentUnknown.start(), startingProbes[i].end(), currentUnknown.strand(), currentUnknown.name());
                            }
                        } else {
                            newProbes.add(currentUnknown);
                            currentUnknown = new Probe(chromosomes[c], startingProbes[i].packedPosition());
                        }
                        break;
                }
            } else {
                if (currentProbe == null) {
                    currentProbe = new Probe(chromosomes[c], startingProbes[i].packedPosition());
                } else if (startingProbes[i].start() <= currentProbe.end() + maxDistance) {
                    if (startingProbes[i].end() > currentProbe.end() || startingProbes[i].strand() != currentProbe.strand()) {
                        // Update the current probe
                        int usedStrand = currentProbe.strand();
                        if (startingProbes[i].strand() != currentProbe.strand()) {
                            usedStrand = Probe.UNKNOWN;
                        }
                        currentProbe = new Probe(chromosomes[c], currentProbe.start(), Math.max(startingProbes[i].end(), currentProbe.end()), usedStrand, currentProbe.name());
                    }
                } else {
                    newProbes.add(currentProbe);
                    currentProbe = new Probe(chromosomes[c], startingProbes[i].packedPosition());
                }
            }
        }
        // At the end of each chromosome we add any probes we have remaining
        if (currentProbe != null)
            newProbes.add(currentProbe);
        if (currentForward != null)
            newProbes.add(currentForward);
        if (currentReverse != null)
            newProbes.add(currentReverse);
        if (currentUnknown != null)
            newProbes.add(currentUnknown);
    }
    Probe[] finalList = newProbes.toArray(new Probe[0]);
    ProbeSet finalSet = new ProbeSet(getDescription(), finalList);
    generationComplete(finalSet);
}
Also used : ProbeSet(uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeSet) Chromosome(uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome) Probe(uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe) Vector(java.util.Vector)

Example 89 with Probe

use of uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe in project SeqMonk by s-andrews.

the class FeatureProbeGenerator method run.

/* (non-Javadoc)
	 * @see java.lang.Runnable#run()
	 */
public void run() {
    // We get the problem here that we won't get per chromosome updates, but this should be
    // so quick I don't think we care:
    updateGenerationProgress("Making probes...", 0, 1);
    Probe[] finalList = optionPanel.getProbes();
    if (removeDuplicates) {
        // System.out.println("Removing duplicates from original list of "+finalList.length+" probes");
        finalList = removeDuplicates(finalList);
    // System.out.println("Unique list has "+finalList.length+" probes");
    }
    ProbeSet finalSet = new ProbeSet(getDescription(), finalList);
    generationComplete(finalSet);
}
Also used : ProbeSet(uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeSet) Probe(uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe)

Example 90 with Probe

use of uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe in project SeqMonk by s-andrews.

the class MacsPeakCaller method run.

public void run() {
    // for (int i=0;i<selectedChIPStores.length;i++) {
    // System.err.println("Selcted ChIP="+selectedChIPStores[i]);
    // }
    // for (int i=0;i<selectedInputStores.length;i++) {
    // System.err.println("Selcted Input="+selectedInputStores[i]);
    // }
    // First find the tag offsets between the watson and crick strands
    // Work out the total average coverage for all of the combined ChIP samples
    long totalChIPCoverage = 0;
    for (int i = 0; i < selectedChIPStores.length; i++) {
        totalChIPCoverage += selectedChIPStores[i].getTotalReadLength();
    }
    if (cancel) {
        generationCancelled();
        return;
    }
    double averageChIPCoveragePerBase = totalChIPCoverage / (double) collection.genome().getTotalGenomeLength();
    double lowerCoverage = averageChIPCoveragePerBase * minFoldEnrichment;
    double upperCoverage = averageChIPCoveragePerBase * maxFoldEnrichment;
    System.err.println("Coverage range for high confidence peaks is " + lowerCoverage + " - " + upperCoverage);
    // Now we go through the data to find locations for our high confidence peaks so we can
    // randomly select 1000 of these to use to find the offset between the two strands
    Chromosome[] chromosomes = collection.genome().getAllChromosomes();
    Vector<Probe> potentialHighConfidencePeaks = new Vector<Probe>();
    for (int c = 0; c < chromosomes.length; c++) {
        if (cancel) {
            generationCancelled();
            return;
        }
        // Time for an update
        updateGenerationProgress("Finding high confidence peaks on chromosome " + chromosomes[c].name(), c, chromosomes.length);
        Probe lastValidProbe = null;
        for (int startPosition = 1; startPosition < chromosomes[c].length() - fragmentSize; startPosition += fragmentSize / 2) {
            // See if we need to quit
            if (cancel) {
                generationCancelled();
                return;
            }
            long totalLength = 0;
            Probe probe = new Probe(chromosomes[c], startPosition, startPosition + fragmentSize);
            for (int s = 0; s < selectedChIPStores.length; s++) {
                long[] reads = selectedChIPStores[s].getReadsForProbe(probe);
                for (int j = 0; j < reads.length; j++) {
                    totalLength += SequenceRead.length(reads[j]);
                }
            }
            if (totalLength >= (lowerCoverage * probe.length()) && totalLength <= upperCoverage * probe.length()) {
                if (lastValidProbe != null && SequenceRead.overlaps(lastValidProbe.packedPosition(), probe.packedPosition())) {
                    lastValidProbe = new Probe(chromosomes[c], lastValidProbe.start(), probe.end());
                } else if (lastValidProbe != null) {
                    // Check that the overall density over the region falls within our limits
                    totalLength = 0;
                    for (int s = 0; s < selectedChIPStores.length; s++) {
                        long[] reads = selectedChIPStores[s].getReadsForProbe(lastValidProbe);
                        for (int j = 0; j < reads.length; j++) {
                            totalLength += SequenceRead.length(reads[j]);
                        }
                    }
                    if (totalLength >= (lowerCoverage * lastValidProbe.length()) && totalLength <= upperCoverage * lastValidProbe.length()) {
                        potentialHighConfidencePeaks.add(lastValidProbe);
                    }
                    lastValidProbe = probe;
                } else {
                    lastValidProbe = probe;
                }
            }
        }
        if (lastValidProbe != null) {
            long totalLength = 0;
            for (int s = 0; s < selectedChIPStores.length; s++) {
                long[] reads = selectedChIPStores[s].getReadsForProbe(lastValidProbe);
                for (int j = 0; j < reads.length; j++) {
                    totalLength += SequenceRead.length(reads[j]);
                }
            }
            if (totalLength >= (lowerCoverage * lastValidProbe.length()) && totalLength <= upperCoverage * lastValidProbe.length()) {
                potentialHighConfidencePeaks.add(lastValidProbe);
            }
        }
    }
    if (potentialHighConfidencePeaks.size() == 0) {
        JOptionPane.showMessageDialog(SeqMonkApplication.getInstance(), "No high confidence peaks found", "Quitting generator", JOptionPane.INFORMATION_MESSAGE);
        generationCancelled();
        return;
    }
    // System.err.println("Found "+potentialHighConfidencePeaks.size()+" high confidence peaks");
    // Now we select 1000 random probes from this set
    Probe[] highConfidencePeaks = potentialHighConfidencePeaks.toArray(new Probe[0]);
    Collections.shuffle(Arrays.asList(highConfidencePeaks));
    Probe[] randomHighConfidenceProbes = new Probe[Math.min(highConfidencePeaks.length, 1000)];
    for (int i = 0; i < randomHighConfidenceProbes.length; i++) {
        randomHighConfidenceProbes[i] = highConfidencePeaks[i];
    }
    // Now find the average distance between forward / reverse reads in the candidate peaks
    int[] distances = new int[highConfidencePeaks.length];
    // Sort the candidates so we don't do stupid stuff with the cache
    Arrays.sort(randomHighConfidenceProbes);
    for (int p = 0; p < randomHighConfidenceProbes.length; p++) {
        // See if we need to quit
        if (cancel) {
            generationCancelled();
            return;
        }
        distances[p] = getInterStrandDistance(randomHighConfidenceProbes[p], selectedChIPStores);
    }
    int medianInterStrandDistance = (int) SimpleStats.median(distances);
    if (medianInterStrandDistance < 0)
        medianInterStrandDistance = 0;
    // System.err.println("Median inter strand difference = "+medianInterStrandDistance);
    // Now we find the depth cutoff for overrepresented single tags using a binomial distribution
    int totalReadCount = 0;
    for (int i = 0; i < selectedChIPStores.length; i++) {
        totalReadCount += selectedChIPStores[i].getTotalReadCount();
    }
    BinomialDistribution bin = new BinomialDistribution(totalReadCount, 1d / collection.genome().getTotalGenomeLength());
    // We want to know what depth has a chance of less than 1^-5
    int redundantThreshold = bin.inverseCumulativeProbability(1 - 0.00001d);
    if (redundantThreshold < 1)
        redundantThreshold = 1;
    // System.err.println("Redundancy threshold is "+redundantThreshold);
    // Now we construct a poisson distribution to work out the threshold to use for
    // constructing a full candidate peak set.
    updateGenerationProgress("Counting non-redundant reads", 0, 1);
    // To do this we need to get the full non-redundant length from the whole set
    int totalNonRedCount = getNonRedundantReadCount(selectedChIPStores, redundantThreshold);
    // System.err.println("Total non-redundant sequences is "+totalNonRedCount);
    // We need to know the median read length for the data
    int readLength = 0;
    for (int i = 0; i < selectedChIPStores.length; i++) {
        readLength += selectedChIPStores[i].getTotalReadLength() / selectedChIPStores[i].getTotalReadCount();
    }
    readLength /= selectedChIPStores.length;
    double expectedCountsPerWindow = getExpectedCountPerWindow(totalNonRedCount, collection.genome().getTotalGenomeLength(), fragmentSize, readLength);
    PoissonDistribution poisson = new PoissonDistribution(expectedCountsPerWindow);
    int readCountCutoff = poisson.inverseCumulativeProbability(1 - pValue);
    // System.err.println("Threshold for enrichment in a window is "+readCountCutoff+" reads using a p-value of "+pValue+" and a mean of "+(totalNonRedCount/(collection.genome().getTotalGenomeLength()/(double)fragmentSize)));
    // Now we go back through the whole dataset to do a search for all possible candidate probes
    // We re-use the peak vector we came up with before.
    potentialHighConfidencePeaks.clear();
    for (int c = 0; c < chromosomes.length; c++) {
        // Time for an update
        updateGenerationProgress("Finding candidate peaks on chromosome " + chromosomes[c].name(), c, chromosomes.length);
        Probe lastValidProbe = null;
        for (int startPosition = 1; startPosition < chromosomes[c].length() - fragmentSize; startPosition += fragmentSize / 2) {
            // See if we need to quit
            if (cancel) {
                generationCancelled();
                return;
            }
            // We expand the region we're looking at by the inter-strand distance as we're going to
            // be adjusting the read positions
            Probe probe = new Probe(chromosomes[c], startPosition, (startPosition + fragmentSize - 1));
            long[] mergedProbeReads = getReadsFromDataStoreCollection(probe, selectedChIPStores, medianInterStrandDistance);
            mergedProbeReads = deduplicateReads(mergedProbeReads, redundantThreshold);
            SequenceRead.sort(mergedProbeReads);
            int thisProbeOverlapCount = 0;
            for (int i = 0; i < mergedProbeReads.length; i++) {
                if (SequenceRead.overlaps(mergedProbeReads[i], probe.packedPosition())) {
                    ++thisProbeOverlapCount;
                }
            }
            if (thisProbeOverlapCount > readCountCutoff) {
                if (lastValidProbe != null && SequenceRead.overlaps(lastValidProbe.packedPosition(), probe.packedPosition())) {
                    lastValidProbe = new Probe(chromosomes[c], lastValidProbe.start(), probe.end());
                } else if (lastValidProbe != null) {
                    potentialHighConfidencePeaks.add(lastValidProbe);
                    lastValidProbe = probe;
                } else {
                    lastValidProbe = probe;
                }
            }
        }
        if (lastValidProbe != null) {
            potentialHighConfidencePeaks.add(lastValidProbe);
        }
    }
    // Finally we re-filter the peaks we have using local poisson distributions with densities taken
    // from either the input samples (if there are any), or the local region.  The densities are
    // estimated over 1,5 and 10kb around the peak and genome wide and the max of these is taken.
    // If there is no input then the 1kb region is not used.
    Probe[] allCandidateProbes = potentialHighConfidencePeaks.toArray(new Probe[0]);
    // Work out which stores we're using to validate against.
    DataStore[] validationStores;
    boolean useInput = false;
    double inputCorrection = 1;
    int validationNonRedCount;
    if (selectedInputStores != null && selectedInputStores.length > 0) {
        // See if we need to quit
        if (cancel) {
            generationCancelled();
            return;
        }
        validationStores = selectedInputStores;
        useInput = true;
        // We also need to work out the total number of nonredundant seqences
        // in the input so we can work out a scaling factor so that the densities
        // for input and chip are comparable.
        validationNonRedCount = getNonRedundantReadCount(validationStores, redundantThreshold);
        inputCorrection = totalNonRedCount / (double) validationNonRedCount;
        System.err.println("From chip=" + totalNonRedCount + " input=" + validationNonRedCount + " correction is " + inputCorrection);
    } else {
        validationStores = selectedChIPStores;
        validationNonRedCount = totalNonRedCount;
    }
    Vector<Probe> finalValidatedProbes = new Vector<Probe>();
    for (int p = 0; p < allCandidateProbes.length; p++) {
        // See if we need to quit
        if (cancel) {
            generationCancelled();
            return;
        }
        if (p % 100 == 0) {
            updateGenerationProgress("Validated " + p + " out of " + allCandidateProbes.length + " raw peaks", p, allCandidateProbes.length);
        }
        // System.err.println("Validating "+allCandidateProbes[p].chromosome()+":"+allCandidateProbes[p].start()+"-"+allCandidateProbes[p].end());
        // We now need to find the maximum read density per 2*bandwidth against which
        // we're going to validate this peak
        // We're going to get all reads within 10kb of the peak, and then we can subselect from there
        int midPoint = allCandidateProbes[p].middle();
        Probe region10kb = new Probe(allCandidateProbes[p].chromosome(), Math.max(midPoint - 5000, 1), Math.min(midPoint + 4999, allCandidateProbes[p].chromosome().length()), allCandidateProbes[p].strand());
        Probe region5kb = new Probe(allCandidateProbes[p].chromosome(), Math.max(midPoint - 2500, 1), Math.min(midPoint + 2499, allCandidateProbes[p].chromosome().length()), allCandidateProbes[p].strand());
        Probe region1kb = new Probe(allCandidateProbes[p].chromosome(), Math.max(midPoint - 500, 1), Math.min(midPoint + 499, allCandidateProbes[p].chromosome().length()), allCandidateProbes[p].strand());
        // Get the probes for the largest region
        long[] thisRegionReads = getReadsFromDataStoreCollection(region10kb, validationStores, 0);
        // Deduplicate so it's a fair comparison
        // Should we recalculate the redundant threshold based on the input coverage?
        thisRegionReads = deduplicateReads(thisRegionReads, redundantThreshold);
        int region10kbcount = thisRegionReads.length;
        int region5kbcount = 0;
        int region1kbcount = 0;
        // Go through the reads seeing if they fit into the 5 or 1kb regions
        for (int r = 0; r < thisRegionReads.length; r++) {
            if (SequenceRead.overlaps(region5kb.packedPosition(), thisRegionReads[r]))
                ++region5kbcount;
            if (SequenceRead.overlaps(region1kb.packedPosition(), thisRegionReads[r]))
                ++region1kbcount;
        }
        // System.err.println("Input counts 10kb="+region10kbcount+" 5kb="+region5kbcount+" 1kb="+region1kbcount);
        // Convert to densities per window and ajdust for global coverage
        double globalDensity = getExpectedCountPerWindow(validationNonRedCount, collection.genome().getTotalGenomeLength(), allCandidateProbes[p].length(), readLength) * inputCorrection;
        double density10kb = getExpectedCountPerWindow(region10kbcount, region10kb.length(), allCandidateProbes[p].length(), readLength) * inputCorrection;
        double density5kb = getExpectedCountPerWindow(region5kbcount, region5kb.length(), allCandidateProbes[p].length(), readLength) * inputCorrection;
        double density1kb = getExpectedCountPerWindow(region1kbcount, region1kb.length(), allCandidateProbes[p].length(), readLength) * inputCorrection;
        // Find the highest density to use for the validation
        double highestDensity = globalDensity;
        if (density10kb > highestDensity)
            highestDensity = density10kb;
        if (density5kb > highestDensity)
            highestDensity = density5kb;
        if (useInput && density1kb > highestDensity)
            highestDensity = density1kb;
        // System.err.println("Global="+globalDensity+" 10kb="+density10kb+" 5kb="+density5kb+" 1kb="+density1kb+" using="+highestDensity);
        // Construct a poisson distribution with this density
        PoissonDistribution localPoisson = new PoissonDistribution(highestDensity);
        // System.err.println("Cutoff from global="+(new PoissonDistribution(globalDensity)).inverseCumulativeProbability(1-pValue)+" 10kb="+(new PoissonDistribution(density10kb)).inverseCumulativeProbability(1-pValue)+" 5kb="+(new PoissonDistribution(density5kb)).inverseCumulativeProbability(1-pValue)+" 1kb="+(new PoissonDistribution(density1kb)).inverseCumulativeProbability(1-pValue));
        // Now check to see if the actual count from this peak is enough to still pass
        long[] mergedProbeReads = getReadsFromDataStoreCollection(allCandidateProbes[p], selectedChIPStores, medianInterStrandDistance);
        mergedProbeReads = deduplicateReads(mergedProbeReads, redundantThreshold);
        SequenceRead.sort(mergedProbeReads);
        int thisProbeOverlapCount = 0;
        for (int i = 0; i < mergedProbeReads.length; i++) {
            if (SequenceRead.overlaps(mergedProbeReads[i], allCandidateProbes[p].packedPosition())) {
                ++thisProbeOverlapCount;
            }
        }
        if (thisProbeOverlapCount > localPoisson.inverseCumulativeProbability(1 - pValue)) {
            finalValidatedProbes.add(allCandidateProbes[p]);
        // System.err.println("Adding probe to final set");
        }
    }
    // System.err.println("From "+allCandidateProbes.length+" candidates "+finalValidatedProbes.size()+" peaks were validated");
    ProbeSet finalSet = new ProbeSet(getDescription(), finalValidatedProbes.toArray(new Probe[0]));
    generationComplete(finalSet);
}
Also used : PoissonDistribution(org.apache.commons.math3.distribution.PoissonDistribution) Chromosome(uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome) Probe(uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe) ProbeSet(uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeSet) DataStore(uk.ac.babraham.SeqMonk.DataTypes.DataStore) BinomialDistribution(org.apache.commons.math3.distribution.BinomialDistribution) Vector(java.util.Vector) LongVector(uk.ac.babraham.SeqMonk.Utilities.LongVector)

Aggregations

Probe (uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe)125 ProbeList (uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeList)54 SeqMonkException (uk.ac.babraham.SeqMonk.SeqMonkException)52 Vector (java.util.Vector)48 Chromosome (uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome)47 ProbeSet (uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeSet)26 DataStore (uk.ac.babraham.SeqMonk.DataTypes.DataStore)21 Feature (uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature)20 HashSet (java.util.HashSet)9 Location (uk.ac.babraham.SeqMonk.DataTypes.Genome.Location)9 File (java.io.File)8 PrintWriter (java.io.PrintWriter)8 ProbeTTestValue (uk.ac.babraham.SeqMonk.Analysis.Statistics.ProbeTTestValue)8 SplitLocation (uk.ac.babraham.SeqMonk.DataTypes.Genome.SplitLocation)7 HiCDataStore (uk.ac.babraham.SeqMonk.DataTypes.HiCDataStore)7 BufferedReader (java.io.BufferedReader)6 FileReader (java.io.FileReader)6 Hashtable (java.util.Hashtable)6 DataSet (uk.ac.babraham.SeqMonk.DataTypes.DataSet)6 HiCHitCollection (uk.ac.babraham.SeqMonk.DataTypes.Sequence.HiCHitCollection)6