Search in sources :

Example 1 with MappedGeneSet

use of uk.ac.babraham.SeqMonk.GeneSets.MappedGeneSet in project SeqMonk by s-andrews.

the class GeneSetIntensityDifferenceFilter method generateProbeList.

/* (non-Javadoc)
	 * @see uk.ac.babraham.SeqMonk.Filters.ProbeFilter#generateProbeList()
	 */
protected void generateProbeList() {
    try {
        applyMultipleTestingCorrection = optionsPanel.multipleTestingBox.isSelected();
        calculateCustomRegression = optionsPanel.calculateCustomRegressionBox.isSelected();
        if (calculateCustomRegression == false) {
            customRegressionValues = null;
        }
        if (calculateLinearRegression) {
            simpleRegression = new SimpleRegression();
        }
        Probe[] allProbes = startingList.getAllProbes();
        // We're not allowing multiple comparisons - this is a bit of a messy workaround so it's compatible with other methods.
        fromStore = fromStores[0];
        toStore = toStores[0];
        ArrayList<Probe> probeArrayList = new ArrayList<Probe>();
        int NaNcount = 0;
        // remove the invalid probes - the ones without a value
        for (int i = 0; i < allProbes.length; i++) {
            if ((Float.isNaN(fromStore.getValueForProbe(allProbes[i]))) || (Float.isNaN(toStore.getValueForProbe(allProbes[i])))) {
                NaNcount++;
            } else {
                probeArrayList.add(allProbes[i]);
            }
        }
        System.err.println("Found " + NaNcount + " probes that were invalid.");
        probes = probeArrayList.toArray(new Probe[0]);
        if (calculateCustomRegression == true) {
            customRegressionValues = new float[2][probes.length];
        }
        // We'll pull the number of probes to sample from the preferences if they've changed it
        Integer updatedProbesPerSet = optionsPanel.probesPerSet();
        if (updatedProbesPerSet != null)
            probesPerSet = updatedProbesPerSet;
        // we want a set of z-scores using the local distribution.
        probeZScoreLookupTable = new Hashtable<Probe, Double>();
        // Put something in the progress whilst we're ordering the probe values to make the comparison.
        progressUpdated("Generating background model", 0, 1);
        Comparator<Integer> comp = new AverageIntensityComparator(fromStore, toStore, probes);
        // We need to generate a set of probe indices that can be ordered by their average intensity
        Integer[] indices = new Integer[probes.length];
        for (int i = 0; i < probes.length; i++) {
            indices[i] = i;
            /* add the data to the linear regression object */
            if (calculateLinearRegression) {
                simpleRegression.addData((double) fromStore.getValueForProbe(probes[i]), (double) toStore.getValueForProbe(probes[i]));
            }
        }
        if (calculateLinearRegression) {
            System.out.println("intercept = " + simpleRegression.getIntercept() + ", slope = " + simpleRegression.getSlope());
        }
        Arrays.sort(indices, comp);
        /* This holds the indices to get the deduplicated probe from the original list of probes */
        deduplicatedIndices = new Integer[probes.length];
        /* The number of probes with different values */
        int dedupProbeCounter = 0;
        // the probes deduplicated by value
        ArrayList<Probe> deduplicatedProbes = new ArrayList<Probe>();
        // populate the first one so that we have something to compare to in the loop
        deduplicatedIndices[0] = 0;
        deduplicatedProbes.add(probes[indices[0]]);
        progressUpdated("Made 0 of 1 comparisons", 0, 1);
        for (int i = 1; i < indices.length; i++) {
            /* indices have been sorted, now we need to check whether adjacent pair values are identical */
            if ((fromStore.getValueForProbe(probes[indices[i]]) == fromStore.getValueForProbe(probes[indices[i - 1]])) && (toStore.getValueForProbe(probes[indices[i]]) == toStore.getValueForProbe(probes[indices[i - 1]]))) {
                /* If they are identical, do not add the probe to the deduplicatedProbes object, but have a reference for it so we can look up which deduplicated probe and 
					 * therefore which distribution slice to use for the duplicated probe. */
                deduplicatedIndices[i] = dedupProbeCounter;
            } else {
                deduplicatedProbes.add(probes[indices[i]]);
                dedupProbeCounter++;
                deduplicatedIndices[i] = dedupProbeCounter;
            }
        }
        Probe[] dedupProbes = deduplicatedProbes.toArray(new Probe[0]);
        // make sure we're not trying to use more probes than we've got in the analysis
        if (probesPerSet > dedupProbes.length) {
            probesPerSet = dedupProbes.length;
        }
        System.out.println("total number of probe values = " + probes.length);
        System.out.println("number of deduplicated probe values = " + dedupProbes.length);
        System.out.println("probesPerSet = " + probesPerSet);
        // I want this to contain all the differences, then from that I'm going to get the z-scores.
        double[] currentDiffSet = new double[probesPerSet];
        for (int i = 0; i < indices.length; i++) {
            if (cancel) {
                cancel = false;
                progressCancelled();
                return;
            }
            if (i % 1000 == 0) {
                int progress = (i * 100) / indices.length;
                // progress += 100*comparisonIndex;
                progressUpdated("Made 0 out of 1 comparisons", progress, 100);
            }
            /**
             * There are +1s in here because we skip over j when j == startingIndex.
             */
            // We need to make up the set of differences to represent this probe
            int startingIndex = deduplicatedIndices[i] - (probesPerSet / 2);
            if (startingIndex < 0)
                startingIndex = 0;
            if (startingIndex + (probesPerSet + 1) >= dedupProbes.length)
                startingIndex = dedupProbes.length - (probesPerSet + 1);
            try {
                for (int j = startingIndex; j < startingIndex + (probesPerSet + 1); j++) {
                    if (j == startingIndex) {
                        // Don't include the point being tested in the background model
                        continue;
                    }
                    double diff;
                    if (calculateLinearRegression == true) {
                        if (j > dedupProbes.length) {
                            System.err.println(" j is too big, it's " + j + " and dedupProbes.length = " + dedupProbes.length + ", starting index = " + startingIndex);
                        }
                        double x = fromStore.getValueForProbe(dedupProbes[j]);
                        double expectedY = (simpleRegression.getSlope() * x) + simpleRegression.getIntercept();
                        diff = toStore.getValueForProbe(dedupProbes[j]) - expectedY;
                    } else {
                        diff = toStore.getValueForProbe(dedupProbes[j]) - fromStore.getValueForProbe(dedupProbes[j]);
                    }
                    if (j < startingIndex) {
                        currentDiffSet[j - startingIndex] = diff;
                    } else {
                        currentDiffSet[(j - startingIndex) - 1] = diff;
                    }
                }
                if (calculateCustomRegression == true) {
                    // the average/ kind of centre line
                    float z = ((fromStore.getValueForProbe(probes[indices[i]]) + toStore.getValueForProbe(probes[indices[i]])) / 2);
                    customRegressionValues[0][indices[i]] = z - ((float) SimpleStats.mean(currentDiffSet) / 2);
                    customRegressionValues[1][indices[i]] = z + ((float) SimpleStats.mean(currentDiffSet) / 2);
                }
                double mean = 0;
                // Get the difference for this point
                double diff;
                if (calculateLinearRegression == true) {
                    double x = fromStore.getValueForProbe(probes[indices[i]]);
                    double expectedY = (simpleRegression.getSlope() * x) + simpleRegression.getIntercept();
                    diff = toStore.getValueForProbe(probes[indices[i]]) - expectedY;
                } else if (calculateCustomRegression == true) {
                    diff = toStore.getValueForProbe(probes[indices[i]]) - fromStore.getValueForProbe(probes[indices[i]]);
                    mean = SimpleStats.mean(currentDiffSet);
                } else {
                    diff = toStore.getValueForProbe(probes[indices[i]]) - fromStore.getValueForProbe(probes[indices[i]]);
                }
                double stdev;
                stdev = SimpleStats.stdev(currentDiffSet, mean);
                // if there are no reads in the probe for either of the datasets, should we set the zscore to 0??
                double zScore = (diff - mean) / stdev;
                // modified z score
                // median absolute deviation
                /*		double[] madArray = new double[currentDiffSet.length];
						double median = SimpleStats.median(currentDiffSet);					
						
						for(int d=0; d<currentDiffSet.length; d++){							
							madArray[d] = Math.abs(currentDiffSet[d] - median);							
						}
						
						double mad = SimpleStats.median(madArray);							
						zScore = (0.6745 * (diff - median))/mad;
					}
				*/
                probeZScoreLookupTable.put(probes[indices[i]], zScore);
            } catch (SeqMonkException sme) {
                progressExceptionReceived(sme);
                return;
            }
        }
        // make this an array list as we're kicking out the mapped gene sets that have zscores with variance of 0.
        ArrayList<MappedGeneSetTTestValue> pValueArrayList = new ArrayList<MappedGeneSetTTestValue>();
        MappedGeneSet[] mappedGeneSets = null;
        /* if we're using the gene set from a file, map the gene sets to the probes */
        if (optionsPanel.geneSetsFileRadioButton.isSelected()) {
            GeneSetCollectionParser geneSetCollectionParser = new GeneSetCollectionParser(minGenesInSet, maxGenesInSet);
            GeneSetCollection geneSetCollection = geneSetCollectionParser.parseGeneSetInformation(validGeneSetFilepath);
            MappedGeneSet[] allMappedGeneSets = geneSetCollection.getMappedGeneSets(probes);
            if (allMappedGeneSets == null) {
                JOptionPane.showMessageDialog(SeqMonkApplication.getInstance(), "No sets of genes could be matched to probes.\nCheck that the gmt file is for the right species. \nTo use gene sets from a file, probe names must contain the gene name. \nTry defining new probes over genes (e.g. using the RNA-seq quantitation pipeline) or use existing probes lists instead of a gene set file.", "No gene sets identified", JOptionPane.ERROR_MESSAGE);
                throw new SeqMonkException("No sets of genes could be matched to probes.\nTo use gene sets from a file, probe names must contain the gene name.\nTry defining new probes over genes or use existing probes lists instead of a gene set file.");
            } else {
                ArrayList<MappedGeneSet> mgsArrayList = new ArrayList<MappedGeneSet>();
                /* get rid of those that have fewer probes in the set than minGenesInSet. We shouldn't exceed maxGenesInSet unless probes have been made over something other than genes */
                for (int i = 0; i < allMappedGeneSets.length; i++) {
                    if (allMappedGeneSets[i].getProbes().length >= minGenesInSet) {
                        mgsArrayList.add(allMappedGeneSets[i]);
                    }
                }
                mappedGeneSets = mgsArrayList.toArray(new MappedGeneSet[0]);
            }
        } else /* or if we're using existing probelists, create mappedGeneSets from them */
        if (optionsPanel.probeListRadioButton.isSelected() && selectedProbeLists != null) {
            mappedGeneSets = new MappedGeneSet[selectedProbeLists.length];
            for (int i = 0; i < selectedProbeLists.length; i++) {
                mappedGeneSets[i] = new MappedGeneSet(selectedProbeLists[i]);
            }
        } else {
            throw new SeqMonkException("Haven't got any genesets to use, shouldn't have got here without having any selected.");
        }
        if (mappedGeneSets == null || mappedGeneSets.length == 0) {
            throw new SeqMonkException("Couldn't map gene sets to the probes, try again with a different probe set");
        } else {
            System.err.println("there are " + mappedGeneSets.length + " mappedGeneSets");
            System.err.println("size of zScore lookup table = " + probeZScoreLookupTable.size());
        }
        System.err.println("total number of mapped gene sets = " + mappedGeneSets.length);
        // deduplicate our mappedGeneSets
        if (optionsPanel.deduplicateGeneSetBox.isSelected()) {
            mappedGeneSets = deduplicateMappedGeneSets(mappedGeneSets);
        }
        System.err.println("deduplicated mapped gene sets = " + mappedGeneSets.length);
        /* 
			 * we need to go through the mapped gene set and get all the values for the matched probes 
			 * 
			 */
        for (int i = 0; i < mappedGeneSets.length; i++) {
            Probe[] geneSetProbes = mappedGeneSets[i].getProbes();
            // to contain all the z-scores for the gene set
            double[] geneSetZscores = new double[geneSetProbes.length];
            // Find the z-scores for each of the probes in the mappedGeneSet
            for (int gsp = 0; gsp < geneSetProbes.length; gsp++) {
                if (probeZScoreLookupTable.containsKey(geneSetProbes[gsp])) {
                    geneSetZscores[gsp] = probeZScoreLookupTable.get(geneSetProbes[gsp]);
                }
            }
            // this is just temporary to check the stats - there's some duplication with this.... is there??
            mappedGeneSets[i].zScores = geneSetZscores;
            if (geneSetZscores.length > 1) {
                // the number of probes in the mappedGeneSet should always be > 1 anyway as the mappedGeneSet shouldn't be created if there are < 2 matched probes.
                double pVal;
                if (optionsPanel.statisticalTestBox.getSelectedItem().toString().equals("t-test")) {
                    pVal = TTest.calculatePValue(geneSetZscores, 0);
                } else if (optionsPanel.statisticalTestBox.getSelectedItem().toString().equals("Kolmorogov-Smirnov test")) {
                    double[] allZscores = getAllZScores(probeZScoreLookupTable);
                    KolmogorovSmirnovTest ksTest = new KolmogorovSmirnovTest();
                    pVal = ksTest.kolmogorovSmirnovTest(geneSetZscores, allZscores);
                } else if (optionsPanel.statisticalTestBox.getSelectedItem().toString().equals("Kolmorogov-Smirnov non-directional test")) {
                    double[] allZscores = getAllZScores(probeZScoreLookupTable);
                    KolmogorovSmirnovTest ksTest = new KolmogorovSmirnovTest();
                    pVal = ksTest.kolmogorovSmirnovTest(convertToAbsoluteValues(geneSetZscores), convertToAbsoluteValues(allZscores));
                } else {
                    throw new IllegalArgumentException("Didn't recognise statistical test " + optionsPanel.statisticalTestBox.getSelectedItem().toString());
                }
                mappedGeneSets[i].meanZScore = SimpleStats.mean(geneSetZscores);
                // check the variance - we don't want variances of 0.
                double stdev = SimpleStats.stdev(geneSetZscores);
                if ((stdev * stdev) < 0.00000000000001) {
                    continue;
                } else // if all the differences between the datasets are 0 then just get rid of them
                if (Double.isNaN(pVal)) {
                    continue;
                } else {
                    pValueArrayList.add(new MappedGeneSetTTestValue(mappedGeneSets[i], pVal));
                }
            } else {
                System.err.println("Fell through the net somewhere, why does the set of zscores contain fewer than 2 values?");
                continue;
            }
        }
        MappedGeneSetTTestValue[] filterResultpValues = pValueArrayList.toArray(new MappedGeneSetTTestValue[0]);
        ArrayList<MappedGeneSetTTestValue> filteredPValueArrayList = new ArrayList<MappedGeneSetTTestValue>();
        // Now we've got all the p values they need to be corrected.
        if (applyMultipleTestingCorrection) {
            BenjHochFDR.calculateQValues(filterResultpValues);
        }
        System.err.println(filterResultpValues.length + " p-values calculated, multtest = " + applyMultipleTestingCorrection + ", pval limit = " + pValueLimit);
        for (int i = 0; i < filterResultpValues.length; i++) {
            double pOrQvalue;
            if (applyMultipleTestingCorrection) {
                pOrQvalue = filterResultpValues[i].q;
            } else {
                pOrQvalue = filterResultpValues[i].p;
            }
            // check whether it passes the p/q-value and z-score cut-offs
            if (optionsPanel.reportAllResults.isSelected()) {
                filteredPValueArrayList.add(filterResultpValues[i]);
            } else {
                if ((pOrQvalue < pValueLimit) && (Math.abs(filterResultpValues[i].mappedGeneSet.meanZScore) > zScoreThreshold)) {
                    filteredPValueArrayList.add(filterResultpValues[i]);
                }
            }
        }
        // convert the ArrayList to MappedGeneSetTTestValue
        filterResultpValues = filteredPValueArrayList.toArray(new MappedGeneSetTTestValue[0]);
        if (filterResultpValues.length == 0) {
            JOptionPane.showMessageDialog(SeqMonkApplication.getInstance(), "No sets of genes were identified using the selected parameters, \ntry changing the gene sets or relaxing the p-value/z-score thresholds.", "No gene sets identified", JOptionPane.INFORMATION_MESSAGE);
        } else {
            geneSetDisplay = new GeneSetDisplay(dataCollection, listDescription(), fromStore, toStore, probes, probeZScoreLookupTable, filterResultpValues, startingList, customRegressionValues, simpleRegression, // optionsPanel.bidirectionalRadioButton.isSelected());
            false);
            geneSetDisplay.addWindowListener(this);
        }
        // We don't want to save the probe list here, we're bringing up the intermediate display from which probe lists can be saved.
        progressCancelled();
    } catch (Exception e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}
Also used : GeneSetCollection(uk.ac.babraham.SeqMonk.GeneSets.GeneSetCollection) ArrayList(java.util.ArrayList) Probe(uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe) MappedGeneSetTTestValue(uk.ac.babraham.SeqMonk.Analysis.Statistics.MappedGeneSetTTestValue) KolmogorovSmirnovTest(org.apache.commons.math3.stat.inference.KolmogorovSmirnovTest) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) GeneSetCollectionParser(uk.ac.babraham.SeqMonk.GeneSets.GeneSetCollectionParser) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) SimpleRegression(org.apache.commons.math3.stat.regression.SimpleRegression) MappedGeneSet(uk.ac.babraham.SeqMonk.GeneSets.MappedGeneSet)

Aggregations

ArrayList (java.util.ArrayList)1 KolmogorovSmirnovTest (org.apache.commons.math3.stat.inference.KolmogorovSmirnovTest)1 SimpleRegression (org.apache.commons.math3.stat.regression.SimpleRegression)1 MappedGeneSetTTestValue (uk.ac.babraham.SeqMonk.Analysis.Statistics.MappedGeneSetTTestValue)1 Probe (uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe)1 GeneSetCollection (uk.ac.babraham.SeqMonk.GeneSets.GeneSetCollection)1 GeneSetCollectionParser (uk.ac.babraham.SeqMonk.GeneSets.GeneSetCollectionParser)1 MappedGeneSet (uk.ac.babraham.SeqMonk.GeneSets.MappedGeneSet)1 SeqMonkException (uk.ac.babraham.SeqMonk.SeqMonkException)1