use of org.apache.commons.math3.stat.inference.KolmogorovSmirnovTest in project vcell by virtualcell.
the class TimeSeriesMultitrialData method kolmogorovSmirnovTest.
public static double kolmogorovSmirnovTest(double[] rawData1, double[] rawData2) {
try {
int numBins = 1 + (int) Math.ceil(Math.sqrt(rawData1.length));
// rawData2 = ramp(0,10,rawData2.length);
Max max = new Max();
max.incrementAll(rawData1);
max.incrementAll(rawData2);
Min min = new Min();
min.incrementAll(rawData1);
min.incrementAll(rawData2);
double[] cdf1 = calculateCDF(rawData1, min.getResult(), max.getResult(), numBins);
double[] cdf2 = calculateCDF(rawData2, min.getResult(), max.getResult(), numBins);
KolmogorovSmirnovTest test = new KolmogorovSmirnovTest();
return test.kolmogorovSmirnovStatistic(cdf1, cdf2);
} catch (Exception e) {
e.printStackTrace(System.out);
return -1;
}
}
use of org.apache.commons.math3.stat.inference.KolmogorovSmirnovTest in project vcell by virtualcell.
the class TimeSeriesMultitrialData method kolmogorovSmirnovTest.
public static double kolmogorovSmirnovTest(double[] rawData1, double[] rawData2) {
try {
int numBins = 1 + (int) Math.ceil(Math.sqrt(rawData1.length));
// rawData2 = ramp(0,10,rawData2.length);
TimeSeriesMultitrialData.MinMaxHelp minMaxHelp1 = new TimeSeriesMultitrialData.MinMaxHelp(rawData1);
TimeSeriesMultitrialData.MinMaxHelp minMaxHelp2 = new TimeSeriesMultitrialData.MinMaxHelp(rawData2);
double min = Math.min(minMaxHelp1.min, minMaxHelp2.min);
double max = Math.max(minMaxHelp1.max, minMaxHelp2.max);
double[] cdf1 = calculateCDF(rawData1, min, max, numBins);
double[] cdf2 = calculateCDF(rawData2, min, max, numBins);
KolmogorovSmirnovTest test = new KolmogorovSmirnovTest();
return test.kolmogorovSmirnovStatistic(cdf1, cdf2);
} catch (Exception e) {
e.printStackTrace(System.out);
return -1;
}
}
use of org.apache.commons.math3.stat.inference.KolmogorovSmirnovTest in project cruise-control by linkedin.
the class AnalyzerUtils method testDifference.
/**
* Test if two clusters are significantly different in the metrics we look at for balancing.
*
* @param orig the utilization matrix from the original cluster
* @param optimized the utilization matrix from the optimized cluster
* @return The P value that the various derived resources come from the same probability distribution. The probability
* that the null hypothesis is correct.
*/
public static double[] testDifference(double[][] orig, double[][] optimized) {
int nResources = RawAndDerivedResource.values().length;
if (orig.length != nResources) {
throw new IllegalArgumentException("orig must have number of rows equal to RawAndDerivedResource.");
}
if (optimized.length != nResources) {
throw new IllegalArgumentException("optimized must have number of rows equal to RawAndDerivedResource.");
}
if (orig[0].length != optimized[0].length) {
throw new IllegalArgumentException("The number of brokers must be the same.");
}
double[] pValues = new double[orig.length];
// TODO: For small N we want to do statistical bootstrapping (not the same as bootstrapping data).
for (int resourceIndex = 0; resourceIndex < nResources; resourceIndex++) {
RandomGenerator rng = new MersenneTwister(0x5d11121018463324L);
KolmogorovSmirnovTest kolmogorovSmirnovTest = new KolmogorovSmirnovTest(rng);
pValues[resourceIndex] = kolmogorovSmirnovTest.kolmogorovSmirnovTest(orig[resourceIndex], optimized[resourceIndex]);
}
return pValues;
}
use of org.apache.commons.math3.stat.inference.KolmogorovSmirnovTest in project SeqMonk by s-andrews.
the class GeneSetIntensityDifferenceFilter method generateProbeList.
/* (non-Javadoc)
* @see uk.ac.babraham.SeqMonk.Filters.ProbeFilter#generateProbeList()
*/
protected void generateProbeList() {
try {
applyMultipleTestingCorrection = optionsPanel.multipleTestingBox.isSelected();
calculateCustomRegression = optionsPanel.calculateCustomRegressionBox.isSelected();
if (calculateCustomRegression == false) {
customRegressionValues = null;
}
if (calculateLinearRegression) {
simpleRegression = new SimpleRegression();
}
Probe[] allProbes = startingList.getAllProbes();
// We're not allowing multiple comparisons - this is a bit of a messy workaround so it's compatible with other methods.
fromStore = fromStores[0];
toStore = toStores[0];
ArrayList<Probe> probeArrayList = new ArrayList<Probe>();
int NaNcount = 0;
// remove the invalid probes - the ones without a value
for (int i = 0; i < allProbes.length; i++) {
if ((Float.isNaN(fromStore.getValueForProbe(allProbes[i]))) || (Float.isNaN(toStore.getValueForProbe(allProbes[i])))) {
NaNcount++;
} else {
probeArrayList.add(allProbes[i]);
}
}
System.err.println("Found " + NaNcount + " probes that were invalid.");
probes = probeArrayList.toArray(new Probe[0]);
if (calculateCustomRegression == true) {
customRegressionValues = new float[2][probes.length];
}
// We'll pull the number of probes to sample from the preferences if they've changed it
Integer updatedProbesPerSet = optionsPanel.probesPerSet();
if (updatedProbesPerSet != null)
probesPerSet = updatedProbesPerSet;
// we want a set of z-scores using the local distribution.
probeZScoreLookupTable = new Hashtable<Probe, Double>();
// Put something in the progress whilst we're ordering the probe values to make the comparison.
progressUpdated("Generating background model", 0, 1);
Comparator<Integer> comp = new AverageIntensityComparator(fromStore, toStore, probes);
// We need to generate a set of probe indices that can be ordered by their average intensity
Integer[] indices = new Integer[probes.length];
for (int i = 0; i < probes.length; i++) {
indices[i] = i;
/* add the data to the linear regression object */
if (calculateLinearRegression) {
simpleRegression.addData((double) fromStore.getValueForProbe(probes[i]), (double) toStore.getValueForProbe(probes[i]));
}
}
if (calculateLinearRegression) {
System.out.println("intercept = " + simpleRegression.getIntercept() + ", slope = " + simpleRegression.getSlope());
}
Arrays.sort(indices, comp);
/* This holds the indices to get the deduplicated probe from the original list of probes */
deduplicatedIndices = new Integer[probes.length];
/* The number of probes with different values */
int dedupProbeCounter = 0;
// the probes deduplicated by value
ArrayList<Probe> deduplicatedProbes = new ArrayList<Probe>();
// populate the first one so that we have something to compare to in the loop
deduplicatedIndices[0] = 0;
deduplicatedProbes.add(probes[indices[0]]);
progressUpdated("Made 0 of 1 comparisons", 0, 1);
for (int i = 1; i < indices.length; i++) {
/* indices have been sorted, now we need to check whether adjacent pair values are identical */
if ((fromStore.getValueForProbe(probes[indices[i]]) == fromStore.getValueForProbe(probes[indices[i - 1]])) && (toStore.getValueForProbe(probes[indices[i]]) == toStore.getValueForProbe(probes[indices[i - 1]]))) {
/* If they are identical, do not add the probe to the deduplicatedProbes object, but have a reference for it so we can look up which deduplicated probe and
* therefore which distribution slice to use for the duplicated probe. */
deduplicatedIndices[i] = dedupProbeCounter;
} else {
deduplicatedProbes.add(probes[indices[i]]);
dedupProbeCounter++;
deduplicatedIndices[i] = dedupProbeCounter;
}
}
Probe[] dedupProbes = deduplicatedProbes.toArray(new Probe[0]);
// make sure we're not trying to use more probes than we've got in the analysis
if (probesPerSet > dedupProbes.length) {
probesPerSet = dedupProbes.length;
}
System.out.println("total number of probe values = " + probes.length);
System.out.println("number of deduplicated probe values = " + dedupProbes.length);
System.out.println("probesPerSet = " + probesPerSet);
// I want this to contain all the differences, then from that I'm going to get the z-scores.
double[] currentDiffSet = new double[probesPerSet];
for (int i = 0; i < indices.length; i++) {
if (cancel) {
cancel = false;
progressCancelled();
return;
}
if (i % 1000 == 0) {
int progress = (i * 100) / indices.length;
// progress += 100*comparisonIndex;
progressUpdated("Made 0 out of 1 comparisons", progress, 100);
}
/**
* There are +1s in here because we skip over j when j == startingIndex.
*/
// We need to make up the set of differences to represent this probe
int startingIndex = deduplicatedIndices[i] - (probesPerSet / 2);
if (startingIndex < 0)
startingIndex = 0;
if (startingIndex + (probesPerSet + 1) >= dedupProbes.length)
startingIndex = dedupProbes.length - (probesPerSet + 1);
try {
for (int j = startingIndex; j < startingIndex + (probesPerSet + 1); j++) {
if (j == startingIndex) {
// Don't include the point being tested in the background model
continue;
}
double diff;
if (calculateLinearRegression == true) {
if (j > dedupProbes.length) {
System.err.println(" j is too big, it's " + j + " and dedupProbes.length = " + dedupProbes.length + ", starting index = " + startingIndex);
}
double x = fromStore.getValueForProbe(dedupProbes[j]);
double expectedY = (simpleRegression.getSlope() * x) + simpleRegression.getIntercept();
diff = toStore.getValueForProbe(dedupProbes[j]) - expectedY;
} else {
diff = toStore.getValueForProbe(dedupProbes[j]) - fromStore.getValueForProbe(dedupProbes[j]);
}
if (j < startingIndex) {
currentDiffSet[j - startingIndex] = diff;
} else {
currentDiffSet[(j - startingIndex) - 1] = diff;
}
}
if (calculateCustomRegression == true) {
// the average/ kind of centre line
float z = ((fromStore.getValueForProbe(probes[indices[i]]) + toStore.getValueForProbe(probes[indices[i]])) / 2);
customRegressionValues[0][indices[i]] = z - ((float) SimpleStats.mean(currentDiffSet) / 2);
customRegressionValues[1][indices[i]] = z + ((float) SimpleStats.mean(currentDiffSet) / 2);
}
double mean = 0;
// Get the difference for this point
double diff;
if (calculateLinearRegression == true) {
double x = fromStore.getValueForProbe(probes[indices[i]]);
double expectedY = (simpleRegression.getSlope() * x) + simpleRegression.getIntercept();
diff = toStore.getValueForProbe(probes[indices[i]]) - expectedY;
} else if (calculateCustomRegression == true) {
diff = toStore.getValueForProbe(probes[indices[i]]) - fromStore.getValueForProbe(probes[indices[i]]);
mean = SimpleStats.mean(currentDiffSet);
} else {
diff = toStore.getValueForProbe(probes[indices[i]]) - fromStore.getValueForProbe(probes[indices[i]]);
}
double stdev;
stdev = SimpleStats.stdev(currentDiffSet, mean);
// if there are no reads in the probe for either of the datasets, should we set the zscore to 0??
double zScore = (diff - mean) / stdev;
// modified z score
// median absolute deviation
/* double[] madArray = new double[currentDiffSet.length];
double median = SimpleStats.median(currentDiffSet);
for(int d=0; d<currentDiffSet.length; d++){
madArray[d] = Math.abs(currentDiffSet[d] - median);
}
double mad = SimpleStats.median(madArray);
zScore = (0.6745 * (diff - median))/mad;
}
*/
probeZScoreLookupTable.put(probes[indices[i]], zScore);
} catch (SeqMonkException sme) {
progressExceptionReceived(sme);
return;
}
}
// make this an array list as we're kicking out the mapped gene sets that have zscores with variance of 0.
ArrayList<MappedGeneSetTTestValue> pValueArrayList = new ArrayList<MappedGeneSetTTestValue>();
MappedGeneSet[] mappedGeneSets = null;
/* if we're using the gene set from a file, map the gene sets to the probes */
if (optionsPanel.geneSetsFileRadioButton.isSelected()) {
GeneSetCollectionParser geneSetCollectionParser = new GeneSetCollectionParser(minGenesInSet, maxGenesInSet);
GeneSetCollection geneSetCollection = geneSetCollectionParser.parseGeneSetInformation(validGeneSetFilepath);
MappedGeneSet[] allMappedGeneSets = geneSetCollection.getMappedGeneSets(probes);
if (allMappedGeneSets == null) {
JOptionPane.showMessageDialog(SeqMonkApplication.getInstance(), "No sets of genes could be matched to probes.\nCheck that the gmt file is for the right species. \nTo use gene sets from a file, probe names must contain the gene name. \nTry defining new probes over genes (e.g. using the RNA-seq quantitation pipeline) or use existing probes lists instead of a gene set file.", "No gene sets identified", JOptionPane.ERROR_MESSAGE);
throw new SeqMonkException("No sets of genes could be matched to probes.\nTo use gene sets from a file, probe names must contain the gene name.\nTry defining new probes over genes or use existing probes lists instead of a gene set file.");
} else {
ArrayList<MappedGeneSet> mgsArrayList = new ArrayList<MappedGeneSet>();
/* get rid of those that have fewer probes in the set than minGenesInSet. We shouldn't exceed maxGenesInSet unless probes have been made over something other than genes */
for (int i = 0; i < allMappedGeneSets.length; i++) {
if (allMappedGeneSets[i].getProbes().length >= minGenesInSet) {
mgsArrayList.add(allMappedGeneSets[i]);
}
}
mappedGeneSets = mgsArrayList.toArray(new MappedGeneSet[0]);
}
} else /* or if we're using existing probelists, create mappedGeneSets from them */
if (optionsPanel.probeListRadioButton.isSelected() && selectedProbeLists != null) {
mappedGeneSets = new MappedGeneSet[selectedProbeLists.length];
for (int i = 0; i < selectedProbeLists.length; i++) {
mappedGeneSets[i] = new MappedGeneSet(selectedProbeLists[i]);
}
} else {
throw new SeqMonkException("Haven't got any genesets to use, shouldn't have got here without having any selected.");
}
if (mappedGeneSets == null || mappedGeneSets.length == 0) {
throw new SeqMonkException("Couldn't map gene sets to the probes, try again with a different probe set");
} else {
System.err.println("there are " + mappedGeneSets.length + " mappedGeneSets");
System.err.println("size of zScore lookup table = " + probeZScoreLookupTable.size());
}
System.err.println("total number of mapped gene sets = " + mappedGeneSets.length);
// deduplicate our mappedGeneSets
if (optionsPanel.deduplicateGeneSetBox.isSelected()) {
mappedGeneSets = deduplicateMappedGeneSets(mappedGeneSets);
}
System.err.println("deduplicated mapped gene sets = " + mappedGeneSets.length);
/*
* we need to go through the mapped gene set and get all the values for the matched probes
*
*/
for (int i = 0; i < mappedGeneSets.length; i++) {
Probe[] geneSetProbes = mappedGeneSets[i].getProbes();
// to contain all the z-scores for the gene set
double[] geneSetZscores = new double[geneSetProbes.length];
// Find the z-scores for each of the probes in the mappedGeneSet
for (int gsp = 0; gsp < geneSetProbes.length; gsp++) {
if (probeZScoreLookupTable.containsKey(geneSetProbes[gsp])) {
geneSetZscores[gsp] = probeZScoreLookupTable.get(geneSetProbes[gsp]);
}
}
// this is just temporary to check the stats - there's some duplication with this.... is there??
mappedGeneSets[i].zScores = geneSetZscores;
if (geneSetZscores.length > 1) {
// the number of probes in the mappedGeneSet should always be > 1 anyway as the mappedGeneSet shouldn't be created if there are < 2 matched probes.
double pVal;
if (optionsPanel.statisticalTestBox.getSelectedItem().toString().equals("t-test")) {
pVal = TTest.calculatePValue(geneSetZscores, 0);
} else if (optionsPanel.statisticalTestBox.getSelectedItem().toString().equals("Kolmorogov-Smirnov test")) {
double[] allZscores = getAllZScores(probeZScoreLookupTable);
KolmogorovSmirnovTest ksTest = new KolmogorovSmirnovTest();
pVal = ksTest.kolmogorovSmirnovTest(geneSetZscores, allZscores);
} else if (optionsPanel.statisticalTestBox.getSelectedItem().toString().equals("Kolmorogov-Smirnov non-directional test")) {
double[] allZscores = getAllZScores(probeZScoreLookupTable);
KolmogorovSmirnovTest ksTest = new KolmogorovSmirnovTest();
pVal = ksTest.kolmogorovSmirnovTest(convertToAbsoluteValues(geneSetZscores), convertToAbsoluteValues(allZscores));
} else {
throw new IllegalArgumentException("Didn't recognise statistical test " + optionsPanel.statisticalTestBox.getSelectedItem().toString());
}
mappedGeneSets[i].meanZScore = SimpleStats.mean(geneSetZscores);
// check the variance - we don't want variances of 0.
double stdev = SimpleStats.stdev(geneSetZscores);
if ((stdev * stdev) < 0.00000000000001) {
continue;
} else // if all the differences between the datasets are 0 then just get rid of them
if (Double.isNaN(pVal)) {
continue;
} else {
pValueArrayList.add(new MappedGeneSetTTestValue(mappedGeneSets[i], pVal));
}
} else {
System.err.println("Fell through the net somewhere, why does the set of zscores contain fewer than 2 values?");
continue;
}
}
MappedGeneSetTTestValue[] filterResultpValues = pValueArrayList.toArray(new MappedGeneSetTTestValue[0]);
ArrayList<MappedGeneSetTTestValue> filteredPValueArrayList = new ArrayList<MappedGeneSetTTestValue>();
// Now we've got all the p values they need to be corrected.
if (applyMultipleTestingCorrection) {
BenjHochFDR.calculateQValues(filterResultpValues);
}
System.err.println(filterResultpValues.length + " p-values calculated, multtest = " + applyMultipleTestingCorrection + ", pval limit = " + pValueLimit);
for (int i = 0; i < filterResultpValues.length; i++) {
double pOrQvalue;
if (applyMultipleTestingCorrection) {
pOrQvalue = filterResultpValues[i].q;
} else {
pOrQvalue = filterResultpValues[i].p;
}
// check whether it passes the p/q-value and z-score cut-offs
if (optionsPanel.reportAllResults.isSelected()) {
filteredPValueArrayList.add(filterResultpValues[i]);
} else {
if ((pOrQvalue < pValueLimit) && (Math.abs(filterResultpValues[i].mappedGeneSet.meanZScore) > zScoreThreshold)) {
filteredPValueArrayList.add(filterResultpValues[i]);
}
}
}
// convert the ArrayList to MappedGeneSetTTestValue
filterResultpValues = filteredPValueArrayList.toArray(new MappedGeneSetTTestValue[0]);
if (filterResultpValues.length == 0) {
JOptionPane.showMessageDialog(SeqMonkApplication.getInstance(), "No sets of genes were identified using the selected parameters, \ntry changing the gene sets or relaxing the p-value/z-score thresholds.", "No gene sets identified", JOptionPane.INFORMATION_MESSAGE);
} else {
geneSetDisplay = new GeneSetDisplay(dataCollection, listDescription(), fromStore, toStore, probes, probeZScoreLookupTable, filterResultpValues, startingList, customRegressionValues, simpleRegression, // optionsPanel.bidirectionalRadioButton.isSelected());
false);
geneSetDisplay.addWindowListener(this);
}
// We don't want to save the probe list here, we're bringing up the intermediate display from which probe lists can be saved.
progressCancelled();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
Aggregations