use of uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe in project SeqMonk by s-andrews.
the class GeneSetIntensityDifferenceFilter method generateProbeList.
/* (non-Javadoc)
* @see uk.ac.babraham.SeqMonk.Filters.ProbeFilter#generateProbeList()
*/
protected void generateProbeList() {
try {
applyMultipleTestingCorrection = optionsPanel.multipleTestingBox.isSelected();
calculateCustomRegression = optionsPanel.calculateCustomRegressionBox.isSelected();
if (calculateCustomRegression == false) {
customRegressionValues = null;
}
if (calculateLinearRegression) {
simpleRegression = new SimpleRegression();
}
Probe[] allProbes = startingList.getAllProbes();
// We're not allowing multiple comparisons - this is a bit of a messy workaround so it's compatible with other methods.
fromStore = fromStores[0];
toStore = toStores[0];
ArrayList<Probe> probeArrayList = new ArrayList<Probe>();
int NaNcount = 0;
// remove the invalid probes - the ones without a value
for (int i = 0; i < allProbes.length; i++) {
if ((Float.isNaN(fromStore.getValueForProbe(allProbes[i]))) || (Float.isNaN(toStore.getValueForProbe(allProbes[i])))) {
NaNcount++;
} else {
probeArrayList.add(allProbes[i]);
}
}
System.err.println("Found " + NaNcount + " probes that were invalid.");
probes = probeArrayList.toArray(new Probe[0]);
if (calculateCustomRegression == true) {
customRegressionValues = new float[2][probes.length];
}
// We'll pull the number of probes to sample from the preferences if they've changed it
Integer updatedProbesPerSet = optionsPanel.probesPerSet();
if (updatedProbesPerSet != null)
probesPerSet = updatedProbesPerSet;
// we want a set of z-scores using the local distribution.
probeZScoreLookupTable = new Hashtable<Probe, Double>();
// Put something in the progress whilst we're ordering the probe values to make the comparison.
progressUpdated("Generating background model", 0, 1);
Comparator<Integer> comp = new AverageIntensityComparator(fromStore, toStore, probes);
// We need to generate a set of probe indices that can be ordered by their average intensity
Integer[] indices = new Integer[probes.length];
for (int i = 0; i < probes.length; i++) {
indices[i] = i;
/* add the data to the linear regression object */
if (calculateLinearRegression) {
simpleRegression.addData((double) fromStore.getValueForProbe(probes[i]), (double) toStore.getValueForProbe(probes[i]));
}
}
if (calculateLinearRegression) {
System.out.println("intercept = " + simpleRegression.getIntercept() + ", slope = " + simpleRegression.getSlope());
}
Arrays.sort(indices, comp);
/* This holds the indices to get the deduplicated probe from the original list of probes */
deduplicatedIndices = new Integer[probes.length];
/* The number of probes with different values */
int dedupProbeCounter = 0;
// the probes deduplicated by value
ArrayList<Probe> deduplicatedProbes = new ArrayList<Probe>();
// populate the first one so that we have something to compare to in the loop
deduplicatedIndices[0] = 0;
deduplicatedProbes.add(probes[indices[0]]);
progressUpdated("Made 0 of 1 comparisons", 0, 1);
for (int i = 1; i < indices.length; i++) {
/* indices have been sorted, now we need to check whether adjacent pair values are identical */
if ((fromStore.getValueForProbe(probes[indices[i]]) == fromStore.getValueForProbe(probes[indices[i - 1]])) && (toStore.getValueForProbe(probes[indices[i]]) == toStore.getValueForProbe(probes[indices[i - 1]]))) {
/* If they are identical, do not add the probe to the deduplicatedProbes object, but have a reference for it so we can look up which deduplicated probe and
* therefore which distribution slice to use for the duplicated probe. */
deduplicatedIndices[i] = dedupProbeCounter;
} else {
deduplicatedProbes.add(probes[indices[i]]);
dedupProbeCounter++;
deduplicatedIndices[i] = dedupProbeCounter;
}
}
Probe[] dedupProbes = deduplicatedProbes.toArray(new Probe[0]);
// make sure we're not trying to use more probes than we've got in the analysis
if (probesPerSet > dedupProbes.length) {
probesPerSet = dedupProbes.length;
}
System.out.println("total number of probe values = " + probes.length);
System.out.println("number of deduplicated probe values = " + dedupProbes.length);
System.out.println("probesPerSet = " + probesPerSet);
// I want this to contain all the differences, then from that I'm going to get the z-scores.
double[] currentDiffSet = new double[probesPerSet];
for (int i = 0; i < indices.length; i++) {
if (cancel) {
cancel = false;
progressCancelled();
return;
}
if (i % 1000 == 0) {
int progress = (i * 100) / indices.length;
// progress += 100*comparisonIndex;
progressUpdated("Made 0 out of 1 comparisons", progress, 100);
}
/**
* There are +1s in here because we skip over j when j == startingIndex.
*/
// We need to make up the set of differences to represent this probe
int startingIndex = deduplicatedIndices[i] - (probesPerSet / 2);
if (startingIndex < 0)
startingIndex = 0;
if (startingIndex + (probesPerSet + 1) >= dedupProbes.length)
startingIndex = dedupProbes.length - (probesPerSet + 1);
try {
for (int j = startingIndex; j < startingIndex + (probesPerSet + 1); j++) {
if (j == startingIndex) {
// Don't include the point being tested in the background model
continue;
}
double diff;
if (calculateLinearRegression == true) {
if (j > dedupProbes.length) {
System.err.println(" j is too big, it's " + j + " and dedupProbes.length = " + dedupProbes.length + ", starting index = " + startingIndex);
}
double x = fromStore.getValueForProbe(dedupProbes[j]);
double expectedY = (simpleRegression.getSlope() * x) + simpleRegression.getIntercept();
diff = toStore.getValueForProbe(dedupProbes[j]) - expectedY;
} else {
diff = toStore.getValueForProbe(dedupProbes[j]) - fromStore.getValueForProbe(dedupProbes[j]);
}
if (j < startingIndex) {
currentDiffSet[j - startingIndex] = diff;
} else {
currentDiffSet[(j - startingIndex) - 1] = diff;
}
}
if (calculateCustomRegression == true) {
// the average/ kind of centre line
float z = ((fromStore.getValueForProbe(probes[indices[i]]) + toStore.getValueForProbe(probes[indices[i]])) / 2);
customRegressionValues[0][indices[i]] = z - ((float) SimpleStats.mean(currentDiffSet) / 2);
customRegressionValues[1][indices[i]] = z + ((float) SimpleStats.mean(currentDiffSet) / 2);
}
double mean = 0;
// Get the difference for this point
double diff;
if (calculateLinearRegression == true) {
double x = fromStore.getValueForProbe(probes[indices[i]]);
double expectedY = (simpleRegression.getSlope() * x) + simpleRegression.getIntercept();
diff = toStore.getValueForProbe(probes[indices[i]]) - expectedY;
} else if (calculateCustomRegression == true) {
diff = toStore.getValueForProbe(probes[indices[i]]) - fromStore.getValueForProbe(probes[indices[i]]);
mean = SimpleStats.mean(currentDiffSet);
} else {
diff = toStore.getValueForProbe(probes[indices[i]]) - fromStore.getValueForProbe(probes[indices[i]]);
}
double stdev;
stdev = SimpleStats.stdev(currentDiffSet, mean);
// if there are no reads in the probe for either of the datasets, should we set the zscore to 0??
double zScore = (diff - mean) / stdev;
// modified z score
// median absolute deviation
/* double[] madArray = new double[currentDiffSet.length];
double median = SimpleStats.median(currentDiffSet);
for(int d=0; d<currentDiffSet.length; d++){
madArray[d] = Math.abs(currentDiffSet[d] - median);
}
double mad = SimpleStats.median(madArray);
zScore = (0.6745 * (diff - median))/mad;
}
*/
probeZScoreLookupTable.put(probes[indices[i]], zScore);
} catch (SeqMonkException sme) {
progressExceptionReceived(sme);
return;
}
}
// make this an array list as we're kicking out the mapped gene sets that have zscores with variance of 0.
ArrayList<MappedGeneSetTTestValue> pValueArrayList = new ArrayList<MappedGeneSetTTestValue>();
MappedGeneSet[] mappedGeneSets = null;
/* if we're using the gene set from a file, map the gene sets to the probes */
if (optionsPanel.geneSetsFileRadioButton.isSelected()) {
GeneSetCollectionParser geneSetCollectionParser = new GeneSetCollectionParser(minGenesInSet, maxGenesInSet);
GeneSetCollection geneSetCollection = geneSetCollectionParser.parseGeneSetInformation(validGeneSetFilepath);
MappedGeneSet[] allMappedGeneSets = geneSetCollection.getMappedGeneSets(probes);
if (allMappedGeneSets == null) {
JOptionPane.showMessageDialog(SeqMonkApplication.getInstance(), "No sets of genes could be matched to probes.\nCheck that the gmt file is for the right species. \nTo use gene sets from a file, probe names must contain the gene name. \nTry defining new probes over genes (e.g. using the RNA-seq quantitation pipeline) or use existing probes lists instead of a gene set file.", "No gene sets identified", JOptionPane.ERROR_MESSAGE);
throw new SeqMonkException("No sets of genes could be matched to probes.\nTo use gene sets from a file, probe names must contain the gene name.\nTry defining new probes over genes or use existing probes lists instead of a gene set file.");
} else {
ArrayList<MappedGeneSet> mgsArrayList = new ArrayList<MappedGeneSet>();
/* get rid of those that have fewer probes in the set than minGenesInSet. We shouldn't exceed maxGenesInSet unless probes have been made over something other than genes */
for (int i = 0; i < allMappedGeneSets.length; i++) {
if (allMappedGeneSets[i].getProbes().length >= minGenesInSet) {
mgsArrayList.add(allMappedGeneSets[i]);
}
}
mappedGeneSets = mgsArrayList.toArray(new MappedGeneSet[0]);
}
} else /* or if we're using existing probelists, create mappedGeneSets from them */
if (optionsPanel.probeListRadioButton.isSelected() && selectedProbeLists != null) {
mappedGeneSets = new MappedGeneSet[selectedProbeLists.length];
for (int i = 0; i < selectedProbeLists.length; i++) {
mappedGeneSets[i] = new MappedGeneSet(selectedProbeLists[i]);
}
} else {
throw new SeqMonkException("Haven't got any genesets to use, shouldn't have got here without having any selected.");
}
if (mappedGeneSets == null || mappedGeneSets.length == 0) {
throw new SeqMonkException("Couldn't map gene sets to the probes, try again with a different probe set");
} else {
System.err.println("there are " + mappedGeneSets.length + " mappedGeneSets");
System.err.println("size of zScore lookup table = " + probeZScoreLookupTable.size());
}
System.err.println("total number of mapped gene sets = " + mappedGeneSets.length);
// deduplicate our mappedGeneSets
if (optionsPanel.deduplicateGeneSetBox.isSelected()) {
mappedGeneSets = deduplicateMappedGeneSets(mappedGeneSets);
}
System.err.println("deduplicated mapped gene sets = " + mappedGeneSets.length);
/*
* we need to go through the mapped gene set and get all the values for the matched probes
*
*/
for (int i = 0; i < mappedGeneSets.length; i++) {
Probe[] geneSetProbes = mappedGeneSets[i].getProbes();
// to contain all the z-scores for the gene set
double[] geneSetZscores = new double[geneSetProbes.length];
// Find the z-scores for each of the probes in the mappedGeneSet
for (int gsp = 0; gsp < geneSetProbes.length; gsp++) {
if (probeZScoreLookupTable.containsKey(geneSetProbes[gsp])) {
geneSetZscores[gsp] = probeZScoreLookupTable.get(geneSetProbes[gsp]);
}
}
// this is just temporary to check the stats - there's some duplication with this.... is there??
mappedGeneSets[i].zScores = geneSetZscores;
if (geneSetZscores.length > 1) {
// the number of probes in the mappedGeneSet should always be > 1 anyway as the mappedGeneSet shouldn't be created if there are < 2 matched probes.
double pVal;
if (optionsPanel.statisticalTestBox.getSelectedItem().toString().equals("t-test")) {
pVal = TTest.calculatePValue(geneSetZscores, 0);
} else if (optionsPanel.statisticalTestBox.getSelectedItem().toString().equals("Kolmorogov-Smirnov test")) {
double[] allZscores = getAllZScores(probeZScoreLookupTable);
KolmogorovSmirnovTest ksTest = new KolmogorovSmirnovTest();
pVal = ksTest.kolmogorovSmirnovTest(geneSetZscores, allZscores);
} else if (optionsPanel.statisticalTestBox.getSelectedItem().toString().equals("Kolmorogov-Smirnov non-directional test")) {
double[] allZscores = getAllZScores(probeZScoreLookupTable);
KolmogorovSmirnovTest ksTest = new KolmogorovSmirnovTest();
pVal = ksTest.kolmogorovSmirnovTest(convertToAbsoluteValues(geneSetZscores), convertToAbsoluteValues(allZscores));
} else {
throw new IllegalArgumentException("Didn't recognise statistical test " + optionsPanel.statisticalTestBox.getSelectedItem().toString());
}
mappedGeneSets[i].meanZScore = SimpleStats.mean(geneSetZscores);
// check the variance - we don't want variances of 0.
double stdev = SimpleStats.stdev(geneSetZscores);
if ((stdev * stdev) < 0.00000000000001) {
continue;
} else // if all the differences between the datasets are 0 then just get rid of them
if (Double.isNaN(pVal)) {
continue;
} else {
pValueArrayList.add(new MappedGeneSetTTestValue(mappedGeneSets[i], pVal));
}
} else {
System.err.println("Fell through the net somewhere, why does the set of zscores contain fewer than 2 values?");
continue;
}
}
MappedGeneSetTTestValue[] filterResultpValues = pValueArrayList.toArray(new MappedGeneSetTTestValue[0]);
ArrayList<MappedGeneSetTTestValue> filteredPValueArrayList = new ArrayList<MappedGeneSetTTestValue>();
// Now we've got all the p values they need to be corrected.
if (applyMultipleTestingCorrection) {
BenjHochFDR.calculateQValues(filterResultpValues);
}
System.err.println(filterResultpValues.length + " p-values calculated, multtest = " + applyMultipleTestingCorrection + ", pval limit = " + pValueLimit);
for (int i = 0; i < filterResultpValues.length; i++) {
double pOrQvalue;
if (applyMultipleTestingCorrection) {
pOrQvalue = filterResultpValues[i].q;
} else {
pOrQvalue = filterResultpValues[i].p;
}
// check whether it passes the p/q-value and z-score cut-offs
if (optionsPanel.reportAllResults.isSelected()) {
filteredPValueArrayList.add(filterResultpValues[i]);
} else {
if ((pOrQvalue < pValueLimit) && (Math.abs(filterResultpValues[i].mappedGeneSet.meanZScore) > zScoreThreshold)) {
filteredPValueArrayList.add(filterResultpValues[i]);
}
}
}
// convert the ArrayList to MappedGeneSetTTestValue
filterResultpValues = filteredPValueArrayList.toArray(new MappedGeneSetTTestValue[0]);
if (filterResultpValues.length == 0) {
JOptionPane.showMessageDialog(SeqMonkApplication.getInstance(), "No sets of genes were identified using the selected parameters, \ntry changing the gene sets or relaxing the p-value/z-score thresholds.", "No gene sets identified", JOptionPane.INFORMATION_MESSAGE);
} else {
geneSetDisplay = new GeneSetDisplay(dataCollection, listDescription(), fromStore, toStore, probes, probeZScoreLookupTable, filterResultpValues, startingList, customRegressionValues, simpleRegression, // optionsPanel.bidirectionalRadioButton.isSelected());
false);
geneSetDisplay.addWindowListener(this);
}
// We don't want to save the probe list here, we're bringing up the intermediate display from which probe lists can be saved.
progressCancelled();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
use of uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe in project SeqMonk by s-andrews.
the class ContigProbeGenerator method run.
/* (non-Javadoc)
* @see java.lang.Runnable#run()
*/
public void run() {
Chromosome[] chromosomes = collection.genome().getAllChromosomes();
Vector<Probe> newProbes = new Vector<Probe>();
for (int c = 0; c < chromosomes.length; c++) {
// Time for an update
updateGenerationProgress("Processed " + c + " chromosomes", c, chromosomes.length);
// We'll merge together the reads for all of the selected DataStores and
// compute a single set of probes which covers all of them.
ReadsWithCounts[] v = new ReadsWithCounts[selectedStores.length];
for (int s = 0; s < selectedStores.length; s++) {
v[s] = selectedStores[s].getReadsForChromosome(chromosomes[c]);
}
ReadsWithCounts rawReads = new ReadsWithCounts(v);
v = null;
// We now want to convert this list into a non-redundant set of
// read positions with counts. If we don't do this then we get
// appalling performance where we have many reads mapped at the
// same position
// Our default is to do all strands at once
int[] strandsToTry = new int[] { 100 };
if (separateStrands.isSelected()) {
strandsToTry = new int[] { Location.FORWARD, Location.REVERSE, Location.UNKNOWN };
}
for (int strand = 0; strand < strandsToTry.length; strand++) {
ReadsWithCounts reads = getNonRedundantReads(rawReads, strandsToTry[strand]);
if (reads.totalCount() == 0) {
// System.err.println("Skipping strand "+strandsToTry[strand]+" on chr "+chromosomes[c]);
continue;
}
int strandForNewProbes = Location.UNKNOWN;
if (strandsToTry.length > 1) {
strandForNewProbes = strandsToTry[strand];
}
int start = -1;
// We now start a process where we work out at what point we cross the
// threshold of having more than depthCutoff reads overlapping at any
// point
LinkedList<SequenceReadWithCount> currentSet = new LinkedList<SequenceReadWithCount>();
int currentSetSize = 0;
for (int r = 0; r < reads.reads.length; r++) {
// See if we need to quit
if (cancel) {
generationCancelled();
return;
}
while (currentSetSize > 0 && SequenceRead.end(currentSet.getFirst().read) < SequenceRead.start(reads.reads[r])) {
SequenceReadWithCount lastRead = currentSet.removeFirst();
currentSetSize -= lastRead.count;
if (start > 0 && currentSetSize < depthCutoff) {
// We just got to the end of a probe
Probe p = new Probe(chromosomes[c], start, SequenceRead.end(lastRead.read), strandForNewProbes);
// Check to see if we have a previous probe against which we can check
Probe lastProbe = null;
if (!newProbes.isEmpty())
lastProbe = newProbes.lastElement();
// Can we merge?
if (lastProbe != null && p.chromosome() == lastProbe.chromosome() && p.strand() == lastProbe.strand() && p.start() - lastProbe.end() <= distance) {
// Remove the last probe from the stored set
newProbes.remove(newProbes.size() - 1);
// Expand this probe to cover the last one and add it to the stored set
newProbes.add(new Probe(p.chromosome(), lastProbe.start(), p.end(), strandForNewProbes));
} else if (lastProbe != null) {
// We might still remove this if it's too small
if (lastProbe.length() < minSize) {
newProbes.remove(newProbes.size() - 1);
}
// We still need to add the new probe
newProbes.add(p);
} else {
newProbes.add(p);
}
start = -1;
}
}
// If there's nothing there already then just add it
if (currentSetSize == 0) {
currentSet.add(new SequenceReadWithCount(reads.reads[r], reads.counts[r]));
currentSetSize += reads.counts[r];
} else // If there are reads in the current set then we need to add this read
// so that the current set is ordered by the end positions of the
// reads, with the earliest end first. We therefore start from the back
// and work our way to the front, as soon as we see an entry whose end
// is lower than ours we add ourselves after that
{
// Now we add this read at a position based on its end position
ListIterator<SequenceReadWithCount> it = currentSet.listIterator(currentSet.size());
while (true) {
// If we reach the front of the set then we add ourselves to the front
if (!it.hasPrevious()) {
currentSet.addFirst(new SequenceReadWithCount(reads.reads[r], reads.counts[r]));
currentSetSize += reads.counts[r];
break;
} else {
SequenceReadWithCount previousRead = it.previous();
if (SequenceRead.end(previousRead.read) < SequenceRead.end(reads.reads[r])) {
// We want to add ourselves after this element so backtrack
// by one position (which must exist because we just went
// past it
it.next();
it.add(new SequenceReadWithCount(reads.reads[r], reads.counts[r]));
currentSetSize += reads.counts[r];
break;
}
}
}
}
// See if we crossed the threshold for starting a new probe
if (start < 0 && currentSetSize >= depthCutoff) {
start = SequenceRead.start(reads.reads[r]);
}
}
// out of the so far unprocessed reads on this chromosome
if (start > 0) {
Probe p = new Probe(chromosomes[c], start, SequenceRead.end(currentSet.getFirst().read), strandForNewProbes);
// Check to see if we can merge with the last probe made
Probe lastProbe = null;
if (!newProbes.isEmpty())
lastProbe = newProbes.lastElement();
// Can we merge?
if (lastProbe != null && p.chromosome() == lastProbe.chromosome() && p.start() - lastProbe.end() <= distance) {
newProbes.remove(newProbes.size() - 1);
newProbes.add(new Probe(p.chromosome(), lastProbe.start(), p.end(), strandForNewProbes));
} else if (lastProbe != null) {
// We might still remove this if it's too small
if (lastProbe.length() < minSize) {
newProbes.remove(newProbes.size() - 1);
}
// final chance
if (p.length() > minSize) {
newProbes.add(p);
}
} else {
// Add the remaining probe if it's big enough.
if (p.length() > minSize) {
newProbes.add(p);
}
}
}
}
}
Probe[] finalList = newProbes.toArray(new Probe[0]);
newProbes.clear();
ProbeSet finalSet = new ProbeSet(getDescription(), finalList);
generationComplete(finalSet);
}
use of uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe in project SeqMonk by s-andrews.
the class DeduplicationProbeGenerator method run.
/* (non-Javadoc)
* @see java.lang.Runnable#run()
*/
public void run() {
Chromosome[] chromosomes = collection.genome().getAllChromosomes();
boolean separateStrands = separateStrandsBox.isSelected();
int maxDistance = 0;
if (maxDistanceField.getText().length() > 0) {
maxDistance = Integer.parseInt(maxDistanceField.getText());
}
Vector<Probe> newProbes = new Vector<Probe>();
for (int c = 0; c < chromosomes.length; c++) {
// Time for an update
updateGenerationProgress("Processed " + c + " chromosomes", c, chromosomes.length);
Probe[] startingProbes = initialList.getProbesForChromosome(chromosomes[c]);
// For directional merging we use 3 probes
Probe currentForward = null;
Probe currentReverse = null;
Probe currentUnknown = null;
// For non-directional merging we use only one
Probe currentProbe = null;
// Now we can make the actual probes
for (int i = 0; i < startingProbes.length; i++) {
if (cancel) {
generationCancelled();
return;
}
if (separateStrands) {
switch(startingProbes[i].strand()) {
case (Probe.FORWARD):
if (currentForward == null) {
currentForward = new Probe(chromosomes[c], startingProbes[i].packedPosition());
} else if (startingProbes[i].start() <= currentForward.end() + maxDistance) {
if (startingProbes[i].end() > currentForward.end()) {
// Extend the current probe
currentForward = new Probe(chromosomes[c], currentForward.start(), startingProbes[i].end(), currentForward.strand(), currentForward.name());
}
} else {
newProbes.add(currentForward);
currentForward = new Probe(chromosomes[c], startingProbes[i].packedPosition());
}
break;
case (Probe.REVERSE):
if (currentReverse == null) {
currentReverse = new Probe(chromosomes[c], startingProbes[i].packedPosition());
} else if (startingProbes[i].start() <= currentReverse.end() + maxDistance) {
if (startingProbes[i].end() > currentReverse.end()) {
// Extend the current probe
currentReverse = new Probe(chromosomes[c], currentReverse.start(), startingProbes[i].end(), currentReverse.strand(), currentReverse.name());
}
} else {
newProbes.add(currentReverse);
currentReverse = new Probe(chromosomes[c], startingProbes[i].packedPosition());
}
break;
case (Probe.UNKNOWN):
if (currentUnknown == null) {
currentUnknown = new Probe(chromosomes[c], startingProbes[i].packedPosition());
} else if (startingProbes[i].start() <= currentUnknown.end() + maxDistance) {
if (startingProbes[i].end() > currentUnknown.end()) {
// Extend the current probe
currentUnknown = new Probe(chromosomes[c], currentUnknown.start(), startingProbes[i].end(), currentUnknown.strand(), currentUnknown.name());
}
} else {
newProbes.add(currentUnknown);
currentUnknown = new Probe(chromosomes[c], startingProbes[i].packedPosition());
}
break;
}
} else {
if (currentProbe == null) {
currentProbe = new Probe(chromosomes[c], startingProbes[i].packedPosition());
} else if (startingProbes[i].start() <= currentProbe.end() + maxDistance) {
if (startingProbes[i].end() > currentProbe.end() || startingProbes[i].strand() != currentProbe.strand()) {
// Update the current probe
int usedStrand = currentProbe.strand();
if (startingProbes[i].strand() != currentProbe.strand()) {
usedStrand = Probe.UNKNOWN;
}
currentProbe = new Probe(chromosomes[c], currentProbe.start(), Math.max(startingProbes[i].end(), currentProbe.end()), usedStrand, currentProbe.name());
}
} else {
newProbes.add(currentProbe);
currentProbe = new Probe(chromosomes[c], startingProbes[i].packedPosition());
}
}
}
// At the end of each chromosome we add any probes we have remaining
if (currentProbe != null)
newProbes.add(currentProbe);
if (currentForward != null)
newProbes.add(currentForward);
if (currentReverse != null)
newProbes.add(currentReverse);
if (currentUnknown != null)
newProbes.add(currentUnknown);
}
Probe[] finalList = newProbes.toArray(new Probe[0]);
ProbeSet finalSet = new ProbeSet(getDescription(), finalList);
generationComplete(finalSet);
}
use of uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe in project SeqMonk by s-andrews.
the class FeatureProbeGenerator method run.
/* (non-Javadoc)
* @see java.lang.Runnable#run()
*/
public void run() {
// We get the problem here that we won't get per chromosome updates, but this should be
// so quick I don't think we care:
updateGenerationProgress("Making probes...", 0, 1);
Probe[] finalList = optionPanel.getProbes();
if (removeDuplicates) {
// System.out.println("Removing duplicates from original list of "+finalList.length+" probes");
finalList = removeDuplicates(finalList);
// System.out.println("Unique list has "+finalList.length+" probes");
}
ProbeSet finalSet = new ProbeSet(getDescription(), finalList);
generationComplete(finalSet);
}
use of uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe in project SeqMonk by s-andrews.
the class MacsPeakCaller method run.
public void run() {
// for (int i=0;i<selectedChIPStores.length;i++) {
// System.err.println("Selcted ChIP="+selectedChIPStores[i]);
// }
// for (int i=0;i<selectedInputStores.length;i++) {
// System.err.println("Selcted Input="+selectedInputStores[i]);
// }
// First find the tag offsets between the watson and crick strands
// Work out the total average coverage for all of the combined ChIP samples
long totalChIPCoverage = 0;
for (int i = 0; i < selectedChIPStores.length; i++) {
totalChIPCoverage += selectedChIPStores[i].getTotalReadLength();
}
if (cancel) {
generationCancelled();
return;
}
double averageChIPCoveragePerBase = totalChIPCoverage / (double) collection.genome().getTotalGenomeLength();
double lowerCoverage = averageChIPCoveragePerBase * minFoldEnrichment;
double upperCoverage = averageChIPCoveragePerBase * maxFoldEnrichment;
System.err.println("Coverage range for high confidence peaks is " + lowerCoverage + " - " + upperCoverage);
// Now we go through the data to find locations for our high confidence peaks so we can
// randomly select 1000 of these to use to find the offset between the two strands
Chromosome[] chromosomes = collection.genome().getAllChromosomes();
Vector<Probe> potentialHighConfidencePeaks = new Vector<Probe>();
for (int c = 0; c < chromosomes.length; c++) {
if (cancel) {
generationCancelled();
return;
}
// Time for an update
updateGenerationProgress("Finding high confidence peaks on chromosome " + chromosomes[c].name(), c, chromosomes.length);
Probe lastValidProbe = null;
for (int startPosition = 1; startPosition < chromosomes[c].length() - fragmentSize; startPosition += fragmentSize / 2) {
// See if we need to quit
if (cancel) {
generationCancelled();
return;
}
long totalLength = 0;
Probe probe = new Probe(chromosomes[c], startPosition, startPosition + fragmentSize);
for (int s = 0; s < selectedChIPStores.length; s++) {
long[] reads = selectedChIPStores[s].getReadsForProbe(probe);
for (int j = 0; j < reads.length; j++) {
totalLength += SequenceRead.length(reads[j]);
}
}
if (totalLength >= (lowerCoverage * probe.length()) && totalLength <= upperCoverage * probe.length()) {
if (lastValidProbe != null && SequenceRead.overlaps(lastValidProbe.packedPosition(), probe.packedPosition())) {
lastValidProbe = new Probe(chromosomes[c], lastValidProbe.start(), probe.end());
} else if (lastValidProbe != null) {
// Check that the overall density over the region falls within our limits
totalLength = 0;
for (int s = 0; s < selectedChIPStores.length; s++) {
long[] reads = selectedChIPStores[s].getReadsForProbe(lastValidProbe);
for (int j = 0; j < reads.length; j++) {
totalLength += SequenceRead.length(reads[j]);
}
}
if (totalLength >= (lowerCoverage * lastValidProbe.length()) && totalLength <= upperCoverage * lastValidProbe.length()) {
potentialHighConfidencePeaks.add(lastValidProbe);
}
lastValidProbe = probe;
} else {
lastValidProbe = probe;
}
}
}
if (lastValidProbe != null) {
long totalLength = 0;
for (int s = 0; s < selectedChIPStores.length; s++) {
long[] reads = selectedChIPStores[s].getReadsForProbe(lastValidProbe);
for (int j = 0; j < reads.length; j++) {
totalLength += SequenceRead.length(reads[j]);
}
}
if (totalLength >= (lowerCoverage * lastValidProbe.length()) && totalLength <= upperCoverage * lastValidProbe.length()) {
potentialHighConfidencePeaks.add(lastValidProbe);
}
}
}
if (potentialHighConfidencePeaks.size() == 0) {
JOptionPane.showMessageDialog(SeqMonkApplication.getInstance(), "No high confidence peaks found", "Quitting generator", JOptionPane.INFORMATION_MESSAGE);
generationCancelled();
return;
}
// System.err.println("Found "+potentialHighConfidencePeaks.size()+" high confidence peaks");
// Now we select 1000 random probes from this set
Probe[] highConfidencePeaks = potentialHighConfidencePeaks.toArray(new Probe[0]);
Collections.shuffle(Arrays.asList(highConfidencePeaks));
Probe[] randomHighConfidenceProbes = new Probe[Math.min(highConfidencePeaks.length, 1000)];
for (int i = 0; i < randomHighConfidenceProbes.length; i++) {
randomHighConfidenceProbes[i] = highConfidencePeaks[i];
}
// Now find the average distance between forward / reverse reads in the candidate peaks
int[] distances = new int[highConfidencePeaks.length];
// Sort the candidates so we don't do stupid stuff with the cache
Arrays.sort(randomHighConfidenceProbes);
for (int p = 0; p < randomHighConfidenceProbes.length; p++) {
// See if we need to quit
if (cancel) {
generationCancelled();
return;
}
distances[p] = getInterStrandDistance(randomHighConfidenceProbes[p], selectedChIPStores);
}
int medianInterStrandDistance = (int) SimpleStats.median(distances);
if (medianInterStrandDistance < 0)
medianInterStrandDistance = 0;
// System.err.println("Median inter strand difference = "+medianInterStrandDistance);
// Now we find the depth cutoff for overrepresented single tags using a binomial distribution
int totalReadCount = 0;
for (int i = 0; i < selectedChIPStores.length; i++) {
totalReadCount += selectedChIPStores[i].getTotalReadCount();
}
BinomialDistribution bin = new BinomialDistribution(totalReadCount, 1d / collection.genome().getTotalGenomeLength());
// We want to know what depth has a chance of less than 1^-5
int redundantThreshold = bin.inverseCumulativeProbability(1 - 0.00001d);
if (redundantThreshold < 1)
redundantThreshold = 1;
// System.err.println("Redundancy threshold is "+redundantThreshold);
// Now we construct a poisson distribution to work out the threshold to use for
// constructing a full candidate peak set.
updateGenerationProgress("Counting non-redundant reads", 0, 1);
// To do this we need to get the full non-redundant length from the whole set
int totalNonRedCount = getNonRedundantReadCount(selectedChIPStores, redundantThreshold);
// System.err.println("Total non-redundant sequences is "+totalNonRedCount);
// We need to know the median read length for the data
int readLength = 0;
for (int i = 0; i < selectedChIPStores.length; i++) {
readLength += selectedChIPStores[i].getTotalReadLength() / selectedChIPStores[i].getTotalReadCount();
}
readLength /= selectedChIPStores.length;
double expectedCountsPerWindow = getExpectedCountPerWindow(totalNonRedCount, collection.genome().getTotalGenomeLength(), fragmentSize, readLength);
PoissonDistribution poisson = new PoissonDistribution(expectedCountsPerWindow);
int readCountCutoff = poisson.inverseCumulativeProbability(1 - pValue);
// System.err.println("Threshold for enrichment in a window is "+readCountCutoff+" reads using a p-value of "+pValue+" and a mean of "+(totalNonRedCount/(collection.genome().getTotalGenomeLength()/(double)fragmentSize)));
// Now we go back through the whole dataset to do a search for all possible candidate probes
// We re-use the peak vector we came up with before.
potentialHighConfidencePeaks.clear();
for (int c = 0; c < chromosomes.length; c++) {
// Time for an update
updateGenerationProgress("Finding candidate peaks on chromosome " + chromosomes[c].name(), c, chromosomes.length);
Probe lastValidProbe = null;
for (int startPosition = 1; startPosition < chromosomes[c].length() - fragmentSize; startPosition += fragmentSize / 2) {
// See if we need to quit
if (cancel) {
generationCancelled();
return;
}
// We expand the region we're looking at by the inter-strand distance as we're going to
// be adjusting the read positions
Probe probe = new Probe(chromosomes[c], startPosition, (startPosition + fragmentSize - 1));
long[] mergedProbeReads = getReadsFromDataStoreCollection(probe, selectedChIPStores, medianInterStrandDistance);
mergedProbeReads = deduplicateReads(mergedProbeReads, redundantThreshold);
SequenceRead.sort(mergedProbeReads);
int thisProbeOverlapCount = 0;
for (int i = 0; i < mergedProbeReads.length; i++) {
if (SequenceRead.overlaps(mergedProbeReads[i], probe.packedPosition())) {
++thisProbeOverlapCount;
}
}
if (thisProbeOverlapCount > readCountCutoff) {
if (lastValidProbe != null && SequenceRead.overlaps(lastValidProbe.packedPosition(), probe.packedPosition())) {
lastValidProbe = new Probe(chromosomes[c], lastValidProbe.start(), probe.end());
} else if (lastValidProbe != null) {
potentialHighConfidencePeaks.add(lastValidProbe);
lastValidProbe = probe;
} else {
lastValidProbe = probe;
}
}
}
if (lastValidProbe != null) {
potentialHighConfidencePeaks.add(lastValidProbe);
}
}
// Finally we re-filter the peaks we have using local poisson distributions with densities taken
// from either the input samples (if there are any), or the local region. The densities are
// estimated over 1,5 and 10kb around the peak and genome wide and the max of these is taken.
// If there is no input then the 1kb region is not used.
Probe[] allCandidateProbes = potentialHighConfidencePeaks.toArray(new Probe[0]);
// Work out which stores we're using to validate against.
DataStore[] validationStores;
boolean useInput = false;
double inputCorrection = 1;
int validationNonRedCount;
if (selectedInputStores != null && selectedInputStores.length > 0) {
// See if we need to quit
if (cancel) {
generationCancelled();
return;
}
validationStores = selectedInputStores;
useInput = true;
// We also need to work out the total number of nonredundant seqences
// in the input so we can work out a scaling factor so that the densities
// for input and chip are comparable.
validationNonRedCount = getNonRedundantReadCount(validationStores, redundantThreshold);
inputCorrection = totalNonRedCount / (double) validationNonRedCount;
System.err.println("From chip=" + totalNonRedCount + " input=" + validationNonRedCount + " correction is " + inputCorrection);
} else {
validationStores = selectedChIPStores;
validationNonRedCount = totalNonRedCount;
}
Vector<Probe> finalValidatedProbes = new Vector<Probe>();
for (int p = 0; p < allCandidateProbes.length; p++) {
// See if we need to quit
if (cancel) {
generationCancelled();
return;
}
if (p % 100 == 0) {
updateGenerationProgress("Validated " + p + " out of " + allCandidateProbes.length + " raw peaks", p, allCandidateProbes.length);
}
// System.err.println("Validating "+allCandidateProbes[p].chromosome()+":"+allCandidateProbes[p].start()+"-"+allCandidateProbes[p].end());
// We now need to find the maximum read density per 2*bandwidth against which
// we're going to validate this peak
// We're going to get all reads within 10kb of the peak, and then we can subselect from there
int midPoint = allCandidateProbes[p].middle();
Probe region10kb = new Probe(allCandidateProbes[p].chromosome(), Math.max(midPoint - 5000, 1), Math.min(midPoint + 4999, allCandidateProbes[p].chromosome().length()), allCandidateProbes[p].strand());
Probe region5kb = new Probe(allCandidateProbes[p].chromosome(), Math.max(midPoint - 2500, 1), Math.min(midPoint + 2499, allCandidateProbes[p].chromosome().length()), allCandidateProbes[p].strand());
Probe region1kb = new Probe(allCandidateProbes[p].chromosome(), Math.max(midPoint - 500, 1), Math.min(midPoint + 499, allCandidateProbes[p].chromosome().length()), allCandidateProbes[p].strand());
// Get the probes for the largest region
long[] thisRegionReads = getReadsFromDataStoreCollection(region10kb, validationStores, 0);
// Deduplicate so it's a fair comparison
// Should we recalculate the redundant threshold based on the input coverage?
thisRegionReads = deduplicateReads(thisRegionReads, redundantThreshold);
int region10kbcount = thisRegionReads.length;
int region5kbcount = 0;
int region1kbcount = 0;
// Go through the reads seeing if they fit into the 5 or 1kb regions
for (int r = 0; r < thisRegionReads.length; r++) {
if (SequenceRead.overlaps(region5kb.packedPosition(), thisRegionReads[r]))
++region5kbcount;
if (SequenceRead.overlaps(region1kb.packedPosition(), thisRegionReads[r]))
++region1kbcount;
}
// System.err.println("Input counts 10kb="+region10kbcount+" 5kb="+region5kbcount+" 1kb="+region1kbcount);
// Convert to densities per window and ajdust for global coverage
double globalDensity = getExpectedCountPerWindow(validationNonRedCount, collection.genome().getTotalGenomeLength(), allCandidateProbes[p].length(), readLength) * inputCorrection;
double density10kb = getExpectedCountPerWindow(region10kbcount, region10kb.length(), allCandidateProbes[p].length(), readLength) * inputCorrection;
double density5kb = getExpectedCountPerWindow(region5kbcount, region5kb.length(), allCandidateProbes[p].length(), readLength) * inputCorrection;
double density1kb = getExpectedCountPerWindow(region1kbcount, region1kb.length(), allCandidateProbes[p].length(), readLength) * inputCorrection;
// Find the highest density to use for the validation
double highestDensity = globalDensity;
if (density10kb > highestDensity)
highestDensity = density10kb;
if (density5kb > highestDensity)
highestDensity = density5kb;
if (useInput && density1kb > highestDensity)
highestDensity = density1kb;
// System.err.println("Global="+globalDensity+" 10kb="+density10kb+" 5kb="+density5kb+" 1kb="+density1kb+" using="+highestDensity);
// Construct a poisson distribution with this density
PoissonDistribution localPoisson = new PoissonDistribution(highestDensity);
// System.err.println("Cutoff from global="+(new PoissonDistribution(globalDensity)).inverseCumulativeProbability(1-pValue)+" 10kb="+(new PoissonDistribution(density10kb)).inverseCumulativeProbability(1-pValue)+" 5kb="+(new PoissonDistribution(density5kb)).inverseCumulativeProbability(1-pValue)+" 1kb="+(new PoissonDistribution(density1kb)).inverseCumulativeProbability(1-pValue));
// Now check to see if the actual count from this peak is enough to still pass
long[] mergedProbeReads = getReadsFromDataStoreCollection(allCandidateProbes[p], selectedChIPStores, medianInterStrandDistance);
mergedProbeReads = deduplicateReads(mergedProbeReads, redundantThreshold);
SequenceRead.sort(mergedProbeReads);
int thisProbeOverlapCount = 0;
for (int i = 0; i < mergedProbeReads.length; i++) {
if (SequenceRead.overlaps(mergedProbeReads[i], allCandidateProbes[p].packedPosition())) {
++thisProbeOverlapCount;
}
}
if (thisProbeOverlapCount > localPoisson.inverseCumulativeProbability(1 - pValue)) {
finalValidatedProbes.add(allCandidateProbes[p]);
// System.err.println("Adding probe to final set");
}
}
// System.err.println("From "+allCandidateProbes.length+" candidates "+finalValidatedProbes.size()+" peaks were validated");
ProbeSet finalSet = new ProbeSet(getDescription(), finalValidatedProbes.toArray(new Probe[0]));
generationComplete(finalSet);
}
Aggregations