Examples with DataStore - uk.ac.babraham.SeqMonk.DataTypes.DataStore

Example 41 with DataStore

use of uk.ac.babraham.SeqMonk.DataTypes.DataStore in project SeqMonk by s-andrews.

the class EdgeRForRevFilter method generateProbeList.

/* (non-Javadoc)
	 * @see uk.ac.babraham.SeqMonk.Filters.ProbeFilter#generateProbeList()
	 */
@Override
protected void generateProbeList() {
    // We need to make a temporary directory, save the data into it, write out the R script
    // and then run it an collect the list of results, then clean up.
    // Make up the list of DataStores in each replicate set
    DataStore[] fromStores = replicateSets[0].dataStores();
    DataStore[] toStores = replicateSets[1].dataStores();
    File tempDir;
    try {
        Probe[] probes = startingList.getAllProbes();
        if (resample) {
            // We need to check that the data stores are quantitated
            for (int i = 0; i < fromStores.length; i++) {
                if (!fromStores[i].isQuantitated()) {
                    progressExceptionReceived(new SeqMonkException("Data Store " + fromStores[i].name() + " wasn't quantitated"));
                    return;
                }
                for (int p = 0; p < probes.length; p++) {
                    float value = fromStores[i].getValueForProbe(probes[p]);
                    if ((!Float.isNaN(value)) && (value < 0 || value > 100)) {
                        progressExceptionReceived(new SeqMonkException("Data Store " + fromStores[i].name() + " had a value outside the range 0-100 (" + value + ")"));
                        return;
                    }
                }
            }
            for (int i = 0; i < toStores.length; i++) {
                if (!toStores[i].isQuantitated()) {
                    progressExceptionReceived(new SeqMonkException("Data Store " + toStores[i].name() + " wasn't quantitated"));
                    return;
                }
                for (int p = 0; p < probes.length; p++) {
                    float value = toStores[i].getValueForProbe(probes[p]);
                    if ((!Float.isNaN(value)) && (value < 0 || value > 100)) {
                        progressExceptionReceived(new SeqMonkException("Data Store " + toStores[i].name() + " had a value outside the range 0-100 (" + value + ")"));
                        return;
                    }
                }
            }
        }
        progressUpdated("Creating temp directory", 0, 1);
        tempDir = TempDirectory.createTempDirectory();
        System.err.println("Temp dir is " + tempDir.getAbsolutePath());
        progressUpdated("Writing R script", 0, 1);
        // Get the template script
        Template template = new Template(ClassLoader.getSystemResource("uk/ac/babraham/SeqMonk/Filters/EdgeRFilter/edger_for_rev_template.r"));
        // Substitute in the variables we need to change
        template.setValue("WORKING", tempDir.getAbsolutePath().replace("\\", "/"));
        template.setValue("DIFFERENCE", "" + absDiffCutoff);
        template.setValue("PVALUE", "" + pValueCutoff);
        if (multiTest) {
            template.setValue("MULTITEST", "TRUE");
            template.setValue("CORRECTED", "FDR");
        } else {
            template.setValue("MULTITEST", "FALSE");
            template.setValue("CORRECTED", "PValue");
        }
        // For the conditions we just repeat "from" and "to" the number of times they occur in the
        // samples (twice for each sample since we have both meth and unmeth)
        StringBuffer conditions = new StringBuffer();
        for (int i = 0; i < fromStores.length; i++) {
            if (i > 0)
                conditions.append(",");
            conditions.append("\"from\",\"from\"");
        }
        for (int i = 0; i < toStores.length; i++) {
            conditions.append(",\"to\",\"to\"");
        }
        template.setValue("CONDITIONS", conditions.toString());
        // Write the script file
        File scriptFile = new File(tempDir.getAbsoluteFile() + "/script.r");
        PrintWriter pr = new PrintWriter(scriptFile);
        pr.print(template.toString());
        pr.close();
        // Write the count data
        File outFile = new File(tempDir.getAbsoluteFile() + "/counts.txt");
        pr = new PrintWriter(outFile);
        pr.print("id");
        for (int i = 0; i < fromStores.length; i++) {
            pr.print("\tfrom_" + i + "_meth\tfrom_" + i + "_unmeth");
        }
        for (int i = 0; i < toStores.length; i++) {
            pr.print("\tto_" + i + "_meth\tto_" + i + "_unmeth");
        }
        pr.print("\n");
        PROBE: for (int p = 0; p < probes.length; p++) {
            if (p % 1000 == 0) {
                progressUpdated("Writing data for chr" + probes[p].chromosome().name(), p, probes.length);
            }
            int[] fromMethCounts = new int[fromStores.length];
            int[] fromUnmethCounts = new int[fromStores.length];
            int[] toMethCounts = new int[toStores.length];
            int[] toUnmethCounts = new int[toStores.length];
            for (int i = 0; i < fromStores.length; i++) {
                long[] reads = fromStores[i].getReadsForProbe(probes[p]);
                int totalCount = 0;
                int methCount = 0;
                if (resample) {
                    float value = fromStores[i].getValueForProbe(probes[p]);
                    if (Float.isNaN(value)) {
                        continue PROBE;
                    }
                    totalCount = reads.length;
                    methCount = Math.round((totalCount * value) / 100f);
                } else {
                    for (int r = 0; r < reads.length; r++) {
                        totalCount++;
                        if (SequenceRead.strand(reads[r]) == Location.FORWARD) {
                            ++methCount;
                        }
                    }
                }
                fromMethCounts[i] = methCount;
                fromUnmethCounts[i] = totalCount - methCount;
            }
            for (int i = 0; i < toStores.length; i++) {
                long[] reads = toStores[i].getReadsForProbe(probes[p]);
                int totalCount = 0;
                int methCount = 0;
                if (resample) {
                    float value = toStores[i].getValueForProbe(probes[p]);
                    if (Float.isNaN(value)) {
                        continue PROBE;
                    }
                    totalCount = reads.length;
                    methCount = Math.round((totalCount * value) / 100f);
                } else {
                    for (int r = 0; r < reads.length; r++) {
                        totalCount++;
                        if (SequenceRead.strand(reads[r]) == Location.FORWARD) {
                            ++methCount;
                        }
                    }
                }
                toMethCounts[i] = methCount;
                toUnmethCounts[i] = totalCount - methCount;
            }
            // Check to see we meet the requirements for the min amount of information
            // and the min diff.
            int totalFromMeth = 0;
            int totalFrom = 0;
            int totalToMeth = 0;
            int totalTo = 0;
            int validFrom = 0;
            for (int i = 0; i < fromStores.length; i++) {
                totalFromMeth += fromMethCounts[i];
                totalFrom += fromMethCounts[i];
                totalFrom += fromUnmethCounts[i];
                if (fromMethCounts[i] > 0 || fromUnmethCounts[i] > 0) {
                    ++validFrom;
                }
            }
            int validTo = 0;
            for (int i = 0; i < toStores.length; i++) {
                totalToMeth += toMethCounts[i];
                totalTo += toMethCounts[i];
                totalTo += toUnmethCounts[i];
                if (toMethCounts[i] > 0 || toUnmethCounts[i] > 0) {
                    ++validTo;
                }
            }
            // EdgeR only requires 2 valid observations
            if (validFrom < 2 || validTo < 2) {
                // We don't have enough data to measure this one
                continue;
            }
            if (Math.abs((totalFromMeth * 100f / totalFrom) - (totalToMeth * 100f / totalTo)) < absDiffCutoff) {
                continue;
            }
            float[] fromPercentages = new float[validFrom];
            float[] toPercentages = new float[validTo];
            int lastFromIndex = 0;
            int lastToIndex = 0;
            for (int i = 0; i < fromMethCounts.length; i++) {
                if (fromMethCounts[i] + fromUnmethCounts[i] == 0)
                    continue;
                fromPercentages[lastFromIndex] = fromMethCounts[i] * 100f / (fromMethCounts[i] + fromUnmethCounts[i]);
                ++lastFromIndex;
            }
            for (int i = 0; i < toMethCounts.length; i++) {
                if (toMethCounts[i] + toUnmethCounts[i] == 0)
                    continue;
                toPercentages[lastToIndex] = toMethCounts[i] * 100f / (toMethCounts[i] + toUnmethCounts[i]);
                ++lastToIndex;
            }
            if (Math.abs(SimpleStats.mean(fromPercentages) - SimpleStats.mean(toPercentages)) < absDiffCutoff) {
                continue;
            }
            // If we get here then we're OK to use this probe so we print out its data.  We put all of the
            // data for one probe onto a single line.  The first value is the index of the probe.  The
            // rest are pairs of meth/unmeth values for the from samples then the to samples.
            pr.print(p);
            for (int i = 0; i < fromMethCounts.length; i++) {
                pr.print("\t" + fromMethCounts[i] + "\t" + fromUnmethCounts[i]);
            }
            for (int i = 0; i < toMethCounts.length; i++) {
                pr.print("\t" + toMethCounts[i] + "\t" + toUnmethCounts[i]);
            }
            pr.print("\n");
        }
        pr.close();
        progressUpdated("Running R Script", 0, 1);
        RScriptRunner runner = new RScriptRunner(tempDir);
        RProgressListener listener = new RProgressListener(runner);
        runner.addProgressListener(new ProgressRecordDialog("R Session", runner));
        runner.runScript();
        while (true) {
            if (listener.cancelled()) {
                progressCancelled();
                pr.close();
                return;
            }
            if (listener.exceptionReceived()) {
                progressExceptionReceived(new SeqMonkException("R Script failed"));
                pr.close();
                return;
            }
            if (listener.complete())
                break;
            Thread.sleep(500);
        }
        // We can now parse the results and put the hits into a new probe list
        ProbeList newList;
        if (multiTest) {
            newList = new ProbeList(startingList, "", "", "FDR");
        } else {
            newList = new ProbeList(startingList, "", "", "p-value");
        }
        File hitsFile = new File(tempDir.getAbsolutePath() + "/hits.txt");
        BufferedReader br = new BufferedReader(new FileReader(hitsFile));
        String line = br.readLine();
        while ((line = br.readLine()) != null) {
            String[] sections = line.split("\t");
            String[] indexSections = sections[0].split("\\.");
            int probeIndex = Integer.parseInt(indexSections[indexSections.length - 1]);
            float pValue = Float.parseFloat(sections[sections.length - 1]);
            newList.addProbe(probes[probeIndex], pValue);
        }
        br.close();
        runner.cleanUp();
        filterFinished(newList);
    } catch (Exception ioe) {
        progressExceptionReceived(ioe);
        return;
    }
}

Also used : ProbeList(uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeList) Probe(uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe) RProgressListener(uk.ac.babraham.SeqMonk.R.RProgressListener) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) Template(uk.ac.babraham.SeqMonk.Utilities.Templates.Template) ProgressRecordDialog(uk.ac.babraham.SeqMonk.Dialogs.ProgressRecordDialog) DataStore(uk.ac.babraham.SeqMonk.DataTypes.DataStore) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) File(java.io.File) RScriptRunner(uk.ac.babraham.SeqMonk.R.RScriptRunner) PrintWriter(java.io.PrintWriter)

Example 42 with DataStore

use of uk.ac.babraham.SeqMonk.DataTypes.DataStore in project SeqMonk by s-andrews.

the class MacsPeakCaller method run.

public void run() {
    // for (int i=0;i<selectedChIPStores.length;i++) {
    // System.err.println("Selcted ChIP="+selectedChIPStores[i]);
    // }
    // for (int i=0;i<selectedInputStores.length;i++) {
    // System.err.println("Selcted Input="+selectedInputStores[i]);
    // }
    // First find the tag offsets between the watson and crick strands
    // Work out the total average coverage for all of the combined ChIP samples
    long totalChIPCoverage = 0;
    for (int i = 0; i < selectedChIPStores.length; i++) {
        totalChIPCoverage += selectedChIPStores[i].getTotalReadLength();
    }
    if (cancel) {
        generationCancelled();
        return;
    }
    double averageChIPCoveragePerBase = totalChIPCoverage / (double) collection.genome().getTotalGenomeLength();
    double lowerCoverage = averageChIPCoveragePerBase * minFoldEnrichment;
    double upperCoverage = averageChIPCoveragePerBase * maxFoldEnrichment;
    System.err.println("Coverage range for high confidence peaks is " + lowerCoverage + " - " + upperCoverage);
    // Now we go through the data to find locations for our high confidence peaks so we can
    // randomly select 1000 of these to use to find the offset between the two strands
    Chromosome[] chromosomes = collection.genome().getAllChromosomes();
    Vector<Probe> potentialHighConfidencePeaks = new Vector<Probe>();
    for (int c = 0; c < chromosomes.length; c++) {
        if (cancel) {
            generationCancelled();
            return;
        }
        // Time for an update
        updateGenerationProgress("Finding high confidence peaks on chromosome " + chromosomes[c].name(), c, chromosomes.length);
        Probe lastValidProbe = null;
        for (int startPosition = 1; startPosition < chromosomes[c].length() - fragmentSize; startPosition += fragmentSize / 2) {
            // See if we need to quit
            if (cancel) {
                generationCancelled();
                return;
            }
            long totalLength = 0;
            Probe probe = new Probe(chromosomes[c], startPosition, startPosition + fragmentSize);
            for (int s = 0; s < selectedChIPStores.length; s++) {
                long[] reads = selectedChIPStores[s].getReadsForProbe(probe);
                for (int j = 0; j < reads.length; j++) {
                    totalLength += SequenceRead.length(reads[j]);
                }
            }
            if (totalLength >= (lowerCoverage * probe.length()) && totalLength <= upperCoverage * probe.length()) {
                if (lastValidProbe != null && SequenceRead.overlaps(lastValidProbe.packedPosition(), probe.packedPosition())) {
                    lastValidProbe = new Probe(chromosomes[c], lastValidProbe.start(), probe.end());
                } else if (lastValidProbe != null) {
                    // Check that the overall density over the region falls within our limits
                    totalLength = 0;
                    for (int s = 0; s < selectedChIPStores.length; s++) {
                        long[] reads = selectedChIPStores[s].getReadsForProbe(lastValidProbe);
                        for (int j = 0; j < reads.length; j++) {
                            totalLength += SequenceRead.length(reads[j]);
                        }
                    }
                    if (totalLength >= (lowerCoverage * lastValidProbe.length()) && totalLength <= upperCoverage * lastValidProbe.length()) {
                        potentialHighConfidencePeaks.add(lastValidProbe);
                    }
                    lastValidProbe = probe;
                } else {
                    lastValidProbe = probe;
                }
            }
        }
        if (lastValidProbe != null) {
            long totalLength = 0;
            for (int s = 0; s < selectedChIPStores.length; s++) {
                long[] reads = selectedChIPStores[s].getReadsForProbe(lastValidProbe);
                for (int j = 0; j < reads.length; j++) {
                    totalLength += SequenceRead.length(reads[j]);
                }
            }
            if (totalLength >= (lowerCoverage * lastValidProbe.length()) && totalLength <= upperCoverage * lastValidProbe.length()) {
                potentialHighConfidencePeaks.add(lastValidProbe);
            }
        }
    }
    if (potentialHighConfidencePeaks.size() == 0) {
        JOptionPane.showMessageDialog(SeqMonkApplication.getInstance(), "No high confidence peaks found", "Quitting generator", JOptionPane.INFORMATION_MESSAGE);
        generationCancelled();
        return;
    }
    // System.err.println("Found "+potentialHighConfidencePeaks.size()+" high confidence peaks");
    // Now we select 1000 random probes from this set
    Probe[] highConfidencePeaks = potentialHighConfidencePeaks.toArray(new Probe[0]);
    Collections.shuffle(Arrays.asList(highConfidencePeaks));
    Probe[] randomHighConfidenceProbes = new Probe[Math.min(highConfidencePeaks.length, 1000)];
    for (int i = 0; i < randomHighConfidenceProbes.length; i++) {
        randomHighConfidenceProbes[i] = highConfidencePeaks[i];
    }
    // Now find the average distance between forward / reverse reads in the candidate peaks
    int[] distances = new int[highConfidencePeaks.length];
    // Sort the candidates so we don't do stupid stuff with the cache
    Arrays.sort(randomHighConfidenceProbes);
    for (int p = 0; p < randomHighConfidenceProbes.length; p++) {
        // See if we need to quit
        if (cancel) {
            generationCancelled();
            return;
        }
        distances[p] = getInterStrandDistance(randomHighConfidenceProbes[p], selectedChIPStores);
    }
    int medianInterStrandDistance = (int) SimpleStats.median(distances);
    if (medianInterStrandDistance < 0)
        medianInterStrandDistance = 0;
    // System.err.println("Median inter strand difference = "+medianInterStrandDistance);
    // Now we find the depth cutoff for overrepresented single tags using a binomial distribution
    int totalReadCount = 0;
    for (int i = 0; i < selectedChIPStores.length; i++) {
        totalReadCount += selectedChIPStores[i].getTotalReadCount();
    }
    BinomialDistribution bin = new BinomialDistribution(totalReadCount, 1d / collection.genome().getTotalGenomeLength());
    // We want to know what depth has a chance of less than 1^-5
    int redundantThreshold = bin.inverseCumulativeProbability(1 - 0.00001d);
    if (redundantThreshold < 1)
        redundantThreshold = 1;
    // System.err.println("Redundancy threshold is "+redundantThreshold);
    // Now we construct a poisson distribution to work out the threshold to use for
    // constructing a full candidate peak set.
    updateGenerationProgress("Counting non-redundant reads", 0, 1);
    // To do this we need to get the full non-redundant length from the whole set
    int totalNonRedCount = getNonRedundantReadCount(selectedChIPStores, redundantThreshold);
    // System.err.println("Total non-redundant sequences is "+totalNonRedCount);
    // We need to know the median read length for the data
    int readLength = 0;
    for (int i = 0; i < selectedChIPStores.length; i++) {
        readLength += selectedChIPStores[i].getTotalReadLength() / selectedChIPStores[i].getTotalReadCount();
    }
    readLength /= selectedChIPStores.length;
    double expectedCountsPerWindow = getExpectedCountPerWindow(totalNonRedCount, collection.genome().getTotalGenomeLength(), fragmentSize, readLength);
    PoissonDistribution poisson = new PoissonDistribution(expectedCountsPerWindow);
    int readCountCutoff = poisson.inverseCumulativeProbability(1 - pValue);
    // System.err.println("Threshold for enrichment in a window is "+readCountCutoff+" reads using a p-value of "+pValue+" and a mean of "+(totalNonRedCount/(collection.genome().getTotalGenomeLength()/(double)fragmentSize)));
    // Now we go back through the whole dataset to do a search for all possible candidate probes
    // We re-use the peak vector we came up with before.
    potentialHighConfidencePeaks.clear();
    for (int c = 0; c < chromosomes.length; c++) {
        // Time for an update
        updateGenerationProgress("Finding candidate peaks on chromosome " + chromosomes[c].name(), c, chromosomes.length);
        Probe lastValidProbe = null;
        for (int startPosition = 1; startPosition < chromosomes[c].length() - fragmentSize; startPosition += fragmentSize / 2) {
            // See if we need to quit
            if (cancel) {
                generationCancelled();
                return;
            }
            // We expand the region we're looking at by the inter-strand distance as we're going to
            // be adjusting the read positions
            Probe probe = new Probe(chromosomes[c], startPosition, (startPosition + fragmentSize - 1));
            long[] mergedProbeReads = getReadsFromDataStoreCollection(probe, selectedChIPStores, medianInterStrandDistance);
            mergedProbeReads = deduplicateReads(mergedProbeReads, redundantThreshold);
            SequenceRead.sort(mergedProbeReads);
            int thisProbeOverlapCount = 0;
            for (int i = 0; i < mergedProbeReads.length; i++) {
                if (SequenceRead.overlaps(mergedProbeReads[i], probe.packedPosition())) {
                    ++thisProbeOverlapCount;
                }
            }
            if (thisProbeOverlapCount > readCountCutoff) {
                if (lastValidProbe != null && SequenceRead.overlaps(lastValidProbe.packedPosition(), probe.packedPosition())) {
                    lastValidProbe = new Probe(chromosomes[c], lastValidProbe.start(), probe.end());
                } else if (lastValidProbe != null) {
                    potentialHighConfidencePeaks.add(lastValidProbe);
                    lastValidProbe = probe;
                } else {
                    lastValidProbe = probe;
                }
            }
        }
        if (lastValidProbe != null) {
            potentialHighConfidencePeaks.add(lastValidProbe);
        }
    }
    // Finally we re-filter the peaks we have using local poisson distributions with densities taken
    // from either the input samples (if there are any), or the local region.  The densities are
    // estimated over 1,5 and 10kb around the peak and genome wide and the max of these is taken.
    // If there is no input then the 1kb region is not used.
    Probe[] allCandidateProbes = potentialHighConfidencePeaks.toArray(new Probe[0]);
    // Work out which stores we're using to validate against.
    DataStore[] validationStores;
    boolean useInput = false;
    double inputCorrection = 1;
    int validationNonRedCount;
    if (selectedInputStores != null && selectedInputStores.length > 0) {
        // See if we need to quit
        if (cancel) {
            generationCancelled();
            return;
        }
        validationStores = selectedInputStores;
        useInput = true;
        // We also need to work out the total number of nonredundant seqences
        // in the input so we can work out a scaling factor so that the densities
        // for input and chip are comparable.
        validationNonRedCount = getNonRedundantReadCount(validationStores, redundantThreshold);
        inputCorrection = totalNonRedCount / (double) validationNonRedCount;
        System.err.println("From chip=" + totalNonRedCount + " input=" + validationNonRedCount + " correction is " + inputCorrection);
    } else {
        validationStores = selectedChIPStores;
        validationNonRedCount = totalNonRedCount;
    }
    Vector<Probe> finalValidatedProbes = new Vector<Probe>();
    for (int p = 0; p < allCandidateProbes.length; p++) {
        // See if we need to quit
        if (cancel) {
            generationCancelled();
            return;
        }
        if (p % 100 == 0) {
            updateGenerationProgress("Validated " + p + " out of " + allCandidateProbes.length + " raw peaks", p, allCandidateProbes.length);
        }
        // System.err.println("Validating "+allCandidateProbes[p].chromosome()+":"+allCandidateProbes[p].start()+"-"+allCandidateProbes[p].end());
        // We now need to find the maximum read density per 2*bandwidth against which
        // we're going to validate this peak
        // We're going to get all reads within 10kb of the peak, and then we can subselect from there
        int midPoint = allCandidateProbes[p].middle();
        Probe region10kb = new Probe(allCandidateProbes[p].chromosome(), Math.max(midPoint - 5000, 1), Math.min(midPoint + 4999, allCandidateProbes[p].chromosome().length()), allCandidateProbes[p].strand());
        Probe region5kb = new Probe(allCandidateProbes[p].chromosome(), Math.max(midPoint - 2500, 1), Math.min(midPoint + 2499, allCandidateProbes[p].chromosome().length()), allCandidateProbes[p].strand());
        Probe region1kb = new Probe(allCandidateProbes[p].chromosome(), Math.max(midPoint - 500, 1), Math.min(midPoint + 499, allCandidateProbes[p].chromosome().length()), allCandidateProbes[p].strand());
        // Get the probes for the largest region
        long[] thisRegionReads = getReadsFromDataStoreCollection(region10kb, validationStores, 0);
        // Deduplicate so it's a fair comparison
        // Should we recalculate the redundant threshold based on the input coverage?
        thisRegionReads = deduplicateReads(thisRegionReads, redundantThreshold);
        int region10kbcount = thisRegionReads.length;
        int region5kbcount = 0;
        int region1kbcount = 0;
        // Go through the reads seeing if they fit into the 5 or 1kb regions
        for (int r = 0; r < thisRegionReads.length; r++) {
            if (SequenceRead.overlaps(region5kb.packedPosition(), thisRegionReads[r]))
                ++region5kbcount;
            if (SequenceRead.overlaps(region1kb.packedPosition(), thisRegionReads[r]))
                ++region1kbcount;
        }
        // System.err.println("Input counts 10kb="+region10kbcount+" 5kb="+region5kbcount+" 1kb="+region1kbcount);
        // Convert to densities per window and ajdust for global coverage
        double globalDensity = getExpectedCountPerWindow(validationNonRedCount, collection.genome().getTotalGenomeLength(), allCandidateProbes[p].length(), readLength) * inputCorrection;
        double density10kb = getExpectedCountPerWindow(region10kbcount, region10kb.length(), allCandidateProbes[p].length(), readLength) * inputCorrection;
        double density5kb = getExpectedCountPerWindow(region5kbcount, region5kb.length(), allCandidateProbes[p].length(), readLength) * inputCorrection;
        double density1kb = getExpectedCountPerWindow(region1kbcount, region1kb.length(), allCandidateProbes[p].length(), readLength) * inputCorrection;
        // Find the highest density to use for the validation
        double highestDensity = globalDensity;
        if (density10kb > highestDensity)
            highestDensity = density10kb;
        if (density5kb > highestDensity)
            highestDensity = density5kb;
        if (useInput && density1kb > highestDensity)
            highestDensity = density1kb;
        // System.err.println("Global="+globalDensity+" 10kb="+density10kb+" 5kb="+density5kb+" 1kb="+density1kb+" using="+highestDensity);
        // Construct a poisson distribution with this density
        PoissonDistribution localPoisson = new PoissonDistribution(highestDensity);
        // System.err.println("Cutoff from global="+(new PoissonDistribution(globalDensity)).inverseCumulativeProbability(1-pValue)+" 10kb="+(new PoissonDistribution(density10kb)).inverseCumulativeProbability(1-pValue)+" 5kb="+(new PoissonDistribution(density5kb)).inverseCumulativeProbability(1-pValue)+" 1kb="+(new PoissonDistribution(density1kb)).inverseCumulativeProbability(1-pValue));
        // Now check to see if the actual count from this peak is enough to still pass
        long[] mergedProbeReads = getReadsFromDataStoreCollection(allCandidateProbes[p], selectedChIPStores, medianInterStrandDistance);
        mergedProbeReads = deduplicateReads(mergedProbeReads, redundantThreshold);
        SequenceRead.sort(mergedProbeReads);
        int thisProbeOverlapCount = 0;
        for (int i = 0; i < mergedProbeReads.length; i++) {
            if (SequenceRead.overlaps(mergedProbeReads[i], allCandidateProbes[p].packedPosition())) {
                ++thisProbeOverlapCount;
            }
        }
        if (thisProbeOverlapCount > localPoisson.inverseCumulativeProbability(1 - pValue)) {
            finalValidatedProbes.add(allCandidateProbes[p]);
        // System.err.println("Adding probe to final set");
        }
    }
    // System.err.println("From "+allCandidateProbes.length+" candidates "+finalValidatedProbes.size()+" peaks were validated");
    ProbeSet finalSet = new ProbeSet(getDescription(), finalValidatedProbes.toArray(new Probe[0]));
    generationComplete(finalSet);
}

Also used : PoissonDistribution(org.apache.commons.math3.distribution.PoissonDistribution) Chromosome(uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome) Probe(uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe) ProbeSet(uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeSet) DataStore(uk.ac.babraham.SeqMonk.DataTypes.DataStore) BinomialDistribution(org.apache.commons.math3.distribution.BinomialDistribution) Vector(java.util.Vector) LongVector(uk.ac.babraham.SeqMonk.Utilities.LongVector)

Example 43 with DataStore

use of uk.ac.babraham.SeqMonk.DataTypes.DataStore in project SeqMonk by s-andrews.

the class MacsPeakCaller method getOptionsPanel.

public JPanel getOptionsPanel() {
    if (optionPanel != null) {
        // We've done this already
        return optionPanel;
    }
    optionPanel = new JPanel();
    optionPanel.setLayout(new GridBagLayout());
    GridBagConstraints gbc = new GridBagConstraints();
    gbc.gridx = 1;
    gbc.gridy = 1;
    gbc.weightx = 0.5;
    gbc.weighty = 0.1;
    gbc.fill = GridBagConstraints.BOTH;
    optionPanel.add(new JLabel("ChIP Stores to use"), gbc);
    gbc.gridy++;
    gbc.weighty = 0.5;
    DataStore[] stores = collection.getAllDataStores();
    chipStoresList = new JList(stores);
    chipStoresList.getSelectionModel().addListSelectionListener(this);
    chipStoresList.setCellRenderer(new TypeColourRenderer());
    optionPanel.add(new JScrollPane(chipStoresList), gbc);
    gbc.gridy++;
    gbc.weighty = 0.1;
    optionPanel.add(new JLabel("Input Stores to use"), gbc);
    gbc.gridy++;
    gbc.weighty = 0.5;
    inputStoresList = new JList(stores);
    inputStoresList.getSelectionModel().addListSelectionListener(this);
    inputStoresList.setCellRenderer(new TypeColourRenderer());
    optionPanel.add(new JScrollPane(inputStoresList), gbc);
    gbc.gridy++;
    JPanel bottomPanel = new JPanel();
    bottomPanel.setLayout(new GridBagLayout());
    GridBagConstraints bgbc = new GridBagConstraints();
    bgbc.gridx = 0;
    bgbc.gridy = 0;
    bgbc.weightx = 0.5;
    bgbc.weighty = 0.5;
    bgbc.fill = GridBagConstraints.HORIZONTAL;
    bottomPanel.add(new JLabel("P-value cutoff "), bgbc);
    pValueField = new JTextField("" + pValue);
    pValueField.addKeyListener(new NumberKeyListener(true, false, 1));
    bgbc.gridx = 1;
    bottomPanel.add(pValueField, bgbc);
    bgbc.gridx = 0;
    bgbc.gridy++;
    bottomPanel.add(new JLabel("Sonicated fragment size "), bgbc);
    fragmentSizeField = new JTextField("300");
    fragmentSizeField.addKeyListener(new NumberKeyListener(false, false));
    bgbc.gridx = 1;
    bottomPanel.add(fragmentSizeField, bgbc);
    bgbc.gridx = 0;
    bgbc.gridy++;
    bottomPanel.add(new JLabel("Skip deduplication step "), bgbc);
    skipDeduplicationBox = new JCheckBox();
    bgbc.gridx = 1;
    bottomPanel.add(skipDeduplicationBox, bgbc);
    optionPanel.add(bottomPanel, gbc);
    return optionPanel;
}

Also used : JScrollPane(javax.swing.JScrollPane) JCheckBox(javax.swing.JCheckBox) JPanel(javax.swing.JPanel) GridBagConstraints(java.awt.GridBagConstraints) GridBagLayout(java.awt.GridBagLayout) DataStore(uk.ac.babraham.SeqMonk.DataTypes.DataStore) JLabel(javax.swing.JLabel) JTextField(javax.swing.JTextField) NumberKeyListener(uk.ac.babraham.SeqMonk.Utilities.NumberKeyListener) JList(javax.swing.JList) TypeColourRenderer(uk.ac.babraham.SeqMonk.Dialogs.Renderers.TypeColourRenderer)

Example 44 with DataStore

use of uk.ac.babraham.SeqMonk.DataTypes.DataStore in project SeqMonk by s-andrews.

the class LogisticRegressionSplicingFilter method generateProbeList.

/* (non-Javadoc)
	 * @see uk.ac.babraham.SeqMonk.Filters.ProbeFilter#generateProbeList()
	 */
@Override
protected void generateProbeList() {
    // We need to make a temporary directory, save the data into it, write out the R script
    // and then run it an collect the list of results, then clean up.
    // Make up the list of DataStores in each replicate set
    DataStore[] fromStores = replicateSets[0].dataStores();
    DataStore[] toStores = replicateSets[1].dataStores();
    File tempDir;
    try {
        Probe[] probes = startingList.getAllProbes();
        probes = filterProbesByCount(probes, fromStores, toStores);
        progressUpdated("Pairing Probes", 0, 1);
        // Make pairs of probes to test
        ProbePair[] pairs = makeProbePairs(probes, fromStores, toStores);
        System.err.println("Found " + pairs.length + " pairs to test");
        progressUpdated("Creating temp directory", 0, 1);
        tempDir = TempDirectory.createTempDirectory();
        System.err.println("Temp dir is " + tempDir.getAbsolutePath());
        progressUpdated("Writing R script", 0, 1);
        // Get the template script
        Template template = new Template(ClassLoader.getSystemResource("uk/ac/babraham/SeqMonk/Filters/LogisticRegressionFilter/logistic_regression_template.r"));
        // Substitute in the variables we need to change
        template.setValue("WORKING", tempDir.getAbsolutePath().replace("\\", "/"));
        template.setValue("PVALUE", "" + pValueCutoff);
        template.setValue("CORRECTCOUNT", "" + pairs.length);
        if (multiTest) {
            template.setValue("MULTITEST", "TRUE");
        } else {
            template.setValue("MULTITEST", "FALSE");
        }
        // Write the script file
        File scriptFile = new File(tempDir.getAbsoluteFile() + "/script.r");
        PrintWriter pr = new PrintWriter(scriptFile);
        pr.print(template.toString());
        pr.close();
        // Write the count data
        // Sort these so we can get probes from the same chromosome together
        Arrays.sort(probes);
        pr = null;
        String lastChr = "";
        for (int p = 0; p < pairs.length; p++) {
            if (!pairs[p].probe1.chromosome().name().equals(lastChr)) {
                if (pr != null)
                    pr.close();
                File outFile = new File(tempDir.getAbsoluteFile() + "/data_chr" + pairs[p].probe1.chromosome().name() + ".txt");
                pr = new PrintWriter(outFile);
                lastChr = pairs[p].probe1.chromosome().name();
                pr.println("id\tgroup\treplicate\tstate\tcount");
            }
            if (p % 1000 == 0) {
                progressUpdated("Writing data for chr" + lastChr, p, probes.length);
            }
            int[] fromProbe1Counts = new int[fromStores.length];
            int[] fromProbe2Counts = new int[fromStores.length];
            int[] toProbe1Counts = new int[toStores.length];
            int[] toProbe2Counts = new int[toStores.length];
            for (int i = 0; i < fromStores.length; i++) {
                // TODO: For the moment we'll expect they've quantitated this themselves
                fromProbe1Counts[i] = (int) fromStores[i].getValueForProbe(pairs[p].probe1);
                fromProbe2Counts[i] = (int) fromStores[i].getValueForProbe(pairs[p].probe2);
            }
            for (int i = 0; i < toStores.length; i++) {
                // TODO: For the moment we'll expect they've quantitated this themselves
                toProbe1Counts[i] = (int) toStores[i].getValueForProbe(pairs[p].probe1);
                toProbe2Counts[i] = (int) toStores[i].getValueForProbe(pairs[p].probe2);
            }
            for (int i = 0; i < fromProbe1Counts.length; i++) {
                pr.println(p + "\tfrom\tfrom" + i + "\tmeth\t" + fromProbe1Counts[i]);
                pr.println(p + "\tfrom\tfrom" + i + "\tunmeth\t" + fromProbe2Counts[i]);
            }
            for (int i = 0; i < toProbe1Counts.length; i++) {
                pr.println(p + "\tto\tto" + i + "\tmeth\t" + toProbe1Counts[i]);
                pr.println(p + "\tto\tto" + i + "\tunmeth\t" + toProbe2Counts[i]);
            }
        }
        if (pr != null)
            pr.close();
        progressUpdated("Running R Script", 0, 1);
        RScriptRunner runner = new RScriptRunner(tempDir);
        RProgressListener listener = new RProgressListener(runner);
        runner.addProgressListener(new ProgressRecordDialog("R Session", runner));
        runner.runScript();
        while (true) {
            if (listener.cancelled()) {
                progressCancelled();
                pr.close();
                return;
            }
            if (listener.exceptionReceived()) {
                progressExceptionReceived(new SeqMonkException("R Script failed"));
                pr.close();
                return;
            }
            if (listener.complete())
                break;
            Thread.sleep(500);
        }
        // We can now parse the results and put the hits into a new probe list
        ProbeList newList;
        if (multiTest) {
            newList = new ProbeList(startingList, "", "", "FDR");
        } else {
            newList = new ProbeList(startingList, "", "", "p-value");
        }
        File hitsFile = new File(tempDir.getAbsolutePath() + "/hits.txt");
        BufferedReader br = new BufferedReader(new FileReader(hitsFile));
        String line = br.readLine();
        // In case the same probe is found multiple times we'll cache the p-values
        // we see and report the best one.
        HashMap<Probe, Float> cachedHits = new HashMap<Probe, Float>();
        while ((line = br.readLine()) != null) {
            String[] sections = line.split("\t");
            String[] indexSections = sections[0].split("\\.");
            int probeIndex = Integer.parseInt(indexSections[indexSections.length - 1]);
            float pValue = Float.parseFloat(sections[sections.length - 1]);
            if (!cachedHits.containsKey(pairs[probeIndex].probe1)) {
                cachedHits.put(pairs[probeIndex].probe1, pValue);
            }
            if (!cachedHits.containsKey(pairs[probeIndex].probe2)) {
                cachedHits.put(pairs[probeIndex].probe2, pValue);
            }
            // See if the pvalue we've got is better than the one we're storing
            if (pValue < cachedHits.get(pairs[probeIndex].probe1)) {
                cachedHits.put(pairs[probeIndex].probe1, pValue);
            }
            if (pValue < cachedHits.get(pairs[probeIndex].probe2)) {
                cachedHits.put(pairs[probeIndex].probe2, pValue);
            }
        }
        br.close();
        for (Probe probeHit : cachedHits.keySet()) {
            newList.addProbe(probeHit, cachedHits.get(probeHit));
        }
        runner.cleanUp();
        filterFinished(newList);
    } catch (Exception ioe) {
        progressExceptionReceived(ioe);
        return;
    }
}

Also used : HashMap(java.util.HashMap) Probe(uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe) RProgressListener(uk.ac.babraham.SeqMonk.R.RProgressListener) Template(uk.ac.babraham.SeqMonk.Utilities.Templates.Template) ProgressRecordDialog(uk.ac.babraham.SeqMonk.Dialogs.ProgressRecordDialog) DataStore(uk.ac.babraham.SeqMonk.DataTypes.DataStore) FileReader(java.io.FileReader) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) PrintWriter(java.io.PrintWriter) ProbeList(uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeList) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) BufferedReader(java.io.BufferedReader) File(java.io.File) RScriptRunner(uk.ac.babraham.SeqMonk.R.RScriptRunner)

Example 45 with DataStore

use of uk.ac.babraham.SeqMonk.DataTypes.DataStore in project SeqMonk by s-andrews.

the class ReadPositionProbeGenerator method getOptionsPanel.

/* (non-Javadoc)
	 * @see uk.ac.babraham.SeqMonk.ProbeGenerators.ProbeGenerator#getOptionsPanel(uk.ac.babraham.SeqMonk.SeqMonkApplication)
	 */
public JPanel getOptionsPanel() {
    if (optionPanel != null) {
        // We've done this already
        return optionPanel;
    }
    optionPanel = new JPanel();
    optionPanel.setLayout(new GridBagLayout());
    GridBagConstraints gbc = new GridBagConstraints();
    gbc.gridx = 1;
    gbc.gridy = 1;
    gbc.weightx = 0.5;
    gbc.weighty = 0.1;
    gbc.fill = GridBagConstraints.BOTH;
    optionPanel.add(new JLabel("Stores to use"), gbc);
    gbc.gridy++;
    gbc.weighty = 0.9;
    DataStore[] stores = collection.getAllDataStores();
    storesList = new JList(stores);
    storesList.getSelectionModel().addListSelectionListener(this);
    storesList.setCellRenderer(new TypeColourRenderer());
    optionPanel.add(new JScrollPane(storesList), gbc);
    gbc.gridy++;
    JPanel bottomPanel = new JPanel();
    bottomPanel.setLayout(new GridBagLayout());
    GridBagConstraints bgbc = new GridBagConstraints();
    bgbc.gridx = 0;
    bgbc.gridy = 0;
    bgbc.weightx = 0.5;
    bgbc.weighty = 0.5;
    bgbc.fill = GridBagConstraints.HORIZONTAL;
    bottomPanel.add(new JLabel("Use reads on strand "), bgbc);
    readStrandTypeBox = new JComboBox(ReadStrandType.getTypeOptions());
    bgbc.gridx = 1;
    bottomPanel.add(readStrandTypeBox, bgbc);
    bgbc.gridx = 0;
    bgbc.gridy++;
    bottomPanel.add(new JLabel("Minimum read count per position"), bgbc);
    bgbc.gridx = 1;
    minCountField = new JTextField("" + minCount, 4);
    minCountField.addKeyListener(this);
    bottomPanel.add(minCountField, bgbc);
    bgbc.gridx = 0;
    bgbc.gridy++;
    bottomPanel.add(new JLabel("Valid positions per window"), bgbc);
    bgbc.gridx = 1;
    readsPerWindowField = new JTextField("" + readsPerWindow, 4);
    readsPerWindowField.addKeyListener(new NumberKeyListener(false, false));
    readsPerWindowField.addKeyListener(this);
    bottomPanel.add(readsPerWindowField, bgbc);
    bgbc.gridx = 0;
    bgbc.gridy++;
    bottomPanel.add(new JLabel("Limit to region"), bgbc);
    bgbc.gridx = 1;
    JPanel limitPanel = new JPanel();
    limitPanel.setLayout(new BorderLayout());
    designWithinExistingBox = new JCheckBox();
    limitPanel.add(designWithinExistingBox, BorderLayout.WEST);
    if (collection.probeSet() == null) {
        limitRegionBox = new JComboBox(new String[] { "Currently Visible Region" });
    } else {
        limitRegionBox = new JComboBox(new String[] { "Active Probe List", "Currently Visible Region" });
    }
    limitRegionBox.setEnabled(false);
    designWithinExistingBox.addActionListener(new ActionListener() {

        public void actionPerformed(ActionEvent arg0) {
            limitRegionBox.setEnabled(designWithinExistingBox.isSelected());
            isReady();
        }
    });
    limitPanel.add(limitRegionBox, BorderLayout.CENTER);
    bottomPanel.add(limitPanel, bgbc);
    optionPanel.add(bottomPanel, gbc);
    bgbc.gridx = 0;
    bgbc.gridy++;
    bottomPanel.add(new JLabel("Ignore Strand "), bgbc);
    bgbc.gridx = 1;
    ignoreStrandBox = new JCheckBox();
    ignoreStrandBox.setSelected(true);
    bottomPanel.add(ignoreStrandBox, bgbc);
    optionPanel.add(bottomPanel, gbc);
    return optionPanel;
}

Also used : JScrollPane(javax.swing.JScrollPane) JPanel(javax.swing.JPanel) GridBagConstraints(java.awt.GridBagConstraints) GridBagLayout(java.awt.GridBagLayout) JComboBox(javax.swing.JComboBox) ActionEvent(java.awt.event.ActionEvent) JLabel(javax.swing.JLabel) JTextField(javax.swing.JTextField) TypeColourRenderer(uk.ac.babraham.SeqMonk.Dialogs.Renderers.TypeColourRenderer) JCheckBox(javax.swing.JCheckBox) BorderLayout(java.awt.BorderLayout) ActionListener(java.awt.event.ActionListener) DataStore(uk.ac.babraham.SeqMonk.DataTypes.DataStore) NumberKeyListener(uk.ac.babraham.SeqMonk.Utilities.NumberKeyListener) JList(javax.swing.JList)

Aggregations

DataStore (uk.ac.babraham.SeqMonk.DataTypes.DataStore)46 Probe (uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe)21 SeqMonkException (uk.ac.babraham.SeqMonk.SeqMonkException)19 Vector (java.util.Vector)15 ProbeList (uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeList)15 DataGroup (uk.ac.babraham.SeqMonk.DataTypes.DataGroup)11 DataSet (uk.ac.babraham.SeqMonk.DataTypes.DataSet)11 Chromosome (uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome)11 JPanel (javax.swing.JPanel)10 ReplicateSet (uk.ac.babraham.SeqMonk.DataTypes.ReplicateSet)10 GridBagConstraints (java.awt.GridBagConstraints)9 JLabel (javax.swing.JLabel)9 GridBagLayout (java.awt.GridBagLayout)8 File (java.io.File)8 ActionEvent (java.awt.event.ActionEvent)7 BufferedReader (java.io.BufferedReader)7 FileReader (java.io.FileReader)7 JComboBox (javax.swing.JComboBox)7 JScrollPane (javax.swing.JScrollPane)7 HiCDataStore (uk.ac.babraham.SeqMonk.DataTypes.HiCDataStore)7