Search in sources :

Example 1 with Report

use of uk.ac.babraham.SeqMonk.Reports.Report in project SeqMonk by s-andrews.

the class RNASeqPipeline method startPipeline.

protected void startPipeline() {
    // We first need to generate probes over all of the features listed in
    // the feature types.  The probes should cover the whole area of the
    // feature regardless of where it splices.
    Vector<Probe> probes = new Vector<Probe>();
    boolean mergeTranscripts = optionsPanel.mergeTranscripts();
    boolean pairedEnd = optionsPanel.pairedEnd();
    boolean logTransform = optionsPanel.logTransform();
    boolean applyTranscriptLengthCorrection = optionsPanel.applyTranscriptLengthCorrection();
    boolean rawCounts = optionsPanel.rawCounts();
    boolean noValueForZeroCounts = optionsPanel.noValueForZeroCounts();
    boolean correctDNAContamination = optionsPanel.correctForDNAContamination();
    boolean correctDuplication = optionsPanel.correctForDNADuplication();
    if (rawCounts) {
        logTransform = false;
        applyTranscriptLengthCorrection = false;
        noValueForZeroCounts = false;
    }
    Chromosome[] chrs = collection().genome().getAllChromosomes();
    for (int c = 0; c < chrs.length; c++) {
        // System.err.println("Processing chr "+chrs[c].name());
        if (cancel) {
            progressCancelled();
            return;
        }
        progressUpdated("Making features for chr" + chrs[c].name(), c, chrs.length * 2);
        Feature[] features = collection().genome().annotationCollection().getFeaturesForType(chrs[c], optionsPanel.getSelectedFeatureType());
        Arrays.sort(features);
        FeatureGroup[] mergedTranscripts = mergeTranscripts(features, mergeTranscripts);
        for (int f = 0; f < mergedTranscripts.length; f++) {
            if (cancel) {
                progressCancelled();
                return;
            }
            probes.add(new Probe(chrs[c], mergedTranscripts[f].start(), mergedTranscripts[f].end(), mergedTranscripts[f].strand(), mergedTranscripts[f].name()));
        }
    }
    Probe[] allProbes = probes.toArray(new Probe[0]);
    Arrays.sort(allProbes);
    if (collection().probeSet() == null) {
        collection().setProbeSet(new ProbeSet("Transcript features over " + optionsPanel.getSelectedFeatureType(), allProbes));
    } else {
        Probe[] existingProbes = collection().probeSet().getAllProbes();
        Arrays.sort(existingProbes);
        if (allProbes.length != existingProbes.length) {
            collection().setProbeSet(new ProbeSet("Transcript features over " + optionsPanel.getSelectedFeatureType(), allProbes));
        } else {
            // Check the positions against the new ones
            boolean areTheyTheSame = true;
            for (int p = 0; p < allProbes.length; p++) {
                if (allProbes[p].packedPosition() != existingProbes[p].packedPosition()) {
                    areTheyTheSame = false;
                    break;
                }
            }
            if (areTheyTheSame) {
                allProbes = existingProbes;
            } else {
                collection().setProbeSet(new ProbeSet("Transcript features over " + optionsPanel.getSelectedFeatureType(), allProbes));
            }
        }
    }
    // If we're correcting for DNA contamination we need to work out the average density of
    // reads in intergenic regions
    float[] dnaDensityPerKb = new float[data.length];
    int[] correctedTotalCounts = new int[data.length];
    if (correctDNAContamination) {
        // We need to make interstitial probes to the set we already have, ignoring those at the end of chromosomes
        Vector<Probe> intergenicProbes = new Vector<Probe>();
        Chromosome lastChr = allProbes[0].chromosome();
        for (int p = 1; p < allProbes.length; p++) {
            if (allProbes[p].chromosome() != lastChr) {
                lastChr = allProbes[p].chromosome();
                continue;
            }
            // See if there's a gap back to the last probe
            if (allProbes[p].start() > allProbes[p - 1].end()) {
                if (allProbes[p].start() - allProbes[p - 1].end() < 1000) {
                    // Don't bother with really short probes
                    continue;
                }
                intergenicProbes.add(new Probe(lastChr, allProbes[p - 1].end() + 1, allProbes[p].start() - 1));
            }
        }
        Probe[] allIntergenicProbes = intergenicProbes.toArray(new Probe[0]);
        for (int d = 0; d < data.length; d++) {
            progressUpdated("Quantitating DNA contamination", 1, 2);
            float[] densities = new float[allIntergenicProbes.length];
            for (int p = 0; p < allIntergenicProbes.length; p++) {
                densities[p] = data[d].getReadsForProbe(allIntergenicProbes[p]).length / (allIntergenicProbes[p].length() / 1000f);
            }
            dnaDensityPerKb[d] = SimpleStats.median(densities);
        }
        // Work out adjusted total counts having subtracted the DNA contamination
        for (int d = 0; d < data.length; d++) {
            int predictedContamination = (int) (dnaDensityPerKb[d] * (SeqMonkApplication.getInstance().dataCollection().genome().getTotalGenomeLength() / 1000));
            int correctedTotalReadCount = data[d].getTotalReadCount() - predictedContamination;
            correctedTotalCounts[d] = correctedTotalReadCount;
        }
        // Halve the density if they're doing a directional quantitation
        if (optionsPanel.isDirectional()) {
            for (int i = 0; i < dnaDensityPerKb.length; i++) {
                dnaDensityPerKb[i] /= 2;
            }
        }
        // Halve the density if the libraries are paired end
        if (pairedEnd) {
            for (int i = 0; i < dnaDensityPerKb.length; i++) {
                dnaDensityPerKb[i] /= 2;
            }
        }
    }
    // If we're correcting for duplication we need to work out the modal count depth in
    // intergenic regions
    int[] modalDuplicationLevels = new int[data.length];
    if (correctDuplication) {
        for (int d = 0; d < data.length; d++) {
            progressUpdated("Quantitating DNA duplication", 1, 2);
            // We're not going to look at depths which are > 200.  If it's that duplicated
            // then there's no point continuing anyway.
            int[] depthCount = new int[200];
            for (int p = 0; p < allProbes.length; p++) {
                long[] reads = data[d].getReadsForProbe(allProbes[p]);
                int currentCount = 0;
                for (int r = 1; r < reads.length; r++) {
                    if (reads[r] == reads[r - 1]) {
                        ++currentCount;
                    } else {
                        if (currentCount > 0 && currentCount < 200) {
                            ++depthCount[currentCount];
                        }
                        currentCount = 1;
                    }
                }
            }
            // Find the modal depth in intergenic regions. This is the best estimate
            // of duplication
            // Since unique reads turn up all over the place even in duplicated
            // data we say that if unique reads are higher than the sum of 2-10 there
            // is no duplication
            int twoTenSum = 0;
            for (int i = 2; i <= 10; i++) {
                twoTenSum += depthCount[i];
            }
            if (depthCount[1] > twoTenSum) {
                modalDuplicationLevels[d] = 1;
            } else {
                int highestDepth = 0;
                int bestDupGuess = 1;
                for (int i = 2; i < depthCount.length; i++) {
                    // System.err.println("For depth "+i+" count was "+depthCount[i]);
                    if (depthCount[i] > highestDepth) {
                        bestDupGuess = i;
                        highestDepth = depthCount[i];
                    }
                }
                modalDuplicationLevels[d] = bestDupGuess;
            }
        }
    }
    // Having made probes we now need to quantitate them.  We'll fetch the
    // probes overlapping each sub-feature and then aggregate these together
    // to get the final quantitation.
    QuantitationStrandType readFilter = optionsPanel.readFilter();
    int currentIndex = 0;
    for (int c = 0; c < chrs.length; c++) {
        if (cancel) {
            progressCancelled();
            return;
        }
        progressUpdated("Quantitating features on chr" + chrs[c].name(), chrs.length + c, chrs.length * 2);
        Feature[] features = collection().genome().annotationCollection().getFeaturesForType(chrs[c], optionsPanel.getSelectedFeatureType());
        Arrays.sort(features);
        FeatureGroup[] mergedTranscripts = mergeTranscripts(features, mergeTranscripts);
        int[] readLengths = new int[data.length];
        for (int d = 0; d < data.length; d++) {
            readLengths[d] = data[d].getMaxReadLength();
            // actual length.
            if (pairedEnd) {
                readLengths[d] *= 2;
            }
        }
        for (int f = 0; f < mergedTranscripts.length; f++) {
            Location[] subLocations = mergedTranscripts[f].getSubLocations();
            int totalLength = 0;
            // Find the total length of all of the exons
            for (int s = 0; s < subLocations.length; s++) {
                totalLength += subLocations[s].length();
            }
            for (int d = 0; d < data.length; d++) {
                if (cancel) {
                    progressCancelled();
                    return;
                }
                long totalCount = 0;
                for (int s = 0; s < subLocations.length; s++) {
                    long[] reads = data[d].getReadsForProbe(new Probe(chrs[c], subLocations[s].start(), subLocations[s].end()));
                    for (int r = 0; r < reads.length; r++) {
                        if (!readFilter.useRead(subLocations[s], reads[r])) {
                            continue;
                        }
                        int overlap = (Math.min(subLocations[s].end(), SequenceRead.end(reads[r])) - Math.max(subLocations[s].start(), SequenceRead.start(reads[r]))) + 1;
                        totalCount += overlap;
                    }
                }
                // Now we correct the count by the total length of reads in the data and by
                // the length of the split parts of the probe, and assign this to the probe.
                // As we're correcting for read length then we work out the whole number of
                // reads which this count could comprise, rounding down to a whole number.
                totalCount /= readLengths[d];
                // We can now subtract the DNA contamination prediction.
                if (correctDNAContamination) {
                    int predictedContamination = (int) ((totalLength / 1000f) * dnaDensityPerKb[d]);
                    totalCount -= predictedContamination;
                    // Makes no sense to have negative counts
                    if (totalCount < 0)
                        totalCount = 0;
                }
                // ..and we can divide by the duplication level if we know it.
                if (correctDuplication) {
                    totalCount /= modalDuplicationLevels[d];
                }
                // System.err.println("Total read count for "+mergedTranscripts[f].name+" is "+totalCount);
                float value = totalCount;
                if (value == 0 && noValueForZeroCounts) {
                    value = Float.NaN;
                }
                // If we're log transforming then we need to set zero values to 0.9
                if (logTransform && value == 0 && !noValueForZeroCounts) {
                    value = 0.9f;
                }
                // been asked to.
                if (applyTranscriptLengthCorrection) {
                    value /= (totalLength / 1000f);
                }
                // We also correct by the total read count
                if (!rawCounts) {
                    // System.err.println("True total is "+data[d].getTotalReadCount()+" corrected total is "+correctedTotalCounts[d]);
                    // If these libraries are paired end then the total number of
                    // reads is also effectively halved.
                    float totalReadCount;
                    // calculated this already, but otherwise we'll take the total count (total length/read length)
                    if (correctDNAContamination) {
                        totalReadCount = correctedTotalCounts[d];
                    } else {
                        totalReadCount = data[d].getTotalReadLength() / readLengths[d];
                    }
                    // If we're correcting for duplication we divide by the duplication level.
                    if (correctDuplication) {
                        totalReadCount /= modalDuplicationLevels[d];
                    }
                    // Finally we work out millions of reads (single end) or fragments (paired end)
                    if (pairedEnd) {
                        totalReadCount /= 2000000f;
                    } else {
                        totalReadCount /= 1000000f;
                    }
                    // Lastly we divide the value by the total millions of reads to get the globally corrected count.
                    value /= totalReadCount;
                }
                // Finally we do the log transform if we've been asked to
                if (logTransform) {
                    value = (float) Math.log(value) / log2;
                }
                data[d].setValueForProbe(allProbes[currentIndex], value);
            }
            currentIndex++;
        }
    }
    collection().probeSet().setCurrentQuantitation(getQuantitationDescription(mergeTranscripts, applyTranscriptLengthCorrection, correctDNAContamination, logTransform, rawCounts));
    // If we estimated any parameters let's report them.
    if (correctDNAContamination || correctDuplication) {
        float[] dna = null;
        if (correctDNAContamination) {
            dna = dnaDensityPerKb;
        }
        int[] dup = null;
        if (correctDuplication) {
            dup = modalDuplicationLevels;
        }
        RNASeqParametersModel model = new RNASeqParametersModel(data, dna, dup);
        ReportTableDialog report = new ReportTableDialog(SeqMonkApplication.getInstance(), new Report(null, null) {

            @Override
            public void run() {
            }

            @Override
            public String name() {
                return "RNA-Seq parameter";
            }

            @Override
            public boolean isReady() {
                return true;
            }

            @Override
            public JPanel getOptionsPanel() {
                return null;
            }

            @Override
            public void generateReport() {
            }
        }, model);
    }
    quantitatonComplete();
}
Also used : JPanel(javax.swing.JPanel) Probe(uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe) Feature(uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature) ProbeSet(uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeSet) ReportTableDialog(uk.ac.babraham.SeqMonk.Displays.Report.ReportTableDialog) QuantitationStrandType(uk.ac.babraham.SeqMonk.DataTypes.Sequence.QuantitationStrandType) Vector(java.util.Vector) Report(uk.ac.babraham.SeqMonk.Reports.Report) Chromosome(uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome) Location(uk.ac.babraham.SeqMonk.DataTypes.Genome.Location)

Aggregations

Vector (java.util.Vector)1 JPanel (javax.swing.JPanel)1 Chromosome (uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome)1 Feature (uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature)1 Location (uk.ac.babraham.SeqMonk.DataTypes.Genome.Location)1 Probe (uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe)1 ProbeSet (uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeSet)1 QuantitationStrandType (uk.ac.babraham.SeqMonk.DataTypes.Sequence.QuantitationStrandType)1 ReportTableDialog (uk.ac.babraham.SeqMonk.Displays.Report.ReportTableDialog)1 Report (uk.ac.babraham.SeqMonk.Reports.Report)1