Search in sources :

Example 6 with SplitLocation

use of uk.ac.babraham.SeqMonk.DataTypes.Genome.SplitLocation in project SeqMonk by s-andrews.

the class CodonBiasPanel method run.

public void run() {
    // First we need to know the length of the feature we'll be analysing.  This isn't the
    // full length in the genome but the sum length of the exons.  We'll also make up an
    // array of offsets so that we can convert genomic positions into positions within the
    // feature easily.
    Location[] subLocations;
    if (feature.location() instanceof SplitLocation) {
        subLocations = ((SplitLocation) (feature.location())).subLocations();
    } else {
        subLocations = new Location[] { feature.location() };
    }
    System.err.println("Working with " + feature.name());
    System.err.println("There are " + subLocations.length + " sublocations");
    // First work out the total transcript length so we can make an appropriate data structure
    int totalLength = 0;
    for (int e = 0; e < subLocations.length; e++) {
        totalLength += subLocations[e].length();
    }
    System.err.println("Total exon length is " + totalLength);
    int[] abundance = new int[totalLength];
    // Now work out the exon boundary positions within the feature
    // We can also work out a mapping between relative genomic position and
    // feature position.
    int[] exonBoundaries = new int[subLocations.length];
    int[] genomeToFeatureMap = new int[1 + feature.location().end() - feature.location().start()];
    for (int j = 0; j < genomeToFeatureMap.length; j++) {
        genomeToFeatureMap[j] = -1;
    }
    System.err.println("Genome to feature map length is " + genomeToFeatureMap.length);
    if (feature.location().strand() == Location.FORWARD) {
        System.err.println("Feature is forward strand");
        int length = 0;
        int positionInFeature = 0;
        for (int i = 0; i < subLocations.length; i++) {
            System.err.println("Looking at sublocation " + i + " from " + subLocations[i].start() + " to " + subLocations[i].end());
            exonBoundaries[i] = length;
            System.err.println("Added exon boundary at " + exonBoundaries[i]);
            length += subLocations[i].length();
            for (int x = 0; x < subLocations[i].length(); x++) {
                int genomePostion = subLocations[i].start() + x;
                int relativeGenomePosition = genomePostion - feature.location().start();
                System.err.println("Sublocation Pos=" + x + " Genome Pos=" + genomePostion + " Rel Genome Pos=" + relativeGenomePosition + " Feature pos=" + positionInFeature);
                genomeToFeatureMap[relativeGenomePosition] = positionInFeature;
                positionInFeature++;
            }
        }
    } else if (feature.location().strand() == Location.REVERSE) {
        int length = 0;
        int positionInFeature = 0;
        for (int i = subLocations.length - 1; i >= 0; i--) {
            exonBoundaries[i] = length;
            length += subLocations[i].length();
            for (int x = 0; x < subLocations[i].length(); x++) {
                genomeToFeatureMap[subLocations[i].end() - x] = positionInFeature;
                positionInFeature++;
            }
        }
    }
    // Now we can get all of the reads and position them within the read.
    long[] reads = store.getReadsForProbe(new Probe(SeqMonkApplication.getInstance().dataCollection().genome().getExactChromsomeNameMatch(feature.chromosomeName()), feature.location().packedPosition()));
    for (int r = 0; r < reads.length; r++) {
        // We need to work out the position of this read in the feature.  This will depend
        // on whether the feature is forward or reverse strand, and whether we're reversing
        // the direction of reads.
        System.err.println("Looking at read " + SequenceRead.toString(reads[r]));
        int genomicPosition = 0;
        if (feature.location().strand() == Location.FORWARD) {
            System.err.println("It's a forward feature");
            if (reverse) {
                System.err.println("We're a same strand library");
                if (SequenceRead.strand(reads[r]) != Location.REVERSE)
                    continue;
                genomicPosition = SequenceRead.end(reads[r]);
            } else {
                System.err.println("We're an opposing strand library");
                if (SequenceRead.strand(reads[r]) != Location.FORWARD)
                    continue;
                genomicPosition = SequenceRead.start(reads[r]);
            }
            System.err.println("Raw genomic position is " + genomicPosition);
            genomicPosition = genomicPosition - feature.location().start();
            System.err.println("Corrected genomic position is " + genomicPosition);
        } else if (feature.location().strand() == Location.REVERSE) {
            if (reverse) {
                if (SequenceRead.strand(reads[r]) != Location.REVERSE)
                    continue;
                genomicPosition = SequenceRead.start(reads[r]);
            } else {
                if (SequenceRead.strand(reads[r]) != Location.FORWARD)
                    continue;
                genomicPosition = SequenceRead.end(reads[r]);
            }
            genomicPosition = feature.location().end() - genomicPosition;
        }
        System.err.println("Final genomic position is " + genomicPosition);
        if (genomicPosition < 0 || genomicPosition >= genomeToFeatureMap.length)
            continue;
        System.err.println("Position in feature is " + genomeToFeatureMap[genomicPosition]);
        if (genomeToFeatureMap[genomicPosition] != -1) {
            abundance[genomeToFeatureMap[genomicPosition]]++;
        }
    }
    this.abundance = abundance;
    this.exonBoundaries = exonBoundaries;
    calculated = true;
    repaint();
}
Also used : SplitLocation(uk.ac.babraham.SeqMonk.DataTypes.Genome.SplitLocation) Probe(uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe) SplitLocation(uk.ac.babraham.SeqMonk.DataTypes.Genome.SplitLocation) Location(uk.ac.babraham.SeqMonk.DataTypes.Genome.Location)

Example 7 with SplitLocation

use of uk.ac.babraham.SeqMonk.DataTypes.Genome.SplitLocation in project SeqMonk by s-andrews.

the class FeaturePositionSelectorPanel method getProbes.

/**
 * Gets the set of probes with appropriate context for the options
 * currently set.
 * @return
 */
public Probe[] getProbes() {
    Chromosome[] chromosomes = collection.genome().getAllChromosomes();
    Vector<Probe> newProbes = new Vector<Probe>();
    for (int c = 0; c < chromosomes.length; c++) {
        Vector<Feature> allFeatures = new Vector<Feature>();
        String[] selectedFeatureTypes = selectedFeatureTypes();
        for (int f = 0; f < selectedFeatureTypes.length; f++) {
            Feature[] features = collection.genome().annotationCollection().getFeaturesForType(chromosomes[c], selectedFeatureTypes[f]);
            for (int i = 0; i < features.length; i++) {
                allFeatures.add(features[i]);
            }
        }
        Feature[] features = allFeatures.toArray(new Feature[0]);
        for (int f = 0; f < features.length; f++) {
            if (useSubFeatures()) {
                // We need to split this up so get the sub-features
                if (features[f].location() instanceof SplitLocation) {
                    SplitLocation location = (SplitLocation) features[f].location();
                    Location[] subLocations = location.subLocations();
                    if (useExonSubfeatures()) {
                        // System.err.println("Making exon probes");
                        for (int s = 0; s < subLocations.length; s++) {
                            makeProbes(features[f], chromosomes[c], subLocations[s], newProbes, false);
                        }
                    } else {
                        // We're making introns
                        for (int s = 1; s < subLocations.length; s++) {
                            makeProbes(features[f], chromosomes[c], new Location(subLocations[s - 1].end() + 1, subLocations[s].start() - 1, features[f].location().strand()), newProbes, false);
                        }
                    }
                } else {
                    if (useExonSubfeatures()) {
                        // We can still make a single probe
                        makeProbes(features[f], chromosomes[c], features[f].location(), newProbes, false);
                    }
                // If we're making introns then we're stuffed and we give up.
                }
            } else {
                makeProbes(features[f], chromosomes[c], features[f].location(), newProbes, false);
            }
        }
    }
    Probe[] finalList = newProbes.toArray(new Probe[0]);
    if (removeDuplicates()) {
        finalList = removeDuplicates(finalList);
    }
    return finalList;
}
Also used : Chromosome(uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome) Probe(uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe) Feature(uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature) SplitLocation(uk.ac.babraham.SeqMonk.DataTypes.Genome.SplitLocation) Vector(java.util.Vector) Location(uk.ac.babraham.SeqMonk.DataTypes.Genome.Location) SplitLocation(uk.ac.babraham.SeqMonk.DataTypes.Genome.SplitLocation)

Example 8 with SplitLocation

use of uk.ac.babraham.SeqMonk.DataTypes.Genome.SplitLocation in project SeqMonk by s-andrews.

the class IntronFeatureGroup method getSubLocations.

public Location[] getSubLocations() {
    if (features.size() == 1) {
        Location loc = features.elementAt(0).location();
        if (loc instanceof SplitLocation) {
            Location[] subLocs = ((SplitLocation) loc).subLocations();
            Location[] interLocs = new Location[subLocs.length - 1];
            for (int i = 0; i < subLocs.length - 1; i++) {
                interLocs[i] = new Location(subLocs[i].end() + 1, subLocs[i + 1].start() - 1, subLocs[i].strand());
            }
            return interLocs;
        } else {
            return new Location[0];
        }
    }
    LongVector allLocs = new LongVector();
    Enumeration<Feature> en = features.elements();
    while (en.hasMoreElements()) {
        Location loc = en.nextElement().location();
        if (loc instanceof SplitLocation) {
            Location[] subLocs = ((SplitLocation) loc).subLocations();
            for (int s = 0; s < subLocs.length; s++) {
                allLocs.add(subLocs[s].packedPosition());
            }
        } else {
            allLocs.add(loc.packedPosition());
        }
    }
    long[] locs = allLocs.toArray();
    SequenceRead.sort(locs);
    Vector<Location> mergedLocs = new Vector<Location>();
    long current = locs[0];
    for (int i = 1; i < locs.length; i++) {
        // if (debug) {System.err.println("Looking at "+SequenceRead.start(locs[i])+"-"+SequenceRead.end(locs[i])+" current is "+SequenceRead.start(current)+"-"+SequenceRead.end(current));}
        if (SequenceRead.overlaps(current, locs[i]) && SequenceRead.end(locs[i]) > SequenceRead.end(current)) {
            // if (debug) {System.err.println("They overlap, extending...");}
            current = SequenceRead.packPosition(SequenceRead.start(current), SequenceRead.end(locs[i]), SequenceRead.strand(current));
        } else if (SequenceRead.end(locs[i]) <= SequenceRead.end(current)) {
            // if (debug) {System.err.println("This is a subset, ignoring it");}
            continue;
        } else {
            // if (debug) {System.err.println("They don't overlap, moving on...");}
            mergedLocs.add(new Location(current));
            current = locs[i];
        }
    }
    mergedLocs.add(new Location(current));
    Location[] interLocs = new Location[mergedLocs.size() - 1];
    for (int i = 0; i < mergedLocs.size() - 1; i++) {
        interLocs[i] = new Location(mergedLocs.elementAt(i).end() + 1, mergedLocs.elementAt(i + 1).start() - 1, mergedLocs.elementAt(i).strand());
    }
    return interLocs;
}
Also used : LongVector(uk.ac.babraham.SeqMonk.Utilities.LongVector) SplitLocation(uk.ac.babraham.SeqMonk.DataTypes.Genome.SplitLocation) Feature(uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature) Vector(java.util.Vector) LongVector(uk.ac.babraham.SeqMonk.Utilities.LongVector) SplitLocation(uk.ac.babraham.SeqMonk.DataTypes.Genome.SplitLocation) Location(uk.ac.babraham.SeqMonk.DataTypes.Genome.Location)

Example 9 with SplitLocation

use of uk.ac.babraham.SeqMonk.DataTypes.Genome.SplitLocation in project SeqMonk by s-andrews.

the class SplicingEfficiencyPipeline method startPipeline.

protected void startPipeline() {
    // We first need to generate probes over all of the features listed in
    // the feature types.  The probes should cover the whole area of the
    // feature regardless of where it splices.
    Vector<Probe> probes = new Vector<Probe>();
    boolean correctReadLength = optionsPanel.correctForReadLength();
    boolean logTransform = optionsPanel.logTransform();
    boolean applyTranscriptLengthCorrection = optionsPanel.applyTranscriptLengthCorrection();
    // We need to make a lookup for chromosomes from their names for later on.
    Chromosome[] chrs = collection().genome().getAllChromosomes();
    Hashtable<String, Chromosome> chrLookup = new Hashtable<String, Chromosome>();
    for (int c = 0; c < chrs.length; c++) {
        chrLookup.put(chrs[c].name(), chrs[c]);
    }
    // We're going to cache the gene to transcript mappings so we don't have to re-do them
    // later on.
    Hashtable<String, Vector<Feature>> transcriptLookup = new Hashtable<String, Vector<Feature>>();
    Vector<Feature> usedGeneFeatures = new Vector<Feature>();
    // We'll record the number of failures so we can report it
    int noTranscriptCount = 0;
    int noIntronCount = 0;
    for (int c = 0; c < chrs.length; c++) {
        if (cancel) {
            progressCancelled();
            return;
        }
        progressUpdated("Making features for chr" + chrs[c].name(), c, chrs.length * 2);
        Feature[] geneFeatures = collection().genome().annotationCollection().getFeaturesForType(chrs[c], optionsPanel.getSelectedGeneFeatureType());
        Arrays.sort(geneFeatures);
        Feature[] transcriptFeatures = collection().genome().annotationCollection().getFeaturesForType(chrs[c], optionsPanel.getSelectedTranscriptFeatureType());
        Arrays.sort(transcriptFeatures);
        // We need to figure out whether we can match based on name, or whether we need to do it based on location.
        boolean matchOnNames = true;
        for (int t = 0; t < transcriptFeatures.length; t++) {
            if (!transcriptFeatures[t].name().matches("-\\d\\d\\d$")) {
                matchOnNames = false;
                break;
            }
        }
        if (!matchOnNames) {
            progressWarningReceived(new SeqMonkException("Non-Ensembl transcript names found - matching based on position"));
        }
        if (matchOnNames) {
            for (int i = 0; i < transcriptFeatures.length; i++) {
                String name = transcriptFeatures[i].name();
                name = name.replaceAll("-\\d\\d\\d$", "");
                if (!transcriptLookup.containsKey(name)) {
                    transcriptLookup.put(name, new Vector<Feature>());
                }
                transcriptLookup.get(name).add(transcriptFeatures[i]);
            }
        } else {
            // We need to go through the genes and transcripts in parallel and match them up based on containment
            // and direction.
            int lastGeneIndex = 0;
            for (int t = 0; t < transcriptFeatures.length; t++) {
                for (int g = lastGeneIndex; g < geneFeatures.length; g++) {
                    if (transcriptFeatures[t].location().start() > geneFeatures[g].location().end()) {
                        lastGeneIndex = g;
                        continue;
                    }
                    // If the gene is beyond the end of the transcript we can stop looking
                    if (geneFeatures[g].location().start() > transcriptFeatures[t].location().end()) {
                        break;
                    }
                    // If we're on the same strand and contained within the gene then we have a match
                    if (geneFeatures[g].location().strand() == transcriptFeatures[t].location().strand() && transcriptFeatures[t].location().start() >= geneFeatures[g].location().start() && transcriptFeatures[t].location().end() <= geneFeatures[g].location().end()) {
                        String name = geneFeatures[g].name();
                        if (!transcriptLookup.containsKey(name)) {
                            transcriptLookup.put(name, new Vector<Feature>());
                        }
                        transcriptLookup.get(name).add(transcriptFeatures[t]);
                    }
                }
            }
        }
        for (int f = 0; f < geneFeatures.length; f++) {
            if (cancel) {
                progressCancelled();
                return;
            }
            if (!transcriptLookup.containsKey(geneFeatures[f].name())) {
                ++noTranscriptCount;
                // No good making features for genes with no transcript.
                continue;
            }
            Feature[] localTranscripts = transcriptLookup.get(geneFeatures[f].name()).toArray(new Feature[0]);
            boolean[] exonPositions = new boolean[geneFeatures[f].location().length()];
            for (int t = 0; t < localTranscripts.length; t++) {
                if (!SequenceRead.overlaps(geneFeatures[f].location().packedPosition(), localTranscripts[t].location().packedPosition())) {
                    // It's not a match for this gene
                    continue;
                }
                if (!(localTranscripts[t].location() instanceof SplitLocation)) {
                    for (int p = localTranscripts[t].location().start() - geneFeatures[f].location().start(); p <= localTranscripts[t].location().end() - geneFeatures[f].location().start(); p++) {
                        if (p < 0)
                            continue;
                        if (p >= exonPositions.length)
                            continue;
                        exonPositions[p] = true;
                    }
                    continue;
                }
                Location[] exons = ((SplitLocation) localTranscripts[t].location()).subLocations();
                for (int e = 0; e < exons.length; e++) {
                    for (int p = exons[e].start() - geneFeatures[f].location().start(); p <= exons[e].end() - geneFeatures[f].location().start(); p++) {
                        if (p < 0)
                            continue;
                        if (p >= exonPositions.length)
                            continue;
                        exonPositions[p] = true;
                    }
                }
            }
            int exonPositionCount = 0;
            int intronPositionCount = 0;
            for (int p = 0; p < exonPositions.length; p++) {
                if (exonPositions[p])
                    exonPositionCount++;
                else
                    intronPositionCount++;
            }
            if (exonPositionCount == 0) {
                ++noTranscriptCount;
                continue;
            }
            if (intronPositionCount == 0) {
                ++noIntronCount;
                continue;
            }
            probes.add(new Probe(chrs[c], geneFeatures[f].location().start(), geneFeatures[f].location().end(), geneFeatures[f].location().strand(), geneFeatures[f].name()));
            usedGeneFeatures.add(geneFeatures[f]);
        }
    }
    if (noTranscriptCount > 0) {
        progressWarningReceived(new SeqMonkException("" + noTranscriptCount + " genes had no associated transcripts"));
    }
    if (noIntronCount > 0) {
        progressWarningReceived(new SeqMonkException("" + noIntronCount + " genes had no intron only areas"));
    }
    Probe[] allProbes = probes.toArray(new Probe[0]);
    collection().setProbeSet(new ProbeSet("Gene features over " + optionsPanel.getSelectedGeneFeatureType(), allProbes));
    // Having made probes we now need to quantitate them.  We'll get the full set of reads for the full gene
    // and then work out a combined set of exons.  We can then quantitate the bases which fall into introns
    // and exons separately and then work out a ratio from there.
    QuantitationStrandType readFilter = optionsPanel.readFilter();
    Feature[] geneFeatures = usedGeneFeatures.toArray(new Feature[0]);
    for (int g = 0; g < geneFeatures.length; g++) {
        if (cancel) {
            progressCancelled();
            return;
        }
        if (g % 100 == 0) {
            progressUpdated("Quantitating features", g, geneFeatures.length);
        }
        int[] readLengths = new int[data.length];
        if (correctReadLength) {
            for (int d = 0; d < data.length; d++) {
                readLengths[d] = data[d].getMaxReadLength();
            }
        }
        // Find the set of transcripts which relate to this gene.
        Feature[] transcripts = transcriptLookup.get(geneFeatures[g].name()).toArray(new Feature[0]);
        Vector<Feature> validTranscripts = new Vector<Feature>();
        for (int t = 0; t < transcripts.length; t++) {
            if (SequenceRead.overlaps(geneFeatures[g].location().packedPosition(), transcripts[t].location().packedPosition())) {
                validTranscripts.add(transcripts[t]);
            }
        }
        transcripts = validTranscripts.toArray(new Feature[0]);
        // before so if we do then something has gone wrong.
        if (transcripts.length == 0) {
            throw new IllegalStateException("No transcripts for gene " + geneFeatures[g] + " on chr " + geneFeatures[g].chromosomeName());
        }
        Vector<Location> allExonsVector = new Vector<Location>();
        for (int t = 0; t < transcripts.length; t++) {
            if (transcripts[t].location() instanceof SplitLocation) {
                Location[] sublocs = ((SplitLocation) transcripts[t].location()).subLocations();
                for (int i = 0; i < sublocs.length; i++) allExonsVector.add(sublocs[i]);
            } else {
                allExonsVector.add(transcripts[t].location());
            }
        }
        // if (geneFeatures[f].name().equals("Cpa6")) {
        // System.err.println("Cpa6 had "+allExonsVector.size()+" total exons");
        // }
        Collections.sort(allExonsVector);
        // Now go through and make a merged set of exons to remove any redundancy
        Vector<Location> mergedExonsVector = new Vector<Location>();
        Location lastLocation = null;
        Enumeration<Location> en = allExonsVector.elements();
        while (en.hasMoreElements()) {
            Location l = en.nextElement();
            if (lastLocation == null) {
                // if (geneFeatures[f].name().equals("Cpa6")) {
                // System.err.println("Setting as first location");
                // }
                lastLocation = l;
                continue;
            }
            // Check if it's the same as the last, which is likely
            if (l.start() == lastLocation.start() && l.end() == lastLocation.end()) {
                continue;
            }
            // Check if they overlap and can be merged
            if (l.start() <= lastLocation.end() && l.end() >= lastLocation.start()) {
                // if (geneFeatures[f].name().equals("Cpa6")) {
                // System.err.println("Overlaps with last location");
                // }
                // It overlaps with the last location so merge them
                lastLocation = new Location(Math.min(l.start(), lastLocation.start()), Math.max(l.end(), lastLocation.end()), geneFeatures[g].location().strand());
                // }
                continue;
            }
            // Start a new location
            // if (geneFeatures[f].name().equals("Cpa6")) {
            // System.err.println("Doesn't overlap - adding last location and creating new one");
            // }
            mergedExonsVector.add(lastLocation);
            lastLocation = l;
        }
        if (lastLocation != null) {
            mergedExonsVector.add(lastLocation);
        }
        // if (geneFeatures[f].name().equals("Cpa6")) {
        // System.err.println("Cpa6 had "+mergedExonsVector.size()+" merged exons");
        // }
        // Now we can start the quantitation.
        int intronLength = geneFeatures[g].location().length();
        int exonLength = 0;
        Location[] subLocs = mergedExonsVector.toArray(new Location[0]);
        for (int l = 0; l < subLocs.length; l++) {
            exonLength += subLocs[l].length();
        }
        // if (geneFeatures[f].name().equals("Cpa6")) {
        // System.err.println("Cpa6 total intron length="+intronLength+" exon length="+exonLength);
        // }
        intronLength -= exonLength;
        if (intronLength <= 0) {
            progressWarningReceived(new IllegalStateException("Intron length of " + intronLength + " for gene " + geneFeatures[g]));
            for (int d = 0; d < data.length; d++) {
                data[d].setValueForProbe(allProbes[g], Float.NaN);
            }
            continue;
        }
        if (exonLength <= 0) {
            progressWarningReceived(new IllegalStateException("Exon length of " + exonLength + " for gene " + geneFeatures[g]));
            for (int d = 0; d < data.length; d++) {
                data[d].setValueForProbe(allProbes[g], Float.NaN);
            }
            continue;
        }
        for (int d = 0; d < data.length; d++) {
            if (cancel) {
                progressCancelled();
                return;
            }
            int totalIntronCount = 0;
            int totalExonCount = 0;
            long[] reads = data[d].getReadsForProbe(new Probe(chrLookup.get(geneFeatures[g].chromosomeName()), geneFeatures[g].location().start(), geneFeatures[g].location().end()));
            for (int r = 0; r < reads.length; r++) {
                if (!readFilter.useRead(geneFeatures[g].location(), reads[r])) {
                    continue;
                }
                int overlap = (Math.min(geneFeatures[g].location().end(), SequenceRead.end(reads[r])) - Math.max(geneFeatures[g].location().start(), SequenceRead.start(reads[r]))) + 1;
                totalIntronCount += overlap;
                // Now we see if we overlap with any of the exons
                for (int s = 0; s < subLocs.length; s++) {
                    if (subLocs[s].start() <= SequenceRead.end(reads[r]) && subLocs[s].end() >= SequenceRead.start(reads[r])) {
                        int exonOverlap = (Math.min(subLocs[s].end(), SequenceRead.end(reads[r])) - Math.max(subLocs[s].start(), SequenceRead.start(reads[r]))) + 1;
                        if (exonOverlap > 0) {
                            totalExonCount += exonOverlap;
                            totalIntronCount -= exonOverlap;
                        }
                    }
                }
            }
            if (totalIntronCount < 0) {
                progressWarningReceived(new SeqMonkException("Intron count of " + totalIntronCount + " for " + geneFeatures[g].name()));
                continue;
            }
            // reads which this count could comprise, rounding down to a whole number.
            if (correctReadLength) {
                totalIntronCount /= readLengths[d];
                totalExonCount /= readLengths[d];
            }
            float intronValue = totalIntronCount;
            float exonValue = totalExonCount;
            // If we're log transforming then we need to set zero values to 0.9
            if (logTransform && intronValue == 0) {
                intronValue = 0.9f;
            }
            if (logTransform && exonValue == 0) {
                exonValue = 0.9f;
            }
            // been asked to.
            if (applyTranscriptLengthCorrection) {
                intronValue /= (intronLength / 1000f);
                exonValue /= (exonLength / 1000f);
            }
            // We also correct by the total read count, or length
            if (correctReadLength) {
                intronValue /= (data[d].getTotalReadCount() / 1000000f);
                exonValue /= (data[d].getTotalReadCount() / 1000000f);
            } else {
                intronValue /= (data[d].getTotalReadLength() / 1000000f);
                exonValue /= (data[d].getTotalReadLength() / 1000000f);
            }
            // Finally we do the log transform if we've been asked to
            if (logTransform) {
                if (intronValue == 0) {
                    intronValue = 0.0001f;
                }
                if (exonValue == 0) {
                    exonValue = 0.0001f;
                }
                intronValue = (float) Math.log(intronValue) / log2;
                exonValue = (float) Math.log(exonValue) / log2;
            }
            // Now we check what value they actually wanted to record
            switch(optionsPanel.quantitationType()) {
                case (EXONS):
                    {
                        data[d].setValueForProbe(allProbes[g], exonValue);
                        break;
                    }
                case (INTRONS):
                    {
                        data[d].setValueForProbe(allProbes[g], intronValue);
                        break;
                    }
                case (RATIO):
                    {
                        if (logTransform) {
                            data[d].setValueForProbe(allProbes[g], exonValue - intronValue);
                        } else {
                            if (intronValue == 0) {
                                intronValue = 0.0001f;
                            }
                            data[d].setValueForProbe(allProbes[g], exonValue / intronValue);
                        }
                        break;
                    }
                default:
                    throw new IllegalStateException("Unknonwn quantitation type " + optionsPanel.quantitationType());
            }
        // if (geneFeatures[f].name().equals("Cpa6")) {
        // try {
        // System.err.println("Stored value was "+data[d].getValueForProbe(allProbes[currentIndex]));
        // } catch (SeqMonkException e) {
        // e.printStackTrace();
        // }
        // }
        }
    }
    StringBuffer quantitationDescription = new StringBuffer();
    quantitationDescription.append("Splicing efficiency quantitation");
    if (correctReadLength) {
        quantitationDescription.append(" counting reads");
    } else {
        quantitationDescription.append(" counting bases");
    }
    if (optionsPanel.logTransform()) {
        quantitationDescription.append(" log transformed");
    }
    if (applyTranscriptLengthCorrection) {
        quantitationDescription.append(" correcting for feature length");
    }
    // TODO: Add more description
    collection().probeSet().setCurrentQuantitation(quantitationDescription.toString());
    quantitatonComplete();
}
Also used : Probe(uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe) Feature(uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature) ProbeSet(uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeSet) QuantitationStrandType(uk.ac.babraham.SeqMonk.DataTypes.Sequence.QuantitationStrandType) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) Vector(java.util.Vector) Hashtable(java.util.Hashtable) Chromosome(uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome) SplitLocation(uk.ac.babraham.SeqMonk.DataTypes.Genome.SplitLocation) Location(uk.ac.babraham.SeqMonk.DataTypes.Genome.Location) SplitLocation(uk.ac.babraham.SeqMonk.DataTypes.Genome.SplitLocation)

Example 10 with SplitLocation

use of uk.ac.babraham.SeqMonk.DataTypes.Genome.SplitLocation in project SeqMonk by s-andrews.

the class IntronRegressionPipeline method startPipeline.

protected void startPipeline() {
    // We first need to generate probes over all of the features listed in
    // the feature types.  The probes should cover the whole area of the
    // feature regardless of where it splices.
    Vector<Probe> probes = new Vector<Probe>();
    int minDensity = optionsPanel.minDensity();
    int minLength = optionsPanel.minLength();
    double maxPValue = optionsPanel.maxPValue();
    int binSize = optionsPanel.measurementBinSize();
    QuantitationStrandType readFilter = optionsPanel.readFilter();
    Chromosome[] chrs = collection().genome().getAllChromosomes();
    for (int c = 0; c < chrs.length; c++) {
        if (cancel) {
            progressCancelled();
            return;
        }
        Vector<Probe> probesForThisChromosome = new Vector<Probe>();
        progressUpdated("Making probes", c, chrs.length);
        Feature[] features = getValidFeatures(chrs[c]);
        for (int f = 0; f < features.length; f++) {
            if (cancel) {
                progressCancelled();
                return;
            }
            // There are no introns here
            if (!(features[f].location() instanceof SplitLocation))
                continue;
            Location[] subLocations = ((SplitLocation) features[f].location()).subLocations();
            // TODO: Reverse the subLocations if its a reverse feature
            for (int intron = 1; intron < subLocations.length; intron++) {
                int start = subLocations[intron - 1].end();
                int end = subLocations[intron].start();
                if ((end - start) + 1 < minLength) {
                    // This intron is too short.
                    continue;
                }
                // TODO: We could throw away any probes which didn't have enough reads in any feature
                Probe p = new Probe(chrs[c], start, end, features[f].location().strand(), features[f].name() + "_" + intron);
                probesForThisChromosome.add(p);
            }
        }
        // Now we can deduplicate the probes for this chromosome and add them to the main collection
        Probe[] dupProbes = probesForThisChromosome.toArray(new Probe[0]);
        Arrays.sort(dupProbes);
        for (int p = 0; p < dupProbes.length; p++) {
            if (p > 0 && dupProbes[p].packedPosition() == dupProbes[p - 1].packedPosition())
                continue;
            probes.add(dupProbes[p]);
        }
    }
    Probe[] allProbes = probes.toArray(new Probe[0]);
    collection().setProbeSet(new ProbeSet("Features over " + optionsPanel.getSelectedFeatureType(), allProbes));
    // Now we go back through the probes and quantitate them
    for (int p = 0; p < allProbes.length; p++) {
        if (cancel) {
            progressCancelled();
            return;
        }
        if (p % 1000 == 0) {
            progressUpdated("Quantitated " + p + " out of " + allProbes.length + " probes", p, allProbes.length);
        }
        for (int d = 0; d < data.length; d++) {
            long[] reads = data[d].getReadsForProbe(allProbes[p]);
            int[] countsPerSite = new int[allProbes[p].length()];
            int usableCounts = 0;
            for (int r = 0; r < reads.length; r++) {
                if (readFilter.useRead(allProbes[p], reads[r])) {
                    ++usableCounts;
                    for (int pos = Math.max(0, SequenceRead.start(reads[r]) - allProbes[p].start()); pos <= Math.min(countsPerSite.length - 1, SequenceRead.end(reads[r]) - allProbes[p].start()); pos++) {
                        ++countsPerSite[pos];
                    }
                }
            }
            if (usableCounts / (allProbes[p].length() / 1000d) >= minDensity) {
                // We're going to do a linear regression rather than a correlation
                // We're analysing in bins so we'll work out the bin counts and
                // add them dynamically to the regression.
                SimpleRegression regression = new SimpleRegression();
                int binCount = 0;
                for (int i = 0; i < countsPerSite.length; i++) {
                    if (i > 0 && i % binSize == 0) {
                        regression.addData(i, binCount);
                        binCount = 0;
                    }
                    binCount += countsPerSite[i];
                }
                float slope = (float) (regression.getSlope() * 1000000);
                double pValue = regression.getSignificance();
                if (allProbes[p].strand() == Location.REVERSE) {
                    slope = 0 - slope;
                }
                if (pValue <= maxPValue) {
                    data[d].setValueForProbe(allProbes[p], slope);
                } else {
                    data[d].setValueForProbe(allProbes[p], Float.NaN);
                }
            } else {
                data[d].setValueForProbe(allProbes[p], Float.NaN);
            }
        }
    }
    StringBuffer quantitationDescription = new StringBuffer();
    quantitationDescription.append("Intron regression pipeline quantitation ");
    quantitationDescription.append(". Directionality was ");
    quantitationDescription.append(optionsPanel.libraryTypeBox.getSelectedItem());
    quantitationDescription.append(". Min intron length was ");
    quantitationDescription.append(minLength);
    quantitationDescription.append(". Min read density was ");
    quantitationDescription.append(minDensity);
    quantitationDescription.append(". Max slope p-value was ");
    quantitationDescription.append(maxPValue);
    collection().probeSet().setCurrentQuantitation(quantitationDescription.toString());
    quantitatonComplete();
}
Also used : Chromosome(uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome) Probe(uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe) Feature(uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature) ProbeSet(uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeSet) SimpleRegression(org.apache.commons.math3.stat.regression.SimpleRegression) SplitLocation(uk.ac.babraham.SeqMonk.DataTypes.Genome.SplitLocation) QuantitationStrandType(uk.ac.babraham.SeqMonk.DataTypes.Sequence.QuantitationStrandType) Vector(java.util.Vector) Location(uk.ac.babraham.SeqMonk.DataTypes.Genome.Location) SplitLocation(uk.ac.babraham.SeqMonk.DataTypes.Genome.SplitLocation)

Aggregations

SplitLocation (uk.ac.babraham.SeqMonk.DataTypes.Genome.SplitLocation)10 Location (uk.ac.babraham.SeqMonk.DataTypes.Genome.Location)9 Vector (java.util.Vector)8 Feature (uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature)8 Chromosome (uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome)6 Probe (uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe)6 ProbeSet (uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeSet)3 LongVector (uk.ac.babraham.SeqMonk.Utilities.LongVector)3 QuantitationStrandType (uk.ac.babraham.SeqMonk.DataTypes.Sequence.QuantitationStrandType)2 SeqMonkException (uk.ac.babraham.SeqMonk.SeqMonkException)2 FileNotFoundException (java.io.FileNotFoundException)1 IOException (java.io.IOException)1 UnknownHostException (java.net.UnknownHostException)1 Enumeration (java.util.Enumeration)1 Hashtable (java.util.Hashtable)1 SimpleRegression (org.apache.commons.math3.stat.regression.SimpleRegression)1 AnnotationSet (uk.ac.babraham.SeqMonk.DataTypes.Genome.AnnotationSet)1