Search in sources :

Example 1 with Location

use of uk.ac.babraham.SeqMonk.DataTypes.Genome.Location in project SeqMonk by s-andrews.

the class FeaturePositionSelectorPanel method getCoreProbes.

/**
 * Gets the set of locations for the core of each feature.  This wouldn't
 * include additional context added by the options, but would have subtracted
 * context removed by the options.
 *
 * @return
 */
public Probe[] getCoreProbes() {
    Chromosome[] chromosomes = collection.genome().getAllChromosomes();
    Vector<Probe> newProbes = new Vector<Probe>();
    for (int c = 0; c < chromosomes.length; c++) {
        Vector<Feature> allFeatures = new Vector<Feature>();
        String[] selectedFeatureTypes = selectedFeatureTypes();
        for (int f = 0; f < selectedFeatureTypes.length; f++) {
            Feature[] features = collection.genome().annotationCollection().getFeaturesForType(chromosomes[c], selectedFeatureTypes[f]);
            for (int i = 0; i < features.length; i++) {
                allFeatures.add(features[i]);
            }
        }
        Feature[] features = allFeatures.toArray(new Feature[0]);
        for (int f = 0; f < features.length; f++) {
            if (useSubFeatures()) {
                // We need to split this up so get the sub-features
                if (features[f].location() instanceof SplitLocation) {
                    SplitLocation location = (SplitLocation) features[f].location();
                    Location[] subLocations = location.subLocations();
                    if (useExonSubfeatures()) {
                        for (int s = 0; s < subLocations.length; s++) {
                            makeProbes(features[f], chromosomes[c], subLocations[s], newProbes, true);
                        }
                    } else {
                        // We're making introns
                        for (int s = 1; s < subLocations.length; s++) {
                            makeProbes(features[f], chromosomes[c], new Location(subLocations[s - 1].end() + 1, subLocations[s].start() - 1, features[f].location().strand()), newProbes, true);
                        }
                    }
                } else {
                    if (useExonSubfeatures()) {
                        // We can still make a single probe
                        makeProbes(features[f], chromosomes[c], features[f].location(), newProbes, true);
                    }
                // If we're making introns then we're stuffed and we give up.
                }
            } else {
                makeProbes(features[f], chromosomes[c], features[f].location(), newProbes, true);
            }
        }
    }
    Probe[] finalList = newProbes.toArray(new Probe[0]);
    if (removeDuplicates()) {
        finalList = removeDuplicates(finalList);
    }
    return finalList;
}
Also used : Chromosome(uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome) Probe(uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe) Feature(uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature) SplitLocation(uk.ac.babraham.SeqMonk.DataTypes.Genome.SplitLocation) Vector(java.util.Vector) Location(uk.ac.babraham.SeqMonk.DataTypes.Genome.Location) SplitLocation(uk.ac.babraham.SeqMonk.DataTypes.Genome.SplitLocation)

Example 2 with Location

use of uk.ac.babraham.SeqMonk.DataTypes.Genome.Location in project SeqMonk by s-andrews.

the class GFF3AnnotationParser method parseAnnotation.

public AnnotationSet[] parseAnnotation(File file, Genome genome, String prefix) throws Exception {
    System.err.println("Parsing " + file);
    if (prefix == null) {
        featurePrefix = JOptionPane.showInputDialog(SeqMonkApplication.getInstance(), "Feature prefix", "GFFv3/GTP Options", JOptionPane.QUESTION_MESSAGE);
    } else {
        featurePrefix = prefix;
    }
    if (featurePrefix == null)
        featurePrefix = "";
    Vector<AnnotationSet> annotationSets = new Vector<AnnotationSet>();
    AnnotationSet currentAnnotation = new AnnotationSet(genome, file.getName());
    annotationSets.add(currentAnnotation);
    Hashtable<String, FeatureGroup> groupedFeatures = new Hashtable<String, FeatureGroup>();
    BufferedReader br;
    if (file.getName().toLowerCase().endsWith(".gz")) {
        br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(file))));
    } else {
        br = new BufferedReader(new FileReader(file));
    }
    String line;
    int count = 0;
    while ((line = br.readLine()) != null) {
        if (cancel) {
            progressCancelled();
            br.close();
            return null;
        }
        if (count % 1000 == 0) {
            progressUpdated("Read " + count + " lines from " + file.getName(), 0, 1);
        }
        if (count > 1000000 && count % 1000000 == 0) {
            progressUpdated("Caching...", 0, 1);
            currentAnnotation.finalise();
            currentAnnotation = new AnnotationSet(genome, file.getName() + "[" + annotationSets.size() + "]");
            annotationSets.add(currentAnnotation);
        }
        ++count;
        // Ignore blank lines
        if (line.trim().length() == 0)
            continue;
        // Skip comments
        if (line.startsWith("#"))
            continue;
        String[] sections = line.split("\t");
        // Check to see if we've got enough data to work with
        if (sections.length < 7) {
            progressWarningReceived(new SeqMonkException("Not enough data from line '" + line + "'"));
            continue;
        }
        int strand;
        int start;
        int end;
        try {
            start = Integer.parseInt(sections[3]);
            end = Integer.parseInt(sections[4]);
            // End must always be later than start
            if (end < start) {
                int temp = start;
                start = end;
                end = temp;
            }
            if (sections.length >= 7) {
                if (sections[6].equals("+")) {
                    strand = Location.FORWARD;
                } else if (sections[6].equals("-")) {
                    strand = Location.REVERSE;
                } else {
                    strand = Location.UNKNOWN;
                }
            } else {
                strand = Location.UNKNOWN;
            }
        } catch (NumberFormatException e) {
            progressWarningReceived(new SeqMonkException("Location " + sections[3] + "-" + sections[4] + " was not an integer"));
            continue;
        }
        ChromosomeWithOffset c;
        try {
            c = genome.getChromosome(sections[0]);
        } catch (IllegalArgumentException e) {
            progressWarningReceived(new SeqMonkException("Couldn't find a chromosome called " + sections[0]));
            continue;
        }
        start = c.position(start);
        end = c.position(end);
        // We also don't allow readings which are beyond the end of the chromosome
        if (end > c.chromosome().length()) {
            int overrun = end - c.chromosome().length();
            progressWarningReceived(new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")"));
            continue;
        }
        if (sections.length > 8 && sections[8].trim().length() > 0) {
            // Should check for escaped colons
            String[] attributes = sections[8].split(" *; *");
            // Make up a data structure of the attributes we have
            Hashtable<String, Vector<String>> keyValuePairs = new Hashtable<String, Vector<String>>();
            for (int a = 0; a < attributes.length; a++) {
                // Should check for escaped equals
                String[] keyValue = attributes[a].split("=", 2);
                // See if we didn't get split
                if (keyValue.length == 1) {
                    // This could be a GTF file which uses quoted values in space delimited fields
                    keyValue = attributes[a].split(" \"");
                    if (keyValue.length == 2) {
                        // We need to remove the quote from the end of the value
                        keyValue[1] = keyValue[1].substring(0, keyValue[1].length() - 1);
                    // System.out.println("Key='"+keyValue[0]+"' value='"+keyValue[1]+"'");
                    }
                }
                if (keyValue.length == 2) {
                    if (keyValuePairs.containsKey(keyValue[0])) {
                        keyValuePairs.get(keyValue[0]).add(keyValue[1]);
                    } else {
                        Vector<String> newVector = new Vector<String>();
                        newVector.add(keyValue[1]);
                        keyValuePairs.put(keyValue[0], newVector);
                    }
                } else {
                    progressWarningReceived(new SeqMonkException("No key value delimiter in " + attributes[a]));
                }
            }
            if (keyValuePairs.containsKey("Parent") && !sections[2].equals("mRNA")) {
                // We change exons to mRNA so we don't end up with spliced exon objects
                if (sections[2].equals("exon"))
                    sections[2] = "mRNA";
                String[] parents = keyValuePairs.get("Parent").elementAt(0).split(",");
                for (int p = 0; p < parents.length; p++) {
                    if (!groupedFeatures.containsKey(sections[2] + "_" + parents[p])) {
                        // Make a new feature to which we can add this
                        Feature feature = new Feature(featurePrefix + sections[2], c.chromosome().name());
                        groupedFeatures.put(sections[2] + "_" + parents[p], new FeatureGroup(feature, strand, feature.location()));
                        Enumeration<String> en = keyValuePairs.keys();
                        while (en.hasMoreElements()) {
                            String key = en.nextElement();
                            String[] values = keyValuePairs.get(key).toArray(new String[0]);
                            for (int v = 0; v < values.length; v++) {
                                feature.addAttribute(key, values[v]);
                            }
                        }
                    }
                    groupedFeatures.get(sections[2] + "_" + parents[p]).addSublocation(new Location(start, end, strand));
                }
            } else // parent feature
            if (keyValuePairs.containsKey("transcript_id")) {
                if (sections[2].equals("exon"))
                    sections[2] = "mRNA";
                if (!groupedFeatures.containsKey(sections[2] + "_" + keyValuePairs.get("transcript_id").elementAt(0))) {
                    Feature feature = new Feature(featurePrefix + sections[2], c.chromosome().name());
                    Enumeration<String> en = keyValuePairs.keys();
                    while (en.hasMoreElements()) {
                        String key = en.nextElement();
                        String[] values = keyValuePairs.get(key).toArray(new String[0]);
                        for (int v = 0; v < values.length; v++) {
                            feature.addAttribute(key, values[v]);
                        }
                    }
                    groupedFeatures.put(sections[2] + "_" + keyValuePairs.get("transcript_id").elementAt(0), new FeatureGroup(feature, strand, feature.location()));
                }
                groupedFeatures.get(sections[2] + "_" + keyValuePairs.get("transcript_id").elementAt(0)).addSublocation(new Location(start, end, strand));
            } else {
                // If we get here we're making a feature with attributes
                Feature feature = new Feature(featurePrefix + sections[2], c.chromosome().name());
                feature.setLocation(new Location(start, end, strand));
                Enumeration<String> en = keyValuePairs.keys();
                while (en.hasMoreElements()) {
                    String key = en.nextElement();
                    String[] values = keyValuePairs.get(key).toArray(new String[0]);
                    for (int v = 0; v < values.length; v++) {
                        feature.addAttribute(key, values[v]);
                    }
                }
                if (keyValuePairs.containsKey("ID")) {
                    // This is a feature which may end up having subfeatures
                    groupedFeatures.put(sections[2] + "_" + keyValuePairs.get("ID").elementAt(0), new FeatureGroup(feature, strand, feature.location()));
                // System.out.println("Making new entry for "+keyValuePairs.get("ID").elementAt(0));
                } else {
                    // We can just add this to the annotation collection
                    currentAnnotation.addFeature(feature);
                }
            }
        } else {
            // No group parameter to worry about
            Feature feature = new Feature(featurePrefix + sections[2], c.chromosome().name());
            feature.setLocation(new Location(start, end, strand));
            currentAnnotation.addFeature(feature);
        }
    }
    br.close();
    // Now go through the grouped features adding them to the annotation set
    Iterator<FeatureGroup> i = groupedFeatures.values().iterator();
    while (i.hasNext()) {
        Feature f = i.next().feature();
        currentAnnotation.addFeature(f);
    }
    return annotationSets.toArray(new AnnotationSet[0]);
}
Also used : ChromosomeWithOffset(uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset) AnnotationSet(uk.ac.babraham.SeqMonk.DataTypes.Genome.AnnotationSet) Feature(uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature) GZIPInputStream(java.util.zip.GZIPInputStream) FileReader(java.io.FileReader) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) Vector(java.util.Vector) Enumeration(java.util.Enumeration) InputStreamReader(java.io.InputStreamReader) Hashtable(java.util.Hashtable) FileInputStream(java.io.FileInputStream) BufferedReader(java.io.BufferedReader) Location(uk.ac.babraham.SeqMonk.DataTypes.Genome.Location) SplitLocation(uk.ac.babraham.SeqMonk.DataTypes.Genome.SplitLocation)

Example 3 with Location

use of uk.ac.babraham.SeqMonk.DataTypes.Genome.Location in project SeqMonk by s-andrews.

the class ProbeListAnnotationParser method parseAnnotation.

/* (non-Javadoc)
	 * @see uk.ac.babraham.SeqMonk.AnnotationParsers.AnnotationParser#parseAnnotation(java.io.File, uk.ac.babraham.SeqMonk.DataTypes.Genome.Genome)
	 */
protected AnnotationSet[] parseAnnotation(File file, Genome genome) throws Exception {
    Vector<AnnotationSet> annotationSets = new Vector<AnnotationSet>();
    AnnotationSet currentAnnotation = new AnnotationSet(genome, probeList.name());
    annotationSets.add(currentAnnotation);
    Probe[] probes = probeList.getAllProbes();
    for (int p = 0; p < probes.length; p++) {
        if (p % 1 + (probes.length / 100) == 0) {
            progressUpdated("Converted " + p + " probes", p, probes.length);
        }
        if (p > 1000000 && p % 1000000 == 0) {
            progressUpdated("Caching...", 0, 1);
            currentAnnotation.finalise();
            currentAnnotation = new AnnotationSet(genome, probeList.name() + "[" + annotationSets.size() + "]");
            annotationSets.add(currentAnnotation);
        }
        Feature feature = new Feature(featureType, probes[p].chromosome().name());
        if (probes[p].hasDefinedName()) {
            feature.addAttribute("name", probes[p].name());
        }
        feature.setLocation(new Location(probes[p].start(), probes[p].end(), probes[p].strand()));
        currentAnnotation.addFeature(feature);
    }
    return annotationSets.toArray(new AnnotationSet[0]);
}
Also used : AnnotationSet(uk.ac.babraham.SeqMonk.DataTypes.Genome.AnnotationSet) Probe(uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe) Vector(java.util.Vector) Feature(uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature) Location(uk.ac.babraham.SeqMonk.DataTypes.Genome.Location)

Example 4 with Location

use of uk.ac.babraham.SeqMonk.DataTypes.Genome.Location in project SeqMonk by s-andrews.

the class FeaturePercentileProbeGenerator method run.

/* (non-Javadoc)
	 * @see java.lang.Runnable#run()
	 */
public void run() {
    Chromosome[] chromosomes = collection.genome().getAllChromosomes();
    Vector<Probe> newProbes = new Vector<Probe>();
    for (int c = 0; c < chromosomes.length; c++) {
        // Time for an update
        updateGenerationProgress("Processed " + c + " chromosomes", c, chromosomes.length);
        Feature[] features = collection.genome().annotationCollection().getFeaturesForType(chromosomes[c], featureType);
        for (int f = 0; f < features.length; f++) {
            // See if we need to quit
            if (cancel) {
                generationCancelled();
                return;
            }
            if (useSubfeatures && (features[f].location() instanceof SplitLocation)) {
                SplitLocation location = (SplitLocation) features[f].location();
                Location[] subLocations = location.subLocations();
                for (int s = 0; s < subLocations.length; s++) {
                    makeProbes(features[f], chromosomes[c], subLocations[s], newProbes);
                }
            } else {
                makeProbes(features[f], chromosomes[c], features[f].location(), newProbes);
            }
        }
    }
    Probe[] finalList = newProbes.toArray(new Probe[0]);
    ProbeSet finalSet = new ProbeSet(getDescription(), finalList);
    generationComplete(finalSet);
}
Also used : Chromosome(uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome) Probe(uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe) Feature(uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature) ProbeSet(uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeSet) SplitLocation(uk.ac.babraham.SeqMonk.DataTypes.Genome.SplitLocation) Vector(java.util.Vector) Location(uk.ac.babraham.SeqMonk.DataTypes.Genome.Location) SplitLocation(uk.ac.babraham.SeqMonk.DataTypes.Genome.SplitLocation)

Example 5 with Location

use of uk.ac.babraham.SeqMonk.DataTypes.Genome.Location in project SeqMonk by s-andrews.

the class FeatureGroup method getSubLocations.

public Location[] getSubLocations() {
    if (features.size() == 1) {
        Location loc = features.elementAt(0).location();
        if (loc instanceof SplitLocation) {
            return ((SplitLocation) loc).subLocations();
        } else {
            return new Location[] { loc };
        }
    }
    LongVector allLocs = new LongVector();
    Enumeration<Feature> en = features.elements();
    while (en.hasMoreElements()) {
        Location loc = en.nextElement().location();
        if (loc instanceof SplitLocation) {
            Location[] subLocs = ((SplitLocation) loc).subLocations();
            for (int s = 0; s < subLocs.length; s++) {
                allLocs.add(subLocs[s].packedPosition());
            }
        } else {
            allLocs.add(loc.packedPosition());
        }
    }
    long[] locs = allLocs.toArray();
    SequenceRead.sort(locs);
    Vector<Location> mergedLocs = new Vector<Location>();
    long current = locs[0];
    for (int i = 1; i < locs.length; i++) {
        // if (debug) {System.err.println("Looking at "+SequenceRead.start(locs[i])+"-"+SequenceRead.end(locs[i])+" current is "+SequenceRead.start(current)+"-"+SequenceRead.end(current));}
        if (SequenceRead.overlaps(current, locs[i]) && SequenceRead.end(locs[i]) > SequenceRead.end(current)) {
            // if (debug) {System.err.println("They overlap, extending...");}
            current = SequenceRead.packPosition(SequenceRead.start(current), SequenceRead.end(locs[i]), SequenceRead.strand(current));
        } else if (SequenceRead.end(locs[i]) <= SequenceRead.end(current)) {
            // if (debug) {System.err.println("This is a subset, ignoring it");}
            continue;
        } else {
            // if (debug) {System.err.println("They don't overlap, moving on...");}
            mergedLocs.add(new Location(current));
            current = locs[i];
        }
    }
    mergedLocs.add(new Location(current));
    return mergedLocs.toArray(new Location[0]);
}
Also used : LongVector(uk.ac.babraham.SeqMonk.Utilities.LongVector) SplitLocation(uk.ac.babraham.SeqMonk.DataTypes.Genome.SplitLocation) Feature(uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature) Vector(java.util.Vector) LongVector(uk.ac.babraham.SeqMonk.Utilities.LongVector) SplitLocation(uk.ac.babraham.SeqMonk.DataTypes.Genome.SplitLocation) Location(uk.ac.babraham.SeqMonk.DataTypes.Genome.Location)

Aggregations

Location (uk.ac.babraham.SeqMonk.DataTypes.Genome.Location)15 Vector (java.util.Vector)14 Feature (uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature)13 SplitLocation (uk.ac.babraham.SeqMonk.DataTypes.Genome.SplitLocation)12 Probe (uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe)9 Chromosome (uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome)7 ProbeSet (uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeSet)5 LongVector (uk.ac.babraham.SeqMonk.Utilities.LongVector)5 Hashtable (java.util.Hashtable)3 AnnotationSet (uk.ac.babraham.SeqMonk.DataTypes.Genome.AnnotationSet)3 QuantitationStrandType (uk.ac.babraham.SeqMonk.DataTypes.Sequence.QuantitationStrandType)3 SeqMonkException (uk.ac.babraham.SeqMonk.SeqMonkException)3 BufferedReader (java.io.BufferedReader)2 FileInputStream (java.io.FileInputStream)2 FileReader (java.io.FileReader)2 InputStreamReader (java.io.InputStreamReader)2 GZIPInputStream (java.util.zip.GZIPInputStream)2 ChromosomeWithOffset (uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset)2 Enumeration (java.util.Enumeration)1 HashSet (java.util.HashSet)1