Examples with AnnotationSet - uk.ac.babraham.SeqMonk.DataTypes.Genome.AnnotationSet

Example 1 with AnnotationSet

use of uk.ac.babraham.SeqMonk.DataTypes.Genome.AnnotationSet in project SeqMonk by s-andrews.

the class GFF3AnnotationParser method parseAnnotation.

public AnnotationSet[] parseAnnotation(File file, Genome genome, String prefix) throws Exception {
    System.err.println("Parsing " + file);
    if (prefix == null) {
        featurePrefix = JOptionPane.showInputDialog(SeqMonkApplication.getInstance(), "Feature prefix", "GFFv3/GTP Options", JOptionPane.QUESTION_MESSAGE);
    } else {
        featurePrefix = prefix;
    }
    if (featurePrefix == null)
        featurePrefix = "";
    Vector<AnnotationSet> annotationSets = new Vector<AnnotationSet>();
    AnnotationSet currentAnnotation = new AnnotationSet(genome, file.getName());
    annotationSets.add(currentAnnotation);
    Hashtable<String, FeatureGroup> groupedFeatures = new Hashtable<String, FeatureGroup>();
    BufferedReader br;
    if (file.getName().toLowerCase().endsWith(".gz")) {
        br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(file))));
    } else {
        br = new BufferedReader(new FileReader(file));
    }
    String line;
    int count = 0;
    while ((line = br.readLine()) != null) {
        if (cancel) {
            progressCancelled();
            br.close();
            return null;
        }
        if (count % 1000 == 0) {
            progressUpdated("Read " + count + " lines from " + file.getName(), 0, 1);
        }
        if (count > 1000000 && count % 1000000 == 0) {
            progressUpdated("Caching...", 0, 1);
            currentAnnotation.finalise();
            currentAnnotation = new AnnotationSet(genome, file.getName() + "[" + annotationSets.size() + "]");
            annotationSets.add(currentAnnotation);
        }
        ++count;
        // Ignore blank lines
        if (line.trim().length() == 0)
            continue;
        // Skip comments
        if (line.startsWith("#"))
            continue;
        String[] sections = line.split("\t");
        // Check to see if we've got enough data to work with
        if (sections.length < 7) {
            progressWarningReceived(new SeqMonkException("Not enough data from line '" + line + "'"));
            continue;
        }
        int strand;
        int start;
        int end;
        try {
            start = Integer.parseInt(sections[3]);
            end = Integer.parseInt(sections[4]);
            // End must always be later than start
            if (end < start) {
                int temp = start;
                start = end;
                end = temp;
            }
            if (sections.length >= 7) {
                if (sections[6].equals("+")) {
                    strand = Location.FORWARD;
                } else if (sections[6].equals("-")) {
                    strand = Location.REVERSE;
                } else {
                    strand = Location.UNKNOWN;
                }
            } else {
                strand = Location.UNKNOWN;
            }
        } catch (NumberFormatException e) {
            progressWarningReceived(new SeqMonkException("Location " + sections[3] + "-" + sections[4] + " was not an integer"));
            continue;
        }
        ChromosomeWithOffset c;
        try {
            c = genome.getChromosome(sections[0]);
        } catch (IllegalArgumentException e) {
            progressWarningReceived(new SeqMonkException("Couldn't find a chromosome called " + sections[0]));
            continue;
        }
        start = c.position(start);
        end = c.position(end);
        // We also don't allow readings which are beyond the end of the chromosome
        if (end > c.chromosome().length()) {
            int overrun = end - c.chromosome().length();
            progressWarningReceived(new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")"));
            continue;
        }
        if (sections.length > 8 && sections[8].trim().length() > 0) {
            // Should check for escaped colons
            String[] attributes = sections[8].split(" *; *");
            // Make up a data structure of the attributes we have
            Hashtable<String, Vector<String>> keyValuePairs = new Hashtable<String, Vector<String>>();
            for (int a = 0; a < attributes.length; a++) {
                // Should check for escaped equals
                String[] keyValue = attributes[a].split("=", 2);
                // See if we didn't get split
                if (keyValue.length == 1) {
                    // This could be a GTF file which uses quoted values in space delimited fields
                    keyValue = attributes[a].split(" \"");
                    if (keyValue.length == 2) {
                        // We need to remove the quote from the end of the value
                        keyValue[1] = keyValue[1].substring(0, keyValue[1].length() - 1);
                    // System.out.println("Key='"+keyValue[0]+"' value='"+keyValue[1]+"'");
                    }
                }
                if (keyValue.length == 2) {
                    if (keyValuePairs.containsKey(keyValue[0])) {
                        keyValuePairs.get(keyValue[0]).add(keyValue[1]);
                    } else {
                        Vector<String> newVector = new Vector<String>();
                        newVector.add(keyValue[1]);
                        keyValuePairs.put(keyValue[0], newVector);
                    }
                } else {
                    progressWarningReceived(new SeqMonkException("No key value delimiter in " + attributes[a]));
                }
            }
            if (keyValuePairs.containsKey("Parent") && !sections[2].equals("mRNA")) {
                // We change exons to mRNA so we don't end up with spliced exon objects
                if (sections[2].equals("exon"))
                    sections[2] = "mRNA";
                String[] parents = keyValuePairs.get("Parent").elementAt(0).split(",");
                for (int p = 0; p < parents.length; p++) {
                    if (!groupedFeatures.containsKey(sections[2] + "_" + parents[p])) {
                        // Make a new feature to which we can add this
                        Feature feature = new Feature(featurePrefix + sections[2], c.chromosome().name());
                        groupedFeatures.put(sections[2] + "_" + parents[p], new FeatureGroup(feature, strand, feature.location()));
                        Enumeration<String> en = keyValuePairs.keys();
                        while (en.hasMoreElements()) {
                            String key = en.nextElement();
                            String[] values = keyValuePairs.get(key).toArray(new String[0]);
                            for (int v = 0; v < values.length; v++) {
                                feature.addAttribute(key, values[v]);
                            }
                        }
                    }
                    groupedFeatures.get(sections[2] + "_" + parents[p]).addSublocation(new Location(start, end, strand));
                }
            } else // parent feature
            if (keyValuePairs.containsKey("transcript_id")) {
                if (sections[2].equals("exon"))
                    sections[2] = "mRNA";
                if (!groupedFeatures.containsKey(sections[2] + "_" + keyValuePairs.get("transcript_id").elementAt(0))) {
                    Feature feature = new Feature(featurePrefix + sections[2], c.chromosome().name());
                    Enumeration<String> en = keyValuePairs.keys();
                    while (en.hasMoreElements()) {
                        String key = en.nextElement();
                        String[] values = keyValuePairs.get(key).toArray(new String[0]);
                        for (int v = 0; v < values.length; v++) {
                            feature.addAttribute(key, values[v]);
                        }
                    }
                    groupedFeatures.put(sections[2] + "_" + keyValuePairs.get("transcript_id").elementAt(0), new FeatureGroup(feature, strand, feature.location()));
                }
                groupedFeatures.get(sections[2] + "_" + keyValuePairs.get("transcript_id").elementAt(0)).addSublocation(new Location(start, end, strand));
            } else {
                // If we get here we're making a feature with attributes
                Feature feature = new Feature(featurePrefix + sections[2], c.chromosome().name());
                feature.setLocation(new Location(start, end, strand));
                Enumeration<String> en = keyValuePairs.keys();
                while (en.hasMoreElements()) {
                    String key = en.nextElement();
                    String[] values = keyValuePairs.get(key).toArray(new String[0]);
                    for (int v = 0; v < values.length; v++) {
                        feature.addAttribute(key, values[v]);
                    }
                }
                if (keyValuePairs.containsKey("ID")) {
                    // This is a feature which may end up having subfeatures
                    groupedFeatures.put(sections[2] + "_" + keyValuePairs.get("ID").elementAt(0), new FeatureGroup(feature, strand, feature.location()));
                // System.out.println("Making new entry for "+keyValuePairs.get("ID").elementAt(0));
                } else {
                    // We can just add this to the annotation collection
                    currentAnnotation.addFeature(feature);
                }
            }
        } else {
            // No group parameter to worry about
            Feature feature = new Feature(featurePrefix + sections[2], c.chromosome().name());
            feature.setLocation(new Location(start, end, strand));
            currentAnnotation.addFeature(feature);
        }
    }
    br.close();
    // Now go through the grouped features adding them to the annotation set
    Iterator<FeatureGroup> i = groupedFeatures.values().iterator();
    while (i.hasNext()) {
        Feature f = i.next().feature();
        currentAnnotation.addFeature(f);
    }
    return annotationSets.toArray(new AnnotationSet[0]);
}

Also used : ChromosomeWithOffset(uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset) AnnotationSet(uk.ac.babraham.SeqMonk.DataTypes.Genome.AnnotationSet) Feature(uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature) GZIPInputStream(java.util.zip.GZIPInputStream) FileReader(java.io.FileReader) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) Vector(java.util.Vector) Enumeration(java.util.Enumeration) InputStreamReader(java.io.InputStreamReader) Hashtable(java.util.Hashtable) FileInputStream(java.io.FileInputStream) BufferedReader(java.io.BufferedReader) Location(uk.ac.babraham.SeqMonk.DataTypes.Genome.Location) SplitLocation(uk.ac.babraham.SeqMonk.DataTypes.Genome.SplitLocation)

Example 2 with AnnotationSet

use of uk.ac.babraham.SeqMonk.DataTypes.Genome.AnnotationSet in project SeqMonk by s-andrews.

the class GenomeParser method parseGenomeFiles.

private void parseGenomeFiles(SingleGenome genome, File baseLocation) {
    // which defines the size and extent of the chromosomes
    try {
        parseChrListFile(genome, baseLocation);
    } catch (Exception ex) {
        Enumeration<ProgressListener> en = listeners.elements();
        while (en.hasMoreElements()) {
            en.nextElement().progressExceptionReceived(ex);
        }
        return;
    }
    // We need a list of all of the .dat files inside the baseLocation
    File[] files = baseLocation.listFiles(new FileFilter() {

        public boolean accept(File f) {
            if (f.getName().toLowerCase().endsWith(".dat")) {
                return true;
            } else {
                return false;
            }
        }
    });
    AnnotationSet coreAnnotation = new CoreAnnotationSet(genome);
    for (int i = 0; i < files.length; i++) {
        // Update the listeners
        Enumeration<ProgressListener> e = listeners.elements();
        while (e.hasMoreElements()) {
            e.nextElement().progressUpdated("Loading Genome File " + files[i].getName(), i, files.length);
        }
        try {
            processEMBLFile(files[i], coreAnnotation, genome);
        } catch (Exception ex) {
            Enumeration<ProgressListener> en = listeners.elements();
            while (en.hasMoreElements()) {
                en.nextElement().progressExceptionReceived(ex);
            }
            return;
        }
    }
    // Update the listeners
    Enumeration<ProgressListener> e = listeners.elements();
    while (e.hasMoreElements()) {
        e.nextElement().progressUpdated("Caching annotation data", 1, 1);
    }
    // Now do the same thing for gff files.
    // We need a list of all of the .gff/gtf files inside the baseLocation
    files = baseLocation.listFiles(new FileFilter() {

        public boolean accept(File f) {
            if (f.getName().toLowerCase().endsWith(".gff") || f.getName().toLowerCase().endsWith(".gtf") || f.getName().toLowerCase().endsWith(".gff3") || f.getName().toLowerCase().endsWith(".gff.gz") || f.getName().toLowerCase().endsWith(".gtf.gz") || f.getName().toLowerCase().endsWith(".gff3.gz")) {
                return true;
            } else {
                return false;
            }
        }
    });
    GFF3AnnotationParser gffParser = new GFF3AnnotationParser(genome);
    for (int i = 0; i < files.length; i++) {
        // System.err.println("Parsing "+files[i]);
        // Update the listeners
        e = listeners.elements();
        while (e.hasMoreElements()) {
            e.nextElement().progressUpdated("Loading Genome File " + files[i].getName(), i, files.length);
        }
        try {
            AnnotationSet[] newSets = gffParser.parseAnnotation(files[i], genome, "");
            for (int s = 0; s < newSets.length; s++) {
                Feature[] features = newSets[s].getAllFeatures();
                for (int f = 0; f < features.length; f++) {
                    coreAnnotation.addFeature(features[f]);
                }
            }
        } catch (Exception ex) {
            Enumeration<ProgressListener> en = listeners.elements();
            while (en.hasMoreElements()) {
                en.nextElement().progressExceptionReceived(ex);
            }
            return;
        }
    }
    // Update the listeners
    e = listeners.elements();
    while (e.hasMoreElements()) {
        e.nextElement().progressUpdated("Caching annotation data", 1, 1);
    }
    genome.annotationCollection().addAnnotationSets(new AnnotationSet[] { coreAnnotation });
// Debugging - put out some stats
// System.err.println("Made genome with "+genome.getAllChromosomes().length+" chromosomes");
// System.err.println("There are "+genome.annotationCollection().listAvailableFeatureTypes().length+" different feature types");
}

Also used : Enumeration(java.util.Enumeration) AnnotationSet(uk.ac.babraham.SeqMonk.DataTypes.Genome.AnnotationSet) CoreAnnotationSet(uk.ac.babraham.SeqMonk.DataTypes.Genome.CoreAnnotationSet) Feature(uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature) IOException(java.io.IOException) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) CoreAnnotationSet(uk.ac.babraham.SeqMonk.DataTypes.Genome.CoreAnnotationSet) ProgressListener(uk.ac.babraham.SeqMonk.DataTypes.ProgressListener) FileFilter(java.io.FileFilter) File(java.io.File)

Example 3 with AnnotationSet

use of uk.ac.babraham.SeqMonk.DataTypes.Genome.AnnotationSet in project SeqMonk by s-andrews.

the class ProbeListAnnotationParser method parseAnnotation.

/* (non-Javadoc)
	 * @see uk.ac.babraham.SeqMonk.AnnotationParsers.AnnotationParser#parseAnnotation(java.io.File, uk.ac.babraham.SeqMonk.DataTypes.Genome.Genome)
	 */
protected AnnotationSet[] parseAnnotation(File file, Genome genome) throws Exception {
    Vector<AnnotationSet> annotationSets = new Vector<AnnotationSet>();
    AnnotationSet currentAnnotation = new AnnotationSet(genome, probeList.name());
    annotationSets.add(currentAnnotation);
    Probe[] probes = probeList.getAllProbes();
    for (int p = 0; p < probes.length; p++) {
        if (p % 1 + (probes.length / 100) == 0) {
            progressUpdated("Converted " + p + " probes", p, probes.length);
        }
        if (p > 1000000 && p % 1000000 == 0) {
            progressUpdated("Caching...", 0, 1);
            currentAnnotation.finalise();
            currentAnnotation = new AnnotationSet(genome, probeList.name() + "[" + annotationSets.size() + "]");
            annotationSets.add(currentAnnotation);
        }
        Feature feature = new Feature(featureType, probes[p].chromosome().name());
        if (probes[p].hasDefinedName()) {
            feature.addAttribute("name", probes[p].name());
        }
        feature.setLocation(new Location(probes[p].start(), probes[p].end(), probes[p].strand()));
        currentAnnotation.addFeature(feature);
    }
    return annotationSets.toArray(new AnnotationSet[0]);
}

Also used : AnnotationSet(uk.ac.babraham.SeqMonk.DataTypes.Genome.AnnotationSet) Probe(uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe) Vector(java.util.Vector) Feature(uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature) Location(uk.ac.babraham.SeqMonk.DataTypes.Genome.Location)

Example 4 with AnnotationSet

use of uk.ac.babraham.SeqMonk.DataTypes.Genome.AnnotationSet in project SeqMonk by s-andrews.

the class DataTreeRenderer method getTreeCellRendererComponent.

/* (non-Javadoc)
	 * @see javax.swing.tree.DefaultTreeCellRenderer#getTreeCellRendererComponent(javax.swing.JTree, java.lang.Object, boolean, boolean, boolean, int, boolean)
	 */
public Component getTreeCellRendererComponent(JTree tree, Object value, boolean selected, boolean expanded, boolean leaf, int row, boolean hasFocus) {
    if (value instanceof DataSet) {
        JLabel label = new JLabel(value.toString(), dataSetIcon, JLabel.LEFT);
        if (value instanceof HiCDataStore && ((HiCDataStore) value).isValidHiC()) {
            label.setText("[HiC] " + label.getText());
        }
        if (selected) {
            label.setOpaque(true);
            label.setBackground(Color.LIGHT_GRAY);
        }
        return label;
    } else if (value instanceof DataGroup) {
        JLabel label = new JLabel(value.toString(), dataGroupIcon, JLabel.LEFT);
        if (value instanceof HiCDataStore && ((HiCDataStore) value).isValidHiC()) {
            label.setText("[HiC] " + label.getText());
        }
        if (selected) {
            label.setOpaque(true);
            label.setBackground(Color.LIGHT_GRAY);
        }
        return label;
    } else if (value instanceof ReplicateSet) {
        JLabel label = new JLabel(value.toString(), replicateSetIcon, JLabel.LEFT);
        if (value instanceof HiCDataStore && ((HiCDataStore) value).isValidHiC()) {
            label.setText("[HiC] " + label.getText());
        }
        if (selected) {
            label.setOpaque(true);
            label.setBackground(Color.LIGHT_GRAY);
        }
        return label;
    } else if (value instanceof ProbeList) {
        JLabel label = new JLabel(value.toString(), probeListIcon, JLabel.LEFT);
        if (selected) {
            label.setOpaque(true);
            label.setBackground(Color.LIGHT_GRAY);
        }
        return label;
    } else if (value instanceof AnnotationSet) {
        JLabel label = new JLabel(value.toString(), annotationSetIcon, JLabel.LEFT);
        if (selected) {
            label.setOpaque(true);
            label.setBackground(Color.LIGHT_GRAY);
        }
        return label;
    } else {
        return super.getTreeCellRendererComponent(tree, value, selected, expanded, leaf, row, hasFocus);
    }
}

Also used : DataGroup(uk.ac.babraham.SeqMonk.DataTypes.DataGroup) ProbeList(uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeList) DataSet(uk.ac.babraham.SeqMonk.DataTypes.DataSet) ReplicateSet(uk.ac.babraham.SeqMonk.DataTypes.ReplicateSet) JLabel(javax.swing.JLabel) HiCDataStore(uk.ac.babraham.SeqMonk.DataTypes.HiCDataStore) AnnotationSet(uk.ac.babraham.SeqMonk.DataTypes.Genome.AnnotationSet)

Example 5 with AnnotationSet

use of uk.ac.babraham.SeqMonk.DataTypes.Genome.AnnotationSet in project SeqMonk by s-andrews.

the class FindFeatureDialog method actionPerformed.

/* (non-Javadoc)
	 * @see java.awt.event.ActionListener#actionPerformed(java.awt.event.ActionEvent)
	 */
public void actionPerformed(ActionEvent ae) {
    if (ae.getActionCommand().equals("close")) {
        setVisible(false);
        dispose();
    } else if (ae.getActionCommand().equals("search")) {
        Thread t = new Thread(this);
        t.start();
    } else if (ae.getActionCommand().equals("save_annotation_all")) {
        // Find a name for the type of feature they want to create
        String name = (String) JOptionPane.showInputDialog(this, "Feature type", "Make Annotation Track", JOptionPane.QUESTION_MESSAGE, null, null, search.getText() + " " + featureType.getSelectedItem() + " search");
        // They cancelled
        if (name == null)
            return;
        // Now we can go ahead and make the new annotation set
        AnnotationSet searchAnnotations = new AnnotationSet(dataCollection.genome(), search.getText() + " " + featureType.getSelectedItem() + " search");
        for (int f = 0; f < lastHits.length; f++) {
            Feature feature = new Feature(name, lastHits[f].chromosomeName());
            feature.setLocation(lastHits[f].location());
            AnnotationTagValue[] tags = lastHits[f].getAnnotationTagValues();
            for (int t = 0; t < tags.length; t++) {
                feature.addAttribute(tags[t].tag(), tags[t].value());
            }
            searchAnnotations.addFeature(feature);
        }
        dataCollection.genome().annotationCollection().addAnnotationSets(new AnnotationSet[] { searchAnnotations });
    } else if (ae.getActionCommand().equals("save_annotation_selected")) {
        Feature[] selectedHits = viewer.getSelectedFeatures();
        if (selectedHits.length == 0) {
            JOptionPane.showMessageDialog(this, "There are no selected features from which to make a track", "Can't make track", JOptionPane.INFORMATION_MESSAGE);
            return;
        }
        // Find a name for the type of feature they want to create
        String name = (String) JOptionPane.showInputDialog(this, "Feature type", "Make Annotation Track", JOptionPane.QUESTION_MESSAGE, null, null, "selected " + search.getText());
        // They cancelled
        if (name == null)
            return;
        // Now we can go ahead and make the new annotation set
        AnnotationSet searchAnnotations = new AnnotationSet(dataCollection.genome(), search.getText() + " search results");
        for (int f = 0; f < selectedHits.length; f++) {
            Feature feature = new Feature(name, selectedHits[f].chromosomeName());
            feature.setLocation(selectedHits[f].location());
            AnnotationTagValue[] tags = selectedHits[f].getAnnotationTagValues();
            for (int t = 0; t < tags.length; t++) {
                feature.addAttribute(tags[t].tag(), tags[t].value());
            }
            searchAnnotations.addFeature(feature);
        }
        dataCollection.genome().annotationCollection().addAnnotationSets(new AnnotationSet[] { searchAnnotations });
    }
}

Also used : AnnotationTagValue(uk.ac.babraham.SeqMonk.DataTypes.Genome.AnnotationTagValue) AnnotationSet(uk.ac.babraham.SeqMonk.DataTypes.Genome.AnnotationSet) Feature(uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature)

Aggregations

AnnotationSet (uk.ac.babraham.SeqMonk.DataTypes.Genome.AnnotationSet)11 Feature (uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature)7 SeqMonkException (uk.ac.babraham.SeqMonk.SeqMonkException)6 Enumeration (java.util.Enumeration)5 IOException (java.io.IOException)4 Vector (java.util.Vector)4 DataGroup (uk.ac.babraham.SeqMonk.DataTypes.DataGroup)3 DataSet (uk.ac.babraham.SeqMonk.DataTypes.DataSet)3 CoreAnnotationSet (uk.ac.babraham.SeqMonk.DataTypes.Genome.CoreAnnotationSet)3 Location (uk.ac.babraham.SeqMonk.DataTypes.Genome.Location)3 ProgressListener (uk.ac.babraham.SeqMonk.DataTypes.ProgressListener)3 ReplicateSet (uk.ac.babraham.SeqMonk.DataTypes.ReplicateSet)3 BufferedReader (java.io.BufferedReader)2 FileInputStream (java.io.FileInputStream)2 FileNotFoundException (java.io.FileNotFoundException)2 FileReader (java.io.FileReader)2 InputStreamReader (java.io.InputStreamReader)2 UnknownHostException (java.net.UnknownHostException)2 GZIPInputStream (java.util.zip.GZIPInputStream)2 AnnotationTagValue (uk.ac.babraham.SeqMonk.DataTypes.Genome.AnnotationTagValue)2