Search in sources :

Example 6 with Feature

use of uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature in project SeqMonk by s-andrews.

the class GenomeParser method parseGenomeFiles.

private void parseGenomeFiles(SingleGenome genome, File baseLocation) {
    // which defines the size and extent of the chromosomes
    try {
        parseChrListFile(genome, baseLocation);
    } catch (Exception ex) {
        Enumeration<ProgressListener> en = listeners.elements();
        while (en.hasMoreElements()) {
            en.nextElement().progressExceptionReceived(ex);
        }
        return;
    }
    // We need a list of all of the .dat files inside the baseLocation
    File[] files = baseLocation.listFiles(new FileFilter() {

        public boolean accept(File f) {
            if (f.getName().toLowerCase().endsWith(".dat")) {
                return true;
            } else {
                return false;
            }
        }
    });
    AnnotationSet coreAnnotation = new CoreAnnotationSet(genome);
    for (int i = 0; i < files.length; i++) {
        // Update the listeners
        Enumeration<ProgressListener> e = listeners.elements();
        while (e.hasMoreElements()) {
            e.nextElement().progressUpdated("Loading Genome File " + files[i].getName(), i, files.length);
        }
        try {
            processEMBLFile(files[i], coreAnnotation, genome);
        } catch (Exception ex) {
            Enumeration<ProgressListener> en = listeners.elements();
            while (en.hasMoreElements()) {
                en.nextElement().progressExceptionReceived(ex);
            }
            return;
        }
    }
    // Update the listeners
    Enumeration<ProgressListener> e = listeners.elements();
    while (e.hasMoreElements()) {
        e.nextElement().progressUpdated("Caching annotation data", 1, 1);
    }
    // Now do the same thing for gff files.
    // We need a list of all of the .gff/gtf files inside the baseLocation
    files = baseLocation.listFiles(new FileFilter() {

        public boolean accept(File f) {
            if (f.getName().toLowerCase().endsWith(".gff") || f.getName().toLowerCase().endsWith(".gtf") || f.getName().toLowerCase().endsWith(".gff3") || f.getName().toLowerCase().endsWith(".gff.gz") || f.getName().toLowerCase().endsWith(".gtf.gz") || f.getName().toLowerCase().endsWith(".gff3.gz")) {
                return true;
            } else {
                return false;
            }
        }
    });
    GFF3AnnotationParser gffParser = new GFF3AnnotationParser(genome);
    for (int i = 0; i < files.length; i++) {
        // System.err.println("Parsing "+files[i]);
        // Update the listeners
        e = listeners.elements();
        while (e.hasMoreElements()) {
            e.nextElement().progressUpdated("Loading Genome File " + files[i].getName(), i, files.length);
        }
        try {
            AnnotationSet[] newSets = gffParser.parseAnnotation(files[i], genome, "");
            for (int s = 0; s < newSets.length; s++) {
                Feature[] features = newSets[s].getAllFeatures();
                for (int f = 0; f < features.length; f++) {
                    coreAnnotation.addFeature(features[f]);
                }
            }
        } catch (Exception ex) {
            Enumeration<ProgressListener> en = listeners.elements();
            while (en.hasMoreElements()) {
                en.nextElement().progressExceptionReceived(ex);
            }
            return;
        }
    }
    // Update the listeners
    e = listeners.elements();
    while (e.hasMoreElements()) {
        e.nextElement().progressUpdated("Caching annotation data", 1, 1);
    }
    genome.annotationCollection().addAnnotationSets(new AnnotationSet[] { coreAnnotation });
// Debugging - put out some stats
// System.err.println("Made genome with "+genome.getAllChromosomes().length+" chromosomes");
// System.err.println("There are "+genome.annotationCollection().listAvailableFeatureTypes().length+" different feature types");
}
Also used : Enumeration(java.util.Enumeration) AnnotationSet(uk.ac.babraham.SeqMonk.DataTypes.Genome.AnnotationSet) CoreAnnotationSet(uk.ac.babraham.SeqMonk.DataTypes.Genome.CoreAnnotationSet) Feature(uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature) IOException(java.io.IOException) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) CoreAnnotationSet(uk.ac.babraham.SeqMonk.DataTypes.Genome.CoreAnnotationSet) ProgressListener(uk.ac.babraham.SeqMonk.DataTypes.ProgressListener) FileFilter(java.io.FileFilter) File(java.io.File)

Example 7 with Feature

use of uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature in project SeqMonk by s-andrews.

the class GenomeParser method processEMBLFile.

/**
 * Process embl file.
 *
 * @param f the f
 * @param annotation the annotation
 * @throws Exception the exception
 */
private void processEMBLFile(File f, AnnotationSet annotation, SingleGenome genome) throws Exception {
    BufferedReader br = new BufferedReader(new FileReader(f));
    Chromosome c = null;
    while ((c = parseChromosome(br, genome)) != null) {
        String line;
        // We can now skip through to the start of the feature table
        while ((line = br.readLine()) != null) {
            if (line.startsWith("FH") || line.startsWith("SQ")) {
                break;
            }
        }
        // We can now start reading the features one at a time by
        // concatonating them and then passing them on for processing
        StringBuffer currentAttribute = new StringBuffer();
        boolean skipping = true;
        Feature feature = null;
        while ((line = br.readLine()) != null) {
            if (line.startsWith("XX") || line.startsWith("SQ") || line.startsWith("//")) {
                skipToEntryEnd(br);
                break;
            }
            // Just a blank line.
            if (line.length() < 18)
                continue;
            String type = line.substring(5, 18).trim();
            // System.out.println("Type is "+type);
            if (type.length() > 0) {
                // Check whether we need to process the old feature
                if (skipping) {
                    // We're either on the first feature, or we've
                    // moving past this one
                    skipping = false;
                } else {
                    // We need to process the last attribute from the
                    // old feature
                    processAttributeReturnSkip(currentAttribute.toString(), feature);
                    annotation.addFeature(feature);
                }
                // We can check to see if we're bothering to load this type of feature
                if (prefs.loadAnnotation(type)) {
                    // System.err.println("Creating new feature of type "+type);
                    feature = new Feature(type, c.name());
                    currentAttribute = new StringBuffer("location=");
                    currentAttribute.append(line.substring(21).trim());
                    continue;
                } else {
                    // System.err.println("Skipping feature of type "+type);
                    genome.addUnloadedFeatureType(type);
                    skipping = true;
                }
            }
            if (skipping)
                continue;
            String data = line.substring(21).trim();
            if (data.startsWith("/")) {
                // We're at the start of a new attribute
                // Process the last attribute
                skipping = processAttributeReturnSkip(currentAttribute.toString(), feature);
                currentAttribute = new StringBuffer();
            }
            // before the next lot of text.
            if (currentAttribute.indexOf("description=") >= 0)
                currentAttribute.append(" ");
            currentAttribute.append(data);
        }
        // if there was one
        if (!skipping) {
            // We need to process the last attribute from the
            // old feature
            processAttributeReturnSkip(currentAttribute.toString(), feature);
            annotation.addFeature(feature);
        }
    }
    br.close();
}
Also used : BufferedReader(java.io.BufferedReader) Chromosome(uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome) FileReader(java.io.FileReader) Feature(uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature)

Example 8 with Feature

use of uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature in project SeqMonk by s-andrews.

the class VisibleStoresParser method processHiCDataStore.

private DataSet processHiCDataStore(DataStore store) throws SeqMonkException {
    int extendBy = prefs.extendReads();
    boolean reverse = prefs.reverseReads();
    boolean removeStrand = prefs.removeStrandInfo();
    PairedDataSet newData = new PairedDataSet(store.name() + "_reimport", "Reimported from " + store.name(), prefs.removeDuplicates(), prefs.hiCDistance(), prefs.hiCIgnoreTrans());
    // Now process the data
    Chromosome[] chrs = dataCollection().genome().getAllChromosomes();
    for (int c = 0; c < chrs.length; c++) {
        progressUpdated("Processing " + store.name() + " chr " + chrs[c].name(), c, chrs.length);
        // We make the call to get exportable reads so we don't duplicate reads
        // when we export things
        HiCHitCollection hitCollection = ((HiCDataStore) store).getExportableReadsForChromosome(chrs[c]);
        String[] localChromosomes = hitCollection.getChromosomeNamesWithHits();
        for (int c2 = 0; c2 < localChromosomes.length; c2++) {
            Chromosome localChromosome = SeqMonkApplication.getInstance().dataCollection().genome().getChromosome(localChromosomes[c2]).chromosome();
            long[] sourceReads = hitCollection.getSourcePositionsForChromosome(localChromosomes[c2]);
            long[] hitReads = hitCollection.getHitPositionsForChromosome(localChromosomes[c2]);
            for (int r = 0; r < sourceReads.length; r++) {
                if (cancel) {
                    progressCancelled();
                    return null;
                }
                if (downsample && downsampleProbabilty < 1) {
                    if (Math.random() > downsampleProbabilty) {
                        continue;
                    }
                }
                if ((!(reverse || removeStrand)) && extendBy == 0 && (!filterByFeature)) {
                    // Just add them as they are
                    newData.addData(chrs[c], sourceReads[r]);
                    newData.addData(localChromosome, hitReads[r]);
                }
                Feature[] features = null;
                if (filterByFeature) {
                    features = collection.genome().annotationCollection().getFeaturesForType(chrs[c], featureType);
                    Arrays.sort(features);
                }
                int currentFeaturePostion = 0;
                if (filterByFeature) {
                    // See if we're comparing against the right feature
                    while (SequenceRead.start(sourceReads[r]) > features[currentFeaturePostion].location().end() && currentFeaturePostion < (features.length - 1)) {
                        currentFeaturePostion++;
                    }
                    // Test to see if we overlap
                    if (SequenceRead.overlaps(sourceReads[r], features[currentFeaturePostion].location().packedPosition())) {
                        if (excludeFeature)
                            continue;
                    } else {
                        if (!excludeFeature)
                            continue;
                    }
                }
                int sourceStart = SequenceRead.start(sourceReads[r]);
                int sourceEend = SequenceRead.end(sourceReads[r]);
                int sourceStrand = SequenceRead.strand(sourceReads[r]);
                int hitStart = SequenceRead.start(sourceReads[r]);
                int hitEend = SequenceRead.end(hitReads[r]);
                int hitStrand = SequenceRead.strand(hitReads[r]);
                if (reverse) {
                    if (sourceStrand == Location.FORWARD) {
                        sourceStrand = Location.REVERSE;
                    } else if (sourceStrand == Location.REVERSE) {
                        sourceStrand = Location.FORWARD;
                    }
                    if (hitStrand == Location.FORWARD) {
                        hitStrand = Location.REVERSE;
                    } else if (hitStrand == Location.REVERSE) {
                        hitStrand = Location.FORWARD;
                    }
                }
                if (removeStrand) {
                    sourceStrand = Location.UNKNOWN;
                    hitStrand = Location.UNKNOWN;
                }
                if (extendBy > 0) {
                    if (sourceStrand == Location.FORWARD) {
                        sourceEend += extendBy;
                    } else if (sourceStrand == Location.REVERSE) {
                        sourceStart -= extendBy;
                    }
                    if (hitStrand == Location.FORWARD) {
                        hitEend += extendBy;
                    } else if (hitStrand == Location.REVERSE) {
                        hitStart -= extendBy;
                    }
                }
                // We also don't allow readings which are beyond the end of the chromosome
                if (sourceEend > chrs[c].length()) {
                    int overrun = sourceEend - chrs[c].length();
                    progressWarningReceived(new SeqMonkException("Reading position " + sourceEend + " was " + overrun + "bp beyond the end of chr" + chrs[c].name() + " (" + chrs[c].length() + ")"));
                    continue;
                }
                if (hitEend > localChromosome.length()) {
                    int overrun = hitEend - SeqMonkApplication.getInstance().dataCollection().genome().getChromosome(localChromosomes[c2]).chromosome().length();
                    progressWarningReceived(new SeqMonkException("Reading position " + hitEend + " was " + overrun + "bp beyond the end of chr" + localChromosome.name() + " (" + chrs[c].length() + ")"));
                    continue;
                }
                // We can now make the new readings
                long sourceRead = SequenceRead.packPosition(sourceStart, sourceEend, sourceStrand);
                long hitRead = SequenceRead.packPosition(hitStart, hitEend, hitStrand);
                if (!prefs.isHiC()) {
                    // HiC additions are deferred until we know the other end is OK too.
                    newData.addData(chrs[c], sourceRead);
                    newData.addData(localChromosome, hitRead);
                }
            }
        }
    }
    return newData;
}
Also used : HiCHitCollection(uk.ac.babraham.SeqMonk.DataTypes.Sequence.HiCHitCollection) PairedDataSet(uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet) Chromosome(uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome) HiCDataStore(uk.ac.babraham.SeqMonk.DataTypes.HiCDataStore) Feature(uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException)

Example 9 with Feature

use of uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature in project SeqMonk by s-andrews.

the class ProbeListAnnotationParser method parseAnnotation.

/* (non-Javadoc)
	 * @see uk.ac.babraham.SeqMonk.AnnotationParsers.AnnotationParser#parseAnnotation(java.io.File, uk.ac.babraham.SeqMonk.DataTypes.Genome.Genome)
	 */
protected AnnotationSet[] parseAnnotation(File file, Genome genome) throws Exception {
    Vector<AnnotationSet> annotationSets = new Vector<AnnotationSet>();
    AnnotationSet currentAnnotation = new AnnotationSet(genome, probeList.name());
    annotationSets.add(currentAnnotation);
    Probe[] probes = probeList.getAllProbes();
    for (int p = 0; p < probes.length; p++) {
        if (p % 1 + (probes.length / 100) == 0) {
            progressUpdated("Converted " + p + " probes", p, probes.length);
        }
        if (p > 1000000 && p % 1000000 == 0) {
            progressUpdated("Caching...", 0, 1);
            currentAnnotation.finalise();
            currentAnnotation = new AnnotationSet(genome, probeList.name() + "[" + annotationSets.size() + "]");
            annotationSets.add(currentAnnotation);
        }
        Feature feature = new Feature(featureType, probes[p].chromosome().name());
        if (probes[p].hasDefinedName()) {
            feature.addAttribute("name", probes[p].name());
        }
        feature.setLocation(new Location(probes[p].start(), probes[p].end(), probes[p].strand()));
        currentAnnotation.addFeature(feature);
    }
    return annotationSets.toArray(new AnnotationSet[0]);
}
Also used : AnnotationSet(uk.ac.babraham.SeqMonk.DataTypes.Genome.AnnotationSet) Probe(uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe) Vector(java.util.Vector) Feature(uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature) Location(uk.ac.babraham.SeqMonk.DataTypes.Genome.Location)

Example 10 with Feature

use of uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature in project SeqMonk by s-andrews.

the class DistanceToFeatureQuantitation method run.

/* (non-Javadoc)
	 * @see java.lang.Runnable#run()
	 */
public void run() {
    Probe[] probes = application.dataCollection().probeSet().getAllProbes();
    Feature[] features = null;
    Chromosome lastChromsome = null;
    int lastIndex = 0;
    for (int p = 0; p < probes.length; p++) {
        // See if we need to quit
        if (cancel) {
            progressCancelled();
            return;
        }
        // See if we're on the same chromosome as last time
        if (lastChromsome == null || probes[p].chromosome() != lastChromsome) {
            lastChromsome = probes[p].chromosome();
            features = application.dataCollection().genome().annotationCollection().getFeaturesForType(lastChromsome, selectedFeature);
            System.err.println("Found " + features.length + " features of type " + selectedFeature + " on chr " + lastChromsome);
            lastIndex = 0;
        }
        int closestDistance = lastChromsome.length();
        int bestIndex = lastIndex;
        for (int i = lastIndex; i >= 0 && i < features.length; i--) {
            int thisDistance = getDistanceToFeature(probes[p], features[i]);
            if (thisDistance < closestDistance) {
                closestDistance = thisDistance;
                bestIndex = i;
            }
            if (features[i].location().end() < probes[p].start())
                break;
        }
        // Now we go forward until we hit the end or we're after the end of the last best feature
        for (int i = lastIndex + 1; i < features.length; i++) {
            int thisDistance = getDistanceToFeature(probes[p], features[i]);
            if (thisDistance < closestDistance) {
                closestDistance = thisDistance;
                bestIndex = i;
            }
            if (features[i].location().start() > Math.max(probes[p].end(), features[lastIndex].location().end()))
                break;
        }
        lastIndex = bestIndex;
        for (int d = 0; d < data.length; d++) {
            if (logTransform.isSelected()) {
                data[d].setValueForProbe(probes[p], (float) (Math.log(closestDistance + 1) / log2));
            } else {
                data[d].setValueForProbe(probes[p], closestDistance);
            }
        }
    }
    quantitatonComplete();
}
Also used : Chromosome(uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome) Probe(uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe) Feature(uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature)

Aggregations

Feature (uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature)37 Vector (java.util.Vector)23 Chromosome (uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome)23 Probe (uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe)20 Location (uk.ac.babraham.SeqMonk.DataTypes.Genome.Location)13 SplitLocation (uk.ac.babraham.SeqMonk.DataTypes.Genome.SplitLocation)11 ProbeSet (uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeSet)9 AnnotationSet (uk.ac.babraham.SeqMonk.DataTypes.Genome.AnnotationSet)7 SeqMonkException (uk.ac.babraham.SeqMonk.SeqMonkException)7 QuantitationStrandType (uk.ac.babraham.SeqMonk.DataTypes.Sequence.QuantitationStrandType)5 Hashtable (java.util.Hashtable)4 AbstractTableModel (javax.swing.table.AbstractTableModel)4 TableModel (javax.swing.table.TableModel)4 LongVector (uk.ac.babraham.SeqMonk.Utilities.LongVector)4 BufferedReader (java.io.BufferedReader)3 FileReader (java.io.FileReader)3 Enumeration (java.util.Enumeration)3 HashSet (java.util.HashSet)3 ProbeList (uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeList)3 FileInputStream (java.io.FileInputStream)2