use of uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature in project SeqMonk by s-andrews.
the class GenomeParser method parseGenomeFiles.
private void parseGenomeFiles(SingleGenome genome, File baseLocation) {
// which defines the size and extent of the chromosomes
try {
parseChrListFile(genome, baseLocation);
} catch (Exception ex) {
Enumeration<ProgressListener> en = listeners.elements();
while (en.hasMoreElements()) {
en.nextElement().progressExceptionReceived(ex);
}
return;
}
// We need a list of all of the .dat files inside the baseLocation
File[] files = baseLocation.listFiles(new FileFilter() {
public boolean accept(File f) {
if (f.getName().toLowerCase().endsWith(".dat")) {
return true;
} else {
return false;
}
}
});
AnnotationSet coreAnnotation = new CoreAnnotationSet(genome);
for (int i = 0; i < files.length; i++) {
// Update the listeners
Enumeration<ProgressListener> e = listeners.elements();
while (e.hasMoreElements()) {
e.nextElement().progressUpdated("Loading Genome File " + files[i].getName(), i, files.length);
}
try {
processEMBLFile(files[i], coreAnnotation, genome);
} catch (Exception ex) {
Enumeration<ProgressListener> en = listeners.elements();
while (en.hasMoreElements()) {
en.nextElement().progressExceptionReceived(ex);
}
return;
}
}
// Update the listeners
Enumeration<ProgressListener> e = listeners.elements();
while (e.hasMoreElements()) {
e.nextElement().progressUpdated("Caching annotation data", 1, 1);
}
// Now do the same thing for gff files.
// We need a list of all of the .gff/gtf files inside the baseLocation
files = baseLocation.listFiles(new FileFilter() {
public boolean accept(File f) {
if (f.getName().toLowerCase().endsWith(".gff") || f.getName().toLowerCase().endsWith(".gtf") || f.getName().toLowerCase().endsWith(".gff3") || f.getName().toLowerCase().endsWith(".gff.gz") || f.getName().toLowerCase().endsWith(".gtf.gz") || f.getName().toLowerCase().endsWith(".gff3.gz")) {
return true;
} else {
return false;
}
}
});
GFF3AnnotationParser gffParser = new GFF3AnnotationParser(genome);
for (int i = 0; i < files.length; i++) {
// System.err.println("Parsing "+files[i]);
// Update the listeners
e = listeners.elements();
while (e.hasMoreElements()) {
e.nextElement().progressUpdated("Loading Genome File " + files[i].getName(), i, files.length);
}
try {
AnnotationSet[] newSets = gffParser.parseAnnotation(files[i], genome, "");
for (int s = 0; s < newSets.length; s++) {
Feature[] features = newSets[s].getAllFeatures();
for (int f = 0; f < features.length; f++) {
coreAnnotation.addFeature(features[f]);
}
}
} catch (Exception ex) {
Enumeration<ProgressListener> en = listeners.elements();
while (en.hasMoreElements()) {
en.nextElement().progressExceptionReceived(ex);
}
return;
}
}
// Update the listeners
e = listeners.elements();
while (e.hasMoreElements()) {
e.nextElement().progressUpdated("Caching annotation data", 1, 1);
}
genome.annotationCollection().addAnnotationSets(new AnnotationSet[] { coreAnnotation });
// Debugging - put out some stats
// System.err.println("Made genome with "+genome.getAllChromosomes().length+" chromosomes");
// System.err.println("There are "+genome.annotationCollection().listAvailableFeatureTypes().length+" different feature types");
}
use of uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature in project SeqMonk by s-andrews.
the class GenomeParser method processEMBLFile.
/**
* Process embl file.
*
* @param f the f
* @param annotation the annotation
* @throws Exception the exception
*/
private void processEMBLFile(File f, AnnotationSet annotation, SingleGenome genome) throws Exception {
BufferedReader br = new BufferedReader(new FileReader(f));
Chromosome c = null;
while ((c = parseChromosome(br, genome)) != null) {
String line;
// We can now skip through to the start of the feature table
while ((line = br.readLine()) != null) {
if (line.startsWith("FH") || line.startsWith("SQ")) {
break;
}
}
// We can now start reading the features one at a time by
// concatonating them and then passing them on for processing
StringBuffer currentAttribute = new StringBuffer();
boolean skipping = true;
Feature feature = null;
while ((line = br.readLine()) != null) {
if (line.startsWith("XX") || line.startsWith("SQ") || line.startsWith("//")) {
skipToEntryEnd(br);
break;
}
// Just a blank line.
if (line.length() < 18)
continue;
String type = line.substring(5, 18).trim();
// System.out.println("Type is "+type);
if (type.length() > 0) {
// Check whether we need to process the old feature
if (skipping) {
// We're either on the first feature, or we've
// moving past this one
skipping = false;
} else {
// We need to process the last attribute from the
// old feature
processAttributeReturnSkip(currentAttribute.toString(), feature);
annotation.addFeature(feature);
}
// We can check to see if we're bothering to load this type of feature
if (prefs.loadAnnotation(type)) {
// System.err.println("Creating new feature of type "+type);
feature = new Feature(type, c.name());
currentAttribute = new StringBuffer("location=");
currentAttribute.append(line.substring(21).trim());
continue;
} else {
// System.err.println("Skipping feature of type "+type);
genome.addUnloadedFeatureType(type);
skipping = true;
}
}
if (skipping)
continue;
String data = line.substring(21).trim();
if (data.startsWith("/")) {
// We're at the start of a new attribute
// Process the last attribute
skipping = processAttributeReturnSkip(currentAttribute.toString(), feature);
currentAttribute = new StringBuffer();
}
// before the next lot of text.
if (currentAttribute.indexOf("description=") >= 0)
currentAttribute.append(" ");
currentAttribute.append(data);
}
// if there was one
if (!skipping) {
// We need to process the last attribute from the
// old feature
processAttributeReturnSkip(currentAttribute.toString(), feature);
annotation.addFeature(feature);
}
}
br.close();
}
use of uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature in project SeqMonk by s-andrews.
the class VisibleStoresParser method processHiCDataStore.
private DataSet processHiCDataStore(DataStore store) throws SeqMonkException {
int extendBy = prefs.extendReads();
boolean reverse = prefs.reverseReads();
boolean removeStrand = prefs.removeStrandInfo();
PairedDataSet newData = new PairedDataSet(store.name() + "_reimport", "Reimported from " + store.name(), prefs.removeDuplicates(), prefs.hiCDistance(), prefs.hiCIgnoreTrans());
// Now process the data
Chromosome[] chrs = dataCollection().genome().getAllChromosomes();
for (int c = 0; c < chrs.length; c++) {
progressUpdated("Processing " + store.name() + " chr " + chrs[c].name(), c, chrs.length);
// We make the call to get exportable reads so we don't duplicate reads
// when we export things
HiCHitCollection hitCollection = ((HiCDataStore) store).getExportableReadsForChromosome(chrs[c]);
String[] localChromosomes = hitCollection.getChromosomeNamesWithHits();
for (int c2 = 0; c2 < localChromosomes.length; c2++) {
Chromosome localChromosome = SeqMonkApplication.getInstance().dataCollection().genome().getChromosome(localChromosomes[c2]).chromosome();
long[] sourceReads = hitCollection.getSourcePositionsForChromosome(localChromosomes[c2]);
long[] hitReads = hitCollection.getHitPositionsForChromosome(localChromosomes[c2]);
for (int r = 0; r < sourceReads.length; r++) {
if (cancel) {
progressCancelled();
return null;
}
if (downsample && downsampleProbabilty < 1) {
if (Math.random() > downsampleProbabilty) {
continue;
}
}
if ((!(reverse || removeStrand)) && extendBy == 0 && (!filterByFeature)) {
// Just add them as they are
newData.addData(chrs[c], sourceReads[r]);
newData.addData(localChromosome, hitReads[r]);
}
Feature[] features = null;
if (filterByFeature) {
features = collection.genome().annotationCollection().getFeaturesForType(chrs[c], featureType);
Arrays.sort(features);
}
int currentFeaturePostion = 0;
if (filterByFeature) {
// See if we're comparing against the right feature
while (SequenceRead.start(sourceReads[r]) > features[currentFeaturePostion].location().end() && currentFeaturePostion < (features.length - 1)) {
currentFeaturePostion++;
}
// Test to see if we overlap
if (SequenceRead.overlaps(sourceReads[r], features[currentFeaturePostion].location().packedPosition())) {
if (excludeFeature)
continue;
} else {
if (!excludeFeature)
continue;
}
}
int sourceStart = SequenceRead.start(sourceReads[r]);
int sourceEend = SequenceRead.end(sourceReads[r]);
int sourceStrand = SequenceRead.strand(sourceReads[r]);
int hitStart = SequenceRead.start(sourceReads[r]);
int hitEend = SequenceRead.end(hitReads[r]);
int hitStrand = SequenceRead.strand(hitReads[r]);
if (reverse) {
if (sourceStrand == Location.FORWARD) {
sourceStrand = Location.REVERSE;
} else if (sourceStrand == Location.REVERSE) {
sourceStrand = Location.FORWARD;
}
if (hitStrand == Location.FORWARD) {
hitStrand = Location.REVERSE;
} else if (hitStrand == Location.REVERSE) {
hitStrand = Location.FORWARD;
}
}
if (removeStrand) {
sourceStrand = Location.UNKNOWN;
hitStrand = Location.UNKNOWN;
}
if (extendBy > 0) {
if (sourceStrand == Location.FORWARD) {
sourceEend += extendBy;
} else if (sourceStrand == Location.REVERSE) {
sourceStart -= extendBy;
}
if (hitStrand == Location.FORWARD) {
hitEend += extendBy;
} else if (hitStrand == Location.REVERSE) {
hitStart -= extendBy;
}
}
// We also don't allow readings which are beyond the end of the chromosome
if (sourceEend > chrs[c].length()) {
int overrun = sourceEend - chrs[c].length();
progressWarningReceived(new SeqMonkException("Reading position " + sourceEend + " was " + overrun + "bp beyond the end of chr" + chrs[c].name() + " (" + chrs[c].length() + ")"));
continue;
}
if (hitEend > localChromosome.length()) {
int overrun = hitEend - SeqMonkApplication.getInstance().dataCollection().genome().getChromosome(localChromosomes[c2]).chromosome().length();
progressWarningReceived(new SeqMonkException("Reading position " + hitEend + " was " + overrun + "bp beyond the end of chr" + localChromosome.name() + " (" + chrs[c].length() + ")"));
continue;
}
// We can now make the new readings
long sourceRead = SequenceRead.packPosition(sourceStart, sourceEend, sourceStrand);
long hitRead = SequenceRead.packPosition(hitStart, hitEend, hitStrand);
if (!prefs.isHiC()) {
// HiC additions are deferred until we know the other end is OK too.
newData.addData(chrs[c], sourceRead);
newData.addData(localChromosome, hitRead);
}
}
}
}
return newData;
}
use of uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature in project SeqMonk by s-andrews.
the class ProbeListAnnotationParser method parseAnnotation.
/* (non-Javadoc)
* @see uk.ac.babraham.SeqMonk.AnnotationParsers.AnnotationParser#parseAnnotation(java.io.File, uk.ac.babraham.SeqMonk.DataTypes.Genome.Genome)
*/
protected AnnotationSet[] parseAnnotation(File file, Genome genome) throws Exception {
Vector<AnnotationSet> annotationSets = new Vector<AnnotationSet>();
AnnotationSet currentAnnotation = new AnnotationSet(genome, probeList.name());
annotationSets.add(currentAnnotation);
Probe[] probes = probeList.getAllProbes();
for (int p = 0; p < probes.length; p++) {
if (p % 1 + (probes.length / 100) == 0) {
progressUpdated("Converted " + p + " probes", p, probes.length);
}
if (p > 1000000 && p % 1000000 == 0) {
progressUpdated("Caching...", 0, 1);
currentAnnotation.finalise();
currentAnnotation = new AnnotationSet(genome, probeList.name() + "[" + annotationSets.size() + "]");
annotationSets.add(currentAnnotation);
}
Feature feature = new Feature(featureType, probes[p].chromosome().name());
if (probes[p].hasDefinedName()) {
feature.addAttribute("name", probes[p].name());
}
feature.setLocation(new Location(probes[p].start(), probes[p].end(), probes[p].strand()));
currentAnnotation.addFeature(feature);
}
return annotationSets.toArray(new AnnotationSet[0]);
}
use of uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature in project SeqMonk by s-andrews.
the class DistanceToFeatureQuantitation method run.
/* (non-Javadoc)
* @see java.lang.Runnable#run()
*/
public void run() {
Probe[] probes = application.dataCollection().probeSet().getAllProbes();
Feature[] features = null;
Chromosome lastChromsome = null;
int lastIndex = 0;
for (int p = 0; p < probes.length; p++) {
// See if we need to quit
if (cancel) {
progressCancelled();
return;
}
// See if we're on the same chromosome as last time
if (lastChromsome == null || probes[p].chromosome() != lastChromsome) {
lastChromsome = probes[p].chromosome();
features = application.dataCollection().genome().annotationCollection().getFeaturesForType(lastChromsome, selectedFeature);
System.err.println("Found " + features.length + " features of type " + selectedFeature + " on chr " + lastChromsome);
lastIndex = 0;
}
int closestDistance = lastChromsome.length();
int bestIndex = lastIndex;
for (int i = lastIndex; i >= 0 && i < features.length; i--) {
int thisDistance = getDistanceToFeature(probes[p], features[i]);
if (thisDistance < closestDistance) {
closestDistance = thisDistance;
bestIndex = i;
}
if (features[i].location().end() < probes[p].start())
break;
}
// Now we go forward until we hit the end or we're after the end of the last best feature
for (int i = lastIndex + 1; i < features.length; i++) {
int thisDistance = getDistanceToFeature(probes[p], features[i]);
if (thisDistance < closestDistance) {
closestDistance = thisDistance;
bestIndex = i;
}
if (features[i].location().start() > Math.max(probes[p].end(), features[lastIndex].location().end()))
break;
}
lastIndex = bestIndex;
for (int d = 0; d < data.length; d++) {
if (logTransform.isSelected()) {
data[d].setValueForProbe(probes[p], (float) (Math.log(closestDistance + 1) / log2));
} else {
data[d].setValueForProbe(probes[p], closestDistance);
}
}
}
quantitatonComplete();
}
Aggregations