use of uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset in project SeqMonk by s-andrews.
the class GFF3AnnotationParser method parseAnnotation.
public AnnotationSet[] parseAnnotation(File file, Genome genome, String prefix) throws Exception {
System.err.println("Parsing " + file);
if (prefix == null) {
featurePrefix = JOptionPane.showInputDialog(SeqMonkApplication.getInstance(), "Feature prefix", "GFFv3/GTP Options", JOptionPane.QUESTION_MESSAGE);
} else {
featurePrefix = prefix;
}
if (featurePrefix == null)
featurePrefix = "";
Vector<AnnotationSet> annotationSets = new Vector<AnnotationSet>();
AnnotationSet currentAnnotation = new AnnotationSet(genome, file.getName());
annotationSets.add(currentAnnotation);
Hashtable<String, FeatureGroup> groupedFeatures = new Hashtable<String, FeatureGroup>();
BufferedReader br;
if (file.getName().toLowerCase().endsWith(".gz")) {
br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(file))));
} else {
br = new BufferedReader(new FileReader(file));
}
String line;
int count = 0;
while ((line = br.readLine()) != null) {
if (cancel) {
progressCancelled();
br.close();
return null;
}
if (count % 1000 == 0) {
progressUpdated("Read " + count + " lines from " + file.getName(), 0, 1);
}
if (count > 1000000 && count % 1000000 == 0) {
progressUpdated("Caching...", 0, 1);
currentAnnotation.finalise();
currentAnnotation = new AnnotationSet(genome, file.getName() + "[" + annotationSets.size() + "]");
annotationSets.add(currentAnnotation);
}
++count;
// Ignore blank lines
if (line.trim().length() == 0)
continue;
// Skip comments
if (line.startsWith("#"))
continue;
String[] sections = line.split("\t");
// Check to see if we've got enough data to work with
if (sections.length < 7) {
progressWarningReceived(new SeqMonkException("Not enough data from line '" + line + "'"));
continue;
}
int strand;
int start;
int end;
try {
start = Integer.parseInt(sections[3]);
end = Integer.parseInt(sections[4]);
// End must always be later than start
if (end < start) {
int temp = start;
start = end;
end = temp;
}
if (sections.length >= 7) {
if (sections[6].equals("+")) {
strand = Location.FORWARD;
} else if (sections[6].equals("-")) {
strand = Location.REVERSE;
} else {
strand = Location.UNKNOWN;
}
} else {
strand = Location.UNKNOWN;
}
} catch (NumberFormatException e) {
progressWarningReceived(new SeqMonkException("Location " + sections[3] + "-" + sections[4] + " was not an integer"));
continue;
}
ChromosomeWithOffset c;
try {
c = genome.getChromosome(sections[0]);
} catch (IllegalArgumentException e) {
progressWarningReceived(new SeqMonkException("Couldn't find a chromosome called " + sections[0]));
continue;
}
start = c.position(start);
end = c.position(end);
// We also don't allow readings which are beyond the end of the chromosome
if (end > c.chromosome().length()) {
int overrun = end - c.chromosome().length();
progressWarningReceived(new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")"));
continue;
}
if (sections.length > 8 && sections[8].trim().length() > 0) {
// Should check for escaped colons
String[] attributes = sections[8].split(" *; *");
// Make up a data structure of the attributes we have
Hashtable<String, Vector<String>> keyValuePairs = new Hashtable<String, Vector<String>>();
for (int a = 0; a < attributes.length; a++) {
// Should check for escaped equals
String[] keyValue = attributes[a].split("=", 2);
// See if we didn't get split
if (keyValue.length == 1) {
// This could be a GTF file which uses quoted values in space delimited fields
keyValue = attributes[a].split(" \"");
if (keyValue.length == 2) {
// We need to remove the quote from the end of the value
keyValue[1] = keyValue[1].substring(0, keyValue[1].length() - 1);
// System.out.println("Key='"+keyValue[0]+"' value='"+keyValue[1]+"'");
}
}
if (keyValue.length == 2) {
if (keyValuePairs.containsKey(keyValue[0])) {
keyValuePairs.get(keyValue[0]).add(keyValue[1]);
} else {
Vector<String> newVector = new Vector<String>();
newVector.add(keyValue[1]);
keyValuePairs.put(keyValue[0], newVector);
}
} else {
progressWarningReceived(new SeqMonkException("No key value delimiter in " + attributes[a]));
}
}
if (keyValuePairs.containsKey("Parent") && !sections[2].equals("mRNA")) {
// We change exons to mRNA so we don't end up with spliced exon objects
if (sections[2].equals("exon"))
sections[2] = "mRNA";
String[] parents = keyValuePairs.get("Parent").elementAt(0).split(",");
for (int p = 0; p < parents.length; p++) {
if (!groupedFeatures.containsKey(sections[2] + "_" + parents[p])) {
// Make a new feature to which we can add this
Feature feature = new Feature(featurePrefix + sections[2], c.chromosome().name());
groupedFeatures.put(sections[2] + "_" + parents[p], new FeatureGroup(feature, strand, feature.location()));
Enumeration<String> en = keyValuePairs.keys();
while (en.hasMoreElements()) {
String key = en.nextElement();
String[] values = keyValuePairs.get(key).toArray(new String[0]);
for (int v = 0; v < values.length; v++) {
feature.addAttribute(key, values[v]);
}
}
}
groupedFeatures.get(sections[2] + "_" + parents[p]).addSublocation(new Location(start, end, strand));
}
} else // parent feature
if (keyValuePairs.containsKey("transcript_id")) {
if (sections[2].equals("exon"))
sections[2] = "mRNA";
if (!groupedFeatures.containsKey(sections[2] + "_" + keyValuePairs.get("transcript_id").elementAt(0))) {
Feature feature = new Feature(featurePrefix + sections[2], c.chromosome().name());
Enumeration<String> en = keyValuePairs.keys();
while (en.hasMoreElements()) {
String key = en.nextElement();
String[] values = keyValuePairs.get(key).toArray(new String[0]);
for (int v = 0; v < values.length; v++) {
feature.addAttribute(key, values[v]);
}
}
groupedFeatures.put(sections[2] + "_" + keyValuePairs.get("transcript_id").elementAt(0), new FeatureGroup(feature, strand, feature.location()));
}
groupedFeatures.get(sections[2] + "_" + keyValuePairs.get("transcript_id").elementAt(0)).addSublocation(new Location(start, end, strand));
} else {
// If we get here we're making a feature with attributes
Feature feature = new Feature(featurePrefix + sections[2], c.chromosome().name());
feature.setLocation(new Location(start, end, strand));
Enumeration<String> en = keyValuePairs.keys();
while (en.hasMoreElements()) {
String key = en.nextElement();
String[] values = keyValuePairs.get(key).toArray(new String[0]);
for (int v = 0; v < values.length; v++) {
feature.addAttribute(key, values[v]);
}
}
if (keyValuePairs.containsKey("ID")) {
// This is a feature which may end up having subfeatures
groupedFeatures.put(sections[2] + "_" + keyValuePairs.get("ID").elementAt(0), new FeatureGroup(feature, strand, feature.location()));
// System.out.println("Making new entry for "+keyValuePairs.get("ID").elementAt(0));
} else {
// We can just add this to the annotation collection
currentAnnotation.addFeature(feature);
}
}
} else {
// No group parameter to worry about
Feature feature = new Feature(featurePrefix + sections[2], c.chromosome().name());
feature.setLocation(new Location(start, end, strand));
currentAnnotation.addFeature(feature);
}
}
br.close();
// Now go through the grouped features adding them to the annotation set
Iterator<FeatureGroup> i = groupedFeatures.values().iterator();
while (i.hasNext()) {
Feature f = i.next().feature();
currentAnnotation.addFeature(f);
}
return annotationSets.toArray(new AnnotationSet[0]);
}
use of uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset in project SeqMonk by s-andrews.
the class BedFileParser method run.
/* (non-Javadoc)
* @see java.lang.Runnable#run()
*/
public void run() {
// System.err.println("Started parsing BED files");
int extendBy = prefs.extendReads();
try {
File[] probeFiles = getFiles();
DataSet[] newData = new DataSet[probeFiles.length];
for (int f = 0; f < probeFiles.length; f++) {
BufferedReader br;
if (probeFiles[f].getName().toLowerCase().endsWith(".gz")) {
br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(probeFiles[f]))));
} else {
br = new BufferedReader(new FileReader(probeFiles[f]));
}
String line;
if (prefs.isHiC()) {
newData[f] = new PairedDataSet(probeFiles[f].getName(), probeFiles[f].getCanonicalPath(), prefs.removeDuplicates(), prefs.hiCDistance(), prefs.hiCIgnoreTrans());
} else {
newData[f] = new DataSet(probeFiles[f].getName(), probeFiles[f].getCanonicalPath(), prefs.removeDuplicates());
}
int lineCount = 0;
// Now process the file
while ((line = br.readLine()) != null) {
if (cancel) {
br.close();
progressCancelled();
return;
}
// Ignore blank lines
if (line.trim().length() == 0)
continue;
++lineCount;
if (lineCount % 100000 == 0) {
progressUpdated("Read " + lineCount + " lines from " + probeFiles[f].getName(), f, probeFiles.length);
}
String[] sections = line.split("\t");
// Check to see if we've got enough data to work with
if (sections.length < 3) {
progressWarningReceived(new SeqMonkException("Not enough data from line '" + line + "'"));
// Skip this line...
continue;
}
int strand;
int start;
int end;
try {
// The start is zero indexed so we need to add 1 to get genomic positions
start = Integer.parseInt(sections[1]) + 1;
// The end is zero indexed, but not included in the feature position so
// we need to add one to get genomic coordinates, but subtract one to not
// include the final base.
end = Integer.parseInt(sections[2]);
// End must always be later than start
if (start > end) {
progressWarningReceived(new SeqMonkException("End position " + end + " was lower than start position " + start));
int temp = start;
start = end;
end = temp;
}
if (sections.length >= 6) {
if (sections[5].equals("+")) {
strand = Location.FORWARD;
} else if (sections[5].equals("-")) {
strand = Location.REVERSE;
} else {
progressWarningReceived(new SeqMonkException("Unknown strand character '" + sections[5] + "' marked as unknown strand"));
strand = Location.UNKNOWN;
}
if (extendBy > 0) {
if (strand == Location.REVERSE) {
start -= extendBy;
} else if (strand == Location.FORWARD) {
end += extendBy;
}
}
} else {
strand = Location.UNKNOWN;
}
} catch (NumberFormatException e) {
progressWarningReceived(new SeqMonkException("Location " + sections[0] + "-" + sections[1] + " was not an integer"));
continue;
}
try {
ChromosomeWithOffset c = dataCollection().genome().getChromosome(sections[0]);
// We also don't allow readings which are beyond the end of the chromosome
start = c.position(start);
end = c.position(end);
if (end > c.chromosome().length()) {
int overrun = end - c.chromosome().length();
progressWarningReceived(new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")"));
continue;
}
// We can now make the new reading
long read = SequenceRead.packPosition(start, end, strand);
newData[f].addData(c.chromosome(), read);
} catch (IllegalArgumentException iae) {
progressWarningReceived(iae);
} catch (SeqMonkException sme) {
progressWarningReceived(sme);
continue;
}
}
// We're finished with the file.
br.close();
// Cache the data in the new dataset
progressUpdated("Caching data from " + probeFiles[f].getName(), f, probeFiles.length);
newData[f].finalise();
}
processingFinished(newData);
} catch (Exception ex) {
progressExceptionReceived(ex);
return;
}
}
use of uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset in project SeqMonk by s-andrews.
the class GenericSeqReadParser method run.
/* (non-Javadoc)
* @see java.lang.Runnable#run()
*/
public void run() {
try {
// This call just makes sure that the options panel exists if
// it's never been called for before.
getOptionsPanel();
int removeDuplicates = optionsPanel.removeDuplicates();
int extendBy = optionsPanel.extendBy();
File[] probeFiles = getFiles();
DataSet[] newData = new DataSet[probeFiles.length];
for (int f = 0; f < probeFiles.length; f++) {
BufferedReader br;
if (probeFiles[f].getName().toLowerCase().endsWith(".gz")) {
br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(probeFiles[f]))));
} else {
br = new BufferedReader(new FileReader(probeFiles[f]));
}
String line;
// First skip the header lines
for (int i = 0; i < startRowValue; i++) {
line = br.readLine();
if (line == null) {
br.close();
throw new Exception("Ran out of file before skipping all of the header lines");
}
}
int maxIndexValue = 0;
if (chrColValue > maxIndexValue)
maxIndexValue = chrColValue;
if (startColValue > maxIndexValue)
maxIndexValue = startColValue;
if (endColValue > maxIndexValue)
maxIndexValue = endColValue;
if (strandColValue > maxIndexValue)
maxIndexValue = strandColValue;
if (countColValue > maxIndexValue)
maxIndexValue = countColValue;
if (optionsPanel.isHiC()) {
int distance = 0;
if (optionsPanel.hiCDistance.getText().length() > 0) {
distance = Integer.parseInt(optionsPanel.hiCDistance.getText());
}
// TODO: Add an option to remove trans hits when importing from the generic parser.
newData[f] = new PairedDataSet(probeFiles[f].getName(), probeFiles[f].getCanonicalPath(), removeDuplicates, distance, false);
} else {
newData[f] = new DataSet(probeFiles[f].getName(), probeFiles[f].getCanonicalPath(), removeDuplicates);
}
int lineCount = 0;
// Now process the rest of the file
while ((line = br.readLine()) != null) {
if (cancel) {
br.close();
progressCancelled();
return;
}
++lineCount;
if (lineCount % 100000 == 0) {
progressUpdated("Read " + lineCount + " lines from " + probeFiles[f].getName(), f, probeFiles.length);
}
String[] sections = line.split(delimitersValue);
// Check to see if we've got enough data to work with
if (maxIndexValue >= sections.length) {
progressWarningReceived(new SeqMonkException("Not enough data (" + sections.length + ") to get a probe name on line '" + line + "'"));
// Skip this line...
continue;
}
int strand;
int start;
int end;
int count = 1;
try {
start = Integer.parseInt(sections[startColValue].replaceAll(" ", ""));
end = Integer.parseInt(sections[endColValue].replaceAll(" ", ""));
// End must always be later than start
if (end < start) {
int temp = start;
start = end;
end = temp;
}
if (countColValue != -1 && sections[countColValue].length() > 0) {
try {
count = Integer.parseInt(sections[countColValue].replaceAll(" ", ""));
} catch (NumberFormatException e) {
progressWarningReceived(new SeqMonkException("Count value " + sections[countColValue] + " was not an integer"));
continue;
}
}
if (useStrand) {
sections[strandColValue] = sections[strandColValue].replaceAll(" ", "");
if (sections[strandColValue].equals("+") || sections[strandColValue].equals("1") || sections[strandColValue].equals("FF") || sections[strandColValue].equals("F")) {
strand = Location.FORWARD;
} else if (sections[strandColValue].equals("-") || sections[strandColValue].equals("-1") || sections[strandColValue].equals("RF") || sections[strandColValue].equals("R")) {
strand = Location.REVERSE;
} else {
progressWarningReceived(new SeqMonkException("Unknown strand character '" + sections[strandColValue] + "' marked as unknown strand"));
strand = Location.UNKNOWN;
}
} else {
strand = Location.UNKNOWN;
}
if (extendBy > 0) {
if (strand == Location.REVERSE) {
start -= extendBy;
} else {
end += extendBy;
}
}
} catch (NumberFormatException e) {
progressWarningReceived(new SeqMonkException("Location '" + sections[startColValue] + "'-'" + sections[endColValue] + "' was not an integer"));
continue;
}
ChromosomeWithOffset c;
try {
c = dataCollection().genome().getChromosome(sections[chrColValue]);
} catch (IllegalArgumentException sme) {
progressWarningReceived(sme);
continue;
}
start = c.position(start);
end = c.position(end);
// We also don't allow readings which are beyond the end of the chromosome
if (end > c.chromosome().length()) {
int overrun = end - c.chromosome().length();
progressWarningReceived(new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")"));
continue;
}
if (start < 1) {
progressWarningReceived(new SeqMonkException("Reading start position " + start + " was less than 1"));
continue;
}
// We can now make the new reading
try {
long read = SequenceRead.packPosition(start, end, strand);
for (int i = 0; i < count; i++) {
newData[f].addData(c.chromosome(), read);
}
} catch (SeqMonkException e) {
progressWarningReceived(e);
continue;
}
// System.out.println("Added probe "+newProbe.name()+" on "+newProbe.chromosome()+" at pos "+newProbe.position());
}
// We're finished with the file.
br.close();
// Cache the data in the new dataset
progressUpdated("Caching data from " + probeFiles[f].getName(), f, probeFiles.length);
newData[f].finalise();
}
processingFinished(newData);
} catch (Exception ex) {
progressExceptionReceived(ex);
return;
}
}
use of uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset in project SeqMonk by s-andrews.
the class MethylKitFileParser method run.
/* (non-Javadoc)
* @see java.lang.Runnable#run()
*/
public void run() {
try {
File[] methylKitFiles = getFiles();
DataSet[] newData = new DataSet[methylKitFiles.length];
for (int f = 0; f < methylKitFiles.length; f++) {
BufferedReader br;
if (methylKitFiles[f].getName().toLowerCase().endsWith(".gz")) {
br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(methylKitFiles[f]))));
} else {
br = new BufferedReader(new FileReader(methylKitFiles[f]));
}
String line;
newData[f] = new DataSet(methylKitFiles[f].getName(), methylKitFiles[f].getCanonicalPath(), prefs.removeDuplicates());
int lineCount = 0;
// Now process the file
while ((line = br.readLine()) != null) {
if (cancel) {
br.close();
progressCancelled();
return;
}
// Ignore blank lines
if (line.trim().length() == 0)
continue;
// In case it has comments
if (line.startsWith("#"))
continue;
// This is the start of the header
if (line.startsWith("chrBase"))
continue;
++lineCount;
if (lineCount % 100000 == 0) {
progressUpdated("Read " + lineCount + " lines from " + methylKitFiles[f].getName(), f, methylKitFiles.length);
}
String[] sections = line.split("\t");
// Check to see if we've got enough data to work with
if (sections.length < 6) {
progressWarningReceived(new SeqMonkException("Not enough data from line '" + line + "'"));
// Skip this line...
continue;
}
int position;
int totalCount;
int methCount;
int unmethCount;
try {
position = Integer.parseInt(sections[2]);
totalCount = Integer.parseInt(sections[4]);
methCount = Math.round((Float.parseFloat(sections[5]) / 100) * totalCount);
unmethCount = Math.round((Float.parseFloat(sections[6]) / 100) * totalCount);
} catch (NumberFormatException e) {
progressWarningReceived(new SeqMonkException("Failed to parse position and counts from " + line));
continue;
}
try {
ChromosomeWithOffset c = dataCollection().genome().getChromosome(sections[1]);
// We also don't allow readings which are beyond the end of the chromosome
if (position > c.chromosome().length()) {
int overrun = position - c.chromosome().length();
progressWarningReceived(new SeqMonkException("Reading position " + position + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")"));
continue;
}
// We can now make the new reads
long methRead = SequenceRead.packPosition(position, position, Location.FORWARD);
long unmethRead = SequenceRead.packPosition(position, position, Location.REVERSE);
for (int i = 0; i < methCount; i++) {
newData[f].addData(c.chromosome(), methRead);
}
for (int i = 0; i < unmethCount; i++) {
newData[f].addData(c.chromosome(), unmethRead);
}
} catch (IllegalArgumentException iae) {
progressWarningReceived(iae);
} catch (SeqMonkException sme) {
progressWarningReceived(sme);
continue;
}
}
// We're finished with the file.
br.close();
// Cache the data in the new dataset
progressUpdated("Caching data from " + methylKitFiles[f].getName(), f, methylKitFiles.length);
newData[f].finalise();
}
processingFinished(newData);
} catch (Exception ex) {
progressExceptionReceived(ex);
return;
}
}
use of uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset in project SeqMonk by s-andrews.
the class BAMFileParser method getPairedEndRead.
/**
* Gets a paired end read. This method assumes that it will only be passed reads which map
* to the reverse strand since these are the ones which contain enough information to
* unambiguously locate both ends of the pair.
*
* @param sections The tab split sections from the SAM file
* @param flag The binary flag field
* @return The read which was read
* @throws SeqMonkException
*/
private SequenceReadWithChromosome getPairedEndRead(SAMRecord samRecord) throws SeqMonkException {
int strand;
int start;
int end;
if (!samRecord.getReadNegativeStrandFlag()) {
throw new SeqMonkException("Read passed to parse pair was not on the negative strand");
}
if (samRecord.getMateNegativeStrandFlag()) {
throw new SeqMonkException("Ignored discordantly stranded read pair");
}
end = samRecord.getAlignmentEnd();
start = samRecord.getMateAlignmentStart();
if (start > end) {
throw new SeqMonkException("Ignored discordantly stranded read pair");
}
if (samRecord.getFirstOfPairFlag()) {
strand = Location.REVERSE;
} else {
strand = Location.FORWARD;
}
if ((end - start) + 1 > pairedEndDistance) {
throw new SeqMonkException("Distance between ends " + ((end - start) + 1) + " was larger than cutoff (" + pairedEndDistance + ")");
}
ChromosomeWithOffset c;
try {
c = dataCollection().genome().getChromosome(samRecord.getReferenceName());
} catch (Exception e) {
throw new SeqMonkException(e.getLocalizedMessage());
}
start = c.position(start);
end = c.position(end);
// We also don't allow readings which are beyond the end of the chromosome
if (end > c.chromosome().length()) {
int overrun = end - c.chromosome().length();
throw new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
}
if (start < 1) {
throw new SeqMonkException("Reading position " + start + " was before the start of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
}
// We can now make the new reading
SequenceReadWithChromosome read = new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(start, end, strand));
return read;
}
Aggregations