use of uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset in project SeqMonk by s-andrews.
the class BedPEFileParser method run.
/* (non-Javadoc)
* @see java.lang.Runnable#run()
*/
public void run() {
// System.err.println("Started parsing BED files");
int extendBy = prefs.extendReads();
try {
File[] probeFiles = getFiles();
DataSet[] newData = new DataSet[probeFiles.length];
for (int f = 0; f < probeFiles.length; f++) {
BufferedReader br;
if (probeFiles[f].getName().toLowerCase().endsWith(".gz")) {
br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(probeFiles[f]))));
} else {
br = new BufferedReader(new FileReader(probeFiles[f]));
}
String line;
if (prefs.isHiC()) {
newData[f] = new PairedDataSet(probeFiles[f].getName(), probeFiles[f].getCanonicalPath(), prefs.removeDuplicates(), prefs.hiCDistance(), prefs.hiCIgnoreTrans());
} else {
newData[f] = new DataSet(probeFiles[f].getName(), probeFiles[f].getCanonicalPath(), prefs.removeDuplicates());
}
int lineCount = 0;
// Now process the file
while ((line = br.readLine()) != null) {
if (cancel) {
br.close();
progressCancelled();
return;
}
// Ignore blank lines
if (line.trim().length() == 0)
continue;
// System.err.println(line);
// Thread.sleep(200);
++lineCount;
if (lineCount % 100000 == 0) {
progressUpdated("Read " + lineCount + " lines from " + probeFiles[f].getName(), f, probeFiles.length);
}
String[] sections = line.split("\t");
// Check to see if we've got enough data to work with
if (sections.length < 6) {
progressWarningReceived(new SeqMonkException("Not enough data from line '" + line + "'"));
// Skip this line...
continue;
}
// move on quickly.
if (sections[0].equals(".") || sections[3].equals("."))
continue;
int strand1;
int start1;
int end1;
int strand2;
int start2;
int end2;
try {
// The start is zero indexed so we need to add 1 to get genomic positions
start1 = Integer.parseInt(sections[1]) + 1;
start2 = Integer.parseInt(sections[4]) + 1;
// The end is zero indexed, but not included in the feature position so
// we need to add one to get genomic coordinates, but subtract one to not
// include the final base.
end1 = Integer.parseInt(sections[2]);
end2 = Integer.parseInt(sections[5]);
// End must always be later than start
if (start1 > end1) {
progressWarningReceived(new SeqMonkException("End position1 " + end1 + " was lower than start position " + start1));
int temp = start1;
start1 = end1;
end1 = temp;
}
if (start2 > end2) {
progressWarningReceived(new SeqMonkException("End position2 " + end2 + " was lower than start position " + start2));
int temp = start2;
start2 = end2;
end2 = temp;
}
if (sections.length >= 10) {
if (sections[8].equals("+")) {
strand1 = Location.FORWARD;
} else if (sections[8].equals("-")) {
strand1 = Location.REVERSE;
} else if (sections[8].equals(".")) {
strand1 = Location.UNKNOWN;
} else {
progressWarningReceived(new SeqMonkException("Unknown strand character '" + sections[8] + "' marked as unknown strand"));
strand1 = Location.UNKNOWN;
}
if (sections[9].equals("+")) {
strand2 = Location.FORWARD;
} else if (sections[9].equals("-")) {
strand2 = Location.REVERSE;
} else if (sections[9].equals(".")) {
strand2 = Location.UNKNOWN;
} else {
progressWarningReceived(new SeqMonkException("Unknown strand character '" + sections[9] + "' marked as unknown strand"));
strand2 = Location.UNKNOWN;
}
// if (extendBy > 0) {
// if (strand==Location.REVERSE) {
// start -=extendBy;
// }
// else if (strand==Location.FORWARD) {
// end+=extendBy;
// }
// }
} else {
strand1 = Location.UNKNOWN;
strand2 = Location.UNKNOWN;
}
} catch (NumberFormatException e) {
progressWarningReceived(new SeqMonkException("Location " + sections[0] + "-" + sections[1] + " was not an integer"));
continue;
}
try {
ChromosomeWithOffset c1 = dataCollection().genome().getChromosome(sections[0]);
// We also don't allow readings which are beyond the end of the chromosome
start1 = c1.position(start1);
end1 = c1.position(end1);
if (end1 > c1.chromosome().length()) {
int overrun = end1 - c1.chromosome().length();
progressWarningReceived(new SeqMonkException("Reading position " + end1 + " was " + overrun + "bp beyond the end of chr" + c1.chromosome().name() + " (" + c1.chromosome().length() + ")"));
continue;
}
ChromosomeWithOffset c2 = dataCollection().genome().getChromosome(sections[3]);
// We also don't allow readings which are beyond the end of the chromosome
start2 = c2.position(start2);
end2 = c2.position(end2);
if (end2 > c2.chromosome().length()) {
int overrun = end2 - c2.chromosome().length();
progressWarningReceived(new SeqMonkException("Reading position " + end2 + " was " + overrun + "bp beyond the end of chr" + c2.chromosome().name() + " (" + c2.chromosome().length() + ")"));
continue;
}
// add them. There's nothing clever to do.
if (prefs.isHiC()) {
long read1 = SequenceRead.packPosition(start1, end1, strand1);
newData[f].addData(c1.chromosome(), read1);
long read2 = SequenceRead.packPosition(start2, end2, strand2);
newData[f].addData(c2.chromosome(), read2);
} else {
// If they're on different chromosomes then we kick them out
if (!c1.chromosome().name().equals(c2.chromosome().name())) {
progressWarningReceived(new SeqMonkException("Paried reads were on different chromosomes - discarding"));
continue;
}
if (strand1 == Location.FORWARD && strand2 != Location.REVERSE) {
progressWarningReceived(new SeqMonkException("Invalid strand orientation - discarding"));
continue;
}
if (strand1 == Location.REVERSE && strand2 != Location.FORWARD) {
progressWarningReceived(new SeqMonkException("Invalid strand orientation - discarding"));
continue;
}
// If they're too far apart we kick them out
int start = 1;
int end = 0;
// We take the strand from read1
int strand = strand1;
if (strand == Location.FORWARD) {
start = start1;
end = end2;
} else if (strand == Location.REVERSE) {
start = start2;
end = end1;
} else if (strand == Location.UNKNOWN) {
start = Math.min(start1, start2);
end = Math.max(end1, end2);
}
if (end <= start) {
progressWarningReceived(new SeqMonkException("Incorrectly oriented reads - discarding"));
continue;
}
if ((end - start) + 1 > prefs.pairDistanceCutoff()) {
progressWarningReceived(new SeqMonkException("Distance between reads too great (" + (((end - start) + 1) - prefs.pairDistanceCutoff()) + ")"));
continue;
}
long read = SequenceRead.packPosition(start, end, strand);
newData[f].addData(c1.chromosome(), read);
}
} catch (IllegalArgumentException iae) {
progressWarningReceived(iae);
} catch (SeqMonkException sme) {
progressWarningReceived(sme);
continue;
}
}
// We're finished with the file.
br.close();
// Cache the data in the new dataset
progressUpdated("Caching data from " + probeFiles[f].getName(), f, probeFiles.length);
newData[f].finalise();
}
processingFinished(newData);
} catch (Exception ex) {
progressExceptionReceived(ex);
return;
}
}
use of uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset in project SeqMonk by s-andrews.
the class BowtieFileParser method getSingleEndRead.
/**
* Gets a single single end read.
*
* @param sections The set of tab-split strings from the bowtie output
* @return The read which was read
* @throws SeqMonkException
*/
private SequenceReadWithChromosome getSingleEndRead(String[] sections) throws SeqMonkException {
int strand;
int start;
int end;
try {
start = Integer.parseInt(sections[3]) + 1;
end = start + (sections[4].length() - 1);
if (sections[1].equals("+")) {
strand = Location.FORWARD;
} else if (sections[1].equals("-")) {
strand = Location.REVERSE;
} else {
strand = Location.UNKNOWN;
}
if (extendBy > 0) {
if (strand == Location.FORWARD) {
end += extendBy;
} else if (strand == Location.REVERSE) {
start -= extendBy;
}
}
} catch (NumberFormatException e) {
throw new SeqMonkException("Location " + sections[3] + " was not an integer");
}
ChromosomeWithOffset c;
try {
c = collection.genome().getChromosome(sections[2]);
} catch (Exception iae) {
throw new SeqMonkException(iae.getLocalizedMessage());
}
start = c.position(start);
end = c.position(end);
// We also don't allow readings which are beyond the end of the chromosome
if (end > c.chromosome().length()) {
int overrun = end - c.chromosome().length();
throw new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
}
if (start < 1) {
throw new SeqMonkException("Reading position " + start + " was before the start of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
}
// We can now make the new reading
SequenceReadWithChromosome read = new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(start, end, strand));
return read;
}
use of uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset in project SeqMonk by s-andrews.
the class GffFileParser method run.
/* (non-Javadoc)
* @see java.lang.Runnable#run()
*/
public void run() {
File[] probeFiles = getFiles();
DataSet[] newData = new DataSet[probeFiles.length];
int extendBy = prefs.extendReads();
try {
for (int f = 0; f < probeFiles.length; f++) {
BufferedReader br;
if (probeFiles[f].getName().toLowerCase().endsWith(".gz")) {
br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(probeFiles[f]))));
} else {
br = new BufferedReader(new FileReader(probeFiles[f]));
}
String line;
if (prefs.isHiC()) {
newData[f] = new PairedDataSet(probeFiles[f].getName(), probeFiles[f].getCanonicalPath(), prefs.removeDuplicates(), prefs.hiCDistance(), prefs.hiCIgnoreTrans());
} else {
newData[f] = new DataSet(probeFiles[f].getName(), probeFiles[f].getCanonicalPath(), prefs.removeDuplicates());
}
int lineCount = 0;
// Now process the file
while ((line = br.readLine()) != null) {
if (cancel) {
br.close();
progressCancelled();
return;
}
// Ignore blank lines
if (line.trim().length() == 0)
continue;
++lineCount;
if (lineCount % 100000 == 0) {
progressUpdated("Read " + lineCount + " lines from " + probeFiles[f].getName(), f, probeFiles.length);
}
String[] sections = line.split("\t");
// Check to see if we've got enough data to work with
if (sections.length < 5) {
progressWarningReceived(new SeqMonkException("Not enough data from line '" + line + "'"));
// Skip this line...
continue;
}
int strand;
int start;
int end;
try {
start = Integer.parseInt(sections[3]);
end = Integer.parseInt(sections[4]);
// End must always be later than start
if (start > end) {
int temp = start;
start = end;
end = temp;
}
if (sections.length >= 7) {
if (sections[6].equals("+")) {
strand = Location.FORWARD;
} else if (sections[6].equals("-")) {
strand = Location.REVERSE;
} else if (sections[6].equals(".")) {
strand = Location.UNKNOWN;
} else {
progressWarningReceived(new SeqMonkException("Unknown strand character '" + sections[6] + "' marked as unknown strand"));
strand = Location.UNKNOWN;
}
} else {
strand = Location.UNKNOWN;
}
if (extendBy > 0) {
if (strand == Location.FORWARD) {
end += extendBy;
} else if (strand == Location.REVERSE) {
start -= extendBy;
}
}
} catch (NumberFormatException e) {
progressWarningReceived(new SeqMonkException("Location " + sections[3] + "-" + sections[4] + " was not an integer"));
continue;
}
ChromosomeWithOffset c;
try {
c = dataCollection().genome().getChromosome(sections[0]);
} catch (IllegalArgumentException sme) {
progressWarningReceived(sme);
continue;
}
start = c.position(start);
end = c.position(end);
// We also don't allow readings which are beyond the end of the chromosome
if (end > c.chromosome().length()) {
int overrun = end - c.chromosome().length();
progressWarningReceived(new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")"));
continue;
}
// We can now make the new reading
try {
long read = SequenceRead.packPosition(start, end, strand);
newData[f].addData(c.chromosome(), read);
} catch (SeqMonkException e) {
progressWarningReceived(e);
continue;
}
}
// We're finished with the file.
br.close();
// Cache the data in the new dataset
progressUpdated("Caching data from " + probeFiles[f].getName(), f, probeFiles.length);
newData[f].finalise();
}
processingFinished(newData);
} catch (Exception ex) {
progressExceptionReceived(ex);
}
}
use of uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset in project SeqMonk by s-andrews.
the class GenericAnnotationParser method parseAnnotation.
/* (non-Javadoc)
* @see uk.ac.babraham.SeqMonk.AnnotationParsers.AnnotationParser#parseAnnotation(java.io.File, uk.ac.babraham.SeqMonk.DataTypes.Genome.Genome)
*/
public AnnotationSet[] parseAnnotation(File file, Genome genome) throws Exception {
this.file = file;
if (options == null) {
options = new JDialog(SeqMonkApplication.getInstance());
options.setModal(true);
options.setContentPane(new GenericAnnotationParserOptions(options));
options.setSize(700, 400);
options.setLocationRelativeTo(null);
}
// We have to set cancel to true as a default so we don't try to
// proceed with processing if the user closes the options using
// the X on the window.
options.setTitle("Format for " + file.getName() + "...");
cancel = true;
options.setVisible(true);
if (cancel) {
progressCancelled();
return null;
}
// We keep track of how many types have been added
// to catch if someone sets the wrong field and makes
// loads of different feature types.
HashSet<String> addedTypes = new HashSet<String>();
BufferedReader br;
if (file.getName().toLowerCase().endsWith(".gz")) {
br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(file))));
} else {
br = new BufferedReader(new FileReader(file));
}
String line;
// First skip the header lines
for (int i = 0; i < startRowValue; i++) {
line = br.readLine();
if (line == null) {
throw new Exception("Ran out of file before skipping all of the header lines");
}
}
int maxIndexValue = 0;
if (chrColValue > maxIndexValue)
maxIndexValue = chrColValue;
if (startColValue > maxIndexValue)
maxIndexValue = startColValue;
if (endColValue > maxIndexValue)
maxIndexValue = endColValue;
if (strandColValue > maxIndexValue)
maxIndexValue = strandColValue;
if (typeColValue > maxIndexValue)
maxIndexValue = typeColValue;
if (nameColValue > maxIndexValue)
maxIndexValue = nameColValue;
if (descriptionColValue > maxIndexValue)
maxIndexValue = descriptionColValue;
Vector<AnnotationSet> annotationSets = new Vector<AnnotationSet>();
AnnotationSet currentAnnotation = new AnnotationSet(genome, file.getName());
annotationSets.add(currentAnnotation);
int lineCount = 0;
// Now process the rest of the file
while ((line = br.readLine()) != null) {
++lineCount;
if (cancel) {
progressCancelled();
return null;
}
if (lineCount % 1000 == 0) {
progressUpdated("Read " + lineCount + " lines from " + file.getName(), 0, 1);
}
if (lineCount > 1000000 && lineCount % 1000000 == 0) {
progressUpdated("Caching...", 0, 1);
currentAnnotation.finalise();
currentAnnotation = new AnnotationSet(genome, file.getName() + "[" + annotationSets.size() + "]");
annotationSets.add(currentAnnotation);
}
String[] sections = line.split(delimitersValue, -1);
// Check to see if we've got enough data to work with
if (maxIndexValue >= sections.length) {
progressWarningReceived(new SeqMonkException("Not enough data (" + sections.length + ") to get a probe name on line '" + line + "'"));
// Skip this line...
continue;
}
int strand;
int start;
int end;
try {
start = Integer.parseInt(sections[startColValue]);
end = Integer.parseInt(sections[endColValue]);
// End must always be later than start
if (end < start) {
int temp = start;
start = end;
end = temp;
}
if (useStrand) {
if (sections[strandColValue].equals("+") || sections[strandColValue].equals("1") || sections[strandColValue].equals("FF") || sections[strandColValue].equals("F")) {
strand = Location.FORWARD;
} else if (sections[strandColValue].equals("-") || sections[strandColValue].equals("-1") || sections[strandColValue].equals("RF") || sections[strandColValue].equals("R")) {
strand = Location.REVERSE;
} else {
progressWarningReceived(new SeqMonkException("Unknown strand character '" + sections[strandColValue] + "' marked as unknown strand"));
strand = Location.UNKNOWN;
}
} else {
strand = Location.UNKNOWN;
}
} catch (NumberFormatException e) {
progressWarningReceived(new SeqMonkException("Location " + sections[startColValue] + "-" + sections[endColValue] + " was not an integer"));
continue;
}
ChromosomeWithOffset c;
try {
c = genome.getChromosome(sections[chrColValue]);
} catch (IllegalArgumentException sme) {
progressWarningReceived(sme);
continue;
}
end = c.position(end);
start = c.position(start);
// We also don't allow readings which are beyond the end of the chromosome
if (end > c.chromosome().length()) {
int overrun = end - c.chromosome().length();
progressWarningReceived(new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")"));
continue;
}
if (start < 1) {
progressWarningReceived(new SeqMonkException("Reading start position " + start + " was less than 1"));
continue;
}
// Now we can add the type, name and description
String type;
// If there's a column containing the type then use that
if (typeColValue >= 0) {
type = sections[typeColValue];
} else // If not then use the manually specified type if they set it
if (featureType != null && featureType.length() > 0) {
type = featureType;
} else // If all else fails use the file name as the type
{
type = file.getName();
}
if (!addedTypes.contains(type)) {
addedTypes.add(type);
if (addedTypes.size() > 100) {
throw new SeqMonkException("More than 100 different types of feature added - you don't want to do this!");
}
}
String name = null;
if (nameColValue >= 0) {
name = sections[nameColValue];
}
String description = null;
if (descriptionColValue >= 0) {
description = sections[descriptionColValue];
}
// We can now make the new annotation
Feature feature = new Feature(type, c.chromosome().name());
if (name != null) {
feature.addAttribute("name", name);
}
if (description != null)
feature.addAttribute("description", description);
feature.setLocation(new Location(start, end, strand));
currentAnnotation.addFeature(feature);
// System.out.println("Added probe "+newProbe.name()+" on "+newProbe.chromosome()+" at pos "+newProbe.position());
}
// We're finished with the file.
br.close();
options.dispose();
options = null;
return annotationSets.toArray(new AnnotationSet[0]);
}
use of uk.ac.babraham.SeqMonk.Utilities.ChromosomeWithOffset in project SeqMonk by s-andrews.
the class BAMFileParser method getSplitSingleEndRead.
/**
* Gets a split single end read. The only reason for asking about whether the
* import is single or paired end is that if we're doing a paired end import
* then we reverse the strand for all second read data. For single end import
* we leave the strand alone whichever read it originates from.
*
* @param samRecord The picard record entry for this read
* @param flag pairedEnd Is this a paired end import
* @return The read which was read
* @throws SeqMonkException
*/
private SequenceReadWithChromosome[] getSplitSingleEndRead(SAMRecord samRecord) throws SeqMonkException {
int strand;
int start;
int lastEnd = -1;
start = samRecord.getAlignmentStart();
if (samRecord.getReadNegativeStrandFlag()) {
if (samRecord.getReadPairedFlag() && samRecord.getSecondOfPairFlag()) {
strand = Location.FORWARD;
} else {
strand = Location.REVERSE;
}
} else {
if (samRecord.getReadPairedFlag() && samRecord.getSecondOfPairFlag()) {
strand = Location.REVERSE;
} else {
strand = Location.FORWARD;
}
}
ChromosomeWithOffset c;
try {
c = dataCollection().genome().getChromosome(samRecord.getReferenceName());
} catch (Exception e) {
throw new SeqMonkException(e.getLocalizedMessage());
}
start = c.position(start);
if (start < 1) {
throw new SeqMonkException("Reading position " + start + " was before the start of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
}
if (!samRecord.getCigarString().contains("N")) {
if (!importIntrons) {
int end = c.position(samRecord.getAlignmentEnd());
return new SequenceReadWithChromosome[] { new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(start, end, strand)) };
}
return new SequenceReadWithChromosome[0];
}
String[] cigarOperations = samRecord.getCigarString().split("\\d+");
String[] cigarNumbers = samRecord.getCigarString().split("[MIDNSHP]");
if (cigarOperations.length != cigarNumbers.length + 1) {
throw new SeqMonkException("Couldn't parse CIGAR string " + samRecord.getCigarString() + " counts were " + cigarOperations.length + " vs " + cigarNumbers.length);
}
Vector<SequenceReadWithChromosome> newReads = new Vector<SequenceReadWithChromosome>();
int currentPosition = start;
for (int pos = 0; pos < cigarNumbers.length; pos++) {
if (cigarOperations[pos + 1].equals("M")) {
currentPosition += Integer.parseInt(cigarNumbers[pos]) - 1;
} else if (cigarOperations[pos + 1].equals("I")) {
currentPosition += Integer.parseInt(cigarNumbers[pos]) - 1;
} else if (cigarOperations[pos + 1].equals("D")) {
currentPosition -= Integer.parseInt(cigarNumbers[pos]) - 1;
} else if (cigarOperations[pos + 1].equals("N")) {
if (importIntrons) {
if (lastEnd > 0) {
if (start > c.chromosome().length()) {
int overrun = (start - 1) - c.chromosome().length();
throw new SeqMonkException("Reading position " + (start - 1) + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
}
newReads.add(new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(lastEnd + 1, start - 1, strand)));
}
// Update the lastEnd whether we added a read or not since this
// will be the start of the next intron
lastEnd = currentPosition;
} else {
// We also don't allow readings which are beyond the end of the chromosome
if (currentPosition > c.chromosome().length()) {
int overrun = currentPosition - c.chromosome().length();
throw new SeqMonkException("Reading position " + currentPosition + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
}
newReads.add(new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(start, currentPosition, strand)));
}
currentPosition += Integer.parseInt(cigarNumbers[pos]) + 1;
start = currentPosition;
}
}
if (importIntrons) {
if (lastEnd > 0) {
if (start > c.chromosome().length()) {
int overrun = (start - 1) - c.chromosome().length();
throw new SeqMonkException("Reading position " + (start - 1) + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
}
newReads.add(new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(lastEnd + 1, start - 1, strand)));
}
} else {
// We also don't allow readings which are beyond the end of the chromosome
if (currentPosition > c.chromosome().length()) {
int overrun = currentPosition - c.chromosome().length();
throw new SeqMonkException("Reading position " + currentPosition + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")");
}
newReads.add(new SequenceReadWithChromosome(c.chromosome(), SequenceRead.packPosition(start, currentPosition, strand)));
}
return newReads.toArray(new SequenceReadWithChromosome[0]);
}
Aggregations