use of uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet in project SeqMonk by s-andrews.
the class BedFileParser method run.
/* (non-Javadoc)
* @see java.lang.Runnable#run()
*/
public void run() {
// System.err.println("Started parsing BED files");
int extendBy = prefs.extendReads();
try {
File[] probeFiles = getFiles();
DataSet[] newData = new DataSet[probeFiles.length];
for (int f = 0; f < probeFiles.length; f++) {
BufferedReader br;
if (probeFiles[f].getName().toLowerCase().endsWith(".gz")) {
br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(probeFiles[f]))));
} else {
br = new BufferedReader(new FileReader(probeFiles[f]));
}
String line;
if (prefs.isHiC()) {
newData[f] = new PairedDataSet(probeFiles[f].getName(), probeFiles[f].getCanonicalPath(), prefs.removeDuplicates(), prefs.hiCDistance(), prefs.hiCIgnoreTrans());
} else {
newData[f] = new DataSet(probeFiles[f].getName(), probeFiles[f].getCanonicalPath(), prefs.removeDuplicates());
}
int lineCount = 0;
// Now process the file
while ((line = br.readLine()) != null) {
if (cancel) {
br.close();
progressCancelled();
return;
}
// Ignore blank lines
if (line.trim().length() == 0)
continue;
++lineCount;
if (lineCount % 100000 == 0) {
progressUpdated("Read " + lineCount + " lines from " + probeFiles[f].getName(), f, probeFiles.length);
}
String[] sections = line.split("\t");
// Check to see if we've got enough data to work with
if (sections.length < 3) {
progressWarningReceived(new SeqMonkException("Not enough data from line '" + line + "'"));
// Skip this line...
continue;
}
int strand;
int start;
int end;
try {
// The start is zero indexed so we need to add 1 to get genomic positions
start = Integer.parseInt(sections[1]) + 1;
// The end is zero indexed, but not included in the feature position so
// we need to add one to get genomic coordinates, but subtract one to not
// include the final base.
end = Integer.parseInt(sections[2]);
// End must always be later than start
if (start > end) {
progressWarningReceived(new SeqMonkException("End position " + end + " was lower than start position " + start));
int temp = start;
start = end;
end = temp;
}
if (sections.length >= 6) {
if (sections[5].equals("+")) {
strand = Location.FORWARD;
} else if (sections[5].equals("-")) {
strand = Location.REVERSE;
} else {
progressWarningReceived(new SeqMonkException("Unknown strand character '" + sections[5] + "' marked as unknown strand"));
strand = Location.UNKNOWN;
}
if (extendBy > 0) {
if (strand == Location.REVERSE) {
start -= extendBy;
} else if (strand == Location.FORWARD) {
end += extendBy;
}
}
} else {
strand = Location.UNKNOWN;
}
} catch (NumberFormatException e) {
progressWarningReceived(new SeqMonkException("Location " + sections[0] + "-" + sections[1] + " was not an integer"));
continue;
}
try {
ChromosomeWithOffset c = dataCollection().genome().getChromosome(sections[0]);
// We also don't allow readings which are beyond the end of the chromosome
start = c.position(start);
end = c.position(end);
if (end > c.chromosome().length()) {
int overrun = end - c.chromosome().length();
progressWarningReceived(new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")"));
continue;
}
// We can now make the new reading
long read = SequenceRead.packPosition(start, end, strand);
newData[f].addData(c.chromosome(), read);
} catch (IllegalArgumentException iae) {
progressWarningReceived(iae);
} catch (SeqMonkException sme) {
progressWarningReceived(sme);
continue;
}
}
// We're finished with the file.
br.close();
// Cache the data in the new dataset
progressUpdated("Caching data from " + probeFiles[f].getName(), f, probeFiles.length);
newData[f].finalise();
}
processingFinished(newData);
} catch (Exception ex) {
progressExceptionReceived(ex);
return;
}
}
use of uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet in project SeqMonk by s-andrews.
the class GenericSeqReadParser method run.
/* (non-Javadoc)
* @see java.lang.Runnable#run()
*/
public void run() {
try {
// This call just makes sure that the options panel exists if
// it's never been called for before.
getOptionsPanel();
int removeDuplicates = optionsPanel.removeDuplicates();
int extendBy = optionsPanel.extendBy();
File[] probeFiles = getFiles();
DataSet[] newData = new DataSet[probeFiles.length];
for (int f = 0; f < probeFiles.length; f++) {
BufferedReader br;
if (probeFiles[f].getName().toLowerCase().endsWith(".gz")) {
br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(probeFiles[f]))));
} else {
br = new BufferedReader(new FileReader(probeFiles[f]));
}
String line;
// First skip the header lines
for (int i = 0; i < startRowValue; i++) {
line = br.readLine();
if (line == null) {
br.close();
throw new Exception("Ran out of file before skipping all of the header lines");
}
}
int maxIndexValue = 0;
if (chrColValue > maxIndexValue)
maxIndexValue = chrColValue;
if (startColValue > maxIndexValue)
maxIndexValue = startColValue;
if (endColValue > maxIndexValue)
maxIndexValue = endColValue;
if (strandColValue > maxIndexValue)
maxIndexValue = strandColValue;
if (countColValue > maxIndexValue)
maxIndexValue = countColValue;
if (optionsPanel.isHiC()) {
int distance = 0;
if (optionsPanel.hiCDistance.getText().length() > 0) {
distance = Integer.parseInt(optionsPanel.hiCDistance.getText());
}
// TODO: Add an option to remove trans hits when importing from the generic parser.
newData[f] = new PairedDataSet(probeFiles[f].getName(), probeFiles[f].getCanonicalPath(), removeDuplicates, distance, false);
} else {
newData[f] = new DataSet(probeFiles[f].getName(), probeFiles[f].getCanonicalPath(), removeDuplicates);
}
int lineCount = 0;
// Now process the rest of the file
while ((line = br.readLine()) != null) {
if (cancel) {
br.close();
progressCancelled();
return;
}
++lineCount;
if (lineCount % 100000 == 0) {
progressUpdated("Read " + lineCount + " lines from " + probeFiles[f].getName(), f, probeFiles.length);
}
String[] sections = line.split(delimitersValue);
// Check to see if we've got enough data to work with
if (maxIndexValue >= sections.length) {
progressWarningReceived(new SeqMonkException("Not enough data (" + sections.length + ") to get a probe name on line '" + line + "'"));
// Skip this line...
continue;
}
int strand;
int start;
int end;
int count = 1;
try {
start = Integer.parseInt(sections[startColValue].replaceAll(" ", ""));
end = Integer.parseInt(sections[endColValue].replaceAll(" ", ""));
// End must always be later than start
if (end < start) {
int temp = start;
start = end;
end = temp;
}
if (countColValue != -1 && sections[countColValue].length() > 0) {
try {
count = Integer.parseInt(sections[countColValue].replaceAll(" ", ""));
} catch (NumberFormatException e) {
progressWarningReceived(new SeqMonkException("Count value " + sections[countColValue] + " was not an integer"));
continue;
}
}
if (useStrand) {
sections[strandColValue] = sections[strandColValue].replaceAll(" ", "");
if (sections[strandColValue].equals("+") || sections[strandColValue].equals("1") || sections[strandColValue].equals("FF") || sections[strandColValue].equals("F")) {
strand = Location.FORWARD;
} else if (sections[strandColValue].equals("-") || sections[strandColValue].equals("-1") || sections[strandColValue].equals("RF") || sections[strandColValue].equals("R")) {
strand = Location.REVERSE;
} else {
progressWarningReceived(new SeqMonkException("Unknown strand character '" + sections[strandColValue] + "' marked as unknown strand"));
strand = Location.UNKNOWN;
}
} else {
strand = Location.UNKNOWN;
}
if (extendBy > 0) {
if (strand == Location.REVERSE) {
start -= extendBy;
} else {
end += extendBy;
}
}
} catch (NumberFormatException e) {
progressWarningReceived(new SeqMonkException("Location '" + sections[startColValue] + "'-'" + sections[endColValue] + "' was not an integer"));
continue;
}
ChromosomeWithOffset c;
try {
c = dataCollection().genome().getChromosome(sections[chrColValue]);
} catch (IllegalArgumentException sme) {
progressWarningReceived(sme);
continue;
}
start = c.position(start);
end = c.position(end);
// We also don't allow readings which are beyond the end of the chromosome
if (end > c.chromosome().length()) {
int overrun = end - c.chromosome().length();
progressWarningReceived(new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")"));
continue;
}
if (start < 1) {
progressWarningReceived(new SeqMonkException("Reading start position " + start + " was less than 1"));
continue;
}
// We can now make the new reading
try {
long read = SequenceRead.packPosition(start, end, strand);
for (int i = 0; i < count; i++) {
newData[f].addData(c.chromosome(), read);
}
} catch (SeqMonkException e) {
progressWarningReceived(e);
continue;
}
// System.out.println("Added probe "+newProbe.name()+" on "+newProbe.chromosome()+" at pos "+newProbe.position());
}
// We're finished with the file.
br.close();
// Cache the data in the new dataset
progressUpdated("Caching data from " + probeFiles[f].getName(), f, probeFiles.length);
newData[f].finalise();
}
processingFinished(newData);
} catch (Exception ex) {
progressExceptionReceived(ex);
return;
}
}
use of uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet in project SeqMonk by s-andrews.
the class VisibleStoresParser method processHiCDataStore.
private DataSet processHiCDataStore(DataStore store) throws SeqMonkException {
int extendBy = prefs.extendReads();
boolean reverse = prefs.reverseReads();
boolean removeStrand = prefs.removeStrandInfo();
PairedDataSet newData = new PairedDataSet(store.name() + "_reimport", "Reimported from " + store.name(), prefs.removeDuplicates(), prefs.hiCDistance(), prefs.hiCIgnoreTrans());
// Now process the data
Chromosome[] chrs = dataCollection().genome().getAllChromosomes();
for (int c = 0; c < chrs.length; c++) {
progressUpdated("Processing " + store.name() + " chr " + chrs[c].name(), c, chrs.length);
// We make the call to get exportable reads so we don't duplicate reads
// when we export things
HiCHitCollection hitCollection = ((HiCDataStore) store).getExportableReadsForChromosome(chrs[c]);
String[] localChromosomes = hitCollection.getChromosomeNamesWithHits();
for (int c2 = 0; c2 < localChromosomes.length; c2++) {
Chromosome localChromosome = SeqMonkApplication.getInstance().dataCollection().genome().getChromosome(localChromosomes[c2]).chromosome();
long[] sourceReads = hitCollection.getSourcePositionsForChromosome(localChromosomes[c2]);
long[] hitReads = hitCollection.getHitPositionsForChromosome(localChromosomes[c2]);
for (int r = 0; r < sourceReads.length; r++) {
if (cancel) {
progressCancelled();
return null;
}
if (downsample && downsampleProbabilty < 1) {
if (Math.random() > downsampleProbabilty) {
continue;
}
}
if ((!(reverse || removeStrand)) && extendBy == 0 && (!filterByFeature)) {
// Just add them as they are
newData.addData(chrs[c], sourceReads[r]);
newData.addData(localChromosome, hitReads[r]);
}
Feature[] features = null;
if (filterByFeature) {
features = collection.genome().annotationCollection().getFeaturesForType(chrs[c], featureType);
Arrays.sort(features);
}
int currentFeaturePostion = 0;
if (filterByFeature) {
// See if we're comparing against the right feature
while (SequenceRead.start(sourceReads[r]) > features[currentFeaturePostion].location().end() && currentFeaturePostion < (features.length - 1)) {
currentFeaturePostion++;
}
// Test to see if we overlap
if (SequenceRead.overlaps(sourceReads[r], features[currentFeaturePostion].location().packedPosition())) {
if (excludeFeature)
continue;
} else {
if (!excludeFeature)
continue;
}
}
int sourceStart = SequenceRead.start(sourceReads[r]);
int sourceEend = SequenceRead.end(sourceReads[r]);
int sourceStrand = SequenceRead.strand(sourceReads[r]);
int hitStart = SequenceRead.start(sourceReads[r]);
int hitEend = SequenceRead.end(hitReads[r]);
int hitStrand = SequenceRead.strand(hitReads[r]);
if (reverse) {
if (sourceStrand == Location.FORWARD) {
sourceStrand = Location.REVERSE;
} else if (sourceStrand == Location.REVERSE) {
sourceStrand = Location.FORWARD;
}
if (hitStrand == Location.FORWARD) {
hitStrand = Location.REVERSE;
} else if (hitStrand == Location.REVERSE) {
hitStrand = Location.FORWARD;
}
}
if (removeStrand) {
sourceStrand = Location.UNKNOWN;
hitStrand = Location.UNKNOWN;
}
if (extendBy > 0) {
if (sourceStrand == Location.FORWARD) {
sourceEend += extendBy;
} else if (sourceStrand == Location.REVERSE) {
sourceStart -= extendBy;
}
if (hitStrand == Location.FORWARD) {
hitEend += extendBy;
} else if (hitStrand == Location.REVERSE) {
hitStart -= extendBy;
}
}
// We also don't allow readings which are beyond the end of the chromosome
if (sourceEend > chrs[c].length()) {
int overrun = sourceEend - chrs[c].length();
progressWarningReceived(new SeqMonkException("Reading position " + sourceEend + " was " + overrun + "bp beyond the end of chr" + chrs[c].name() + " (" + chrs[c].length() + ")"));
continue;
}
if (hitEend > localChromosome.length()) {
int overrun = hitEend - SeqMonkApplication.getInstance().dataCollection().genome().getChromosome(localChromosomes[c2]).chromosome().length();
progressWarningReceived(new SeqMonkException("Reading position " + hitEend + " was " + overrun + "bp beyond the end of chr" + localChromosome.name() + " (" + chrs[c].length() + ")"));
continue;
}
// We can now make the new readings
long sourceRead = SequenceRead.packPosition(sourceStart, sourceEend, sourceStrand);
long hitRead = SequenceRead.packPosition(hitStart, hitEend, hitStrand);
if (!prefs.isHiC()) {
// HiC additions are deferred until we know the other end is OK too.
newData.addData(chrs[c], sourceRead);
newData.addData(localChromosome, hitRead);
}
}
}
}
return newData;
}
use of uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet in project SeqMonk by s-andrews.
the class BedPEFileParser method run.
/* (non-Javadoc)
* @see java.lang.Runnable#run()
*/
public void run() {
// System.err.println("Started parsing BED files");
int extendBy = prefs.extendReads();
try {
File[] probeFiles = getFiles();
DataSet[] newData = new DataSet[probeFiles.length];
for (int f = 0; f < probeFiles.length; f++) {
BufferedReader br;
if (probeFiles[f].getName().toLowerCase().endsWith(".gz")) {
br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(probeFiles[f]))));
} else {
br = new BufferedReader(new FileReader(probeFiles[f]));
}
String line;
if (prefs.isHiC()) {
newData[f] = new PairedDataSet(probeFiles[f].getName(), probeFiles[f].getCanonicalPath(), prefs.removeDuplicates(), prefs.hiCDistance(), prefs.hiCIgnoreTrans());
} else {
newData[f] = new DataSet(probeFiles[f].getName(), probeFiles[f].getCanonicalPath(), prefs.removeDuplicates());
}
int lineCount = 0;
// Now process the file
while ((line = br.readLine()) != null) {
if (cancel) {
br.close();
progressCancelled();
return;
}
// Ignore blank lines
if (line.trim().length() == 0)
continue;
// System.err.println(line);
// Thread.sleep(200);
++lineCount;
if (lineCount % 100000 == 0) {
progressUpdated("Read " + lineCount + " lines from " + probeFiles[f].getName(), f, probeFiles.length);
}
String[] sections = line.split("\t");
// Check to see if we've got enough data to work with
if (sections.length < 6) {
progressWarningReceived(new SeqMonkException("Not enough data from line '" + line + "'"));
// Skip this line...
continue;
}
// move on quickly.
if (sections[0].equals(".") || sections[3].equals("."))
continue;
int strand1;
int start1;
int end1;
int strand2;
int start2;
int end2;
try {
// The start is zero indexed so we need to add 1 to get genomic positions
start1 = Integer.parseInt(sections[1]) + 1;
start2 = Integer.parseInt(sections[4]) + 1;
// The end is zero indexed, but not included in the feature position so
// we need to add one to get genomic coordinates, but subtract one to not
// include the final base.
end1 = Integer.parseInt(sections[2]);
end2 = Integer.parseInt(sections[5]);
// End must always be later than start
if (start1 > end1) {
progressWarningReceived(new SeqMonkException("End position1 " + end1 + " was lower than start position " + start1));
int temp = start1;
start1 = end1;
end1 = temp;
}
if (start2 > end2) {
progressWarningReceived(new SeqMonkException("End position2 " + end2 + " was lower than start position " + start2));
int temp = start2;
start2 = end2;
end2 = temp;
}
if (sections.length >= 10) {
if (sections[8].equals("+")) {
strand1 = Location.FORWARD;
} else if (sections[8].equals("-")) {
strand1 = Location.REVERSE;
} else if (sections[8].equals(".")) {
strand1 = Location.UNKNOWN;
} else {
progressWarningReceived(new SeqMonkException("Unknown strand character '" + sections[8] + "' marked as unknown strand"));
strand1 = Location.UNKNOWN;
}
if (sections[9].equals("+")) {
strand2 = Location.FORWARD;
} else if (sections[9].equals("-")) {
strand2 = Location.REVERSE;
} else if (sections[9].equals(".")) {
strand2 = Location.UNKNOWN;
} else {
progressWarningReceived(new SeqMonkException("Unknown strand character '" + sections[9] + "' marked as unknown strand"));
strand2 = Location.UNKNOWN;
}
// if (extendBy > 0) {
// if (strand==Location.REVERSE) {
// start -=extendBy;
// }
// else if (strand==Location.FORWARD) {
// end+=extendBy;
// }
// }
} else {
strand1 = Location.UNKNOWN;
strand2 = Location.UNKNOWN;
}
} catch (NumberFormatException e) {
progressWarningReceived(new SeqMonkException("Location " + sections[0] + "-" + sections[1] + " was not an integer"));
continue;
}
try {
ChromosomeWithOffset c1 = dataCollection().genome().getChromosome(sections[0]);
// We also don't allow readings which are beyond the end of the chromosome
start1 = c1.position(start1);
end1 = c1.position(end1);
if (end1 > c1.chromosome().length()) {
int overrun = end1 - c1.chromosome().length();
progressWarningReceived(new SeqMonkException("Reading position " + end1 + " was " + overrun + "bp beyond the end of chr" + c1.chromosome().name() + " (" + c1.chromosome().length() + ")"));
continue;
}
ChromosomeWithOffset c2 = dataCollection().genome().getChromosome(sections[3]);
// We also don't allow readings which are beyond the end of the chromosome
start2 = c2.position(start2);
end2 = c2.position(end2);
if (end2 > c2.chromosome().length()) {
int overrun = end2 - c2.chromosome().length();
progressWarningReceived(new SeqMonkException("Reading position " + end2 + " was " + overrun + "bp beyond the end of chr" + c2.chromosome().name() + " (" + c2.chromosome().length() + ")"));
continue;
}
// add them. There's nothing clever to do.
if (prefs.isHiC()) {
long read1 = SequenceRead.packPosition(start1, end1, strand1);
newData[f].addData(c1.chromosome(), read1);
long read2 = SequenceRead.packPosition(start2, end2, strand2);
newData[f].addData(c2.chromosome(), read2);
} else {
// If they're on different chromosomes then we kick them out
if (!c1.chromosome().name().equals(c2.chromosome().name())) {
progressWarningReceived(new SeqMonkException("Paried reads were on different chromosomes - discarding"));
continue;
}
if (strand1 == Location.FORWARD && strand2 != Location.REVERSE) {
progressWarningReceived(new SeqMonkException("Invalid strand orientation - discarding"));
continue;
}
if (strand1 == Location.REVERSE && strand2 != Location.FORWARD) {
progressWarningReceived(new SeqMonkException("Invalid strand orientation - discarding"));
continue;
}
// If they're too far apart we kick them out
int start = 1;
int end = 0;
// We take the strand from read1
int strand = strand1;
if (strand == Location.FORWARD) {
start = start1;
end = end2;
} else if (strand == Location.REVERSE) {
start = start2;
end = end1;
} else if (strand == Location.UNKNOWN) {
start = Math.min(start1, start2);
end = Math.max(end1, end2);
}
if (end <= start) {
progressWarningReceived(new SeqMonkException("Incorrectly oriented reads - discarding"));
continue;
}
if ((end - start) + 1 > prefs.pairDistanceCutoff()) {
progressWarningReceived(new SeqMonkException("Distance between reads too great (" + (((end - start) + 1) - prefs.pairDistanceCutoff()) + ")"));
continue;
}
long read = SequenceRead.packPosition(start, end, strand);
newData[f].addData(c1.chromosome(), read);
}
} catch (IllegalArgumentException iae) {
progressWarningReceived(iae);
} catch (SeqMonkException sme) {
progressWarningReceived(sme);
continue;
}
}
// We're finished with the file.
br.close();
// Cache the data in the new dataset
progressUpdated("Caching data from " + probeFiles[f].getName(), f, probeFiles.length);
newData[f].finalise();
}
processingFinished(newData);
} catch (Exception ex) {
progressExceptionReceived(ex);
return;
}
}
use of uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet in project SeqMonk by s-andrews.
the class SeqMonkReimportParser method parseSamples.
/**
* Parses the list of samples.
*
* @param sections The tab split initial samples line
* @throws SeqMonkException
* @throws IOException Signals that an I/O exception has occurred.
*/
private DataSet[] parseSamples(String[] sections) throws SeqMonkException, IOException {
if (sections.length != 2) {
throw new SeqMonkException("Samples line didn't contain 2 sections");
}
if (!sections[0].equals("Samples")) {
throw new SeqMonkException("Couldn't find expected samples line");
}
int n = Integer.parseInt(sections[1]);
// We need to keep the Data Sets around to add data to later.
// We're going to start all of these datasets off, but only collect data for some
// of them.
DataSet[] dataSets = new DataSet[n];
for (int i = 0; i < n; i++) {
sections = br.readLine().split("\\t");
// anything else is assumed to be a normal dataset.
if (sections.length == 1) {
dataSets[i] = new DataSet(sections[0], "Not known", DataSet.DUPLICATES_REMOVE_NO);
} else if (sections.length == 2) {
dataSets[i] = new DataSet(sections[0], sections[1], DataSet.DUPLICATES_REMOVE_NO);
} else if (sections.length == 3) {
if (sections[2].equals("HiC")) {
dataSets[i] = new PairedDataSet(sections[0], sections[1], DataSet.DUPLICATES_REMOVE_NO, 0, false);
} else {
dataSets[i] = new DataSet(sections[0], sections[1], DataSet.DUPLICATES_REMOVE_NO);
}
}
}
// At this point we need to know which of the datasets we're actually going
// to import into the current project
DataSetSelector selector = new DataSetSelector(dataSets);
boolean[] dataSetsToParse = new boolean[dataSets.length];
int lastDataSetToParse = 0;
for (int i = 0; i < dataSetsToParse.length; i++) {
dataSetsToParse[i] = false;
}
int[] selectedIndices = selector.getSelectedIndices();
for (int i = 0; i < selectedIndices.length; i++) {
dataSetsToParse[selectedIndices[i]] = true;
if (selectedIndices[i] > lastDataSetToParse)
lastDataSetToParse = selectedIndices[i];
}
// Now we can go through the rest of the sets and add data where appropriate
// Immediately after the list of samples comes the lists of reads
String line;
// Iterate through the number of samples
for (int i = 0; i < n; i++) {
if (i > lastDataSetToParse) {
// No sense skipping through a load of data we're not going to import.
break;
}
if (dataSetsToParse[i]) {
progressUpdated("Reading data for " + dataSets[i].name(), i * 10, n * 10);
} else {
progressUpdated("Skipping " + dataSets[i].name(), i * 10, n * 10);
}
// The first line is
line = br.readLine();
sections = line.split("\t");
if (sections.length != 2) {
throw new SeqMonkException("Read line " + i + " didn't contain 2 sections");
}
int readCount = Integer.parseInt(sections[0]);
// In versions prior to 7 we encoded everything on every line separately
if (thisDataVersion < 7) {
for (int r = 0; r < readCount; r++) {
if ((r % (1 + (readCount / 10))) == 0) {
if (dataSetsToParse[i]) {
progressUpdated("Reading data for " + dataSets[i].name(), i * 10 + (r / (1 + (readCount / 10))), n * 10);
} else {
progressUpdated("Skipping " + dataSets[i].name(), i * 10 + (r / (1 + (readCount / 10))), n * 10);
}
}
line = br.readLine();
if (line == null) {
throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
}
if (dataSetsToParse[i]) {
sections = line.split("\t");
Chromosome c;
try {
c = genome.getChromosome(sections[0]).chromosome();
} catch (IllegalArgumentException sme) {
progressWarningReceived(new SeqMonkException("Read from sample " + i + " could not be mapped to chromosome '" + sections[0] + "'"));
continue;
}
int start = Integer.parseInt(sections[1]);
int end = Integer.parseInt(sections[2]);
int strand = Integer.parseInt(sections[3]);
try {
dataSets[i].addData(c, SequenceRead.packPosition(start, end, strand));
} catch (SeqMonkException ex) {
progressWarningReceived(ex);
continue;
}
}
}
} else if (dataSets[i] instanceof PairedDataSet) {
// Paired Data sets have a different format, with a packed position for
// the first read, then a chromosome name and packed position for the second
// read.
// From v13 onwards we started entering HiC data pairs in both directions so
// the data would be able to be read back in without sorting it.
boolean hicDataIsPresorted = thisDataVersion >= 13;
while (true) {
// The first line should be the chromosome and a number of reads
line = br.readLine();
if (line == null) {
throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
}
if (dataSetsToParse[i]) {
// A blank line indicates the end of the sample
if (line.length() == 0)
break;
sections = line.split("\\t");
Chromosome c = genome.getChromosome(sections[0]).chromosome();
// progressUpdated("Reading data for "+dataSets[i].name(),i*application.dataCollection().genome().getChromosomeCount()+seenChromosomeCount,n*application.dataCollection().genome().getChromosomeCount());
int chrReadCount = Integer.parseInt(sections[1]);
for (int r = 0; r < chrReadCount; r++) {
line = br.readLine();
if (line == null) {
throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
}
/*
* We used to have a split("\t") here, but this turned out to be the bottleneck
* which hugely restricted the speed at which the data could be read. Switching
* for a set of index calls and substring makes this *way* faster.
*/
long packedPosition = Long.parseLong(line.substring(0, line.indexOf('\t')));
Chromosome c2 = genome.getChromosome(line.substring(line.indexOf('\t'), line.lastIndexOf(('\t')))).chromosome();
long packedPosition2 = Long.parseLong(line.substring(line.lastIndexOf('\t') + 1, line.length()));
try {
dataSets[i].addData(c, packedPosition, hicDataIsPresorted);
dataSets[i].addData(c2, packedPosition2, hicDataIsPresorted);
} catch (SeqMonkException ex) {
progressWarningReceived(ex);
continue;
}
}
}
}
} else {
// In versions after 7 we split the section up into chromosomes
// so we don't put the chromosome on every line, and we put out
// the packed double value rather than the individual start, end
// and strand
// As of version 12 we collapse repeated reads into one line with
// a count after it, so we need to check for this.
// We keep count of reads processed to update the progress listeners
int readsRead = 0;
while (true) {
// The first line should be the chromosome and a number of reads
line = br.readLine();
if (line == null) {
throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
}
// A blank line indicates the end of the sample
if (line.length() == 0)
break;
if (dataSetsToParse[i]) {
sections = line.split("\\t");
// We don't try to capture this exception since we can't then process
// any of the reads which follow.
Chromosome c = genome.getChromosome(sections[0]).chromosome();
int chrReadCount = Integer.parseInt(sections[1]);
int tabIndexPosition = 0;
for (int r = 0; r < chrReadCount; r++) {
readsRead++;
if ((readsRead % (1 + (readCount / 10))) == 0) {
if (dataSetsToParse[i]) {
progressUpdated("Reading data for " + dataSets[i].name(), i * 10 + (readsRead / (1 + (readCount / 10))), n * 10);
} else {
progressUpdated("Skipping " + dataSets[i].name(), i * 10 + (readsRead / (1 + (readCount / 10))), n * 10);
}
}
line = br.readLine();
if (line == null) {
throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
}
long packedPosition;
try {
if (line.indexOf("\t") != -1) {
// We have an index with a count
sections = line.split("\t");
tabIndexPosition = line.indexOf("\t");
packedPosition = Long.parseLong(line.substring(0, tabIndexPosition));
int count = Integer.parseInt(line.substring(tabIndexPosition + 1));
// TODO: Does this break the progress bar sometimes?
r += (count - 1);
readsRead += (count - 1);
for (int x = 1; x <= count; x++) {
dataSets[i].addData(c, packedPosition);
}
} else {
packedPosition = Long.parseLong(line);
dataSets[i].addData(c, packedPosition);
}
} catch (SeqMonkException ex) {
progressWarningReceived(ex);
}
}
}
}
}
if (dataSetsToParse[i]) {
dataSets[i].finalise();
}
}
// Finally we need to make up an array of just the datasets we actually read
Vector<DataSet> keepers = new Vector<DataSet>();
for (int i = 0; i < dataSets.length; i++) {
if (dataSetsToParse[i]) {
keepers.add(dataSets[i]);
}
}
return keepers.toArray(new DataSet[0]);
}
Aggregations