use of uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome in project SeqMonk by s-andrews.
the class RNAQCCalcualtor method run.
public void run() {
updateProgress("Getting features", 0, 1);
this.geneFeatures = collection.genome().annotationCollection().getFeaturesForType(geneFeatureName);
if (transcriptFeatureName != null) {
this.transcriptFeatures = collection.genome().annotationCollection().getFeaturesForType(transcriptFeatureName);
}
if (rRNAFeatureName != null) {
this.rRNAFeatures = collection.genome().annotationCollection().getFeaturesForType(rRNAFeatureName);
}
updateProgress("Getting merged genes", 0, 1);
// Get the merged set of gene locations
Feature[] mergedGenes = FeatureMerging.getNonOverlappingLocationsForFeatures(geneFeatures, false);
if (cancel) {
progressCancelled();
return;
}
updateProgress("Getting merged transcripts", 0, 1);
// Get the merged set of transcript features
Feature[] mergedTranscripts = null;
if (transcriptFeatureName != null) {
mergedTranscripts = FeatureMerging.getNonOverlappingLocationsForFeatures(transcriptFeatures, true);
}
if (cancel) {
progressCancelled();
return;
}
// Quantitate the genes.
long[] geneBaseCounts = new long[stores.length];
int[] measuredGenesCounts = new int[stores.length];
for (int s = 0; s < stores.length; s++) {
updateProgress("Quantitating " + stores[s].name() + " over genes", s, stores.length * 4);
String lastChr = "";
Chromosome lastChrObject = null;
for (int f = 0; f < mergedGenes.length; f++) {
if (mergedGenes[f].chromosomeName() != lastChr) {
lastChr = mergedGenes[f].chromosomeName();
lastChrObject = collection.genome().getExactChromsomeNameMatch(lastChr);
}
long[] reads = stores[s].getReadsForProbe(new Probe(lastChrObject, mergedGenes[f].location().packedPosition()));
// See if we measured anything for this gene
if (reads.length > 0) {
++measuredGenesCounts[s];
}
for (int r = 0; r < reads.length; r++) {
if (cancel) {
progressCancelled();
return;
}
// Get the length of the overlap
int overlap = 1 + (Math.min(SequenceRead.end(reads[r]), mergedGenes[f].location().end()) - Math.max(SequenceRead.start(reads[r]), mergedGenes[f].location().start()));
geneBaseCounts[s] += overlap;
}
}
}
// Quantitate the transcripts
long[] transcriptBaseCounts = null;
long[] transcriptSameStrandBaseCounts = null;
long[] transcriptOpposingStrandBaseCounts = null;
if (transcriptFeatureName != null) {
transcriptBaseCounts = new long[stores.length];
transcriptSameStrandBaseCounts = new long[stores.length];
transcriptOpposingStrandBaseCounts = new long[stores.length];
for (int s = 0; s < stores.length; s++) {
updateProgress("Quantitating " + stores[s].name() + " over transcripts", s + stores.length, stores.length * 4);
String lastChr = "";
Chromosome lastChrObject = null;
for (int f = 0; f < mergedTranscripts.length; f++) {
if (mergedTranscripts[f].chromosomeName() != lastChr) {
lastChr = mergedTranscripts[f].chromosomeName();
lastChrObject = collection.genome().getExactChromsomeNameMatch(lastChr);
}
long[] reads = stores[s].getReadsForProbe(new Probe(lastChrObject, mergedTranscripts[f].location().packedPosition()));
for (int r = 0; r < reads.length; r++) {
if (cancel) {
progressCancelled();
return;
}
// Get the length of the overlap
int overlap = 1 + (Math.min(SequenceRead.end(reads[r]), mergedTranscripts[f].location().end()) - Math.max(SequenceRead.start(reads[r]), mergedTranscripts[f].location().start()));
transcriptBaseCounts[s] += overlap;
if (SequenceRead.strand(reads[r]) == mergedTranscripts[f].location().strand()) {
transcriptSameStrandBaseCounts[s] += overlap;
} else {
transcriptOpposingStrandBaseCounts[s] += overlap;
}
}
}
}
}
// Quantitate the rRNA
long[] rRNABaseCounts = null;
if (rRNAFeatureName != null) {
rRNABaseCounts = new long[stores.length];
for (int s = 0; s < stores.length; s++) {
updateProgress("Quantitating " + stores[s].name() + " over rRNAs", s + (stores.length * 2), stores.length * 4);
String lastChr = "";
Chromosome lastChrObject = null;
for (int f = 0; f < rRNAFeatures.length; f++) {
if (rRNAFeatures[f].chromosomeName() != lastChr) {
lastChr = rRNAFeatures[f].chromosomeName();
lastChrObject = collection.genome().getExactChromsomeNameMatch(lastChr);
}
long[] reads = stores[s].getReadsForProbe(new Probe(lastChrObject, rRNAFeatures[f].location().packedPosition()));
for (int r = 0; r < reads.length; r++) {
if (cancel) {
progressCancelled();
return;
}
// Get the length of the overlap
int overlap = 1 + (Math.min(SequenceRead.end(reads[r]), rRNAFeatures[f].location().end()) - Math.max(SequenceRead.start(reads[r]), rRNAFeatures[f].location().start()));
rRNABaseCounts[s] += overlap;
}
}
}
}
// Quantitate the chromosomes
long[][] chromosomeBaseCounts = new long[chromosomes.length][stores.length];
for (int s = 0; s < stores.length; s++) {
for (int c = 0; c < chromosomes.length; c++) {
updateProgress("Quantitating " + stores[s].name() + " over " + chromosomes[c].name(), s + (stores.length * 3), stores.length * 4);
ReadsWithCounts reads = stores[s].getReadsForChromosome(chromosomes[c]);
for (int r = 0; r < reads.reads.length; r++) {
chromosomeBaseCounts[c][s] += SequenceRead.length(reads.reads[r]) * reads.counts[r];
}
}
}
// Finally we make up the data sets we're going to pass back.
RNAQCResult result = new RNAQCResult(stores);
double[] percentInGene = new double[stores.length];
for (int i = 0; i < geneBaseCounts.length; i++) {
percentInGene[i] = (geneBaseCounts[i] / (double) stores[i].getTotalReadLength()) * 100;
if (percentInGene[i] > 100) {
progressWarning("Percent in gene was >100 for " + stores[i]);
percentInGene[i] = 100;
}
}
result.addPercentageSet("Percent in Gene", percentInGene);
double[] percentInTranscript = null;
if (transcriptFeatureName != null) {
percentInTranscript = new double[stores.length];
for (int i = 0; i < geneBaseCounts.length; i++) {
percentInTranscript[i] = (transcriptBaseCounts[i] / (double) geneBaseCounts[i]) * 100;
if (percentInTranscript[i] > 100) {
progressWarning("Percent in exons was >100 for " + stores[i]);
percentInTranscript[i] = 100;
}
}
result.addPercentageSet("Percent in exons", percentInTranscript);
}
double[] percentInrRNA = null;
if (rRNAFeatureName != null) {
percentInrRNA = new double[stores.length];
for (int i = 0; i < rRNABaseCounts.length; i++) {
percentInrRNA[i] = (rRNABaseCounts[i] / (double) stores[i].getTotalReadLength()) * 100;
if (percentInrRNA[i] > 100) {
progressWarning("Percent in rRNA was >100 for " + stores[i]);
percentInrRNA[i] = 100;
}
}
result.addPercentageSet("Percent in rRNA", percentInrRNA);
}
double[] percentageMeasuredGenes = new double[stores.length];
for (int i = 0; i < measuredGenesCounts.length; i++) {
percentageMeasuredGenes[i] = measuredGenesCounts[i] / (double) mergedGenes.length * 100;
}
result.addPercentageSet("Percent Genes Measured", percentageMeasuredGenes);
// Work out the relative coverage
double[] percentageOfMaxCoverage = new double[stores.length];
long maxLength = 0;
for (int i = 0; i < stores.length; i++) {
if (stores[i].getTotalReadLength() > maxLength)
maxLength = stores[i].getTotalReadLength();
}
for (int i = 0; i < stores.length; i++) {
percentageOfMaxCoverage[i] = (stores[i].getTotalReadLength() * 100d) / maxLength;
}
result.addPercentageSet("Percentage of max data size", percentageOfMaxCoverage);
double[][] percentInChromosomes = new double[chromosomes.length][stores.length];
for (int c = 0; c < percentInChromosomes.length; c++) {
for (int i = 0; i < chromosomeBaseCounts[c].length; i++) {
percentInChromosomes[c][i] = (chromosomeBaseCounts[c][i] / (double) stores[i].getTotalReadLength()) * 100;
if (percentInChromosomes[c][i] > 100) {
progressWarning("Percent in " + chromosomes[c] + " was >100 for " + stores[i]);
percentInChromosomes[c][i] = 100;
}
}
}
for (int c = 0; c < percentInChromosomes.length; c++) {
result.addPercentageSet("Percent in " + chromosomes[c].name(), percentInChromosomes[c]);
}
double[] percentOnSenseStrand = null;
if (transcriptFeatureName != null) {
percentOnSenseStrand = new double[stores.length];
for (int i = 0; i < transcriptBaseCounts.length; i++) {
percentOnSenseStrand[i] = (transcriptSameStrandBaseCounts[i] / (double) transcriptBaseCounts[i]) * 100;
if (percentOnSenseStrand[i] > 100) {
progressWarning("Percent on sense strand was >100 for " + stores[i]);
percentOnSenseStrand[i] = 100;
}
}
result.addPercentageSet("Percent on sense strand", percentOnSenseStrand);
}
progressComplete(result);
}
use of uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome in project SeqMonk by s-andrews.
the class GenomeParser method reloadCacheFiles.
private void reloadCacheFiles(SingleGenome genome, File baseLocation) {
Enumeration<ProgressListener> el = listeners.elements();
while (el.hasMoreElements()) {
el.nextElement().progressUpdated("Reloading cache files", 0, 1);
}
CoreAnnotationSet coreAnnotation = new CoreAnnotationSet(genome);
File cacheDir = new File(baseLocation.getAbsoluteFile() + "/cache/");
// First we need to get the list of chromosomes and set those
// up before we go on to add the actual feature sets.
File chrListFile = new File(baseLocation.getAbsoluteFile() + "/cache/chr_list");
try {
BufferedReader br = new BufferedReader(new FileReader(chrListFile));
String line;
while ((line = br.readLine()) != null) {
String[] chrLen = line.split("\\t");
Chromosome c = genome.addChromosome(chrLen[0]);
c.setLength(Integer.parseInt(chrLen[1]));
}
br.close();
} catch (Exception e) {
throw new IllegalStateException(e);
}
File[] cacheFiles = cacheDir.listFiles(new FileFilter() {
public boolean accept(File pathname) {
return pathname.getName().toLowerCase().endsWith(".cache");
}
});
for (int i = 0; i < cacheFiles.length; i++) {
// Update the listeners
String name = cacheFiles[i].getName();
name = name.replaceAll("\\.cache$", "");
String[] chrType = name.split("%", 2);
if (chrType.length != 2) {
throw new IllegalStateException("Cache name '" + name + "' didn't split into chr and type");
}
// If the feature name had a forward slash in it we've replaced it with 3 underscores
chrType[1] = chrType[1].replaceAll("___", "/");
coreAnnotation.addPreCachedFile(chrType[1], chrType[0], cacheFiles[i]);
}
genome.annotationCollection().addAnnotationSets(new AnnotationSet[] { coreAnnotation });
}
use of uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome in project SeqMonk by s-andrews.
the class GenomeParser method processEMBLFile.
/**
* Process embl file.
*
* @param f the f
* @param annotation the annotation
* @throws Exception the exception
*/
private void processEMBLFile(File f, AnnotationSet annotation, SingleGenome genome) throws Exception {
BufferedReader br = new BufferedReader(new FileReader(f));
Chromosome c = null;
while ((c = parseChromosome(br, genome)) != null) {
String line;
// We can now skip through to the start of the feature table
while ((line = br.readLine()) != null) {
if (line.startsWith("FH") || line.startsWith("SQ")) {
break;
}
}
// We can now start reading the features one at a time by
// concatonating them and then passing them on for processing
StringBuffer currentAttribute = new StringBuffer();
boolean skipping = true;
Feature feature = null;
while ((line = br.readLine()) != null) {
if (line.startsWith("XX") || line.startsWith("SQ") || line.startsWith("//")) {
skipToEntryEnd(br);
break;
}
// Just a blank line.
if (line.length() < 18)
continue;
String type = line.substring(5, 18).trim();
// System.out.println("Type is "+type);
if (type.length() > 0) {
// Check whether we need to process the old feature
if (skipping) {
// We're either on the first feature, or we've
// moving past this one
skipping = false;
} else {
// We need to process the last attribute from the
// old feature
processAttributeReturnSkip(currentAttribute.toString(), feature);
annotation.addFeature(feature);
}
// We can check to see if we're bothering to load this type of feature
if (prefs.loadAnnotation(type)) {
// System.err.println("Creating new feature of type "+type);
feature = new Feature(type, c.name());
currentAttribute = new StringBuffer("location=");
currentAttribute.append(line.substring(21).trim());
continue;
} else {
// System.err.println("Skipping feature of type "+type);
genome.addUnloadedFeatureType(type);
skipping = true;
}
}
if (skipping)
continue;
String data = line.substring(21).trim();
if (data.startsWith("/")) {
// We're at the start of a new attribute
// Process the last attribute
skipping = processAttributeReturnSkip(currentAttribute.toString(), feature);
currentAttribute = new StringBuffer();
}
// before the next lot of text.
if (currentAttribute.indexOf("description=") >= 0)
currentAttribute.append(" ");
currentAttribute.append(data);
}
// if there was one
if (!skipping) {
// We need to process the last attribute from the
// old feature
processAttributeReturnSkip(currentAttribute.toString(), feature);
annotation.addFeature(feature);
}
}
br.close();
}
use of uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome in project SeqMonk by s-andrews.
the class VisibleStoresParser method processHiCDataStore.
private DataSet processHiCDataStore(DataStore store) throws SeqMonkException {
int extendBy = prefs.extendReads();
boolean reverse = prefs.reverseReads();
boolean removeStrand = prefs.removeStrandInfo();
PairedDataSet newData = new PairedDataSet(store.name() + "_reimport", "Reimported from " + store.name(), prefs.removeDuplicates(), prefs.hiCDistance(), prefs.hiCIgnoreTrans());
// Now process the data
Chromosome[] chrs = dataCollection().genome().getAllChromosomes();
for (int c = 0; c < chrs.length; c++) {
progressUpdated("Processing " + store.name() + " chr " + chrs[c].name(), c, chrs.length);
// We make the call to get exportable reads so we don't duplicate reads
// when we export things
HiCHitCollection hitCollection = ((HiCDataStore) store).getExportableReadsForChromosome(chrs[c]);
String[] localChromosomes = hitCollection.getChromosomeNamesWithHits();
for (int c2 = 0; c2 < localChromosomes.length; c2++) {
Chromosome localChromosome = SeqMonkApplication.getInstance().dataCollection().genome().getChromosome(localChromosomes[c2]).chromosome();
long[] sourceReads = hitCollection.getSourcePositionsForChromosome(localChromosomes[c2]);
long[] hitReads = hitCollection.getHitPositionsForChromosome(localChromosomes[c2]);
for (int r = 0; r < sourceReads.length; r++) {
if (cancel) {
progressCancelled();
return null;
}
if (downsample && downsampleProbabilty < 1) {
if (Math.random() > downsampleProbabilty) {
continue;
}
}
if ((!(reverse || removeStrand)) && extendBy == 0 && (!filterByFeature)) {
// Just add them as they are
newData.addData(chrs[c], sourceReads[r]);
newData.addData(localChromosome, hitReads[r]);
}
Feature[] features = null;
if (filterByFeature) {
features = collection.genome().annotationCollection().getFeaturesForType(chrs[c], featureType);
Arrays.sort(features);
}
int currentFeaturePostion = 0;
if (filterByFeature) {
// See if we're comparing against the right feature
while (SequenceRead.start(sourceReads[r]) > features[currentFeaturePostion].location().end() && currentFeaturePostion < (features.length - 1)) {
currentFeaturePostion++;
}
// Test to see if we overlap
if (SequenceRead.overlaps(sourceReads[r], features[currentFeaturePostion].location().packedPosition())) {
if (excludeFeature)
continue;
} else {
if (!excludeFeature)
continue;
}
}
int sourceStart = SequenceRead.start(sourceReads[r]);
int sourceEend = SequenceRead.end(sourceReads[r]);
int sourceStrand = SequenceRead.strand(sourceReads[r]);
int hitStart = SequenceRead.start(sourceReads[r]);
int hitEend = SequenceRead.end(hitReads[r]);
int hitStrand = SequenceRead.strand(hitReads[r]);
if (reverse) {
if (sourceStrand == Location.FORWARD) {
sourceStrand = Location.REVERSE;
} else if (sourceStrand == Location.REVERSE) {
sourceStrand = Location.FORWARD;
}
if (hitStrand == Location.FORWARD) {
hitStrand = Location.REVERSE;
} else if (hitStrand == Location.REVERSE) {
hitStrand = Location.FORWARD;
}
}
if (removeStrand) {
sourceStrand = Location.UNKNOWN;
hitStrand = Location.UNKNOWN;
}
if (extendBy > 0) {
if (sourceStrand == Location.FORWARD) {
sourceEend += extendBy;
} else if (sourceStrand == Location.REVERSE) {
sourceStart -= extendBy;
}
if (hitStrand == Location.FORWARD) {
hitEend += extendBy;
} else if (hitStrand == Location.REVERSE) {
hitStart -= extendBy;
}
}
// We also don't allow readings which are beyond the end of the chromosome
if (sourceEend > chrs[c].length()) {
int overrun = sourceEend - chrs[c].length();
progressWarningReceived(new SeqMonkException("Reading position " + sourceEend + " was " + overrun + "bp beyond the end of chr" + chrs[c].name() + " (" + chrs[c].length() + ")"));
continue;
}
if (hitEend > localChromosome.length()) {
int overrun = hitEend - SeqMonkApplication.getInstance().dataCollection().genome().getChromosome(localChromosomes[c2]).chromosome().length();
progressWarningReceived(new SeqMonkException("Reading position " + hitEend + " was " + overrun + "bp beyond the end of chr" + localChromosome.name() + " (" + chrs[c].length() + ")"));
continue;
}
// We can now make the new readings
long sourceRead = SequenceRead.packPosition(sourceStart, sourceEend, sourceStrand);
long hitRead = SequenceRead.packPosition(hitStart, hitEend, hitStrand);
if (!prefs.isHiC()) {
// HiC additions are deferred until we know the other end is OK too.
newData.addData(chrs[c], sourceRead);
newData.addData(localChromosome, hitRead);
}
}
}
}
return newData;
}
use of uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome in project SeqMonk by s-andrews.
the class SeqMonkReimportParser method parseSamples.
/**
* Parses the list of samples.
*
* @param sections The tab split initial samples line
* @throws SeqMonkException
* @throws IOException Signals that an I/O exception has occurred.
*/
private DataSet[] parseSamples(String[] sections) throws SeqMonkException, IOException {
if (sections.length != 2) {
throw new SeqMonkException("Samples line didn't contain 2 sections");
}
if (!sections[0].equals("Samples")) {
throw new SeqMonkException("Couldn't find expected samples line");
}
int n = Integer.parseInt(sections[1]);
// We need to keep the Data Sets around to add data to later.
// We're going to start all of these datasets off, but only collect data for some
// of them.
DataSet[] dataSets = new DataSet[n];
for (int i = 0; i < n; i++) {
sections = br.readLine().split("\\t");
// anything else is assumed to be a normal dataset.
if (sections.length == 1) {
dataSets[i] = new DataSet(sections[0], "Not known", DataSet.DUPLICATES_REMOVE_NO);
} else if (sections.length == 2) {
dataSets[i] = new DataSet(sections[0], sections[1], DataSet.DUPLICATES_REMOVE_NO);
} else if (sections.length == 3) {
if (sections[2].equals("HiC")) {
dataSets[i] = new PairedDataSet(sections[0], sections[1], DataSet.DUPLICATES_REMOVE_NO, 0, false);
} else {
dataSets[i] = new DataSet(sections[0], sections[1], DataSet.DUPLICATES_REMOVE_NO);
}
}
}
// At this point we need to know which of the datasets we're actually going
// to import into the current project
DataSetSelector selector = new DataSetSelector(dataSets);
boolean[] dataSetsToParse = new boolean[dataSets.length];
int lastDataSetToParse = 0;
for (int i = 0; i < dataSetsToParse.length; i++) {
dataSetsToParse[i] = false;
}
int[] selectedIndices = selector.getSelectedIndices();
for (int i = 0; i < selectedIndices.length; i++) {
dataSetsToParse[selectedIndices[i]] = true;
if (selectedIndices[i] > lastDataSetToParse)
lastDataSetToParse = selectedIndices[i];
}
// Now we can go through the rest of the sets and add data where appropriate
// Immediately after the list of samples comes the lists of reads
String line;
// Iterate through the number of samples
for (int i = 0; i < n; i++) {
if (i > lastDataSetToParse) {
// No sense skipping through a load of data we're not going to import.
break;
}
if (dataSetsToParse[i]) {
progressUpdated("Reading data for " + dataSets[i].name(), i * 10, n * 10);
} else {
progressUpdated("Skipping " + dataSets[i].name(), i * 10, n * 10);
}
// The first line is
line = br.readLine();
sections = line.split("\t");
if (sections.length != 2) {
throw new SeqMonkException("Read line " + i + " didn't contain 2 sections");
}
int readCount = Integer.parseInt(sections[0]);
// In versions prior to 7 we encoded everything on every line separately
if (thisDataVersion < 7) {
for (int r = 0; r < readCount; r++) {
if ((r % (1 + (readCount / 10))) == 0) {
if (dataSetsToParse[i]) {
progressUpdated("Reading data for " + dataSets[i].name(), i * 10 + (r / (1 + (readCount / 10))), n * 10);
} else {
progressUpdated("Skipping " + dataSets[i].name(), i * 10 + (r / (1 + (readCount / 10))), n * 10);
}
}
line = br.readLine();
if (line == null) {
throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
}
if (dataSetsToParse[i]) {
sections = line.split("\t");
Chromosome c;
try {
c = genome.getChromosome(sections[0]).chromosome();
} catch (IllegalArgumentException sme) {
progressWarningReceived(new SeqMonkException("Read from sample " + i + " could not be mapped to chromosome '" + sections[0] + "'"));
continue;
}
int start = Integer.parseInt(sections[1]);
int end = Integer.parseInt(sections[2]);
int strand = Integer.parseInt(sections[3]);
try {
dataSets[i].addData(c, SequenceRead.packPosition(start, end, strand));
} catch (SeqMonkException ex) {
progressWarningReceived(ex);
continue;
}
}
}
} else if (dataSets[i] instanceof PairedDataSet) {
// Paired Data sets have a different format, with a packed position for
// the first read, then a chromosome name and packed position for the second
// read.
// From v13 onwards we started entering HiC data pairs in both directions so
// the data would be able to be read back in without sorting it.
boolean hicDataIsPresorted = thisDataVersion >= 13;
while (true) {
// The first line should be the chromosome and a number of reads
line = br.readLine();
if (line == null) {
throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
}
if (dataSetsToParse[i]) {
// A blank line indicates the end of the sample
if (line.length() == 0)
break;
sections = line.split("\\t");
Chromosome c = genome.getChromosome(sections[0]).chromosome();
// progressUpdated("Reading data for "+dataSets[i].name(),i*application.dataCollection().genome().getChromosomeCount()+seenChromosomeCount,n*application.dataCollection().genome().getChromosomeCount());
int chrReadCount = Integer.parseInt(sections[1]);
for (int r = 0; r < chrReadCount; r++) {
line = br.readLine();
if (line == null) {
throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
}
/*
* We used to have a split("\t") here, but this turned out to be the bottleneck
* which hugely restricted the speed at which the data could be read. Switching
* for a set of index calls and substring makes this *way* faster.
*/
long packedPosition = Long.parseLong(line.substring(0, line.indexOf('\t')));
Chromosome c2 = genome.getChromosome(line.substring(line.indexOf('\t'), line.lastIndexOf(('\t')))).chromosome();
long packedPosition2 = Long.parseLong(line.substring(line.lastIndexOf('\t') + 1, line.length()));
try {
dataSets[i].addData(c, packedPosition, hicDataIsPresorted);
dataSets[i].addData(c2, packedPosition2, hicDataIsPresorted);
} catch (SeqMonkException ex) {
progressWarningReceived(ex);
continue;
}
}
}
}
} else {
// In versions after 7 we split the section up into chromosomes
// so we don't put the chromosome on every line, and we put out
// the packed double value rather than the individual start, end
// and strand
// As of version 12 we collapse repeated reads into one line with
// a count after it, so we need to check for this.
// We keep count of reads processed to update the progress listeners
int readsRead = 0;
while (true) {
// The first line should be the chromosome and a number of reads
line = br.readLine();
if (line == null) {
throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
}
// A blank line indicates the end of the sample
if (line.length() == 0)
break;
if (dataSetsToParse[i]) {
sections = line.split("\\t");
// We don't try to capture this exception since we can't then process
// any of the reads which follow.
Chromosome c = genome.getChromosome(sections[0]).chromosome();
int chrReadCount = Integer.parseInt(sections[1]);
int tabIndexPosition = 0;
for (int r = 0; r < chrReadCount; r++) {
readsRead++;
if ((readsRead % (1 + (readCount / 10))) == 0) {
if (dataSetsToParse[i]) {
progressUpdated("Reading data for " + dataSets[i].name(), i * 10 + (readsRead / (1 + (readCount / 10))), n * 10);
} else {
progressUpdated("Skipping " + dataSets[i].name(), i * 10 + (readsRead / (1 + (readCount / 10))), n * 10);
}
}
line = br.readLine();
if (line == null) {
throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
}
long packedPosition;
try {
if (line.indexOf("\t") != -1) {
// We have an index with a count
sections = line.split("\t");
tabIndexPosition = line.indexOf("\t");
packedPosition = Long.parseLong(line.substring(0, tabIndexPosition));
int count = Integer.parseInt(line.substring(tabIndexPosition + 1));
// TODO: Does this break the progress bar sometimes?
r += (count - 1);
readsRead += (count - 1);
for (int x = 1; x <= count; x++) {
dataSets[i].addData(c, packedPosition);
}
} else {
packedPosition = Long.parseLong(line);
dataSets[i].addData(c, packedPosition);
}
} catch (SeqMonkException ex) {
progressWarningReceived(ex);
}
}
}
}
}
if (dataSetsToParse[i]) {
dataSets[i].finalise();
}
}
// Finally we need to make up an array of just the datasets we actually read
Vector<DataSet> keepers = new Vector<DataSet>();
for (int i = 0; i < dataSets.length; i++) {
if (dataSetsToParse[i]) {
keepers.add(dataSets[i]);
}
}
return keepers.toArray(new DataSet[0]);
}
Aggregations