Examples with Chromosome - uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome

Example 6 with Chromosome

use of uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome in project SeqMonk by s-andrews.

the class RNAQCCalcualtor method run.

public void run() {
    updateProgress("Getting features", 0, 1);
    this.geneFeatures = collection.genome().annotationCollection().getFeaturesForType(geneFeatureName);
    if (transcriptFeatureName != null) {
        this.transcriptFeatures = collection.genome().annotationCollection().getFeaturesForType(transcriptFeatureName);
    }
    if (rRNAFeatureName != null) {
        this.rRNAFeatures = collection.genome().annotationCollection().getFeaturesForType(rRNAFeatureName);
    }
    updateProgress("Getting merged genes", 0, 1);
    // Get the merged set of gene locations
    Feature[] mergedGenes = FeatureMerging.getNonOverlappingLocationsForFeatures(geneFeatures, false);
    if (cancel) {
        progressCancelled();
        return;
    }
    updateProgress("Getting merged transcripts", 0, 1);
    // Get the merged set of transcript features
    Feature[] mergedTranscripts = null;
    if (transcriptFeatureName != null) {
        mergedTranscripts = FeatureMerging.getNonOverlappingLocationsForFeatures(transcriptFeatures, true);
    }
    if (cancel) {
        progressCancelled();
        return;
    }
    // Quantitate the genes.
    long[] geneBaseCounts = new long[stores.length];
    int[] measuredGenesCounts = new int[stores.length];
    for (int s = 0; s < stores.length; s++) {
        updateProgress("Quantitating " + stores[s].name() + " over genes", s, stores.length * 4);
        String lastChr = "";
        Chromosome lastChrObject = null;
        for (int f = 0; f < mergedGenes.length; f++) {
            if (mergedGenes[f].chromosomeName() != lastChr) {
                lastChr = mergedGenes[f].chromosomeName();
                lastChrObject = collection.genome().getExactChromsomeNameMatch(lastChr);
            }
            long[] reads = stores[s].getReadsForProbe(new Probe(lastChrObject, mergedGenes[f].location().packedPosition()));
            // See if we measured anything for this gene
            if (reads.length > 0) {
                ++measuredGenesCounts[s];
            }
            for (int r = 0; r < reads.length; r++) {
                if (cancel) {
                    progressCancelled();
                    return;
                }
                // Get the length of the overlap
                int overlap = 1 + (Math.min(SequenceRead.end(reads[r]), mergedGenes[f].location().end()) - Math.max(SequenceRead.start(reads[r]), mergedGenes[f].location().start()));
                geneBaseCounts[s] += overlap;
            }
        }
    }
    // Quantitate the transcripts
    long[] transcriptBaseCounts = null;
    long[] transcriptSameStrandBaseCounts = null;
    long[] transcriptOpposingStrandBaseCounts = null;
    if (transcriptFeatureName != null) {
        transcriptBaseCounts = new long[stores.length];
        transcriptSameStrandBaseCounts = new long[stores.length];
        transcriptOpposingStrandBaseCounts = new long[stores.length];
        for (int s = 0; s < stores.length; s++) {
            updateProgress("Quantitating " + stores[s].name() + " over transcripts", s + stores.length, stores.length * 4);
            String lastChr = "";
            Chromosome lastChrObject = null;
            for (int f = 0; f < mergedTranscripts.length; f++) {
                if (mergedTranscripts[f].chromosomeName() != lastChr) {
                    lastChr = mergedTranscripts[f].chromosomeName();
                    lastChrObject = collection.genome().getExactChromsomeNameMatch(lastChr);
                }
                long[] reads = stores[s].getReadsForProbe(new Probe(lastChrObject, mergedTranscripts[f].location().packedPosition()));
                for (int r = 0; r < reads.length; r++) {
                    if (cancel) {
                        progressCancelled();
                        return;
                    }
                    // Get the length of the overlap
                    int overlap = 1 + (Math.min(SequenceRead.end(reads[r]), mergedTranscripts[f].location().end()) - Math.max(SequenceRead.start(reads[r]), mergedTranscripts[f].location().start()));
                    transcriptBaseCounts[s] += overlap;
                    if (SequenceRead.strand(reads[r]) == mergedTranscripts[f].location().strand()) {
                        transcriptSameStrandBaseCounts[s] += overlap;
                    } else {
                        transcriptOpposingStrandBaseCounts[s] += overlap;
                    }
                }
            }
        }
    }
    // Quantitate the rRNA
    long[] rRNABaseCounts = null;
    if (rRNAFeatureName != null) {
        rRNABaseCounts = new long[stores.length];
        for (int s = 0; s < stores.length; s++) {
            updateProgress("Quantitating " + stores[s].name() + " over rRNAs", s + (stores.length * 2), stores.length * 4);
            String lastChr = "";
            Chromosome lastChrObject = null;
            for (int f = 0; f < rRNAFeatures.length; f++) {
                if (rRNAFeatures[f].chromosomeName() != lastChr) {
                    lastChr = rRNAFeatures[f].chromosomeName();
                    lastChrObject = collection.genome().getExactChromsomeNameMatch(lastChr);
                }
                long[] reads = stores[s].getReadsForProbe(new Probe(lastChrObject, rRNAFeatures[f].location().packedPosition()));
                for (int r = 0; r < reads.length; r++) {
                    if (cancel) {
                        progressCancelled();
                        return;
                    }
                    // Get the length of the overlap
                    int overlap = 1 + (Math.min(SequenceRead.end(reads[r]), rRNAFeatures[f].location().end()) - Math.max(SequenceRead.start(reads[r]), rRNAFeatures[f].location().start()));
                    rRNABaseCounts[s] += overlap;
                }
            }
        }
    }
    // Quantitate the chromosomes
    long[][] chromosomeBaseCounts = new long[chromosomes.length][stores.length];
    for (int s = 0; s < stores.length; s++) {
        for (int c = 0; c < chromosomes.length; c++) {
            updateProgress("Quantitating " + stores[s].name() + " over " + chromosomes[c].name(), s + (stores.length * 3), stores.length * 4);
            ReadsWithCounts reads = stores[s].getReadsForChromosome(chromosomes[c]);
            for (int r = 0; r < reads.reads.length; r++) {
                chromosomeBaseCounts[c][s] += SequenceRead.length(reads.reads[r]) * reads.counts[r];
            }
        }
    }
    // Finally we make up the data sets we're going to pass back.
    RNAQCResult result = new RNAQCResult(stores);
    double[] percentInGene = new double[stores.length];
    for (int i = 0; i < geneBaseCounts.length; i++) {
        percentInGene[i] = (geneBaseCounts[i] / (double) stores[i].getTotalReadLength()) * 100;
        if (percentInGene[i] > 100) {
            progressWarning("Percent in gene was >100 for " + stores[i]);
            percentInGene[i] = 100;
        }
    }
    result.addPercentageSet("Percent in Gene", percentInGene);
    double[] percentInTranscript = null;
    if (transcriptFeatureName != null) {
        percentInTranscript = new double[stores.length];
        for (int i = 0; i < geneBaseCounts.length; i++) {
            percentInTranscript[i] = (transcriptBaseCounts[i] / (double) geneBaseCounts[i]) * 100;
            if (percentInTranscript[i] > 100) {
                progressWarning("Percent in exons was >100 for " + stores[i]);
                percentInTranscript[i] = 100;
            }
        }
        result.addPercentageSet("Percent in exons", percentInTranscript);
    }
    double[] percentInrRNA = null;
    if (rRNAFeatureName != null) {
        percentInrRNA = new double[stores.length];
        for (int i = 0; i < rRNABaseCounts.length; i++) {
            percentInrRNA[i] = (rRNABaseCounts[i] / (double) stores[i].getTotalReadLength()) * 100;
            if (percentInrRNA[i] > 100) {
                progressWarning("Percent in rRNA was >100 for " + stores[i]);
                percentInrRNA[i] = 100;
            }
        }
        result.addPercentageSet("Percent in rRNA", percentInrRNA);
    }
    double[] percentageMeasuredGenes = new double[stores.length];
    for (int i = 0; i < measuredGenesCounts.length; i++) {
        percentageMeasuredGenes[i] = measuredGenesCounts[i] / (double) mergedGenes.length * 100;
    }
    result.addPercentageSet("Percent Genes Measured", percentageMeasuredGenes);
    // Work out the relative coverage
    double[] percentageOfMaxCoverage = new double[stores.length];
    long maxLength = 0;
    for (int i = 0; i < stores.length; i++) {
        if (stores[i].getTotalReadLength() > maxLength)
            maxLength = stores[i].getTotalReadLength();
    }
    for (int i = 0; i < stores.length; i++) {
        percentageOfMaxCoverage[i] = (stores[i].getTotalReadLength() * 100d) / maxLength;
    }
    result.addPercentageSet("Percentage of max data size", percentageOfMaxCoverage);
    double[][] percentInChromosomes = new double[chromosomes.length][stores.length];
    for (int c = 0; c < percentInChromosomes.length; c++) {
        for (int i = 0; i < chromosomeBaseCounts[c].length; i++) {
            percentInChromosomes[c][i] = (chromosomeBaseCounts[c][i] / (double) stores[i].getTotalReadLength()) * 100;
            if (percentInChromosomes[c][i] > 100) {
                progressWarning("Percent in " + chromosomes[c] + " was >100 for " + stores[i]);
                percentInChromosomes[c][i] = 100;
            }
        }
    }
    for (int c = 0; c < percentInChromosomes.length; c++) {
        result.addPercentageSet("Percent in " + chromosomes[c].name(), percentInChromosomes[c]);
    }
    double[] percentOnSenseStrand = null;
    if (transcriptFeatureName != null) {
        percentOnSenseStrand = new double[stores.length];
        for (int i = 0; i < transcriptBaseCounts.length; i++) {
            percentOnSenseStrand[i] = (transcriptSameStrandBaseCounts[i] / (double) transcriptBaseCounts[i]) * 100;
            if (percentOnSenseStrand[i] > 100) {
                progressWarning("Percent on sense strand was >100 for " + stores[i]);
                percentOnSenseStrand[i] = 100;
            }
        }
        result.addPercentageSet("Percent on sense strand", percentOnSenseStrand);
    }
    progressComplete(result);
}

Also used : Chromosome(uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome) ReadsWithCounts(uk.ac.babraham.SeqMonk.DataTypes.Sequence.ReadsWithCounts) Probe(uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe) Feature(uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature)

Example 7 with Chromosome

use of uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome in project SeqMonk by s-andrews.

the class GenomeParser method reloadCacheFiles.

private void reloadCacheFiles(SingleGenome genome, File baseLocation) {
    Enumeration<ProgressListener> el = listeners.elements();
    while (el.hasMoreElements()) {
        el.nextElement().progressUpdated("Reloading cache files", 0, 1);
    }
    CoreAnnotationSet coreAnnotation = new CoreAnnotationSet(genome);
    File cacheDir = new File(baseLocation.getAbsoluteFile() + "/cache/");
    // First we need to get the list of chromosomes and set those
    // up before we go on to add the actual feature sets.
    File chrListFile = new File(baseLocation.getAbsoluteFile() + "/cache/chr_list");
    try {
        BufferedReader br = new BufferedReader(new FileReader(chrListFile));
        String line;
        while ((line = br.readLine()) != null) {
            String[] chrLen = line.split("\\t");
            Chromosome c = genome.addChromosome(chrLen[0]);
            c.setLength(Integer.parseInt(chrLen[1]));
        }
        br.close();
    } catch (Exception e) {
        throw new IllegalStateException(e);
    }
    File[] cacheFiles = cacheDir.listFiles(new FileFilter() {

        public boolean accept(File pathname) {
            return pathname.getName().toLowerCase().endsWith(".cache");
        }
    });
    for (int i = 0; i < cacheFiles.length; i++) {
        // Update the listeners
        String name = cacheFiles[i].getName();
        name = name.replaceAll("\\.cache$", "");
        String[] chrType = name.split("%", 2);
        if (chrType.length != 2) {
            throw new IllegalStateException("Cache name '" + name + "' didn't split into chr and type");
        }
        // If the feature name had a forward slash in it we've replaced it with 3 underscores
        chrType[1] = chrType[1].replaceAll("___", "/");
        coreAnnotation.addPreCachedFile(chrType[1], chrType[0], cacheFiles[i]);
    }
    genome.annotationCollection().addAnnotationSets(new AnnotationSet[] { coreAnnotation });
}

Also used : Chromosome(uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome) IOException(java.io.IOException) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) CoreAnnotationSet(uk.ac.babraham.SeqMonk.DataTypes.Genome.CoreAnnotationSet) ProgressListener(uk.ac.babraham.SeqMonk.DataTypes.ProgressListener) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) FileFilter(java.io.FileFilter) File(java.io.File)

Example 8 with Chromosome

use of uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome in project SeqMonk by s-andrews.

the class GenomeParser method processEMBLFile.

/**
 * Process embl file.
 *
 * @param f the f
 * @param annotation the annotation
 * @throws Exception the exception
 */
private void processEMBLFile(File f, AnnotationSet annotation, SingleGenome genome) throws Exception {
    BufferedReader br = new BufferedReader(new FileReader(f));
    Chromosome c = null;
    while ((c = parseChromosome(br, genome)) != null) {
        String line;
        // We can now skip through to the start of the feature table
        while ((line = br.readLine()) != null) {
            if (line.startsWith("FH") || line.startsWith("SQ")) {
                break;
            }
        }
        // We can now start reading the features one at a time by
        // concatonating them and then passing them on for processing
        StringBuffer currentAttribute = new StringBuffer();
        boolean skipping = true;
        Feature feature = null;
        while ((line = br.readLine()) != null) {
            if (line.startsWith("XX") || line.startsWith("SQ") || line.startsWith("//")) {
                skipToEntryEnd(br);
                break;
            }
            // Just a blank line.
            if (line.length() < 18)
                continue;
            String type = line.substring(5, 18).trim();
            // System.out.println("Type is "+type);
            if (type.length() > 0) {
                // Check whether we need to process the old feature
                if (skipping) {
                    // We're either on the first feature, or we've
                    // moving past this one
                    skipping = false;
                } else {
                    // We need to process the last attribute from the
                    // old feature
                    processAttributeReturnSkip(currentAttribute.toString(), feature);
                    annotation.addFeature(feature);
                }
                // We can check to see if we're bothering to load this type of feature
                if (prefs.loadAnnotation(type)) {
                    // System.err.println("Creating new feature of type "+type);
                    feature = new Feature(type, c.name());
                    currentAttribute = new StringBuffer("location=");
                    currentAttribute.append(line.substring(21).trim());
                    continue;
                } else {
                    // System.err.println("Skipping feature of type "+type);
                    genome.addUnloadedFeatureType(type);
                    skipping = true;
                }
            }
            if (skipping)
                continue;
            String data = line.substring(21).trim();
            if (data.startsWith("/")) {
                // We're at the start of a new attribute
                // Process the last attribute
                skipping = processAttributeReturnSkip(currentAttribute.toString(), feature);
                currentAttribute = new StringBuffer();
            }
            // before the next lot of text.
            if (currentAttribute.indexOf("description=") >= 0)
                currentAttribute.append(" ");
            currentAttribute.append(data);
        }
        // if there was one
        if (!skipping) {
            // We need to process the last attribute from the
            // old feature
            processAttributeReturnSkip(currentAttribute.toString(), feature);
            annotation.addFeature(feature);
        }
    }
    br.close();
}

Also used : BufferedReader(java.io.BufferedReader) Chromosome(uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome) FileReader(java.io.FileReader) Feature(uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature)

Example 9 with Chromosome

use of uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome in project SeqMonk by s-andrews.

the class VisibleStoresParser method processHiCDataStore.

private DataSet processHiCDataStore(DataStore store) throws SeqMonkException {
    int extendBy = prefs.extendReads();
    boolean reverse = prefs.reverseReads();
    boolean removeStrand = prefs.removeStrandInfo();
    PairedDataSet newData = new PairedDataSet(store.name() + "_reimport", "Reimported from " + store.name(), prefs.removeDuplicates(), prefs.hiCDistance(), prefs.hiCIgnoreTrans());
    // Now process the data
    Chromosome[] chrs = dataCollection().genome().getAllChromosomes();
    for (int c = 0; c < chrs.length; c++) {
        progressUpdated("Processing " + store.name() + " chr " + chrs[c].name(), c, chrs.length);
        // We make the call to get exportable reads so we don't duplicate reads
        // when we export things
        HiCHitCollection hitCollection = ((HiCDataStore) store).getExportableReadsForChromosome(chrs[c]);
        String[] localChromosomes = hitCollection.getChromosomeNamesWithHits();
        for (int c2 = 0; c2 < localChromosomes.length; c2++) {
            Chromosome localChromosome = SeqMonkApplication.getInstance().dataCollection().genome().getChromosome(localChromosomes[c2]).chromosome();
            long[] sourceReads = hitCollection.getSourcePositionsForChromosome(localChromosomes[c2]);
            long[] hitReads = hitCollection.getHitPositionsForChromosome(localChromosomes[c2]);
            for (int r = 0; r < sourceReads.length; r++) {
                if (cancel) {
                    progressCancelled();
                    return null;
                }
                if (downsample && downsampleProbabilty < 1) {
                    if (Math.random() > downsampleProbabilty) {
                        continue;
                    }
                }
                if ((!(reverse || removeStrand)) && extendBy == 0 && (!filterByFeature)) {
                    // Just add them as they are
                    newData.addData(chrs[c], sourceReads[r]);
                    newData.addData(localChromosome, hitReads[r]);
                }
                Feature[] features = null;
                if (filterByFeature) {
                    features = collection.genome().annotationCollection().getFeaturesForType(chrs[c], featureType);
                    Arrays.sort(features);
                }
                int currentFeaturePostion = 0;
                if (filterByFeature) {
                    // See if we're comparing against the right feature
                    while (SequenceRead.start(sourceReads[r]) > features[currentFeaturePostion].location().end() && currentFeaturePostion < (features.length - 1)) {
                        currentFeaturePostion++;
                    }
                    // Test to see if we overlap
                    if (SequenceRead.overlaps(sourceReads[r], features[currentFeaturePostion].location().packedPosition())) {
                        if (excludeFeature)
                            continue;
                    } else {
                        if (!excludeFeature)
                            continue;
                    }
                }
                int sourceStart = SequenceRead.start(sourceReads[r]);
                int sourceEend = SequenceRead.end(sourceReads[r]);
                int sourceStrand = SequenceRead.strand(sourceReads[r]);
                int hitStart = SequenceRead.start(sourceReads[r]);
                int hitEend = SequenceRead.end(hitReads[r]);
                int hitStrand = SequenceRead.strand(hitReads[r]);
                if (reverse) {
                    if (sourceStrand == Location.FORWARD) {
                        sourceStrand = Location.REVERSE;
                    } else if (sourceStrand == Location.REVERSE) {
                        sourceStrand = Location.FORWARD;
                    }
                    if (hitStrand == Location.FORWARD) {
                        hitStrand = Location.REVERSE;
                    } else if (hitStrand == Location.REVERSE) {
                        hitStrand = Location.FORWARD;
                    }
                }
                if (removeStrand) {
                    sourceStrand = Location.UNKNOWN;
                    hitStrand = Location.UNKNOWN;
                }
                if (extendBy > 0) {
                    if (sourceStrand == Location.FORWARD) {
                        sourceEend += extendBy;
                    } else if (sourceStrand == Location.REVERSE) {
                        sourceStart -= extendBy;
                    }
                    if (hitStrand == Location.FORWARD) {
                        hitEend += extendBy;
                    } else if (hitStrand == Location.REVERSE) {
                        hitStart -= extendBy;
                    }
                }
                // We also don't allow readings which are beyond the end of the chromosome
                if (sourceEend > chrs[c].length()) {
                    int overrun = sourceEend - chrs[c].length();
                    progressWarningReceived(new SeqMonkException("Reading position " + sourceEend + " was " + overrun + "bp beyond the end of chr" + chrs[c].name() + " (" + chrs[c].length() + ")"));
                    continue;
                }
                if (hitEend > localChromosome.length()) {
                    int overrun = hitEend - SeqMonkApplication.getInstance().dataCollection().genome().getChromosome(localChromosomes[c2]).chromosome().length();
                    progressWarningReceived(new SeqMonkException("Reading position " + hitEend + " was " + overrun + "bp beyond the end of chr" + localChromosome.name() + " (" + chrs[c].length() + ")"));
                    continue;
                }
                // We can now make the new readings
                long sourceRead = SequenceRead.packPosition(sourceStart, sourceEend, sourceStrand);
                long hitRead = SequenceRead.packPosition(hitStart, hitEend, hitStrand);
                if (!prefs.isHiC()) {
                    // HiC additions are deferred until we know the other end is OK too.
                    newData.addData(chrs[c], sourceRead);
                    newData.addData(localChromosome, hitRead);
                }
            }
        }
    }
    return newData;
}

Also used : HiCHitCollection(uk.ac.babraham.SeqMonk.DataTypes.Sequence.HiCHitCollection) PairedDataSet(uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet) Chromosome(uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome) HiCDataStore(uk.ac.babraham.SeqMonk.DataTypes.HiCDataStore) Feature(uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException)

Example 10 with Chromosome

use of uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome in project SeqMonk by s-andrews.

the class SeqMonkReimportParser method parseSamples.

/**
 * Parses the list of samples.
 *
 * @param sections The tab split initial samples line
 * @throws SeqMonkException
 * @throws IOException Signals that an I/O exception has occurred.
 */
private DataSet[] parseSamples(String[] sections) throws SeqMonkException, IOException {
    if (sections.length != 2) {
        throw new SeqMonkException("Samples line didn't contain 2 sections");
    }
    if (!sections[0].equals("Samples")) {
        throw new SeqMonkException("Couldn't find expected samples line");
    }
    int n = Integer.parseInt(sections[1]);
    // We need to keep the Data Sets around to add data to later.
    // We're going to start all of these datasets off, but only collect data for some
    // of them.
    DataSet[] dataSets = new DataSet[n];
    for (int i = 0; i < n; i++) {
        sections = br.readLine().split("\\t");
        // anything else is assumed to be a normal dataset.
        if (sections.length == 1) {
            dataSets[i] = new DataSet(sections[0], "Not known", DataSet.DUPLICATES_REMOVE_NO);
        } else if (sections.length == 2) {
            dataSets[i] = new DataSet(sections[0], sections[1], DataSet.DUPLICATES_REMOVE_NO);
        } else if (sections.length == 3) {
            if (sections[2].equals("HiC")) {
                dataSets[i] = new PairedDataSet(sections[0], sections[1], DataSet.DUPLICATES_REMOVE_NO, 0, false);
            } else {
                dataSets[i] = new DataSet(sections[0], sections[1], DataSet.DUPLICATES_REMOVE_NO);
            }
        }
    }
    // At this point we need to know which of the datasets we're actually going
    // to import into the current project
    DataSetSelector selector = new DataSetSelector(dataSets);
    boolean[] dataSetsToParse = new boolean[dataSets.length];
    int lastDataSetToParse = 0;
    for (int i = 0; i < dataSetsToParse.length; i++) {
        dataSetsToParse[i] = false;
    }
    int[] selectedIndices = selector.getSelectedIndices();
    for (int i = 0; i < selectedIndices.length; i++) {
        dataSetsToParse[selectedIndices[i]] = true;
        if (selectedIndices[i] > lastDataSetToParse)
            lastDataSetToParse = selectedIndices[i];
    }
    // Now we can go through the rest of the sets and add data where appropriate
    // Immediately after the list of samples comes the lists of reads
    String line;
    // Iterate through the number of samples
    for (int i = 0; i < n; i++) {
        if (i > lastDataSetToParse) {
            // No sense skipping through a load of data we're not going to import.
            break;
        }
        if (dataSetsToParse[i]) {
            progressUpdated("Reading data for " + dataSets[i].name(), i * 10, n * 10);
        } else {
            progressUpdated("Skipping " + dataSets[i].name(), i * 10, n * 10);
        }
        // The first line is
        line = br.readLine();
        sections = line.split("\t");
        if (sections.length != 2) {
            throw new SeqMonkException("Read line " + i + " didn't contain 2 sections");
        }
        int readCount = Integer.parseInt(sections[0]);
        // In versions prior to 7 we encoded everything on every line separately
        if (thisDataVersion < 7) {
            for (int r = 0; r < readCount; r++) {
                if ((r % (1 + (readCount / 10))) == 0) {
                    if (dataSetsToParse[i]) {
                        progressUpdated("Reading data for " + dataSets[i].name(), i * 10 + (r / (1 + (readCount / 10))), n * 10);
                    } else {
                        progressUpdated("Skipping " + dataSets[i].name(), i * 10 + (r / (1 + (readCount / 10))), n * 10);
                    }
                }
                line = br.readLine();
                if (line == null) {
                    throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
                }
                if (dataSetsToParse[i]) {
                    sections = line.split("\t");
                    Chromosome c;
                    try {
                        c = genome.getChromosome(sections[0]).chromosome();
                    } catch (IllegalArgumentException sme) {
                        progressWarningReceived(new SeqMonkException("Read from sample " + i + " could not be mapped to chromosome '" + sections[0] + "'"));
                        continue;
                    }
                    int start = Integer.parseInt(sections[1]);
                    int end = Integer.parseInt(sections[2]);
                    int strand = Integer.parseInt(sections[3]);
                    try {
                        dataSets[i].addData(c, SequenceRead.packPosition(start, end, strand));
                    } catch (SeqMonkException ex) {
                        progressWarningReceived(ex);
                        continue;
                    }
                }
            }
        } else if (dataSets[i] instanceof PairedDataSet) {
            // Paired Data sets have a different format, with a packed position for
            // the first read, then a chromosome name and packed position for the second
            // read.
            // From v13 onwards we started entering HiC data pairs in both directions so
            // the data would be able to be read back in without sorting it.
            boolean hicDataIsPresorted = thisDataVersion >= 13;
            while (true) {
                // The first line should be the chromosome and a number of reads
                line = br.readLine();
                if (line == null) {
                    throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
                }
                if (dataSetsToParse[i]) {
                    // A blank line indicates the end of the sample
                    if (line.length() == 0)
                        break;
                    sections = line.split("\\t");
                    Chromosome c = genome.getChromosome(sections[0]).chromosome();
                    // progressUpdated("Reading data for "+dataSets[i].name(),i*application.dataCollection().genome().getChromosomeCount()+seenChromosomeCount,n*application.dataCollection().genome().getChromosomeCount());
                    int chrReadCount = Integer.parseInt(sections[1]);
                    for (int r = 0; r < chrReadCount; r++) {
                        line = br.readLine();
                        if (line == null) {
                            throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
                        }
                        /*
							 * We used to have a split("\t") here, but this turned out to be the bottleneck
							 * which hugely restricted the speed at which the data could be read.  Switching
							 * for a set of index calls and substring makes this *way* faster.
							 */
                        long packedPosition = Long.parseLong(line.substring(0, line.indexOf('\t')));
                        Chromosome c2 = genome.getChromosome(line.substring(line.indexOf('\t'), line.lastIndexOf(('\t')))).chromosome();
                        long packedPosition2 = Long.parseLong(line.substring(line.lastIndexOf('\t') + 1, line.length()));
                        try {
                            dataSets[i].addData(c, packedPosition, hicDataIsPresorted);
                            dataSets[i].addData(c2, packedPosition2, hicDataIsPresorted);
                        } catch (SeqMonkException ex) {
                            progressWarningReceived(ex);
                            continue;
                        }
                    }
                }
            }
        } else {
            // In versions after 7 we split the section up into chromosomes
            // so we don't put the chromosome on every line, and we put out
            // the packed double value rather than the individual start, end
            // and strand
            // As of version 12 we collapse repeated reads into one line with
            // a count after it, so we need to check for this.
            // We keep count of reads processed to update the progress listeners
            int readsRead = 0;
            while (true) {
                // The first line should be the chromosome and a number of reads
                line = br.readLine();
                if (line == null) {
                    throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
                }
                // A blank line indicates the end of the sample
                if (line.length() == 0)
                    break;
                if (dataSetsToParse[i]) {
                    sections = line.split("\\t");
                    // We don't try to capture this exception since we can't then process
                    // any of the reads which follow.
                    Chromosome c = genome.getChromosome(sections[0]).chromosome();
                    int chrReadCount = Integer.parseInt(sections[1]);
                    int tabIndexPosition = 0;
                    for (int r = 0; r < chrReadCount; r++) {
                        readsRead++;
                        if ((readsRead % (1 + (readCount / 10))) == 0) {
                            if (dataSetsToParse[i]) {
                                progressUpdated("Reading data for " + dataSets[i].name(), i * 10 + (readsRead / (1 + (readCount / 10))), n * 10);
                            } else {
                                progressUpdated("Skipping " + dataSets[i].name(), i * 10 + (readsRead / (1 + (readCount / 10))), n * 10);
                            }
                        }
                        line = br.readLine();
                        if (line == null) {
                            throw new SeqMonkException("Ran out of data whilst parsing reads for sample " + i);
                        }
                        long packedPosition;
                        try {
                            if (line.indexOf("\t") != -1) {
                                // We have an index with a count
                                sections = line.split("\t");
                                tabIndexPosition = line.indexOf("\t");
                                packedPosition = Long.parseLong(line.substring(0, tabIndexPosition));
                                int count = Integer.parseInt(line.substring(tabIndexPosition + 1));
                                // TODO: Does this break the progress bar sometimes?
                                r += (count - 1);
                                readsRead += (count - 1);
                                for (int x = 1; x <= count; x++) {
                                    dataSets[i].addData(c, packedPosition);
                                }
                            } else {
                                packedPosition = Long.parseLong(line);
                                dataSets[i].addData(c, packedPosition);
                            }
                        } catch (SeqMonkException ex) {
                            progressWarningReceived(ex);
                        }
                    }
                }
            }
        }
        if (dataSetsToParse[i]) {
            dataSets[i].finalise();
        }
    }
    // Finally we need to make up an array of just the datasets we actually read
    Vector<DataSet> keepers = new Vector<DataSet>();
    for (int i = 0; i < dataSets.length; i++) {
        if (dataSetsToParse[i]) {
            keepers.add(dataSets[i]);
        }
    }
    return keepers.toArray(new DataSet[0]);
}

Also used : DataSet(uk.ac.babraham.SeqMonk.DataTypes.DataSet) PairedDataSet(uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet) PairedDataSet(uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet) Chromosome(uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome) SeqMonkException(uk.ac.babraham.SeqMonk.SeqMonkException) Vector(java.util.Vector)

Aggregations

Chromosome (uk.ac.babraham.SeqMonk.DataTypes.Genome.Chromosome)78 Probe (uk.ac.babraham.SeqMonk.DataTypes.Probes.Probe)47 Vector (java.util.Vector)36 Feature (uk.ac.babraham.SeqMonk.DataTypes.Genome.Feature)23 SeqMonkException (uk.ac.babraham.SeqMonk.SeqMonkException)23 ProbeSet (uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeSet)22 ProbeList (uk.ac.babraham.SeqMonk.DataTypes.Probes.ProbeList)12 DataStore (uk.ac.babraham.SeqMonk.DataTypes.DataStore)11 DataSet (uk.ac.babraham.SeqMonk.DataTypes.DataSet)8 ReadsWithCounts (uk.ac.babraham.SeqMonk.DataTypes.Sequence.ReadsWithCounts)8 Location (uk.ac.babraham.SeqMonk.DataTypes.Genome.Location)7 SplitLocation (uk.ac.babraham.SeqMonk.DataTypes.Genome.SplitLocation)7 ProgressListener (uk.ac.babraham.SeqMonk.DataTypes.ProgressListener)7 HiCHitCollection (uk.ac.babraham.SeqMonk.DataTypes.Sequence.HiCHitCollection)7 IOException (java.io.IOException)6 File (java.io.File)5 Hashtable (java.util.Hashtable)5 HiCDataStore (uk.ac.babraham.SeqMonk.DataTypes.HiCDataStore)5 QuantitationStrandType (uk.ac.babraham.SeqMonk.DataTypes.Sequence.QuantitationStrandType)5 PairedDataSet (uk.ac.babraham.SeqMonk.DataTypes.PairedDataSet)4