use of uk.ac.babraham.SeqMonk.DataTypes.Genome.SplitLocation in project SeqMonk by s-andrews.
the class CodonBiasPanel method run.
public void run() {
// First we need to know the length of the feature we'll be analysing. This isn't the
// full length in the genome but the sum length of the exons. We'll also make up an
// array of offsets so that we can convert genomic positions into positions within the
// feature easily.
Location[] subLocations;
if (feature.location() instanceof SplitLocation) {
subLocations = ((SplitLocation) (feature.location())).subLocations();
} else {
subLocations = new Location[] { feature.location() };
}
System.err.println("Working with " + feature.name());
System.err.println("There are " + subLocations.length + " sublocations");
// First work out the total transcript length so we can make an appropriate data structure
int totalLength = 0;
for (int e = 0; e < subLocations.length; e++) {
totalLength += subLocations[e].length();
}
System.err.println("Total exon length is " + totalLength);
int[] abundance = new int[totalLength];
// Now work out the exon boundary positions within the feature
// We can also work out a mapping between relative genomic position and
// feature position.
int[] exonBoundaries = new int[subLocations.length];
int[] genomeToFeatureMap = new int[1 + feature.location().end() - feature.location().start()];
for (int j = 0; j < genomeToFeatureMap.length; j++) {
genomeToFeatureMap[j] = -1;
}
System.err.println("Genome to feature map length is " + genomeToFeatureMap.length);
if (feature.location().strand() == Location.FORWARD) {
System.err.println("Feature is forward strand");
int length = 0;
int positionInFeature = 0;
for (int i = 0; i < subLocations.length; i++) {
System.err.println("Looking at sublocation " + i + " from " + subLocations[i].start() + " to " + subLocations[i].end());
exonBoundaries[i] = length;
System.err.println("Added exon boundary at " + exonBoundaries[i]);
length += subLocations[i].length();
for (int x = 0; x < subLocations[i].length(); x++) {
int genomePostion = subLocations[i].start() + x;
int relativeGenomePosition = genomePostion - feature.location().start();
System.err.println("Sublocation Pos=" + x + " Genome Pos=" + genomePostion + " Rel Genome Pos=" + relativeGenomePosition + " Feature pos=" + positionInFeature);
genomeToFeatureMap[relativeGenomePosition] = positionInFeature;
positionInFeature++;
}
}
} else if (feature.location().strand() == Location.REVERSE) {
int length = 0;
int positionInFeature = 0;
for (int i = subLocations.length - 1; i >= 0; i--) {
exonBoundaries[i] = length;
length += subLocations[i].length();
for (int x = 0; x < subLocations[i].length(); x++) {
genomeToFeatureMap[subLocations[i].end() - x] = positionInFeature;
positionInFeature++;
}
}
}
// Now we can get all of the reads and position them within the read.
long[] reads = store.getReadsForProbe(new Probe(SeqMonkApplication.getInstance().dataCollection().genome().getExactChromsomeNameMatch(feature.chromosomeName()), feature.location().packedPosition()));
for (int r = 0; r < reads.length; r++) {
// We need to work out the position of this read in the feature. This will depend
// on whether the feature is forward or reverse strand, and whether we're reversing
// the direction of reads.
System.err.println("Looking at read " + SequenceRead.toString(reads[r]));
int genomicPosition = 0;
if (feature.location().strand() == Location.FORWARD) {
System.err.println("It's a forward feature");
if (reverse) {
System.err.println("We're a same strand library");
if (SequenceRead.strand(reads[r]) != Location.REVERSE)
continue;
genomicPosition = SequenceRead.end(reads[r]);
} else {
System.err.println("We're an opposing strand library");
if (SequenceRead.strand(reads[r]) != Location.FORWARD)
continue;
genomicPosition = SequenceRead.start(reads[r]);
}
System.err.println("Raw genomic position is " + genomicPosition);
genomicPosition = genomicPosition - feature.location().start();
System.err.println("Corrected genomic position is " + genomicPosition);
} else if (feature.location().strand() == Location.REVERSE) {
if (reverse) {
if (SequenceRead.strand(reads[r]) != Location.REVERSE)
continue;
genomicPosition = SequenceRead.start(reads[r]);
} else {
if (SequenceRead.strand(reads[r]) != Location.FORWARD)
continue;
genomicPosition = SequenceRead.end(reads[r]);
}
genomicPosition = feature.location().end() - genomicPosition;
}
System.err.println("Final genomic position is " + genomicPosition);
if (genomicPosition < 0 || genomicPosition >= genomeToFeatureMap.length)
continue;
System.err.println("Position in feature is " + genomeToFeatureMap[genomicPosition]);
if (genomeToFeatureMap[genomicPosition] != -1) {
abundance[genomeToFeatureMap[genomicPosition]]++;
}
}
this.abundance = abundance;
this.exonBoundaries = exonBoundaries;
calculated = true;
repaint();
}
use of uk.ac.babraham.SeqMonk.DataTypes.Genome.SplitLocation in project SeqMonk by s-andrews.
the class FeaturePositionSelectorPanel method getProbes.
/**
* Gets the set of probes with appropriate context for the options
* currently set.
* @return
*/
public Probe[] getProbes() {
Chromosome[] chromosomes = collection.genome().getAllChromosomes();
Vector<Probe> newProbes = new Vector<Probe>();
for (int c = 0; c < chromosomes.length; c++) {
Vector<Feature> allFeatures = new Vector<Feature>();
String[] selectedFeatureTypes = selectedFeatureTypes();
for (int f = 0; f < selectedFeatureTypes.length; f++) {
Feature[] features = collection.genome().annotationCollection().getFeaturesForType(chromosomes[c], selectedFeatureTypes[f]);
for (int i = 0; i < features.length; i++) {
allFeatures.add(features[i]);
}
}
Feature[] features = allFeatures.toArray(new Feature[0]);
for (int f = 0; f < features.length; f++) {
if (useSubFeatures()) {
// We need to split this up so get the sub-features
if (features[f].location() instanceof SplitLocation) {
SplitLocation location = (SplitLocation) features[f].location();
Location[] subLocations = location.subLocations();
if (useExonSubfeatures()) {
// System.err.println("Making exon probes");
for (int s = 0; s < subLocations.length; s++) {
makeProbes(features[f], chromosomes[c], subLocations[s], newProbes, false);
}
} else {
// We're making introns
for (int s = 1; s < subLocations.length; s++) {
makeProbes(features[f], chromosomes[c], new Location(subLocations[s - 1].end() + 1, subLocations[s].start() - 1, features[f].location().strand()), newProbes, false);
}
}
} else {
if (useExonSubfeatures()) {
// We can still make a single probe
makeProbes(features[f], chromosomes[c], features[f].location(), newProbes, false);
}
// If we're making introns then we're stuffed and we give up.
}
} else {
makeProbes(features[f], chromosomes[c], features[f].location(), newProbes, false);
}
}
}
Probe[] finalList = newProbes.toArray(new Probe[0]);
if (removeDuplicates()) {
finalList = removeDuplicates(finalList);
}
return finalList;
}
use of uk.ac.babraham.SeqMonk.DataTypes.Genome.SplitLocation in project SeqMonk by s-andrews.
the class IntronFeatureGroup method getSubLocations.
public Location[] getSubLocations() {
if (features.size() == 1) {
Location loc = features.elementAt(0).location();
if (loc instanceof SplitLocation) {
Location[] subLocs = ((SplitLocation) loc).subLocations();
Location[] interLocs = new Location[subLocs.length - 1];
for (int i = 0; i < subLocs.length - 1; i++) {
interLocs[i] = new Location(subLocs[i].end() + 1, subLocs[i + 1].start() - 1, subLocs[i].strand());
}
return interLocs;
} else {
return new Location[0];
}
}
LongVector allLocs = new LongVector();
Enumeration<Feature> en = features.elements();
while (en.hasMoreElements()) {
Location loc = en.nextElement().location();
if (loc instanceof SplitLocation) {
Location[] subLocs = ((SplitLocation) loc).subLocations();
for (int s = 0; s < subLocs.length; s++) {
allLocs.add(subLocs[s].packedPosition());
}
} else {
allLocs.add(loc.packedPosition());
}
}
long[] locs = allLocs.toArray();
SequenceRead.sort(locs);
Vector<Location> mergedLocs = new Vector<Location>();
long current = locs[0];
for (int i = 1; i < locs.length; i++) {
// if (debug) {System.err.println("Looking at "+SequenceRead.start(locs[i])+"-"+SequenceRead.end(locs[i])+" current is "+SequenceRead.start(current)+"-"+SequenceRead.end(current));}
if (SequenceRead.overlaps(current, locs[i]) && SequenceRead.end(locs[i]) > SequenceRead.end(current)) {
// if (debug) {System.err.println("They overlap, extending...");}
current = SequenceRead.packPosition(SequenceRead.start(current), SequenceRead.end(locs[i]), SequenceRead.strand(current));
} else if (SequenceRead.end(locs[i]) <= SequenceRead.end(current)) {
// if (debug) {System.err.println("This is a subset, ignoring it");}
continue;
} else {
// if (debug) {System.err.println("They don't overlap, moving on...");}
mergedLocs.add(new Location(current));
current = locs[i];
}
}
mergedLocs.add(new Location(current));
Location[] interLocs = new Location[mergedLocs.size() - 1];
for (int i = 0; i < mergedLocs.size() - 1; i++) {
interLocs[i] = new Location(mergedLocs.elementAt(i).end() + 1, mergedLocs.elementAt(i + 1).start() - 1, mergedLocs.elementAt(i).strand());
}
return interLocs;
}
use of uk.ac.babraham.SeqMonk.DataTypes.Genome.SplitLocation in project SeqMonk by s-andrews.
the class SplicingEfficiencyPipeline method startPipeline.
protected void startPipeline() {
// We first need to generate probes over all of the features listed in
// the feature types. The probes should cover the whole area of the
// feature regardless of where it splices.
Vector<Probe> probes = new Vector<Probe>();
boolean correctReadLength = optionsPanel.correctForReadLength();
boolean logTransform = optionsPanel.logTransform();
boolean applyTranscriptLengthCorrection = optionsPanel.applyTranscriptLengthCorrection();
// We need to make a lookup for chromosomes from their names for later on.
Chromosome[] chrs = collection().genome().getAllChromosomes();
Hashtable<String, Chromosome> chrLookup = new Hashtable<String, Chromosome>();
for (int c = 0; c < chrs.length; c++) {
chrLookup.put(chrs[c].name(), chrs[c]);
}
// We're going to cache the gene to transcript mappings so we don't have to re-do them
// later on.
Hashtable<String, Vector<Feature>> transcriptLookup = new Hashtable<String, Vector<Feature>>();
Vector<Feature> usedGeneFeatures = new Vector<Feature>();
// We'll record the number of failures so we can report it
int noTranscriptCount = 0;
int noIntronCount = 0;
for (int c = 0; c < chrs.length; c++) {
if (cancel) {
progressCancelled();
return;
}
progressUpdated("Making features for chr" + chrs[c].name(), c, chrs.length * 2);
Feature[] geneFeatures = collection().genome().annotationCollection().getFeaturesForType(chrs[c], optionsPanel.getSelectedGeneFeatureType());
Arrays.sort(geneFeatures);
Feature[] transcriptFeatures = collection().genome().annotationCollection().getFeaturesForType(chrs[c], optionsPanel.getSelectedTranscriptFeatureType());
Arrays.sort(transcriptFeatures);
// We need to figure out whether we can match based on name, or whether we need to do it based on location.
boolean matchOnNames = true;
for (int t = 0; t < transcriptFeatures.length; t++) {
if (!transcriptFeatures[t].name().matches("-\\d\\d\\d$")) {
matchOnNames = false;
break;
}
}
if (!matchOnNames) {
progressWarningReceived(new SeqMonkException("Non-Ensembl transcript names found - matching based on position"));
}
if (matchOnNames) {
for (int i = 0; i < transcriptFeatures.length; i++) {
String name = transcriptFeatures[i].name();
name = name.replaceAll("-\\d\\d\\d$", "");
if (!transcriptLookup.containsKey(name)) {
transcriptLookup.put(name, new Vector<Feature>());
}
transcriptLookup.get(name).add(transcriptFeatures[i]);
}
} else {
// We need to go through the genes and transcripts in parallel and match them up based on containment
// and direction.
int lastGeneIndex = 0;
for (int t = 0; t < transcriptFeatures.length; t++) {
for (int g = lastGeneIndex; g < geneFeatures.length; g++) {
if (transcriptFeatures[t].location().start() > geneFeatures[g].location().end()) {
lastGeneIndex = g;
continue;
}
// If the gene is beyond the end of the transcript we can stop looking
if (geneFeatures[g].location().start() > transcriptFeatures[t].location().end()) {
break;
}
// If we're on the same strand and contained within the gene then we have a match
if (geneFeatures[g].location().strand() == transcriptFeatures[t].location().strand() && transcriptFeatures[t].location().start() >= geneFeatures[g].location().start() && transcriptFeatures[t].location().end() <= geneFeatures[g].location().end()) {
String name = geneFeatures[g].name();
if (!transcriptLookup.containsKey(name)) {
transcriptLookup.put(name, new Vector<Feature>());
}
transcriptLookup.get(name).add(transcriptFeatures[t]);
}
}
}
}
for (int f = 0; f < geneFeatures.length; f++) {
if (cancel) {
progressCancelled();
return;
}
if (!transcriptLookup.containsKey(geneFeatures[f].name())) {
++noTranscriptCount;
// No good making features for genes with no transcript.
continue;
}
Feature[] localTranscripts = transcriptLookup.get(geneFeatures[f].name()).toArray(new Feature[0]);
boolean[] exonPositions = new boolean[geneFeatures[f].location().length()];
for (int t = 0; t < localTranscripts.length; t++) {
if (!SequenceRead.overlaps(geneFeatures[f].location().packedPosition(), localTranscripts[t].location().packedPosition())) {
// It's not a match for this gene
continue;
}
if (!(localTranscripts[t].location() instanceof SplitLocation)) {
for (int p = localTranscripts[t].location().start() - geneFeatures[f].location().start(); p <= localTranscripts[t].location().end() - geneFeatures[f].location().start(); p++) {
if (p < 0)
continue;
if (p >= exonPositions.length)
continue;
exonPositions[p] = true;
}
continue;
}
Location[] exons = ((SplitLocation) localTranscripts[t].location()).subLocations();
for (int e = 0; e < exons.length; e++) {
for (int p = exons[e].start() - geneFeatures[f].location().start(); p <= exons[e].end() - geneFeatures[f].location().start(); p++) {
if (p < 0)
continue;
if (p >= exonPositions.length)
continue;
exonPositions[p] = true;
}
}
}
int exonPositionCount = 0;
int intronPositionCount = 0;
for (int p = 0; p < exonPositions.length; p++) {
if (exonPositions[p])
exonPositionCount++;
else
intronPositionCount++;
}
if (exonPositionCount == 0) {
++noTranscriptCount;
continue;
}
if (intronPositionCount == 0) {
++noIntronCount;
continue;
}
probes.add(new Probe(chrs[c], geneFeatures[f].location().start(), geneFeatures[f].location().end(), geneFeatures[f].location().strand(), geneFeatures[f].name()));
usedGeneFeatures.add(geneFeatures[f]);
}
}
if (noTranscriptCount > 0) {
progressWarningReceived(new SeqMonkException("" + noTranscriptCount + " genes had no associated transcripts"));
}
if (noIntronCount > 0) {
progressWarningReceived(new SeqMonkException("" + noIntronCount + " genes had no intron only areas"));
}
Probe[] allProbes = probes.toArray(new Probe[0]);
collection().setProbeSet(new ProbeSet("Gene features over " + optionsPanel.getSelectedGeneFeatureType(), allProbes));
// Having made probes we now need to quantitate them. We'll get the full set of reads for the full gene
// and then work out a combined set of exons. We can then quantitate the bases which fall into introns
// and exons separately and then work out a ratio from there.
QuantitationStrandType readFilter = optionsPanel.readFilter();
Feature[] geneFeatures = usedGeneFeatures.toArray(new Feature[0]);
for (int g = 0; g < geneFeatures.length; g++) {
if (cancel) {
progressCancelled();
return;
}
if (g % 100 == 0) {
progressUpdated("Quantitating features", g, geneFeatures.length);
}
int[] readLengths = new int[data.length];
if (correctReadLength) {
for (int d = 0; d < data.length; d++) {
readLengths[d] = data[d].getMaxReadLength();
}
}
// Find the set of transcripts which relate to this gene.
Feature[] transcripts = transcriptLookup.get(geneFeatures[g].name()).toArray(new Feature[0]);
Vector<Feature> validTranscripts = new Vector<Feature>();
for (int t = 0; t < transcripts.length; t++) {
if (SequenceRead.overlaps(geneFeatures[g].location().packedPosition(), transcripts[t].location().packedPosition())) {
validTranscripts.add(transcripts[t]);
}
}
transcripts = validTranscripts.toArray(new Feature[0]);
// before so if we do then something has gone wrong.
if (transcripts.length == 0) {
throw new IllegalStateException("No transcripts for gene " + geneFeatures[g] + " on chr " + geneFeatures[g].chromosomeName());
}
Vector<Location> allExonsVector = new Vector<Location>();
for (int t = 0; t < transcripts.length; t++) {
if (transcripts[t].location() instanceof SplitLocation) {
Location[] sublocs = ((SplitLocation) transcripts[t].location()).subLocations();
for (int i = 0; i < sublocs.length; i++) allExonsVector.add(sublocs[i]);
} else {
allExonsVector.add(transcripts[t].location());
}
}
// if (geneFeatures[f].name().equals("Cpa6")) {
// System.err.println("Cpa6 had "+allExonsVector.size()+" total exons");
// }
Collections.sort(allExonsVector);
// Now go through and make a merged set of exons to remove any redundancy
Vector<Location> mergedExonsVector = new Vector<Location>();
Location lastLocation = null;
Enumeration<Location> en = allExonsVector.elements();
while (en.hasMoreElements()) {
Location l = en.nextElement();
if (lastLocation == null) {
// if (geneFeatures[f].name().equals("Cpa6")) {
// System.err.println("Setting as first location");
// }
lastLocation = l;
continue;
}
// Check if it's the same as the last, which is likely
if (l.start() == lastLocation.start() && l.end() == lastLocation.end()) {
continue;
}
// Check if they overlap and can be merged
if (l.start() <= lastLocation.end() && l.end() >= lastLocation.start()) {
// if (geneFeatures[f].name().equals("Cpa6")) {
// System.err.println("Overlaps with last location");
// }
// It overlaps with the last location so merge them
lastLocation = new Location(Math.min(l.start(), lastLocation.start()), Math.max(l.end(), lastLocation.end()), geneFeatures[g].location().strand());
// }
continue;
}
// Start a new location
// if (geneFeatures[f].name().equals("Cpa6")) {
// System.err.println("Doesn't overlap - adding last location and creating new one");
// }
mergedExonsVector.add(lastLocation);
lastLocation = l;
}
if (lastLocation != null) {
mergedExonsVector.add(lastLocation);
}
// if (geneFeatures[f].name().equals("Cpa6")) {
// System.err.println("Cpa6 had "+mergedExonsVector.size()+" merged exons");
// }
// Now we can start the quantitation.
int intronLength = geneFeatures[g].location().length();
int exonLength = 0;
Location[] subLocs = mergedExonsVector.toArray(new Location[0]);
for (int l = 0; l < subLocs.length; l++) {
exonLength += subLocs[l].length();
}
// if (geneFeatures[f].name().equals("Cpa6")) {
// System.err.println("Cpa6 total intron length="+intronLength+" exon length="+exonLength);
// }
intronLength -= exonLength;
if (intronLength <= 0) {
progressWarningReceived(new IllegalStateException("Intron length of " + intronLength + " for gene " + geneFeatures[g]));
for (int d = 0; d < data.length; d++) {
data[d].setValueForProbe(allProbes[g], Float.NaN);
}
continue;
}
if (exonLength <= 0) {
progressWarningReceived(new IllegalStateException("Exon length of " + exonLength + " for gene " + geneFeatures[g]));
for (int d = 0; d < data.length; d++) {
data[d].setValueForProbe(allProbes[g], Float.NaN);
}
continue;
}
for (int d = 0; d < data.length; d++) {
if (cancel) {
progressCancelled();
return;
}
int totalIntronCount = 0;
int totalExonCount = 0;
long[] reads = data[d].getReadsForProbe(new Probe(chrLookup.get(geneFeatures[g].chromosomeName()), geneFeatures[g].location().start(), geneFeatures[g].location().end()));
for (int r = 0; r < reads.length; r++) {
if (!readFilter.useRead(geneFeatures[g].location(), reads[r])) {
continue;
}
int overlap = (Math.min(geneFeatures[g].location().end(), SequenceRead.end(reads[r])) - Math.max(geneFeatures[g].location().start(), SequenceRead.start(reads[r]))) + 1;
totalIntronCount += overlap;
// Now we see if we overlap with any of the exons
for (int s = 0; s < subLocs.length; s++) {
if (subLocs[s].start() <= SequenceRead.end(reads[r]) && subLocs[s].end() >= SequenceRead.start(reads[r])) {
int exonOverlap = (Math.min(subLocs[s].end(), SequenceRead.end(reads[r])) - Math.max(subLocs[s].start(), SequenceRead.start(reads[r]))) + 1;
if (exonOverlap > 0) {
totalExonCount += exonOverlap;
totalIntronCount -= exonOverlap;
}
}
}
}
if (totalIntronCount < 0) {
progressWarningReceived(new SeqMonkException("Intron count of " + totalIntronCount + " for " + geneFeatures[g].name()));
continue;
}
// reads which this count could comprise, rounding down to a whole number.
if (correctReadLength) {
totalIntronCount /= readLengths[d];
totalExonCount /= readLengths[d];
}
float intronValue = totalIntronCount;
float exonValue = totalExonCount;
// If we're log transforming then we need to set zero values to 0.9
if (logTransform && intronValue == 0) {
intronValue = 0.9f;
}
if (logTransform && exonValue == 0) {
exonValue = 0.9f;
}
// been asked to.
if (applyTranscriptLengthCorrection) {
intronValue /= (intronLength / 1000f);
exonValue /= (exonLength / 1000f);
}
// We also correct by the total read count, or length
if (correctReadLength) {
intronValue /= (data[d].getTotalReadCount() / 1000000f);
exonValue /= (data[d].getTotalReadCount() / 1000000f);
} else {
intronValue /= (data[d].getTotalReadLength() / 1000000f);
exonValue /= (data[d].getTotalReadLength() / 1000000f);
}
// Finally we do the log transform if we've been asked to
if (logTransform) {
if (intronValue == 0) {
intronValue = 0.0001f;
}
if (exonValue == 0) {
exonValue = 0.0001f;
}
intronValue = (float) Math.log(intronValue) / log2;
exonValue = (float) Math.log(exonValue) / log2;
}
// Now we check what value they actually wanted to record
switch(optionsPanel.quantitationType()) {
case (EXONS):
{
data[d].setValueForProbe(allProbes[g], exonValue);
break;
}
case (INTRONS):
{
data[d].setValueForProbe(allProbes[g], intronValue);
break;
}
case (RATIO):
{
if (logTransform) {
data[d].setValueForProbe(allProbes[g], exonValue - intronValue);
} else {
if (intronValue == 0) {
intronValue = 0.0001f;
}
data[d].setValueForProbe(allProbes[g], exonValue / intronValue);
}
break;
}
default:
throw new IllegalStateException("Unknonwn quantitation type " + optionsPanel.quantitationType());
}
// if (geneFeatures[f].name().equals("Cpa6")) {
// try {
// System.err.println("Stored value was "+data[d].getValueForProbe(allProbes[currentIndex]));
// } catch (SeqMonkException e) {
// e.printStackTrace();
// }
// }
}
}
StringBuffer quantitationDescription = new StringBuffer();
quantitationDescription.append("Splicing efficiency quantitation");
if (correctReadLength) {
quantitationDescription.append(" counting reads");
} else {
quantitationDescription.append(" counting bases");
}
if (optionsPanel.logTransform()) {
quantitationDescription.append(" log transformed");
}
if (applyTranscriptLengthCorrection) {
quantitationDescription.append(" correcting for feature length");
}
// TODO: Add more description
collection().probeSet().setCurrentQuantitation(quantitationDescription.toString());
quantitatonComplete();
}
use of uk.ac.babraham.SeqMonk.DataTypes.Genome.SplitLocation in project SeqMonk by s-andrews.
the class IntronRegressionPipeline method startPipeline.
protected void startPipeline() {
// We first need to generate probes over all of the features listed in
// the feature types. The probes should cover the whole area of the
// feature regardless of where it splices.
Vector<Probe> probes = new Vector<Probe>();
int minDensity = optionsPanel.minDensity();
int minLength = optionsPanel.minLength();
double maxPValue = optionsPanel.maxPValue();
int binSize = optionsPanel.measurementBinSize();
QuantitationStrandType readFilter = optionsPanel.readFilter();
Chromosome[] chrs = collection().genome().getAllChromosomes();
for (int c = 0; c < chrs.length; c++) {
if (cancel) {
progressCancelled();
return;
}
Vector<Probe> probesForThisChromosome = new Vector<Probe>();
progressUpdated("Making probes", c, chrs.length);
Feature[] features = getValidFeatures(chrs[c]);
for (int f = 0; f < features.length; f++) {
if (cancel) {
progressCancelled();
return;
}
// There are no introns here
if (!(features[f].location() instanceof SplitLocation))
continue;
Location[] subLocations = ((SplitLocation) features[f].location()).subLocations();
// TODO: Reverse the subLocations if its a reverse feature
for (int intron = 1; intron < subLocations.length; intron++) {
int start = subLocations[intron - 1].end();
int end = subLocations[intron].start();
if ((end - start) + 1 < minLength) {
// This intron is too short.
continue;
}
// TODO: We could throw away any probes which didn't have enough reads in any feature
Probe p = new Probe(chrs[c], start, end, features[f].location().strand(), features[f].name() + "_" + intron);
probesForThisChromosome.add(p);
}
}
// Now we can deduplicate the probes for this chromosome and add them to the main collection
Probe[] dupProbes = probesForThisChromosome.toArray(new Probe[0]);
Arrays.sort(dupProbes);
for (int p = 0; p < dupProbes.length; p++) {
if (p > 0 && dupProbes[p].packedPosition() == dupProbes[p - 1].packedPosition())
continue;
probes.add(dupProbes[p]);
}
}
Probe[] allProbes = probes.toArray(new Probe[0]);
collection().setProbeSet(new ProbeSet("Features over " + optionsPanel.getSelectedFeatureType(), allProbes));
// Now we go back through the probes and quantitate them
for (int p = 0; p < allProbes.length; p++) {
if (cancel) {
progressCancelled();
return;
}
if (p % 1000 == 0) {
progressUpdated("Quantitated " + p + " out of " + allProbes.length + " probes", p, allProbes.length);
}
for (int d = 0; d < data.length; d++) {
long[] reads = data[d].getReadsForProbe(allProbes[p]);
int[] countsPerSite = new int[allProbes[p].length()];
int usableCounts = 0;
for (int r = 0; r < reads.length; r++) {
if (readFilter.useRead(allProbes[p], reads[r])) {
++usableCounts;
for (int pos = Math.max(0, SequenceRead.start(reads[r]) - allProbes[p].start()); pos <= Math.min(countsPerSite.length - 1, SequenceRead.end(reads[r]) - allProbes[p].start()); pos++) {
++countsPerSite[pos];
}
}
}
if (usableCounts / (allProbes[p].length() / 1000d) >= minDensity) {
// We're going to do a linear regression rather than a correlation
// We're analysing in bins so we'll work out the bin counts and
// add them dynamically to the regression.
SimpleRegression regression = new SimpleRegression();
int binCount = 0;
for (int i = 0; i < countsPerSite.length; i++) {
if (i > 0 && i % binSize == 0) {
regression.addData(i, binCount);
binCount = 0;
}
binCount += countsPerSite[i];
}
float slope = (float) (regression.getSlope() * 1000000);
double pValue = regression.getSignificance();
if (allProbes[p].strand() == Location.REVERSE) {
slope = 0 - slope;
}
if (pValue <= maxPValue) {
data[d].setValueForProbe(allProbes[p], slope);
} else {
data[d].setValueForProbe(allProbes[p], Float.NaN);
}
} else {
data[d].setValueForProbe(allProbes[p], Float.NaN);
}
}
}
StringBuffer quantitationDescription = new StringBuffer();
quantitationDescription.append("Intron regression pipeline quantitation ");
quantitationDescription.append(". Directionality was ");
quantitationDescription.append(optionsPanel.libraryTypeBox.getSelectedItem());
quantitationDescription.append(". Min intron length was ");
quantitationDescription.append(minLength);
quantitationDescription.append(". Min read density was ");
quantitationDescription.append(minDensity);
quantitationDescription.append(". Max slope p-value was ");
quantitationDescription.append(maxPValue);
collection().probeSet().setCurrentQuantitation(quantitationDescription.toString());
quantitatonComplete();
}
Aggregations