use of uk.ac.babraham.SeqMonk.DataTypes.Sequence.QuantitationStrandType in project SeqMonk by s-andrews.
the class TranscriptionTerminationPipeline method startPipeline.
protected void startPipeline() {
// We first need to generate probes over all of the features listed in
// the feature types. The probes should cover the whole area of the
// feature regardless of where it splices.
Vector<Probe> probes = new Vector<Probe>();
int minCount = optionsPanel.minCount();
int measurementLength = optionsPanel.measurementLength();
QuantitationStrandType readFilter = optionsPanel.readFilter();
Chromosome[] chrs = collection().genome().getAllChromosomes();
for (int c = 0; c < chrs.length; c++) {
if (cancel) {
progressCancelled();
return;
}
progressUpdated("Creating Probes" + chrs[c].name(), c, chrs.length * 2);
Feature[] features = getValidFeatures(chrs[c], measurementLength);
for (int f = 0; f < features.length; f++) {
if (cancel) {
progressCancelled();
return;
}
if (features[f].location().strand() == Location.REVERSE) {
Probe p = new Probe(chrs[c], features[f].location().start() - measurementLength, features[f].location().start() + measurementLength, features[f].location().strand(), features[f].name());
probes.add(p);
} else {
Probe p = new Probe(chrs[c], features[f].location().end() - measurementLength, features[f].location().end() + measurementLength, features[f].location().strand(), features[f].name());
probes.add(p);
}
}
}
Probe[] allProbes = probes.toArray(new Probe[0]);
collection().setProbeSet(new ProbeSet("Features " + measurementLength + "bp around the end of " + optionsPanel.getSelectedFeatureType(), allProbes));
int probeIndex = 0;
for (int c = 0; c < chrs.length; c++) {
if (cancel) {
progressCancelled();
return;
}
progressUpdated("Quantitating features on chr" + chrs[c].name(), chrs.length + c, chrs.length * 2);
Feature[] features = getValidFeatures(chrs[c], measurementLength);
for (int f = 0; f < features.length; f++) {
if (cancel) {
progressCancelled();
return;
}
for (int d = 0; d < data.length; d++) {
if (allProbes[probeIndex].strand() == Location.REVERSE) {
Probe downstreamProbe = new Probe(chrs[c], features[f].location().start() - measurementLength, features[f].location().start(), features[f].location().strand(), features[f].name());
Probe upstreamProbe = new Probe(chrs[c], features[f].location().start(), features[f].location().start() + measurementLength, features[f].location().strand(), features[f].name());
long[] upstreamReads = data[d].getReadsForProbe(upstreamProbe);
long[] downstreamReads = data[d].getReadsForProbe(downstreamProbe);
int upstreamCount = 0;
for (int i = 0; i < upstreamReads.length; i++) {
if (readFilter.useRead(allProbes[probeIndex], upstreamReads[i]))
++upstreamCount;
}
int downstreamCount = 0;
for (int i = 0; i < downstreamReads.length; i++) {
if (readFilter.useRead(allProbes[probeIndex], downstreamReads[i]))
++downstreamCount;
}
float percentage = ((upstreamCount - downstreamCount) / (float) upstreamCount) * 100f;
if (upstreamCount >= minCount) {
data[d].setValueForProbe(allProbes[probeIndex], percentage);
} else {
data[d].setValueForProbe(allProbes[probeIndex], Float.NaN);
}
} else {
Probe upstreamProbe = new Probe(chrs[c], features[f].location().end() - measurementLength, features[f].location().end(), features[f].location().strand(), features[f].name());
Probe downstreamProbe = new Probe(chrs[c], features[f].location().end(), features[f].location().end() + measurementLength, features[f].location().strand(), features[f].name());
long[] upstreamReads = data[d].getReadsForProbe(upstreamProbe);
long[] downstreamReads = data[d].getReadsForProbe(downstreamProbe);
int upstreamCount = 0;
for (int i = 0; i < upstreamReads.length; i++) {
if (readFilter.useRead(allProbes[probeIndex], upstreamReads[i]))
++upstreamCount;
}
int downstreamCount = 0;
for (int i = 0; i < downstreamReads.length; i++) {
if (readFilter.useRead(allProbes[probeIndex], downstreamReads[i]))
++downstreamCount;
}
float percentage = ((upstreamCount - downstreamCount) / (float) upstreamCount) * 100f;
if (upstreamCount >= minCount) {
data[d].setValueForProbe(allProbes[probeIndex], percentage);
} else {
data[d].setValueForProbe(allProbes[probeIndex], Float.NaN);
}
}
}
++probeIndex;
}
}
StringBuffer quantitationDescription = new StringBuffer();
quantitationDescription.append("Transcription termination pipeline quantitation ");
quantitationDescription.append(". Directionality was ");
quantitationDescription.append(optionsPanel.libraryTypeBox.getSelectedItem());
quantitationDescription.append(". Measurement length was ");
quantitationDescription.append(optionsPanel.measurementLength());
quantitationDescription.append(". Min count was ");
quantitationDescription.append(optionsPanel.minCount());
collection().probeSet().setCurrentQuantitation(quantitationDescription.toString());
quantitatonComplete();
}
use of uk.ac.babraham.SeqMonk.DataTypes.Sequence.QuantitationStrandType in project SeqMonk by s-andrews.
the class ChiSquareFilterFrontBack method generateProbeList.
/* (non-Javadoc)
* @see uk.ac.babraham.SeqMonk.Filters.ProbeFilter#generateProbeList()
*/
protected void generateProbeList() {
if (options.stringencyField.getText().length() == 0) {
stringency = 0.05;
} else {
stringency = Double.parseDouble(options.stringencyField.getText());
}
if (options.minObservationsField.getText().length() == 0) {
minObservations = 10;
} else {
minObservations = Integer.parseInt(options.minObservationsField.getText());
}
if (options.minDifferenceField.getText().length() == 0) {
minPercentShift = 10;
} else {
minPercentShift = Integer.parseInt(options.minDifferenceField.getText());
}
QuantitationStrandType readFilter = (QuantitationStrandType) options.strandLimitBox.getSelectedItem();
applyMultipleTestingCorrection = options.multiTestBox.isSelected();
ProbeList newList;
if (applyMultipleTestingCorrection) {
newList = new ProbeList(startingList, "Filtered Probes", "", "Q-value");
} else {
newList = new ProbeList(startingList, "Filtered Probes", "", "P-value");
}
Probe[] probes = startingList.getAllProbes();
int[][] frontBackCounts = new int[stores.length][2];
// This is where we'll store any hits
Vector<ProbeTTestValue> hits = new Vector<ProbeTTestValue>();
PROBE: for (int p = 0; p < probes.length; p++) {
if (p % 100 == 0) {
progressUpdated("Processed " + p + " probes", p, probes.length);
}
if (cancel) {
cancel = false;
progressCancelled();
return;
}
Probe frontProbe;
Probe backProbe;
if (probes[p].strand() == Location.REVERSE) {
backProbe = new Probe(probes[p].chromosome(), probes[p].start(), probes[p].middle(), probes[p].strand());
frontProbe = new Probe(probes[p].chromosome(), probes[p].middle(), probes[p].end(), probes[p].strand());
} else {
frontProbe = new Probe(probes[p].chromosome(), probes[p].start(), probes[p].middle(), probes[p].strand());
backProbe = new Probe(probes[p].chromosome(), probes[p].middle(), probes[p].end(), probes[p].strand());
}
// For each dataset make up a list of forward and reverse probes under this probe
for (int d = 0; d < stores.length; d++) {
long[] frontReads = stores[d].getReadsForProbe(frontProbe);
long[] backReads = stores[d].getReadsForProbe(backProbe);
frontBackCounts[d][0] = 0;
frontBackCounts[d][1] = 0;
for (int r = 0; r < frontReads.length; r++) {
if (readFilter.useRead(frontProbe, frontReads[r]))
++frontBackCounts[d][0];
}
for (int r = 0; r < backReads.length; r++) {
if (readFilter.useRead(backProbe, backReads[r]))
++frontBackCounts[d][1];
}
// System.err.println("Datset = "+stores[d].name()+" Front counts="+frontBackCounts[d][0]+" Back counts="+frontBackCounts[d][1]);
}
// See if we have enough counts and difference to go on with this
double minPercent = 0;
double maxPercent = 0;
for (int d = 0; d < stores.length; d++) {
if (frontBackCounts[d][0] < minObservations) {
// System.err.println("Not enough counts to test");
continue PROBE;
}
double percent = (((double) frontBackCounts[d][0]) / (frontBackCounts[d][0] + frontBackCounts[d][1])) * 100;
if (d == 0 || percent < minPercent)
minPercent = percent;
if (d == 0 || percent > maxPercent)
maxPercent = percent;
}
if (maxPercent - minPercent < minPercentShift) {
// System.err.println("Not enough difference to test");
continue PROBE;
}
// Now perform the Chi-Square test.
double pValue = ChiSquareTest.chiSquarePvalue(frontBackCounts);
// System.err.println("Raw p-value="+pValue);
// Store this as a potential hit (after correcting p-values later)
hits.add(new ProbeTTestValue(probes[p], pValue));
}
// Now we can correct the p-values if we need to
ProbeTTestValue[] rawHits = hits.toArray(new ProbeTTestValue[0]);
if (applyMultipleTestingCorrection) {
BenjHochFDR.calculateQValues(rawHits);
}
for (int h = 0; h < rawHits.length; h++) {
if (applyMultipleTestingCorrection) {
if (rawHits[h].q < stringency) {
newList.addProbe(rawHits[h].probe, (float) rawHits[h].q);
}
} else {
if (rawHits[h].p < stringency) {
newList.addProbe(rawHits[h].probe, (float) rawHits[h].p);
}
}
}
filterFinished(newList);
}
use of uk.ac.babraham.SeqMonk.DataTypes.Sequence.QuantitationStrandType in project SeqMonk by s-andrews.
the class RNASeqPipeline method startPipeline.
protected void startPipeline() {
// We first need to generate probes over all of the features listed in
// the feature types. The probes should cover the whole area of the
// feature regardless of where it splices.
Vector<Probe> probes = new Vector<Probe>();
boolean mergeTranscripts = optionsPanel.mergeTranscripts();
boolean pairedEnd = optionsPanel.pairedEnd();
boolean logTransform = optionsPanel.logTransform();
boolean applyTranscriptLengthCorrection = optionsPanel.applyTranscriptLengthCorrection();
boolean rawCounts = optionsPanel.rawCounts();
boolean noValueForZeroCounts = optionsPanel.noValueForZeroCounts();
boolean correctDNAContamination = optionsPanel.correctForDNAContamination();
boolean correctDuplication = optionsPanel.correctForDNADuplication();
if (rawCounts) {
logTransform = false;
applyTranscriptLengthCorrection = false;
noValueForZeroCounts = false;
}
Chromosome[] chrs = collection().genome().getAllChromosomes();
for (int c = 0; c < chrs.length; c++) {
// System.err.println("Processing chr "+chrs[c].name());
if (cancel) {
progressCancelled();
return;
}
progressUpdated("Making features for chr" + chrs[c].name(), c, chrs.length * 2);
Feature[] features = collection().genome().annotationCollection().getFeaturesForType(chrs[c], optionsPanel.getSelectedFeatureType());
Arrays.sort(features);
FeatureGroup[] mergedTranscripts = mergeTranscripts(features, mergeTranscripts);
for (int f = 0; f < mergedTranscripts.length; f++) {
if (cancel) {
progressCancelled();
return;
}
probes.add(new Probe(chrs[c], mergedTranscripts[f].start(), mergedTranscripts[f].end(), mergedTranscripts[f].strand(), mergedTranscripts[f].name()));
}
}
Probe[] allProbes = probes.toArray(new Probe[0]);
Arrays.sort(allProbes);
if (collection().probeSet() == null) {
collection().setProbeSet(new ProbeSet("Transcript features over " + optionsPanel.getSelectedFeatureType(), allProbes));
} else {
Probe[] existingProbes = collection().probeSet().getAllProbes();
Arrays.sort(existingProbes);
if (allProbes.length != existingProbes.length) {
collection().setProbeSet(new ProbeSet("Transcript features over " + optionsPanel.getSelectedFeatureType(), allProbes));
} else {
// Check the positions against the new ones
boolean areTheyTheSame = true;
for (int p = 0; p < allProbes.length; p++) {
if (allProbes[p].packedPosition() != existingProbes[p].packedPosition()) {
areTheyTheSame = false;
break;
}
}
if (areTheyTheSame) {
allProbes = existingProbes;
} else {
collection().setProbeSet(new ProbeSet("Transcript features over " + optionsPanel.getSelectedFeatureType(), allProbes));
}
}
}
// If we're correcting for DNA contamination we need to work out the average density of
// reads in intergenic regions
float[] dnaDensityPerKb = new float[data.length];
int[] correctedTotalCounts = new int[data.length];
if (correctDNAContamination) {
// We need to make interstitial probes to the set we already have, ignoring those at the end of chromosomes
Vector<Probe> intergenicProbes = new Vector<Probe>();
Chromosome lastChr = allProbes[0].chromosome();
for (int p = 1; p < allProbes.length; p++) {
if (allProbes[p].chromosome() != lastChr) {
lastChr = allProbes[p].chromosome();
continue;
}
// See if there's a gap back to the last probe
if (allProbes[p].start() > allProbes[p - 1].end()) {
if (allProbes[p].start() - allProbes[p - 1].end() < 1000) {
// Don't bother with really short probes
continue;
}
intergenicProbes.add(new Probe(lastChr, allProbes[p - 1].end() + 1, allProbes[p].start() - 1));
}
}
Probe[] allIntergenicProbes = intergenicProbes.toArray(new Probe[0]);
for (int d = 0; d < data.length; d++) {
progressUpdated("Quantitating DNA contamination", 1, 2);
float[] densities = new float[allIntergenicProbes.length];
for (int p = 0; p < allIntergenicProbes.length; p++) {
densities[p] = data[d].getReadsForProbe(allIntergenicProbes[p]).length / (allIntergenicProbes[p].length() / 1000f);
}
dnaDensityPerKb[d] = SimpleStats.median(densities);
}
// Work out adjusted total counts having subtracted the DNA contamination
for (int d = 0; d < data.length; d++) {
int predictedContamination = (int) (dnaDensityPerKb[d] * (SeqMonkApplication.getInstance().dataCollection().genome().getTotalGenomeLength() / 1000));
int correctedTotalReadCount = data[d].getTotalReadCount() - predictedContamination;
correctedTotalCounts[d] = correctedTotalReadCount;
}
// Halve the density if they're doing a directional quantitation
if (optionsPanel.isDirectional()) {
for (int i = 0; i < dnaDensityPerKb.length; i++) {
dnaDensityPerKb[i] /= 2;
}
}
// Halve the density if the libraries are paired end
if (pairedEnd) {
for (int i = 0; i < dnaDensityPerKb.length; i++) {
dnaDensityPerKb[i] /= 2;
}
}
}
// If we're correcting for duplication we need to work out the modal count depth in
// intergenic regions
int[] modalDuplicationLevels = new int[data.length];
if (correctDuplication) {
for (int d = 0; d < data.length; d++) {
progressUpdated("Quantitating DNA duplication", 1, 2);
// We're not going to look at depths which are > 200. If it's that duplicated
// then there's no point continuing anyway.
int[] depthCount = new int[200];
for (int p = 0; p < allProbes.length; p++) {
long[] reads = data[d].getReadsForProbe(allProbes[p]);
int currentCount = 0;
for (int r = 1; r < reads.length; r++) {
if (reads[r] == reads[r - 1]) {
++currentCount;
} else {
if (currentCount > 0 && currentCount < 200) {
++depthCount[currentCount];
}
currentCount = 1;
}
}
}
// Find the modal depth in intergenic regions. This is the best estimate
// of duplication
// Since unique reads turn up all over the place even in duplicated
// data we say that if unique reads are higher than the sum of 2-10 there
// is no duplication
int twoTenSum = 0;
for (int i = 2; i <= 10; i++) {
twoTenSum += depthCount[i];
}
if (depthCount[1] > twoTenSum) {
modalDuplicationLevels[d] = 1;
} else {
int highestDepth = 0;
int bestDupGuess = 1;
for (int i = 2; i < depthCount.length; i++) {
// System.err.println("For depth "+i+" count was "+depthCount[i]);
if (depthCount[i] > highestDepth) {
bestDupGuess = i;
highestDepth = depthCount[i];
}
}
modalDuplicationLevels[d] = bestDupGuess;
}
}
}
// Having made probes we now need to quantitate them. We'll fetch the
// probes overlapping each sub-feature and then aggregate these together
// to get the final quantitation.
QuantitationStrandType readFilter = optionsPanel.readFilter();
int currentIndex = 0;
for (int c = 0; c < chrs.length; c++) {
if (cancel) {
progressCancelled();
return;
}
progressUpdated("Quantitating features on chr" + chrs[c].name(), chrs.length + c, chrs.length * 2);
Feature[] features = collection().genome().annotationCollection().getFeaturesForType(chrs[c], optionsPanel.getSelectedFeatureType());
Arrays.sort(features);
FeatureGroup[] mergedTranscripts = mergeTranscripts(features, mergeTranscripts);
int[] readLengths = new int[data.length];
for (int d = 0; d < data.length; d++) {
readLengths[d] = data[d].getMaxReadLength();
// actual length.
if (pairedEnd) {
readLengths[d] *= 2;
}
}
for (int f = 0; f < mergedTranscripts.length; f++) {
Location[] subLocations = mergedTranscripts[f].getSubLocations();
int totalLength = 0;
// Find the total length of all of the exons
for (int s = 0; s < subLocations.length; s++) {
totalLength += subLocations[s].length();
}
for (int d = 0; d < data.length; d++) {
if (cancel) {
progressCancelled();
return;
}
long totalCount = 0;
for (int s = 0; s < subLocations.length; s++) {
long[] reads = data[d].getReadsForProbe(new Probe(chrs[c], subLocations[s].start(), subLocations[s].end()));
for (int r = 0; r < reads.length; r++) {
if (!readFilter.useRead(subLocations[s], reads[r])) {
continue;
}
int overlap = (Math.min(subLocations[s].end(), SequenceRead.end(reads[r])) - Math.max(subLocations[s].start(), SequenceRead.start(reads[r]))) + 1;
totalCount += overlap;
}
}
// Now we correct the count by the total length of reads in the data and by
// the length of the split parts of the probe, and assign this to the probe.
// As we're correcting for read length then we work out the whole number of
// reads which this count could comprise, rounding down to a whole number.
totalCount /= readLengths[d];
// We can now subtract the DNA contamination prediction.
if (correctDNAContamination) {
int predictedContamination = (int) ((totalLength / 1000f) * dnaDensityPerKb[d]);
totalCount -= predictedContamination;
// Makes no sense to have negative counts
if (totalCount < 0)
totalCount = 0;
}
// ..and we can divide by the duplication level if we know it.
if (correctDuplication) {
totalCount /= modalDuplicationLevels[d];
}
// System.err.println("Total read count for "+mergedTranscripts[f].name+" is "+totalCount);
float value = totalCount;
if (value == 0 && noValueForZeroCounts) {
value = Float.NaN;
}
// If we're log transforming then we need to set zero values to 0.9
if (logTransform && value == 0 && !noValueForZeroCounts) {
value = 0.9f;
}
// been asked to.
if (applyTranscriptLengthCorrection) {
value /= (totalLength / 1000f);
}
// We also correct by the total read count
if (!rawCounts) {
// System.err.println("True total is "+data[d].getTotalReadCount()+" corrected total is "+correctedTotalCounts[d]);
// If these libraries are paired end then the total number of
// reads is also effectively halved.
float totalReadCount;
// calculated this already, but otherwise we'll take the total count (total length/read length)
if (correctDNAContamination) {
totalReadCount = correctedTotalCounts[d];
} else {
totalReadCount = data[d].getTotalReadLength() / readLengths[d];
}
// If we're correcting for duplication we divide by the duplication level.
if (correctDuplication) {
totalReadCount /= modalDuplicationLevels[d];
}
// Finally we work out millions of reads (single end) or fragments (paired end)
if (pairedEnd) {
totalReadCount /= 2000000f;
} else {
totalReadCount /= 1000000f;
}
// Lastly we divide the value by the total millions of reads to get the globally corrected count.
value /= totalReadCount;
}
// Finally we do the log transform if we've been asked to
if (logTransform) {
value = (float) Math.log(value) / log2;
}
data[d].setValueForProbe(allProbes[currentIndex], value);
}
currentIndex++;
}
}
collection().probeSet().setCurrentQuantitation(getQuantitationDescription(mergeTranscripts, applyTranscriptLengthCorrection, correctDNAContamination, logTransform, rawCounts));
// If we estimated any parameters let's report them.
if (correctDNAContamination || correctDuplication) {
float[] dna = null;
if (correctDNAContamination) {
dna = dnaDensityPerKb;
}
int[] dup = null;
if (correctDuplication) {
dup = modalDuplicationLevels;
}
RNASeqParametersModel model = new RNASeqParametersModel(data, dna, dup);
ReportTableDialog report = new ReportTableDialog(SeqMonkApplication.getInstance(), new Report(null, null) {
@Override
public void run() {
}
@Override
public String name() {
return "RNA-Seq parameter";
}
@Override
public boolean isReady() {
return true;
}
@Override
public JPanel getOptionsPanel() {
return null;
}
@Override
public void generateReport() {
}
}, model);
}
quantitatonComplete();
}
use of uk.ac.babraham.SeqMonk.DataTypes.Sequence.QuantitationStrandType in project SeqMonk by s-andrews.
the class SplicingEfficiencyPipeline method startPipeline.
protected void startPipeline() {
// We first need to generate probes over all of the features listed in
// the feature types. The probes should cover the whole area of the
// feature regardless of where it splices.
Vector<Probe> probes = new Vector<Probe>();
boolean correctReadLength = optionsPanel.correctForReadLength();
boolean logTransform = optionsPanel.logTransform();
boolean applyTranscriptLengthCorrection = optionsPanel.applyTranscriptLengthCorrection();
// We need to make a lookup for chromosomes from their names for later on.
Chromosome[] chrs = collection().genome().getAllChromosomes();
Hashtable<String, Chromosome> chrLookup = new Hashtable<String, Chromosome>();
for (int c = 0; c < chrs.length; c++) {
chrLookup.put(chrs[c].name(), chrs[c]);
}
// We're going to cache the gene to transcript mappings so we don't have to re-do them
// later on.
Hashtable<String, Vector<Feature>> transcriptLookup = new Hashtable<String, Vector<Feature>>();
Vector<Feature> usedGeneFeatures = new Vector<Feature>();
// We'll record the number of failures so we can report it
int noTranscriptCount = 0;
int noIntronCount = 0;
for (int c = 0; c < chrs.length; c++) {
if (cancel) {
progressCancelled();
return;
}
progressUpdated("Making features for chr" + chrs[c].name(), c, chrs.length * 2);
Feature[] geneFeatures = collection().genome().annotationCollection().getFeaturesForType(chrs[c], optionsPanel.getSelectedGeneFeatureType());
Arrays.sort(geneFeatures);
Feature[] transcriptFeatures = collection().genome().annotationCollection().getFeaturesForType(chrs[c], optionsPanel.getSelectedTranscriptFeatureType());
Arrays.sort(transcriptFeatures);
// We need to figure out whether we can match based on name, or whether we need to do it based on location.
boolean matchOnNames = true;
for (int t = 0; t < transcriptFeatures.length; t++) {
if (!transcriptFeatures[t].name().matches("-\\d\\d\\d$")) {
matchOnNames = false;
break;
}
}
if (!matchOnNames) {
progressWarningReceived(new SeqMonkException("Non-Ensembl transcript names found - matching based on position"));
}
if (matchOnNames) {
for (int i = 0; i < transcriptFeatures.length; i++) {
String name = transcriptFeatures[i].name();
name = name.replaceAll("-\\d\\d\\d$", "");
if (!transcriptLookup.containsKey(name)) {
transcriptLookup.put(name, new Vector<Feature>());
}
transcriptLookup.get(name).add(transcriptFeatures[i]);
}
} else {
// We need to go through the genes and transcripts in parallel and match them up based on containment
// and direction.
int lastGeneIndex = 0;
for (int t = 0; t < transcriptFeatures.length; t++) {
for (int g = lastGeneIndex; g < geneFeatures.length; g++) {
if (transcriptFeatures[t].location().start() > geneFeatures[g].location().end()) {
lastGeneIndex = g;
continue;
}
// If the gene is beyond the end of the transcript we can stop looking
if (geneFeatures[g].location().start() > transcriptFeatures[t].location().end()) {
break;
}
// If we're on the same strand and contained within the gene then we have a match
if (geneFeatures[g].location().strand() == transcriptFeatures[t].location().strand() && transcriptFeatures[t].location().start() >= geneFeatures[g].location().start() && transcriptFeatures[t].location().end() <= geneFeatures[g].location().end()) {
String name = geneFeatures[g].name();
if (!transcriptLookup.containsKey(name)) {
transcriptLookup.put(name, new Vector<Feature>());
}
transcriptLookup.get(name).add(transcriptFeatures[t]);
}
}
}
}
for (int f = 0; f < geneFeatures.length; f++) {
if (cancel) {
progressCancelled();
return;
}
if (!transcriptLookup.containsKey(geneFeatures[f].name())) {
++noTranscriptCount;
// No good making features for genes with no transcript.
continue;
}
Feature[] localTranscripts = transcriptLookup.get(geneFeatures[f].name()).toArray(new Feature[0]);
boolean[] exonPositions = new boolean[geneFeatures[f].location().length()];
for (int t = 0; t < localTranscripts.length; t++) {
if (!SequenceRead.overlaps(geneFeatures[f].location().packedPosition(), localTranscripts[t].location().packedPosition())) {
// It's not a match for this gene
continue;
}
if (!(localTranscripts[t].location() instanceof SplitLocation)) {
for (int p = localTranscripts[t].location().start() - geneFeatures[f].location().start(); p <= localTranscripts[t].location().end() - geneFeatures[f].location().start(); p++) {
if (p < 0)
continue;
if (p >= exonPositions.length)
continue;
exonPositions[p] = true;
}
continue;
}
Location[] exons = ((SplitLocation) localTranscripts[t].location()).subLocations();
for (int e = 0; e < exons.length; e++) {
for (int p = exons[e].start() - geneFeatures[f].location().start(); p <= exons[e].end() - geneFeatures[f].location().start(); p++) {
if (p < 0)
continue;
if (p >= exonPositions.length)
continue;
exonPositions[p] = true;
}
}
}
int exonPositionCount = 0;
int intronPositionCount = 0;
for (int p = 0; p < exonPositions.length; p++) {
if (exonPositions[p])
exonPositionCount++;
else
intronPositionCount++;
}
if (exonPositionCount == 0) {
++noTranscriptCount;
continue;
}
if (intronPositionCount == 0) {
++noIntronCount;
continue;
}
probes.add(new Probe(chrs[c], geneFeatures[f].location().start(), geneFeatures[f].location().end(), geneFeatures[f].location().strand(), geneFeatures[f].name()));
usedGeneFeatures.add(geneFeatures[f]);
}
}
if (noTranscriptCount > 0) {
progressWarningReceived(new SeqMonkException("" + noTranscriptCount + " genes had no associated transcripts"));
}
if (noIntronCount > 0) {
progressWarningReceived(new SeqMonkException("" + noIntronCount + " genes had no intron only areas"));
}
Probe[] allProbes = probes.toArray(new Probe[0]);
collection().setProbeSet(new ProbeSet("Gene features over " + optionsPanel.getSelectedGeneFeatureType(), allProbes));
// Having made probes we now need to quantitate them. We'll get the full set of reads for the full gene
// and then work out a combined set of exons. We can then quantitate the bases which fall into introns
// and exons separately and then work out a ratio from there.
QuantitationStrandType readFilter = optionsPanel.readFilter();
Feature[] geneFeatures = usedGeneFeatures.toArray(new Feature[0]);
for (int g = 0; g < geneFeatures.length; g++) {
if (cancel) {
progressCancelled();
return;
}
if (g % 100 == 0) {
progressUpdated("Quantitating features", g, geneFeatures.length);
}
int[] readLengths = new int[data.length];
if (correctReadLength) {
for (int d = 0; d < data.length; d++) {
readLengths[d] = data[d].getMaxReadLength();
}
}
// Find the set of transcripts which relate to this gene.
Feature[] transcripts = transcriptLookup.get(geneFeatures[g].name()).toArray(new Feature[0]);
Vector<Feature> validTranscripts = new Vector<Feature>();
for (int t = 0; t < transcripts.length; t++) {
if (SequenceRead.overlaps(geneFeatures[g].location().packedPosition(), transcripts[t].location().packedPosition())) {
validTranscripts.add(transcripts[t]);
}
}
transcripts = validTranscripts.toArray(new Feature[0]);
// before so if we do then something has gone wrong.
if (transcripts.length == 0) {
throw new IllegalStateException("No transcripts for gene " + geneFeatures[g] + " on chr " + geneFeatures[g].chromosomeName());
}
Vector<Location> allExonsVector = new Vector<Location>();
for (int t = 0; t < transcripts.length; t++) {
if (transcripts[t].location() instanceof SplitLocation) {
Location[] sublocs = ((SplitLocation) transcripts[t].location()).subLocations();
for (int i = 0; i < sublocs.length; i++) allExonsVector.add(sublocs[i]);
} else {
allExonsVector.add(transcripts[t].location());
}
}
// if (geneFeatures[f].name().equals("Cpa6")) {
// System.err.println("Cpa6 had "+allExonsVector.size()+" total exons");
// }
Collections.sort(allExonsVector);
// Now go through and make a merged set of exons to remove any redundancy
Vector<Location> mergedExonsVector = new Vector<Location>();
Location lastLocation = null;
Enumeration<Location> en = allExonsVector.elements();
while (en.hasMoreElements()) {
Location l = en.nextElement();
if (lastLocation == null) {
// if (geneFeatures[f].name().equals("Cpa6")) {
// System.err.println("Setting as first location");
// }
lastLocation = l;
continue;
}
// Check if it's the same as the last, which is likely
if (l.start() == lastLocation.start() && l.end() == lastLocation.end()) {
continue;
}
// Check if they overlap and can be merged
if (l.start() <= lastLocation.end() && l.end() >= lastLocation.start()) {
// if (geneFeatures[f].name().equals("Cpa6")) {
// System.err.println("Overlaps with last location");
// }
// It overlaps with the last location so merge them
lastLocation = new Location(Math.min(l.start(), lastLocation.start()), Math.max(l.end(), lastLocation.end()), geneFeatures[g].location().strand());
// }
continue;
}
// Start a new location
// if (geneFeatures[f].name().equals("Cpa6")) {
// System.err.println("Doesn't overlap - adding last location and creating new one");
// }
mergedExonsVector.add(lastLocation);
lastLocation = l;
}
if (lastLocation != null) {
mergedExonsVector.add(lastLocation);
}
// if (geneFeatures[f].name().equals("Cpa6")) {
// System.err.println("Cpa6 had "+mergedExonsVector.size()+" merged exons");
// }
// Now we can start the quantitation.
int intronLength = geneFeatures[g].location().length();
int exonLength = 0;
Location[] subLocs = mergedExonsVector.toArray(new Location[0]);
for (int l = 0; l < subLocs.length; l++) {
exonLength += subLocs[l].length();
}
// if (geneFeatures[f].name().equals("Cpa6")) {
// System.err.println("Cpa6 total intron length="+intronLength+" exon length="+exonLength);
// }
intronLength -= exonLength;
if (intronLength <= 0) {
progressWarningReceived(new IllegalStateException("Intron length of " + intronLength + " for gene " + geneFeatures[g]));
for (int d = 0; d < data.length; d++) {
data[d].setValueForProbe(allProbes[g], Float.NaN);
}
continue;
}
if (exonLength <= 0) {
progressWarningReceived(new IllegalStateException("Exon length of " + exonLength + " for gene " + geneFeatures[g]));
for (int d = 0; d < data.length; d++) {
data[d].setValueForProbe(allProbes[g], Float.NaN);
}
continue;
}
for (int d = 0; d < data.length; d++) {
if (cancel) {
progressCancelled();
return;
}
int totalIntronCount = 0;
int totalExonCount = 0;
long[] reads = data[d].getReadsForProbe(new Probe(chrLookup.get(geneFeatures[g].chromosomeName()), geneFeatures[g].location().start(), geneFeatures[g].location().end()));
for (int r = 0; r < reads.length; r++) {
if (!readFilter.useRead(geneFeatures[g].location(), reads[r])) {
continue;
}
int overlap = (Math.min(geneFeatures[g].location().end(), SequenceRead.end(reads[r])) - Math.max(geneFeatures[g].location().start(), SequenceRead.start(reads[r]))) + 1;
totalIntronCount += overlap;
// Now we see if we overlap with any of the exons
for (int s = 0; s < subLocs.length; s++) {
if (subLocs[s].start() <= SequenceRead.end(reads[r]) && subLocs[s].end() >= SequenceRead.start(reads[r])) {
int exonOverlap = (Math.min(subLocs[s].end(), SequenceRead.end(reads[r])) - Math.max(subLocs[s].start(), SequenceRead.start(reads[r]))) + 1;
if (exonOverlap > 0) {
totalExonCount += exonOverlap;
totalIntronCount -= exonOverlap;
}
}
}
}
if (totalIntronCount < 0) {
progressWarningReceived(new SeqMonkException("Intron count of " + totalIntronCount + " for " + geneFeatures[g].name()));
continue;
}
// reads which this count could comprise, rounding down to a whole number.
if (correctReadLength) {
totalIntronCount /= readLengths[d];
totalExonCount /= readLengths[d];
}
float intronValue = totalIntronCount;
float exonValue = totalExonCount;
// If we're log transforming then we need to set zero values to 0.9
if (logTransform && intronValue == 0) {
intronValue = 0.9f;
}
if (logTransform && exonValue == 0) {
exonValue = 0.9f;
}
// been asked to.
if (applyTranscriptLengthCorrection) {
intronValue /= (intronLength / 1000f);
exonValue /= (exonLength / 1000f);
}
// We also correct by the total read count, or length
if (correctReadLength) {
intronValue /= (data[d].getTotalReadCount() / 1000000f);
exonValue /= (data[d].getTotalReadCount() / 1000000f);
} else {
intronValue /= (data[d].getTotalReadLength() / 1000000f);
exonValue /= (data[d].getTotalReadLength() / 1000000f);
}
// Finally we do the log transform if we've been asked to
if (logTransform) {
if (intronValue == 0) {
intronValue = 0.0001f;
}
if (exonValue == 0) {
exonValue = 0.0001f;
}
intronValue = (float) Math.log(intronValue) / log2;
exonValue = (float) Math.log(exonValue) / log2;
}
// Now we check what value they actually wanted to record
switch(optionsPanel.quantitationType()) {
case (EXONS):
{
data[d].setValueForProbe(allProbes[g], exonValue);
break;
}
case (INTRONS):
{
data[d].setValueForProbe(allProbes[g], intronValue);
break;
}
case (RATIO):
{
if (logTransform) {
data[d].setValueForProbe(allProbes[g], exonValue - intronValue);
} else {
if (intronValue == 0) {
intronValue = 0.0001f;
}
data[d].setValueForProbe(allProbes[g], exonValue / intronValue);
}
break;
}
default:
throw new IllegalStateException("Unknonwn quantitation type " + optionsPanel.quantitationType());
}
// if (geneFeatures[f].name().equals("Cpa6")) {
// try {
// System.err.println("Stored value was "+data[d].getValueForProbe(allProbes[currentIndex]));
// } catch (SeqMonkException e) {
// e.printStackTrace();
// }
// }
}
}
StringBuffer quantitationDescription = new StringBuffer();
quantitationDescription.append("Splicing efficiency quantitation");
if (correctReadLength) {
quantitationDescription.append(" counting reads");
} else {
quantitationDescription.append(" counting bases");
}
if (optionsPanel.logTransform()) {
quantitationDescription.append(" log transformed");
}
if (applyTranscriptLengthCorrection) {
quantitationDescription.append(" correcting for feature length");
}
// TODO: Add more description
collection().probeSet().setCurrentQuantitation(quantitationDescription.toString());
quantitatonComplete();
}
use of uk.ac.babraham.SeqMonk.DataTypes.Sequence.QuantitationStrandType in project SeqMonk by s-andrews.
the class AntisenseTranscriptionPipeline method startPipeline.
protected void startPipeline() {
// We first need to generate probes over all of the features listed in
// the feature types. The probes should cover the whole area of the
// feature regardless of where it splices.
Vector<Probe> probes = new Vector<Probe>();
double pValue = optionsPanel.pValue();
QuantitationStrandType readFilter = optionsPanel.readFilter();
long[] senseCounts = new long[data.length];
long[] antisenseCounts = new long[data.length];
Chromosome[] chrs = collection().genome().getAllChromosomes();
for (int c = 0; c < chrs.length; c++) {
if (cancel) {
progressCancelled();
return;
}
progressUpdated("Getting total antisense rate for chr" + chrs[c].name(), c, chrs.length * 2);
Feature[] features = getValidFeatures(chrs[c]);
for (int f = 0; f < features.length; f++) {
if (cancel) {
progressCancelled();
return;
}
Probe p = new Probe(chrs[c], features[f].location().start(), features[f].location().end(), features[f].location().strand(), features[f].name());
probes.add(p);
for (int d = 0; d < data.length; d++) {
long[] reads = data[d].getReadsForProbe(p);
for (int r = 0; r < reads.length; r++) {
if (readFilter.useRead(p, reads[r])) {
senseCounts[d] += SequenceRead.length(reads[r]);
} else {
antisenseCounts[d] += SequenceRead.length(reads[r]);
}
}
}
}
}
Probe[] allProbes = probes.toArray(new Probe[0]);
collection().setProbeSet(new ProbeSet("Features over " + optionsPanel.getSelectedFeatureType(), allProbes));
// Now we can work out the overall antisense rate
double[] antisenseProbability = new double[data.length];
for (int d = 0; d < data.length; d++) {
System.err.println("Antisense counts are " + antisenseCounts[d] + " sense counts are " + senseCounts[d]);
antisenseProbability[d] = antisenseCounts[d] / (double) (antisenseCounts[d] + senseCounts[d]);
System.err.println("Antisense probability for " + data[d].name() + " is " + antisenseProbability[d]);
}
// Now we can quantitate each individual feature and test for whether it is significantly
// showing antisense expression
ArrayList<Vector<ProbeTTestValue>> significantProbes = new ArrayList<Vector<ProbeTTestValue>>();
for (int d = 0; d < data.length; d++) {
significantProbes.add(new Vector<ProbeTTestValue>());
}
int[] readLengths = new int[data.length];
for (int d = 0; d < readLengths.length; d++) {
readLengths[d] = data[d].getMaxReadLength();
System.err.println("For " + data[d].name() + " max read len is " + readLengths[d]);
}
for (int c = 0; c < chrs.length; c++) {
if (cancel) {
progressCancelled();
return;
}
progressUpdated("Quantitating features on chr" + chrs[c].name(), chrs.length + c, chrs.length * 2);
Probe[] thisChrProbes = collection().probeSet().getProbesForChromosome(chrs[c]);
for (int p = 0; p < thisChrProbes.length; p++) {
for (int d = 0; d < data.length; d++) {
if (cancel) {
progressCancelled();
return;
}
long senseCount = 0;
long antisenseCount = 0;
long[] reads = data[d].getReadsForProbe(thisChrProbes[p]);
for (int r = 0; r < reads.length; r++) {
if (readFilter.useRead(thisChrProbes[p], reads[r])) {
// TODO: Just count overlap?
senseCount += SequenceRead.length(reads[r]);
} else {
antisenseCount += SequenceRead.length(reads[r]);
}
}
// if (thisChrProbes[p].name().equals("RP4-798A10.2")) {
// System.err.println("Raw base counts are sense="+senseCount+" anti="+antisenseCount+" from "+reads.length+" reads");
// }
int senseReads = (int) (senseCount / readLengths[d]);
int antisenseReads = (int) (antisenseCount / readLengths[d]);
// if (thisChrProbes[p].name().equals("RP4-798A10.2")) {
// System.err.println("Raw read counts are sense="+senseReads+" anti="+antisenseReads+" from "+reads.length+" reads");
// }
BinomialDistribution bd = new BinomialDistribution(senseReads + antisenseReads, antisenseProbability[d]);
// Since the binomial distribution gives the probability of getting a value higher than
// this we need to subtract one so we get the probability of this or higher.
double thisPValue = 1 - bd.cumulativeProbability(antisenseReads - 1);
if (antisenseReads == 0)
thisPValue = 1;
// We have to add all results at this stage so we don't mess up the multiple
// testing correction later on.
significantProbes.get(d).add(new ProbeTTestValue(thisChrProbes[p], thisPValue));
double expected = ((senseReads + antisenseReads) * antisenseProbability[d]);
if (expected < 1)
expected = 1;
float obsExp = antisenseReads / (float) expected;
data[d].setValueForProbe(thisChrProbes[p], obsExp);
}
}
}
// filtering those which pass our p-value cutoff
for (int d = 0; d < data.length; d++) {
ProbeTTestValue[] ttestResults = significantProbes.get(d).toArray(new ProbeTTestValue[0]);
BenjHochFDR.calculateQValues(ttestResults);
ProbeList newList = new ProbeList(collection().probeSet(), "Antisense < " + pValue + " in " + data[d].name(), "Probes showing significant antisense transcription from a basal level of " + antisenseProbability[d] + " with a cutoff of " + pValue, "FDR");
for (int i = 0; i < ttestResults.length; i++) {
if (ttestResults[i].probe.name().equals("RP4-798A10.2")) {
System.err.println("Raw p=" + ttestResults[i].p + " q=" + ttestResults[i].q);
}
if (ttestResults[i].q < pValue) {
newList.addProbe(ttestResults[i].probe, (float) ttestResults[i].q);
}
}
}
StringBuffer quantitationDescription = new StringBuffer();
quantitationDescription.append("Antisense transcription pipeline quantitation ");
quantitationDescription.append(". Directionality was ");
quantitationDescription.append(optionsPanel.libraryTypeBox.getSelectedItem());
if (optionsPanel.ignoreOverlaps()) {
quantitationDescription.append(". Ignoring existing overlaps");
}
quantitationDescription.append(". P-value cutoff was ");
quantitationDescription.append(optionsPanel.pValue());
collection().probeSet().setCurrentQuantitation(quantitationDescription.toString());
quantitatonComplete();
}
Aggregations