use of uk.ac.babraham.SeqMonk.Reports.Report in project SeqMonk by s-andrews.
the class RNASeqPipeline method startPipeline.
protected void startPipeline() {
// We first need to generate probes over all of the features listed in
// the feature types. The probes should cover the whole area of the
// feature regardless of where it splices.
Vector<Probe> probes = new Vector<Probe>();
boolean mergeTranscripts = optionsPanel.mergeTranscripts();
boolean pairedEnd = optionsPanel.pairedEnd();
boolean logTransform = optionsPanel.logTransform();
boolean applyTranscriptLengthCorrection = optionsPanel.applyTranscriptLengthCorrection();
boolean rawCounts = optionsPanel.rawCounts();
boolean noValueForZeroCounts = optionsPanel.noValueForZeroCounts();
boolean correctDNAContamination = optionsPanel.correctForDNAContamination();
boolean correctDuplication = optionsPanel.correctForDNADuplication();
if (rawCounts) {
logTransform = false;
applyTranscriptLengthCorrection = false;
noValueForZeroCounts = false;
}
Chromosome[] chrs = collection().genome().getAllChromosomes();
for (int c = 0; c < chrs.length; c++) {
// System.err.println("Processing chr "+chrs[c].name());
if (cancel) {
progressCancelled();
return;
}
progressUpdated("Making features for chr" + chrs[c].name(), c, chrs.length * 2);
Feature[] features = collection().genome().annotationCollection().getFeaturesForType(chrs[c], optionsPanel.getSelectedFeatureType());
Arrays.sort(features);
FeatureGroup[] mergedTranscripts = mergeTranscripts(features, mergeTranscripts);
for (int f = 0; f < mergedTranscripts.length; f++) {
if (cancel) {
progressCancelled();
return;
}
probes.add(new Probe(chrs[c], mergedTranscripts[f].start(), mergedTranscripts[f].end(), mergedTranscripts[f].strand(), mergedTranscripts[f].name()));
}
}
Probe[] allProbes = probes.toArray(new Probe[0]);
Arrays.sort(allProbes);
if (collection().probeSet() == null) {
collection().setProbeSet(new ProbeSet("Transcript features over " + optionsPanel.getSelectedFeatureType(), allProbes));
} else {
Probe[] existingProbes = collection().probeSet().getAllProbes();
Arrays.sort(existingProbes);
if (allProbes.length != existingProbes.length) {
collection().setProbeSet(new ProbeSet("Transcript features over " + optionsPanel.getSelectedFeatureType(), allProbes));
} else {
// Check the positions against the new ones
boolean areTheyTheSame = true;
for (int p = 0; p < allProbes.length; p++) {
if (allProbes[p].packedPosition() != existingProbes[p].packedPosition()) {
areTheyTheSame = false;
break;
}
}
if (areTheyTheSame) {
allProbes = existingProbes;
} else {
collection().setProbeSet(new ProbeSet("Transcript features over " + optionsPanel.getSelectedFeatureType(), allProbes));
}
}
}
// If we're correcting for DNA contamination we need to work out the average density of
// reads in intergenic regions
float[] dnaDensityPerKb = new float[data.length];
int[] correctedTotalCounts = new int[data.length];
if (correctDNAContamination) {
// We need to make interstitial probes to the set we already have, ignoring those at the end of chromosomes
Vector<Probe> intergenicProbes = new Vector<Probe>();
Chromosome lastChr = allProbes[0].chromosome();
for (int p = 1; p < allProbes.length; p++) {
if (allProbes[p].chromosome() != lastChr) {
lastChr = allProbes[p].chromosome();
continue;
}
// See if there's a gap back to the last probe
if (allProbes[p].start() > allProbes[p - 1].end()) {
if (allProbes[p].start() - allProbes[p - 1].end() < 1000) {
// Don't bother with really short probes
continue;
}
intergenicProbes.add(new Probe(lastChr, allProbes[p - 1].end() + 1, allProbes[p].start() - 1));
}
}
Probe[] allIntergenicProbes = intergenicProbes.toArray(new Probe[0]);
for (int d = 0; d < data.length; d++) {
progressUpdated("Quantitating DNA contamination", 1, 2);
float[] densities = new float[allIntergenicProbes.length];
for (int p = 0; p < allIntergenicProbes.length; p++) {
densities[p] = data[d].getReadsForProbe(allIntergenicProbes[p]).length / (allIntergenicProbes[p].length() / 1000f);
}
dnaDensityPerKb[d] = SimpleStats.median(densities);
}
// Work out adjusted total counts having subtracted the DNA contamination
for (int d = 0; d < data.length; d++) {
int predictedContamination = (int) (dnaDensityPerKb[d] * (SeqMonkApplication.getInstance().dataCollection().genome().getTotalGenomeLength() / 1000));
int correctedTotalReadCount = data[d].getTotalReadCount() - predictedContamination;
correctedTotalCounts[d] = correctedTotalReadCount;
}
// Halve the density if they're doing a directional quantitation
if (optionsPanel.isDirectional()) {
for (int i = 0; i < dnaDensityPerKb.length; i++) {
dnaDensityPerKb[i] /= 2;
}
}
// Halve the density if the libraries are paired end
if (pairedEnd) {
for (int i = 0; i < dnaDensityPerKb.length; i++) {
dnaDensityPerKb[i] /= 2;
}
}
}
// If we're correcting for duplication we need to work out the modal count depth in
// intergenic regions
int[] modalDuplicationLevels = new int[data.length];
if (correctDuplication) {
for (int d = 0; d < data.length; d++) {
progressUpdated("Quantitating DNA duplication", 1, 2);
// We're not going to look at depths which are > 200. If it's that duplicated
// then there's no point continuing anyway.
int[] depthCount = new int[200];
for (int p = 0; p < allProbes.length; p++) {
long[] reads = data[d].getReadsForProbe(allProbes[p]);
int currentCount = 0;
for (int r = 1; r < reads.length; r++) {
if (reads[r] == reads[r - 1]) {
++currentCount;
} else {
if (currentCount > 0 && currentCount < 200) {
++depthCount[currentCount];
}
currentCount = 1;
}
}
}
// Find the modal depth in intergenic regions. This is the best estimate
// of duplication
// Since unique reads turn up all over the place even in duplicated
// data we say that if unique reads are higher than the sum of 2-10 there
// is no duplication
int twoTenSum = 0;
for (int i = 2; i <= 10; i++) {
twoTenSum += depthCount[i];
}
if (depthCount[1] > twoTenSum) {
modalDuplicationLevels[d] = 1;
} else {
int highestDepth = 0;
int bestDupGuess = 1;
for (int i = 2; i < depthCount.length; i++) {
// System.err.println("For depth "+i+" count was "+depthCount[i]);
if (depthCount[i] > highestDepth) {
bestDupGuess = i;
highestDepth = depthCount[i];
}
}
modalDuplicationLevels[d] = bestDupGuess;
}
}
}
// Having made probes we now need to quantitate them. We'll fetch the
// probes overlapping each sub-feature and then aggregate these together
// to get the final quantitation.
QuantitationStrandType readFilter = optionsPanel.readFilter();
int currentIndex = 0;
for (int c = 0; c < chrs.length; c++) {
if (cancel) {
progressCancelled();
return;
}
progressUpdated("Quantitating features on chr" + chrs[c].name(), chrs.length + c, chrs.length * 2);
Feature[] features = collection().genome().annotationCollection().getFeaturesForType(chrs[c], optionsPanel.getSelectedFeatureType());
Arrays.sort(features);
FeatureGroup[] mergedTranscripts = mergeTranscripts(features, mergeTranscripts);
int[] readLengths = new int[data.length];
for (int d = 0; d < data.length; d++) {
readLengths[d] = data[d].getMaxReadLength();
// actual length.
if (pairedEnd) {
readLengths[d] *= 2;
}
}
for (int f = 0; f < mergedTranscripts.length; f++) {
Location[] subLocations = mergedTranscripts[f].getSubLocations();
int totalLength = 0;
// Find the total length of all of the exons
for (int s = 0; s < subLocations.length; s++) {
totalLength += subLocations[s].length();
}
for (int d = 0; d < data.length; d++) {
if (cancel) {
progressCancelled();
return;
}
long totalCount = 0;
for (int s = 0; s < subLocations.length; s++) {
long[] reads = data[d].getReadsForProbe(new Probe(chrs[c], subLocations[s].start(), subLocations[s].end()));
for (int r = 0; r < reads.length; r++) {
if (!readFilter.useRead(subLocations[s], reads[r])) {
continue;
}
int overlap = (Math.min(subLocations[s].end(), SequenceRead.end(reads[r])) - Math.max(subLocations[s].start(), SequenceRead.start(reads[r]))) + 1;
totalCount += overlap;
}
}
// Now we correct the count by the total length of reads in the data and by
// the length of the split parts of the probe, and assign this to the probe.
// As we're correcting for read length then we work out the whole number of
// reads which this count could comprise, rounding down to a whole number.
totalCount /= readLengths[d];
// We can now subtract the DNA contamination prediction.
if (correctDNAContamination) {
int predictedContamination = (int) ((totalLength / 1000f) * dnaDensityPerKb[d]);
totalCount -= predictedContamination;
// Makes no sense to have negative counts
if (totalCount < 0)
totalCount = 0;
}
// ..and we can divide by the duplication level if we know it.
if (correctDuplication) {
totalCount /= modalDuplicationLevels[d];
}
// System.err.println("Total read count for "+mergedTranscripts[f].name+" is "+totalCount);
float value = totalCount;
if (value == 0 && noValueForZeroCounts) {
value = Float.NaN;
}
// If we're log transforming then we need to set zero values to 0.9
if (logTransform && value == 0 && !noValueForZeroCounts) {
value = 0.9f;
}
// been asked to.
if (applyTranscriptLengthCorrection) {
value /= (totalLength / 1000f);
}
// We also correct by the total read count
if (!rawCounts) {
// System.err.println("True total is "+data[d].getTotalReadCount()+" corrected total is "+correctedTotalCounts[d]);
// If these libraries are paired end then the total number of
// reads is also effectively halved.
float totalReadCount;
// calculated this already, but otherwise we'll take the total count (total length/read length)
if (correctDNAContamination) {
totalReadCount = correctedTotalCounts[d];
} else {
totalReadCount = data[d].getTotalReadLength() / readLengths[d];
}
// If we're correcting for duplication we divide by the duplication level.
if (correctDuplication) {
totalReadCount /= modalDuplicationLevels[d];
}
// Finally we work out millions of reads (single end) or fragments (paired end)
if (pairedEnd) {
totalReadCount /= 2000000f;
} else {
totalReadCount /= 1000000f;
}
// Lastly we divide the value by the total millions of reads to get the globally corrected count.
value /= totalReadCount;
}
// Finally we do the log transform if we've been asked to
if (logTransform) {
value = (float) Math.log(value) / log2;
}
data[d].setValueForProbe(allProbes[currentIndex], value);
}
currentIndex++;
}
}
collection().probeSet().setCurrentQuantitation(getQuantitationDescription(mergeTranscripts, applyTranscriptLengthCorrection, correctDNAContamination, logTransform, rawCounts));
// If we estimated any parameters let's report them.
if (correctDNAContamination || correctDuplication) {
float[] dna = null;
if (correctDNAContamination) {
dna = dnaDensityPerKb;
}
int[] dup = null;
if (correctDuplication) {
dup = modalDuplicationLevels;
}
RNASeqParametersModel model = new RNASeqParametersModel(data, dna, dup);
ReportTableDialog report = new ReportTableDialog(SeqMonkApplication.getInstance(), new Report(null, null) {
@Override
public void run() {
}
@Override
public String name() {
return "RNA-Seq parameter";
}
@Override
public boolean isReady() {
return true;
}
@Override
public JPanel getOptionsPanel() {
return null;
}
@Override
public void generateReport() {
}
}, model);
}
quantitatonComplete();
}
Aggregations