use of uk.ac.babraham.SeqMonk.DataTypes.Sequence.HiCHitCollection in project SeqMonk by s-andrews.
the class VisibleStoresParser method processHiCDataStore.
private DataSet processHiCDataStore(DataStore store) throws SeqMonkException {
int extendBy = prefs.extendReads();
boolean reverse = prefs.reverseReads();
boolean removeStrand = prefs.removeStrandInfo();
PairedDataSet newData = new PairedDataSet(store.name() + "_reimport", "Reimported from " + store.name(), prefs.removeDuplicates(), prefs.hiCDistance(), prefs.hiCIgnoreTrans());
// Now process the data
Chromosome[] chrs = dataCollection().genome().getAllChromosomes();
for (int c = 0; c < chrs.length; c++) {
progressUpdated("Processing " + store.name() + " chr " + chrs[c].name(), c, chrs.length);
// We make the call to get exportable reads so we don't duplicate reads
// when we export things
HiCHitCollection hitCollection = ((HiCDataStore) store).getExportableReadsForChromosome(chrs[c]);
String[] localChromosomes = hitCollection.getChromosomeNamesWithHits();
for (int c2 = 0; c2 < localChromosomes.length; c2++) {
Chromosome localChromosome = SeqMonkApplication.getInstance().dataCollection().genome().getChromosome(localChromosomes[c2]).chromosome();
long[] sourceReads = hitCollection.getSourcePositionsForChromosome(localChromosomes[c2]);
long[] hitReads = hitCollection.getHitPositionsForChromosome(localChromosomes[c2]);
for (int r = 0; r < sourceReads.length; r++) {
if (cancel) {
progressCancelled();
return null;
}
if (downsample && downsampleProbabilty < 1) {
if (Math.random() > downsampleProbabilty) {
continue;
}
}
if ((!(reverse || removeStrand)) && extendBy == 0 && (!filterByFeature)) {
// Just add them as they are
newData.addData(chrs[c], sourceReads[r]);
newData.addData(localChromosome, hitReads[r]);
}
Feature[] features = null;
if (filterByFeature) {
features = collection.genome().annotationCollection().getFeaturesForType(chrs[c], featureType);
Arrays.sort(features);
}
int currentFeaturePostion = 0;
if (filterByFeature) {
// See if we're comparing against the right feature
while (SequenceRead.start(sourceReads[r]) > features[currentFeaturePostion].location().end() && currentFeaturePostion < (features.length - 1)) {
currentFeaturePostion++;
}
// Test to see if we overlap
if (SequenceRead.overlaps(sourceReads[r], features[currentFeaturePostion].location().packedPosition())) {
if (excludeFeature)
continue;
} else {
if (!excludeFeature)
continue;
}
}
int sourceStart = SequenceRead.start(sourceReads[r]);
int sourceEend = SequenceRead.end(sourceReads[r]);
int sourceStrand = SequenceRead.strand(sourceReads[r]);
int hitStart = SequenceRead.start(sourceReads[r]);
int hitEend = SequenceRead.end(hitReads[r]);
int hitStrand = SequenceRead.strand(hitReads[r]);
if (reverse) {
if (sourceStrand == Location.FORWARD) {
sourceStrand = Location.REVERSE;
} else if (sourceStrand == Location.REVERSE) {
sourceStrand = Location.FORWARD;
}
if (hitStrand == Location.FORWARD) {
hitStrand = Location.REVERSE;
} else if (hitStrand == Location.REVERSE) {
hitStrand = Location.FORWARD;
}
}
if (removeStrand) {
sourceStrand = Location.UNKNOWN;
hitStrand = Location.UNKNOWN;
}
if (extendBy > 0) {
if (sourceStrand == Location.FORWARD) {
sourceEend += extendBy;
} else if (sourceStrand == Location.REVERSE) {
sourceStart -= extendBy;
}
if (hitStrand == Location.FORWARD) {
hitEend += extendBy;
} else if (hitStrand == Location.REVERSE) {
hitStart -= extendBy;
}
}
// We also don't allow readings which are beyond the end of the chromosome
if (sourceEend > chrs[c].length()) {
int overrun = sourceEend - chrs[c].length();
progressWarningReceived(new SeqMonkException("Reading position " + sourceEend + " was " + overrun + "bp beyond the end of chr" + chrs[c].name() + " (" + chrs[c].length() + ")"));
continue;
}
if (hitEend > localChromosome.length()) {
int overrun = hitEend - SeqMonkApplication.getInstance().dataCollection().genome().getChromosome(localChromosomes[c2]).chromosome().length();
progressWarningReceived(new SeqMonkException("Reading position " + hitEend + " was " + overrun + "bp beyond the end of chr" + localChromosome.name() + " (" + chrs[c].length() + ")"));
continue;
}
// We can now make the new readings
long sourceRead = SequenceRead.packPosition(sourceStart, sourceEend, sourceStrand);
long hitRead = SequenceRead.packPosition(hitStart, hitEend, hitStrand);
if (!prefs.isHiC()) {
// HiC additions are deferred until we know the other end is OK too.
newData.addData(chrs[c], sourceRead);
newData.addData(localChromosome, hitRead);
}
}
}
}
return newData;
}
use of uk.ac.babraham.SeqMonk.DataTypes.Sequence.HiCHitCollection in project SeqMonk by s-andrews.
the class PairedDataSet method getHiCReadsForProbe.
public HiCHitCollection getHiCReadsForProbe(Probe p) {
HiCHitCollection hitCollection = new HiCHitCollection(p.chromosome().name());
HiCHitCollection lastCachedHits = updateCacheToChromosome(p.chromosome());
if (lastChromosomeHitNames.length != lastIndices.length) {
throw new IllegalArgumentException("Chr names " + lastChromosomeHitNames.length + " but indices " + lastIndices.length);
}
for (int c = 0; c < lastChromosomeHitNames.length; c++) {
long[] sourceReads = lastCachedHits.getSourcePositionsForChromosome(lastChromosomeHitNames[c]);
long[] hitReads = lastCachedHits.getHitPositionsForChromosome(lastChromosomeHitNames[c]);
if (sourceReads.length != hitReads.length) {
throw new IllegalStateException("Source reads and hit reads had different lengths for chr " + lastChromosomeHitNames[c] + " " + sourceReads.length + " vs " + hitReads.length + " current values are " + lastCachedHits.getSourcePositionsForChromosome(lastChromosomeHitNames[c]).length + " and " + lastCachedHits.getSourcePositionsForChromosome(lastChromosomeHitNames[c]).length);
}
if (sourceReads.length == 0)
continue;
int startPos = lastIndices[c];
if (startPos < 0) {
// System.err.println("Started looking for reads at position "+startPos+" (below 0)");
// Shouldn't happen, but better safe than sorry
startPos = 0;
}
if (startPos >= sourceReads.length) {
startPos = sourceReads.length - 1;
}
// We can now backtrack from here to find the first possible hit
boolean setCache = false;
// System.err.println("Starting chr "+chromosomes[c]+" at index "+startPos+" position "+SequenceRead.midPoint(sourceReads[startPos]));
for (int i = startPos - 1; i >= 0; i--) {
// Reads come in order, so we can stop when we've seen enough.
if (SequenceRead.end(sourceReads[i]) < p.start() - 5000) {
// System.err.println("Finished backtracking at index "+i+" position "+SequenceRead.midPoint(sourceReads[i]));
break;
}
if (SequenceRead.overlaps(sourceReads[i], p.packedPosition())) {
// They overlap.
// ++hitCount;
hitCollection.addHit(lastChromosomeHitNames[c], sourceReads[i], hitReads[i]);
if (i != startPos - 1 && !setCache) {
// This is the last read so set the cache position
lastIndices[c] = i;
setCache = true;
}
}
}
for (int i = startPos; i < sourceReads.length; i++) {
// Reads come in order, so we can stop when we've seen enough.
if (SequenceRead.start(sourceReads[i]) > p.end()) {
// System.err.println("Finished forward tracking at index "+i+" position "+SequenceRead.midPoint(sourceReads[i]));
break;
}
if (SequenceRead.overlaps(sourceReads[i], p.packedPosition())) {
// They overlap.
// ++hitCount;
hitCollection.addHit(lastChromosomeHitNames[c], sourceReads[i], hitReads[i]);
lastChromosome = p.chromosome();
lastIndices[c] = i;
}
}
}
return hitCollection;
}
use of uk.ac.babraham.SeqMonk.DataTypes.Sequence.HiCHitCollection in project SeqMonk by s-andrews.
the class PairedDataSet method getHiCReadsForChromosome.
public synchronized HiCHitCollection getHiCReadsForChromosome(Chromosome c) {
if (!isFinalised)
finalise();
if (readData.containsKey(c)) {
if (readData.get(c).hitCollection != null) {
// We're not caching, so just give them back the reads
return readData.get(c).hitCollection;
} else {
// Check if we've cached this data
if (lastCachedHits != null && lastCachedHits.getSourceChromosomeName() == c.name()) {
return lastCachedHits;
}
// Signal that we're accessing the cache so the cache icon can blink!
SeqMonkApplication.getInstance().cacheUsed();
// temp file
try {
ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream(new FileInputStream(readData.get(c).tempFile)));
lastCachedHits = (HiCHitCollection) ois.readObject();
ois.close();
lastChromosomeHitNames = lastCachedHits.getChromosomeNamesWithHits();
lastIndices = new int[lastChromosomeHitNames.length];
return lastCachedHits;
} catch (Exception e) {
throw new IllegalStateException(e);
}
}
} else {
lastChromosomeHitNames = new String[0];
lastCachedHits = new HiCHitCollection(c.name());
lastIndices = new int[0];
return lastCachedHits;
}
}
use of uk.ac.babraham.SeqMonk.DataTypes.Sequence.HiCHitCollection in project SeqMonk by s-andrews.
the class HiCReadCountQuantitation method run.
/* (non-Javadoc)
* @see java.lang.Runnable#run()
*/
public void run() {
Probe[] probes = application.dataCollection().probeSet().getAllProbes();
float[] corrections = new float[data.length];
if (correctTotal) {
float largest = 0;
if (correctPerMillion) {
largest = 1000000;
}
for (int d = 0; d < data.length; d++) {
// if (correctOnlyInProbes) {
// corrections[d] = getTotalCountInProbes(data[d],probes);
// }
// else {
corrections[d] = ((DataStore) data[d]).getTotalReadCount();
// }
if (d == 0 && !correctPerMillion) {
largest = corrections[d];
} else {
if (!correctPerMillion && corrections[d] > largest) {
largest = corrections[d];
}
}
}
// We correct everything by the largest count
for (int d = 0; d < corrections.length; d++) {
corrections[d] = largest / corrections[d];
}
}
for (int p = 0; p < probes.length; p++) {
// See if we need to quit
if (cancel) {
progressCancelled();
return;
}
double lengthCorrection = 1;
if (correctLength) {
// We assume a 'normal' probe length of 1kb
lengthCorrection = (double) 1000 / probes[p].length();
}
progressUpdated(p, probes.length);
for (int d = 0; d < data.length; d++) {
double count = 0;
HiCHitCollection hiCHits = data[d].getHiCReadsForProbe(probes[p]);
String[] chromosomeNames = hiCHits.getChromosomeNamesWithHits();
for (int c = 0; c < chromosomeNames.length; c++) {
long[] sourceReads = hiCHits.getSourcePositionsForChromosome(chromosomeNames[c]);
long[] hitReads = hiCHits.getHitPositionsForChromosome(chromosomeNames[c]);
for (int r = 0; r < sourceReads.length; r++) {
// Check if we can ignore this one
if (removeDuplicates) {
if (r > 0 && sourceReads[r] == sourceReads[r - 1] && hitReads[r] == hitReads[r - 1])
continue;
}
if (SequenceRead.overlaps(hitReads[r], probes[p].packedPosition()) && SequenceRead.compare(hitReads[r], sourceReads[r]) > 0) {
continue;
}
++count;
}
}
if (logTransform && count == 0) {
count = 0.9;
}
if (correctTotal) {
count *= corrections[d];
}
if (correctLength) {
count *= lengthCorrection;
}
if (logTransform) {
count = (float) Math.log(count) / log2;
}
// TODO: This is icky since the inheritance between HiCDataStore and DataStore
// isn't properly sorted out.
((DataStore) data[d]).setValueForProbe(probes[p], (float) count);
}
}
quantitatonComplete();
}
use of uk.ac.babraham.SeqMonk.DataTypes.Sequence.HiCHitCollection in project SeqMonk by s-andrews.
the class ChromosomeDataTrack method getHiCPixelCounts.
private int[][] getHiCPixelCounts() {
// Check if we're able to use the last cached data.
if (lastWidth == width && viewer.currentStart() == lastStart && viewer.currentEnd() == lastEnd && cachedHiCPixelCounts != null) {
return cachedHiCPixelCounts;
}
lastWidth = width;
lastStart = viewer.currentStart();
lastEnd = viewer.currentEnd();
cachedHiCPixelCounts = new int[width + 1][width + 1];
if (lastChromosome != viewer.chromosome()) {
HiCHitCollection col = ((HiCDataStore) data).getHiCReadsForChromosome(viewer.chromosome());
lastSourceHiC = col.getSourcePositionsForChromosome(viewer.chromosome().name());
lastHitHic = col.getHitPositionsForChromosome(viewer.chromosome().name());
lastChromosome = viewer.chromosome();
// Check if the HiC collection is sorted.
// for (int i=1;i<lastSourceHiC.length;i++) {
// if (SequenceRead.start(lastSourceHiC[i-1]) > SequenceRead.start(lastSourceHiC[i])) {
// throw new IllegalStateException("HiC set isn't sorted");
// }
// }
}
// No need to do anything more if there's no data
if (lastSourceHiC.length == 0)
return cachedHiCPixelCounts;
// Go back until we're sure we're not going to see any more reads
for (int i = lastInteractionIndexStart; i >= 0; i--) {
if (i >= lastSourceHiC.length)
continue;
if (SequenceRead.start(lastSourceHiC[i]) < viewer.currentStart() - data.getMaxReadLength()) {
lastInteractionIndexStart = i;
break;
}
}
// Go forward until we hit our first read
for (int i = lastInteractionIndexStart; i < reads.length; i++) {
if (SequenceRead.start(lastSourceHiC[i]) >= viewer.currentStart() - data.getMaxReadLength()) {
lastInteractionIndexStart = i;
break;
}
}
for (int i = lastInteractionIndexStart; i < lastSourceHiC.length; i++) {
// Check if both of the ends are in the current window
int sourceMidPoint = SequenceRead.midPoint(lastSourceHiC[i]);
if (SequenceRead.start(lastSourceHiC[i]) > viewer.currentEnd()) {
break;
}
if (sourceMidPoint < viewer.currentStart() || sourceMidPoint > viewer.currentEnd()) {
continue;
}
int hitMidPoint = SequenceRead.midPoint(lastHitHic[i]);
if (hitMidPoint < viewer.currentStart() || hitMidPoint > viewer.currentEnd()) {
continue;
}
int sourcePixel = bpToPixel(Math.min(sourceMidPoint, hitMidPoint));
int hitPixel = bpToPixel(Math.max(sourceMidPoint, hitMidPoint));
// System.err.println("Found valid interaction between "+sourceMidPoint+" and "+hitMidPoint+" which is "+sourcePixel+" to "+hitPixel);
// Now we can add the interaction to the matrix
cachedHiCPixelCounts[sourcePixel][hitPixel]++;
}
return cachedHiCPixelCounts;
}
Aggregations