use of uk.ac.babraham.SeqMonk.DataTypes.DataSet in project SeqMonk by s-andrews.
the class BAMFileParser method run.
/* (non-Javadoc)
* @see java.lang.Runnable#run()
*/
public void run() {
pairedEndImport = prefs.pairedEnd();
pairedEndDistance = prefs.pairDistanceCutoff();
separateSplicedReads = prefs.splitSplicedReads();
importIntrons = prefs.importIntrons();
extendBy = prefs.extendReads();
minMappingQuality = prefs.minMappingQuality();
primaryAlignmentsOnly = prefs.primaryAlignmentsOnly();
File[] samFiles = getFiles();
DataSet[] newData = new DataSet[samFiles.length];
try {
for (int f = 0; f < samFiles.length; f++) {
SAMFileReader.setDefaultValidationStringency(SAMFileReader.ValidationStringency.SILENT);
SAMFileReader inputSam = new SAMFileReader(samFiles[f]);
if (prefs.isHiC()) {
newData[f] = new PairedDataSet(samFiles[f].getName(), samFiles[f].getCanonicalPath(), prefs.removeDuplicates(), prefs.hiCDistance(), prefs.hiCIgnoreTrans());
} else {
newData[f] = new DataSet(samFiles[f].getName(), samFiles[f].getCanonicalPath(), prefs.removeDuplicates());
}
int lineCount = 0;
// Now process the file
// A flag we can set to skip the next record if we're getting
// out of sync during single end HiC import.
boolean skipNext = false;
for (SAMRecord samRecord : inputSam) {
if (skipNext) {
skipNext = false;
continue;
}
if (cancel) {
inputSam.close();
progressCancelled();
return;
}
++lineCount;
if (lineCount % 100000 == 0) {
progressUpdated("Read " + lineCount + " lines from " + samFiles[f].getName(), f, samFiles.length);
}
if (pairedEndImport && !samRecord.getReadPairedFlag()) {
progressWarningReceived(new SeqMonkException("Data was single ended during paired end import"));
continue;
}
if (samRecord.getReadUnmappedFlag()) {
// There was no match
continue;
}
if (primaryAlignmentsOnly && samRecord.getNotPrimaryAlignmentFlag()) {
// alignments
continue;
}
if (pairedEndImport && !separateSplicedReads && samRecord.getMateUnmappedFlag()) {
// No match on the reverse strand. Doesn't matter if we're doing spliced reads.
continue;
}
if (minMappingQuality > 0 && samRecord.getMappingQuality() < minMappingQuality) {
if (prefs.isHiC()) {
if (((PairedDataSet) newData[f]).importSequenceSkipped()) {
// Skip the next line
skipNext = true;
}
}
continue;
}
if (pairedEndImport && !separateSplicedReads && !samRecord.getReadNegativeStrandFlag()) {
// field.
continue;
}
if (pairedEndImport && !separateSplicedReads && !samRecord.getReferenceName().equals(samRecord.getMateReferenceName())) {
// progressWarningReceived(new SeqMonkException("Paired reads mapped to different chromosomes"));
continue;
}
// TODO: Check what this actually stores - might be a real name rather than 0/=
if (pairedEndImport && !separateSplicedReads && !prefs.isHiC() && samRecord.getMateReferenceName() == "0") {
if (samRecord.getMateReferenceName() != "=") {
inputSam.close();
throw new SeqMonkException("Unexpected mate referenece name " + samRecord.getMateReferenceName());
}
// Matches were on different chromosomes
continue;
}
try {
if (pairedEndImport && !separateSplicedReads) {
SequenceReadWithChromosome read = getPairedEndRead(samRecord);
newData[f].addData(read.chromosome, read.read);
} else if (separateSplicedReads) {
SequenceReadWithChromosome[] reads = getSplitSingleEndRead(samRecord);
for (int r = 0; r < reads.length; r++) {
newData[f].addData(reads[r].chromosome, reads[r].read);
}
} else {
SequenceReadWithChromosome read = getSingleEndRead(samRecord);
newData[f].addData(read.chromosome, read.read);
}
} catch (SeqMonkException ex) {
progressWarningReceived(ex);
if (prefs.isHiC()) {
if (((PairedDataSet) newData[f]).importSequenceSkipped()) {
// Skip the next line
skipNext = true;
}
}
}
}
// We're finished with the file.
inputSam.close();
// Cache the data in the new dataset
progressUpdated("Caching data from " + samFiles[f].getName(), f, samFiles.length);
newData[f].finalise();
}
} catch (Exception ex) {
progressExceptionReceived(ex);
return;
}
processingFinished(newData);
}
use of uk.ac.babraham.SeqMonk.DataTypes.DataSet in project SeqMonk by s-andrews.
the class BowtieFileParser method run.
/* (non-Javadoc)
* @see java.lang.Runnable#run()
*/
public void run() {
pairedEndImport = prefs.pairedEnd();
pairedEndDistance = prefs.pairDistanceCutoff();
if (!pairedEndImport) {
extendBy = prefs.extendReads();
}
String[] sections1;
String[] sections2 = null;
File[] bowtieFiles = getFiles();
DataSet[] newData = new DataSet[bowtieFiles.length];
try {
for (int f = 0; f < bowtieFiles.length; f++) {
BufferedReader br;
if (bowtieFiles[f].getName().toLowerCase().endsWith(".gz")) {
br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(bowtieFiles[f]))));
} else {
br = new BufferedReader(new FileReader(bowtieFiles[f]));
}
String line;
String line2 = null;
if (prefs.isHiC()) {
newData[f] = new PairedDataSet(bowtieFiles[f].getName(), bowtieFiles[f].getCanonicalPath(), prefs.removeDuplicates(), prefs.hiCDistance(), prefs.hiCIgnoreTrans());
} else {
newData[f] = new DataSet(bowtieFiles[f].getName(), bowtieFiles[f].getCanonicalPath(), prefs.removeDuplicates());
}
int lineCount = 0;
while ((line = br.readLine()) != null) {
if (cancel) {
br.close();
progressCancelled();
return;
}
// Ignore blank lines
if (line.trim().length() == 0)
continue;
++lineCount;
if (pairedEndImport) {
line2 = br.readLine();
if (line2 == null) {
++lineCount;
progressWarningReceived(new SeqMonkException("Ran out of file when looking for paired end"));
continue;
}
}
if (lineCount % 100000 == 0) {
progressUpdated("Read " + lineCount + " lines from " + bowtieFiles[f].getName(), f, bowtieFiles.length);
}
sections1 = line.split("\t");
if (pairedEndImport) {
sections2 = line2.split("\t");
}
// Check to see if we've got enough data to work with
if (sections1.length < 5) {
progressWarningReceived(new SeqMonkException("Not enough data from line '" + line + "'"));
// Skip this line...
continue;
}
if (pairedEndImport && sections2.length < 5) {
progressWarningReceived(new SeqMonkException("Not enough data from line '" + line + "'"));
// Skip this line...
continue;
}
try {
if (pairedEndImport) {
SequenceReadWithChromosome read = getPairedEndRead(sections1, sections2);
newData[f].addData(read.chromosome, read.read);
} else {
SequenceReadWithChromosome read = getSingleEndRead(sections1);
newData[f].addData(read.chromosome, read.read);
}
} catch (SeqMonkException ex) {
progressWarningReceived(ex);
}
}
// We're finished with the file.
br.close();
// Cache the data in the new dataset
progressUpdated("Caching data from " + bowtieFiles[f].getName(), f, bowtieFiles.length);
newData[f].finalise();
}
} catch (Exception ex) {
progressExceptionReceived(ex);
return;
}
processingFinished(newData);
}
use of uk.ac.babraham.SeqMonk.DataTypes.DataSet in project SeqMonk by s-andrews.
the class ActiveProbeListParser method processNormalDataStore.
private DataSet processNormalDataStore(ProbeList activeList) {
int extendBy = prefs.extendReads();
boolean reverse = prefs.reverseReads();
boolean modifyStrand = false;
int forcedStrand = 0;
if (!prefs.strandOptionBox.getSelectedItem().equals("From probes")) {
modifyStrand = true;
if (prefs.strandOptionBox.getSelectedItem().equals("Forward")) {
forcedStrand = Location.FORWARD;
} else if (prefs.strandOptionBox.getSelectedItem().equals("Reverse")) {
forcedStrand = Location.REVERSE;
} else if (prefs.strandOptionBox.getSelectedItem().equals("Unknown")) {
forcedStrand = Location.UNKNOWN;
} else {
throw new IllegalArgumentException("Unknown forced strand option " + prefs.strandOptionBox.getSelectedItem());
}
}
DataSet newData = new DataSet(activeList.name(), "Reimported from " + activeList.name(), prefs.removeDuplicates());
// Now process the data
Chromosome[] chrs = dataCollection().genome().getAllChromosomes();
for (int c = 0; c < chrs.length; c++) {
progressUpdated("Processing " + activeList.name() + " chr " + chrs[c].name(), c, chrs.length);
Probe[] probes = activeList.getProbesForChromosome(chrs[c]);
for (int r = 0; r < probes.length; r++) {
if (cancel) {
progressCancelled();
return null;
}
long read;
int start = probes[r].start();
int end = probes[r].end();
int strand = probes[r].strand();
if (reverse) {
if (strand == Location.FORWARD) {
strand = Location.REVERSE;
} else if (strand == Location.REVERSE) {
strand = Location.FORWARD;
}
}
if (extendBy != 0) {
// We now allow negative extensions to shorten reads
if (strand == Location.FORWARD || strand == Location.UNKNOWN) {
end += extendBy;
if (end < start)
end = start;
} else if (strand == Location.REVERSE) {
start -= extendBy;
if (start > end)
start = end;
}
}
// We don't allow reads before the start of the chromosome
if (start < 1) {
int overrun = (0 - start) + 1;
progressWarningReceived(new SeqMonkException("Reading position " + start + " was " + overrun + "bp before the start of chr" + chrs[c].name() + " (" + chrs[c].length() + ")"));
continue;
}
// We also don't allow readings which are beyond the end of the chromosome
if (end > chrs[c].length()) {
int overrun = end - chrs[c].length();
progressWarningReceived(new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + chrs[c].name() + " (" + chrs[c].length() + ")"));
continue;
}
// Force the strand to what they specified if they want this.
if (modifyStrand) {
strand = forcedStrand;
}
// We can now make the new reading
try {
read = SequenceRead.packPosition(start, end, strand);
newData.addData(chrs[c], read);
} catch (SeqMonkException e) {
progressWarningReceived(e);
continue;
}
}
}
return newData;
}
use of uk.ac.babraham.SeqMonk.DataTypes.DataSet in project SeqMonk by s-andrews.
the class BismarkCovFileParser method run.
/* (non-Javadoc)
* @see java.lang.Runnable#run()
*/
public void run() {
try {
File[] covFiles = getFiles();
DataSet[] newData = new DataSet[covFiles.length];
for (int f = 0; f < covFiles.length; f++) {
BufferedReader br;
if (covFiles[f].getName().toLowerCase().endsWith(".gz")) {
br = new BufferedReader(new InputStreamReader(new GZIPInputStream(new FileInputStream(covFiles[f]))));
} else {
br = new BufferedReader(new FileReader(covFiles[f]));
}
String line;
newData[f] = new DataSet(covFiles[f].getName(), covFiles[f].getCanonicalPath(), prefs.removeDuplicates());
int lineCount = 0;
// Now process the file
while ((line = br.readLine()) != null) {
if (cancel) {
br.close();
progressCancelled();
return;
}
// Ignore blank lines
if (line.trim().length() == 0)
continue;
++lineCount;
if (lineCount % 100000 == 0) {
progressUpdated("Read " + lineCount + " lines from " + covFiles[f].getName(), f, covFiles.length);
}
String[] sections = line.split("\t");
// Check to see if we've got enough data to work with
if (sections.length < 6) {
progressWarningReceived(new SeqMonkException("Not enough data from line '" + line + "'"));
// Skip this line...
continue;
}
int start;
int end;
int methCount;
int unmethCount;
try {
start = Integer.parseInt(sections[1]);
end = Integer.parseInt(sections[2]);
methCount = Integer.parseInt(sections[4]);
unmethCount = Integer.parseInt(sections[5]);
// End must always be later than start
if (start > end) {
progressWarningReceived(new SeqMonkException("End position " + end + " was lower than start position " + start));
int temp = start;
start = end;
end = temp;
}
} catch (NumberFormatException e) {
progressWarningReceived(new SeqMonkException("Location " + sections[0] + "-" + sections[1] + " was not an integer"));
continue;
}
try {
ChromosomeWithOffset c = dataCollection().genome().getChromosome(sections[0]);
// We also don't allow readings which are beyond the end of the chromosome
start = c.position(start);
end = c.position(end);
if (end > c.chromosome().length()) {
int overrun = end - c.chromosome().length();
progressWarningReceived(new SeqMonkException("Reading position " + end + " was " + overrun + "bp beyond the end of chr" + c.chromosome().name() + " (" + c.chromosome().length() + ")"));
continue;
}
// We can now make the new reads
long methRead = SequenceRead.packPosition(start, end, Location.FORWARD);
long unmethRead = SequenceRead.packPosition(start, end, Location.REVERSE);
newData[f].addData(c.chromosome(), methRead, methCount);
newData[f].addData(c.chromosome(), unmethRead, unmethCount);
} catch (IllegalArgumentException iae) {
progressWarningReceived(iae);
} catch (SeqMonkException sme) {
progressWarningReceived(sme);
continue;
}
}
// We're finished with the file.
br.close();
// Cache the data in the new dataset
progressUpdated("Caching data from " + covFiles[f].getName(), f, covFiles.length);
newData[f].finalise();
}
processingFinished(newData);
} catch (Exception ex) {
progressExceptionReceived(ex);
return;
}
}
use of uk.ac.babraham.SeqMonk.DataTypes.DataSet in project SeqMonk by s-andrews.
the class HiCOtherEndExtractor method run.
/* (non-Javadoc)
* @see java.lang.Runnable#run()
*/
public void run() {
// We need to get the baits and other options from the preferences
Probe[] baits = prefs.getBaits();
boolean mergeIntoOne = prefs.mergeIntoOne();
boolean excludeSelf = prefs.excludeSelf();
DataSet[] newData;
if (mergeIntoOne) {
newData = new DataSet[visibleHiCDataSets.length];
} else {
newData = new DataSet[visibleHiCDataSets.length * baits.length];
}
try {
for (int d = 0; d < visibleHiCDataSets.length; d++) {
if (baits.length == 1) {
System.err.println("Only one bait");
newData[d] = new DataSet(prefs.prefix.getText() + "_" + visibleHiCDataSets[d].name(), "HiC other end of " + visibleHiCDataSets[d].name() + " for region " + baits[0].chromosome().name() + ":" + baits[0].start() + "-" + baits[0].end(), DataSet.DUPLICATES_REMOVE_NO);
} else if (mergeIntoOne) {
System.err.println("Multiple baits, but merging");
newData[d] = new DataSet(prefs.prefix.getText() + "_" + visibleHiCDataSets[d].name(), "HiC other end of " + visibleHiCDataSets[d].name() + " for " + baits.length + " regions", DataSet.DUPLICATES_REMOVE_NO);
}
for (int b = 0; b < baits.length; b++) {
if (cancel) {
progressCancelled();
return;
}
progressUpdated("Getting other ends from " + visibleHiCDataSets[d].name() + " and " + baits[b].toString(), (d * baits.length) + b, visibleHiCDataSets.length * baits.length);
if (!(baits.length == 1 || mergeIntoOne)) {
newData[(d * baits.length) + b] = new DataSet(baits[b].toString() + "_" + visibleHiCDataSets[d].name(), "HiC other end of " + visibleHiCDataSets[d].name() + " for region " + baits[b].chromosome().name() + ":" + baits[b].start() + "-" + baits[b].end(), DataSet.DUPLICATES_REMOVE_NO);
}
HiCHitCollection hiCCollection = visibleHiCDataSets[d].getHiCReadsForProbe(baits[b]);
String[] chromosomes = hiCCollection.getChromosomeNamesWithHits();
for (int c = 0; c < chromosomes.length; c++) {
Chromosome chromosome = collection.genome().getChromosome(chromosomes[c]).chromosome();
long[] reads = hiCCollection.getHitPositionsForChromosome(chromosomes[c]);
for (int r = 0; r < reads.length; r++) {
if (cancel) {
progressCancelled();
return;
}
if (excludeSelf && baits[b].chromosome().name().equals(chromosomes[c]) && SequenceRead.overlaps(reads[r], baits[b].packedPosition())) {
continue;
}
if (mergeIntoOne) {
newData[d].addData(chromosome, reads[r]);
} else {
newData[(d * baits.length) + b].addData(chromosome, reads[r]);
}
}
}
if (!mergeIntoOne) {
// Cache the data in the new dataset
progressUpdated("Caching data", (d * baits.length) + b, visibleHiCDataSets.length * baits.length);
newData[(d * baits.length) + b].finalise();
}
}
if (mergeIntoOne) {
// Cache the data in the new dataset
progressUpdated("Caching data", d, visibleHiCDataSets.length);
newData[d].finalise();
}
}
processingFinished(newData);
} catch (Exception ex) {
progressExceptionReceived(ex);
}
}
Aggregations