use of cz1.gbs.core.ReadBarcodeResult in project polyGembler by c-zhou.
the class ParseBarcodeRead method parseReadIntoTagAndTaxa.
/**
* The barcode libraries used for this study can include two types of
* extraneous sequence at the end of reads. The first are chimeras created
* with the free ends. These will recreate the restriction site. The second
* are short regions (less than 64bp), so that will they will contain a
* portion of site and the universal adapter. This finds the first of site
* in likelyReadEnd, keeps the restriction site overhang and then sets
* everything to polyA afterwards
*
* @param seq An unprocessed tag sequence.
* @param maxLength The maximum number of bp in the processed sequence.
* @return returnValue A ReadBarcodeResult object containing the unprocessed
* tag, Cut site position, Processed tag, and Poly-A padded tag.
*/
public ReadBarcodeResult parseReadIntoTagAndTaxa(String seqS, String qualS, int minQual) {
if (seqS == null)
return null;
if (minQual != 0 && seqS != null) {
try {
final char[] seqC = (char[]) field.get(seqS), qualC = (char[]) field.get(qualS);
int len = seqC.length;
for (int i = 0; i < len; i++) if (qualC[i] - 33 < minQual)
seqC[i] = 'N';
seqS = String.valueOf(seqC);
} catch (IllegalArgumentException | IllegalAccessException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
Barcode bestBarcode = findBestBarcode(seqS, maximumMismatchInBarcodeAndOverhang);
if (bestBarcode == null) {
// overhang missing so skip
return null;
}
seqS = StringUtils.strip(seqS.substring(bestBarcode.getBarLength()), "N");
if (seqS.length() < 32)
return null;
int cutSitePosition = seqS.length();
for (String end : likelyReadEnd) {
int w = seqS.indexOf(end);
if (w >= 0 && w < cutSitePosition)
cutSitePosition = w;
}
String processedSeqS = StringUtils.strip(seqS.substring(0, cutSitePosition), "N");
if (processedSeqS.length() >= 32)
return new ReadBarcodeResult(BaseEncoder.getBitSetFromSeq(processedSeqS), bestBarcode.getTaxaId());
return null;
}
use of cz1.gbs.core.ReadBarcodeResult in project polyGembler by c-zhou.
the class FastqToTagSequence method run.
@Override
public void run() {
String[] countFileNames = null;
File inputDirectory = new File(this.myInputDirName);
File[] fastqFiles = inputDirectory.listFiles(new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
return name.matches("(?i).*\\.fq$|.*\\.fq\\.gz$|.*\\.fastq$|.*_fastq\\.txt$|.*_fastq\\.gz$|.*_fastq\\.txt\\.gz$|.*_sequence\\.txt$|.*_sequence\\.txt\\.gz$");
// (?i) denotes case insensitive; \\. denotes escape . so it doesn't mean 'any char' & escape the backslash
}
});
if (fastqFiles.length == 0 || fastqFiles == null) {
myLogger.warn("Couldn't find any files that end with \".fq\", \".fq.gz\", \".fastq\", \"_fastq.txt\", \"_fastq.gz\", \"_fastq.txt.gz\", \"_sequence.txt\", or \"_sequence.txt.gz\" in the supplied directory.");
return;
} else {
myLogger.info("Using the following FASTQ files:");
countFileNames = new String[fastqFiles.length];
for (int i = 0; i < fastqFiles.length; i++) {
countFileNames[i] = fastqFiles[i].getName().replaceAll("(?i)\\.fq$|\\.fq\\.gz$|\\.fastq$|_fastq\\.txt$|_fastq\\.gz$|_fastq\\.txt\\.gz$|_sequence\\.txt$|_sequence\\.txt\\.gz$", "");
// \\. escape . so it doesn't mean 'any char' & escape the backslash
myLogger.info(fastqFiles[i].getAbsolutePath());
}
}
for (int laneNum = 0; laneNum < fastqFiles.length; laneNum++) {
// for (int laneNum = 2; laneNum < 3; laneNum++) {
if (new File(myOutputDir + System.getProperty("file.separator") + countFileNames[laneNum] + ".cnt.gz").exists()) {
myLogger.info("Fastq file " + fastqFiles[laneNum] + " skipped.");
continue;
}
File outputFile = new File(this.myOutputDir + File.separator + countFileNames[laneNum]);
if (outputFile.isFile()) {
myLogger.warn("An output file " + countFileNames[laneNum] + "\n" + " already exists in the output directory for file " + fastqFiles[laneNum] + ". Skipping.");
continue;
}
myLogger.info("Reading FASTQ file: " + fastqFiles[laneNum]);
String[] filenameField = fastqFiles[laneNum].getName().split("_");
thePBR = new ParseBarcodeRead[this.myEnzyme.length];
for (int i = 0; i < this.myEnzyme.length; i++) {
if (filenameField.length == 3) {
thePBR[i] = new ParseBarcodeRead(this.myKeyfile, this.myEnzyme[i], filenameField[0], filenameField[1]);
} else if (filenameField.length == 4) {
thePBR[i] = new ParseBarcodeRead(this.myKeyfile, this.myEnzyme[i], filenameField[0], filenameField[2]);
} else // B08AAABXX_s_1_sequence.txt.gz
if (filenameField.length == 5) {
thePBR[i] = new ParseBarcodeRead(this.myKeyfile, this.myEnzyme[i], filenameField[1], filenameField[3]);
} else {
myLogger.error("Error in parsing file name: " + fastqFiles[laneNum]);
myLogger.error(" The filename does not contain either 3, 4, or 5 underscore-delimited values.");
myLogger.error(" Expect: flowcell_lane_fastq.txt.gz OR flowcell_s_lane_fastq.txt.gz OR code_flowcell_s_lane_fastq.txt.gz");
continue;
}
}
taxa = thePBR[0].getSortedTaxaNames();
n = taxa.length;
os = countFileNames[laneNum];
volume = 0;
myLogger.info("Total barcodes found in lane:" + thePBR[0].getBarCodeCount());
if (thePBR[0].getBarCodeCount() == 0) {
myLogger.warn("No barcodes found. Skipping this flowcell lane.");
continue;
}
String[] taxaNames = new String[thePBR[0].getBarCodeCount()];
for (int i = 0; i < taxaNames.length; i++) {
taxaNames[i] = thePBR[0].getTheBarcodes(i).getTaxaName();
}
long start = System.currentTimeMillis();
this.initial_thread_pool();
try {
BufferedReader br = Utils.getBufferedReader(fastqFiles[laneNum], 65536);
int block = 10000;
String[][] Qs = new String[block][2];
int k = 0;
allReads = 0;
goodBarcodedReads = 0;
tags = 0;
String temp = br.readLine();
while (temp != null) {
try {
Qs[k][0] = br.readLine();
br.readLine();
Qs[k][1] = br.readLine();
} catch (NullPointerException e) {
myLogger.error("Unable to correctly parse the sequence from fastq file. " + "Your fastq file may have been corrupted.");
System.exit(1);
}
k++;
temp = br.readLine();
if (k == block || temp == null) {
if (usedMemory() / maxMemory() > load) {
this.waitFor();
writeHardDisk();
this.initial_thread_pool();
}
executor.submit(new Runnable() {
private String[][] fastq;
@Override
public void run() {
// TODO Auto-generated method stub
try {
final Map<BitSet, short[]> block_tagCounts = new HashMap<BitSet, short[]>();
int block_allReads = 0, block_goodBarcodedReads = 0;
ReadBarcodeResult rr = null;
BitSet key;
for (int i = 0; i < fastq.length; i++) {
if (fastq[i][0] == null)
break;
// synchronized(lock) {
// allReads++;
// }
block_allReads++;
outerloop: for (int j = 0; j < myLeadingTrim.length; j++) {
for (int k = 0; k < thePBR.length; k++) {
rr = thePBR[k].parseReadIntoTagAndTaxa(fastq[i][0].substring(myLeadingTrim[j]), fastq[i][1].substring(myLeadingTrim[j]), myMinQualS);
if (rr != null)
break outerloop;
}
}
if (rr != null) {
key = rr.read;
/**
* synchronized(lock) {
* goodBarcodedReads++;
* if (allReads % 1000000 == 0) {
* myLogger.info("Total Reads:" + allReads +
* " Reads with barcode and cut site overhang:" +
* goodBarcodedReads);
* }
* if(!tagCounts.containsKey(key)) {
* tagCounts.put(key, new short[n]);
* tags++;
* }
* tagCounts.get(key)[rr.taxonId]++;
* }
*/
block_goodBarcodedReads++;
if (!block_tagCounts.containsKey(key)) {
block_tagCounts.put(key, new short[n]);
}
block_tagCounts.get(key)[rr.taxonId]++;
}
}
synchronized (lock) {
allReads += block_allReads;
goodBarcodedReads += block_goodBarcodedReads;
// goodBarcodedReads);
for (BitSet bs : block_tagCounts.keySet()) {
if (!tagCounts.containsKey(bs)) {
tags++;
tagCounts.put(bs, block_tagCounts.get(bs));
} else {
short[] copy = tagCounts.get(bs);
short[] block_copy = block_tagCounts.get(bs);
for (int i = 0; i < n; i++) copy[i] += block_copy[i];
}
}
}
} catch (Exception e) {
Thread t = Thread.currentThread();
t.getUncaughtExceptionHandler().uncaughtException(t, e);
e.printStackTrace();
executor.shutdown();
System.exit(1);
}
}
public Runnable init(String[][] fastq) {
this.fastq = fastq;
return (this);
}
}.init(Qs));
k = 0;
Qs = new String[block][2];
}
}
if (!tagCounts.isEmpty()) {
this.waitFor();
writeHardDisk();
}
br.close();
myLogger.info("Total number of reads in lane=" + allReads);
myLogger.info("Total number of good barcoded reads=" + goodBarcodedReads);
myLogger.info("Total number of tags=" + tags);
myLogger.info("Process took " + (System.currentTimeMillis() - start) / 1000 + " seconds.");
} catch (Exception e) {
e.printStackTrace();
}
myLogger.info("Finished reading " + (laneNum + 1) + " of " + fastqFiles.length + " sequence files.");
tagCounts.clear();
}
}
Aggregations