Search in sources :

Example 1 with ReadBarcodeResult

use of cz1.gbs.core.ReadBarcodeResult in project polyGembler by c-zhou.

the class ParseBarcodeRead method parseReadIntoTagAndTaxa.

/**
 * The barcode libraries used for this study can include two types of
 * extraneous sequence at the end of reads. The first are chimeras created
 * with the free ends. These will recreate the restriction site. The second
 * are short regions (less than 64bp), so that will they will contain a
 * portion of site and the universal adapter. This finds the first of site
 * in likelyReadEnd, keeps the restriction site overhang and then sets
 * everything to polyA afterwards
 *
 * @param seq An unprocessed tag sequence.
 * @param maxLength The maximum number of bp in the processed sequence.
 * @return returnValue A ReadBarcodeResult object containing the unprocessed
 * tag, Cut site position, Processed tag, and Poly-A padded tag.
 */
public ReadBarcodeResult parseReadIntoTagAndTaxa(String seqS, String qualS, int minQual) {
    if (seqS == null)
        return null;
    if (minQual != 0 && seqS != null) {
        try {
            final char[] seqC = (char[]) field.get(seqS), qualC = (char[]) field.get(qualS);
            int len = seqC.length;
            for (int i = 0; i < len; i++) if (qualC[i] - 33 < minQual)
                seqC[i] = 'N';
            seqS = String.valueOf(seqC);
        } catch (IllegalArgumentException | IllegalAccessException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }
    Barcode bestBarcode = findBestBarcode(seqS, maximumMismatchInBarcodeAndOverhang);
    if (bestBarcode == null) {
        // overhang missing so skip
        return null;
    }
    seqS = StringUtils.strip(seqS.substring(bestBarcode.getBarLength()), "N");
    if (seqS.length() < 32)
        return null;
    int cutSitePosition = seqS.length();
    for (String end : likelyReadEnd) {
        int w = seqS.indexOf(end);
        if (w >= 0 && w < cutSitePosition)
            cutSitePosition = w;
    }
    String processedSeqS = StringUtils.strip(seqS.substring(0, cutSitePosition), "N");
    if (processedSeqS.length() >= 32)
        return new ReadBarcodeResult(BaseEncoder.getBitSetFromSeq(processedSeqS), bestBarcode.getTaxaId());
    return null;
}
Also used : Barcode(cz1.gbs.core.Barcode) ReadBarcodeResult(cz1.gbs.core.ReadBarcodeResult)

Example 2 with ReadBarcodeResult

use of cz1.gbs.core.ReadBarcodeResult in project polyGembler by c-zhou.

the class FastqToTagSequence method run.

@Override
public void run() {
    String[] countFileNames = null;
    File inputDirectory = new File(this.myInputDirName);
    File[] fastqFiles = inputDirectory.listFiles(new FilenameFilter() {

        @Override
        public boolean accept(File dir, String name) {
            return name.matches("(?i).*\\.fq$|.*\\.fq\\.gz$|.*\\.fastq$|.*_fastq\\.txt$|.*_fastq\\.gz$|.*_fastq\\.txt\\.gz$|.*_sequence\\.txt$|.*_sequence\\.txt\\.gz$");
        // (?i) denotes case insensitive;                 \\. denotes escape . so it doesn't mean 'any char' & escape the backslash
        }
    });
    if (fastqFiles.length == 0 || fastqFiles == null) {
        myLogger.warn("Couldn't find any files that end with \".fq\", \".fq.gz\", \".fastq\", \"_fastq.txt\", \"_fastq.gz\", \"_fastq.txt.gz\", \"_sequence.txt\", or \"_sequence.txt.gz\" in the supplied directory.");
        return;
    } else {
        myLogger.info("Using the following FASTQ files:");
        countFileNames = new String[fastqFiles.length];
        for (int i = 0; i < fastqFiles.length; i++) {
            countFileNames[i] = fastqFiles[i].getName().replaceAll("(?i)\\.fq$|\\.fq\\.gz$|\\.fastq$|_fastq\\.txt$|_fastq\\.gz$|_fastq\\.txt\\.gz$|_sequence\\.txt$|_sequence\\.txt\\.gz$", "");
            // \\. escape . so it doesn't mean 'any char' & escape the backslash
            myLogger.info(fastqFiles[i].getAbsolutePath());
        }
    }
    for (int laneNum = 0; laneNum < fastqFiles.length; laneNum++) {
        // for (int laneNum = 2; laneNum < 3; laneNum++) {
        if (new File(myOutputDir + System.getProperty("file.separator") + countFileNames[laneNum] + ".cnt.gz").exists()) {
            myLogger.info("Fastq file " + fastqFiles[laneNum] + " skipped.");
            continue;
        }
        File outputFile = new File(this.myOutputDir + File.separator + countFileNames[laneNum]);
        if (outputFile.isFile()) {
            myLogger.warn("An output file " + countFileNames[laneNum] + "\n" + " already exists in the output directory for file " + fastqFiles[laneNum] + ".  Skipping.");
            continue;
        }
        myLogger.info("Reading FASTQ file: " + fastqFiles[laneNum]);
        String[] filenameField = fastqFiles[laneNum].getName().split("_");
        thePBR = new ParseBarcodeRead[this.myEnzyme.length];
        for (int i = 0; i < this.myEnzyme.length; i++) {
            if (filenameField.length == 3) {
                thePBR[i] = new ParseBarcodeRead(this.myKeyfile, this.myEnzyme[i], filenameField[0], filenameField[1]);
            } else if (filenameField.length == 4) {
                thePBR[i] = new ParseBarcodeRead(this.myKeyfile, this.myEnzyme[i], filenameField[0], filenameField[2]);
            } else // B08AAABXX_s_1_sequence.txt.gz
            if (filenameField.length == 5) {
                thePBR[i] = new ParseBarcodeRead(this.myKeyfile, this.myEnzyme[i], filenameField[1], filenameField[3]);
            } else {
                myLogger.error("Error in parsing file name: " + fastqFiles[laneNum]);
                myLogger.error("   The filename does not contain either 3, 4, or 5 underscore-delimited values.");
                myLogger.error("   Expect: flowcell_lane_fastq.txt.gz OR flowcell_s_lane_fastq.txt.gz OR code_flowcell_s_lane_fastq.txt.gz");
                continue;
            }
        }
        taxa = thePBR[0].getSortedTaxaNames();
        n = taxa.length;
        os = countFileNames[laneNum];
        volume = 0;
        myLogger.info("Total barcodes found in lane:" + thePBR[0].getBarCodeCount());
        if (thePBR[0].getBarCodeCount() == 0) {
            myLogger.warn("No barcodes found.  Skipping this flowcell lane.");
            continue;
        }
        String[] taxaNames = new String[thePBR[0].getBarCodeCount()];
        for (int i = 0; i < taxaNames.length; i++) {
            taxaNames[i] = thePBR[0].getTheBarcodes(i).getTaxaName();
        }
        long start = System.currentTimeMillis();
        this.initial_thread_pool();
        try {
            BufferedReader br = Utils.getBufferedReader(fastqFiles[laneNum], 65536);
            int block = 10000;
            String[][] Qs = new String[block][2];
            int k = 0;
            allReads = 0;
            goodBarcodedReads = 0;
            tags = 0;
            String temp = br.readLine();
            while (temp != null) {
                try {
                    Qs[k][0] = br.readLine();
                    br.readLine();
                    Qs[k][1] = br.readLine();
                } catch (NullPointerException e) {
                    myLogger.error("Unable to correctly parse the sequence from fastq file.  " + "Your fastq file may have been corrupted.");
                    System.exit(1);
                }
                k++;
                temp = br.readLine();
                if (k == block || temp == null) {
                    if (usedMemory() / maxMemory() > load) {
                        this.waitFor();
                        writeHardDisk();
                        this.initial_thread_pool();
                    }
                    executor.submit(new Runnable() {

                        private String[][] fastq;

                        @Override
                        public void run() {
                            // TODO Auto-generated method stub
                            try {
                                final Map<BitSet, short[]> block_tagCounts = new HashMap<BitSet, short[]>();
                                int block_allReads = 0, block_goodBarcodedReads = 0;
                                ReadBarcodeResult rr = null;
                                BitSet key;
                                for (int i = 0; i < fastq.length; i++) {
                                    if (fastq[i][0] == null)
                                        break;
                                    // synchronized(lock) {
                                    // allReads++;
                                    // }
                                    block_allReads++;
                                    outerloop: for (int j = 0; j < myLeadingTrim.length; j++) {
                                        for (int k = 0; k < thePBR.length; k++) {
                                            rr = thePBR[k].parseReadIntoTagAndTaxa(fastq[i][0].substring(myLeadingTrim[j]), fastq[i][1].substring(myLeadingTrim[j]), myMinQualS);
                                            if (rr != null)
                                                break outerloop;
                                        }
                                    }
                                    if (rr != null) {
                                        key = rr.read;
                                        /**
                                         *											synchronized(lock) {
                                         *												goodBarcodedReads++;
                                         *												if (allReads % 1000000 == 0) {
                                         *													myLogger.info("Total Reads:" + allReads +
                                         *															" Reads with barcode and cut site overhang:" +
                                         *															goodBarcodedReads);
                                         *												}
                                         *												if(!tagCounts.containsKey(key)) {
                                         *													tagCounts.put(key, new short[n]);
                                         *													tags++;
                                         *												}
                                         *												tagCounts.get(key)[rr.taxonId]++;
                                         *											}
                                         */
                                        block_goodBarcodedReads++;
                                        if (!block_tagCounts.containsKey(key)) {
                                            block_tagCounts.put(key, new short[n]);
                                        }
                                        block_tagCounts.get(key)[rr.taxonId]++;
                                    }
                                }
                                synchronized (lock) {
                                    allReads += block_allReads;
                                    goodBarcodedReads += block_goodBarcodedReads;
                                    // goodBarcodedReads);
                                    for (BitSet bs : block_tagCounts.keySet()) {
                                        if (!tagCounts.containsKey(bs)) {
                                            tags++;
                                            tagCounts.put(bs, block_tagCounts.get(bs));
                                        } else {
                                            short[] copy = tagCounts.get(bs);
                                            short[] block_copy = block_tagCounts.get(bs);
                                            for (int i = 0; i < n; i++) copy[i] += block_copy[i];
                                        }
                                    }
                                }
                            } catch (Exception e) {
                                Thread t = Thread.currentThread();
                                t.getUncaughtExceptionHandler().uncaughtException(t, e);
                                e.printStackTrace();
                                executor.shutdown();
                                System.exit(1);
                            }
                        }

                        public Runnable init(String[][] fastq) {
                            this.fastq = fastq;
                            return (this);
                        }
                    }.init(Qs));
                    k = 0;
                    Qs = new String[block][2];
                }
            }
            if (!tagCounts.isEmpty()) {
                this.waitFor();
                writeHardDisk();
            }
            br.close();
            myLogger.info("Total number of reads in lane=" + allReads);
            myLogger.info("Total number of good barcoded reads=" + goodBarcodedReads);
            myLogger.info("Total number of tags=" + tags);
            myLogger.info("Process took " + (System.currentTimeMillis() - start) / 1000 + " seconds.");
        } catch (Exception e) {
            e.printStackTrace();
        }
        myLogger.info("Finished reading " + (laneNum + 1) + " of " + fastqFiles.length + " sequence files.");
        tagCounts.clear();
    }
}
Also used : HashMap(java.util.HashMap) BitSet(java.util.BitSet) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException) FilenameFilter(java.io.FilenameFilter) ParseBarcodeRead(cz1.gbs.model.ParseBarcodeRead) BufferedReader(java.io.BufferedReader) ReadBarcodeResult(cz1.gbs.core.ReadBarcodeResult) File(java.io.File)

Aggregations

ReadBarcodeResult (cz1.gbs.core.ReadBarcodeResult)2 Barcode (cz1.gbs.core.Barcode)1 ParseBarcodeRead (cz1.gbs.model.ParseBarcodeRead)1 BufferedReader (java.io.BufferedReader)1 File (java.io.File)1 FileNotFoundException (java.io.FileNotFoundException)1 FilenameFilter (java.io.FilenameFilter)1 IOException (java.io.IOException)1 BitSet (java.util.BitSet)1 HashMap (java.util.HashMap)1