Search in sources :

Example 1 with Sequence

use of cz1.ngs.model.Sequence in project polyGembler by c-zhou.

the class ParseBarcodeRead method setupBarcodeFiles.

/**
 * Reads in an Illumina key file, creates a linear array of {@link Barcode} objects
 * representing the barcodes in the key file, then creates a hash map containing
 * indices from the linear array indexed by sequence.  The names of barcode objects
 * follow the pattern samplename:flowcell:lane:well, since sample names alone are not unique.
 *
 * @param keyFile Illumina key file.
 * @param flowcell Only barcodes from this flowcell will be added to the array.
 * @param lane Only barcodes from this lane will be added to the array.
 * @return Number of barcodes in the array.
 */
private int setupBarcodeFiles(File keyFile, String flowcell, String lane) {
    try {
        BufferedReader br = new BufferedReader(new FileReader(keyFile), 65536);
        ArrayList<Barcode> theBarcodesArrayList = new ArrayList<Barcode>();
        String temp;
        int k = 0;
        while (((temp = br.readLine()) != null)) {
            // split by whitespace
            String[] s = temp.split("\\t");
            Barcode theBC = null;
            if (s[0].equals(flowcell) && s[1].equals(lane)) {
                String well = (s[6].length() < 2) ? (s[5] + '0' + s[6]) : s[5] + s[6];
                if (s.length < 8 || s[7] == null || s[7].equals("")) {
                    // use the plate and well
                    theBC = new Barcode(s[2], initialCutSiteRemnant, s[3] + ":" + s[0] + ":" + s[1] + ":" + s[4] + ":" + well, flowcell, lane, k++);
                } else {
                    // use the "libraryPlateWellID" or whatever is in column H of the key file, IF it is an integer
                    try {
                        int libPrepID = Integer.parseInt(s[7]);
                        theBC = new Barcode(s[2], initialCutSiteRemnant, s[3] + ":" + s[0] + ":" + s[1] + ":" + libPrepID, flowcell, lane, k++);
                    } catch (NumberFormatException nfe) {
                        theBC = new Barcode(s[2], initialCutSiteRemnant, s[3] + ":" + s[0] + ":" + s[1] + ":" + s[4] + ":" + well, flowcell, lane, k++);
                    }
                }
                theBarcodesArrayList.add(theBC);
                System.out.println(theBC.getBarcodeString() + " " + theBC.getTaxaName());
            }
        }
        br.close();
        theBarcodes = new Barcode[theBarcodesArrayList.size()];
        theBarcodesArrayList.toArray(theBarcodes);
        Arrays.sort(theBarcodes);
        int nBL = theBarcodes[0].getBarOverLong().length;
        quickBarcodeList = new long[theBarcodes.length * nBL];
        quickMap = new HashMap<Long, Integer>();
        for (int i = 0; i < theBarcodes.length; i++) {
            for (int j = 0; j < nBL; j++) {
                quickBarcodeList[i * nBL + j] = theBarcodes[i].getBarOverLong()[j];
                quickMap.put(theBarcodes[i].getBarOverLong()[j], i);
            }
        }
        Arrays.sort(quickBarcodeList);
    } catch (Exception e) {
        System.out.println("Error with setupBarcodeFiles: " + e);
    }
    return theBarcodes.length;
}
Also used : ArrayList(java.util.ArrayList) Barcode(cz1.gbs.core.Barcode) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader)

Example 2 with Sequence

use of cz1.ngs.model.Sequence in project polyGembler by c-zhou.

the class ParseBarcodeRead method parseReadIntoTagAndTaxa.

/**
 * The barcode libraries used for this study can include two types of
 * extraneous sequence at the end of reads. The first are chimeras created
 * with the free ends. These will recreate the restriction site. The second
 * are short regions (less than 64bp), so that will they will contain a
 * portion of site and the universal adapter. This finds the first of site
 * in likelyReadEnd, keeps the restriction site overhang and then sets
 * everything to polyA afterwards
 *
 * @param seq An unprocessed tag sequence.
 * @param maxLength The maximum number of bp in the processed sequence.
 * @return returnValue A ReadBarcodeResult object containing the unprocessed
 * tag, Cut site position, Processed tag, and Poly-A padded tag.
 */
public ReadBarcodeResult parseReadIntoTagAndTaxa(String seqS, String qualS, int minQual) {
    if (seqS == null)
        return null;
    if (minQual != 0 && seqS != null) {
        try {
            final char[] seqC = (char[]) field.get(seqS), qualC = (char[]) field.get(qualS);
            int len = seqC.length;
            for (int i = 0; i < len; i++) if (qualC[i] - 33 < minQual)
                seqC[i] = 'N';
            seqS = String.valueOf(seqC);
        } catch (IllegalArgumentException | IllegalAccessException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
    }
    Barcode bestBarcode = findBestBarcode(seqS, maximumMismatchInBarcodeAndOverhang);
    if (bestBarcode == null) {
        // overhang missing so skip
        return null;
    }
    seqS = StringUtils.strip(seqS.substring(bestBarcode.getBarLength()), "N");
    if (seqS.length() < 32)
        return null;
    int cutSitePosition = seqS.length();
    for (String end : likelyReadEnd) {
        int w = seqS.indexOf(end);
        if (w >= 0 && w < cutSitePosition)
            cutSitePosition = w;
    }
    String processedSeqS = StringUtils.strip(seqS.substring(0, cutSitePosition), "N");
    if (processedSeqS.length() >= 32)
        return new ReadBarcodeResult(BaseEncoder.getBitSetFromSeq(processedSeqS), bestBarcode.getTaxaId());
    return null;
}
Also used : Barcode(cz1.gbs.core.Barcode) ReadBarcodeResult(cz1.gbs.core.ReadBarcodeResult)

Example 3 with Sequence

use of cz1.ngs.model.Sequence in project polyGembler by c-zhou.

the class FastqToTagSequence method run.

@Override
public void run() {
    String[] countFileNames = null;
    File inputDirectory = new File(this.myInputDirName);
    File[] fastqFiles = inputDirectory.listFiles(new FilenameFilter() {

        @Override
        public boolean accept(File dir, String name) {
            return name.matches("(?i).*\\.fq$|.*\\.fq\\.gz$|.*\\.fastq$|.*_fastq\\.txt$|.*_fastq\\.gz$|.*_fastq\\.txt\\.gz$|.*_sequence\\.txt$|.*_sequence\\.txt\\.gz$");
        // (?i) denotes case insensitive;                 \\. denotes escape . so it doesn't mean 'any char' & escape the backslash
        }
    });
    if (fastqFiles.length == 0 || fastqFiles == null) {
        myLogger.warn("Couldn't find any files that end with \".fq\", \".fq.gz\", \".fastq\", \"_fastq.txt\", \"_fastq.gz\", \"_fastq.txt.gz\", \"_sequence.txt\", or \"_sequence.txt.gz\" in the supplied directory.");
        return;
    } else {
        myLogger.info("Using the following FASTQ files:");
        countFileNames = new String[fastqFiles.length];
        for (int i = 0; i < fastqFiles.length; i++) {
            countFileNames[i] = fastqFiles[i].getName().replaceAll("(?i)\\.fq$|\\.fq\\.gz$|\\.fastq$|_fastq\\.txt$|_fastq\\.gz$|_fastq\\.txt\\.gz$|_sequence\\.txt$|_sequence\\.txt\\.gz$", "");
            // \\. escape . so it doesn't mean 'any char' & escape the backslash
            myLogger.info(fastqFiles[i].getAbsolutePath());
        }
    }
    for (int laneNum = 0; laneNum < fastqFiles.length; laneNum++) {
        // for (int laneNum = 2; laneNum < 3; laneNum++) {
        if (new File(myOutputDir + System.getProperty("file.separator") + countFileNames[laneNum] + ".cnt.gz").exists()) {
            myLogger.info("Fastq file " + fastqFiles[laneNum] + " skipped.");
            continue;
        }
        File outputFile = new File(this.myOutputDir + File.separator + countFileNames[laneNum]);
        if (outputFile.isFile()) {
            myLogger.warn("An output file " + countFileNames[laneNum] + "\n" + " already exists in the output directory for file " + fastqFiles[laneNum] + ".  Skipping.");
            continue;
        }
        myLogger.info("Reading FASTQ file: " + fastqFiles[laneNum]);
        String[] filenameField = fastqFiles[laneNum].getName().split("_");
        thePBR = new ParseBarcodeRead[this.myEnzyme.length];
        for (int i = 0; i < this.myEnzyme.length; i++) {
            if (filenameField.length == 3) {
                thePBR[i] = new ParseBarcodeRead(this.myKeyfile, this.myEnzyme[i], filenameField[0], filenameField[1]);
            } else if (filenameField.length == 4) {
                thePBR[i] = new ParseBarcodeRead(this.myKeyfile, this.myEnzyme[i], filenameField[0], filenameField[2]);
            } else // B08AAABXX_s_1_sequence.txt.gz
            if (filenameField.length == 5) {
                thePBR[i] = new ParseBarcodeRead(this.myKeyfile, this.myEnzyme[i], filenameField[1], filenameField[3]);
            } else {
                myLogger.error("Error in parsing file name: " + fastqFiles[laneNum]);
                myLogger.error("   The filename does not contain either 3, 4, or 5 underscore-delimited values.");
                myLogger.error("   Expect: flowcell_lane_fastq.txt.gz OR flowcell_s_lane_fastq.txt.gz OR code_flowcell_s_lane_fastq.txt.gz");
                continue;
            }
        }
        taxa = thePBR[0].getSortedTaxaNames();
        n = taxa.length;
        os = countFileNames[laneNum];
        volume = 0;
        myLogger.info("Total barcodes found in lane:" + thePBR[0].getBarCodeCount());
        if (thePBR[0].getBarCodeCount() == 0) {
            myLogger.warn("No barcodes found.  Skipping this flowcell lane.");
            continue;
        }
        String[] taxaNames = new String[thePBR[0].getBarCodeCount()];
        for (int i = 0; i < taxaNames.length; i++) {
            taxaNames[i] = thePBR[0].getTheBarcodes(i).getTaxaName();
        }
        long start = System.currentTimeMillis();
        this.initial_thread_pool();
        try {
            BufferedReader br = Utils.getBufferedReader(fastqFiles[laneNum], 65536);
            int block = 10000;
            String[][] Qs = new String[block][2];
            int k = 0;
            allReads = 0;
            goodBarcodedReads = 0;
            tags = 0;
            String temp = br.readLine();
            while (temp != null) {
                try {
                    Qs[k][0] = br.readLine();
                    br.readLine();
                    Qs[k][1] = br.readLine();
                } catch (NullPointerException e) {
                    myLogger.error("Unable to correctly parse the sequence from fastq file.  " + "Your fastq file may have been corrupted.");
                    System.exit(1);
                }
                k++;
                temp = br.readLine();
                if (k == block || temp == null) {
                    if (usedMemory() / maxMemory() > load) {
                        this.waitFor();
                        writeHardDisk();
                        this.initial_thread_pool();
                    }
                    executor.submit(new Runnable() {

                        private String[][] fastq;

                        @Override
                        public void run() {
                            // TODO Auto-generated method stub
                            try {
                                final Map<BitSet, short[]> block_tagCounts = new HashMap<BitSet, short[]>();
                                int block_allReads = 0, block_goodBarcodedReads = 0;
                                ReadBarcodeResult rr = null;
                                BitSet key;
                                for (int i = 0; i < fastq.length; i++) {
                                    if (fastq[i][0] == null)
                                        break;
                                    // synchronized(lock) {
                                    // allReads++;
                                    // }
                                    block_allReads++;
                                    outerloop: for (int j = 0; j < myLeadingTrim.length; j++) {
                                        for (int k = 0; k < thePBR.length; k++) {
                                            rr = thePBR[k].parseReadIntoTagAndTaxa(fastq[i][0].substring(myLeadingTrim[j]), fastq[i][1].substring(myLeadingTrim[j]), myMinQualS);
                                            if (rr != null)
                                                break outerloop;
                                        }
                                    }
                                    if (rr != null) {
                                        key = rr.read;
                                        /**
                                         *											synchronized(lock) {
                                         *												goodBarcodedReads++;
                                         *												if (allReads % 1000000 == 0) {
                                         *													myLogger.info("Total Reads:" + allReads +
                                         *															" Reads with barcode and cut site overhang:" +
                                         *															goodBarcodedReads);
                                         *												}
                                         *												if(!tagCounts.containsKey(key)) {
                                         *													tagCounts.put(key, new short[n]);
                                         *													tags++;
                                         *												}
                                         *												tagCounts.get(key)[rr.taxonId]++;
                                         *											}
                                         */
                                        block_goodBarcodedReads++;
                                        if (!block_tagCounts.containsKey(key)) {
                                            block_tagCounts.put(key, new short[n]);
                                        }
                                        block_tagCounts.get(key)[rr.taxonId]++;
                                    }
                                }
                                synchronized (lock) {
                                    allReads += block_allReads;
                                    goodBarcodedReads += block_goodBarcodedReads;
                                    // goodBarcodedReads);
                                    for (BitSet bs : block_tagCounts.keySet()) {
                                        if (!tagCounts.containsKey(bs)) {
                                            tags++;
                                            tagCounts.put(bs, block_tagCounts.get(bs));
                                        } else {
                                            short[] copy = tagCounts.get(bs);
                                            short[] block_copy = block_tagCounts.get(bs);
                                            for (int i = 0; i < n; i++) copy[i] += block_copy[i];
                                        }
                                    }
                                }
                            } catch (Exception e) {
                                Thread t = Thread.currentThread();
                                t.getUncaughtExceptionHandler().uncaughtException(t, e);
                                e.printStackTrace();
                                executor.shutdown();
                                System.exit(1);
                            }
                        }

                        public Runnable init(String[][] fastq) {
                            this.fastq = fastq;
                            return (this);
                        }
                    }.init(Qs));
                    k = 0;
                    Qs = new String[block][2];
                }
            }
            if (!tagCounts.isEmpty()) {
                this.waitFor();
                writeHardDisk();
            }
            br.close();
            myLogger.info("Total number of reads in lane=" + allReads);
            myLogger.info("Total number of good barcoded reads=" + goodBarcodedReads);
            myLogger.info("Total number of tags=" + tags);
            myLogger.info("Process took " + (System.currentTimeMillis() - start) / 1000 + " seconds.");
        } catch (Exception e) {
            e.printStackTrace();
        }
        myLogger.info("Finished reading " + (laneNum + 1) + " of " + fastqFiles.length + " sequence files.");
        tagCounts.clear();
    }
}
Also used : HashMap(java.util.HashMap) BitSet(java.util.BitSet) IOException(java.io.IOException) FileNotFoundException(java.io.FileNotFoundException) FilenameFilter(java.io.FilenameFilter) ParseBarcodeRead(cz1.gbs.model.ParseBarcodeRead) BufferedReader(java.io.BufferedReader) ReadBarcodeResult(cz1.gbs.core.ReadBarcodeResult) File(java.io.File)

Example 4 with Sequence

use of cz1.ngs.model.Sequence in project polyGembler by c-zhou.

the class Consensus method parseScaffold.

private final void parseScaffold(final String contig_fa, final String agp_map, final int minL, final String out_fa) {
    final Map<String, Sequence> contig_map = Sequence.parseFastaFileAsMap(contig_fa);
    try {
        BufferedReader br_agp = Utils.getBufferedReader(agp_map);
        final List<Sequence> scaffolds = new ArrayList<Sequence>();
        final List<Segment> segments = new ArrayList<Segment>();
        final StringBuilder str_buf = new StringBuilder();
        String line = br_agp.readLine();
        String[] s;
        String seq_sn;
        Sequence contig;
        final Set<String> anchored_seqs = new HashSet<String>();
        while (line != null) {
            s = line.split("\\s+");
            seq_sn = s[5];
            segments.clear();
            segments.add(new Segment(s[0], Integer.parseInt(s[1]), Integer.parseInt(s[2]), Integer.parseInt(s[3]), s[4], s[5], Integer.parseInt(s[6]), Integer.parseInt(s[7])));
            while ((line = br_agp.readLine()) != null) {
                s = line.split("\\s+");
                if (s[5].equals(seq_sn))
                    segments.add(new Segment(s[0], Integer.parseInt(s[1]), Integer.parseInt(s[2]), Integer.parseInt(s[3]), s[4], s[5], Integer.parseInt(s[6]), Integer.parseInt(s[7])));
                else
                    break;
            }
            str_buf.setLength(0);
            for (Segment seg : segments) {
                if (seg.type == MAP_ENUM.GAP) {
                    str_buf.append(Sequence.polyN(seg.seq_ln));
                } else {
                    contig = contig_map.get(seg.seq_sn);
                    str_buf.append(seg.seq_rev ? Sequence.revCompSeq(contig.seq_str().substring(seg.seq_start, seg.seq_end)) : contig.seq_str().substring(seg.seq_start, seg.seq_end));
                    anchored_seqs.add(seg.seq_sn);
                }
            }
            scaffolds.add(new Sequence(str_buf.toString().replaceAll("N{1,}$", "").replaceAll("^N{1,}", "")));
        }
        br_agp.close();
        for (String seq : anchored_seqs) contig_map.remove(seq);
        BufferedWriter bw_fa = Utils.getBufferedWriter(out_fa);
        System.err.println("Buffer contigs...");
        for (Map.Entry<String, Sequence> entry : contig_map.entrySet()) scaffolds.add(entry.getValue());
        System.err.println("Sort scaffolds...");
        Collections.sort(scaffolds);
        Collections.reverse(scaffolds);
        int n = scaffolds.size(), rm = 0, seq_ln;
        String seq_str;
        System.err.println(n + " scaffolds to parse...");
        for (int i = 0; i != n; i++) {
            if (i % 10000 == 0)
                System.err.println(i + " scaffolds parsed...");
            contig = scaffolds.get(i);
            if (contig.seq_ln() < minL || contig.seq_ln() == 0)
                break;
            rm++;
            bw_fa.write(contig.seq_str().contains("N") ? ">S" : ">C");
            bw_fa.write(String.format("%08d\n", i + 1));
            /**
             *				// not too slow as string buffer insertion is O(n)
             *				str_buf.setLength(0);
             *				str_buf.append(contig.seq_str);
             *
             *				int offset=0, seq_ln=str_buf.length();
             *				while(offset<seq_ln) {
             *					offset += lineWidth;
             *					str_buf.insert( Math.min(offset, seq_ln), "\n" );
             *					seq_ln++;
             *					offset++;
             *				}
             *
             *				bw_fa.write(str_buf.toString());
             */
            seq_str = contig.seq_str();
            seq_ln = contig.seq_ln();
            bw_fa.write(seq_str.charAt(0));
            for (int j = 1; j != seq_ln; j++) {
                if (j % lineWidth == 0)
                    bw_fa.write("\n");
                bw_fa.write(seq_str.charAt(j));
            }
            bw_fa.write("\n");
        }
        bw_fa.close();
        System.err.println("Calculate statistics...");
        // for(int i=rm; i!=n; i++) scaffolds.remove(i);
        scaffolds.subList(rm, scaffolds.size()).clear();
        int gap_ln = 0;
        seq_ln = 0;
        for (int i = 0; i != rm; i++) {
            seq_ln += scaffolds.get(i).seq_ln();
            gap_ln += StringUtils.countMatches(scaffolds.get(i).seq_str(), "N");
        }
        System.err.println("# SEQ_LEN\t" + seq_ln);
        System.err.println("# GAP_LEN\t" + gap_ln);
        System.err.println("#     N50\t" + calcWeightedMedianSatistic(scaffolds, 0.5));
    } catch (IOException e) {
        e.printStackTrace();
    }
}
Also used : ArrayList(java.util.ArrayList) Sequence(cz1.ngs.model.Sequence) IOException(java.io.IOException) BufferedWriter(java.io.BufferedWriter) BufferedReader(java.io.BufferedReader) Map(java.util.Map) HashMap(java.util.HashMap) TreeMap(java.util.TreeMap) HashSet(java.util.HashSet)

Example 5 with Sequence

use of cz1.ngs.model.Sequence in project polyGembler by c-zhou.

the class Consensus method setParameters.

@Override
public void setParameters(String[] args) {
    // TODO Auto-generated method stub
    if (myArgsEngine == null) {
        myArgsEngine = new ArgsEngine();
        myArgsEngine.add("-s", "--sequence", true);
        myArgsEngine.add("-m", "--map", true);
        myArgsEngine.add("-as", "--block-size", true);
        myArgsEngine.add("-bs", "--batch-size", true);
        myArgsEngine.add("-t", "--threads", true);
        myArgsEngine.add("-l", "--min-size", true);
        myArgsEngine.add("-o", "--out", true);
        myArgsEngine.addWildOptions(bam_lib, true);
        myArgsEngine.addWildOptions(ins_lib, true);
        myArgsEngine.addWildOptions(w_lib, true);
        myArgsEngine.parse(args);
    }
    if (myArgsEngine.getBoolean("-s")) {
        this.sequence_file = myArgsEngine.getString("-s");
    } else {
        printUsage();
        throw new IllegalArgumentException("Please specify the contig file.");
    }
    if (myArgsEngine.getBoolean("-m")) {
        this.map_file = myArgsEngine.getString("-m");
    } else {
        printUsage();
        throw new IllegalArgumentException("Please specify the map file.");
    }
    if (myArgsEngine.getBoolean("-as")) {
        this.min_as = Integer.parseInt(myArgsEngine.getString("-as"));
    }
    if (myArgsEngine.getBoolean("-bs")) {
        this.batch_size = Integer.parseInt(myArgsEngine.getString("-bs"));
    }
    if (myArgsEngine.getBoolean("-t")) {
        this.THREADS = Integer.parseInt(myArgsEngine.getString("-t"));
    }
    if (myArgsEngine.getBoolean("-l")) {
        this.min_size = Integer.parseInt(myArgsEngine.getString("-l"));
    }
    if (myArgsEngine.getBoolean("-o")) {
        this.out_prefix = myArgsEngine.getString("-o");
    } else {
        printUsage();
        throw new IllegalArgumentException("Please specify the output file.");
    }
    this.parseDataLibrary(args);
}
Also used : ArgsEngine(cz1.util.ArgsEngine)

Aggregations

HashMap (java.util.HashMap)16 Sequence (cz1.ngs.model.Sequence)15 IOException (java.io.IOException)13 BufferedReader (java.io.BufferedReader)11 HashSet (java.util.HashSet)11 ArrayList (java.util.ArrayList)8 BufferedWriter (java.io.BufferedWriter)7 Map (java.util.Map)7 TreeRangeSet (com.google.common.collect.TreeRangeSet)6 AlignmentSegment (cz1.ngs.model.AlignmentSegment)5 Blast6Segment (cz1.ngs.model.Blast6Segment)5 FileReader (java.io.FileReader)5 List (java.util.List)5 Set (java.util.Set)5 File (java.io.File)4 Barcode (cz1.gbs.core.Barcode)3 SamReader (htsjdk.samtools.SamReader)3 BidiMap (org.apache.commons.collections4.BidiMap)3 ReadBarcodeResult (cz1.gbs.core.ReadBarcodeResult)2 SAMSequenceRecord (htsjdk.samtools.SAMSequenceRecord)2