use of cz1.ngs.model.Sequence in project polyGembler by c-zhou.
the class ParseBarcodeRead method setupBarcodeFiles.
/**
* Reads in an Illumina key file, creates a linear array of {@link Barcode} objects
* representing the barcodes in the key file, then creates a hash map containing
* indices from the linear array indexed by sequence. The names of barcode objects
* follow the pattern samplename:flowcell:lane:well, since sample names alone are not unique.
*
* @param keyFile Illumina key file.
* @param flowcell Only barcodes from this flowcell will be added to the array.
* @param lane Only barcodes from this lane will be added to the array.
* @return Number of barcodes in the array.
*/
private int setupBarcodeFiles(File keyFile, String flowcell, String lane) {
try {
BufferedReader br = new BufferedReader(new FileReader(keyFile), 65536);
ArrayList<Barcode> theBarcodesArrayList = new ArrayList<Barcode>();
String temp;
int k = 0;
while (((temp = br.readLine()) != null)) {
// split by whitespace
String[] s = temp.split("\\t");
Barcode theBC = null;
if (s[0].equals(flowcell) && s[1].equals(lane)) {
String well = (s[6].length() < 2) ? (s[5] + '0' + s[6]) : s[5] + s[6];
if (s.length < 8 || s[7] == null || s[7].equals("")) {
// use the plate and well
theBC = new Barcode(s[2], initialCutSiteRemnant, s[3] + ":" + s[0] + ":" + s[1] + ":" + s[4] + ":" + well, flowcell, lane, k++);
} else {
// use the "libraryPlateWellID" or whatever is in column H of the key file, IF it is an integer
try {
int libPrepID = Integer.parseInt(s[7]);
theBC = new Barcode(s[2], initialCutSiteRemnant, s[3] + ":" + s[0] + ":" + s[1] + ":" + libPrepID, flowcell, lane, k++);
} catch (NumberFormatException nfe) {
theBC = new Barcode(s[2], initialCutSiteRemnant, s[3] + ":" + s[0] + ":" + s[1] + ":" + s[4] + ":" + well, flowcell, lane, k++);
}
}
theBarcodesArrayList.add(theBC);
System.out.println(theBC.getBarcodeString() + " " + theBC.getTaxaName());
}
}
br.close();
theBarcodes = new Barcode[theBarcodesArrayList.size()];
theBarcodesArrayList.toArray(theBarcodes);
Arrays.sort(theBarcodes);
int nBL = theBarcodes[0].getBarOverLong().length;
quickBarcodeList = new long[theBarcodes.length * nBL];
quickMap = new HashMap<Long, Integer>();
for (int i = 0; i < theBarcodes.length; i++) {
for (int j = 0; j < nBL; j++) {
quickBarcodeList[i * nBL + j] = theBarcodes[i].getBarOverLong()[j];
quickMap.put(theBarcodes[i].getBarOverLong()[j], i);
}
}
Arrays.sort(quickBarcodeList);
} catch (Exception e) {
System.out.println("Error with setupBarcodeFiles: " + e);
}
return theBarcodes.length;
}
use of cz1.ngs.model.Sequence in project polyGembler by c-zhou.
the class ParseBarcodeRead method parseReadIntoTagAndTaxa.
/**
* The barcode libraries used for this study can include two types of
* extraneous sequence at the end of reads. The first are chimeras created
* with the free ends. These will recreate the restriction site. The second
* are short regions (less than 64bp), so that will they will contain a
* portion of site and the universal adapter. This finds the first of site
* in likelyReadEnd, keeps the restriction site overhang and then sets
* everything to polyA afterwards
*
* @param seq An unprocessed tag sequence.
* @param maxLength The maximum number of bp in the processed sequence.
* @return returnValue A ReadBarcodeResult object containing the unprocessed
* tag, Cut site position, Processed tag, and Poly-A padded tag.
*/
public ReadBarcodeResult parseReadIntoTagAndTaxa(String seqS, String qualS, int minQual) {
if (seqS == null)
return null;
if (minQual != 0 && seqS != null) {
try {
final char[] seqC = (char[]) field.get(seqS), qualC = (char[]) field.get(qualS);
int len = seqC.length;
for (int i = 0; i < len; i++) if (qualC[i] - 33 < minQual)
seqC[i] = 'N';
seqS = String.valueOf(seqC);
} catch (IllegalArgumentException | IllegalAccessException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
Barcode bestBarcode = findBestBarcode(seqS, maximumMismatchInBarcodeAndOverhang);
if (bestBarcode == null) {
// overhang missing so skip
return null;
}
seqS = StringUtils.strip(seqS.substring(bestBarcode.getBarLength()), "N");
if (seqS.length() < 32)
return null;
int cutSitePosition = seqS.length();
for (String end : likelyReadEnd) {
int w = seqS.indexOf(end);
if (w >= 0 && w < cutSitePosition)
cutSitePosition = w;
}
String processedSeqS = StringUtils.strip(seqS.substring(0, cutSitePosition), "N");
if (processedSeqS.length() >= 32)
return new ReadBarcodeResult(BaseEncoder.getBitSetFromSeq(processedSeqS), bestBarcode.getTaxaId());
return null;
}
use of cz1.ngs.model.Sequence in project polyGembler by c-zhou.
the class FastqToTagSequence method run.
@Override
public void run() {
String[] countFileNames = null;
File inputDirectory = new File(this.myInputDirName);
File[] fastqFiles = inputDirectory.listFiles(new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
return name.matches("(?i).*\\.fq$|.*\\.fq\\.gz$|.*\\.fastq$|.*_fastq\\.txt$|.*_fastq\\.gz$|.*_fastq\\.txt\\.gz$|.*_sequence\\.txt$|.*_sequence\\.txt\\.gz$");
// (?i) denotes case insensitive; \\. denotes escape . so it doesn't mean 'any char' & escape the backslash
}
});
if (fastqFiles.length == 0 || fastqFiles == null) {
myLogger.warn("Couldn't find any files that end with \".fq\", \".fq.gz\", \".fastq\", \"_fastq.txt\", \"_fastq.gz\", \"_fastq.txt.gz\", \"_sequence.txt\", or \"_sequence.txt.gz\" in the supplied directory.");
return;
} else {
myLogger.info("Using the following FASTQ files:");
countFileNames = new String[fastqFiles.length];
for (int i = 0; i < fastqFiles.length; i++) {
countFileNames[i] = fastqFiles[i].getName().replaceAll("(?i)\\.fq$|\\.fq\\.gz$|\\.fastq$|_fastq\\.txt$|_fastq\\.gz$|_fastq\\.txt\\.gz$|_sequence\\.txt$|_sequence\\.txt\\.gz$", "");
// \\. escape . so it doesn't mean 'any char' & escape the backslash
myLogger.info(fastqFiles[i].getAbsolutePath());
}
}
for (int laneNum = 0; laneNum < fastqFiles.length; laneNum++) {
// for (int laneNum = 2; laneNum < 3; laneNum++) {
if (new File(myOutputDir + System.getProperty("file.separator") + countFileNames[laneNum] + ".cnt.gz").exists()) {
myLogger.info("Fastq file " + fastqFiles[laneNum] + " skipped.");
continue;
}
File outputFile = new File(this.myOutputDir + File.separator + countFileNames[laneNum]);
if (outputFile.isFile()) {
myLogger.warn("An output file " + countFileNames[laneNum] + "\n" + " already exists in the output directory for file " + fastqFiles[laneNum] + ". Skipping.");
continue;
}
myLogger.info("Reading FASTQ file: " + fastqFiles[laneNum]);
String[] filenameField = fastqFiles[laneNum].getName().split("_");
thePBR = new ParseBarcodeRead[this.myEnzyme.length];
for (int i = 0; i < this.myEnzyme.length; i++) {
if (filenameField.length == 3) {
thePBR[i] = new ParseBarcodeRead(this.myKeyfile, this.myEnzyme[i], filenameField[0], filenameField[1]);
} else if (filenameField.length == 4) {
thePBR[i] = new ParseBarcodeRead(this.myKeyfile, this.myEnzyme[i], filenameField[0], filenameField[2]);
} else // B08AAABXX_s_1_sequence.txt.gz
if (filenameField.length == 5) {
thePBR[i] = new ParseBarcodeRead(this.myKeyfile, this.myEnzyme[i], filenameField[1], filenameField[3]);
} else {
myLogger.error("Error in parsing file name: " + fastqFiles[laneNum]);
myLogger.error(" The filename does not contain either 3, 4, or 5 underscore-delimited values.");
myLogger.error(" Expect: flowcell_lane_fastq.txt.gz OR flowcell_s_lane_fastq.txt.gz OR code_flowcell_s_lane_fastq.txt.gz");
continue;
}
}
taxa = thePBR[0].getSortedTaxaNames();
n = taxa.length;
os = countFileNames[laneNum];
volume = 0;
myLogger.info("Total barcodes found in lane:" + thePBR[0].getBarCodeCount());
if (thePBR[0].getBarCodeCount() == 0) {
myLogger.warn("No barcodes found. Skipping this flowcell lane.");
continue;
}
String[] taxaNames = new String[thePBR[0].getBarCodeCount()];
for (int i = 0; i < taxaNames.length; i++) {
taxaNames[i] = thePBR[0].getTheBarcodes(i).getTaxaName();
}
long start = System.currentTimeMillis();
this.initial_thread_pool();
try {
BufferedReader br = Utils.getBufferedReader(fastqFiles[laneNum], 65536);
int block = 10000;
String[][] Qs = new String[block][2];
int k = 0;
allReads = 0;
goodBarcodedReads = 0;
tags = 0;
String temp = br.readLine();
while (temp != null) {
try {
Qs[k][0] = br.readLine();
br.readLine();
Qs[k][1] = br.readLine();
} catch (NullPointerException e) {
myLogger.error("Unable to correctly parse the sequence from fastq file. " + "Your fastq file may have been corrupted.");
System.exit(1);
}
k++;
temp = br.readLine();
if (k == block || temp == null) {
if (usedMemory() / maxMemory() > load) {
this.waitFor();
writeHardDisk();
this.initial_thread_pool();
}
executor.submit(new Runnable() {
private String[][] fastq;
@Override
public void run() {
// TODO Auto-generated method stub
try {
final Map<BitSet, short[]> block_tagCounts = new HashMap<BitSet, short[]>();
int block_allReads = 0, block_goodBarcodedReads = 0;
ReadBarcodeResult rr = null;
BitSet key;
for (int i = 0; i < fastq.length; i++) {
if (fastq[i][0] == null)
break;
// synchronized(lock) {
// allReads++;
// }
block_allReads++;
outerloop: for (int j = 0; j < myLeadingTrim.length; j++) {
for (int k = 0; k < thePBR.length; k++) {
rr = thePBR[k].parseReadIntoTagAndTaxa(fastq[i][0].substring(myLeadingTrim[j]), fastq[i][1].substring(myLeadingTrim[j]), myMinQualS);
if (rr != null)
break outerloop;
}
}
if (rr != null) {
key = rr.read;
/**
* synchronized(lock) {
* goodBarcodedReads++;
* if (allReads % 1000000 == 0) {
* myLogger.info("Total Reads:" + allReads +
* " Reads with barcode and cut site overhang:" +
* goodBarcodedReads);
* }
* if(!tagCounts.containsKey(key)) {
* tagCounts.put(key, new short[n]);
* tags++;
* }
* tagCounts.get(key)[rr.taxonId]++;
* }
*/
block_goodBarcodedReads++;
if (!block_tagCounts.containsKey(key)) {
block_tagCounts.put(key, new short[n]);
}
block_tagCounts.get(key)[rr.taxonId]++;
}
}
synchronized (lock) {
allReads += block_allReads;
goodBarcodedReads += block_goodBarcodedReads;
// goodBarcodedReads);
for (BitSet bs : block_tagCounts.keySet()) {
if (!tagCounts.containsKey(bs)) {
tags++;
tagCounts.put(bs, block_tagCounts.get(bs));
} else {
short[] copy = tagCounts.get(bs);
short[] block_copy = block_tagCounts.get(bs);
for (int i = 0; i < n; i++) copy[i] += block_copy[i];
}
}
}
} catch (Exception e) {
Thread t = Thread.currentThread();
t.getUncaughtExceptionHandler().uncaughtException(t, e);
e.printStackTrace();
executor.shutdown();
System.exit(1);
}
}
public Runnable init(String[][] fastq) {
this.fastq = fastq;
return (this);
}
}.init(Qs));
k = 0;
Qs = new String[block][2];
}
}
if (!tagCounts.isEmpty()) {
this.waitFor();
writeHardDisk();
}
br.close();
myLogger.info("Total number of reads in lane=" + allReads);
myLogger.info("Total number of good barcoded reads=" + goodBarcodedReads);
myLogger.info("Total number of tags=" + tags);
myLogger.info("Process took " + (System.currentTimeMillis() - start) / 1000 + " seconds.");
} catch (Exception e) {
e.printStackTrace();
}
myLogger.info("Finished reading " + (laneNum + 1) + " of " + fastqFiles.length + " sequence files.");
tagCounts.clear();
}
}
use of cz1.ngs.model.Sequence in project polyGembler by c-zhou.
the class Consensus method parseScaffold.
private final void parseScaffold(final String contig_fa, final String agp_map, final int minL, final String out_fa) {
final Map<String, Sequence> contig_map = Sequence.parseFastaFileAsMap(contig_fa);
try {
BufferedReader br_agp = Utils.getBufferedReader(agp_map);
final List<Sequence> scaffolds = new ArrayList<Sequence>();
final List<Segment> segments = new ArrayList<Segment>();
final StringBuilder str_buf = new StringBuilder();
String line = br_agp.readLine();
String[] s;
String seq_sn;
Sequence contig;
final Set<String> anchored_seqs = new HashSet<String>();
while (line != null) {
s = line.split("\\s+");
seq_sn = s[5];
segments.clear();
segments.add(new Segment(s[0], Integer.parseInt(s[1]), Integer.parseInt(s[2]), Integer.parseInt(s[3]), s[4], s[5], Integer.parseInt(s[6]), Integer.parseInt(s[7])));
while ((line = br_agp.readLine()) != null) {
s = line.split("\\s+");
if (s[5].equals(seq_sn))
segments.add(new Segment(s[0], Integer.parseInt(s[1]), Integer.parseInt(s[2]), Integer.parseInt(s[3]), s[4], s[5], Integer.parseInt(s[6]), Integer.parseInt(s[7])));
else
break;
}
str_buf.setLength(0);
for (Segment seg : segments) {
if (seg.type == MAP_ENUM.GAP) {
str_buf.append(Sequence.polyN(seg.seq_ln));
} else {
contig = contig_map.get(seg.seq_sn);
str_buf.append(seg.seq_rev ? Sequence.revCompSeq(contig.seq_str().substring(seg.seq_start, seg.seq_end)) : contig.seq_str().substring(seg.seq_start, seg.seq_end));
anchored_seqs.add(seg.seq_sn);
}
}
scaffolds.add(new Sequence(str_buf.toString().replaceAll("N{1,}$", "").replaceAll("^N{1,}", "")));
}
br_agp.close();
for (String seq : anchored_seqs) contig_map.remove(seq);
BufferedWriter bw_fa = Utils.getBufferedWriter(out_fa);
System.err.println("Buffer contigs...");
for (Map.Entry<String, Sequence> entry : contig_map.entrySet()) scaffolds.add(entry.getValue());
System.err.println("Sort scaffolds...");
Collections.sort(scaffolds);
Collections.reverse(scaffolds);
int n = scaffolds.size(), rm = 0, seq_ln;
String seq_str;
System.err.println(n + " scaffolds to parse...");
for (int i = 0; i != n; i++) {
if (i % 10000 == 0)
System.err.println(i + " scaffolds parsed...");
contig = scaffolds.get(i);
if (contig.seq_ln() < minL || contig.seq_ln() == 0)
break;
rm++;
bw_fa.write(contig.seq_str().contains("N") ? ">S" : ">C");
bw_fa.write(String.format("%08d\n", i + 1));
/**
* // not too slow as string buffer insertion is O(n)
* str_buf.setLength(0);
* str_buf.append(contig.seq_str);
*
* int offset=0, seq_ln=str_buf.length();
* while(offset<seq_ln) {
* offset += lineWidth;
* str_buf.insert( Math.min(offset, seq_ln), "\n" );
* seq_ln++;
* offset++;
* }
*
* bw_fa.write(str_buf.toString());
*/
seq_str = contig.seq_str();
seq_ln = contig.seq_ln();
bw_fa.write(seq_str.charAt(0));
for (int j = 1; j != seq_ln; j++) {
if (j % lineWidth == 0)
bw_fa.write("\n");
bw_fa.write(seq_str.charAt(j));
}
bw_fa.write("\n");
}
bw_fa.close();
System.err.println("Calculate statistics...");
// for(int i=rm; i!=n; i++) scaffolds.remove(i);
scaffolds.subList(rm, scaffolds.size()).clear();
int gap_ln = 0;
seq_ln = 0;
for (int i = 0; i != rm; i++) {
seq_ln += scaffolds.get(i).seq_ln();
gap_ln += StringUtils.countMatches(scaffolds.get(i).seq_str(), "N");
}
System.err.println("# SEQ_LEN\t" + seq_ln);
System.err.println("# GAP_LEN\t" + gap_ln);
System.err.println("# N50\t" + calcWeightedMedianSatistic(scaffolds, 0.5));
} catch (IOException e) {
e.printStackTrace();
}
}
use of cz1.ngs.model.Sequence in project polyGembler by c-zhou.
the class Consensus method setParameters.
@Override
public void setParameters(String[] args) {
// TODO Auto-generated method stub
if (myArgsEngine == null) {
myArgsEngine = new ArgsEngine();
myArgsEngine.add("-s", "--sequence", true);
myArgsEngine.add("-m", "--map", true);
myArgsEngine.add("-as", "--block-size", true);
myArgsEngine.add("-bs", "--batch-size", true);
myArgsEngine.add("-t", "--threads", true);
myArgsEngine.add("-l", "--min-size", true);
myArgsEngine.add("-o", "--out", true);
myArgsEngine.addWildOptions(bam_lib, true);
myArgsEngine.addWildOptions(ins_lib, true);
myArgsEngine.addWildOptions(w_lib, true);
myArgsEngine.parse(args);
}
if (myArgsEngine.getBoolean("-s")) {
this.sequence_file = myArgsEngine.getString("-s");
} else {
printUsage();
throw new IllegalArgumentException("Please specify the contig file.");
}
if (myArgsEngine.getBoolean("-m")) {
this.map_file = myArgsEngine.getString("-m");
} else {
printUsage();
throw new IllegalArgumentException("Please specify the map file.");
}
if (myArgsEngine.getBoolean("-as")) {
this.min_as = Integer.parseInt(myArgsEngine.getString("-as"));
}
if (myArgsEngine.getBoolean("-bs")) {
this.batch_size = Integer.parseInt(myArgsEngine.getString("-bs"));
}
if (myArgsEngine.getBoolean("-t")) {
this.THREADS = Integer.parseInt(myArgsEngine.getString("-t"));
}
if (myArgsEngine.getBoolean("-l")) {
this.min_size = Integer.parseInt(myArgsEngine.getString("-l"));
}
if (myArgsEngine.getBoolean("-o")) {
this.out_prefix = myArgsEngine.getString("-o");
} else {
printUsage();
throw new IllegalArgumentException("Please specify the output file.");
}
this.parseDataLibrary(args);
}
Aggregations