use of cz1.gbs.core.Barcode in project polyGembler by c-zhou.
the class ParseBarcodeRead method setupBarcodeFiles.
/**
* Reads in an Illumina key file, creates a linear array of {@link Barcode} objects
* representing the barcodes in the key file, then creates a hash map containing
* indices from the linear array indexed by sequence. The names of barcode objects
* follow the pattern samplename:flowcell:lane:well, since sample names alone are not unique.
*
* @param keyFile Illumina key file.
* @param flowcell Only barcodes from this flowcell will be added to the array.
* @param lane Only barcodes from this lane will be added to the array.
* @return Number of barcodes in the array.
*/
private int setupBarcodeFiles(File keyFile, String flowcell, String lane) {
try {
BufferedReader br = new BufferedReader(new FileReader(keyFile), 65536);
ArrayList<Barcode> theBarcodesArrayList = new ArrayList<Barcode>();
String temp;
int k = 0;
while (((temp = br.readLine()) != null)) {
// split by whitespace
String[] s = temp.split("\\t");
Barcode theBC = null;
if (s[0].equals(flowcell) && s[1].equals(lane)) {
String well = (s[6].length() < 2) ? (s[5] + '0' + s[6]) : s[5] + s[6];
if (s.length < 8 || s[7] == null || s[7].equals("")) {
// use the plate and well
theBC = new Barcode(s[2], initialCutSiteRemnant, s[3] + ":" + s[0] + ":" + s[1] + ":" + s[4] + ":" + well, flowcell, lane, k++);
} else {
// use the "libraryPlateWellID" or whatever is in column H of the key file, IF it is an integer
try {
int libPrepID = Integer.parseInt(s[7]);
theBC = new Barcode(s[2], initialCutSiteRemnant, s[3] + ":" + s[0] + ":" + s[1] + ":" + libPrepID, flowcell, lane, k++);
} catch (NumberFormatException nfe) {
theBC = new Barcode(s[2], initialCutSiteRemnant, s[3] + ":" + s[0] + ":" + s[1] + ":" + s[4] + ":" + well, flowcell, lane, k++);
}
}
theBarcodesArrayList.add(theBC);
System.out.println(theBC.getBarcodeString() + " " + theBC.getTaxaName());
}
}
br.close();
theBarcodes = new Barcode[theBarcodesArrayList.size()];
theBarcodesArrayList.toArray(theBarcodes);
Arrays.sort(theBarcodes);
int nBL = theBarcodes[0].getBarOverLong().length;
quickBarcodeList = new long[theBarcodes.length * nBL];
quickMap = new HashMap<Long, Integer>();
for (int i = 0; i < theBarcodes.length; i++) {
for (int j = 0; j < nBL; j++) {
quickBarcodeList[i * nBL + j] = theBarcodes[i].getBarOverLong()[j];
quickMap.put(theBarcodes[i].getBarOverLong()[j], i);
}
}
Arrays.sort(quickBarcodeList);
} catch (Exception e) {
System.out.println("Error with setupBarcodeFiles: " + e);
}
return theBarcodes.length;
}
use of cz1.gbs.core.Barcode in project polyGembler by c-zhou.
the class ParseBarcodeRead method parseReadIntoTagAndTaxa.
/**
* The barcode libraries used for this study can include two types of
* extraneous sequence at the end of reads. The first are chimeras created
* with the free ends. These will recreate the restriction site. The second
* are short regions (less than 64bp), so that will they will contain a
* portion of site and the universal adapter. This finds the first of site
* in likelyReadEnd, keeps the restriction site overhang and then sets
* everything to polyA afterwards
*
* @param seq An unprocessed tag sequence.
* @param maxLength The maximum number of bp in the processed sequence.
* @return returnValue A ReadBarcodeResult object containing the unprocessed
* tag, Cut site position, Processed tag, and Poly-A padded tag.
*/
public ReadBarcodeResult parseReadIntoTagAndTaxa(String seqS, String qualS, int minQual) {
if (seqS == null)
return null;
if (minQual != 0 && seqS != null) {
try {
final char[] seqC = (char[]) field.get(seqS), qualC = (char[]) field.get(qualS);
int len = seqC.length;
for (int i = 0; i < len; i++) if (qualC[i] - 33 < minQual)
seqC[i] = 'N';
seqS = String.valueOf(seqC);
} catch (IllegalArgumentException | IllegalAccessException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
Barcode bestBarcode = findBestBarcode(seqS, maximumMismatchInBarcodeAndOverhang);
if (bestBarcode == null) {
// overhang missing so skip
return null;
}
seqS = StringUtils.strip(seqS.substring(bestBarcode.getBarLength()), "N");
if (seqS.length() < 32)
return null;
int cutSitePosition = seqS.length();
for (String end : likelyReadEnd) {
int w = seqS.indexOf(end);
if (w >= 0 && w < cutSitePosition)
cutSitePosition = w;
}
String processedSeqS = StringUtils.strip(seqS.substring(0, cutSitePosition), "N");
if (processedSeqS.length() >= 32)
return new ReadBarcodeResult(BaseEncoder.getBitSetFromSeq(processedSeqS), bestBarcode.getTaxaId());
return null;
}
use of cz1.gbs.core.Barcode in project polyGembler by c-zhou.
the class FastqToTagSequence method run.
@Override
public void run() {
String[] countFileNames = null;
File inputDirectory = new File(this.myInputDirName);
File[] fastqFiles = inputDirectory.listFiles(new FilenameFilter() {
@Override
public boolean accept(File dir, String name) {
return name.matches("(?i).*\\.fq$|.*\\.fq\\.gz$|.*\\.fastq$|.*_fastq\\.txt$|.*_fastq\\.gz$|.*_fastq\\.txt\\.gz$|.*_sequence\\.txt$|.*_sequence\\.txt\\.gz$");
// (?i) denotes case insensitive; \\. denotes escape . so it doesn't mean 'any char' & escape the backslash
}
});
if (fastqFiles.length == 0 || fastqFiles == null) {
myLogger.warn("Couldn't find any files that end with \".fq\", \".fq.gz\", \".fastq\", \"_fastq.txt\", \"_fastq.gz\", \"_fastq.txt.gz\", \"_sequence.txt\", or \"_sequence.txt.gz\" in the supplied directory.");
return;
} else {
myLogger.info("Using the following FASTQ files:");
countFileNames = new String[fastqFiles.length];
for (int i = 0; i < fastqFiles.length; i++) {
countFileNames[i] = fastqFiles[i].getName().replaceAll("(?i)\\.fq$|\\.fq\\.gz$|\\.fastq$|_fastq\\.txt$|_fastq\\.gz$|_fastq\\.txt\\.gz$|_sequence\\.txt$|_sequence\\.txt\\.gz$", "");
// \\. escape . so it doesn't mean 'any char' & escape the backslash
myLogger.info(fastqFiles[i].getAbsolutePath());
}
}
for (int laneNum = 0; laneNum < fastqFiles.length; laneNum++) {
// for (int laneNum = 2; laneNum < 3; laneNum++) {
if (new File(myOutputDir + System.getProperty("file.separator") + countFileNames[laneNum] + ".cnt.gz").exists()) {
myLogger.info("Fastq file " + fastqFiles[laneNum] + " skipped.");
continue;
}
File outputFile = new File(this.myOutputDir + File.separator + countFileNames[laneNum]);
if (outputFile.isFile()) {
myLogger.warn("An output file " + countFileNames[laneNum] + "\n" + " already exists in the output directory for file " + fastqFiles[laneNum] + ". Skipping.");
continue;
}
myLogger.info("Reading FASTQ file: " + fastqFiles[laneNum]);
String[] filenameField = fastqFiles[laneNum].getName().split("_");
thePBR = new ParseBarcodeRead[this.myEnzyme.length];
for (int i = 0; i < this.myEnzyme.length; i++) {
if (filenameField.length == 3) {
thePBR[i] = new ParseBarcodeRead(this.myKeyfile, this.myEnzyme[i], filenameField[0], filenameField[1]);
} else if (filenameField.length == 4) {
thePBR[i] = new ParseBarcodeRead(this.myKeyfile, this.myEnzyme[i], filenameField[0], filenameField[2]);
} else // B08AAABXX_s_1_sequence.txt.gz
if (filenameField.length == 5) {
thePBR[i] = new ParseBarcodeRead(this.myKeyfile, this.myEnzyme[i], filenameField[1], filenameField[3]);
} else {
myLogger.error("Error in parsing file name: " + fastqFiles[laneNum]);
myLogger.error(" The filename does not contain either 3, 4, or 5 underscore-delimited values.");
myLogger.error(" Expect: flowcell_lane_fastq.txt.gz OR flowcell_s_lane_fastq.txt.gz OR code_flowcell_s_lane_fastq.txt.gz");
continue;
}
}
taxa = thePBR[0].getSortedTaxaNames();
n = taxa.length;
os = countFileNames[laneNum];
volume = 0;
myLogger.info("Total barcodes found in lane:" + thePBR[0].getBarCodeCount());
if (thePBR[0].getBarCodeCount() == 0) {
myLogger.warn("No barcodes found. Skipping this flowcell lane.");
continue;
}
String[] taxaNames = new String[thePBR[0].getBarCodeCount()];
for (int i = 0; i < taxaNames.length; i++) {
taxaNames[i] = thePBR[0].getTheBarcodes(i).getTaxaName();
}
long start = System.currentTimeMillis();
this.initial_thread_pool();
try {
BufferedReader br = Utils.getBufferedReader(fastqFiles[laneNum], 65536);
int block = 10000;
String[][] Qs = new String[block][2];
int k = 0;
allReads = 0;
goodBarcodedReads = 0;
tags = 0;
String temp = br.readLine();
while (temp != null) {
try {
Qs[k][0] = br.readLine();
br.readLine();
Qs[k][1] = br.readLine();
} catch (NullPointerException e) {
myLogger.error("Unable to correctly parse the sequence from fastq file. " + "Your fastq file may have been corrupted.");
System.exit(1);
}
k++;
temp = br.readLine();
if (k == block || temp == null) {
if (usedMemory() / maxMemory() > load) {
this.waitFor();
writeHardDisk();
this.initial_thread_pool();
}
executor.submit(new Runnable() {
private String[][] fastq;
@Override
public void run() {
// TODO Auto-generated method stub
try {
final Map<BitSet, short[]> block_tagCounts = new HashMap<BitSet, short[]>();
int block_allReads = 0, block_goodBarcodedReads = 0;
ReadBarcodeResult rr = null;
BitSet key;
for (int i = 0; i < fastq.length; i++) {
if (fastq[i][0] == null)
break;
// synchronized(lock) {
// allReads++;
// }
block_allReads++;
outerloop: for (int j = 0; j < myLeadingTrim.length; j++) {
for (int k = 0; k < thePBR.length; k++) {
rr = thePBR[k].parseReadIntoTagAndTaxa(fastq[i][0].substring(myLeadingTrim[j]), fastq[i][1].substring(myLeadingTrim[j]), myMinQualS);
if (rr != null)
break outerloop;
}
}
if (rr != null) {
key = rr.read;
/**
* synchronized(lock) {
* goodBarcodedReads++;
* if (allReads % 1000000 == 0) {
* myLogger.info("Total Reads:" + allReads +
* " Reads with barcode and cut site overhang:" +
* goodBarcodedReads);
* }
* if(!tagCounts.containsKey(key)) {
* tagCounts.put(key, new short[n]);
* tags++;
* }
* tagCounts.get(key)[rr.taxonId]++;
* }
*/
block_goodBarcodedReads++;
if (!block_tagCounts.containsKey(key)) {
block_tagCounts.put(key, new short[n]);
}
block_tagCounts.get(key)[rr.taxonId]++;
}
}
synchronized (lock) {
allReads += block_allReads;
goodBarcodedReads += block_goodBarcodedReads;
// goodBarcodedReads);
for (BitSet bs : block_tagCounts.keySet()) {
if (!tagCounts.containsKey(bs)) {
tags++;
tagCounts.put(bs, block_tagCounts.get(bs));
} else {
short[] copy = tagCounts.get(bs);
short[] block_copy = block_tagCounts.get(bs);
for (int i = 0; i < n; i++) copy[i] += block_copy[i];
}
}
}
} catch (Exception e) {
Thread t = Thread.currentThread();
t.getUncaughtExceptionHandler().uncaughtException(t, e);
e.printStackTrace();
executor.shutdown();
System.exit(1);
}
}
public Runnable init(String[][] fastq) {
this.fastq = fastq;
return (this);
}
}.init(Qs));
k = 0;
Qs = new String[block][2];
}
}
if (!tagCounts.isEmpty()) {
this.waitFor();
writeHardDisk();
}
br.close();
myLogger.info("Total number of reads in lane=" + allReads);
myLogger.info("Total number of good barcoded reads=" + goodBarcodedReads);
myLogger.info("Total number of tags=" + tags);
myLogger.info("Process took " + (System.currentTimeMillis() - start) / 1000 + " seconds.");
} catch (Exception e) {
e.printStackTrace();
}
myLogger.info("Finished reading " + (laneNum + 1) + " of " + fastqFiles.length + " sequence files.");
tagCounts.clear();
}
}
use of cz1.gbs.core.Barcode in project polyGembler by c-zhou.
the class GBSpileup method setParameters.
@Override
public void setParameters(String[] args) {
// TODO Auto-generated method stub
if (args.length == 0) {
printUsage();
throw new IllegalArgumentException("\n\nPlease use the above arguments/options.\n\n");
}
if (myArgsEngine == null) {
myArgsEngine = new ArgsEngine();
myArgsEngine.add("-i", "--input-fastq", true);
myArgsEngine.add("-k", "--key-file", true);
myArgsEngine.add("-e", "--enzyme", true);
myArgsEngine.add("-q", "--min-qualS", true);
myArgsEngine.add("-p", "--ploidy", true);
myArgsEngine.add("-t", "--threads", true);
myArgsEngine.add("-T", "--trim-leading", true);
myArgsEngine.add("-b", "--unassgined-reads", true);
myArgsEngine.add("-f", "--reference", true);
myArgsEngine.add("-z", "--skip-freebayes", false);
myArgsEngine.add("-x", "--max-coverage", true);
myArgsEngine.add("-o", "--prefix", true);
myArgsEngine.parse(args);
}
if (myArgsEngine.getBoolean("-i")) {
myInputDirName = myArgsEngine.getString("-i");
} else {
printUsage();
throw new IllegalArgumentException("Please specify the location of your FASTQ files.");
}
if (myArgsEngine.getBoolean("-k")) {
myKeyfile = myArgsEngine.getString("-k");
} else {
printUsage();
throw new IllegalArgumentException("Please specify a barcode key file.");
}
if (myArgsEngine.getBoolean("-z")) {
this.skipFB = true;
} else {
this.require("freebayes");
}
if (myArgsEngine.getBoolean("-f")) {
myReference = myArgsEngine.getString("-f");
if (!new File(myReference + ".amb").exists() || !new File(myReference + ".ann").exists() || !new File(myReference + ".bwt").exists() || !new File(myReference + ".pac").exists() || !new File(myReference + ".sa").exists()) {
String index = "bwa index -p " + myReference + " -a bwtsw " + myReference;
this.consume(this.bash(index));
}
} else {
printUsage();
throw new IllegalArgumentException("Please specify the reference.");
}
if (myArgsEngine.getBoolean("-e")) {
myEnzyme = myArgsEngine.getString("-e").split("-");
} else {
myLogger.warn("No enzyme specified. Using enzyme listed in key file.");
try {
BufferedReader br = Utils.getBufferedReader(myKeyfile);
String[] s = br.readLine().split("\\s+");
int k = -1;
for (int i = 0; i < s.length; i++) if (s[i].toLowerCase().equals("enzyme"))
k = i;
if (k < 0)
throw new IllegalArgumentException("No enzyme found in the key file. " + "Please specify the enzyme with -e option.\n\n");
s = br.readLine().split("\\s+");
myEnzyme = s[k].split("-");
br.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
if (myArgsEngine.getBoolean("-q")) {
myMinQualS = Integer.parseInt(myArgsEngine.getString("-q"));
}
if (myArgsEngine.getBoolean("-p")) {
myPloidy = Integer.parseInt(myArgsEngine.getString("-p"));
}
if (myArgsEngine.getBoolean("-t")) {
THREADS = Integer.parseInt(myArgsEngine.getString("-t"));
}
if (myArgsEngine.getBoolean("-x")) {
maxCov = Long.parseLong(myArgsEngine.getString("-x"));
}
if (myArgsEngine.getBoolean("-T")) {
int leading = Integer.parseInt(myArgsEngine.getString("-T"));
if (leading > 0) {
List<Integer> leadings = new ArrayList<Integer>();
leadings.add(leading);
for (int i = 1; i < 4; i++) {
if (leading - i >= 0)
leadings.add(leading - i);
leadings.add(leading + i);
}
myLeadingTrim = new int[leadings.size()];
for (int i = 0; i < myLeadingTrim.length; i++) myLeadingTrim[i] = leadings.get(i);
}
}
if (myArgsEngine.getBoolean("-o")) {
myOutputDir = myArgsEngine.getString("-o");
}
File out = new File(myOutputDir);
if (out.exists() && out.isDirectory()) {
myLogger.warn("Output directory " + myOutputDir + " exsits. " + "We strongly recommend a new location.");
}
}
use of cz1.gbs.core.Barcode in project polyGembler by c-zhou.
the class TenXSamtools method setParameters.
@Override
public void setParameters(String[] args) {
// TODO Auto-generated method stub
if (args.length == 0) {
printUsage();
throw new IllegalArgumentException("\n\nPlease use the above arguments/options.\n\n");
}
switch(args[0].toUpperCase()) {
case "SORT":
this.task = Task.sort;
break;
default:
printUsage();
throw new IllegalArgumentException("\n\nPlease use the above arguments/options.\n\n");
}
String[] args2 = new String[args.length - 1];
System.arraycopy(args, 1, args2, 0, args2.length);
switch(this.task) {
case sort:
if (myArgsEngine == null) {
myArgsEngine = new ArgsEngine();
myArgsEngine.add("-i", "--in-bam", true);
myArgsEngine.add("-b", "--barcode", false);
myArgsEngine.add("-n", "--name", false);
myArgsEngine.add("-s", "--batch-size", true);
myArgsEngine.add("-t", "--threads", true);
myArgsEngine.add("-o", "--out-bam", true);
myArgsEngine.parse(args2);
}
if (myArgsEngine.getBoolean("-i")) {
this.bam_in = myArgsEngine.getString("-i");
} else {
printUsage();
throw new IllegalArgumentException("Please specify the input BAM file.");
}
if (myArgsEngine.getBoolean("-b") && myArgsEngine.getBoolean("-n")) {
printUsage();
throw new IllegalArgumentException("Options -b and -n are exculsive!!!");
}
if (myArgsEngine.getBoolean("-b")) {
this.sort_order = Order.barcode;
comprator = new Comparator<SAMRecord>() {
@Override
public int compare(SAMRecord record1, SAMRecord record2) {
// TODO Auto-generated method stub
if (record1 == null && record2 == null)
return 0;
if (record1 == null)
return 1;
if (record2 == null)
return -1;
String bx1 = record1.getStringAttribute("BX"), bx2 = record2.getStringAttribute("BX");
if (bx1 == null && bx2 == null)
return compareSAMRecord(record1, record2);
// none barcode record to the end
if (bx1 == null)
return 1;
if (bx2 == null)
return -1;
int diff = bx1.compareTo(bx2);
return diff == 0 ? compareSAMRecord(record1, record2) : diff;
}
};
}
if (myArgsEngine.getBoolean("-n")) {
this.sort_order = Order.queryname;
comprator = new Comparator<SAMRecord>() {
@Override
public int compare(SAMRecord record1, SAMRecord record2) {
// TODO Auto-generated method stub
if (record1 == null && record2 == null)
return 0;
if (record1 == null)
return 1;
if (record2 == null)
return -1;
if (record1.getReadName().compareTo(record2.getReadName()) == 0)
return compareSAMRecord(record1, record2);
return record1.getReadName().compareTo(record2.getReadName());
}
};
}
if (myArgsEngine.getBoolean("-s")) {
this.batch_size = Integer.parseInt(myArgsEngine.getString("-s"));
}
if (myArgsEngine.getBoolean("-t")) {
this.THREADS = Integer.parseInt(myArgsEngine.getString("-t"));
}
if (myArgsEngine.getBoolean("-o")) {
this.bam_out = myArgsEngine.getString("-o");
} else {
printUsage();
throw new IllegalArgumentException("Please specify the output BAM file.");
}
break;
default:
throw new RuntimeException("!!!");
}
}
Aggregations