use of cz1.ngs.model.Sequence in project polyGembler by c-zhou.
the class Consensus method parseContig.
public Map<String, Sequence> parseContig(final String contig_index) {
final Map<String, Sequence> contig_list = new HashMap<String, Sequence>();
try {
BufferedReader ctg_br = new BufferedReader(new FileReader(contig_index));
String line;
String[] s;
int index = 0;
while ((line = ctg_br.readLine()) != null) {
s = line.split("\\s+");
contig_list.put(s[1].split(":")[1], new Sequence(index, Integer.parseInt(s[2].split(":")[1])));
index++;
}
ctg_br.close();
} catch (IOException e) {
e.printStackTrace();
}
return contig_list;
}
use of cz1.ngs.model.Sequence in project polyGembler by c-zhou.
the class Consensus method parseLink.
public void parseLink(final String[] links_in, final String segment_in, final String link_out) {
Map<String, Sequence> contig_list = parseContigFromBam();
List<List<Segment>> segment_list = parseSegment(segment_in);
Map<Integer, Set<Integer>> contig_seg_map = new HashMap<Integer, Set<Integer>>();
int chr_index = 0, seg_index, seq_index;
// 8 bits pseudo molecule indices
int seg_key;
// 24 bits segment indices
// List<String> pseudo_mol_sn = new ArrayList<String>();
List<double[]> pseudo_mol_sf = new ArrayList<double[]>();
// pseudo_mol_sn.add("");
for (List<Segment> seg_list : segment_list) {
// pseudo_mol_sn.add("Chr"+(chr_index<10?"0":"")+chr_index);
seg_index = 0;
for (Segment seg : seg_list) {
if (seg.type == MAP_ENUM.CONTIG) {
seg_key = chr_index;
seg_key <<= 24;
seg_key += seg_index;
seq_index = contig_list.get(seg.seq_sn).seq_no();
if (!contig_seg_map.containsKey(seq_index))
contig_seg_map.put(seq_index, new HashSet<Integer>());
contig_seg_map.get(seq_index).add(seg_key);
}
seg_index++;
}
pseudo_mol_sf.add(new double[seg_index - 1]);
chr_index++;
}
myLogger.info("Segments: " + segment_list.size());
myLogger.info("SegMap: " + contig_seg_map.size());
// for(Map.Entry<Integer, Integer> entry : contig_seg_map.entrySet())
// System.err.println(entry.getKey()+" "+entry.getValue());
Map<Long, Integer> links = new HashMap<Long, Integer>();
myLogger.info("Insert size threshold: " + strcat(this.ins_thres, ","));
myLogger.info("Link weights: " + strcat(link_w, ","));
long key;
int val;
try {
for (int i = 0; i < links_in.length; i++) {
links.clear();
System.gc();
System.runFinalization();
String link_i = links_in[i];
BufferedReader link_br = new BufferedReader(new FileReader(link_i));
String line;
String[] s;
Set<Integer> C1, C2;
int z1, z2;
while ((line = link_br.readLine()) != null) {
s = line.split("\\s+");
val = Integer.parseInt(s[2]);
z1 = Integer.parseInt(s[0]);
z2 = Integer.parseInt(s[1]);
if (!contig_seg_map.containsKey(z1) || !contig_seg_map.containsKey(z2))
continue;
C1 = contig_seg_map.get(z1);
C2 = contig_seg_map.get(z2);
for (Integer c1 : C1) {
for (Integer c2 : C2) {
if (c1 < c2) {
key = c1;
key <<= 32;
key += c2;
} else {
key = c2;
key <<= 32;
key += c1;
}
if (links.containsKey(key)) {
link_br.close();
throw new RuntimeException("!!!");
// links.put(key, val+links.get(key));
} else {
links.put(key, val);
}
}
}
}
link_br.close();
double insz_ub = ins_thres[i];
double w = link_w[i];
List<Segment> seg_list;
double[] mol_sf;
BufferedWriter link_bw = new BufferedWriter(new FileWriter(link_out + ".link" + String.format("%04d", i + 1)));
int n1, n2, ins;
Segment seg;
outerloop: for (Map.Entry<Long, Integer> entry : links.entrySet()) {
key = entry.getKey();
n2 = (int) (key & mask_24bits);
key >>= 24;
z2 = (int) (key & mask_08bits);
key >>= 8;
n1 = (int) (key & mask_24bits);
key >>= 24;
z1 = (int) key;
val = entry.getValue();
link_bw.write(n1 + " " + n2 + " " + z1 + " " + z2 + " " + val + "\n");
// if(c1!=c2 || val<link_thres) continue;
if (z1 != z2)
continue;
// System.out.println(n1+" "+n2+" "+c1+" "+c2+" "+val+"\n");
seg_list = segment_list.get(z1);
mol_sf = pseudo_mol_sf.get(z1);
ins = 0;
if (n1 > n2)
throw new RuntimeException("!!!");
for (int j = n1 + 1; j < n2; j++) {
seg = seg_list.get(j);
if (seg.type == MAP_ENUM.GAP)
continue;
ins += seg.seq_ln;
if (ins > insz_ub)
continue outerloop;
}
for (int j = n1; j < n2; j++) mol_sf[j] += w * val;
}
link_bw.close();
BufferedWriter map_bw = new BufferedWriter(new FileWriter(link_out + ".map" + String.format("%04d", i + 1)));
int scaff_i = 0;
for (int j = 0; j != pseudo_mol_sf.size(); j++) {
mol_sf = pseudo_mol_sf.get(j);
seg_list = segment_list.get(j);
if (mol_sf.length != seg_list.size() - 1) {
map_bw.close();
throw new RuntimeException("!!!");
}
String scaff = "Scaffold" + String.format("%08d", ++scaff_i);
seg = seg_list.get(0);
map_bw.write(seg.seq_sn + "\t" + seg.seq_ln + "\t" + seg.seq_start + "\t" + seg.seq_end + "\t" + (seg.seq_rev ? "-" : "+") + "\t" + scaff + "\t" + seg.mol_start + "\t" + seg.mol_end + "\t" + df2.format(-1) + "\n");
int n = seg_list.size();
for (int k = 1; k != n; k++) {
seg = seg_list.get(k);
if (mol_sf[k - 1] < 1.0)
scaff = "Scaffold" + String.format("%08d", ++scaff_i);
map_bw.write(seg.seq_sn + "\t" + seg.seq_ln + "\t" + seg.seq_start + "\t" + seg.seq_end + "\t" + (seg.seq_rev ? "-" : "+") + "\t" + scaff + "\t" + seg.mol_start + "\t" + seg.mol_end + "\t" + df2.format(mol_sf[k - 1]) + "\n");
}
}
map_bw.close();
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
use of cz1.ngs.model.Sequence in project polyGembler by c-zhou.
the class Consensus method parseContig.
public Map<String, Sequence> parseContig(final String contig_index) {
final Map<String, Sequence> contig_list = new HashMap<String, Sequence>();
try {
BufferedReader ctg_br = new BufferedReader(new FileReader(contig_index));
String line;
String[] s;
int index = 0;
while ((line = ctg_br.readLine()) != null) {
s = line.split("\\s+");
contig_list.put(s[1].split(":")[1], new Sequence(index, Integer.parseInt(s[2].split(":")[1])));
index++;
}
ctg_br.close();
} catch (IOException e) {
e.printStackTrace();
}
return contig_list;
}
use of cz1.ngs.model.Sequence in project polyGembler by c-zhou.
the class Redundas method run.
@Override
public void run() {
// TODO Auto-generated method stub
Map<String, Sequence> sequence_map = Sequence.parseFastaFileAsMap(query_file);
String line;
String[] s;
Set<String> seq_rm = new HashSet<String>();
try {
BufferedReader br_blast6 = new BufferedReader(new FileReader(blast_out));
Set<Blast6Segment> buffer_b6 = new HashSet<Blast6Segment>();
line = br_blast6.readLine();
String qseqid;
while (line != null) {
s = line.split("\\s+");
qseqid = s[0];
buffer_b6.clear();
buffer_b6.add(Blast6Segment.blast6Segment(line));
while ((line = br_blast6.readLine()) != null && line.startsWith(qseqid)) buffer_b6.add(Blast6Segment.blast6Segment(line));
int sz = sequence_map.get(qseqid).seq_ln();
RangeSet<Integer> range_covered = TreeRangeSet.create();
for (Blast6Segment record : buffer_b6) {
if (!seq_rm.contains(record.sseqid()) && !record.qseqid().equals(record.sseqid()) && record.pident() >= min_ident && record.length() >= min_match && (record.qstart() <= max_overhang || record.qend() > sz - max_overhang))
range_covered.add(Range.closedOpen(record.qstart(), record.qend()).canonical(DiscreteDomain.integers()));
}
int unique_cvg = 0;
for (Range<Integer> range : range_covered.asRanges()) unique_cvg += range.upperEndpoint() - range.lowerEndpoint();
if ((double) unique_cvg / sz >= min_frac) {
seq_rm.add(qseqid);
myLogger.info("Redundant sequence " + qseqid + "\t" + sz + "\t" + unique_cvg + "\t" + (double) unique_cvg / sz);
}
}
br_blast6.close();
BufferedWriter bw_unique = Utils.getBufferedWriter(this.out_prefix + ".fa");
BufferedWriter bw_redundas = Utils.getBufferedWriter(this.out_prefix + "_redundas.fa");
for (Map.Entry<String, Sequence> entry : sequence_map.entrySet()) {
if (seq_rm.contains(entry.getKey()))
bw_redundas.write(entry.getValue().formatOutput());
else
bw_unique.write(entry.getValue().formatOutput());
}
bw_unique.close();
bw_redundas.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
use of cz1.ngs.model.Sequence in project polyGembler by c-zhou.
the class ParseBarcodeRead method findBestBarcode.
/**
* Returns the best barcode match for a given sequence.
* @param queryS query sequence to be tested against all barcodes
* @param maxDivergence maximum divergence to permit
* @return best barcode match (null if no good match)
*/
Barcode findBestBarcode(String queryS, int maxDivergence) {
long query = BaseEncoder.getLongFromSeq(queryS.substring(0, chunkSize));
// note because the barcodes are polyA after the sequence, they should always
// sort ahead of the hit, this is the reason for the -(closestHit+2)
int closestHit = Arrays.binarySearch(quickBarcodeList, query);
// Below is the old pipeline approach, which works (at least for maxDivergence of 0)
if (closestHit < -1) {
// should always be true, as the barcode+overhang is padded to 32 bases with polyA
int index = quickMap.get(quickBarcodeList[-(closestHit + 2)]);
if (theBarcodes[index].compareSequence(query, 1) == 0) {
return theBarcodes[index];
} else if (maxDivergence == 0) {
// return null if not a perfect match
return null;
}
} else {
// should never go to this line
return null;
}
int maxLength = 0, minDiv = maxDivergence + 1;
Barcode bestBC = null;
for (Barcode bc : theBarcodes) {
int div = bc.compareSequence(query, maxDivergence + 1);
if (div <= minDiv) {
if ((div < minDiv) || (bc.getBarOverLength() > maxLength)) {
minDiv = div;
maxLength = bc.getBarOverLength();
bestBC = bc;
} else {
// it is a tie, so return that not resolvable
bestBC = null;
}
}
}
return bestBC;
}
Aggregations