use of com.google.common.collect.TreeRangeSet in project polyGembler by c-zhou.
the class Anchor method run.
@Override
public void run() {
// TODO Auto-generated method stub
// read assembly graph file
final GFA gfa = new GFA(query_file, asm_graph);
qry_seqs = gfa.getSequenceMap();
sub_seqs = Sequence.parseFastaFileAsMap(subject_file);
myLogger.info(" GFA vertices: " + gfa.vertexSet().size());
myLogger.info(" GFA edges : " + gfa.edgeSet().size());
// myLogger.info(" GFA edges --- ");
// for(OverlapEdge olap : gfa.edgeSet())
// myLogger.info(olap.olapInfo().toString());
// find 'N/n's in subject/reference sequences
// which could have impact on parsing the blast records
sub_gaps = new HashMap<String, TreeRangeSet<Integer>>();
for (Map.Entry<String, Sequence> entry : sub_seqs.entrySet()) {
String seq_sn = entry.getKey();
String seq_str = entry.getValue().seq_str();
final TreeRangeSet<Integer> tmp_rangeSet = TreeRangeSet.create();
for (int j = 0; j < seq_str.length(); j++) {
if (seq_str.charAt(j) == 'N' || seq_str.charAt(j) == 'n')
// blast record is 1-based closed coordination
tmp_rangeSet.add(Range.closed(j + 1, j + 1).canonical(DiscreteDomain.integers()));
}
int seq_ln = seq_str.length();
final TreeRangeSet<Integer> range_set = TreeRangeSet.create();
for (Range<Integer> range : tmp_rangeSet.asRanges()) {
int lowerend = range.hasLowerBound() ? Math.max(0, range.lowerEndpoint() - gap_buff) : 0;
int upperend = range.hasUpperBound() ? Math.min(seq_ln, range.upperEndpoint() + gap_buff - 1) : seq_ln;
range_set.add(Range.closed(lowerend, upperend).canonical(DiscreteDomain.integers()));
}
sub_gaps.put(seq_sn, range_set);
}
// read alignment file and place the query sequences
final Map<String, Set<SAMSegment>> initPlace = new HashMap<String, Set<SAMSegment>>();
final Map<String, List<SAMSegment>> initPseudoAssembly = new HashMap<String, List<SAMSegment>>();
for (String sub_seq : sub_seqs.keySet()) initPseudoAssembly.put(sub_seq, new ArrayList<SAMSegment>());
try {
final SamReaderFactory factory = SamReaderFactory.makeDefault().enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS, SamReaderFactory.Option.VALIDATE_CRC_CHECKSUMS).validationStringency(ValidationStringency.SILENT);
final SamReader in1 = factory.open(new File(align_file));
final SAMRecordIterator iter1 = in1.iterator();
String qry;
int qry_ln;
double min_aln;
final List<SAMSegment> buff = new ArrayList<SAMSegment>();
SAMRecord rc = iter1.next();
while (rc != null) {
qry = rc.getReadName();
qry_ln = qry_seqs.get(qry).seq_ln();
buff.clear();
if (!rc.getReadUnmappedFlag())
buff.add(SAMSegment.samRecord(rc, true, qry_ln));
while ((rc = iter1.next()) != null && rc.getReadName().equals(qry)) {
buff.add(SAMSegment.samRecord(rc, true, qry_ln));
}
if (buff.isEmpty())
continue;
min_aln = 0.9 * buff.get(0).qlength();
// keep alignment fragment that has qual>0
Set<SAMSegment> init_f = new HashSet<SAMSegment>();
Set<SAMSegment> init_r = new HashSet<SAMSegment>();
for (SAMSegment record : buff) {
if (record.qual() == 0 && record.qlength() < min_aln)
continue;
if (record.qseqid().equals(qry))
init_f.add(record);
else
init_r.add(record);
initPseudoAssembly.get(record.sseqid()).add(record);
}
if (!init_f.isEmpty())
initPlace.put(qry, init_f);
if (!init_r.isEmpty())
initPlace.put(qry + "'", init_r);
}
iter1.close();
in1.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
// Collections.sort(initPseudoAssembly.get("1_pilon"), new AlignmentSegment.SubjectCoordinationComparator());
// if(debug) {
// for(SAMSegment record : initPseudoAssembly.get("1_pilon")) {
// System.out.println(record.qseqid()+":"+record.sstart()+"-"+record.send());
// }
// }
final Set<SAMSegment> contained = new HashSet<SAMSegment>();
final Set<SAMSegment> placed = new HashSet<SAMSegment>();
final int flank_size = 10000;
int distance;
for (String sub_seq : sub_seqs.keySet()) {
// sub_seq = "Chr10";
if (sub_seq.equals("Chr00"))
continue;
myLogger.info(">>>>>>>>>>>>>" + sub_seq + "<<<<<<<<<<<<<<<<");
final List<SAMSegment> seq_by_sub = initPseudoAssembly.get(sub_seq);
Collections.sort(seq_by_sub, new AlignmentSegment.SubjectCoordinationComparator());
placed.clear();
int nSeq = seq_by_sub.size();
double edge_penalty, edge_score;
SAMSegment root_seq, source_seq, target_seq;
Set<SAMSegment> target_seqs;
Set<OverlapEdge> outgoing;
TraceableEdge edge;
String root_seqid, source_seqid, target_seqid;
TraceableVertex<String> root_vertex, source_vertex, target_vertex;
Deque<SAMSegment> deque = new ArrayDeque<SAMSegment>();
final List<TraceableVertex<String>> traceable = new ArrayList<TraceableVertex<String>>();
for (int i = 0; i < nSeq; i++) {
root_seq = seq_by_sub.get(i);
root_seqid = root_seq.qseqid();
if (placed.contains(root_seq))
continue;
final TraceableDirectedWeightedPseudograph<String> razor = new TraceableDirectedWeightedPseudograph<String>(TraceableEdge.class);
// final ListenableDirectedWeightedGraph<TraceableVertex<String>, DefaultWeightedEdge> razor =
// new ListenableDirectedWeightedGraph<TraceableVertex<String>, DefaultWeightedEdge>(DefaultWeightedEdge.class);
// JGraphModelAdapter<TraceableVertex<String>, DefaultWeightedEdge> jgAdapter =
// new JGraphModelAdapter<TraceableVertex<String>, DefaultWeightedEdge>(razor);
// JGraph jgraph = new JGraph(jgAdapter);
deque.clear();
deque.push(root_seq);
contained.clear();
while (!deque.isEmpty()) {
source_seq = deque.pop();
source_seqid = source_seq.qseqid();
if (contained.contains(source_seq))
continue;
contained.add(source_seq);
source_vertex = new TraceableVertex<String>(source_seqid);
source_vertex.setSAMSegment(source_seq);
if (!razor.containsVertex(source_vertex))
razor.addVertex(source_vertex);
outgoing = gfa.outgoingEdgesOf(source_seqid);
for (OverlapEdge out : outgoing) {
target_seqid = gfa.getEdgeTarget(out);
if (!initPlace.containsKey(target_seqid))
continue;
target_seqs = initPlace.get(target_seqid);
distance = Integer.MAX_VALUE;
target_seq = null;
for (SAMSegment seq : target_seqs) {
int d = AlignmentSegment.sdistance(source_seq, seq);
if (d < distance) {
distance = d;
target_seq = seq;
}
}
if (distance <= flank_size) {
target_vertex = new TraceableVertex<String>(target_seqid);
target_vertex.setSAMSegment(target_seq);
if (!razor.containsVertex(target_vertex))
razor.addVertex(target_vertex);
if (razor.containsEdge(source_vertex, target_vertex))
continue;
edge = razor.addEdge(source_vertex, target_vertex);
// calculate edge weight
// higher weight edges are those,
/**
**
* // 1. large/long alignment segments vertices
* // TODO: 2*. small gaps on the reference
* edge_weight = qry_seqs.get(source_seqid).seq_ln()+
* qry_seqs.get(target_seqid).seq_ln()-
* gfa.getEdge(source_seqid, target_seqid).olap();
*/
// TODO: 1*. large/long alignment segments vertices
// 2. small gaps on the reference
edge_penalty = AlignmentSegment.sdistance(source_seq, target_seq);
edge.setPenalty(edge_penalty);
edge_score = qry_seqs.get(source_seqid).seq_ln() + qry_seqs.get(target_seqid).seq_ln() - gfa.getEdge(source_seqid, target_seqid).olap();
edge.setScore(edge_score);
deque.push(target_seq);
}
}
}
if (ddebug)
myLogger.info(root_seqid + " " + razor.vertexSet().size() + " " + razor.edgeSet().size() + " done");
// JFrame frame = new JFrame();
// frame.getContentPane().add(jgraph);
// frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
// frame.pack();
// frame.setVisible(true);
// "pseudo"-DFS to find the route with the highest score
final Map<String, TraceableVertex<String>> razv_map = new HashMap<String, TraceableVertex<String>>();
for (TraceableVertex<String> v : razor.vertexSet()) razv_map.put(v.getId(), v);
// we use a bidirectional hashmap to simulate the deque
// this is because we may need to do deletions
// Deque<TraceableVertex<String>> queue = new ArrayDeque<TraceableVertex<String>>();
final TreeBidiMap<Long, TraceableVertex<String>> bidiQ = new TreeBidiMap<Long, TraceableVertex<String>>();
root_vertex = razv_map.get(root_seqid);
root_vertex.setSAMSegment(root_seq);
root_vertex.setScore(qry_seqs.get(root_seqid).seq_ln());
root_vertex.setPenalty(0);
root_vertex.setStatus(true);
bidiQ.put(0L, root_vertex);
double max_ws = Double.NEGATIVE_INFINITY, source_penalty, target_penalty, source_score, target_score, penalty, score, target_ws, source_ws, ws;
int source_ln;
Set<TraceableEdge> out_edges;
TraceableVertex<String> opt_vertex = null;
long sizeQ;
boolean isLeaf;
if (ddebug)
for (TraceableEdge e : razor.edgeSet()) myLogger.info(e.toString() + "(" + razor.getEdgeSource(e).getSAMSegment().toString() + "|" + razor.getEdgeTarget(e).getSAMSegment().toString() + "|" + e.getScore() + "-" + e.getPenalty() + ")");
while (!bidiQ.isEmpty()) {
sizeQ = bidiQ.lastKey();
source_vertex = bidiQ.get(sizeQ);
bidiQ.remove(sizeQ);
source_ln = qry_seqs.get(source_vertex.getId()).seq_ln();
source_score = source_vertex.getScore() - source_ln;
source_penalty = source_vertex.getPenalty();
source_ws = source_score - source_penalty;
isLeaf = true;
out_edges = razor.outgoingEdgesOf(source_vertex);
for (TraceableEdge out : out_edges) {
// this is not right because graph edges are immutable?
// target_vertex = razor.getEdgeTarget(out);
target_vertex = razv_map.get(razor.getEdgeTarget(out).getId());
target_score = target_vertex.getScore();
target_penalty = target_vertex.getPenalty();
target_ws = target_score - target_penalty;
edge_penalty = out.getPenalty();
penalty = source_penalty + edge_penalty;
edge_score = out.getScore();
score = source_score + edge_score;
ws = score - penalty;
if (edge_penalty > flank_size || target_vertex.getStatus() && (ws <= target_ws || isLoopback(razor, source_vertex, target_vertex)))
continue;
isLeaf = false;
target_vertex.setBackTrace(source_vertex);
target_vertex.setScore(score);
target_vertex.setPenalty(penalty);
target_vertex.setStatus(true);
bidiQ.put(sizeQ++, target_vertex);
}
if (isLeaf && source_ws > max_ws) {
penalty = source_vertex.getPenalty();
score = source_vertex.getScore();
max_ws = source_ws;
opt_vertex = source_vertex;
if (ddebug) {
String trace = opt_vertex.toString() + ":" + opt_vertex.getSAMSegment().sstart() + "-" + opt_vertex.getSAMSegment().send() + "(" + opt_vertex.getScore() + "-" + opt_vertex.getPenalty() + ")";
TraceableVertex<String> optx = opt_vertex;
while ((optx = optx.getBackTrace()) != null) {
trace += "," + optx.toString() + ":" + optx.getSAMSegment().sstart() + "-" + optx.getSAMSegment().send() + "(" + optx.getScore() + "-" + optx.getPenalty() + ")";
}
myLogger.info("trace back [" + score + ", " + penalty + "]: " + trace);
}
}
}
traceable.add(opt_vertex);
Set<TraceableVertex<String>> optx = new HashSet<TraceableVertex<String>>();
optx.add(opt_vertex);
while ((opt_vertex = opt_vertex.getBackTrace()) != null) optx.add(opt_vertex);
for (TraceableVertex<String> v : optx) placed.add(v.getSAMSegment());
}
// sort traceable by size
Collections.sort(traceable, new Comparator<TraceableVertex<String>>() {
@Override
public int compare(TraceableVertex<String> t0, TraceableVertex<String> t1) {
// TODO Auto-generated method stub
return Double.compare(t1.getScore(), t0.getScore());
}
});
if (debug) {
for (TraceableVertex<String> opt_vertex : traceable) {
double score = opt_vertex.getScore();
double penalty = opt_vertex.getPenalty();
String trace = opt_vertex.toString() + ":" + opt_vertex.getSAMSegment().sstart() + "-" + opt_vertex.getSAMSegment().send() + "(" + opt_vertex.getScore() + "-" + opt_vertex.getPenalty() + ")";
while ((opt_vertex = opt_vertex.getBackTrace()) != null) {
trace += "," + opt_vertex.toString() + ":" + opt_vertex.getSAMSegment().sstart() + "-" + opt_vertex.getSAMSegment().send() + "(" + opt_vertex.getScore() + "-" + opt_vertex.getPenalty() + ")";
}
myLogger.info("trace back [" + score + ", " + penalty + "]: " + trace);
}
}
// we generate a compound alignment record for each traceable
for (TraceableVertex<String> opt_vertex : traceable) {
}
}
}
use of com.google.common.collect.TreeRangeSet in project polyGembler by c-zhou.
the class Anchor method run.
// private static DijkstraShortestPath<Integer, DefaultWeightedEdge> dijkstra_paths;
@Override
public void run() {
// TODO Auto-generated method stub
long tic = System.nanoTime();
sub_seqs = Sequence.parseFastaFileAsMap(subject_file);
qry_seqs = Sequence.parseFastaFileWithRevCmpAsMap(query_file);
// this.makeAssemblyGraph(query_file);
this.makeAssemblyGraph(query_file);
bfs = new BFSShortestPath<Integer, DefaultWeightedEdge>(assembly_graph);
// dijkstra_paths = new DijkstraShortestPath<Integer, DefaultWeightedEdge>(assembly_graph);
// find 'N/n's in subject/reference sequences
// which could have impact on parsing the blast records
final Map<String, TreeRangeSet<Integer>> sub_gaps = new HashMap<String, TreeRangeSet<Integer>>();
final Map<String, List<Blast6Segment>> anchored_records = new HashMap<String, List<Blast6Segment>>();
for (Map.Entry<String, Sequence> entry : sub_seqs.entrySet()) {
String seq_sn = entry.getKey();
String seq_str = entry.getValue().seq_str();
final TreeRangeSet<Integer> tmp_rangeSet = TreeRangeSet.create();
for (int j = 0; j < seq_str.length(); j++) {
if (seq_str.charAt(j) == 'N' || seq_str.charAt(j) == 'n')
// blast record is 1-based closed coordination
tmp_rangeSet.add(Range.closed(j + 1, j + 1).canonical(DiscreteDomain.integers()));
}
int seq_ln = seq_str.length();
final TreeRangeSet<Integer> range_set = TreeRangeSet.create();
for (Range<Integer> range : tmp_rangeSet.asRanges()) {
int lowerend = range.hasLowerBound() ? Math.max(0, range.lowerEndpoint() - gap_buff) : 0;
int upperend = range.hasUpperBound() ? Math.min(seq_ln, range.upperEndpoint() + gap_buff - 1) : seq_ln;
range_set.add(Range.closed(lowerend, upperend).canonical(DiscreteDomain.integers()));
}
sub_gaps.put(seq_sn, range_set);
// initialise an array list for each reference chromosome
anchored_records.put(seq_sn, new ArrayList<Blast6Segment>());
}
// parse blast records
// blast records buffer
final List<Blast6Segment> buff = new ArrayList<Blast6Segment>();
// selected blast records
final List<Blast6Segment> sel_recs = new ArrayList<Blast6Segment>();
// temp list
final List<Blast6Segment> tmp_records = new ArrayList<Blast6Segment>();
try {
BufferedReader br_blast = Utils.getBufferedReader(blast_out);
Blast6Segment tmp_record = Blast6Segment.blast6Segment(br_blast.readLine());
Blast6Segment primary_record, secondary_record;
String qry;
double qry_ln, aln_frac;
while (tmp_record != null) {
qry = tmp_record.qseqid();
qry_ln = qry_seqs.get(qry).seq_ln();
buff.clear();
buff.add(tmp_record);
while ((tmp_record = Blast6Segment.blast6Segment(br_blast.readLine())) != null && tmp_record.qseqid().equals(qry)) {
// filter by identity
if (tmp_record.pident() >= this.min_ident)
buff.add(tmp_record);
}
sel_recs.clear();
// merge collinear records
for (String sub : sub_seqs.keySet()) {
// get all records for subject/reference sequence sub_seq
tmp_records.clear();
for (Blast6Segment record : buff) if (record.sseqid().equals(sub))
tmp_records.add(record);
if (tmp_records.isEmpty())
continue;
// find alignment segments that can be deleted
// those that are subsets of larger alignment segments
// (Sstart, (sstart, send), Send) and (Qstart, (qstart, qend), Qend)
Collections.sort(tmp_records, new Blast6Segment.SegmentSizeComparator());
final Set<int[][]> ranges = new HashSet<int[][]>();
outerloop: for (int i = 0; i < tmp_records.size(); i++) {
primary_record = tmp_records.get(i);
int[][] range = new int[2][2];
if (primary_record.sstart() < primary_record.send()) {
range[0][0] = primary_record.sstart();
range[0][1] = primary_record.send();
} else {
range[0][0] = primary_record.send();
range[0][1] = primary_record.sstart();
}
if (primary_record.qstart() < primary_record.qend()) {
range[1][0] = primary_record.qstart();
range[1][1] = primary_record.qend();
} else {
range[1][0] = primary_record.qend();
range[1][1] = primary_record.qstart();
}
for (int[][] r : ranges) {
if (r[0][0] <= range[0][0] && r[0][1] >= range[0][1] && r[1][0] <= range[1][0] && r[1][1] >= range[1][1]) {
tmp_records.remove(i--);
continue outerloop;
}
}
ranges.add(range);
}
// TODO rewrite this part
/**
* // find collinear alignment segments that can be merged
* Collections.sort(tmp_records, new BlastRecord.SInterceptComparator());
*
* collinear_merged.clear();
* final List<Blast6Record> temp = new ArrayList<Blast6Record>();
* for(int i=0; i<tmp_records.size(); ) {
* Blast6Record record = tmp_records.get(i);
* double max_shift;
* temp.clear();
* temp.add(record);
*
* // find collinear alignment segments
* outerloop:
* while( (++i)<tmp_records.size() ) {
* record = tmp_records.get(i);
* // check if is collinear with other alignment segments
* for(Blast6Record r : temp) {
* max_shift = collinear_shift*
* Math.min(r.length(), record.length());
* if(BlastRecord.sdistance(r, record)<=max_shift &&
* BlastRecord.qdistance(r, record)<=max_shift &&
* BlastRecord.pdistance(r, record)<=max_shift) {
* temp.add(record);
* continue outerloop;
* }
* }
* break;
* }
*
* // merge collinear alignment segments
* int qstart = Integer.MAX_VALUE;
* int qend = Integer.MIN_VALUE;
* int sstart = Integer.MAX_VALUE;
* int send = Integer.MIN_VALUE;
* double pident = 0;
* int length = 0;
*
* // qstart is always smaller than qend
* for(int j=0; j<temp.size(); j++) {
* record = temp.get(j);
* if(record.qstart()<qstart) {
* qstart = record.qstart();
* sstart = record.sstart();
* }
* if(record.qend()>qend) {
* qend = record.qend();
* send = record.send();
* }
* if(record.pident()>pident)
* pident = record.pident();
* length += record.length();
* }
*
* collinear_merged.add(new Blast6Record(qry,sub,pident,length,-1,-1,qstart,qend,sstart,send,-1,-1));
* }
*/
// find collinear alignment segments that can be merged
// more accurate but slower
Collections.sort(tmp_records, new AlignmentSegment.SubjectCoordinationComparator());
Blast6Segment record;
for (int i = 0; i < tmp_records.size(); i++) {
primary_record = tmp_records.get(i);
for (int j = i + 1; j < tmp_records.size(); j++) {
secondary_record = tmp_records.get(j);
double max_shift = collinear_shift * Math.min(primary_record.length(), secondary_record.length());
if ((record = Blast6Segment.collinear(primary_record, secondary_record, max_shift)) != null) {
tmp_records.set(i, record);
tmp_records.remove(j);
--i;
break;
}
}
}
// process blast records that clipped by gaps
// (sstart, send)---(start2, send2)
// (sstart ... --- ... send2)
TreeRangeSet<Integer> sub_gap = sub_gaps.get(sub);
Collections.sort(tmp_records, new AlignmentSegment.SubjectCoordinationComparator());
for (int i = 0; i < tmp_records.size(); i++) {
primary_record = tmp_records.get(i);
if (sub_gap.contains(primary_record.true_send())) {
secondary_record = null;
int sec_j = -1;
for (int j = i + 1; j < tmp_records.size(); j++) {
if (tmp_records.get(j).true_sstart() >= primary_record.true_send()) {
secondary_record = tmp_records.get(j);
sec_j = j;
break;
}
}
if (secondary_record == null || AlignmentSegment.reverse(primary_record, secondary_record)) {
// reverse alignment segments
continue;
}
if (sub_gap.contains(secondary_record.true_sstart()) && sub_gap.rangeContaining(primary_record.true_send()).equals(sub_gap.rangeContaining(secondary_record.true_sstart()))) {
// clipping
// merge two alignment segments
double pident = Math.max(primary_record.pident(), secondary_record.pident());
int qstart = Math.min(primary_record.true_qstart(), secondary_record.true_qstart());
int qend = Math.max(primary_record.true_qend(), secondary_record.true_qend());
int sstart = Math.min(primary_record.true_sstart(), secondary_record.true_sstart());
int send = Math.max(primary_record.true_send(), secondary_record.true_send());
int length = qend - qstart + 1;
// replace primary record with merged record
// delete secondary record
Blast6Segment merged_record = primary_record.forward() ? new Blast6Segment(qry, sub, pident, length, -1, -1, qstart, qend, sstart, send, -1, -1) : new Blast6Segment(qry, sub, pident, length, -1, -1, qstart, qend, send, sstart, -1, -1);
tmp_records.set(i, merged_record);
tmp_records.remove(sec_j);
// the merged records need to be processed
--i;
}
}
}
// add to sel_recs
sel_recs.addAll(tmp_records);
}
// filter by alignment fraction
buff.clear();
buff.addAll(sel_recs);
sel_recs.clear();
for (Blast6Segment record : buff) {
if (record.length() / qry_ln >= this.min_frac)
sel_recs.add(record);
}
if (sel_recs.isEmpty()) {
// continue
continue;
}
// filter blast records
Collections.sort(sel_recs, new Blast6Segment.MatchIndentityComparator());
// process primary alignment
primary_record = sel_recs.get(0);
anchored_records.get(primary_record.sseqid()).add(primary_record);
aln_frac = primary_record.length() / qry_ln;
// and process
for (int i = 1; i < sel_recs.size(); i++) {
secondary_record = sel_recs.get(i);
if (secondary_record.pident() + this.diff_ident < primary_record.pident() || secondary_record.length() / qry_ln + this.diff_frac < aln_frac) {
break;
} else {
anchored_records.get(secondary_record.sseqid()).add(secondary_record);
}
}
}
br_blast.close();
for (Map.Entry<String, List<Blast6Segment>> entry : anchored_records.entrySet()) {
System.out.println(entry.getKey() + ": " + entry.getValue().size());
}
final BufferedWriter bw_map = Utils.getBufferedWriter(out_prefix + ".map");
final BufferedWriter bw_fa = Utils.getBufferedWriter(out_prefix + ".fa");
final Set<String> anchored_seqs = new HashSet<String>();
final List<String> sub_list = Sequence.parseSeqList(subject_file);
for (String sub_sn : sub_list) {
blast6_records = anchored_records.get(sub_sn);
int nV = blast6_records.size(), count = 0;
// sort blast records
Collections.sort(blast6_records, new Blast6Segment.SubjectCoordinationComparator());
// consensus
int posUpto = 0, send_clip = 0;
int sstart, send, qstart, qend, qlen, tmp_int, qstart_clip, qend_clip, gap_size;
// distance to last 'N', start from next position to find longest common suffix-prefix
int prev_n = Integer.MAX_VALUE;
int mol_len = 0;
String qseq;
int nS, nQ;
// first step: construct super scaffold
// will form a tree graph indicating the path through the contigs
// convert contig names to integer indices
// one contig could end up with multiple indices due to repeats
Map<Integer, String> ss_coordinate = new HashMap<Integer, String>();
int index = 0;
for (Blast6Segment record : blast6_records) ss_coordinate.put(index++, record.qseqid() + (record.forward() ? "" : "'"));
StringBuilder seq_str = new StringBuilder();
for (int v = 0; v < nV - 1; v++) {
if (++count % 10000 == 0)
myLogger.info(sub_sn + " " + count + "/" + nV + " done.");
Blast6Segment record = blast6_records.get(v);
qlen = qry_seqs.get(record.qseqid()).seq_ln();
sstart = record.sstart();
send = record.send();
qstart = record.qstart();
qend = record.qend();
if (sstart > send) {
// make sure sstart<send
tmp_int = sstart;
sstart = send;
send = tmp_int;
tmp_int = qstart;
qstart = qend;
qend = tmp_int;
}
if (qstart > qend) {
qstart_clip = qlen - qstart;
qend_clip = qend - 1;
qseq = Sequence.revCompSeq(qry_seqs.get(record.qseqid()).seq_str());
} else {
qstart_clip = qstart - 1;
qend_clip = qlen - qend;
qseq = qry_seqs.get(record.qseqid()).seq_str();
}
if (send < posUpto || sstart < posUpto && qstart_clip > max_clip) {
// TODO process
continue;
}
// find longest suffix-prefix
nS = seq_str.length();
nQ = qseq.length();
int nO = Math.min(prev_n, Math.min(nS, nQ));
outerloop: for (; nO >= min_overlap; nO--) {
int nS_i = nS - nO;
for (int i = 0; i < nO; i++) {
if (seq_str.charAt(nS_i + i) != qseq.charAt(i))
continue outerloop;
}
break outerloop;
}
if (nO < min_overlap) {
// otherwise will insert a large GAP max(pseduo_distance, max_gap)
if (posUpto > 0) {
if (sstart <= posUpto) {
// if too much overlap, then treat it as a contradiction
if (posUpto - sstart > min_overlap) {
// discard
continue;
} else {
// insert a min_gap
gap_size = min_gap;
}
} else {
// estimate gap size
gap_size = (sstart - posUpto) - (send_clip + qstart_clip);
if (gap_size < max_gap)
gap_size = max_gap;
}
bw_map.write("GAP\t" + gap_size + "\t0\t" + gap_size + "\t+\t" + sub_sn + "\t" + mol_len + "\t" + (mol_len + gap_size) + "\n");
seq_str.append(Sequence.polyN(gap_size));
mol_len += gap_size;
}
bw_map.write(record.qseqid() + "\t" + qlen + "\t");
if (qstart > qend) {
// reverse
bw_map.write("0\t" + qlen + "\t-\t");
} else {
// forward
bw_map.write("0\t" + qlen + "\t+\t");
}
seq_str.append(qseq);
bw_map.write(sub_sn + "\t" + mol_len + "\t" + (mol_len + qlen) + "\n");
mol_len += qlen;
prev_n = qlen;
anchored_seqs.add(record.qseqid());
} else {
// overlap found
// will not insert gap
// ====================
// /-------\
// /----\
// calculate overlaps
// process overlap
qstart = nO;
if (qstart == qlen)
continue;
bw_map.write(record.qseqid() + "\t" + (qlen - qstart) + "\t");
if (qstart > qend) {
// reverse
bw_map.write(0 + "\t" + (qlen - qstart) + "\t-\t");
} else {
// forward
bw_map.write(qstart + "\t" + qlen + "\t+\t");
}
bw_map.write(sub_sn + "\t" + mol_len + "\t" + (mol_len + qlen - qstart) + "\n");
mol_len += qlen - qstart;
prev_n += qlen - qstart;
seq_str.append(qseq.substring(qstart));
anchored_seqs.add(record.qseqid());
}
posUpto = send;
send_clip = qend_clip;
}
if (seq_str.length() > 0)
bw_fa.write(Sequence.formatOutput(sub_sn, seq_str.toString()));
}
bw_fa.close();
bw_map.close();
final BufferedWriter bw_ufa = Utils.getBufferedWriter(out_prefix + "_unplaced.fa");
for (String seq : qry_seqs.keySet()) if (!anchored_seqs.contains(seq))
bw_ufa.write(qry_seqs.get(seq).formatOutput());
bw_ufa.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
use of com.google.common.collect.TreeRangeSet in project polyGembler by c-zhou.
the class Anchor0 method run.
@Override
public void run() {
// TODO Auto-generated method stub
final Map<String, Sequence> sub_seqs = Sequence.parseFastaFileAsMap(subject_file);
final Map<String, Sequence> qry_seqs = Sequence.parseFastaFileAsMap(query_file);
// find 'N/n's in subject/reference sequences
// which could have impact on parsing the blast records
final Map<String, TreeRangeSet<Integer>> sub_gaps = new HashMap<String, TreeRangeSet<Integer>>();
final Map<String, List<Blast6Segment>> anchored_records = new HashMap<String, List<Blast6Segment>>();
for (Map.Entry<String, Sequence> entry : sub_seqs.entrySet()) {
String seq_sn = entry.getKey();
String seq_str = entry.getValue().seq_str();
final TreeRangeSet<Integer> tmp_rangeSet = TreeRangeSet.create();
for (int j = 0; j < seq_str.length(); j++) {
if (seq_str.charAt(j) == 'N' || seq_str.charAt(j) == 'n')
// blast record is 1-based closed coordination
tmp_rangeSet.add(Range.closed(j + 1, j + 1).canonical(DiscreteDomain.integers()));
}
int seq_ln = seq_str.length();
final TreeRangeSet<Integer> range_set = TreeRangeSet.create();
for (Range<Integer> range : tmp_rangeSet.asRanges()) {
int lowerend = range.hasLowerBound() ? Math.max(0, range.lowerEndpoint() - gap_buff) : 0;
int upperend = range.hasUpperBound() ? Math.min(seq_ln, range.upperEndpoint() + gap_buff - 1) : seq_ln;
range_set.add(Range.closed(lowerend, upperend).canonical(DiscreteDomain.integers()));
}
sub_gaps.put(seq_sn, range_set);
// initialise an array list for each reference chromosome
anchored_records.put(seq_sn, new ArrayList<Blast6Segment>());
}
// parse blast records
// blast records buffer
final List<Blast6Segment> buff = new ArrayList<Blast6Segment>();
// selected blast records
final List<Blast6Segment> sel_recs = new ArrayList<Blast6Segment>();
// temp list
final List<Blast6Segment> tmp_records = new ArrayList<Blast6Segment>();
try {
BufferedReader br_blast = Utils.getBufferedReader(blast_out);
Blast6Segment tmp_record = Blast6Segment.blast6Segment(br_blast.readLine());
Blast6Segment primary_record, secondary_record;
String qry;
double qry_ln, aln_frac;
while (tmp_record != null) {
qry = tmp_record.qseqid();
qry_ln = qry_seqs.get(qry).seq_ln();
buff.clear();
buff.add(tmp_record);
while ((tmp_record = Blast6Segment.blast6Segment(br_blast.readLine())) != null && tmp_record.qseqid().equals(qry)) {
// filter by identity
if (tmp_record.pident() >= this.min_ident)
buff.add(tmp_record);
}
sel_recs.clear();
// merge collinear records
for (String sub : sub_seqs.keySet()) {
// get all records for subject/reference sequence sub_seq
tmp_records.clear();
for (Blast6Segment record : buff) if (record.sseqid().equals(sub))
tmp_records.add(record);
if (tmp_records.isEmpty())
continue;
// find alignment segments that can be deleted
// those that are subsets of larger alignment segments
// (Sstart, (sstart, send), Send) and (Qstart, (qstart, qend), Qend)
Collections.sort(tmp_records, new Blast6Segment.SegmentSizeComparator());
final Set<int[][]> ranges = new HashSet<int[][]>();
outerloop: for (int i = 0; i < tmp_records.size(); i++) {
primary_record = tmp_records.get(i);
int[][] range = new int[2][2];
if (primary_record.sstart() < primary_record.send()) {
range[0][0] = primary_record.sstart();
range[0][1] = primary_record.send();
} else {
range[0][0] = primary_record.send();
range[0][1] = primary_record.sstart();
}
if (primary_record.qstart() < primary_record.qend()) {
range[1][0] = primary_record.qstart();
range[1][1] = primary_record.qend();
} else {
range[1][0] = primary_record.qend();
range[1][1] = primary_record.qstart();
}
for (int[][] r : ranges) {
if (r[0][0] <= range[0][0] && r[0][1] >= range[0][1] && r[1][0] <= range[1][0] && r[1][1] >= range[1][1]) {
tmp_records.remove(i--);
continue outerloop;
}
}
ranges.add(range);
}
// TODO rewrite this part
/**
* // find collinear alignment segments that can be merged
* Collections.sort(tmp_records, new BlastRecord.SInterceptComparator());
*
* collinear_merged.clear();
* final List<Blast6Record> temp = new ArrayList<Blast6Record>();
* for(int i=0; i<tmp_records.size(); ) {
* Blast6Record record = tmp_records.get(i);
* double max_shift;
* temp.clear();
* temp.add(record);
*
* // find collinear alignment segments
* outerloop:
* while( (++i)<tmp_records.size() ) {
* record = tmp_records.get(i);
* // check if is collinear with other alignment segments
* for(Blast6Record r : temp) {
* max_shift = collinear_shift*
* Math.min(r.length(), record.length());
* if(BlastRecord.sdistance(r, record)<=max_shift &&
* BlastRecord.qdistance(r, record)<=max_shift &&
* BlastRecord.pdistance(r, record)<=max_shift) {
* temp.add(record);
* continue outerloop;
* }
* }
* break;
* }
*
* // merge collinear alignment segments
* int qstart = Integer.MAX_VALUE;
* int qend = Integer.MIN_VALUE;
* int sstart = Integer.MAX_VALUE;
* int send = Integer.MIN_VALUE;
* double pident = 0;
* int length = 0;
*
* // qstart is always smaller than qend
* for(int j=0; j<temp.size(); j++) {
* record = temp.get(j);
* if(record.qstart()<qstart) {
* qstart = record.qstart();
* sstart = record.sstart();
* }
* if(record.qend()>qend) {
* qend = record.qend();
* send = record.send();
* }
* if(record.pident()>pident)
* pident = record.pident();
* length += record.length();
* }
*
* collinear_merged.add(new Blast6Record(qry,sub,pident,length,-1,-1,qstart,qend,sstart,send,-1,-1));
* }
*/
// find collinear alignment segments that can be merged
// more accurate but slower
Collections.sort(tmp_records, new AlignmentSegment.SubjectCoordinationComparator());
Blast6Segment record;
for (int i = 0; i < tmp_records.size(); i++) {
primary_record = tmp_records.get(i);
for (int j = i + 1; j < tmp_records.size(); j++) {
secondary_record = tmp_records.get(j);
double max_shift = collinear_shift * Math.min(primary_record.length(), secondary_record.length());
if ((record = Blast6Segment.collinear(primary_record, secondary_record, max_shift)) != null) {
tmp_records.set(i, record);
tmp_records.remove(j);
--i;
break;
}
}
}
// process blast records that clipped by gaps
// (sstart, send)---(start2, send2)
// (sstart ... --- ... send2)
TreeRangeSet<Integer> sub_gap = sub_gaps.get(sub);
Collections.sort(tmp_records, new AlignmentSegment.SubjectCoordinationComparator());
for (int i = 0; i < tmp_records.size(); i++) {
primary_record = tmp_records.get(i);
if (sub_gap.contains(primary_record.true_send())) {
secondary_record = null;
int sec_j = -1;
for (int j = i + 1; j < tmp_records.size(); j++) {
if (tmp_records.get(j).true_sstart() >= primary_record.true_send()) {
secondary_record = tmp_records.get(j);
sec_j = j;
break;
}
}
if (secondary_record == null || AlignmentSegment.reverse(primary_record, secondary_record)) {
// reverse alignment segments
continue;
}
if (sub_gap.contains(secondary_record.true_sstart()) && sub_gap.rangeContaining(primary_record.true_send()).equals(sub_gap.rangeContaining(secondary_record.true_sstart()))) {
// clipping
// merge two alignment segments
double pident = Math.max(primary_record.pident(), secondary_record.pident());
int qstart = Math.min(primary_record.true_qstart(), secondary_record.true_qstart());
int qend = Math.max(primary_record.true_qend(), secondary_record.true_qend());
int sstart = Math.min(primary_record.true_sstart(), secondary_record.true_sstart());
int send = Math.max(primary_record.true_send(), secondary_record.true_send());
int length = qend - qstart + 1;
// replace primary record with merged record
// delete secondary record
Blast6Segment merged_record = primary_record.forward() ? new Blast6Segment(qry, sub, pident, length, -1, -1, qstart, qend, sstart, send, -1, -1) : new Blast6Segment(qry, sub, pident, length, -1, -1, qstart, qend, send, sstart, -1, -1);
tmp_records.set(i, merged_record);
tmp_records.remove(sec_j);
// the merged records need to be processed
--i;
}
}
}
// add to sel_recs
sel_recs.addAll(tmp_records);
}
// filter by alignment fraction
buff.clear();
buff.addAll(sel_recs);
sel_recs.clear();
for (Blast6Segment record : buff) {
if (record.length() / qry_ln >= this.min_frac)
sel_recs.add(record);
}
if (sel_recs.isEmpty()) {
// continue
continue;
}
// filter blast records
Collections.sort(sel_recs, new Blast6Segment.MatchIndentityComparator());
// process primary alignment
primary_record = sel_recs.get(0);
anchored_records.get(primary_record.sseqid()).add(primary_record);
aln_frac = primary_record.length() / qry_ln;
// and process
for (int i = 1; i < sel_recs.size(); i++) {
secondary_record = sel_recs.get(i);
if (secondary_record.pident() + this.diff_ident < primary_record.pident() || secondary_record.length() / qry_ln + this.diff_frac < aln_frac) {
break;
} else {
anchored_records.get(secondary_record.sseqid()).add(secondary_record);
}
}
}
br_blast.close();
for (Map.Entry<String, List<Blast6Segment>> entry : anchored_records.entrySet()) {
System.out.println(entry.getKey() + ": " + entry.getValue().size());
}
final BufferedWriter bw_map = Utils.getBufferedWriter(out_prefix + ".map");
final BufferedWriter bw_fa = Utils.getBufferedWriter(out_prefix + ".fa");
final Set<String> anchored_seqs = new HashSet<String>();
final List<String> sub_list = Sequence.parseSeqList(subject_file);
for (String sub_sn : sub_list) {
List<Blast6Segment> blast6_records = anchored_records.get(sub_sn);
int total = blast6_records.size(), count = 0;
// sort blast records
Collections.sort(blast6_records, new Blast6Segment.SubjectCoordinationComparator());
// consensus
int posUpto = 0, send_clip = 0;
int sstart, send, qstart, qend, qlen, tmp_int, qstart_clip, qend_clip, gap_size;
// distance to last 'N', start from next position to find longest common suffix-prefix
int prev_n = Integer.MAX_VALUE;
int mol_len = 0;
String qseq;
int nS, nQ;
// will form a pseudomolecule
StringBuilder seq_str = new StringBuilder();
for (Blast6Segment record : blast6_records) {
if (count++ % 10000 == 0)
myLogger.info(sub_sn + " " + count + "/" + total + " done.");
qlen = qry_seqs.get(record.qseqid()).seq_ln();
sstart = record.sstart();
send = record.send();
qstart = record.qstart();
qend = record.qend();
if (sstart > send) {
// make sure sstart<send
tmp_int = sstart;
sstart = send;
send = tmp_int;
tmp_int = qstart;
qstart = qend;
qend = tmp_int;
}
if (qstart > qend) {
qstart_clip = qlen - qstart;
qend_clip = qend - 1;
qseq = Sequence.revCompSeq(qry_seqs.get(record.qseqid()).seq_str());
} else {
qstart_clip = qstart - 1;
qend_clip = qlen - qend;
qseq = qry_seqs.get(record.qseqid()).seq_str();
}
if (send < posUpto || sstart < posUpto && qstart_clip > max_clip) {
// TODO process
continue;
}
// find longest suffix-prefix
nS = seq_str.length();
nQ = qseq.length();
int nO = Math.min(prev_n, Math.min(nS, nQ));
outerloop: for (; nO >= min_overlap; nO--) {
int nS_i = nS - nO;
for (int i = 0; i < nO; i++) {
if (seq_str.charAt(nS_i + i) != qseq.charAt(i))
continue outerloop;
}
break outerloop;
}
if (nO < min_overlap) {
// otherwise will insert a large GAP max(pseduo_distance, max_gap)
if (posUpto > 0) {
if (sstart <= posUpto) {
// if too much overlap, then treat it as a contradiction
if (posUpto - sstart > min_overlap) {
// discard
continue;
} else {
// insert a min_gap
gap_size = min_gap;
}
} else {
// estimate gap size
gap_size = (sstart - posUpto) - (send_clip + qstart_clip);
if (gap_size < max_gap)
gap_size = max_gap;
}
bw_map.write("GAP\t" + gap_size + "\t0\t" + gap_size + "\t+\t" + sub_sn + "\t" + mol_len + "\t" + (mol_len + gap_size) + "\n");
seq_str.append(Sequence.polyN(gap_size));
mol_len += gap_size;
}
bw_map.write(record.qseqid() + "\t" + qlen + "\t");
if (qstart > qend) {
// reverse
bw_map.write("0\t" + qlen + "\t-\t");
} else {
// forward
bw_map.write("0\t" + qlen + "\t+\t");
}
seq_str.append(qseq);
bw_map.write(sub_sn + "\t" + mol_len + "\t" + (mol_len + qlen) + "\n");
mol_len += qlen;
prev_n = qlen;
anchored_seqs.add(record.qseqid());
} else {
// overlap found
// will not insert gap
// ====================
// /-------\
// /----\
// calculate overlaps
// process overlap
qstart = nO;
if (qstart == qlen)
continue;
bw_map.write(record.qseqid() + "\t" + (qlen - qstart) + "\t");
if (qstart > qend) {
// reverse
bw_map.write(0 + "\t" + (qlen - qstart) + "\t-\t");
} else {
// forward
bw_map.write(qstart + "\t" + qlen + "\t+\t");
}
bw_map.write(sub_sn + "\t" + mol_len + "\t" + (mol_len + qlen - qstart) + "\n");
mol_len += qlen - qstart;
prev_n += qlen - qstart;
seq_str.append(qseq.substring(qstart));
anchored_seqs.add(record.qseqid());
}
posUpto = send;
send_clip = qend_clip;
}
if (seq_str.length() > 0)
bw_fa.write(Sequence.formatOutput(sub_sn, seq_str.toString()));
}
bw_fa.close();
bw_map.close();
final BufferedWriter bw_ufa = Utils.getBufferedWriter(out_prefix + "_unplaced.fa");
for (String seq : qry_seqs.keySet()) if (!anchored_seqs.contains(seq))
bw_ufa.write(qry_seqs.get(seq).formatOutput());
bw_ufa.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
use of com.google.common.collect.TreeRangeSet in project polyGembler by c-zhou.
the class Anchor1 method run.
// private static DijkstraShortestPath<Integer, DefaultWeightedEdge> dijkstra_paths;
@Override
public void run() {
// TODO Auto-generated method stub
long tic = System.nanoTime();
sub_seqs = Sequence.parseFastaFileAsMap(subject_file);
qry_seqs = Sequence.parseFastaFileWithRevCmpAsMap(query_file);
// this.makeAssemblyGraph(query_file);
this.makeAssemblyGraph(query_file);
bfs = new BFSShortestPath<Integer, DefaultWeightedEdge>(assembly_graph);
// dijkstra_paths = new DijkstraShortestPath<Integer, DefaultWeightedEdge>(assembly_graph);
// find 'N/n's in subject/reference sequences
// which could have impact on parsing the blast records
final Map<String, TreeRangeSet<Integer>> sub_gaps = new HashMap<String, TreeRangeSet<Integer>>();
final Map<String, List<Blast6Segment>> anchored_records = new HashMap<String, List<Blast6Segment>>();
for (Map.Entry<String, Sequence> entry : sub_seqs.entrySet()) {
String seq_sn = entry.getKey();
String seq_str = entry.getValue().seq_str();
final TreeRangeSet<Integer> tmp_rangeSet = TreeRangeSet.create();
for (int j = 0; j < seq_str.length(); j++) {
if (seq_str.charAt(j) == 'N' || seq_str.charAt(j) == 'n')
// blast record is 1-based closed coordination
tmp_rangeSet.add(Range.closed(j + 1, j + 1).canonical(DiscreteDomain.integers()));
}
int seq_ln = seq_str.length();
final TreeRangeSet<Integer> range_set = TreeRangeSet.create();
for (Range<Integer> range : tmp_rangeSet.asRanges()) {
int lowerend = range.hasLowerBound() ? Math.max(0, range.lowerEndpoint() - gap_buff) : 0;
int upperend = range.hasUpperBound() ? Math.min(seq_ln, range.upperEndpoint() + gap_buff - 1) : seq_ln;
range_set.add(Range.closed(lowerend, upperend).canonical(DiscreteDomain.integers()));
}
sub_gaps.put(seq_sn, range_set);
// initialise an array list for each reference chromosome
anchored_records.put(seq_sn, new ArrayList<Blast6Segment>());
}
// parse blast records
// blast records buffer
final List<Blast6Segment> buff = new ArrayList<Blast6Segment>();
// selected blast records
final List<Blast6Segment> sel_recs = new ArrayList<Blast6Segment>();
// temp list
final List<Blast6Segment> tmp_records = new ArrayList<Blast6Segment>();
try {
BufferedReader br_blast = Utils.getBufferedReader(blast_out);
Blast6Segment tmp_record = Blast6Segment.blast6Segment(br_blast.readLine());
Blast6Segment primary_record, secondary_record;
String qry;
double qry_ln, aln_frac;
while (tmp_record != null) {
qry = tmp_record.qseqid();
qry_ln = qry_seqs.get(qry).seq_ln();
buff.clear();
buff.add(tmp_record);
while ((tmp_record = Blast6Segment.blast6Segment(br_blast.readLine())) != null && tmp_record.qseqid().equals(qry)) {
// filter by identity
if (tmp_record.pident() >= this.min_ident)
buff.add(tmp_record);
}
sel_recs.clear();
// merge collinear records
for (String sub : sub_seqs.keySet()) {
// get all records for subject/reference sequence sub_seq
tmp_records.clear();
for (Blast6Segment record : buff) if (record.sseqid().equals(sub))
tmp_records.add(record);
if (tmp_records.isEmpty())
continue;
// find alignment segments that can be deleted
// those that are subsets of larger alignment segments
// (Sstart, (sstart, send), Send) and (Qstart, (qstart, qend), Qend)
Collections.sort(tmp_records, new Blast6Segment.SegmentSizeComparator());
final Set<int[][]> ranges = new HashSet<int[][]>();
outerloop: for (int i = 0; i < tmp_records.size(); i++) {
primary_record = tmp_records.get(i);
int[][] range = new int[2][2];
if (primary_record.sstart() < primary_record.send()) {
range[0][0] = primary_record.sstart();
range[0][1] = primary_record.send();
} else {
range[0][0] = primary_record.send();
range[0][1] = primary_record.sstart();
}
if (primary_record.qstart() < primary_record.qend()) {
range[1][0] = primary_record.qstart();
range[1][1] = primary_record.qend();
} else {
range[1][0] = primary_record.qend();
range[1][1] = primary_record.qstart();
}
for (int[][] r : ranges) {
if (r[0][0] <= range[0][0] && r[0][1] >= range[0][1] && r[1][0] <= range[1][0] && r[1][1] >= range[1][1]) {
tmp_records.remove(i--);
continue outerloop;
}
}
ranges.add(range);
}
// TODO rewrite this part
/**
* // find collinear alignment segments that can be merged
* Collections.sort(tmp_records, new BlastRecord.SInterceptComparator());
*
* collinear_merged.clear();
* final List<Blast6Record> temp = new ArrayList<Blast6Record>();
* for(int i=0; i<tmp_records.size(); ) {
* Blast6Record record = tmp_records.get(i);
* double max_shift;
* temp.clear();
* temp.add(record);
*
* // find collinear alignment segments
* outerloop:
* while( (++i)<tmp_records.size() ) {
* record = tmp_records.get(i);
* // check if is collinear with other alignment segments
* for(Blast6Record r : temp) {
* max_shift = collinear_shift*
* Math.min(r.length(), record.length());
* if(BlastRecord.sdistance(r, record)<=max_shift &&
* BlastRecord.qdistance(r, record)<=max_shift &&
* BlastRecord.pdistance(r, record)<=max_shift) {
* temp.add(record);
* continue outerloop;
* }
* }
* break;
* }
*
* // merge collinear alignment segments
* int qstart = Integer.MAX_VALUE;
* int qend = Integer.MIN_VALUE;
* int sstart = Integer.MAX_VALUE;
* int send = Integer.MIN_VALUE;
* double pident = 0;
* int length = 0;
*
* // qstart is always smaller than qend
* for(int j=0; j<temp.size(); j++) {
* record = temp.get(j);
* if(record.qstart()<qstart) {
* qstart = record.qstart();
* sstart = record.sstart();
* }
* if(record.qend()>qend) {
* qend = record.qend();
* send = record.send();
* }
* if(record.pident()>pident)
* pident = record.pident();
* length += record.length();
* }
*
* collinear_merged.add(new Blast6Record(qry,sub,pident,length,-1,-1,qstart,qend,sstart,send,-1,-1));
* }
*/
// find collinear alignment segments that can be merged
// more accurate but slower
Collections.sort(tmp_records, new AlignmentSegment.SubjectCoordinationComparator());
Blast6Segment record;
for (int i = 0; i < tmp_records.size(); i++) {
primary_record = tmp_records.get(i);
for (int j = i + 1; j < tmp_records.size(); j++) {
secondary_record = tmp_records.get(j);
double max_shift = collinear_shift * Math.min(primary_record.length(), secondary_record.length());
if ((record = Blast6Segment.collinear(primary_record, secondary_record, max_shift)) != null) {
tmp_records.set(i, record);
tmp_records.remove(j);
--i;
break;
}
}
}
// process blast records that clipped by gaps
// (sstart, send)---(start2, send2)
// (sstart ... --- ... send2)
TreeRangeSet<Integer> sub_gap = sub_gaps.get(sub);
Collections.sort(tmp_records, new AlignmentSegment.SubjectCoordinationComparator());
for (int i = 0; i < tmp_records.size(); i++) {
primary_record = tmp_records.get(i);
if (sub_gap.contains(primary_record.true_send())) {
secondary_record = null;
int sec_j = -1;
for (int j = i + 1; j < tmp_records.size(); j++) {
if (tmp_records.get(j).true_sstart() >= primary_record.true_send()) {
secondary_record = tmp_records.get(j);
sec_j = j;
break;
}
}
if (secondary_record == null || AlignmentSegment.reverse(primary_record, secondary_record)) {
// reverse alignment segments
continue;
}
if (sub_gap.contains(secondary_record.true_sstart()) && sub_gap.rangeContaining(primary_record.true_send()).equals(sub_gap.rangeContaining(secondary_record.true_sstart()))) {
// clipping
// merge two alignment segments
double pident = Math.max(primary_record.pident(), secondary_record.pident());
int qstart = Math.min(primary_record.true_qstart(), secondary_record.true_qstart());
int qend = Math.max(primary_record.true_qend(), secondary_record.true_qend());
int sstart = Math.min(primary_record.true_sstart(), secondary_record.true_sstart());
int send = Math.max(primary_record.true_send(), secondary_record.true_send());
int length = qend - qstart + 1;
// replace primary record with merged record
// delete secondary record
Blast6Segment merged_record = primary_record.forward() ? new Blast6Segment(qry, sub, pident, length, -1, -1, qstart, qend, sstart, send, -1, -1) : new Blast6Segment(qry, sub, pident, length, -1, -1, qstart, qend, send, sstart, -1, -1);
tmp_records.set(i, merged_record);
tmp_records.remove(sec_j);
// the merged records need to be processed
--i;
}
}
}
// add to sel_recs
sel_recs.addAll(tmp_records);
}
// filter by alignment fraction
buff.clear();
buff.addAll(sel_recs);
sel_recs.clear();
for (Blast6Segment record : buff) {
if (record.length() / qry_ln >= this.min_frac)
sel_recs.add(record);
}
if (sel_recs.isEmpty()) {
// continue
continue;
}
// filter blast records
Collections.sort(sel_recs, new Blast6Segment.MatchIndentityComparator());
// process primary alignment
primary_record = sel_recs.get(0);
anchored_records.get(primary_record.sseqid()).add(primary_record);
aln_frac = primary_record.length() / qry_ln;
// and process
for (int i = 1; i < sel_recs.size(); i++) {
secondary_record = sel_recs.get(i);
if (secondary_record.pident() + this.diff_ident < primary_record.pident() || secondary_record.length() / qry_ln + this.diff_frac < aln_frac) {
break;
} else {
anchored_records.get(secondary_record.sseqid()).add(secondary_record);
}
}
}
br_blast.close();
for (Map.Entry<String, List<Blast6Segment>> entry : anchored_records.entrySet()) {
System.out.println(entry.getKey() + ": " + entry.getValue().size());
}
final BufferedWriter bw_map = Utils.getBufferedWriter(out_prefix + ".map");
final BufferedWriter bw_fa = Utils.getBufferedWriter(out_prefix + ".fa");
final Set<String> anchored_seqs = new HashSet<String>();
final List<String> sub_list = Sequence.parseSeqList(subject_file);
final List<GraphPath<Integer, DefaultWeightedEdge>> paths = new ArrayList<GraphPath<Integer, DefaultWeightedEdge>>();
for (String sub_sn : sub_list) {
blast6_records = anchored_records.get(sub_sn);
int nV = blast6_records.size(), count = 0;
// sort blast records
Collections.sort(blast6_records, new Blast6Segment.SubjectCoordinationComparator());
// consensus
int posUpto = 0, send_clip = 0;
int sstart, send, qstart, qend, qlen, tmp_int, qstart_clip, qend_clip, gap_size;
// distance to last 'N', start from next position to find longest common suffix-prefix
int prev_n = Integer.MAX_VALUE;
int mol_len = 0;
int bfs_dist;
String qseq;
int nS, nQ;
// first step: construct super scaffold
// will form a tree graph indicating the path through the contigs
// convert contig names to integer indices
// one contig could end up with multiple indices due to repeats
Map<Integer, String> ss_coordinate = new HashMap<Integer, String>();
int index = 0;
for (Blast6Segment record : blast6_records) ss_coordinate.put(index++, record.qseqid() + (record.forward() ? "" : "'"));
int qseqid1, qseqid2;
String os;
List<Double> dist_stats = new ArrayList<Double>();
for (int i = 0; i != nV; i++) {
qseqid1 = contig_coordinate.getKey(ss_coordinate.get(i));
os = blast6_records.get(i).qseqid() + (blast6_records.get(i).forward() ? "" : "'");
System.out.print(Utils.fixedLengthPaddingString(os, 40) + "\t");
double distance = Double.MAX_VALUE;
for (int j = 1; j <= window_size; j++) {
if (i + j >= nV) {
System.out.print(Utils.fixedLengthPaddingString("", 10) + ",\t");
continue;
}
qseqid2 = contig_coordinate.getKey(ss_coordinate.get(i + j));
GraphPath<Integer, DefaultWeightedEdge> e = bfs.getPath(qseqid1, qseqid2);
// GraphPath<Integer, DefaultWeightedEdge> e = dijkstra_paths.getPath( qseqid1, qseqid2 );
if (e != null) {
double d = e.getLength();
if (distance > d)
distance = d;
}
int l = 0;
if (e != null) {
List<Integer> vertices = e.getVertexList();
for (int k = 1; k < vertices.size() - 1; k++) l += qry_seqs.get(contig_coordinate.get(vertices.get(k))).seq_ln();
l -= (vertices.size() - 2) * this.kmer_size;
}
System.out.print((e == null ? Utils.fixedLengthPaddingString("Inf " + l, 10) + ",\t" : Utils.fixedLengthPaddingString(e.getLength() + " " + l, 10) + ",\t"));
}
System.out.println();
dist_stats.add(distance);
}
double[] dist_arr = ArrayUtils.toPrimitive(dist_stats.toArray(new Double[dist_stats.size()]));
Arrays.sort(dist_arr);
Percentile percentile = new Percentile();
double q1 = percentile.evaluate(dist_arr, 25);
double q3 = percentile.evaluate(dist_arr, 75);
// upper bound for outliers
// q3+1.5*IQR
double outlier = q3 + 1.5 * (q3 - q1);
System.out.println("Outlier upper bound: " + outlier);
// construct a tree graph
pseudo_tree = new DirectedWeightedPseudograph<Integer, DefaultWeightedEdge>(DefaultWeightedEdge.class);
for (int i : ss_coordinate.keySet()) pseudo_tree.addVertex(i);
Blast6Segment record1, record2;
// select the nearest neighbour(s)
for (int i = 0; i != nV; i++) {
System.out.println(i);
record1 = blast6_records.get(i);
send = record1.true_send();
qseqid1 = contig_coordinate.getKey(ss_coordinate.get(i));
paths.clear();
bfs_dist = Integer.MAX_VALUE;
for (int j = i + 1; j < nV; j++) {
record2 = blast6_records.get(j);
sstart = record2.true_sstart();
if (send < sstart)
break;
qseqid2 = contig_coordinate.getKey(ss_coordinate.get(j));
GraphPath<Integer, DefaultWeightedEdge> e = bfs.getPath(qseqid1, qseqid2);
// GraphPath<Integer, DefaultWeightedEdge> e = dijkstra_paths.getPath( qseqid1, qseqid2 );
if (e != null) {
tmp_int = e.getLength();
if (tmp_int == 0)
continue;
if (tmp_int < bfs_dist)
paths.clear();
paths.add(e);
bfs_dist = tmp_int;
}
}
if (paths.isEmpty())
continue;
for (GraphPath<Integer, DefaultWeightedEdge> path : paths) pseudo_tree.setEdgeWeight(pseudo_tree.addEdge(path.getStartVertex(), path.getEndVertex()), 1d);
}
// secondly fill gaps
long toc = System.nanoTime();
System.out.println(toc - tic + " nano secs.");
if (true)
throw new RuntimeException("!!!");
// second step: construct pseudo chromosomes
// 1. choose the first contig
int firstv = 0;
while (true) {
// int v2 = next(firstv+1);
if (true)
break;
}
// 2. extend pseudo chromosomes
StringBuilder seq_str = new StringBuilder();
for (int v = firstv; v < nV - 1; v++) {
if (++count % 10000 == 0)
myLogger.info(sub_sn + " " + count + "/" + nV + " done.");
Blast6Segment record = blast6_records.get(v);
qlen = qry_seqs.get(record.qseqid()).seq_ln();
sstart = record.sstart();
send = record.send();
qstart = record.qstart();
qend = record.qend();
if (sstart > send) {
// make sure sstart<send
tmp_int = sstart;
sstart = send;
send = tmp_int;
tmp_int = qstart;
qstart = qend;
qend = tmp_int;
}
if (qstart > qend) {
qstart_clip = qlen - qstart;
qend_clip = qend - 1;
qseq = Sequence.revCompSeq(qry_seqs.get(record.qseqid()).seq_str());
} else {
qstart_clip = qstart - 1;
qend_clip = qlen - qend;
qseq = qry_seqs.get(record.qseqid()).seq_str();
}
if (send < posUpto || sstart < posUpto && qstart_clip > max_clip) {
// TODO process
continue;
}
// find longest suffix-prefix
nS = seq_str.length();
nQ = qseq.length();
int nO = Math.min(prev_n, Math.min(nS, nQ));
outerloop: for (; nO >= min_overlap; nO--) {
int nS_i = nS - nO;
for (int i = 0; i < nO; i++) {
if (seq_str.charAt(nS_i + i) != qseq.charAt(i))
continue outerloop;
}
break outerloop;
}
if (nO < min_overlap) {
// otherwise will insert a large GAP max(pseduo_distance, max_gap)
if (posUpto > 0) {
if (sstart <= posUpto) {
// if too much overlap, then treat it as a contradiction
if (posUpto - sstart > min_overlap) {
// discard
continue;
} else {
// insert a min_gap
gap_size = min_gap;
}
} else {
// estimate gap size
gap_size = (sstart - posUpto) - (send_clip + qstart_clip);
if (gap_size < max_gap)
gap_size = max_gap;
}
bw_map.write("GAP\t" + gap_size + "\t0\t" + gap_size + "\t+\t" + sub_sn + "\t" + mol_len + "\t" + (mol_len + gap_size) + "\n");
seq_str.append(Sequence.polyN(gap_size));
mol_len += gap_size;
}
bw_map.write(record.qseqid() + "\t" + qlen + "\t");
if (qstart > qend) {
// reverse
bw_map.write("0\t" + qlen + "\t-\t");
} else {
// forward
bw_map.write("0\t" + qlen + "\t+\t");
}
seq_str.append(qseq);
bw_map.write(sub_sn + "\t" + mol_len + "\t" + (mol_len + qlen) + "\n");
mol_len += qlen;
prev_n = qlen;
anchored_seqs.add(record.qseqid());
} else {
// overlap found
// will not insert gap
// ====================
// /-------\
// /----\
// calculate overlaps
// process overlap
qstart = nO;
if (qstart == qlen)
continue;
bw_map.write(record.qseqid() + "\t" + (qlen - qstart) + "\t");
if (qstart > qend) {
// reverse
bw_map.write(0 + "\t" + (qlen - qstart) + "\t-\t");
} else {
// forward
bw_map.write(qstart + "\t" + qlen + "\t+\t");
}
bw_map.write(sub_sn + "\t" + mol_len + "\t" + (mol_len + qlen - qstart) + "\n");
mol_len += qlen - qstart;
prev_n += qlen - qstart;
seq_str.append(qseq.substring(qstart));
anchored_seqs.add(record.qseqid());
}
posUpto = send;
send_clip = qend_clip;
}
if (seq_str.length() > 0)
bw_fa.write(Sequence.formatOutput(sub_sn, seq_str.toString()));
}
bw_fa.close();
bw_map.close();
final BufferedWriter bw_ufa = Utils.getBufferedWriter(out_prefix + "_unplaced.fa");
for (String seq : qry_seqs.keySet()) if (!anchored_seqs.contains(seq))
bw_ufa.write(qry_seqs.get(seq).formatOutput());
bw_ufa.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
use of com.google.common.collect.TreeRangeSet in project calcite by apache.
the class DruidDateTimeUtils method createInterval.
/**
* Generates a list of {@link Interval}s equivalent to a given
* expression. Assumes that all the predicates in the input
* reference a single column: the timestamp column.
*/
@Nullable
public static List<Interval> createInterval(RexNode e, String timeZone) {
final List<Range<TimestampString>> ranges = extractRanges(e, TimeZone.getTimeZone(timeZone), false);
if (ranges == null) {
// We did not succeed, bail out
return null;
}
final TreeRangeSet condensedRanges = TreeRangeSet.create();
for (Range r : ranges) {
condensedRanges.add(r);
}
if (LOGGER.isDebugEnabled()) {
LOGGER.debug("Inferred ranges on interval : " + condensedRanges);
}
return toInterval(ImmutableList.<Range>copyOf(condensedRanges.asRanges()));
}
Aggregations