Search in sources :

Example 1 with Blast6Segment

use of cz1.ngs.model.Blast6Segment in project polyGembler by c-zhou.

the class Redundas method run.

@Override
public void run() {
    // TODO Auto-generated method stub
    Map<String, Sequence> sequence_map = Sequence.parseFastaFileAsMap(query_file);
    String line;
    String[] s;
    Set<String> seq_rm = new HashSet<String>();
    try {
        BufferedReader br_blast6 = new BufferedReader(new FileReader(blast_out));
        Set<Blast6Segment> buffer_b6 = new HashSet<Blast6Segment>();
        line = br_blast6.readLine();
        String qseqid;
        while (line != null) {
            s = line.split("\\s+");
            qseqid = s[0];
            buffer_b6.clear();
            buffer_b6.add(Blast6Segment.blast6Segment(line));
            while ((line = br_blast6.readLine()) != null && line.startsWith(qseqid)) buffer_b6.add(Blast6Segment.blast6Segment(line));
            int sz = sequence_map.get(qseqid).seq_ln();
            RangeSet<Integer> range_covered = TreeRangeSet.create();
            for (Blast6Segment record : buffer_b6) {
                if (!seq_rm.contains(record.sseqid()) && !record.qseqid().equals(record.sseqid()) && record.pident() >= min_ident && record.length() >= min_match && (record.qstart() <= max_overhang || record.qend() > sz - max_overhang))
                    range_covered.add(Range.closedOpen(record.qstart(), record.qend()).canonical(DiscreteDomain.integers()));
            }
            int unique_cvg = 0;
            for (Range<Integer> range : range_covered.asRanges()) unique_cvg += range.upperEndpoint() - range.lowerEndpoint();
            if ((double) unique_cvg / sz >= min_frac) {
                seq_rm.add(qseqid);
                myLogger.info("Redundant sequence " + qseqid + "\t" + sz + "\t" + unique_cvg + "\t" + (double) unique_cvg / sz);
            }
        }
        br_blast6.close();
        BufferedWriter bw_unique = Utils.getBufferedWriter(this.out_prefix + ".fa");
        BufferedWriter bw_redundas = Utils.getBufferedWriter(this.out_prefix + "_redundas.fa");
        for (Map.Entry<String, Sequence> entry : sequence_map.entrySet()) {
            if (seq_rm.contains(entry.getKey()))
                bw_redundas.write(entry.getValue().formatOutput());
            else
                bw_unique.write(entry.getValue().formatOutput());
        }
        bw_unique.close();
        bw_redundas.close();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}
Also used : Blast6Segment(cz1.ngs.model.Blast6Segment) Sequence(cz1.ngs.model.Sequence) IOException(java.io.IOException) BufferedWriter(java.io.BufferedWriter) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) HashMap(java.util.HashMap) Map(java.util.Map) HashSet(java.util.HashSet)

Example 2 with Blast6Segment

use of cz1.ngs.model.Blast6Segment in project polyGembler by c-zhou.

the class Anchor method run.

// private static DijkstraShortestPath<Integer, DefaultWeightedEdge> dijkstra_paths;
@Override
public void run() {
    // TODO Auto-generated method stub
    long tic = System.nanoTime();
    sub_seqs = Sequence.parseFastaFileAsMap(subject_file);
    qry_seqs = Sequence.parseFastaFileWithRevCmpAsMap(query_file);
    // this.makeAssemblyGraph(query_file);
    this.makeAssemblyGraph(query_file);
    bfs = new BFSShortestPath<Integer, DefaultWeightedEdge>(assembly_graph);
    // dijkstra_paths = new DijkstraShortestPath<Integer, DefaultWeightedEdge>(assembly_graph);
    // find 'N/n's in subject/reference sequences
    // which could have impact on parsing the blast records
    final Map<String, TreeRangeSet<Integer>> sub_gaps = new HashMap<String, TreeRangeSet<Integer>>();
    final Map<String, List<Blast6Segment>> anchored_records = new HashMap<String, List<Blast6Segment>>();
    for (Map.Entry<String, Sequence> entry : sub_seqs.entrySet()) {
        String seq_sn = entry.getKey();
        String seq_str = entry.getValue().seq_str();
        final TreeRangeSet<Integer> tmp_rangeSet = TreeRangeSet.create();
        for (int j = 0; j < seq_str.length(); j++) {
            if (seq_str.charAt(j) == 'N' || seq_str.charAt(j) == 'n')
                // blast record is 1-based closed coordination
                tmp_rangeSet.add(Range.closed(j + 1, j + 1).canonical(DiscreteDomain.integers()));
        }
        int seq_ln = seq_str.length();
        final TreeRangeSet<Integer> range_set = TreeRangeSet.create();
        for (Range<Integer> range : tmp_rangeSet.asRanges()) {
            int lowerend = range.hasLowerBound() ? Math.max(0, range.lowerEndpoint() - gap_buff) : 0;
            int upperend = range.hasUpperBound() ? Math.min(seq_ln, range.upperEndpoint() + gap_buff - 1) : seq_ln;
            range_set.add(Range.closed(lowerend, upperend).canonical(DiscreteDomain.integers()));
        }
        sub_gaps.put(seq_sn, range_set);
        // initialise an array list for each reference chromosome
        anchored_records.put(seq_sn, new ArrayList<Blast6Segment>());
    }
    // parse blast records
    // blast records buffer
    final List<Blast6Segment> buff = new ArrayList<Blast6Segment>();
    // selected blast records
    final List<Blast6Segment> sel_recs = new ArrayList<Blast6Segment>();
    // temp list
    final List<Blast6Segment> tmp_records = new ArrayList<Blast6Segment>();
    try {
        BufferedReader br_blast = Utils.getBufferedReader(blast_out);
        Blast6Segment tmp_record = Blast6Segment.blast6Segment(br_blast.readLine());
        Blast6Segment primary_record, secondary_record;
        String qry;
        double qry_ln, aln_frac;
        while (tmp_record != null) {
            qry = tmp_record.qseqid();
            qry_ln = qry_seqs.get(qry).seq_ln();
            buff.clear();
            buff.add(tmp_record);
            while ((tmp_record = Blast6Segment.blast6Segment(br_blast.readLine())) != null && tmp_record.qseqid().equals(qry)) {
                // filter by identity
                if (tmp_record.pident() >= this.min_ident)
                    buff.add(tmp_record);
            }
            sel_recs.clear();
            // merge collinear records
            for (String sub : sub_seqs.keySet()) {
                // get all records for subject/reference sequence sub_seq
                tmp_records.clear();
                for (Blast6Segment record : buff) if (record.sseqid().equals(sub))
                    tmp_records.add(record);
                if (tmp_records.isEmpty())
                    continue;
                // find alignment segments that can be deleted
                // those that are subsets of larger alignment segments
                // (Sstart, (sstart, send), Send) and (Qstart, (qstart, qend), Qend)
                Collections.sort(tmp_records, new Blast6Segment.SegmentSizeComparator());
                final Set<int[][]> ranges = new HashSet<int[][]>();
                outerloop: for (int i = 0; i < tmp_records.size(); i++) {
                    primary_record = tmp_records.get(i);
                    int[][] range = new int[2][2];
                    if (primary_record.sstart() < primary_record.send()) {
                        range[0][0] = primary_record.sstart();
                        range[0][1] = primary_record.send();
                    } else {
                        range[0][0] = primary_record.send();
                        range[0][1] = primary_record.sstart();
                    }
                    if (primary_record.qstart() < primary_record.qend()) {
                        range[1][0] = primary_record.qstart();
                        range[1][1] = primary_record.qend();
                    } else {
                        range[1][0] = primary_record.qend();
                        range[1][1] = primary_record.qstart();
                    }
                    for (int[][] r : ranges) {
                        if (r[0][0] <= range[0][0] && r[0][1] >= range[0][1] && r[1][0] <= range[1][0] && r[1][1] >= range[1][1]) {
                            tmp_records.remove(i--);
                            continue outerloop;
                        }
                    }
                    ranges.add(range);
                }
                // TODO rewrite this part
                /**
                 *					// find collinear alignment segments that can be merged
                 *					Collections.sort(tmp_records, new BlastRecord.SInterceptComparator());
                 *
                 *					collinear_merged.clear();
                 *					final List<Blast6Record> temp = new ArrayList<Blast6Record>();
                 *					for(int i=0; i<tmp_records.size(); ) {
                 *						Blast6Record record = tmp_records.get(i);
                 *						double max_shift;
                 *						temp.clear();
                 *						temp.add(record);
                 *
                 *						// find collinear alignment segments
                 *						outerloop:
                 *							while( (++i)<tmp_records.size() ) {
                 *								record = tmp_records.get(i);
                 *								// check if is collinear with other alignment segments
                 *								for(Blast6Record r : temp) {
                 *									max_shift = collinear_shift*
                 *											Math.min(r.length(), record.length());
                 *									if(BlastRecord.sdistance(r, record)<=max_shift &&
                 *											BlastRecord.qdistance(r, record)<=max_shift &&
                 *											BlastRecord.pdistance(r, record)<=max_shift) {
                 *										temp.add(record);
                 *										continue outerloop;
                 *									}
                 *								}
                 *								break;
                 *							}
                 *
                 *						// merge collinear alignment segments
                 *						int qstart = Integer.MAX_VALUE;
                 *						int qend = Integer.MIN_VALUE;
                 *						int sstart = Integer.MAX_VALUE;
                 *						int send = Integer.MIN_VALUE;
                 *						double pident = 0;
                 *						int length = 0;
                 *
                 *						// qstart is always smaller than qend
                 *						for(int j=0; j<temp.size(); j++) {
                 *							record = temp.get(j);
                 *							if(record.qstart()<qstart) {
                 *								qstart = record.qstart();
                 *								sstart = record.sstart();
                 *							}
                 *							if(record.qend()>qend) {
                 *								qend = record.qend();
                 *								send = record.send();
                 *							}
                 *							if(record.pident()>pident)
                 *								pident = record.pident();
                 *							length += record.length();
                 *						}
                 *
                 *						collinear_merged.add(new Blast6Record(qry,sub,pident,length,-1,-1,qstart,qend,sstart,send,-1,-1));
                 *					}
                 */
                // find collinear alignment segments that can be merged
                // more accurate but slower
                Collections.sort(tmp_records, new AlignmentSegment.SubjectCoordinationComparator());
                Blast6Segment record;
                for (int i = 0; i < tmp_records.size(); i++) {
                    primary_record = tmp_records.get(i);
                    for (int j = i + 1; j < tmp_records.size(); j++) {
                        secondary_record = tmp_records.get(j);
                        double max_shift = collinear_shift * Math.min(primary_record.length(), secondary_record.length());
                        if ((record = Blast6Segment.collinear(primary_record, secondary_record, max_shift)) != null) {
                            tmp_records.set(i, record);
                            tmp_records.remove(j);
                            --i;
                            break;
                        }
                    }
                }
                // process blast records that clipped by gaps
                // (sstart, send)---(start2, send2)
                // (sstart  ...  ---  ...    send2)
                TreeRangeSet<Integer> sub_gap = sub_gaps.get(sub);
                Collections.sort(tmp_records, new AlignmentSegment.SubjectCoordinationComparator());
                for (int i = 0; i < tmp_records.size(); i++) {
                    primary_record = tmp_records.get(i);
                    if (sub_gap.contains(primary_record.true_send())) {
                        secondary_record = null;
                        int sec_j = -1;
                        for (int j = i + 1; j < tmp_records.size(); j++) {
                            if (tmp_records.get(j).true_sstart() >= primary_record.true_send()) {
                                secondary_record = tmp_records.get(j);
                                sec_j = j;
                                break;
                            }
                        }
                        if (secondary_record == null || AlignmentSegment.reverse(primary_record, secondary_record)) {
                            // reverse alignment segments
                            continue;
                        }
                        if (sub_gap.contains(secondary_record.true_sstart()) && sub_gap.rangeContaining(primary_record.true_send()).equals(sub_gap.rangeContaining(secondary_record.true_sstart()))) {
                            // clipping
                            // merge two alignment segments
                            double pident = Math.max(primary_record.pident(), secondary_record.pident());
                            int qstart = Math.min(primary_record.true_qstart(), secondary_record.true_qstart());
                            int qend = Math.max(primary_record.true_qend(), secondary_record.true_qend());
                            int sstart = Math.min(primary_record.true_sstart(), secondary_record.true_sstart());
                            int send = Math.max(primary_record.true_send(), secondary_record.true_send());
                            int length = qend - qstart + 1;
                            // replace primary record with merged record
                            // delete secondary record
                            Blast6Segment merged_record = primary_record.forward() ? new Blast6Segment(qry, sub, pident, length, -1, -1, qstart, qend, sstart, send, -1, -1) : new Blast6Segment(qry, sub, pident, length, -1, -1, qstart, qend, send, sstart, -1, -1);
                            tmp_records.set(i, merged_record);
                            tmp_records.remove(sec_j);
                            // the merged records need to be processed
                            --i;
                        }
                    }
                }
                // add to sel_recs
                sel_recs.addAll(tmp_records);
            }
            // filter by alignment fraction
            buff.clear();
            buff.addAll(sel_recs);
            sel_recs.clear();
            for (Blast6Segment record : buff) {
                if (record.length() / qry_ln >= this.min_frac)
                    sel_recs.add(record);
            }
            if (sel_recs.isEmpty()) {
                // continue
                continue;
            }
            // filter blast records
            Collections.sort(sel_recs, new Blast6Segment.MatchIndentityComparator());
            // process primary alignment
            primary_record = sel_recs.get(0);
            anchored_records.get(primary_record.sseqid()).add(primary_record);
            aln_frac = primary_record.length() / qry_ln;
            // and process
            for (int i = 1; i < sel_recs.size(); i++) {
                secondary_record = sel_recs.get(i);
                if (secondary_record.pident() + this.diff_ident < primary_record.pident() || secondary_record.length() / qry_ln + this.diff_frac < aln_frac) {
                    break;
                } else {
                    anchored_records.get(secondary_record.sseqid()).add(secondary_record);
                }
            }
        }
        br_blast.close();
        for (Map.Entry<String, List<Blast6Segment>> entry : anchored_records.entrySet()) {
            System.out.println(entry.getKey() + ": " + entry.getValue().size());
        }
        final BufferedWriter bw_map = Utils.getBufferedWriter(out_prefix + ".map");
        final BufferedWriter bw_fa = Utils.getBufferedWriter(out_prefix + ".fa");
        final Set<String> anchored_seqs = new HashSet<String>();
        final List<String> sub_list = Sequence.parseSeqList(subject_file);
        for (String sub_sn : sub_list) {
            blast6_records = anchored_records.get(sub_sn);
            int nV = blast6_records.size(), count = 0;
            // sort blast records
            Collections.sort(blast6_records, new Blast6Segment.SubjectCoordinationComparator());
            // consensus
            int posUpto = 0, send_clip = 0;
            int sstart, send, qstart, qend, qlen, tmp_int, qstart_clip, qend_clip, gap_size;
            // distance to last 'N', start from next position to find longest common suffix-prefix
            int prev_n = Integer.MAX_VALUE;
            int mol_len = 0;
            String qseq;
            int nS, nQ;
            // first step: construct super scaffold
            // will form a tree graph indicating the path through the contigs
            // convert contig names to integer indices
            // one contig could end up with multiple indices due to repeats
            Map<Integer, String> ss_coordinate = new HashMap<Integer, String>();
            int index = 0;
            for (Blast6Segment record : blast6_records) ss_coordinate.put(index++, record.qseqid() + (record.forward() ? "" : "'"));
            StringBuilder seq_str = new StringBuilder();
            for (int v = 0; v < nV - 1; v++) {
                if (++count % 10000 == 0)
                    myLogger.info(sub_sn + " " + count + "/" + nV + " done.");
                Blast6Segment record = blast6_records.get(v);
                qlen = qry_seqs.get(record.qseqid()).seq_ln();
                sstart = record.sstart();
                send = record.send();
                qstart = record.qstart();
                qend = record.qend();
                if (sstart > send) {
                    // make sure sstart<send
                    tmp_int = sstart;
                    sstart = send;
                    send = tmp_int;
                    tmp_int = qstart;
                    qstart = qend;
                    qend = tmp_int;
                }
                if (qstart > qend) {
                    qstart_clip = qlen - qstart;
                    qend_clip = qend - 1;
                    qseq = Sequence.revCompSeq(qry_seqs.get(record.qseqid()).seq_str());
                } else {
                    qstart_clip = qstart - 1;
                    qend_clip = qlen - qend;
                    qseq = qry_seqs.get(record.qseqid()).seq_str();
                }
                if (send < posUpto || sstart < posUpto && qstart_clip > max_clip) {
                    // TODO process
                    continue;
                }
                // find longest suffix-prefix
                nS = seq_str.length();
                nQ = qseq.length();
                int nO = Math.min(prev_n, Math.min(nS, nQ));
                outerloop: for (; nO >= min_overlap; nO--) {
                    int nS_i = nS - nO;
                    for (int i = 0; i < nO; i++) {
                        if (seq_str.charAt(nS_i + i) != qseq.charAt(i))
                            continue outerloop;
                    }
                    break outerloop;
                }
                if (nO < min_overlap) {
                    // otherwise will insert a large GAP max(pseduo_distance, max_gap)
                    if (posUpto > 0) {
                        if (sstart <= posUpto) {
                            // if too much overlap, then treat it as a contradiction
                            if (posUpto - sstart > min_overlap) {
                                // discard
                                continue;
                            } else {
                                // insert a min_gap
                                gap_size = min_gap;
                            }
                        } else {
                            // estimate gap size
                            gap_size = (sstart - posUpto) - (send_clip + qstart_clip);
                            if (gap_size < max_gap)
                                gap_size = max_gap;
                        }
                        bw_map.write("GAP\t" + gap_size + "\t0\t" + gap_size + "\t+\t" + sub_sn + "\t" + mol_len + "\t" + (mol_len + gap_size) + "\n");
                        seq_str.append(Sequence.polyN(gap_size));
                        mol_len += gap_size;
                    }
                    bw_map.write(record.qseqid() + "\t" + qlen + "\t");
                    if (qstart > qend) {
                        // reverse
                        bw_map.write("0\t" + qlen + "\t-\t");
                    } else {
                        // forward
                        bw_map.write("0\t" + qlen + "\t+\t");
                    }
                    seq_str.append(qseq);
                    bw_map.write(sub_sn + "\t" + mol_len + "\t" + (mol_len + qlen) + "\n");
                    mol_len += qlen;
                    prev_n = qlen;
                    anchored_seqs.add(record.qseqid());
                } else {
                    // overlap found
                    // will not insert gap
                    // ====================
                    // /-------\
                    // /----\
                    // calculate overlaps
                    // process overlap
                    qstart = nO;
                    if (qstart == qlen)
                        continue;
                    bw_map.write(record.qseqid() + "\t" + (qlen - qstart) + "\t");
                    if (qstart > qend) {
                        // reverse
                        bw_map.write(0 + "\t" + (qlen - qstart) + "\t-\t");
                    } else {
                        // forward
                        bw_map.write(qstart + "\t" + qlen + "\t+\t");
                    }
                    bw_map.write(sub_sn + "\t" + mol_len + "\t" + (mol_len + qlen - qstart) + "\n");
                    mol_len += qlen - qstart;
                    prev_n += qlen - qstart;
                    seq_str.append(qseq.substring(qstart));
                    anchored_seqs.add(record.qseqid());
                }
                posUpto = send;
                send_clip = qend_clip;
            }
            if (seq_str.length() > 0)
                bw_fa.write(Sequence.formatOutput(sub_sn, seq_str.toString()));
        }
        bw_fa.close();
        bw_map.close();
        final BufferedWriter bw_ufa = Utils.getBufferedWriter(out_prefix + "_unplaced.fa");
        for (String seq : qry_seqs.keySet()) if (!anchored_seqs.contains(seq))
            bw_ufa.write(qry_seqs.get(seq).formatOutput());
        bw_ufa.close();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}
Also used : TreeRangeSet(com.google.common.collect.TreeRangeSet) HashMap(java.util.HashMap) Blast6Segment(cz1.ngs.model.Blast6Segment) ArrayList(java.util.ArrayList) BufferedWriter(java.io.BufferedWriter) ArrayList(java.util.ArrayList) List(java.util.List) HashSet(java.util.HashSet) Sequence(cz1.ngs.model.Sequence) IOException(java.io.IOException) DefaultWeightedEdge(org.jgrapht.graph.DefaultWeightedEdge) AlignmentSegment(cz1.ngs.model.AlignmentSegment) BufferedReader(java.io.BufferedReader) HashMap(java.util.HashMap) Map(java.util.Map) BidiMap(org.apache.commons.collections4.BidiMap) DualHashBidiMap(org.apache.commons.collections4.bidimap.DualHashBidiMap)

Example 3 with Blast6Segment

use of cz1.ngs.model.Blast6Segment in project polyGembler by c-zhou.

the class Anchor0 method run.

@Override
public void run() {
    // TODO Auto-generated method stub
    final Map<String, Sequence> sub_seqs = Sequence.parseFastaFileAsMap(subject_file);
    final Map<String, Sequence> qry_seqs = Sequence.parseFastaFileAsMap(query_file);
    // find 'N/n's in subject/reference sequences
    // which could have impact on parsing the blast records
    final Map<String, TreeRangeSet<Integer>> sub_gaps = new HashMap<String, TreeRangeSet<Integer>>();
    final Map<String, List<Blast6Segment>> anchored_records = new HashMap<String, List<Blast6Segment>>();
    for (Map.Entry<String, Sequence> entry : sub_seqs.entrySet()) {
        String seq_sn = entry.getKey();
        String seq_str = entry.getValue().seq_str();
        final TreeRangeSet<Integer> tmp_rangeSet = TreeRangeSet.create();
        for (int j = 0; j < seq_str.length(); j++) {
            if (seq_str.charAt(j) == 'N' || seq_str.charAt(j) == 'n')
                // blast record is 1-based closed coordination
                tmp_rangeSet.add(Range.closed(j + 1, j + 1).canonical(DiscreteDomain.integers()));
        }
        int seq_ln = seq_str.length();
        final TreeRangeSet<Integer> range_set = TreeRangeSet.create();
        for (Range<Integer> range : tmp_rangeSet.asRanges()) {
            int lowerend = range.hasLowerBound() ? Math.max(0, range.lowerEndpoint() - gap_buff) : 0;
            int upperend = range.hasUpperBound() ? Math.min(seq_ln, range.upperEndpoint() + gap_buff - 1) : seq_ln;
            range_set.add(Range.closed(lowerend, upperend).canonical(DiscreteDomain.integers()));
        }
        sub_gaps.put(seq_sn, range_set);
        // initialise an array list for each reference chromosome
        anchored_records.put(seq_sn, new ArrayList<Blast6Segment>());
    }
    // parse blast records
    // blast records buffer
    final List<Blast6Segment> buff = new ArrayList<Blast6Segment>();
    // selected blast records
    final List<Blast6Segment> sel_recs = new ArrayList<Blast6Segment>();
    // temp list
    final List<Blast6Segment> tmp_records = new ArrayList<Blast6Segment>();
    try {
        BufferedReader br_blast = Utils.getBufferedReader(blast_out);
        Blast6Segment tmp_record = Blast6Segment.blast6Segment(br_blast.readLine());
        Blast6Segment primary_record, secondary_record;
        String qry;
        double qry_ln, aln_frac;
        while (tmp_record != null) {
            qry = tmp_record.qseqid();
            qry_ln = qry_seqs.get(qry).seq_ln();
            buff.clear();
            buff.add(tmp_record);
            while ((tmp_record = Blast6Segment.blast6Segment(br_blast.readLine())) != null && tmp_record.qseqid().equals(qry)) {
                // filter by identity
                if (tmp_record.pident() >= this.min_ident)
                    buff.add(tmp_record);
            }
            sel_recs.clear();
            // merge collinear records
            for (String sub : sub_seqs.keySet()) {
                // get all records for subject/reference sequence sub_seq
                tmp_records.clear();
                for (Blast6Segment record : buff) if (record.sseqid().equals(sub))
                    tmp_records.add(record);
                if (tmp_records.isEmpty())
                    continue;
                // find alignment segments that can be deleted
                // those that are subsets of larger alignment segments
                // (Sstart, (sstart, send), Send) and (Qstart, (qstart, qend), Qend)
                Collections.sort(tmp_records, new Blast6Segment.SegmentSizeComparator());
                final Set<int[][]> ranges = new HashSet<int[][]>();
                outerloop: for (int i = 0; i < tmp_records.size(); i++) {
                    primary_record = tmp_records.get(i);
                    int[][] range = new int[2][2];
                    if (primary_record.sstart() < primary_record.send()) {
                        range[0][0] = primary_record.sstart();
                        range[0][1] = primary_record.send();
                    } else {
                        range[0][0] = primary_record.send();
                        range[0][1] = primary_record.sstart();
                    }
                    if (primary_record.qstart() < primary_record.qend()) {
                        range[1][0] = primary_record.qstart();
                        range[1][1] = primary_record.qend();
                    } else {
                        range[1][0] = primary_record.qend();
                        range[1][1] = primary_record.qstart();
                    }
                    for (int[][] r : ranges) {
                        if (r[0][0] <= range[0][0] && r[0][1] >= range[0][1] && r[1][0] <= range[1][0] && r[1][1] >= range[1][1]) {
                            tmp_records.remove(i--);
                            continue outerloop;
                        }
                    }
                    ranges.add(range);
                }
                // TODO rewrite this part
                /**
                 *					// find collinear alignment segments that can be merged
                 *					Collections.sort(tmp_records, new BlastRecord.SInterceptComparator());
                 *
                 *					collinear_merged.clear();
                 *					final List<Blast6Record> temp = new ArrayList<Blast6Record>();
                 *					for(int i=0; i<tmp_records.size(); ) {
                 *						Blast6Record record = tmp_records.get(i);
                 *						double max_shift;
                 *						temp.clear();
                 *						temp.add(record);
                 *
                 *						// find collinear alignment segments
                 *						outerloop:
                 *							while( (++i)<tmp_records.size() ) {
                 *								record = tmp_records.get(i);
                 *								// check if is collinear with other alignment segments
                 *								for(Blast6Record r : temp) {
                 *									max_shift = collinear_shift*
                 *											Math.min(r.length(), record.length());
                 *									if(BlastRecord.sdistance(r, record)<=max_shift &&
                 *											BlastRecord.qdistance(r, record)<=max_shift &&
                 *											BlastRecord.pdistance(r, record)<=max_shift) {
                 *										temp.add(record);
                 *										continue outerloop;
                 *									}
                 *								}
                 *								break;
                 *							}
                 *
                 *						// merge collinear alignment segments
                 *						int qstart = Integer.MAX_VALUE;
                 *						int qend = Integer.MIN_VALUE;
                 *						int sstart = Integer.MAX_VALUE;
                 *						int send = Integer.MIN_VALUE;
                 *						double pident = 0;
                 *						int length = 0;
                 *
                 *						// qstart is always smaller than qend
                 *						for(int j=0; j<temp.size(); j++) {
                 *							record = temp.get(j);
                 *							if(record.qstart()<qstart) {
                 *								qstart = record.qstart();
                 *								sstart = record.sstart();
                 *							}
                 *							if(record.qend()>qend) {
                 *								qend = record.qend();
                 *								send = record.send();
                 *							}
                 *							if(record.pident()>pident)
                 *								pident = record.pident();
                 *							length += record.length();
                 *						}
                 *
                 *						collinear_merged.add(new Blast6Record(qry,sub,pident,length,-1,-1,qstart,qend,sstart,send,-1,-1));
                 *					}
                 */
                // find collinear alignment segments that can be merged
                // more accurate but slower
                Collections.sort(tmp_records, new AlignmentSegment.SubjectCoordinationComparator());
                Blast6Segment record;
                for (int i = 0; i < tmp_records.size(); i++) {
                    primary_record = tmp_records.get(i);
                    for (int j = i + 1; j < tmp_records.size(); j++) {
                        secondary_record = tmp_records.get(j);
                        double max_shift = collinear_shift * Math.min(primary_record.length(), secondary_record.length());
                        if ((record = Blast6Segment.collinear(primary_record, secondary_record, max_shift)) != null) {
                            tmp_records.set(i, record);
                            tmp_records.remove(j);
                            --i;
                            break;
                        }
                    }
                }
                // process blast records that clipped by gaps
                // (sstart, send)---(start2, send2)
                // (sstart  ...  ---  ...    send2)
                TreeRangeSet<Integer> sub_gap = sub_gaps.get(sub);
                Collections.sort(tmp_records, new AlignmentSegment.SubjectCoordinationComparator());
                for (int i = 0; i < tmp_records.size(); i++) {
                    primary_record = tmp_records.get(i);
                    if (sub_gap.contains(primary_record.true_send())) {
                        secondary_record = null;
                        int sec_j = -1;
                        for (int j = i + 1; j < tmp_records.size(); j++) {
                            if (tmp_records.get(j).true_sstart() >= primary_record.true_send()) {
                                secondary_record = tmp_records.get(j);
                                sec_j = j;
                                break;
                            }
                        }
                        if (secondary_record == null || AlignmentSegment.reverse(primary_record, secondary_record)) {
                            // reverse alignment segments
                            continue;
                        }
                        if (sub_gap.contains(secondary_record.true_sstart()) && sub_gap.rangeContaining(primary_record.true_send()).equals(sub_gap.rangeContaining(secondary_record.true_sstart()))) {
                            // clipping
                            // merge two alignment segments
                            double pident = Math.max(primary_record.pident(), secondary_record.pident());
                            int qstart = Math.min(primary_record.true_qstart(), secondary_record.true_qstart());
                            int qend = Math.max(primary_record.true_qend(), secondary_record.true_qend());
                            int sstart = Math.min(primary_record.true_sstart(), secondary_record.true_sstart());
                            int send = Math.max(primary_record.true_send(), secondary_record.true_send());
                            int length = qend - qstart + 1;
                            // replace primary record with merged record
                            // delete secondary record
                            Blast6Segment merged_record = primary_record.forward() ? new Blast6Segment(qry, sub, pident, length, -1, -1, qstart, qend, sstart, send, -1, -1) : new Blast6Segment(qry, sub, pident, length, -1, -1, qstart, qend, send, sstart, -1, -1);
                            tmp_records.set(i, merged_record);
                            tmp_records.remove(sec_j);
                            // the merged records need to be processed
                            --i;
                        }
                    }
                }
                // add to sel_recs
                sel_recs.addAll(tmp_records);
            }
            // filter by alignment fraction
            buff.clear();
            buff.addAll(sel_recs);
            sel_recs.clear();
            for (Blast6Segment record : buff) {
                if (record.length() / qry_ln >= this.min_frac)
                    sel_recs.add(record);
            }
            if (sel_recs.isEmpty()) {
                // continue
                continue;
            }
            // filter blast records
            Collections.sort(sel_recs, new Blast6Segment.MatchIndentityComparator());
            // process primary alignment
            primary_record = sel_recs.get(0);
            anchored_records.get(primary_record.sseqid()).add(primary_record);
            aln_frac = primary_record.length() / qry_ln;
            // and process
            for (int i = 1; i < sel_recs.size(); i++) {
                secondary_record = sel_recs.get(i);
                if (secondary_record.pident() + this.diff_ident < primary_record.pident() || secondary_record.length() / qry_ln + this.diff_frac < aln_frac) {
                    break;
                } else {
                    anchored_records.get(secondary_record.sseqid()).add(secondary_record);
                }
            }
        }
        br_blast.close();
        for (Map.Entry<String, List<Blast6Segment>> entry : anchored_records.entrySet()) {
            System.out.println(entry.getKey() + ": " + entry.getValue().size());
        }
        final BufferedWriter bw_map = Utils.getBufferedWriter(out_prefix + ".map");
        final BufferedWriter bw_fa = Utils.getBufferedWriter(out_prefix + ".fa");
        final Set<String> anchored_seqs = new HashSet<String>();
        final List<String> sub_list = Sequence.parseSeqList(subject_file);
        for (String sub_sn : sub_list) {
            List<Blast6Segment> blast6_records = anchored_records.get(sub_sn);
            int total = blast6_records.size(), count = 0;
            // sort blast records
            Collections.sort(blast6_records, new Blast6Segment.SubjectCoordinationComparator());
            // consensus
            int posUpto = 0, send_clip = 0;
            int sstart, send, qstart, qend, qlen, tmp_int, qstart_clip, qend_clip, gap_size;
            // distance to last 'N', start from next position to find longest common suffix-prefix
            int prev_n = Integer.MAX_VALUE;
            int mol_len = 0;
            String qseq;
            int nS, nQ;
            // will form a pseudomolecule
            StringBuilder seq_str = new StringBuilder();
            for (Blast6Segment record : blast6_records) {
                if (count++ % 10000 == 0)
                    myLogger.info(sub_sn + " " + count + "/" + total + " done.");
                qlen = qry_seqs.get(record.qseqid()).seq_ln();
                sstart = record.sstart();
                send = record.send();
                qstart = record.qstart();
                qend = record.qend();
                if (sstart > send) {
                    // make sure sstart<send
                    tmp_int = sstart;
                    sstart = send;
                    send = tmp_int;
                    tmp_int = qstart;
                    qstart = qend;
                    qend = tmp_int;
                }
                if (qstart > qend) {
                    qstart_clip = qlen - qstart;
                    qend_clip = qend - 1;
                    qseq = Sequence.revCompSeq(qry_seqs.get(record.qseqid()).seq_str());
                } else {
                    qstart_clip = qstart - 1;
                    qend_clip = qlen - qend;
                    qseq = qry_seqs.get(record.qseqid()).seq_str();
                }
                if (send < posUpto || sstart < posUpto && qstart_clip > max_clip) {
                    // TODO process
                    continue;
                }
                // find longest suffix-prefix
                nS = seq_str.length();
                nQ = qseq.length();
                int nO = Math.min(prev_n, Math.min(nS, nQ));
                outerloop: for (; nO >= min_overlap; nO--) {
                    int nS_i = nS - nO;
                    for (int i = 0; i < nO; i++) {
                        if (seq_str.charAt(nS_i + i) != qseq.charAt(i))
                            continue outerloop;
                    }
                    break outerloop;
                }
                if (nO < min_overlap) {
                    // otherwise will insert a large GAP max(pseduo_distance, max_gap)
                    if (posUpto > 0) {
                        if (sstart <= posUpto) {
                            // if too much overlap, then treat it as a contradiction
                            if (posUpto - sstart > min_overlap) {
                                // discard
                                continue;
                            } else {
                                // insert a min_gap
                                gap_size = min_gap;
                            }
                        } else {
                            // estimate gap size
                            gap_size = (sstart - posUpto) - (send_clip + qstart_clip);
                            if (gap_size < max_gap)
                                gap_size = max_gap;
                        }
                        bw_map.write("GAP\t" + gap_size + "\t0\t" + gap_size + "\t+\t" + sub_sn + "\t" + mol_len + "\t" + (mol_len + gap_size) + "\n");
                        seq_str.append(Sequence.polyN(gap_size));
                        mol_len += gap_size;
                    }
                    bw_map.write(record.qseqid() + "\t" + qlen + "\t");
                    if (qstart > qend) {
                        // reverse
                        bw_map.write("0\t" + qlen + "\t-\t");
                    } else {
                        // forward
                        bw_map.write("0\t" + qlen + "\t+\t");
                    }
                    seq_str.append(qseq);
                    bw_map.write(sub_sn + "\t" + mol_len + "\t" + (mol_len + qlen) + "\n");
                    mol_len += qlen;
                    prev_n = qlen;
                    anchored_seqs.add(record.qseqid());
                } else {
                    // overlap found
                    // will not insert gap
                    // ====================
                    // /-------\
                    // /----\
                    // calculate overlaps
                    // process overlap
                    qstart = nO;
                    if (qstart == qlen)
                        continue;
                    bw_map.write(record.qseqid() + "\t" + (qlen - qstart) + "\t");
                    if (qstart > qend) {
                        // reverse
                        bw_map.write(0 + "\t" + (qlen - qstart) + "\t-\t");
                    } else {
                        // forward
                        bw_map.write(qstart + "\t" + qlen + "\t+\t");
                    }
                    bw_map.write(sub_sn + "\t" + mol_len + "\t" + (mol_len + qlen - qstart) + "\n");
                    mol_len += qlen - qstart;
                    prev_n += qlen - qstart;
                    seq_str.append(qseq.substring(qstart));
                    anchored_seqs.add(record.qseqid());
                }
                posUpto = send;
                send_clip = qend_clip;
            }
            if (seq_str.length() > 0)
                bw_fa.write(Sequence.formatOutput(sub_sn, seq_str.toString()));
        }
        bw_fa.close();
        bw_map.close();
        final BufferedWriter bw_ufa = Utils.getBufferedWriter(out_prefix + "_unplaced.fa");
        for (String seq : qry_seqs.keySet()) if (!anchored_seqs.contains(seq))
            bw_ufa.write(qry_seqs.get(seq).formatOutput());
        bw_ufa.close();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}
Also used : TreeRangeSet(com.google.common.collect.TreeRangeSet) HashMap(java.util.HashMap) Blast6Segment(cz1.ngs.model.Blast6Segment) ArrayList(java.util.ArrayList) BufferedWriter(java.io.BufferedWriter) ArrayList(java.util.ArrayList) List(java.util.List) HashSet(java.util.HashSet) Sequence(cz1.ngs.model.Sequence) IOException(java.io.IOException) AlignmentSegment(cz1.ngs.model.AlignmentSegment) BufferedReader(java.io.BufferedReader) HashMap(java.util.HashMap) Map(java.util.Map)

Example 4 with Blast6Segment

use of cz1.ngs.model.Blast6Segment in project polyGembler by c-zhou.

the class Anchor1 method run.

// private static DijkstraShortestPath<Integer, DefaultWeightedEdge> dijkstra_paths;
@Override
public void run() {
    // TODO Auto-generated method stub
    long tic = System.nanoTime();
    sub_seqs = Sequence.parseFastaFileAsMap(subject_file);
    qry_seqs = Sequence.parseFastaFileWithRevCmpAsMap(query_file);
    // this.makeAssemblyGraph(query_file);
    this.makeAssemblyGraph(query_file);
    bfs = new BFSShortestPath<Integer, DefaultWeightedEdge>(assembly_graph);
    // dijkstra_paths = new DijkstraShortestPath<Integer, DefaultWeightedEdge>(assembly_graph);
    // find 'N/n's in subject/reference sequences
    // which could have impact on parsing the blast records
    final Map<String, TreeRangeSet<Integer>> sub_gaps = new HashMap<String, TreeRangeSet<Integer>>();
    final Map<String, List<Blast6Segment>> anchored_records = new HashMap<String, List<Blast6Segment>>();
    for (Map.Entry<String, Sequence> entry : sub_seqs.entrySet()) {
        String seq_sn = entry.getKey();
        String seq_str = entry.getValue().seq_str();
        final TreeRangeSet<Integer> tmp_rangeSet = TreeRangeSet.create();
        for (int j = 0; j < seq_str.length(); j++) {
            if (seq_str.charAt(j) == 'N' || seq_str.charAt(j) == 'n')
                // blast record is 1-based closed coordination
                tmp_rangeSet.add(Range.closed(j + 1, j + 1).canonical(DiscreteDomain.integers()));
        }
        int seq_ln = seq_str.length();
        final TreeRangeSet<Integer> range_set = TreeRangeSet.create();
        for (Range<Integer> range : tmp_rangeSet.asRanges()) {
            int lowerend = range.hasLowerBound() ? Math.max(0, range.lowerEndpoint() - gap_buff) : 0;
            int upperend = range.hasUpperBound() ? Math.min(seq_ln, range.upperEndpoint() + gap_buff - 1) : seq_ln;
            range_set.add(Range.closed(lowerend, upperend).canonical(DiscreteDomain.integers()));
        }
        sub_gaps.put(seq_sn, range_set);
        // initialise an array list for each reference chromosome
        anchored_records.put(seq_sn, new ArrayList<Blast6Segment>());
    }
    // parse blast records
    // blast records buffer
    final List<Blast6Segment> buff = new ArrayList<Blast6Segment>();
    // selected blast records
    final List<Blast6Segment> sel_recs = new ArrayList<Blast6Segment>();
    // temp list
    final List<Blast6Segment> tmp_records = new ArrayList<Blast6Segment>();
    try {
        BufferedReader br_blast = Utils.getBufferedReader(blast_out);
        Blast6Segment tmp_record = Blast6Segment.blast6Segment(br_blast.readLine());
        Blast6Segment primary_record, secondary_record;
        String qry;
        double qry_ln, aln_frac;
        while (tmp_record != null) {
            qry = tmp_record.qseqid();
            qry_ln = qry_seqs.get(qry).seq_ln();
            buff.clear();
            buff.add(tmp_record);
            while ((tmp_record = Blast6Segment.blast6Segment(br_blast.readLine())) != null && tmp_record.qseqid().equals(qry)) {
                // filter by identity
                if (tmp_record.pident() >= this.min_ident)
                    buff.add(tmp_record);
            }
            sel_recs.clear();
            // merge collinear records
            for (String sub : sub_seqs.keySet()) {
                // get all records for subject/reference sequence sub_seq
                tmp_records.clear();
                for (Blast6Segment record : buff) if (record.sseqid().equals(sub))
                    tmp_records.add(record);
                if (tmp_records.isEmpty())
                    continue;
                // find alignment segments that can be deleted
                // those that are subsets of larger alignment segments
                // (Sstart, (sstart, send), Send) and (Qstart, (qstart, qend), Qend)
                Collections.sort(tmp_records, new Blast6Segment.SegmentSizeComparator());
                final Set<int[][]> ranges = new HashSet<int[][]>();
                outerloop: for (int i = 0; i < tmp_records.size(); i++) {
                    primary_record = tmp_records.get(i);
                    int[][] range = new int[2][2];
                    if (primary_record.sstart() < primary_record.send()) {
                        range[0][0] = primary_record.sstart();
                        range[0][1] = primary_record.send();
                    } else {
                        range[0][0] = primary_record.send();
                        range[0][1] = primary_record.sstart();
                    }
                    if (primary_record.qstart() < primary_record.qend()) {
                        range[1][0] = primary_record.qstart();
                        range[1][1] = primary_record.qend();
                    } else {
                        range[1][0] = primary_record.qend();
                        range[1][1] = primary_record.qstart();
                    }
                    for (int[][] r : ranges) {
                        if (r[0][0] <= range[0][0] && r[0][1] >= range[0][1] && r[1][0] <= range[1][0] && r[1][1] >= range[1][1]) {
                            tmp_records.remove(i--);
                            continue outerloop;
                        }
                    }
                    ranges.add(range);
                }
                // TODO rewrite this part
                /**
                 *					// find collinear alignment segments that can be merged
                 *					Collections.sort(tmp_records, new BlastRecord.SInterceptComparator());
                 *
                 *					collinear_merged.clear();
                 *					final List<Blast6Record> temp = new ArrayList<Blast6Record>();
                 *					for(int i=0; i<tmp_records.size(); ) {
                 *						Blast6Record record = tmp_records.get(i);
                 *						double max_shift;
                 *						temp.clear();
                 *						temp.add(record);
                 *
                 *						// find collinear alignment segments
                 *						outerloop:
                 *							while( (++i)<tmp_records.size() ) {
                 *								record = tmp_records.get(i);
                 *								// check if is collinear with other alignment segments
                 *								for(Blast6Record r : temp) {
                 *									max_shift = collinear_shift*
                 *											Math.min(r.length(), record.length());
                 *									if(BlastRecord.sdistance(r, record)<=max_shift &&
                 *											BlastRecord.qdistance(r, record)<=max_shift &&
                 *											BlastRecord.pdistance(r, record)<=max_shift) {
                 *										temp.add(record);
                 *										continue outerloop;
                 *									}
                 *								}
                 *								break;
                 *							}
                 *
                 *						// merge collinear alignment segments
                 *						int qstart = Integer.MAX_VALUE;
                 *						int qend = Integer.MIN_VALUE;
                 *						int sstart = Integer.MAX_VALUE;
                 *						int send = Integer.MIN_VALUE;
                 *						double pident = 0;
                 *						int length = 0;
                 *
                 *						// qstart is always smaller than qend
                 *						for(int j=0; j<temp.size(); j++) {
                 *							record = temp.get(j);
                 *							if(record.qstart()<qstart) {
                 *								qstart = record.qstart();
                 *								sstart = record.sstart();
                 *							}
                 *							if(record.qend()>qend) {
                 *								qend = record.qend();
                 *								send = record.send();
                 *							}
                 *							if(record.pident()>pident)
                 *								pident = record.pident();
                 *							length += record.length();
                 *						}
                 *
                 *						collinear_merged.add(new Blast6Record(qry,sub,pident,length,-1,-1,qstart,qend,sstart,send,-1,-1));
                 *					}
                 */
                // find collinear alignment segments that can be merged
                // more accurate but slower
                Collections.sort(tmp_records, new AlignmentSegment.SubjectCoordinationComparator());
                Blast6Segment record;
                for (int i = 0; i < tmp_records.size(); i++) {
                    primary_record = tmp_records.get(i);
                    for (int j = i + 1; j < tmp_records.size(); j++) {
                        secondary_record = tmp_records.get(j);
                        double max_shift = collinear_shift * Math.min(primary_record.length(), secondary_record.length());
                        if ((record = Blast6Segment.collinear(primary_record, secondary_record, max_shift)) != null) {
                            tmp_records.set(i, record);
                            tmp_records.remove(j);
                            --i;
                            break;
                        }
                    }
                }
                // process blast records that clipped by gaps
                // (sstart, send)---(start2, send2)
                // (sstart  ...  ---  ...    send2)
                TreeRangeSet<Integer> sub_gap = sub_gaps.get(sub);
                Collections.sort(tmp_records, new AlignmentSegment.SubjectCoordinationComparator());
                for (int i = 0; i < tmp_records.size(); i++) {
                    primary_record = tmp_records.get(i);
                    if (sub_gap.contains(primary_record.true_send())) {
                        secondary_record = null;
                        int sec_j = -1;
                        for (int j = i + 1; j < tmp_records.size(); j++) {
                            if (tmp_records.get(j).true_sstart() >= primary_record.true_send()) {
                                secondary_record = tmp_records.get(j);
                                sec_j = j;
                                break;
                            }
                        }
                        if (secondary_record == null || AlignmentSegment.reverse(primary_record, secondary_record)) {
                            // reverse alignment segments
                            continue;
                        }
                        if (sub_gap.contains(secondary_record.true_sstart()) && sub_gap.rangeContaining(primary_record.true_send()).equals(sub_gap.rangeContaining(secondary_record.true_sstart()))) {
                            // clipping
                            // merge two alignment segments
                            double pident = Math.max(primary_record.pident(), secondary_record.pident());
                            int qstart = Math.min(primary_record.true_qstart(), secondary_record.true_qstart());
                            int qend = Math.max(primary_record.true_qend(), secondary_record.true_qend());
                            int sstart = Math.min(primary_record.true_sstart(), secondary_record.true_sstart());
                            int send = Math.max(primary_record.true_send(), secondary_record.true_send());
                            int length = qend - qstart + 1;
                            // replace primary record with merged record
                            // delete secondary record
                            Blast6Segment merged_record = primary_record.forward() ? new Blast6Segment(qry, sub, pident, length, -1, -1, qstart, qend, sstart, send, -1, -1) : new Blast6Segment(qry, sub, pident, length, -1, -1, qstart, qend, send, sstart, -1, -1);
                            tmp_records.set(i, merged_record);
                            tmp_records.remove(sec_j);
                            // the merged records need to be processed
                            --i;
                        }
                    }
                }
                // add to sel_recs
                sel_recs.addAll(tmp_records);
            }
            // filter by alignment fraction
            buff.clear();
            buff.addAll(sel_recs);
            sel_recs.clear();
            for (Blast6Segment record : buff) {
                if (record.length() / qry_ln >= this.min_frac)
                    sel_recs.add(record);
            }
            if (sel_recs.isEmpty()) {
                // continue
                continue;
            }
            // filter blast records
            Collections.sort(sel_recs, new Blast6Segment.MatchIndentityComparator());
            // process primary alignment
            primary_record = sel_recs.get(0);
            anchored_records.get(primary_record.sseqid()).add(primary_record);
            aln_frac = primary_record.length() / qry_ln;
            // and process
            for (int i = 1; i < sel_recs.size(); i++) {
                secondary_record = sel_recs.get(i);
                if (secondary_record.pident() + this.diff_ident < primary_record.pident() || secondary_record.length() / qry_ln + this.diff_frac < aln_frac) {
                    break;
                } else {
                    anchored_records.get(secondary_record.sseqid()).add(secondary_record);
                }
            }
        }
        br_blast.close();
        for (Map.Entry<String, List<Blast6Segment>> entry : anchored_records.entrySet()) {
            System.out.println(entry.getKey() + ": " + entry.getValue().size());
        }
        final BufferedWriter bw_map = Utils.getBufferedWriter(out_prefix + ".map");
        final BufferedWriter bw_fa = Utils.getBufferedWriter(out_prefix + ".fa");
        final Set<String> anchored_seqs = new HashSet<String>();
        final List<String> sub_list = Sequence.parseSeqList(subject_file);
        final List<GraphPath<Integer, DefaultWeightedEdge>> paths = new ArrayList<GraphPath<Integer, DefaultWeightedEdge>>();
        for (String sub_sn : sub_list) {
            blast6_records = anchored_records.get(sub_sn);
            int nV = blast6_records.size(), count = 0;
            // sort blast records
            Collections.sort(blast6_records, new Blast6Segment.SubjectCoordinationComparator());
            // consensus
            int posUpto = 0, send_clip = 0;
            int sstart, send, qstart, qend, qlen, tmp_int, qstart_clip, qend_clip, gap_size;
            // distance to last 'N', start from next position to find longest common suffix-prefix
            int prev_n = Integer.MAX_VALUE;
            int mol_len = 0;
            int bfs_dist;
            String qseq;
            int nS, nQ;
            // first step: construct super scaffold
            // will form a tree graph indicating the path through the contigs
            // convert contig names to integer indices
            // one contig could end up with multiple indices due to repeats
            Map<Integer, String> ss_coordinate = new HashMap<Integer, String>();
            int index = 0;
            for (Blast6Segment record : blast6_records) ss_coordinate.put(index++, record.qseqid() + (record.forward() ? "" : "'"));
            int qseqid1, qseqid2;
            String os;
            List<Double> dist_stats = new ArrayList<Double>();
            for (int i = 0; i != nV; i++) {
                qseqid1 = contig_coordinate.getKey(ss_coordinate.get(i));
                os = blast6_records.get(i).qseqid() + (blast6_records.get(i).forward() ? "" : "'");
                System.out.print(Utils.fixedLengthPaddingString(os, 40) + "\t");
                double distance = Double.MAX_VALUE;
                for (int j = 1; j <= window_size; j++) {
                    if (i + j >= nV) {
                        System.out.print(Utils.fixedLengthPaddingString("", 10) + ",\t");
                        continue;
                    }
                    qseqid2 = contig_coordinate.getKey(ss_coordinate.get(i + j));
                    GraphPath<Integer, DefaultWeightedEdge> e = bfs.getPath(qseqid1, qseqid2);
                    // GraphPath<Integer, DefaultWeightedEdge> e = dijkstra_paths.getPath( qseqid1, qseqid2 );
                    if (e != null) {
                        double d = e.getLength();
                        if (distance > d)
                            distance = d;
                    }
                    int l = 0;
                    if (e != null) {
                        List<Integer> vertices = e.getVertexList();
                        for (int k = 1; k < vertices.size() - 1; k++) l += qry_seqs.get(contig_coordinate.get(vertices.get(k))).seq_ln();
                        l -= (vertices.size() - 2) * this.kmer_size;
                    }
                    System.out.print((e == null ? Utils.fixedLengthPaddingString("Inf " + l, 10) + ",\t" : Utils.fixedLengthPaddingString(e.getLength() + " " + l, 10) + ",\t"));
                }
                System.out.println();
                dist_stats.add(distance);
            }
            double[] dist_arr = ArrayUtils.toPrimitive(dist_stats.toArray(new Double[dist_stats.size()]));
            Arrays.sort(dist_arr);
            Percentile percentile = new Percentile();
            double q1 = percentile.evaluate(dist_arr, 25);
            double q3 = percentile.evaluate(dist_arr, 75);
            // upper bound for outliers
            // q3+1.5*IQR
            double outlier = q3 + 1.5 * (q3 - q1);
            System.out.println("Outlier upper bound: " + outlier);
            // construct a tree graph
            pseudo_tree = new DirectedWeightedPseudograph<Integer, DefaultWeightedEdge>(DefaultWeightedEdge.class);
            for (int i : ss_coordinate.keySet()) pseudo_tree.addVertex(i);
            Blast6Segment record1, record2;
            // select the nearest neighbour(s)
            for (int i = 0; i != nV; i++) {
                System.out.println(i);
                record1 = blast6_records.get(i);
                send = record1.true_send();
                qseqid1 = contig_coordinate.getKey(ss_coordinate.get(i));
                paths.clear();
                bfs_dist = Integer.MAX_VALUE;
                for (int j = i + 1; j < nV; j++) {
                    record2 = blast6_records.get(j);
                    sstart = record2.true_sstart();
                    if (send < sstart)
                        break;
                    qseqid2 = contig_coordinate.getKey(ss_coordinate.get(j));
                    GraphPath<Integer, DefaultWeightedEdge> e = bfs.getPath(qseqid1, qseqid2);
                    // GraphPath<Integer, DefaultWeightedEdge> e = dijkstra_paths.getPath( qseqid1, qseqid2 );
                    if (e != null) {
                        tmp_int = e.getLength();
                        if (tmp_int == 0)
                            continue;
                        if (tmp_int < bfs_dist)
                            paths.clear();
                        paths.add(e);
                        bfs_dist = tmp_int;
                    }
                }
                if (paths.isEmpty())
                    continue;
                for (GraphPath<Integer, DefaultWeightedEdge> path : paths) pseudo_tree.setEdgeWeight(pseudo_tree.addEdge(path.getStartVertex(), path.getEndVertex()), 1d);
            }
            // secondly fill gaps
            long toc = System.nanoTime();
            System.out.println(toc - tic + " nano secs.");
            if (true)
                throw new RuntimeException("!!!");
            // second step: construct pseudo chromosomes
            // 1. choose the first contig
            int firstv = 0;
            while (true) {
                // int v2 = next(firstv+1);
                if (true)
                    break;
            }
            // 2. extend pseudo chromosomes
            StringBuilder seq_str = new StringBuilder();
            for (int v = firstv; v < nV - 1; v++) {
                if (++count % 10000 == 0)
                    myLogger.info(sub_sn + " " + count + "/" + nV + " done.");
                Blast6Segment record = blast6_records.get(v);
                qlen = qry_seqs.get(record.qseqid()).seq_ln();
                sstart = record.sstart();
                send = record.send();
                qstart = record.qstart();
                qend = record.qend();
                if (sstart > send) {
                    // make sure sstart<send
                    tmp_int = sstart;
                    sstart = send;
                    send = tmp_int;
                    tmp_int = qstart;
                    qstart = qend;
                    qend = tmp_int;
                }
                if (qstart > qend) {
                    qstart_clip = qlen - qstart;
                    qend_clip = qend - 1;
                    qseq = Sequence.revCompSeq(qry_seqs.get(record.qseqid()).seq_str());
                } else {
                    qstart_clip = qstart - 1;
                    qend_clip = qlen - qend;
                    qseq = qry_seqs.get(record.qseqid()).seq_str();
                }
                if (send < posUpto || sstart < posUpto && qstart_clip > max_clip) {
                    // TODO process
                    continue;
                }
                // find longest suffix-prefix
                nS = seq_str.length();
                nQ = qseq.length();
                int nO = Math.min(prev_n, Math.min(nS, nQ));
                outerloop: for (; nO >= min_overlap; nO--) {
                    int nS_i = nS - nO;
                    for (int i = 0; i < nO; i++) {
                        if (seq_str.charAt(nS_i + i) != qseq.charAt(i))
                            continue outerloop;
                    }
                    break outerloop;
                }
                if (nO < min_overlap) {
                    // otherwise will insert a large GAP max(pseduo_distance, max_gap)
                    if (posUpto > 0) {
                        if (sstart <= posUpto) {
                            // if too much overlap, then treat it as a contradiction
                            if (posUpto - sstart > min_overlap) {
                                // discard
                                continue;
                            } else {
                                // insert a min_gap
                                gap_size = min_gap;
                            }
                        } else {
                            // estimate gap size
                            gap_size = (sstart - posUpto) - (send_clip + qstart_clip);
                            if (gap_size < max_gap)
                                gap_size = max_gap;
                        }
                        bw_map.write("GAP\t" + gap_size + "\t0\t" + gap_size + "\t+\t" + sub_sn + "\t" + mol_len + "\t" + (mol_len + gap_size) + "\n");
                        seq_str.append(Sequence.polyN(gap_size));
                        mol_len += gap_size;
                    }
                    bw_map.write(record.qseqid() + "\t" + qlen + "\t");
                    if (qstart > qend) {
                        // reverse
                        bw_map.write("0\t" + qlen + "\t-\t");
                    } else {
                        // forward
                        bw_map.write("0\t" + qlen + "\t+\t");
                    }
                    seq_str.append(qseq);
                    bw_map.write(sub_sn + "\t" + mol_len + "\t" + (mol_len + qlen) + "\n");
                    mol_len += qlen;
                    prev_n = qlen;
                    anchored_seqs.add(record.qseqid());
                } else {
                    // overlap found
                    // will not insert gap
                    // ====================
                    // /-------\
                    // /----\
                    // calculate overlaps
                    // process overlap
                    qstart = nO;
                    if (qstart == qlen)
                        continue;
                    bw_map.write(record.qseqid() + "\t" + (qlen - qstart) + "\t");
                    if (qstart > qend) {
                        // reverse
                        bw_map.write(0 + "\t" + (qlen - qstart) + "\t-\t");
                    } else {
                        // forward
                        bw_map.write(qstart + "\t" + qlen + "\t+\t");
                    }
                    bw_map.write(sub_sn + "\t" + mol_len + "\t" + (mol_len + qlen - qstart) + "\n");
                    mol_len += qlen - qstart;
                    prev_n += qlen - qstart;
                    seq_str.append(qseq.substring(qstart));
                    anchored_seqs.add(record.qseqid());
                }
                posUpto = send;
                send_clip = qend_clip;
            }
            if (seq_str.length() > 0)
                bw_fa.write(Sequence.formatOutput(sub_sn, seq_str.toString()));
        }
        bw_fa.close();
        bw_map.close();
        final BufferedWriter bw_ufa = Utils.getBufferedWriter(out_prefix + "_unplaced.fa");
        for (String seq : qry_seqs.keySet()) if (!anchored_seqs.contains(seq))
            bw_ufa.write(qry_seqs.get(seq).formatOutput());
        bw_ufa.close();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}
Also used : Percentile(org.apache.commons.math3.stat.descriptive.rank.Percentile) TreeRangeSet(com.google.common.collect.TreeRangeSet) HashMap(java.util.HashMap) Blast6Segment(cz1.ngs.model.Blast6Segment) ArrayList(java.util.ArrayList) GraphPath(org.jgrapht.GraphPath) BufferedWriter(java.io.BufferedWriter) ArrayList(java.util.ArrayList) List(java.util.List) HashSet(java.util.HashSet) Sequence(cz1.ngs.model.Sequence) IOException(java.io.IOException) DefaultWeightedEdge(org.jgrapht.graph.DefaultWeightedEdge) AlignmentSegment(cz1.ngs.model.AlignmentSegment) BufferedReader(java.io.BufferedReader) HashMap(java.util.HashMap) Map(java.util.Map) BidiMap(org.apache.commons.collections4.BidiMap) DualHashBidiMap(org.apache.commons.collections4.bidimap.DualHashBidiMap)

Aggregations

Blast6Segment (cz1.ngs.model.Blast6Segment)4 Sequence (cz1.ngs.model.Sequence)4 BufferedReader (java.io.BufferedReader)4 BufferedWriter (java.io.BufferedWriter)4 IOException (java.io.IOException)4 HashMap (java.util.HashMap)4 HashSet (java.util.HashSet)4 Map (java.util.Map)4 TreeRangeSet (com.google.common.collect.TreeRangeSet)3 AlignmentSegment (cz1.ngs.model.AlignmentSegment)3 ArrayList (java.util.ArrayList)3 List (java.util.List)3 BidiMap (org.apache.commons.collections4.BidiMap)2 DualHashBidiMap (org.apache.commons.collections4.bidimap.DualHashBidiMap)2 DefaultWeightedEdge (org.jgrapht.graph.DefaultWeightedEdge)2 FileReader (java.io.FileReader)1 Percentile (org.apache.commons.math3.stat.descriptive.rank.Percentile)1 GraphPath (org.jgrapht.GraphPath)1