Search in sources :

Example 6 with Identity

use of org.apache.commons.math3.analysis.function.Identity in project polyGembler by c-zhou.

the class Anchor1 method run.

// private static DijkstraShortestPath<Integer, DefaultWeightedEdge> dijkstra_paths;
@Override
public void run() {
    // TODO Auto-generated method stub
    long tic = System.nanoTime();
    sub_seqs = Sequence.parseFastaFileAsMap(subject_file);
    qry_seqs = Sequence.parseFastaFileWithRevCmpAsMap(query_file);
    // this.makeAssemblyGraph(query_file);
    this.makeAssemblyGraph(query_file);
    bfs = new BFSShortestPath<Integer, DefaultWeightedEdge>(assembly_graph);
    // dijkstra_paths = new DijkstraShortestPath<Integer, DefaultWeightedEdge>(assembly_graph);
    // find 'N/n's in subject/reference sequences
    // which could have impact on parsing the blast records
    final Map<String, TreeRangeSet<Integer>> sub_gaps = new HashMap<String, TreeRangeSet<Integer>>();
    final Map<String, List<Blast6Segment>> anchored_records = new HashMap<String, List<Blast6Segment>>();
    for (Map.Entry<String, Sequence> entry : sub_seqs.entrySet()) {
        String seq_sn = entry.getKey();
        String seq_str = entry.getValue().seq_str();
        final TreeRangeSet<Integer> tmp_rangeSet = TreeRangeSet.create();
        for (int j = 0; j < seq_str.length(); j++) {
            if (seq_str.charAt(j) == 'N' || seq_str.charAt(j) == 'n')
                // blast record is 1-based closed coordination
                tmp_rangeSet.add(Range.closed(j + 1, j + 1).canonical(DiscreteDomain.integers()));
        }
        int seq_ln = seq_str.length();
        final TreeRangeSet<Integer> range_set = TreeRangeSet.create();
        for (Range<Integer> range : tmp_rangeSet.asRanges()) {
            int lowerend = range.hasLowerBound() ? Math.max(0, range.lowerEndpoint() - gap_buff) : 0;
            int upperend = range.hasUpperBound() ? Math.min(seq_ln, range.upperEndpoint() + gap_buff - 1) : seq_ln;
            range_set.add(Range.closed(lowerend, upperend).canonical(DiscreteDomain.integers()));
        }
        sub_gaps.put(seq_sn, range_set);
        // initialise an array list for each reference chromosome
        anchored_records.put(seq_sn, new ArrayList<Blast6Segment>());
    }
    // parse blast records
    // blast records buffer
    final List<Blast6Segment> buff = new ArrayList<Blast6Segment>();
    // selected blast records
    final List<Blast6Segment> sel_recs = new ArrayList<Blast6Segment>();
    // temp list
    final List<Blast6Segment> tmp_records = new ArrayList<Blast6Segment>();
    try {
        BufferedReader br_blast = Utils.getBufferedReader(blast_out);
        Blast6Segment tmp_record = Blast6Segment.blast6Segment(br_blast.readLine());
        Blast6Segment primary_record, secondary_record;
        String qry;
        double qry_ln, aln_frac;
        while (tmp_record != null) {
            qry = tmp_record.qseqid();
            qry_ln = qry_seqs.get(qry).seq_ln();
            buff.clear();
            buff.add(tmp_record);
            while ((tmp_record = Blast6Segment.blast6Segment(br_blast.readLine())) != null && tmp_record.qseqid().equals(qry)) {
                // filter by identity
                if (tmp_record.pident() >= this.min_ident)
                    buff.add(tmp_record);
            }
            sel_recs.clear();
            // merge collinear records
            for (String sub : sub_seqs.keySet()) {
                // get all records for subject/reference sequence sub_seq
                tmp_records.clear();
                for (Blast6Segment record : buff) if (record.sseqid().equals(sub))
                    tmp_records.add(record);
                if (tmp_records.isEmpty())
                    continue;
                // find alignment segments that can be deleted
                // those that are subsets of larger alignment segments
                // (Sstart, (sstart, send), Send) and (Qstart, (qstart, qend), Qend)
                Collections.sort(tmp_records, new Blast6Segment.SegmentSizeComparator());
                final Set<int[][]> ranges = new HashSet<int[][]>();
                outerloop: for (int i = 0; i < tmp_records.size(); i++) {
                    primary_record = tmp_records.get(i);
                    int[][] range = new int[2][2];
                    if (primary_record.sstart() < primary_record.send()) {
                        range[0][0] = primary_record.sstart();
                        range[0][1] = primary_record.send();
                    } else {
                        range[0][0] = primary_record.send();
                        range[0][1] = primary_record.sstart();
                    }
                    if (primary_record.qstart() < primary_record.qend()) {
                        range[1][0] = primary_record.qstart();
                        range[1][1] = primary_record.qend();
                    } else {
                        range[1][0] = primary_record.qend();
                        range[1][1] = primary_record.qstart();
                    }
                    for (int[][] r : ranges) {
                        if (r[0][0] <= range[0][0] && r[0][1] >= range[0][1] && r[1][0] <= range[1][0] && r[1][1] >= range[1][1]) {
                            tmp_records.remove(i--);
                            continue outerloop;
                        }
                    }
                    ranges.add(range);
                }
                // TODO rewrite this part
                /**
                 *					// find collinear alignment segments that can be merged
                 *					Collections.sort(tmp_records, new BlastRecord.SInterceptComparator());
                 *
                 *					collinear_merged.clear();
                 *					final List<Blast6Record> temp = new ArrayList<Blast6Record>();
                 *					for(int i=0; i<tmp_records.size(); ) {
                 *						Blast6Record record = tmp_records.get(i);
                 *						double max_shift;
                 *						temp.clear();
                 *						temp.add(record);
                 *
                 *						// find collinear alignment segments
                 *						outerloop:
                 *							while( (++i)<tmp_records.size() ) {
                 *								record = tmp_records.get(i);
                 *								// check if is collinear with other alignment segments
                 *								for(Blast6Record r : temp) {
                 *									max_shift = collinear_shift*
                 *											Math.min(r.length(), record.length());
                 *									if(BlastRecord.sdistance(r, record)<=max_shift &&
                 *											BlastRecord.qdistance(r, record)<=max_shift &&
                 *											BlastRecord.pdistance(r, record)<=max_shift) {
                 *										temp.add(record);
                 *										continue outerloop;
                 *									}
                 *								}
                 *								break;
                 *							}
                 *
                 *						// merge collinear alignment segments
                 *						int qstart = Integer.MAX_VALUE;
                 *						int qend = Integer.MIN_VALUE;
                 *						int sstart = Integer.MAX_VALUE;
                 *						int send = Integer.MIN_VALUE;
                 *						double pident = 0;
                 *						int length = 0;
                 *
                 *						// qstart is always smaller than qend
                 *						for(int j=0; j<temp.size(); j++) {
                 *							record = temp.get(j);
                 *							if(record.qstart()<qstart) {
                 *								qstart = record.qstart();
                 *								sstart = record.sstart();
                 *							}
                 *							if(record.qend()>qend) {
                 *								qend = record.qend();
                 *								send = record.send();
                 *							}
                 *							if(record.pident()>pident)
                 *								pident = record.pident();
                 *							length += record.length();
                 *						}
                 *
                 *						collinear_merged.add(new Blast6Record(qry,sub,pident,length,-1,-1,qstart,qend,sstart,send,-1,-1));
                 *					}
                 */
                // find collinear alignment segments that can be merged
                // more accurate but slower
                Collections.sort(tmp_records, new AlignmentSegment.SubjectCoordinationComparator());
                Blast6Segment record;
                for (int i = 0; i < tmp_records.size(); i++) {
                    primary_record = tmp_records.get(i);
                    for (int j = i + 1; j < tmp_records.size(); j++) {
                        secondary_record = tmp_records.get(j);
                        double max_shift = collinear_shift * Math.min(primary_record.length(), secondary_record.length());
                        if ((record = Blast6Segment.collinear(primary_record, secondary_record, max_shift)) != null) {
                            tmp_records.set(i, record);
                            tmp_records.remove(j);
                            --i;
                            break;
                        }
                    }
                }
                // process blast records that clipped by gaps
                // (sstart, send)---(start2, send2)
                // (sstart  ...  ---  ...    send2)
                TreeRangeSet<Integer> sub_gap = sub_gaps.get(sub);
                Collections.sort(tmp_records, new AlignmentSegment.SubjectCoordinationComparator());
                for (int i = 0; i < tmp_records.size(); i++) {
                    primary_record = tmp_records.get(i);
                    if (sub_gap.contains(primary_record.true_send())) {
                        secondary_record = null;
                        int sec_j = -1;
                        for (int j = i + 1; j < tmp_records.size(); j++) {
                            if (tmp_records.get(j).true_sstart() >= primary_record.true_send()) {
                                secondary_record = tmp_records.get(j);
                                sec_j = j;
                                break;
                            }
                        }
                        if (secondary_record == null || AlignmentSegment.reverse(primary_record, secondary_record)) {
                            // reverse alignment segments
                            continue;
                        }
                        if (sub_gap.contains(secondary_record.true_sstart()) && sub_gap.rangeContaining(primary_record.true_send()).equals(sub_gap.rangeContaining(secondary_record.true_sstart()))) {
                            // clipping
                            // merge two alignment segments
                            double pident = Math.max(primary_record.pident(), secondary_record.pident());
                            int qstart = Math.min(primary_record.true_qstart(), secondary_record.true_qstart());
                            int qend = Math.max(primary_record.true_qend(), secondary_record.true_qend());
                            int sstart = Math.min(primary_record.true_sstart(), secondary_record.true_sstart());
                            int send = Math.max(primary_record.true_send(), secondary_record.true_send());
                            int length = qend - qstart + 1;
                            // replace primary record with merged record
                            // delete secondary record
                            Blast6Segment merged_record = primary_record.forward() ? new Blast6Segment(qry, sub, pident, length, -1, -1, qstart, qend, sstart, send, -1, -1) : new Blast6Segment(qry, sub, pident, length, -1, -1, qstart, qend, send, sstart, -1, -1);
                            tmp_records.set(i, merged_record);
                            tmp_records.remove(sec_j);
                            // the merged records need to be processed
                            --i;
                        }
                    }
                }
                // add to sel_recs
                sel_recs.addAll(tmp_records);
            }
            // filter by alignment fraction
            buff.clear();
            buff.addAll(sel_recs);
            sel_recs.clear();
            for (Blast6Segment record : buff) {
                if (record.length() / qry_ln >= this.min_frac)
                    sel_recs.add(record);
            }
            if (sel_recs.isEmpty()) {
                // continue
                continue;
            }
            // filter blast records
            Collections.sort(sel_recs, new Blast6Segment.MatchIndentityComparator());
            // process primary alignment
            primary_record = sel_recs.get(0);
            anchored_records.get(primary_record.sseqid()).add(primary_record);
            aln_frac = primary_record.length() / qry_ln;
            // and process
            for (int i = 1; i < sel_recs.size(); i++) {
                secondary_record = sel_recs.get(i);
                if (secondary_record.pident() + this.diff_ident < primary_record.pident() || secondary_record.length() / qry_ln + this.diff_frac < aln_frac) {
                    break;
                } else {
                    anchored_records.get(secondary_record.sseqid()).add(secondary_record);
                }
            }
        }
        br_blast.close();
        for (Map.Entry<String, List<Blast6Segment>> entry : anchored_records.entrySet()) {
            System.out.println(entry.getKey() + ": " + entry.getValue().size());
        }
        final BufferedWriter bw_map = Utils.getBufferedWriter(out_prefix + ".map");
        final BufferedWriter bw_fa = Utils.getBufferedWriter(out_prefix + ".fa");
        final Set<String> anchored_seqs = new HashSet<String>();
        final List<String> sub_list = Sequence.parseSeqList(subject_file);
        final List<GraphPath<Integer, DefaultWeightedEdge>> paths = new ArrayList<GraphPath<Integer, DefaultWeightedEdge>>();
        for (String sub_sn : sub_list) {
            blast6_records = anchored_records.get(sub_sn);
            int nV = blast6_records.size(), count = 0;
            // sort blast records
            Collections.sort(blast6_records, new Blast6Segment.SubjectCoordinationComparator());
            // consensus
            int posUpto = 0, send_clip = 0;
            int sstart, send, qstart, qend, qlen, tmp_int, qstart_clip, qend_clip, gap_size;
            // distance to last 'N', start from next position to find longest common suffix-prefix
            int prev_n = Integer.MAX_VALUE;
            int mol_len = 0;
            int bfs_dist;
            String qseq;
            int nS, nQ;
            // first step: construct super scaffold
            // will form a tree graph indicating the path through the contigs
            // convert contig names to integer indices
            // one contig could end up with multiple indices due to repeats
            Map<Integer, String> ss_coordinate = new HashMap<Integer, String>();
            int index = 0;
            for (Blast6Segment record : blast6_records) ss_coordinate.put(index++, record.qseqid() + (record.forward() ? "" : "'"));
            int qseqid1, qseqid2;
            String os;
            List<Double> dist_stats = new ArrayList<Double>();
            for (int i = 0; i != nV; i++) {
                qseqid1 = contig_coordinate.getKey(ss_coordinate.get(i));
                os = blast6_records.get(i).qseqid() + (blast6_records.get(i).forward() ? "" : "'");
                System.out.print(Utils.fixedLengthPaddingString(os, 40) + "\t");
                double distance = Double.MAX_VALUE;
                for (int j = 1; j <= window_size; j++) {
                    if (i + j >= nV) {
                        System.out.print(Utils.fixedLengthPaddingString("", 10) + ",\t");
                        continue;
                    }
                    qseqid2 = contig_coordinate.getKey(ss_coordinate.get(i + j));
                    GraphPath<Integer, DefaultWeightedEdge> e = bfs.getPath(qseqid1, qseqid2);
                    // GraphPath<Integer, DefaultWeightedEdge> e = dijkstra_paths.getPath( qseqid1, qseqid2 );
                    if (e != null) {
                        double d = e.getLength();
                        if (distance > d)
                            distance = d;
                    }
                    int l = 0;
                    if (e != null) {
                        List<Integer> vertices = e.getVertexList();
                        for (int k = 1; k < vertices.size() - 1; k++) l += qry_seqs.get(contig_coordinate.get(vertices.get(k))).seq_ln();
                        l -= (vertices.size() - 2) * this.kmer_size;
                    }
                    System.out.print((e == null ? Utils.fixedLengthPaddingString("Inf " + l, 10) + ",\t" : Utils.fixedLengthPaddingString(e.getLength() + " " + l, 10) + ",\t"));
                }
                System.out.println();
                dist_stats.add(distance);
            }
            double[] dist_arr = ArrayUtils.toPrimitive(dist_stats.toArray(new Double[dist_stats.size()]));
            Arrays.sort(dist_arr);
            Percentile percentile = new Percentile();
            double q1 = percentile.evaluate(dist_arr, 25);
            double q3 = percentile.evaluate(dist_arr, 75);
            // upper bound for outliers
            // q3+1.5*IQR
            double outlier = q3 + 1.5 * (q3 - q1);
            System.out.println("Outlier upper bound: " + outlier);
            // construct a tree graph
            pseudo_tree = new DirectedWeightedPseudograph<Integer, DefaultWeightedEdge>(DefaultWeightedEdge.class);
            for (int i : ss_coordinate.keySet()) pseudo_tree.addVertex(i);
            Blast6Segment record1, record2;
            // select the nearest neighbour(s)
            for (int i = 0; i != nV; i++) {
                System.out.println(i);
                record1 = blast6_records.get(i);
                send = record1.true_send();
                qseqid1 = contig_coordinate.getKey(ss_coordinate.get(i));
                paths.clear();
                bfs_dist = Integer.MAX_VALUE;
                for (int j = i + 1; j < nV; j++) {
                    record2 = blast6_records.get(j);
                    sstart = record2.true_sstart();
                    if (send < sstart)
                        break;
                    qseqid2 = contig_coordinate.getKey(ss_coordinate.get(j));
                    GraphPath<Integer, DefaultWeightedEdge> e = bfs.getPath(qseqid1, qseqid2);
                    // GraphPath<Integer, DefaultWeightedEdge> e = dijkstra_paths.getPath( qseqid1, qseqid2 );
                    if (e != null) {
                        tmp_int = e.getLength();
                        if (tmp_int == 0)
                            continue;
                        if (tmp_int < bfs_dist)
                            paths.clear();
                        paths.add(e);
                        bfs_dist = tmp_int;
                    }
                }
                if (paths.isEmpty())
                    continue;
                for (GraphPath<Integer, DefaultWeightedEdge> path : paths) pseudo_tree.setEdgeWeight(pseudo_tree.addEdge(path.getStartVertex(), path.getEndVertex()), 1d);
            }
            // secondly fill gaps
            long toc = System.nanoTime();
            System.out.println(toc - tic + " nano secs.");
            if (true)
                throw new RuntimeException("!!!");
            // second step: construct pseudo chromosomes
            // 1. choose the first contig
            int firstv = 0;
            while (true) {
                // int v2 = next(firstv+1);
                if (true)
                    break;
            }
            // 2. extend pseudo chromosomes
            StringBuilder seq_str = new StringBuilder();
            for (int v = firstv; v < nV - 1; v++) {
                if (++count % 10000 == 0)
                    myLogger.info(sub_sn + " " + count + "/" + nV + " done.");
                Blast6Segment record = blast6_records.get(v);
                qlen = qry_seqs.get(record.qseqid()).seq_ln();
                sstart = record.sstart();
                send = record.send();
                qstart = record.qstart();
                qend = record.qend();
                if (sstart > send) {
                    // make sure sstart<send
                    tmp_int = sstart;
                    sstart = send;
                    send = tmp_int;
                    tmp_int = qstart;
                    qstart = qend;
                    qend = tmp_int;
                }
                if (qstart > qend) {
                    qstart_clip = qlen - qstart;
                    qend_clip = qend - 1;
                    qseq = Sequence.revCompSeq(qry_seqs.get(record.qseqid()).seq_str());
                } else {
                    qstart_clip = qstart - 1;
                    qend_clip = qlen - qend;
                    qseq = qry_seqs.get(record.qseqid()).seq_str();
                }
                if (send < posUpto || sstart < posUpto && qstart_clip > max_clip) {
                    // TODO process
                    continue;
                }
                // find longest suffix-prefix
                nS = seq_str.length();
                nQ = qseq.length();
                int nO = Math.min(prev_n, Math.min(nS, nQ));
                outerloop: for (; nO >= min_overlap; nO--) {
                    int nS_i = nS - nO;
                    for (int i = 0; i < nO; i++) {
                        if (seq_str.charAt(nS_i + i) != qseq.charAt(i))
                            continue outerloop;
                    }
                    break outerloop;
                }
                if (nO < min_overlap) {
                    // otherwise will insert a large GAP max(pseduo_distance, max_gap)
                    if (posUpto > 0) {
                        if (sstart <= posUpto) {
                            // if too much overlap, then treat it as a contradiction
                            if (posUpto - sstart > min_overlap) {
                                // discard
                                continue;
                            } else {
                                // insert a min_gap
                                gap_size = min_gap;
                            }
                        } else {
                            // estimate gap size
                            gap_size = (sstart - posUpto) - (send_clip + qstart_clip);
                            if (gap_size < max_gap)
                                gap_size = max_gap;
                        }
                        bw_map.write("GAP\t" + gap_size + "\t0\t" + gap_size + "\t+\t" + sub_sn + "\t" + mol_len + "\t" + (mol_len + gap_size) + "\n");
                        seq_str.append(Sequence.polyN(gap_size));
                        mol_len += gap_size;
                    }
                    bw_map.write(record.qseqid() + "\t" + qlen + "\t");
                    if (qstart > qend) {
                        // reverse
                        bw_map.write("0\t" + qlen + "\t-\t");
                    } else {
                        // forward
                        bw_map.write("0\t" + qlen + "\t+\t");
                    }
                    seq_str.append(qseq);
                    bw_map.write(sub_sn + "\t" + mol_len + "\t" + (mol_len + qlen) + "\n");
                    mol_len += qlen;
                    prev_n = qlen;
                    anchored_seqs.add(record.qseqid());
                } else {
                    // overlap found
                    // will not insert gap
                    // ====================
                    // /-------\
                    // /----\
                    // calculate overlaps
                    // process overlap
                    qstart = nO;
                    if (qstart == qlen)
                        continue;
                    bw_map.write(record.qseqid() + "\t" + (qlen - qstart) + "\t");
                    if (qstart > qend) {
                        // reverse
                        bw_map.write(0 + "\t" + (qlen - qstart) + "\t-\t");
                    } else {
                        // forward
                        bw_map.write(qstart + "\t" + qlen + "\t+\t");
                    }
                    bw_map.write(sub_sn + "\t" + mol_len + "\t" + (mol_len + qlen - qstart) + "\n");
                    mol_len += qlen - qstart;
                    prev_n += qlen - qstart;
                    seq_str.append(qseq.substring(qstart));
                    anchored_seqs.add(record.qseqid());
                }
                posUpto = send;
                send_clip = qend_clip;
            }
            if (seq_str.length() > 0)
                bw_fa.write(Sequence.formatOutput(sub_sn, seq_str.toString()));
        }
        bw_fa.close();
        bw_map.close();
        final BufferedWriter bw_ufa = Utils.getBufferedWriter(out_prefix + "_unplaced.fa");
        for (String seq : qry_seqs.keySet()) if (!anchored_seqs.contains(seq))
            bw_ufa.write(qry_seqs.get(seq).formatOutput());
        bw_ufa.close();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}
Also used : Percentile(org.apache.commons.math3.stat.descriptive.rank.Percentile) TreeRangeSet(com.google.common.collect.TreeRangeSet) HashMap(java.util.HashMap) Blast6Segment(cz1.ngs.model.Blast6Segment) ArrayList(java.util.ArrayList) GraphPath(org.jgrapht.GraphPath) BufferedWriter(java.io.BufferedWriter) ArrayList(java.util.ArrayList) List(java.util.List) HashSet(java.util.HashSet) Sequence(cz1.ngs.model.Sequence) IOException(java.io.IOException) DefaultWeightedEdge(org.jgrapht.graph.DefaultWeightedEdge) AlignmentSegment(cz1.ngs.model.AlignmentSegment) BufferedReader(java.io.BufferedReader) HashMap(java.util.HashMap) Map(java.util.Map) BidiMap(org.apache.commons.collections4.BidiMap) DualHashBidiMap(org.apache.commons.collections4.bidimap.DualHashBidiMap)

Aggregations

RealMatrix (org.apache.commons.math3.linear.RealMatrix)3 ArrayRealVector (org.apache.commons.math3.linear.ArrayRealVector)2 TreeRangeSet (com.google.common.collect.TreeRangeSet)1 AlignmentSegment (cz1.ngs.model.AlignmentSegment)1 Blast6Segment (cz1.ngs.model.Blast6Segment)1 Sequence (cz1.ngs.model.Sequence)1 BufferedReader (java.io.BufferedReader)1 BufferedWriter (java.io.BufferedWriter)1 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 List (java.util.List)1 Map (java.util.Map)1 BidiMap (org.apache.commons.collections4.BidiMap)1 DualHashBidiMap (org.apache.commons.collections4.bidimap.DualHashBidiMap)1 NormalDistribution (org.apache.commons.math3.distribution.NormalDistribution)1 DimensionMismatchException (org.apache.commons.math3.exception.DimensionMismatchException)1 Array2DRowRealMatrix (org.apache.commons.math3.linear.Array2DRowRealMatrix)1 CholeskyDecomposition (org.apache.commons.math3.linear.CholeskyDecomposition)1