Search in sources :

Example 11 with Sequence

use of cz1.ngs.model.Sequence in project polyGembler by c-zhou.

the class Anchor method run.

@Override
public void run() {
    // TODO Auto-generated method stub
    // read assembly graph file
    final GFA gfa = new GFA(query_file, asm_graph);
    qry_seqs = gfa.getSequenceMap();
    sub_seqs = Sequence.parseFastaFileAsMap(subject_file);
    myLogger.info("  GFA vertices: " + gfa.vertexSet().size());
    myLogger.info("  GFA edges   : " + gfa.edgeSet().size());
    // myLogger.info("  GFA edges --- ");
    // for(OverlapEdge olap : gfa.edgeSet())
    // myLogger.info(olap.olapInfo().toString());
    // find 'N/n's in subject/reference sequences
    // which could have impact on parsing the blast records
    sub_gaps = new HashMap<String, TreeRangeSet<Integer>>();
    for (Map.Entry<String, Sequence> entry : sub_seqs.entrySet()) {
        String seq_sn = entry.getKey();
        String seq_str = entry.getValue().seq_str();
        final TreeRangeSet<Integer> tmp_rangeSet = TreeRangeSet.create();
        for (int j = 0; j < seq_str.length(); j++) {
            if (seq_str.charAt(j) == 'N' || seq_str.charAt(j) == 'n')
                // blast record is 1-based closed coordination
                tmp_rangeSet.add(Range.closed(j + 1, j + 1).canonical(DiscreteDomain.integers()));
        }
        int seq_ln = seq_str.length();
        final TreeRangeSet<Integer> range_set = TreeRangeSet.create();
        for (Range<Integer> range : tmp_rangeSet.asRanges()) {
            int lowerend = range.hasLowerBound() ? Math.max(0, range.lowerEndpoint() - gap_buff) : 0;
            int upperend = range.hasUpperBound() ? Math.min(seq_ln, range.upperEndpoint() + gap_buff - 1) : seq_ln;
            range_set.add(Range.closed(lowerend, upperend).canonical(DiscreteDomain.integers()));
        }
        sub_gaps.put(seq_sn, range_set);
    }
    // read alignment file and place the query sequences
    final Map<String, Set<SAMSegment>> initPlace = new HashMap<String, Set<SAMSegment>>();
    final Map<String, List<SAMSegment>> initPseudoAssembly = new HashMap<String, List<SAMSegment>>();
    for (String sub_seq : sub_seqs.keySet()) initPseudoAssembly.put(sub_seq, new ArrayList<SAMSegment>());
    try {
        final SamReaderFactory factory = SamReaderFactory.makeDefault().enable(SamReaderFactory.Option.INCLUDE_SOURCE_IN_RECORDS, SamReaderFactory.Option.VALIDATE_CRC_CHECKSUMS).validationStringency(ValidationStringency.SILENT);
        final SamReader in1 = factory.open(new File(align_file));
        final SAMRecordIterator iter1 = in1.iterator();
        String qry;
        int qry_ln;
        double min_aln;
        final List<SAMSegment> buff = new ArrayList<SAMSegment>();
        SAMRecord rc = iter1.next();
        while (rc != null) {
            qry = rc.getReadName();
            qry_ln = qry_seqs.get(qry).seq_ln();
            buff.clear();
            if (!rc.getReadUnmappedFlag())
                buff.add(SAMSegment.samRecord(rc, true, qry_ln));
            while ((rc = iter1.next()) != null && rc.getReadName().equals(qry)) {
                buff.add(SAMSegment.samRecord(rc, true, qry_ln));
            }
            if (buff.isEmpty())
                continue;
            min_aln = 0.9 * buff.get(0).qlength();
            // keep alignment fragment that has qual>0
            Set<SAMSegment> init_f = new HashSet<SAMSegment>();
            Set<SAMSegment> init_r = new HashSet<SAMSegment>();
            for (SAMSegment record : buff) {
                if (record.qual() == 0 && record.qlength() < min_aln)
                    continue;
                if (record.qseqid().equals(qry))
                    init_f.add(record);
                else
                    init_r.add(record);
                initPseudoAssembly.get(record.sseqid()).add(record);
            }
            if (!init_f.isEmpty())
                initPlace.put(qry, init_f);
            if (!init_r.isEmpty())
                initPlace.put(qry + "'", init_r);
        }
        iter1.close();
        in1.close();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    // Collections.sort(initPseudoAssembly.get("1_pilon"), new AlignmentSegment.SubjectCoordinationComparator());
    // if(debug) {
    // for(SAMSegment record : initPseudoAssembly.get("1_pilon")) {
    // System.out.println(record.qseqid()+":"+record.sstart()+"-"+record.send());
    // }
    // }
    final Set<SAMSegment> contained = new HashSet<SAMSegment>();
    final Set<SAMSegment> placed = new HashSet<SAMSegment>();
    final int flank_size = 10000;
    int distance;
    for (String sub_seq : sub_seqs.keySet()) {
        // sub_seq = "Chr10";
        if (sub_seq.equals("Chr00"))
            continue;
        myLogger.info(">>>>>>>>>>>>>" + sub_seq + "<<<<<<<<<<<<<<<<");
        final List<SAMSegment> seq_by_sub = initPseudoAssembly.get(sub_seq);
        Collections.sort(seq_by_sub, new AlignmentSegment.SubjectCoordinationComparator());
        placed.clear();
        int nSeq = seq_by_sub.size();
        double edge_penalty, edge_score;
        SAMSegment root_seq, source_seq, target_seq;
        Set<SAMSegment> target_seqs;
        Set<OverlapEdge> outgoing;
        TraceableEdge edge;
        String root_seqid, source_seqid, target_seqid;
        TraceableVertex<String> root_vertex, source_vertex, target_vertex;
        Deque<SAMSegment> deque = new ArrayDeque<SAMSegment>();
        final List<TraceableVertex<String>> traceable = new ArrayList<TraceableVertex<String>>();
        for (int i = 0; i < nSeq; i++) {
            root_seq = seq_by_sub.get(i);
            root_seqid = root_seq.qseqid();
            if (placed.contains(root_seq))
                continue;
            final TraceableDirectedWeightedPseudograph<String> razor = new TraceableDirectedWeightedPseudograph<String>(TraceableEdge.class);
            // final ListenableDirectedWeightedGraph<TraceableVertex<String>, DefaultWeightedEdge> razor =
            // new ListenableDirectedWeightedGraph<TraceableVertex<String>, DefaultWeightedEdge>(DefaultWeightedEdge.class);
            // JGraphModelAdapter<TraceableVertex<String>, DefaultWeightedEdge> jgAdapter =
            // new JGraphModelAdapter<TraceableVertex<String>, DefaultWeightedEdge>(razor);
            // JGraph jgraph = new JGraph(jgAdapter);
            deque.clear();
            deque.push(root_seq);
            contained.clear();
            while (!deque.isEmpty()) {
                source_seq = deque.pop();
                source_seqid = source_seq.qseqid();
                if (contained.contains(source_seq))
                    continue;
                contained.add(source_seq);
                source_vertex = new TraceableVertex<String>(source_seqid);
                source_vertex.setSAMSegment(source_seq);
                if (!razor.containsVertex(source_vertex))
                    razor.addVertex(source_vertex);
                outgoing = gfa.outgoingEdgesOf(source_seqid);
                for (OverlapEdge out : outgoing) {
                    target_seqid = gfa.getEdgeTarget(out);
                    if (!initPlace.containsKey(target_seqid))
                        continue;
                    target_seqs = initPlace.get(target_seqid);
                    distance = Integer.MAX_VALUE;
                    target_seq = null;
                    for (SAMSegment seq : target_seqs) {
                        int d = AlignmentSegment.sdistance(source_seq, seq);
                        if (d < distance) {
                            distance = d;
                            target_seq = seq;
                        }
                    }
                    if (distance <= flank_size) {
                        target_vertex = new TraceableVertex<String>(target_seqid);
                        target_vertex.setSAMSegment(target_seq);
                        if (!razor.containsVertex(target_vertex))
                            razor.addVertex(target_vertex);
                        if (razor.containsEdge(source_vertex, target_vertex))
                            continue;
                        edge = razor.addEdge(source_vertex, target_vertex);
                        // calculate edge weight
                        // higher weight edges are those,
                        /**
                         **
                         *							//       1.  large/long alignment segments vertices
                         *							// TODO: 2*. small gaps on the reference
                         *							edge_weight = qry_seqs.get(source_seqid).seq_ln()+
                         *									qry_seqs.get(target_seqid).seq_ln()-
                         *									gfa.getEdge(source_seqid, target_seqid).olap();
                         */
                        // TODO: 1*. large/long alignment segments vertices
                        // 2.  small gaps on the reference
                        edge_penalty = AlignmentSegment.sdistance(source_seq, target_seq);
                        edge.setPenalty(edge_penalty);
                        edge_score = qry_seqs.get(source_seqid).seq_ln() + qry_seqs.get(target_seqid).seq_ln() - gfa.getEdge(source_seqid, target_seqid).olap();
                        edge.setScore(edge_score);
                        deque.push(target_seq);
                    }
                }
            }
            if (ddebug)
                myLogger.info(root_seqid + " " + razor.vertexSet().size() + " " + razor.edgeSet().size() + " done");
            // JFrame frame = new JFrame();
            // frame.getContentPane().add(jgraph);
            // frame.setDefaultCloseOperation(JFrame.EXIT_ON_CLOSE);
            // frame.pack();
            // frame.setVisible(true);
            // "pseudo"-DFS to find the route with the highest score
            final Map<String, TraceableVertex<String>> razv_map = new HashMap<String, TraceableVertex<String>>();
            for (TraceableVertex<String> v : razor.vertexSet()) razv_map.put(v.getId(), v);
            // we use a bidirectional hashmap to simulate the deque
            // this is because we may need to do deletions
            // Deque<TraceableVertex<String>> queue = new ArrayDeque<TraceableVertex<String>>();
            final TreeBidiMap<Long, TraceableVertex<String>> bidiQ = new TreeBidiMap<Long, TraceableVertex<String>>();
            root_vertex = razv_map.get(root_seqid);
            root_vertex.setSAMSegment(root_seq);
            root_vertex.setScore(qry_seqs.get(root_seqid).seq_ln());
            root_vertex.setPenalty(0);
            root_vertex.setStatus(true);
            bidiQ.put(0L, root_vertex);
            double max_ws = Double.NEGATIVE_INFINITY, source_penalty, target_penalty, source_score, target_score, penalty, score, target_ws, source_ws, ws;
            int source_ln;
            Set<TraceableEdge> out_edges;
            TraceableVertex<String> opt_vertex = null;
            long sizeQ;
            boolean isLeaf;
            if (ddebug)
                for (TraceableEdge e : razor.edgeSet()) myLogger.info(e.toString() + "(" + razor.getEdgeSource(e).getSAMSegment().toString() + "|" + razor.getEdgeTarget(e).getSAMSegment().toString() + "|" + e.getScore() + "-" + e.getPenalty() + ")");
            while (!bidiQ.isEmpty()) {
                sizeQ = bidiQ.lastKey();
                source_vertex = bidiQ.get(sizeQ);
                bidiQ.remove(sizeQ);
                source_ln = qry_seqs.get(source_vertex.getId()).seq_ln();
                source_score = source_vertex.getScore() - source_ln;
                source_penalty = source_vertex.getPenalty();
                source_ws = source_score - source_penalty;
                isLeaf = true;
                out_edges = razor.outgoingEdgesOf(source_vertex);
                for (TraceableEdge out : out_edges) {
                    // this is not right because graph edges are immutable?
                    // target_vertex = razor.getEdgeTarget(out);
                    target_vertex = razv_map.get(razor.getEdgeTarget(out).getId());
                    target_score = target_vertex.getScore();
                    target_penalty = target_vertex.getPenalty();
                    target_ws = target_score - target_penalty;
                    edge_penalty = out.getPenalty();
                    penalty = source_penalty + edge_penalty;
                    edge_score = out.getScore();
                    score = source_score + edge_score;
                    ws = score - penalty;
                    if (edge_penalty > flank_size || target_vertex.getStatus() && (ws <= target_ws || isLoopback(razor, source_vertex, target_vertex)))
                        continue;
                    isLeaf = false;
                    target_vertex.setBackTrace(source_vertex);
                    target_vertex.setScore(score);
                    target_vertex.setPenalty(penalty);
                    target_vertex.setStatus(true);
                    bidiQ.put(sizeQ++, target_vertex);
                }
                if (isLeaf && source_ws > max_ws) {
                    penalty = source_vertex.getPenalty();
                    score = source_vertex.getScore();
                    max_ws = source_ws;
                    opt_vertex = source_vertex;
                    if (ddebug) {
                        String trace = opt_vertex.toString() + ":" + opt_vertex.getSAMSegment().sstart() + "-" + opt_vertex.getSAMSegment().send() + "(" + opt_vertex.getScore() + "-" + opt_vertex.getPenalty() + ")";
                        TraceableVertex<String> optx = opt_vertex;
                        while ((optx = optx.getBackTrace()) != null) {
                            trace += "," + optx.toString() + ":" + optx.getSAMSegment().sstart() + "-" + optx.getSAMSegment().send() + "(" + optx.getScore() + "-" + optx.getPenalty() + ")";
                        }
                        myLogger.info("trace back [" + score + ", " + penalty + "]: " + trace);
                    }
                }
            }
            traceable.add(opt_vertex);
            Set<TraceableVertex<String>> optx = new HashSet<TraceableVertex<String>>();
            optx.add(opt_vertex);
            while ((opt_vertex = opt_vertex.getBackTrace()) != null) optx.add(opt_vertex);
            for (TraceableVertex<String> v : optx) placed.add(v.getSAMSegment());
        }
        // sort traceable by size
        Collections.sort(traceable, new Comparator<TraceableVertex<String>>() {

            @Override
            public int compare(TraceableVertex<String> t0, TraceableVertex<String> t1) {
                // TODO Auto-generated method stub
                return Double.compare(t1.getScore(), t0.getScore());
            }
        });
        if (debug) {
            for (TraceableVertex<String> opt_vertex : traceable) {
                double score = opt_vertex.getScore();
                double penalty = opt_vertex.getPenalty();
                String trace = opt_vertex.toString() + ":" + opt_vertex.getSAMSegment().sstart() + "-" + opt_vertex.getSAMSegment().send() + "(" + opt_vertex.getScore() + "-" + opt_vertex.getPenalty() + ")";
                while ((opt_vertex = opt_vertex.getBackTrace()) != null) {
                    trace += "," + opt_vertex.toString() + ":" + opt_vertex.getSAMSegment().sstart() + "-" + opt_vertex.getSAMSegment().send() + "(" + opt_vertex.getScore() + "-" + opt_vertex.getPenalty() + ")";
                }
                myLogger.info("trace back [" + score + ", " + penalty + "]: " + trace);
            }
        }
        // we generate a compound alignment record for each traceable
        for (TraceableVertex<String> opt_vertex : traceable) {
        }
    }
}
Also used : SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ArrayList(java.util.ArrayList) List(java.util.List) HashSet(java.util.HashSet) OverlapEdge(cz1.ngs.model.OverlapEdge) Sequence(cz1.ngs.model.Sequence) SAMSegment(cz1.ngs.model.SAMSegment) AlignmentSegment(cz1.ngs.model.AlignmentSegment) TraceableVertex(cz1.ngs.model.TraceableVertex) SAMRecord(htsjdk.samtools.SAMRecord) HashMap(java.util.HashMap) Map(java.util.Map) BidiMap(org.apache.commons.collections4.BidiMap) TreeBidiMap(org.apache.commons.collections4.bidimap.TreeBidiMap) File(java.io.File) GFA(cz1.ngs.model.GFA) TreeRangeSet(com.google.common.collect.TreeRangeSet) HashSet(java.util.HashSet) Set(java.util.Set) TreeRangeSet(com.google.common.collect.TreeRangeSet) TraceableEdge(cz1.ngs.model.TraceableEdge) SamReader(htsjdk.samtools.SamReader) TraceableDirectedWeightedPseudograph(cz1.ngs.model.TraceableDirectedWeightedPseudograph) TreeBidiMap(org.apache.commons.collections4.bidimap.TreeBidiMap) SamReaderFactory(htsjdk.samtools.SamReaderFactory) IOException(java.io.IOException) ArrayDeque(java.util.ArrayDeque)

Example 12 with Sequence

use of cz1.ngs.model.Sequence in project polyGembler by c-zhou.

the class Consensus method parseScaffold.

private final void parseScaffold(final String contig_fa, final String agp_map, final int minL, final String out_fa) {
    final Map<String, Sequence> contig_map = Sequence.parseFastaFileAsMap(contig_fa);
    try {
        BufferedReader br_agp = Utils.getBufferedReader(agp_map);
        final List<Sequence> scaffolds = new ArrayList<Sequence>();
        final List<Segment> segments = new ArrayList<Segment>();
        final StringBuilder str_buf = new StringBuilder();
        String line = br_agp.readLine();
        String[] s;
        String seq_sn, ref_sn, anchor_start, anchor_end;
        Sequence contig;
        final Set<String> anchored_seqs = new HashSet<String>();
        while (line != null) {
            s = line.split("\\s+");
            seq_sn = s[5];
            anchor_start = s[6];
            segments.clear();
            segments.add(new Segment(s[0], Integer.parseInt(s[1]), Integer.parseInt(s[2]), Integer.parseInt(s[3]), s[4], s[5], Integer.parseInt(s[6]), Integer.parseInt(s[7])));
            while ((line = br_agp.readLine()) != null) {
                s = line.split("\\s+");
                if (s[5].equals(seq_sn))
                    segments.add(new Segment(s[0], Integer.parseInt(s[1]), Integer.parseInt(s[2]), Integer.parseInt(s[3]), s[4], s[5], Integer.parseInt(s[6]), Integer.parseInt(s[7])));
                else
                    break;
            }
            anchor_end = s[7];
            ref_sn = s.length > 9 ? s[9] + ":" + anchor_start + "_" + anchor_end : null;
            str_buf.setLength(0);
            for (Segment seg : segments) {
                if (seg.type == MAP_ENUM.GAP) {
                    str_buf.append(Sequence.polyN(seg.seq_ln));
                } else {
                    contig = contig_map.get(seg.seq_sn);
                    str_buf.append(seg.seq_rev ? Sequence.revCompSeq(contig.seq_str().substring(seg.seq_start, seg.seq_end)) : contig.seq_str().substring(seg.seq_start, seg.seq_end));
                    anchored_seqs.add(seg.seq_sn);
                }
            }
            scaffolds.add(new Sequence(ref_sn, str_buf.toString().replaceAll("N{1,}$", "").replaceAll("^N{1,}", "")));
        }
        br_agp.close();
        for (String seq : anchored_seqs) contig_map.remove(seq);
        BufferedWriter bw_fa = Utils.getBufferedWriter(out_fa);
        // whether to add unplaced contigs ?
        // alternatively can refer to the map file and do it later
        System.err.println("Buffer contigs...");
        for (Map.Entry<String, Sequence> entry : contig_map.entrySet()) scaffolds.add(entry.getValue());
        System.err.println("Sort scaffolds...");
        Collections.sort(scaffolds);
        Collections.reverse(scaffolds);
        int n = scaffolds.size(), rm = 0, seq_ln;
        String seq_str;
        System.err.println(n + " scaffolds to parse...");
        for (int i = 0; i != n; i++) {
            if (i % 10000 == 0)
                System.err.println(i + " scaffolds parsed...");
            contig = scaffolds.get(i);
            if (contig.seq_ln() < minL || contig.seq_ln() == 0)
                break;
            rm++;
            // bw_fa.write(contig.seq_str().contains("N") ? ">S" : ">C");
            bw_fa.write(">S");
            bw_fa.write(String.format("%08d", i + 1));
            if (contig.seq_sn() != null)
                bw_fa.write("|" + contig.seq_sn());
            bw_fa.write("\n");
            /**
             *				// not too slow as string buffer insertion is O(n)
             *				str_buf.setLength(0);
             *				str_buf.append(contig.seq_str);
             *
             *				int offset=0, seq_ln=str_buf.length();
             *				while(offset<seq_ln) {
             *					offset += lineWidth;
             *					str_buf.insert( Math.min(offset, seq_ln), "\n" );
             *					seq_ln++;
             *					offset++;
             *				}
             *
             *				bw_fa.write(str_buf.toString());
             */
            seq_str = contig.seq_str();
            seq_ln = contig.seq_ln();
            bw_fa.write(seq_str.charAt(0));
            for (int j = 1; j != seq_ln; j++) {
                if (j % lineWidth == 0)
                    bw_fa.write("\n");
                bw_fa.write(seq_str.charAt(j));
            }
            bw_fa.write("\n");
        }
        bw_fa.close();
        System.err.println("Calculate statistics...");
        // for(int i=rm; i!=n; i++) scaffolds.remove(i);
        scaffolds.subList(rm, scaffolds.size()).clear();
        int gap_ln = 0;
        seq_ln = 0;
        for (int i = 0; i != rm; i++) {
            seq_ln += scaffolds.get(i).seq_ln();
            gap_ln += StringUtils.countMatches(scaffolds.get(i).seq_str(), "N");
        }
        System.err.println("# SEQ_LEN\t" + seq_ln);
        System.err.println("# GAP_LEN\t" + gap_ln);
        System.err.println("#     N50\t" + calcWeightedMedianSatistic(scaffolds, 0.5));
    } catch (IOException e) {
        e.printStackTrace();
    }
}
Also used : ArrayList(java.util.ArrayList) Sequence(cz1.ngs.model.Sequence) IOException(java.io.IOException) AlignmentSegment(cz1.ngs.model.AlignmentSegment) Blast6Segment(cz1.ngs.model.Blast6Segment) BufferedWriter(java.io.BufferedWriter) BufferedReader(java.io.BufferedReader) Map(java.util.Map) HashMap(java.util.HashMap) TreeMap(java.util.TreeMap) HashSet(java.util.HashSet)

Example 13 with Sequence

use of cz1.ngs.model.Sequence in project polyGembler by c-zhou.

the class Consensus method parseContigFromBam.

private Map<String, Sequence> parseContigFromBam() {
    // TODO Auto-generated method stub
    final Map<String, Sequence> contig_list = new HashMap<String, Sequence>();
    final SamReader in1 = factory.open(new File(this.bam_list[0]));
    try {
        List<SAMSequenceRecord> seqs = in1.getFileHeader().getSequenceDictionary().getSequences();
        for (SAMSequenceRecord seq : seqs) contig_list.put(seq.getSequenceName(), new Sequence(seq.getSequenceIndex(), seq.getSequenceLength()));
        in1.close();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
    return contig_list;
}
Also used : SamReader(htsjdk.samtools.SamReader) HashMap(java.util.HashMap) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) Sequence(cz1.ngs.model.Sequence) IOException(java.io.IOException) File(java.io.File)

Example 14 with Sequence

use of cz1.ngs.model.Sequence in project polyGembler by c-zhou.

the class Anchor method makeAssemblyGraph.

private void makeAssemblyGraph(String query_file) {
    // TODO Auto-generated method stub
    List<Sequence> qry_seqs = Sequence.parseFastaFileWithRevCmpAsList(query_file);
    Map<String, Set<Integer>> pref_mer = new HashMap<String, Set<Integer>>();
    Map<String, Set<Integer>> suff_mer = new HashMap<String, Set<Integer>>();
    String seq_str, kmer;
    int seq_ln;
    Sequence seq;
    for (int i = 0; i < qry_seqs.size(); i++) {
        seq = qry_seqs.get(i);
        contig_coordinate.put(i, seq.seq_sn());
        assembly_graph.addVertex(i);
        seq_str = seq.seq_str();
        seq_ln = seq.seq_ln();
        kmer = seq_str.substring(0, kmer_size);
        if (!pref_mer.containsKey(kmer))
            pref_mer.put(kmer, new HashSet<Integer>());
        pref_mer.get(kmer).add(i);
        kmer = seq_str.substring(seq_ln - kmer_size, seq_ln);
        if (!suff_mer.containsKey(kmer))
            suff_mer.put(kmer, new HashSet<Integer>());
        suff_mer.get(kmer).add(i);
    }
    Set<String> pref_set = pref_mer.keySet();
    Set<String> suff_set = suff_mer.keySet();
    for (String mer : suff_set) // from a suff_mer to a pref_mer
    if (pref_set.contains(mer))
        for (Integer i : suff_mer.get(mer)) for (Integer j : pref_mer.get(mer)) assembly_graph.setEdgeWeight(assembly_graph.addEdge(i, j), 1.0);
    myLogger.info("Assembly graph " + assembly_graph.vertexSet().size() + " vetices and " + assembly_graph.edgeSet().size() + " edges.");
    return;
}
Also used : TreeRangeSet(com.google.common.collect.TreeRangeSet) HashSet(java.util.HashSet) Set(java.util.Set) HashMap(java.util.HashMap) Sequence(cz1.ngs.model.Sequence) HashSet(java.util.HashSet)

Example 15 with Sequence

use of cz1.ngs.model.Sequence in project polyGembler by c-zhou.

the class Anchor method run.

// private static DijkstraShortestPath<Integer, DefaultWeightedEdge> dijkstra_paths;
@Override
public void run() {
    // TODO Auto-generated method stub
    long tic = System.nanoTime();
    sub_seqs = Sequence.parseFastaFileAsMap(subject_file);
    qry_seqs = Sequence.parseFastaFileWithRevCmpAsMap(query_file);
    // this.makeAssemblyGraph(query_file);
    this.makeAssemblyGraph(query_file);
    bfs = new BFSShortestPath<Integer, DefaultWeightedEdge>(assembly_graph);
    // dijkstra_paths = new DijkstraShortestPath<Integer, DefaultWeightedEdge>(assembly_graph);
    // find 'N/n's in subject/reference sequences
    // which could have impact on parsing the blast records
    final Map<String, TreeRangeSet<Integer>> sub_gaps = new HashMap<String, TreeRangeSet<Integer>>();
    final Map<String, List<Blast6Segment>> anchored_records = new HashMap<String, List<Blast6Segment>>();
    for (Map.Entry<String, Sequence> entry : sub_seqs.entrySet()) {
        String seq_sn = entry.getKey();
        String seq_str = entry.getValue().seq_str();
        final TreeRangeSet<Integer> tmp_rangeSet = TreeRangeSet.create();
        for (int j = 0; j < seq_str.length(); j++) {
            if (seq_str.charAt(j) == 'N' || seq_str.charAt(j) == 'n')
                // blast record is 1-based closed coordination
                tmp_rangeSet.add(Range.closed(j + 1, j + 1).canonical(DiscreteDomain.integers()));
        }
        int seq_ln = seq_str.length();
        final TreeRangeSet<Integer> range_set = TreeRangeSet.create();
        for (Range<Integer> range : tmp_rangeSet.asRanges()) {
            int lowerend = range.hasLowerBound() ? Math.max(0, range.lowerEndpoint() - gap_buff) : 0;
            int upperend = range.hasUpperBound() ? Math.min(seq_ln, range.upperEndpoint() + gap_buff - 1) : seq_ln;
            range_set.add(Range.closed(lowerend, upperend).canonical(DiscreteDomain.integers()));
        }
        sub_gaps.put(seq_sn, range_set);
        // initialise an array list for each reference chromosome
        anchored_records.put(seq_sn, new ArrayList<Blast6Segment>());
    }
    // parse blast records
    // blast records buffer
    final List<Blast6Segment> buff = new ArrayList<Blast6Segment>();
    // selected blast records
    final List<Blast6Segment> sel_recs = new ArrayList<Blast6Segment>();
    // temp list
    final List<Blast6Segment> tmp_records = new ArrayList<Blast6Segment>();
    try {
        BufferedReader br_blast = Utils.getBufferedReader(blast_out);
        Blast6Segment tmp_record = Blast6Segment.blast6Segment(br_blast.readLine());
        Blast6Segment primary_record, secondary_record;
        String qry;
        double qry_ln, aln_frac;
        while (tmp_record != null) {
            qry = tmp_record.qseqid();
            qry_ln = qry_seqs.get(qry).seq_ln();
            buff.clear();
            buff.add(tmp_record);
            while ((tmp_record = Blast6Segment.blast6Segment(br_blast.readLine())) != null && tmp_record.qseqid().equals(qry)) {
                // filter by identity
                if (tmp_record.pident() >= this.min_ident)
                    buff.add(tmp_record);
            }
            sel_recs.clear();
            // merge collinear records
            for (String sub : sub_seqs.keySet()) {
                // get all records for subject/reference sequence sub_seq
                tmp_records.clear();
                for (Blast6Segment record : buff) if (record.sseqid().equals(sub))
                    tmp_records.add(record);
                if (tmp_records.isEmpty())
                    continue;
                // find alignment segments that can be deleted
                // those that are subsets of larger alignment segments
                // (Sstart, (sstart, send), Send) and (Qstart, (qstart, qend), Qend)
                Collections.sort(tmp_records, new Blast6Segment.SegmentSizeComparator());
                final Set<int[][]> ranges = new HashSet<int[][]>();
                outerloop: for (int i = 0; i < tmp_records.size(); i++) {
                    primary_record = tmp_records.get(i);
                    int[][] range = new int[2][2];
                    if (primary_record.sstart() < primary_record.send()) {
                        range[0][0] = primary_record.sstart();
                        range[0][1] = primary_record.send();
                    } else {
                        range[0][0] = primary_record.send();
                        range[0][1] = primary_record.sstart();
                    }
                    if (primary_record.qstart() < primary_record.qend()) {
                        range[1][0] = primary_record.qstart();
                        range[1][1] = primary_record.qend();
                    } else {
                        range[1][0] = primary_record.qend();
                        range[1][1] = primary_record.qstart();
                    }
                    for (int[][] r : ranges) {
                        if (r[0][0] <= range[0][0] && r[0][1] >= range[0][1] && r[1][0] <= range[1][0] && r[1][1] >= range[1][1]) {
                            tmp_records.remove(i--);
                            continue outerloop;
                        }
                    }
                    ranges.add(range);
                }
                // TODO rewrite this part
                /**
                 *					// find collinear alignment segments that can be merged
                 *					Collections.sort(tmp_records, new BlastRecord.SInterceptComparator());
                 *
                 *					collinear_merged.clear();
                 *					final List<Blast6Record> temp = new ArrayList<Blast6Record>();
                 *					for(int i=0; i<tmp_records.size(); ) {
                 *						Blast6Record record = tmp_records.get(i);
                 *						double max_shift;
                 *						temp.clear();
                 *						temp.add(record);
                 *
                 *						// find collinear alignment segments
                 *						outerloop:
                 *							while( (++i)<tmp_records.size() ) {
                 *								record = tmp_records.get(i);
                 *								// check if is collinear with other alignment segments
                 *								for(Blast6Record r : temp) {
                 *									max_shift = collinear_shift*
                 *											Math.min(r.length(), record.length());
                 *									if(BlastRecord.sdistance(r, record)<=max_shift &&
                 *											BlastRecord.qdistance(r, record)<=max_shift &&
                 *											BlastRecord.pdistance(r, record)<=max_shift) {
                 *										temp.add(record);
                 *										continue outerloop;
                 *									}
                 *								}
                 *								break;
                 *							}
                 *
                 *						// merge collinear alignment segments
                 *						int qstart = Integer.MAX_VALUE;
                 *						int qend = Integer.MIN_VALUE;
                 *						int sstart = Integer.MAX_VALUE;
                 *						int send = Integer.MIN_VALUE;
                 *						double pident = 0;
                 *						int length = 0;
                 *
                 *						// qstart is always smaller than qend
                 *						for(int j=0; j<temp.size(); j++) {
                 *							record = temp.get(j);
                 *							if(record.qstart()<qstart) {
                 *								qstart = record.qstart();
                 *								sstart = record.sstart();
                 *							}
                 *							if(record.qend()>qend) {
                 *								qend = record.qend();
                 *								send = record.send();
                 *							}
                 *							if(record.pident()>pident)
                 *								pident = record.pident();
                 *							length += record.length();
                 *						}
                 *
                 *						collinear_merged.add(new Blast6Record(qry,sub,pident,length,-1,-1,qstart,qend,sstart,send,-1,-1));
                 *					}
                 */
                // find collinear alignment segments that can be merged
                // more accurate but slower
                Collections.sort(tmp_records, new AlignmentSegment.SubjectCoordinationComparator());
                Blast6Segment record;
                for (int i = 0; i < tmp_records.size(); i++) {
                    primary_record = tmp_records.get(i);
                    for (int j = i + 1; j < tmp_records.size(); j++) {
                        secondary_record = tmp_records.get(j);
                        double max_shift = collinear_shift * Math.min(primary_record.length(), secondary_record.length());
                        if ((record = Blast6Segment.collinear(primary_record, secondary_record, max_shift)) != null) {
                            tmp_records.set(i, record);
                            tmp_records.remove(j);
                            --i;
                            break;
                        }
                    }
                }
                // process blast records that clipped by gaps
                // (sstart, send)---(start2, send2)
                // (sstart  ...  ---  ...    send2)
                TreeRangeSet<Integer> sub_gap = sub_gaps.get(sub);
                Collections.sort(tmp_records, new AlignmentSegment.SubjectCoordinationComparator());
                for (int i = 0; i < tmp_records.size(); i++) {
                    primary_record = tmp_records.get(i);
                    if (sub_gap.contains(primary_record.true_send())) {
                        secondary_record = null;
                        int sec_j = -1;
                        for (int j = i + 1; j < tmp_records.size(); j++) {
                            if (tmp_records.get(j).true_sstart() >= primary_record.true_send()) {
                                secondary_record = tmp_records.get(j);
                                sec_j = j;
                                break;
                            }
                        }
                        if (secondary_record == null || AlignmentSegment.reverse(primary_record, secondary_record)) {
                            // reverse alignment segments
                            continue;
                        }
                        if (sub_gap.contains(secondary_record.true_sstart()) && sub_gap.rangeContaining(primary_record.true_send()).equals(sub_gap.rangeContaining(secondary_record.true_sstart()))) {
                            // clipping
                            // merge two alignment segments
                            double pident = Math.max(primary_record.pident(), secondary_record.pident());
                            int qstart = Math.min(primary_record.true_qstart(), secondary_record.true_qstart());
                            int qend = Math.max(primary_record.true_qend(), secondary_record.true_qend());
                            int sstart = Math.min(primary_record.true_sstart(), secondary_record.true_sstart());
                            int send = Math.max(primary_record.true_send(), secondary_record.true_send());
                            int length = qend - qstart + 1;
                            // replace primary record with merged record
                            // delete secondary record
                            Blast6Segment merged_record = primary_record.forward() ? new Blast6Segment(qry, sub, pident, length, -1, -1, qstart, qend, sstart, send, -1, -1) : new Blast6Segment(qry, sub, pident, length, -1, -1, qstart, qend, send, sstart, -1, -1);
                            tmp_records.set(i, merged_record);
                            tmp_records.remove(sec_j);
                            // the merged records need to be processed
                            --i;
                        }
                    }
                }
                // add to sel_recs
                sel_recs.addAll(tmp_records);
            }
            // filter by alignment fraction
            buff.clear();
            buff.addAll(sel_recs);
            sel_recs.clear();
            for (Blast6Segment record : buff) {
                if (record.length() / qry_ln >= this.min_frac)
                    sel_recs.add(record);
            }
            if (sel_recs.isEmpty()) {
                // continue
                continue;
            }
            // filter blast records
            Collections.sort(sel_recs, new Blast6Segment.MatchIndentityComparator());
            // process primary alignment
            primary_record = sel_recs.get(0);
            anchored_records.get(primary_record.sseqid()).add(primary_record);
            aln_frac = primary_record.length() / qry_ln;
            // and process
            for (int i = 1; i < sel_recs.size(); i++) {
                secondary_record = sel_recs.get(i);
                if (secondary_record.pident() + this.diff_ident < primary_record.pident() || secondary_record.length() / qry_ln + this.diff_frac < aln_frac) {
                    break;
                } else {
                    anchored_records.get(secondary_record.sseqid()).add(secondary_record);
                }
            }
        }
        br_blast.close();
        for (Map.Entry<String, List<Blast6Segment>> entry : anchored_records.entrySet()) {
            System.out.println(entry.getKey() + ": " + entry.getValue().size());
        }
        final BufferedWriter bw_map = Utils.getBufferedWriter(out_prefix + ".map");
        final BufferedWriter bw_fa = Utils.getBufferedWriter(out_prefix + ".fa");
        final Set<String> anchored_seqs = new HashSet<String>();
        final List<String> sub_list = Sequence.parseSeqList(subject_file);
        for (String sub_sn : sub_list) {
            blast6_records = anchored_records.get(sub_sn);
            int nV = blast6_records.size(), count = 0;
            // sort blast records
            Collections.sort(blast6_records, new Blast6Segment.SubjectCoordinationComparator());
            // consensus
            int posUpto = 0, send_clip = 0;
            int sstart, send, qstart, qend, qlen, tmp_int, qstart_clip, qend_clip, gap_size;
            // distance to last 'N', start from next position to find longest common suffix-prefix
            int prev_n = Integer.MAX_VALUE;
            int mol_len = 0;
            String qseq;
            int nS, nQ;
            // first step: construct super scaffold
            // will form a tree graph indicating the path through the contigs
            // convert contig names to integer indices
            // one contig could end up with multiple indices due to repeats
            Map<Integer, String> ss_coordinate = new HashMap<Integer, String>();
            int index = 0;
            for (Blast6Segment record : blast6_records) ss_coordinate.put(index++, record.qseqid() + (record.forward() ? "" : "'"));
            StringBuilder seq_str = new StringBuilder();
            for (int v = 0; v < nV - 1; v++) {
                if (++count % 10000 == 0)
                    myLogger.info(sub_sn + " " + count + "/" + nV + " done.");
                Blast6Segment record = blast6_records.get(v);
                qlen = qry_seqs.get(record.qseqid()).seq_ln();
                sstart = record.sstart();
                send = record.send();
                qstart = record.qstart();
                qend = record.qend();
                if (sstart > send) {
                    // make sure sstart<send
                    tmp_int = sstart;
                    sstart = send;
                    send = tmp_int;
                    tmp_int = qstart;
                    qstart = qend;
                    qend = tmp_int;
                }
                if (qstart > qend) {
                    qstart_clip = qlen - qstart;
                    qend_clip = qend - 1;
                    qseq = Sequence.revCompSeq(qry_seqs.get(record.qseqid()).seq_str());
                } else {
                    qstart_clip = qstart - 1;
                    qend_clip = qlen - qend;
                    qseq = qry_seqs.get(record.qseqid()).seq_str();
                }
                if (send < posUpto || sstart < posUpto && qstart_clip > max_clip) {
                    // TODO process
                    continue;
                }
                // find longest suffix-prefix
                nS = seq_str.length();
                nQ = qseq.length();
                int nO = Math.min(prev_n, Math.min(nS, nQ));
                outerloop: for (; nO >= min_overlap; nO--) {
                    int nS_i = nS - nO;
                    for (int i = 0; i < nO; i++) {
                        if (seq_str.charAt(nS_i + i) != qseq.charAt(i))
                            continue outerloop;
                    }
                    break outerloop;
                }
                if (nO < min_overlap) {
                    // otherwise will insert a large GAP max(pseduo_distance, max_gap)
                    if (posUpto > 0) {
                        if (sstart <= posUpto) {
                            // if too much overlap, then treat it as a contradiction
                            if (posUpto - sstart > min_overlap) {
                                // discard
                                continue;
                            } else {
                                // insert a min_gap
                                gap_size = min_gap;
                            }
                        } else {
                            // estimate gap size
                            gap_size = (sstart - posUpto) - (send_clip + qstart_clip);
                            if (gap_size < max_gap)
                                gap_size = max_gap;
                        }
                        bw_map.write("GAP\t" + gap_size + "\t0\t" + gap_size + "\t+\t" + sub_sn + "\t" + mol_len + "\t" + (mol_len + gap_size) + "\n");
                        seq_str.append(Sequence.polyN(gap_size));
                        mol_len += gap_size;
                    }
                    bw_map.write(record.qseqid() + "\t" + qlen + "\t");
                    if (qstart > qend) {
                        // reverse
                        bw_map.write("0\t" + qlen + "\t-\t");
                    } else {
                        // forward
                        bw_map.write("0\t" + qlen + "\t+\t");
                    }
                    seq_str.append(qseq);
                    bw_map.write(sub_sn + "\t" + mol_len + "\t" + (mol_len + qlen) + "\n");
                    mol_len += qlen;
                    prev_n = qlen;
                    anchored_seqs.add(record.qseqid());
                } else {
                    // overlap found
                    // will not insert gap
                    // ====================
                    // /-------\
                    // /----\
                    // calculate overlaps
                    // process overlap
                    qstart = nO;
                    if (qstart == qlen)
                        continue;
                    bw_map.write(record.qseqid() + "\t" + (qlen - qstart) + "\t");
                    if (qstart > qend) {
                        // reverse
                        bw_map.write(0 + "\t" + (qlen - qstart) + "\t-\t");
                    } else {
                        // forward
                        bw_map.write(qstart + "\t" + qlen + "\t+\t");
                    }
                    bw_map.write(sub_sn + "\t" + mol_len + "\t" + (mol_len + qlen - qstart) + "\n");
                    mol_len += qlen - qstart;
                    prev_n += qlen - qstart;
                    seq_str.append(qseq.substring(qstart));
                    anchored_seqs.add(record.qseqid());
                }
                posUpto = send;
                send_clip = qend_clip;
            }
            if (seq_str.length() > 0)
                bw_fa.write(Sequence.formatOutput(sub_sn, seq_str.toString()));
        }
        bw_fa.close();
        bw_map.close();
        final BufferedWriter bw_ufa = Utils.getBufferedWriter(out_prefix + "_unplaced.fa");
        for (String seq : qry_seqs.keySet()) if (!anchored_seqs.contains(seq))
            bw_ufa.write(qry_seqs.get(seq).formatOutput());
        bw_ufa.close();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}
Also used : TreeRangeSet(com.google.common.collect.TreeRangeSet) HashMap(java.util.HashMap) Blast6Segment(cz1.ngs.model.Blast6Segment) ArrayList(java.util.ArrayList) BufferedWriter(java.io.BufferedWriter) ArrayList(java.util.ArrayList) List(java.util.List) HashSet(java.util.HashSet) Sequence(cz1.ngs.model.Sequence) IOException(java.io.IOException) DefaultWeightedEdge(org.jgrapht.graph.DefaultWeightedEdge) AlignmentSegment(cz1.ngs.model.AlignmentSegment) BufferedReader(java.io.BufferedReader) HashMap(java.util.HashMap) Map(java.util.Map) BidiMap(org.apache.commons.collections4.BidiMap) DualHashBidiMap(org.apache.commons.collections4.bidimap.DualHashBidiMap)

Aggregations

HashMap (java.util.HashMap)16 Sequence (cz1.ngs.model.Sequence)15 IOException (java.io.IOException)13 BufferedReader (java.io.BufferedReader)11 HashSet (java.util.HashSet)11 ArrayList (java.util.ArrayList)8 BufferedWriter (java.io.BufferedWriter)7 Map (java.util.Map)7 TreeRangeSet (com.google.common.collect.TreeRangeSet)6 AlignmentSegment (cz1.ngs.model.AlignmentSegment)5 Blast6Segment (cz1.ngs.model.Blast6Segment)5 FileReader (java.io.FileReader)5 List (java.util.List)5 Set (java.util.Set)5 File (java.io.File)4 Barcode (cz1.gbs.core.Barcode)3 SamReader (htsjdk.samtools.SamReader)3 BidiMap (org.apache.commons.collections4.BidiMap)3 ReadBarcodeResult (cz1.gbs.core.ReadBarcodeResult)2 SAMSequenceRecord (htsjdk.samtools.SAMSequenceRecord)2