Search in sources :

Example 6 with Sequence

use of cz1.ngs.model.Sequence in project polyGembler by c-zhou.

the class Consensus method parseContig.

public Map<String, Sequence> parseContig(final String contig_index) {
    final Map<String, Sequence> contig_list = new HashMap<String, Sequence>();
    try {
        BufferedReader ctg_br = new BufferedReader(new FileReader(contig_index));
        String line;
        String[] s;
        int index = 0;
        while ((line = ctg_br.readLine()) != null) {
            s = line.split("\\s+");
            contig_list.put(s[1].split(":")[1], new Sequence(index, Integer.parseInt(s[2].split(":")[1])));
            index++;
        }
        ctg_br.close();
    } catch (IOException e) {
        e.printStackTrace();
    }
    return contig_list;
}
Also used : HashMap(java.util.HashMap) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) Sequence(cz1.ngs.model.Sequence) IOException(java.io.IOException)

Example 7 with Sequence

use of cz1.ngs.model.Sequence in project polyGembler by c-zhou.

the class Consensus method parseLink.

public void parseLink(final String[] links_in, final String segment_in, final String link_out) {
    Map<String, Sequence> contig_list = parseContigFromBam();
    List<List<Segment>> segment_list = parseSegment(segment_in);
    Map<Integer, Set<Integer>> contig_seg_map = new HashMap<Integer, Set<Integer>>();
    int chr_index = 0, seg_index, seq_index;
    // 8  bits pseudo molecule indices
    int seg_key;
    // 24 bits segment indices
    // List<String> pseudo_mol_sn = new ArrayList<String>();
    List<double[]> pseudo_mol_sf = new ArrayList<double[]>();
    // pseudo_mol_sn.add("");
    for (List<Segment> seg_list : segment_list) {
        // pseudo_mol_sn.add("Chr"+(chr_index<10?"0":"")+chr_index);
        seg_index = 0;
        for (Segment seg : seg_list) {
            if (seg.type == MAP_ENUM.CONTIG) {
                seg_key = chr_index;
                seg_key <<= 24;
                seg_key += seg_index;
                seq_index = contig_list.get(seg.seq_sn).seq_no();
                if (!contig_seg_map.containsKey(seq_index))
                    contig_seg_map.put(seq_index, new HashSet<Integer>());
                contig_seg_map.get(seq_index).add(seg_key);
            }
            seg_index++;
        }
        pseudo_mol_sf.add(new double[seg_index - 1]);
        chr_index++;
    }
    myLogger.info("Segments: " + segment_list.size());
    myLogger.info("SegMap:   " + contig_seg_map.size());
    // for(Map.Entry<Integer, Integer> entry : contig_seg_map.entrySet())
    // System.err.println(entry.getKey()+" "+entry.getValue());
    Map<Long, Integer> links = new HashMap<Long, Integer>();
    myLogger.info("Insert size threshold: " + strcat(this.ins_thres, ","));
    myLogger.info("Link weights: " + strcat(link_w, ","));
    long key;
    int val;
    try {
        for (int i = 0; i < links_in.length; i++) {
            links.clear();
            System.gc();
            System.runFinalization();
            String link_i = links_in[i];
            BufferedReader link_br = new BufferedReader(new FileReader(link_i));
            String line;
            String[] s;
            Set<Integer> C1, C2;
            int z1, z2;
            while ((line = link_br.readLine()) != null) {
                s = line.split("\\s+");
                val = Integer.parseInt(s[2]);
                z1 = Integer.parseInt(s[0]);
                z2 = Integer.parseInt(s[1]);
                if (!contig_seg_map.containsKey(z1) || !contig_seg_map.containsKey(z2))
                    continue;
                C1 = contig_seg_map.get(z1);
                C2 = contig_seg_map.get(z2);
                for (Integer c1 : C1) {
                    for (Integer c2 : C2) {
                        if (c1 < c2) {
                            key = c1;
                            key <<= 32;
                            key += c2;
                        } else {
                            key = c2;
                            key <<= 32;
                            key += c1;
                        }
                        if (links.containsKey(key)) {
                            link_br.close();
                            throw new RuntimeException("!!!");
                        // links.put(key, val+links.get(key));
                        } else {
                            links.put(key, val);
                        }
                    }
                }
            }
            link_br.close();
            double insz_ub = ins_thres[i];
            double w = link_w[i];
            List<Segment> seg_list;
            double[] mol_sf;
            BufferedWriter link_bw = new BufferedWriter(new FileWriter(link_out + ".link" + String.format("%04d", i + 1)));
            int n1, n2, ins;
            Segment seg;
            outerloop: for (Map.Entry<Long, Integer> entry : links.entrySet()) {
                key = entry.getKey();
                n2 = (int) (key & mask_24bits);
                key >>= 24;
                z2 = (int) (key & mask_08bits);
                key >>= 8;
                n1 = (int) (key & mask_24bits);
                key >>= 24;
                z1 = (int) key;
                val = entry.getValue();
                link_bw.write(n1 + " " + n2 + " " + z1 + " " + z2 + " " + val + "\n");
                // if(c1!=c2 || val<link_thres) continue;
                if (z1 != z2)
                    continue;
                // System.out.println(n1+" "+n2+" "+c1+" "+c2+" "+val+"\n");
                seg_list = segment_list.get(z1);
                mol_sf = pseudo_mol_sf.get(z1);
                ins = 0;
                if (n1 > n2)
                    throw new RuntimeException("!!!");
                for (int j = n1 + 1; j < n2; j++) {
                    seg = seg_list.get(j);
                    if (seg.type == MAP_ENUM.GAP)
                        continue;
                    ins += seg.seq_ln;
                    if (ins > insz_ub)
                        continue outerloop;
                }
                for (int j = n1; j < n2; j++) mol_sf[j] += w * val;
            }
            link_bw.close();
            BufferedWriter map_bw = new BufferedWriter(new FileWriter(link_out + ".map" + String.format("%04d", i + 1)));
            int scaff_i = 0;
            for (int j = 0; j != pseudo_mol_sf.size(); j++) {
                mol_sf = pseudo_mol_sf.get(j);
                seg_list = segment_list.get(j);
                if (mol_sf.length != seg_list.size() - 1) {
                    map_bw.close();
                    throw new RuntimeException("!!!");
                }
                String scaff = "Scaffold" + String.format("%08d", ++scaff_i);
                seg = seg_list.get(0);
                map_bw.write(seg.seq_sn + "\t" + seg.seq_ln + "\t" + seg.seq_start + "\t" + seg.seq_end + "\t" + (seg.seq_rev ? "-" : "+") + "\t" + scaff + "\t" + seg.mol_start + "\t" + seg.mol_end + "\t" + df2.format(-1) + "\n");
                int n = seg_list.size();
                for (int k = 1; k != n; k++) {
                    seg = seg_list.get(k);
                    if (mol_sf[k - 1] < 1.0)
                        scaff = "Scaffold" + String.format("%08d", ++scaff_i);
                    map_bw.write(seg.seq_sn + "\t" + seg.seq_ln + "\t" + seg.seq_start + "\t" + seg.seq_end + "\t" + (seg.seq_rev ? "-" : "+") + "\t" + scaff + "\t" + seg.mol_start + "\t" + seg.mol_end + "\t" + df2.format(mol_sf[k - 1]) + "\n");
                }
            }
            map_bw.close();
        }
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}
Also used : SortedSet(java.util.SortedSet) Set(java.util.Set) TreeSet(java.util.TreeSet) HashSet(java.util.HashSet) HashMap(java.util.HashMap) FileWriter(java.io.FileWriter) ArrayList(java.util.ArrayList) BufferedWriter(java.io.BufferedWriter) Entry(java.util.Map.Entry) List(java.util.List) ArrayList(java.util.ArrayList) FileReader(java.io.FileReader) HashSet(java.util.HashSet) Sequence(cz1.ngs.model.Sequence) IOException(java.io.IOException) BufferedReader(java.io.BufferedReader)

Example 8 with Sequence

use of cz1.ngs.model.Sequence in project polyGembler by c-zhou.

the class Consensus method parseContig.

public Map<String, Sequence> parseContig(final String contig_index) {
    final Map<String, Sequence> contig_list = new HashMap<String, Sequence>();
    try {
        BufferedReader ctg_br = new BufferedReader(new FileReader(contig_index));
        String line;
        String[] s;
        int index = 0;
        while ((line = ctg_br.readLine()) != null) {
            s = line.split("\\s+");
            contig_list.put(s[1].split(":")[1], new Sequence(index, Integer.parseInt(s[2].split(":")[1])));
            index++;
        }
        ctg_br.close();
    } catch (IOException e) {
        e.printStackTrace();
    }
    return contig_list;
}
Also used : HashMap(java.util.HashMap) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) Sequence(cz1.ngs.model.Sequence) IOException(java.io.IOException)

Example 9 with Sequence

use of cz1.ngs.model.Sequence in project polyGembler by c-zhou.

the class Redundas method run.

@Override
public void run() {
    // TODO Auto-generated method stub
    Map<String, Sequence> sequence_map = Sequence.parseFastaFileAsMap(query_file);
    String line;
    String[] s;
    Set<String> seq_rm = new HashSet<String>();
    try {
        BufferedReader br_blast6 = new BufferedReader(new FileReader(blast_out));
        Set<Blast6Segment> buffer_b6 = new HashSet<Blast6Segment>();
        line = br_blast6.readLine();
        String qseqid;
        while (line != null) {
            s = line.split("\\s+");
            qseqid = s[0];
            buffer_b6.clear();
            buffer_b6.add(Blast6Segment.blast6Segment(line));
            while ((line = br_blast6.readLine()) != null && line.startsWith(qseqid)) buffer_b6.add(Blast6Segment.blast6Segment(line));
            int sz = sequence_map.get(qseqid).seq_ln();
            RangeSet<Integer> range_covered = TreeRangeSet.create();
            for (Blast6Segment record : buffer_b6) {
                if (!seq_rm.contains(record.sseqid()) && !record.qseqid().equals(record.sseqid()) && record.pident() >= min_ident && record.length() >= min_match && (record.qstart() <= max_overhang || record.qend() > sz - max_overhang))
                    range_covered.add(Range.closedOpen(record.qstart(), record.qend()).canonical(DiscreteDomain.integers()));
            }
            int unique_cvg = 0;
            for (Range<Integer> range : range_covered.asRanges()) unique_cvg += range.upperEndpoint() - range.lowerEndpoint();
            if ((double) unique_cvg / sz >= min_frac) {
                seq_rm.add(qseqid);
                myLogger.info("Redundant sequence " + qseqid + "\t" + sz + "\t" + unique_cvg + "\t" + (double) unique_cvg / sz);
            }
        }
        br_blast6.close();
        BufferedWriter bw_unique = Utils.getBufferedWriter(this.out_prefix + ".fa");
        BufferedWriter bw_redundas = Utils.getBufferedWriter(this.out_prefix + "_redundas.fa");
        for (Map.Entry<String, Sequence> entry : sequence_map.entrySet()) {
            if (seq_rm.contains(entry.getKey()))
                bw_redundas.write(entry.getValue().formatOutput());
            else
                bw_unique.write(entry.getValue().formatOutput());
        }
        bw_unique.close();
        bw_redundas.close();
    } catch (IOException e) {
        // TODO Auto-generated catch block
        e.printStackTrace();
    }
}
Also used : Blast6Segment(cz1.ngs.model.Blast6Segment) Sequence(cz1.ngs.model.Sequence) IOException(java.io.IOException) BufferedWriter(java.io.BufferedWriter) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) HashMap(java.util.HashMap) Map(java.util.Map) HashSet(java.util.HashSet)

Example 10 with Sequence

use of cz1.ngs.model.Sequence in project polyGembler by c-zhou.

the class ParseBarcodeRead method findBestBarcode.

/**
 * Returns the best barcode match for a given sequence.
 * @param queryS query sequence to be tested against all barcodes
 * @param maxDivergence maximum divergence to permit
 * @return best barcode match (null if no good match)
 */
Barcode findBestBarcode(String queryS, int maxDivergence) {
    long query = BaseEncoder.getLongFromSeq(queryS.substring(0, chunkSize));
    // note because the barcodes are polyA after the sequence, they should always
    // sort ahead of the hit, this is the reason for the -(closestHit+2)
    int closestHit = Arrays.binarySearch(quickBarcodeList, query);
    // Below is the old pipeline approach, which works (at least for maxDivergence of 0)
    if (closestHit < -1) {
        // should always be true, as the barcode+overhang is padded to 32 bases with polyA
        int index = quickMap.get(quickBarcodeList[-(closestHit + 2)]);
        if (theBarcodes[index].compareSequence(query, 1) == 0) {
            return theBarcodes[index];
        } else if (maxDivergence == 0) {
            // return null if not a perfect match
            return null;
        }
    } else {
        // should never go to this line
        return null;
    }
    int maxLength = 0, minDiv = maxDivergence + 1;
    Barcode bestBC = null;
    for (Barcode bc : theBarcodes) {
        int div = bc.compareSequence(query, maxDivergence + 1);
        if (div <= minDiv) {
            if ((div < minDiv) || (bc.getBarOverLength() > maxLength)) {
                minDiv = div;
                maxLength = bc.getBarOverLength();
                bestBC = bc;
            } else {
                // it is a tie, so return that not resolvable
                bestBC = null;
            }
        }
    }
    return bestBC;
}
Also used : Barcode(cz1.gbs.core.Barcode)

Aggregations

HashMap (java.util.HashMap)16 Sequence (cz1.ngs.model.Sequence)15 IOException (java.io.IOException)13 BufferedReader (java.io.BufferedReader)11 HashSet (java.util.HashSet)11 ArrayList (java.util.ArrayList)8 BufferedWriter (java.io.BufferedWriter)7 Map (java.util.Map)7 TreeRangeSet (com.google.common.collect.TreeRangeSet)6 AlignmentSegment (cz1.ngs.model.AlignmentSegment)5 Blast6Segment (cz1.ngs.model.Blast6Segment)5 FileReader (java.io.FileReader)5 List (java.util.List)5 Set (java.util.Set)5 File (java.io.File)4 Barcode (cz1.gbs.core.Barcode)3 SamReader (htsjdk.samtools.SamReader)3 BidiMap (org.apache.commons.collections4.BidiMap)3 ReadBarcodeResult (cz1.gbs.core.ReadBarcodeResult)2 SAMSequenceRecord (htsjdk.samtools.SAMSequenceRecord)2