Search in sources :

Example 1 with SubSequence

use of com.github.lindenb.jvarkit.lang.SubSequence in project jvarkit by lindenb.

the class GenomicJaspar method digest.

private void digest(PrintWriter out, String seqName, int position0, final StringBuilder sequence) {
    for (final Matrix matrix : this.jasparDb) {
        if (matrix.length() > sequence.length())
            continue;
        CharSequence forward = new SubSequence(sequence, 0, matrix.length());
        CharSequence revcomp = new RevCompCharSequence(forward);
        for (int strand = 0; strand < 2; ++strand) {
            double score = matrix.score(strand == 0 ? forward : revcomp);
            if (score <= 0)
                continue;
            if (score >= matrix.max() * this.fraction_of_max) {
                out.print(seqName);
                out.print('\t');
                out.print(position0);
                out.print('\t');
                out.print(position0 + matrix.length());
                out.print('\t');
                out.print(matrix.getName());
                out.print('\t');
                out.print((int) (1000.0 * (score / matrix.max())));
                out.print('\t');
                out.print(strand == 1 ? '-' : '+');
                out.print('\t');
                out.print(matrix.length());
                out.print('\t');
                out.print(matrix.getArchetype());
                out.print('\t');
                out.print(strand == 0 ? forward : revcomp);
                out.println();
                break;
            }
        }
    }
}
Also used : SubSequence(com.github.lindenb.jvarkit.lang.SubSequence) RevCompCharSequence(com.github.lindenb.jvarkit.util.bio.RevCompCharSequence) RevCompCharSequence(com.github.lindenb.jvarkit.util.bio.RevCompCharSequence)

Example 2 with SubSequence

use of com.github.lindenb.jvarkit.lang.SubSequence in project jvarkit by lindenb.

the class VcfJaspar method doVcfToVcf.

@Override
protected int doVcfToVcf(String inputName, VcfIterator in, VariantContextWriter out) {
    final String ATT = "JASPAR";
    GenomicSequence genomicSequence = null;
    final VCFHeader header = new VCFHeader(in.getHeader());
    addMetaData(header);
    header.addMetaDataLine(new VCFInfoHeaderLine(ATT, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Jaspar pattern overlapping: Format: (Name|length|Score/1000|pos|strand)"));
    out.writeHeader(header);
    while (in.hasNext()) {
        VariantContext var = in.next();
        if (genomicSequence == null || !genomicSequence.getChrom().equals(var.getContig())) {
            LOG.info("Loading sequence " + var.getContig());
            genomicSequence = new GenomicSequence(this.indexedFastaSequenceFile, var.getContig());
        }
        final Set<String> hits = new HashSet<String>();
        for (final Matrix matrix : this.jasparDb) {
            int start0 = Math.max(0, var.getStart() - matrix.length());
            for (int y = start0; y < var.getStart() && y + matrix.length() <= genomicSequence.length(); ++y) {
                CharSequence forward = new SubSequence(genomicSequence, y, y + matrix.length());
                CharSequence revcomp = new RevCompCharSequence(forward);
                // run each strand
                for (int strand = 0; strand < 2; ++strand) {
                    double score = matrix.score(strand == 0 ? forward : revcomp);
                    if (score <= 0)
                        continue;
                    if (score >= matrix.max() * this.fraction_of_max) {
                        StringBuilder b = new StringBuilder("(");
                        b.append(matrix.getName().replaceAll("[ \t;=]+", "/"));
                        b.append("|");
                        b.append(matrix.length());
                        b.append("|");
                        b.append((int) (1000.0 * (score / matrix.max())));
                        b.append("|");
                        b.append(y + 1);
                        b.append("|");
                        b.append(strand == 0 ? '+' : '-');
                        b.append(")");
                        hits.add(b.toString());
                        break;
                    }
                }
            }
        }
        if (hits.isEmpty()) {
            out.add(var);
            continue;
        }
        final VariantContextBuilder vcb = new VariantContextBuilder(var);
        vcb.attribute(ATT, hits.toArray(new String[hits.size()]));
        out.add(vcb.make());
    }
    return RETURN_OK;
}
Also used : GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) RevCompCharSequence(com.github.lindenb.jvarkit.util.bio.RevCompCharSequence) VariantContext(htsjdk.variant.variantcontext.VariantContext) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) SubSequence(com.github.lindenb.jvarkit.lang.SubSequence) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) RevCompCharSequence(com.github.lindenb.jvarkit.util.bio.RevCompCharSequence) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet)

Example 3 with SubSequence

use of com.github.lindenb.jvarkit.lang.SubSequence in project jvarkit by lindenb.

the class LocalRealignReads method doWork.

@Override
public int doWork(List<String> args) {
    if (this.faidxFile == null) {
        LOG.error("REFerence file missing;");
        return -1;
    }
    IndexedFastaSequenceFile indexedFastaSequenceFile = null;
    SamReader samReader = null;
    SAMFileWriter w = null;
    SAMRecordIterator iter = null;
    GenomicSequence genomicSequence = null;
    LongestCommonSequence matrix = new LongestCommonSequence();
    try {
        indexedFastaSequenceFile = new IndexedFastaSequenceFile(this.faidxFile);
        samReader = openSamReader(oneFileOrNull(args));
        final SAMFileHeader header1 = samReader.getFileHeader();
        final SAMFileHeader header2 = header1.clone();
        header2.setSortOrder(SortOrder.unsorted);
        w = this.writingBamArgs.setReferenceFile(faidxFile).openSAMFileWriter(outputFile, header2, true);
        final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header1);
        iter = samReader.iterator();
        while (iter.hasNext()) {
            final SAMRecord rec = progress.watch(iter.next());
            if (rec.getReadUnmappedFlag() || rec.isSecondaryOrSupplementary() || rec.getReadFailsVendorQualityCheckFlag() || rec.getDuplicateReadFlag()) {
                w.addAlignment(rec);
                continue;
            }
            final Cigar cigar = rec.getCigar();
            final int nCigarElement = cigar.numCigarElements();
            if (nCigarElement < 2) {
                w.addAlignment(rec);
                continue;
            }
            final CigarElement ce5 = cigar.getCigarElement(0);
            final CigarElement ce3 = cigar.getCigarElement(nCigarElement - 1);
            if (!((ce3.getOperator().equals(CigarOperator.S) && ce3.getLength() >= MIN_ALIGN_LEN) || (ce5.getOperator().equals(CigarOperator.S) && ce5.getLength() >= MIN_ALIGN_LEN))) {
                w.addAlignment(rec);
                continue;
            }
            if (genomicSequence == null || !genomicSequence.getChrom().equals(rec.getReferenceName())) {
                genomicSequence = new GenomicSequence(indexedFastaSequenceFile, rec.getReferenceName());
            }
            final CharSequence readseq = rec.getReadString();
            /**
             * short invert
             */
            if (ce5.getOperator() == CigarOperator.S && ce5.getLength() >= MIN_ALIGN_LEN) {
                CharSequence clipseq = new SubSequence(readseq, 0, ce5.getLength());
                CharSequence revcomp = new RevCompCharSequence(clipseq);
                LongestCommonSequence.Hit hit = matrix.align(genomicSequence, Math.max(0, rec.getUnclippedStart() - rec.getReadLength()), Math.min(rec.getUnclippedEnd() + rec.getReadLength(), genomicSequence.length()), revcomp, 0, revcomp.length());
                if (hit.size() >= MIN_ALIGN_LEN) {
                    System.err.println("REVCOMP5' " + hit.getMatchingSequence() + " " + clipseq + " for " + readseq + " " + rec.getReadName() + " " + rec.getCigarString());
                }
            /*
					
					hit = matrix.align(
							readseq, 0, readseq.length(),
							revcomp,
							0,
							revcomp.length()
							);
					if(hit.size()>=MIN_ALIGN_LEN)
						{
						System.err.println("REVCOMP5' vs ITSELF: "+hit.getMatchingSequence()+" "+clipseq+" for "+readseq+" "+rec.getReadName()+" "+rec.getCigarString());
						}
					*/
            }
            if (ce3.getOperator() == CigarOperator.S && ce3.getLength() >= MIN_ALIGN_LEN) {
                CharSequence clipseq = new SubSequence(readseq, readseq.length() - ce3.getLength(), readseq.length());
                CharSequence revcomp = new RevCompCharSequence(clipseq);
                LongestCommonSequence.Hit hit = matrix.align(genomicSequence, Math.max(0, rec.getUnclippedStart() - rec.getReadLength()), Math.min(rec.getUnclippedEnd() + rec.getReadLength(), genomicSequence.length()), revcomp, 0, revcomp.length());
                if (hit.size() >= MIN_ALIGN_LEN) {
                    System.err.println("REVCOMP3' " + hit.getMatchingSequence() + " " + clipseq + " for " + readseq + " " + rec.getReadName());
                }
            /*
					hit = matrix.align(
							readseq, 0, readseq.length(),
							revcomp,
							0,
							revcomp.length()
							);
					if(hit.size()>=MIN_ALIGN_LEN)
						{
						System.err.println("REVCOMP3' vs ITSELF: "+hit.getMatchingSequence()+" "+clipseq+" for "+readseq+" "+rec.getReadName()+" "+rec.getCigarString());
						}*/
            }
            /* other */
            List<LongestCommonSequence.Hit> hits = new ArrayList<>();
            align(hits, matrix, genomicSequence, Math.max(0, rec.getUnclippedStart() - rec.getReadLength()), Math.min(rec.getUnclippedEnd() + rec.getReadLength(), genomicSequence.length()), readseq, 0, readseq.length(), -1);
            if (hits.size() < 2000)
                continue;
            for (LongestCommonSequence.Hit hit : hits) {
                int readstart = 0;
                boolean in_M = false;
                for (int i = 0; i < nCigarElement; ++i) {
                    final CigarElement ce = cigar.getCigarElement(i);
                    if (ce.getOperator().consumesReadBases()) {
                        int readend = readstart + ce.getLength();
                        if (ce.getOperator() == CigarOperator.M || ce.getOperator() == CigarOperator.X || ce.getOperator() == CigarOperator.EQ) {
                            if (!(hit.getEndY() <= readstart || readend <= hit.getStartY())) {
                                in_M = true;
                                break;
                            }
                        }
                        readstart = readend;
                    }
                }
                if (in_M)
                    continue;
                int align_size = hit.size();
                System.err.println(rec.getReadName() + " " + rec.getCigarString() + " " + rec.getAlignmentStart() + "-" + rec.getAlignmentEnd());
                System.err.println("REF: " + hit.getStartX() + "-" + hit.getEndX());
                System.err.println("READ " + hit.getStartY() + "-" + hit.getEndY());
                System.err.print("REF  :");
                for (int i = 0; i < align_size; ++i) {
                    System.err.print(genomicSequence.charAt(hit.getStartX() + i));
                }
                System.err.println();
                System.err.print("READ :");
                for (int i = 0; i < align_size; ++i) {
                    System.err.print(readseq.charAt(hit.getStartY() + i));
                }
                System.err.println();
            }
            System.err.println();
        }
        progress.finish();
        return RETURN_OK;
    } catch (Exception e) {
        return wrapException(e);
    } finally {
        genomicSequence = null;
        CloserUtil.close(iter);
        CloserUtil.close(samReader);
        CloserUtil.close(w);
        CloserUtil.close(indexedFastaSequenceFile);
        matrix = null;
    }
}
Also used : SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) SAMFileWriter(htsjdk.samtools.SAMFileWriter) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) RevCompCharSequence(com.github.lindenb.jvarkit.util.bio.RevCompCharSequence) ArrayList(java.util.ArrayList) LongestCommonSequence(com.github.lindenb.jvarkit.util.align.LongestCommonSequence) CigarElement(htsjdk.samtools.CigarElement) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile) SamReader(htsjdk.samtools.SamReader) Cigar(htsjdk.samtools.Cigar) SubSequence(com.github.lindenb.jvarkit.lang.SubSequence) SAMRecord(htsjdk.samtools.SAMRecord) RevCompCharSequence(com.github.lindenb.jvarkit.util.bio.RevCompCharSequence) SAMFileHeader(htsjdk.samtools.SAMFileHeader)

Aggregations

SubSequence (com.github.lindenb.jvarkit.lang.SubSequence)3 RevCompCharSequence (com.github.lindenb.jvarkit.util.bio.RevCompCharSequence)3 GenomicSequence (com.github.lindenb.jvarkit.util.picard.GenomicSequence)2 LongestCommonSequence (com.github.lindenb.jvarkit.util.align.LongestCommonSequence)1 SAMSequenceDictionaryProgress (com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress)1 Cigar (htsjdk.samtools.Cigar)1 CigarElement (htsjdk.samtools.CigarElement)1 SAMFileHeader (htsjdk.samtools.SAMFileHeader)1 SAMFileWriter (htsjdk.samtools.SAMFileWriter)1 SAMRecord (htsjdk.samtools.SAMRecord)1 SAMRecordIterator (htsjdk.samtools.SAMRecordIterator)1 SamReader (htsjdk.samtools.SamReader)1 IndexedFastaSequenceFile (htsjdk.samtools.reference.IndexedFastaSequenceFile)1 VariantContext (htsjdk.variant.variantcontext.VariantContext)1 VariantContextBuilder (htsjdk.variant.variantcontext.VariantContextBuilder)1 VCFHeader (htsjdk.variant.vcf.VCFHeader)1 VCFInfoHeaderLine (htsjdk.variant.vcf.VCFInfoHeaderLine)1 ArrayList (java.util.ArrayList)1 HashSet (java.util.HashSet)1