Search in sources :

Example 1 with LongestCommonSequence

use of com.github.lindenb.jvarkit.util.align.LongestCommonSequence in project jvarkit by lindenb.

the class LocalRealignReads method doWork.

@Override
public int doWork(List<String> args) {
    if (this.faidxFile == null) {
        LOG.error("REFerence file missing;");
        return -1;
    }
    IndexedFastaSequenceFile indexedFastaSequenceFile = null;
    SamReader samReader = null;
    SAMFileWriter w = null;
    SAMRecordIterator iter = null;
    GenomicSequence genomicSequence = null;
    LongestCommonSequence matrix = new LongestCommonSequence();
    try {
        indexedFastaSequenceFile = new IndexedFastaSequenceFile(this.faidxFile);
        samReader = openSamReader(oneFileOrNull(args));
        final SAMFileHeader header1 = samReader.getFileHeader();
        final SAMFileHeader header2 = header1.clone();
        header2.setSortOrder(SortOrder.unsorted);
        w = this.writingBamArgs.setReferenceFile(faidxFile).openSAMFileWriter(outputFile, header2, true);
        final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header1);
        iter = samReader.iterator();
        while (iter.hasNext()) {
            final SAMRecord rec = progress.watch(iter.next());
            if (rec.getReadUnmappedFlag() || rec.isSecondaryOrSupplementary() || rec.getReadFailsVendorQualityCheckFlag() || rec.getDuplicateReadFlag()) {
                w.addAlignment(rec);
                continue;
            }
            final Cigar cigar = rec.getCigar();
            final int nCigarElement = cigar.numCigarElements();
            if (nCigarElement < 2) {
                w.addAlignment(rec);
                continue;
            }
            final CigarElement ce5 = cigar.getCigarElement(0);
            final CigarElement ce3 = cigar.getCigarElement(nCigarElement - 1);
            if (!((ce3.getOperator().equals(CigarOperator.S) && ce3.getLength() >= MIN_ALIGN_LEN) || (ce5.getOperator().equals(CigarOperator.S) && ce5.getLength() >= MIN_ALIGN_LEN))) {
                w.addAlignment(rec);
                continue;
            }
            if (genomicSequence == null || !genomicSequence.getChrom().equals(rec.getReferenceName())) {
                genomicSequence = new GenomicSequence(indexedFastaSequenceFile, rec.getReferenceName());
            }
            final CharSequence readseq = rec.getReadString();
            /**
             * short invert
             */
            if (ce5.getOperator() == CigarOperator.S && ce5.getLength() >= MIN_ALIGN_LEN) {
                CharSequence clipseq = new SubSequence(readseq, 0, ce5.getLength());
                CharSequence revcomp = new RevCompCharSequence(clipseq);
                LongestCommonSequence.Hit hit = matrix.align(genomicSequence, Math.max(0, rec.getUnclippedStart() - rec.getReadLength()), Math.min(rec.getUnclippedEnd() + rec.getReadLength(), genomicSequence.length()), revcomp, 0, revcomp.length());
                if (hit.size() >= MIN_ALIGN_LEN) {
                    System.err.println("REVCOMP5' " + hit.getMatchingSequence() + " " + clipseq + " for " + readseq + " " + rec.getReadName() + " " + rec.getCigarString());
                }
            /*
					
					hit = matrix.align(
							readseq, 0, readseq.length(),
							revcomp,
							0,
							revcomp.length()
							);
					if(hit.size()>=MIN_ALIGN_LEN)
						{
						System.err.println("REVCOMP5' vs ITSELF: "+hit.getMatchingSequence()+" "+clipseq+" for "+readseq+" "+rec.getReadName()+" "+rec.getCigarString());
						}
					*/
            }
            if (ce3.getOperator() == CigarOperator.S && ce3.getLength() >= MIN_ALIGN_LEN) {
                CharSequence clipseq = new SubSequence(readseq, readseq.length() - ce3.getLength(), readseq.length());
                CharSequence revcomp = new RevCompCharSequence(clipseq);
                LongestCommonSequence.Hit hit = matrix.align(genomicSequence, Math.max(0, rec.getUnclippedStart() - rec.getReadLength()), Math.min(rec.getUnclippedEnd() + rec.getReadLength(), genomicSequence.length()), revcomp, 0, revcomp.length());
                if (hit.size() >= MIN_ALIGN_LEN) {
                    System.err.println("REVCOMP3' " + hit.getMatchingSequence() + " " + clipseq + " for " + readseq + " " + rec.getReadName());
                }
            /*
					hit = matrix.align(
							readseq, 0, readseq.length(),
							revcomp,
							0,
							revcomp.length()
							);
					if(hit.size()>=MIN_ALIGN_LEN)
						{
						System.err.println("REVCOMP3' vs ITSELF: "+hit.getMatchingSequence()+" "+clipseq+" for "+readseq+" "+rec.getReadName()+" "+rec.getCigarString());
						}*/
            }
            /* other */
            List<LongestCommonSequence.Hit> hits = new ArrayList<>();
            align(hits, matrix, genomicSequence, Math.max(0, rec.getUnclippedStart() - rec.getReadLength()), Math.min(rec.getUnclippedEnd() + rec.getReadLength(), genomicSequence.length()), readseq, 0, readseq.length(), -1);
            if (hits.size() < 2000)
                continue;
            for (LongestCommonSequence.Hit hit : hits) {
                int readstart = 0;
                boolean in_M = false;
                for (int i = 0; i < nCigarElement; ++i) {
                    final CigarElement ce = cigar.getCigarElement(i);
                    if (ce.getOperator().consumesReadBases()) {
                        int readend = readstart + ce.getLength();
                        if (ce.getOperator() == CigarOperator.M || ce.getOperator() == CigarOperator.X || ce.getOperator() == CigarOperator.EQ) {
                            if (!(hit.getEndY() <= readstart || readend <= hit.getStartY())) {
                                in_M = true;
                                break;
                            }
                        }
                        readstart = readend;
                    }
                }
                if (in_M)
                    continue;
                int align_size = hit.size();
                System.err.println(rec.getReadName() + " " + rec.getCigarString() + " " + rec.getAlignmentStart() + "-" + rec.getAlignmentEnd());
                System.err.println("REF: " + hit.getStartX() + "-" + hit.getEndX());
                System.err.println("READ " + hit.getStartY() + "-" + hit.getEndY());
                System.err.print("REF  :");
                for (int i = 0; i < align_size; ++i) {
                    System.err.print(genomicSequence.charAt(hit.getStartX() + i));
                }
                System.err.println();
                System.err.print("READ :");
                for (int i = 0; i < align_size; ++i) {
                    System.err.print(readseq.charAt(hit.getStartY() + i));
                }
                System.err.println();
            }
            System.err.println();
        }
        progress.finish();
        return RETURN_OK;
    } catch (Exception e) {
        return wrapException(e);
    } finally {
        genomicSequence = null;
        CloserUtil.close(iter);
        CloserUtil.close(samReader);
        CloserUtil.close(w);
        CloserUtil.close(indexedFastaSequenceFile);
        matrix = null;
    }
}
Also used : SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) SAMFileWriter(htsjdk.samtools.SAMFileWriter) GenomicSequence(com.github.lindenb.jvarkit.util.picard.GenomicSequence) RevCompCharSequence(com.github.lindenb.jvarkit.util.bio.RevCompCharSequence) ArrayList(java.util.ArrayList) LongestCommonSequence(com.github.lindenb.jvarkit.util.align.LongestCommonSequence) CigarElement(htsjdk.samtools.CigarElement) IndexedFastaSequenceFile(htsjdk.samtools.reference.IndexedFastaSequenceFile) SamReader(htsjdk.samtools.SamReader) Cigar(htsjdk.samtools.Cigar) SubSequence(com.github.lindenb.jvarkit.lang.SubSequence) SAMRecord(htsjdk.samtools.SAMRecord) RevCompCharSequence(com.github.lindenb.jvarkit.util.bio.RevCompCharSequence) SAMFileHeader(htsjdk.samtools.SAMFileHeader)

Aggregations

SubSequence (com.github.lindenb.jvarkit.lang.SubSequence)1 LongestCommonSequence (com.github.lindenb.jvarkit.util.align.LongestCommonSequence)1 RevCompCharSequence (com.github.lindenb.jvarkit.util.bio.RevCompCharSequence)1 GenomicSequence (com.github.lindenb.jvarkit.util.picard.GenomicSequence)1 SAMSequenceDictionaryProgress (com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress)1 Cigar (htsjdk.samtools.Cigar)1 CigarElement (htsjdk.samtools.CigarElement)1 SAMFileHeader (htsjdk.samtools.SAMFileHeader)1 SAMFileWriter (htsjdk.samtools.SAMFileWriter)1 SAMRecord (htsjdk.samtools.SAMRecord)1 SAMRecordIterator (htsjdk.samtools.SAMRecordIterator)1 SamReader (htsjdk.samtools.SamReader)1 IndexedFastaSequenceFile (htsjdk.samtools.reference.IndexedFastaSequenceFile)1 ArrayList (java.util.ArrayList)1