use of com.github.lindenb.jvarkit.util.align.LongestCommonSequence in project jvarkit by lindenb.
the class LocalRealignReads method doWork.
@Override
public int doWork(List<String> args) {
if (this.faidxFile == null) {
LOG.error("REFerence file missing;");
return -1;
}
IndexedFastaSequenceFile indexedFastaSequenceFile = null;
SamReader samReader = null;
SAMFileWriter w = null;
SAMRecordIterator iter = null;
GenomicSequence genomicSequence = null;
LongestCommonSequence matrix = new LongestCommonSequence();
try {
indexedFastaSequenceFile = new IndexedFastaSequenceFile(this.faidxFile);
samReader = openSamReader(oneFileOrNull(args));
final SAMFileHeader header1 = samReader.getFileHeader();
final SAMFileHeader header2 = header1.clone();
header2.setSortOrder(SortOrder.unsorted);
w = this.writingBamArgs.setReferenceFile(faidxFile).openSAMFileWriter(outputFile, header2, true);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header1);
iter = samReader.iterator();
while (iter.hasNext()) {
final SAMRecord rec = progress.watch(iter.next());
if (rec.getReadUnmappedFlag() || rec.isSecondaryOrSupplementary() || rec.getReadFailsVendorQualityCheckFlag() || rec.getDuplicateReadFlag()) {
w.addAlignment(rec);
continue;
}
final Cigar cigar = rec.getCigar();
final int nCigarElement = cigar.numCigarElements();
if (nCigarElement < 2) {
w.addAlignment(rec);
continue;
}
final CigarElement ce5 = cigar.getCigarElement(0);
final CigarElement ce3 = cigar.getCigarElement(nCigarElement - 1);
if (!((ce3.getOperator().equals(CigarOperator.S) && ce3.getLength() >= MIN_ALIGN_LEN) || (ce5.getOperator().equals(CigarOperator.S) && ce5.getLength() >= MIN_ALIGN_LEN))) {
w.addAlignment(rec);
continue;
}
if (genomicSequence == null || !genomicSequence.getChrom().equals(rec.getReferenceName())) {
genomicSequence = new GenomicSequence(indexedFastaSequenceFile, rec.getReferenceName());
}
final CharSequence readseq = rec.getReadString();
/**
* short invert
*/
if (ce5.getOperator() == CigarOperator.S && ce5.getLength() >= MIN_ALIGN_LEN) {
CharSequence clipseq = new SubSequence(readseq, 0, ce5.getLength());
CharSequence revcomp = new RevCompCharSequence(clipseq);
LongestCommonSequence.Hit hit = matrix.align(genomicSequence, Math.max(0, rec.getUnclippedStart() - rec.getReadLength()), Math.min(rec.getUnclippedEnd() + rec.getReadLength(), genomicSequence.length()), revcomp, 0, revcomp.length());
if (hit.size() >= MIN_ALIGN_LEN) {
System.err.println("REVCOMP5' " + hit.getMatchingSequence() + " " + clipseq + " for " + readseq + " " + rec.getReadName() + " " + rec.getCigarString());
}
/*
hit = matrix.align(
readseq, 0, readseq.length(),
revcomp,
0,
revcomp.length()
);
if(hit.size()>=MIN_ALIGN_LEN)
{
System.err.println("REVCOMP5' vs ITSELF: "+hit.getMatchingSequence()+" "+clipseq+" for "+readseq+" "+rec.getReadName()+" "+rec.getCigarString());
}
*/
}
if (ce3.getOperator() == CigarOperator.S && ce3.getLength() >= MIN_ALIGN_LEN) {
CharSequence clipseq = new SubSequence(readseq, readseq.length() - ce3.getLength(), readseq.length());
CharSequence revcomp = new RevCompCharSequence(clipseq);
LongestCommonSequence.Hit hit = matrix.align(genomicSequence, Math.max(0, rec.getUnclippedStart() - rec.getReadLength()), Math.min(rec.getUnclippedEnd() + rec.getReadLength(), genomicSequence.length()), revcomp, 0, revcomp.length());
if (hit.size() >= MIN_ALIGN_LEN) {
System.err.println("REVCOMP3' " + hit.getMatchingSequence() + " " + clipseq + " for " + readseq + " " + rec.getReadName());
}
/*
hit = matrix.align(
readseq, 0, readseq.length(),
revcomp,
0,
revcomp.length()
);
if(hit.size()>=MIN_ALIGN_LEN)
{
System.err.println("REVCOMP3' vs ITSELF: "+hit.getMatchingSequence()+" "+clipseq+" for "+readseq+" "+rec.getReadName()+" "+rec.getCigarString());
}*/
}
/* other */
List<LongestCommonSequence.Hit> hits = new ArrayList<>();
align(hits, matrix, genomicSequence, Math.max(0, rec.getUnclippedStart() - rec.getReadLength()), Math.min(rec.getUnclippedEnd() + rec.getReadLength(), genomicSequence.length()), readseq, 0, readseq.length(), -1);
if (hits.size() < 2000)
continue;
for (LongestCommonSequence.Hit hit : hits) {
int readstart = 0;
boolean in_M = false;
for (int i = 0; i < nCigarElement; ++i) {
final CigarElement ce = cigar.getCigarElement(i);
if (ce.getOperator().consumesReadBases()) {
int readend = readstart + ce.getLength();
if (ce.getOperator() == CigarOperator.M || ce.getOperator() == CigarOperator.X || ce.getOperator() == CigarOperator.EQ) {
if (!(hit.getEndY() <= readstart || readend <= hit.getStartY())) {
in_M = true;
break;
}
}
readstart = readend;
}
}
if (in_M)
continue;
int align_size = hit.size();
System.err.println(rec.getReadName() + " " + rec.getCigarString() + " " + rec.getAlignmentStart() + "-" + rec.getAlignmentEnd());
System.err.println("REF: " + hit.getStartX() + "-" + hit.getEndX());
System.err.println("READ " + hit.getStartY() + "-" + hit.getEndY());
System.err.print("REF :");
for (int i = 0; i < align_size; ++i) {
System.err.print(genomicSequence.charAt(hit.getStartX() + i));
}
System.err.println();
System.err.print("READ :");
for (int i = 0; i < align_size; ++i) {
System.err.print(readseq.charAt(hit.getStartY() + i));
}
System.err.println();
}
System.err.println();
}
progress.finish();
return RETURN_OK;
} catch (Exception e) {
return wrapException(e);
} finally {
genomicSequence = null;
CloserUtil.close(iter);
CloserUtil.close(samReader);
CloserUtil.close(w);
CloserUtil.close(indexedFastaSequenceFile);
matrix = null;
}
}
Aggregations