use of com.github.lindenb.jvarkit.lang.SubSequence in project jvarkit by lindenb.
the class GenomicJaspar method digest.
private void digest(PrintWriter out, String seqName, int position0, final StringBuilder sequence) {
for (final Matrix matrix : this.jasparDb) {
if (matrix.length() > sequence.length())
continue;
CharSequence forward = new SubSequence(sequence, 0, matrix.length());
CharSequence revcomp = new RevCompCharSequence(forward);
for (int strand = 0; strand < 2; ++strand) {
double score = matrix.score(strand == 0 ? forward : revcomp);
if (score <= 0)
continue;
if (score >= matrix.max() * this.fraction_of_max) {
out.print(seqName);
out.print('\t');
out.print(position0);
out.print('\t');
out.print(position0 + matrix.length());
out.print('\t');
out.print(matrix.getName());
out.print('\t');
out.print((int) (1000.0 * (score / matrix.max())));
out.print('\t');
out.print(strand == 1 ? '-' : '+');
out.print('\t');
out.print(matrix.length());
out.print('\t');
out.print(matrix.getArchetype());
out.print('\t');
out.print(strand == 0 ? forward : revcomp);
out.println();
break;
}
}
}
}
use of com.github.lindenb.jvarkit.lang.SubSequence in project jvarkit by lindenb.
the class VcfJaspar method doVcfToVcf.
@Override
protected int doVcfToVcf(String inputName, VcfIterator in, VariantContextWriter out) {
final String ATT = "JASPAR";
GenomicSequence genomicSequence = null;
final VCFHeader header = new VCFHeader(in.getHeader());
addMetaData(header);
header.addMetaDataLine(new VCFInfoHeaderLine(ATT, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Jaspar pattern overlapping: Format: (Name|length|Score/1000|pos|strand)"));
out.writeHeader(header);
while (in.hasNext()) {
VariantContext var = in.next();
if (genomicSequence == null || !genomicSequence.getChrom().equals(var.getContig())) {
LOG.info("Loading sequence " + var.getContig());
genomicSequence = new GenomicSequence(this.indexedFastaSequenceFile, var.getContig());
}
final Set<String> hits = new HashSet<String>();
for (final Matrix matrix : this.jasparDb) {
int start0 = Math.max(0, var.getStart() - matrix.length());
for (int y = start0; y < var.getStart() && y + matrix.length() <= genomicSequence.length(); ++y) {
CharSequence forward = new SubSequence(genomicSequence, y, y + matrix.length());
CharSequence revcomp = new RevCompCharSequence(forward);
// run each strand
for (int strand = 0; strand < 2; ++strand) {
double score = matrix.score(strand == 0 ? forward : revcomp);
if (score <= 0)
continue;
if (score >= matrix.max() * this.fraction_of_max) {
StringBuilder b = new StringBuilder("(");
b.append(matrix.getName().replaceAll("[ \t;=]+", "/"));
b.append("|");
b.append(matrix.length());
b.append("|");
b.append((int) (1000.0 * (score / matrix.max())));
b.append("|");
b.append(y + 1);
b.append("|");
b.append(strand == 0 ? '+' : '-');
b.append(")");
hits.add(b.toString());
break;
}
}
}
}
if (hits.isEmpty()) {
out.add(var);
continue;
}
final VariantContextBuilder vcb = new VariantContextBuilder(var);
vcb.attribute(ATT, hits.toArray(new String[hits.size()]));
out.add(vcb.make());
}
return RETURN_OK;
}
use of com.github.lindenb.jvarkit.lang.SubSequence in project jvarkit by lindenb.
the class LocalRealignReads method doWork.
@Override
public int doWork(List<String> args) {
if (this.faidxFile == null) {
LOG.error("REFerence file missing;");
return -1;
}
IndexedFastaSequenceFile indexedFastaSequenceFile = null;
SamReader samReader = null;
SAMFileWriter w = null;
SAMRecordIterator iter = null;
GenomicSequence genomicSequence = null;
LongestCommonSequence matrix = new LongestCommonSequence();
try {
indexedFastaSequenceFile = new IndexedFastaSequenceFile(this.faidxFile);
samReader = openSamReader(oneFileOrNull(args));
final SAMFileHeader header1 = samReader.getFileHeader();
final SAMFileHeader header2 = header1.clone();
header2.setSortOrder(SortOrder.unsorted);
w = this.writingBamArgs.setReferenceFile(faidxFile).openSAMFileWriter(outputFile, header2, true);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header1);
iter = samReader.iterator();
while (iter.hasNext()) {
final SAMRecord rec = progress.watch(iter.next());
if (rec.getReadUnmappedFlag() || rec.isSecondaryOrSupplementary() || rec.getReadFailsVendorQualityCheckFlag() || rec.getDuplicateReadFlag()) {
w.addAlignment(rec);
continue;
}
final Cigar cigar = rec.getCigar();
final int nCigarElement = cigar.numCigarElements();
if (nCigarElement < 2) {
w.addAlignment(rec);
continue;
}
final CigarElement ce5 = cigar.getCigarElement(0);
final CigarElement ce3 = cigar.getCigarElement(nCigarElement - 1);
if (!((ce3.getOperator().equals(CigarOperator.S) && ce3.getLength() >= MIN_ALIGN_LEN) || (ce5.getOperator().equals(CigarOperator.S) && ce5.getLength() >= MIN_ALIGN_LEN))) {
w.addAlignment(rec);
continue;
}
if (genomicSequence == null || !genomicSequence.getChrom().equals(rec.getReferenceName())) {
genomicSequence = new GenomicSequence(indexedFastaSequenceFile, rec.getReferenceName());
}
final CharSequence readseq = rec.getReadString();
/**
* short invert
*/
if (ce5.getOperator() == CigarOperator.S && ce5.getLength() >= MIN_ALIGN_LEN) {
CharSequence clipseq = new SubSequence(readseq, 0, ce5.getLength());
CharSequence revcomp = new RevCompCharSequence(clipseq);
LongestCommonSequence.Hit hit = matrix.align(genomicSequence, Math.max(0, rec.getUnclippedStart() - rec.getReadLength()), Math.min(rec.getUnclippedEnd() + rec.getReadLength(), genomicSequence.length()), revcomp, 0, revcomp.length());
if (hit.size() >= MIN_ALIGN_LEN) {
System.err.println("REVCOMP5' " + hit.getMatchingSequence() + " " + clipseq + " for " + readseq + " " + rec.getReadName() + " " + rec.getCigarString());
}
/*
hit = matrix.align(
readseq, 0, readseq.length(),
revcomp,
0,
revcomp.length()
);
if(hit.size()>=MIN_ALIGN_LEN)
{
System.err.println("REVCOMP5' vs ITSELF: "+hit.getMatchingSequence()+" "+clipseq+" for "+readseq+" "+rec.getReadName()+" "+rec.getCigarString());
}
*/
}
if (ce3.getOperator() == CigarOperator.S && ce3.getLength() >= MIN_ALIGN_LEN) {
CharSequence clipseq = new SubSequence(readseq, readseq.length() - ce3.getLength(), readseq.length());
CharSequence revcomp = new RevCompCharSequence(clipseq);
LongestCommonSequence.Hit hit = matrix.align(genomicSequence, Math.max(0, rec.getUnclippedStart() - rec.getReadLength()), Math.min(rec.getUnclippedEnd() + rec.getReadLength(), genomicSequence.length()), revcomp, 0, revcomp.length());
if (hit.size() >= MIN_ALIGN_LEN) {
System.err.println("REVCOMP3' " + hit.getMatchingSequence() + " " + clipseq + " for " + readseq + " " + rec.getReadName());
}
/*
hit = matrix.align(
readseq, 0, readseq.length(),
revcomp,
0,
revcomp.length()
);
if(hit.size()>=MIN_ALIGN_LEN)
{
System.err.println("REVCOMP3' vs ITSELF: "+hit.getMatchingSequence()+" "+clipseq+" for "+readseq+" "+rec.getReadName()+" "+rec.getCigarString());
}*/
}
/* other */
List<LongestCommonSequence.Hit> hits = new ArrayList<>();
align(hits, matrix, genomicSequence, Math.max(0, rec.getUnclippedStart() - rec.getReadLength()), Math.min(rec.getUnclippedEnd() + rec.getReadLength(), genomicSequence.length()), readseq, 0, readseq.length(), -1);
if (hits.size() < 2000)
continue;
for (LongestCommonSequence.Hit hit : hits) {
int readstart = 0;
boolean in_M = false;
for (int i = 0; i < nCigarElement; ++i) {
final CigarElement ce = cigar.getCigarElement(i);
if (ce.getOperator().consumesReadBases()) {
int readend = readstart + ce.getLength();
if (ce.getOperator() == CigarOperator.M || ce.getOperator() == CigarOperator.X || ce.getOperator() == CigarOperator.EQ) {
if (!(hit.getEndY() <= readstart || readend <= hit.getStartY())) {
in_M = true;
break;
}
}
readstart = readend;
}
}
if (in_M)
continue;
int align_size = hit.size();
System.err.println(rec.getReadName() + " " + rec.getCigarString() + " " + rec.getAlignmentStart() + "-" + rec.getAlignmentEnd());
System.err.println("REF: " + hit.getStartX() + "-" + hit.getEndX());
System.err.println("READ " + hit.getStartY() + "-" + hit.getEndY());
System.err.print("REF :");
for (int i = 0; i < align_size; ++i) {
System.err.print(genomicSequence.charAt(hit.getStartX() + i));
}
System.err.println();
System.err.print("READ :");
for (int i = 0; i < align_size; ++i) {
System.err.print(readseq.charAt(hit.getStartY() + i));
}
System.err.println();
}
System.err.println();
}
progress.finish();
return RETURN_OK;
} catch (Exception e) {
return wrapException(e);
} finally {
genomicSequence = null;
CloserUtil.close(iter);
CloserUtil.close(samReader);
CloserUtil.close(w);
CloserUtil.close(indexedFastaSequenceFile);
matrix = null;
}
}
Aggregations