use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.
the class VcfBurden method doWork2.
private int doWork2(List<String> args) {
ZipOutputStream zout = null;
FileOutputStream fout = null;
VcfIterator in = null;
try {
if (args.isEmpty()) {
LOG.info("reading from stdin.");
in = VCFUtils.createVcfIteratorStdin();
} else if (args.size() == 1) {
String filename = args.get(0);
LOG.info("reading from " + filename);
in = VCFUtils.createVcfIterator(filename);
} else {
LOG.error("Illegal number of arguments.");
return -1;
}
if (outputFile == null) {
LOG.error("undefined output");
return -1;
} else if (!outputFile.getName().endsWith(".zip")) {
LOG.error("output " + outputFile + " should end with .zip");
return -1;
} else {
fout = new FileOutputStream(outputFile);
zout = new ZipOutputStream(fout);
}
final List<String> samples = in.getHeader().getSampleNamesInOrder();
final VCFHeader header = in.getHeader();
String prev_chrom = null;
final VepPredictionParser vepPredParser = new VepPredictionParserFactory().header(header).get();
final Map<GeneTranscript, List<VariantAndCsq>> gene2variants = new HashMap<>();
final SequenceOntologyTree soTree = SequenceOntologyTree.getInstance();
final Set<SequenceOntologyTree.Term> acn = new HashSet<>();
/* mail solena *SO en remplacement des SO actuels (VEP HIGH + MODERATE) - pas la peine de faire retourner les analyses mais servira pour les futures analyses burden */
String[] acn_list = new String[] { "SO:0001893", "SO:0001574", "SO:0001575", "SO:0001587", "SO:0001589", "SO:0001578", "SO:0002012", "SO:0001889", "SO:0001821", "SO:0001822", "SO:0001583", "SO:0001818" /*
"SO:0001589", "SO:0001587", "SO:0001582", "SO:0001583",
"SO:0001575", "SO:0001578", "SO:0001574", "SO:0001889",
"SO:0001821", "SO:0001822", "SO:0001893"*/
};
if (this.highdamage) {
acn_list = new String[] { "SO:0001893", "SO:0001574", "SO:0001575", "SO:0001587", "SO:0001589", "SO:0001578", "SO:0002012", "SO:0001889" };
}
for (final String acns : acn_list) {
final SequenceOntologyTree.Term tacn = soTree.getTermByAcn(acns);
if (tacn == null) {
in.close();
throw new NullPointerException("tacn == null pour " + acns);
}
acn.addAll(tacn.getAllDescendants());
}
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(in.getHeader());
for (; ; ) {
VariantContext ctx1 = null;
if (in.hasNext()) {
ctx1 = progress.watch(in.next());
if (ctx1.getAlternateAlleles().size() != 1) {
// info("count(ALT)!=1 in "+ctx1.getChr()+":"+ctx1.getStart());
continue;
}
if (ctx1.getAlternateAlleles().get(0).equals(Allele.SPAN_DEL)) {
continue;
}
}
if (ctx1 == null || !ctx1.getContig().equals(prev_chrom)) {
LOG.info("DUMP to zip n=" + gene2variants.size());
final Set<String> geneNames = new HashSet<>();
for (GeneTranscript gene_transcript : gene2variants.keySet()) {
geneNames.add(gene_transcript.geneName);
final Set<VariantAndCsq> uniq = new TreeSet<>(this.variantAndCsqComparator);
uniq.addAll(gene2variants.get(gene_transcript));
dumpVariants(zout, prev_chrom, gene_transcript.geneName + "_" + gene_transcript.transcriptName, samples, new ArrayList<VariantAndCsq>(uniq));
LOG.info("dumped" + gene_transcript.geneName);
}
LOG.info("loop over geneName");
for (final String geneName : geneNames) {
final SortedSet<VariantAndCsq> lis_nm = new TreeSet<>(this.variantAndCsqComparator);
final SortedSet<VariantAndCsq> lis_all = new TreeSet<>(this.variantAndCsqComparator);
final SortedSet<VariantAndCsq> lis_refseq = new TreeSet<>(this.variantAndCsqComparator);
final SortedSet<VariantAndCsq> lis_enst = new TreeSet<>(this.variantAndCsqComparator);
LOG.info("loop over gene2variants");
for (final GeneTranscript gene_transcript : gene2variants.keySet()) {
if (!geneName.equals(gene_transcript.geneName))
continue;
lis_all.addAll(gene2variants.get(gene_transcript));
if (gene_transcript.transcriptName.startsWith("NM_")) {
lis_nm.addAll(gene2variants.get(gene_transcript));
}
if (!gene_transcript.transcriptName.startsWith("ENST")) {
lis_refseq.addAll(gene2variants.get(gene_transcript));
}
if (gene_transcript.transcriptName.startsWith("ENST")) {
lis_enst.addAll(gene2variants.get(gene_transcript));
}
}
LOG.info("dump_ALL_TRANSCRIPTS");
dumpVariants(zout, prev_chrom, geneName + "_ALL_TRANSCRIPTS", samples, new ArrayList<VariantAndCsq>(lis_all));
LOG.info("dump_ALL_NM");
dumpVariants(zout, prev_chrom, geneName + "_ALL_NM", samples, new ArrayList<VariantAndCsq>(lis_nm));
LOG.info("dump_ALL_REFSEQ");
dumpVariants(zout, prev_chrom, geneName + "_ALL_REFSEQ", samples, new ArrayList<VariantAndCsq>(lis_refseq));
LOG.info("dump_ALL_ENST");
dumpVariants(zout, prev_chrom, geneName + "_ALL_ENST", samples, new ArrayList<VariantAndCsq>(lis_enst));
}
if (ctx1 == null)
break;
LOG.info("gene2variants");
gene2variants.clear();
LOG.info("System.gc();");
System.gc();
prev_chrom = ctx1.getContig();
LOG.info("prev_chrom=" + prev_chrom);
}
final Set<GeneTranscript> seen_names = new HashSet<>();
for (final VepPredictionParser.VepPrediction pred : vepPredParser.getPredictions(ctx1)) {
String geneName = pred.getSymbol();
if (geneName == null || geneName.trim().isEmpty())
continue;
if (this._gene2seen != null) {
if (!this._gene2seen.containsKey(geneName))
continue;
}
final String transcriptName = pred.getFeature();
if (transcriptName == null || transcriptName.trim().isEmpty()) {
LOG.info("No transcript in " + ctx1);
continue;
}
final GeneTranscript geneTranscript = new GeneTranscript(geneName, transcriptName);
if (seen_names.contains(geneTranscript))
continue;
boolean ok = false;
for (SequenceOntologyTree.Term so : pred.getSOTerms()) {
if (acn.contains(so)) {
ok = true;
}
}
if (!printSOTerms && !ok)
continue;
List<VariantAndCsq> L = gene2variants.get(geneTranscript);
if (L == null) {
L = new ArrayList<>();
gene2variants.put(geneTranscript, L);
}
Float vqslod = null;
if (this.printVQSLOD && ctx1.hasAttribute("VQSLOD")) {
vqslod = (float) ctx1.getAttributeAsDouble("VQSLOD", -9999999.0);
}
L.add(new VariantAndCsq(ctx1, pred.getSOTerms(), pred.getPositionInCDS(), vqslod));
seen_names.add(geneTranscript);
if (this._gene2seen != null) {
this._gene2seen.put(geneTranscript.geneName, Boolean.TRUE);
}
}
}
if (this._gene2seen != null) {
final List<VariantAndCsq> emptylist = Collections.emptyList();
for (final String gene : this._gene2seen.keySet()) {
if (this._gene2seen.get(gene).equals(Boolean.TRUE))
continue;
LOG.warning("Gene not found : " + gene);
dumpVariants(zout, "UNDEFINED", gene + "_000000000000000.txt", samples, emptylist);
}
}
progress.finish();
zout.finish();
fout.flush();
zout.flush();
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(in);
CloserUtil.close(zout);
CloserUtil.close(fout);
}
}
use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.
the class VcfCadd method doVcfToVcf.
@Override
protected int doVcfToVcf(String inputName, VcfIterator in, VariantContextWriter out) {
try {
VCFHeader header = in.getHeader();
if (header.getSequenceDictionary() != null) {
SAMSequenceDictionary dict = header.getSequenceDictionary();
Set<String> vcfchr = new HashSet<String>();
for (SAMSequenceRecord ssr : dict.getSequences()) vcfchr.add(ssr.getSequenceName());
if (// nothing changed
!vcfchr.retainAll(this.tabix.getChromosomes())) {
LOG.warning("#### !!!! NO common chromosomes between tabix and vcf file. Check chromosome 'chr' prefix ? tabix chroms:" + this.tabix.getChromosomes());
}
}
SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header.getSequenceDictionary());
header.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "CmdLine", String.valueOf(getProgramCommandLine())));
header.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName() + "Version", String.valueOf(getVersion())));
header.addMetaDataLine(new VCFInfoHeaderLine(CADD_FLAG, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "(Allele|Score|Phred) Score suggests that that variant is likely to be observed (negative values) vs simulated(positive values)." + "However, raw values do have relative meaning, with higher values indicating that a variant is more likely to be simulated (or -not observed-) and therefore more likely to have deleterious effects." + "PHRED expressing the rank in order of magnitude terms. For example, reference genome single nucleotide variants at the 10th-% of CADD scores are assigned to CADD-10, top 1% to CADD-20, top 0.1% to CADD-30, etc"));
out.writeHeader(header);
List<VariantContext> buffer = new ArrayList<>();
for (; ; ) {
VariantContext ctx = null;
if (in.hasNext()) {
ctx = progress.watch(in.next());
}
if (ctx == null || (!buffer.isEmpty() && (buffer.get(0).getContig().equals(ctx.getContig())) && (ctx.getEnd() - buffer.get(0).getStart()) > buffer_distance)) {
if (!buffer.isEmpty()) {
runTabix(buffer);
for (VariantContext c : buffer) {
out.add(c);
}
}
if (ctx == null)
break;
buffer.clear();
}
buffer.add(ctx);
}
progress.finish();
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
}
}
use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.
the class VcfGeneSplitter method doVcfToVcf.
@Override
protected int doVcfToVcf(String inputName, File outputFile) {
SortingCollection<KeyAndLine> sortingcollection = null;
BufferedReader in = null;
FileOutputStream fos = null;
ZipOutputStream zout = null;
CloseableIterator<KeyAndLine> iter = null;
PrintWriter pw = null;
try {
in = inputName == null ? IOUtils.openStreamForBufferedReader(stdin()) : IOUtils.openURIForBufferedReading(inputName);
final VCFUtils.CodecAndHeader cah = VCFUtils.parseHeader(in);
/**
* find splitter by name
*/
final VepPredictionParser vepPredictionParser = new VepPredictionParserFactory().header(cah.header).get();
sortingcollection = SortingCollection.newInstance(KeyAndLine.class, new KeyAndLineCodec(), new KeyAndLineComparator(), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
sortingcollection.setDestructiveIteration(true);
// read variants
final SAMSequenceDictionaryProgress progess = new SAMSequenceDictionaryProgress(cah.header);
String line;
while ((line = in.readLine()) != null) {
final VariantContext ctx = progess.watch(cah.codec.decode(line));
// no check for ctx.ifFiltered here, we do this later.
for (final String key : this.getVariantKeys(vepPredictionParser, ctx)) {
sortingcollection.add(new KeyAndLine(key, line));
}
}
progess.finish();
sortingcollection.doneAdding();
LOG.info("creating zip " + outputFile);
fos = new FileOutputStream(outputFile);
zout = new ZipOutputStream(fos);
final File tmpReportFile = File.createTempFile("_tmp.", ".txt", writingSortingCollection.getTmpDirectories().get(0));
tmpReportFile.deleteOnExit();
pw = IOUtils.openFileForPrintWriter(tmpReportFile);
pw.println("#chrom\tstart\tend\tkey\tCount_Variants");
iter = sortingcollection.iterator();
final EqualRangeIterator<KeyAndLine> eqiter = new EqualRangeIterator<>(iter, new Comparator<KeyAndLine>() {
@Override
public int compare(final KeyAndLine o1, final KeyAndLine o2) {
return o1.key.compareTo(o2.key);
}
});
while (eqiter.hasNext()) {
final List<KeyAndLine> buffer = eqiter.next();
final KeyAndLine first = buffer.get(0);
LOG.info(first.key);
final List<VariantContext> variants = new ArrayList<>(buffer.size());
String contig = null;
int chromStart = Integer.MAX_VALUE;
int chromEnd = 0;
for (final KeyAndLine kal : buffer) {
final VariantContext ctx = cah.codec.decode(kal.ctx);
variants.add(ctx);
contig = ctx.getContig();
chromStart = Math.min(chromStart, ctx.getStart());
chromEnd = Math.max(chromEnd, ctx.getEnd());
}
pw.println(contig + "\t" + (chromStart - 1) + // -1 for bed compatibility
"\t" + chromEnd + "\t" + first.key + "\t" + variants.size());
// save vcf file
final ZipEntry ze = new ZipEntry(this.baseZipDir + "/" + first.key + ".vcf");
zout.putNextEntry(ze);
final VariantContextWriter out = VCFUtils.createVariantContextWriterToOutputStream(IOUtils.uncloseableOutputStream(zout));
final VCFHeader header2 = addMetaData(new VCFHeader(cah.header));
header2.addMetaDataLine(new VCFHeaderLine("VcfGeneSplitter.Name", String.valueOf(first.key)));
out.writeHeader(header2);
for (final VariantContext ctx : variants) {
out.add(ctx);
}
// yes because wrapped into IOUtils.encloseableOutputSream
out.close();
zout.closeEntry();
}
eqiter.close();
iter.close();
iter = null;
progess.finish();
LOG.info("saving report");
pw.flush();
pw.close();
final ZipEntry entry = new ZipEntry(this.baseZipDir + "/manifest.bed");
zout.putNextEntry(entry);
IOUtils.copyTo(tmpReportFile, zout);
zout.closeEntry();
zout.finish();
zout.close();
return RETURN_OK;
} catch (final Exception err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(iter);
if (sortingcollection != null)
sortingcollection.cleanup();
CloserUtil.close(in);
CloserUtil.close(fos);
CloserUtil.close(pw);
}
}
use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.
the class VcfGroupByPopulation method doVcfToVcf.
@Override
protected int doVcfToVcf(String inputName, VcfIterator vcfIn, VariantContextWriter out) {
try {
Reader r = IOUtils.openFileForBufferedReading(this.mappingFile);
parsePopulationMapping(r);
r.close();
VCFHeader header = vcfIn.getHeader();
Set<String> samplesInVcf = new HashSet<>(header.getSampleNamesInOrder());
this.sample2population.keySet().retainAll(samplesInVcf);
Map<String, Set<String>> population2samples = new HashMap<>();
for (String sample : this.sample2population.keySet()) {
String pop = this.sample2population.get(sample);
Set<String> samples = population2samples.get(pop);
if (samples == null) {
samples = new HashSet<>();
population2samples.put(pop, samples);
}
samples.add(sample);
}
for (String sample : header.getSampleNamesInOrder()) {
if (!this.sample2population.containsKey(sample)) {
throw new IOException("Sample " + sample + " not affected to a population");
}
}
Set<VCFHeaderLine> metaData = new LinkedHashSet<>();
for (VCFHeaderLine vhl : header.getMetaDataInInputOrder()) {
if (!(vhl instanceof VCFContigHeaderLine))
continue;
metaData.add(vhl);
}
metaData.add(new VCFHeaderLine(getClass().getSimpleName() + "CmdLine", String.valueOf(getProgramCommandLine())));
metaData.add(new VCFHeaderLine(getClass().getSimpleName() + "Version", String.valueOf(getVersion())));
metaData.add(new VCFHeaderLine(getClass().getSimpleName() + "HtsJdkVersion", HtsjdkVersion.getVersion()));
metaData.add(new VCFHeaderLine(getClass().getSimpleName() + "HtsJdkHome", HtsjdkVersion.getHome()));
/* FORMAT */
metaData.add(new VCFFormatHeaderLine("NS", 1, VCFHeaderLineType.Integer, "Total Number of Samples"));
metaData.add(new VCFFormatHeaderLine("R", 1, VCFHeaderLineType.Integer, "Number of alleles REF (hom:=2,het:=1)"));
metaData.add(new VCFFormatHeaderLine("A", 1, VCFHeaderLineType.Integer, "Number of alleles ALT (hom:=2,het:=1)"));
metaData.add(new VCFFormatHeaderLine("UNC", 1, VCFHeaderLineType.Integer, "Number of NON-called samples"));
metaData.add(new VCFFormatHeaderLine("F", 1, VCFHeaderLineType.Float, "Allele Frequency A/(R+A)"));
metaData.add(new VCFFormatHeaderLine("DP", 1, VCFHeaderLineType.Integer, "Depth"));
/* INFO */
metaData.add(new VCFInfoHeaderLine("NS", 1, VCFHeaderLineType.Integer, "Total Number of Samples"));
metaData.add(new VCFInfoHeaderLine("R", 1, VCFHeaderLineType.Integer, "Number of alleles REF (hom:=2,het:=1)"));
metaData.add(new VCFInfoHeaderLine("A", 1, VCFHeaderLineType.Integer, "Number of alleles ALT (hom:=2,het:=1)"));
metaData.add(new VCFInfoHeaderLine("UNC", 1, VCFHeaderLineType.Integer, "Number of NON-called samples"));
metaData.add(new VCFInfoHeaderLine("F", 1, VCFHeaderLineType.Float, "Allele Frequency A/(R+A)"));
metaData.add(new VCFInfoHeaderLine("DP", 1, VCFHeaderLineType.Integer, "Depth"));
metaData.add(new VCFFormatHeaderLine("GT", 1, VCFHeaderLineType.String, "Genotype"));
VCFHeader h2 = new VCFHeader(metaData, population2samples.keySet());
out.writeHeader(h2);
SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(vcfIn.getHeader());
while (vcfIn.hasNext()) {
VariantContext ctx = progress.watch(vcfIn.next());
VariantContextBuilder vcb = new VariantContextBuilder(ctx.getSource(), ctx.getContig(), ctx.getStart(), ctx.getEnd(), ctx.getAlleles());
if (ctx.hasID())
vcb.id(ctx.getID());
GCount count_ctx = new GCount();
List<Genotype> genotypes = new ArrayList<>(population2samples.size());
for (String pop : population2samples.keySet()) {
GCount count = new GCount();
Set<String> samples = population2samples.get(pop);
for (String sample : samples) {
Genotype g = ctx.getGenotype(sample);
count.watch(g);
}
GenotypeBuilder gb = new GenotypeBuilder(pop);
gb.attribute("NS", samples.size());
gb.attribute("R", count.R);
gb.attribute("A", count.A);
gb.attribute("UNC", count.uncalled);
if (count.R + count.A > 0) {
gb.attribute("F", (float) count.A / (float) (count.R + count.A));
}
if (count.dp >= 0) {
gb.attribute("DP", count.dp);
if (count_ctx.dp == -1)
count_ctx.dp = 0;
}
genotypes.add(gb.make());
count_ctx.R += count.R;
count_ctx.A += count.A;
count_ctx.uncalled += count.uncalled;
count_ctx.dp += count.dp;
}
vcb.attribute("R", count_ctx.R);
vcb.attribute("A", count_ctx.A);
vcb.attribute("UNC", count_ctx.uncalled);
if (count_ctx.R + count_ctx.A > 0) {
vcb.attribute("F", (float) count_ctx.A / (float) (count_ctx.R + count_ctx.A));
}
if (count_ctx.dp >= 0) {
vcb.attribute("DP", count_ctx.dp);
}
vcb.attribute("NS", this.sample2population.keySet().size());
vcb.genotypes(genotypes);
out.add(vcb.make());
}
progress.finish();
return 0;
} catch (Exception err) {
LOG.error(err);
return -1;
}
}
use of com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress in project jvarkit by lindenb.
the class LocalRealignReads method doWork.
@Override
public int doWork(List<String> args) {
if (this.faidxFile == null) {
LOG.error("REFerence file missing;");
return -1;
}
IndexedFastaSequenceFile indexedFastaSequenceFile = null;
SamReader samReader = null;
SAMFileWriter w = null;
SAMRecordIterator iter = null;
GenomicSequence genomicSequence = null;
LongestCommonSequence matrix = new LongestCommonSequence();
try {
indexedFastaSequenceFile = new IndexedFastaSequenceFile(this.faidxFile);
samReader = openSamReader(oneFileOrNull(args));
final SAMFileHeader header1 = samReader.getFileHeader();
final SAMFileHeader header2 = header1.clone();
header2.setSortOrder(SortOrder.unsorted);
w = this.writingBamArgs.setReferenceFile(faidxFile).openSAMFileWriter(outputFile, header2, true);
final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header1);
iter = samReader.iterator();
while (iter.hasNext()) {
final SAMRecord rec = progress.watch(iter.next());
if (rec.getReadUnmappedFlag() || rec.isSecondaryOrSupplementary() || rec.getReadFailsVendorQualityCheckFlag() || rec.getDuplicateReadFlag()) {
w.addAlignment(rec);
continue;
}
final Cigar cigar = rec.getCigar();
final int nCigarElement = cigar.numCigarElements();
if (nCigarElement < 2) {
w.addAlignment(rec);
continue;
}
final CigarElement ce5 = cigar.getCigarElement(0);
final CigarElement ce3 = cigar.getCigarElement(nCigarElement - 1);
if (!((ce3.getOperator().equals(CigarOperator.S) && ce3.getLength() >= MIN_ALIGN_LEN) || (ce5.getOperator().equals(CigarOperator.S) && ce5.getLength() >= MIN_ALIGN_LEN))) {
w.addAlignment(rec);
continue;
}
if (genomicSequence == null || !genomicSequence.getChrom().equals(rec.getReferenceName())) {
genomicSequence = new GenomicSequence(indexedFastaSequenceFile, rec.getReferenceName());
}
final CharSequence readseq = rec.getReadString();
/**
* short invert
*/
if (ce5.getOperator() == CigarOperator.S && ce5.getLength() >= MIN_ALIGN_LEN) {
CharSequence clipseq = new SubSequence(readseq, 0, ce5.getLength());
CharSequence revcomp = new RevCompCharSequence(clipseq);
LongestCommonSequence.Hit hit = matrix.align(genomicSequence, Math.max(0, rec.getUnclippedStart() - rec.getReadLength()), Math.min(rec.getUnclippedEnd() + rec.getReadLength(), genomicSequence.length()), revcomp, 0, revcomp.length());
if (hit.size() >= MIN_ALIGN_LEN) {
System.err.println("REVCOMP5' " + hit.getMatchingSequence() + " " + clipseq + " for " + readseq + " " + rec.getReadName() + " " + rec.getCigarString());
}
/*
hit = matrix.align(
readseq, 0, readseq.length(),
revcomp,
0,
revcomp.length()
);
if(hit.size()>=MIN_ALIGN_LEN)
{
System.err.println("REVCOMP5' vs ITSELF: "+hit.getMatchingSequence()+" "+clipseq+" for "+readseq+" "+rec.getReadName()+" "+rec.getCigarString());
}
*/
}
if (ce3.getOperator() == CigarOperator.S && ce3.getLength() >= MIN_ALIGN_LEN) {
CharSequence clipseq = new SubSequence(readseq, readseq.length() - ce3.getLength(), readseq.length());
CharSequence revcomp = new RevCompCharSequence(clipseq);
LongestCommonSequence.Hit hit = matrix.align(genomicSequence, Math.max(0, rec.getUnclippedStart() - rec.getReadLength()), Math.min(rec.getUnclippedEnd() + rec.getReadLength(), genomicSequence.length()), revcomp, 0, revcomp.length());
if (hit.size() >= MIN_ALIGN_LEN) {
System.err.println("REVCOMP3' " + hit.getMatchingSequence() + " " + clipseq + " for " + readseq + " " + rec.getReadName());
}
/*
hit = matrix.align(
readseq, 0, readseq.length(),
revcomp,
0,
revcomp.length()
);
if(hit.size()>=MIN_ALIGN_LEN)
{
System.err.println("REVCOMP3' vs ITSELF: "+hit.getMatchingSequence()+" "+clipseq+" for "+readseq+" "+rec.getReadName()+" "+rec.getCigarString());
}*/
}
/* other */
List<LongestCommonSequence.Hit> hits = new ArrayList<>();
align(hits, matrix, genomicSequence, Math.max(0, rec.getUnclippedStart() - rec.getReadLength()), Math.min(rec.getUnclippedEnd() + rec.getReadLength(), genomicSequence.length()), readseq, 0, readseq.length(), -1);
if (hits.size() < 2000)
continue;
for (LongestCommonSequence.Hit hit : hits) {
int readstart = 0;
boolean in_M = false;
for (int i = 0; i < nCigarElement; ++i) {
final CigarElement ce = cigar.getCigarElement(i);
if (ce.getOperator().consumesReadBases()) {
int readend = readstart + ce.getLength();
if (ce.getOperator() == CigarOperator.M || ce.getOperator() == CigarOperator.X || ce.getOperator() == CigarOperator.EQ) {
if (!(hit.getEndY() <= readstart || readend <= hit.getStartY())) {
in_M = true;
break;
}
}
readstart = readend;
}
}
if (in_M)
continue;
int align_size = hit.size();
System.err.println(rec.getReadName() + " " + rec.getCigarString() + " " + rec.getAlignmentStart() + "-" + rec.getAlignmentEnd());
System.err.println("REF: " + hit.getStartX() + "-" + hit.getEndX());
System.err.println("READ " + hit.getStartY() + "-" + hit.getEndY());
System.err.print("REF :");
for (int i = 0; i < align_size; ++i) {
System.err.print(genomicSequence.charAt(hit.getStartX() + i));
}
System.err.println();
System.err.print("READ :");
for (int i = 0; i < align_size; ++i) {
System.err.print(readseq.charAt(hit.getStartY() + i));
}
System.err.println();
}
System.err.println();
}
progress.finish();
return RETURN_OK;
} catch (Exception e) {
return wrapException(e);
} finally {
genomicSequence = null;
CloserUtil.close(iter);
CloserUtil.close(samReader);
CloserUtil.close(w);
CloserUtil.close(indexedFastaSequenceFile);
matrix = null;
}
}
Aggregations