use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.
the class KnownDeletion method doWork.
@Override
public int doWork(final List<String> args) {
if (this.extendDistance <= 0) {
LOG.error("bad extend factor " + this.extendDistance);
return -1;
}
final Function<SAMRecord, Integer> mateEnd = REC -> SAMUtils.getMateCigar(REC) != null ? SAMUtils.getMateAlignmentEnd(REC) : REC.getMateAlignmentStart();
PrintWriter pw = null;
SAMFileWriter sfw = null;
try {
pw = super.openPathOrStdoutAsPrintWriter(this.outputFile);
final SamReaderFactory srf = SamReaderFactory.makeDefault().referenceSequence(this.faidx).validationStringency(ValidationStringency.LENIENT);
final List<String> filenames = IOUtils.unrollStrings2018(args);
if (this.bamOut != null && !filenames.isEmpty()) {
SAMSequenceDictionary theDict = null;
final Set<String> samples = new TreeSet<>();
for (final String filename : filenames) {
final SamInputResource sri = SamInputResource.of(filename);
try (final SamReader samReader = srf.open(sri)) {
final SAMFileHeader header = samReader.getFileHeader();
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(header);
if (theDict == null) {
theDict = dict;
} else if (!SequenceUtil.areSequenceDictionariesEqual(theDict, dict)) {
throw new JvarkitException.DictionariesAreNotTheSame(theDict, dict);
}
final String sampleName = header.getReadGroups().stream().map(R -> R.getSample()).filter(S -> !StringUtil.isBlank(S)).findFirst().orElse(filename);
samples.add(sampleName);
}
}
final SAMFileHeader header = new SAMFileHeader(theDict);
header.setSortOrder(SAMFileHeader.SortOrder.unsorted);
JVarkitVersion.getInstance().addMetaData(this, header);
for (final String sn : samples) {
final SAMReadGroupRecord srg = new SAMReadGroupRecord(sn);
srg.setSample(sn);
header.addReadGroup(srg);
}
final SAMFileWriterFactory swf = new SAMFileWriterFactory();
sfw = swf.makeSAMOrBAMWriter(header, true, this.bamOut);
}
for (final String filename : IOUtils.unrollStrings2018(args)) {
final SamInputResource sri = SamInputResource.of(filename);
try (final SamReader samReader = srf.open(sri)) {
final SAMFileHeader header = samReader.getFileHeader();
final SAMSequenceDictionary dict = header.getSequenceDictionary();
final String sampleName = header.getReadGroups().stream().map(R -> R.getSample()).filter(S -> !StringUtil.isBlank(S)).findFirst().orElse(filename);
final Function<String, Optional<SimpleInterval>> parser = IntervalParserFactory.newInstance().dictionary(dict).make();
final SimpleInterval interval = parser.apply(this.regionStr).orElseThrow(IntervalParserFactory.exception(this.regionStr));
final SAMSequenceRecord ssr = Objects.requireNonNull(dict.getSequence(interval.getContig()));
final int tid = ssr.getSequenceIndex();
final QueryInterval qi1 = new QueryInterval(tid, Math.max(0, interval.getStart() - this.extendDistance), Math.min(interval.getStart() + this.extendDistance, ssr.getSequenceLength()));
final QueryInterval qi2 = new QueryInterval(tid, Math.max(0, interval.getEnd() - this.extendDistance), Math.min(interval.getEnd() + this.extendDistance, ssr.getSequenceLength()));
if (CoordMath.overlaps(qi1.start, qi1.end, qi2.start, qi2.end)) {
LOG.error("after extending the boundaries with " + this.extendDistance + ". They both overlap...");
return -1;
}
long count_disc = 0L;
long count_split = 0L;
long count_del = 0L;
final QueryInterval[] intervals = QueryInterval.optimizeIntervals(new QueryInterval[] { qi1, qi2 });
try (final CloseableIterator<SAMRecord> iter = samReader.query(intervals, false)) {
while (iter.hasNext()) {
final SAMRecord rec = iter.next();
if (rec.getReadUnmappedFlag())
continue;
if (this.samRecordFilter.filterOut(rec))
continue;
if (rec.getStart() <= qi1.end && rec.getEnd() >= qi2.start) {
count_del++;
if (sfw != null) {
rec.setAttribute("RG", sampleName);
sfw.addAlignment(rec);
}
continue;
}
if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag() && rec.getMateReferenceIndex().equals(tid) && ((CoordMath.overlaps(rec.getStart(), rec.getEnd(), qi1.start, qi1.end) && CoordMath.overlaps(rec.getMateAlignmentStart(), mateEnd.apply(rec), qi2.start, qi2.end)) || (CoordMath.overlaps(rec.getStart(), rec.getEnd(), qi2.start, qi2.end) && CoordMath.overlaps(rec.getMateAlignmentStart(), mateEnd.apply(rec), qi1.start, qi1.end)))) {
count_disc++;
if (sfw != null) {
rec.setAttribute("RG", sampleName);
sfw.addAlignment(rec);
}
continue;
}
for (final SAMRecord rec2 : SAMUtils.getOtherCanonicalAlignments(rec)) {
if (rec2.getReferenceIndex().equals(tid) && ((CoordMath.overlaps(rec.getStart(), rec.getEnd(), qi1.start, qi1.end) && CoordMath.overlaps(rec2.getStart(), rec2.getEnd(), qi2.start, qi2.end)) || (CoordMath.overlaps(rec.getStart(), rec.getEnd(), qi2.start, qi2.end) && CoordMath.overlaps(rec2.getStart(), rec2.getEnd(), qi1.start, qi1.end)))) {
count_split++;
if (sfw != null) {
rec.setAttribute("RG", sampleName);
sfw.addAlignment(rec);
}
break;
}
}
}
}
pw.println(filename + "\t" + sampleName + "\t" + this.regionStr + "\t" + count_disc + "\t" + count_split + "\t" + count_del + "\t" + (count_del + count_disc + count_split));
}
}
pw.flush();
if (sfw != null)
sfw.close();
sfw = null;
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(sfw);
}
}
use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.
the class SamShortInvertion method dump.
private void dump(final SAMSequenceDictionary dict, final IntervalTreeMap<List<Arc>> database, final VariantContextWriter vcw, final Set<String> samples, final Integer before) {
if (this.debug)
LOG.debug("dump");
final Allele REF = Allele.create("N", true);
final Allele SPLIT = Allele.create("<INV>", false);
final Comparator<Locatable> cmp = new ContigDictComparator(dict).createLocatableComparator();
final List<SimpleInterval> intervals = database.keySet().stream().filter(R -> (before == null ? true : R.getEnd() > before.intValue())).map(R -> new SimpleInterval(R)).sorted(cmp).collect(Collectors.toList());
for (final SimpleInterval interval0 : intervals) {
final List<Arc> arcs = database.getOverlapping(interval0).stream().flatMap(L -> L.stream()).filter(A -> !A.consummed).filter(A -> testOverlapping(interval0, A)).collect(Collectors.toList());
if (arcs.isEmpty())
continue;
if (!keep_flag)
arcs.forEach(A -> A.consummed = true);
int maxdp = 0;
final VariantContextBuilder vcb = new VariantContextBuilder();
final Set<Allele> alleles = new HashSet<>();
alleles.add(REF);
final List<Genotype> genotypes = new ArrayList<>(samples.size());
vcb.chr(dict.getSequence(arcs.get(0).tid).getSequenceName());
final int chromStart = arcs.stream().mapToInt(A -> A.chromStart).min().getAsInt();
vcb.start(chromStart);
final int chromEnd = arcs.stream().mapToInt(A -> A.chromEnd).max().getAsInt();
vcb.stop(chromEnd);
vcb.attribute(VCFConstants.END_KEY, chromEnd);
vcb.attribute("SVLEN", CoordMath.getLength(chromStart, chromEnd));
int depth = 0;
final Set<String> gotSamples = new TreeSet<>();
for (final String sample : samples) {
final List<Arc> sampleArcs = arcs.stream().filter(A -> A.sample.equals(sample)).collect(Collectors.toList());
if (sampleArcs.isEmpty()) {
genotypes.add(GenotypeBuilder.createMissing(sample, 2));
} else {
final GenotypeBuilder gb = new GenotypeBuilder(sample);
alleles.add(SPLIT);
gb.alleles(Arrays.asList(REF, SPLIT));
final int countCat1 = (int) sampleArcs.stream().filter(A -> A.type == SUPPORTING_LEFT).count();
final int countCat2 = (int) sampleArcs.stream().filter(A -> A.type == SUPPORTING_RIGHT).count();
gb.DP(countCat1 + countCat2);
gb.attribute("N5", countCat1);
gb.attribute("N3", countCat2);
maxdp = Math.max(maxdp, countCat1 + countCat2);
depth += countCat1 + countCat2;
genotypes.add(gb.make());
gotSamples.add(sample);
}
}
if (depth <= this.min_supporting_reads)
continue;
vcb.genotypes(genotypes);
vcb.alleles(alleles);
vcb.attribute(VCFConstants.DEPTH_KEY, depth);
vcb.attribute("NSAMPLES", gotSamples.size());
vcb.attribute("SAMPLES", new ArrayList<>(gotSamples));
vcb.attribute(VCFConstants.SVTYPE, "INV");
vcb.attribute("DPMAX", maxdp);
vcw.add(vcb.make());
}
}
use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.
the class VCFBed method doVcfToVcf.
@Override
protected int doVcfToVcf(final String inputName, final VCFIterator vcfin, final VariantContextWriter out) {
final VCFHeader header = vcfin.getHeader();
final VCFHeader h2 = new VCFHeader(header);
final VCFInfoHeaderLine infoHeader = new VCFInfoHeaderLine(this.infoName, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "metadata added from " + this.inputBedFile + " . Format was " + this.formatPattern.replaceAll("[\"\'\\\\]", " "));
final VCFInfoHeaderLine infoCountBasicOverlap = new VCFInfoHeaderLine(this.infoName + "_N", 1, VCFHeaderLineType.Integer, "Number of raw overlap within distance " + this.within_distance + "bp.");
final VCFInfoHeaderLine infoCountFinerOverlap = new VCFInfoHeaderLine(this.infoName + "_C", 1, VCFHeaderLineType.Integer, "Number of finer overlap within distance " + inputBedFile + " and overlap bed:" + this.min_overlap_bed_fraction + " and overlap vcf: " + this.min_overlap_vcf_fraction);
if (infoCountBasicOverlap != null)
h2.addMetaDataLine(infoCountBasicOverlap);
if (infoCountFinerOverlap != null)
h2.addMetaDataLine(infoCountFinerOverlap);
h2.addMetaDataLine(infoHeader);
JVarkitVersion.getInstance().addMetaData(this, h2);
out.writeHeader(h2);
while (vcfin.hasNext()) {
final VariantContext ctx = vcfin.next();
final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
vcb.rmAttribute(infoCountBasicOverlap.getID());
vcb.rmAttribute(infoCountFinerOverlap.getID());
vcb.rmAttribute(infoHeader.getID());
if (this.ignoreFILTERed && ctx.isFiltered()) {
out.add(vcb.make());
continue;
}
final String normalizedContig = this.contigNameConverter.apply(ctx.getContig());
if (StringUtil.isBlank(normalizedContig)) {
out.add(ctx);
continue;
}
final Optional<? extends Locatable> bndOpt = this.disable_bnd_eval ? Optional.empty() : Breakend.parseInterval(ctx);
final SimpleInterval theInterval;
if (bndOpt.isPresent()) {
theInterval = new SimpleInterval(normalizedContig, bndOpt.get().getStart(), bndOpt.get().getEnd());
} else {
theInterval = new SimpleInterval(normalizedContig, ctx.getStart(), ctx.getEnd());
}
final SimpleInterval extendedInterval;
if (this.within_distance <= 0) {
extendedInterval = theInterval;
} else {
extendedInterval = new SimpleInterval(theInterval.getContig(), Math.max(1, theInterval.getStart() + this.within_distance), theInterval.getEnd() + this.within_distance);
}
int count_basic_overlap = 0;
int count_finer_overlap = 0;
final Set<String> annotations = new LinkedHashSet<>();
if (this.intervalTreeMap != null) {
for (final Set<BedLine> bedLines : this.intervalTreeMap.getOverlapping(extendedInterval)) {
for (final BedLine bedLine : bedLines) {
count_basic_overlap++;
if (!testFinerIntersection(theInterval, bedLine))
continue;
count_finer_overlap++;
final String newannot = this.bedJexlToString.apply(new BedJEXLContext(bedLine, ctx));
if (!StringUtil.isBlank(newannot)) {
annotations.add(VCFUtils.escapeInfoField(newannot));
}
}
}
} else {
try (CloseableIterator<BedLine> iter = this.bedReader.iterator(theInterval.getContig(), Math.max(0, theInterval.getStart() - 1), theInterval.getEnd() + 1)) {
while (iter.hasNext()) {
final BedLine bedLine = iter.next();
if (!theInterval.contigsMatch(bedLine))
continue;
if (!extendedInterval.withinDistanceOf(bedLine, this.within_distance))
continue;
count_basic_overlap++;
if (!testFinerIntersection(theInterval, bedLine))
continue;
count_finer_overlap++;
final String newannot = this.bedJexlToString.apply(new BedJEXLContext(bedLine, ctx));
if (!StringUtil.isBlank(newannot))
annotations.add(VCFUtils.escapeInfoField(newannot));
}
} catch (final IOException ioe) {
LOG.error(ioe);
throw new RuntimeIOException(ioe);
}
}
if (!annotations.isEmpty()) {
vcb.attribute(infoHeader.getID(), annotations.toArray());
}
vcb.attribute(infoCountBasicOverlap.getID(), count_basic_overlap);
vcb.attribute(infoCountFinerOverlap.getID(), count_finer_overlap);
out.add(vcb.make());
}
out.close();
return 0;
}
use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.
the class VcfGnomad method findOverlapping.
/**
* find matching variant in tabix file, use a buffer to avoid multiple random accesses
*/
private List<VariantContext> findOverlapping(final VariantContext userVariantCtx) {
final String normContig = this.ctgNameConverter.apply(userVariantCtx.getContig());
final Locatable loc;
if (StringUtil.isBlank(normContig))
return Collections.emptyList();
if (normContig.equals(userVariantCtx.getContig())) {
loc = userVariantCtx;
} else {
loc = new SimpleInterval(normContig, userVariantCtx.getStart(), userVariantCtx.getEnd());
}
final List<VariantContext> list = new ArrayList<>();
try (CloseableIterator<VariantContext> iter = this.gnomadReader.query(loc)) {
while (iter.hasNext()) {
final VariantContext ctx = iter.next();
list.add(ctx);
}
}
return list;
}
use of com.github.lindenb.jvarkit.samtools.util.SimpleInterval in project jvarkit by lindenb.
the class VcfGnomadExomeVsGenome method isVariantIn.
boolean isVariantIn(final VariantContext ctx, final BufferedVCFReader reader) {
final String ctg = this.ctgNameConverter.apply(ctx.getContig());
if (StringUtils.isBlank(ctg))
return false;
try (CloseableIterator<VariantContext> iterE = reader.query(new SimpleInterval(ctg, ctx.getStart(), ctx.getEnd()))) {
while (iterE.hasNext()) {
final VariantContext ctx2 = iterE.next();
if (ctx.getStart() != ctx2.getStart())
continue;
if (!ctx.getReference().equals(ctx2.getReference()))
continue;
final Set<Allele> alt2 = ctx2.getAlternateAlleles().stream().filter(A -> AcidNucleics.isATGC(A)).collect(Collectors.toSet());
if (ctx.getAlternateAlleles().stream().anyMatch(A -> alt2.contains(A)))
return true;
}
}
return false;
}
Aggregations