use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.
the class VcfStrechToSvg method run.
private void run(final ArchiveFactory archive, final BedLine bed, final VCFHeader header, final VCFReader in, final PrintWriter manifest) {
LOG.info("processing " + bed);
final Set<String> limitSamples;
if (StringUtils.isBlank(this.sampleStr)) {
limitSamples = new TreeSet<>(header.getSampleNamesInOrder());
} else if (this.sampleStr.startsWith("^")) {
limitSamples = new TreeSet<>(header.getSampleNamesInOrder());
limitSamples.removeAll(Arrays.stream(this.sampleStr.substring(1).split("[, ]+")).filter(S -> !StringUtils.isBlank(S)).collect(Collectors.toSet()));
} else {
limitSamples = Arrays.stream(this.sampleStr.split("[, ]+")).filter(S -> !StringUtils.isBlank(S)).collect(Collectors.toCollection(TreeSet::new));
}
final SAMSequenceDictionary dict = header.getSequenceDictionary();
try (CloseableIterator<VariantContext> iter = in.query(bed)) {
final List<VariantSet> L = iter.stream().filter(V -> acceptVariant(V)).map(V -> new VariantSet(V)).collect(Collectors.toCollection(ArrayList::new));
if (L.isEmpty()) {
LOG.warn("No valid variant found for \"" + bed + "\"");
return;
}
int i = 0;
while (i + 1 < L.size()) {
if (L.get(i).withinDistanceOf(L.get(i + 1), this.withinDistance)) {
L.get(i).variants.addAll(L.get(i + 1).variants);
L.remove(i + 1);
} else {
i++;
}
}
final int margin_left = 50;
final int margin_right = 10;
final double drawingAreaWidth = image_width_pixel - (margin_left + margin_right);
final int intervalLength = L.stream().mapToInt(V -> V.getLengthOnReference()).sum();
double x = 0;
for (i = 0; i < L.size(); i++) {
L.get(i).x = x;
x += (L.get(i).getLengthOnReference() / (double) intervalLength) * drawingAreaWidth;
}
for (i = 0; i < L.size(); i++) {
L.get(i).width = (i + 1 < L.size() ? L.get(i + 1).x : drawingAreaWidth) - L.get(i).x;
}
try {
final DocumentBuilderFactory db = DocumentBuilderFactory.newInstance();
final DocumentBuilder dom = db.newDocumentBuilder();
this.document = dom.newDocument();
final Element svgRoot = element("svg");
this.document.appendChild(svgRoot);
final String mainTitleStr = SequenceDictionaryUtils.getBuildName(dict).orElse("") + " " + new SimpleInterval(bed).toNiceString() + " length:" + StringUtils.niceInt(bed.getLengthOnReference());
/* SVG title */
{
final Element title = element("title");
svgRoot.appendChild(title);
title.appendChild(text(mainTitleStr));
}
/* SVG style */
{
final String gtopacity = this.dynamicParams.getOrDefault("gt.opacity", "0.7");
final Element style = element("style");
svgRoot.appendChild(style);
style.appendChild(text(".maintitle {text-anchor:middle;fill:blue} " + ".vc {stroke-width:0.5px;} " + ".transcript {stroke:black;stroke-width:1px;opacity:1;}" + ".exon {stroke:black;stroke-width:0.5px;fill:blue;opacity:1;}" + ".sample {fill:blue;font-size:7px;} " + ".samplelabel {stroke:gray;stroke-width:0.5px;font-size:" + this.dynamicParams.getOrDefault("sample.fontsize", "7") + "px;}\n" + ".coverage { fill:gray; stroke:yellow;opacity:0.2;} " + ".frame { fill:none; stroke: darkgray;} " + ".area0 {fill:white;}\n" + ".area1 {fill:floralwhite;}\n" + "circle.HOM_REF {fill:green;opacity:" + gtopacity + ";stroke-width:0.5px;}\n" + "circle.HET {fill:blue;opacity:" + gtopacity + ";stroke-width:0.5px;}\n" + "circle.HOM_VAR {fill:red;opacity:" + gtopacity + ";stroke-width:0.5px;}\n" + "a {cursor: pointer;}\n"));
}
/* desc */
{
final Element descr = element("desc");
svgRoot.appendChild(descr);
descr.appendChild(text("Author: Pierre Lindenbaum\n" + JVarkitVersion.getInstance().getCompilationDate() + "\n" + JVarkitVersion.getInstance().getGitHash()));
}
// main title
{
final Element gtitle = element("text", mainTitleStr);
gtitle.setAttribute("class", "maintitle");
gtitle.setAttribute("x", format(this.image_width_pixel / 2.0));
gtitle.setAttribute("y", "15");
svgRoot.appendChild(wrapLoc(gtitle, bed));
}
int margin_top = 50;
double y = margin_top;
final double min_circle_radius = Double.parseDouble(this.dynamicParams.getOrDefault("gt.r1", "1"));
final double max_circle_radius = Double.parseDouble(this.dynamicParams.getOrDefault("gt.r2", "7"));
final Element main_g = element("g");
svgRoot.appendChild(main_g);
/**
* plot genes
*/
if (!this.all_genes.isEmpty()) {
final double transcript_height = 5;
final double exon_height = (transcript_height - 1);
final double save_y = y;
final Element g_genes = element("g");
g_genes.setAttribute("transform", "translate(" + margin_left + ",0)");
main_g.appendChild(g_genes);
/* loop over each vset */
for (i = 0; i < L.size(); i++) {
final VariantSet vset = L.get(i);
// find transcript in this vset
final List<Transcript> transcripts = this.all_genes.getOverlapping(vset).stream().flatMap(G -> G.getTranscripts().stream()).filter(T -> T.overlaps(vset)).collect(Collectors.toList());
if (transcripts.isEmpty())
continue;
final Element g_vset = element("g");
g_vset.setAttribute("transform", "translate(" + format(vset.x) + ",0)");
g_genes.appendChild(g_vset);
// y in this vset
double y2 = save_y;
/* convert base to pixel */
final ToDoubleFunction<Integer> base2pixel = vset.createBaseToPixelFunction();
/* loop over transcripts */
for (final Transcript tr : transcripts) {
final Element g_tr = element("g");
g_tr.setAttribute("transform", "translate(0," + format(y2) + ")");
g_vset.appendChild(g_tr);
final Element line = element("line");
line.setAttribute("class", "transcript");
line.setAttribute("x1", format(Math.max(0, base2pixel.applyAsDouble(tr.getStart()))));
line.setAttribute("y1", format(transcript_height / 2.0));
line.setAttribute("x2", format(Math.min(vset.width, base2pixel.applyAsDouble(tr.getEnd()))));
line.setAttribute("y2", format(transcript_height / 2.0));
line.appendChild(element("title", tr.getId()));
g_tr.appendChild(wrapLoc(line, tr));
/* loop over exons */
for (final Exon exon : tr.getExons()) {
if (!exon.overlaps(vset))
continue;
final Element exRect = element("rect");
exRect.setAttribute("class", "exon");
final double x_start = Math.max(0, base2pixel.applyAsDouble(exon.getStart()));
exRect.setAttribute("x", format(x_start));
exRect.setAttribute("y", format(transcript_height / 2.0 - exon_height / 2.0));
final double x_end = Math.min(vset.width, base2pixel.applyAsDouble(exon.getEnd()));
exRect.setAttribute("width", format(x_end - x_start));
exRect.setAttribute("height", format(exon_height));
exRect.appendChild(element("title", exon.getName()));
g_tr.appendChild(wrapLoc(exRect, exon));
}
y2 += transcript_height + 0.5;
}
y = Math.max(y, y2);
}
y++;
}
final double sample_height = Double.parseDouble(this.dynamicParams.getOrDefault("sample.height", "25"));
final double sample_height2 = sample_height - (max_circle_radius * 2.0);
int space_between_samples = 2;
int got_n_samples = 0;
for (final String sn : header.getSampleNamesInOrder()) {
if (!limitSamples.contains(sn))
continue;
boolean got_this_sample = false;
final Element g_sample = element("g");
g_sample.setAttribute("transform", "translate(" + margin_left + "," + format(y) + ")");
/* get coverage */
final int maxCoverage;
if (this.sample2bam.containsKey(sn)) {
final CoverageFactory coverageFactory = new CoverageFactory();
try (SamReader sr = this.openSamReader(this.sample2bam.get(sn))) {
/* loop over each variant set */
for (final VariantSet vset : L) {
vset.coverage = coverageFactory.getSimpleCoverage(sr, vset, sn);
}
}
maxCoverage = L.stream().flatMapToInt(V -> V.coverage.stream()).max().orElse(0);
} else {
maxCoverage = 0;
for (final VariantSet vset : L) {
vset.coverage = null;
}
}
/* loop over each variant set */
for (i = 0; i < L.size(); i++) {
final VariantSet vset = L.get(i);
final Element g_vset = element("g");
g_vset.setAttribute("transform", "translate(" + format(vset.x) + ",0)");
g_sample.appendChild(g_vset);
/* convert base to pixel */
final ToDoubleFunction<Integer> base2pixel = vset.createBaseToPixelFunction();
// plot set length
final Element rect = element("rect");
rect.setAttribute("class", "area" + (i % 2));
rect.setAttribute("x", "0");
rect.setAttribute("y", "0");
rect.setAttribute("width", format(vset.width));
rect.setAttribute("height", format(sample_height));
if (!remove_tooltip)
rect.appendChild(element("title", vset.toString()));
g_vset.appendChild(rect);
// plot coverage
if (maxCoverage > 0 && this.sample2bam.containsKey(sn)) {
final double[] scaled = vset.coverage.scaleAverage((int) vset.width);
final StringBuilder sb = new StringBuilder();
sb.append("0," + sample_height);
for (int t = 0; t < scaled.length; t++) {
if (t > 1 && t + 1 < scaled.length && format(scaled[t - 1]).equals(format(scaled[t + 1])) && format(scaled[t - 1]).equals(format(scaled[t])))
continue;
sb.append(" ").append(t).append(",");
sb.append(format(sample_height * (1.0 - scaled[t] / maxCoverage)));
}
sb.append(" " + format(vset.width) + "," + sample_height);
final Element polyline = element("polyline");
polyline.setAttribute("class", "coverage");
polyline.setAttribute("points", sb.toString());
g_vset.appendChild(polyline);
vset.coverage = null;
}
// plot vertical line if colorTag defined
if (!StringUtils.isBlank(this.colorTag)) {
for (final VariantContext vc : vset.variants) {
if (!vc.hasAttribute(this.colorTag))
continue;
final String cssColor = vc.getAttributeAsString(this.colorTag, "");
if (StringUtils.isBlank(cssColor))
continue;
final double x0 = base2pixel.applyAsDouble(vc.getStart());
final Element line = element("line");
line.setAttribute("class", "vc");
line.setAttribute("style", "stroke:" + cssColor);
line.setAttribute("x1", format(x0));
line.setAttribute("y1", "0");
line.setAttribute("x2", format(x0));
line.setAttribute("y2", format(sample_height));
g_vset.appendChild(line);
}
}
// print all variants in this vcfset for this sample
for (final VariantContext vc : vset.variants) {
final Genotype gt = vc.getGenotype(sn);
if (gt.isNoCall())
continue;
if (hide_hom_ref && gt.isHomRef())
continue;
if (gt.hasGQ() && gt.getGQ() < this.minGQ)
continue;
final OptionalDouble alt_ratio = getAltRatio(gt);
if (!alt_ratio.isPresent())
continue;
final OptionalDouble af = getAF(vc);
final double circle_radius = min_circle_radius + (max_circle_radius - min_circle_radius) * (1.0 - af.orElse(1.0));
// HOMREF=0; HET =0.5; HOMVAR = 1;
final double gtx = base2pixel.applyAsDouble(vc.getStart());
final double gty = sample_height - (sample_height2 * alt_ratio.getAsDouble() + (sample_height - sample_height2) / 2.0);
final Element circle = element("circle");
circle.setAttribute("class", gt.getType().name());
circle.setAttribute("cx", format(gtx));
circle.setAttribute("cy", format(gty));
circle.setAttribute("r", format(circle_radius));
if (!remove_tooltip)
circle.appendChild(element("title", vc.getStart() + " " + (vc.hasID() ? vc.getID() : "") + " " + vc.getAlleles().stream().map(A -> A.getDisplayString()).collect(Collectors.joining("/")) + " " + gt.getType().name() + " AF=" + format(af.orElse(-1))));
g_vset.appendChild(wrapLoc(circle, vc));
got_this_sample = true;
}
}
final Element frame_sample = element("rect");
frame_sample.setAttribute("class", "frame");
frame_sample.setAttribute("x", "0");
frame_sample.setAttribute("y", "0");
frame_sample.setAttribute("width", format(drawingAreaWidth));
frame_sample.setAttribute("height", format(sample_height));
g_sample.appendChild(frame_sample);
final Element label = element("text", sn + (maxCoverage == 0 ? "" : " Max Cov. " + maxCoverage));
label.setAttribute("class", "samplelabel");
label.setAttribute("x", "0");
label.setAttribute("y", "0");
// label.setAttribute("transform", "translate("+format(-10)+","+0+") rotate(90) ");
label.setAttribute("transform", "translate(12,12)");
if (!remove_tooltip)
label.appendChild(element("title", sn));
g_sample.appendChild(label);
if (got_this_sample) {
got_n_samples++;
main_g.appendChild(g_sample);
y += sample_height + space_between_samples;
} else {
LOG.warn("no valid data for sample " + sn + " in " + bed);
}
}
// remove extra sample space
y -= space_between_samples;
svgRoot.setAttribute("width", format(this.image_width_pixel + 1));
svgRoot.setAttribute("height", format(y + 1));
if (got_n_samples == 0) {
LOG.info("no sample/genotype found for " + bed);
return;
}
// save
final Transformer tr = TransformerFactory.newInstance().newTransformer();
final String filename = bed.getContig() + "_" + bed.getStart() + "_" + bed.getEnd() + ".svg" + (this.compressed_svg ? ".gz" : "");
LOG.info("writing " + filename);
if (this.compressed_svg) {
try (final OutputStream pw = archive.openOuputStream(filename)) {
try (GZIPOutputStream gzout = new GZIPOutputStream(pw)) {
tr.transform(new DOMSource(this.document), new StreamResult(gzout));
gzout.finish();
gzout.flush();
}
pw.flush();
}
} else {
try (final PrintWriter pw = archive.openWriter(filename)) {
tr.transform(new DOMSource(this.document), new StreamResult(pw));
pw.flush();
}
}
manifest.print(bed.getContig());
manifest.print("\t");
manifest.print(bed.getStart() - 1);
manifest.print("\t");
manifest.print(bed.getEnd());
manifest.print("\t");
manifest.print(filename);
manifest.println();
} catch (final Throwable err) {
throw new RuntimeException(err);
} finally {
this.document = null;
}
}
}
use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.
the class KnownDeletion method doWork.
@Override
public int doWork(final List<String> args) {
if (this.extendDistance <= 0) {
LOG.error("bad extend factor " + this.extendDistance);
return -1;
}
final Function<SAMRecord, Integer> mateEnd = REC -> SAMUtils.getMateCigar(REC) != null ? SAMUtils.getMateAlignmentEnd(REC) : REC.getMateAlignmentStart();
PrintWriter pw = null;
SAMFileWriter sfw = null;
try {
pw = super.openPathOrStdoutAsPrintWriter(this.outputFile);
final SamReaderFactory srf = SamReaderFactory.makeDefault().referenceSequence(this.faidx).validationStringency(ValidationStringency.LENIENT);
final List<String> filenames = IOUtils.unrollStrings2018(args);
if (this.bamOut != null && !filenames.isEmpty()) {
SAMSequenceDictionary theDict = null;
final Set<String> samples = new TreeSet<>();
for (final String filename : filenames) {
final SamInputResource sri = SamInputResource.of(filename);
try (final SamReader samReader = srf.open(sri)) {
final SAMFileHeader header = samReader.getFileHeader();
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(header);
if (theDict == null) {
theDict = dict;
} else if (!SequenceUtil.areSequenceDictionariesEqual(theDict, dict)) {
throw new JvarkitException.DictionariesAreNotTheSame(theDict, dict);
}
final String sampleName = header.getReadGroups().stream().map(R -> R.getSample()).filter(S -> !StringUtil.isBlank(S)).findFirst().orElse(filename);
samples.add(sampleName);
}
}
final SAMFileHeader header = new SAMFileHeader(theDict);
header.setSortOrder(SAMFileHeader.SortOrder.unsorted);
JVarkitVersion.getInstance().addMetaData(this, header);
for (final String sn : samples) {
final SAMReadGroupRecord srg = new SAMReadGroupRecord(sn);
srg.setSample(sn);
header.addReadGroup(srg);
}
final SAMFileWriterFactory swf = new SAMFileWriterFactory();
sfw = swf.makeSAMOrBAMWriter(header, true, this.bamOut);
}
for (final String filename : IOUtils.unrollStrings2018(args)) {
final SamInputResource sri = SamInputResource.of(filename);
try (final SamReader samReader = srf.open(sri)) {
final SAMFileHeader header = samReader.getFileHeader();
final SAMSequenceDictionary dict = header.getSequenceDictionary();
final String sampleName = header.getReadGroups().stream().map(R -> R.getSample()).filter(S -> !StringUtil.isBlank(S)).findFirst().orElse(filename);
final Function<String, Optional<SimpleInterval>> parser = IntervalParserFactory.newInstance().dictionary(dict).make();
final SimpleInterval interval = parser.apply(this.regionStr).orElseThrow(IntervalParserFactory.exception(this.regionStr));
final SAMSequenceRecord ssr = Objects.requireNonNull(dict.getSequence(interval.getContig()));
final int tid = ssr.getSequenceIndex();
final QueryInterval qi1 = new QueryInterval(tid, Math.max(0, interval.getStart() - this.extendDistance), Math.min(interval.getStart() + this.extendDistance, ssr.getSequenceLength()));
final QueryInterval qi2 = new QueryInterval(tid, Math.max(0, interval.getEnd() - this.extendDistance), Math.min(interval.getEnd() + this.extendDistance, ssr.getSequenceLength()));
if (CoordMath.overlaps(qi1.start, qi1.end, qi2.start, qi2.end)) {
LOG.error("after extending the boundaries with " + this.extendDistance + ". They both overlap...");
return -1;
}
long count_disc = 0L;
long count_split = 0L;
long count_del = 0L;
final QueryInterval[] intervals = QueryInterval.optimizeIntervals(new QueryInterval[] { qi1, qi2 });
try (final CloseableIterator<SAMRecord> iter = samReader.query(intervals, false)) {
while (iter.hasNext()) {
final SAMRecord rec = iter.next();
if (rec.getReadUnmappedFlag())
continue;
if (this.samRecordFilter.filterOut(rec))
continue;
if (rec.getStart() <= qi1.end && rec.getEnd() >= qi2.start) {
count_del++;
if (sfw != null) {
rec.setAttribute("RG", sampleName);
sfw.addAlignment(rec);
}
continue;
}
if (rec.getReadPairedFlag() && !rec.getMateUnmappedFlag() && rec.getMateReferenceIndex().equals(tid) && ((CoordMath.overlaps(rec.getStart(), rec.getEnd(), qi1.start, qi1.end) && CoordMath.overlaps(rec.getMateAlignmentStart(), mateEnd.apply(rec), qi2.start, qi2.end)) || (CoordMath.overlaps(rec.getStart(), rec.getEnd(), qi2.start, qi2.end) && CoordMath.overlaps(rec.getMateAlignmentStart(), mateEnd.apply(rec), qi1.start, qi1.end)))) {
count_disc++;
if (sfw != null) {
rec.setAttribute("RG", sampleName);
sfw.addAlignment(rec);
}
continue;
}
for (final SAMRecord rec2 : SAMUtils.getOtherCanonicalAlignments(rec)) {
if (rec2.getReferenceIndex().equals(tid) && ((CoordMath.overlaps(rec.getStart(), rec.getEnd(), qi1.start, qi1.end) && CoordMath.overlaps(rec2.getStart(), rec2.getEnd(), qi2.start, qi2.end)) || (CoordMath.overlaps(rec.getStart(), rec.getEnd(), qi2.start, qi2.end) && CoordMath.overlaps(rec2.getStart(), rec2.getEnd(), qi1.start, qi1.end)))) {
count_split++;
if (sfw != null) {
rec.setAttribute("RG", sampleName);
sfw.addAlignment(rec);
}
break;
}
}
}
}
pw.println(filename + "\t" + sampleName + "\t" + this.regionStr + "\t" + count_disc + "\t" + count_split + "\t" + count_del + "\t" + (count_del + count_disc + count_split));
}
}
pw.flush();
if (sfw != null)
sfw.close();
sfw = null;
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(sfw);
}
}
use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.
the class SamInversions method doWork.
@Override
public int doWork(final List<String> args) {
if (this.max_distance < 100) {
LOG.error("max size inversion must be >=100");
return -1;
}
VariantContextWriter vcw = null;
try {
final Allele REF = Allele.create("N", true);
final Allele INV = Allele.create("<INV>", false);
final String input = super.oneFileOrNull(args);
final Path inputPath = input == null ? null : Paths.get(input);
try (SamReader samReader = super.createSamReaderFactory().validationStringency(ValidationStringency.LENIENT).referenceSequence(this.referenceFaidx).open(input == null ? SamInputResource.of(stdin()) : SamInputResource.of(inputPath))) {
final SAMFileHeader samHeader = samReader.getFileHeader();
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(samHeader);
final String sampleName = samHeader.getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtil.isBlank(S)).findFirst().orElse(inputPath == null ? "SAMPLE" : IOUtils.getFilenameWithoutCommonSuffixes(inputPath));
final ToIntFunction<SAMRecord> mateEnd = REC -> SAMUtils.getMateCigar(REC) != null ? SAMUtils.getMateAlignmentEnd(REC) : REC.getMateAlignmentStart();
final ToIntFunction<SAMRecord> recordLen = REC -> {
final int start = Math.min(REC.getStart(), REC.getMateAlignmentStart());
final int end = Math.max(REC.getEnd(), mateEnd.applyAsInt(REC));
return CoordMath.getLength(start, end);
};
final QueryInterval[] queryIntervals = this.intervallistProvider == null ? null : this.intervallistProvider.dictionary(dict).optimizedQueryIntervals();
final Set<VCFHeaderLine> meta = new HashSet<>();
VCFStandardHeaderLines.addStandardFormatLines(meta, true, VCFConstants.GENOTYPE_KEY, VCFConstants.DEPTH_KEY);
VCFStandardHeaderLines.addStandardInfoLines(meta, true, VCFConstants.DEPTH_KEY, VCFConstants.END_KEY);
meta.add(new VCFInfoHeaderLine("SVLEN", 1, VCFHeaderLineType.Integer, "SV length"));
meta.add(new VCFInfoHeaderLine("SVTYPE", 1, VCFHeaderLineType.String, "Structural variant type"));
final VCFHeader header = new VCFHeader(meta, Collections.singleton(sampleName));
JVarkitVersion.getInstance().addMetaData(this, header);
header.setSequenceDictionary(dict);
vcw = this.writingVariants.dictionary(dict).open(this.outputFile);
vcw.writeHeader(header);
final Predicate<SAMRecord> acceptRecord = rec -> {
if (!SAMRecordDefaultFilter.accept(rec, this.mapq))
return false;
if (!rec.getReadPairedFlag())
return false;
if (rec.getMateUnmappedFlag())
return false;
if (!rec.getReferenceIndex().equals(rec.getMateReferenceIndex()))
return false;
if (rec.getReadNegativeStrandFlag() != rec.getMateNegativeStrandFlag())
return false;
final int len = recordLen.applyAsInt(rec);
if (len > max_distance)
return false;
return true;
};
final List<Allele> alleles = Arrays.asList(REF, INV);
final ProgressFactory.Watcher<SAMRecord> progress = ProgressFactory.newInstance().dictionary(dict).logger(LOG).build();
try (final CloseableIterator<SAMRecord> iter0 = (queryIntervals == null ? samReader.iterator() : samReader.query(queryIntervals, false))) {
final PeekIterator<SAMRecord> iter = new PeekIterator<>(iter0);
while (iter.hasNext()) {
final SAMRecord rec = progress.apply(iter.next());
if (!acceptRecord.test(rec))
continue;
final int invStart = rec.getAlignmentStart();
int invEnd = Math.max(rec.getEnd(), mateEnd.applyAsInt(rec));
if (CoordMath.getLength(invStart, invEnd) > this.max_distance)
continue;
int dp = 1;
while (iter.hasNext()) {
final SAMRecord rec2 = iter.peek();
if (!acceptRecord.test(rec2)) {
iter.next();
continue;
}
if (rec2.getAlignmentStart() > invEnd || !rec2.contigsMatch(rec) || CoordMath.getLength(invStart, rec2.getAlignmentStart()) > this.max_distance) {
break;
}
// consumme
iter.next();
invEnd = Math.max(invEnd, Math.max(rec2.getEnd(), mateEnd.applyAsInt(rec2)));
dp++;
}
if (dp >= this.min_supporting_reads) {
final VariantContextBuilder vcb = new VariantContextBuilder();
vcb.chr(rec.getContig());
vcb.start(invStart);
vcb.stop(invEnd);
vcb.alleles(alleles);
vcb.attribute(VCFConstants.END_KEY, invEnd);
final GenotypeBuilder gb = new GenotypeBuilder(sampleName, alleles);
gb.DP(dp);
vcb.genotypes(Collections.singletonList(gb.make()));
vcb.alleles(alleles);
vcb.attribute(VCFConstants.DEPTH_KEY, dp);
vcb.attribute(VCFConstants.SVTYPE, "INV");
vcb.attribute("SVLEN", CoordMath.getLength(invStart, invEnd));
vcw.add(vcb.make());
}
}
}
progress.close();
vcw.close();
}
return 0;
} catch (final Throwable e) {
LOG.error(e);
return -1;
} finally {
CloserUtil.close(vcw);
}
}
use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.
the class SamTranslocations method doWork.
@Override
public int doWork(final List<String> args) {
if (this.fuzzy_distance <= 0) {
LOG.error("fuzzy_distance <=0 (" + fuzzy_distance + ")");
return -1;
}
final List<SamReader> samReaders = new ArrayList<>();
final List<CloseableIterator<SAMRecord>> samRecordIterators = new ArrayList<>();
VariantContextWriter out = null;
try {
final List<Path> inputBams = IOUtils.unrollPaths(args);
final SamReaderFactory srf = super.createSamReaderFactory();
if (inputBams.isEmpty()) {
LOG.error("No bam was defined");
return -1;
}
samReaders.addAll(inputBams.stream().map(P -> srf.open(P)).collect(Collectors.toList()));
final SAMFileHeader header = new SamFileHeaderMerger(SortOrder.coordinate, samReaders.stream().map(SR -> SR.getFileHeader()).collect(Collectors.toList()), false).getMergedHeader();
samRecordIterators.addAll(samReaders.stream().map(S -> makeIterator(S)).collect(Collectors.toList()));
final CloseableIterator<SAMRecord> iter = new MergingIterator<>(this.coordinateComparator, samRecordIterators);
final Set<String> sampleNames = header.getReadGroups().stream().map(RG -> RG.getSample()).filter(S -> !StringUtil.isBlank(S)).collect(Collectors.toSet());
final SAMSequenceDictionary refDict = SequenceDictionaryUtils.extractRequired(header);
final boolean[] allowedContigs = new boolean[refDict.size()];
Arrays.fill(allowedContigs, true);
if (!StringUtils.isBlank(this.chromRegex)) {
final Pattern pat = Pattern.compile(this.chromRegex);
refDict.getSequences().stream().filter(SR -> !pat.matcher(SR.getSequenceName()).matches()).forEach(SR -> allowedContigs[SR.getSequenceIndex()] = false);
}
final ProgressFactory.Watcher<SAMRecord> progress = ProgressFactory.newInstance().dictionary(refDict).logger(LOG).build();
final Set<VCFHeaderLine> metaData = new HashSet<>();
metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY, true));
metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_QUALITY_KEY, true));
metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.DEPTH_KEY, true));
metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_ALLELE_DEPTHS, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.DEPTH_KEY, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_NUMBER_KEY, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_COUNT_KEY, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.ALLELE_FREQUENCY_KEY, true));
metaData.add(VCFStandardHeaderLines.getInfoLine(VCFConstants.END_KEY, true));
metaData.add(new VCFInfoHeaderLine(VCFConstants.SVTYPE, 1, VCFHeaderLineType.String, "Variation type"));
final VCFFormatHeaderLine supportingReadsFormat = new VCFFormatHeaderLine("SR", 4, VCFHeaderLineType.Integer, "Supporting reads: contig1-forward,contig1-reverse,contig2-forward,contig2-reverse");
metaData.add(supportingReadsFormat);
final VCFInfoHeaderLine stdDevContig1Info = new VCFInfoHeaderLine("STDDEV_POS1", 1, VCFHeaderLineType.Integer, "std deviation to position 1");
metaData.add(stdDevContig1Info);
final VCFInfoHeaderLine stdDevContig2Info = new VCFInfoHeaderLine("STDDEV_POS2", 1, VCFHeaderLineType.Integer, "std deviation to position 2");
metaData.add(stdDevContig2Info);
final VCFInfoHeaderLine chrom2Info = new VCFInfoHeaderLine("CHROM2", 1, VCFHeaderLineType.String, "other chromosome");
metaData.add(chrom2Info);
final VCFInfoHeaderLine pos2Info = new VCFInfoHeaderLine("POS2", 1, VCFHeaderLineType.Integer, "other position");
metaData.add(pos2Info);
final VCFInfoHeaderLine highQualSampleInfo = new VCFInfoHeaderLine("highQualSamples", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "High Quality Samples");
metaData.add(highQualSampleInfo);
final VCFInfoHeaderLine lowQualSampleInfo = new VCFInfoHeaderLine("lowQualSamples", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Low Quality Samples");
metaData.add(lowQualSampleInfo);
final VCFInfoHeaderLine maxEventInfo = new VCFInfoHeaderLine("MAX_DP", 1, VCFHeaderLineType.Integer, "Max number of event per sample");
metaData.add(maxEventInfo);
final VCFHeader vcfHeader = new VCFHeader(metaData, sampleNames);
vcfHeader.setSequenceDictionary(refDict);
JVarkitVersion.getInstance().addMetaData(this, vcfHeader);
out = VCFUtils.createVariantContextWriterToPath(this.outputFile);
out.writeHeader(vcfHeader);
final Function<SAMRecord, Integer> mateEnd = REC -> SAMUtils.getMateCigar(REC) != null ? SAMUtils.getMateAlignmentEnd(REC) : REC.getMateAlignmentStart();
final Allele REF = Allele.create("N", true);
final Allele ALT = Allele.create("<TRANSLOC>", false);
/**
* buffer or reads
*/
final LinkedList<SAMRecord> buffer = new LinkedList<>();
int prev_tid = -1;
for (; ; ) {
final SAMRecord rec;
if (!buffer.isEmpty()) {
rec = buffer.pollFirst();
} else if (iter.hasNext()) {
rec = progress.apply(iter.next());
} else {
rec = null;
}
if (rec == null || (rec.getReferenceIndex() != prev_tid)) {
final int final_prev_tid = prev_tid;
buffer.removeIf(SR -> SR.getReferenceIndex() <= final_prev_tid);
if (rec == null)
break;
prev_tid = rec.getReferenceIndex();
}
buffer.removeIf(SR -> SR.getUnclippedEnd() < rec.getUnclippedStart());
if (rec.getReadNegativeStrandFlag())
continue;
final List<SAMRecord> candidates = new ArrayList<>();
candidates.add(rec);
if (rec.getReadPairedFlag() && !rec.getReadNegativeStrandFlag() && !rec.getMateUnmappedFlag() && !rec.getReferenceIndex().equals(rec.getMateReferenceIndex())) {
long end = rec.getUnclippedEnd() + this.fuzzy_distance;
int buffer_index = 0;
while (buffer_index < buffer.size()) {
final SAMRecord rec2 = buffer.get(buffer_index);
if (!rec2.getReferenceIndex().equals(rec.getReferenceIndex())) {
break;
}
if (rec2.getAlignmentStart() > end) {
// not unclipped to avoid side effect
break;
}
if (rec2.getMateUnmappedFlag()) {
buffer_index++;
continue;
}
if (!rec2.getMateReferenceIndex().equals(rec.getMateReferenceIndex())) {
buffer_index++;
continue;
}
final int mate1 = rec.getMateNegativeStrandFlag() ? rec.getMateAlignmentStart() : mateEnd.apply(rec);
final int mate2 = rec2.getMateNegativeStrandFlag() ? rec2.getMateAlignmentStart() : mateEnd.apply(rec2);
if (Math.abs(mate1 - mate2) > this.fuzzy_distance) {
buffer_index++;
continue;
}
buffer.remove(buffer_index);
candidates.add(rec2);
}
while (iter.hasNext()) {
final SAMRecord rec2 = iter.next();
if (rec2 == null)
break;
if (!rec2.getReferenceIndex().equals(rec.getReferenceIndex())) {
buffer.add(rec2);
break;
}
if (rec2.getAlignmentStart() > end) {
// not unclipped to avoid side effect
buffer.add(rec2);
break;
}
if (rec2.getMateUnmappedFlag()) {
buffer.add(rec2);
continue;
}
if (!rec2.getMateReferenceIndex().equals(rec.getMateReferenceIndex())) {
buffer.add(rec2);
continue;
}
final int mate1 = rec.getMateNegativeStrandFlag() ? rec.getMateAlignmentStart() : mateEnd.apply(rec);
final int mate2 = rec2.getMateNegativeStrandFlag() ? rec2.getMateAlignmentStart() : mateEnd.apply(rec2);
if (Math.abs(mate1 - mate2) > this.fuzzy_distance) {
buffer.add(rec2);
continue;
}
candidates.add(rec2);
}
} else {
continue;
}
if ((long) buffer.size() > this.max_complexity_buffer * samReaders.size()) {
// final long end = rec.getUnclippedEnd();
// buffer.remove(0);
// buffer.removeIf(R->R.getReferenceIndex().equals(rec.getReferenceIndex()) && R.getStart()<=end);
buffer.clear();
LOG.warn("zone of low complexity: clearing buffer near " + rec.getContig() + ":" + rec.getStart());
continue;
}
if (candidates.isEmpty() || candidates.size() < this.min_number_of_events)
continue;
// check in both sides
final int count_plus = (int) candidates.stream().filter(SR -> !SR.getReadNegativeStrandFlag()).count();
final int count_minus = (int) candidates.stream().filter(SR -> SR.getReadNegativeStrandFlag()).count();
if (count_plus < this.min_number_of_events || count_minus < this.min_number_of_events) {
putbackInBuffer(candidates, buffer);
continue;
}
final int pos_ctg1 = (int) candidates.stream().mapToInt(SR -> SR.getReadNegativeStrandFlag() ? SR.getAlignmentStart() : SR.getAlignmentEnd()).average().orElse(-1);
if (pos_ctg1 < 1) {
putbackInBuffer(candidates, buffer);
continue;
}
final int stddev_ctg1 = (int) candidates.stream().mapToInt(SR -> SR.getReadNegativeStrandFlag() ? SR.getAlignmentStart() : SR.getAlignmentEnd()).map(X -> Math.abs(X - pos_ctg1)).average().orElse(0.0);
final int pos_ctg2 = (int) candidates.stream().mapToInt(SR -> SR.getMateNegativeStrandFlag() ? SR.getMateAlignmentStart() : mateEnd.apply(SR)).average().orElse(-1);
if (pos_ctg2 < 1) {
putbackInBuffer(candidates, buffer);
continue;
}
final int stddev_ctg2 = (int) candidates.stream().mapToInt(SR -> SR.getMateNegativeStrandFlag() ? SR.getMateAlignmentStart() : mateEnd.apply(SR)).map(X -> Math.abs(X - pos_ctg2)).average().orElse(0.0);
final List<String> lowQualSamples = new ArrayList<>();
final List<String> hiQualSamples = new ArrayList<>();
final VariantContextBuilder vcb = new VariantContextBuilder(null, rec.getContig(), pos_ctg1, pos_ctg1, Arrays.asList(REF, ALT));
final List<Genotype> genotypes = new ArrayList<>(sampleNames.size());
int max_dp = 0;
for (final String sample : sampleNames) {
final List<SAMRecord> readSample = candidates.stream().filter(SR -> sample.equals(SR.getReadGroup().getSample())).collect(Collectors.toList());
if (readSample.isEmpty()) {
genotypes.add(GenotypeBuilder.createMissing(sample, 2));
} else {
final GenotypeBuilder gb = new GenotypeBuilder(sample, Arrays.asList(REF, ALT));
final int sn_contig1_count_plus = (int) readSample.stream().filter(SR -> !SR.getReadNegativeStrandFlag()).count();
final int sn_contig1_count_minus = (int) readSample.stream().filter(SR -> SR.getReadNegativeStrandFlag()).count();
final int sn_contig2_count_plus = (int) readSample.stream().filter(SR -> !SR.getMateNegativeStrandFlag()).count();
final int sn_contig2_count_minus = (int) readSample.stream().filter(SR -> SR.getMateNegativeStrandFlag()).count();
max_dp = Math.max(max_dp, readSample.size());
gb.DP(readSample.size());
gb.attribute(supportingReadsFormat.getID(), new int[] { sn_contig1_count_plus, sn_contig1_count_minus, sn_contig2_count_plus, sn_contig2_count_minus });
genotypes.add(gb.make());
if (sn_contig1_count_plus > this.min_number_of_events && sn_contig1_count_minus > this.min_number_of_events && sn_contig2_count_plus > this.min_number_of_events && sn_contig2_count_minus > this.min_number_of_events) {
hiQualSamples.add(sample);
} else {
lowQualSamples.add(sample);
}
}
}
if (hiQualSamples.isEmpty()) {
putbackInBuffer(candidates, buffer);
continue;
}
vcb.id(rec.getReferenceName() + ":" + pos_ctg1 + ":" + rec.getMateReferenceName() + ":" + pos_ctg2);
vcb.attribute(stdDevContig1Info.getID(), stddev_ctg1);
vcb.attribute(stdDevContig2Info.getID(), stddev_ctg2);
vcb.attribute(chrom2Info.getID(), rec.getMateReferenceName());
vcb.attribute(pos2Info.getID(), pos_ctg2);
vcb.attribute(maxEventInfo.getID(), max_dp);
vcb.attribute(VCFConstants.SVTYPE, StructuralVariantType.BND.name());
vcb.attribute(VCFConstants.DEPTH_KEY, candidates.size());
vcb.attribute(VCFConstants.ALLELE_COUNT_KEY, (hiQualSamples.size() + lowQualSamples.size()));
vcb.attribute(VCFConstants.ALLELE_NUMBER_KEY, sampleNames.size() * 2);
vcb.attribute(VCFConstants.ALLELE_FREQUENCY_KEY, (hiQualSamples.size() + lowQualSamples.size()) / (sampleNames.size() * 2.0));
if (!lowQualSamples.isEmpty())
vcb.attribute(lowQualSampleInfo.getID(), lowQualSamples);
if (!hiQualSamples.isEmpty())
vcb.attribute(highQualSampleInfo.getID(), hiQualSamples);
vcb.genotypes(genotypes);
vcb.log10PError(candidates.size() / -10.0);
vcb.alleles(Arrays.asList(REF, ALT));
out.add(vcb.make());
}
progress.close();
iter.close();
samRecordIterators.forEach(S -> S.close());
samRecordIterators.clear();
samReaders.forEach(CloserUtil::close);
samReaders.clear();
out.close();
out = null;
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
samRecordIterators.forEach(S -> S.close());
samReaders.forEach(S -> CloserUtil.close(S));
CloserUtil.close(out);
}
}
use of htsjdk.samtools.util.CloseableIterator in project jvarkit by lindenb.
the class ScanStructuralVariants method doWork.
@Override
public int doWork(final List<String> args) {
final List<VCFReader> casesFiles = new ArrayList<>();
if (this.svComparator.getBndDistance() < 0) {
LOG.error("bad max_distance :" + this.svComparator.getBndDistance());
return -1;
}
VariantContextWriter out = null;
try {
final List<Path> casesPaths = (IOUtils.unrollPaths(args));
if (casesPaths.isEmpty()) {
LOG.error("cases list is empty");
return -1;
}
if (!print_all_ctx && casesPaths.size() == 1) {
LOG.warning("One case: switching to --all");
print_all_ctx = true;
}
if (this.controlsPath.size() == 1 && this.controlsPath.get(0).toString().endsWith(".list")) {
this.controlsPath = Files.lines(this.controlsPath.get(0)).filter(L -> !(L.startsWith("#") || StringUtils.isBlank(L))).map(L -> Paths.get(L)).collect(Collectors.toList());
}
SAMSequenceDictionary dict = null;
final Set<VCFHeaderLine> metadata = new HashSet<>();
for (int side = 0; side < 2; side++) {
for (final Path input : (side == 0 ? casesPaths : this.controlsPath)) {
final VCFReader vcfInput = VCFReaderFactory.makeDefault().open(input);
final VCFHeader header = vcfInput.getHeader();
if (side == 0) {
casesFiles.add(vcfInput);
} else {
vcfInput.close();
}
final SAMSequenceDictionary dict2 = SequenceDictionaryUtils.extractRequired(header);
if (dict == null) {
dict = dict2;
} else if (!SequenceUtil.areSequenceDictionariesEqual(dict, dict2)) {
LOG.error(JvarkitException.DictionariesAreNotTheSame.getMessage(dict2, dict));
return -1;
}
}
}
final IntervalTreeMap<Boolean> intervalTreeMap;
if (intervalListProvider != null) {
intervalTreeMap = new IntervalTreeMap<>();
intervalListProvider.dictionary(dict).stream().forEach(R -> intervalTreeMap.put(new Interval(R), true));
} else {
intervalTreeMap = null;
}
casesFiles.stream().flatMap(F -> F.getHeader().getMetaDataInInputOrder().stream()).forEach(H -> metadata.add(H));
VCFStandardHeaderLines.addStandardFormatLines(metadata, true, VCFConstants.GENOTYPE_KEY);
VCFStandardHeaderLines.addStandardInfoLines(metadata, true, VCFConstants.END_KEY);
metadata.add(new VCFInfoHeaderLine("SAMPLES", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Samples carrying the SV"));
metadata.add(new VCFInfoHeaderLine("NSAMPLES", 1, VCFHeaderLineType.Integer, "Number of Samples carrying the SV"));
metadata.add(new VCFInfoHeaderLine("SVLEN", 1, VCFHeaderLineType.Integer, "SV length"));
metadata.add(new VCFInfoHeaderLine("CIPOS", 2, VCFHeaderLineType.Integer, "Confidence interval around POS for imprecise variants"));
metadata.add(new VCFInfoHeaderLine("CIEND", 2, VCFHeaderLineType.Integer, "Confidence interval around END for imprecise variants"));
metadata.add(new VCFInfoHeaderLine("IMPRECISE", 0, VCFHeaderLineType.Flag, "Imprecise structural variation"));
metadata.add(new VCFInfoHeaderLine(ATT_FILENAME, 1, VCFHeaderLineType.String, "Source of variant"));
metadata.add(new VCFInfoHeaderLine(ATT_CLUSTER, 1, VCFHeaderLineType.String, "Variant cluster"));
/*metadata.add(new VCFFormatHeaderLine(
"OV",1,
VCFHeaderLineType.Integer,
"Number calls (with different sample) overlapping this genotype"
));*/
metadata.add(new VCFInfoHeaderLine(VCFConstants.SVTYPE, 1, VCFHeaderLineType.String, "SV type"));
metadata.add(new VCFFilterHeaderLine(ATT_CONTROL, "Variant is found in controls (max MAF=" + this.max_maf + ")"));
final VCFHeader header = new VCFHeader(metadata);
header.setSequenceDictionary(dict);
JVarkitVersion.getInstance().addMetaData(this, header);
final List<ShadowedVcfReader> controlShadowReaders = new ArrayList<>(this.controlsPath.size());
for (int i = 0; i < this.controlsPath.size(); i++) {
boolean large_flag = this.max_control_large_flag < 0 || i >= this.max_control_large_flag;
controlShadowReaders.add(new ShadowedVcfReader(this.controlsPath.get(i), large_flag));
}
out = super.openVariantContextWriter(this.outputFile);
out.writeHeader(header);
final CloseableIterator<VariantContext> iter = casesFiles.get(0).iterator();
final ProgressFactory.Watcher<VariantContext> progress = ProgressFactory.newInstance().dictionary(dict).logger(LOG).build();
final Decoy decoy = Decoy.getDefaultInstance();
VariantContext prevCtx = null;
while (iter.hasNext()) {
final VariantContext ctx = progress.apply(iter.next());
if (decoy.isDecoy(ctx.getContig()))
continue;
if (Breakend.parse(ctx).stream().anyMatch(B -> decoy.isDecoy(B.getContig())))
continue;
if (intervalTreeMap != null && !intervalTreeMap.containsOverlapping(ctx))
continue;
// in manta, I see the same variant multiple times in the same vcf
if (prevCtx != null && ctx.getContig().equals(prevCtx.getContig()) && ctx.getStart() == prevCtx.getStart() && ctx.getEnd() == prevCtx.getEnd())
continue;
prevCtx = ctx;
final List<VariantContext> candidate = new ArrayList<>(casesFiles.size());
candidate.add(ctx);
recursive(ctx, candidate, casesFiles, controlShadowReaders, out);
}
iter.close();
progress.close();
out.close();
out = null;
casesFiles.stream().forEach(F -> {
try {
F.close();
} catch (Exception err) {
}
});
controlShadowReaders.stream().forEach(F -> F.realClose());
return 0;
} catch (final Throwable err) {
LOG.error(err);
return -1;
} finally {
CloserUtil.close(out);
}
}
Aggregations