Search in sources :

Example 56 with VCFReader

use of htsjdk.variant.vcf.VCFReader in project jvarkit by lindenb.

the class VcfStrechToSvg method run.

private void run(final ArchiveFactory archive, final BedLine bed, final VCFHeader header, final VCFReader in, final PrintWriter manifest) {
    LOG.info("processing " + bed);
    final Set<String> limitSamples;
    if (StringUtils.isBlank(this.sampleStr)) {
        limitSamples = new TreeSet<>(header.getSampleNamesInOrder());
    } else if (this.sampleStr.startsWith("^")) {
        limitSamples = new TreeSet<>(header.getSampleNamesInOrder());
        limitSamples.removeAll(Arrays.stream(this.sampleStr.substring(1).split("[, ]+")).filter(S -> !StringUtils.isBlank(S)).collect(Collectors.toSet()));
    } else {
        limitSamples = Arrays.stream(this.sampleStr.split("[, ]+")).filter(S -> !StringUtils.isBlank(S)).collect(Collectors.toCollection(TreeSet::new));
    }
    final SAMSequenceDictionary dict = header.getSequenceDictionary();
    try (CloseableIterator<VariantContext> iter = in.query(bed)) {
        final List<VariantSet> L = iter.stream().filter(V -> acceptVariant(V)).map(V -> new VariantSet(V)).collect(Collectors.toCollection(ArrayList::new));
        if (L.isEmpty()) {
            LOG.warn("No valid variant found for \"" + bed + "\"");
            return;
        }
        int i = 0;
        while (i + 1 < L.size()) {
            if (L.get(i).withinDistanceOf(L.get(i + 1), this.withinDistance)) {
                L.get(i).variants.addAll(L.get(i + 1).variants);
                L.remove(i + 1);
            } else {
                i++;
            }
        }
        final int margin_left = 50;
        final int margin_right = 10;
        final double drawingAreaWidth = image_width_pixel - (margin_left + margin_right);
        final int intervalLength = L.stream().mapToInt(V -> V.getLengthOnReference()).sum();
        double x = 0;
        for (i = 0; i < L.size(); i++) {
            L.get(i).x = x;
            x += (L.get(i).getLengthOnReference() / (double) intervalLength) * drawingAreaWidth;
        }
        for (i = 0; i < L.size(); i++) {
            L.get(i).width = (i + 1 < L.size() ? L.get(i + 1).x : drawingAreaWidth) - L.get(i).x;
        }
        try {
            final DocumentBuilderFactory db = DocumentBuilderFactory.newInstance();
            final DocumentBuilder dom = db.newDocumentBuilder();
            this.document = dom.newDocument();
            final Element svgRoot = element("svg");
            this.document.appendChild(svgRoot);
            final String mainTitleStr = SequenceDictionaryUtils.getBuildName(dict).orElse("") + " " + new SimpleInterval(bed).toNiceString() + " length:" + StringUtils.niceInt(bed.getLengthOnReference());
            /* SVG title */
            {
                final Element title = element("title");
                svgRoot.appendChild(title);
                title.appendChild(text(mainTitleStr));
            }
            /* SVG style */
            {
                final String gtopacity = this.dynamicParams.getOrDefault("gt.opacity", "0.7");
                final Element style = element("style");
                svgRoot.appendChild(style);
                style.appendChild(text(".maintitle {text-anchor:middle;fill:blue} " + ".vc {stroke-width:0.5px;} " + ".transcript {stroke:black;stroke-width:1px;opacity:1;}" + ".exon {stroke:black;stroke-width:0.5px;fill:blue;opacity:1;}" + ".sample {fill:blue;font-size:7px;} " + ".samplelabel {stroke:gray;stroke-width:0.5px;font-size:" + this.dynamicParams.getOrDefault("sample.fontsize", "7") + "px;}\n" + ".coverage { fill:gray; stroke:yellow;opacity:0.2;} " + ".frame { fill:none; stroke: darkgray;} " + ".area0 {fill:white;}\n" + ".area1 {fill:floralwhite;}\n" + "circle.HOM_REF {fill:green;opacity:" + gtopacity + ";stroke-width:0.5px;}\n" + "circle.HET {fill:blue;opacity:" + gtopacity + ";stroke-width:0.5px;}\n" + "circle.HOM_VAR {fill:red;opacity:" + gtopacity + ";stroke-width:0.5px;}\n" + "a {cursor: pointer;}\n"));
            }
            /* desc */
            {
                final Element descr = element("desc");
                svgRoot.appendChild(descr);
                descr.appendChild(text("Author: Pierre Lindenbaum\n" + JVarkitVersion.getInstance().getCompilationDate() + "\n" + JVarkitVersion.getInstance().getGitHash()));
            }
            // main title
            {
                final Element gtitle = element("text", mainTitleStr);
                gtitle.setAttribute("class", "maintitle");
                gtitle.setAttribute("x", format(this.image_width_pixel / 2.0));
                gtitle.setAttribute("y", "15");
                svgRoot.appendChild(wrapLoc(gtitle, bed));
            }
            int margin_top = 50;
            double y = margin_top;
            final double min_circle_radius = Double.parseDouble(this.dynamicParams.getOrDefault("gt.r1", "1"));
            final double max_circle_radius = Double.parseDouble(this.dynamicParams.getOrDefault("gt.r2", "7"));
            final Element main_g = element("g");
            svgRoot.appendChild(main_g);
            /**
             * plot genes
             */
            if (!this.all_genes.isEmpty()) {
                final double transcript_height = 5;
                final double exon_height = (transcript_height - 1);
                final double save_y = y;
                final Element g_genes = element("g");
                g_genes.setAttribute("transform", "translate(" + margin_left + ",0)");
                main_g.appendChild(g_genes);
                /* loop over each vset */
                for (i = 0; i < L.size(); i++) {
                    final VariantSet vset = L.get(i);
                    // find transcript in this vset
                    final List<Transcript> transcripts = this.all_genes.getOverlapping(vset).stream().flatMap(G -> G.getTranscripts().stream()).filter(T -> T.overlaps(vset)).collect(Collectors.toList());
                    if (transcripts.isEmpty())
                        continue;
                    final Element g_vset = element("g");
                    g_vset.setAttribute("transform", "translate(" + format(vset.x) + ",0)");
                    g_genes.appendChild(g_vset);
                    // y in this vset
                    double y2 = save_y;
                    /* convert base to pixel */
                    final ToDoubleFunction<Integer> base2pixel = vset.createBaseToPixelFunction();
                    /* loop over transcripts */
                    for (final Transcript tr : transcripts) {
                        final Element g_tr = element("g");
                        g_tr.setAttribute("transform", "translate(0," + format(y2) + ")");
                        g_vset.appendChild(g_tr);
                        final Element line = element("line");
                        line.setAttribute("class", "transcript");
                        line.setAttribute("x1", format(Math.max(0, base2pixel.applyAsDouble(tr.getStart()))));
                        line.setAttribute("y1", format(transcript_height / 2.0));
                        line.setAttribute("x2", format(Math.min(vset.width, base2pixel.applyAsDouble(tr.getEnd()))));
                        line.setAttribute("y2", format(transcript_height / 2.0));
                        line.appendChild(element("title", tr.getId()));
                        g_tr.appendChild(wrapLoc(line, tr));
                        /* loop over exons */
                        for (final Exon exon : tr.getExons()) {
                            if (!exon.overlaps(vset))
                                continue;
                            final Element exRect = element("rect");
                            exRect.setAttribute("class", "exon");
                            final double x_start = Math.max(0, base2pixel.applyAsDouble(exon.getStart()));
                            exRect.setAttribute("x", format(x_start));
                            exRect.setAttribute("y", format(transcript_height / 2.0 - exon_height / 2.0));
                            final double x_end = Math.min(vset.width, base2pixel.applyAsDouble(exon.getEnd()));
                            exRect.setAttribute("width", format(x_end - x_start));
                            exRect.setAttribute("height", format(exon_height));
                            exRect.appendChild(element("title", exon.getName()));
                            g_tr.appendChild(wrapLoc(exRect, exon));
                        }
                        y2 += transcript_height + 0.5;
                    }
                    y = Math.max(y, y2);
                }
                y++;
            }
            final double sample_height = Double.parseDouble(this.dynamicParams.getOrDefault("sample.height", "25"));
            final double sample_height2 = sample_height - (max_circle_radius * 2.0);
            int space_between_samples = 2;
            int got_n_samples = 0;
            for (final String sn : header.getSampleNamesInOrder()) {
                if (!limitSamples.contains(sn))
                    continue;
                boolean got_this_sample = false;
                final Element g_sample = element("g");
                g_sample.setAttribute("transform", "translate(" + margin_left + "," + format(y) + ")");
                /* get coverage */
                final int maxCoverage;
                if (this.sample2bam.containsKey(sn)) {
                    final CoverageFactory coverageFactory = new CoverageFactory();
                    try (SamReader sr = this.openSamReader(this.sample2bam.get(sn))) {
                        /* loop over each variant set */
                        for (final VariantSet vset : L) {
                            vset.coverage = coverageFactory.getSimpleCoverage(sr, vset, sn);
                        }
                    }
                    maxCoverage = L.stream().flatMapToInt(V -> V.coverage.stream()).max().orElse(0);
                } else {
                    maxCoverage = 0;
                    for (final VariantSet vset : L) {
                        vset.coverage = null;
                    }
                }
                /* loop over each variant set */
                for (i = 0; i < L.size(); i++) {
                    final VariantSet vset = L.get(i);
                    final Element g_vset = element("g");
                    g_vset.setAttribute("transform", "translate(" + format(vset.x) + ",0)");
                    g_sample.appendChild(g_vset);
                    /* convert base to pixel */
                    final ToDoubleFunction<Integer> base2pixel = vset.createBaseToPixelFunction();
                    // plot set length
                    final Element rect = element("rect");
                    rect.setAttribute("class", "area" + (i % 2));
                    rect.setAttribute("x", "0");
                    rect.setAttribute("y", "0");
                    rect.setAttribute("width", format(vset.width));
                    rect.setAttribute("height", format(sample_height));
                    if (!remove_tooltip)
                        rect.appendChild(element("title", vset.toString()));
                    g_vset.appendChild(rect);
                    // plot coverage
                    if (maxCoverage > 0 && this.sample2bam.containsKey(sn)) {
                        final double[] scaled = vset.coverage.scaleAverage((int) vset.width);
                        final StringBuilder sb = new StringBuilder();
                        sb.append("0," + sample_height);
                        for (int t = 0; t < scaled.length; t++) {
                            if (t > 1 && t + 1 < scaled.length && format(scaled[t - 1]).equals(format(scaled[t + 1])) && format(scaled[t - 1]).equals(format(scaled[t])))
                                continue;
                            sb.append(" ").append(t).append(",");
                            sb.append(format(sample_height * (1.0 - scaled[t] / maxCoverage)));
                        }
                        sb.append(" " + format(vset.width) + "," + sample_height);
                        final Element polyline = element("polyline");
                        polyline.setAttribute("class", "coverage");
                        polyline.setAttribute("points", sb.toString());
                        g_vset.appendChild(polyline);
                        vset.coverage = null;
                    }
                    // plot vertical line if colorTag defined
                    if (!StringUtils.isBlank(this.colorTag)) {
                        for (final VariantContext vc : vset.variants) {
                            if (!vc.hasAttribute(this.colorTag))
                                continue;
                            final String cssColor = vc.getAttributeAsString(this.colorTag, "");
                            if (StringUtils.isBlank(cssColor))
                                continue;
                            final double x0 = base2pixel.applyAsDouble(vc.getStart());
                            final Element line = element("line");
                            line.setAttribute("class", "vc");
                            line.setAttribute("style", "stroke:" + cssColor);
                            line.setAttribute("x1", format(x0));
                            line.setAttribute("y1", "0");
                            line.setAttribute("x2", format(x0));
                            line.setAttribute("y2", format(sample_height));
                            g_vset.appendChild(line);
                        }
                    }
                    // print all variants in this vcfset for this sample
                    for (final VariantContext vc : vset.variants) {
                        final Genotype gt = vc.getGenotype(sn);
                        if (gt.isNoCall())
                            continue;
                        if (hide_hom_ref && gt.isHomRef())
                            continue;
                        if (gt.hasGQ() && gt.getGQ() < this.minGQ)
                            continue;
                        final OptionalDouble alt_ratio = getAltRatio(gt);
                        if (!alt_ratio.isPresent())
                            continue;
                        final OptionalDouble af = getAF(vc);
                        final double circle_radius = min_circle_radius + (max_circle_radius - min_circle_radius) * (1.0 - af.orElse(1.0));
                        // HOMREF=0;  HET =0.5; HOMVAR = 1;
                        final double gtx = base2pixel.applyAsDouble(vc.getStart());
                        final double gty = sample_height - (sample_height2 * alt_ratio.getAsDouble() + (sample_height - sample_height2) / 2.0);
                        final Element circle = element("circle");
                        circle.setAttribute("class", gt.getType().name());
                        circle.setAttribute("cx", format(gtx));
                        circle.setAttribute("cy", format(gty));
                        circle.setAttribute("r", format(circle_radius));
                        if (!remove_tooltip)
                            circle.appendChild(element("title", vc.getStart() + " " + (vc.hasID() ? vc.getID() : "") + " " + vc.getAlleles().stream().map(A -> A.getDisplayString()).collect(Collectors.joining("/")) + " " + gt.getType().name() + " AF=" + format(af.orElse(-1))));
                        g_vset.appendChild(wrapLoc(circle, vc));
                        got_this_sample = true;
                    }
                }
                final Element frame_sample = element("rect");
                frame_sample.setAttribute("class", "frame");
                frame_sample.setAttribute("x", "0");
                frame_sample.setAttribute("y", "0");
                frame_sample.setAttribute("width", format(drawingAreaWidth));
                frame_sample.setAttribute("height", format(sample_height));
                g_sample.appendChild(frame_sample);
                final Element label = element("text", sn + (maxCoverage == 0 ? "" : " Max Cov. " + maxCoverage));
                label.setAttribute("class", "samplelabel");
                label.setAttribute("x", "0");
                label.setAttribute("y", "0");
                // label.setAttribute("transform", "translate("+format(-10)+","+0+") rotate(90) ");
                label.setAttribute("transform", "translate(12,12)");
                if (!remove_tooltip)
                    label.appendChild(element("title", sn));
                g_sample.appendChild(label);
                if (got_this_sample) {
                    got_n_samples++;
                    main_g.appendChild(g_sample);
                    y += sample_height + space_between_samples;
                } else {
                    LOG.warn("no valid data for sample " + sn + " in " + bed);
                }
            }
            // remove extra sample space
            y -= space_between_samples;
            svgRoot.setAttribute("width", format(this.image_width_pixel + 1));
            svgRoot.setAttribute("height", format(y + 1));
            if (got_n_samples == 0) {
                LOG.info("no sample/genotype found for " + bed);
                return;
            }
            // save
            final Transformer tr = TransformerFactory.newInstance().newTransformer();
            final String filename = bed.getContig() + "_" + bed.getStart() + "_" + bed.getEnd() + ".svg" + (this.compressed_svg ? ".gz" : "");
            LOG.info("writing " + filename);
            if (this.compressed_svg) {
                try (final OutputStream pw = archive.openOuputStream(filename)) {
                    try (GZIPOutputStream gzout = new GZIPOutputStream(pw)) {
                        tr.transform(new DOMSource(this.document), new StreamResult(gzout));
                        gzout.finish();
                        gzout.flush();
                    }
                    pw.flush();
                }
            } else {
                try (final PrintWriter pw = archive.openWriter(filename)) {
                    tr.transform(new DOMSource(this.document), new StreamResult(pw));
                    pw.flush();
                }
            }
            manifest.print(bed.getContig());
            manifest.print("\t");
            manifest.print(bed.getStart() - 1);
            manifest.print("\t");
            manifest.print(bed.getEnd());
            manifest.print("\t");
            manifest.print(filename);
            manifest.println();
        } catch (final Throwable err) {
            throw new RuntimeException(err);
        } finally {
            this.document = null;
        }
    }
}
Also used : Transformer(javax.xml.transform.Transformer) Text(org.w3c.dom.Text) Arrays(java.util.Arrays) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Transcript(com.github.lindenb.jvarkit.util.bio.structure.Transcript) VCFHeader(htsjdk.variant.vcf.VCFHeader) StreamResult(javax.xml.transform.stream.StreamResult) Gene(com.github.lindenb.jvarkit.util.bio.structure.Gene) SVG(com.github.lindenb.jvarkit.util.svg.SVG) SAMFileHeader(htsjdk.samtools.SAMFileHeader) Hyperlink(com.github.lindenb.jvarkit.net.Hyperlink) DistanceParser(com.github.lindenb.jvarkit.util.bio.DistanceParser) CoverageFactory(com.github.lindenb.jvarkit.samtools.CoverageFactory) Document(org.w3c.dom.Document) Map(java.util.Map) Path(java.nio.file.Path) PrintWriter(java.io.PrintWriter) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) SAMException(htsjdk.samtools.SAMException) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) Collectors(java.util.stream.Collectors) AFExtractorFactory(com.github.lindenb.jvarkit.util.vcf.AFExtractorFactory) List(java.util.List) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) ToDoubleFunction(java.util.function.ToDoubleFunction) VariantContext(htsjdk.variant.variantcontext.VariantContext) GZIPOutputStream(java.util.zip.GZIPOutputStream) DocumentBuilderFactory(javax.xml.parsers.DocumentBuilderFactory) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) Genotype(htsjdk.variant.variantcontext.Genotype) DOMSource(javax.xml.transform.dom.DOMSource) CloseableIterator(htsjdk.samtools.util.CloseableIterator) SequenceUtil(htsjdk.samtools.util.SequenceUtil) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Parameter(com.beust.jcommander.Parameter) NullOuputStream(com.github.lindenb.jvarkit.io.NullOuputStream) BedLineCodec(com.github.lindenb.jvarkit.util.bio.bed.BedLineCodec) OptionalDouble(java.util.OptionalDouble) Exon(com.github.lindenb.jvarkit.util.bio.structure.Exon) HashMap(java.util.HashMap) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) Interval(htsjdk.samtools.util.Interval) NoSplitter(com.github.lindenb.jvarkit.util.jcommander.NoSplitter) Node(org.w3c.dom.Node) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFReaderFactory(com.github.lindenb.jvarkit.variant.vcf.VCFReaderFactory) VCFConstants(htsjdk.variant.vcf.VCFConstants) OutputStream(java.io.OutputStream) Locatable(htsjdk.samtools.util.Locatable) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) DecimalFormat(java.text.DecimalFormat) VCFReader(htsjdk.variant.vcf.VCFReader) IOException(java.io.IOException) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) SamReader(htsjdk.samtools.SamReader) GtfReader(com.github.lindenb.jvarkit.util.bio.structure.GtfReader) Element(org.w3c.dom.Element) FractionConverter(com.github.lindenb.jvarkit.jcommander.converter.FractionConverter) DocumentBuilder(javax.xml.parsers.DocumentBuilder) DynamicParameter(com.beust.jcommander.DynamicParameter) BufferedReader(java.io.BufferedReader) TransformerFactory(javax.xml.transform.TransformerFactory) ArchiveFactory(com.github.lindenb.jvarkit.io.ArchiveFactory) DOMSource(javax.xml.transform.dom.DOMSource) DocumentBuilderFactory(javax.xml.parsers.DocumentBuilderFactory) Transformer(javax.xml.transform.Transformer) Element(org.w3c.dom.Element) GZIPOutputStream(java.util.zip.GZIPOutputStream) OutputStream(java.io.OutputStream) VariantContext(htsjdk.variant.variantcontext.VariantContext) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) SamReader(htsjdk.samtools.SamReader) Exon(com.github.lindenb.jvarkit.util.bio.structure.Exon) GZIPOutputStream(java.util.zip.GZIPOutputStream) TreeSet(java.util.TreeSet) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) PrintWriter(java.io.PrintWriter) Transcript(com.github.lindenb.jvarkit.util.bio.structure.Transcript) CoverageFactory(com.github.lindenb.jvarkit.samtools.CoverageFactory) StreamResult(javax.xml.transform.stream.StreamResult) Genotype(htsjdk.variant.variantcontext.Genotype) OptionalDouble(java.util.OptionalDouble) DocumentBuilder(javax.xml.parsers.DocumentBuilder)

Example 57 with VCFReader

use of htsjdk.variant.vcf.VCFReader in project jvarkit by lindenb.

the class ScanStructuralVariants method doWork.

@Override
public int doWork(final List<String> args) {
    final List<VCFReader> casesFiles = new ArrayList<>();
    if (this.svComparator.getBndDistance() < 0) {
        LOG.error("bad max_distance :" + this.svComparator.getBndDistance());
        return -1;
    }
    VariantContextWriter out = null;
    try {
        final List<Path> casesPaths = (IOUtils.unrollPaths(args));
        if (casesPaths.isEmpty()) {
            LOG.error("cases list is empty");
            return -1;
        }
        if (!print_all_ctx && casesPaths.size() == 1) {
            LOG.warning("One case: switching to --all");
            print_all_ctx = true;
        }
        if (this.controlsPath.size() == 1 && this.controlsPath.get(0).toString().endsWith(".list")) {
            this.controlsPath = Files.lines(this.controlsPath.get(0)).filter(L -> !(L.startsWith("#") || StringUtils.isBlank(L))).map(L -> Paths.get(L)).collect(Collectors.toList());
        }
        SAMSequenceDictionary dict = null;
        final Set<VCFHeaderLine> metadata = new HashSet<>();
        for (int side = 0; side < 2; side++) {
            for (final Path input : (side == 0 ? casesPaths : this.controlsPath)) {
                final VCFReader vcfInput = VCFReaderFactory.makeDefault().open(input);
                final VCFHeader header = vcfInput.getHeader();
                if (side == 0) {
                    casesFiles.add(vcfInput);
                } else {
                    vcfInput.close();
                }
                final SAMSequenceDictionary dict2 = SequenceDictionaryUtils.extractRequired(header);
                if (dict == null) {
                    dict = dict2;
                } else if (!SequenceUtil.areSequenceDictionariesEqual(dict, dict2)) {
                    LOG.error(JvarkitException.DictionariesAreNotTheSame.getMessage(dict2, dict));
                    return -1;
                }
            }
        }
        final IntervalTreeMap<Boolean> intervalTreeMap;
        if (intervalListProvider != null) {
            intervalTreeMap = new IntervalTreeMap<>();
            intervalListProvider.dictionary(dict).stream().forEach(R -> intervalTreeMap.put(new Interval(R), true));
        } else {
            intervalTreeMap = null;
        }
        casesFiles.stream().flatMap(F -> F.getHeader().getMetaDataInInputOrder().stream()).forEach(H -> metadata.add(H));
        VCFStandardHeaderLines.addStandardFormatLines(metadata, true, VCFConstants.GENOTYPE_KEY);
        VCFStandardHeaderLines.addStandardInfoLines(metadata, true, VCFConstants.END_KEY);
        metadata.add(new VCFInfoHeaderLine("SAMPLES", VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Samples carrying the SV"));
        metadata.add(new VCFInfoHeaderLine("NSAMPLES", 1, VCFHeaderLineType.Integer, "Number of Samples carrying the SV"));
        metadata.add(new VCFInfoHeaderLine("SVLEN", 1, VCFHeaderLineType.Integer, "SV length"));
        metadata.add(new VCFInfoHeaderLine("CIPOS", 2, VCFHeaderLineType.Integer, "Confidence interval around POS for imprecise variants"));
        metadata.add(new VCFInfoHeaderLine("CIEND", 2, VCFHeaderLineType.Integer, "Confidence interval around END for imprecise variants"));
        metadata.add(new VCFInfoHeaderLine("IMPRECISE", 0, VCFHeaderLineType.Flag, "Imprecise structural variation"));
        metadata.add(new VCFInfoHeaderLine(ATT_FILENAME, 1, VCFHeaderLineType.String, "Source of variant"));
        metadata.add(new VCFInfoHeaderLine(ATT_CLUSTER, 1, VCFHeaderLineType.String, "Variant cluster"));
        /*metadata.add(new VCFFormatHeaderLine(
					"OV",1,
					VCFHeaderLineType.Integer,
					"Number calls (with different sample) overlapping this genotype"
					));*/
        metadata.add(new VCFInfoHeaderLine(VCFConstants.SVTYPE, 1, VCFHeaderLineType.String, "SV type"));
        metadata.add(new VCFFilterHeaderLine(ATT_CONTROL, "Variant is found in controls (max MAF=" + this.max_maf + ")"));
        final VCFHeader header = new VCFHeader(metadata);
        header.setSequenceDictionary(dict);
        JVarkitVersion.getInstance().addMetaData(this, header);
        final List<ShadowedVcfReader> controlShadowReaders = new ArrayList<>(this.controlsPath.size());
        for (int i = 0; i < this.controlsPath.size(); i++) {
            boolean large_flag = this.max_control_large_flag < 0 || i >= this.max_control_large_flag;
            controlShadowReaders.add(new ShadowedVcfReader(this.controlsPath.get(i), large_flag));
        }
        out = super.openVariantContextWriter(this.outputFile);
        out.writeHeader(header);
        final CloseableIterator<VariantContext> iter = casesFiles.get(0).iterator();
        final ProgressFactory.Watcher<VariantContext> progress = ProgressFactory.newInstance().dictionary(dict).logger(LOG).build();
        final Decoy decoy = Decoy.getDefaultInstance();
        VariantContext prevCtx = null;
        while (iter.hasNext()) {
            final VariantContext ctx = progress.apply(iter.next());
            if (decoy.isDecoy(ctx.getContig()))
                continue;
            if (Breakend.parse(ctx).stream().anyMatch(B -> decoy.isDecoy(B.getContig())))
                continue;
            if (intervalTreeMap != null && !intervalTreeMap.containsOverlapping(ctx))
                continue;
            // in manta, I see the same variant multiple times in the same vcf
            if (prevCtx != null && ctx.getContig().equals(prevCtx.getContig()) && ctx.getStart() == prevCtx.getStart() && ctx.getEnd() == prevCtx.getEnd())
                continue;
            prevCtx = ctx;
            final List<VariantContext> candidate = new ArrayList<>(casesFiles.size());
            candidate.add(ctx);
            recursive(ctx, candidate, casesFiles, controlShadowReaders, out);
        }
        iter.close();
        progress.close();
        out.close();
        out = null;
        casesFiles.stream().forEach(F -> {
            try {
                F.close();
            } catch (Exception err) {
            }
        });
        controlShadowReaders.stream().forEach(F -> F.realClose());
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(out);
    }
}
Also used : VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) CloseableIterator(htsjdk.samtools.util.CloseableIterator) Allele(htsjdk.variant.variantcontext.Allele) Arrays(java.util.Arrays) SequenceUtil(htsjdk.samtools.util.SequenceUtil) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) VCFStandardHeaderLines(htsjdk.variant.vcf.VCFStandardHeaderLines) VCFHeader(htsjdk.variant.vcf.VCFHeader) IntervalListProvider(com.github.lindenb.jvarkit.samtools.util.IntervalListProvider) htsjdk.samtools.util(htsjdk.samtools.util) Function(java.util.function.Function) ParametersDelegate(com.beust.jcommander.ParametersDelegate) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Decoy(com.github.lindenb.jvarkit.samtools.Decoy) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Breakend(com.github.lindenb.jvarkit.variant.variantcontext.Breakend) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFReaderFactory(com.github.lindenb.jvarkit.variant.vcf.VCFReaderFactory) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) VCFConstants(htsjdk.variant.vcf.VCFConstants) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) Files(java.nio.file.Files) Logger(com.github.lindenb.jvarkit.util.log.Logger) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) VCFReader(htsjdk.variant.vcf.VCFReader) Set(java.util.Set) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) StructuralVariantComparator(com.github.lindenb.jvarkit.variant.sv.StructuralVariantComparator) Collectors(java.util.stream.Collectors) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) File(java.io.File) List(java.util.List) Paths(java.nio.file.Paths) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) VariantContext(htsjdk.variant.variantcontext.VariantContext) VCFHeaderLineCount(htsjdk.variant.vcf.VCFHeaderLineCount) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) VCFReader(htsjdk.variant.vcf.VCFReader) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet) Path(java.nio.file.Path) Decoy(com.github.lindenb.jvarkit.samtools.Decoy) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException)

Example 58 with VCFReader

use of htsjdk.variant.vcf.VCFReader in project jvarkit by lindenb.

the class TestNg01 method streamVcf.

static Stream<VariantContext> streamVcf(final File f) {
    final VCFReader r = VCFReaderFactory.makeDefault().open(f, false);
    final CloseableIterator<VariantContext> iter = r.iterator();
    return StreamSupport.stream(new IterableAdapter<VariantContext>(iter).spliterator(), false).onClose(() -> {
        iter.close();
        CloserUtil.close(r);
    });
}
Also used : VCFReader(htsjdk.variant.vcf.VCFReader) VariantContext(htsjdk.variant.variantcontext.VariantContext)

Example 59 with VCFReader

use of htsjdk.variant.vcf.VCFReader in project jvarkit by lindenb.

the class VCFCompareGT method doWork.

@Override
public int doWork(final List<String> arguments) {
    if (arguments.isEmpty()) {
        LOG.error("VCFs missing.");
        return -1;
    }
    final List<String> vcfLabelList;
    if (!StringUtil.isBlank(this.vcfLabelsStr)) {
        vcfLabelList = Arrays.stream(this.vcfLabelsStr.split("[,]")).filter(S -> !StringUtil.isBlank(S)).collect(Collectors.toList());
        if (new HashSet<>(vcfLabelList).size() != arguments.size()) {
            LOG.error("bad number of labels in : " + this.vcfLabelsStr);
            return -1;
        }
        if (vcfLabelList.stream().anyMatch(S -> !S.matches("[0-9A-Za-z]+"))) {
            LOG.error("bad label in : " + this.vcfLabelsStr);
            return -1;
        }
    } else {
        vcfLabelList = new ArrayList<>();
        for (int i = 0; i < arguments.size(); ++i) {
            vcfLabelList.add(String.valueOf(i + 1));
        }
    }
    VariantComparator varcmp = new VariantComparator();
    SortingCollection<Variant> variants = null;
    final Set<String> sampleNames = new LinkedHashSet<>();
    try {
        variants = SortingCollection.newInstance(Variant.class, new VariantCodec(), varcmp, writingSortingCollection.getMaxRecordsInRam(), writingSortingCollection.getTmpPaths());
        variants.setDestructiveIteration(true);
        final Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
        metaData.add(new VCFHeaderLine(getClass().getSimpleName(), "version:" + getVersion() + " command:" + getProgramCommandLine()));
        final BiFunction<String, Integer, String> createName = (SN, IDX) -> SN + "_" + vcfLabelList.get(IDX);
        for (int i = 0; i < arguments.size(); ++i) {
            final File vcfFile = new File(arguments.get(i));
            LOG.info("Opening " + vcfFile);
            final VCFReader vcfFileReader = VCFReaderFactory.makeDefault().open(vcfFile, false);
            final CloseableIterator<VariantContext> iter = vcfFileReader.iterator();
            final VCFHeader header = vcfFileReader.getHeader();
            sampleNames.addAll(header.getSampleNamesInOrder());
            metaData.add(new VCFHeaderLine(getClass().getSimpleName() + "_" + vcfLabelList.get(i), "File: " + vcfFile.getPath()));
            long nLines = 0;
            while (iter.hasNext()) {
                final VariantContext var = iter.next();
                if (nLines++ % 10000 == 0) {
                    LOG.info(vcfFile + " " + nLines);
                }
                if (!this.variantFilter.test(var))
                    continue;
                if (!var.isVariant())
                    continue;
                if (!var.hasGenotypes())
                    continue;
                for (final Genotype genotype : var.getGenotypes()) {
                    final Variant rec = new Variant();
                    rec.file_index1 = i + 1;
                    rec.sampleName = genotype.getSampleName();
                    rec.chrom = var.getContig();
                    rec.start = var.getStart();
                    rec.end = var.getEnd();
                    rec.ref = var.getReference().getDisplayString().toUpperCase();
                    if (var.hasID()) {
                        rec.id = var.getID();
                    }
                    if (genotype.hasDP()) {
                        rec.dp = genotype.getDP();
                    }
                    if (genotype.hasGQ()) {
                        rec.gq = genotype.getGQ();
                    }
                    final List<Allele> alleles = genotype.getAlleles();
                    if (genotype.isNoCall() || !genotype.isAvailable() || alleles == null) {
                        if (!this.convertNoCallToHomRef)
                            continue;
                        rec.a1 = var.getReference().getDisplayString().toUpperCase();
                        rec.a2 = rec.a1;
                    } else if (alleles.size() == 1) {
                        rec.a1 = alleles.get(0).getDisplayString().toUpperCase();
                        rec.a2 = rec.a1;
                    } else if (alleles.size() == 2) {
                        rec.a1 = alleles.get(0).getDisplayString().toUpperCase();
                        rec.a2 = alleles.get(1).getDisplayString().toUpperCase();
                        if (rec.a1.compareTo(rec.a2) > 0) {
                            final String tmp = rec.a2;
                            rec.a2 = rec.a1;
                            rec.a1 = tmp;
                        }
                    } else {
                        continue;
                    }
                    variants.add(rec);
                }
            }
            iter.close();
            vcfFileReader.close();
        }
        variants.doneAdding();
        LOG.info("Done Adding");
        final String GenpotypeChangedKey = "GCH";
        final String GenpotypeCreated = "GNW";
        final String GenpotypeDiff = "GDF";
        metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY));
        metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.DEPTH_KEY));
        metaData.add(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_QUALITY_KEY));
        metaData.add(new VCFFormatHeaderLine(GenpotypeChangedKey, 1, VCFHeaderLineType.Integer, "Changed Genotype"));
        metaData.add(new VCFFormatHeaderLine(GenpotypeCreated, 1, VCFHeaderLineType.Integer, "Genotype Created/Deleted"));
        metaData.add(new VCFInfoHeaderLine(GenpotypeDiff, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "Samples with Genotype Difference"));
        metaData.add(new VCFFilterHeaderLine("DISCORDANCE", "something has changed."));
        final Set<String> newSampleNames = new TreeSet<>();
        for (int i = 0; i < vcfLabelList.size(); ++i) {
            for (final String sample : sampleNames) {
                newSampleNames.add(createName.apply(sample, i));
            }
        }
        final VCFHeader header = new VCFHeader(metaData, new ArrayList<>(newSampleNames));
        final VariantContextWriter w = super.openVariantContextWriter(outputFile);
        w.writeHeader(header);
        final PosComparator posCompare = new PosComparator();
        final EqualRangeIterator<Variant> iter = new EqualRangeIterator<>(variants.iterator(), posCompare);
        while (iter.hasNext()) {
            final List<Variant> row = iter.next();
            /**
             * this sample is not always the same
             */
            final Set<String> samplesModified = new TreeSet<>();
            /**
             * the number of sample is different from vcflist.size()
             */
            final Set<String> samplesCreates = new TreeSet<>();
            final Map<String, List<Variant>> sample2variants = row.stream().collect(Collectors.groupingBy(T -> T.sampleName));
            for (final String sn : sample2variants.keySet()) {
                boolean all_hom_ref = true;
                final List<Variant> sampleVariants = sample2variants.get(sn);
                for (int x = 0; x < /*+1 non, besoin de tester hom_ref */
                sampleVariants.size(); ++x) {
                    final Variant var1 = sampleVariants.get(x);
                    if (!var1.isHomRef())
                        all_hom_ref = false;
                    for (int y = x + 1; y < sampleVariants.size(); ++y) {
                        final Variant var2 = sampleVariants.get(y);
                        if (var1.a1.equals(var2.a1) && var1.a2.equals(var2.a2))
                            continue;
                        samplesModified.add(var1.sampleName);
                    }
                }
                if (sampleVariants.size() != arguments.size()) {
                    if (!convertNoCallToHomRef || (this.convertNoCallToHomRef && !all_hom_ref)) {
                        samplesCreates.add(sn);
                    }
                }
            }
            final Variant first = row.get(0);
            final Set<Allele> alleles = new HashSet<>();
            alleles.add(Allele.create(first.ref, true));
            for (final Variant var : row) {
                alleles.add(Allele.create(var.a1, var.a1.equalsIgnoreCase(var.ref)));
                alleles.add(Allele.create(var.a2, var.a2.equalsIgnoreCase(var.ref)));
            }
            final VariantContextBuilder b = new VariantContextBuilder(getClass().getName(), first.chrom, first.start, first.end, alleles);
            // build genotypes
            final List<Genotype> genotypes = new ArrayList<Genotype>();
            for (final Variant var : row) {
                // alleles for this genotype
                final List<Allele> galleles = new ArrayList<Allele>();
                galleles.add(Allele.create(var.a1, var.a1.equalsIgnoreCase(var.ref)));
                galleles.add(Allele.create(var.a2, var.a2.equalsIgnoreCase(var.ref)));
                final GenotypeBuilder gb = new GenotypeBuilder();
                gb.DP(var.dp);
                gb.alleles(galleles);
                gb.name(createName.apply(var.sampleName, var.file_index1 - 1));
                gb.GQ(var.gq);
                gb.attribute(GenpotypeChangedKey, samplesModified.contains(var.sampleName) ? 1 : 0);
                gb.attribute(GenpotypeCreated, samplesCreates.contains(var.sampleName) ? 1 : 0);
                genotypes.add(gb.make());
            }
            b.genotypes(genotypes);
            b.id(first.id);
            if (!(samplesModified.isEmpty() && samplesCreates.isEmpty())) {
                final Set<String> set2 = new TreeSet<String>(samplesModified);
                set2.addAll(samplesCreates);
                b.attribute(GenpotypeDiff, set2.toArray());
                b.filter("DISCORDANCE");
            } else {
                b.passFilters();
            }
            if (!only_print_modified || !(samplesModified.isEmpty() && samplesCreates.isEmpty())) {
                w.add(b.make());
            }
        }
        iter.close();
        w.close();
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        if (variants != null)
            try {
                variants.cleanup();
            } catch (Exception err) {
            }
    }
    return 0;
}
Also used : LinkedHashSet(java.util.LinkedHashSet) Genotype(htsjdk.variant.variantcontext.Genotype) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) DataInputStream(java.io.DataInputStream) CloseableIterator(htsjdk.samtools.util.CloseableIterator) Allele(htsjdk.variant.variantcontext.Allele) Arrays(java.util.Arrays) JexlVariantPredicate(com.github.lindenb.jvarkit.util.vcf.JexlVariantPredicate) Program(com.github.lindenb.jvarkit.util.jcommander.Program) VCFStandardHeaderLines(htsjdk.variant.vcf.VCFStandardHeaderLines) Parameter(com.beust.jcommander.Parameter) VCFHeader(htsjdk.variant.vcf.VCFHeader) BiFunction(java.util.function.BiFunction) TreeSet(java.util.TreeSet) ParametersDelegate(com.beust.jcommander.ParametersDelegate) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) DataOutputStream(java.io.DataOutputStream) StringUtil(htsjdk.samtools.util.StringUtil) AbstractDataCodec(com.github.lindenb.jvarkit.util.picard.AbstractDataCodec) Map(java.util.Map) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFReaderFactory(com.github.lindenb.jvarkit.variant.vcf.VCFReaderFactory) LinkedHashSet(java.util.LinkedHashSet) VCFConstants(htsjdk.variant.vcf.VCFConstants) SortingCollection(htsjdk.samtools.util.SortingCollection) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Predicate(java.util.function.Predicate) Logger(com.github.lindenb.jvarkit.util.log.Logger) VCFReader(htsjdk.variant.vcf.VCFReader) Set(java.util.Set) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) File(java.io.File) List(java.util.List) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) VariantContext(htsjdk.variant.variantcontext.VariantContext) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) Comparator(java.util.Comparator) VCFHeaderLineCount(htsjdk.variant.vcf.VCFHeaderLineCount) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) VCFReader(htsjdk.variant.vcf.VCFReader) TreeSet(java.util.TreeSet) ArrayList(java.util.ArrayList) List(java.util.List) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) Genotype(htsjdk.variant.variantcontext.Genotype) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) IOException(java.io.IOException) Allele(htsjdk.variant.variantcontext.Allele) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) File(java.io.File)

Example 60 with VCFReader

use of htsjdk.variant.vcf.VCFReader in project jvarkit by lindenb.

the class VCFMerge method doWork.

@Override
public int doWork(final List<String> args) {
    final List<Path> userVcfFiles = new ArrayList<Path>();
    final Set<String> genotypeSampleNames = new TreeSet<String>();
    SAMSequenceDictionary dict = null;
    VariantContextWriter w = null;
    SortingCollection<VariantContext> array = null;
    CloseableIterator<VariantContext> iter = null;
    try {
        userVcfFiles.addAll(IOUtils.unrollPaths(args));
        final Set<String> fieldsSet = Arrays.stream(this.fieldsString.split("[,; ]")).collect(Collectors.toSet());
        final boolean with_ac = fieldsSet.contains(VCFConstants.ALLELE_COUNT_KEY);
        final boolean with_an = fieldsSet.contains(VCFConstants.ALLELE_NUMBER_KEY);
        final boolean with_af = fieldsSet.contains(VCFConstants.ALLELE_FREQUENCY_KEY);
        final boolean with_dp = fieldsSet.contains(VCFConstants.DEPTH_KEY);
        final boolean with_gq = fieldsSet.contains(VCFConstants.GENOTYPE_QUALITY_KEY);
        final boolean with_ad = fieldsSet.contains(VCFConstants.GENOTYPE_ALLELE_DEPTHS);
        final boolean with_pl = fieldsSet.contains(VCFConstants.GENOTYPE_PL_KEY);
        if (userVcfFiles.isEmpty()) {
            LOG.error("No input");
            return -1;
        }
        final boolean requireIndex = !StringUtils.isBlank(this.regionStr);
        for (final Path vcfFile : userVcfFiles) {
            try (VCFReader in = VCFReaderFactory.makeDefault().open(vcfFile, requireIndex)) {
                final VCFHeader header = in.getHeader();
                for (final String sn : header.getSampleNamesInOrder()) {
                    if (genotypeSampleNames.contains(sn)) {
                        LOG.error("duplicate sample name " + sn);
                        return -1;
                    }
                    genotypeSampleNames.add(sn);
                }
                final SAMSequenceDictionary dict1 = SequenceDictionaryUtils.extractRequired(header);
                if (dict == null) {
                    dict = dict1;
                } else {
                    SequenceUtil.assertSequenceDictionariesEqual(dict, dict1);
                }
            }
        }
        if (dict == null) {
            LOG.error("No sequence dictionary defined");
            return -1;
        }
        final SAMSequenceDictionary finalDict = dict;
        final Function<String, Integer> contig2tid = C -> {
            final int tid = finalDict.getSequenceIndex(C);
            if (tid == -1)
                throw new JvarkitException.ContigNotFoundInDictionary(C, finalDict);
            return tid;
        };
        final Comparator<String> compareContigs = (C1, C2) -> {
            if (C1.equals(C2))
                return 0;
            return contig2tid.apply(C1) - contig2tid.apply(C2);
        };
        final Comparator<VariantContext> compareChromPos = (V1, V2) -> {
            int i = compareContigs.compare(V1.getContig(), V2.getContig());
            if (i != 0)
                return i;
            return V1.getStart() - V2.getStart();
        };
        final Comparator<VariantContext> compareChromPosRef = (V1, V2) -> {
            int i = compareChromPos.compare(V1, V2);
            if (i != 0)
                return i;
            return V1.getReference().compareTo(V2.getReference());
        };
        final SimpleInterval rgn;
        final Predicate<VariantContext> accept;
        if (!StringUtil.isBlank(VCFMerge.this.regionStr)) {
            rgn = IntervalParserFactory.newInstance().dictionary(dict).enableWholeContig().make().apply(VCFMerge.this.regionStr).orElseThrow(IntervalParserFactory.exception(VCFMerge.this.regionStr));
            accept = (CTX) -> {
                return rgn.overlaps(CTX);
            };
        } else {
            accept = (VOL) -> true;
            rgn = null;
        }
        final Set<VCFHeaderLine> metaData = new HashSet<>();
        VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.GENOTYPE_KEY);
        if (with_dp)
            VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.DEPTH_KEY);
        if (with_gq)
            VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.GENOTYPE_QUALITY_KEY);
        if (with_pl)
            VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.GENOTYPE_PL_KEY);
        if (with_ad)
            VCFStandardHeaderLines.addStandardFormatLines(metaData, true, VCFConstants.GENOTYPE_ALLELE_DEPTHS);
        if (with_ac)
            VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.ALLELE_COUNT_KEY);
        if (with_an)
            VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.ALLELE_NUMBER_KEY);
        if (with_af)
            VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.ALLELE_FREQUENCY_KEY);
        if (with_dp)
            VCFStandardHeaderLines.addStandardInfoLines(metaData, true, VCFConstants.DEPTH_KEY);
        final VCFHeader mergedHeader = new VCFHeader(metaData, genotypeSampleNames);
        mergedHeader.setSequenceDictionary(dict);
        JVarkitVersion.getInstance().addMetaData(this, mergedHeader);
        array = SortingCollection.newInstance(VariantContext.class, new VCFRecordCodec(mergedHeader), compareChromPosRef, this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
        array.setDestructiveIteration(true);
        for (final Path vcfFile : userVcfFiles) {
            try (VCFReader in = VCFReaderFactory.makeDefault().open(vcfFile, requireIndex)) {
                try (CloseableIterator<VariantContext> lit = (in.isQueryable() && rgn != null ? in.query(rgn) : in.iterator())) {
                    while (lit.hasNext()) {
                        final VariantContext ctx = lit.next();
                        if (!accept.test(ctx))
                            continue;
                        array.add(new VariantContextBuilder(ctx).unfiltered().genotypes(ctx.getGenotypes().stream().filter(G -> G.isCalled()).map(G -> {
                            final GenotypeBuilder gb = new GenotypeBuilder(G);
                            gb.noAttributes();
                            return gb.make();
                        }).collect(Collectors.toList())).rmAttributes(new ArrayList<>(ctx.getAttributes().keySet())).make());
                    }
                }
            }
        }
        array.doneAdding();
        LOG.info("merging..." + userVcfFiles.size() + " vcfs");
        // create the context writer
        w = this.writingVariantsDelegate.open(outputFile);
        w.writeHeader(mergedHeader);
        iter = array.iterator();
        EqualRangeIterator<VariantContext> eqiter = new EqualRangeIterator<>(iter, compareChromPosRef);
        while (eqiter.hasNext()) {
            final List<VariantContext> row = eqiter.next();
            final VariantContext first = row.get(0);
            final List<Allele> alleles = new ArrayList<>();
            alleles.add(first.getReference());
            alleles.addAll(row.stream().flatMap(VC -> VC.getAlternateAlleles().stream()).filter(A -> !A.isNoCall()).collect(Collectors.toSet()));
            final VariantContextBuilder vcb = new VariantContextBuilder(null, first.getContig(), first.getStart(), first.getEnd(), alleles);
            final String id = row.stream().filter(V -> V.hasID()).map(ST -> ST.getID()).findFirst().orElse(null);
            if (!StringUtils.isBlank(id)) {
                vcb.id(id);
            }
            final Map<String, Genotype> sample2genotypes = new HashMap<>(genotypeSampleNames.size());
            final Set<String> remainingSamples = new HashSet<String>(genotypeSampleNames);
            int an = 0;
            int dp = -1;
            final Counter<Allele> ac = new Counter<>();
            for (final VariantContext ctx : row) {
                for (final Genotype gt : ctx.getGenotypes()) {
                    if (gt.isNoCall())
                        continue;
                    for (Allele a : gt.getAlleles()) {
                        ac.incr(a);
                        an++;
                    }
                    final GenotypeBuilder gb = new GenotypeBuilder(gt.getSampleName(), gt.getAlleles());
                    if (with_dp && gt.hasDP()) {
                        if (dp < 0)
                            dp = 0;
                        dp += gt.getDP();
                        gb.DP(gt.getDP());
                    }
                    if (with_gq && gt.hasGQ())
                        gb.GQ(gt.getGQ());
                    if (with_pl && gt.hasPL())
                        gb.PL(gt.getPL());
                    if (with_ad && gt.hasAD()) {
                        final int[] src_ad = gt.getAD();
                        final int[] dest_ad = new int[alleles.size()];
                        Arrays.fill(dest_ad, 0);
                        for (int i = 0; i < src_ad.length && i < ctx.getAlleles().size(); i++) {
                            final Allele a1 = ctx.getAlleles().get(i);
                            final int dest_idx = alleles.indexOf(a1);
                            if (dest_idx >= 0 && dest_idx < dest_ad.length) {
                                dest_ad[dest_idx] = src_ad[i];
                            }
                            gb.AD(dest_ad);
                        }
                    }
                    sample2genotypes.put(gt.getSampleName(), gb.make());
                }
            }
            remainingSamples.removeAll(sample2genotypes.keySet());
            for (String sampleName : remainingSamples) {
                final Genotype gt;
                if (this.useHomRefForUnknown) {
                    final List<Allele> list = new ArrayList<>(ploidy);
                    for (int i = 0; i < ploidy; i++) list.add(first.getReference());
                    an += ploidy;
                    gt = GenotypeBuilder.create(sampleName, list);
                } else {
                    gt = GenotypeBuilder.createMissing(sampleName, this.ploidy);
                }
                sample2genotypes.put(sampleName, gt);
            }
            if (with_an)
                vcb.attribute(VCFConstants.ALLELE_NUMBER_KEY, an);
            if (with_ac)
                vcb.attribute(VCFConstants.ALLELE_COUNT_KEY, alleles.subList(1, alleles.size()).stream().mapToInt(A -> (int) ac.count(A)).toArray());
            if (with_af && an > 0) {
                final double finalAn = an;
                vcb.attribute(VCFConstants.ALLELE_FREQUENCY_KEY, alleles.subList(1, alleles.size()).stream().mapToDouble(A -> ac.count(A) / finalAn).toArray());
            }
            if (with_dp && dp >= 0) {
                vcb.attribute(VCFConstants.DEPTH_KEY, dp);
            }
            vcb.genotypes(sample2genotypes.values());
            w.add(vcb.make());
        }
        eqiter.close();
        CloserUtil.close(w);
        w = null;
        array.cleanup();
        array = null;
        CloserUtil.close(iter);
        iter = null;
        return 0;
    } catch (Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(w);
        CloserUtil.close(iter);
        if (array != null)
            array.cleanup();
    }
}
Also used : WritingVariantsDelegate(com.github.lindenb.jvarkit.variant.variantcontext.writer.WritingVariantsDelegate) Genotype(htsjdk.variant.variantcontext.Genotype) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) CloseableIterator(htsjdk.samtools.util.CloseableIterator) Allele(htsjdk.variant.variantcontext.Allele) VCFRecordCodec(htsjdk.variant.vcf.VCFRecordCodec) Arrays(java.util.Arrays) SequenceUtil(htsjdk.samtools.util.SequenceUtil) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) VCFStandardHeaderLines(htsjdk.variant.vcf.VCFStandardHeaderLines) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashMap(java.util.HashMap) IntervalParserFactory(com.github.lindenb.jvarkit.samtools.util.IntervalParserFactory) Function(java.util.function.Function) TreeSet(java.util.TreeSet) ParametersDelegate(com.beust.jcommander.ParametersDelegate) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) StringUtil(htsjdk.samtools.util.StringUtil) Map(java.util.Map) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFReaderFactory(com.github.lindenb.jvarkit.variant.vcf.VCFReaderFactory) Path(java.nio.file.Path) CloserUtil(htsjdk.samtools.util.CloserUtil) VCFConstants(htsjdk.variant.vcf.VCFConstants) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) Counter(com.github.lindenb.jvarkit.util.Counter) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) SortingCollection(htsjdk.samtools.util.SortingCollection) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Predicate(java.util.function.Predicate) Logger(com.github.lindenb.jvarkit.util.log.Logger) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) VCFReader(htsjdk.variant.vcf.VCFReader) Set(java.util.Set) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) Collectors(java.util.stream.Collectors) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) List(java.util.List) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VariantContext(htsjdk.variant.variantcontext.VariantContext) Comparator(java.util.Comparator) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) EqualRangeIterator(com.github.lindenb.jvarkit.util.iterator.EqualRangeIterator) VCFRecordCodec(htsjdk.variant.vcf.VCFRecordCodec) Counter(com.github.lindenb.jvarkit.util.Counter) TreeSet(java.util.TreeSet) VCFReader(htsjdk.variant.vcf.VCFReader) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet) Path(java.nio.file.Path) Genotype(htsjdk.variant.variantcontext.Genotype) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Allele(htsjdk.variant.variantcontext.Allele) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder)

Aggregations

VCFReader (htsjdk.variant.vcf.VCFReader)60 VariantContext (htsjdk.variant.variantcontext.VariantContext)45 Path (java.nio.file.Path)41 VCFHeader (htsjdk.variant.vcf.VCFHeader)37 VCFReaderFactory (com.github.lindenb.jvarkit.variant.vcf.VCFReaderFactory)34 CloseableIterator (htsjdk.samtools.util.CloseableIterator)32 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)31 Collectors (java.util.stream.Collectors)30 ArrayList (java.util.ArrayList)29 List (java.util.List)29 Logger (com.github.lindenb.jvarkit.util.log.Logger)28 Parameter (com.beust.jcommander.Parameter)27 Program (com.github.lindenb.jvarkit.util.jcommander.Program)26 Launcher (com.github.lindenb.jvarkit.util.jcommander.Launcher)24 Set (java.util.Set)22 VariantContextWriter (htsjdk.variant.variantcontext.writer.VariantContextWriter)21 IOException (java.io.IOException)21 CloserUtil (htsjdk.samtools.util.CloserUtil)20 VariantContextBuilder (htsjdk.variant.variantcontext.VariantContextBuilder)20 StringUtils (com.github.lindenb.jvarkit.lang.StringUtils)19