Search in sources :

Example 36 with VcfIterator

use of com.github.lindenb.jvarkit.util.vcf.VcfIterator in project jvarkit by lindenb.

the class VcfConcat method fromFiles.

private int fromFiles(final VariantContextWriter out) throws IOException {
    List<VcfIterator> inputs = new ArrayList<VcfIterator>(this.inputFiles.size());
    List<String> inputFiles = new ArrayList<>(this.inputFiles.size());
    List<String> samples = new ArrayList<>();
    SAMSequenceDictionary dict = null;
    try {
        Set<VCFHeaderLine> metaData = new HashSet<VCFHeaderLine>();
        /* open each vcf file */
        for (String vcfFile : this.inputFiles) {
            LOG.info("Opening " + vcfFile);
            VcfIterator r = VCFUtils.createVcfIterator(vcfFile);
            /* check VCF dict */
            VCFHeader header = r.getHeader();
            if (dict == null && inputs.isEmpty()) {
                dict = header.getSequenceDictionary();
            } else if (!inputs.isEmpty() && ((dict == null && header.getSequenceDictionary() != null) || (dict != null && header.getSequenceDictionary() == null))) {
                LOG.error("not.the.same.sequence.dictionaries");
                return -1;
            } else if (!inputs.isEmpty() && dict != null && !SequenceUtil.areSequenceDictionariesEqual(dict, header.getSequenceDictionary())) {
                LOG.error("not.the.same.sequence.dictionaries");
                return -1;
            }
            /* check samples */
            if (inputs.isEmpty()) {
                samples = header.getSampleNamesInOrder();
            } else if (!header.getSampleNamesInOrder().equals(samples)) {
                LOG.error("No same samples");
                return -1;
            }
            metaData.addAll(header.getMetaDataInInputOrder());
            inputs.add(r);
            inputFiles.add(VCFUtils.escapeInfoField(vcfFile));
        }
        /* create comparator according to dict*/
        final Comparator<VariantContext> comparator = (dict == null ? VCFUtils.createChromPosRefComparator() : VCFUtils.createTidPosRefComparator(dict));
        metaData.add(new VCFHeaderLine(getClass().getSimpleName() + "CmdLine", String.valueOf(getProgramCommandLine())));
        metaData.add(new VCFHeaderLine(getClass().getSimpleName() + "Version", String.valueOf(getVersion())));
        metaData.add(new VCFHeaderLine(getClass().getSimpleName() + "HtsJdkVersion", HtsjdkVersion.getVersion()));
        metaData.add(new VCFHeaderLine(getClass().getSimpleName() + "HtsJdkHome", HtsjdkVersion.getHome()));
        metaData.add(new VCFInfoHeaderLine(VARIANTSOURCE, 1, VCFHeaderLineType.String, "Origin File of Varant"));
        VCFHeader h2 = new VCFHeader(metaData, samples);
        out.writeHeader(h2);
        SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict);
        for (; ; ) {
            /* get 'smallest' variant */
            VariantContext smallest = null;
            int idx = 0;
            int best_idx = -1;
            while (idx < inputs.size()) {
                VcfIterator in = inputs.get(idx);
                if (!in.hasNext()) {
                    CloserUtil.close(in);
                    inputs.remove(idx);
                    inputFiles.remove(idx);
                } else {
                    VariantContext ctx = in.peek();
                    if (smallest == null || comparator.compare(smallest, ctx) > 0) {
                        smallest = ctx;
                        best_idx = idx;
                    }
                    ++idx;
                }
            }
            if (smallest == null)
                break;
            final VariantContext ctx = progress.watch(inputs.get(best_idx).next());
            final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
            vcb.attribute(VARIANTSOURCE, inputFiles.get(best_idx));
            out.add(vcb.make());
        }
        progress.finish();
        return 0;
    } catch (Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(inputs);
    }
}
Also used : VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) IOException(java.io.IOException) VcfIterator(com.github.lindenb.jvarkit.util.vcf.VcfIterator) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet)

Example 37 with VcfIterator

use of com.github.lindenb.jvarkit.util.vcf.VcfIterator in project jvarkit by lindenb.

the class VcfToZip method doWork.

@Override
public int doWork(List<String> args) {
    if (this.outputFile != null && this.outputFile.getName().endsWith(".zip")) {
        LOG.error("Filename must end with '.zip' " + outputFile);
        return -1;
    }
    LineIterator lr = null;
    VcfIterator in = null;
    ZipOutputStream zout = null;
    FileOutputStream fout = null;
    int num_vcfs = 0;
    VariantContextWriter vcw = null;
    args = new ArrayList<>(IOUtils.unrollFiles(args));
    try {
        int optind = 0;
        if (this.outputFile != null) {
            fout = new FileOutputStream(this.outputFile);
            zout = new ZipOutputStream(fout);
        } else {
            zout = new ZipOutputStream(stdout());
        }
        do {
            if (args.isEmpty()) {
                LOG.info("reading concatenated vcf from stdin");
                lr = IOUtils.openStreamForLineIterator(stdin());
            } else {
                LOG.info("reading concatenated vcf from " + args.get(optind));
                lr = IOUtils.openURIForLineIterator(args.get(optind));
            }
            while (lr.hasNext()) {
                ++num_vcfs;
                in = VCFUtils.createVcfIteratorFromLineIterator(lr, true);
                final VCFHeader header = in.getHeader();
                String filename = null;
                if (this.titleHeaderStr != null && !this.titleHeaderStr.isEmpty()) {
                    final VCFHeaderLine h = header.getOtherHeaderLine(this.titleHeaderStr);
                    if (h != null && !h.getValue().trim().isEmpty())
                        filename = h.getValue().trim();
                }
                if (filename == null || filename.trim().isEmpty()) {
                    // create title
                    filename = String.format("vcf2zip.%05d.vcf", num_vcfs);
                    // set name in header
                    if (this.titleHeaderStr != null && !this.titleHeaderStr.isEmpty()) {
                        header.addMetaDataLine(new VCFHeaderLine(this.titleHeaderStr.trim(), filename));
                    }
                }
                if (!filename.endsWith(".vcf")) {
                    filename += ".vcf";
                }
                if (!this.zipPrefix.isEmpty()) {
                    filename = this.zipPrefix + (this.zipPrefix.endsWith("/") ? "" : "/") + filename;
                }
                LOG.info(filename);
                final ZipEntry entry = new ZipEntry(filename);
                entry.setComment("Created with " + getProgramName());
                zout.putNextEntry(entry);
                vcw = VCFUtils.createVariantContextWriterToOutputStream(IOUtils.uncloseableOutputStream(zout));
                vcw.writeHeader(header);
                final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
                while (in.hasNext()) {
                    vcw.add(progress.watch(in.next()));
                }
                vcw.close();
                progress.finish();
                zout.closeEntry();
                in.close();
            }
            ++optind;
        } while (optind < args.size());
        zout.finish();
        zout.flush();
        zout.close();
        zout = null;
        CloserUtil.close(fout);
        LOG.info("done. Number of VCFs:" + num_vcfs);
        return RETURN_OK;
    } catch (final Exception e) {
        LOG.error(e);
        return -1;
    } finally {
        CloserUtil.close(in);
        CloserUtil.close(lr);
        CloserUtil.close(zout);
        CloserUtil.close(fout);
    }
}
Also used : VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) ZipEntry(java.util.zip.ZipEntry) LineIterator(htsjdk.tribble.readers.LineIterator) VcfIterator(com.github.lindenb.jvarkit.util.vcf.VcfIterator) ZipOutputStream(java.util.zip.ZipOutputStream) FileOutputStream(java.io.FileOutputStream) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFHeader(htsjdk.variant.vcf.VCFHeader)

Example 38 with VcfIterator

use of com.github.lindenb.jvarkit.util.vcf.VcfIterator in project jvarkit by lindenb.

the class LumpyVcfToCircos method doWork.

@Override
public int doWork(List<String> args) {
    VcfIterator r = null;
    ArchiveFactory archiveFactory = null;
    PrintWriter pw = null;
    PrintWriter conf = null;
    try {
        r = super.openVcfIterator(oneFileOrNull(args));
        final VCFHeader header = r.getHeader();
        final SAMSequenceDictionary dict = header.getSequenceDictionary();
        if (dict == null) {
            LOG.error("No dictionary in vcf");
            return -1;
        }
        archiveFactory = ArchiveFactory.open(outputFile);
        conf = archiveFactory.openWriter(createFilename("ideogram.conf"));
        conf.println("<ideogram>");
        conf.println("show = yes");
        conf.println("<spacing>");
        conf.println("default = 1u");
        conf.println("<pairwise \"/hs/ /hs/\">");
        conf.println("spacing = 0.5r");
        conf.println("</pairwise>");
        conf.println("");
        conf.println("</spacing>");
        conf.println("");
        conf.println("thickness         = 25p");
        conf.println("");
        conf.println("#stroke_thickness = 1");
        conf.println("#stroke_color     = black");
        conf.println("");
        conf.println("fill           = yes");
        conf.println("fill_color     = black");
        conf.println("");
        conf.println("radius         = 1r");
        conf.println("show_label     = yes");
        conf.println("label_font     = default");
        conf.println("label_radius   = dims(ideogram,radius_outer) + 10p");
        conf.println("label_size     = 12p");
        conf.println("label_parallel = yes");
        conf.println("");
        conf.println("show_bands            = yes");
        conf.println("fill_bands            = yes");
        conf.println("band_stroke_thickness = 0");
        conf.println("band_stroke_color     = black");
        conf.println("band_transparency     = 4");
        conf.println("");
        conf.println("</ideogram>");
        conf.close();
        conf = archiveFactory.openWriter(createFilename("ticks.conf"));
        conf.println("");
        conf.println("show_ticks        = no");
        conf.println("show_tick_labels  = no");
        conf.println("show_grid         = no");
        conf.println("");
        conf.println("<ticks>");
        conf.println("tick_label_font  = light");
        conf.println("radius           = dims(ideogram,radius_outer) + 180p");
        conf.println("label_offset     = 5p");
        conf.println("label_size       = 16p");
        conf.println("multiplier       = 1e-6");
        conf.println("color            = black");
        conf.println("thickness        = 1p");
        conf.println("");
        conf.println("# 25 Mb ticks, all chromosomes, with labels");
        conf.println("");
        conf.println("<tick>");
        conf.println("spacing        = 25u");
        conf.println("size           = 12p");
        conf.println("show_label     = yes");
        conf.println("format         = %d");
        conf.println("</tick>");
        conf.println("");
        conf.println("# 5 Mb ticks, all chromosomes, with labels");
        conf.println("# labels must be separated by at least 1px, which");
        conf.println("#   avoids overlap on human chrs");
        conf.println("<tick>");
        conf.println("label_separation = 1p");
        conf.println("spacing          = 5u");
        conf.println("size             = 7p");
        conf.println("show_label       = yes");
        conf.println("format           = %d");
        conf.println("</tick>");
        conf.println("");
        conf.println("# 5 Mb ticks with grid on rn1 and mm1, drawn only");
        conf.println("# for their grid (note size=0p)");
        conf.println("<tick>");
        conf.println("chromosomes_display_default = no");
        conf.println("chromosomes    = rn1;mm1");
        conf.println("spacing        = 5u");
        conf.println("size           = 0p");
        conf.println("force_display  = yes");
        conf.println("grid_start     = 0.45r");
        conf.println("grid_end       = dims(ideogram,radius_outer) + 180p");
        conf.println("grid_color     = grey");
        conf.println("grid_thickness = 1p");
        conf.println("grid           = yes");
        conf.println("</tick>");
        conf.println("");
        conf.println("# 20% relative ticks on human chromosomes");
        conf.println("");
        conf.println("<tick>");
        conf.println("chromosomes    = -rn1;-mm1");
        conf.println("radius         = 0.95r");
        conf.println("spacing_type   = relative");
        conf.println("rspacing       = 0.20");
        conf.println("size           = 6p");
        conf.println("show_label     = yes");
        conf.println("label_relative = yes");
        conf.println("rmultiplier    = 100");
        conf.println("format         = %d");
        conf.println("suffix         = %");
        conf.println("");
        conf.println("skip_last_label= yes");
        conf.println("");
        conf.println("grid_start     = 0.885r");
        conf.println("grid_end       = 0.95r");
        conf.println("grid_color     = grey");
        conf.println("grid_thickness = 1p");
        conf.println("grid           = yes");
        conf.println("</tick>");
        conf.println("");
        conf.println("# relative ticks at start and end of human chromosomes");
        conf.println("# with grid");
        conf.println("");
        conf.println("<tick>");
        conf.println("spacing_type   = relative");
        conf.println("rspacing       = 1");
        conf.println("size           = 6p");
        conf.println("grid_start     = 0.45r");
        conf.println("grid_end       = dims(ideogram,radius_outer) + 180p");
        conf.println("grid_color     = grey");
        conf.println("grid_thickness = 1p");
        conf.println("grid           = yes");
        conf.println("</tick>");
        conf.println("");
        conf.println("<tick>");
        conf.println("chromosomes    = -rn1;-mm1");
        conf.println("radius         = 0.95r");
        conf.println("spacing_type   = relative");
        conf.println("rspacing       = 0.10");
        conf.println("size           = 3p");
        conf.println("show_label     = no");
        conf.println("");
        conf.println("grid_start     = 0.885r");
        conf.println("grid_end       = 0.95r");
        conf.println("grid_color     = lgrey");
        conf.println("grid_thickness = 1p");
        conf.println("grid           = yes");
        conf.println("</tick>");
        conf.println("");
        conf.println("# 25% relative ticks on human chromosomes");
        conf.println("");
        conf.println("<tick>");
        conf.println("chromosomes    = -rn1;-mm1");
        conf.println("radius         = 0.82r");
        conf.println("spacing_type   = relative");
        conf.println("rspacing       = 0.25");
        conf.println("size           = 6p");
        conf.println("show_label     = yes");
        conf.println("label_relative = yes");
        conf.println("rmultiplier    = 100");
        conf.println("format         = %d");
        conf.println("");
        conf.println("skip_last_label= yes");
        conf.println("");
        conf.println("grid_start     = 0.755r");
        conf.println("grid_end       = 0.82r");
        conf.println("grid_color     = grey");
        conf.println("grid_thickness = 1p");
        conf.println("grid           = yes");
        conf.println("</tick>");
        conf.println("");
        conf.println("</ticks>");
        conf.close();
        conf = archiveFactory.openWriter(createFilename("circos.conf"));
        conf.println("karyotype=" + createFilename("karyotype.txt"));
        conf.println("chromosomes_units=1000000");
        conf.println("<<include " + createFilename("ideogram.conf") + ">>");
        conf.println("<<include " + createFilename("ticks.conf") + ">>");
        final Map<String, SampleWriters> sample2writers = new HashMap<>(header.getNGenotypeSamples());
        for (final String sampleName : header.getSampleNamesInOrder()) {
            sample2writers.put(sampleName, new SampleWriters(sampleName));
        }
        while (r.hasNext()) {
            final VariantContext vc = r.next();
            for (final SampleWriters sw : sample2writers.values()) {
                sw.visit(vc);
            }
        }
        r.close();
        r = null;
        LOG.info("writing conf");
        final Set<String> seen_chromosomes = new HashSet<>(dict.size());
        for (final SampleWriters sw : sample2writers.values()) {
            seen_chromosomes.addAll(sw.bdns.stream().map(SV -> SV.contig).collect(Collectors.toSet()));
            seen_chromosomes.addAll(sw.bdns.stream().map(SV -> SV.bnb_contig).collect(Collectors.toSet()));
            seen_chromosomes.addAll(sw.duplications.stream().map(SV -> SV.contig).collect(Collectors.toSet()));
            seen_chromosomes.addAll(sw.invertions.stream().map(SV -> SV.contig).collect(Collectors.toSet()));
            seen_chromosomes.addAll(sw.deletions.stream().map(SV -> SV.contig).collect(Collectors.toSet()));
        }
        pw = archiveFactory.openWriter(createFilename("karyotype.txt"));
        for (int i = 0; i < dict.size(); ++i) {
            final SAMSequenceRecord ssr = dict.getSequence(i);
            if (!seen_chromosomes.contains(ssr.getSequenceName()))
                continue;
            pw.print("chr - ");
            pw.print(ssr.getSequenceName());
            pw.print(" ");
            pw.print(ssr.getSequenceName());
            pw.print(" 0 ");
            pw.print(ssr.getSequenceLength());
            pw.print(" ");
            pw.println(i % 2 == 0 ? "blue" : "grey");
        }
        pw.flush();
        pw.close();
        Function<String, String> sample2color = S -> {
            int i = header.getSampleNamesInOrder().indexOf(S);
            if (i != -1) {
                if (i % 2 == 0)
                    i = ((header.getSampleNamesInOrder().size() - 1) - i);
                int g = 30 + (int) ((i / (double) header.getSampleNamesInOrder().size()) * 50.0);
                return "gray" + (int) g;
            }
            return (i % 2 == 0 ? "green" : "red");
        };
        final double minR = 0.1;
        final double maxR = 0.95;
        final double numberOfTracks = sample2writers.values().stream().mapToInt(X -> X.getNumberOfTracks()).sum();
        // final double maxSu = sample2writers.values().stream().mapToInt(X->X.getMaxSupportingEvidences()).max().orElse(1);
        double radius = minR;
        final double dr = (maxR - minR) / numberOfTracks;
        LOG.info("N tracks = " + numberOfTracks + " dr=" + dr);
        conf.println("<plots>");
        int maxSu = sample2writers.values().stream().flatMap(S -> S.deletions.stream()).mapToInt(SV -> SV.su).max().orElse(0);
        for (final SampleWriters sw : sample2writers.values()) {
            if (maxSu == 0 || sw.deletions.isEmpty())
                continue;
            String file = sw.getDeletionFilename();
            pw = archiveFactory.openWriter(file);
            for (final SVDel sv : sw.deletions) sv.print(pw);
            pw.flush();
            pw.close();
            conf.println("<plot>");
            conf.println("type = histogram");
            conf.println("file = " + file);
            conf.println("min = 0");
            conf.println("max = " + maxSu);
            conf.println("orientation = out");
            conf.println("layers = 1");
            conf.println("margin = 0.02u");
            conf.println("color = black_a4");
            conf.println("r0 = " + radius + "r");
            conf.println("r1 = " + (radius + dr) + "r");
            conf.println("fill_color = " + sample2color.apply(sw.sampleName));
            // +sample2color.apply(sw.sampleName));
            conf.println("stroke_color = grey");
            conf.println("thickness = 1");
            conf.println("extend_bin  = no");
            // conf.println("<backgrounds>\n<background>\ncolor ="+(zz%2==0?"pink":"yellow")+"\n</background>\n</backgrounds>");
            // conf.println("<axes>\n<axis>\nspacing   = "+(dr/10.0)+"\ncolor=lgrey\nthickness=2\n</axis>\n</axes>");
            conf.println("</plot>");
            radius += dr;
        }
        maxSu = sample2writers.values().stream().flatMap(S -> S.invertions.stream()).mapToInt(SV -> SV.su).max().orElse(0);
        for (final SampleWriters sw : sample2writers.values()) {
            if (maxSu == 0 || sw.invertions.isEmpty())
                continue;
            String file = sw.getInvertionFilename();
            pw = archiveFactory.openWriter(file);
            for (final SVInv sv : sw.invertions) sv.print(pw);
            pw.flush();
            pw.close();
            conf.println("<plot>");
            conf.println("type = histogram");
            conf.println("file = " + file);
            conf.println("min = 0");
            conf.println("max = " + maxSu);
            conf.println("color = black_a4");
            conf.println("r0 = " + radius + "r");
            conf.println("r1 = " + (radius + dr) + "r");
            conf.println("fill_color = " + sample2color.apply(sw.sampleName));
            conf.println("stroke_color = " + sample2color.apply(sw.sampleName));
            conf.println("thickness = 1");
            conf.println("extend_bin  = no");
            conf.println("</plot>");
            radius += dr;
        }
        maxSu = sample2writers.values().stream().flatMap(S -> S.duplications.stream()).mapToInt(SV -> SV.su).max().orElse(0);
        for (final SampleWriters sw : sample2writers.values()) {
            if (maxSu == 0 || sw.duplications.isEmpty())
                continue;
            String file = sw.getDuplicationFilename();
            pw = archiveFactory.openWriter(file);
            for (final SVDup sv : sw.duplications) sv.print(pw);
            pw.flush();
            pw.close();
            conf.println("<plot>");
            conf.println("type = histogram");
            conf.println("file = " + file);
            conf.println("min = 0");
            conf.println("max = " + maxSu);
            conf.println("color = black_a4");
            conf.println("r0 = " + radius + "r");
            conf.println("r1 = " + (radius + dr) + "r");
            conf.println("fill_color = " + sample2color.apply(sw.sampleName));
            conf.println("stroke_color = " + sample2color.apply(sw.sampleName));
            conf.println("thickness = 1");
            conf.println("extend_bin  = no");
            conf.println("</plot>");
            radius += dr;
        }
        conf.println("</plots>");
        conf.println("<links>");
        for (final SampleWriters sw : sample2writers.values()) {
            if (sw.bdns.isEmpty())
                continue;
            pw = archiveFactory.openWriter(sw.getBnbFilename());
            for (final SVBnB sv : sw.bdns) sv.print(pw);
            pw.flush();
            pw.close();
            conf.println("<link>");
            conf.println("file=" + sw.getBnbFilename());
            conf.println("radius=" + radius + "r");
            conf.println("bezier_radius = " + (radius + dr) + "r");
            conf.println("fill_color = " + sample2color.apply(sw.sampleName));
            conf.println("stroke_color = " + sample2color.apply(sw.sampleName));
            conf.println("</link>");
            radius += dr;
        }
        conf.println("</links>");
        sample2writers.clear();
        conf.println("");
        conf.println("<image>");
        conf.println("# Included from Circos distribution.");
        conf.println("<<include etc/image.conf>>");
        conf.println("</image>");
        conf.println("");
        conf.println("<<include etc/colors_fonts_patterns.conf>>");
        conf.println("");
        conf.println("# Debugging, I/O an dother system parameters");
        conf.println("# Included from Circos distribution.");
        conf.println("<<include etc/housekeeping.conf>>");
        conf.println("");
        conf.flush();
        conf.close();
        conf = null;
        archiveFactory.close();
        archiveFactory = null;
        LOG.info("done");
        return 0;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(conf);
        CloserUtil.close(archiveFactory);
        CloserUtil.close(r);
    }
}
Also used : PrintWriter(java.io.PrintWriter) Genotype(htsjdk.variant.variantcontext.Genotype) Allele(htsjdk.variant.variantcontext.Allele) Program(com.github.lindenb.jvarkit.util.jcommander.Program) Parameter(com.beust.jcommander.Parameter) VCFHeader(htsjdk.variant.vcf.VCFHeader) Logger(com.github.lindenb.jvarkit.util.log.Logger) VcfIterator(com.github.lindenb.jvarkit.util.vcf.VcfIterator) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Set(java.util.Set) IOException(java.io.IOException) HashMap(java.util.HashMap) Function(java.util.function.Function) Collectors(java.util.stream.Collectors) File(java.io.File) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) List(java.util.List) Map(java.util.Map) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VariantContext(htsjdk.variant.variantcontext.VariantContext) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) ArchiveFactory(com.github.lindenb.jvarkit.io.ArchiveFactory) CloserUtil(htsjdk.samtools.util.CloserUtil) ArchiveFactory(com.github.lindenb.jvarkit.io.ArchiveFactory) HashMap(java.util.HashMap) VariantContext(htsjdk.variant.variantcontext.VariantContext) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) IOException(java.io.IOException) VcfIterator(com.github.lindenb.jvarkit.util.vcf.VcfIterator) VCFHeader(htsjdk.variant.vcf.VCFHeader) PrintWriter(java.io.PrintWriter) HashSet(java.util.HashSet)

Example 39 with VcfIterator

use of com.github.lindenb.jvarkit.util.vcf.VcfIterator in project jvarkit by lindenb.

the class FixVCF method doWork.

private int doWork(String filenameIn, InputStream vcfStream, VariantContextWriter w) throws IOException {
    final AbstractVCFCodec vcfCodec = VCFUtils.createDefaultVCFCodec();
    LineIterator r = new LineIteratorImpl(new SynchronousLineReader(vcfStream));
    final VCFHeader header = (VCFHeader) vcfCodec.readActualHeader(r);
    // samples names have been changed by picard api and reordered !!!
    // re-create the original order
    List<String> sampleNamesInSameOrder = new ArrayList<String>(header.getSampleNamesInOrder().size());
    for (int col = 0; col < header.getSampleNamesInOrder().size(); ++col) {
        for (String sample : header.getSampleNameToOffset().keySet()) {
            if (header.getSampleNameToOffset().get(sample) == col) {
                sampleNamesInSameOrder.add(sample);
                break;
            }
        }
    }
    if (sampleNamesInSameOrder.size() != header.getSampleNamesInOrder().size()) {
        throw new IllegalStateException();
    }
    VCFHeader h2 = new VCFHeader(header.getMetaDataInInputOrder(), sampleNamesInSameOrder);
    File tmp = IOUtil.newTempFile("tmp", ".vcf.gz", new File[] { tmpDir });
    tmp.deleteOnExit();
    PrintWriter pw = new PrintWriter(new GZIPOutputStream(new FileOutputStream(tmp)));
    while (r.hasNext()) {
        String line = r.next();
        pw.println(line);
        VariantContext ctx = null;
        try {
            ctx = vcfCodec.decode(line);
        } catch (Exception err) {
            pw.close();
            LOG.error(line);
            LOG.error(err);
            return -1;
        }
        for (String f : ctx.getFilters()) {
            if (h2.getFilterHeaderLine(f) != null)
                continue;
            // if(f.equals(VCFConstants.PASSES_FILTERS_v4)) continue; hum...
            if (f.isEmpty() || f.equals(VCFConstants.UNFILTERED))
                continue;
            LOG.info("Fixing missing Filter:" + f);
            h2.addMetaDataLine(new VCFFilterHeaderLine(f));
        }
        for (String tag : ctx.getAttributes().keySet()) {
            if (h2.getInfoHeaderLine(tag) != null)
                continue;
            LOG.info("Fixing missing INFO:" + tag);
            h2.addMetaDataLine(new VCFInfoHeaderLine(tag, VCFHeaderLineCount.UNBOUNDED, VCFHeaderLineType.String, "undefined. Saved by " + getClass()));
        }
    }
    pw.flush();
    pw.close();
    pw = null;
    LOG.info("re-reading VCF frm tmpFile:" + tmp);
    h2.addMetaDataLine(new VCFHeaderLine(getClass().getSimpleName(), "Saved VCF FILTER AND INFO from " + filenameIn));
    // save header in memory
    ByteArrayOutputStream baos = new ByteArrayOutputStream();
    VariantContextWriter w2 = VCFUtils.createVariantContextWriterToOutputStream(baos);
    w2.writeHeader(h2);
    w2.close();
    baos.close();
    // reopen tmp file
    @SuppressWarnings("resource") VcfIterator in = new VcfIteratorImpl(new SequenceInputStream(new ByteArrayInputStream(baos.toByteArray()), new GZIPInputStream(new FileInputStream(tmp))));
    w.writeHeader(h2);
    while (in.hasNext()) {
        w.add(in.next());
    }
    in.close();
    tmp.delete();
    return 0;
}
Also used : VCFHeaderLine(htsjdk.variant.vcf.VCFHeaderLine) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) LineIterator(htsjdk.tribble.readers.LineIterator) GZIPInputStream(java.util.zip.GZIPInputStream) VcfIterator(com.github.lindenb.jvarkit.util.vcf.VcfIterator) GZIPOutputStream(java.util.zip.GZIPOutputStream) SynchronousLineReader(htsjdk.tribble.readers.SynchronousLineReader) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) LineIteratorImpl(htsjdk.tribble.readers.LineIteratorImpl) VCFHeader(htsjdk.variant.vcf.VCFHeader) PrintWriter(java.io.PrintWriter) VcfIteratorImpl(com.github.lindenb.jvarkit.util.vcf.VcfIteratorImpl) ByteArrayOutputStream(java.io.ByteArrayOutputStream) AbstractVCFCodec(htsjdk.variant.vcf.AbstractVCFCodec) IOException(java.io.IOException) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) FileInputStream(java.io.FileInputStream) SequenceInputStream(java.io.SequenceInputStream) ByteArrayInputStream(java.io.ByteArrayInputStream) FileOutputStream(java.io.FileOutputStream) File(java.io.File)

Example 40 with VcfIterator

use of com.github.lindenb.jvarkit.util.vcf.VcfIterator in project jvarkit by lindenb.

the class FixVcfMissingGenotypes method doVcfToVcf.

@Override
protected int doVcfToVcf(final String inputName, final VcfIterator in, final VariantContextWriter out) {
    final Set<String> bamFiles = IOUtils.unrollFiles(bamList);
    final Map<String, List<SamReader>> sample2bam = new HashMap<>(bamFiles.size());
    try {
        final VCFHeader header = in.getHeader();
        for (final String bamFile : bamFiles) {
            LOG.info("Reading header for " + bamFile);
            final SamReader reader = super.openSamReader(bamFile);
            if (!reader.hasIndex()) {
                LOG.error("No BAM index available for " + bamFile);
                return -1;
            }
            final SAMFileHeader samHeader = reader.getFileHeader();
            for (final SAMReadGroupRecord g : samHeader.getReadGroups()) {
                if (g.getSample() == null)
                    continue;
                final String sample = g.getSample();
                if (StringUtil.isBlank(sample))
                    continue;
                List<SamReader> readers = sample2bam.get(sample);
                if (readers == null) {
                    readers = new ArrayList<>();
                    sample2bam.put(sample, readers);
                }
                readers.add(reader);
            }
        }
        final VCFHeader h2 = new VCFHeader(header);
        if (h2.getFormatHeaderLine(VCFConstants.DEPTH_KEY) == null) {
            h2.addMetaDataLine(VCFStandardHeaderLines.getFormatLine(VCFConstants.DEPTH_KEY));
        }
        if (h2.getFormatHeaderLine(VCFConstants.GENOTYPE_KEY) == null) {
            h2.addMetaDataLine(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_KEY));
        }
        if (h2.getFormatHeaderLine(VCFConstants.GENOTYPE_FILTER_KEY) == null) {
            h2.addMetaDataLine(VCFStandardHeaderLines.getFormatLine(VCFConstants.GENOTYPE_FILTER_KEY));
        }
        h2.addMetaDataLine(new VCFFormatHeaderLine(this.fixedTag, 1, VCFHeaderLineType.Integer, "Genotype was set as homozygous (min depth =" + this.minDepth + ")"));
        final SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(header);
        this.recalculator.setHeader(h2);
        out.writeHeader(h2);
        while (in.hasNext()) {
            final VariantContext ctx = progress.watch(in.next());
            boolean somethingWasChanged = false;
            final List<Genotype> genotypes = new ArrayList<>(ctx.getNSamples());
            for (int i = 0; i < ctx.getNSamples(); ++i) {
                Genotype genotype = ctx.getGenotype(i);
                final String sample = genotype.getSampleName();
                if (!genotype.isCalled() || (!genotype.hasDP() && this.fixDP)) {
                    List<SamReader> samReaders = sample2bam.get(sample);
                    if (samReaders == null)
                        samReaders = Collections.emptyList();
                    final int depth = fetchDP(ctx, sample, samReaders);
                    if (genotype.isCalled() && !genotype.hasDP()) {
                        genotype = new GenotypeBuilder(genotype).DP(depth).make();
                        somethingWasChanged = true;
                    } else // genotype was not called
                    {
                        if (depth >= this.minDepth) {
                            final List<Allele> homozygous = new ArrayList<>(2);
                            homozygous.add(ctx.getReference());
                            homozygous.add(ctx.getReference());
                            final GenotypeBuilder gb = new GenotypeBuilder(genotype);
                            gb.alleles(homozygous);
                            gb.attribute(this.fixedTag, 1);
                            gb.DP(depth);
                            if (!StringUtil.isBlank(this.fixedGenotypesAreFiltered))
                                gb.filter(this.fixedGenotypesAreFiltered);
                            somethingWasChanged = true;
                            genotype = gb.make();
                        } else if (// cannot fix but we can update DP
                        !genotype.hasDP()) {
                            genotype = new GenotypeBuilder(genotype).DP(depth).make();
                            somethingWasChanged = true;
                        }
                    }
                }
                genotypes.add(genotype);
            }
            if (somethingWasChanged) {
                final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
                vcb.genotypes(genotypes);
                out.add(this.recalculator.apply(vcb.make()));
            } else {
                out.add(ctx);
            }
        }
        progress.finish();
        return 0;
    } catch (final Exception err) {
        LOG.error(err);
        return -1;
    } finally {
        sample2bam.values().stream().flatMap(L -> L.stream()).forEach(R -> CloserUtil.close(R));
    }
}
Also used : Genotype(htsjdk.variant.variantcontext.Genotype) Cigar(htsjdk.samtools.Cigar) Allele(htsjdk.variant.variantcontext.Allele) Program(com.github.lindenb.jvarkit.util.jcommander.Program) VCFStandardHeaderLines(htsjdk.variant.vcf.VCFStandardHeaderLines) Parameter(com.beust.jcommander.Parameter) VCFHeader(htsjdk.variant.vcf.VCFHeader) CigarElement(htsjdk.samtools.CigarElement) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) HashMap(java.util.HashMap) SAMFileHeader(htsjdk.samtools.SAMFileHeader) ParametersDelegate(com.beust.jcommander.ParametersDelegate) ArrayList(java.util.ArrayList) StringUtil(htsjdk.samtools.util.StringUtil) Map(java.util.Map) IOUtils(com.github.lindenb.jvarkit.io.IOUtils) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) CloserUtil(htsjdk.samtools.util.CloserUtil) VCFConstants(htsjdk.variant.vcf.VCFConstants) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) SAMRecordIterator(htsjdk.samtools.SAMRecordIterator) VcfIterator(com.github.lindenb.jvarkit.util.vcf.VcfIterator) Logger(com.github.lindenb.jvarkit.util.log.Logger) Set(java.util.Set) SamFilterParser(com.github.lindenb.jvarkit.util.bio.samfilter.SamFilterParser) SamReader(htsjdk.samtools.SamReader) SAMRecord(htsjdk.samtools.SAMRecord) File(java.io.File) SamRecordFilter(htsjdk.samtools.filter.SamRecordFilter) List(java.util.List) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) VariantAttributesRecalculator(com.github.lindenb.jvarkit.util.vcf.VariantAttributesRecalculator) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VariantContext(htsjdk.variant.variantcontext.VariantContext) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) Collections(java.util.Collections) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) HashMap(java.util.HashMap) SAMReadGroupRecord(htsjdk.samtools.SAMReadGroupRecord) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) SamReader(htsjdk.samtools.SamReader) ArrayList(java.util.ArrayList) List(java.util.List) VCFHeader(htsjdk.variant.vcf.VCFHeader) VCFFormatHeaderLine(htsjdk.variant.vcf.VCFFormatHeaderLine) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) Genotype(htsjdk.variant.variantcontext.Genotype) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Allele(htsjdk.variant.variantcontext.Allele) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) SAMFileHeader(htsjdk.samtools.SAMFileHeader)

Aggregations

VcfIterator (com.github.lindenb.jvarkit.util.vcf.VcfIterator)55 VariantContext (htsjdk.variant.variantcontext.VariantContext)39 VCFHeader (htsjdk.variant.vcf.VCFHeader)35 SAMSequenceDictionaryProgress (com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress)30 ArrayList (java.util.ArrayList)28 VariantContextWriter (htsjdk.variant.variantcontext.writer.VariantContextWriter)26 IOException (java.io.IOException)24 File (java.io.File)22 HashSet (java.util.HashSet)19 List (java.util.List)19 Genotype (htsjdk.variant.variantcontext.Genotype)18 Parameter (com.beust.jcommander.Parameter)17 Launcher (com.github.lindenb.jvarkit.util.jcommander.Launcher)17 Program (com.github.lindenb.jvarkit.util.jcommander.Program)17 Logger (com.github.lindenb.jvarkit.util.log.Logger)17 Set (java.util.Set)17 Allele (htsjdk.variant.variantcontext.Allele)16 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)15 VariantContextBuilder (htsjdk.variant.variantcontext.VariantContextBuilder)15 JvarkitException (com.github.lindenb.jvarkit.lang.JvarkitException)14