Search in sources :

Example 6 with VCFIterator

use of htsjdk.variant.vcf.VCFIterator in project jvarkit by lindenb.

the class VcfGnomadOld method doVcfToVcf.

@Override
protected int doVcfToVcf(final String inputName, final VCFIterator iter, final VariantContextWriter out) {
    final VCFHeader h0 = iter.getHeader();
    if (!SequenceDictionaryUtils.isGRCh37(h0)) {
        LOG.warn("Input is NOT GRCh37 ?");
    // can be lift over
    }
    final ProgressFactory.Watcher<VariantContext> progress = ProgressFactory.newInstance().dictionary(h0).logger(LOG).build();
    final VCFHeader h2 = new VCFHeader(h0);
    final Set<String> all_filters_from_gnomad = new HashSet<>();
    final List<InfoField> infoFields = new ArrayList<>();
    for (final OmeType ome : OmeType.values()) {
        final ManifestEntry entry = this.manifestEntries.stream().filter(M -> M.omeType.equals(ome)).findFirst().orElse(null);
        if (entry == null)
            continue;
        entry.open();
        final VCFHeader header = entry.getHeader();
        this.gnomadVersion = header.getInfoHeaderLines().stream().anyMatch(V -> V.getID().equals("non_neuro_AC_nfe_male")) ? GnomadVersion.v2_1 : GnomadVersion.v2_0;
        LOG.debug("identified as gnomad version " + this.gnomadVersion);
        if (!StringUtil.isBlank(this.filteredInGnomadFilterPrefix)) {
            for (final VCFFilterHeaderLine fh : header.getFilterLines()) {
                if (fh.getID().equals(VCFConstants.PASSES_FILTERS_v4))
                    continue;
                final String fid = this.filteredInGnomadFilterPrefix + "_" + ome.name().toUpperCase() + "_" + fh.getID();
                final VCFFilterHeaderLine fh2 = new VCFFilterHeaderLine(fid, "[gnomad-" + ome.name() + "]" + fh.getDescription());
                h2.addMetaDataLine(fh2);
                all_filters_from_gnomad.add(fid);
            }
        }
        final Predicate<VCFInfoHeaderLine> acceptInfoTag;
        if (StringUtil.isBlank(this.excludePatternStr)) {
            acceptInfoTag = T -> true;
        } else {
            final Pattern pat = Pattern.compile(this.excludePatternStr);
            acceptInfoTag = T -> !pat.matcher(T.getID()).find();
        }
        switch(this.gnomadVersion) {
            case v2_0:
                header.getInfoHeaderLines().stream().filter(acceptInfoTag).filter(FH -> FH.getID().equals("AC") || FH.getID().equals("AN") || FH.getID().equals("AF") || FH.getID().startsWith("AC_") || FH.getID().startsWith("AN_") || FH.getID().startsWith("AF_")).map(FH -> new InfoField(FH, ome)).forEach(FH -> infoFields.add(FH));
                break;
            case v2_1:
                header.getInfoHeaderLines().stream().filter(acceptInfoTag).filter(FH -> !FH.getID().contains("MEDIAN")).filter(FH -> FH.getID().contains("AC") || FH.getID().contains("AN") || FH.getID().contains("AF")).map(FH -> new InfoField(FH, ome)).forEach(FH -> infoFields.add(FH));
                break;
            default:
                throw new IllegalStateException("TODO " + this.gnomadVersion);
        }
        entry.close();
    }
    for (final InfoField f : infoFields) h2.addMetaDataLine(f.geOutputHeaderLine());
    if (!StringUtil.isBlank(this.inGnomadFilterName)) {
        h2.addMetaDataLine(new VCFFilterHeaderLine(this.inGnomadFilterName, "Variant CHROM/POS/REF was found in gnomad"));
    }
    if (!StringUtil.isBlank(this.overlapGnomadFilterName)) {
        h2.addMetaDataLine(new VCFFilterHeaderLine(this.overlapGnomadFilterName, "Gnomad Variant was found overlapping the variant, not necessarily at the same CHROM/POS"));
    }
    JVarkitVersion.getInstance().addMetaData(this, h2);
    out.writeHeader(h2);
    final ManifestEntry[] om2manifest = new ManifestEntry[] { null, null };
    while (iter.hasNext()) {
        final VariantContext ctx = progress.apply(iter.next());
        final Set<String> filters = new HashSet<>(ctx.getFilters());
        final VariantContextBuilder vcb = new VariantContextBuilder(ctx);
        for (final InfoField f : infoFields) vcb.rmAttribute(f.getOutputTag());
        if (!StringUtil.isBlank(this.inGnomadFilterName)) {
            filters.remove(this.inGnomadFilterName);
        }
        filters.removeAll(all_filters_from_gnomad);
        if (!StringUtil.isBlank(this.overlapGnomadFilterName)) {
            filters.remove(this.overlapGnomadFilterName);
        }
        if (this.skipFiltered && ctx.isFiltered()) {
            vcb.filters(filters);
            out.add(vcb.make());
            continue;
        }
        if (ctx.getContig().equals("MT") || ctx.getContig().equals("chrM") || ctx.getContig().contains("_")) {
            vcb.filters(filters);
            out.add(vcb.make());
            continue;
        }
        final List<Allele> alternateAlleles = ctx.getAlternateAlleles();
        String newid = null;
        boolean set_filter_ctx_is_in_gnomad = false;
        boolean found_gnomad_overlapping_variant = false;
        for (int omeIndex = 0; omeIndex < 2; omeIndex++) {
            final OmeType omeType = omeIndex == 0 ? OmeType.exome : OmeType.genome;
            if (this.useGenomeOnly && !omeType.equals(OmeType.genome))
                continue;
            if (om2manifest[omeIndex] != null && !om2manifest[omeIndex].acceptContig(ctx.getContig())) {
                om2manifest[omeIndex].close();
                om2manifest[omeIndex] = null;
            }
            if (om2manifest[omeIndex] == null) {
                om2manifest[omeIndex] = this.manifestEntries.stream().filter(M -> M.omeType.equals(omeType) && M.acceptContig(ctx.getContig())).findFirst().orElse(null);
                if (om2manifest[omeIndex] == null)
                    continue;
                LOG.debug("Opening " + om2manifest[omeIndex].uri);
                om2manifest[omeIndex].open();
            }
            // variant overlapping 'ctx'
            final List<VariantContext> overlappingVariants = om2manifest[omeIndex].findOverlapping(ctx);
            if (!overlappingVariants.isEmpty())
                found_gnomad_overlapping_variant = true;
            final List<VariantContext> gnomadVariants = overlappingVariants.stream().filter(V -> V.getStart() == ctx.getStart() && V.getReference().equals(ctx.getReference())).collect(Collectors.toList());
            if (!gnomadVariants.isEmpty()) {
                set_filter_ctx_is_in_gnomad = true;
                // set new id ?
                if (newid == null) {
                    newid = gnomadVariants.stream().filter(V -> V.hasID()).map(V -> V.getID()).findFirst().orElse(null);
                }
                // add FILTER(s)
                if (!StringUtil.isBlank(this.filteredInGnomadFilterPrefix)) {
                    filters.addAll(gnomadVariants.stream().filter(V -> V.isFiltered()).flatMap(V -> V.getFilters().stream()).filter(F -> !F.equals(VCFConstants.PASSES_FILTERS_v4)).map(F -> this.filteredInGnomadFilterPrefix + "_" + omeType.name().toUpperCase() + "_" + F).collect(Collectors.toList()));
                }
            }
            // loop over each field
            for (final InfoField infoField : infoFields) {
                if (!infoField.ome.equals(omeType))
                    continue;
                if (gnomadVariants.stream().noneMatch(V -> V.hasAttribute(infoField.original.getID())))
                    continue;
                if (infoField.getOutputLineCount().equals(VCFHeaderLineCount.INTEGER) && infoField.original.getCountType().equals(VCFHeaderLineCount.INTEGER)) {
                    final Set<Object> set = gnomadVariants.stream().filter(V -> V.hasAttribute(infoField.original.getID())).map(V -> infoField.parse(V.getAttribute(infoField.original.getID()))).collect(Collectors.toSet());
                    if (set.isEmpty())
                        continue;
                    if (set.size() == 1) {
                        vcb.attribute(infoField.getOutputTag(), set.iterator().next());
                    } else if (this.ignore_info_found_twice) {
                        LOG.warn("Found more than one value (" + set + ") for " + infoField + " " + ctx.getContig() + ":" + ctx.getStart());
                        vcb.attribute(infoField.getOutputTag(), set.iterator().next());
                    } else {
                        LOG.error("Found more than one value (" + set + ") for " + infoField + " " + ctx.getContig() + ":" + ctx.getStart() + ". Are you using a lift-overed gnomad ? Use option --ignore-error0 to skip this error");
                        progress.close();
                        return -1;
                    }
                } else if (infoField.original.getCountType() == VCFHeaderLineCount.A) {
                    final Object[] numbers = new Object[alternateAlleles.size()];
                    Arrays.fill(numbers, infoField.getDefault());
                    for (int x = 0; x < alternateAlleles.size(); ++x) {
                        final Allele alt = alternateAlleles.get(x);
                        if (alt.equals(Allele.SPAN_DEL))
                            continue;
                        for (final VariantContext gv : gnomadVariants) {
                            int idx = gv.getAlternateAlleles().indexOf(alt);
                            if (idx == -1)
                                continue;
                            if (!gv.hasAttribute(infoField.original.getID()))
                                continue;
                            final List<Object> array = gv.getAttributeAsList(infoField.original.getID());
                            if (idx >= array.size())
                                continue;
                            numbers[x] = infoField.parse(array.get(idx));
                        }
                    }
                    vcb.attribute(infoField.getOutputTag(), numbers);
                } else if (infoField.original.isFixedCount() && infoField.original.getCount() == 1) {
                    final Object[] numbers = new Object[alternateAlleles.size()];
                    Arrays.fill(numbers, infoField.getDefault());
                    boolean something_found_for_an_allele = false;
                    for (int x = 0; x < alternateAlleles.size(); ++x) {
                        final Allele alt = alternateAlleles.get(x);
                        if (alt.equals(Allele.SPAN_DEL))
                            continue;
                        for (final VariantContext gv : gnomadVariants) {
                            int idx = gv.getAlternateAlleles().indexOf(alt);
                            if (idx == -1)
                                continue;
                            if (!gv.hasAttribute(infoField.original.getID()))
                                continue;
                            something_found_for_an_allele = true;
                            final Object val = infoField.parse(gv.getAttribute(infoField.original.getID()));
                            if (val == null)
                                continue;
                            numbers[x] = val;
                        }
                    }
                    if (something_found_for_an_allele) {
                        vcb.attribute(infoField.getOutputTag(), numbers);
                    }
                } else {
                    LOG.error("not handled " + infoField);
                    progress.close();
                    return -1;
                }
            }
        }
        if (set_filter_ctx_is_in_gnomad && !StringUtil.isBlank(this.inGnomadFilterName)) {
            filters.add(this.inGnomadFilterName);
        }
        if (found_gnomad_overlapping_variant && !StringUtil.isBlank(this.overlapGnomadFilterName)) {
            filters.add(this.overlapGnomadFilterName);
        }
        if (!this.doNotUpdateId && !ctx.hasID() && !StringUtil.isBlank(newid))
            vcb.id(newid);
        vcb.filters(filters);
        out.add(vcb.make());
    }
    out.close();
    progress.close();
    for (int omeIndex = 0; omeIndex < 2; omeIndex++) CloserUtil.close(om2manifest[omeIndex]);
    return 0;
}
Also used : CloseableIterator(htsjdk.samtools.util.CloseableIterator) Allele(htsjdk.variant.variantcontext.Allele) Arrays(java.util.Arrays) CharSplitter(com.github.lindenb.jvarkit.lang.CharSplitter) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Program(com.github.lindenb.jvarkit.util.jcommander.Program) VCFIterator(htsjdk.variant.vcf.VCFIterator) Parameter(com.beust.jcommander.Parameter) VCFHeader(htsjdk.variant.vcf.VCFHeader) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) Interval(htsjdk.samtools.util.Interval) DistanceParser(com.github.lindenb.jvarkit.util.bio.DistanceParser) StringUtil(htsjdk.samtools.util.StringUtil) RuntimeIOException(htsjdk.samtools.util.RuntimeIOException) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) VCFReaderFactory(com.github.lindenb.jvarkit.variant.vcf.VCFReaderFactory) CloserUtil(htsjdk.samtools.util.CloserUtil) VCFConstants(htsjdk.variant.vcf.VCFConstants) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) Files(java.nio.file.Files) Predicate(java.util.function.Predicate) Logger(com.github.lindenb.jvarkit.util.log.Logger) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) VCFReader(htsjdk.variant.vcf.VCFReader) Set(java.util.Set) IOException(java.io.IOException) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) Collectors(java.util.stream.Collectors) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) File(java.io.File) Objects(java.util.Objects) List(java.util.List) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) Closeable(java.io.Closeable) CoordMath(htsjdk.samtools.util.CoordMath) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) VariantContext(htsjdk.variant.variantcontext.VariantContext) Pattern(java.util.regex.Pattern) VCFHeaderLineCount(htsjdk.variant.vcf.VCFHeaderLineCount) Collections(java.util.Collections) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) ProgressFactory(com.github.lindenb.jvarkit.util.log.ProgressFactory) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) ArrayList(java.util.ArrayList) List(java.util.List) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeader(htsjdk.variant.vcf.VCFHeader) HashSet(java.util.HashSet) Pattern(java.util.regex.Pattern) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) Allele(htsjdk.variant.variantcontext.Allele) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder)

Example 7 with VCFIterator

use of htsjdk.variant.vcf.VCFIterator in project jvarkit by lindenb.

the class IranomeScrapper method doWork.

@Override
public int doWork(final List<String> args) {
    TabixReader tabixReader = null;
    VariantContextWriter failedWriter = null;
    try {
        /**
         * create http client
         */
        // createDefault();
        this.httpClient = HttpClients.createSystem();
        try (VCFIterator iter = super.openVCFIterator(oneFileOrNull(args))) {
            if (this.tabixDatabase != null) {
                tabixReader = new TabixReader(this.tabixDatabase);
            }
            if (this.failedVcfPath != null) {
                failedWriter = VCFUtils.createVariantContextWriterToPath(failedVcfPath);
                failedWriter.writeHeader(iter.getHeader());
            }
            try (PrintWriter pw = super.openPathOrStdoutAsPrintWriter(this.outputFile)) {
                pw.print("CHROM\tPOS\tREF\tALT\tAC\tAN\tAF");
                for (int i = 0; i < populations.size(); i++) {
                    pw.print("\tAC_" + populations.get(i).toUpperCase().replace(' ', '_'));
                }
                for (int i = 0; i < populations.size(); i++) {
                    pw.print("\tAN_" + populations.get(i).toUpperCase().replace(' ', '_'));
                }
                pw.println("\tdate");
                while (iter.hasNext()) {
                    final VariantContext ctx = iter.next();
                    if (!AcidNucleics.isATGC(ctx.getReference()))
                        continue;
                    final String iranomeCtg = toIranomeContig(ctx.getContig());
                    for (final Allele alt : ctx.getAlternateAlleles()) {
                        if (!AcidNucleics.isATGC(alt))
                            continue;
                        if (tabixReader != null) {
                            boolean found = false;
                            final TabixReader.Iterator iter2 = tabixReader.query(iranomeCtg, ctx.getStart(), ctx.getEnd());
                            for (; ; ) {
                                final String line2 = iter2.next();
                                if (line2 == null)
                                    break;
                                final String[] tokens = CharSplitter.TAB.split(line2);
                                if (tokens.length < 4)
                                    throw new JvarkitException.TokenErrors(4, tokens);
                                if (tokens[0].equals(iranomeCtg) && Integer.parseInt(tokens[1]) == ctx.getStart() && tokens[2].equals(ctx.getReference().getDisplayString()) && tokens[3].equals(alt.getDisplayString())) {
                                    LOG.info("already in database " + String.join("-", Arrays.asList(tokens).subList(0, 4)));
                                    found = true;
                                    break;
                                }
                                if (found)
                                    continue;
                            }
                        // end tabix terator
                        }
                        // end tabix!=null
                        final Map<String, Object> hash = this.getIranomeVariant(ctx, alt);
                        if (hash == null) {
                            if (failedWriter != null) {
                                failedWriter.add(ctx);
                            }
                            continue;
                        }
                        for (int i = 0; i < fields.size(); i++) {
                            pw.print(i > 0 ? "\t" : "");
                            pw.print(hash.get(fields.get(i)));
                        }
                        for (int i = 0; i < populations.size(); i++) {
                            pw.print("\t" + hash.get("AC_" + populations.get(i)));
                        }
                        for (int i = 0; i < populations.size(); i++) {
                            pw.print("\t" + hash.get("AN_" + populations.get(i)));
                        }
                        pw.print("\t");
                        pw.println(StringUtils.now());
                    }
                }
                pw.flush();
            }
        // end print
        }
        // end iter
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
        CloserUtil.close(tabixReader);
        CloserUtil.close(failedWriter);
        CloserUtil.close(this.httpClient);
    }
}
Also used : TabixReader(htsjdk.tribble.readers.TabixReader) VariantContext(htsjdk.variant.variantcontext.VariantContext) JvarkitException(com.github.lindenb.jvarkit.lang.JvarkitException) Allele(htsjdk.variant.variantcontext.Allele) JsonObject(com.google.gson.JsonObject) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) VCFIterator(htsjdk.variant.vcf.VCFIterator) PrintWriter(java.io.PrintWriter)

Example 8 with VCFIterator

use of htsjdk.variant.vcf.VCFIterator in project jvarkit by lindenb.

the class VcfFisherCombinatorics method doWork.

@Override
public int doWork(final List<String> args) {
    try {
        final Pedigree pedigree = new PedigreeParser().parse(this.pedigreeFile);
        final IntervalTreeMap<List<GeneInfo>> geneInfoMap = new IntervalTreeMap<>();
        final Predicate<Genotype> isGtCnv = G -> G.isHet() || G.isHomVar();
        try (VCFIterator vcfIter = super.openVCFIterator(super.oneFileOrNull(args))) {
            final VCFHeader header = vcfIter.getHeader();
            final ContigNameConverter ctgConverter = ContigNameConverter.fromOneDictionary(SequenceDictionaryUtils.extractRequired(header));
            if (!header.hasGenotypingData()) {
                LOG.error("No Genotype data in " + header);
                return -1;
            }
            final Map<String, Integer> sample2index = header.getSampleNameToOffset();
            final List<Integer> casesIdx = pedigree.getSamplesInVcfHeader(header).filter(S -> S.isAffected()).map(S -> S.getId()).map(S -> sample2index.get(S)).sorted().collect(Collectors.toList());
            LOG.info("cases N=" + casesIdx.size());
            if (casesIdx.isEmpty()) {
                LOG.error("No affected/cases sample in the input VCF");
                return -1;
            }
            final List<Integer> controlsIdx = pedigree.getSamplesInVcfHeader(header).filter(S -> S.isUnaffected()).map(S -> S.getId()).map(S -> sample2index.get(S)).sorted().collect(Collectors.toList());
            LOG.info("controls N=" + controlsIdx.size());
            if (controlsIdx.isEmpty()) {
                LOG.error("No unaffected/controls sample in the input VCF");
                return -1;
            }
            final Predicate<VariantContext> acceptCtx = V -> {
                if (discard_bnd && V.getAttributeAsString(VCFConstants.SVTYPE, "").equals("BND"))
                    return false;
                if (discard_filtered && V.isFiltered())
                    return false;
                if (max_af < 1.0) {
                    final Iterator<Integer> it = Stream.concat(controlsIdx.stream(), casesIdx.stream()).iterator();
                    int ac = 0;
                    int an = 0;
                    while (it.hasNext()) {
                        switch(V.getGenotype(it.next()).getType()) {
                            case HOM_VAR:
                                ac += 2;
                                an += 2;
                                break;
                            case HOM_REF:
                                ac += 0;
                                an += 2;
                                break;
                            case HET:
                                ac += 1;
                                an += 2;
                                break;
                            default:
                                break;
                        }
                    }
                    if (an == 0)
                        return false;
                    if (ac / (double) an > max_af)
                        return false;
                }
                return true;
            };
            try (BedLineReader br = new BedLineReader(this.bedPath)) {
                while (br.hasNext()) {
                    final BedLine bed = br.next();
                    final String ctg = ctgConverter.apply(bed.getContig());
                    if (StringUtil.isBlank(ctg))
                        continue;
                    final GeneInfo geneInfo = new GeneInfo();
                    geneInfo.casesflags = new BitSet(casesIdx.size());
                    geneInfo.controlsflags = new BitSet(controlsIdx.size());
                    geneInfo.contig = ctg;
                    geneInfo.name = bed.getOrDefault(3, "undefined");
                    geneInfo.parts.add(new SimpleInterval(bed).renameContig(ctg));
                    final Interval key = new Interval(geneInfo);
                    List<GeneInfo> L = geneInfoMap.get(key);
                    if (L == null) {
                        L = new ArrayList<>();
                        geneInfoMap.put(key, L);
                    }
                    L.add(geneInfo);
                }
            }
            if (geneInfoMap.isEmpty()) {
                LOG.error("no gene found in " + this.bedPath);
                return -1;
            }
            LOG.info("reading variants...");
            while (vcfIter.hasNext()) {
                final VariantContext ctx = vcfIter.next();
                if (!acceptCtx.test(ctx))
                    continue;
                for (List<GeneInfo> gil : geneInfoMap.getOverlapping(ctx)) {
                    for (GeneInfo gi : gil) {
                        for (int y = 0; y < casesIdx.size(); ++y) {
                            final Genotype gt = ctx.getGenotype(casesIdx.get(y));
                            if (!isGtCnv.test(gt))
                                continue;
                            gi.casesflags.set(y);
                        }
                        for (int y = 0; y < controlsIdx.size(); ++y) {
                            final Genotype gt = ctx.getGenotype(controlsIdx.get(y));
                            if (!isGtCnv.test(gt))
                                continue;
                            gi.controlsflags.set(y);
                        }
                    }
                }
            }
            /* remove Genes without variant , count sample per gene*/
            for (List<GeneInfo> gil : geneInfoMap.values()) {
                gil.removeIf(GI -> GI.casesflags.nextSetBit(0) == -1 && GI.controlsflags.nextSetBit(0) == -1);
            }
            // end remove
            /* load bed file */
            final List<GeneInfo> geneList = geneInfoMap.values().stream().filter(L -> !L.isEmpty()).flatMap(L -> L.stream()).sorted().collect(Collectors.toList());
            if (geneList.isEmpty()) {
                LOG.error("no gene found in " + this.bedPath);
                return -1;
            }
            LOG.info("N Genes=" + geneList.size());
            Solution best = null;
            for (int i = 2; i < Math.min(this.max_num_genes, geneList.size()); i++) {
                LOG.info("sarting loop from " + i);
                best = recursion(geneList, casesIdx, controlsIdx, new ArrayList<>(), i, best);
            }
            try (PrintWriter pw = super.openPathOrStdoutAsPrintWriter(this.outputFile)) {
                pw.println(best);
                pw.flush();
            }
        }
        return 0;
    } catch (final Throwable err) {
        LOG.error(err);
        return -1;
    } finally {
    }
}
Also used : Genotype(htsjdk.variant.variantcontext.Genotype) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) Program(com.github.lindenb.jvarkit.util.jcommander.Program) VCFIterator(htsjdk.variant.vcf.VCFIterator) Parameter(com.beust.jcommander.Parameter) VCFHeader(htsjdk.variant.vcf.VCFHeader) ArrayList(java.util.ArrayList) BedLineReader(com.github.lindenb.jvarkit.bed.BedLineReader) Interval(htsjdk.samtools.util.Interval) StringUtil(htsjdk.samtools.util.StringUtil) FisherExactTest(com.github.lindenb.jvarkit.math.stats.FisherExactTest) PedigreeParser(com.github.lindenb.jvarkit.pedigree.PedigreeParser) Map(java.util.Map) Launcher(com.github.lindenb.jvarkit.util.jcommander.Launcher) Path(java.nio.file.Path) VCFConstants(htsjdk.variant.vcf.VCFConstants) PrintWriter(java.io.PrintWriter) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) SequenceDictionaryUtils(com.github.lindenb.jvarkit.util.bio.SequenceDictionaryUtils) Locatable(htsjdk.samtools.util.Locatable) Iterator(java.util.Iterator) Pedigree(com.github.lindenb.jvarkit.pedigree.Pedigree) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) Predicate(java.util.function.Predicate) Logger(com.github.lindenb.jvarkit.util.log.Logger) Collectors(java.util.stream.Collectors) List(java.util.List) Stream(java.util.stream.Stream) FractionConverter(com.github.lindenb.jvarkit.jcommander.converter.FractionConverter) VariantContext(htsjdk.variant.variantcontext.VariantContext) BitSet(java.util.BitSet) Collections(java.util.Collections) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) BedLineReader(com.github.lindenb.jvarkit.bed.BedLineReader) PedigreeParser(com.github.lindenb.jvarkit.pedigree.PedigreeParser) VCFIterator(htsjdk.variant.vcf.VCFIterator) Iterator(java.util.Iterator) ArrayList(java.util.ArrayList) List(java.util.List) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval) VCFHeader(htsjdk.variant.vcf.VCFHeader) ContigNameConverter(com.github.lindenb.jvarkit.util.bio.fasta.ContigNameConverter) VCFIterator(htsjdk.variant.vcf.VCFIterator) PrintWriter(java.io.PrintWriter) BitSet(java.util.BitSet) Genotype(htsjdk.variant.variantcontext.Genotype) BedLine(com.github.lindenb.jvarkit.util.bio.bed.BedLine) Pedigree(com.github.lindenb.jvarkit.pedigree.Pedigree) IntervalTreeMap(htsjdk.samtools.util.IntervalTreeMap) Interval(htsjdk.samtools.util.Interval) SimpleInterval(com.github.lindenb.jvarkit.samtools.util.SimpleInterval)

Example 9 with VCFIterator

use of htsjdk.variant.vcf.VCFIterator in project jvarkit by lindenb.

the class VcfPostProcessSV method doVcfToVcf.

@Override
protected int doVcfToVcf(String inputName, VCFIterator iterin, VariantContextWriter out) {
    SortingCollection<VariantContext> sorter1 = null;
    SortingCollection<VariantContext> sorter2 = null;
    try {
        final Counter<String> counter = new Counter<>();
        if (StringUtils.isBlank(this.keys)) {
            LOG.error("empty --keys");
            return -1;
        }
        if (StringUtils.isBlank(this.newAlternateSymbol)) {
            LOG.error("empty --allele");
            return -1;
        }
        final VCFHeader header = iterin.getHeader();
        if (header.getInfoHeaderLine(VCFConstants.SVTYPE) == null) {
            header.addMetaDataLine(new VCFInfoHeaderLine(VCFConstants.SVTYPE, 1, VCFHeaderLineType.String, "Type of structural variant"));
        }
        if (header.getInfoHeaderLine("SVLEN") == null) {
            header.addMetaDataLine(new VCFInfoHeaderLine("SVLEN", 1, VCFHeaderLineType.Integer, "Difference in length between REF and ALT alleles"));
        }
        if (header.getInfoHeaderLine(VCFConstants.END_KEY) == null) {
            header.addMetaDataLine(new VCFInfoHeaderLine(VCFConstants.END_KEY, 1, VCFHeaderLineType.Integer, "End position of the variant described in this record"));
        }
        final String mateID = Arrays.stream(CharSplitter.COMMA.split(this.keys)).map(S -> S.trim()).filter(S -> !StringUtils.isBlank(S)).map(S -> header.getInfoHeaderLine(S)).filter(H -> H != null && H.getType().equals(VCFHeaderLineType.String)).map(H -> H.getID()).findFirst().orElse(null);
        if (StringUtils.isBlank(mateID)) {
            LOG.error("Cannot find INFO for mate-id (was " + this.keys + ")");
            return -1;
        }
        LOG.info("Mate key is '" + mateID + "'. Reading input.");
        final Comparator<VariantContext> comparator = (A, B) -> A.getAttributeAsString(mateID, "").compareTo(B.getAttributeAsString(mateID, ""));
        sorter1 = SortingCollection.newInstance(VariantContext.class, new VCFRecordCodec(header, true), comparator, this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
        sorter2 = SortingCollection.newInstance(VariantContext.class, new VCFRecordCodec(header, true), header.getVCFRecordComparator(), this.writingSortingCollection.getMaxRecordsInRam(), this.writingSortingCollection.getTmpPaths());
        while (iterin.hasNext()) {
            final VariantContext ctx = iterin.next();
            if (ctx.hasAttribute(mateID) && (!ctx.hasAttribute(VCFConstants.SVTYPE) || ctx.getAttributeAsString(VCFConstants.SVTYPE, "").equals("BND"))) {
                sorter1.add(ctx);
            } else {
                counter.incr("Not a BND variant");
                sorter2.add(ctx);
            }
        }
        sorter1.doneAdding();
        sorter1.setDestructiveIteration(true);
        CloseableIterator<VariantContext> iter2 = sorter1.iterator();
        PeekIterator<VariantContext> peek = new PeekIterator<>(iter2);
        while (peek.hasNext()) {
            final VariantContext first = peek.next();
            final List<VariantContext> variants = new ArrayList<>();
            variants.add(first);
            while (peek.hasNext()) {
                final VariantContext second = peek.peek();
                if (first.hasID() && first.getID().equals(second.getAttributeAsString(mateID, ""))) {
                    variants.add(peek.next());
                } else if (second.hasID() && second.getID().equals(first.getAttributeAsString(mateID, ""))) {
                    variants.add(peek.next());
                } else if (first.getAttributeAsString(mateID, "").equals(second.getAttributeAsString(mateID, ""))) {
                    variants.add(peek.next());
                } else {
                    break;
                }
            }
            if (variants.size() != 2) {
                counter.incr("Not a pair of Mate (" + variants.size() + ")", variants.size());
                for (final VariantContext ctx : variants) {
                    sorter2.add(ctx);
                }
                continue;
            }
            Collections.sort(variants, (A, B) -> Integer.compare(A.getStart(), B.getStart()));
            final VariantContext ctx1 = variants.get(0);
            final VariantContext ctx2 = variants.get(1);
            if (!(ctx1.getNAlleles() == 2 && ctx1.getNAlleles() == 2)) {
                counter.incr("expected two alleles.", 2L);
                sorter2.add(ctx1);
                sorter2.add(ctx2);
                continue;
            }
            final Breakend brk1 = Breakend.parse(ctx1.getAlleles().get(1)).orElse(null);
            final Breakend brk2 = Breakend.parse(ctx2.getAlleles().get(1)).orElse(null);
            if (brk1 == null || brk2 == null) {
                counter.incr("ALT is not breakend.", 2L);
                sorter2.add(ctx1);
                sorter2.add(ctx2);
            }
            /* should be the same breakend 
				difference can be large use CIPOS / CIEND ?
				if( !ctx1.getContig().equals(brk2.getContig()) ||
					!ctx2.getContig().equals(brk1.getContig()) ||
					Math.abs(ctx1.getStart()-brk2.getStart())>1  ||
					Math.abs(ctx2.getStart()-brk1.getStart())>1) {
					counter.incr("mate is not the same position.",2L);
					sorter2.add(ctx1);
					sorter2.add(ctx2);
					continue;
					}
				*/
            final VariantContextBuilder vcb1 = new VariantContextBuilder(ctx1);
            final VariantContextBuilder vcb2 = new VariantContextBuilder(ctx2);
            if (!ctx1.contigsMatch(ctx2)) {
                vcb1.attribute(VCFConstants.SVTYPE, "TRANSLOC");
                vcb2.attribute(VCFConstants.SVTYPE, "TRANSLOC");
                sorter2.add(vcb1.make());
                sorter2.add(vcb2.make());
                counter.incr("translocation.", 2L);
                continue;
            }
            final Allele ctx1_alt = ctx1.getAlleles().get(1);
            final List<Allele> alleles = Arrays.asList(ctx1.getReference(), Allele.create("<" + this.newAlternateSymbol.trim() + ">", false));
            vcb1.attribute(VCFConstants.SVTYPE, this.newAlternateSymbol.trim());
            vcb1.alleles(alleles);
            final int ctx_end = Math.max(ctx1.getEnd(), ctx2.getEnd());
            vcb1.stop(ctx_end);
            vcb1.attribute("END", ctx_end);
            vcb1.attribute("SVLEN", CoordMath.getLength(ctx1.getStart(), ctx_end));
            vcb1.rmAttribute(mateID);
            vcb1.genotypes(ctx1.getGenotypes().stream().map(GT -> new GenotypeBuilder(GT).alleles(GT.getAlleles().stream().map(A -> A.equals(ctx1_alt) ? alleles.get(1) : A).collect(Collectors.toList())).make()).collect(Collectors.toList()));
            sorter2.add(vcb1.make());
            counter.incr("Two BND variants converted.", 2L);
        }
        iter2.close();
        sorter1.cleanup();
        sorter1 = null;
        sorter2.doneAdding();
        sorter2.setDestructiveIteration(true);
        iter2 = sorter2.iterator();
        final VCFHeader header2 = new VCFHeader(header);
        JVarkitVersion.getInstance().addMetaData(this, header2);
        out.writeHeader(header2);
        while (iter2.hasNext()) {
            out.add(iter2.next());
        }
        iter2.close();
        sorter2.cleanup();
        sorter2 = null;
        if (!disable_summary) {
            LOG.info("SUMMARY COUNT");
            for (final String key : counter.keySet()) {
                LOG.info("\t" + key + " : " + counter.count(key));
            }
        }
        return 0;
    } catch (final Throwable err) {
        err.printStackTrace();
        LOG.error(err);
        return -1;
    } finally {
        if (sorter1 != null)
            sorter1.cleanup();
        if (sorter2 != null)
            sorter2.cleanup();
    }
}
Also used : CloseableIterator(htsjdk.samtools.util.CloseableIterator) Allele(htsjdk.variant.variantcontext.Allele) VCFRecordCodec(htsjdk.variant.vcf.VCFRecordCodec) Arrays(java.util.Arrays) CharSplitter(com.github.lindenb.jvarkit.lang.CharSplitter) Program(com.github.lindenb.jvarkit.util.jcommander.Program) VCFIterator(htsjdk.variant.vcf.VCFIterator) Parameter(com.beust.jcommander.Parameter) VCFHeader(htsjdk.variant.vcf.VCFHeader) ParametersDelegate(com.beust.jcommander.ParametersDelegate) ArrayList(java.util.ArrayList) Breakend(com.github.lindenb.jvarkit.variant.variantcontext.Breakend) VCFConstants(htsjdk.variant.vcf.VCFConstants) Counter(com.github.lindenb.jvarkit.util.Counter) SortingCollection(htsjdk.samtools.util.SortingCollection) VCFHeaderLineType(htsjdk.variant.vcf.VCFHeaderLineType) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) Logger(com.github.lindenb.jvarkit.util.log.Logger) JVarkitVersion(com.github.lindenb.jvarkit.util.JVarkitVersion) Collectors(java.util.stream.Collectors) List(java.util.List) StringUtils(com.github.lindenb.jvarkit.lang.StringUtils) VariantContextWriter(htsjdk.variant.variantcontext.writer.VariantContextWriter) CoordMath(htsjdk.samtools.util.CoordMath) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) VariantContext(htsjdk.variant.variantcontext.VariantContext) OnePassVcfLauncher(com.github.lindenb.jvarkit.jcommander.OnePassVcfLauncher) Comparator(java.util.Comparator) PeekIterator(htsjdk.samtools.util.PeekIterator) Collections(java.util.Collections) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) PeekIterator(htsjdk.samtools.util.PeekIterator) ArrayList(java.util.ArrayList) VariantContext(htsjdk.variant.variantcontext.VariantContext) Breakend(com.github.lindenb.jvarkit.variant.variantcontext.Breakend) GenotypeBuilder(htsjdk.variant.variantcontext.GenotypeBuilder) VCFInfoHeaderLine(htsjdk.variant.vcf.VCFInfoHeaderLine) VCFRecordCodec(htsjdk.variant.vcf.VCFRecordCodec) Allele(htsjdk.variant.variantcontext.Allele) Counter(com.github.lindenb.jvarkit.util.Counter) VariantContextBuilder(htsjdk.variant.variantcontext.VariantContextBuilder) VCFHeader(htsjdk.variant.vcf.VCFHeader)

Example 10 with VCFIterator

use of htsjdk.variant.vcf.VCFIterator in project jvarkit by lindenb.

the class VcfToSql method read.

private void read(File filename) throws IOException {
    /* insert ATGC */
    this.alleleTable.insert(outputWriter, null, "A");
    this.alleleTable.insert(outputWriter, null, "C");
    this.alleleTable.insert(outputWriter, null, "G");
    this.alleleTable.insert(outputWriter, null, "T");
    /* insert this sample */
    this.vcfFileTable.insert(outputWriter, null, filename);
    final SelectStmt vcffile_id = new SelectStmt(this.vcfFileTable);
    final Map<String, SelectStmt> sample2sampleid = new HashMap<String, SelectStmt>();
    final Map<String, SelectStmt> filter2filterid = new HashMap<String, SelectStmt>();
    final Map<String, SelectStmt> chrom2chromId = new HashMap<String, SelectStmt>();
    final VCFIterator r = VCFUtils.createVCFIteratorFromFile(filename);
    final VCFHeader header = r.getHeader();
    /* parse samples */
    for (final String sampleName : header.getSampleNamesInOrder()) {
        this.sampleTable.insert(outputWriter, null, sampleName);
        SelectStmt sample_id = new SelectStmt(this.sampleTable, "name", sampleName);
        sample2sampleid.put(sampleName, sample_id);
        this.sample2fileTable.insert(outputWriter, null, vcffile_id, sample_id);
    }
    /* parse filters */
    for (final VCFFilterHeaderLine filter : header.getFilterLines()) {
        this.filterTable.insert(outputWriter, null, vcffile_id, filter.getID(), filter.getValue());
        filter2filterid.put(filter.getID(), new SelectStmt(this.filterTable, "name", filter.getID()));
    }
    filter2filterid.put(VCFConstants.PASSES_FILTERS_v4, new SelectStmt(this.filterTable, "name", VCFConstants.PASSES_FILTERS_v4));
    final SAMSequenceDictionary dict = header.getSequenceDictionary();
    if (dict == null) {
        throw new RuntimeException("dictionary missing in VCF");
    }
    /* parse sequence dict */
    for (final SAMSequenceRecord ssr : dict.getSequences()) {
        this.chromosomeTable.insert(outputWriter, null, vcffile_id, ssr.getSequenceName(), ssr.getSequenceLength());
        chrom2chromId.put(ssr.getSequenceName(), new SelectStmt(this.chromosomeTable, "name", ssr.getSequenceName()));
    }
    VepPredictionParser vepPredictionParser = new VepPredictionParserFactory(header).get();
    SAMSequenceDictionaryProgress progress = new SAMSequenceDictionaryProgress(dict);
    int nVariants = 0;
    while (r.hasNext()) {
        if (this.outputWriter.checkError())
            break;
        VariantContext var = progress.watch(r.next());
        ++nVariants;
        /* insert ref allele */
        this.alleleTable.insert(outputWriter, null, var.getReference().getBaseString());
        /* insert variant */
        this.variantTable.insert(outputWriter, null, vcffile_id, nVariants, chrom2chromId.get(var.getContig()), var.getStart(), (var.hasID() ? var.getID() : null), new SelectStmt(this.alleleTable, "bases", var.getReference().getBaseString()), (var.hasLog10PError() ? var.getPhredScaledQual() : null));
        SelectStmt variant_id = new SelectStmt(variantTable);
        /* insert alternate alleles */
        for (Allele alt : var.getAlternateAlleles()) {
            /* insert alt allele */
            this.alleleTable.insert(outputWriter, null, alt.getBaseString());
            this.variant2altTable.insert(outputWriter, null, variant_id, new SelectStmt(this.alleleTable, "bases", alt.getBaseString()));
        }
        /* insert filters */
        for (final String filter : var.getFilters()) {
            if (filter2filterid.get(filter) == null) {
                throw new IOException("VCF Error: filter " + filter + " is not defined in the VCF header.");
            }
            this.variant2filters.insert(outputWriter, null, variant_id, filter2filterid.get(filter));
        }
        if (!this.ignore_info) {
            for (final VepPrediction pred : vepPredictionParser.getPredictions(var)) {
            /*
					vepPrediction.insert(
							outputWriter,
							null,
							variant_id,
							pred.getEnsemblGene(),
							pred.getEnsemblTranscript(),
							pred.getEnsemblProtein(),
							pred.getSymbol()
							);
					SelectStmt pred_id = new SelectStmt(vepPrediction);
			
					for(SequenceOntologyTree.Term t: pred.getSOTerms())
						{
						String term=t.getAcn().replace(':', '_');
						soTermTable.insert(
								outputWriter,
								null,
								term,
								t.getAcn()
								);//for bioportal compatibility
						SelectStmt term_id = new SelectStmt(soTermTable,"acn",term);
						
						vepPrediction2so.insert(
							outputWriter,
							null,
							pred_id,
							term_id
							);
						}
					*/
            }
        }
        /* insert genotypes */
        for (final String sampleName : sample2sampleid.keySet()) {
            final Genotype g = var.getGenotype(sampleName);
            if (!g.isAvailable() || g.isNoCall())
                continue;
            genotypeTable.insert(outputWriter, null, variant_id, sample2sampleid.get(sampleName), g.isCalled() ? new SelectStmt(this.alleleTable, "bases", g.getAllele(0).getBaseString()) : null, g.isCalled() ? new SelectStmt(this.alleleTable, "bases", g.getAllele(1).getBaseString()) : null, g.hasDP() ? g.getDP() : null, g.hasGQ() ? g.getGQ() : null);
        }
    }
    r.close();
}
Also used : VepPrediction(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser.VepPrediction) SAMSequenceDictionaryProgress(com.github.lindenb.jvarkit.util.picard.SAMSequenceDictionaryProgress) HashMap(java.util.HashMap) VariantContext(htsjdk.variant.variantcontext.VariantContext) Genotype(htsjdk.variant.variantcontext.Genotype) SAMSequenceRecord(htsjdk.samtools.SAMSequenceRecord) IOException(java.io.IOException) SAMSequenceDictionary(htsjdk.samtools.SAMSequenceDictionary) Allele(htsjdk.variant.variantcontext.Allele) VepPredictionParser(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParser) VCFFilterHeaderLine(htsjdk.variant.vcf.VCFFilterHeaderLine) VCFHeader(htsjdk.variant.vcf.VCFHeader) VCFIterator(htsjdk.variant.vcf.VCFIterator) VepPredictionParserFactory(com.github.lindenb.jvarkit.util.vcf.predictions.VepPredictionParserFactory)

Aggregations

VCFIterator (htsjdk.variant.vcf.VCFIterator)98 VariantContext (htsjdk.variant.variantcontext.VariantContext)83 VCFHeader (htsjdk.variant.vcf.VCFHeader)76 VariantContextWriter (htsjdk.variant.variantcontext.writer.VariantContextWriter)62 Parameter (com.beust.jcommander.Parameter)56 Program (com.github.lindenb.jvarkit.util.jcommander.Program)56 Logger (com.github.lindenb.jvarkit.util.log.Logger)56 ArrayList (java.util.ArrayList)52 List (java.util.List)52 Set (java.util.Set)49 VariantContextBuilder (htsjdk.variant.variantcontext.VariantContextBuilder)46 Collectors (java.util.stream.Collectors)45 HashSet (java.util.HashSet)42 Path (java.nio.file.Path)39 IOException (java.io.IOException)37 Allele (htsjdk.variant.variantcontext.Allele)36 CloserUtil (htsjdk.samtools.util.CloserUtil)35 Genotype (htsjdk.variant.variantcontext.Genotype)33 SAMSequenceDictionary (htsjdk.samtools.SAMSequenceDictionary)32 VCFInfoHeaderLine (htsjdk.variant.vcf.VCFInfoHeaderLine)32